avr-crypto-lib/serpent/serpent-asm.S

755 lines
10 KiB
ArmAsm

/* serpent_asm.S */
/*
This file is part of the AVR-Crypto-Lib.
Copyright (C) 2006-2015 Daniel Otte (bg@nerilex.org)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* File: serpent_sboxes.S
* Author: Daniel Otte
* Date: 2008-08-07
* License: GPLv3 or later
* Description: Implementation of the serpent sbox function.
*
*/
#include <avr/io.h>
#include "avr-asm-macros.S"
/*
static void serpent_lt(uint8_t *b){
X0 = rotl32(X0, 13);
X2 = rotl32(X2, 3);
X1 ^= X0 ^ X2;
X3 ^= X2 ^ (X0 << 3);
X1 = rotl32(X1, 1);
X3 = rotl32(X3, 7);
X0 ^= X1 ^ X3;
X2 ^= X3 ^ (X1 << 7);
X0 = rotl32(X0, 5);
X2 = rotr32(X2, 10);
}
*/
#if 0
A0 = 4
A1 = 5
A2 = 6
A3 = 7
B0 = 8
B1 = 9
B2 = 10
B3 = 11
C0 = 12
C1 = 13
C2 = 14
C3 = 15
D0 = 16
D1 = 17
D2 = 18
D3 = 19
T0 = 20
T1 = 21
T2 = 22
T3 = 23
serpent_lt:
push_range 4, 17
movw r26, r24
ld A2, X+
ld A3, X+
ld A0, X+
ld A1, X+
ldi r20, 3
mov r0, A0
1:
lsr r0
ror A3
ror A2
ror A1
ror A0
dec r20
brne 1b
ld B0, X+
ld B1, X+
ld B2, X+
ld B3, X+
ld C2, X+
ld C3, X+
ld C0, X+
ld C1, X+
ldi r20, 3
mov r0, C0
1:
lsr r0
ror C3
ror C2
ror C1
ror C0
dec r20
brne 1b
ld D0, X+
ld D1, X+
ld D2, X+
ld D3, X+
/* X1 ^= X0 ^ X2; */
eor B0, A0
eor B0, C0
eor B1, A1
eor B1, C1
eor B2, A2
eor B2, C2
eor B3, A3
eor B3, C3
/* X3 ^= X2 ^ (X0 << 3); */
mov T0, A0
mov T1, A1
mov T2, A2
mov T3, A3
ldi r24, 3
1:
lsl T0
rol T1
rol T2
rol T3
dec r24
brne 1b
eor C0, B0
eor C0, T0
eor C1, B1
eor C1, T1
eor C2, B2
eor C2, T2
eor C3, B3
eor C3, T3
/* X1 = rotl32(X1, 1); */
mov r0, B3
lsl r0
rol B0
rol B1
rol B2
rol B3
/* X3 = rotl32(X3, 7); */
mov r0, D3
mov D3, D2
mov D2, D1
mov D1, D0
mov D0, r0
lsr r0
ror D3
ror D2
ror D1
ror D0
/* X0 ^= X1 ^ X3; */
eor A0, B0
eor A0, D0
eor A1, B1
eor A1, D1
eor A2, B2
eor A2, D2
eor A3, B3
eor A3, D3
/* X2 ^= X3 ^ (X1 << 7); */
mov T1, B0
mov T2, B1
mov T3, B2
clr T0
mov r0, B3
lsr r0
ror T2
ror T1
ror T0
eor C0, D0
eor C0, T0
eor C1, D1
eor C1, T1
eor C2, D2
eor C2, T2
eor C3, D3
eor C3, T3
/* X0 = rotl32(X0, 5); */
ldi r24, 5
mov r0, A3
1:
lsl r0
rol A0
rol A1
rol A2
rol A3
dec r24
brne 1b
/* X2 = rotr32(X2, 10); */
mov r0, C0
mov C0, C1
mov C1, C2
mov C2, C3
mov C3, r0
ldi r24, 2
1:
lsr r0
ror C2
ror C1
ror C0
ror C3
dec r24
brne 1b
clr r31
ldi r30, D3+1
ldi r24, 16
1:
ld r0, -Z
st -X, r0
dec r24
brne 1b
pop_range 4, 17
ret
#endif
T0 = 22
T1 = 23
T2 = 24
T3 = 25
TT = 21
/* rotate the data word (4 byte) pointed to by X by r20 bits to the right */
memrotr32:
ld T0, X+
ld T1, X+
ld T2, X+
ld T3, X+
mov TT, T0
1:
lsr TT
ror T3
ror T2
ror T1
ror T0
dec r20
brne 1b
st -X, T3
st -X, T2
st -X, T1
st -X, T0
ret
/* rotate the data word (4 byte) pointed to by X by r20 bits to the left */
memrotl32:
ld T0, X+
ld T1, X+
ld T2, X+
ld T3, X+
mov TT, T3
1:
lsl TT
rol T0
rol T1
rol T2
rol T3
dec r20
brne 1b
st -X, T3
st -X, T2
st -X, T1
st -X, T0
ret
/* xor the dataword (4 byte) pointed by Z into X */
memeor32:
ldi T2, 4
1:
ld T0, X
ld T1, Z+
eor T0, T1
st X+, T0
dec T2
brne 1b
ret
serpent_lt:
/* X0 := X0 <<< 13 */
movw r26, r24
ldi r20, 7
rcall memrotl32
ldi r20, 6
rcall memrotl32
/* X2 := X2 <<< 3 */
adiw r26, 8
ldi r20, 3
rcall memrotl32
/* X1 ^= X2 */
movw r30, r26
sbiw r26, 4
rcall memeor32
/* X1 ^= X0 */
sbiw r26, 4
sbiw r30, 12
rcall memeor32
/* X3 ^= X2 */
movw r30, r26
adiw r26, 4
rcall memeor32
/* T := X0 */
sbiw r26, 16
ld r18, X+
ld r19, X+
ld r20, X+
ld r21, X+
/* T := T<<3 */
ldi r22, 3
1:
lsl r18
rol r19
rol r20
rol r21
dec r22
brne 1b
clr r31
/* X3 ^= T */
adiw r26, 8
ldi r30, 18
rcall memeor32
/* X1 := X1<<<1 */
sbiw r26, 12
ldi r20, 1
rcall memrotl32
/* X3 := X3<<<7 */
adiw r26, 8
ldi r20, 7
rcall memrotl32
/* X0 ^= X3 */
movw r30, r26
sbiw r26, 12
rcall memeor32
/* X0 ^= X1 */
movw r30, r26
sbiw r26, 4
rcall memeor32
/* X2 ^= X3 */
adiw r26, 4
adiw r30, 4
rcall memeor32
/* T := X1<<<8 */
sbiw r26, 8
ld r19, X+
ld r20, X+
ld r21, X+
ld r18, X+
/* T := T>>>1; T&=0xfffffff8 */
lsr r18
ror r21
ror r20
ror r19
clr r18
ror r18
clr r31
ldi r30, 18
/* X2 ^= T */
rcall memeor32
/* X0 := X0 <<< 5 */
sbiw r26, 12
ldi r20, 5
rcall memrotl32
/* X3 := X3 >>> 10 */
adiw r26, 8
ldi r20, 7
rcall memrotr32
ldi r20, 3
rcall memrotr32
ret
serpent_inv_lt:
/* X0 := X0 >>> 5 */
movw r26, r24
ldi r20, 5
rcall memrotr32
/* X2 := X2 <<< 10 */
adiw r26, 8
ldi r20, 7
rcall memrotl32
ldi r20, 3
rcall memrotl32
/* X2 ^= X3 */
movw r30, r26
adiw r30, 4
rcall memeor32
sbiw r26, 4
sbiw r30, 12
/* T := X1<<7 */
ld r19, Z+
ld r20, Z+
ld r21, Z+
ld r18, Z+
lsr r18
ror r21
ror r20
ror r19
clr r18
ror r18
clr r31
/* X2 ^= T */
ldi r30, 18
rcall memeor32
/* X0 ^= X1 */
sbiw r26, 12
movw r30, r26
adiw r30, 4
rcall memeor32
/* X0 ^= X3 */
sbiw r26, 4
adiw r30, 4
rcall memeor32
/* X1 := X1>>>1 */
ldi r20, 1
rcall memrotr32
/* X3 := X3>>>7 */
adiw r26, 8
ldi r20, 7
rcall memrotr32
/* X3 ^= X2 */
sbiw r30, 8
rcall memeor32
sbiw r26, 4
/* T:= X0<<3 */
sbiw r30, 12
ld r18, Z+
ld r19, Z+
ld r20, Z+
ld r21, Z+
ldi r24, 3
1:
lsl r18
rol r19
rol r20
rol r21
dec r24
brne 1b
/* X3 ^= T */
clr r31
ldi r30, 18
rcall memeor32
/* X1 ^= X0 */
sbiw r26, 12
movw r30, r26
sbiw r30, 4
rcall memeor32
/* X1 ^= X2 */
movw r26, r30
adiw r30, 4
rcall memeor32
/* X2 := X2 >>> 3 */
ldi r20, 3
rcall memrotr32
/* X0 := X0 >>> 13 */
sbiw r26, 8
ldi r20, 7
rcall memrotr32
ldi r20, 6
rcall memrotr32
ret
/*
#define GOLDEN_RATIO 0x9e3779b9l
static uint32_t serpent_gen_w(uint32_t * b, uint8_t i){
uint32_t ret;
ret = b[0] ^ b[3] ^ b[5] ^ b[7] ^ GOLDEN_RATIO ^ (uint32_t)i;
ret = rotl32(ret, 11);
return ret;
}
*/
/*
* param b is passed in r24:r25
* param i is passed in r22
* return value is returned in r22.r23.r24.r25
*/
/* trashes:
* r20-r25, r30-r31
*/
serpent_gen_w:
movw r30, r24
/* ^i^b[0]*/
ld r21, Z+
eor r22, r21
ld r23, Z+
ld r24, Z+
ld r25, Z+
/* ^b[3]^b[5]^[b7] */
adiw r30, 4
ldi r20, 3
1:
adiw r30, 4
ld r21, Z+
eor r22, r21
ld r21, Z+
eor r23, r21
ld r21, Z+
eor r24, r21
ld r21, Z+
eor r25, r21
dec r20
brne 1b
/* ^0x9e3779b9l */
ldi r21, 0xb9
eor r22, r21
ldi r21, 0x79
eor r23, r21
ldi r21, 0x37
eor r24, r21
ldi r21, 0x9e
eor r25, r21
/* <<<11 */
mov r21, r25
mov r25, r24
mov r24, r23
mov r23, r22
mov r22, r21
mov r21, r25
ldi r20, 3
1:
lsl r21
rol r22
rol r23
rol r24
rol r25
dec r20
brne 1b
ret
/*
* void serpent_init(const void *key, uint16_t keysize_b, serpent_ctx_t *ctx)
*/
/*
* param key is passed in r24:r25
* param keysize is passed in r22:r23
* param ctx is passed in r20:r21
*/
.global serpent_init
serpent_init:
stack_alloc 32
adiw r30, 1
push_ r30, r31
movw r26, r22
adiw r26, 7
tst r27
breq 1f
ldi r26, 32
rjmp 2f
1:
lsr r26
lsr r26
lsr r26
2:
mov r22, r26
bst r22, 5 /* store in T if we have to do the "append 1 thing"*/
ldi r27, 32
3: /* set buffer to zero */
st Z+, r1
dec r27
brne 3b
movw r26, r24 /* X points to the key */
sbiw r30, 32
tst r22
breq 5f /* if keylength_b==0 */
4: /* copy keybytes to buffer */
ld r19, X+
st Z+, r19
dec r22
brne 4b
5:
brts 7f /* if keylength_b == 256 */
ldi r18, 0x01
andi r22, 0x07
brne 6f
st Z, r18
rjmp 7f
6: /* shift the one to the right position */
lsl r18
dec r22
brne 6b
or r18, r19
st -Z, r18
7: /* post "appending 1 thing" buffer is ready for subkey generation */
movw r26, r20 /* X points to the context */
pop_ r19, r18 /* r18:r19 points to the buffer */
push r16
clr r16
8:
movw r24, r18
mov r22, r16
rcall serpent_gen_w
movw r30, r18
ldi r20, 7*4
1: /* the memmove */
ldd r0, Z+4
st Z+, r0
dec r20
brne 1b
/* store new word in buffer and context */
st Z+, r22
st Z+, r23
st Z+, r24
st Z+, r25
st X+, r22
st X+, r23
st X+, r24
st X+, r25
inc r16
cpi r16, 132
brne 8b
push_ r28, r29
movw r28, r26
subi r28, lo8(132*4)
sbci r29, hi8(132*4)
ldi r16, 33
2:
movw r24, r28
adiw r28, 16
ldi r22, 2
add r22, r16
rcall sbox128
dec r16
brne 2b
pop_ r29, r28, r16
stack_free 32
ret
/*
* void serpent_enc(void *buffer, const serpent_ctx_t *ctx){
*/
/*
* param buffer is passed in r24:r25
* param ctx is passed in r22:r23
*/
.global serpent_enc
serpent_enc:
push_ r12, r13, r14, r15, r16
clr r16
movw r14, r24
movw r12, r22
1:
movw r24, r14
movw r22, r12
ldi r20, 16
add r12, r20
adc r13, r1
clr r21
rcall memxor
movw r24, r14
mov r22, r16
rcall sbox128
movw r24, r14
rcall serpent_lt
inc r16
cpi r16, 31
brne 1b
movw r24, r14
movw r22, r12
ldi r20, 16
add r12, r20
adc r13, r1
clr r21
rcall memxor
movw r24, r14
mov r22, r16
rcall sbox128
inc r16
movw r24, r14
movw r22, r12
ldi r20, 16
clr r21
pop_ r16, r15, r14, r13, r12
rjmp memxor
/*
* void serpent_dec(void *buffer, const serpent_ctx_t *ctx){
*/
/*
* param buffer is passed in r24:r25
* param ctx is passed in r22:r23
*/
.global serpent_dec
serpent_dec:
push_ r12, r13, r14, r15, r16
movw r14, r24
// ldi r16, lo8(32*16)
// add r22, r16
ldi r16, hi8(32*16)
add r23, r16
movw r12, r22
ldi r20, 16
clr r21
rcall memxor
movw r24, r14
ldi r22, 31
call inv_sbox128
movw r24, r14
ldi r20, 16
sub r12, r20
sbc r13, r1
movw r22, r12
clr r21
rcall memxor
ldi r16, 31
1:
dec r16
movw r24, r14
rcall serpent_inv_lt
movw r24, r14
mov r22, r16
rcall inv_sbox128
movw r24, r14
ldi r20, 16
sub r12, r20
sbc r13, r1
movw r22, r12
clr r21
rcall memxor
tst r16
brne 1b
pop_ r16, r15, r14, r13, r12
ret