avr-crypto-lib/noekeon_asm.S

622 lines
10 KiB
ArmAsm

/*
* noekeon assembler implementation for avr
* author: Daniel Otte
* email: daniel.otte@rub.de
* license: GPLv3
*/
#include <avr/io.h>
.macro push_all
push r2
push r3
push r4
push r5
push r6
push r7
push r8
push r9
push r10
push r11
push r12
push r13
push r14
push r15
push r16
push r17
push r28
push r29
in r28, _SFR_IO_ADDR(SREG)
push r28
.endm
.macro pop_all
pop r28
out _SFR_IO_ADDR(SREG), r28
pop r29
pop r28
pop r17
pop r16
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop r8
pop r7
pop r6
pop r5
pop r4
pop r3
pop r2
clr r1
.endm
.macro xchg a b
eor \a, \b
eor \b, \a
eor \a, \b
.endm
.macro op32 op a b
\op \a\()_0, \b\()_0
\op \a\()_1, \b\()_1
\op \a\()_2, \b\()_2
\op \a\()_3, \b\()_3
.endm
.macro op32_4t op a b c d w x y z
\op \a, \w
\op \b, \x
\op \c, \y
\op \d, \z
.endm
.macro op32_prefix op p q a b c d w x y z
\op \p\()\a, \q\()\w
\op \p\()\b, \q\()\x
\op \p\()\c, \q\()\y
\op \p\()\d, \q\()\z
.endm
; === bigendian_rotl32 ===
; this function rotates a 32bit bigendian word n bits to the left
; param1: the 32-bit value
; given in r25,r24,r23,r22 (r22 is most significant)
; param2: the 8-bit parameter giving the number of bits to rotate
; given in r20
; return: the rotatet 32-bit word
; given in r25,r24,r23,r22
bigendian_rotl32:
/* copy high bit of r22 to carry */
mov r1, r22
2:
rol r1
rol r25
rol r24
rol r23
rol r22
dec r20
brne 2b
bigendian_rotl32_exit:
clr r1
ret
/******************************************************************************/
; === bigendian_rotl32 ===
; this function rotates a 32bit bigendian word n bits to the right
; param1: the 32-bit value
; given in r25,r24,r23,r22 (r22 is most significant)
; param2: the 8-bit parameter giving the number of bits to rotate
; given in r20
; return: the rotatet 32-bit word
; given in r25,r24,r23,r22
bigendian_rotr32:
/* copy high bit of r25 to carry */
mov r1, r25
2:
ror r1
ror r22
ror r23
ror r24
ror r25
dec r20
brne 2b
bigendian_rotr32_exit:
clr r1
ret
/******************************************************************************/
/*
void theta(uint32_t* k, uint32_t* a){
uint32_t temp;
temp = a[0] ^ a[2]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
a[1] ^= temp;
a[3] ^= temp;
a[0] ^= k[0];
a[1] ^= k[1];
a[2] ^= k[2];
a[3] ^= k[3];
temp = a[1] ^ a[3]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
a[0] ^= temp;
a[2] ^= temp;
}
*/
round_const: .byte 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, \
0x2F, 0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, \
0xD4
;-- a[0]
state0_0 = 2
state0_1 = 3
state0_2 = 4
state0_3 = 5
;-- a[1]
state1_0 = 6
state1_1 = 7
state1_2 = 8
state1_3 = 9
;-- a[2]
state2_0 = 10
state2_1 = 11
state2_2 = 12
state2_3 = 13
;-- a[3]
state3_0 = 14
state3_1 = 15
state3_2 = 16
state3_3 = 17
; === theta ===
;
; param1: the state in r2-r17
; param2: pointer to k in X (r26,r27)
;
temp_a = 18
temp_b = 19
temp_c = 20
temp_d = 21
theta:
/* temp = a[0] ^ a[2]; temp ^= temp>>>8 ^ temp<<<8 */
op32_prefix mov, temp_, state0_, a,b,c,d, 0,1,2,3
op32_prefix eor, temp_, state2_, a,b,c,d, 0,1,2,3
mov r1, temp_a
eor r1, temp_b
eor r1, temp_c
eor r1, temp_d
op32_prefix eor, temp_, r, a,b,c,d, 1,1,1,1
/* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
/* a[1] ^= temp */
eor state1_0, temp_c
eor state1_1, temp_d
eor state1_2, temp_a
eor state1_3, temp_b
/* a[3] ^= temp */
eor state3_0, temp_c
eor state3_1, temp_d
eor state3_2, temp_a
eor state3_3, temp_b
/* state ^ k (X points to K) */
ldi r28, 2
clr r29 /* Y points to r2 aka state0_0 */
ldi temp_a, 16
1:
ld r1, X+
ld r0, Y
eor r1, r0
st Y+, r1
dec temp_a
brne 1b
sbiw r26, 16 /* set X back to key */
mov temp_a, state1_0
mov temp_b, state1_1
mov temp_c, state1_2
mov temp_d, state1_3
eor temp_a, state3_0
eor temp_b, state3_1
eor temp_c, state3_2
eor temp_d, state3_3
mov r1, temp_a
eor r1, temp_b
eor r1, temp_c
eor r1, temp_d
eor temp_a, r1
eor temp_b, r1
eor temp_c, r1
eor temp_d, r1
/* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
/* a[0] ^= temp */
eor state0_0, temp_c
eor state0_1, temp_d
eor state0_2, temp_a
eor state0_3, temp_b
/* a[2] ^= temp */
eor state2_0, temp_c
eor state2_1, temp_d
eor state2_2, temp_a
eor state2_3, temp_b
clr r1
ret
/******************************************************************************/
; === noekeon_enc ===
;
; param1: pointer to buffer/state (r24,r25)
; param2: pointer to k (r22,r23)
;
.global noekeon_enc
noekeon_enc:
push_all
/* load state */
movw r26, r22
ldi r28, 2
clr r29 /* Y points at r2 aka state0_0 */
movw r30, r24 /* Z points at state */
push r30
push r31
ldi r22, 16
push r22 /* 16 is also the number of rounds and gets pushed here */
1:
ld r0, Z+
st Y+, r0
dec r22
brne 1b
/* state loaded */
push r1 /* push round constan2 (0x00) */
ldi r20, 0x80
push r20 /* push round constan2 (0x00) */
rjmp 3f
2:
ldi r30, lo8(round_const+15)
ldi r31, hi8(round_const+15)
sub r30, r22
sbci r31, 0
clr r1
push r1
lpm r0, Z
push r0
3:
call round /* pops rc2 & rc1 */
pop r22
dec r22
push r22
brne 2b
pop r22
ldi r22, 0xD4
eor state0_3, r22
call theta
pop r31
pop r30
clr r29
ldi r28, 2
ldi r22, 16
1:
ld r0, Y+
st Z+, r0
dec r22
brne 1b
pop_all
ret
/******************************************************************************/
/******************************************************************************/
; === noekeon_dec ===
;
; param1: pointer to buffer/state (r24,r25)
; param2: pointer to k (r22,r23)
;
.global noekeon_dec
noekeon_dec:
push_all
/* allocate 16 bytes on the stack */
in r30, _SFR_IO_ADDR(SPL)
in r31, _SFR_IO_ADDR(SPH)
sbiw r30, 16
out _SFR_IO_ADDR(SPH), r31
out _SFR_IO_ADDR(SPL), r30
adiw r30, 1
/* push state pointer */
push r24
push r25
movw r26, r22 /* move key ptr to X */
/* set stackkey to zero */
ldi r22, 16
1: st Z+, r1
dec r22
brne 1b
/* copy key to state */
clr r29
ldi r28, 2
ldi r22, 16
1: ld r0, X+
st Y+, r0
dec r22
brne 1b
movw r26, r30
sbiw r26, 16 /* set X back to begining of stack key */
call theta
/* mov state to stackkey */
clr r29
ldi r28, 2
ldi r22, 16
1: ld r0, Y+
st X+, r0
dec r22
brne 1b
sbiw r26, 16 /* set X back to begining of stack key */
/* move data from stateptr to state */
pop r31
pop r30
push r30
push r31
clr r29
ldi r28, 2
ldi r22, 16
push r22
1: ld r0, Z+
st Y+, r0
dec r22
brne 1b
;--- snip 8< ----
ldi r20, 0xD4
push r20 /* push round constant2 (0xD4) */
push r22 /* push round constan1 (0x00) */
rjmp 3f
2:
ldi r30, lo8(round_const-1)
ldi r31, hi8(round_const-1)
clr r1
add r30, r22
adc r31, r1
lpm r0, Z
push r0
push r1
3:
call round /* pops rc2 & rc1 */
pop r22
dec r22
push r22
brne 2b
;----
pop r22
call theta
ldi r22, 0x80
eor state0_3, r22
write_state_back:
/* write state back */
pop r31 /* pop state pointer */
pop r30
clr r29
ldi r28, 2
ldi r22, 16
1:
ld r0, Y+
st Z+, r0
dec r22
brne 1b
/* remove key from stack */
in r30, _SFR_IO_ADDR(SPL)
in r31, _SFR_IO_ADDR(SPH)
adiw r30, 16
out _SFR_IO_ADDR(SPH), r31
out _SFR_IO_ADDR(SPL), r30
pop_all
ret
/******************************************************************************/
round:
pop r24
pop r25
pop r1
eor state0_3, r1
call theta
pop r1
eor state0_3, r1
push r25
push r24
pi_gamma_pi:
ldi r30, lo8(bigendian_rotl32)
ldi r31, hi8(bigendian_rotl32)
call pi
/* pi1 done; now gamma */
call gamma_1
/* a[0] <-> a[3] */
xchg state0_0, state3_0
xchg state0_1, state3_1
xchg state0_2, state3_2
xchg state0_3, state3_3
/* a[2] ^= a[0] ^ a[1] ^ a[3] */
op32 eor, state2, state0
op32 eor, state2, state1
op32 eor, state2, state3
/*
eor state2_0, state0_0
eor state2_1, state0_1
eor state2_2, state0_2
eor state2_3, state0_3
eor state2_0, state1_0
eor state2_1, state1_1
eor state2_2, state1_2
eor state2_3, state1_3
eor state2_0, state3_0
eor state2_1, state3_1
eor state2_2, state3_2
eor state2_3, state3_3
*/
call gamma_1
ldi r30, lo8(bigendian_rotr32)
ldi r31, hi8(bigendian_rotr32)
call pi
ret
gamma_1:
/* a[1] ^= ~(a[3]|a[2])*/
mov r1, state3_0
or r1, state2_0
com r1
eor state1_0, r1
mov r1, state3_1
or r1, state2_1
com r1
eor state1_1, r1
mov r1, state3_2
or r1, state2_2
com r1
eor state1_2, r1
mov r1, state3_3
or r1, state2_3
com r1
eor state1_3, r1
/* a[0] ^= a[2]&a[1] */
mov r1, state2_0
and r1, state1_0
eor state0_0, r1
mov r1, state2_1
and r1, state1_1
eor state0_1, r1
mov r1, state2_2
and r1, state1_2
eor state0_2, r1
mov r1, state2_3
and r1, state1_3
eor state0_3, r1
ret
pi:
lsr r31
ror r30
/* a[1] <<<= 1*/
mov r22, state1_0
mov r23, state1_1
mov r24, state1_2
mov r25, state1_3
ldi r20, 1
icall
mov state1_0, r22
mov state1_1, r23
mov state1_2, r24
mov state1_3, r25
/* a[2] <<<= 5*/
mov r22, state2_0
mov r23, state2_1
mov r24, state2_2
mov r25, state2_3
ldi r20, 5
icall
mov state2_0, r22
mov state2_1, r23
mov state2_2, r24
mov state2_3, r25
/* a[3] <<<= 2*/
mov r22, state3_0
mov r23, state3_1
mov r24, state3_2
mov r25, state3_3
ldi r20, 2
icall
mov state3_0, r22
mov state3_1, r23
mov state3_2, r24
mov state3_3, r25
ret
/******************************************************************************/
/*
void noekeon_init(void* key, noekeon_ctx_t* ctx){
uint8_t nullv[16];
memset(nullv, 0, 16);
memcpy(ctx, key, 16);
noekeon_enc(ctx, nullv);
}
*/
.global noekeon_init
noekeon_init:
; === noekeon_init ===
;
; param1: pointer to key (r24,r25)
; param2: pointer to context (r22,r23)
;
in r30, _SFR_IO_ADDR(SPL)
in r31, _SFR_IO_ADDR(SPH)
sbiw r30, 16
out _SFR_IO_ADDR(SPH), r31
out _SFR_IO_ADDR(SPL), r30
movw r26, r22
adiw r30, 1
movw r22, r30
/* set nullv(stack) to zero */
ldi r20, 16
1: st Z+, r1
dec r20
brne 1b
/* copy key data to ctx */
movw r30, r24
ldi r20, 16
1: ld r1, Z+
st X+, r1
dec r20
brne 1b
clr r1
sbiw r26, 16
movw r24, r26
call noekeon_enc
in r30, _SFR_IO_ADDR(SPL)
in r31, _SFR_IO_ADDR(SPH)
adiw r30, 16
out _SFR_IO_ADDR(SPH), r31
out _SFR_IO_ADDR(SPL), r30
ret