avr-crypto-lib/dsa/noekeon_asm.S

642 lines
11 KiB
ArmAsm

/* noekeon_asm.S */
/*
This file is part of the AVR-Crypto-Lib.
Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* noekeon assembler implementation for avr
* author: Daniel Otte
* email: daniel.otte@rub.de
* license: GPLv3
*/
#include <avr/io.h>
.macro push_all
push r2
push r3
push r4
push r5
push r6
push r7
push r8
push r9
push r10
push r11
push r12
push r13
push r14
push r15
push r16
push r17
push r28
push r29
.endm
.macro pop_all
pop r29
pop r28
pop r17
pop r16
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop r8
pop r7
pop r6
pop r5
pop r4
pop r3
pop r2
clr r1
.endm
push_all_func:
pop r31
pop r30
push_all
ijmp
pop_all_func:
pop r31
pop r30
pop_all
ijmp
.macro xchg a b
eor \a, \b
eor \b, \a
eor \a, \b
.endm
.macro op32 op a b
\op \a\()_0, \b\()_0
\op \a\()_1, \b\()_1
\op \a\()_2, \b\()_2
\op \a\()_3, \b\()_3
.endm
.macro op32_4t op a b c d w x y z
\op \a, \w
\op \b, \x
\op \c, \y
\op \d, \z
.endm
.macro op32_prefix op p q a b c d w x y z
\op \p\()\a, \q\()\w
\op \p\()\b, \q\()\x
\op \p\()\c, \q\()\y
\op \p\()\d, \q\()\z
.endm
; === bigendian_rotl32 ===
; this function rotates a 32bit bigendian word n bits to the left
; param1: the 32-bit value
; given in r25,r24,r23,r22 (r22 is most significant)
; param2: the 8-bit parameter giving the number of bits to rotate
; given in r20
; return: the rotatet 32-bit word
; given in r25,r24,r23,r22
bigendian_rotl32:
/* copy high bit of r22 to carry */
mov r1, r22
2:
rol r1
rol r25
rol r24
rol r23
rol r22
dec r20
brne 2b
bigendian_rotl32_exit:
clr r1
ret
/******************************************************************************/
; === bigendian_rotl32 ===
; this function rotates a 32bit bigendian word n bits to the right
; param1: the 32-bit value
; given in r25,r24,r23,r22 (r22 is most significant)
; param2: the 8-bit parameter giving the number of bits to rotate
; given in r20
; return: the rotatet 32-bit word
; given in r25,r24,r23,r22
bigendian_rotr32:
/* copy high bit of r25 to carry */
mov r1, r25
2:
ror r1
ror r22
ror r23
ror r24
ror r25
dec r20
brne 2b
bigendian_rotr32_exit:
clr r1
ret
/******************************************************************************/
/*
void theta(uint32_t* k, uint32_t* a){
uint32_t temp;
temp = a[0] ^ a[2]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
a[1] ^= temp;
a[3] ^= temp;
a[0] ^= k[0];
a[1] ^= k[1];
a[2] ^= k[2];
a[3] ^= k[3];
temp = a[1] ^ a[3]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8);
a[0] ^= temp;
a[2] ^= temp;
}
*/
round_const: .byte 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, \
0x2F, 0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, \
0xD4
;-- a[0]
state0_0 = 2
state0_1 = 3
state0_2 = 4
state0_3 = 5
;-- a[1]
state1_0 = 6
state1_1 = 7
state1_2 = 8
state1_3 = 9
;-- a[2]
state2_0 = 10
state2_1 = 11
state2_2 = 12
state2_3 = 13
;-- a[3]
state3_0 = 14
state3_1 = 15
state3_2 = 16
state3_3 = 17
; === theta ===
;
; param1: the state in r2-r17
; param2: pointer to k in X (r26,r27)
;
temp_a = 18
temp_b = 19
temp_c = 20
temp_d = 21
theta:
/* temp = a[0] ^ a[2]; temp ^= temp>>>8 ^ temp<<<8 */
op32_prefix mov, temp_, state0_, a,b,c,d, 0,1,2,3
op32_prefix eor, temp_, state2_, a,b,c,d, 0,1,2,3
mov r1, temp_a
eor r1, temp_b
eor r1, temp_c
eor r1, temp_d
op32_prefix eor, temp_, r, a,b,c,d, 1,1,1,1
/* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
/* a[1] ^= temp */
eor state1_0, temp_c
eor state1_1, temp_d
eor state1_2, temp_a
eor state1_3, temp_b
/* a[3] ^= temp */
eor state3_0, temp_c
eor state3_1, temp_d
eor state3_2, temp_a
eor state3_3, temp_b
/* state ^ k (X points to K) */
ldi r28, 2
clr r29 /* Y points to r2 aka state0_0 */
ldi temp_a, 16
1:
ld r1, X+
ld r0, Y
eor r1, r0
st Y+, r1
dec temp_a
brne 1b
sbiw r26, 16 /* set X back to key */
mov temp_a, state1_0
mov temp_b, state1_1
mov temp_c, state1_2
mov temp_d, state1_3
eor temp_a, state3_0
eor temp_b, state3_1
eor temp_c, state3_2
eor temp_d, state3_3
mov r1, temp_a
eor r1, temp_b
eor r1, temp_c
eor r1, temp_d
eor temp_a, r1
eor temp_b, r1
eor temp_c, r1
eor temp_d, r1
/* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */
/* a[0] ^= temp */
eor state0_0, temp_c
eor state0_1, temp_d
eor state0_2, temp_a
eor state0_3, temp_b
/* a[2] ^= temp */
eor state2_0, temp_c
eor state2_1, temp_d
eor state2_2, temp_a
eor state2_3, temp_b
clr r1
ret
/******************************************************************************/
#ifndef NOEKEON_NO_ENC
; === noekeon_enc ===
;
; param1: pointer to buffer (r24,r25)
; param2: pointer to k (r22,r23)
;
.global noekeon_enc
noekeon_enc:
rcall push_all_func
/* load state */
movw r26, r22
ldi r28, 2
clr r29 /* Y points at r2 aka state0_0 */
movw r30, r24 /* Z points at state */
push r30
push r31
ldi r22, 16
push r22 /* 16 is also the number of rounds and gets pushed here */
1:
ld r0, Z+
st Y+, r0
dec r22
brne 1b
/* state loaded */
push r1 /* push round constan2 (0x00) */
ldi r20, 0x80
push r20 /* push round constan2 (0x00) */
rjmp 3f
2:
ldi r30, lo8(round_const+15)
ldi r31, hi8(round_const+15)
sub r30, r22
sbci r31, 0
clr r1
push r1
lpm r0, Z
push r0
3:
rcall round /* pops rc2 & rc1 */
pop r22
dec r22
push r22
brne 2b
pop r22
ldi r22, 0xD4
eor state0_3, r22
rcall theta
pop r31
pop r30
clr r29
ldi r28, 2
ldi r22, 16
1:
ld r0, Y+
st Z+, r0
dec r22
brne 1b
rcall pop_all_func
ret
#endif
/******************************************************************************/
/******************************************************************************/
#ifndef NOEKEON_NO_DEC
; === noekeon_dec ===
;
; param1: pointer to buffer/state (r24,r25)
; param2: pointer to k (r22,r23)
;
.global noekeon_dec
noekeon_dec:
rcall push_all_func
/* allocate 16 bytes on the stack */
in r30, _SFR_IO_ADDR(SPL)
in r31, _SFR_IO_ADDR(SPH)
sbiw r30, 16
out _SFR_IO_ADDR(SPH), r31
out _SFR_IO_ADDR(SPL), r30
adiw r30, 1
/* push state pointer */
push r24
push r25
movw r26, r22 /* move key ptr to X */
/* set stackkey to zero */
ldi r22, 16
1: st Z+, r1
dec r22
brne 1b
/* copy key to state */
clr r29
ldi r28, 2
ldi r22, 16
1: ld r0, X+
st Y+, r0
dec r22
brne 1b
movw r26, r30
sbiw r26, 16 /* set X back to begining of stack key */
rcall theta
/* mov state to stackkey */
clr r29
ldi r28, 2
ldi r22, 16
1: ld r0, Y+
st X+, r0
dec r22
brne 1b
sbiw r26, 16 /* set X back to begining of stack key */
/* move data from stateptr to state */
pop r31
pop r30
push r30
push r31
clr r29
ldi r28, 2
ldi r22, 16
push r22
1: ld r0, Z+
st Y+, r0
dec r22
brne 1b
;--- snip 8< ----
ldi r20, 0xD4
push r20 /* push round constant2 (0xD4) */
push r22 /* push round constan1 (0x00) */
rjmp 3f
2:
ldi r30, lo8(round_const-1)
ldi r31, hi8(round_const-1)
clr r1
add r30, r22
adc r31, r1
lpm r0, Z
push r0
push r1
3:
rcall round /* pops rc2 & rc1 */
pop r22
dec r22
push r22
brne 2b
;----
pop r22
rcall theta
ldi r22, 0x80
eor state0_3, r22
write_state_back:
/* write state back */
pop r31 /* pop state pointer */
pop r30
clr r29
ldi r28, 2
ldi r22, 16
1:
ld r0, Y+
st Z+, r0
dec r22
brne 1b
/* remove key from stack */
in r30, _SFR_IO_ADDR(SPL)
in r31, _SFR_IO_ADDR(SPH)
adiw r30, 16
out _SFR_IO_ADDR(SPH), r31
out _SFR_IO_ADDR(SPL), r30
rcall pop_all_func
ret
#endif
/******************************************************************************/
round:
pop r24
pop r25
pop r1
eor state0_3, r1
rcall theta
pop r1
eor state0_3, r1
push r25
push r24
pi_gamma_pi:
ldi r30, pm_lo8(bigendian_rotl32)
ldi r31, pm_hi8(bigendian_rotl32)
rcall pi
/* pi1 done; now gamma */
rcall gamma_1
/* a[0] <-> a[3] */
xchg state0_0, state3_0
xchg state0_1, state3_1
xchg state0_2, state3_2
xchg state0_3, state3_3
/* a[2] ^= a[0] ^ a[1] ^ a[3] */
op32 eor, state2, state0
op32 eor, state2, state1
op32 eor, state2, state3
rcall gamma_1
ldi r30, pm_lo8(bigendian_rotr32)
ldi r31, pm_hi8(bigendian_rotr32)
rcall pi
ret
gamma_1:
/* a[1] ^= ~(a[3]|a[2])*/
mov r1, state3_0
or r1, state2_0
com r1
eor state1_0, r1
mov r1, state3_1
or r1, state2_1
com r1
eor state1_1, r1
mov r1, state3_2
or r1, state2_2
com r1
eor state1_2, r1
mov r1, state3_3
or r1, state2_3
com r1
eor state1_3, r1
/* a[0] ^= a[2]&a[1] */
mov r1, state2_0
and r1, state1_0
eor state0_0, r1
mov r1, state2_1
and r1, state1_1
eor state0_1, r1
mov r1, state2_2
and r1, state1_2
eor state0_2, r1
mov r1, state2_3
and r1, state1_3
eor state0_3, r1
ret
pi:
/* a[1] <<<= 1*/
mov r22, state1_0
mov r23, state1_1
mov r24, state1_2
mov r25, state1_3
ldi r20, 1
icall
mov state1_0, r22
mov state1_1, r23
mov state1_2, r24
mov state1_3, r25
/* a[2] <<<= 5*/
mov r22, state2_0
mov r23, state2_1
mov r24, state2_2
mov r25, state2_3
ldi r20, 5
icall
mov state2_0, r22
mov state2_1, r23
mov state2_2, r24
mov state2_3, r25
/* a[3] <<<= 2*/
mov r22, state3_0
mov r23, state3_1
mov r24, state3_2
mov r25, state3_3
ldi r20, 2
icall
mov state3_0, r22
mov state3_1, r23
mov state3_2, r24
mov state3_3, r25
ret
/******************************************************************************/
/*
void noekeon_init(void* key, noekeon_ctx_t* ctx){
uint8_t nullv[16];
memset(nullv, 0, 16);
memcpy(ctx, key, 16);
noekeon_enc(ctx, nullv);
}
*/
#ifndef NOEKEON_NO_INIT
.global noekeon_init
noekeon_init:
; === noekeon_init ===
;
; param1: pointer to key (r24,r25)
; param2: pointer to context (r22,r23)
;
in r30, _SFR_IO_ADDR(SPL)
in r31, _SFR_IO_ADDR(SPH)
sbiw r30, 16
out _SFR_IO_ADDR(SPH), r31
out _SFR_IO_ADDR(SPL), r30
movw r26, r22
adiw r30, 1
movw r22, r30
/* set nullv(stack) to zero */
ldi r20, 16
1: st Z+, r1
dec r20
brne 1b
/* copy key data to ctx */
movw r30, r24
ldi r20, 16
1: ld r1, Z+
st X+, r1
dec r20
brne 1b
clr r1
sbiw r26, 16
movw r24, r26
rcall noekeon_enc
in r30, _SFR_IO_ADDR(SPL)
in r31, _SFR_IO_ADDR(SPH)
adiw r30, 16
out _SFR_IO_ADDR(SPH), r31
out _SFR_IO_ADDR(SPL), r30
ret
#endif