From 06a565f432ed3f51cbd9d88807b9860474c38938 Mon Sep 17 00:00:00 2001 From: bg Date: Sat, 12 Apr 2008 16:06:44 +0000 Subject: [PATCH] + noekeon_asm.S (more will follow) --- noekeon.c | 6 + noekeon.mk | 5 +- noekeon_asm.S | 634 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 643 insertions(+), 2 deletions(-) create mode 100644 noekeon_asm.S diff --git a/noekeon.c b/noekeon.c index dd68b65..fc25d62 100644 --- a/noekeon.c +++ b/noekeon.c @@ -11,6 +11,7 @@ #include #include #include "noekeon.h" +#include "uart.h" #define ROUND_NR 16 @@ -50,6 +51,7 @@ void pi2(uint32_t* a){ static void theta(uint32_t* k, uint32_t* a){ uint32_t temp; + temp = a[0] ^ a[2]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8); a[1] ^= temp; a[3] ^= temp; @@ -62,6 +64,7 @@ void theta(uint32_t* k, uint32_t* a){ temp = a[1] ^ a[3]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8); a[0] ^= temp; a[2] ^= temp; + } static @@ -139,6 +142,9 @@ void noekeon_dec(void* buffer, void* key){ memcpy(dkey, key, 16); theta((uint32_t*)nullv, (uint32_t*)dkey); + uart_putstr_P(PSTR("\r\nTheta: ")); + uart_hexdump(dkey, 16); + for(i=ROUND_NR-1; i>=0; --i){ rc = pgm_read_byte(rc_tab+i); noekeon_round((uint32_t*)dkey, (uint32_t*)buffer, 0, rc); diff --git a/noekeon.mk b/noekeon.mk index 3524623..901c7fc 100644 --- a/noekeon.mk +++ b/noekeon.mk @@ -5,9 +5,10 @@ ALGO_NAME := NOEKEON BLOCK_CIPHERS += $(ALGO_NAME) -$(ALGO_NAME)_OBJ := noekeon.o +$(ALGO_NAME)_OBJ := noekeon_asmC.o noekeon_asm.o +#$(ALGO_NAME)_OBJ := noekeon.o $(ALGO_NAME)_TEST_BIN := main-noekeon-test.o debug.o uart.o serial-tools.o \ - noekeon.o nessie_bc_test.o \ + noekeon_asmC.o noekeon_asm.o nessie_bc_test.o \ nessie_common.o cli.o performance_test.o $(ALGO_NAME)_NESSIE_TEST := "nessie" $(ALGO_NAME)_PEROFRMANCE_TEST := "performance" diff --git a/noekeon_asm.S b/noekeon_asm.S new file mode 100644 index 0000000..f3f904d --- /dev/null +++ b/noekeon_asm.S @@ -0,0 +1,634 @@ +/* + * noekeon assembler implementation for avr + * author: Daniel Otte + * email: daniel.otte@rub.de + * license: GPLv3 + */ + +#include + +.macro push_all + push r2 + push r3 + push r4 + push r5 + push r6 + push r7 + push r8 + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push r16 + push r17 + push r28 + push r29 + in r28, _SFR_IO_ADDR(SREG) + push r28 +.endm + +.macro pop_all + pop r28 + out _SFR_IO_ADDR(SREG), r28 + pop r29 + pop r28 + pop r17 + pop r16 + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop r8 + pop r7 + pop r6 + pop r5 + pop r4 + pop r3 + pop r2 + clr r1 +.endm + +.macro xchg a b + eor \a, \b + eor \b, \a + eor \a, \b +.endm + +.macro op32 op a b + \op \a\()_0, \b\()_0 + \op \a\()_1, \b\()_1 + \op \a\()_2, \b\()_2 + \op \a\()_3, \b\()_3 +.endm + + +.macro op32_4t op a b c d w x y z + \op \a, \w + \op \b, \x + \op \c, \y + \op \d, \z +.endm + + +.macro op32_prefix op p q a b c d w x y z + \op \p\()\a, \q\()\w + \op \p\()\b, \q\()\x + \op \p\()\c, \q\()\y + \op \p\()\d, \q\()\z +.endm + +.global bigendian_rotl32 +; === bigendian_rotl32 === +; this function rotates a 32bit bigendian word n bits to the left +; param1: the 32-bit value +; given in r25,r24,r23,r22 (r22 is most significant) +; param2: the 8-bit parameter giving the number of bits to rotate +; given in r20 +; return: the rotatet 32-bit word +; given in r25,r24,r23,r22 + +bigendian_rotl32: + in r0, _SFR_IO_ADDR(SREG) + /* copy high bit of r22 to carry */ + mov r1, r22 +2: + rol r1 + + rol r25 + rol r24 + rol r23 + rol r22 + + dec r20 + brne 2b +bigendian_rotl32_exit: + clr r1 + out _SFR_IO_ADDR(SREG), r0 + ret + + +/******************************************************************************/ + +.global bigendian_rotr32 +; === bigendian_rotl32 === +; this function rotates a 32bit bigendian word n bits to the right +; param1: the 32-bit value +; given in r25,r24,r23,r22 (r22 is most significant) +; param2: the 8-bit parameter giving the number of bits to rotate +; given in r20 +; return: the rotatet 32-bit word +; given in r25,r24,r23,r22 + +bigendian_rotr32: + in r0, _SFR_IO_ADDR(SREG) + /* copy high bit of r25 to carry */ + + mov r1, r25 +2: + ror r1 + + ror r22 + ror r23 + ror r24 + ror r25 + dec r20 + brne 2b +bigendian_rotr32_exit: + clr r1 + out _SFR_IO_ADDR(SREG), r0 + ret + +/******************************************************************************/ +/* +void theta(uint32_t* k, uint32_t* a){ + uint32_t temp; + temp = a[0] ^ a[2]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8); + a[1] ^= temp; + a[3] ^= temp; + + a[0] ^= k[0]; + a[1] ^= k[1]; + a[2] ^= k[2]; + a[3] ^= k[3]; + + temp = a[1] ^ a[3]; temp ^= ROTR32(temp, 8) ^ ROTL32(temp, 8); + a[0] ^= temp; + a[2] ^= temp; +} +*/ + +round_const: .byte 0x1B, 0x36, 0x6C, 0xD8, 0xAB, 0x4D, 0x9A, \ + 0x2F, 0x5E, 0xBC, 0x63, 0xC6, 0x97, 0x35, 0x6A, \ + 0xD4 + +;-- a[0] +state0_0 = 2 +state0_1 = 3 +state0_2 = 4 +state0_3 = 5 +;-- a[1] +state1_0 = 6 +state1_1 = 7 +state1_2 = 8 +state1_3 = 9 +;-- a[2] +state2_0 = 10 +state2_1 = 11 +state2_2 = 12 +state2_3 = 13 +;-- a[3] +state3_0 = 14 +state3_1 = 15 +state3_2 = 16 +state3_3 = 17 + +; === theta === +; +; param1: the state in r2-r17 +; param2: pointer to k in X (r26,r27) +; +temp_a = 18 +temp_b = 19 +temp_c = 20 +temp_d = 21 + +theta: + /* temp = a[0] ^ a[2]; temp ^= temp>>>8 ^ temp<<<8 */ + op32_prefix mov, temp_, state0_, a,b,c,d, 0,1,2,3 + op32_prefix eor, temp_, state2_, a,b,c,d, 0,1,2,3 + + mov r1, temp_a + eor r1, temp_b + eor r1, temp_c + eor r1, temp_d + + op32_prefix eor, temp_, r, a,b,c,d, 1,1,1,1 + + /* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */ + /* a[1] ^= temp */ + eor state1_0, temp_c + eor state1_1, temp_d + eor state1_2, temp_a + eor state1_3, temp_b + /* a[3] ^= temp */ + eor state3_0, temp_c + eor state3_1, temp_d + eor state3_2, temp_a + eor state3_3, temp_b + + /* state ^ k (X points to K) */ + ldi r28, 2 + clr r29 /* Y points to r2 aka state0_0 */ + ldi temp_a, 16 +1: + ld r1, X+ + ld r0, Y + eor r1, r0 + st Y+, r1 + dec temp_a + brne 1b + sbiw r26, 16 /* set X back to key */ + + mov temp_a, state1_0 + mov temp_b, state1_1 + mov temp_c, state1_2 + mov temp_d, state1_3 + eor temp_a, state3_0 + eor temp_b, state3_1 + eor temp_c, state3_2 + eor temp_d, state3_3 + mov r1, temp_a + eor r1, temp_b + eor r1, temp_c + eor r1, temp_d + eor temp_a, r1 + eor temp_b, r1 + eor temp_c, r1 + eor temp_d, r1 + /* temp is know a little bit mixed c,d,a,b (if abcd is normal order) */ + /* a[0] ^= temp */ + eor state0_0, temp_c + eor state0_1, temp_d + eor state0_2, temp_a + eor state0_3, temp_b + /* a[2] ^= temp */ + eor state2_0, temp_c + eor state2_1, temp_d + eor state2_2, temp_a + eor state2_3, temp_b + + clr r1 + ret + +/******************************************************************************/ +; === noekeon_enc === +; +; param1: pointer to buffer/state (r24,r25) +; param2: pointer to k (r22,r23) +; +.global noekeon_enc +noekeon_enc: + push_all + /* load state */ + movw r26, r22 + ldi r28, 2 + clr r29 /* Y points at r2 aka state0_0 */ + movw r30, r24 /* Z points at state */ + push r30 + push r31 + ldi r22, 16 + push r22 /* 16 is also the number of rounds and gets pushed here */ +1: + ld r0, Z+ + st Y+, r0 + dec r22 + brne 1b + /* state loaded */ + push r1 /* push round constan2 (0x00) */ + ldi r20, 0x80 + push r20 /* push round constan2 (0x00) */ + rjmp 3f +2: + ldi r30, lo8(round_const+15) + ldi r31, hi8(round_const+15) + sub r30, r22 + sbci r31, 0 + clr r1 + push r1 + lpm r0, Z + push r0 +3: + call round /* pops rc2 & rc1 */ + pop r22 + dec r22 + push r22 + brne 2b + + pop r22 + + ldi r22, 0xD4 + eor state0_3, r22 + call theta + + pop r31 + pop r30 + clr r29 + ldi r28, 2 + ldi r22, 16 +1: + ld r0, Y+ + st Z+, r0 + dec r22 + brne 1b + + pop_all + ret +/******************************************************************************/ +/******************************************************************************/ +; === noekeon_dec === +; +; param1: pointer to buffer/state (r24,r25) +; param2: pointer to k (r22,r23) +; +.global noekeon_dec +noekeon_dec: + push_all + /* allocate 16 bytes on the stack */ + in r30, _SFR_IO_ADDR(SPL) + in r31, _SFR_IO_ADDR(SPH) + sbiw r30, 16 + out _SFR_IO_ADDR(SPH), r31 + out _SFR_IO_ADDR(SPL), r30 + + adiw r30, 1 + /* push state pointer */ + push r24 + push r25 + movw r26, r22 /* move key ptr to X */ + + /* set stackkey to zero */ + ldi r22, 16 +1: st Z+, r1 + dec r22 + brne 1b + + /* copy key to state */ + clr r29 + ldi r28, 2 + ldi r22, 16 +1: ld r0, X+ + st Y+, r0 + dec r22 + brne 1b + + movw r26, r30 + sbiw r26, 16 /* set X back to begining of stack key */ + call theta + + /* mov state to stackkey */ + clr r29 + ldi r28, 2 + ldi r22, 16 +1: ld r0, Y+ + st X+, r0 + dec r22 + brne 1b + sbiw r26, 16 /* set X back to begining of stack key */ + + /* move data from stateptr to state */ + pop r31 + pop r30 + push r30 + push r31 + clr r29 + ldi r28, 2 + ldi r22, 16 + push r22 +1: ld r0, Z+ + st Y+, r0 + dec r22 + brne 1b + +;--- snip 8< ---- + + ldi r20, 0xD4 + push r20 /* push round constant2 (0xD4) */ + push r22 /* push round constan1 (0x00) */ + rjmp 3f +2: + ldi r30, lo8(round_const-1) + ldi r31, hi8(round_const-1) + clr r1 + add r30, r22 + adc r31, r1 + lpm r0, Z + push r0 + push r1 +3: + call round /* pops rc2 & rc1 */ + pop r22 + dec r22 + push r22 + brne 2b +;---- + pop r22 + + call theta + ldi r22, 0x80 + eor state0_3, r22 + +write_state_back: + /* write state back */ + pop r31 /* pop state pointer */ + pop r30 + clr r29 + ldi r28, 2 + ldi r22, 16 +1: + ld r0, Y+ + st Z+, r0 + dec r22 + brne 1b + + /* remove key from stack */ + in r30, _SFR_IO_ADDR(SPL) + in r31, _SFR_IO_ADDR(SPH) + adiw r30, 16 + out _SFR_IO_ADDR(SPH), r31 + out _SFR_IO_ADDR(SPL), r30 + pop_all + ret +/******************************************************************************/ + +round: + pop r24 + pop r25 + pop r1 + eor state0_3, r1 + call theta + pop r1 + eor state0_3, r1 + push r25 + push r24 +pi_gamma_pi: + clc + call pi + /* pi1 done; now gamma */ + call gamma_1 + /* a[0] <-> a[3] */ + xchg state0_0, state3_0 + xchg state0_1, state3_1 + xchg state0_2, state3_2 + xchg state0_3, state3_3 + /* a[2] ^= a[0] ^ a[1] ^ a[3] */ + op32 eor, state2, state0 + op32 eor, state2, state1 + op32 eor, state2, state3 +/* + eor state2_0, state0_0 + eor state2_1, state0_1 + eor state2_2, state0_2 + eor state2_3, state0_3 + eor state2_0, state1_0 + eor state2_1, state1_1 + eor state2_2, state1_2 + eor state2_3, state1_3 + eor state2_0, state3_0 + eor state2_1, state3_1 + eor state2_2, state3_2 + eor state2_3, state3_3 +*/ + call gamma_1 + sec + call pi + ret + +gamma_1: + /* a[1] ^= ~(a[3]|a[2])*/ + mov r1, state3_0 + or r1, state2_0 + com r1 + eor state1_0, r1 + + mov r1, state3_1 + or r1, state2_1 + com r1 + eor state1_1, r1 + + mov r1, state3_2 + or r1, state2_2 + com r1 + eor state1_2, r1 + + mov r1, state3_3 + or r1, state2_3 + com r1 + eor state1_3, r1 + + /* a[0] ^= a[2]&a[1] */ + mov r1, state2_0 + and r1, state1_0 + eor state0_0, r1 + + mov r1, state2_1 + and r1, state1_1 + eor state0_1, r1 + + mov r1, state2_2 + and r1, state1_2 + eor state0_2, r1 + + mov r1, state2_3 + and r1, state1_3 + eor state0_3, r1 + ret + +pi: + brcs 1f + ldi r30, lo8(bigendian_rotl32) + ldi r31, hi8(bigendian_rotl32) + rjmp 2f +1: + ldi r30, lo8(bigendian_rotr32) + ldi r31, hi8(bigendian_rotr32) +2: + lsr r31 + ror r30 + /* a[1] <<<= 1*/ + mov r22, state1_0 + mov r23, state1_1 + mov r24, state1_2 + mov r25, state1_3 + ldi r20, 1 + icall + mov state1_0, r22 + mov state1_1, r23 + mov state1_2, r24 + mov state1_3, r25 + /* a[2] <<<= 5*/ + mov r22, state2_0 + mov r23, state2_1 + mov r24, state2_2 + mov r25, state2_3 + ldi r20, 5 + icall + mov state2_0, r22 + mov state2_1, r23 + mov state2_2, r24 + mov state2_3, r25 + /* a[3] <<<= 2*/ + mov r22, state3_0 + mov r23, state3_1 + mov r24, state3_2 + mov r25, state3_3 + ldi r20, 2 + icall + mov state3_0, r22 + mov state3_1, r23 + mov state3_2, r24 + mov state3_3, r25 + ret + +;------- trash follows -------- + + + + /* load state */ + movw r26, r22 + ldi r28, 2 + clr r29 /* Y points at r2 aka state0_0 */ + ldi r22, 16 +1: /* copy key to state */ + ld r0, X+ + st Y+, r0 + dec r22 + brne 1b + + movw r26, r30 + + clr r1 + ldi r22, 16 +1: /* set key to zero */ + st Z+, r1 + dec r22 + brne 1b + + call theta + + ldi r22, 16 +1: /* write key back */ + ld r0, -Y + st -Z, r0 + dec r22 + brne 1b + +; movw r26, r30 /* move keypointer to X */ +; adiw r26, 1 + movw r30, r24 /* Z points at state */ + push r30 /* push state pointer */ + push r31 + + ;-- + clr r29 + ldi r28, 2 + ;-- + ldi r22, 16 + push r22 /* 16 is also the number of rounds and gets pushed here */ + ldi r22, 16 +1: /* load state */ + ld r0, Z+ + st Y+, r0 + dec r22 + brne 1b + /* state loaded */ + + +;------- ------------- -------- + +