From 96789d49fd89502f9c20dbc0611e401b1a417880 Mon Sep 17 00:00:00 2001 From: bg Date: Mon, 12 Jan 2009 11:19:16 +0000 Subject: [PATCH] speed up --- aes_enc-asm.S | 397 +++++++++++++++++++++-------------------------- aes_enc.c | 31 ++-- aes_sboxes-asm.S | 65 ++++++++ mkfiles/aes.mk | 2 +- 4 files changed, 257 insertions(+), 238 deletions(-) create mode 100644 aes_sboxes-asm.S diff --git a/aes_enc-asm.S b/aes_enc-asm.S index ed04b6f..72b9e6e 100644 --- a/aes_enc-asm.S +++ b/aes_enc-asm.S @@ -52,9 +52,9 @@ gf256mul: 3: rjmp 1b 4: - brcc 2f + brcc 5f eor P, B -2: +5: ret .global aes256_enc @@ -118,6 +118,10 @@ aes_encrypt_core: .irp param,ST00, ST01, ST02, ST03, ST10, ST11, ST12, ST13, ST20, ST21, ST22, ST23, ST30, ST31, ST32, ST33 ld \param, Z+ .endr + + ldi xREDUCER, 0x1b /* load reducer */ + ldi r31, hi8(aes_sbox) + /* key whitening */ 1: .irp param,ST00, ST01, ST02, ST03, ST10, ST11, ST12, ST13, ST20, ST21, ST22, ST23, ST30, ST31, ST32, ST33 @@ -132,81 +136,46 @@ aes_encrypt_core: set 3: - ldi r30, lo8(aes_sbox) - ldi r31, hi8(aes_sbox) - movw SBOX_SAVE0, r30 /* encryption loop */ /* SBOX substitution and shifting */ - movw r30, SBOX_SAVE0 - add r30, ST00 - adc r31, r1 + mov r30, ST00 lpm ST00, Z - movw r30, SBOX_SAVE0 - add r30, ST10 - adc r31, r1 + mov r30, ST10 lpm ST10, Z - movw r30, SBOX_SAVE0 - add r30, ST20 - adc r31, r1 + mov r30, ST20 lpm ST20, Z - movw r30, SBOX_SAVE0 - add r30, ST30 - adc r31, r1 + mov r30, ST30 lpm ST30, Z - movw r30, SBOX_SAVE0 - add r30, ST01 - adc r31, r1 + mov r30, ST01 lpm T0, Z - movw r30, SBOX_SAVE0 - add r30, ST11 - adc r31, r1 + mov r30, ST11 lpm ST01, Z - movw r30, SBOX_SAVE0 - add r30, ST21 - adc r31, r1 + mov r30, ST21 lpm ST11, Z - movw r30, SBOX_SAVE0 - add r30, ST31 - adc r31, r1 + mov r30, ST31 lpm ST21, Z mov ST31, T0 - movw r30, SBOX_SAVE0 - add r30, ST02 - adc r31, r1 + mov r30, ST02 lpm T0, Z - movw r30, SBOX_SAVE0 - add r30, ST12 - adc r31, r1 + mov r30, ST12 lpm T1, Z - movw r30, SBOX_SAVE0 - add r30, ST22 - adc r31, r1 + mov r30, ST22 lpm ST02, Z - movw r30, SBOX_SAVE0 - add r30, ST32 - adc r31, r1 + mov r30, ST32 lpm ST12, Z mov ST22, T0 mov ST32, T1 - movw r30, SBOX_SAVE0 - add r30, ST03 - adc r31, r1 + mov r30, ST03 lpm T0, Z - movw r30, SBOX_SAVE0 - add r30, ST13 - adc r31, r1 + mov r30, ST13 lpm T1, Z - movw r30, SBOX_SAVE0 - add r30, ST23 - adc r31, r1 + mov r30, ST23 lpm T2, Z - movw r30, SBOX_SAVE0 - add r30, ST33 - adc r31, r1 + mov r30, ST33 lpm ST03, Z mov ST13, T0 mov ST23, T1 @@ -216,199 +185,187 @@ aes_encrypt_core: brtc 2f rjmp 1b 2: - ldi xREDUCER, 0x1b /* load reducer */ - ldi A, 2 - mov B, ST00 - rcall gf256mul - mov T0, r0 - ldi A, 3 - mov B, ST01 - rcall gf256mul - eor T0, r0 - eor T0, ST02 - eor T0, ST03 + mov r1, ST00 + eor r1, ST01 + eor r1, ST02 + eor r1, ST03 + + mov T0, ST00 + eor T0, ST01 + lsl T0 + brcc 3f + eor T0, xREDUCER +3: + eor T0, r1 + eor T0, ST00 - mov T1, ST00 - ldi A, 2 - mov B, ST01 - rcall gf256mul - eor T1, r0 - ldi A, 3 - mov B, ST02 - rcall gf256mul - eor T1, r0 - eor T1, ST03 + mov T1, ST01 + eor T1, ST02 + lsl T1 + brcc 3f + eor T1, xREDUCER +3: + eor T1, r1 + eor T1, ST01 - mov T2, ST00 - eor T2, ST01 - ldi A, 2 - mov B, ST02 - rcall gf256mul - eor T2, r0 - ldi A, 3 - mov B, ST03 - rcall gf256mul - eor T2, r0 - - ldi A, 3 - mov B, ST00 - rcall gf256mul - mov T3, r0 - eor T3, ST01 - eor T3, ST02 - ldi A, 2 - mov B, ST03 - rcall gf256mul - eor T3, r0 + mov T2, ST02 + eor T2, ST03 + lsl T2 + brcc 3f + eor T2, xREDUCER +3: + eor T2, r1 + eor T2, ST02 + mov T3, ST03 + eor T3, ST00 + lsl T3 + brcc 3f + eor T3, xREDUCER +3: + eor T3, r1 + eor T3, ST03 + mov ST00, T0 mov ST01, T1 mov ST02, T2 mov ST03, T3 - - ldi A, 2 - mov B, ST10 - rcall gf256mul - mov T0, r0 - ldi A, 3 - mov B, ST11 - rcall gf256mul - eor T0, r0 - eor T0, ST12 - eor T0, ST13 + + + mov r1, ST10 + eor r1, ST11 + eor r1, ST12 + eor r1, ST13 + + mov T0, ST10 + eor T0, ST11 + lsl T0 + brcc 3f + eor T0, xREDUCER +3: + eor T0, r1 + eor T0, ST10 - mov T1, ST10 - ldi A, 2 - mov B, ST11 - rcall gf256mul - eor T1, r0 - ldi A, 3 - mov B, ST12 - rcall gf256mul - eor T1, r0 - eor T1, ST13 + mov T1, ST11 + eor T1, ST12 + lsl T1 + brcc 3f + eor T1, xREDUCER +3: + eor T1, r1 + eor T1, ST11 - mov T2, ST10 - eor T2, ST11 - ldi A, 2 - mov B, ST12 - rcall gf256mul - eor T2, r0 - ldi A, 3 - mov B, ST13 - rcall gf256mul - eor T2, r0 - - ldi A, 3 - mov B, ST10 - rcall gf256mul - mov T3, r0 - eor T3, ST11 - eor T3, ST12 - ldi A, 2 - mov B, ST13 - rcall gf256mul - eor T3, r0 + mov T2, ST12 + eor T2, ST13 + lsl T2 + brcc 3f + eor T2, xREDUCER +3: + eor T2, r1 + eor T2, ST12 + mov T3, ST13 + eor T3, ST10 + lsl T3 + brcc 3f + eor T3, xREDUCER +3: + eor T3, r1 + eor T3, ST13 + mov ST10, T0 mov ST11, T1 mov ST12, T2 mov ST13, T3 - ldi A, 2 - mov B, ST20 - rcall gf256mul - mov T0, r0 - ldi A, 3 - mov B, ST21 - rcall gf256mul - eor T0, r0 - eor T0, ST22 - eor T0, ST23 + mov r1, ST20 + eor r1, ST21 + eor r1, ST22 + eor r1, ST23 + + mov T0, ST20 + eor T0, ST21 + lsl T0 + brcc 3f + eor T0, xREDUCER +3: + eor T0, r1 + eor T0, ST20 - mov T1, ST20 - ldi A, 2 - mov B, ST21 - rcall gf256mul - eor T1, r0 - ldi A, 3 - mov B, ST22 - rcall gf256mul - eor T1, r0 - eor T1, ST23 + mov T1, ST21 + eor T1, ST22 + lsl T1 + brcc 3f + eor T1, xREDUCER +3: + eor T1, r1 + eor T1, ST21 - mov T2, ST20 - eor T2, ST21 - ldi A, 2 - mov B, ST22 - rcall gf256mul - eor T2, r0 - ldi A, 3 - mov B, ST23 - rcall gf256mul - eor T2, r0 - - ldi A, 3 - mov B, ST20 - rcall gf256mul - mov T3, r0 - eor T3, ST21 - eor T3, ST22 - ldi A, 2 - mov B, ST23 - rcall gf256mul - eor T3, r0 + mov T2, ST22 + eor T2, ST23 + lsl T2 + brcc 3f + eor T2, xREDUCER +3: + eor T2, r1 + eor T2, ST22 + mov T3, ST23 + eor T3, ST20 + lsl T3 + brcc 3f + eor T3, xREDUCER +3: + eor T3, r1 + eor T3, ST23 + mov ST20, T0 mov ST21, T1 mov ST22, T2 mov ST23, T3 - ldi A, 2 - mov B, ST30 - rcall gf256mul - mov T0, r0 - ldi A, 3 - mov B, ST31 - rcall gf256mul - eor T0, r0 - eor T0, ST32 - eor T0, ST33 + mov r1, ST30 + eor r1, ST31 + eor r1, ST32 + eor r1, ST33 + + mov T0, ST30 + eor T0, ST31 + lsl T0 + brcc 3f + eor T0, xREDUCER +3: + eor T0, r1 + eor T0, ST30 - mov T1, ST30 - ldi A, 2 - mov B, ST31 - rcall gf256mul - eor T1, r0 - ldi A, 3 - mov B, ST32 - rcall gf256mul - eor T1, r0 - eor T1, ST33 + mov T1, ST31 + eor T1, ST32 + lsl T1 + brcc 3f + eor T1, xREDUCER +3: + eor T1, r1 + eor T1, ST31 - mov T2, ST30 - eor T2, ST31 - ldi A, 2 - mov B, ST32 - rcall gf256mul - eor T2, r0 - ldi A, 3 - mov B, ST33 - rcall gf256mul - eor T2, r0 - - ldi A, 3 - mov B, ST30 - rcall gf256mul - mov T3, r0 - eor T3, ST31 - eor T3, ST32 - ldi A, 2 - mov B, ST33 - rcall gf256mul - eor T3, r0 + mov T2, ST32 + eor T2, ST33 + lsl T2 + brcc 3f + eor T2, xREDUCER +3: + eor T2, r1 + eor T2, ST32 + mov T3, ST33 + eor T3, ST30 + lsl T3 + brcc 3f + eor T3, xREDUCER +3: + eor T3, r1 + eor T3, ST33 + mov ST30, T0 mov ST31, T1 mov ST32, T2 @@ -438,7 +395,7 @@ exit: st Z+, ST31 st Z+, ST32 st Z+, ST33 - + clr r1 pop r29 pop r28 pop_range 2, 17 diff --git a/aes_enc.c b/aes_enc.c index 244dcff..a9a2607 100644 --- a/aes_enc.c +++ b/aes_enc.c @@ -51,7 +51,7 @@ void aes_shiftcol(void* data, uint8_t shift){ static void aes_enc_round(aes_cipher_state_t* state, const aes_roundkey_t* k){ - uint8_t tmp[16]; + uint8_t tmp[16], t; uint8_t i; /* subBytes */ for(i=0; i<16; ++i){ @@ -63,26 +63,23 @@ void aes_enc_round(aes_cipher_state_t* state, const aes_roundkey_t* k){ aes_shiftcol(tmp+3, 3); /* mixColums */ for(i=0; i<4; ++i){ + t = tmp[4*i+0] ^ tmp[4*i+1] ^ tmp[4*i+2] ^ tmp[4*i+3]; state->s[4*i+0] = - GF256MUL_2(tmp[4*i+0]) - ^ GF256MUL_3(tmp[4*i+1]) - ^ GF256MUL_1(tmp[4*i+2]) - ^ GF256MUL_1(tmp[4*i+3]); + GF256MUL_2(tmp[4*i+0]^tmp[4*i+1]) + ^ tmp[4*i+0] + ^ t; state->s[4*i+1] = - GF256MUL_1(tmp[4*i+0]) - ^ GF256MUL_2(tmp[4*i+1]) - ^ GF256MUL_3(tmp[4*i+2]) - ^ GF256MUL_1(tmp[4*i+3]); + GF256MUL_2(tmp[4*i+1]^tmp[4*i+2]) + ^ tmp[4*i+1] + ^ t; state->s[4*i+2] = - GF256MUL_1(tmp[4*i+0]) - ^ GF256MUL_1(tmp[4*i+1]) - ^ GF256MUL_2(tmp[4*i+2]) - ^ GF256MUL_3(tmp[4*i+3]); + GF256MUL_2(tmp[4*i+2]^tmp[4*i+3]) + ^ tmp[4*i+2] + ^ t; state->s[4*i+3] = - GF256MUL_3(tmp[4*i+0]) - ^ GF256MUL_1(tmp[4*i+1]) - ^ GF256MUL_1(tmp[4*i+2]) - ^ GF256MUL_2(tmp[4*i+3]); + GF256MUL_2(tmp[4*i+3]^tmp[4*i+0]) + ^ tmp[4*i+3] + ^ t; } /* addKey */ diff --git a/aes_sboxes-asm.S b/aes_sboxes-asm.S new file mode 100644 index 0000000..8ce54b6 --- /dev/null +++ b/aes_sboxes-asm.S @@ -0,0 +1,65 @@ +/* aes_sbox-asm.S */ +/* + This file is part of the Crypto-avr-lib/microcrypt-lib. + Copyright (C) 2008, 2009 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/** + * \file aes_dec-asm.S + * \email daniel.otte@rub.de + * \author Daniel Otte + * \date 2009-01-10 + * \license GPLv3 or later + * + */ + +.balign 256 +.global aes_sbox +aes_sbox: +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.global aes_invsbox +aes_invsbox: +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d diff --git a/mkfiles/aes.mk b/mkfiles/aes.mk index 47d1da6..34cf3ca 100644 --- a/mkfiles/aes.mk +++ b/mkfiles/aes.mk @@ -5,7 +5,7 @@ ALGO_NAME := AES BLOCK_CIPHERS += $(ALGO_NAME) -$(ALGO_NAME)_OBJ := aes_enc-asm.o aes_dec.o aes_sbox.o aes_invsbox.o aes.o \ +$(ALGO_NAME)_OBJ := aes_enc-asm.o aes_dec.o aes_sboxes-asm.o aes.o \ aes_keyschedule-asm.o gf256mul.o \ aes128_dec.o aes192_dec.o aes256_dec.o $(ALGO_NAME)_TEST_BIN := main-aes-test.o debug.o uart.o serial-tools.o \