From 5ac75cfae217122b540c1a6d258054230dc534c3 Mon Sep 17 00:00:00 2001 From: bg Date: Tue, 25 Nov 2008 01:23:22 +0000 Subject: [PATCH] new MD5 ins ASM with C (working on pure ASM implementation) plus enhancments in asm Macros small changes in MD5 C-implementation (migth be a little faster) little bug fixed in Nessie-Hash-Test --- Makefile | 20 +- arcfour.c | 10 +- arcfour.h | 40 +-- avr-asm-macros.S | 4 +- avr-makefile.inc | 4 +- config.h | 8 +- md5-asm.S | 521 ++++++++++++++++++++++++++++++++++++ md5-stub.c | 135 ++++++++++ md5.c | 27 +- mkfiles/md5_asm.mk | 12 + sha256-asm.S | 14 +- test_src/cli.c | 6 +- test_src/main-md5-test.c | 13 +- test_src/nessie_hash_test.c | 16 +- test_src/performance_test.c | 4 +- test_src/uart.c | 3 + 16 files changed, 769 insertions(+), 68 deletions(-) create mode 100644 md5-asm.S create mode 100644 md5-stub.c create mode 100644 mkfiles/md5_asm.mk diff --git a/Makefile b/Makefile index 57abdad..359e592 100644 --- a/Makefile +++ b/Makefile @@ -14,15 +14,18 @@ include mkfiles/*.mk ALGORITHMS = $(BLOCK_CIPHERS) $(STREAM_CIPHERS) $(HASHES) $(PRNGS) $(MACS) ALGORITHMS_OBJ = $(patsubst %,%_OBJ, $(ALGORITHMS)) +ALGORITHMS_TEST_BIN = $(patsubst %,%_TEST_BIN, $(ALGORITHMS)) + define OBJinBINDIR_TEMPLATE $(1) = $(2) endef + $(foreach a, $(ALGORITHMS_OBJ), $(eval $(call OBJinBINDIR_TEMPLATE, $(a), $(patsubst %.o,$(BIN_DIR)%.o,$($(a)))))) -ALGORITHMS_TEST_BIN = $(patsubst %,%_TEST_BIN, $(ALGORITHMS)) + $(foreach a, $(ALGORITHMS_TEST_BIN), $(eval $(call OBJinBINDIR_TEMPLATE, $(a), $(patsubst %.o,$(TESTBIN_DIR)%.o,$($(a)))))) -ALGORITHMS_TEST_BIN_IMM = $(foreach a, $(ALGORITHMS_TEST_BIN), $($(a))) +#ALGORITHMS_TEST_BIN_IMM = $(foreach a, $(ALGORITHMS_TEST_BIN), $($(a))) ALGORITHMS_NESSIE_TEST = $(patsubst %,%_NESSIE_TEST, $(ALGORITHMS)) ALGORITHMS_PERFORMANCE_TEST = $(patsubst %,%_PERORMANCE_TEST, $(ALGORITHMS)) @@ -73,7 +76,7 @@ info: @echo " $(MACS)" @echo " PRNG functions:" @echo " $(PRNGS)" -# @echo " ALGORITHMS_TEST_BIN" +# @echo " ALGORITHMS_TEST_BIN:" # @echo " $(ALGORITHMS_TEST_BIN)" # @echo " ALGORITHMS_TEST_TARGET_ELF:" # @echo " $(ALGORITHMS_TEST_TARGET_ELF)" @@ -116,6 +119,15 @@ endef $(foreach algo, $(ALGORITHMS), $(eval $(call OBJ_TEMPLATE, $(algo), $($(algo)_OBJ)))) + +#------------------------------------------------------------------------------- + +define TESTBIN_TEMPLATE +$(1)_TEST_BIN: $(2) +endef + +$(foreach algo, $(ALGORITHMS), $(eval $(call TESTBIN_TEMPLATE, $(algo), $($(algo)_TEST_BIN)))) + #------------------------------------------------------------------------------- $(BLOCK_CIPHERS_OBJ): $(patsubst %,%_OBJ, $(BLOCK_CIPHERS)) @@ -124,8 +136,6 @@ $(HASHES_OBJ): $(patsubst %,%_OBJ, $(HASHES)) $(PRNGS_OBJ): $(patsubst %,%_OBJ, $(PRNGS)) $(MACS_OBJ): $(patsubst %,%_OBJ, $(MACS)) -$(ALGORITHMS_TEST_BIN): $(ALGORITHMS_TEST_BIN_IMM) - #------------------------------------------------------------------------------- define SIZE_TEMPLATE diff --git a/arcfour.c b/arcfour.c index 93b2e26..5dcb84f 100644 --- a/arcfour.c +++ b/arcfour.c @@ -16,16 +16,16 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -/* +/* * File: arcfour.c * Author: Daniel Otte * email: daniel.otte@rub.de * Date: 2006-06-07 * License: GPLv3 or later * Description: Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm. - * + * */ - + #include #include "arcfour.h" @@ -38,7 +38,7 @@ void arcfour_init(const void *key, uint8_t length_B, arcfour_ctx_t *ctx){ uint16_t x,y=0; for(x=0; x<= 255; ++x) ctx->s[x]=x; - + for(x=0; x<= 255; ++x){ y += ctx->s[x] + ((uint8_t*)key)[x % length_B]; y &= 0xff; @@ -46,7 +46,7 @@ void arcfour_init(const void *key, uint8_t length_B, arcfour_ctx_t *ctx){ t = ctx->s[y]; ctx->s[y] = ctx->s[x]; ctx->s[x] = t; - } + } ctx->i = ctx->j = 0; } diff --git a/arcfour.h b/arcfour.h index 23fb458..9adf28d 100644 --- a/arcfour.h +++ b/arcfour.h @@ -16,29 +16,29 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -/* +/* * File: arcfour.h * Author: Daniel Otte * Date: 2006-06-07 * License: GPLv3+ - * Description: Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm. + * Description: Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm. */ - -/** + +/** * \file arcfour.h * \author Daniel Otte * \date 2006-06-07 * \license GPLv3+ - * \brief Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm. - * + * \brief Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm. + * * This header file defines the interface of the ARCFOUR cipher implementation. - * + * * This implementation aims to be compatible with the ARCFOUR description - * availabe at + * available at * http://www.mozilla.org/projects/security/pki/nss/draft-kaukonen-cipher-arcfour-03.txt */ - - + + #ifndef ARCFOUR_H_ #define ARCFOUR_H_ @@ -46,46 +46,46 @@ /** \typedef arcfour_ctx_t * \brief type for arcfour context - * + * * A variable of this type may contain a complete ARCFOUR context. * The context is used to store the state of the cipher and gets * created by the arcfour_init(arcfour_ctx_t *c, uint8_t *key, uint8_t length_B) * function. The context is of the fixed size of 258 bytes */ - + /** \struct arcfour_ctx_st * \brief base for ::arcfour_ctx_t - * + * * The struct holds the two indices and the S-Box */ typedef struct arcfour_ctx_st { uint8_t i,j; uint8_t s[256]; } arcfour_ctx_t; - + /** \fn void arcfour_init(arcfour_ctx_t *ctx, void *key, uint8_t length_B) * \brief setup a context with a key - * + * * This function sets up a ::arcfour_ctx_t context using * the supplied key of the given length. * \param ctx pointer to the context * \param key pointer to the key * \param length_B length of the key in bytes (between 1 and 255) */ - + void arcfour_init(const void *key, uint8_t length_B, arcfour_ctx_t *ctx); /** \fn uint8_t arcfour_gen(arcfour_ctx_t *ctx) * \brief generates a byte of keystream - * + * * This function generates the next byte of keystream - * from the supplied ::arcfour_ctx_t context which is updated acordingly - * + * from the supplied ::arcfour_ctx_t context which is updated accordingly + * * \param ctx pointer to the context * \return byte of keystream */ - + uint8_t arcfour_gen(arcfour_ctx_t *ctx); #endif diff --git a/avr-asm-macros.S b/avr-asm-macros.S index 2acb4a1..f878be8 100644 --- a/avr-asm-macros.S +++ b/avr-asm-macros.S @@ -67,8 +67,8 @@ sbiw \reg1, \size cli out _SFR_IO_ADDR(SPH), \reg2 - out _SFR_IO_ADDR(SPL), \reg1 out _SFR_IO_ADDR(SREG), r0 + out _SFR_IO_ADDR(SPL), \reg1 .endm .macro stack_free size:req, reg1=r30, reg2=r31 @@ -78,8 +78,8 @@ adiw \reg1, \size cli out _SFR_IO_ADDR(SPH), \reg2 - out _SFR_IO_ADDR(SPL), \reg1 out _SFR_IO_ADDR(SREG), r0 + out _SFR_IO_ADDR(SPL), \reg1 .endm /******************************************************************************* diff --git a/avr-makefile.inc b/avr-makefile.inc index ac8e647..08457a3 100644 --- a/avr-makefile.inc +++ b/avr-makefile.inc @@ -1,9 +1,9 @@ -OBJ = $(SERPENT_OBJ) MCU_TARGET = atmega644 OPTIMIZE = -Os DEFS = -D$(call uc, $(MCU_TARGET)) FLASHCMD = avrdude -p $(MCU_TARGET) -P /dev/ttyUSB0 -c avr911 -U flash:w:# no space at the end +#FLASHCMD = avrdude -p $(MCU_TARGET) -c usbasp -U flash:w:# no space at the end DEP_DIR = deps/ BIN_DIR = bin/ TESTBIN_DIR = test_bin/ @@ -21,7 +21,7 @@ CC = avr-gcc override CFLAGS = -MMD -MF$(DEP_DIR)$(patsubst %.c,%.d,$(patsubst $(TESTSRC_DIR)%,%,$<)) -I. -gdwarf-2 -pedantic -std=c99 -Wall -Wstrict-prototypes $(OPTIMIZE) -mmcu=$(MCU_TARGET) $(DEFS) override LDFLAGS = -gdwarf-2 -Wl,-Map, -override ASFLAGS = -mmcu=$(MCU_TARGET) -gdwarf-2 +override ASFLAGS = -mmcu=$(MCU_TARGET) -Wa,--gdwarf-2 SIZESTAT_FILE = sizestats.txt diff --git a/config.h b/config.h index 06548bd..529ff51 100644 --- a/config.h +++ b/config.h @@ -20,13 +20,17 @@ #define __CONFIG_H__ #include + #define F_CPU 16000000 /* Oszillator-Frequenz in Hz */ +// #define F_CPU 14745600 + + #define DEBUG uart /* uart.[ch] defines */ #define UART_INTERRUPT 1 -#define UART_BAUD_RATE 38400 +#define UART_BAUD_RATE 9600 #define UART_RXBUFSIZE 16 #define UART_TXBUFSIZE 16 #define UART_LINE_BUFFER_SIZE 40 @@ -41,8 +45,6 @@ #define UART_CTS_BIT 1 */ -//#define ATMEGA644 /* this is now done by make */ - #define CLI_AUTO_HELP #endif diff --git a/md5-asm.S b/md5-asm.S new file mode 100644 index 0000000..ee5f942 --- /dev/null +++ b/md5-asm.S @@ -0,0 +1,521 @@ +/* md5-asm.S */ +/* + This file is part of the Crypto-avr-lib/microcrypt-lib. + Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/* + * Author: Daniel Otte + * License: GPLv3 or later + * Date: 2008-11-15 +*/ + +.include "avr-asm-macros.S" + +;########################################################### +; S-BOX + +T_table: +.hword 0xa478, 0xd76a, 0xb756, 0xe8c7, 0x70db, 0x2420, 0xceee, 0xc1bd, 0x0faf, 0xf57c +.hword 0xc62a, 0x4787, 0x4613, 0xa830, 0x9501, 0xfd46, 0x98d8, 0x6980, 0xf7af, 0x8b44 +.hword 0x5bb1, 0xffff, 0xd7be, 0x895c, 0x1122, 0x6b90, 0x7193, 0xfd98, 0x438e, 0xa679 +.hword 0x0821, 0x49b4, 0x2562, 0xf61e, 0xb340, 0xc040, 0x5a51, 0x265e, 0xc7aa, 0xe9b6 +.hword 0x105d, 0xd62f, 0x1453, 0x0244, 0xe681, 0xd8a1, 0xfbc8, 0xe7d3, 0xcde6, 0x21e1 +.hword 0x07d6, 0xc337, 0x0d87, 0xf4d5, 0x14ed, 0x455a, 0xe905, 0xa9e3, 0xa3f8, 0xfcef +.hword 0x02d9, 0x676f, 0x4c8a, 0x8d2a, 0x3942, 0xfffa, 0xf681, 0x8771, 0x6122, 0x6d9d +.hword 0x380c, 0xfde5, 0xea44, 0xa4be, 0xcfa9, 0x4bde, 0x4b60, 0xf6bb, 0xbc70, 0xbebf +.hword 0x7ec6, 0x289b, 0x27fa, 0xeaa1, 0x3085, 0xd4ef, 0x1d05, 0x0488, 0xd039, 0xd9d4 +.hword 0x99e5, 0xe6db, 0x7cf8, 0x1fa2, 0x5665, 0xc4ac, 0x2244, 0xf429, 0xff97, 0x432a +.hword 0x23a7, 0xab94, 0xa039, 0xfc93, 0x59c3, 0x655b, 0xcc92, 0x8f0c, 0xf47d, 0xffef +.hword 0x5dd1, 0x8584, 0x7e4f, 0x6fa8, 0xe6e0, 0xfe2c, 0x4314, 0xa301, 0x11a1, 0x4e08 +.hword 0x7e82, 0xf753, 0xf235, 0xbd3a, 0xd2bb, 0x2ad7, 0xd391, 0xeb86 + + +#define MD5_init_fast + +.global md5_init +#ifndef MD5_init_fast +;########################################################### +;void md5_init(md5_ctx_t *state) +; param1: (r24,r25) 16-bit pointer to sha256_ctx_t struct in ram +; modifys: Z(r30,r31), X(r25,r26) +; size = 9+5*4 WORDS = 29 WORDS = 58 Bytes +md5_init: + movw r26, r24 ; (24,25) --> (26,27) load X with param1 + ldi r30, lo8(md5_init_vector) + ldi r31, hi8(md5_init_vector) + ldi r24, 16+4 +md5_init_vloop: + lpm r0, Z+ + st X+, r0 + dec r24 + brne md5_init_vloop + ret + +md5_init_vector: +.hword 0x2301, 0x6745 +.hword 0xAB89, 0xEFCD +.hword 0xDCFE, 0x98BA +.hword 0x5476, 0x1032 +.hword 0x0000, 0x0000 + +#else +;########################################################### +.global md5_init_fast +;void md5_init(md5_ctx_t *state) +; param1: (r24,r25) 16-bit pointer to sha256_ctx_t struct in ram +; modifys: r23, r22 +; cycles = 1+16*3+4*2+4 = 1+48+12 = 61 +; size = 1+16*2+4+1 WORDS = 38 WORDS = 76 Bytes +md5_init: +md5_init_fast: + movw r26, r24 + ldi r24, 0x01 + st X+, r24 + ldi r24, 0x23 + st X+, r24 + ldi r24, 0x45 + st X+, r24 + ldi r24, 0x67 + st X+, r24 + ldi r24, 0x89 + st X+, r24 + ldi r24, 0xAB + st X+, r24 + ldi r24, 0xCD + st X+, r24 + ldi r24, 0xEF + st X+, r24 + ldi r24, 0xFE + st X+, r24 + ldi r24, 0xDC + st X+, r24 + ldi r24, 0xBA + st X+, r24 + ldi r24, 0x98 + st X+, r24 + ldi r24, 0x76 + st X+, r24 + ldi r24, 0x54 + st X+, r24 + ldi r24, 0x32 + st X+, r24 + ldi r24, 0x10 + st X+, r24 + st X+, r1 + st X+, r1 + st X+, r1 + st X+, r1 + ret +#endif +;########################################################### + +/* +static +uint32_t md5_F(uint32_t x, uint32_t y, uint32_t z){ + return ((x&y)|((~x)&z)); +} +*/ +; x: r22-r25 +; y: r18-r21 +; z: r14-r17 +md5_F: + and r18, r22 + and r19, r23 + and r20, r24 + and r21, r25 + com r22 + com r23 + com r24 + com r25 + and r22, r14 + and r23, r15 + and r24, r16 + and r25, r17 + or r22, r18 + or r23, r19 + or r24, r20 + or r25, r21 + rjmp md5_core_F_exit + +/* +static +uint32_t md5_G(uint32_t x, uint32_t y, uint32_t z){ + return ((x&z)|((~z)&y)); +} +*/ + +; x: r22-r25 +; y: r18-r21 +; z: r14-r17 +md5_G: + and r22, r14 + and r23, r15 + and r24, r16 + and r25, r17 + com r14 + com r15 + com r16 + com r17 + and r18, r14 + and r19, r15 + and r20, r16 + and r21, r17 + or r22, r18 + or r23, r19 + or r24, r20 + or r25, r21 + rjmp md5_core_F_exit +/* +static +uint32_t md5_H(uint32_t x, uint32_t y, uint32_t z){ + return (x^y^z); +} +*/ +; x: r22-r25 +; y: r18-r21 +; z: r14-r17 +md5_H: + eor r22, r18 + eor r22, r14 + eor r23, r19 + eor r23, r15 + eor r24, r20 + eor r24, r16 + eor r25, r21 + eor r25, r17 + rjmp md5_core_F_exit +/* +static +uint32_t md5_I(uint32_t x, uint32_t y, uint32_t z){ + return (y ^ (x | (~z))); +} +*/ + +jump_table: + rjmp md5_F + rjmp md5_G + rjmp md5_H +; rjmp md5_I + +; x: r22-r25 +; y: r18-r21 +; z: r14-r17 +md5_I: + com r14 + com r15 + com r16 + com r17 + or r22, r14 + or r23, r15 + or r24, r16 + or r25, r17 + eor r22, r18 + eor r23, r19 + eor r24, r20 + eor r25, r21 + rjmp md5_core_F_exit + +as_table: +; (as+0)&3 (as+3)&3 (as+1)&3 (as+2)&3 +; Z X Y +; AS_SAVE0 AS_SAVE1 AS_SAVE2 AS_SAVE3 +.byte 1*4, 0*4, 2*4, 3*4 ;as=1 +.byte 2*4, 1*4, 3*4, 0*4 ;as=2 +.byte 3*4, 2*4, 0*4, 1*4 ;as=3 +.byte 0*4, 3*4, 1*4, 2*4 ;as=4 + +;########################################################### +.global md5_core +md5_core: + mov r21, r20 + mov r20, r18 + mov r19, r16 + mov r18, r14 +; rjmp md5_core_asm +/* +void md5_core(uint32_t* a, void* block, uint8_t as, uint8_t s, uint8_t i, uint8_t fi){ + uint32_t t; + md5_func_t* funcs[]={md5_F, md5_G, md5_H, md5_I}; + as &= 0x3; + / * a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). * / + t = a[as] + funcs[fi](a[(as+1)&3], a[(as+2)&3], a[(as+3)&3]) + *((uint32_t*)block) + md5_T[i] ; + a[as]=a[(as+1)&3] + ROTL32(t, s); +} +*/ +; a: r24-r25 +; block: r22-r23 +; as: r21 +; s: r20 +; i: r19 +; fi: r18 +P_A0 = 24 +P_A1 = 25 +P_B0 = 22 +P_B1 = 23 +P_AS = 21 +P_S = 20 +P_I = 19 +P_FI = 18 + +; x: r22-r25 +; y: r18-r21 +; z: r14-r17 + + +AS_SAVE0 = 4 +AS_SAVE1 = 5 +AS_SAVE2 = 6 +AS_SAVE3 = 7 +FI_SAVE = 8 +S_SAVE = 9 +ACCU0 = 10 +ACCU1 = 11 +ACCU2 = 12 +ACCU3 = 13 +ARG_X0 = 22 +ARG_X1 = 23 +ARG_X2 = 24 +ARG_X3 = 25 +ARG_Y0 = 18 +ARG_Y1 = 19 +ARG_Y2 = 20 +ARG_Y3 = 21 +ARG_Z0 = 14 +ARG_Z1 = 15 +ARG_Z2 = 16 +ARG_Z3 = 17 + + +md5_core_asm: + push r28 + push r29 + push_range 4, 17 + ldi r30, lo8(T_table) + ldi r31, hi8(T_table) + lsl P_I + rol r1 + lsl P_I + rol r1 + add r30, P_I + adc r31, r1 + clr r1 + mov FI_SAVE, r18 + /* loading T[i] into ACCU */ + lpm ACCU0, Z+ + lpm ACCU1, Z+ + lpm ACCU2, Z+ + lpm ACCU3, Z + /* add *block to ACCU */ + movw r30, P_B0 + ld r0, Z+ + add ACCU0, r0 + ld r0, Z+ + adc ACCU1, r0 + ld r0, Z+ + adc ACCU2, r0 + ld r0, Z+ + adc ACCU3, r0 + /* add a[as+0&3] to ACCU */ + ldi r30, lo8(as_table) + ldi r31, hi8(as_table) + dec P_AS + andi P_AS, 0x03 + lsl P_AS + lsl P_AS + add r30, r21 + adc r31, r1 ; Z points to the correct row in as_table + lpm AS_SAVE0, Z+ + lpm AS_SAVE1, Z+ + lpm AS_SAVE2, Z+ + lpm AS_SAVE3, Z + movw r26, r24 ; X points to a[0] + add r26, AS_SAVE0 + adc r27, r1 ; X points at a[as&3] + ld r0, X+ + add ACCU0, r0 + ld r0, X+ + adc ACCU1, r0 + ld r0, X+ + adc ACCU2, r0 + ld r0, X+ + adc ACCU3, r0 + mov S_SAVE, r20 + + movw r28, r24 + /* loading z value */ + movw r26, r28 + add r26, AS_SAVE1 + adc r27, r1 + ld ARG_Z0, X+ + ld ARG_Z1, X+ + ld ARG_Z2, X+ + ld ARG_Z3, X + + /* loading x value */ + movw r26, r28 + add r26, AS_SAVE2 + adc r27, r1 + ld ARG_X0, X+ + ld ARG_X1, X+ + ld ARG_X2, X+ + ld ARG_X3, X + + /* loading y value */ + movw r26, r28 + add r26, AS_SAVE3 + adc r27, r1 + ldi r30, pm_lo8(jump_table) + ldi r31, pm_hi8(jump_table) + add r30, FI_SAVE + adc r31, r1 ; Z points to the correct entry in our jump table + ld ARG_Y0, X+ + ld ARG_Y1, X+ + ld ARG_Y2, X+ + ld ARG_Y3, X + + ijmp /* calls the function pointed by Z */ +md5_core_F_exit: + + /* add ACCU to result of f() */ + add r22, ACCU0 + adc r23, ACCU1 + adc r24, ACCU2 + adc r25, ACCU3 + + /* rotate */ + mov r20, S_SAVE +rotl32: + cpi r20, 8 + brlo bitrotl + mov r21, r25 + mov r25, r24 + mov r24, r23 + mov r23, r22 + mov r22, r21 + subi r20, 8 + rjmp rotl32 +bitrotl: + mov r21, r25 +bitrotl_loop: + tst r20 + breq fixrotl +bitrotl_loop2: + lsl r21 + rol r22 + rol r23 + rol r24 + rol r25 + dec r20 + brne bitrotl_loop2 +fixrotl: + + /* add a[(as+1)&3] */ + movw r26, r28 + add r26, AS_SAVE2 + adc r27, r1 + ld r0, X+ + add r22, r0 + ld r0, X+ + adc r23, r0 + ld r0, X+ + adc r24, r0 + ld r0, X + adc r25, r0 + + /* store result */ + movw r26, r28 + add r26, AS_SAVE0 + adc r27, r1 + st X+, r22 + st X+, r23 + st X+, r24 + st X , r25 +md5_core_exit: + pop_range 4, 17 + pop r29 + pop r28 + ret + +;################################################################### +/* +void md5_nextBlock(md5_ctx_t *state, void* block){ + uint32_t a[4]; + uint8_t m,n,i=0; + + a[0]=state->a[0]; + a[1]=state->a[1]; + a[2]=state->a[2]; + a[3]=state->a[3]; + + / * round 1 * / + uint8_t s1t[]={7,12,17,22}; // 1,-1 1,4 2,-1 3,-2 + for(m=0;m<4;++m){ + for(n=0;n<4;++n){ + md5_core(a, &(((uint32_t*)block)[m*4+n]), 4-n, s1t[n],i++,0); + } + } + / * round 2 * / + uint8_t s2t[]={5,9,14,20}; // 1,-3 1,1 2,-2 2,4 + for(m=0;m<4;++m){ + for(n=0;n<4;++n){ + md5_core(a, &(((uint32_t*)block)[(1+m*4+n*5)&0xf]), 4-n, s2t[n],i++,1); + } + } + / * round 3 * / + uint8_t s3t[]={4,11,16,23}; // 0,4 1,3 2,0 3,-1 + for(m=0;m<4;++m){ + for(n=0;n<4;++n){ + md5_core(a, &(((uint32_t*)block)[(5-m*4+n*3)&0xf]), 4-n, s3t[n],i++,2); + } + } + / * round 4 * / + uint8_t s4t[]={6,10,15,21}; // 1,-2 1,2 2,-1 3,-3 + for(m=0;m<4;++m){ + for(n=0;n<4;++n){ + md5_core(a, &(((uint32_t*)block)[(0-m*4+n*7)&0xf]), 4-n, s4t[n],i++,3); + } + } + state->a[0] += a[0]; + state->a[1] += a[1]; + state->a[2] += a[2]; + state->a[3] += a[3]; + state->counter++; +} +*/ +/* +shift_table: + .byte 7,12,17,22 + .byte 5, 9,14,20 + .byte 4,11,16,23 + .byte 6,10,15,21 + +md5_nextBlock: + stack_alloc 4*4 + + + + + stack_free 4*4 + + +*/ + + + + + + + diff --git a/md5-stub.c b/md5-stub.c new file mode 100644 index 0000000..f9fb945 --- /dev/null +++ b/md5-stub.c @@ -0,0 +1,135 @@ +/* md5-asm.c */ +/* + This file is part of the Crypto-avr-lib/microcrypt-lib. + Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + + #include "md5.h" + #include "uart.h" + #include + #include + + #undef DEBUG + +void md5_core(uint32_t* a, void* block, uint8_t as, uint8_t s, uint8_t i, uint8_t fi); + +/* +#define ROTL32(x,n) (((x)<<(n)) | ((x)>>(32-(n)))) + +static +void md5_core(uint32_t* a, void* block, uint8_t as, uint8_t s, uint8_t i, uint8_t fi){ + uint32_t t; + md5_func_t* funcs[]={md5_F, md5_G, md5_H, md5_I}; + as &= 0x3; + // * a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). * / +#ifdef DEBUG + char funcc[]={'*', '-', '+', '~'}; + uart_putstr("\r\n DBG: md5_core ["); + uart_putc(funcc[fi]); + uart_hexdump(&as, 1); uart_putc(' '); + uart_hexdump(&k, 1); uart_putc(' '); + uart_hexdump(&s, 1); uart_putc(' '); + uart_hexdump(&i, 1); uart_putc(']'); +#endif + t = a[as] + funcs[fi](a[(as+1)&3], a[(as+2)&3], a[(as+3)&3]) + *((uint32_t*)block) + md5_T[i] ; + a[as]=a[(as+1)&3] + ROTL32(t, s); +} +*/ + + +void md5_nextBlock(md5_ctx_t *state, void* block){ + uint32_t a[4]; + uint8_t m,n,i=0; + /* this requires other mixed sboxes */ +#ifdef DEBUG + uart_putstr("\r\n DBG: md5_nextBlock: block:\r\n"); + uart_hexdump(block, 16); uart_putstr("\r\n"); + uart_hexdump(block+16, 16); uart_putstr("\r\n"); + uart_hexdump(block+32, 16); uart_putstr("\r\n"); + uart_hexdump(block+48, 16); uart_putstr("\r\n"); +#endif + + a[0]=state->a[0]; + a[1]=state->a[1]; + a[2]=state->a[2]; + a[3]=state->a[3]; + + /* round 1 */ + uint8_t s1t[]={7,12,17,22}; // 1,-1 1,4 2,-1 3,-2 + for(m=0;m<4;++m){ + for(n=0;n<4;++n){ + md5_core(a, &(((uint32_t*)block)[m*4+n]), 4-n, s1t[n],i++,0); + } + } + /* round 2 */ + uint8_t s2t[]={5,9,14,20}; // 1,-3 1,1 2,-2 2,4 + for(m=0;m<4;++m){ + for(n=0;n<4;++n){ + md5_core(a, &(((uint32_t*)block)[(1+m*4+n*5)&0xf]), 4-n, s2t[n],i++,1); + } + } + /* round 3 */ + uint8_t s3t[]={4,11,16,23}; // 0,4 1,3 2,0 3,-1 + for(m=0;m<4;++m){ + for(n=0;n<4;++n){ + md5_core(a, &(((uint32_t*)block)[(5-m*4+n*3)&0xf]), 4-n, s3t[n],i++,2); + } + } + /* round 4 */ + uint8_t s4t[]={6,10,15,21}; // 1,-2 1,2 2,-1 3,-3 + for(m=0;m<4;++m){ + for(n=0;n<4;++n){ + md5_core(a, &(((uint32_t*)block)[(0-m*4+n*7)&0xf]), 4-n, s4t[n],i++,3); + } + } + state->a[0] += a[0]; + state->a[1] += a[1]; + state->a[2] += a[2]; + state->a[3] += a[3]; + state->counter++; +} + +void md5_lastBlock(md5_ctx_t *state, void* block, uint16_t length_b){ + uint16_t l; + uint8_t b[64]; + while (length_b >= 512){ + md5_nextBlock(state, block); + length_b -= 512; + block = ((uint8_t*)block) + 512/8; + } + memset(b, 0, 64); + memcpy(b, block, length_b/8); + /* insert padding one */ + l=length_b/8; + if(length_b%8){ + uint8_t t; + t = ((uint8_t*)block)[l]; + t |= (0x80>>(length_b%8)); + b[l]=t; + }else{ + b[l]=0x80; + } + /* insert length value */ + if(l+sizeof(uint64_t) >= 512/8){ + md5_nextBlock(state, b); + state->counter--; + memset(b, 0, 64); + } + *((uint64_t*)&b[64-sizeof(uint64_t)]) = (state->counter * 512) + length_b; + md5_nextBlock(state, b); +} + + diff --git a/md5.c b/md5.c index 88f4f68..c995b2a 100644 --- a/md5.c +++ b/md5.c @@ -40,19 +40,23 @@ void md5_init(md5_ctx_t *s){ s->a[2] = 0x98badcfe; s->a[3] = 0x10325476; } - + +static uint32_t md5_F(uint32_t x, uint32_t y, uint32_t z){ return ((x&y)|((~x)&z)); } +static uint32_t md5_G(uint32_t x, uint32_t y, uint32_t z){ return ((x&z)|((~z)&y)); } +static uint32_t md5_H(uint32_t x, uint32_t y, uint32_t z){ return (x^y^z); } +static uint32_t md5_I(uint32_t x, uint32_t y, uint32_t z){ return (y ^ (x | (~z))); } @@ -61,7 +65,8 @@ typedef uint32_t md5_func_t(uint32_t, uint32_t, uint32_t); #define ROTL32(x,n) (((x)<<(n)) | ((x)>>(32-(n)))) -void md5_core(uint32_t* a, uint8_t as, void* block, uint8_t k, uint8_t s, uint8_t i, uint8_t fi){ +static +void md5_core(uint32_t* a, void* block, uint8_t as, uint8_t s, uint8_t i, uint8_t fi){ uint32_t t; md5_func_t* funcs[]={md5_F, md5_G, md5_H, md5_I}; as &= 0x3; @@ -75,7 +80,7 @@ void md5_core(uint32_t* a, uint8_t as, void* block, uint8_t k, uint8_t s, uint8_ uart_hexdump(&s, 1); uart_putc(' '); uart_hexdump(&i, 1); uart_putc(']'); #endif - t = a[as] + funcs[fi](a[(as+1)&3], a[(as+2)&3], a[(as+3)&3]) + ((uint32_t*)block)[k] + md5_T[i] ; + t = a[as] + funcs[fi](a[(as+1)&3], a[(as+2)&3], a[(as+3)&3]) + *((uint32_t*)block) + md5_T[i] ; a[as]=a[(as+1)&3] + ROTL32(t, s); } @@ -97,31 +102,31 @@ void md5_nextBlock(md5_ctx_t *state, void* block){ a[3]=state->a[3]; /* round 1 */ - uint8_t s1t[]={7,12,17,22}; + uint8_t s1t[]={7,12,17,22}; // 1,-1 1,4 2,-1 3,-2 for(m=0;m<4;++m){ for(n=0;n<4;++n){ - md5_core(a, 4-n, block, m*4+n, s1t[n],i++,0); + md5_core(a, &(((uint32_t*)block)[m*4+n]), 4-n, s1t[n],i++,0); } } /* round 2 */ - uint8_t s2t[]={5,9,14,20}; + uint8_t s2t[]={5,9,14,20}; // 1,-3 1,1 2,-2 2,4 for(m=0;m<4;++m){ for(n=0;n<4;++n){ - md5_core(a, 4-n, block, (1+m*4+n*5)&0xf, s2t[n],i++,1); + md5_core(a, &(((uint32_t*)block)[(1+m*4+n*5)&0xf]), 4-n, s2t[n],i++,1); } } /* round 3 */ - uint8_t s3t[]={4,11,16,23}; + uint8_t s3t[]={4,11,16,23}; // 0,4 1,3 2,0 3,-1 for(m=0;m<4;++m){ for(n=0;n<4;++n){ - md5_core(a, 4-n, block, (5-m*4+n*3)&0xf, s3t[n],i++,2); + md5_core(a, &(((uint32_t*)block)[(5-m*4+n*3)&0xf]), 4-n, s3t[n],i++,2); } } /* round 4 */ - uint8_t s4t[]={6,10,15,21}; + uint8_t s4t[]={6,10,15,21}; // 1,-2 1,2 2,-1 3,-3 for(m=0;m<4;++m){ for(n=0;n<4;++n){ - md5_core(a, 4-n, block, (0-m*4+n*7)&0xf, s4t[n],i++,3); + md5_core(a, &(((uint32_t*)block)[(0-m*4+n*7)&0xf]), 4-n, s4t[n],i++,3); } } state->a[0] += a[0]; diff --git a/mkfiles/md5_asm.mk b/mkfiles/md5_asm.mk new file mode 100644 index 0000000..8cba137 --- /dev/null +++ b/mkfiles/md5_asm.mk @@ -0,0 +1,12 @@ +# Makefile for MD5 +ALGO_NAME := MD5_ASM + +# comment out the following line for removement of MD5 from the build process +HASHES += $(ALGO_NAME) + +$(ALGO_NAME)_OBJ := md5-asm.o md5-stub.o +$(ALGO_NAME)_TEST_BIN := main-md5-test.o debug.o uart.o serial-tools.o \ + nessie_hash_test.o nessie_common.o cli.o performance_test.o +$(ALGO_NAME)_NESSIE_TEST := "nessie" +$(ALGO_NAME)_PEROFRMANCE_TEST := "performance" + diff --git a/sha256-asm.S b/sha256-asm.S index 403506e..6795604 100644 --- a/sha256-asm.S +++ b/sha256-asm.S @@ -919,24 +919,18 @@ sha256_kv: ; round-key-vector stored in ProgMem ; state->length=0; ; memcpy(state->h, sha256_init_vector, 8*4); ;} -; param1: (Func3,r24) 16-bit pointer to sha256_ctx_t struct in ram +; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram ; modifys: Z(r30,r31), Func1, r22 sha256_init: movw r26, r24 ; (24,25) --> (26,27) load X with param1 ldi r30, lo8((sha256_init_vector)) ldi r31, hi8((sha256_init_vector)) - ldi r22, 32 + ldi r22, 32+8 sha256_init_vloop: lpm r23, Z+ st X+, r23 dec r22 brne sha256_init_vloop - ldi r22, 8 - clr r1 ;this should not be needed -sha256_init_lloop: - st X+, r1 - dec r22 - brne sha256_init_lloop ret sha256_init_vector: @@ -948,6 +942,8 @@ sha256_init_vector: .word 0x688C, 0x9B05 .word 0xD9AB, 0x1F83 .word 0xCD19, 0x5BE0 +.word 0x0000, 0x0000 +.word 0x0000, 0x0000 ;########################################################### @@ -968,7 +964,7 @@ rotl32: mov r23, r22 mov r22, r21 subi r20, 8 - rjmp rotr32 + rjmp rotl32 bitrotl: clr r21 clc diff --git a/test_src/cli.c b/test_src/cli.c index e8ed063..37bbb25 100644 --- a/test_src/cli.c +++ b/test_src/cli.c @@ -80,9 +80,11 @@ int16_t execcommand_d0_P(const char* str, PGM_P v, void(*fpt[])(void) ){ if(i!=-1){ if(fpt[i]) fpt[i](); + return i; + }else{ + cli_auto_help_P(v); + return -1; } - cli_auto_help_P(v); - return -1; } diff --git a/test_src/main-md5-test.c b/test_src/main-md5-test.c index 490ee48..c3e40da 100644 --- a/test_src/main-md5-test.c +++ b/test_src/main-md5-test.c @@ -85,7 +85,12 @@ void testrun_nessie_md5(void){ void testrun_md5(void){ md5_ctx_t s; - char* testv[]={"", "a", "abc", "message digest", "abcdefghijklmnopqrstuvwxyz", + char* testv[]={ + "", + "a", + "abc", + "message digest", + "abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", "12345678901234567890123456789012345678901234567890123456789012345678901234567890"}; uint8_t i; @@ -94,10 +99,10 @@ void testrun_md5(void){ for(i=0; i<7; ++i){ uart_putstr("\r\n MD5 (\""); uart_putstr(testv[i]); - uart_putstr("\") = \r\n"); + uart_putstr("\") = \r\n\t"); md5_init(&s); md5_lastBlock(&s, testv[i], strlen(testv[i])*8); - uart_hexdump(&s.a[0], 16); + uart_hexdump(&(s.a[0]), 16); } } @@ -141,7 +146,7 @@ void testrun_performance_md5(void){ /***************************************************************************** - * main * + * main * *****************************************************************************/ int main (void){ diff --git a/test_src/nessie_hash_test.c b/test_src/nessie_hash_test.c index a870b82..27b9892 100644 --- a/test_src/nessie_hash_test.c +++ b/test_src/nessie_hash_test.c @@ -151,12 +151,22 @@ static void tv4_hash(void){ uint8_t ctx[nessie_hash_ctx.ctx_size_B]; uint8_t hash[(nessie_hash_ctx.hashsize_b+7)/8]; - uint8_t block[256/8]; - uint16_t n=256; + uint8_t block[nessie_hash_ctx.hashsize_b/8]; + uint16_t n=nessie_hash_ctx.hashsize_b; uint32_t i; uart_putstr_P(PSTR("\r\n message=")); - uart_putstr(PSTR("256 zero bits")); + if(nessie_hash_ctx.hashsize_b>=10000) + uart_putc('0' + (nessie_hash_ctx.hashsize_b/10000)%10); + if(nessie_hash_ctx.hashsize_b>=1000) + uart_putc('0' + (nessie_hash_ctx.hashsize_b/1000)%10); + if(nessie_hash_ctx.hashsize_b>=100) + uart_putc('0' + (nessie_hash_ctx.hashsize_b/100)%10); + if(nessie_hash_ctx.hashsize_b>=10) + uart_putc('0' + (nessie_hash_ctx.hashsize_b/10)%10); + uart_putc('0' + nessie_hash_ctx.hashsize_b%10); + + uart_putstr_P(PSTR(" zero bits")); memset(block, 0, 256/8); nessie_hash_ctx.hash_init(ctx); diff --git a/test_src/performance_test.c b/test_src/performance_test.c index 04a26bd..5853c84 100644 --- a/test_src/performance_test.c +++ b/test_src/performance_test.c @@ -51,13 +51,13 @@ ISR(TIMER1_OVF_vect){ } void calibrateTimer(void){ + volatile uint8_t i; startTimer(1); stopTimer(); const_overhead = TCNT1; startTimer(1); TCNT1=0xFFFE; - ; ; ; ; -// asm volatile("NOP\n"::); asm volatile("NOP\n"::); + i++; stopTimer(); int_overhead = TCNT1; } diff --git a/test_src/uart.c b/test_src/uart.c index e35106c..cf952ed 100644 --- a/test_src/uart.c +++ b/test_src/uart.c @@ -33,6 +33,9 @@ #define UBRRH UBRR0H #define UBRRL UBRR0L #define URSEL UMSEL +#define USART_UDRE_vect USART0_UDRE_vect +#define USART_RXC_vect USART0_RX_vect +#define UCSRA UCSR0A #endif #ifdef ATMEGA644