From b246a2a0589f234db6247255555df98f4c281c41 Mon Sep 17 00:00:00 2001 From: bg Date: Fri, 2 Sep 2011 18:42:52 +0000 Subject: [PATCH] optimized xtea asm implementation --- arcfour/arcfour.c | 16 +- avr-ethernut2.1-makefile.inc | 51 +++ avr-makefile.inc | 8 +- bcal/bcal_xtea.c | 2 +- mkfiles/a5_1.mk | 4 +- test_src/avr-asm-macros.S | 4 +- test_src/config.h | 2 +- test_src/dump-asm.S | 4 + test_src/main-xtea-test.c | 21 +- test_src/uart_defs.h | 37 +- test_src/uart_i.h | 156 +++++-- xtea/xtea-asm.S | 761 ++++++++++------------------------- xtea/xtea.c | 1 - 13 files changed, 454 insertions(+), 613 deletions(-) create mode 100644 avr-ethernut2.1-makefile.inc diff --git a/arcfour/arcfour.c b/arcfour/arcfour.c index 0d929ef..6c068a8 100644 --- a/arcfour/arcfour.c +++ b/arcfour/arcfour.c @@ -36,18 +36,24 @@ void arcfour_init(const void *key, uint16_t length_b, arcfour_ctx_t *ctx){ uint8_t t; uint8_t length_B = length_b/8; - uint16_t x,y=0; - for(x=0; x<= 255; ++x) + uint8_t x=0,y=0; + uint8_t *kptr=key; + do{ ctx->s[x]=x; + }while(++x); - for(x=0; x<= 255; ++x){ - y += ctx->s[x] + ((uint8_t*)key)[x % length_B]; + do{ + y += ctx->s[x] + *kptr++; + if(x==length_B){ + kptr = key; + } y &= 0xff; /* ctx->s[y] <--> ctx->s[x] */ t = ctx->s[y]; ctx->s[y] = ctx->s[x]; ctx->s[x] = t; - } + }while(++x); + ctx->i = ctx->j = 0; } diff --git a/avr-ethernut2.1-makefile.inc b/avr-ethernut2.1-makefile.inc new file mode 100644 index 0000000..b296705 --- /dev/null +++ b/avr-ethernut2.1-makefile.inc @@ -0,0 +1,51 @@ + +MCU_TARGET = atmega128 +F_CPU = 14745600 +OPTIMIZE = -Os # -Os +DEBUG = -gdwarf-2 +WARNING = -pedantic -Wall -Wstrict-prototypes +PROGRAMMER = jtagmkII +PROG_PORT = usb +DEFS = -D$(call uc, $(MCU_TARGET)) -DF_CPU=$(F_CPU) +FLASHCMD = avrdude -p $(MCU_TARGET) -P $(PROG_PORT) -c $(PROGRAMMER) -U flash:w:# no space at the end +#FLASHCMD = avrdude -p $(MCU_TARGET) -c usbasp -U flash:w:# no space at the end +RESETCMD = avrdude -p $(MCU_TARGET) -P $(PROG_PORT) -c $(PROGRAMMER) +DEP_DIR = deps/ +TEST_DIR = test/ +BIN_DIR = bin/ +TESTSRC_DIR = test_src/ +#uisp -dprog=bsd -dlpt=/dev/parport1 --upload if=$(PRG).hex +ERASECMD = +TESTPORT = /dev/ttyUSB0 +TESTPORTBAUDR = 115200 +TESTLOG_DIR = testlog/# +TESTPREFIX = nessie- +SPEEDTOOL = host/get_performance.rb +SPEEDLOG_DIR = speed_log/ +SPEEDPREFIX = +SPEEDCMD = performance +SIZE_DIR = size_log/# +LIST_DIR = listings/# +STAT_DIR = stats/# +AUTOASM_DIR = autoasm/# +AUTOASM_OPT = -S +CC = avr-gcc +CSTD = c99 + +override CFLAGS_A = -MMD -MF$(DEP_DIR)$(patsubst %.o,%.d,$(notdir $(1))) $(DEBUG) $(WARNING) -std=$(CSTD) $(OPTIMIZE) -mmcu=$(MCU_TARGET) $(DEFS) +override CFLAGS = -MMD -MF$(DEP_DIR)$(patsubst %.o,%.d,$(notdir $@)) $(DEBUG) $(WARNING) -std=$(CSTD) $(OPTIMIZE) -mmcu=$(MCU_TARGET) $(DEFS) + +override LDFLAGS = -gdwarf-2 -Wl,-Map, +override ASFLAGS = -mmcu=$(MCU_TARGET) -Wa,--gdwarf-2 + +SIZESTAT_FILE = sizestats.txt + +OBJCOPY = avr-objcopy +OBJDUMP = avr-objdump +SIZE = avr-size +READELF = readelf +RUBY = ruby +GET_TEST = host/get_test.rb +MAKE = make +MAKE2GRAPH = ~/bin/make2graph.rb +TWOPI = twopi diff --git a/avr-makefile.inc b/avr-makefile.inc index 9de0323..4396b24 100644 --- a/avr-makefile.inc +++ b/avr-makefile.inc @@ -1,13 +1,15 @@ MCU_TARGET = atmega644 +F_CPU = 20000000 OPTIMIZE = -Os # -Os DEBUG = -gdwarf-2 WARNING = -pedantic -Wall -Wstrict-prototypes PROGRAMMER = avr911 -DEFS = -D$(call uc, $(MCU_TARGET)) -FLASHCMD = avrdude -p $(MCU_TARGET) -P /dev/ttyUSB0 -c $(PROGRAMMER) -U flash:w:# no space at the end +PROG_PORT = /dev/ttyUSB0 +DEFS = -D$(call uc, $(MCU_TARGET)) -DF_CPU=$(F_CPU) +FLASHCMD = avrdude -p $(MCU_TARGET) -P $(PROG_PORT) -c $(PROGRAMMER) -U flash:w:# no space at the end #FLASHCMD = avrdude -p $(MCU_TARGET) -c usbasp -U flash:w:# no space at the end -RESETCMD = avrdude -p $(MCU_TARGET) -P /dev/ttyUSB0 -c $(PROGRAMMER) +RESETCMD = avrdude -p $(MCU_TARGET) -P $(PROG_PORT) -c $(PROGRAMMER) DEP_DIR = deps/ TEST_DIR = test/ BIN_DIR = bin/ diff --git a/bcal/bcal_xtea.c b/bcal/bcal_xtea.c index 5902c10..bd6263b 100644 --- a/bcal/bcal_xtea.c +++ b/bcal/bcal_xtea.c @@ -48,7 +48,7 @@ void xtea_dummy_dec(void* block, void* key){ const bcdesc_t xtea_desc PROGMEM = { BCDESC_TYPE_BLOCKCIPHER, - BC_INIT_TYPE_2, + BC_INIT_TYPE_1, xtea_str, 16, 64, diff --git a/mkfiles/a5_1.mk b/mkfiles/a5_1.mk index 5dfcadd..8c0c030 100644 --- a/mkfiles/a5_1.mk +++ b/mkfiles/a5_1.mk @@ -5,7 +5,9 @@ ALGO_NAME := A51 STREAM_CIPHERS += $(ALGO_NAME) $(ALGO_NAME)_OBJ := A5_1.o -$(ALGO_NAME)_TEST_BIN := main-a5_1-test.o nessie_stream_test.o nessie_common.o $(CLI_STD) +$(ALGO_NAME)_DIR := a51/ +$(ALGO_NAME)_INCDIR := memxor/ scal/ +$(ALGO_NAME)_TEST_BIN := main-a5_1-test.o $(CLI_STD) $(SCAL_STD) $(ALGO_NAME)_NESSIE_TEST := "nessie" $(ALGO_NAME)_PERFORMANCE_TEST := "performance" diff --git a/test_src/avr-asm-macros.S b/test_src/avr-asm-macros.S index 14f907c..80d8006 100644 --- a/test_src/avr-asm-macros.S +++ b/test_src/avr-asm-macros.S @@ -157,7 +157,7 @@ .macro CLEAR_BIT_IO io:req bit:req reg:req .if _SFR_IO_REG_P(\io) - cbi _SFR_IO_ADDR(\io), bit + cbi _SFR_IO_ADDR(\io), \bit .else lds \reg, _SFR_MEM_ADDR(\io) andi \reg, ~_BV(\bit) @@ -167,7 +167,7 @@ .macro SET_BIT_IO io:req bit:req reg:req .if _SFR_IO_REG_P(\io) - sbi _SFR_IO_ADDR(\io),bit + sbi _SFR_IO_ADDR(\io), \bit .else lds \reg, _SFR_MEM_ADDR(\io) ori \reg, _BV(\bit) diff --git a/test_src/config.h b/test_src/config.h index 63d1902..513729e 100644 --- a/test_src/config.h +++ b/test_src/config.h @@ -19,7 +19,7 @@ #ifndef __CONFIG_H__ #define __CONFIG_H__ #include -#define F_CPU 20000000 +//#define F_CPU 20000000 // #define F_CPU 16000000 /* oscillator-frequency in Hz */ // #define F_CPU 14745600 diff --git a/test_src/dump-asm.S b/test_src/dump-asm.S index f38b48a..26f04bd 100644 --- a/test_src/dump-asm.S +++ b/test_src/dump-asm.S @@ -415,6 +415,10 @@ ram_read_block: * param addr: r20:r23 * param length: r18 */ +#ifdef EEWE +# define EEPE EEWE +#endif + .global ee_read_block ee_read_block: movw r26, r24 diff --git a/test_src/main-xtea-test.c b/test_src/main-xtea-test.c index 99a8eb7..778d9cd 100644 --- a/test_src/main-xtea-test.c +++ b/test_src/main-xtea-test.c @@ -54,6 +54,25 @@ void testrun_performance_xtea(void){ bcal_performance_multiple(algolist); } +void test_xtea(void){ + uint8_t key[16]; + uint8_t data[8]; + + memset(key, 0, 16); + key[0] = 0x80; + memset(data, 0, 8); + cli_putstr_P(PSTR("\r\n*** XTEA test ***\r\n key: ")); + cli_hexdump(key, 16); + cli_putstr_P(PSTR("\r\n plain: ")); + cli_hexdump(data, 8); + xtea_enc(data, data, key); + cli_putstr_P(PSTR("\r\n crypt: ")); + cli_hexdump(data, 8); + xtea_dec(data, data, key); + cli_putstr_P(PSTR("\r\n plain: ")); + cli_hexdump(data, 8); +} + /***************************************************************************** * main * *****************************************************************************/ @@ -65,7 +84,7 @@ const char echo_str[] PROGMEM = "echo"; cmdlist_entry_t cmdlist[] PROGMEM = { { nessie_str, NULL, testrun_nessie_xtea}, - { test_str, NULL, testrun_nessie_xtea}, + { test_str, NULL, test_xtea}, { performance_str, NULL, testrun_performance_xtea}, { echo_str, (void*)1, (void_fpt)echo_ctrl}, { NULL, NULL, NULL} diff --git a/test_src/uart_defs.h b/test_src/uart_defs.h index dac0a48..8b71e86 100644 --- a/test_src/uart_defs.h +++ b/test_src/uart_defs.h @@ -1,6 +1,6 @@ /* uart_defs.h */ /* - This file is part of the AVR-uart_ni. + This file is part of the AVR-uart_i. Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de) This program is free software: you can redistribute it and/or modify @@ -16,19 +16,30 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ - +/** + * \file uart_defs.h + * \email daniel.otte@rub.de + * \author Daniel Otte + * \date 2009-07-24 + * \license GPLv3 or later + * \addtogroup uart_config + * \brief definitions for uart configuration + * \details + * This file declares some macros for use in uart configuration + */ +/*@{*/ #ifndef UART_DEFS_H_ #define UART_DEFS_H_ -#define UART_PARATY_NONE 0 /** \def UART_PARATY_NONE define no paraty */ -#define UART_PARATY_EVEN 2 /** \def UART_PARATY_EVEN define even paraty */ -#define UART_PARATY_ODD 3 /** \def UART_PARATY_ODD define odd paraty */ -#define UART_STOPBITS_1 0 /** \def UART_STOPBITS_1 define 1 stop bit */ -#define UART_STOPBITS_2 1 /** \def UART_STOPBITS_2 define 2 stop bits */ -#define UART_DATABITS_5 0 /** \def UART_DATABITS_5 define 5 data bits */ -#define UART_DATABITS_6 1 /** \def UART_DATABITS_6 define 6 data bits */ -#define UART_DATABITS_7 2 /** \def UART_DATABITS_7 define 7 data bits */ -#define UART_DATABITS_8 3 /** \def UART_DATABITS_8 define 8 data bits */ -#define UART_DATABITS_9 7 /** \def UART_DATABITS_9 define 9 data bits */ - +#define UART_PARATY_NONE 0 /**< define no paraty */ +#define UART_PARATY_EVEN 2 /**< define even paraty */ +#define UART_PARATY_ODD 3 /**< define odd paraty */ +#define UART_STOPBITS_1 0 /**< define 1 stop bit */ +#define UART_STOPBITS_2 1 /**< define 2 stop bits */ +#define UART_DATABITS_5 0 /**< define 5 data bits */ +#define UART_DATABITS_6 1 /**< define 6 data bits */ +#define UART_DATABITS_7 2 /**< define 7 data bits */ +#define UART_DATABITS_8 3 /**< define 8 data bits */ +#define UART_DATABITS_9 7 /**< define 9 data bits */ +/*@}*/ #endif /* UART_DEFS_H_ */ diff --git a/test_src/uart_i.h b/test_src/uart_i.h index aff5f85..b9a65c0 100644 --- a/test_src/uart_i.h +++ b/test_src/uart_i.h @@ -19,13 +19,54 @@ /** * \file uart_i.h * \email daniel.otte@rub.de - * \author Daniel Otte + * \author Daniel Otte * \date 2009-07-24 * \license GPLv3 or later - * \ingroup uart_i - * \brief declaration for non-interrupt uart + * \defgroup uart_i + * \brief declaration for interrupt based uart + * \details + * This implementation of the uart-interface of AVR microcontrollers uses the + * interrup architecture and can be used to handle serial communication in the + * background. + * The uart is configured at compile-time by some special defines starting with + * \a UART0_ for configuring the first uart and \a UART1_ for the second. + * Some settings use symbolic values defined in uart_defs.h . + * The following options are available: + * - \a *_I enables the interrupt based driver for this uart + * - \a 0 disables driver + * - \a 1 enables driver + * - \a *_BAUD_RATE sets the baudrate for this uart (value is the baudrate) + * - \a *_STOPBITS sets the amount of stop bits for this uart + * - \a UART_STOPBITS_1 for one stop bit + * - \a UART_STOPBITS_2 for two stop bits + * - \a *_DATABITS sets the amount of data bits for this uart + * - \a UART_DATABITS_5 for five data bits + * - \a UART_DATABITS_6 for six data bits + * - \a UART_DATABITS_7 for seven data bits + * - \a UART_DATABITS_8 for eight data bits + * - \a *_PARATY sets the mode for paraty calculation for this uart + * - \a UART_PARATY_NONE ignore paraty + * - \a UART_PARATY_ODD odd paraty + * - \a UART_PARATY_EVEN even paraty + * - \a *_RXBUFFER_SIZE size of the recieve buffer in bytes + * - \a *_TXBUFFER_SIZE size of the transmitt buffer in bytes + * - \a *_SWFLOWCTRL enable/diasable software flow control (via XON & XOFF) + * - \a 0 disable software flow control + * - \a 1 enable software flow control + * - \a *_THRESH_HIGH set upper limit for the rx buffer, which causes an XOFF + * to be send when crossed (only relevant if software flow + * control is enabled) + * - \a *_THRESH_LOW set lower limit for the rx buffer, which causes an XON to + * be send when crossed and an XOFF has been send previously + * (only relevant if software flow control is enabled) + * - \a *_HOOK enable/disable implementation of the hook feature + * (\ref uart0_sethook()) + * - \a 0 disable hook feature + * - \a 1 enable hook feature + * */ +/*@{*/ #ifndef UART_I_H_ #define UART_I_H_ @@ -33,61 +74,82 @@ #include "circularbytebuffer.h" #include +/** + * \brief storage type for uart0 context + * + * This type is used to store uart0 specific global variables. + * It contains a pointer to the buffer instances and when neccessary + * a pointer to the hook function and an indicator if the hook is + * currently executed. + * If software flow control is enabled it also contains flags for flow control. + */ typedef struct{ - circularbytebuffer_t rxb; - circularbytebuffer_t txb; + circularbytebuffer_t rxb; /**< recieve buffer */ + circularbytebuffer_t txb; /**< transmitt buffer*/ #if UART0_HOOK - void(*hook)(uint8_t); - volatile uint8_t hook_running; + void(*hook)(uint8_t); /**< pointer to the hook function */ + volatile uint8_t hook_running; /**< flag indicating if the hook is running */ #endif #if UART0_SWFLOWCTRL - volatile uint8_t txon; - volatile uint8_t rxon; + volatile uint8_t txon; /**< flag indicating if we are allowed to send data */ + volatile uint8_t rxon; /**< flag indicating if we have send an \a XOFF */ #endif } uart0_ctx_t; +/** + * \brief storage type for uart1 context + * + * This type is used to store uart1 specific global variables. + * It contains a pointer to the buffer instances and when neccessary + * a pointer to the hook function and an indicator if the hook is + * currently executed. + * If software flow control is enabled it also contains flags for flow control. + */ typedef struct{ - circularbytebuffer_t rxb; - circularbytebuffer_t txb; + circularbytebuffer_t rxb; /**< recieve buffer */ + circularbytebuffer_t txb; /**< transmitt buffer */ #if UART1_HOOK - void(*hook)(uint8_t); - volatile uint8_t hook_running; + void(*hook)(uint8_t); /**< pointer to the hook function */ + volatile uint8_t hook_running; /**< flag indicating if the hook is running */ #endif #if UART1_SWFLOWCTRL - volatile uint8_t txon; - volatile uint8_t rxon; + volatile uint8_t txon; /**< flag indicating if we are allowed to send data */ + volatile uint8_t rxon; /**< flag indicating if we have send an \a XOFF */ #endif } uart1_ctx_t; #if UART0_I -/** \fn uart0_init(void) +/** * \brief initialize uart0. - * This function initializes the first uart according to the parameter specifyed + * + * This function initializes the first uart according to the parameter specified * in config.h . */ void uart0_init(void); -/** \fn uart0_putc(uint16_t) +/** * \brief send data through uart0. - * This function sends data through the first uart + * + * This function sends data through the first uart * (the data size is debfined in config.h). * \param c data to send */ void uart0_putc(uint16_t c); -/** \fn uart0_getc(void) +/** * \brief read data from uart0. - * This function reads data from the first uart + * + * This function reads data from the first uart * (the data size is debfined in config.h). * \return data recived by uart0 */ uint16_t uart0_getc(void); -/** \fn uart0_dataavail(void) +/** * \brief checks if data is available. - * + * * This function checks the state of the input buffer of uart0 and * returns if data is available or not. * \return zero if no data is available else a value different from zero is returned @@ -95,6 +157,20 @@ uint16_t uart0_getc(void); uint8_t uart0_dataavail(void); #if UART0_HOOK +/** + * \brief sets the hook for uart0. + * + * This function modifys the way the software handels incomming data. + * When the hook is set to \a NULL (which is the default) incomming data is buffered + * in a special ringbuffer and read by \ref uart0_getc(). If the hook is set to a + * different value, this value is interpret as a function pointer. The hook (the + * function where the function pointer points to) is called with the recieved data + * as single parameter. Any value returned by the hook is discarded. + + * \note If the hook is set \ref uart0_getc() will not return, as the + * ringbuffer is bypassed. + * \param fpt pointer to thae handler function for recieved data + */ void uart0_sethook(void(*fpt)(uint8_t)); #endif @@ -102,38 +178,58 @@ void uart0_sethook(void(*fpt)(uint8_t)); #endif /* UART0_I */ #if UART1_I -/** \fn uart1_init(void) +/** * \brief initialize uart1. + * * This function initializes the second uart according to the parameter specifyed * in config.h . */ void uart1_init(void); -/** \fn uart1_putc(uint16_t) +/** * \brief send data through uart1. - * This function sends data through the second uart + * + * This function sends data through the second uart * (the data size is debfined in config.h). * \param c data to send */ void uart1_putc(uint16_t c); -/** \fn uart1_getc(void) +/** * \brief read data from uart1. - * This function reads data from the second uart + * + * This function reads data from the second uart * (the data size is debfined in config.h). * \return data recived by uart1 */ uint16_t uart1_getc(void); -/** \fn uart1_dataavail(void) +/** * \brief checks if data is available. + * * This function checks the state of the input buffer of uart1 and * returns if data is available or not. * \return zero if no data is available else a value different from zero is returned */ uint8_t uart1_dataavail(void); -void uart0_sethook(void(*fpt)(uint8_t)); +/** + * \brief sets the hook for uart1. + * + * This function modifys the way the software handels incomming data. + * When the hook is set to \a NULL (which is the default) incomming data is buffered + * in a special ringbuffer and read by \ref uart1_getc(). If the hook is set to a + * different value, this value is interpret as a function pointer. The hook (the + * function where the function pointer points to) is called with the recieved data + * as single parameter. Any value returned by the hook is discarded. + + * \note If the hook is set \ref uart1_getc() will not return, as the + * ringbuffer is bypassed. + * \param fpt pointer to thae handler function for recieved data + */ +void uart1_sethook(void(*fpt)(uint8_t)); #endif +/*@}*/ + #endif /* UART_I_H_ */ diff --git a/xtea/xtea-asm.S b/xtea/xtea-asm.S index 826f123..f8aac8c 100644 --- a/xtea/xtea-asm.S +++ b/xtea/xtea-asm.S @@ -1,7 +1,7 @@ -/* xtea-asm.S */ +/* xtea-enc.S */ /* - This file is part of the AVR-Crypto-Lib. - Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de) + This file is part of the ARM-Crypto-Lib. + Copyright (C) 2006-2011 Daniel Otte (daniel.otte@rub.de) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,570 +16,221 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -/* xtea-asm.S - * Author: Daniel Otte - * Date: 2006-06-06 - * License: GPLv3 or later - * Implementation of XTEA for AVR - * include xtea.h in your C-Project to use this functions. -*/ -V01 = 2 -V02 = 3 -V03 = 4 -V04 = 5 -V11 = 6 -V12 = 7 -V13 = 8 -V14 = 9 -Accu1 = 14 -Accu2 = 15 -Accu3 = 16 -Accu4 = 17 -Sum1 = 18 -Sum2 = 19 -Sum3 = 20 -Sum4 = 21 -Func1 = 22 -Func2 = 23 -Func3 = 24 -Func4 = 25 -C = 28 /* der kleine Zaehler fuer zwischendurch */ +#include "avr-asm-macros.S" + +B0 = 4 +B1 = 5 +B2 = 6 +B3 = 7 + +A0 = 8 +A1 = 9 +A2 = 10 +A3 = 11 + +V10 = 12 +V11 = 13 +V12 = 14 +V13 = 15 + +V00 = 16 +V01 = 17 +V02 = 18 +V03 = 19 + +S0 = 20 +S1 = 21 +S2 = 22 +S3 = 23 + +xchg_V0V1: + movw r26, V10 + movw V10, V00 + movw V00, r26 + movw r26, V12 + movw V12, V02 + movw V02, r26 + ret + +eor_AB: + eor A0, B0 + eor A1, B1 + eor A2, B2 + eor A3, B3 + ret + +g_func: + movw A0, V10 + movw A2, V12 + movw B0, V10 + movw B2, V12 + + ldi r24, 4 +10: + lsl A0 + rol A1 + rol A2 + rol A3 + dec r24 + brne 10b + + ldi r24, 5 +10: + lsr B3 + ror B2 + ror B1 + ror B0 + dec r24 + brne 10b + + rcall eor_AB + + add A0, V10 + adc A1, V11 + adc A2, V12 + adc A3, V13 + + ret + +sum_plus_k: + andi r24, (3<<2) + movw r26, r30 + add r26, r24 + adc r27, r1 + ld B0, X+ + ld B1, X+ + ld B2, X+ + ld B3, X+ + add B0, S0 + adc B1, S1 + adc B2, S2 + adc B3, S3 + rcall eor_AB + brtc 20f + add V00, A0 + adc V01, A1 + adc V02, A2 + adc V03, A3 + ret +20: sub V00, A0 + sbc V01, A1 + sbc V02, A2 + sbc V03, A3 + ret + +main1: + rcall g_func + mov r24, S0 + lsl r24 + lsl r24 + rcall sum_plus_k + ret + +main2: + rcall xchg_V0V1 + rcall g_func + mov r24, S1 + lsr r24 + rcall sum_plus_k + rcall xchg_V0V1 + ret .global xtea_enc -; == xtea_enc == -; xtea encrytion function -; param1: 16-bit pointer to destination for encrypted block -; given in r25,r24 -; param2: 16-bit pointer to the block (64-bit) which is to encrypt -; given in r23,r22 -; param3: 16-bit pointer to the key (128-bit) -; given in r21,r20 -; xtea_enc: - /* prolog */ - push r2 - push r3 - push r4 - push r5 - push r6 - push r7 - push r8 - push r9 - push r14 - push r15 - push r16 - push r17 - push r28 - - /* load the block */ - movw r26, r22 /* X points to block */ - movw r30, r20 /* Z points to key */ - ld V01, X+ - ld V02, X+ - ld V03, X+ - ld V04, X+ - ld V11, X+ - ld V12, X+ - ld V13, X+ - ld V14, X+ -; push r25 -; push r24 - movw r26, r24 /* X points to destination */ - - ldi Func1, 32 - mov r0, Func1 /* r0 is cycle-counter */ - clr Sum1 - clr Sum2 - movw Sum3, Sum1 - clt - -1: - movw Accu1, V11 - movw Accu3, V13 - ldi C, 4 -2: lsl Accu1 - rol Accu2 - rol Accu3 - rol Accu4 - dec C - brne 2b /* Accu == V1 << 4 */ - - movw Func1, V11 - movw Func3, V13 - ldi C, 5 -3: lsr Func4 - ror Func3 - ror Func2 - ror Func1 - dec C - brne 3b /* Func == V1 >> 5 */ - - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 - add Accu1, V11 - adc Accu2, V12 - adc Accu3, V13 - adc Accu4, V14 /* Accu == ( (V1<<4)^(V1>>5) ) + V1 */ - - brtc 4f - mov C, Sum2 - lsr C - andi C,(0x03 <<2) - clt - rjmp 5f -4: - mov C, Sum1 /* calc key offset */ - andi C, 0x03 - lsl C - lsl C set - -5: - add r30, C - adc r31, r1 - ld Func1, Z - ldd Func2, Z+1 - ldd Func3, Z+2 - ldd Func4, Z+3 /* Func = key[sum & 3] */ - sub r30, C - sbci r31, 0 - add Func1, Sum1 - adc Func2, Sum2 - adc Func3, Sum3 - adc Func4, Sum4 - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu = ((V1<<4 ^ V1>>5) + V1) ^ (sum + key[sum&3]) */ - add Accu1, V01 - adc Accu2, V02 - adc Accu3, V03 - adc Accu4, V04 - - movw V01, V11 - movw V03, V13 - movw V11, Accu1 - movw V13, Accu3 - - /* sum += delta */ /* delta == 0x9E3779B9 */ - brtc 6f - ldi C, 0xB9 - add Sum1, C - ldi C, 0x79 - adc Sum2, C - ldi C, 0x37 - adc Sum3, C - ldi C, 0x9E - adc Sum4, C - rjmp 1b - -6: - dec r0 - breq 7f - rjmp 1b - - 7: - /* write block back */ - ; pop r26 - ; pop r27 - st X+, V01 - st X+, V02 - st X+, V03 - st X+, V04 - st X+, V11 - st X+, V12 - st X+, V13 - st X+, V14 - - /* epilog */ - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r9 - pop r8 - pop r7 - pop r6 - pop r5 - pop r4 - pop r3 - pop r2 - ret +xtea_intro: + clr r27 + ldi r26, 4 + ldi r30, 14 +10: + ld r0, X+ + push r0 + dec r30 + brne 10b + + push r24 + push r25 + movw r30, r20 +/* load block */ + movw r26, r22 + ld V00, X+ + ld V01, X+ + ld V02, X+ + ld V03, X+ + ld V10, X+ + ld V11, X+ + ld V12, X+ + ld V13, X+ + ldi r24, 32 + mov r0, r24 + brtc xtea_dec_start + clr S0 + clr S1 + movw S2, S0 + +10: + rcall main1 + subi S0, 0x47 + sbci S1, 0x86 + sbci S2, 0xC8 + sbci S3, 0x61 + rcall main2 + + dec r0 + brne 10b + +/* store back */ +xtea_enc_exit: + pop r27 + pop r26 + st X+, V00 + st X+, V01 + st X+, V02 + st X+, V03 + st X+, V10 + st X+, V11 + st X+, V12 + st X+, V13 + + clr r27 + ldi r26, 18 + ldi r24, 14 +10: + pop r0 + st -X, r0 + dec r24 + brne 10b + ret + + +/******************************************************************************/ +/******************************************************************************/ +/******************************************************************************/ +/******************************************************************************/ -;#################################################################### - - /* #endif TWO_IN_ONE */ - - /* #ifdef TWO_IN_ONE */ - /* now we use the same base-structure for enc- and decryption - to indicate operation mode we use the highest bit of param3 (16 bit pointer to key), - this is ok, since even the larges atmel today has "only" 8k of ram, - but you shouldn't use this feature while using external ram. - */ -.global xtea_enc - ori r21, 0x80 - .global xtea_dec -; == xtea_dec == -; xtea decrytion function -; param1: 16-bit pointer to destination for decrypted block -; given in r25,r24 -; param2: 16-bit pointer to the block (64-bit) which is to derypt -; given in r23,r22 -; param3: 16-bit pointer to the key (128-bit) -; given in r21,r20 -; -/* -void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) { - uint32_t v0=v[0], v1=v[1], i; - uint32_t sum=0xC6EF3720, delta=0x9E3779B9; - for(i=0; i<32; i++) { - v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); - sum -= delta; - v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); - } - dest[0]=v0; dest[1]=v1; -} -*/ - xtea_dec: - /* prolog */ - push r2 - push r3 - push r4 - push r5 - push r6 - push r7 - push r8 - push r9 - push r14 - push r15 - push r16 - push r17 - push r28 - /* load the block */ - movw r26, r22 /* Z points to block */ - movw r30, r20 /* X points to key */ - ld V01, X+ - ld V02, X+ - ld V03, X+ - ld V04, X+ - ld V11, X+ - ld V12, X+ - ld V13, X+ - ld V14, X+ - movw r26, r24 /* Z points to destination */ - - ldi Sum1, 32 - mov r0, Sum1 /* r1 is cycle-counter */ - ldi Sum1, 0x20 /* sum = 0xC6EF3720 */ - ldi Sum2, 0x37 - ldi Sum3, 0xEF - ldi Sum4, 0xC6 clt + rjmp xtea_intro +xtea_dec_start: + ldi S0, 0x20 /* sum = 0xC6EF3720 */ + ldi S1, 0x37 + ldi S2, 0xEF + ldi S3, 0xC6 -1: - movw Accu1, V01 - movw Accu3, V03 - ldi C, 4 -2: lsl Accu1 - rol Accu2 - rol Accu3 - rol Accu4 - dec C - brne 2b /* Accu == V0 << 4 */ +10: + rcall main2 + subi S0, 0xB9 + sbci S1, 0x79 + sbci S2, 0x37 + sbci S3, 0x9E + rcall main1 - movw Func1, V01 - movw Func3, V03 - ldi C, 5 -3: lsr Func4 - ror Func3 - ror Func2 - ror Func1 - dec C - brne 3b /* Func == V0 >> 5 */ - - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 - add Accu1, V01 - adc Accu2, V02 - adc Accu3, V03 - adc Accu4, V04 /* Accu == ( (V0<<4)^(V0>>5) ) + V0 */ - - brts 4f - mov C, Sum2 - lsr C - andi C,(0x03 <<2) - set - rjmp 5f -4: - mov C, Sum1 /* calc key offset */ - andi C, 0x03 - lsl C - lsl C - clt - -5: - add r30, C - adc r31, r1 - ld Func1, Z - ldd Func2, Z+1 - ldd Func3, Z+2 - ldd Func4, Z+3 /* Func = key[sum & 3] */ - sub r30, C - sbci r31, 0 - add Func1, Sum1 - adc Func2, Sum2 - adc Func3, Sum3 - adc Func4, Sum4 - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3]) */ - sub V11, Accu1 - sbc V12, Accu2 - sbc V13, Accu3 - sbc V14, Accu4 - - movw Accu1, V01 - movw Accu3, V03 - movw V01, V11 - movw V03, V13 - movw V11, Accu1 - movw V13, Accu3 - - /* sum += delta */ /* delta == 0x9E3779B9 */ - brtc 6f - subi Sum1, 0xB9 - sbci Sum2, 0x79 - sbci Sum3, 0x37 - sbci Sum4, 0x9E - rjmp 1b - -6: dec r0 - breq 7f - rjmp 1b - -7: - /* write block back */ - st X+, V01 - st X+, V02 - st X+, V03 - st X+, V04 - st X+, V11 - st X+, V12 - st X+, V13 - st X+, V14 - - /* epilog */ - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r9 - pop r8 - pop r7 - pop r6 - pop r5 - pop r4 - pop r3 - pop r2 - ret - - /* #endif */ + brne 10b +/* store back */ + rjmp xtea_enc_exit -;#################################################################### - - #ifdef TWO_IN_ONE - /* now we use the same base-structure for enc- and decryption - to indicate operation mode we use the highest bit of param3 (16 bit pointer to key), - this is ok, since even the larges atmel today has "only" 8k of ram, - but you shouldn't use this feature while using external ram. - */ -.global xtea_enc - ori r21, 0x80 - -.global xtea_dec -; == xtea_dec == -; xtea decrytion function -; param1: 16-bit pointer to destination for decrypted block -; given in r25,r24 -; param2: 16-bit pointer to the block (64-bit) which is to derypt -; given in r23,r22 -; param3: 16-bit pointer to the key (128-bit) -; given in r21,r20 -; -/* -void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) { - uint32_t v0=v[0], v1=v[1], i; - uint32_t sum=0xC6EF3720, delta=0x9E3779B9; - for(i=0; i<32; i++) { - v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); - sum -= delta; - v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); - } - dest[0]=v0; dest[1]=v1; -} -*/ -xtea_dec: - /* prolog */ - push r2 - push r3 - push r4 - push r5 - push r6 - push r7 - push r8 - push r9 - push r14 - push r15 - push r16 - push r17 - push r28 - /* set T-bit if we are going to encrypt, clear otherwise */ - bst r21, 7 - andi r21, 0x7f /* fix r21:r22 to a real addr */ - /* load the block */ - movw r26, r22 /* Z points to block */ - movw r30, r20 /* X points to key */ - ld V01, X+ - ld V02, X+ - ld V03, X+ - ld V04, X+ - ld V11, X+ - ld V12, X+ - ld V13, X+ - ld V14, X+ - movw r26, r24 /* Z points to destination */ - - ldi Sum1, 32 - mov r0, Sum1 /* r1 is cycle-counter */ - ldi Sum1, 0x20 /* sum = 0xC6EF3720 */ - ldi Sum2, 0x37 - ldi Sum3, 0xEF - ldi Sum4, 0xC6 - clt - -1: - movw Accu1, V01 - movw Accu3, V03 - ldi C, 4 -2: lsl Accu1 - rol Accu2 - rol Accu3 - rol Accu4 - dec C - brne 2b /* Accu == V0 << 4 */ - - movw Func1, V01 - movw Func3, V03 - ldi C, 5 -3: lsr Func4 - ror Func3 - ror Func2 - ror Func1 - dec C - brne 3b /* Func == V0 >> 5 */ - - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 - add Accu1, V01 - adc Accu2, V02 - adc Accu3, V03 - adc Accu4, V04 /* Accu == ( (V0<<4)^(V0>>5) ) + V0 */ - - brts 4f - mov C, Sum2 - lsr C - andi C,(0x03 <<2) - set - rjmp 5f -4: - mov C, Sum1 /* calc key offset */ - andi C, 0x03 - lsl C - lsl C - clt - -5: - add r30, C - adc r31, r1 - ld Func1, Z - ldd Func2, Z+1 - ldd Func3, Z+2 - ldd Func4, Z+3 /* Func = key[sum & 3] */ - sub r30, C - sbci r31, 0 - add Func1, Sum1 - adc Func2, Sum2 - adc Func3, Sum3 - adc Func4, Sum4 - eor Accu1, Func1 - eor Accu2, Func2 - eor Accu3, Func3 - eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3]) */ - sub V11, Accu1 - sbc V12, Accu2 - sbc V13, Accu3 - sbc V14, Accu4 - - movw Accu1, V01 - movw Accu3, V03 - movw V01, V11 - movw V03, V13 - movw V11, Accu1 - movw V13, Accu3 - - /* sum += delta */ /* delta == 0x9E3779B9 */ - brtc 6f - subi Sum1, 0xB9 - sbci Sum2, 0x79 - sbci Sum3, 0x37 - sbci Sum4, 0x9E - rjmp 1b - -6: - dec r0 - breq 7f - rjmp 1b - -7: - /* write block back */ - st X+, V01 - st X+, V02 - st X+, V03 - st X+, V04 - st X+, V11 - st X+, V12 - st X+, V13 - st X+, V14 - - /* epilog */ - pop r28 - pop r17 - pop r16 - pop r15 - pop r14 - pop r9 - pop r8 - pop r7 - pop r6 - pop r5 - pop r4 - pop r3 - pop r2 - ret - - #endif diff --git a/xtea/xtea.c b/xtea/xtea.c index 4605cb0..c517e9b 100644 --- a/xtea/xtea.c +++ b/xtea/xtea.c @@ -24,7 +24,6 @@ */ #include - void xtea_enc(void* dest, const void* v, const void* k) { uint8_t i;