diff --git a/A5_1.c b/A5_1.c
index 526e6cf..a22d185 100644
--- a/A5_1.c
+++ b/A5_1.c
@@ -17,10 +17,11 @@
along with this program. If not, see .
*/
/*
- * File: A5_1.c
- * Author: Daniel Otte
- * Date: 24.06.2006
- * License: GPL
+ * File: A5_1.c
+ * Author: Daniel Otte
+ * email: daniel.otte@rub.de
+ * Date: 2006-06-24
+ * License: GPLv3 or later
* Description: Implementation of the A5/1 stream cipher algorithm, as used in GSM.
* ! Warning, this is weak crypto !
*
diff --git a/Makefile b/Makefile
index 09457be..c8a52b7 100644
--- a/Makefile
+++ b/Makefile
@@ -30,6 +30,10 @@ PRG = remove_me
#-------------------------------------------------------------------------------
+all: $(foreach algo, $(ALGORITHMS), $(algo)_OBJ)
+
+#-------------------------------------------------------------------------------
+
define BLA_TEMPLATE2
$(2): $(3)
@echo "[gcc]: $$@"
@@ -151,11 +155,6 @@ $(foreach algo, $(ALGORITHMS),$(eval $(call FLASH_TEMPLATE, $(algo), \
$(patsubst %.o,%.hex,$(firstword $($(algo)_TEST_BIN)))) ))
#-------------------------------------------------------------------------------
-
-.PHONY: all
-all: $(foreach algo, $(ALGORITHMS), $(algo)_OBJ)
-#all: $(PRG).elf lst text eeprom
-
.PHONY: clean
clean:
diff --git a/arcfour-asm.S b/arcfour-asm.S
index eafd771..ec0eeeb 100644
--- a/arcfour-asm.S
+++ b/arcfour-asm.S
@@ -17,10 +17,10 @@
along with this program. If not, see .
*/
/*
- * File: arcfour-asm.S
- * Author: Daniel Otte
- * Date: 07.06.2006
- * License: GPL
+ * File: arcfour-asm.S
+ * Author: Daniel Otte
+ * Date: 2006-07-06
+ * License: GPLv3 or later
* Description: Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm.
*
*/
diff --git a/arcfour.c b/arcfour.c
index abed9dd..f8d01a6 100644
--- a/arcfour.c
+++ b/arcfour.c
@@ -17,10 +17,11 @@
along with this program. If not, see .
*/
/*
- * File: arcfour.c
- * Author: Daniel Otte
- * Date: 07.06.2006
- * License: GPL
+ * File: arcfour.c
+ * Author: Daniel Otte
+ * email: daniel.otte@rub.de
+ * Date: 2006-06-07
+ * License: GPLv3 or later
* Description: Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm.
*
*/
diff --git a/camellia-asm.S b/camellia-asm.S
index bcc190e..6e58ca5 100644
--- a/camellia-asm.S
+++ b/camellia-asm.S
@@ -17,10 +17,10 @@
along with this program. If not, see .
*/
/*
- * File: camellis-asm.S
- * Author: Daniel Otte
- * Date: 10.11.2006
- * License: GPL
+ * File: camellis-asm.S
+ * Author: Daniel Otte
+ * Date: 2006-11-10
+ * License: GPLv3 or later
* Description: Implementation of the camellia block cipher algorithm.
*
*/
diff --git a/cast5.c b/cast5.c
index a938bfb..51e9e93 100644
--- a/cast5.c
+++ b/cast5.c
@@ -19,9 +19,10 @@
/*
* \file cast5.c
* \author Daniel Otte
- * \date 26.07.2006
+ * \email daniel.otte@rub.de
+ * \date 2006-07-26
* \par License:
- * GPL
+ * GPLv3 or later
* \brief Implementation of the CAST5 (aka CAST-128) cipher algorithm as described in RFC 2144
*
*/
diff --git a/cli.c b/cli.c
index 1b5467c..8c15f5f 100644
--- a/cli.c
+++ b/cli.c
@@ -20,7 +20,7 @@
*
* author: Daniel Otte
* email: daniel.otte@rub.de
- * license: GPLv3
+ * license: GPLv3 or later
*
* components to help implementing simple command based interaction
*
diff --git a/des.c b/des.c
index dc16750..d4b8ce1 100644
--- a/des.c
+++ b/des.c
@@ -17,12 +17,13 @@
along with this program. If not, see .
*/
/**
- * \file des.c
- * \author Daniel Otte
- * \date 2007-06-16
- * \brief DES and EDE-DES implementation
+ * \file des.c
+ * \author Daniel Otte
+ * \email daniel.otte@rub.de
+ * \date 2007-06-16
+ * \brief DES and EDE-DES implementation
* \par License
- * GPL
+ * GPLv3 or later
*
*/
#include "config.h"
diff --git a/entropium.c b/entropium.c
index 1bbf583..fdbf13c 100644
--- a/entropium.c
+++ b/entropium.c
@@ -17,11 +17,12 @@
along with this program. If not, see .
*/
/**
- * \file entropium.c
- * \author Daniel Otte
- * \date 17.05.2006
+ * \file entropium.c
+ * \author Daniel Otte
+ * \email daniel.otte@rub.de
+ * \date 2006-05-17
* \par License:
- * GPL
+ * GPLv3 or later
* \brief This file contains an implementaition of a pseudo-random-number generator.
*
* Extension 1:
diff --git a/grain.c b/grain.c
index 612d18b..05571f4 100644
--- a/grain.c
+++ b/grain.c
@@ -20,7 +20,7 @@
*
* author: Daniel Otte
* email: daniel.otte@rub.de
- * license: GPLv3
+ * license: GPLv3 or later
*
*/
diff --git a/grain_h_lutgen.c b/grain_h_lutgen.c
deleted file mode 100644
index 4b5ede0..0000000
--- a/grain_h_lutgen.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- *
- * author: Daniel Otte
- * email: daniel.otte@rub.de
- * license: GPLv3
- *
- * this program generate a lookuptable for the h-function in grain
- */
-
-#include
-#include
-
-#define X(i) ((x)>>((i)))
-uint8_t h(uint8_t x){
- uint8_t h;
-
- h = (X(1)) ^ (X(4)) ^
- (X(0)&X(3)) ^ (X(2)&X(3)) ^ (X(3)&X(4)) ^
- (X(0)&X(1)&X(2)) ^ (X(0)&X(2)&X(3)) ^ (X(0)&X(2)&X(4)) ^
- (X(1)&X(2)&X(4)) ^ (X(2)&X(3)&X(4)) ;
-
- return h&1;
-}
-
-int main(void){
- uint8_t i;
- uint32_t lut;
- puts(
- "/* \n"
- " * author: Daniel Otte \n"
- " * email: daniel.otte@rub.de \n"
- " * license: GPLv3 \n"
- " * \n"
- " * this program generate a lookuptable for the h-function in grain \n"
- " * \n"
- " */ \n");
- puts("/* \n"
- " * x0 x1 x2 x3 x4 - h");
-
- for(i=0; i<0x20; ++i){
- printf(" * %c %c %c %c %c - %c\n",
- (i&0x01)?'1':'0',
- (i&0x02)?'1':'0',
- (i&0x04)?'1':'0',
- (i&0x08)?'1':'0',
- (i&0x10)?'1':'0',
- (h(i))?'1':'0' );
- lut >>=1;
- lut |= h(i)?0x80000000:0x00000000;
- if(i%4==3){
- puts(" * --");
- }
- }
- puts(" */\n");
- printf(" uint8_t lut[4]= {0x%2.2X, 0x%2.2X, 0x%2.2X, 0x%2.2X} \n",
- lut&0xFF, (lut>>8)&0xFF, (lut>>16)&0xFF, (lut>>24)&0xFF);
-
- return 0;
-}
-
diff --git a/grain_nfsr_lutgen.c b/grain_nfsr_lutgen.c
deleted file mode 100644
index 9b9277d..0000000
--- a/grain_nfsr_lutgen.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- *
- * author: Daniel Otte
- * email: daniel.otte@rub.de
- * license: GPLv3
- *
- * this program generate a lookuptable for the nfsr-feedback-function in grain
- */
-
-#include
-#include
-
-#define X(i) ((x)>>((i)))
-#define B63 X(0)
-#define B60 X(3)
-#define B52 X(5)
-#define B45 X(6)
-#define B37 X(4)
-#define B33 X(8)
-#define B28 X(2)
-#define B21 X(9)
-#define B15 X(1)
-#define B09 X(7)
-
-uint8_t g(uint16_t x){
- uint8_t a,b,d,e;
- uint8_t ret;
-
- ret = B60 ^ B52 ^ B45 ^ B37 ^ B33 ^ B28 ^ B21 ^ B09;
- ret ^= (a = B63 & B60);
- ret ^= (b = B37 & B33);
- ret ^= B15 & B09;
- ret ^= (d = B60 & B52 & B45);
- ret ^= (e = B33 & B28 & B21);
- ret ^= B63 & B45 & B28 & B09;
- ret ^= b & B60 & B52;
- ret ^= a & B21 & B15;
- ret ^= d & B63 & B37;
- ret ^= e & B15 & B09;
- ret ^= e & B52 & B45 & B37;
-
- return ret&1;
-}
-
-int main(void){
- uint16_t i;
- uint8_t t, lut[128]={0}; /* 2**10 / 8 == 2**(10-3) == 2**7 == 128 */
- puts(
- "/* \n"
- " * author: Daniel Otte \n"
- " * email: daniel.otte@rub.de \n"
- " * license: GPLv3 \n"
- " * \n"
- " * this program generate a lookuptable for the h-function in grain \n"
- " * \n"
- " */ \n");
- puts("/* \n"
- " * b63 b15 b28 b60 b37 b52 b45 b09 b33 b21 - g");
-
- for(i=0; i<0x0400; ++i){
- t = g(i);
- printf(" * %c %c %c %c %c %c %c %c %c %c - %c\n",
- (i&0x01)?'1':'0',
- (i&0x02)?'1':'0',
- (i&0x04)?'1':'0',
- (i&0x08)?'1':'0',
- (i&0x10)?'1':'0',
- (i&0x20)?'1':'0',
- (i&0x40)?'1':'0',
- (i&0x80)?'1':'0',
- (i&0x0100)?'1':'0',
- (i&0x0200)?'1':'0',
- t?'1':'0' );
- lut[i/8] |= t<<(i%8);
-// if(i%4==3){
-// puts(" * --");
-// }
- }
- puts(" */\n");
-
- printf(" uint8_t g_lut[128]= {");
- for(i=0; i<128; ++i){
- if(i%16==0){
- printf("\n\t");
- }
- printf("0x%2.2X%c ", lut[i], (i!=127)?',':' ');
- }
- printf("};\n\n");
- return 0;
-}
-
diff --git a/hmac-sha256.c b/hmac-sha256.c
index c57ba95..a0ad1dc 100644
--- a/hmac-sha256.c
+++ b/hmac-sha256.c
@@ -19,9 +19,9 @@
/**
*
* implementation of HMAC as described in RFC2104
- * Author: Daniel Otte
- *
- * License: GPL
+ * Author: Daniel Otte
+ * email: daniel.otte@rub.de
+ * License: GPLv3 or later
**/
/*
diff --git a/main-seed-test.c b/main-seed-test.c
index 6bff1d5..813cb5c 100644
--- a/main-seed-test.c
+++ b/main-seed-test.c
@@ -18,11 +18,12 @@
*/
/**
* \file main-seed-test.c
- * \author Daniel Otte
+ * \author Daniel Otte
+ * \email daniel.otte@rub.de
* \date 2007-06-01
* \brief test suit for SEED
* \par License
- * GPL
+ * GPLv3 or later
*
*/
#include "config.h"
diff --git a/main-shabea-test.c b/main-shabea-test.c
index aac85c1..a83d0c7 100644
--- a/main-shabea-test.c
+++ b/main-shabea-test.c
@@ -22,7 +22,7 @@
* \date 2007-06-07
* \brief test suit for SHABEA
* \par License
- * GPL
+ * GPLv3 or later
*
*/
#include "config.h"
diff --git a/md5.c b/md5.c
index bd43a38..5edb36b 100644
--- a/md5.c
+++ b/md5.c
@@ -19,9 +19,9 @@
/*
* \file md5.c
* \author Daniel Otte
- * \date 31.07.2006
+ * \date 2006-07-31
* \par License:
- * GPL
+ * GPLv3 or later
* \brief Implementation of the MD5 hash algorithm as described in RFC 1321
*
*/
diff --git a/noekeon.c b/noekeon.c
index 5ae3ec2..7627cc1 100644
--- a/noekeon.c
+++ b/noekeon.c
@@ -19,7 +19,7 @@
/*
* author: Daniel Otte
* email: daniel.otte@rub.de
- * license: GPLv3
+ * license: GPLv3 or later
*
*
*
diff --git a/noekeon_genrc.c b/noekeon_genrc.c
deleted file mode 100644
index cb8fac2..0000000
--- a/noekeon_genrc.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- *
- * author: Daniel Otte
- * email: daniel.otte@rub.de
- * license: GPLv3
- *
- */
-
-#include
-#include
-
-uint8_t getnextrc(uint8_t a){
- if((a&0x80) != 0){
- return (a<<1) ^ 0x1B;
- } else {
- return (a<<1);
- }
-}
-
-#define N 32
-
-int main(void){
- uint8_t c=0x80;
- uint32_t i;
- puts("\nNoekeon Round Constants:");
- for(i=0; i.
*/
-/*
- * Author: Daniel Otte
- *
- * License: GPL
-*/
-; SHA1 implementation in assembler for AVR
-SHA1_BLOCK_BITS = 512
-SHA1_HASH_BITS = 160
-
-.macro precall
- /* push r18 - r27, r30 - r31*/
- push r0
- push r1
- push r18
- push r19
- push r20
- push r21
- push r22
- push r23
- push r24
- push r25
- push r26
- push r27
- push r30
- push r31
- clr r1
-.endm
-
-.macro postcall
- pop r31
- pop r30
- pop r27
- pop r26
- pop r25
- pop r24
- pop r23
- pop r22
- pop r21
- pop r20
- pop r19
- pop r18
- pop r1
- pop r0
-.endm
-
-
-.macro hexdump length
- push r27
- push r26
- ldi r25, '\r'
- mov r24, r25
- call uart_putc
- ldi r25, '\n'
- mov r24, r25
- call uart_putc
- pop r26
- pop r27
- movw r24, r26
-.if \length > 16
- ldi r22, lo8(16)
- ldi r23, hi8(16)
- push r27
- push r26
- call uart_hexdump
- pop r26
- pop r27
- adiw r26, 16
- hexdump \length-16
-.else
- ldi r22, lo8(\length)
- ldi r23, hi8(\length)
- call uart_hexdump
-.endif
-.endm
-
-.macro delay
-/*
- push r0
- push r1
- clr r0
-1: clr r1
-2: dec r1
- brne 2b
- dec r0
- brne 1b
- pop r1
- pop r0 // */
-.endm
-
-/* X points to Block */
-.macro dbg_hexdump length
-/*
- precall
- hexdump \length
- postcall
- // */
-.endm
-
-
-
-.section .text
-
-SPL = 0x3D
-SPH = 0x3E
-SREG = 0x3F
-
-
-;
-;sha1_ctx_t is:
-;
-; [h0][h1][h2][h3][h4][length]
-; hn is 32 bit large, length is 64 bit large
-
-;###########################################################
-
-.global sha1_ctx2hash
-; === sha1_ctx2hash ===
-; this function converts a state into a normal hash (bytestring)
-; param1: the 16-bit destination pointer
-; given in r25,r24 (r25 is most significant)
-; param2: the 16-bit pointer to sha1_ctx structure
-; given in r23,r22
-sha1_ctx2hash:
- movw r26, r22
- movw r30, r24
- ldi r21, 5
- sbiw r26, 4
-1:
- ldi r20, 4
- adiw r26, 8
-2:
- ld r0, -X
- st Z+, r0
- dec r20
- brne 2b
-
- dec r21
- brne 1b
-
- ret
-
-;###########################################################
-
-.global sha1
-; === sha1 ===
-; this function calculates SHA-1 hashes from messages in RAM
-; param1: the 16-bit hash destination pointer
-; given in r25,r24 (r25 is most significant)
-; param2: the 16-bit pointer to message
-; given in r23,r22
-; param3: 32-bit length value (length of message in bits)
-; given in r21,r20,r19,r18
-sha1:
-sha1_prolog:
- push r8
- push r9
- push r10
- push r11
- push r12
- push r13
- push r16
- push r17
- in r16, SPL
- in r17, SPH
- subi r16, 5*4+8
- sbci r17, 0
- in r0, SREG
- cli
- out SPL, r16
- out SPH, r17
- out SREG, r0
-
- push r25
- push r24
- inc r16
- adc r17, r1
-
- movw r8, r18 /* backup of length*/
- movw r10, r20
-
- movw r12, r22 /* backup pf msg-ptr */
-
- movw r24, r16
- rcall sha1_init
- /* if length >= 512 */
-1:
- tst r11
- brne 4f
- tst r10
- brne 4f
- mov r19, r9
- cpi r19, 0x02
- brlo 4f
-
- movw r24, r16
- movw r22, r12
- rcall sha1_nextBlock
- ldi r19, 0x64
- add r22, r19
- adc r23, r1
- /* length -= 512 */
- ldi r19, 0x02
- sub r9, r19
- sbc r10, r1
- sbc r11, r1
- rjmp 1b
-
-4:
- movw r24, r16
- movw r22, r12
- movw r20, r8
- rcall sha1_lastBlock
-
- pop r24
- pop r25
- movw r22, r16
- rcall sha1_ctx2hash
-
-sha1_epilog:
- in r30, SPL
- in r31, SPH
- adiw r30, 5*4+8
- in r0, SREG
- cli
- out SPL, r30
- out SPH, r31
- out SREG, r0
- pop r17
- pop r16
- pop r13
- pop r12
- pop r11
- pop r10
- pop r9
- pop r8
- ret
-
-;###########################################################
-
-
-; block MUST NOT be larger than 64 bytes
-
-.global sha1_lastBlock
-; === sha1_lastBlock ===
-; this function does padding & Co. for calculating SHA-1 hashes
-; param1: the 16-bit pointer to sha1_ctx structure
-; given in r25,r24 (r25 is most significant)
-; param2: an 16-bit pointer to 64 byte block to hash
-; given in r23,r22
-; param3: an 16-bit integer specifing length of block in bits
-; given in r21,r20
-sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
-
-
-sha1_lastBlock:
- tst r20
- brne sha1_lastBlock_prolog
- cpi r21, 0x02
- brne sha1_lastBlock_prolog
- push r25
- push r24
- push r23
- push r22
- rcall sha1_nextBlock
- pop r22
- pop r23
- pop r24
- pop r25
- clr r21
- clr r22
-sha1_lastBlock_prolog:
- /* allocate space on stack */
- in r30, SPL
- in r31, SPH
- in r1, SREG
- subi r30, lo8(64)
- sbci r31, hi8(64) /* ??? */
- cli
- out SPL, r30
- out SPH, r31
- out SREG,r1
-
- adiw r30, 1 /* SP points to next free byte on stack */
- mov r18, r20 /* r20 = LSB(length) */
- lsr r18
- lsr r18
- lsr r18
- bst r21, 0 /* may be we should explain this ... */
- bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
-
-
- movw r26, r22 /* X points to begin of msg */
- tst r18
- breq sha1_lastBlock_post_copy
- mov r1, r18
-sha1_lastBlock_copy_loop:
- ld r0, X+
- st Z+, r0
- dec r1
- brne sha1_lastBlock_copy_loop
-sha1_lastBlock_post_copy:
-sha1_lastBlock_insert_stuffing_bit:
- ldi r19, 0x80
- mov r0,r19
- ldi r19, 0x07
- and r19, r20 /* if we are in bitmode */
- breq 2f /* no bitmode */
-1:
- lsr r0
- dec r19
- brne 1b
- ld r19, X
-/* maybe we should do some ANDing here, just for safety */
- or r0, r19
-2:
- st Z+, r0
- inc r18
-
-/* checking stuff here */
- cpi r18, 64-8+1
- brsh 0f
- rjmp sha1_lastBlock_insert_zeros
-0:
- /* oh shit, we landed here */
- /* first we have to fill it up with zeros */
- ldi r19, 64
- sub r19, r18
- breq 2f
-1:
- st Z+, r1
- dec r19
- brne 1b
-2:
- sbiw r30, 63
- sbiw r30, 1
- movw r22, r30
-
- push r31
- push r30
- push r25
- push r24
- push r21
- push r20
- rcall sha1_nextBlock
- pop r20
- pop r21
- pop r24
- pop r25
- pop r30
- pop r31
-
- /* now we should subtract 512 from length */
- movw r26, r24
- adiw r26, 4*5+1 /* we can skip the lowest byte */
- ld r19, X
- subi r19, hi8(512)
- st X+, r19
- ldi r18, 6
-1:
- ld r19, X
- sbci r19, 0
- st X+, r19
- dec r18
- brne 1b
-
-; clr r18 /* not neccessary ;-) */
- /* reset Z pointer to begin of block */
-
-sha1_lastBlock_insert_zeros:
- ldi r19, 64-8
- sub r19, r18
- breq sha1_lastBlock_insert_length
- clr r1
-1:
- st Z+, r1 /* r1 is still zero */
- dec r19
- brne 1b
-
-; rjmp sha1_lastBlock_epilog
-sha1_lastBlock_insert_length:
- movw r26, r24 /* X points to state */
- adiw r26, 5*4 /* X points to (state.length) */
- adiw r30, 8 /* Z points one after the last byte of block */
- ld r0, X+
- add r0, r20
- st -Z, r0
- ld r0, X+
- adc r0, r21
- st -Z, r0
- ldi r19, 6
-1:
- ld r0, X+
- adc r0, r1
- st -Z, r0
- dec r19
- brne 1b
-
- sbiw r30, 64-8
- movw r22, r30
- rcall sha1_nextBlock
-
-sha1_lastBlock_epilog:
- in r30, SPL
- in r31, SPH
- in r1, SREG
- adiw r30, 63 ; lo8(64)
- adiw r30, 1 ; hi8(64)
- cli
- out SPL, r30
- out SPH, r31
- out SREG,r1
- clr r1
- clr r0
- ret
-
-/**/
-;###########################################################
-
-.global sha1_nextBlock
-; === sha1_nextBlock ===
-; this is the core function for calculating SHA-1 hashes
-; param1: the 16-bit pointer to sha1_ctx structure
-; given in r25,r24 (r25 is most significant)
-; param2: an 16-bit pointer to 64 byte block to hash
-; given in r23,r22
-sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
-
-xtmp = 0
-xNULL = 1
-W1 = 10
-W2 = 11
-T1 = 12
-T2 = 13
-T3 = 14
-T4 = 15
-LoopC = 16
-S = 17
-tmp1 = 18
-tmp2 = 19
-tmp3 = 20
-tmp4 = 21
-F1 = 22
-F2 = 23
-F3 = 24
-F4 = 25
-
-/* byteorder: high number <--> high significance */
-sha1_nextBlock:
- ; initial, let's make some space ready for local vars
- /* replace push & pop by mem ops? */
- push r10
- push r11
- push r12
- push r13
- push r14
- push r15
- push r16
- push r17
- push r28
- push r29
- in r20, SPL
- in r21, SPH
- movw r18, r20 ;backup SP
-; movw r26, r20 ; X points to free space on stack /* maybe removeable? */
- movw r30, r22 ; Z points to message
- subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
- sbci r21, hi8(sha1_nextBlock_localSpace)
- movw r26, r20 ; X points to free space on stack
- in r0, SREG
- cli ; we want to be uninterrupted while updating SP
- out SPL, r20
- out SPH, r21
- out SREG, r0
-
- push r18
- push r19 /* push old SP on new stack */
- push r24
- push r25 /* param1 will be needed later */
-
- /* load a[] with state */
- movw 28, r24 /* load pointer to state in Y */
- adiw r26, 1 ; X++
-
- ldi LoopC, 5*4
-1: ld tmp1, Y+
- st X+, tmp1
- dec LoopC
- brne 1b
-
- movw W1, r26 /* save pointer to w[0] */
- /* load w[] with endian fixed message */
- /* we might also use the changeendian32() function at bottom */
- movw r30, r22 /* mv param2 (ponter to msg) to Z */
- ldi LoopC, 16
-1:
- ldd tmp1, Z+3
- st X+, tmp1
- ldd tmp1, Z+2
- st X+, tmp1
- ldd tmp1, Z+1
- st X+, tmp1
- ld tmp1, Z
- st X+, tmp1
- adiw r30, 4
- dec LoopC
- brne 1b
-
- ;clr LoopC /* LoopC is named t in FIPS 180-2 */
- clr xtmp
-sha1_nextBlock_mainloop:
- mov S, LoopC
- lsl S
- lsl S
- andi S, 0x3C /* S is a bytepointer so *4 */
- /* load w[s] */
- movw r26, W1
- add r26, S /* X points at w[s] */
- adc r27, xNULL
- ld T1, X+
- ld T2, X+
- ld T3, X+
- ld T4, X+
-
- /**/
- push r26
- push r27
- push T4
- push T3
- push T2
- push T1
- in r26, SPL
- in r27, SPH
- adiw r26, 1
- dbg_hexdump 4
- pop T1
- pop T2
- pop T3
- pop T4
- pop r27
- pop r26
- /**/
-
- cpi LoopC, 16
- brlt sha1_nextBlock_mainloop_core
- /* update w[s] */
- ldi tmp1, 2*4
- rcall 1f
- ldi tmp1, 8*4
- rcall 1f
- ldi tmp1, 13*4
- rcall 1f
- rjmp 2f
-1: /* this might be "outsourced" to save the jump above */
- add tmp1, S
- andi tmp1, 0x3f
- movw r26, W1
- add r26, tmp1
- adc r27, xNULL
- ld tmp2, X+
- eor T1, tmp2
- ld tmp2, X+
- eor T2, tmp2
- ld tmp2, X+
- eor T3, tmp2
- ld tmp2, X+
- eor T4, tmp2
- ret
-2: /* now we just hav to do a ROTL(T) and save T back */
- mov tmp2, T4
- rol tmp2
- rol T1
- rol T2
- rol T3
- rol T4
- movw r26, W1
- add r26, S
- adc r27, xNULL
- st X+, T1
- st X+, T2
- st X+, T3
- st X+, T4
-
-sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/
- /* T already contains w[s] */
- movw r26, W1
- sbiw r26, 4*1 /* X points at a[4] aka e */
- ld tmp1, X+
- add T1, tmp1
- ld tmp1, X+
- adc T2, tmp1
- ld tmp1, X+
- adc T3, tmp1
- ld tmp1, X+
- adc T4, tmp1 /* T = w[s]+e */
- sbiw r26, 4*5 /* X points at a[0] aka a */
- ld F1, X+
- ld F2, X+
- ld F3, X+
- ld F4, X+
- mov tmp1, F4 /* X points at a[1] aka b */
- ldi tmp2, 5
-1:
- rol tmp1
- rol F1
- rol F2
- rol F3
- rol F4
- dec tmp2
- brne 1b
-
- add T1, F1
- adc T2, F2
- adc T3, F3
- adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
-
- /* now we have to do this fucking conditional stuff */
- ldi r30, lo8(sha1_nextBlock_xTable)
- ldi r31, hi8(sha1_nextBlock_xTable)
- add r30, xtmp
- adc r31, xNULL
- lpm tmp1, Z
- cp tmp1, LoopC
- brne 1f
- inc xtmp
-1: ldi r30, lo8(sha1_nextBlock_KTable)
- ldi r31, hi8(sha1_nextBlock_KTable)
- lsl xtmp
- lsl xtmp
- add r30, xtmp
- adc r31, xNULL
- lsr xtmp
- lsr xtmp
-
- lpm tmp1, Z+
- add T1, tmp1
- lpm tmp1, Z+
- adc T2, tmp1
- lpm tmp1, Z+
- adc T3, tmp1
- lpm tmp1, Z+
- adc T4, tmp1
- /* T = ROTL(a,5) + e + kt + w[s] */
-
- /* wo Z-4 gerade auf kt zeigt ... */
- movw r28, r26 /* copy X in Y */
- adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
- lsr r31
- ror r30
-
- icall
- mov F1, tmp1
- icall
- mov F2, tmp1
- icall
- mov F3, tmp1
- icall
-
- add T1, F1
- adc T2, F2
- adc T3, F3
- adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
- /* X points still at a[1] aka b, Y points at a[2] aka c */
- /* update a[] */
-sha1_nextBlock_update_a:
- /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
- //adiw r28, 3*4 /* Y should point at a[4] aka e */
- movw r28, W1
- sbiw r28, 4
-
- ldi tmp2, 4*4
-1:
- ld tmp1, -Y
- std Y+4, tmp1
- dec tmp2
- brne 1b
- /* Y points at a[0] aka a*/
-
- movw r28, W1
- sbiw r28, 5*4
- /* store T in a[0] aka a */
- st Y+, T1
- st Y+, T2
- st Y+, T3
- st Y+, T4
- /* Y points at a[1] aka b*/
-
- /* rotate c */
- ldd T1, Y+1*4
- ldd T2, Y+1*4+1
- ldd T3, Y+1*4+2
- ldd T4, Y+1*4+3
- mov tmp1, T1
- ldi tmp2, 2
-1: ror tmp1
- ror T4
- ror T3
- ror T2
- ror T1
- dec tmp2
- brne 1b
- std Y+1*4+0, T1
- std Y+1*4+1, T2
- std Y+1*4+2, T3
- std Y+1*4+3, T4
-
- push r27
- push r26
- movw r26, W1
- sbiw r26, 4*5
- dbg_hexdump 4*5
- pop r26
- pop r27
-
- inc LoopC
- cpi LoopC, 80
- brge 1f
- jmp sha1_nextBlock_mainloop
-/**************************************/
-1:
- /* littel patch */
- sbiw r28, 4
-
-/* add a[] to state and inc length */
- pop r27
- pop r26 /* now X points to state (and Y still at a[0]) */
- ldi tmp4, 5
-1: clc
- ldi tmp3, 4
-2: ld tmp1, X
- ld tmp2, Y+
- adc tmp1, tmp2
- st X+, tmp1
- dec tmp3
- brne 2b
- dec tmp4
- brne 1b
-
- /* now length += 512 */
- adiw r26, 1 /* we skip the least significant byte */
- ld tmp1, X
- ldi tmp2, hi8(512) /* 2 */
- add tmp1, tmp2
- st X+, tmp1
- ldi tmp2, 6
-1:
- ld tmp1, X
- adc tmp1, xNULL
- st X+, tmp1
- dec tmp2
- brne 1b
-
-; EPILOG
-sha1_nextBlock_epilog:
-/* now we should clean up the stack */
- pop r21
- pop r20
- in r0, SREG
- cli ; we want to be uninterrupted while updating SP
- out SPL, r20
- out SPH, r21
- out SREG, r0
-
- clr r1
- pop r29
- pop r28
- pop r17
- pop r16
- pop r15
- pop r14
- pop r13
- pop r12
- pop r11
- pop r10
- ret
-
-sha1_nextBlock_xTable:
-.byte 20,40,60,0
-sha1_nextBlock_KTable:
-.int 0x5a827999
-.int 0x6ed9eba1
-.int 0x8f1bbcdc
-.int 0xca62c1d6
-sha1_nextBlock_JumpTable:
-jmp sha1_nextBlock_Ch
-jmp sha1_nextBlock_Parity
-jmp sha1_nextBlock_Maj
-jmp sha1_nextBlock_Parity
-
- /* X and Y still point at a[1] aka b ; return value in tmp1 */
-sha1_nextBlock_Ch:
- ld tmp1, Y+
- mov tmp2, tmp1
- com tmp2
- ldd tmp3, Y+3 /* load from c */
- and tmp1, tmp3
- ldd tmp3, Y+7 /* load from d */
- and tmp2, tmp3
- eor tmp1, tmp2
- /**
- precall
- ldi r24, lo8(ch_str)
- ldi r25, hi8(ch_str)
- call uart_putstr_P
- postcall
- /**/
- ret
-
-sha1_nextBlock_Maj:
- ld tmp1, Y+
- mov tmp2, tmp1
- ldd tmp3, Y+3 /* load from c */
- and tmp1, tmp3
- ldd tmp4, Y+7 /* load from d */
- and tmp2, tmp4
- eor tmp1, tmp2
- and tmp3, tmp4
- eor tmp1, tmp3
- /**
- precall
- ldi r24, lo8(maj_str)
- ldi r25, hi8(maj_str)
- call uart_putstr_P
- postcall
- /**/
- ret
-
-sha1_nextBlock_Parity:
- ld tmp1, Y+
- ldd tmp2, Y+3 /* load from c */
- eor tmp1, tmp2
- ldd tmp2, Y+7 /* load from d */
- eor tmp1, tmp2
-
- /**
- precall
- ldi r24, lo8(parity_str)
- ldi r25, hi8(parity_str)
- call uart_putstr_P
- postcall
- /**/
- ret
-/*
-ch_str: .asciz "\r\nCh"
-maj_str: .asciz "\r\nMaj"
-parity_str: .asciz "\r\nParity"
-*/
-;###########################################################
-
-.global sha1_init
-;void sha1_init(sha1_ctx_t *state){
-; DEBUG_S("\r\nSHA1_INIT");
-; state->h[0] = 0x67452301;
-; state->h[1] = 0xefcdab89;
-; state->h[2] = 0x98badcfe;
-; state->h[3] = 0x10325476;
-; state->h[4] = 0xc3d2e1f0;
-; state->length = 0;
-;}
-; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
-; modifys: Z(r30,r31), Func1, r22
-sha1_init:
- movw r26, r24 ; (24,25) --> (26,27) load X with param1
- ldi r30, lo8((sha1_init_vector))
- ldi r31, hi8((sha1_init_vector))
- ldi r22, 5*4 /* bytes to copy */
-sha1_init_vloop:
- lpm r23, Z+
- st X+, r23
- dec r22
- brne sha1_init_vloop
- ldi r22, 8
- clr r1 /* this should not be needed */
-sha1_init_lloop:
- st X+, r1
- dec r22
- brne sha1_init_lloop
- ret
-
-sha1_init_vector:
-.int 0x67452301;
-.int 0xefcdab89;
-.int 0x98badcfe;
-.int 0x10325476;
-.int 0xc3d2e1f0;
-/*
-;###########################################################
-
-.global rotl32
-; === ROTL32 ===
-; function that rotates a 32 bit word to the left
-; param1: the 32-bit word to rotate
-; given in r25,r24,r23,r22 (r25 is most significant)
-; param2: an 8-bit value telling how often to rotate
-; given in r20
-; modifys: r21, r22
-rotl32:
- cpi r20, 8
- brlo bitrotl
- mov r21, r25
- mov r25, r24
- mov r24, r23
- mov r23, r22
- mov r22, r21
- subi r20, 8
- rjmp rotr32
-bitrotl:
- clr r21
- clc
-bitrotl_loop:
- tst r20
- breq fixrotl
- rol r22
- rol r23
- rol r24
- rol r25
- rol r21
- dec r20
- rjmp bitrotl_loop
-fixrotl:
- or r22, r21
- ret
-
-
-;###########################################################
-
-.global rotr32
-; === ROTR32 ===
-; function that rotates a 32 bit word to the right
-; param1: the 32-bit word to rotate
-; given in r25,r24,r23,22 (r25 is most significant)
-; param2: an 8-bit value telling how often to rotate
-; given in r20
-; modifys: r21, r22
-rotr32:
- cpi r20, 8
- brlo bitrotr
- mov r21, r22
- mov r22, r23
- mov r23, r24
- mov r24, r25
- mov r25, r21
- subi r20, 8
- rjmp rotr32
-bitrotr:
- clr r21
- clc
-bitrotr_loop:
- tst r20
- breq fixrotr
- ror r25
- ror r24
- ror r23
- ror r22
- ror r21
- dec r20
- rjmp bitrotr_loop
-fixrotr:
- or r25, r21
- ret
-
-
-;###########################################################
-
-.global change_endian32
-; === change_endian32 ===
-; function that changes the endianess of a 32-bit word
-; param1: the 32-bit word
-; given in r25,r24,r23,22 (r25 is most significant)
-; modifys: r21, r22
-change_endian32:
- movw r20, r22 ; (r22,r23) --> (r20,r21)
- mov r22, r25
- mov r23, r24
- mov r24, r21
- mov r25, r20
- ret
-*/
+/*
+ * Author: Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; SHA1 implementation in assembler for AVR
+SHA1_BLOCK_BITS = 512
+SHA1_HASH_BITS = 160
+
+.macro precall
+ /* push r18 - r27, r30 - r31*/
+ push r0
+ push r1
+ push r18
+ push r19
+ push r20
+ push r21
+ push r22
+ push r23
+ push r24
+ push r25
+ push r26
+ push r27
+ push r30
+ push r31
+ clr r1
+.endm
+
+.macro postcall
+ pop r31
+ pop r30
+ pop r27
+ pop r26
+ pop r25
+ pop r24
+ pop r23
+ pop r22
+ pop r21
+ pop r20
+ pop r19
+ pop r18
+ pop r1
+ pop r0
+.endm
+
+
+.macro hexdump length
+ push r27
+ push r26
+ ldi r25, '\r'
+ mov r24, r25
+ call uart_putc
+ ldi r25, '\n'
+ mov r24, r25
+ call uart_putc
+ pop r26
+ pop r27
+ movw r24, r26
+.if \length > 16
+ ldi r22, lo8(16)
+ ldi r23, hi8(16)
+ push r27
+ push r26
+ call uart_hexdump
+ pop r26
+ pop r27
+ adiw r26, 16
+ hexdump \length-16
+.else
+ ldi r22, lo8(\length)
+ ldi r23, hi8(\length)
+ call uart_hexdump
+.endif
+.endm
+
+.macro delay
+/*
+ push r0
+ push r1
+ clr r0
+1: clr r1
+2: dec r1
+ brne 2b
+ dec r0
+ brne 1b
+ pop r1
+ pop r0 // */
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+/*
+ precall
+ hexdump \length
+ postcall
+ // */
+.endm
+
+
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha1_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################
+
+.global sha1_ctx2hash
+; === sha1_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+; param1: the 16-bit destination pointer
+; given in r25,r24 (r25 is most significant)
+; param2: the 16-bit pointer to sha1_ctx structure
+; given in r23,r22
+sha1_ctx2hash:
+ movw r26, r22
+ movw r30, r24
+ ldi r21, 5
+ sbiw r26, 4
+1:
+ ldi r20, 4
+ adiw r26, 8
+2:
+ ld r0, -X
+ st Z+, r0
+ dec r20
+ brne 2b
+
+ dec r21
+ brne 1b
+
+ ret
+
+;###########################################################
+
+.global sha1
+; === sha1 ===
+; this function calculates SHA-1 hashes from messages in RAM
+; param1: the 16-bit hash destination pointer
+; given in r25,r24 (r25 is most significant)
+; param2: the 16-bit pointer to message
+; given in r23,r22
+; param3: 32-bit length value (length of message in bits)
+; given in r21,r20,r19,r18
+sha1:
+sha1_prolog:
+ push r8
+ push r9
+ push r10
+ push r11
+ push r12
+ push r13
+ push r16
+ push r17
+ in r16, SPL
+ in r17, SPH
+ subi r16, 5*4+8
+ sbci r17, 0
+ in r0, SREG
+ cli
+ out SPL, r16
+ out SPH, r17
+ out SREG, r0
+
+ push r25
+ push r24
+ inc r16
+ adc r17, r1
+
+ movw r8, r18 /* backup of length*/
+ movw r10, r20
+
+ movw r12, r22 /* backup pf msg-ptr */
+
+ movw r24, r16
+ rcall sha1_init
+ /* if length >= 512 */
+1:
+ tst r11
+ brne 4f
+ tst r10
+ brne 4f
+ mov r19, r9
+ cpi r19, 0x02
+ brlo 4f
+
+ movw r24, r16
+ movw r22, r12
+ rcall sha1_nextBlock
+ ldi r19, 0x64
+ add r22, r19
+ adc r23, r1
+ /* length -= 512 */
+ ldi r19, 0x02
+ sub r9, r19
+ sbc r10, r1
+ sbc r11, r1
+ rjmp 1b
+
+4:
+ movw r24, r16
+ movw r22, r12
+ movw r20, r8
+ rcall sha1_lastBlock
+
+ pop r24
+ pop r25
+ movw r22, r16
+ rcall sha1_ctx2hash
+
+sha1_epilog:
+ in r30, SPL
+ in r31, SPH
+ adiw r30, 5*4+8
+ in r0, SREG
+ cli
+ out SPL, r30
+ out SPH, r31
+ out SREG, r0
+ pop r17
+ pop r16
+ pop r13
+ pop r12
+ pop r11
+ pop r10
+ pop r9
+ pop r8
+ ret
+
+;###########################################################
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha1_lastBlock
+; === sha1_lastBlock ===
+; this function does padding & Co. for calculating SHA-1 hashes
+; param1: the 16-bit pointer to sha1_ctx structure
+; given in r25,r24 (r25 is most significant)
+; param2: an 16-bit pointer to 64 byte block to hash
+; given in r23,r22
+; param3: an 16-bit integer specifing length of block in bits
+; given in r21,r20
+sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
+
+
+sha1_lastBlock:
+ tst r20
+ brne sha1_lastBlock_prolog
+ cpi r21, 0x02
+ brne sha1_lastBlock_prolog
+ push r25
+ push r24
+ push r23
+ push r22
+ rcall sha1_nextBlock
+ pop r22
+ pop r23
+ pop r24
+ pop r25
+ clr r21
+ clr r22
+sha1_lastBlock_prolog:
+ /* allocate space on stack */
+ in r30, SPL
+ in r31, SPH
+ in r1, SREG
+ subi r30, lo8(64)
+ sbci r31, hi8(64) /* ??? */
+ cli
+ out SPL, r30
+ out SPH, r31
+ out SREG,r1
+
+ adiw r30, 1 /* SP points to next free byte on stack */
+ mov r18, r20 /* r20 = LSB(length) */
+ lsr r18
+ lsr r18
+ lsr r18
+ bst r21, 0 /* may be we should explain this ... */
+ bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
+
+
+ movw r26, r22 /* X points to begin of msg */
+ tst r18
+ breq sha1_lastBlock_post_copy
+ mov r1, r18
+sha1_lastBlock_copy_loop:
+ ld r0, X+
+ st Z+, r0
+ dec r1
+ brne sha1_lastBlock_copy_loop
+sha1_lastBlock_post_copy:
+sha1_lastBlock_insert_stuffing_bit:
+ ldi r19, 0x80
+ mov r0,r19
+ ldi r19, 0x07
+ and r19, r20 /* if we are in bitmode */
+ breq 2f /* no bitmode */
+1:
+ lsr r0
+ dec r19
+ brne 1b
+ ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+ or r0, r19
+2:
+ st Z+, r0
+ inc r18
+
+/* checking stuff here */
+ cpi r18, 64-8+1
+ brsh 0f
+ rjmp sha1_lastBlock_insert_zeros
+0:
+ /* oh shit, we landed here */
+ /* first we have to fill it up with zeros */
+ ldi r19, 64
+ sub r19, r18
+ breq 2f
+1:
+ st Z+, r1
+ dec r19
+ brne 1b
+2:
+ sbiw r30, 63
+ sbiw r30, 1
+ movw r22, r30
+
+ push r31
+ push r30
+ push r25
+ push r24
+ push r21
+ push r20
+ rcall sha1_nextBlock
+ pop r20
+ pop r21
+ pop r24
+ pop r25
+ pop r30
+ pop r31
+
+ /* now we should subtract 512 from length */
+ movw r26, r24
+ adiw r26, 4*5+1 /* we can skip the lowest byte */
+ ld r19, X
+ subi r19, hi8(512)
+ st X+, r19
+ ldi r18, 6
+1:
+ ld r19, X
+ sbci r19, 0
+ st X+, r19
+ dec r18
+ brne 1b
+
+; clr r18 /* not neccessary ;-) */
+ /* reset Z pointer to begin of block */
+
+sha1_lastBlock_insert_zeros:
+ ldi r19, 64-8
+ sub r19, r18
+ breq sha1_lastBlock_insert_length
+ clr r1
+1:
+ st Z+, r1 /* r1 is still zero */
+ dec r19
+ brne 1b
+
+; rjmp sha1_lastBlock_epilog
+sha1_lastBlock_insert_length:
+ movw r26, r24 /* X points to state */
+ adiw r26, 5*4 /* X points to (state.length) */
+ adiw r30, 8 /* Z points one after the last byte of block */
+ ld r0, X+
+ add r0, r20
+ st -Z, r0
+ ld r0, X+
+ adc r0, r21
+ st -Z, r0
+ ldi r19, 6
+1:
+ ld r0, X+
+ adc r0, r1
+ st -Z, r0
+ dec r19
+ brne 1b
+
+ sbiw r30, 64-8
+ movw r22, r30
+ rcall sha1_nextBlock
+
+sha1_lastBlock_epilog:
+ in r30, SPL
+ in r31, SPH
+ in r1, SREG
+ adiw r30, 63 ; lo8(64)
+ adiw r30, 1 ; hi8(64)
+ cli
+ out SPL, r30
+ out SPH, r31
+ out SREG,r1
+ clr r1
+ clr r0
+ ret
+
+/**/
+;###########################################################
+
+.global sha1_nextBlock
+; === sha1_nextBlock ===
+; this is the core function for calculating SHA-1 hashes
+; param1: the 16-bit pointer to sha1_ctx structure
+; given in r25,r24 (r25 is most significant)
+; param2: an 16-bit pointer to 64 byte block to hash
+; given in r23,r22
+sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
+
+xtmp = 0
+xNULL = 1
+W1 = 10
+W2 = 11
+T1 = 12
+T2 = 13
+T3 = 14
+T4 = 15
+LoopC = 16
+S = 17
+tmp1 = 18
+tmp2 = 19
+tmp3 = 20
+tmp4 = 21
+F1 = 22
+F2 = 23
+F3 = 24
+F4 = 25
+
+/* byteorder: high number <--> high significance */
+sha1_nextBlock:
+ ; initial, let's make some space ready for local vars
+ /* replace push & pop by mem ops? */
+ push r10
+ push r11
+ push r12
+ push r13
+ push r14
+ push r15
+ push r16
+ push r17
+ push r28
+ push r29
+ in r20, SPL
+ in r21, SPH
+ movw r18, r20 ;backup SP
+; movw r26, r20 ; X points to free space on stack /* maybe removeable? */
+ movw r30, r22 ; Z points to message
+ subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
+ sbci r21, hi8(sha1_nextBlock_localSpace)
+ movw r26, r20 ; X points to free space on stack
+ in r0, SREG
+ cli ; we want to be uninterrupted while updating SP
+ out SPL, r20
+ out SPH, r21
+ out SREG, r0
+
+ push r18
+ push r19 /* push old SP on new stack */
+ push r24
+ push r25 /* param1 will be needed later */
+
+ /* load a[] with state */
+ movw 28, r24 /* load pointer to state in Y */
+ adiw r26, 1 ; X++
+
+ ldi LoopC, 5*4
+1: ld tmp1, Y+
+ st X+, tmp1
+ dec LoopC
+ brne 1b
+
+ movw W1, r26 /* save pointer to w[0] */
+ /* load w[] with endian fixed message */
+ /* we might also use the changeendian32() function at bottom */
+ movw r30, r22 /* mv param2 (ponter to msg) to Z */
+ ldi LoopC, 16
+1:
+ ldd tmp1, Z+3
+ st X+, tmp1
+ ldd tmp1, Z+2
+ st X+, tmp1
+ ldd tmp1, Z+1
+ st X+, tmp1
+ ld tmp1, Z
+ st X+, tmp1
+ adiw r30, 4
+ dec LoopC
+ brne 1b
+
+ ;clr LoopC /* LoopC is named t in FIPS 180-2 */
+ clr xtmp
+sha1_nextBlock_mainloop:
+ mov S, LoopC
+ lsl S
+ lsl S
+ andi S, 0x3C /* S is a bytepointer so *4 */
+ /* load w[s] */
+ movw r26, W1
+ add r26, S /* X points at w[s] */
+ adc r27, xNULL
+ ld T1, X+
+ ld T2, X+
+ ld T3, X+
+ ld T4, X+
+
+ /**/
+ push r26
+ push r27
+ push T4
+ push T3
+ push T2
+ push T1
+ in r26, SPL
+ in r27, SPH
+ adiw r26, 1
+ dbg_hexdump 4
+ pop T1
+ pop T2
+ pop T3
+ pop T4
+ pop r27
+ pop r26
+ /**/
+
+ cpi LoopC, 16
+ brlt sha1_nextBlock_mainloop_core
+ /* update w[s] */
+ ldi tmp1, 2*4
+ rcall 1f
+ ldi tmp1, 8*4
+ rcall 1f
+ ldi tmp1, 13*4
+ rcall 1f
+ rjmp 2f
+1: /* this might be "outsourced" to save the jump above */
+ add tmp1, S
+ andi tmp1, 0x3f
+ movw r26, W1
+ add r26, tmp1
+ adc r27, xNULL
+ ld tmp2, X+
+ eor T1, tmp2
+ ld tmp2, X+
+ eor T2, tmp2
+ ld tmp2, X+
+ eor T3, tmp2
+ ld tmp2, X+
+ eor T4, tmp2
+ ret
+2: /* now we just hav to do a ROTL(T) and save T back */
+ mov tmp2, T4
+ rol tmp2
+ rol T1
+ rol T2
+ rol T3
+ rol T4
+ movw r26, W1
+ add r26, S
+ adc r27, xNULL
+ st X+, T1
+ st X+, T2
+ st X+, T3
+ st X+, T4
+
+sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/
+ /* T already contains w[s] */
+ movw r26, W1
+ sbiw r26, 4*1 /* X points at a[4] aka e */
+ ld tmp1, X+
+ add T1, tmp1
+ ld tmp1, X+
+ adc T2, tmp1
+ ld tmp1, X+
+ adc T3, tmp1
+ ld tmp1, X+
+ adc T4, tmp1 /* T = w[s]+e */
+ sbiw r26, 4*5 /* X points at a[0] aka a */
+ ld F1, X+
+ ld F2, X+
+ ld F3, X+
+ ld F4, X+
+ mov tmp1, F4 /* X points at a[1] aka b */
+ ldi tmp2, 5
+1:
+ rol tmp1
+ rol F1
+ rol F2
+ rol F3
+ rol F4
+ dec tmp2
+ brne 1b
+
+ add T1, F1
+ adc T2, F2
+ adc T3, F3
+ adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
+
+ /* now we have to do this fucking conditional stuff */
+ ldi r30, lo8(sha1_nextBlock_xTable)
+ ldi r31, hi8(sha1_nextBlock_xTable)
+ add r30, xtmp
+ adc r31, xNULL
+ lpm tmp1, Z
+ cp tmp1, LoopC
+ brne 1f
+ inc xtmp
+1: ldi r30, lo8(sha1_nextBlock_KTable)
+ ldi r31, hi8(sha1_nextBlock_KTable)
+ lsl xtmp
+ lsl xtmp
+ add r30, xtmp
+ adc r31, xNULL
+ lsr xtmp
+ lsr xtmp
+
+ lpm tmp1, Z+
+ add T1, tmp1
+ lpm tmp1, Z+
+ adc T2, tmp1
+ lpm tmp1, Z+
+ adc T3, tmp1
+ lpm tmp1, Z+
+ adc T4, tmp1
+ /* T = ROTL(a,5) + e + kt + w[s] */
+
+ /* wo Z-4 gerade auf kt zeigt ... */
+ movw r28, r26 /* copy X in Y */
+ adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
+ lsr r31
+ ror r30
+
+ icall
+ mov F1, tmp1
+ icall
+ mov F2, tmp1
+ icall
+ mov F3, tmp1
+ icall
+
+ add T1, F1
+ adc T2, F2
+ adc T3, F3
+ adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
+ /* X points still at a[1] aka b, Y points at a[2] aka c */
+ /* update a[] */
+sha1_nextBlock_update_a:
+ /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
+ //adiw r28, 3*4 /* Y should point at a[4] aka e */
+ movw r28, W1
+ sbiw r28, 4
+
+ ldi tmp2, 4*4
+1:
+ ld tmp1, -Y
+ std Y+4, tmp1
+ dec tmp2
+ brne 1b
+ /* Y points at a[0] aka a*/
+
+ movw r28, W1
+ sbiw r28, 5*4
+ /* store T in a[0] aka a */
+ st Y+, T1
+ st Y+, T2
+ st Y+, T3
+ st Y+, T4
+ /* Y points at a[1] aka b*/
+
+ /* rotate c */
+ ldd T1, Y+1*4
+ ldd T2, Y+1*4+1
+ ldd T3, Y+1*4+2
+ ldd T4, Y+1*4+3
+ mov tmp1, T1
+ ldi tmp2, 2
+1: ror tmp1
+ ror T4
+ ror T3
+ ror T2
+ ror T1
+ dec tmp2
+ brne 1b
+ std Y+1*4+0, T1
+ std Y+1*4+1, T2
+ std Y+1*4+2, T3
+ std Y+1*4+3, T4
+
+ push r27
+ push r26
+ movw r26, W1
+ sbiw r26, 4*5
+ dbg_hexdump 4*5
+ pop r26
+ pop r27
+
+ inc LoopC
+ cpi LoopC, 80
+ brge 1f
+ jmp sha1_nextBlock_mainloop
+/**************************************/
+1:
+ /* littel patch */
+ sbiw r28, 4
+
+/* add a[] to state and inc length */
+ pop r27
+ pop r26 /* now X points to state (and Y still at a[0]) */
+ ldi tmp4, 5
+1: clc
+ ldi tmp3, 4
+2: ld tmp1, X
+ ld tmp2, Y+
+ adc tmp1, tmp2
+ st X+, tmp1
+ dec tmp3
+ brne 2b
+ dec tmp4
+ brne 1b
+
+ /* now length += 512 */
+ adiw r26, 1 /* we skip the least significant byte */
+ ld tmp1, X
+ ldi tmp2, hi8(512) /* 2 */
+ add tmp1, tmp2
+ st X+, tmp1
+ ldi tmp2, 6
+1:
+ ld tmp1, X
+ adc tmp1, xNULL
+ st X+, tmp1
+ dec tmp2
+ brne 1b
+
+; EPILOG
+sha1_nextBlock_epilog:
+/* now we should clean up the stack */
+ pop r21
+ pop r20
+ in r0, SREG
+ cli ; we want to be uninterrupted while updating SP
+ out SPL, r20
+ out SPH, r21
+ out SREG, r0
+
+ clr r1
+ pop r29
+ pop r28
+ pop r17
+ pop r16
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop r11
+ pop r10
+ ret
+
+sha1_nextBlock_xTable:
+.byte 20,40,60,0
+sha1_nextBlock_KTable:
+.int 0x5a827999
+.int 0x6ed9eba1
+.int 0x8f1bbcdc
+.int 0xca62c1d6
+sha1_nextBlock_JumpTable:
+jmp sha1_nextBlock_Ch
+jmp sha1_nextBlock_Parity
+jmp sha1_nextBlock_Maj
+jmp sha1_nextBlock_Parity
+
+ /* X and Y still point at a[1] aka b ; return value in tmp1 */
+sha1_nextBlock_Ch:
+ ld tmp1, Y+
+ mov tmp2, tmp1
+ com tmp2
+ ldd tmp3, Y+3 /* load from c */
+ and tmp1, tmp3
+ ldd tmp3, Y+7 /* load from d */
+ and tmp2, tmp3
+ eor tmp1, tmp2
+ /**
+ precall
+ ldi r24, lo8(ch_str)
+ ldi r25, hi8(ch_str)
+ call uart_putstr_P
+ postcall
+ /**/
+ ret
+
+sha1_nextBlock_Maj:
+ ld tmp1, Y+
+ mov tmp2, tmp1
+ ldd tmp3, Y+3 /* load from c */
+ and tmp1, tmp3
+ ldd tmp4, Y+7 /* load from d */
+ and tmp2, tmp4
+ eor tmp1, tmp2
+ and tmp3, tmp4
+ eor tmp1, tmp3
+ /**
+ precall
+ ldi r24, lo8(maj_str)
+ ldi r25, hi8(maj_str)
+ call uart_putstr_P
+ postcall
+ /**/
+ ret
+
+sha1_nextBlock_Parity:
+ ld tmp1, Y+
+ ldd tmp2, Y+3 /* load from c */
+ eor tmp1, tmp2
+ ldd tmp2, Y+7 /* load from d */
+ eor tmp1, tmp2
+
+ /**
+ precall
+ ldi r24, lo8(parity_str)
+ ldi r25, hi8(parity_str)
+ call uart_putstr_P
+ postcall
+ /**/
+ ret
+/*
+ch_str: .asciz "\r\nCh"
+maj_str: .asciz "\r\nMaj"
+parity_str: .asciz "\r\nParity"
+*/
+;###########################################################
+
+.global sha1_init
+;void sha1_init(sha1_ctx_t *state){
+; DEBUG_S("\r\nSHA1_INIT");
+; state->h[0] = 0x67452301;
+; state->h[1] = 0xefcdab89;
+; state->h[2] = 0x98badcfe;
+; state->h[3] = 0x10325476;
+; state->h[4] = 0xc3d2e1f0;
+; state->length = 0;
+;}
+; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha1_init:
+ movw r26, r24 ; (24,25) --> (26,27) load X with param1
+ ldi r30, lo8((sha1_init_vector))
+ ldi r31, hi8((sha1_init_vector))
+ ldi r22, 5*4 /* bytes to copy */
+sha1_init_vloop:
+ lpm r23, Z+
+ st X+, r23
+ dec r22
+ brne sha1_init_vloop
+ ldi r22, 8
+ clr r1 /* this should not be needed */
+sha1_init_lloop:
+ st X+, r1
+ dec r22
+ brne sha1_init_lloop
+ ret
+
+sha1_init_vector:
+.int 0x67452301;
+.int 0xefcdab89;
+.int 0x98badcfe;
+.int 0x10325476;
+.int 0xc3d2e1f0;
+/*
+;###########################################################
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+; param1: the 32-bit word to rotate
+; given in r25,r24,r23,r22 (r25 is most significant)
+; param2: an 8-bit value telling how often to rotate
+; given in r20
+; modifys: r21, r22
+rotl32:
+ cpi r20, 8
+ brlo bitrotl
+ mov r21, r25
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ mov r22, r21
+ subi r20, 8
+ rjmp rotr32
+bitrotl:
+ clr r21
+ clc
+bitrotl_loop:
+ tst r20
+ breq fixrotl
+ rol r22
+ rol r23
+ rol r24
+ rol r25
+ rol r21
+ dec r20
+ rjmp bitrotl_loop
+fixrotl:
+ or r22, r21
+ ret
+
+
+;###########################################################
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+; param1: the 32-bit word to rotate
+; given in r25,r24,r23,22 (r25 is most significant)
+; param2: an 8-bit value telling how often to rotate
+; given in r20
+; modifys: r21, r22
+rotr32:
+ cpi r20, 8
+ brlo bitrotr
+ mov r21, r22
+ mov r22, r23
+ mov r23, r24
+ mov r24, r25
+ mov r25, r21
+ subi r20, 8
+ rjmp rotr32
+bitrotr:
+ clr r21
+ clc
+bitrotr_loop:
+ tst r20
+ breq fixrotr
+ ror r25
+ ror r24
+ ror r23
+ ror r22
+ ror r21
+ dec r20
+ rjmp bitrotr_loop
+fixrotr:
+ or r25, r21
+ ret
+
+
+;###########################################################
+
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+; param1: the 32-bit word
+; given in r25,r24,r23,22 (r25 is most significant)
+; modifys: r21, r22
+change_endian32:
+ movw r20, r22 ; (r22,r23) --> (r20,r21)
+ mov r22, r25
+ mov r23, r24
+ mov r24, r21
+ mov r25, r20
+ ret
+*/
diff --git a/sha256-asm.S b/sha256-asm.S
index 392bf42..403506e 100644
--- a/sha256-asm.S
+++ b/sha256-asm.S
@@ -16,1028 +16,1028 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
-/*
- * Author: Daniel Otte
- *
- * License: GPL
-*/
-; sha-256 implementation in assembler
-SHA256_BLOCK_BITS = 512
-SHA256_HASH_BITS = 256
-
-.macro precall
- /* push r18 - r27, r30 - r31*/
- push r0
- push r1
- push r18
- push r19
- push r20
- push r21
- push r22
- push r23
- push r24
- push r25
- push r26
- push r27
- push r30
- push r31
- clr r1
-.endm
-
-.macro postcall
- pop r31
- pop r30
- pop r27
- pop r26
- pop r25
- pop r24
- pop r23
- pop r22
- pop r21
- pop r20
- pop r19
- pop r18
- pop r1
- pop r0
-.endm
-
-
-.macro hexdump length
- push r27
- push r26
- ldi r25, '\r'
- mov r24, r25
- call uart_putc
- ldi r25, '\n'
- mov r24, r25
- call uart_putc
- pop r26
- pop r27
- movw r24, r26
-.if \length > 16
- ldi r22, lo8(16)
- ldi r23, hi8(16)
- push r27
- push r26
- call uart_hexdump
- pop r26
- pop r27
- adiw r26, 16
- hexdump \length-16
-.else
- ldi r22, lo8(\length)
- ldi r23, hi8(\length)
- call uart_hexdump
-.endif
-.endm
-
-/* X points to Block */
-.macro dbg_hexdump length
- precall
- hexdump \length
- postcall
-.endm
-
-.section .text
-
-SPL = 0x3D
-SPH = 0x3E
-SREG = 0x3F
-
-
-;
-;sha256_ctx_t is:
-;
-; [h0][h1][h2][h3][h4][h5][h6][h7][length]
-; hn is 32 bit large, length is 64 bit large
-
-;###########################################################
-
-.global sha256_ctx2hash
-; === sha256_ctx2hash ===
-; this function converts a state into a normal hash (bytestring)
-; param1: the 16-bit destination pointer
-; given in r25,r24 (r25 is most significant)
-; param2: the 16-bit pointer to sha256_ctx structure
-; given in r23,r22
-sha256_ctx2hash:
- movw r26, r22
- movw r30, r24
- ldi r21, 8
- sbiw r26, 4
-1:
- ldi r20, 4
- adiw r26, 8
-2:
- ld r0, -X
- st Z+, r0
- dec r20
- brne 2b
-
- dec r21
- brne 1b
-
- ret
-
-;###########################################################
-
-.global sha256
-; === sha256 ===
-; this function calculates SHA-256 hashes from messages in RAM
-; param1: the 16-bit hash destination pointer
-; given in r25,r24 (r25 is most significant)
-; param2: the 16-bit pointer to message
-; given in r23,r22
-; param3: 32-bit length value (length of message in bits)
-; given in r21,r20,r19,r18
-sha256:
-sha256_prolog:
- push r8
- push r9
- push r10
- push r11
- push r12
- push r13
- push r16
- push r17
- in r16, SPL
- in r17, SPH
- subi r16, 8*4+8
- sbci r17, 0
- in r0, SREG
- cli
- out SPL, r16
- out SPH, r17
- out SREG, r0
-
- push r25
- push r24
- inc r16
- adc r17, r1
-
- movw r8, r18 /* backup of length*/
- movw r10, r20
-
- movw r12, r22 /* backup pf msg-ptr */
-
- movw r24, r16
- rcall sha256_init
- /* if length >= 512 */
-1:
- tst r11
- brne 4f
- tst r10
- brne 4f
- mov r19, r9
- cpi r19, 0x02
- brlo 4f
-
- movw r24, r16
- movw r22, r12
- rcall sha256_nextBlock
- ldi r19, 0x64
- add r22, r19
- adc r23, r1
- /* length -= 512 */
- ldi r19, 0x02
- sub r9, r19
- sbc r10, r1
- sbc r11, r1
- rjmp 1b
-
-4:
- movw r24, r16
- movw r22, r12
- movw r20, r8
- rcall sha256_lastBlock
-
- pop r24
- pop r25
- movw r22, r16
- rcall sha256_ctx2hash
-
-sha256_epilog:
- in r30, SPL
- in r31, SPH
- adiw r30, 8*4+8
- in r0, SREG
- cli
- out SPL, r30
- out SPH, r31
- out SREG, r0
- pop r17
- pop r16
- pop r13
- pop r12
- pop r11
- pop r10
- pop r9
- pop r8
- ret
-
-;###########################################################
-
-
-; block MUST NOT be larger than 64 bytes
-
-.global sha256_lastBlock
-; === sha256_lastBlock ===
-; this function does padding & Co. for calculating SHA-256 hashes
-; param1: the 16-bit pointer to sha256_ctx structure
-; given in r25,r24 (r25 is most significant)
-; param2: an 16-bit pointer to 64 byte block to hash
-; given in r23,r22
-; param3: an 16-bit integer specifing length of block in bits
-; given in r21,r20
-sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
-
-
-sha256_lastBlock:
- tst r20
- brne sha256_lastBlock_prolog
- cpi r21, 0x02
- brne sha256_lastBlock_prolog
- push r25
- push r24
- push r23
- push r22
- rcall sha256_nextBlock
- pop r22
- pop r23
- pop r24
- pop r25
- clr r21
- clr r22
-sha256_lastBlock_prolog:
- /* allocate space on stack */
- in r30, SPL
- in r31, SPH
- in r1, SREG
- subi r30, lo8(64)
- sbci r31, hi8(64)
- cli
- out SPL, r30
- out SPH, r31
- out SREG,r1
-
- adiw r30, 1 /* SP points to next free byte on stack */
- mov r18, r20 /* r20 = LSB(length) */
- lsr r18
- lsr r18
- lsr r18
- bst r21, 0 /* may be we should explain this ... */
- bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
-
-
- movw r26, r22 /* X points to begin of msg */
- tst r18
- breq sha256_lastBlock_post_copy
- mov r1, r18
-sha256_lastBlock_copy_loop:
- ld r0, X+
- st Z+, r0
- dec r1
- brne sha256_lastBlock_copy_loop
-sha256_lastBlock_post_copy:
-sha256_lastBlock_insert_stuffing_bit:
- ldi r19, 0x80
- mov r0,r19
- ldi r19, 0x07
- and r19, r20 /* if we are in bitmode */
- breq 2f /* no bitmode */
-1:
- lsr r0
- dec r19
- brne 1b
- ld r19, X
-/* maybe we should do some ANDing here, just for safety */
- or r0, r19
-2:
- st Z+, r0
- inc r18
-
-/* checking stuff here */
- cpi r18, 64-8+1
- brsh 0f
- rjmp sha256_lastBlock_insert_zeros
-0:
- /* oh shit, we landed here */
- /* first we have to fill it up with zeros */
- ldi r19, 64
- sub r19, r18
- breq 2f
-1:
- st Z+, r1
- dec r19
- brne 1b
-2:
- sbiw r30, 63
- sbiw r30, 1
- movw r22, r30
-
- push r31
- push r30
- push r25
- push r24
- push r21
- push r20
- rcall sha256_nextBlock
- pop r20
- pop r21
- pop r24
- pop r25
- pop r30
- pop r31
-
- /* now we should subtract 512 from length */
- movw r26, r24
- adiw r26, 4*8+1 /* we can skip the lowest byte */
- ld r19, X
- subi r19, hi8(512)
- st X+, r19
- ldi r18, 6
-1:
- ld r19, X
- sbci r19, 0
- st X+, r19
- dec r18
- brne 1b
-
-; clr r18 /* not neccessary ;-) */
- /* reset Z pointer to begin of block */
-
-sha256_lastBlock_insert_zeros:
- ldi r19, 64-8
- sub r19, r18
- breq sha256_lastBlock_insert_length
- clr r1
-1:
- st Z+, r1 /* r1 is still zero */
- dec r19
- brne 1b
-
-; rjmp sha256_lastBlock_epilog
-sha256_lastBlock_insert_length:
- movw r26, r24 /* X points to state */
- adiw r26, 8*4 /* X points to (state.length) */
- adiw r30, 8 /* Z points one after the last byte of block */
- ld r0, X+
- add r0, r20
- st -Z, r0
- ld r0, X+
- adc r0, r21
- st -Z, r0
- ldi r19, 6
-1:
- ld r0, X+
- adc r0, r1
- st -Z, r0
- dec r19
- brne 1b
-
- sbiw r30, 64-8
- movw r22, r30
- rcall sha256_nextBlock
-
-sha256_lastBlock_epilog:
- in r30, SPL
- in r31, SPH
- in r1, SREG
- adiw r30, 63 ; lo8(64)
- adiw r30, 1 ; hi8(64)
- cli
- out SPL, r30
- out SPH, r31
- out SREG,r1
- clr r1
- clr r0
- ret
-
-/**/
-;###########################################################
-
-.global sha256_nextBlock
-; === sha256_nextBlock ===
-; this is the core function for calculating SHA-256 hashes
-; param1: the 16-bit pointer to sha256_ctx structure
-; given in r25,r24 (r25 is most significant)
-; param2: an 16-bit pointer to 64 byte block to hash
-; given in r23,r22
-sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
-
-Bck1 = 12
-Bck2 = 13
-Bck3 = 14
-Bck4 = 15
-Func1 = 22
-Func2 = 23
-Func3 = 24
-Func4 = 25
-Accu1 = 16
-Accu2 = 17
-Accu3 = 18
-Accu4 = 19
-XAccu1 = 8
-XAccu2 = 9
-XAccu3 = 10
-XAccu4 = 11
-T1 = 4
-T2 = 5
-T3 = 6
-T4 = 7
-LoopC = 1
-/* byteorder: high number <--> high significance */
-sha256_nextBlock:
- ; initial, let's make some space ready for local vars
- push r4 /* replace push & pop by mem ops? */
- push r5
- push r6
- push r7
- push r8
- push r9
- push r10
- push r11
- push r12
- push r13
- push r14
- push r15
- push r16
- push r17
- push r28
- push r29
- in r20, SPL
- in r21, SPH
- movw r18, r20 ;backup SP
-; movw r26, r20 ; X points to free space on stack
- movw r30, r22 ; Z points to message
- subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
- sbci r21, hi8(sha256_nextBlock_localSpace)
- movw r26, r20 ; X points to free space on stack
- in r0, SREG
- cli ; we want to be uninterrupted while updating SP
- out SPL, r20
- out SPH, r21
- out SREG, r0
- push r18
- push r19
- push r24
- push r25 /* param1 will be needed later */
- ; now we fill the w array with message (think about endianess)
- adiw r26, 1 ; X++
- ldi r20, 16
-sha256_nextBlock_wcpyloop:
- ld r23, Z+
- ld r22, Z+
- ld r19, Z+
- ld r18, Z+
- st X+, r18
- st X+, r19
- st X+, r22
- st X+, r23
- dec r20
- brne sha256_nextBlock_wcpyloop
-/* for (i=16; i<64; ++i){
- w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
- } */
- /* r25,r24,r23,r24 (r21,r20) are function values
- r19,r18,r17,r16 are the accumulator
- r15,r14,r13,rBck1 are backup1
- r11,r10,r9 ,r8 are xor accu
- r1 is round counter */
-
- ldi r20, 64-16
- mov LoopC, r20
-sha256_nextBlock_wcalcloop:
- movw r30, r26 ; cp X to Z
- sbiw r30, 63
- sbiw r30, 1 ; substract 64 = 16*4
- ld Accu1, Z+
- ld Accu2, Z+
- ld Accu3, Z+
- ld Accu4, Z+ /* w[i] = w[i-16] */
- ld Bck1, Z+
- ld Bck2, Z+
- ld Bck3, Z+
- ld Bck4, Z+ /* backup = w[i-15] */
- /* now sigma 0 */
- mov Func1, Bck2
- mov Func2, Bck3
- mov Func3, Bck4
- mov Func4, Bck1 /* prerotated by 8 */
- ldi r20, 1
- rcall bitrotl
- movw XAccu1, Func1
- movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */
- movw Func1, Bck3
- movw Func3, Bck1 /* prerotated by 16 */
- ldi r20, 2
- rcall bitrotr
- eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/
- eor XAccu2, Func2
- eor XAccu3, Func3
- eor XAccu4, Func4
- ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/
-sigma0_shr:
- lsr Bck4
- ror Bck3
- ror Bck2
- ror Bck1
- dec Func2
- brne sigma0_shr
- eor XAccu1, Bck1
- eor XAccu2, Bck2
- eor XAccu3, Bck3
- eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
- add Accu1, XAccu1
- adc Accu2, XAccu2
- adc Accu3, XAccu3
- adc Accu4, XAccu4 /* finished with sigma0 */
- ldd Func1, Z+7*4 /* now accu += w[i-7] */
- ldd Func2, Z+7*4+1
- ldd Func3, Z+7*4+2
- ldd Func4, Z+7*4+3
- add Accu1, Func1
- adc Accu2, Func2
- adc Accu3, Func3
- adc Accu4, Func4
- ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
- ldd Bck2, Z+12*4+1
- ldd Bck3, Z+12*4+2
- ldd Bck4, Z+12*4+3
- /* now sigma 1 */
- movw Func1, Bck3
- movw Func3, Bck1 /* prerotated by 16 */
- ldi r20, 1
- rcall bitrotr
- movw XAccu3, Func3
- movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */
-; movw Func1, Bck3
-; movw Func3, Bck1 /* prerotated by 16 */
- ldi r20, 2
- rcall bitrotr
- eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/
- eor XAccu2, Func2
- eor XAccu3, Func3
- eor XAccu4, Func4
- ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
-sigma1_shr:
- lsr Bck4
- ror Bck3
- ror Bck2
- dec Func2
- brne sigma1_shr
- eor XAccu1, Bck2
- eor XAccu2, Bck3
- eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
- add Accu1, XAccu1
- adc Accu2, XAccu2
- adc Accu3, XAccu3
- adc Accu4, XAccu4 /* finished with sigma0 */
- /* now let's store the shit */
- st X+, Accu1
- st X+, Accu2
- st X+, Accu3
- st X+, Accu4
- dec LoopC
- breq 3f ; skip if zero
- rjmp sha256_nextBlock_wcalcloop
-3:
- /* we are finished with w array X points one byte post w */
-/* init a array */
- pop r31
- pop r30
- push r30
- push r31
- ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
-init_a_array:
- ld r1, Z+
- st X+, r1
- dec r25
- brne init_a_array
-
-/* now the real fun begins */
-/* for (i=0; i<64; ++i){
- t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
- t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
- memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
- a[4] += t1;
- a[0] = t1 + t2;
- } */
- /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
- sbiw r26, 8*4 /* X still points at a[7]+1*/
- movw r28, r26
- ldi r30, lo8(sha256_kv)
- ldi r31, hi8(sha256_kv)
- dec r27 /* X - (64*4 == 256) */
- ldi r25, 64
- mov LoopC, r25
-sha256_main_loop:
- /* now calculate t1 */
- /*CH(x,y,z) = (x&y)^((~x)&z)*/
- ldd T1, Y+5*4
- ldd T2, Y+5*4+1
- ldd T3, Y+5*4+2
- ldd T4, Y+5*4+3 /* y in T */
- ldd Func1, Y+4*4
- ldd Func2, Y+4*4+1
- ldd Func3, Y+4*4+2
- ldd Func4, Y+4*4+3 /* x in Func */
- ldd Bck1, Y+6*4
- ldd Bck2, Y+6*4+1
- ldd Bck3, Y+6*4+2
- ldd Bck4, Y+6*4+3 /* z in Bck */
- and T1, Func1
- and T2, Func2
- and T3, Func3
- and T4, Func4
- com Func1
- com Func2
- com Func3
- com Func4
- and Bck1, Func1
- and Bck2, Func2
- and Bck3, Func3
- and Bck4, Func4
- eor T1, Bck1
- eor T2, Bck2
- eor T3, Bck3
- eor T4, Bck4 /* done, CH(x,y,z) is in T */
- /* now SIGMA1(a[4]) */
- ldd Bck4, Y+4*4 /* think about using it from Func reg above*/
- ldd Bck1, Y+4*4+1
- ldd Bck2, Y+4*4+2
- ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
- movw Func1, Bck1
- movw Func3, Bck3
- ldi r20, 2
- rcall bitrotl /* rotr(x,6) */
- movw XAccu1, Func1
- movw XAccu3, Func3
- movw Func1, Bck1
- movw Func3, Bck3
- ldi r20, 3
- rcall bitrotr /* rotr(x,11) */
- eor XAccu1, Func1
- eor XAccu2, Func2
- eor XAccu3, Func3
- eor XAccu4, Func4
- movw Func1, Bck3 /* this prerotates furteh 16 bits*/
- movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
- ldi r20, 1
- rcall bitrotr /* rotr(x,11) */
- eor XAccu1, Func1
- eor XAccu2, Func2
- eor XAccu3, Func3
- eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
- add T1, XAccu1
- adc T2, XAccu2
- adc T3, XAccu3
- adc T4, XAccu4
- /* now we've to add a[7], w[i] and k[i] */
- ldd XAccu1, Y+4*7
- ldd XAccu2, Y+4*7+1
- ldd XAccu3, Y+4*7+2
- ldd XAccu4, Y+4*7+3
- add T1, XAccu1
- adc T2, XAccu2
- adc T3, XAccu3
- adc T4, XAccu4 /* add a[7] */
- ld XAccu1, X+
- ld XAccu2, X+
- ld XAccu3, X+
- ld XAccu4, X+
- add T1, XAccu1
- adc T2, XAccu2
- adc T3, XAccu3
- adc T4, XAccu4 /* add w[i] */
- lpm XAccu1, Z+
- lpm XAccu2, Z+
- lpm XAccu3, Z+
- lpm XAccu4, Z+
- add T1, XAccu1
- adc T2, XAccu2
- adc T3, XAccu3
- adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
- /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
- /* starting with MAJ(x,y,z) */
- ldd Func1, Y+4*0+0
- ldd Func2, Y+4*0+1
- ldd Func3, Y+4*0+2
- ldd Func4, Y+4*0+3 /* load x=a[0] */
- ldd XAccu1, Y+4*1+0
- ldd XAccu2, Y+4*1+1
- ldd XAccu3, Y+4*1+2
- ldd XAccu4, Y+4*1+3 /* load y=a[1] */
- and XAccu1, Func1
- and XAccu2, Func2
- and XAccu3, Func3
- and XAccu4, Func4 /* XAccu == (x & y) */
- ldd Bck1, Y+4*2+0
- ldd Bck2, Y+4*2+1
- ldd Bck3, Y+4*2+2
- ldd Bck4, Y+4*2+3 /* load z=a[2] */
- and Func1, Bck1
- and Func2, Bck2
- and Func3, Bck3
- and Func4, Bck4
- eor XAccu1, Func1
- eor XAccu2, Func2
- eor XAccu3, Func3
- eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */
- ldd Func1, Y+4*1+0
- ldd Func2, Y+4*1+1
- ldd Func3, Y+4*1+2
- ldd Func4, Y+4*1+3 /* load y=a[1] */
- and Func1, Bck1
- and Func2, Bck2
- and Func3, Bck3
- and Func4, Bck4
- eor XAccu1, Func1
- eor XAccu2, Func2
- eor XAccu3, Func3
- eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
- /* SIGMA0(a[0]) */
- ldd Bck1, Y+4*0+0 /* we should combine this with above */
- ldd Bck2, Y+4*0+1
- ldd Bck3, Y+4*0+2
- ldd Bck4, Y+4*0+3
- movw Func1, Bck1
- movw Func3, Bck3
- ldi r20, 2
- rcall bitrotr
- movw Accu1, Func1
- movw Accu3, Func3 /* Accu = shr(a[0], 2) */
- movw Func1, Bck3
- movw Func3, Bck1 /* prerotate by 16 bits */
- ldi r20, 3
- rcall bitrotl
- eor Accu1, Func1
- eor Accu2, Func2
- eor Accu3, Func3
- eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
- mov Func1, Bck4
- mov Func2, Bck1
- mov Func3, Bck2
- mov Func4, Bck3 /* prerotate by 24 bits */
- ldi r20, 2
- rcall bitrotl
- eor Accu1, Func1
- eor Accu2, Func2
- eor Accu3, Func3
- eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
- add Accu1, XAccu1 /* add previous result (MAJ)*/
- adc Accu2, XAccu2
- adc Accu3, XAccu3
- adc Accu4, XAccu4
- /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
- /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
-
- ldi r21, 7*4
- adiw r28, 7*4
-a_shift_loop:
- ld r25, -Y /* warning: this is PREdecrement */
- std Y+4, r25
- dec r21
- brne a_shift_loop
-
- ldd Bck1, Y+4*4+0
- ldd Bck2, Y+4*4+1
- ldd Bck3, Y+4*4+2
- ldd Bck4, Y+4*4+3
- add Bck1, T1
- adc Bck2, T2
- adc Bck3, T3
- adc Bck4, T4
- std Y+4*4+0, Bck1
- std Y+4*4+1, Bck2
- std Y+4*4+2, Bck3
- std Y+4*4+3, Bck4
- add Accu1, T1
- adc Accu2, T2
- adc Accu3, T3
- adc Accu4, T4
- std Y+4*0+0, Accu1
- std Y+4*0+1, Accu2
- std Y+4*0+2, Accu3
- std Y+4*0+3, Accu4 /* a array updated */
-
-
- dec LoopC
- breq update_state
- rjmp sha256_main_loop ;brne sha256_main_loop
-update_state:
- /* update state */
- /* pointers to state should still exist on the stack ;-) */
- pop r31
- pop r30
- ldi r21, 8
-update_state_loop:
- ldd Accu1, Z+0
- ldd Accu2, Z+1
- ldd Accu3, Z+2
- ldd Accu4, Z+3
- ld Func1, Y+
- ld Func2, Y+
- ld Func3, Y+
- ld Func4, Y+
- add Accu1, Func1
- adc Accu2, Func2
- adc Accu3, Func3
- adc Accu4, Func4
- st Z+, Accu1
- st Z+, Accu2
- st Z+, Accu3
- st Z+, Accu4
- dec r21
- brne update_state_loop
- /* now we just have to update the length */
- adiw r30, 1 /* since we add 512, we can simply skip the LSB */
- ldi r21, 2
- ldi r22, 6
- ld r20, Z
- add r20, r21
- st Z+, r20
- clr r21
-sha256_nextBlock_fix_length:
- brcc sha256_nextBlock_epilog
- ld r20, Z
- adc r20, r21
- st Z+, r20
- dec r22
- brne sha256_nextBlock_fix_length
-
-; EPILOG
-sha256_nextBlock_epilog:
-/* now we should clean up the stack */
-
- pop r21
- pop r20
- in r0, SREG
- cli ; we want to be uninterrupted while updating SP
- out SPL, r20
- out SPH, r21
- out SREG, r0
-
- clr r1
- pop r29
- pop r28
- pop r17
- pop r16
- pop r15
- pop r14
- pop r13
- pop r12
- pop r11
- pop r10
- pop r9
- pop r8
- pop r7
- pop r6
- pop r5
- pop r4
- ret
-
-sha256_kv: ; round-key-vector stored in ProgMem
-.word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
-.word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
-.word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
-.word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
-.word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
-.word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
-.word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
-.word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
-
-
-;###########################################################
-
-.global sha256_init
-;uint32_t sha256_init_vector[]={
-; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
-;
-;void sha256_init(sha256_ctx_t *state){
-; state->length=0;
-; memcpy(state->h, sha256_init_vector, 8*4);
-;}
-; param1: (Func3,r24) 16-bit pointer to sha256_ctx_t struct in ram
-; modifys: Z(r30,r31), Func1, r22
-sha256_init:
- movw r26, r24 ; (24,25) --> (26,27) load X with param1
- ldi r30, lo8((sha256_init_vector))
- ldi r31, hi8((sha256_init_vector))
- ldi r22, 32
-sha256_init_vloop:
- lpm r23, Z+
- st X+, r23
- dec r22
- brne sha256_init_vloop
- ldi r22, 8
- clr r1 ;this should not be needed
-sha256_init_lloop:
- st X+, r1
- dec r22
- brne sha256_init_lloop
- ret
-
-sha256_init_vector:
-.word 0xE667, 0x6A09
-.word 0xAE85, 0xBB67
-.word 0xF372, 0x3C6E
-.word 0xF53A, 0xA54F
-.word 0x527F, 0x510E
-.word 0x688C, 0x9B05
-.word 0xD9AB, 0x1F83
-.word 0xCD19, 0x5BE0
-
-;###########################################################
-
-.global rotl32
-; === ROTL32 ===
-; function that rotates a 32 bit word to the left
-; param1: the 32-bit word to rotate
-; given in r25,r24,r23,r22 (r25 is most significant)
-; param2: an 8-bit value telling how often to rotate
-; given in r20
-; modifys: r21, r22
-rotl32:
- cpi r20, 8
- brlo bitrotl
- mov r21, r25
- mov r25, r24
- mov r24, r23
- mov r23, r22
- mov r22, r21
- subi r20, 8
- rjmp rotr32
-bitrotl:
- clr r21
- clc
-bitrotl_loop:
- tst r20
- breq fixrotl
- rol r22
- rol r23
- rol r24
- rol r25
- rol r21
- dec r20
- rjmp bitrotl_loop
-fixrotl:
- or r22, r21
- ret
-
-
-;###########################################################
-
-.global rotr32
-; === ROTR32 ===
-; function that rotates a 32 bit word to the right
-; param1: the 32-bit word to rotate
-; given in r25,r24,r23,22 (r25 is most significant)
-; param2: an 8-bit value telling how often to rotate
-; given in r20
-; modifys: r21, r22
-rotr32:
- cpi r20, 8
- brlo bitrotr
- mov r21, r22
- mov r22, r23
- mov r23, r24
- mov r24, r25
- mov r25, r21
- subi r20, 8
- rjmp rotr32
-bitrotr:
- clr r21
- clc
-bitrotr_loop:
- tst r20
- breq fixrotr
- ror r25
- ror r24
- ror r23
- ror r22
- ror r21
- dec r20
- rjmp bitrotr_loop
-fixrotr:
- or r25, r21
- ret
-
-
-;###########################################################
-
-.global change_endian32
-; === change_endian32 ===
-; function that changes the endianess of a 32-bit word
-; param1: the 32-bit word
-; given in r25,r24,r23,22 (r25 is most significant)
-; modifys: r21, r22
-change_endian32:
- movw r20, r22 ; (r22,r23) --> (r20,r21)
- mov r22, r25
- mov r23, r24
- mov r24, r21
- mov r25, r20
- ret
-
+/*
+ * Author: Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; sha-256 implementation in assembler
+SHA256_BLOCK_BITS = 512
+SHA256_HASH_BITS = 256
+
+.macro precall
+ /* push r18 - r27, r30 - r31*/
+ push r0
+ push r1
+ push r18
+ push r19
+ push r20
+ push r21
+ push r22
+ push r23
+ push r24
+ push r25
+ push r26
+ push r27
+ push r30
+ push r31
+ clr r1
+.endm
+
+.macro postcall
+ pop r31
+ pop r30
+ pop r27
+ pop r26
+ pop r25
+ pop r24
+ pop r23
+ pop r22
+ pop r21
+ pop r20
+ pop r19
+ pop r18
+ pop r1
+ pop r0
+.endm
+
+
+.macro hexdump length
+ push r27
+ push r26
+ ldi r25, '\r'
+ mov r24, r25
+ call uart_putc
+ ldi r25, '\n'
+ mov r24, r25
+ call uart_putc
+ pop r26
+ pop r27
+ movw r24, r26
+.if \length > 16
+ ldi r22, lo8(16)
+ ldi r23, hi8(16)
+ push r27
+ push r26
+ call uart_hexdump
+ pop r26
+ pop r27
+ adiw r26, 16
+ hexdump \length-16
+.else
+ ldi r22, lo8(\length)
+ ldi r23, hi8(\length)
+ call uart_hexdump
+.endif
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+ precall
+ hexdump \length
+ postcall
+.endm
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha256_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][h5][h6][h7][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################
+
+.global sha256_ctx2hash
+; === sha256_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+; param1: the 16-bit destination pointer
+; given in r25,r24 (r25 is most significant)
+; param2: the 16-bit pointer to sha256_ctx structure
+; given in r23,r22
+sha256_ctx2hash:
+ movw r26, r22
+ movw r30, r24
+ ldi r21, 8
+ sbiw r26, 4
+1:
+ ldi r20, 4
+ adiw r26, 8
+2:
+ ld r0, -X
+ st Z+, r0
+ dec r20
+ brne 2b
+
+ dec r21
+ brne 1b
+
+ ret
+
+;###########################################################
+
+.global sha256
+; === sha256 ===
+; this function calculates SHA-256 hashes from messages in RAM
+; param1: the 16-bit hash destination pointer
+; given in r25,r24 (r25 is most significant)
+; param2: the 16-bit pointer to message
+; given in r23,r22
+; param3: 32-bit length value (length of message in bits)
+; given in r21,r20,r19,r18
+sha256:
+sha256_prolog:
+ push r8
+ push r9
+ push r10
+ push r11
+ push r12
+ push r13
+ push r16
+ push r17
+ in r16, SPL
+ in r17, SPH
+ subi r16, 8*4+8
+ sbci r17, 0
+ in r0, SREG
+ cli
+ out SPL, r16
+ out SPH, r17
+ out SREG, r0
+
+ push r25
+ push r24
+ inc r16
+ adc r17, r1
+
+ movw r8, r18 /* backup of length*/
+ movw r10, r20
+
+ movw r12, r22 /* backup pf msg-ptr */
+
+ movw r24, r16
+ rcall sha256_init
+ /* if length >= 512 */
+1:
+ tst r11
+ brne 4f
+ tst r10
+ brne 4f
+ mov r19, r9
+ cpi r19, 0x02
+ brlo 4f
+
+ movw r24, r16
+ movw r22, r12
+ rcall sha256_nextBlock
+ ldi r19, 0x64
+ add r22, r19
+ adc r23, r1
+ /* length -= 512 */
+ ldi r19, 0x02
+ sub r9, r19
+ sbc r10, r1
+ sbc r11, r1
+ rjmp 1b
+
+4:
+ movw r24, r16
+ movw r22, r12
+ movw r20, r8
+ rcall sha256_lastBlock
+
+ pop r24
+ pop r25
+ movw r22, r16
+ rcall sha256_ctx2hash
+
+sha256_epilog:
+ in r30, SPL
+ in r31, SPH
+ adiw r30, 8*4+8
+ in r0, SREG
+ cli
+ out SPL, r30
+ out SPH, r31
+ out SREG, r0
+ pop r17
+ pop r16
+ pop r13
+ pop r12
+ pop r11
+ pop r10
+ pop r9
+ pop r8
+ ret
+
+;###########################################################
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha256_lastBlock
+; === sha256_lastBlock ===
+; this function does padding & Co. for calculating SHA-256 hashes
+; param1: the 16-bit pointer to sha256_ctx structure
+; given in r25,r24 (r25 is most significant)
+; param2: an 16-bit pointer to 64 byte block to hash
+; given in r23,r22
+; param3: an 16-bit integer specifing length of block in bits
+; given in r21,r20
+sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
+
+
+sha256_lastBlock:
+ tst r20
+ brne sha256_lastBlock_prolog
+ cpi r21, 0x02
+ brne sha256_lastBlock_prolog
+ push r25
+ push r24
+ push r23
+ push r22
+ rcall sha256_nextBlock
+ pop r22
+ pop r23
+ pop r24
+ pop r25
+ clr r21
+ clr r22
+sha256_lastBlock_prolog:
+ /* allocate space on stack */
+ in r30, SPL
+ in r31, SPH
+ in r1, SREG
+ subi r30, lo8(64)
+ sbci r31, hi8(64)
+ cli
+ out SPL, r30
+ out SPH, r31
+ out SREG,r1
+
+ adiw r30, 1 /* SP points to next free byte on stack */
+ mov r18, r20 /* r20 = LSB(length) */
+ lsr r18
+ lsr r18
+ lsr r18
+ bst r21, 0 /* may be we should explain this ... */
+ bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
+
+
+ movw r26, r22 /* X points to begin of msg */
+ tst r18
+ breq sha256_lastBlock_post_copy
+ mov r1, r18
+sha256_lastBlock_copy_loop:
+ ld r0, X+
+ st Z+, r0
+ dec r1
+ brne sha256_lastBlock_copy_loop
+sha256_lastBlock_post_copy:
+sha256_lastBlock_insert_stuffing_bit:
+ ldi r19, 0x80
+ mov r0,r19
+ ldi r19, 0x07
+ and r19, r20 /* if we are in bitmode */
+ breq 2f /* no bitmode */
+1:
+ lsr r0
+ dec r19
+ brne 1b
+ ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+ or r0, r19
+2:
+ st Z+, r0
+ inc r18
+
+/* checking stuff here */
+ cpi r18, 64-8+1
+ brsh 0f
+ rjmp sha256_lastBlock_insert_zeros
+0:
+ /* oh shit, we landed here */
+ /* first we have to fill it up with zeros */
+ ldi r19, 64
+ sub r19, r18
+ breq 2f
+1:
+ st Z+, r1
+ dec r19
+ brne 1b
+2:
+ sbiw r30, 63
+ sbiw r30, 1
+ movw r22, r30
+
+ push r31
+ push r30
+ push r25
+ push r24
+ push r21
+ push r20
+ rcall sha256_nextBlock
+ pop r20
+ pop r21
+ pop r24
+ pop r25
+ pop r30
+ pop r31
+
+ /* now we should subtract 512 from length */
+ movw r26, r24
+ adiw r26, 4*8+1 /* we can skip the lowest byte */
+ ld r19, X
+ subi r19, hi8(512)
+ st X+, r19
+ ldi r18, 6
+1:
+ ld r19, X
+ sbci r19, 0
+ st X+, r19
+ dec r18
+ brne 1b
+
+; clr r18 /* not neccessary ;-) */
+ /* reset Z pointer to begin of block */
+
+sha256_lastBlock_insert_zeros:
+ ldi r19, 64-8
+ sub r19, r18
+ breq sha256_lastBlock_insert_length
+ clr r1
+1:
+ st Z+, r1 /* r1 is still zero */
+ dec r19
+ brne 1b
+
+; rjmp sha256_lastBlock_epilog
+sha256_lastBlock_insert_length:
+ movw r26, r24 /* X points to state */
+ adiw r26, 8*4 /* X points to (state.length) */
+ adiw r30, 8 /* Z points one after the last byte of block */
+ ld r0, X+
+ add r0, r20
+ st -Z, r0
+ ld r0, X+
+ adc r0, r21
+ st -Z, r0
+ ldi r19, 6
+1:
+ ld r0, X+
+ adc r0, r1
+ st -Z, r0
+ dec r19
+ brne 1b
+
+ sbiw r30, 64-8
+ movw r22, r30
+ rcall sha256_nextBlock
+
+sha256_lastBlock_epilog:
+ in r30, SPL
+ in r31, SPH
+ in r1, SREG
+ adiw r30, 63 ; lo8(64)
+ adiw r30, 1 ; hi8(64)
+ cli
+ out SPL, r30
+ out SPH, r31
+ out SREG,r1
+ clr r1
+ clr r0
+ ret
+
+/**/
+;###########################################################
+
+.global sha256_nextBlock
+; === sha256_nextBlock ===
+; this is the core function for calculating SHA-256 hashes
+; param1: the 16-bit pointer to sha256_ctx structure
+; given in r25,r24 (r25 is most significant)
+; param2: an 16-bit pointer to 64 byte block to hash
+; given in r23,r22
+sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
+
+Bck1 = 12
+Bck2 = 13
+Bck3 = 14
+Bck4 = 15
+Func1 = 22
+Func2 = 23
+Func3 = 24
+Func4 = 25
+Accu1 = 16
+Accu2 = 17
+Accu3 = 18
+Accu4 = 19
+XAccu1 = 8
+XAccu2 = 9
+XAccu3 = 10
+XAccu4 = 11
+T1 = 4
+T2 = 5
+T3 = 6
+T4 = 7
+LoopC = 1
+/* byteorder: high number <--> high significance */
+sha256_nextBlock:
+ ; initial, let's make some space ready for local vars
+ push r4 /* replace push & pop by mem ops? */
+ push r5
+ push r6
+ push r7
+ push r8
+ push r9
+ push r10
+ push r11
+ push r12
+ push r13
+ push r14
+ push r15
+ push r16
+ push r17
+ push r28
+ push r29
+ in r20, SPL
+ in r21, SPH
+ movw r18, r20 ;backup SP
+; movw r26, r20 ; X points to free space on stack
+ movw r30, r22 ; Z points to message
+ subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
+ sbci r21, hi8(sha256_nextBlock_localSpace)
+ movw r26, r20 ; X points to free space on stack
+ in r0, SREG
+ cli ; we want to be uninterrupted while updating SP
+ out SPL, r20
+ out SPH, r21
+ out SREG, r0
+ push r18
+ push r19
+ push r24
+ push r25 /* param1 will be needed later */
+ ; now we fill the w array with message (think about endianess)
+ adiw r26, 1 ; X++
+ ldi r20, 16
+sha256_nextBlock_wcpyloop:
+ ld r23, Z+
+ ld r22, Z+
+ ld r19, Z+
+ ld r18, Z+
+ st X+, r18
+ st X+, r19
+ st X+, r22
+ st X+, r23
+ dec r20
+ brne sha256_nextBlock_wcpyloop
+/* for (i=16; i<64; ++i){
+ w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];
+ } */
+ /* r25,r24,r23,r24 (r21,r20) are function values
+ r19,r18,r17,r16 are the accumulator
+ r15,r14,r13,rBck1 are backup1
+ r11,r10,r9 ,r8 are xor accu
+ r1 is round counter */
+
+ ldi r20, 64-16
+ mov LoopC, r20
+sha256_nextBlock_wcalcloop:
+ movw r30, r26 ; cp X to Z
+ sbiw r30, 63
+ sbiw r30, 1 ; substract 64 = 16*4
+ ld Accu1, Z+
+ ld Accu2, Z+
+ ld Accu3, Z+
+ ld Accu4, Z+ /* w[i] = w[i-16] */
+ ld Bck1, Z+
+ ld Bck2, Z+
+ ld Bck3, Z+
+ ld Bck4, Z+ /* backup = w[i-15] */
+ /* now sigma 0 */
+ mov Func1, Bck2
+ mov Func2, Bck3
+ mov Func3, Bck4
+ mov Func4, Bck1 /* prerotated by 8 */
+ ldi r20, 1
+ rcall bitrotl
+ movw XAccu1, Func1
+ movw XAccu3, Func3 /* store ROTR(w[i-15],7) in xor accu */
+ movw Func1, Bck3
+ movw Func3, Bck1 /* prerotated by 16 */
+ ldi r20, 2
+ rcall bitrotr
+ eor XAccu1, Func1 /* xor ROTR(w[i-15], 18)*/
+ eor XAccu2, Func2
+ eor XAccu3, Func3
+ eor XAccu4, Func4
+ ldi Func2, 3 /* now shr3 */ /*we can destroy backup now*/
+sigma0_shr:
+ lsr Bck4
+ ror Bck3
+ ror Bck2
+ ror Bck1
+ dec Func2
+ brne sigma0_shr
+ eor XAccu1, Bck1
+ eor XAccu2, Bck2
+ eor XAccu3, Bck3
+ eor XAccu4, Bck4 /* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
+ add Accu1, XAccu1
+ adc Accu2, XAccu2
+ adc Accu3, XAccu3
+ adc Accu4, XAccu4 /* finished with sigma0 */
+ ldd Func1, Z+7*4 /* now accu += w[i-7] */
+ ldd Func2, Z+7*4+1
+ ldd Func3, Z+7*4+2
+ ldd Func4, Z+7*4+3
+ add Accu1, Func1
+ adc Accu2, Func2
+ adc Accu3, Func3
+ adc Accu4, Func4
+ ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
+ ldd Bck2, Z+12*4+1
+ ldd Bck3, Z+12*4+2
+ ldd Bck4, Z+12*4+3
+ /* now sigma 1 */
+ movw Func1, Bck3
+ movw Func3, Bck1 /* prerotated by 16 */
+ ldi r20, 1
+ rcall bitrotr
+ movw XAccu3, Func3
+ movw XAccu1, Func1 /* store in ROTR(w[i-2], 17) xor accu */
+; movw Func1, Bck3
+; movw Func3, Bck1 /* prerotated by 16 */
+ ldi r20, 2
+ rcall bitrotr
+ eor XAccu1, Func1 /* xor ROTR(w[i-2], 19)*/
+ eor XAccu2, Func2
+ eor XAccu3, Func3
+ eor XAccu4, Func4
+ ldi Func2, 2 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
+sigma1_shr:
+ lsr Bck4
+ ror Bck3
+ ror Bck2
+ dec Func2
+ brne sigma1_shr
+ eor XAccu1, Bck2
+ eor XAccu2, Bck3
+ eor XAccu3, Bck4 /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
+ add Accu1, XAccu1
+ adc Accu2, XAccu2
+ adc Accu3, XAccu3
+ adc Accu4, XAccu4 /* finished with sigma0 */
+ /* now let's store the shit */
+ st X+, Accu1
+ st X+, Accu2
+ st X+, Accu3
+ st X+, Accu4
+ dec LoopC
+ breq 3f ; skip if zero
+ rjmp sha256_nextBlock_wcalcloop
+3:
+ /* we are finished with w array X points one byte post w */
+/* init a array */
+ pop r31
+ pop r30
+ push r30
+ push r31
+ ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
+init_a_array:
+ ld r1, Z+
+ st X+, r1
+ dec r25
+ brne init_a_array
+
+/* now the real fun begins */
+/* for (i=0; i<64; ++i){
+ t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
+ t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
+ memmove(&(a[1]), &(a[0]), 7*4); // a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0];
+ a[4] += t1;
+ a[0] = t1 + t2;
+ } */
+ /* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
+ sbiw r26, 8*4 /* X still points at a[7]+1*/
+ movw r28, r26
+ ldi r30, lo8(sha256_kv)
+ ldi r31, hi8(sha256_kv)
+ dec r27 /* X - (64*4 == 256) */
+ ldi r25, 64
+ mov LoopC, r25
+sha256_main_loop:
+ /* now calculate t1 */
+ /*CH(x,y,z) = (x&y)^((~x)&z)*/
+ ldd T1, Y+5*4
+ ldd T2, Y+5*4+1
+ ldd T3, Y+5*4+2
+ ldd T4, Y+5*4+3 /* y in T */
+ ldd Func1, Y+4*4
+ ldd Func2, Y+4*4+1
+ ldd Func3, Y+4*4+2
+ ldd Func4, Y+4*4+3 /* x in Func */
+ ldd Bck1, Y+6*4
+ ldd Bck2, Y+6*4+1
+ ldd Bck3, Y+6*4+2
+ ldd Bck4, Y+6*4+3 /* z in Bck */
+ and T1, Func1
+ and T2, Func2
+ and T3, Func3
+ and T4, Func4
+ com Func1
+ com Func2
+ com Func3
+ com Func4
+ and Bck1, Func1
+ and Bck2, Func2
+ and Bck3, Func3
+ and Bck4, Func4
+ eor T1, Bck1
+ eor T2, Bck2
+ eor T3, Bck3
+ eor T4, Bck4 /* done, CH(x,y,z) is in T */
+ /* now SIGMA1(a[4]) */
+ ldd Bck4, Y+4*4 /* think about using it from Func reg above*/
+ ldd Bck1, Y+4*4+1
+ ldd Bck2, Y+4*4+2
+ ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */
+ movw Func1, Bck1
+ movw Func3, Bck3
+ ldi r20, 2
+ rcall bitrotl /* rotr(x,6) */
+ movw XAccu1, Func1
+ movw XAccu3, Func3
+ movw Func1, Bck1
+ movw Func3, Bck3
+ ldi r20, 3
+ rcall bitrotr /* rotr(x,11) */
+ eor XAccu1, Func1
+ eor XAccu2, Func2
+ eor XAccu3, Func3
+ eor XAccu4, Func4
+ movw Func1, Bck3 /* this prerotates furteh 16 bits*/
+ movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
+ ldi r20, 1
+ rcall bitrotr /* rotr(x,11) */
+ eor XAccu1, Func1
+ eor XAccu2, Func2
+ eor XAccu3, Func3
+ eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
+ add T1, XAccu1
+ adc T2, XAccu2
+ adc T3, XAccu3
+ adc T4, XAccu4
+ /* now we've to add a[7], w[i] and k[i] */
+ ldd XAccu1, Y+4*7
+ ldd XAccu2, Y+4*7+1
+ ldd XAccu3, Y+4*7+2
+ ldd XAccu4, Y+4*7+3
+ add T1, XAccu1
+ adc T2, XAccu2
+ adc T3, XAccu3
+ adc T4, XAccu4 /* add a[7] */
+ ld XAccu1, X+
+ ld XAccu2, X+
+ ld XAccu3, X+
+ ld XAccu4, X+
+ add T1, XAccu1
+ adc T2, XAccu2
+ adc T3, XAccu3
+ adc T4, XAccu4 /* add w[i] */
+ lpm XAccu1, Z+
+ lpm XAccu2, Z+
+ lpm XAccu3, Z+
+ lpm XAccu4, Z+
+ add T1, XAccu1
+ adc T2, XAccu2
+ adc T3, XAccu3
+ adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
+ /*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
+ /* starting with MAJ(x,y,z) */
+ ldd Func1, Y+4*0+0
+ ldd Func2, Y+4*0+1
+ ldd Func3, Y+4*0+2
+ ldd Func4, Y+4*0+3 /* load x=a[0] */
+ ldd XAccu1, Y+4*1+0
+ ldd XAccu2, Y+4*1+1
+ ldd XAccu3, Y+4*1+2
+ ldd XAccu4, Y+4*1+3 /* load y=a[1] */
+ and XAccu1, Func1
+ and XAccu2, Func2
+ and XAccu3, Func3
+ and XAccu4, Func4 /* XAccu == (x & y) */
+ ldd Bck1, Y+4*2+0
+ ldd Bck2, Y+4*2+1
+ ldd Bck3, Y+4*2+2
+ ldd Bck4, Y+4*2+3 /* load z=a[2] */
+ and Func1, Bck1
+ and Func2, Bck2
+ and Func3, Bck3
+ and Func4, Bck4
+ eor XAccu1, Func1
+ eor XAccu2, Func2
+ eor XAccu3, Func3
+ eor XAccu4, Func4 /* XAccu == (x & y) ^ (x & z) */
+ ldd Func1, Y+4*1+0
+ ldd Func2, Y+4*1+1
+ ldd Func3, Y+4*1+2
+ ldd Func4, Y+4*1+3 /* load y=a[1] */
+ and Func1, Bck1
+ and Func2, Bck2
+ and Func3, Bck3
+ and Func4, Bck4
+ eor XAccu1, Func1
+ eor XAccu2, Func2
+ eor XAccu3, Func3
+ eor XAccu4, Func4 /* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
+ /* SIGMA0(a[0]) */
+ ldd Bck1, Y+4*0+0 /* we should combine this with above */
+ ldd Bck2, Y+4*0+1
+ ldd Bck3, Y+4*0+2
+ ldd Bck4, Y+4*0+3
+ movw Func1, Bck1
+ movw Func3, Bck3
+ ldi r20, 2
+ rcall bitrotr
+ movw Accu1, Func1
+ movw Accu3, Func3 /* Accu = shr(a[0], 2) */
+ movw Func1, Bck3
+ movw Func3, Bck1 /* prerotate by 16 bits */
+ ldi r20, 3
+ rcall bitrotl
+ eor Accu1, Func1
+ eor Accu2, Func2
+ eor Accu3, Func3
+ eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
+ mov Func1, Bck4
+ mov Func2, Bck1
+ mov Func3, Bck2
+ mov Func4, Bck3 /* prerotate by 24 bits */
+ ldi r20, 2
+ rcall bitrotl
+ eor Accu1, Func1
+ eor Accu2, Func2
+ eor Accu3, Func3
+ eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
+ add Accu1, XAccu1 /* add previous result (MAJ)*/
+ adc Accu2, XAccu2
+ adc Accu3, XAccu3
+ adc Accu4, XAccu4
+ /* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
+ /* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
+
+ ldi r21, 7*4
+ adiw r28, 7*4
+a_shift_loop:
+ ld r25, -Y /* warning: this is PREdecrement */
+ std Y+4, r25
+ dec r21
+ brne a_shift_loop
+
+ ldd Bck1, Y+4*4+0
+ ldd Bck2, Y+4*4+1
+ ldd Bck3, Y+4*4+2
+ ldd Bck4, Y+4*4+3
+ add Bck1, T1
+ adc Bck2, T2
+ adc Bck3, T3
+ adc Bck4, T4
+ std Y+4*4+0, Bck1
+ std Y+4*4+1, Bck2
+ std Y+4*4+2, Bck3
+ std Y+4*4+3, Bck4
+ add Accu1, T1
+ adc Accu2, T2
+ adc Accu3, T3
+ adc Accu4, T4
+ std Y+4*0+0, Accu1
+ std Y+4*0+1, Accu2
+ std Y+4*0+2, Accu3
+ std Y+4*0+3, Accu4 /* a array updated */
+
+
+ dec LoopC
+ breq update_state
+ rjmp sha256_main_loop ;brne sha256_main_loop
+update_state:
+ /* update state */
+ /* pointers to state should still exist on the stack ;-) */
+ pop r31
+ pop r30
+ ldi r21, 8
+update_state_loop:
+ ldd Accu1, Z+0
+ ldd Accu2, Z+1
+ ldd Accu3, Z+2
+ ldd Accu4, Z+3
+ ld Func1, Y+
+ ld Func2, Y+
+ ld Func3, Y+
+ ld Func4, Y+
+ add Accu1, Func1
+ adc Accu2, Func2
+ adc Accu3, Func3
+ adc Accu4, Func4
+ st Z+, Accu1
+ st Z+, Accu2
+ st Z+, Accu3
+ st Z+, Accu4
+ dec r21
+ brne update_state_loop
+ /* now we just have to update the length */
+ adiw r30, 1 /* since we add 512, we can simply skip the LSB */
+ ldi r21, 2
+ ldi r22, 6
+ ld r20, Z
+ add r20, r21
+ st Z+, r20
+ clr r21
+sha256_nextBlock_fix_length:
+ brcc sha256_nextBlock_epilog
+ ld r20, Z
+ adc r20, r21
+ st Z+, r20
+ dec r22
+ brne sha256_nextBlock_fix_length
+
+; EPILOG
+sha256_nextBlock_epilog:
+/* now we should clean up the stack */
+
+ pop r21
+ pop r20
+ in r0, SREG
+ cli ; we want to be uninterrupted while updating SP
+ out SPL, r20
+ out SPH, r21
+ out SREG, r0
+
+ clr r1
+ pop r29
+ pop r28
+ pop r17
+ pop r16
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop r11
+ pop r10
+ pop r9
+ pop r8
+ pop r7
+ pop r6
+ pop r5
+ pop r4
+ ret
+
+sha256_kv: ; round-key-vector stored in ProgMem
+.word 0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
+.word 0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
+.word 0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
+.word 0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
+.word 0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
+.word 0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
+.word 0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
+.word 0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
+
+
+;###########################################################
+
+.global sha256_init
+;uint32_t sha256_init_vector[]={
+; 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+; 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
+;
+;void sha256_init(sha256_ctx_t *state){
+; state->length=0;
+; memcpy(state->h, sha256_init_vector, 8*4);
+;}
+; param1: (Func3,r24) 16-bit pointer to sha256_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha256_init:
+ movw r26, r24 ; (24,25) --> (26,27) load X with param1
+ ldi r30, lo8((sha256_init_vector))
+ ldi r31, hi8((sha256_init_vector))
+ ldi r22, 32
+sha256_init_vloop:
+ lpm r23, Z+
+ st X+, r23
+ dec r22
+ brne sha256_init_vloop
+ ldi r22, 8
+ clr r1 ;this should not be needed
+sha256_init_lloop:
+ st X+, r1
+ dec r22
+ brne sha256_init_lloop
+ ret
+
+sha256_init_vector:
+.word 0xE667, 0x6A09
+.word 0xAE85, 0xBB67
+.word 0xF372, 0x3C6E
+.word 0xF53A, 0xA54F
+.word 0x527F, 0x510E
+.word 0x688C, 0x9B05
+.word 0xD9AB, 0x1F83
+.word 0xCD19, 0x5BE0
+
+;###########################################################
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+; param1: the 32-bit word to rotate
+; given in r25,r24,r23,r22 (r25 is most significant)
+; param2: an 8-bit value telling how often to rotate
+; given in r20
+; modifys: r21, r22
+rotl32:
+ cpi r20, 8
+ brlo bitrotl
+ mov r21, r25
+ mov r25, r24
+ mov r24, r23
+ mov r23, r22
+ mov r22, r21
+ subi r20, 8
+ rjmp rotr32
+bitrotl:
+ clr r21
+ clc
+bitrotl_loop:
+ tst r20
+ breq fixrotl
+ rol r22
+ rol r23
+ rol r24
+ rol r25
+ rol r21
+ dec r20
+ rjmp bitrotl_loop
+fixrotl:
+ or r22, r21
+ ret
+
+
+;###########################################################
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+; param1: the 32-bit word to rotate
+; given in r25,r24,r23,22 (r25 is most significant)
+; param2: an 8-bit value telling how often to rotate
+; given in r20
+; modifys: r21, r22
+rotr32:
+ cpi r20, 8
+ brlo bitrotr
+ mov r21, r22
+ mov r22, r23
+ mov r23, r24
+ mov r24, r25
+ mov r25, r21
+ subi r20, 8
+ rjmp rotr32
+bitrotr:
+ clr r21
+ clc
+bitrotr_loop:
+ tst r20
+ breq fixrotr
+ ror r25
+ ror r24
+ ror r23
+ ror r22
+ ror r21
+ dec r20
+ rjmp bitrotr_loop
+fixrotr:
+ or r25, r21
+ ret
+
+
+;###########################################################
+
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+; param1: the 32-bit word
+; given in r25,r24,r23,22 (r25 is most significant)
+; modifys: r21, r22
+change_endian32:
+ movw r20, r22 ; (r22,r23) --> (r20,r21)
+ mov r22, r25
+ mov r23, r24
+ mov r24, r21
+ mov r25, r20
+ ret
+
diff --git a/xtea-asm.S b/xtea-asm.S
index f3c5b12..20f1d63 100644
--- a/xtea-asm.S
+++ b/xtea-asm.S
@@ -17,9 +17,9 @@
along with this program. If not, see .
*/
/* xtea-asm.S
- * Author: Daniel Otte
- * Date: 06.06.2006
- * License: GPL
+ * Author: Daniel Otte
+ * Date: 2006-06-06
+ * License: GPLv3 or later
* Implementation of XTEA for AVR
* include xtea.h in your C-Project to use this functions.
*/