diff --git a/A5_1.c b/A5_1.c
index 526e6cf..a22d185 100644
--- a/A5_1.c
+++ b/A5_1.c
@@ -17,10 +17,11 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /* 
- * File:		A5_1.c
- * Author:	Daniel Otte
- * Date: 	24.06.2006
- * License: GPL
+ * File:        A5_1.c
+ * Author:      Daniel Otte
+ * email:       daniel.otte@rub.de
+ * Date:        2006-06-24
+ * License:     GPLv3 or later
  * Description: Implementation of the A5/1 stream cipher algorithm, as used in GSM.
  * ! Warning, this is weak crypto !
  * 
diff --git a/Makefile b/Makefile
index 09457be..c8a52b7 100644
--- a/Makefile
+++ b/Makefile
@@ -30,6 +30,10 @@ PRG = remove_me
 
 #-------------------------------------------------------------------------------
 
+all: $(foreach algo, $(ALGORITHMS), $(algo)_OBJ)
+
+#-------------------------------------------------------------------------------
+
 define BLA_TEMPLATE2
 $(2): $(3)
 	@echo "[gcc]: $$@"
@@ -151,11 +155,6 @@ $(foreach algo, $(ALGORITHMS),$(eval $(call FLASH_TEMPLATE, $(algo), \
                 $(patsubst %.o,%.hex,$(firstword $($(algo)_TEST_BIN)))) ))  
 
 #-------------------------------------------------------------------------------
-	
-.PHONY: all
-all: $(foreach algo, $(ALGORITHMS), $(algo)_OBJ)
-#all: $(PRG).elf lst text eeprom
-
 
 .PHONY: clean
 clean:
diff --git a/arcfour-asm.S b/arcfour-asm.S
index eafd771..ec0eeeb 100644
--- a/arcfour-asm.S
+++ b/arcfour-asm.S
@@ -17,10 +17,10 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /* 
- * File:		arcfour-asm.S
- * Author:	Daniel Otte
- * Date: 	07.06.2006
- * License: GPL
+ * File:        arcfour-asm.S
+ * Author:      Daniel Otte
+ * Date:        2006-07-06
+ * License:     GPLv3 or later
  * Description: Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm.
  * 
  */
diff --git a/arcfour.c b/arcfour.c
index abed9dd..f8d01a6 100644
--- a/arcfour.c
+++ b/arcfour.c
@@ -17,10 +17,11 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /* 
- * File:		arcfour.c
- * Author:	Daniel Otte
- * Date: 	07.06.2006
- * License: GPL
+ * File:        arcfour.c
+ * Author:      Daniel Otte
+ * email:       daniel.otte@rub.de
+ * Date:        2006-06-07
+ * License:     GPLv3 or later
  * Description: Implementation of the ARCFOUR (RC4 compatible) stream cipher algorithm.
  * 
  */
diff --git a/camellia-asm.S b/camellia-asm.S
index bcc190e..6e58ca5 100644
--- a/camellia-asm.S
+++ b/camellia-asm.S
@@ -17,10 +17,10 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /* 
- * File:	camellis-asm.S
- * Author:	Daniel Otte
- * Date: 	10.11.2006
- * License: GPL
+ * File:        camellis-asm.S
+ * Author:      Daniel Otte
+ * Date:        2006-11-10
+ * License:     GPLv3 or later
  * Description: Implementation of the camellia block cipher algorithm.
  * 
  */
diff --git a/cast5.c b/cast5.c
index a938bfb..51e9e93 100644
--- a/cast5.c
+++ b/cast5.c
@@ -19,9 +19,10 @@
 /* 
  * \file	cast5.c
  * \author	Daniel Otte
- * \date 	26.07.2006
+ * \email       daniel.otte@rub.de
+ * \date 	2006-07-26
  * \par License:
- *  GPL
+ *  GPLv3 or later
  * \brief Implementation of the CAST5 (aka CAST-128) cipher algorithm as described in RFC 2144
  * 
  */
diff --git a/cli.c b/cli.c
index 1b5467c..8c15f5f 100644
--- a/cli.c
+++ b/cli.c
@@ -20,7 +20,7 @@
  * 
  * author: Daniel Otte
  * email:  daniel.otte@rub.de
- * license: GPLv3
+ * license: GPLv3 or later
  * 
  * components to help implementing simple command based interaction
  * 
diff --git a/des.c b/des.c
index dc16750..d4b8ce1 100644
--- a/des.c
+++ b/des.c
@@ -17,12 +17,13 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
- * \file	des.c
- * \author	Daniel Otte 
- * \date	2007-06-16
- * \brief	DES and EDE-DES implementation
+ * \file        des.c
+ * \author      Daniel Otte
+ * \email       daniel.otte@rub.de
+ * \date        2007-06-16
+ * \brief       DES and EDE-DES implementation
  * \par License	
- * GPL
+ * GPLv3 or later
  * 
  */
 #include "config.h"
diff --git a/entropium.c b/entropium.c
index 1bbf583..fdbf13c 100644
--- a/entropium.c
+++ b/entropium.c
@@ -17,11 +17,12 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /**
- * \file		entropium.c
- * \author		Daniel Otte
- * \date		17.05.2006
+ * \file        entropium.c
+ * \author      Daniel Otte
+ * \email       daniel.otte@rub.de
+ * \date        2006-05-17
  * \par License:
- * 	GPL
+ * 	GPLv3 or later
  * \brief	This file contains an implementaition of a pseudo-random-number generator.
  * 
  * Extension 1:
diff --git a/grain.c b/grain.c
index 612d18b..05571f4 100644
--- a/grain.c
+++ b/grain.c
@@ -20,7 +20,7 @@
  * 
  * author: Daniel Otte
  * email:  daniel.otte@rub.de
- * license: GPLv3
+ * license: GPLv3 or later
  * 
  */
 
diff --git a/grain_h_lutgen.c b/grain_h_lutgen.c
deleted file mode 100644
index 4b5ede0..0000000
--- a/grain_h_lutgen.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * 
- * author: Daniel Otte
- * email:  daniel.otte@rub.de
- * license: GPLv3
- *
- * this program generate a lookuptable for the h-function in grain 
- */
-
-#include <stdint.h>
-#include <stdio.h>
-
-#define X(i) ((x)>>((i)))
-uint8_t h(uint8_t x){
-	uint8_t h;
-	
-	h = (X(1)) ^ (X(4)) ^ 
-	    (X(0)&X(3)) ^ (X(2)&X(3)) ^ (X(3)&X(4)) ^ 
-	    (X(0)&X(1)&X(2)) ^ (X(0)&X(2)&X(3)) ^ (X(0)&X(2)&X(4)) ^ 
-	    (X(1)&X(2)&X(4)) ^ (X(2)&X(3)&X(4)) ;
-	
-	return h&1;
-}
-
-int main(void){
-	uint8_t i; 
-	uint32_t lut;
-	puts(
-	"/* \n"
-	" * author: Daniel Otte \n"
-	" * email:  daniel.otte@rub.de  \n"
-	" * license: GPLv3 \n"
-	" *  \n"
-	" * this program generate a lookuptable for the h-function in grain  \n"
-	" *  \n"
-	" */ \n");
-	puts("/* \n"
-	     " * x0 x1 x2 x3 x4 - h");
-	
-	for(i=0; i<0x20; ++i){
-		printf(" *  %c  %c  %c  %c  %c - %c\n",
-			(i&0x01)?'1':'0',
-			(i&0x02)?'1':'0',
-			(i&0x04)?'1':'0',
-			(i&0x08)?'1':'0',
-			(i&0x10)?'1':'0',
-			(h(i))?'1':'0' );
-			lut >>=1;
-			lut |= h(i)?0x80000000:0x00000000;
-			if(i%4==3){	
-				puts(" * --");
-			}
-	}
-	puts(" */\n");
-	printf(" uint8_t lut[4]= {0x%2.2X, 0x%2.2X, 0x%2.2X, 0x%2.2X} \n",
-		lut&0xFF, (lut>>8)&0xFF, (lut>>16)&0xFF, (lut>>24)&0xFF);
-		
-	return 0; 
-}
-
diff --git a/grain_nfsr_lutgen.c b/grain_nfsr_lutgen.c
deleted file mode 100644
index 9b9277d..0000000
--- a/grain_nfsr_lutgen.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/**
- * 
- * author: Daniel Otte
- * email:  daniel.otte@rub.de
- * license: GPLv3
- *
- * this program generate a lookuptable for the nfsr-feedback-function in grain 
- */
-
-#include <stdint.h>
-#include <stdio.h>
-
-#define X(i) ((x)>>((i)))
-#define B63 X(0)
-#define B60 X(3)
-#define B52 X(5)
-#define B45 X(6)
-#define B37 X(4)
-#define B33 X(8)
-#define B28 X(2)
-#define B21 X(9)
-#define B15 X(1)
-#define B09 X(7)
-
-uint8_t g(uint16_t x){
-	uint8_t a,b,d,e;
-	uint8_t ret;
-
-	ret = B60 ^ B52 ^ B45 ^ B37 ^ B33 ^ B28 ^ B21 ^ B09;
-	ret ^= (a = B63 & B60);
-	ret ^= (b = B37 & B33);
-	ret ^= B15 & B09;
-	ret ^= (d = B60 & B52 & B45);
-	ret ^= (e = B33 & B28 & B21);
-	ret ^= B63 & B45 & B28 & B09;
-	ret ^= b & B60 & B52;
-	ret ^= a & B21 & B15;
-	ret ^= d & B63 & B37;
-	ret ^= e & B15 & B09;
-	ret ^= e & B52 & B45 & B37;
-	
-	return ret&1;
-}
-
-int main(void){
-	uint16_t i; 
-	uint8_t t, lut[128]={0}; /* 2**10 / 8 == 2**(10-3) == 2**7 == 128 */
-	puts(
-	"/* \n"
-	" * author: Daniel Otte \n"
-	" * email:  daniel.otte@rub.de  \n"
-	" * license: GPLv3 \n"
-	" *  \n"
-	" * this program generate a lookuptable for the h-function in grain  \n"
-	" *  \n"
-	" */ \n");
-	puts("/* \n"
-	     " * b63 b15 b28 b60 b37 b52 b45 b09 b33 b21 - g");
-	
-	for(i=0; i<0x0400; ++i){
-		t = g(i);
-		printf(" *  %c   %c   %c   %c   %c   %c   %c   %c   %c   %c  - %c\n",
-			(i&0x01)?'1':'0',
-			(i&0x02)?'1':'0',
-			(i&0x04)?'1':'0',
-			(i&0x08)?'1':'0',
-			(i&0x10)?'1':'0',
-			(i&0x20)?'1':'0',
-			(i&0x40)?'1':'0',
-			(i&0x80)?'1':'0',
-			(i&0x0100)?'1':'0',
-			(i&0x0200)?'1':'0',
-			t?'1':'0' );
-		lut[i/8] |= t<<(i%8);
-//		if(i%4==3){	
-//			puts(" * --");
-//		}
-	}
-	puts(" */\n");
-	
-	printf(" uint8_t g_lut[128]= {");
-	for(i=0; i<128; ++i){
-		if(i%16==0){
-			printf("\n\t");
-		}
-		printf("0x%2.2X%c ", lut[i], (i!=127)?',':' ');
-	}
-	printf("};\n\n");
-	return 0; 
-}
-
diff --git a/hmac-sha256.c b/hmac-sha256.c
index c57ba95..a0ad1dc 100644
--- a/hmac-sha256.c
+++ b/hmac-sha256.c
@@ -19,9 +19,9 @@
 /**
  * 
  * implementation of HMAC as described in RFC2104
- * Author:	Daniel Otte
- * 
- * License:	GPL
+ * Author:      Daniel Otte
+ * email:       daniel.otte@rub.de
+ * License:     GPLv3 or later
  **/
 
 /* 
diff --git a/main-seed-test.c b/main-seed-test.c
index 6bff1d5..813cb5c 100644
--- a/main-seed-test.c
+++ b/main-seed-test.c
@@ -18,11 +18,12 @@
 */
 /**
  * \file	main-seed-test.c
- * \author	Daniel Otte 
+ * \author	Daniel Otte
+ * \email	daniel.otte@rub.de
  * \date	2007-06-01
  * \brief	test suit for SEED
  * \par License	
- * GPL
+ * GPLv3 or later
  * 
  */
 #include "config.h"
diff --git a/main-shabea-test.c b/main-shabea-test.c
index aac85c1..a83d0c7 100644
--- a/main-shabea-test.c
+++ b/main-shabea-test.c
@@ -22,7 +22,7 @@
  * \date	2007-06-07
  * \brief	test suit for SHABEA
  * \par License	
- * GPL
+ * GPLv3 or later
  * 
  */
 #include "config.h"
diff --git a/md5.c b/md5.c
index bd43a38..5edb36b 100644
--- a/md5.c
+++ b/md5.c
@@ -19,9 +19,9 @@
 /* 
  * \file	md5.c
  * \author	Daniel Otte
- * \date 	31.07.2006
+ * \date 	2006-07-31
  * \par License:
- * GPL
+ * GPLv3 or later
  * \brief Implementation of the MD5 hash algorithm as described in RFC 1321
  * 
  */
diff --git a/noekeon.c b/noekeon.c
index 5ae3ec2..7627cc1 100644
--- a/noekeon.c
+++ b/noekeon.c
@@ -19,7 +19,7 @@
 /*
  * author: Daniel Otte
  * email:  daniel.otte@rub.de
- * license: GPLv3
+ * license: GPLv3 or later
  * 
  * 
  * 
diff --git a/noekeon_genrc.c b/noekeon_genrc.c
deleted file mode 100644
index cb8fac2..0000000
--- a/noekeon_genrc.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- *
- * author: Daniel Otte
- * email:  daniel.otte@rub.de
- * license: GPLv3
- *
- */
-
-#include <stdio.h>
-#include <stdint.h>
-
-uint8_t getnextrc(uint8_t a){
-	if((a&0x80) != 0){
-		return (a<<1) ^ 0x1B;
-	} else {
-		return (a<<1);
-	}
-}
-
-#define N 32
-
-int main(void){
-	uint8_t c=0x80;
-	uint32_t i;
-	puts("\nNoekeon Round Constants:");
-	for(i=0; i<N; ++i){
-		printf(" 0x%2.2X,", c);
-		if(i%8==7){
-			puts("");
-		}
-		c=getnextrc(c);
-	}
-	return 0;
-}
-
diff --git a/seed-asm.S b/seed-asm.S
index 5fbfd41..780ac6b 100644
--- a/seed-asm.S
+++ b/seed-asm.S
@@ -22,7 +22,7 @@
  * \date	2007-06-1
  * \brief	SEED parts in assembler for AVR
  * \par License	
- * GPL
+ * GPLv3 or later
  * 
  */
 SPL = 0x3D
diff --git a/sha1-asm.S b/sha1-asm.S
index 21d8510..14c2c21 100644
--- a/sha1-asm.S
+++ b/sha1-asm.S
@@ -16,980 +16,980 @@
     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-/*
- * Author:	Daniel Otte
- *
- * License: GPL
-*/
-; SHA1 implementation in assembler for AVR
-SHA1_BLOCK_BITS = 512
-SHA1_HASH_BITS = 160
-
-.macro precall
-	/* push r18 - r27, r30 - r31*/
-	push r0
-	push r1
-	push r18
-	push r19
-	push r20
-	push r21
-	push r22
-	push r23
-	push r24
-	push r25
-	push r26
-	push r27
-	push r30
-	push r31
-	clr r1
-.endm
-
-.macro postcall
-	pop r31
-	pop r30
-	pop r27
-	pop r26
-	pop r25
-	pop r24
-	pop r23
-	pop r22
-	pop r21
-	pop r20
-	pop r19
-	pop r18
-	pop r1
-	pop r0
-.endm
-
-
-.macro hexdump length
-	push r27
-	push r26
-	ldi r25, '\r'
-	mov r24, r25
-	call uart_putc
-	ldi r25, '\n'
-	mov r24, r25
-	call uart_putc
-	pop r26
-	pop r27
-	movw r24, r26
-.if \length > 16
-	ldi r22, lo8(16)
-	ldi r23, hi8(16)
-	push r27
-	push r26
-	call uart_hexdump
-	pop r26
-	pop r27
-	adiw r26, 16
-	hexdump \length-16
-.else
-	ldi r22, lo8(\length)
-	ldi r23, hi8(\length)
-	call uart_hexdump
-.endif
-.endm
-
-.macro delay
-/*	
-	push r0
-	push r1
-	clr r0
-1:	clr r1
-2:	dec r1
-	brne 2b
-	dec r0
-	brne 1b
-	pop r1
-	pop r0  // */
-.endm
-
-/* X points to Block */
-.macro dbg_hexdump length
-/*	
-	precall
-	hexdump \length
-	postcall
-	// */
-.endm
-
-
-
-.section .text
-
-SPL = 0x3D
-SPH = 0x3E
-SREG = 0x3F
-
-
-;
-;sha1_ctx_t is:
-;
-; [h0][h1][h2][h3][h4][length]
-; hn is 32 bit large, length is 64 bit large
-
-;###########################################################	
-
-.global sha1_ctx2hash
-; === sha1_ctx2hash ===
-; this function converts a state into a normal hash (bytestring)
-;  param1: the 16-bit destination pointer
-;	given in r25,r24 (r25 is most significant)
-;  param2: the 16-bit pointer to sha1_ctx structure
-;	given in r23,r22
-sha1_ctx2hash:
-	movw r26, r22
-	movw r30, r24
-	ldi r21, 5
-	sbiw r26, 4
-1:	
-	ldi r20, 4
-	adiw r26, 8
-2:	
-		ld r0, -X
-		st Z+, r0	
-	dec r20
-	brne 2b
-	
-	dec r21
-	brne 1b
-	
-	ret
-
-;###########################################################	
-
-.global sha1
-; === sha1 ===
-; this function calculates SHA-1 hashes from messages in RAM
-;  param1: the 16-bit hash destination pointer
-;	given in r25,r24 (r25 is most significant)
-;  param2: the 16-bit pointer to message
-;	given in r23,r22
-;  param3: 32-bit length value (length of message in bits)
-;   given in r21,r20,r19,r18
-sha1:
-sha1_prolog:
-	push r8
-	push r9
-	push r10
-	push r11
-	push r12
-	push r13
-	push r16
-	push r17
-	in r16, SPL
-	in r17, SPH
-	subi r16, 5*4+8 
-	sbci r17, 0	
-	in r0, SREG
-	cli
-	out SPL, r16
-	out SPH, r17
-	out SREG, r0
-	
-	push r25
-	push r24
-	inc r16
-	adc r17, r1
-	
-	movw r8, r18		/* backup of length*/
-	movw r10, r20
-	
-	movw r12, r22	/* backup pf msg-ptr */
-	
-	movw r24, r16
-	rcall sha1_init
-	/* if length >= 512 */
-1:
-	tst r11
-	brne 4f
-	tst r10
-	brne 4f
-	mov r19, r9
-	cpi r19, 0x02
-	brlo 4f
-	
-	movw r24, r16
-	movw r22, r12
-	rcall sha1_nextBlock
-	ldi r19, 0x64
-	add r22, r19
-	adc r23, r1
-	/* length -= 512 */
-	ldi r19, 0x02
-	sub r9, r19
-	sbc r10, r1
-	sbc r11, r1
-	rjmp 1b
-	
-4:
-	movw r24, r16
-	movw r22, r12
-	movw r20, r8
-	rcall sha1_lastBlock
-	
-	pop r24
-	pop r25
-	movw r22, r16
-	rcall sha1_ctx2hash	
-	
-sha1_epilog:
-	in r30, SPL
-	in r31, SPH
-	adiw r30, 5*4+8 	
-	in r0, SREG
-	cli
-	out SPL, r30
-	out SPH, r31
-	out SREG, r0
-	pop r17
-	pop r16
-	pop r13
-	pop r12
-	pop r11
-	pop r10
-	pop r9
-	pop r8
-	ret
-
-;###########################################################	
-
-
-; block MUST NOT be larger than 64 bytes
-
-.global sha1_lastBlock
-; === sha1_lastBlock ===
-; this function does padding & Co. for calculating SHA-1 hashes
-;  param1: the 16-bit pointer to sha1_ctx structure
-;	given in r25,r24 (r25 is most significant)
-;  param2: an 16-bit pointer to 64 byte block to hash
-;	given in r23,r22
-;  param3: an 16-bit integer specifing length of block in bits
-;	given in r21,r20
-sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
-
-
-sha1_lastBlock:
-	tst r20
-	brne sha1_lastBlock_prolog
-	cpi r21, 0x02
-	brne sha1_lastBlock_prolog
-	push r25
-	push r24
-	push r23
-	push r22
-	rcall sha1_nextBlock
-	pop r22
-	pop r23
-	pop r24
-	pop r25
-	clr r21
-	clr r22
-sha1_lastBlock_prolog:
-	/* allocate space on stack */
-	in r30, SPL
-	in r31, SPH
-	in r1, SREG
-	subi r30, lo8(64)
-	sbci r31, hi8(64) /* ??? */
-	cli
-	out SPL, r30
-	out SPH, r31
-	out SREG,r1
-
-	adiw r30, 1 /* SP points to next free byte on stack */
-	mov r18, r20 /* r20 = LSB(length) */
-	lsr r18
-	lsr r18
-	lsr r18
-	bst r21, 0	/* may be we should explain this ... */
-	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
-	
-	
-	movw r26, r22 /* X points to begin of msg */
-	tst r18
-	breq sha1_lastBlock_post_copy
-	mov r1, r18
-sha1_lastBlock_copy_loop:
-	ld r0, X+
-	st Z+, r0
-	dec r1
-	brne sha1_lastBlock_copy_loop
-sha1_lastBlock_post_copy:	
-sha1_lastBlock_insert_stuffing_bit:	
-	ldi r19, 0x80
-	mov r0,r19 	
-	ldi r19, 0x07
-	and r19, r20 /* if we are in bitmode */
-	breq 2f	/* no bitmode */
-1:	
-	lsr r0
-	dec r19
-	brne 1b
-	ld r19, X
-/* maybe we should do some ANDing here, just for safety */
-	or r0, r19
-2:	
-	st Z+, r0
-	inc r18
-
-/* checking stuff here */
-	cpi r18, 64-8+1
-	brsh 0f 
-	rjmp sha1_lastBlock_insert_zeros
-0:
-	/* oh shit, we landed here */
-	/* first we have to fill it up with zeros */
-	ldi r19, 64
-	sub r19, r18
-	breq 2f
-1:	
-	st Z+, r1
-	dec r19
-	brne 1b	
-2:	
-	sbiw r30, 63
-	sbiw r30,  1
-	movw r22, r30
-	
-	push r31
-	push r30
-	push r25
-	push r24
-	push r21
-	push r20
-	rcall sha1_nextBlock
-	pop r20
-	pop r21
-	pop r24
-	pop r25
-	pop r30
-	pop r31
-	
-	/* now we should subtract 512 from length */
-	movw r26, r24
-	adiw r26, 4*5+1 /* we can skip the lowest byte */
-	ld r19, X
-	subi r19, hi8(512)
-	st X+, r19
-	ldi r18, 6
-1:
-	ld r19, X
-	sbci r19, 0
-	st X+, r19
-	dec r18
-	brne 1b
-	
-;	clr r18 /* not neccessary ;-) */
-	/* reset Z pointer to begin of block */
-
-sha1_lastBlock_insert_zeros:	
-	ldi r19, 64-8
-	sub r19, r18
-	breq sha1_lastBlock_insert_length
-	clr r1
-1:
-	st Z+, r1	/* r1 is still zero */
-	dec r19
-	brne 1b
-
-;	rjmp sha1_lastBlock_epilog
-sha1_lastBlock_insert_length:
-	movw r26, r24	/* X points to state */
-	adiw r26, 5*4	/* X points to (state.length) */
-	adiw r30, 8		/* Z points one after the last byte of block */
-	ld r0, X+
-	add r0, r20
-	st -Z, r0
-	ld r0, X+
-	adc r0, r21
-	st -Z, r0
-	ldi r19, 6
-1:
-	ld r0, X+
-	adc r0, r1
-	st -Z, r0
-	dec r19
-	brne 1b
-
-	sbiw r30, 64-8
-	movw r22, r30
-	rcall sha1_nextBlock
-
-sha1_lastBlock_epilog:
-	in r30, SPL
-	in r31, SPH
-	in r1, SREG
-	adiw r30, 63 ; lo8(64)
-	adiw r30,  1  ; hi8(64)
-	cli
-	out SPL, r30
-	out SPH, r31
-	out SREG,r1
-	clr r1
-	clr r0
-	ret
-
-/**/
-;###########################################################	
-
-.global sha1_nextBlock
-; === sha1_nextBlock ===
-; this is the core function for calculating SHA-1 hashes
-;  param1: the 16-bit pointer to sha1_ctx structure
-;	given in r25,r24 (r25 is most significant)
-;  param2: an 16-bit pointer to 64 byte block to hash
-;	given in r23,r22
-sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
-
-xtmp = 0
-xNULL = 1
-W1 = 10
-W2 = 11
-T1	= 12
-T2	= 13
-T3	= 14
-T4	= 15
-LoopC = 16
-S	  = 17
-tmp1 = 18
-tmp2 = 19
-tmp3 = 20
-tmp4 = 21
-F1 = 22
-F2 = 23
-F3 = 24
-F4 = 25
-
-/* byteorder: high number <--> high significance */
-sha1_nextBlock:
- ; initial, let's make some space ready for local vars
- 			 /* replace push & pop by mem ops? */
-	push r10
-	push r11
-	push r12
-	push r13
-	push r14
-	push r15
-	push r16
-	push r17
-	push r28
-	push r29
-	in r20, SPL
-	in r21, SPH
-	movw r18, r20			;backup SP
-;	movw r26, r20			; X points to free space on stack /* maybe removeable? */ 
-	movw r30, r22			; Z points to message
-	subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
-	sbci r21, hi8(sha1_nextBlock_localSpace)
-	movw r26, r20			; X points to free space on stack 
-	in r0, SREG
-	cli ; we want to be uninterrupted while updating SP
-	out SPL, r20
-	out SPH, r21
-	out SREG, r0
-	
-	push r18
-	push r19 /* push old SP on new stack */
-	push r24
-	push r25 /* param1 will be needed later */
-	
-	/* load a[] with state */
-	movw 28, r24 /* load pointer to state in Y */
-	adiw r26, 1 ; X++
-
-	ldi LoopC, 5*4	
-1:	ld tmp1, Y+
-	st X+, tmp1
-	dec LoopC
-	brne 1b
-
-	movw W1, r26 /* save pointer to w[0] */
-	/* load w[] with endian fixed message */
-		/* we might also use the changeendian32() function at bottom */
-	movw r30, r22 /* mv param2 (ponter to msg) to Z */	
-	ldi LoopC, 16
-1:
-	ldd tmp1, Z+3
-	st X+, tmp1
-	ldd tmp1, Z+2
-	st X+, tmp1
-	ldd tmp1, Z+1
-	st X+, tmp1
-	ld tmp1, Z
-	st X+, tmp1
-	adiw r30, 4
-	dec LoopC
-	brne 1b
-	
-	;clr LoopC /* LoopC is named t in FIPS 180-2 */	
-	clr xtmp
-sha1_nextBlock_mainloop:
-	mov S, LoopC
-	lsl S
-	lsl S
-	andi S, 0x3C /* S is a bytepointer so *4 */
-	/* load w[s] */
-	movw r26, W1
-	add r26, S /* X points at w[s] */
-	adc r27, xNULL
-	ld T1, X+
-	ld T2, X+
-	ld T3, X+
-	ld T4, X+
-
-	/**/
-	push r26
-	push r27
-	push T4
-	push T3
-	push T2
-	push T1
-	in r26, SPL
-	in r27, SPH
-	adiw r26, 1
-	dbg_hexdump 4
-	pop T1
-	pop T2
-	pop T3
-	pop T4
-	pop r27
-	pop r26
-	/**/
-
-	cpi LoopC, 16
-	brlt sha1_nextBlock_mainloop_core
-	/* update w[s] */
-	ldi tmp1, 2*4
-	rcall 1f
-	ldi tmp1, 8*4
-	rcall 1f
-	ldi tmp1, 13*4
-	rcall 1f
-	rjmp 2f
-1:		/* this might be "outsourced" to save the jump above */
-	add tmp1, S
-	andi tmp1, 0x3f
-	movw r26, W1
-	add r26, tmp1
-	adc r27, xNULL
-	ld tmp2, X+
-	eor T1, tmp2
-	ld tmp2, X+
-	eor T2, tmp2
-	ld tmp2, X+
-	eor T3, tmp2
-	ld tmp2, X+
-	eor T4, tmp2
-	ret
-2:	/* now we just hav to do a ROTL(T) and save T back */
-	mov tmp2, T4
-	rol tmp2
-	rol T1
-	rol T2
-	rol T3
-	rol T4
-	movw r26, W1
-	add r26, S
-	adc r27, xNULL
-	st X+, T1
-	st X+, T2
-	st X+, T3
-	st X+, T4
-	
-sha1_nextBlock_mainloop_core:	/* ther core function; T=ROTL5(a) ....*/	
-								/* T already contains w[s] */
-	movw r26, W1
-	sbiw r26, 4*1		/* X points at a[4] aka e */
-	ld tmp1, X+ 
-	add T1, tmp1
-	ld tmp1, X+ 
-	adc T2, tmp1
-	ld tmp1, X+ 
-	adc T3, tmp1
-	ld tmp1, X+ 
-	adc T4, tmp1		/* T = w[s]+e */
-	sbiw r26, 4*5		/* X points at a[0] aka a */
-	ld F1, X+ 
-	ld F2, X+ 
-	ld F3, X+ 
-	ld F4, X+ 
-	mov tmp1, F4		/* X points at a[1] aka b */
-	ldi tmp2, 5
-1:
-	rol tmp1
-	rol F1
-	rol F2
-	rol F3
-	rol F4
-	dec tmp2
-	brne 1b
-	
-	add T1, F1
-	adc T2, F2
-	adc T3, F3
-	adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
-	
-	/* now we have to do this fucking conditional stuff */
-	ldi r30, lo8(sha1_nextBlock_xTable)
-	ldi r31, hi8(sha1_nextBlock_xTable)
-	add r30, xtmp
-	adc r31, xNULL
-	lpm tmp1, Z
-	cp tmp1, LoopC
-	brne 1f
-	inc xtmp
-1:	ldi r30, lo8(sha1_nextBlock_KTable)
-	ldi r31, hi8(sha1_nextBlock_KTable)
-	lsl xtmp
-	lsl xtmp
-	add r30, xtmp
-	adc r31, xNULL
-	lsr xtmp
-	lsr xtmp
-	 
-	lpm tmp1, Z+
-	add T1, tmp1
-	lpm tmp1, Z+
-	adc T2, tmp1
-	lpm tmp1, Z+
-	adc T3, tmp1
-	lpm tmp1, Z+
-	adc T4, tmp1
-			/* T = ROTL(a,5) + e + kt + w[s] */
-	
-	/* wo Z-4 gerade auf kt zeigt ... */
-	movw r28, r26 /* copy X in Y */
-	adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
-	lsr r31
-	ror r30
-		
-	icall
-	mov F1, tmp1
-	icall
-	mov F2, tmp1
-	icall
-	mov F3, tmp1
-	icall
-	
-	add T1, F1
-	adc T2, F2
-	adc T3, F3
-	adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
-				 /* X points still at a[1] aka b, Y points at a[2] aka c */	
-	/* update a[] */
-sha1_nextBlock_update_a:
-	/*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
-	//adiw r28, 3*4  /* Y should point at a[4] aka e */
-	movw r28, W1
-	sbiw r28, 4
-	
-	ldi tmp2, 4*4 
-1:	
-	ld tmp1, -Y
-	std Y+4, tmp1
-	dec tmp2
-	brne 1b
-	/* Y points at a[0] aka a*/
-	
-	movw r28, W1
-	sbiw r28, 5*4
-	/* store T in a[0] aka a */
-	st Y+, T1
-	st Y+, T2
-	st Y+, T3
-	st Y+, T4
-	/* Y points at a[1] aka b*/
-	
-	/* rotate c */
-	ldd T1, Y+1*4
-	ldd T2, Y+1*4+1
-	ldd T3, Y+1*4+2
-	ldd T4, Y+1*4+3
-	mov tmp1, T1
-	ldi tmp2, 2
-1:	ror tmp1
-	ror T4
-	ror T3
-	ror T2
-	ror T1
-	dec tmp2
-	brne 1b
-	std Y+1*4+0, T1
-	std Y+1*4+1, T2
-	std Y+1*4+2, T3
-	std Y+1*4+3, T4
-	
-	push r27
-	push r26
-	movw r26, W1
-	sbiw r26, 4*5
-	dbg_hexdump 4*5
-	pop r26
-	pop r27
-	
-	inc LoopC
-	cpi LoopC, 80
-	brge 1f
-	jmp sha1_nextBlock_mainloop
-/**************************************/
-1:	
-   /* littel patch */
-	sbiw r28, 4
-
-/* add a[] to state and inc length */	
-	pop r27
-	pop r26		/* now X points to state (and Y still at a[0]) */
-	ldi tmp4, 5
-1:	clc
-	ldi tmp3, 4
-2:	ld tmp1, X
-	ld tmp2, Y+
-	adc tmp1, tmp2
-	st X+, tmp1
-	dec tmp3
-	brne 2b
-	dec tmp4
-	brne 1b
-	
-	/* now length += 512 */
-	adiw r26, 1 /* we skip the least significant byte */
-	ld tmp1, X
-	ldi tmp2, hi8(512) /* 2 */
-	add tmp1, tmp2
-	st X+, tmp1
-	ldi tmp2, 6
-1:
-	ld tmp1, X
-	adc tmp1, xNULL
-	st X+, tmp1
-	dec tmp2
-	brne 1b
-	
-; EPILOG
-sha1_nextBlock_epilog:
-/* now we should clean up the stack */
-	pop r21
-	pop r20
-	in r0, SREG
-	cli ; we want to be uninterrupted while updating SP
-	out SPL, r20
-	out SPH, r21
-	out SREG, r0
-	
-	clr r1
-	pop r29
-	pop r28
-	pop r17
-	pop r16
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop r11
-	pop r10
-	ret
-
-sha1_nextBlock_xTable:
-.byte 20,40,60,0
-sha1_nextBlock_KTable:
-.int	0x5a827999 
-.int	0x6ed9eba1 
-.int	0x8f1bbcdc 
-.int	0xca62c1d6
-sha1_nextBlock_JumpTable:
-jmp sha1_nextBlock_Ch	
-jmp sha1_nextBlock_Parity
-jmp sha1_nextBlock_Maj
-jmp sha1_nextBlock_Parity
-
-	 /* X and Y still point at a[1] aka b ; return value in tmp1 */
-sha1_nextBlock_Ch:
-	ld tmp1, Y+
-	mov tmp2, tmp1
-	com tmp2
-	ldd tmp3, Y+3	/* load from c */
-	and tmp1, tmp3
-	ldd tmp3, Y+7	/* load from d */
-	and tmp2, tmp3
-	eor tmp1, tmp2
-	/**
-	precall
-	ldi r24, lo8(ch_str)
-	ldi r25, hi8(ch_str)
-	call uart_putstr_P
-	postcall
-	/**/
-	ret
-	
-sha1_nextBlock_Maj:
-	ld tmp1, Y+
-	mov tmp2, tmp1
-	ldd tmp3, Y+3	/* load from c */
-	and tmp1, tmp3
-	ldd tmp4, Y+7	/* load from d */
-	and tmp2, tmp4
-	eor tmp1, tmp2
-	and tmp3, tmp4
-	eor tmp1, tmp3
-	/**
-	precall
-	ldi r24, lo8(maj_str)
-	ldi r25, hi8(maj_str)
-	call uart_putstr_P
-	postcall
-	/**/
-	ret
-
-sha1_nextBlock_Parity:
-	ld tmp1, Y+
-	ldd tmp2, Y+3	/* load from c */
-	eor tmp1, tmp2
-	ldd tmp2, Y+7	/* load from d */
-	eor tmp1, tmp2
-	
-	/**
-	precall
-	ldi r24, lo8(parity_str)
-	ldi r25, hi8(parity_str)
-	call uart_putstr_P
-	postcall
-	/**/
-	ret
-/*	
-ch_str:			.asciz "\r\nCh"
-maj_str:		.asciz "\r\nMaj"
-parity_str:	.asciz "\r\nParity"
-*/
-;###########################################################	
-
-.global sha1_init 
-;void sha1_init(sha1_ctx_t *state){
-;	DEBUG_S("\r\nSHA1_INIT");
-;	state->h[0] = 0x67452301;
-;	state->h[1] = 0xefcdab89;
-;	state->h[2] = 0x98badcfe;
-;	state->h[3] = 0x10325476;
-;	state->h[4] = 0xc3d2e1f0;
-;	state->length = 0;
-;}
-; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
-; modifys: Z(r30,r31), Func1, r22
-sha1_init:
-	movw r26, r24 ; (24,25) --> (26,27) load X with param1
-	ldi r30, lo8((sha1_init_vector))
-	ldi r31, hi8((sha1_init_vector))
-	ldi r22, 5*4 /* bytes to copy */
-sha1_init_vloop:	
-	lpm r23, Z+ 
-	st X+, r23
-	dec r22
-	brne sha1_init_vloop
-	ldi r22, 8
-	clr r1 /* this should not be needed */
-sha1_init_lloop:
-	st X+, r1
-	dec r22
-	brne sha1_init_lloop
-	ret
-	
-sha1_init_vector:
-.int 0x67452301;
-.int 0xefcdab89;
-.int 0x98badcfe;
-.int 0x10325476;
-.int 0xc3d2e1f0;
-/*
-;###########################################################	
-
-.global rotl32
-; === ROTL32 ===
-; function that rotates a 32 bit word to the left
-;  param1: the 32-bit word to rotate
-;	given in r25,r24,r23,r22 (r25 is most significant)
-;  param2: an 8-bit value telling how often to rotate
-;	given in r20
-; modifys: r21, r22
-rotl32:
-	cpi r20, 8
-	brlo bitrotl
-	mov r21, r25
-	mov r25, r24
-	mov r24, r23
-	mov r23, r22
-	mov r22, r21
-	subi r20, 8
-	rjmp rotr32
-bitrotl:
-	clr r21
-	clc
-bitrotl_loop:	
-	tst r20
-	breq fixrotl
-	rol r22
-	rol r23
-	rol r24
-	rol r25
-	rol r21
-	dec r20
-	rjmp bitrotl_loop
-fixrotl:
-	or r22, r21
-	ret
-	
-
-;###########################################################	
-
-.global rotr32
-; === ROTR32 ===
-; function that rotates a 32 bit word to the right
-;  param1: the 32-bit word to rotate
-;	given in r25,r24,r23,22 (r25 is most significant)
-;  param2: an 8-bit value telling how often to rotate
-;	given in r20
-; modifys: r21, r22
-rotr32:
-	cpi r20, 8
-	brlo bitrotr
-	mov r21, r22
-	mov r22, r23
-	mov r23, r24
-	mov r24, r25
-	mov r25, r21
-	subi r20, 8
-	rjmp rotr32
-bitrotr:
-	clr r21
-	clc
-bitrotr_loop:	
-	tst r20
-	breq fixrotr
-	ror r25
-	ror r24
-	ror r23
-	ror r22
-	ror r21
-	dec r20
-	rjmp bitrotr_loop
-fixrotr:
-	or r25, r21
-	ret
-	
-	
-;###########################################################	
-	
-.global change_endian32
-; === change_endian32 ===
-; function that changes the endianess of a 32-bit word
-;  param1: the 32-bit word
-;	given in r25,r24,r23,22 (r25 is most significant)
-;  modifys: r21, r22
-change_endian32:
-	movw r20,  r22 ; (r22,r23) --> (r20,r21)
-	mov r22, r25
-	mov r23, r24
-	mov r24, r21
-	mov r25, r20 
-	ret
-*/
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; SHA1 implementation in assembler for AVR
+SHA1_BLOCK_BITS = 512
+SHA1_HASH_BITS = 160
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+.macro delay
+/*	
+	push r0
+	push r1
+	clr r0
+1:	clr r1
+2:	dec r1
+	brne 2b
+	dec r0
+	brne 1b
+	pop r1
+	pop r0  // */
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+/*	
+	precall
+	hexdump \length
+	postcall
+	// */
+.endm
+
+
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha1_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha1_ctx2hash
+; === sha1_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha1_ctx structure
+;	given in r23,r22
+sha1_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 5
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha1
+; === sha1 ===
+; this function calculates SHA-1 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha1:
+sha1_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 5*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha1_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha1_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha1_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha1_ctx2hash	
+	
+sha1_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 5*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha1_lastBlock
+; === sha1_lastBlock ===
+; this function does padding & Co. for calculating SHA-1 hashes
+;  param1: the 16-bit pointer to sha1_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
+
+
+sha1_lastBlock:
+	tst r20
+	brne sha1_lastBlock_prolog
+	cpi r21, 0x02
+	brne sha1_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	rcall sha1_nextBlock
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	clr r21
+	clr r22
+sha1_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64) /* ??? */
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha1_lastBlock_post_copy
+	mov r1, r18
+sha1_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha1_lastBlock_copy_loop
+sha1_lastBlock_post_copy:	
+sha1_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha1_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha1_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*5+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha1_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha1_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha1_lastBlock_epilog
+sha1_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 5*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha1_nextBlock
+
+sha1_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha1_nextBlock
+; === sha1_nextBlock ===
+; this is the core function for calculating SHA-1 hashes
+;  param1: the 16-bit pointer to sha1_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
+
+xtmp = 0
+xNULL = 1
+W1 = 10
+W2 = 11
+T1	= 12
+T2	= 13
+T3	= 14
+T4	= 15
+LoopC = 16
+S	  = 17
+tmp1 = 18
+tmp2 = 19
+tmp3 = 20
+tmp4 = 21
+F1 = 22
+F2 = 23
+F3 = 24
+F4 = 25
+
+/* byteorder: high number <--> high significance */
+sha1_nextBlock:
+ ; initial, let's make some space ready for local vars
+ 			 /* replace push & pop by mem ops? */
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack /* maybe removeable? */ 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha1_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	push r18
+	push r19 /* push old SP on new stack */
+	push r24
+	push r25 /* param1 will be needed later */
+	
+	/* load a[] with state */
+	movw 28, r24 /* load pointer to state in Y */
+	adiw r26, 1 ; X++
+
+	ldi LoopC, 5*4	
+1:	ld tmp1, Y+
+	st X+, tmp1
+	dec LoopC
+	brne 1b
+
+	movw W1, r26 /* save pointer to w[0] */
+	/* load w[] with endian fixed message */
+		/* we might also use the changeendian32() function at bottom */
+	movw r30, r22 /* mv param2 (ponter to msg) to Z */	
+	ldi LoopC, 16
+1:
+	ldd tmp1, Z+3
+	st X+, tmp1
+	ldd tmp1, Z+2
+	st X+, tmp1
+	ldd tmp1, Z+1
+	st X+, tmp1
+	ld tmp1, Z
+	st X+, tmp1
+	adiw r30, 4
+	dec LoopC
+	brne 1b
+	
+	;clr LoopC /* LoopC is named t in FIPS 180-2 */	
+	clr xtmp
+sha1_nextBlock_mainloop:
+	mov S, LoopC
+	lsl S
+	lsl S
+	andi S, 0x3C /* S is a bytepointer so *4 */
+	/* load w[s] */
+	movw r26, W1
+	add r26, S /* X points at w[s] */
+	adc r27, xNULL
+	ld T1, X+
+	ld T2, X+
+	ld T3, X+
+	ld T4, X+
+
+	/**/
+	push r26
+	push r27
+	push T4
+	push T3
+	push T2
+	push T1
+	in r26, SPL
+	in r27, SPH
+	adiw r26, 1
+	dbg_hexdump 4
+	pop T1
+	pop T2
+	pop T3
+	pop T4
+	pop r27
+	pop r26
+	/**/
+
+	cpi LoopC, 16
+	brlt sha1_nextBlock_mainloop_core
+	/* update w[s] */
+	ldi tmp1, 2*4
+	rcall 1f
+	ldi tmp1, 8*4
+	rcall 1f
+	ldi tmp1, 13*4
+	rcall 1f
+	rjmp 2f
+1:		/* this might be "outsourced" to save the jump above */
+	add tmp1, S
+	andi tmp1, 0x3f
+	movw r26, W1
+	add r26, tmp1
+	adc r27, xNULL
+	ld tmp2, X+
+	eor T1, tmp2
+	ld tmp2, X+
+	eor T2, tmp2
+	ld tmp2, X+
+	eor T3, tmp2
+	ld tmp2, X+
+	eor T4, tmp2
+	ret
+2:	/* now we just hav to do a ROTL(T) and save T back */
+	mov tmp2, T4
+	rol tmp2
+	rol T1
+	rol T2
+	rol T3
+	rol T4
+	movw r26, W1
+	add r26, S
+	adc r27, xNULL
+	st X+, T1
+	st X+, T2
+	st X+, T3
+	st X+, T4
+	
+sha1_nextBlock_mainloop_core:	/* ther core function; T=ROTL5(a) ....*/	
+								/* T already contains w[s] */
+	movw r26, W1
+	sbiw r26, 4*1		/* X points at a[4] aka e */
+	ld tmp1, X+ 
+	add T1, tmp1
+	ld tmp1, X+ 
+	adc T2, tmp1
+	ld tmp1, X+ 
+	adc T3, tmp1
+	ld tmp1, X+ 
+	adc T4, tmp1		/* T = w[s]+e */
+	sbiw r26, 4*5		/* X points at a[0] aka a */
+	ld F1, X+ 
+	ld F2, X+ 
+	ld F3, X+ 
+	ld F4, X+ 
+	mov tmp1, F4		/* X points at a[1] aka b */
+	ldi tmp2, 5
+1:
+	rol tmp1
+	rol F1
+	rol F2
+	rol F3
+	rol F4
+	dec tmp2
+	brne 1b
+	
+	add T1, F1
+	adc T2, F2
+	adc T3, F3
+	adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
+	
+	/* now we have to do this fucking conditional stuff */
+	ldi r30, lo8(sha1_nextBlock_xTable)
+	ldi r31, hi8(sha1_nextBlock_xTable)
+	add r30, xtmp
+	adc r31, xNULL
+	lpm tmp1, Z
+	cp tmp1, LoopC
+	brne 1f
+	inc xtmp
+1:	ldi r30, lo8(sha1_nextBlock_KTable)
+	ldi r31, hi8(sha1_nextBlock_KTable)
+	lsl xtmp
+	lsl xtmp
+	add r30, xtmp
+	adc r31, xNULL
+	lsr xtmp
+	lsr xtmp
+	 
+	lpm tmp1, Z+
+	add T1, tmp1
+	lpm tmp1, Z+
+	adc T2, tmp1
+	lpm tmp1, Z+
+	adc T3, tmp1
+	lpm tmp1, Z+
+	adc T4, tmp1
+			/* T = ROTL(a,5) + e + kt + w[s] */
+	
+	/* wo Z-4 gerade auf kt zeigt ... */
+	movw r28, r26 /* copy X in Y */
+	adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
+	lsr r31
+	ror r30
+		
+	icall
+	mov F1, tmp1
+	icall
+	mov F2, tmp1
+	icall
+	mov F3, tmp1
+	icall
+	
+	add T1, F1
+	adc T2, F2
+	adc T3, F3
+	adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
+				 /* X points still at a[1] aka b, Y points at a[2] aka c */	
+	/* update a[] */
+sha1_nextBlock_update_a:
+	/*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
+	//adiw r28, 3*4  /* Y should point at a[4] aka e */
+	movw r28, W1
+	sbiw r28, 4
+	
+	ldi tmp2, 4*4 
+1:	
+	ld tmp1, -Y
+	std Y+4, tmp1
+	dec tmp2
+	brne 1b
+	/* Y points at a[0] aka a*/
+	
+	movw r28, W1
+	sbiw r28, 5*4
+	/* store T in a[0] aka a */
+	st Y+, T1
+	st Y+, T2
+	st Y+, T3
+	st Y+, T4
+	/* Y points at a[1] aka b*/
+	
+	/* rotate c */
+	ldd T1, Y+1*4
+	ldd T2, Y+1*4+1
+	ldd T3, Y+1*4+2
+	ldd T4, Y+1*4+3
+	mov tmp1, T1
+	ldi tmp2, 2
+1:	ror tmp1
+	ror T4
+	ror T3
+	ror T2
+	ror T1
+	dec tmp2
+	brne 1b
+	std Y+1*4+0, T1
+	std Y+1*4+1, T2
+	std Y+1*4+2, T3
+	std Y+1*4+3, T4
+	
+	push r27
+	push r26
+	movw r26, W1
+	sbiw r26, 4*5
+	dbg_hexdump 4*5
+	pop r26
+	pop r27
+	
+	inc LoopC
+	cpi LoopC, 80
+	brge 1f
+	jmp sha1_nextBlock_mainloop
+/**************************************/
+1:	
+   /* littel patch */
+	sbiw r28, 4
+
+/* add a[] to state and inc length */	
+	pop r27
+	pop r26		/* now X points to state (and Y still at a[0]) */
+	ldi tmp4, 5
+1:	clc
+	ldi tmp3, 4
+2:	ld tmp1, X
+	ld tmp2, Y+
+	adc tmp1, tmp2
+	st X+, tmp1
+	dec tmp3
+	brne 2b
+	dec tmp4
+	brne 1b
+	
+	/* now length += 512 */
+	adiw r26, 1 /* we skip the least significant byte */
+	ld tmp1, X
+	ldi tmp2, hi8(512) /* 2 */
+	add tmp1, tmp2
+	st X+, tmp1
+	ldi tmp2, 6
+1:
+	ld tmp1, X
+	adc tmp1, xNULL
+	st X+, tmp1
+	dec tmp2
+	brne 1b
+	
+; EPILOG
+sha1_nextBlock_epilog:
+/* now we should clean up the stack */
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	ret
+
+sha1_nextBlock_xTable:
+.byte 20,40,60,0
+sha1_nextBlock_KTable:
+.int	0x5a827999 
+.int	0x6ed9eba1 
+.int	0x8f1bbcdc 
+.int	0xca62c1d6
+sha1_nextBlock_JumpTable:
+jmp sha1_nextBlock_Ch	
+jmp sha1_nextBlock_Parity
+jmp sha1_nextBlock_Maj
+jmp sha1_nextBlock_Parity
+
+	 /* X and Y still point at a[1] aka b ; return value in tmp1 */
+sha1_nextBlock_Ch:
+	ld tmp1, Y+
+	mov tmp2, tmp1
+	com tmp2
+	ldd tmp3, Y+3	/* load from c */
+	and tmp1, tmp3
+	ldd tmp3, Y+7	/* load from d */
+	and tmp2, tmp3
+	eor tmp1, tmp2
+	/**
+	precall
+	ldi r24, lo8(ch_str)
+	ldi r25, hi8(ch_str)
+	call uart_putstr_P
+	postcall
+	/**/
+	ret
+	
+sha1_nextBlock_Maj:
+	ld tmp1, Y+
+	mov tmp2, tmp1
+	ldd tmp3, Y+3	/* load from c */
+	and tmp1, tmp3
+	ldd tmp4, Y+7	/* load from d */
+	and tmp2, tmp4
+	eor tmp1, tmp2
+	and tmp3, tmp4
+	eor tmp1, tmp3
+	/**
+	precall
+	ldi r24, lo8(maj_str)
+	ldi r25, hi8(maj_str)
+	call uart_putstr_P
+	postcall
+	/**/
+	ret
+
+sha1_nextBlock_Parity:
+	ld tmp1, Y+
+	ldd tmp2, Y+3	/* load from c */
+	eor tmp1, tmp2
+	ldd tmp2, Y+7	/* load from d */
+	eor tmp1, tmp2
+	
+	/**
+	precall
+	ldi r24, lo8(parity_str)
+	ldi r25, hi8(parity_str)
+	call uart_putstr_P
+	postcall
+	/**/
+	ret
+/*	
+ch_str:			.asciz "\r\nCh"
+maj_str:		.asciz "\r\nMaj"
+parity_str:	.asciz "\r\nParity"
+*/
+;###########################################################	
+
+.global sha1_init 
+;void sha1_init(sha1_ctx_t *state){
+;	DEBUG_S("\r\nSHA1_INIT");
+;	state->h[0] = 0x67452301;
+;	state->h[1] = 0xefcdab89;
+;	state->h[2] = 0x98badcfe;
+;	state->h[3] = 0x10325476;
+;	state->h[4] = 0xc3d2e1f0;
+;	state->length = 0;
+;}
+; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha1_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha1_init_vector))
+	ldi r31, hi8((sha1_init_vector))
+	ldi r22, 5*4 /* bytes to copy */
+sha1_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha1_init_vloop
+	ldi r22, 8
+	clr r1 /* this should not be needed */
+sha1_init_lloop:
+	st X+, r1
+	dec r22
+	brne sha1_init_lloop
+	ret
+	
+sha1_init_vector:
+.int 0x67452301;
+.int 0xefcdab89;
+.int 0x98badcfe;
+.int 0x10325476;
+.int 0xc3d2e1f0;
+/*
+;###########################################################	
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,r22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotl32:
+	cpi r20, 8
+	brlo bitrotl
+	mov r21, r25
+	mov r25, r24
+	mov r24, r23
+	mov r23, r22
+	mov r22, r21
+	subi r20, 8
+	rjmp rotr32
+bitrotl:
+	clr r21
+	clc
+bitrotl_loop:	
+	tst r20
+	breq fixrotl
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	rol r21
+	dec r20
+	rjmp bitrotl_loop
+fixrotl:
+	or r22, r21
+	ret
+	
+
+;###########################################################	
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotr32:
+	cpi r20, 8
+	brlo bitrotr
+	mov r21, r22
+	mov r22, r23
+	mov r23, r24
+	mov r24, r25
+	mov r25, r21
+	subi r20, 8
+	rjmp rotr32
+bitrotr:
+	clr r21
+	clc
+bitrotr_loop:	
+	tst r20
+	breq fixrotr
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	dec r20
+	rjmp bitrotr_loop
+fixrotr:
+	or r25, r21
+	ret
+	
+	
+;###########################################################	
+	
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+;  param1: the 32-bit word
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  modifys: r21, r22
+change_endian32:
+	movw r20,  r22 ; (r22,r23) --> (r20,r21)
+	mov r22, r25
+	mov r23, r24
+	mov r24, r21
+	mov r25, r20 
+	ret
+*/
diff --git a/sha256-asm.S b/sha256-asm.S
index 392bf42..403506e 100644
--- a/sha256-asm.S
+++ b/sha256-asm.S
@@ -16,1028 +16,1028 @@
     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
-/*
- * Author:	Daniel Otte
- *
- * License: GPL
-*/
-; sha-256 implementation in assembler	
-SHA256_BLOCK_BITS = 512
-SHA256_HASH_BITS = 256
-
-.macro precall
-	/* push r18 - r27, r30 - r31*/
-	push r0
-	push r1
-	push r18
-	push r19
-	push r20
-	push r21
-	push r22
-	push r23
-	push r24
-	push r25
-	push r26
-	push r27
-	push r30
-	push r31
-	clr r1
-.endm
-
-.macro postcall
-	pop r31
-	pop r30
-	pop r27
-	pop r26
-	pop r25
-	pop r24
-	pop r23
-	pop r22
-	pop r21
-	pop r20
-	pop r19
-	pop r18
-	pop r1
-	pop r0
-.endm
-
-
-.macro hexdump length
-	push r27
-	push r26
-	ldi r25, '\r'
-	mov r24, r25
-	call uart_putc
-	ldi r25, '\n'
-	mov r24, r25
-	call uart_putc
-	pop r26
-	pop r27
-	movw r24, r26
-.if \length > 16
-	ldi r22, lo8(16)
-	ldi r23, hi8(16)
-	push r27
-	push r26
-	call uart_hexdump
-	pop r26
-	pop r27
-	adiw r26, 16
-	hexdump \length-16
-.else
-	ldi r22, lo8(\length)
-	ldi r23, hi8(\length)
-	call uart_hexdump
-.endif
-.endm
-
-/* X points to Block */
-.macro dbg_hexdump length
-	precall
-	hexdump \length
-	postcall
-.endm
-
-.section .text
-
-SPL = 0x3D
-SPH = 0x3E
-SREG = 0x3F
-
-
-;
-;sha256_ctx_t is:
-;
-; [h0][h1][h2][h3][h4][h5][h6][h7][length]
-; hn is 32 bit large, length is 64 bit large
-
-;###########################################################	
-
-.global sha256_ctx2hash
-; === sha256_ctx2hash ===
-; this function converts a state into a normal hash (bytestring)
-;  param1: the 16-bit destination pointer
-;	given in r25,r24 (r25 is most significant)
-;  param2: the 16-bit pointer to sha256_ctx structure
-;	given in r23,r22
-sha256_ctx2hash:
-	movw r26, r22
-	movw r30, r24
-	ldi r21, 8
-	sbiw r26, 4
-1:	
-	ldi r20, 4
-	adiw r26, 8
-2:	
-		ld r0, -X
-		st Z+, r0	
-	dec r20
-	brne 2b
-	
-	dec r21
-	brne 1b
-	
-	ret
-
-;###########################################################	
-
-.global sha256
-; === sha256 ===
-; this function calculates SHA-256 hashes from messages in RAM
-;  param1: the 16-bit hash destination pointer
-;	given in r25,r24 (r25 is most significant)
-;  param2: the 16-bit pointer to message
-;	given in r23,r22
-;  param3: 32-bit length value (length of message in bits)
-;   given in r21,r20,r19,r18
-sha256:
-sha256_prolog:
-	push r8
-	push r9
-	push r10
-	push r11
-	push r12
-	push r13
-	push r16
-	push r17
-	in r16, SPL
-	in r17, SPH
-	subi r16, 8*4+8 
-	sbci r17, 0	
-	in r0, SREG
-	cli
-	out SPL, r16
-	out SPH, r17
-	out SREG, r0
-	
-	push r25
-	push r24
-	inc r16
-	adc r17, r1
-	
-	movw r8, r18		/* backup of length*/
-	movw r10, r20
-	
-	movw r12, r22	/* backup pf msg-ptr */
-	
-	movw r24, r16
-	rcall sha256_init
-	/* if length >= 512 */
-1:
-	tst r11
-	brne 4f
-	tst r10
-	brne 4f
-	mov r19, r9
-	cpi r19, 0x02
-	brlo 4f
-	
-	movw r24, r16
-	movw r22, r12
-	rcall sha256_nextBlock
-	ldi r19, 0x64
-	add r22, r19
-	adc r23, r1
-	/* length -= 512 */
-	ldi r19, 0x02
-	sub r9, r19
-	sbc r10, r1
-	sbc r11, r1
-	rjmp 1b
-	
-4:
-	movw r24, r16
-	movw r22, r12
-	movw r20, r8
-	rcall sha256_lastBlock
-	
-	pop r24
-	pop r25
-	movw r22, r16
-	rcall sha256_ctx2hash	
-	
-sha256_epilog:
-	in r30, SPL
-	in r31, SPH
-	adiw r30, 8*4+8 	
-	in r0, SREG
-	cli
-	out SPL, r30
-	out SPH, r31
-	out SREG, r0
-	pop r17
-	pop r16
-	pop r13
-	pop r12
-	pop r11
-	pop r10
-	pop r9
-	pop r8
-	ret
-
-;###########################################################	
-
-
-; block MUST NOT be larger than 64 bytes
-
-.global sha256_lastBlock
-; === sha256_lastBlock ===
-; this function does padding & Co. for calculating SHA-256 hashes
-;  param1: the 16-bit pointer to sha256_ctx structure
-;	given in r25,r24 (r25 is most significant)
-;  param2: an 16-bit pointer to 64 byte block to hash
-;	given in r23,r22
-;  param3: an 16-bit integer specifing length of block in bits
-;	given in r21,r20
-sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
-
-
-sha256_lastBlock:
-	tst r20
-	brne sha256_lastBlock_prolog
-	cpi r21, 0x02
-	brne sha256_lastBlock_prolog
-	push r25
-	push r24
-	push r23
-	push r22
-	rcall sha256_nextBlock
-	pop r22
-	pop r23
-	pop r24
-	pop r25
-	clr r21
-	clr r22
-sha256_lastBlock_prolog:
-	/* allocate space on stack */
-	in r30, SPL
-	in r31, SPH
-	in r1, SREG
-	subi r30, lo8(64)
-	sbci r31, hi8(64)
-	cli
-	out SPL, r30
-	out SPH, r31
-	out SREG,r1
-
-	adiw r30, 1 /* SP points to next free byte on stack */
-	mov r18, r20 /* r20 = LSB(length) */
-	lsr r18
-	lsr r18
-	lsr r18
-	bst r21, 0	/* may be we should explain this ... */
-	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
-	
-	
-	movw r26, r22 /* X points to begin of msg */
-	tst r18
-	breq sha256_lastBlock_post_copy
-	mov r1, r18
-sha256_lastBlock_copy_loop:
-	ld r0, X+
-	st Z+, r0
-	dec r1
-	brne sha256_lastBlock_copy_loop
-sha256_lastBlock_post_copy:	
-sha256_lastBlock_insert_stuffing_bit:	
-	ldi r19, 0x80
-	mov r0,r19 	
-	ldi r19, 0x07
-	and r19, r20 /* if we are in bitmode */
-	breq 2f	/* no bitmode */
-1:	
-	lsr r0
-	dec r19
-	brne 1b
-	ld r19, X
-/* maybe we should do some ANDing here, just for safety */
-	or r0, r19
-2:	
-	st Z+, r0
-	inc r18
-
-/* checking stuff here */
-	cpi r18, 64-8+1
-	brsh 0f 
-	rjmp sha256_lastBlock_insert_zeros
-0:
-	/* oh shit, we landed here */
-	/* first we have to fill it up with zeros */
-	ldi r19, 64
-	sub r19, r18
-	breq 2f
-1:	
-	st Z+, r1
-	dec r19
-	brne 1b	
-2:	
-	sbiw r30, 63
-	sbiw r30,  1
-	movw r22, r30
-	
-	push r31
-	push r30
-	push r25
-	push r24
-	push r21
-	push r20
-	rcall sha256_nextBlock
-	pop r20
-	pop r21
-	pop r24
-	pop r25
-	pop r30
-	pop r31
-	
-	/* now we should subtract 512 from length */
-	movw r26, r24
-	adiw r26, 4*8+1 /* we can skip the lowest byte */
-	ld r19, X
-	subi r19, hi8(512)
-	st X+, r19
-	ldi r18, 6
-1:
-	ld r19, X
-	sbci r19, 0
-	st X+, r19
-	dec r18
-	brne 1b
-	
-;	clr r18 /* not neccessary ;-) */
-	/* reset Z pointer to begin of block */
-
-sha256_lastBlock_insert_zeros:	
-	ldi r19, 64-8
-	sub r19, r18
-	breq sha256_lastBlock_insert_length
-	clr r1
-1:
-	st Z+, r1	/* r1 is still zero */
-	dec r19
-	brne 1b
-
-;	rjmp sha256_lastBlock_epilog
-sha256_lastBlock_insert_length:
-	movw r26, r24	/* X points to state */
-	adiw r26, 8*4	/* X points to (state.length) */
-	adiw r30, 8		/* Z points one after the last byte of block */
-	ld r0, X+
-	add r0, r20
-	st -Z, r0
-	ld r0, X+
-	adc r0, r21
-	st -Z, r0
-	ldi r19, 6
-1:
-	ld r0, X+
-	adc r0, r1
-	st -Z, r0
-	dec r19
-	brne 1b
-
-	sbiw r30, 64-8
-	movw r22, r30
-	rcall sha256_nextBlock
-
-sha256_lastBlock_epilog:
-	in r30, SPL
-	in r31, SPH
-	in r1, SREG
-	adiw r30, 63 ; lo8(64)
-	adiw r30,  1  ; hi8(64)
-	cli
-	out SPL, r30
-	out SPH, r31
-	out SREG,r1
-	clr r1
-	clr r0
-	ret
-
-/**/
-;###########################################################	
-
-.global sha256_nextBlock
-; === sha256_nextBlock ===
-; this is the core function for calculating SHA-256 hashes
-;  param1: the 16-bit pointer to sha256_ctx structure
-;	given in r25,r24 (r25 is most significant)
-;  param2: an 16-bit pointer to 64 byte block to hash
-;	given in r23,r22
-sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
-
-Bck1 = 12
-Bck2 = 13
-Bck3 = 14
-Bck4 = 15
-Func1 = 22
-Func2 = 23
-Func3 = 24
-Func4 = 25
-Accu1 = 16
-Accu2 = 17
-Accu3 = 18
-Accu4 = 19
-XAccu1 = 8
-XAccu2 = 9
-XAccu3 = 10
-XAccu4 = 11
-T1	= 4
-T2	= 5
-T3	= 6
-T4	= 7
-LoopC = 1
-/* byteorder: high number <--> high significance */
-sha256_nextBlock:
- ; initial, let's make some space ready for local vars
-	push r4 /* replace push & pop by mem ops? */
-	push r5
-	push r6
-	push r7
-	push r8
-	push r9
-	push r10
-	push r11
-	push r12
-	push r13
-	push r14
-	push r15
-	push r16
-	push r17
-	push r28
-	push r29
-	in r20, SPL
-	in r21, SPH
-	movw r18, r20			;backup SP
-;	movw r26, r20			; X points to free space on stack 
-	movw r30, r22			; Z points to message
-	subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
-	sbci r21, hi8(sha256_nextBlock_localSpace)
-	movw r26, r20			; X points to free space on stack 
-	in r0, SREG
-	cli ; we want to be uninterrupted while updating SP
-	out SPL, r20
-	out SPH, r21
-	out SREG, r0
-	push r18
-	push r19
-	push r24
-	push r25 /* param1 will be needed later */
- ; now we fill the w array with message (think about endianess)
- 	adiw r26, 1 ; X++
- 	ldi r20, 16
-sha256_nextBlock_wcpyloop: 	
- 	ld r23, Z+
- 	ld r22, Z+
- 	ld r19, Z+
- 	ld r18, Z+
- 	st X+, r18
- 	st X+, r19
- 	st X+, r22	
-	st X+, r23
-	dec r20
-	brne sha256_nextBlock_wcpyloop
-/*	for (i=16; i<64; ++i){
-		w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];	
-	} */
-	/* r25,r24,r23,r24 (r21,r20) are function values
-	   r19,r18,r17,r16 are the accumulator
-	   r15,r14,r13,rBck1 are backup1
-	   r11,r10,r9 ,r8  are xor accu   
-	   r1 is round counter 								*/
-
-	ldi r20, 64-16
-	mov LoopC, r20
-sha256_nextBlock_wcalcloop:		 
-	movw r30, r26 ; cp X to Z
-	sbiw r30, 63
-	sbiw r30, 1 		; substract 64 = 16*4
-	ld Accu1, Z+
-	ld Accu2, Z+
-	ld Accu3, Z+
-	ld Accu4, Z+ /* w[i] = w[i-16] */
-	ld Bck1, Z+
-	ld Bck2, Z+
-	ld Bck3, Z+
-	ld Bck4, Z+ /* backup = w[i-15] */
-	/* now sigma 0 */
-	mov Func1, Bck2
-	mov Func2, Bck3
-	mov Func3, Bck4
-	mov Func4, Bck1  /* prerotated by 8 */
-	ldi r20, 1
-	rcall bitrotl
-	movw XAccu1, Func1
-	movw XAccu3, Func3	 /* store ROTR(w[i-15],7) in xor accu */
-	movw Func1, Bck3
-	movw Func3, Bck1 /* prerotated by 16 */
-	ldi r20, 2
-	rcall bitrotr
-	eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
-	eor XAccu2, Func2
-	eor XAccu3, Func3
-	eor XAccu4, Func4
-	ldi Func2, 3		 /* now shr3 */ /*we can destroy backup now*/
-sigma0_shr:
-	lsr Bck4
-	ror Bck3
-	ror Bck2
-	ror Bck1	
-	dec Func2
-	brne sigma0_shr
-	eor XAccu1, Bck1
-	eor XAccu2, Bck2
-	eor XAccu3, Bck3
-	eor XAccu4, Bck4	/* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
-	add Accu1, XAccu1
-	adc Accu2, XAccu2
-	adc Accu3, XAccu3
-	adc Accu4, XAccu4 /* finished with sigma0 */
-	ldd Func1, Z+7*4  /* now accu += w[i-7] */
-	ldd Func2, Z+7*4+1
-	ldd Func3, Z+7*4+2
-	ldd Func4, Z+7*4+3
-	add Accu1, Func1
-	adc Accu2, Func2
-	adc Accu3, Func3
-	adc Accu4, Func4
-	ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
-	ldd Bck2, Z+12*4+1
-	ldd Bck3, Z+12*4+2
-	ldd Bck4, Z+12*4+3
-	/* now sigma 1 */
-	movw Func1, Bck3
-	movw Func3, Bck1 /* prerotated by 16 */
-	ldi r20, 1
-	rcall bitrotr
-	movw XAccu3, Func3
-	movw XAccu1, Func1	 /* store in ROTR(w[i-2], 17) xor accu */
-;	movw Func1, Bck3
-;	movw Func3, Bck1 /* prerotated by 16 */
-	ldi r20, 2
-	rcall bitrotr
-	eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
-	eor XAccu2, Func2
-	eor XAccu3, Func3
-	eor XAccu4, Func4
-	ldi Func2, 2	 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
-sigma1_shr:
-	lsr Bck4
-	ror Bck3
-	ror Bck2	
-	dec Func2
-	brne sigma1_shr
-	eor XAccu1, Bck2
-	eor XAccu2, Bck3
-	eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
-	add Accu1, XAccu1
-	adc Accu2, XAccu2
-	adc Accu3, XAccu3
-	adc Accu4, XAccu4 /* finished with sigma0 */
-	/* now let's store the shit */
-	st X+, Accu1
-	st X+, Accu2
-	st X+, Accu3
-	st X+, Accu4
-	dec LoopC
-	breq 3f  ; skip if zero
-	rjmp sha256_nextBlock_wcalcloop
-3:
-	/* we are finished with w array X points one byte post w */
-/* init a array */
-	pop r31
-	pop r30
-	push r30
-	push r31
-	ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
-init_a_array:	
-	ld r1, Z+
-	st X+, r1
-	dec r25
-	brne init_a_array
-	
-/* now the real fun begins */
-/* for (i=0; i<64; ++i){
-			t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
-			t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
-			memmove(&(a[1]), &(a[0]), 7*4); 	// a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; 
-			a[4] += t1;
-			a[0] = t1 + t2;
-		} */
-	/* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
-	sbiw r26, 8*4  /* X still points at a[7]+1*/
-	movw r28, r26
-	ldi r30, lo8(sha256_kv)
-	ldi r31, hi8(sha256_kv)		
-	dec r27  /* X - (64*4 == 256) */
-	ldi r25, 64
-	mov LoopC, r25
-sha256_main_loop:
-	/* now calculate t1 */
-	 /*CH(x,y,z) = (x&y)^((~x)&z)*/
-	ldd T1, Y+5*4
-	ldd T2, Y+5*4+1
-	ldd T3, Y+5*4+2
-	ldd T4, Y+5*4+3 /* y in T */
-	ldd Func1, Y+4*4
-	ldd Func2, Y+4*4+1
-	ldd Func3, Y+4*4+2
-	ldd Func4, Y+4*4+3  /* x in Func */
-	ldd Bck1, Y+6*4
-	ldd Bck2, Y+6*4+1
-	ldd Bck3, Y+6*4+2
-	ldd Bck4, Y+6*4+3 /* z in Bck */
-	and T1, Func1
-	and T2, Func2
-	and T3, Func3
-	and T4, Func4
-	com Func1
-	com Func2
-	com Func3
-	com Func4
-	and Bck1, Func1
-	and Bck2, Func2
-	and Bck3, Func3
-	and Bck4, Func4
-	eor T1, Bck1
-	eor T2, Bck2
-	eor T3, Bck3
-	eor T4, Bck4 /* done, CH(x,y,z) is in T */
-	/* now SIGMA1(a[4]) */
-	ldd Bck4, Y+4*4		/* think about using it from Func reg above*/
-	ldd Bck1, Y+4*4+1	
-	ldd Bck2, Y+4*4+2
-	ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */	
-	movw Func1, Bck1
-	movw Func3, Bck3
-	ldi r20, 2 
-	rcall bitrotl		/* rotr(x,6) */ 
-	movw XAccu1, Func1
-	movw XAccu3, Func3
-	movw Func1, Bck1
-	movw Func3, Bck3
-	ldi r20, 3 
-	rcall bitrotr 	/* rotr(x,11) */
-	eor XAccu1, Func1
-	eor XAccu2, Func2
-	eor XAccu3, Func3
-	eor XAccu4, Func4
-	movw Func1, Bck3 /* this prerotates furteh 16 bits*/
-	movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
-	ldi r20, 1 
-	rcall bitrotr 	/* rotr(x,11) */
-	eor XAccu1, Func1
-	eor XAccu2, Func2
-	eor XAccu3, Func3
-	eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
-	add T1, XAccu1
-	adc T2, XAccu2
-	adc T3, XAccu3
-	adc T4, XAccu4
-	/* now we've to add a[7], w[i] and k[i] */
-	ldd XAccu1, Y+4*7
-	ldd XAccu2, Y+4*7+1
-	ldd XAccu3, Y+4*7+2
-	ldd XAccu4, Y+4*7+3
-	add T1, XAccu1
-	adc T2, XAccu2
-	adc T3, XAccu3
-	adc T4, XAccu4 /* add a[7] */
-	ld XAccu1, X+
-	ld XAccu2, X+
-	ld XAccu3, X+
-	ld XAccu4, X+
-	add T1, XAccu1
-	adc T2, XAccu2
-	adc T3, XAccu3
-	adc T4, XAccu4 /* add w[i] */
-	lpm XAccu1, Z+
-	lpm XAccu2, Z+
-	lpm XAccu3, Z+
-	lpm XAccu4, Z+
-	add T1, XAccu1
-	adc T2, XAccu2
-	adc T3, XAccu3
-	adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
-	/*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
-		/* starting with MAJ(x,y,z) */
-	ldd Func1, Y+4*0+0
-	ldd Func2, Y+4*0+1
-	ldd Func3, Y+4*0+2
-	ldd Func4, Y+4*0+3 /* load x=a[0] */
-	ldd XAccu1, Y+4*1+0
-	ldd XAccu2, Y+4*1+1
-	ldd XAccu3, Y+4*1+2
-	ldd XAccu4, Y+4*1+3 /* load y=a[1] */
-	and XAccu1, Func1
-	and XAccu2, Func2
-	and XAccu3, Func3
-	and XAccu4, Func4	/* XAccu == (x & y) */
-	ldd Bck1, Y+4*2+0
-	ldd Bck2, Y+4*2+1
-	ldd Bck3, Y+4*2+2
-	ldd Bck4, Y+4*2+3 /* load z=a[2] */
-	and Func1, Bck1
-	and Func2, Bck2
-	and Func3, Bck3
-	and Func4, Bck4
-	eor XAccu1, Func1
-	eor XAccu2, Func2
-	eor XAccu3, Func3
-	eor XAccu4, Func4	/* XAccu == (x & y) ^ (x & z) */
-	ldd Func1, Y+4*1+0
-	ldd Func2, Y+4*1+1
-	ldd Func3, Y+4*1+2
-	ldd Func4, Y+4*1+3 /* load y=a[1] */
-	and Func1, Bck1
-	and Func2, Bck2
-	and Func3, Bck3
-	and Func4, Bck4
-	eor XAccu1, Func1
-	eor XAccu2, Func2
-	eor XAccu3, Func3
-	eor XAccu4, Func4	/* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
-   	/* SIGMA0(a[0]) */
-	ldd Bck1, Y+4*0+0 /* we should combine this with above */
-	ldd Bck2, Y+4*0+1
-	ldd Bck3, Y+4*0+2
-	ldd Bck4, Y+4*0+3
-	movw Func1, Bck1
-	movw Func3, Bck3
-	ldi r20, 2
-	rcall bitrotr
-	movw Accu1, Func1
-	movw Accu3, Func3 /* Accu = shr(a[0], 2) */
-	movw Func1, Bck3 
-	movw Func3, Bck1 /* prerotate by 16 bits */
-	ldi r20, 3
-	rcall bitrotl
-	eor Accu1, Func1
-	eor Accu2, Func2
-	eor Accu3, Func3
-	eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
-	mov Func1, Bck4
-	mov Func2, Bck1
-	mov Func3, Bck2
-	mov Func4, Bck3  /* prerotate by 24 bits */
-	ldi r20, 2
-	rcall bitrotl
-	eor Accu1, Func1
-	eor Accu2, Func2
-	eor Accu3, Func3
-	eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
-	add Accu1, XAccu1 /* add previous result (MAJ)*/
-	adc Accu2, XAccu2
-	adc Accu3, XAccu3
-	adc Accu4, XAccu4
-	/* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
-	/* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
-
-	ldi r21, 7*4
-	adiw r28, 7*4
-a_shift_loop:
-	ld  r25, -Y /* warning: this is PREdecrement */
-	std Y+4, r25
-	dec r21
-	brne a_shift_loop
-
-	ldd Bck1, Y+4*4+0
-	ldd Bck2, Y+4*4+1
-	ldd Bck3, Y+4*4+2
-	ldd Bck4, Y+4*4+3
-	add Bck1, T1
-	adc Bck2, T2
-	adc Bck3, T3
-	adc Bck4, T4
-	std Y+4*4+0, Bck1
-	std Y+4*4+1, Bck2
-	std Y+4*4+2, Bck3
-	std Y+4*4+3, Bck4
-	add Accu1, T1
-	adc Accu2, T2
-	adc Accu3, T3
-	adc Accu4, T4
-	std Y+4*0+0, Accu1
-	std Y+4*0+1, Accu2
-	std Y+4*0+2, Accu3
-	std Y+4*0+3, Accu4 /* a array updated */
-	
-	
-	dec LoopC
-	breq update_state
-	rjmp sha256_main_loop ;brne sha256_main_loop
-update_state:	
-	/* update state */
-	/* pointers to state should still exist on the stack ;-) */
-	pop r31
-	pop r30
-	ldi r21, 8
-update_state_loop:
-	ldd Accu1, Z+0
-	ldd Accu2, Z+1
-	ldd Accu3, Z+2
-	ldd Accu4, Z+3 
-	ld Func1, Y+
-	ld Func2, Y+
-	ld Func3, Y+
-	ld Func4, Y+
-	add Accu1, Func1
-	adc Accu2, Func2
-	adc Accu3, Func3
-	adc Accu4, Func4
-	st Z+, Accu1
-	st Z+, Accu2
-	st Z+, Accu3
-	st Z+, Accu4
-	dec r21
-	brne update_state_loop
-	/* now we just have to update the length */
-	adiw r30, 1 /* since we add 512, we can simply skip the LSB */ 
-	ldi r21, 2
-	ldi r22, 6
-	ld r20, Z
-	add r20, r21
-	st Z+, r20	
-	clr r21
-sha256_nextBlock_fix_length:	
-	brcc sha256_nextBlock_epilog
-	ld r20, Z
-	adc r20, r21
-	st Z+, r20
-	dec r22
-	brne sha256_nextBlock_fix_length
-	
-; EPILOG
-sha256_nextBlock_epilog:
-/* now we should clean up the stack */
-	
-	pop r21
-	pop r20
-	in r0, SREG
-	cli ; we want to be uninterrupted while updating SP
-	out SPL, r20
-	out SPH, r21
-	out SREG, r0
-	
-	clr r1
-	pop r29
-	pop r28
-	pop r17
-	pop r16
-	pop r15
-	pop r14
-	pop r13
-	pop r12
-	pop r11
-	pop r10
-	pop r9
-	pop r8
-	pop r7
-	pop r6
-	pop r5
-	pop r4 
-	ret
-
-sha256_kv: ; round-key-vector stored in ProgMem 
-.word	0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
-.word	0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
-.word	0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
-.word	0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
-.word	0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
-.word	0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
-.word	0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
-.word	0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
-
-	
-;###########################################################	
-
-.global sha256_init 
-;uint32_t sha256_init_vector[]={
-;  	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
-;	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
-;
-;void sha256_init(sha256_ctx_t *state){
-;	state->length=0;
-;	memcpy(state->h, sha256_init_vector, 8*4);
-;}
-; param1: (Func3,r24) 16-bit pointer to sha256_ctx_t struct in ram
-; modifys: Z(r30,r31), Func1, r22
-sha256_init:
-	movw r26, r24 ; (24,25) --> (26,27) load X with param1
-	ldi r30, lo8((sha256_init_vector))
-	ldi r31, hi8((sha256_init_vector))
-	ldi r22, 32
-sha256_init_vloop:	
-	lpm r23, Z+ 
-	st X+, r23
-	dec r22
-	brne sha256_init_vloop
-	ldi r22, 8
-	clr r1 ;this should not be needed
-sha256_init_lloop:
-	st X+, r1
-	dec r22
-	brne sha256_init_lloop
-	ret
-	
-sha256_init_vector:
-.word 0xE667, 0x6A09
-.word 0xAE85, 0xBB67 
-.word 0xF372, 0x3C6E 
-.word 0xF53A, 0xA54F 
-.word 0x527F, 0x510E 
-.word 0x688C, 0x9B05 
-.word 0xD9AB, 0x1F83 
-.word 0xCD19, 0x5BE0
-
-;###########################################################	
-
-.global rotl32
-; === ROTL32 ===
-; function that rotates a 32 bit word to the left
-;  param1: the 32-bit word to rotate
-;	given in r25,r24,r23,r22 (r25 is most significant)
-;  param2: an 8-bit value telling how often to rotate
-;	given in r20
-; modifys: r21, r22
-rotl32:
-	cpi r20, 8
-	brlo bitrotl
-	mov r21, r25
-	mov r25, r24
-	mov r24, r23
-	mov r23, r22
-	mov r22, r21
-	subi r20, 8
-	rjmp rotr32
-bitrotl:
-	clr r21
-	clc
-bitrotl_loop:	
-	tst r20
-	breq fixrotl
-	rol r22
-	rol r23
-	rol r24
-	rol r25
-	rol r21
-	dec r20
-	rjmp bitrotl_loop
-fixrotl:
-	or r22, r21
-	ret
-	
-
-;###########################################################	
-
-.global rotr32
-; === ROTR32 ===
-; function that rotates a 32 bit word to the right
-;  param1: the 32-bit word to rotate
-;	given in r25,r24,r23,22 (r25 is most significant)
-;  param2: an 8-bit value telling how often to rotate
-;	given in r20
-; modifys: r21, r22
-rotr32:
-	cpi r20, 8
-	brlo bitrotr
-	mov r21, r22
-	mov r22, r23
-	mov r23, r24
-	mov r24, r25
-	mov r25, r21
-	subi r20, 8
-	rjmp rotr32
-bitrotr:
-	clr r21
-	clc
-bitrotr_loop:	
-	tst r20
-	breq fixrotr
-	ror r25
-	ror r24
-	ror r23
-	ror r22
-	ror r21
-	dec r20
-	rjmp bitrotr_loop
-fixrotr:
-	or r25, r21
-	ret
-	
-	
-;###########################################################	
-	
-.global change_endian32
-; === change_endian32 ===
-; function that changes the endianess of a 32-bit word
-;  param1: the 32-bit word
-;	given in r25,r24,r23,22 (r25 is most significant)
-;  modifys: r21, r22
-change_endian32:
-	movw r20,  r22 ; (r22,r23) --> (r20,r21)
-	mov r22, r25
-	mov r23, r24
-	mov r24, r21
-	mov r25, r20 
-	ret
-
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; sha-256 implementation in assembler	
+SHA256_BLOCK_BITS = 512
+SHA256_HASH_BITS = 256
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+	precall
+	hexdump \length
+	postcall
+.endm
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha256_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][h5][h6][h7][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha256_ctx2hash
+; === sha256_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha256_ctx structure
+;	given in r23,r22
+sha256_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 8
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha256
+; === sha256 ===
+; this function calculates SHA-256 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha256:
+sha256_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 8*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha256_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha256_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha256_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha256_ctx2hash	
+	
+sha256_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 8*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha256_lastBlock
+; === sha256_lastBlock ===
+; this function does padding & Co. for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
+
+
+sha256_lastBlock:
+	tst r20
+	brne sha256_lastBlock_prolog
+	cpi r21, 0x02
+	brne sha256_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	rcall sha256_nextBlock
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	clr r21
+	clr r22
+sha256_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha256_lastBlock_post_copy
+	mov r1, r18
+sha256_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha256_lastBlock_copy_loop
+sha256_lastBlock_post_copy:	
+sha256_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha256_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*8+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha256_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha256_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha256_lastBlock_epilog
+sha256_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 8*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha256_nextBlock
+
+sha256_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha256_nextBlock
+; === sha256_nextBlock ===
+; this is the core function for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
+
+Bck1 = 12
+Bck2 = 13
+Bck3 = 14
+Bck4 = 15
+Func1 = 22
+Func2 = 23
+Func3 = 24
+Func4 = 25
+Accu1 = 16
+Accu2 = 17
+Accu3 = 18
+Accu4 = 19
+XAccu1 = 8
+XAccu2 = 9
+XAccu3 = 10
+XAccu4 = 11
+T1	= 4
+T2	= 5
+T3	= 6
+T4	= 7
+LoopC = 1
+/* byteorder: high number <--> high significance */
+sha256_nextBlock:
+ ; initial, let's make some space ready for local vars
+	push r4 /* replace push & pop by mem ops? */
+	push r5
+	push r6
+	push r7
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha256_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	push r18
+	push r19
+	push r24
+	push r25 /* param1 will be needed later */
+ ; now we fill the w array with message (think about endianess)
+ 	adiw r26, 1 ; X++
+ 	ldi r20, 16
+sha256_nextBlock_wcpyloop: 	
+ 	ld r23, Z+
+ 	ld r22, Z+
+ 	ld r19, Z+
+ 	ld r18, Z+
+ 	st X+, r18
+ 	st X+, r19
+ 	st X+, r22	
+	st X+, r23
+	dec r20
+	brne sha256_nextBlock_wcpyloop
+/*	for (i=16; i<64; ++i){
+		w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];	
+	} */
+	/* r25,r24,r23,r24 (r21,r20) are function values
+	   r19,r18,r17,r16 are the accumulator
+	   r15,r14,r13,rBck1 are backup1
+	   r11,r10,r9 ,r8  are xor accu   
+	   r1 is round counter 								*/
+
+	ldi r20, 64-16
+	mov LoopC, r20
+sha256_nextBlock_wcalcloop:		 
+	movw r30, r26 ; cp X to Z
+	sbiw r30, 63
+	sbiw r30, 1 		; substract 64 = 16*4
+	ld Accu1, Z+
+	ld Accu2, Z+
+	ld Accu3, Z+
+	ld Accu4, Z+ /* w[i] = w[i-16] */
+	ld Bck1, Z+
+	ld Bck2, Z+
+	ld Bck3, Z+
+	ld Bck4, Z+ /* backup = w[i-15] */
+	/* now sigma 0 */
+	mov Func1, Bck2
+	mov Func2, Bck3
+	mov Func3, Bck4
+	mov Func4, Bck1  /* prerotated by 8 */
+	ldi r20, 1
+	rcall bitrotl
+	movw XAccu1, Func1
+	movw XAccu3, Func3	 /* store ROTR(w[i-15],7) in xor accu */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 3		 /* now shr3 */ /*we can destroy backup now*/
+sigma0_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2
+	ror Bck1	
+	dec Func2
+	brne sigma0_shr
+	eor XAccu1, Bck1
+	eor XAccu2, Bck2
+	eor XAccu3, Bck3
+	eor XAccu4, Bck4	/* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	ldd Func1, Z+7*4  /* now accu += w[i-7] */
+	ldd Func2, Z+7*4+1
+	ldd Func3, Z+7*4+2
+	ldd Func4, Z+7*4+3
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
+	ldd Bck2, Z+12*4+1
+	ldd Bck3, Z+12*4+2
+	ldd Bck4, Z+12*4+3
+	/* now sigma 1 */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 1
+	rcall bitrotr
+	movw XAccu3, Func3
+	movw XAccu1, Func1	 /* store in ROTR(w[i-2], 17) xor accu */
+;	movw Func1, Bck3
+;	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 2	 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
+sigma1_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2	
+	dec Func2
+	brne sigma1_shr
+	eor XAccu1, Bck2
+	eor XAccu2, Bck3
+	eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	/* now let's store the shit */
+	st X+, Accu1
+	st X+, Accu2
+	st X+, Accu3
+	st X+, Accu4
+	dec LoopC
+	breq 3f  ; skip if zero
+	rjmp sha256_nextBlock_wcalcloop
+3:
+	/* we are finished with w array X points one byte post w */
+/* init a array */
+	pop r31
+	pop r30
+	push r30
+	push r31
+	ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
+init_a_array:	
+	ld r1, Z+
+	st X+, r1
+	dec r25
+	brne init_a_array
+	
+/* now the real fun begins */
+/* for (i=0; i<64; ++i){
+			t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
+			t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
+			memmove(&(a[1]), &(a[0]), 7*4); 	// a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; 
+			a[4] += t1;
+			a[0] = t1 + t2;
+		} */
+	/* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
+	sbiw r26, 8*4  /* X still points at a[7]+1*/
+	movw r28, r26
+	ldi r30, lo8(sha256_kv)
+	ldi r31, hi8(sha256_kv)		
+	dec r27  /* X - (64*4 == 256) */
+	ldi r25, 64
+	mov LoopC, r25
+sha256_main_loop:
+	/* now calculate t1 */
+	 /*CH(x,y,z) = (x&y)^((~x)&z)*/
+	ldd T1, Y+5*4
+	ldd T2, Y+5*4+1
+	ldd T3, Y+5*4+2
+	ldd T4, Y+5*4+3 /* y in T */
+	ldd Func1, Y+4*4
+	ldd Func2, Y+4*4+1
+	ldd Func3, Y+4*4+2
+	ldd Func4, Y+4*4+3  /* x in Func */
+	ldd Bck1, Y+6*4
+	ldd Bck2, Y+6*4+1
+	ldd Bck3, Y+6*4+2
+	ldd Bck4, Y+6*4+3 /* z in Bck */
+	and T1, Func1
+	and T2, Func2
+	and T3, Func3
+	and T4, Func4
+	com Func1
+	com Func2
+	com Func3
+	com Func4
+	and Bck1, Func1
+	and Bck2, Func2
+	and Bck3, Func3
+	and Bck4, Func4
+	eor T1, Bck1
+	eor T2, Bck2
+	eor T3, Bck3
+	eor T4, Bck4 /* done, CH(x,y,z) is in T */
+	/* now SIGMA1(a[4]) */
+	ldd Bck4, Y+4*4		/* think about using it from Func reg above*/
+	ldd Bck1, Y+4*4+1	
+	ldd Bck2, Y+4*4+2
+	ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */	
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2 
+	rcall bitrotl		/* rotr(x,6) */ 
+	movw XAccu1, Func1
+	movw XAccu3, Func3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 3 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	movw Func1, Bck3 /* this prerotates furteh 16 bits*/
+	movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
+	ldi r20, 1 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4
+	/* now we've to add a[7], w[i] and k[i] */
+	ldd XAccu1, Y+4*7
+	ldd XAccu2, Y+4*7+1
+	ldd XAccu3, Y+4*7+2
+	ldd XAccu4, Y+4*7+3
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add a[7] */
+	ld XAccu1, X+
+	ld XAccu2, X+
+	ld XAccu3, X+
+	ld XAccu4, X+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add w[i] */
+	lpm XAccu1, Z+
+	lpm XAccu2, Z+
+	lpm XAccu3, Z+
+	lpm XAccu4, Z+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
+	/*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
+		/* starting with MAJ(x,y,z) */
+	ldd Func1, Y+4*0+0
+	ldd Func2, Y+4*0+1
+	ldd Func3, Y+4*0+2
+	ldd Func4, Y+4*0+3 /* load x=a[0] */
+	ldd XAccu1, Y+4*1+0
+	ldd XAccu2, Y+4*1+1
+	ldd XAccu3, Y+4*1+2
+	ldd XAccu4, Y+4*1+3 /* load y=a[1] */
+	and XAccu1, Func1
+	and XAccu2, Func2
+	and XAccu3, Func3
+	and XAccu4, Func4	/* XAccu == (x & y) */
+	ldd Bck1, Y+4*2+0
+	ldd Bck2, Y+4*2+1
+	ldd Bck3, Y+4*2+2
+	ldd Bck4, Y+4*2+3 /* load z=a[2] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == (x & y) ^ (x & z) */
+	ldd Func1, Y+4*1+0
+	ldd Func2, Y+4*1+1
+	ldd Func3, Y+4*1+2
+	ldd Func4, Y+4*1+3 /* load y=a[1] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
+   	/* SIGMA0(a[0]) */
+	ldd Bck1, Y+4*0+0 /* we should combine this with above */
+	ldd Bck2, Y+4*0+1
+	ldd Bck3, Y+4*0+2
+	ldd Bck4, Y+4*0+3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2
+	rcall bitrotr
+	movw Accu1, Func1
+	movw Accu3, Func3 /* Accu = shr(a[0], 2) */
+	movw Func1, Bck3 
+	movw Func3, Bck1 /* prerotate by 16 bits */
+	ldi r20, 3
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
+	mov Func1, Bck4
+	mov Func2, Bck1
+	mov Func3, Bck2
+	mov Func4, Bck3  /* prerotate by 24 bits */
+	ldi r20, 2
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
+	add Accu1, XAccu1 /* add previous result (MAJ)*/
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4
+	/* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
+	/* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
+
+	ldi r21, 7*4
+	adiw r28, 7*4
+a_shift_loop:
+	ld  r25, -Y /* warning: this is PREdecrement */
+	std Y+4, r25
+	dec r21
+	brne a_shift_loop
+
+	ldd Bck1, Y+4*4+0
+	ldd Bck2, Y+4*4+1
+	ldd Bck3, Y+4*4+2
+	ldd Bck4, Y+4*4+3
+	add Bck1, T1
+	adc Bck2, T2
+	adc Bck3, T3
+	adc Bck4, T4
+	std Y+4*4+0, Bck1
+	std Y+4*4+1, Bck2
+	std Y+4*4+2, Bck3
+	std Y+4*4+3, Bck4
+	add Accu1, T1
+	adc Accu2, T2
+	adc Accu3, T3
+	adc Accu4, T4
+	std Y+4*0+0, Accu1
+	std Y+4*0+1, Accu2
+	std Y+4*0+2, Accu3
+	std Y+4*0+3, Accu4 /* a array updated */
+	
+	
+	dec LoopC
+	breq update_state
+	rjmp sha256_main_loop ;brne sha256_main_loop
+update_state:	
+	/* update state */
+	/* pointers to state should still exist on the stack ;-) */
+	pop r31
+	pop r30
+	ldi r21, 8
+update_state_loop:
+	ldd Accu1, Z+0
+	ldd Accu2, Z+1
+	ldd Accu3, Z+2
+	ldd Accu4, Z+3 
+	ld Func1, Y+
+	ld Func2, Y+
+	ld Func3, Y+
+	ld Func4, Y+
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	st Z+, Accu1
+	st Z+, Accu2
+	st Z+, Accu3
+	st Z+, Accu4
+	dec r21
+	brne update_state_loop
+	/* now we just have to update the length */
+	adiw r30, 1 /* since we add 512, we can simply skip the LSB */ 
+	ldi r21, 2
+	ldi r22, 6
+	ld r20, Z
+	add r20, r21
+	st Z+, r20	
+	clr r21
+sha256_nextBlock_fix_length:	
+	brcc sha256_nextBlock_epilog
+	ld r20, Z
+	adc r20, r21
+	st Z+, r20
+	dec r22
+	brne sha256_nextBlock_fix_length
+	
+; EPILOG
+sha256_nextBlock_epilog:
+/* now we should clean up the stack */
+	
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	pop r7
+	pop r6
+	pop r5
+	pop r4 
+	ret
+
+sha256_kv: ; round-key-vector stored in ProgMem 
+.word	0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
+.word	0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
+.word	0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
+.word	0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
+.word	0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
+.word	0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
+.word	0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
+.word	0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
+
+	
+;###########################################################	
+
+.global sha256_init 
+;uint32_t sha256_init_vector[]={
+;  	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+;	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
+;
+;void sha256_init(sha256_ctx_t *state){
+;	state->length=0;
+;	memcpy(state->h, sha256_init_vector, 8*4);
+;}
+; param1: (Func3,r24) 16-bit pointer to sha256_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha256_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha256_init_vector))
+	ldi r31, hi8((sha256_init_vector))
+	ldi r22, 32
+sha256_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha256_init_vloop
+	ldi r22, 8
+	clr r1 ;this should not be needed
+sha256_init_lloop:
+	st X+, r1
+	dec r22
+	brne sha256_init_lloop
+	ret
+	
+sha256_init_vector:
+.word 0xE667, 0x6A09
+.word 0xAE85, 0xBB67 
+.word 0xF372, 0x3C6E 
+.word 0xF53A, 0xA54F 
+.word 0x527F, 0x510E 
+.word 0x688C, 0x9B05 
+.word 0xD9AB, 0x1F83 
+.word 0xCD19, 0x5BE0
+
+;###########################################################	
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,r22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotl32:
+	cpi r20, 8
+	brlo bitrotl
+	mov r21, r25
+	mov r25, r24
+	mov r24, r23
+	mov r23, r22
+	mov r22, r21
+	subi r20, 8
+	rjmp rotr32
+bitrotl:
+	clr r21
+	clc
+bitrotl_loop:	
+	tst r20
+	breq fixrotl
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	rol r21
+	dec r20
+	rjmp bitrotl_loop
+fixrotl:
+	or r22, r21
+	ret
+	
+
+;###########################################################	
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotr32:
+	cpi r20, 8
+	brlo bitrotr
+	mov r21, r22
+	mov r22, r23
+	mov r23, r24
+	mov r24, r25
+	mov r25, r21
+	subi r20, 8
+	rjmp rotr32
+bitrotr:
+	clr r21
+	clc
+bitrotr_loop:	
+	tst r20
+	breq fixrotr
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	dec r20
+	rjmp bitrotr_loop
+fixrotr:
+	or r25, r21
+	ret
+	
+	
+;###########################################################	
+	
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+;  param1: the 32-bit word
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  modifys: r21, r22
+change_endian32:
+	movw r20,  r22 ; (r22,r23) --> (r20,r21)
+	mov r22, r25
+	mov r23, r24
+	mov r24, r21
+	mov r25, r20 
+	ret
+
diff --git a/xtea-asm.S b/xtea-asm.S
index f3c5b12..20f1d63 100644
--- a/xtea-asm.S
+++ b/xtea-asm.S
@@ -17,9 +17,9 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 /* xtea-asm.S 
- * Author:	Daniel Otte
- * Date:		06.06.2006
- * License: GPL
+ * Author:      Daniel Otte
+ * Date:        2006-06-06
+ * License:     GPLv3 or later
  *  Implementation of XTEA for AVR
  *  include xtea.h in your C-Project to use this functions.
 */