diff --git a/arcfour-asm.S b/arcfour/arcfour-asm.S
similarity index 100%
rename from arcfour-asm.S
rename to arcfour/arcfour-asm.S
diff --git a/arcfour.c b/arcfour/arcfour.c
similarity index 100%
rename from arcfour.c
rename to arcfour/arcfour.c
diff --git a/arcfour.h b/arcfour/arcfour.h
similarity index 100%
rename from arcfour.h
rename to arcfour/arcfour.h
diff --git a/base64_dec.c b/base64/base64_dec.c
similarity index 100%
rename from base64_dec.c
rename to base64/base64_dec.c
diff --git a/base64_dec.h b/base64/base64_dec.h
similarity index 100%
rename from base64_dec.h
rename to base64/base64_dec.h
diff --git a/base64_enc.c b/base64/base64_enc.c
similarity index 100%
rename from base64_enc.c
rename to base64/base64_enc.c
diff --git a/base64_enc.h b/base64/base64_enc.h
similarity index 100%
rename from base64_enc.h
rename to base64/base64_enc.h
diff --git a/bmw_large.c b/bmw/bmw_large.c
similarity index 100%
rename from bmw_large.c
rename to bmw/bmw_large.c
diff --git a/bmw_large.h b/bmw/bmw_large.h
similarity index 100%
rename from bmw_large.h
rename to bmw/bmw_large.h
diff --git a/bmw_small.c b/bmw/bmw_small.c
similarity index 100%
rename from bmw_small.c
rename to bmw/bmw_small.c
diff --git a/bmw_small.h b/bmw/bmw_small.h
similarity index 100%
rename from bmw_small.h
rename to bmw/bmw_small.h
diff --git a/cast5-sbox.h b/cast5/cast5-sbox.h
similarity index 100%
rename from cast5-sbox.h
rename to cast5/cast5-sbox.h
diff --git a/cast5.c b/cast5/cast5.c
similarity index 100%
rename from cast5.c
rename to cast5/cast5.c
diff --git a/cast5.h b/cast5/cast5.h
similarity index 100%
rename from cast5.h
rename to cast5/cast5.h
diff --git a/cast6.c b/cast6/cast6.c
similarity index 100%
rename from cast6.c
rename to cast6/cast6.c
diff --git a/cast6.h b/cast6/cast6.h
similarity index 100%
rename from cast6.h
rename to cast6/cast6.h
diff --git a/cast6_sboxes.h b/cast6/cast6_sboxes.h
similarity index 100%
rename from cast6_sboxes.h
rename to cast6/cast6_sboxes.h
diff --git a/des.c b/des/des.c
similarity index 100%
rename from des.c
rename to des/des.c
diff --git a/des.h b/des/des.h
similarity index 100%
rename from des.h
rename to des/des.h
diff --git a/entropium.c b/entropium/entropium.c
similarity index 100%
rename from entropium.c
rename to entropium/entropium.c
diff --git a/entropium.h b/entropium/entropium.h
similarity index 100%
rename from entropium.h
rename to entropium/entropium.h
diff --git a/sha256-asm.S b/entropium/sha256-asm.S
similarity index 100%
rename from sha256-asm.S
rename to entropium/sha256-asm.S
diff --git a/sha256.h b/entropium/sha256.h
similarity index 100%
rename from sha256.h
rename to entropium/sha256.h
diff --git a/grain.c b/grain/grain.c
similarity index 100%
rename from grain.c
rename to grain/grain.c
diff --git a/grain.h b/grain/grain.h
similarity index 100%
rename from grain.h
rename to grain/grain.h
diff --git a/hmac-md5/base64_dec.c b/hmac-md5/base64_dec.c
new file mode 100644
index 0000000..f057f54
--- /dev/null
+++ b/hmac-md5/base64_dec.c
@@ -0,0 +1,246 @@
+/* base64_dec.c */
+/*
+ *   This file is part of the AVR-Crypto-Lib.
+ *   Copyright (C) 2006, 2007, 2008  Daniel Otte (daniel.otte@rub.de)
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/**
+ * base64 decoder (RFC3548)
+ * Author: Daniel Otte
+ * License: GPLv3
+ * 
+ * 
+ */
+
+#include <stdint.h>
+#include "base64_dec.h"
+
+#include "test_src/cli.h"
+
+/*
+ #define USE_GCC_EXTENSION
+*/
+#if 1
+
+#ifdef USE_GCC_EXTENSION
+
+static
+int ascii2bit6(char a){
+	switch(a){
+		case 'A'...'Z':
+			return a-'A';
+		case 'a'...'z':
+			return a-'a'+26;
+		case '0'...'9':
+			return a-'0'+52;
+		case '+':
+		case '-':
+			return 62;
+		case '/':
+		case '_':
+			return 63;
+		default:
+			return -1;
+	}
+}
+
+#else
+
+static
+uint8_t ascii2bit6(char a){
+	int r;
+	switch(a>>4){
+		case 0x5:
+		case 0x4: 
+			r=a-'A';
+			if(r<0 || r>25){
+				return -1;
+			} else {
+				return r;
+			}
+		case 0x7:
+		case 0x6: 
+			r=a-'a';
+			if(r<0 || r>25){
+				return -1;
+			} else {
+				return r+26;
+			}
+			break;
+		case 0x3:
+			if(a>'9')
+				return -1;
+			return a-'0'+52;
+		default:
+			break;	
+	}
+	switch (a){
+		case '+':
+		case '-':
+			return 62;
+		case '/':
+		case '_':
+			return 63;
+		default:
+			return 0xff;
+	}
+}
+
+#endif
+
+#else
+
+static 
+uint8_t ascii2bit6(uint8_t a){
+	if(a>='A' && a<='Z'){
+		return a-'A';
+	} else {
+		if(a>='a' && a<= 'z'){
+			return a-'a'+26;
+		} else {
+			if(a>='0' && a<='9'){
+				return a-'0'+52;
+			} else {
+				if(a=='+' || a=='-'){
+					return 62;
+				} else {
+					if(a=='/' || a=='_'){
+						return 63;
+					} else {
+						return 0xff;
+					}
+				}
+			}
+		}
+	}
+}
+
+#endif
+
+int base64_binlength(char* str, uint8_t strict){
+	int l=0;
+	uint8_t term=0;
+	for(;;){
+		if(*str=='\0')
+			break;
+		if(*str=='\n' || *str=='\r'){
+			str++;
+			continue;
+		}
+		if(*str=='='){
+			term++;
+			str++;
+			if(term==2){
+				break;
+			}
+			continue;
+		}
+		if(term)
+			return -1;
+		if(ascii2bit6(*str)==-1){
+			if(strict)
+				return -1;
+		} else {
+			l++;
+		}
+		str++;
+	}
+	switch(term){
+		case 0:
+			if(l%4!=0)
+				return -1;
+			return l/4*3;
+		case 1:
+			if(l%4!=3)
+				return -1;
+			return (l+1)/4*3-1;
+		case 2:
+			if(l%4!=2)
+				return -1;
+			return (l+2)/4*3-2;
+		default:
+			return -1;
+	}
+}
+
+/*
+  |543210543210543210543210|
+  |765432107654321076543210|
+
+        .      .      .     .
+  |54321054|32105432|10543210|
+  |76543210|76543210|76543210|
+
+*/
+
+int base64dec(void* dest, char* b64str, uint8_t strict){
+	uint8_t buffer[4];
+	uint8_t idx=0;
+	uint8_t term=0;
+	for(;;){
+//		cli_putstr_P(PSTR("\r\n  DBG: got 0x"));
+//		cli_hexdump(b64str, 1);
+		buffer[idx]= ascii2bit6(*b64str);
+//		cli_putstr_P(PSTR(" --> 0x"));
+//		cli_hexdump(buffer+idx, 1);
+		
+		if(buffer[idx]==0xFF){
+			if(*b64str=='='){
+				term++;
+				b64str++;
+				if(term==2)
+					goto finalize; /* definitly the end */
+			}else{
+				if(*b64str == '\0'){
+					goto finalize; /* definitly the end */
+				}else{
+					if(*b64str == '\r' || *b64str == '\n' || !(strict)){
+						b64str++; /* charcters that we simply ignore */
+					}else{
+						return -1;
+					}
+				}
+			}
+		}else{
+			if(term)
+				return -1; /* this happens if we get a '=' in the stream */
+			idx++;
+			b64str++;
+		}
+		if(idx==4){
+			((uint8_t*)dest)[0] = buffer[0]<<2 | buffer[1]>>4;
+			((uint8_t*)dest)[1] = buffer[1]<<4 | buffer[2]>>2;
+			((uint8_t*)dest)[2] = buffer[2]<<6 | buffer[3];
+			dest = (uint8_t*)dest +3;
+			idx=0;
+		}
+	}
+  finalize:	
+	/* the final touch */
+	if(idx==0)
+		return 0;
+	if(term==1){
+		((uint8_t*)dest)[0] = buffer[0]<<2 | buffer[1]>>4;
+		((uint8_t*)dest)[1] = buffer[1]<<4 | buffer[2]>>2;			
+		return 0;
+	}
+	if(term==2){
+		((uint8_t*)dest)[0] = buffer[0]<<2 | buffer[1]>>4;
+		return 0;
+	}
+	return -1;
+}
diff --git a/hmac-md5/base64_dec.h b/hmac-md5/base64_dec.h
new file mode 100644
index 0000000..39beff8
--- /dev/null
+++ b/hmac-md5/base64_dec.h
@@ -0,0 +1,29 @@
+/* base64_dec.h */
+/*
+ *   This file is part of the AVR-Crypto-Lib.
+ *   Copyright (C) 2006, 2007, 2008  Daniel Otte (daniel.otte@rub.de)
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#ifndef BASE64_DEC_H_
+#define BASE64_DEC_H_
+
+#include <stdint.h>
+
+int base64_binlength(char* str, uint8_t strict);
+int base64dec(void* dest, char* b64str, uint8_t strict);
+
+#endif /*BASE64_DEC_H_*/
diff --git a/hmac-md5/base64_enc.c b/hmac-md5/base64_enc.c
new file mode 100644
index 0000000..400f25c
--- /dev/null
+++ b/hmac-md5/base64_enc.c
@@ -0,0 +1,117 @@
+/* base64_enc.c */
+/*
+ *   This file is part of the AVR-Crypto-Lib.
+ *   Copyright (C) 2006, 2007, 2008  Daniel Otte (daniel.otte@rub.de)
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/**
+ * base64 encoder (RFC3548)
+ * Author: Daniel Otte
+ * License: GPLv3
+ * 
+ * 
+ */
+
+#include <stdint.h>
+#include "base64_enc.h"
+
+#if 1
+#include <avr/pgmspace.h>
+
+char base64_alphabet[64] PROGMEM = {
+	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 
+	'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 
+	'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 
+	'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 
+	'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 
+	'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 
+	'w', 'x', 'y', 'z', '0', '1', '2', '3', 
+	'4', '5', '6', '7', '8', '9', '+', '/' }; 
+
+static 
+char bit6toAscii(uint8_t a){
+	a &= (uint8_t)0x3F;
+	return pgm_read_byte(base64_alphabet+a);
+}
+
+#else
+
+static 
+char bit6toAscii(uint8_t a){
+	a &= (uint8_t)0x3F;
+	
+	if(a<=25){
+		return a+'A';
+	} else {
+		if(a<=51){
+			return a-26+'a';
+		} else {
+			if(a<=61){
+				return a-52+'0';
+			} else {
+				if(a==62){
+					return '+';
+				} else {
+					return '/'; /* a == 63 */
+				}
+			}
+		}
+	}
+}
+
+#endif
+
+void base64enc(char* dest, void* src, uint16_t length){
+	uint16_t i,j;
+	uint8_t a[4];
+	for(i=0; i<length/3; ++i){
+		a[0]= (((uint8_t*)src)[i*3+0])>>2;
+		a[1]= (((((uint8_t*)src)[i*3+0])<<4) | ((((uint8_t*)src)[i*3+1])>>4)) & 0x3F;
+		a[2]= (((((uint8_t*)src)[i*3+1])<<2) | ((((uint8_t*)src)[i*3+2])>>6)) & 0x3F;
+		a[3]= (((uint8_t*)src)[i*3+2]) & 0x3F;
+		for(j=0; j<4; ++j){
+			*dest++=bit6toAscii(a[j]);
+		}
+	}
+	/* now we do the rest */
+	switch(length%3){
+		case 0: 
+			break;
+		case 1:
+			a[0]=(((uint8_t*)src)[i*3+0])>>2;
+			a[1]=((((uint8_t*)src)[i*3+0])<<4)&0x3F;
+			*dest++ = bit6toAscii(a[0]);
+			*dest++ = bit6toAscii(a[1]);
+			*dest++ = '=';
+			*dest++ = '=';
+			break;
+		case 2:		
+			a[0]= (((uint8_t*)src)[i*3+0])>>2;
+			a[1]= (((((uint8_t*)src)[i*3+0])<<4) | ((((uint8_t*)src)[i*3+1])>>4)) & 0x3F;
+			a[2]= ((((uint8_t*)src)[i*3+1])<<2) & 0x3F;
+			*dest++ = bit6toAscii(a[0]);
+			*dest++ = bit6toAscii(a[1]);
+			*dest++ = bit6toAscii(a[2]);
+			*dest++ = '=';
+			break;
+		default: /* this will not happen! */
+			break;	
+	}
+/*  finalize: */
+  	*dest='\0';
+}
+
diff --git a/hmac-md5/base64_enc.h b/hmac-md5/base64_enc.h
new file mode 100644
index 0000000..9065132
--- /dev/null
+++ b/hmac-md5/base64_enc.h
@@ -0,0 +1,28 @@
+/* base64_enc.h */
+/*
+ *   This file is part of the AVR-Crypto-Lib.
+ *   Copyright (C) 2006, 2007, 2008  Daniel Otte (daniel.otte@rub.de)
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#ifndef BASE64_ENC_H_
+#define BASE64_ENC_H_
+
+#include <stdint.h>
+
+void base64enc(char* dest, void* src, uint16_t length);
+
+#endif /*BASE64_ENC_H_*/
diff --git a/hmac-md5.c b/hmac-md5/hmac-md5.c
similarity index 100%
rename from hmac-md5.c
rename to hmac-md5/hmac-md5.c
diff --git a/hmac-md5.h b/hmac-md5/hmac-md5.h
similarity index 100%
rename from hmac-md5.h
rename to hmac-md5/hmac-md5.h
diff --git a/md5-asm.S b/hmac-md5/md5-asm.S
similarity index 100%
rename from md5-asm.S
rename to hmac-md5/md5-asm.S
diff --git a/md5.h b/hmac-md5/md5.h
similarity index 100%
rename from md5.h
rename to hmac-md5/md5.h
diff --git a/hmac-sha1.c b/hmac-sha1/hmac-sha1.c
similarity index 100%
rename from hmac-sha1.c
rename to hmac-sha1/hmac-sha1.c
diff --git a/hmac-sha1.h b/hmac-sha1/hmac-sha1.h
similarity index 100%
rename from hmac-sha1.h
rename to hmac-sha1/hmac-sha1.h
diff --git a/sha1-asm.S b/hmac-sha1/sha1-asm.S
similarity index 100%
rename from sha1-asm.S
rename to hmac-sha1/sha1-asm.S
diff --git a/sha1.h b/hmac-sha1/sha1.h
similarity index 100%
rename from sha1.h
rename to hmac-sha1/sha1.h
diff --git a/hmac-sha256.c b/hmac-sha256/hmac-sha256.c
similarity index 100%
rename from hmac-sha256.c
rename to hmac-sha256/hmac-sha256.c
diff --git a/hmac-sha256.h b/hmac-sha256/hmac-sha256.h
similarity index 100%
rename from hmac-sha256.h
rename to hmac-sha256/hmac-sha256.h
diff --git a/hmac-sha256/sha256-asm.S b/hmac-sha256/sha256-asm.S
new file mode 100644
index 0000000..d9eb6b6
--- /dev/null
+++ b/hmac-sha256/sha256-asm.S
@@ -0,0 +1,1042 @@
+/* sha256-asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; sha-256 implementation in assembler	
+SHA256_BLOCK_BITS = 512
+SHA256_HASH_BITS = 256
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+	precall
+	hexdump \length
+	postcall
+.endm
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha256_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][h5][h6][h7][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha256_ctx2hash
+; === sha256_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha256_ctx structure
+;	given in r23,r22
+sha256_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 8
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha256
+; === sha256 ===
+; this function calculates SHA-256 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha256:
+sha256_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 8*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha256_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha256_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha256_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha256_ctx2hash	
+	
+sha256_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 8*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha256_lastBlock
+; === sha256_lastBlock ===
+; this function does padding & Co. for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
+
+
+sha256_lastBlock:
+	cpi r21, 0x02
+	brlo sha256_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	subi r21, 0x02
+	subi r23, -2
+	rjmp sha256_lastBlock	
+sha256_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha256_lastBlock_post_copy
+	mov r1, r18
+sha256_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha256_lastBlock_copy_loop
+sha256_lastBlock_post_copy:	
+sha256_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha256_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*8+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha256_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha256_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha256_lastBlock_epilog
+sha256_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 8*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha256_nextBlock
+
+sha256_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha256_nextBlock
+; === sha256_nextBlock ===
+; this is the core function for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
+
+Bck1 = 12
+Bck2 = 13
+Bck3 = 14
+Bck4 = 15
+Func1 = 22
+Func2 = 23
+Func3 = 24
+Func4 = 25
+Accu1 = 16
+Accu2 = 17
+Accu3 = 18
+Accu4 = 19
+XAccu1 = 8
+XAccu2 = 9
+XAccu3 = 10
+XAccu4 = 11
+T1	= 4
+T2	= 5
+T3	= 6
+T4	= 7
+LoopC = 1
+/* byteorder: high number <--> high significance */
+sha256_nextBlock:
+ ; initial, let's make some space ready for local vars
+	push r4 /* replace push & pop by mem ops? */
+	push r5
+	push r6
+	push r7
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha256_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	push r18
+	push r19
+	push r24
+	push r25 /* param1 will be needed later */
+ ; now we fill the w array with message (think about endianess)
+ 	adiw r26, 1 ; X++
+ 	ldi r20, 16
+sha256_nextBlock_wcpyloop: 	
+ 	ld r23, Z+
+ 	ld r22, Z+
+ 	ld r19, Z+
+ 	ld r18, Z+
+ 	st X+, r18
+ 	st X+, r19
+ 	st X+, r22	
+	st X+, r23
+	dec r20
+	brne sha256_nextBlock_wcpyloop
+/*	for (i=16; i<64; ++i){
+		w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];	
+	} */
+	/* r25,r24,r23,r24 (r21,r20) are function values
+	   r19,r18,r17,r16 are the accumulator
+	   r15,r14,r13,rBck1 are backup1
+	   r11,r10,r9 ,r8  are xor accu   
+	   r1 is round counter 								*/
+
+	ldi r20, 64-16
+	mov LoopC, r20
+sha256_nextBlock_wcalcloop:		 
+	movw r30, r26 ; cp X to Z
+	sbiw r30, 63
+	sbiw r30, 1 		; substract 64 = 16*4
+	ld Accu1, Z+
+	ld Accu2, Z+
+	ld Accu3, Z+
+	ld Accu4, Z+ /* w[i] = w[i-16] */
+	ld Bck1, Z+
+	ld Bck2, Z+
+	ld Bck3, Z+
+	ld Bck4, Z+ /* backup = w[i-15] */
+	/* now sigma 0 */
+	mov Func1, Bck2
+	mov Func2, Bck3
+	mov Func3, Bck4
+	mov Func4, Bck1  /* prerotated by 8 */
+	ldi r20, 1
+	rcall bitrotl
+	movw XAccu1, Func1
+	movw XAccu3, Func3	 /* store ROTR(w[i-15],7) in xor accu */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 3		 /* now shr3 */ /*we can destroy backup now*/
+sigma0_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2
+	ror Bck1	
+	dec Func2
+	brne sigma0_shr
+	eor XAccu1, Bck1
+	eor XAccu2, Bck2
+	eor XAccu3, Bck3
+	eor XAccu4, Bck4	/* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	ldd Func1, Z+7*4  /* now accu += w[i-7] */
+	ldd Func2, Z+7*4+1
+	ldd Func3, Z+7*4+2
+	ldd Func4, Z+7*4+3
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
+	ldd Bck2, Z+12*4+1
+	ldd Bck3, Z+12*4+2
+	ldd Bck4, Z+12*4+3
+	/* now sigma 1 */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 1
+	rcall bitrotr
+	movw XAccu3, Func3
+	movw XAccu1, Func1	 /* store in ROTR(w[i-2], 17) xor accu */
+;	movw Func1, Bck3
+;	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 2	 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
+sigma1_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2	
+	dec Func2
+	brne sigma1_shr
+	eor XAccu1, Bck2
+	eor XAccu2, Bck3
+	eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	/* now let's store the shit */
+	st X+, Accu1
+	st X+, Accu2
+	st X+, Accu3
+	st X+, Accu4
+	dec LoopC
+	breq 3f  ; skip if zero
+	rjmp sha256_nextBlock_wcalcloop
+3:
+	/* we are finished with w array X points one byte post w */
+/* init a array */
+	pop r31
+	pop r30
+	push r30
+	push r31
+	ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
+init_a_array:	
+	ld r1, Z+
+	st X+, r1
+	dec r25
+	brne init_a_array
+	
+/* now the real fun begins */
+/* for (i=0; i<64; ++i){
+			t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
+			t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
+			memmove(&(a[1]), &(a[0]), 7*4); 	// a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; 
+			a[4] += t1;
+			a[0] = t1 + t2;
+		} */
+	/* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
+	sbiw r26, 8*4  /* X still points at a[7]+1*/
+	movw r28, r26
+	ldi r30, lo8(sha256_kv)
+	ldi r31, hi8(sha256_kv)		
+	dec r27  /* X - (64*4 == 256) */
+	ldi r25, 64
+	mov LoopC, r25
+sha256_main_loop:
+	/* now calculate t1 */
+	 /*CH(x,y,z) = (x&y)^((~x)&z)*/
+	ldd T1, Y+5*4
+	ldd T2, Y+5*4+1
+	ldd T3, Y+5*4+2
+	ldd T4, Y+5*4+3 /* y in T */
+	ldd Func1, Y+4*4
+	ldd Func2, Y+4*4+1
+	ldd Func3, Y+4*4+2
+	ldd Func4, Y+4*4+3  /* x in Func */
+	ldd Bck1, Y+6*4
+	ldd Bck2, Y+6*4+1
+	ldd Bck3, Y+6*4+2
+	ldd Bck4, Y+6*4+3 /* z in Bck */
+	and T1, Func1
+	and T2, Func2
+	and T3, Func3
+	and T4, Func4
+	com Func1
+	com Func2
+	com Func3
+	com Func4
+	and Bck1, Func1
+	and Bck2, Func2
+	and Bck3, Func3
+	and Bck4, Func4
+	eor T1, Bck1
+	eor T2, Bck2
+	eor T3, Bck3
+	eor T4, Bck4 /* done, CH(x,y,z) is in T */
+	/* now SIGMA1(a[4]) */
+	ldd Bck4, Y+4*4		/* think about using it from Func reg above*/
+	ldd Bck1, Y+4*4+1	
+	ldd Bck2, Y+4*4+2
+	ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */	
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2 
+	rcall bitrotl		/* rotr(x,6) */ 
+	movw XAccu1, Func1
+	movw XAccu3, Func3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 3 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	movw Func1, Bck3 /* this prerotates furteh 16 bits*/
+	movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
+	ldi r20, 1 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4
+	/* now we've to add a[7], w[i] and k[i] */
+	ldd XAccu1, Y+4*7
+	ldd XAccu2, Y+4*7+1
+	ldd XAccu3, Y+4*7+2
+	ldd XAccu4, Y+4*7+3
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add a[7] */
+	ld XAccu1, X+
+	ld XAccu2, X+
+	ld XAccu3, X+
+	ld XAccu4, X+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add w[i] */
+	lpm XAccu1, Z+
+	lpm XAccu2, Z+
+	lpm XAccu3, Z+
+	lpm XAccu4, Z+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
+	/*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
+		/* starting with MAJ(x,y,z) */
+	ldd Func1, Y+4*0+0
+	ldd Func2, Y+4*0+1
+	ldd Func3, Y+4*0+2
+	ldd Func4, Y+4*0+3 /* load x=a[0] */
+	ldd XAccu1, Y+4*1+0
+	ldd XAccu2, Y+4*1+1
+	ldd XAccu3, Y+4*1+2
+	ldd XAccu4, Y+4*1+3 /* load y=a[1] */
+	and XAccu1, Func1
+	and XAccu2, Func2
+	and XAccu3, Func3
+	and XAccu4, Func4	/* XAccu == (x & y) */
+	ldd Bck1, Y+4*2+0
+	ldd Bck2, Y+4*2+1
+	ldd Bck3, Y+4*2+2
+	ldd Bck4, Y+4*2+3 /* load z=a[2] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == (x & y) ^ (x & z) */
+	ldd Func1, Y+4*1+0
+	ldd Func2, Y+4*1+1
+	ldd Func3, Y+4*1+2
+	ldd Func4, Y+4*1+3 /* load y=a[1] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
+   	/* SIGMA0(a[0]) */
+	ldd Bck1, Y+4*0+0 /* we should combine this with above */
+	ldd Bck2, Y+4*0+1
+	ldd Bck3, Y+4*0+2
+	ldd Bck4, Y+4*0+3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2
+	rcall bitrotr
+	movw Accu1, Func1
+	movw Accu3, Func3 /* Accu = shr(a[0], 2) */
+	movw Func1, Bck3 
+	movw Func3, Bck1 /* prerotate by 16 bits */
+	ldi r20, 3
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
+	mov Func1, Bck4
+	mov Func2, Bck1
+	mov Func3, Bck2
+	mov Func4, Bck3  /* prerotate by 24 bits */
+	ldi r20, 2
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
+	add Accu1, XAccu1 /* add previous result (MAJ)*/
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4
+	/* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
+	/* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
+
+	ldi r21, 7*4
+	adiw r28, 7*4
+a_shift_loop:
+	ld  r25, -Y /* warning: this is PREdecrement */
+	std Y+4, r25
+	dec r21
+	brne a_shift_loop
+
+	ldd Bck1, Y+4*4+0
+	ldd Bck2, Y+4*4+1
+	ldd Bck3, Y+4*4+2
+	ldd Bck4, Y+4*4+3
+	add Bck1, T1
+	adc Bck2, T2
+	adc Bck3, T3
+	adc Bck4, T4
+	std Y+4*4+0, Bck1
+	std Y+4*4+1, Bck2
+	std Y+4*4+2, Bck3
+	std Y+4*4+3, Bck4
+	add Accu1, T1
+	adc Accu2, T2
+	adc Accu3, T3
+	adc Accu4, T4
+	std Y+4*0+0, Accu1
+	std Y+4*0+1, Accu2
+	std Y+4*0+2, Accu3
+	std Y+4*0+3, Accu4 /* a array updated */
+	
+	
+	dec LoopC
+	breq update_state
+	rjmp sha256_main_loop ;brne sha256_main_loop
+update_state:	
+	/* update state */
+	/* pointers to state should still exist on the stack ;-) */
+	pop r31
+	pop r30
+	ldi r21, 8
+update_state_loop:
+	ldd Accu1, Z+0
+	ldd Accu2, Z+1
+	ldd Accu3, Z+2
+	ldd Accu4, Z+3 
+	ld Func1, Y+
+	ld Func2, Y+
+	ld Func3, Y+
+	ld Func4, Y+
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	st Z+, Accu1
+	st Z+, Accu2
+	st Z+, Accu3
+	st Z+, Accu4
+	dec r21
+	brne update_state_loop
+	/* now we just have to update the length */
+	adiw r30, 1 /* since we add 512, we can simply skip the LSB */ 
+	ldi r21, 2
+	ldi r22, 6
+	ld r20, Z
+	add r20, r21
+	st Z+, r20	
+	clr r21
+sha256_nextBlock_fix_length:	
+	brcc sha256_nextBlock_epilog
+	ld r20, Z
+	adc r20, r21
+	st Z+, r20
+	dec r22
+	brne sha256_nextBlock_fix_length
+	
+; EPILOG
+sha256_nextBlock_epilog:
+/* now we should clean up the stack */
+	
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	pop r7
+	pop r6
+	pop r5
+	pop r4 
+	ret
+
+sha256_kv: ; round-key-vector stored in ProgMem 
+.word	0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
+.word	0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
+.word	0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
+.word	0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
+.word	0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
+.word	0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
+.word	0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
+.word	0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
+
+	
+;###########################################################	
+
+.global sha256_init 
+;uint32_t sha256_init_vector[]={
+;  	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+;	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
+;
+;void sha256_init(sha256_ctx_t *state){
+;	state->length=0;
+;	memcpy(state->h, sha256_init_vector, 8*4);
+;}
+; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha256_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha256_init_vector))
+	ldi r31, hi8((sha256_init_vector))
+	ldi r22, 32+8
+sha256_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha256_init_vloop
+	ret
+	
+sha256_init_vector:
+.word 0xE667, 0x6A09
+.word 0xAE85, 0xBB67 
+.word 0xF372, 0x3C6E 
+.word 0xF53A, 0xA54F 
+.word 0x527F, 0x510E 
+.word 0x688C, 0x9B05 
+.word 0xD9AB, 0x1F83 
+.word 0xCD19, 0x5BE0
+.word 0x0000, 0x0000
+.word 0x0000, 0x0000
+
+;###########################################################	
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,r22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotl32:
+	cpi r20, 8
+	brlo bitrotl
+	mov r21, r25
+	mov r25, r24
+	mov r24, r23
+	mov r23, r22
+	mov r22, r21
+	subi r20, 8
+	rjmp rotl32
+bitrotl:
+	clr r21
+	clc
+bitrotl_loop:	
+	tst r20
+	breq fixrotl
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	rol r21
+	dec r20
+	rjmp bitrotl_loop
+fixrotl:
+	or r22, r21
+	ret
+	
+
+;###########################################################	
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotr32:
+	cpi r20, 8
+	brlo bitrotr
+	mov r21, r22
+	mov r22, r23
+	mov r23, r24
+	mov r24, r25
+	mov r25, r21
+	subi r20, 8
+	rjmp rotr32
+bitrotr:
+	clr r21
+	clc
+bitrotr_loop:	
+	tst r20
+	breq fixrotr
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	dec r20
+	rjmp bitrotr_loop
+fixrotr:
+	or r25, r21
+	ret
+	
+	
+;###########################################################	
+	
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+;  param1: the 32-bit word
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  modifys: r21, r22
+change_endian32:
+	movw r20,  r22 ; (r22,r23) --> (r20,r21)
+	mov r22, r25
+	mov r23, r24
+	mov r24, r21
+	mov r25, r20 
+	ret
+
diff --git a/hmac-sha256/sha256.h b/hmac-sha256/sha256.h
new file mode 100644
index 0000000..24960a3
--- /dev/null
+++ b/hmac-sha256/sha256.h
@@ -0,0 +1,122 @@
+/* sha256.h */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+ * \file	sha256.h
+ * \author  Daniel Otte 
+ * \date    2006-05-16
+ * \license	GPLv3 or later
+ * 
+ */
+
+#ifndef SHA256_H_
+#define SHA256_H_
+
+#define __LITTLE_ENDIAN__
+
+
+#include <stdint.h>
+
+/** \def SHA256_HASH_BITS
+ * defines the size of a SHA-256 hash value in bits
+ */
+
+/** \def SHA256_HASH_BYTES
+ * defines the size of a SHA-256 hash value in bytes
+ */
+
+/** \def SHA256_BLOCK_BITS
+ * defines the size of a SHA-256 input block in bits
+ */
+
+/** \def SHA256_BLOCK_BYTES
+ * defines the size of a SHA-256 input block in bytes
+ */
+
+#define SHA256_HASH_BITS  256
+#define SHA256_HASH_BYTES (SHA256_HASH_BITS/8)
+#define SHA256_BLOCK_BITS 512
+#define SHA256_BLOCK_BYTES (SHA256_BLOCK_BITS/8)
+
+/** \typedef sha256_ctx_t
+ * \brief SHA-256 context type
+ * 
+ * A variable of this type may hold the state of a SHA-256 hashing process
+ */
+typedef struct {
+	uint32_t h[8];
+	uint64_t length;
+} sha256_ctx_t;
+
+/** \typedef sha256_hash_t
+ * \brief SHA-256 hash value type
+ * 
+ * A variable of this type may hold the hash value produced by the
+ * sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state) function.
+ */
+typedef uint8_t sha256_hash_t[SHA256_HASH_BYTES];
+
+/** \fn void sha256_init(sha256_ctx_t *state)
+ * \brief initialise a SHA-256 context
+ * 
+ * This function sets a ::sha256_ctx_t to the initial values for hashing.
+ * \param state pointer to the SHA-256 hashing context
+ */
+void sha256_init(sha256_ctx_t *state);
+
+/** \fn void sha256_nextBlock (sha256_ctx_t* state, const void* block)
+ * \brief update the context with a given block
+ * 
+ * This function updates the SHA-256 hash context by processing the given block
+ * of fixed length.
+ * \param state pointer to the SHA-256 hash context
+ * \param block pointer to the block of fixed length (512 bit = 64 byte)
+ */
+void sha256_nextBlock (sha256_ctx_t* state, const void* block);
+
+/** \fn void sha256_lastBlock(sha256_ctx_t* state, const void* block, uint16_t length_b)
+ * \brief finalize the context with the given block 
+ * 
+ * This function finalizes the SHA-256 hash context by processing the given block
+ * of variable length.
+ * \param state pointer to the SHA-256 hash context
+ * \param block pointer to the block of fixed length (512 bit = 64 byte)
+ * \param length_b the length of the block in bits
+ */
+void sha256_lastBlock(sha256_ctx_t* state, const void* block, uint16_t length_b);
+
+/** \fn void sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state)
+ * \brief convert the hash state into the hash value
+ * This function reads the context and writes the hash value to the destination
+ * \param dest pointer to the location where the hash value should be written
+ * \param state pointer to the SHA-256 hash context
+ */
+void sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state);
+
+/** \fn void sha256(sha256_hash_t* dest, const void* msg, uint32_t length_b)
+ * \brief simple SHA-256 hashing function for direct hashing
+ * 
+ * This function automaticaly hashes a given message of arbitary length with
+ * the SHA-256 hashing algorithm.
+ * \param dest pointer to the location where the hash value is going to be written to
+ * \param msg pointer to the message thats going to be hashed
+ * \param length_b length of the message in bits
+ */
+void sha256(sha256_hash_t* dest, const void* msg, uint32_t length_b);
+
+#endif /*SHA256_H_*/
diff --git a/md5/md5-asm.S b/md5/md5-asm.S
new file mode 100644
index 0000000..de3b170
--- /dev/null
+++ b/md5/md5-asm.S
@@ -0,0 +1,977 @@
+/* md5-asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Author:  Daniel Otte
+ * License: GPLv3 or later
+ * Date:    2008-11-15
+*/
+
+
+#include "avr-asm-macros.S"
+
+;###########################################################	
+; S-BOX
+
+T_table:
+.hword	0xa478, 0xd76a, 0xb756, 0xe8c7, 0x70db, 0x2420, 0xceee, 0xc1bd, 0x0faf, 0xf57c 
+.hword	0xc62a, 0x4787, 0x4613, 0xa830, 0x9501, 0xfd46, 0x98d8, 0x6980, 0xf7af, 0x8b44 
+.hword	0x5bb1, 0xffff, 0xd7be, 0x895c, 0x1122, 0x6b90, 0x7193, 0xfd98, 0x438e, 0xa679 
+.hword	0x0821, 0x49b4, 0x2562, 0xf61e, 0xb340, 0xc040, 0x5a51, 0x265e, 0xc7aa, 0xe9b6 
+.hword	0x105d, 0xd62f, 0x1453, 0x0244, 0xe681, 0xd8a1, 0xfbc8, 0xe7d3, 0xcde6, 0x21e1 
+.hword	0x07d6, 0xc337, 0x0d87, 0xf4d5, 0x14ed, 0x455a, 0xe905, 0xa9e3, 0xa3f8, 0xfcef 
+.hword	0x02d9, 0x676f, 0x4c8a, 0x8d2a, 0x3942, 0xfffa, 0xf681, 0x8771, 0x6122, 0x6d9d 
+.hword	0x380c, 0xfde5, 0xea44, 0xa4be, 0xcfa9, 0x4bde, 0x4b60, 0xf6bb, 0xbc70, 0xbebf 
+.hword	0x7ec6, 0x289b, 0x27fa, 0xeaa1, 0x3085, 0xd4ef, 0x1d05, 0x0488, 0xd039, 0xd9d4 
+.hword	0x99e5, 0xe6db, 0x7cf8, 0x1fa2, 0x5665, 0xc4ac, 0x2244, 0xf429, 0xff97, 0x432a 
+.hword	0x23a7, 0xab94, 0xa039, 0xfc93, 0x59c3, 0x655b, 0xcc92, 0x8f0c, 0xf47d, 0xffef 
+.hword	0x5dd1, 0x8584, 0x7e4f, 0x6fa8, 0xe6e0, 0xfe2c, 0x4314, 0xa301, 0x11a1, 0x4e08 
+.hword	0x7e82, 0xf753, 0xf235, 0xbd3a, 0xd2bb, 0x2ad7, 0xd391, 0xeb86
+
+
+#define MD5_init_fast
+
+.global md5_init 
+#ifndef MD5_init_fast
+;###########################################################	
+;void md5_init(md5_ctx_t *state)
+; param1: (r24,r25) 16-bit pointer to sha256_ctx_t struct in ram
+; modifys: Z(r30,r31), X(r25,r26)
+; size = 9+5*4 WORDS = 29 WORDS = 58 Bytes
+md5_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8(md5_init_vector)
+	ldi r31, hi8(md5_init_vector)
+	ldi r24, 16+4
+md5_init_vloop:	
+	lpm r0, Z+ 
+	st X+, r0
+	dec r24
+	brne md5_init_vloop
+	ret
+	
+md5_init_vector:
+.hword 0x2301, 0x6745
+.hword 0xAB89, 0xEFCD 
+.hword 0xDCFE, 0x98BA 
+.hword 0x5476, 0x1032 
+.hword 0x0000, 0x0000
+
+#else
+;###########################################################	
+.global md5_init_fast 
+;void md5_init(md5_ctx_t *state)
+; param1: (r24,r25) 16-bit pointer to sha256_ctx_t struct in ram
+; modifys: r23, r22
+; cycles = 1+16*3+4*2+4 = 1+48+12 = 61
+; size = 1+16*2+4+1 WORDS = 38 WORDS = 76 Bytes
+md5_init:
+md5_init_fast:
+	movw r26, r24
+	ldi r24, 0x01
+	st X+, r24
+	ldi r24, 0x23
+	st X+, r24
+	ldi r24, 0x45
+	st X+, r24
+	ldi r24, 0x67
+	st X+, r24
+	ldi r24, 0x89
+	st X+, r24
+	ldi r24, 0xAB
+	st X+, r24
+	ldi r24, 0xCD
+	st X+, r24
+	ldi r24, 0xEF
+	st X+, r24
+	ldi r24, 0xFE
+	st X+, r24
+	ldi r24, 0xDC
+	st X+, r24
+	ldi r24, 0xBA
+	st X+, r24
+	ldi r24, 0x98
+	st X+, r24
+	ldi r24, 0x76
+	st X+, r24
+	ldi r24, 0x54
+	st X+, r24
+	ldi r24, 0x32
+	st X+, r24
+	ldi r24, 0x10
+	st X+, r24
+	st X+, r1
+	st X+, r1
+	st X+, r1
+	st X+, r1
+	ret
+#endif
+;###########################################################	
+
+/*
+static 
+uint32_t md5_F(uint32_t x, uint32_t y, uint32_t z){
+	return ((x&y)|((~x)&z));
+}
+*/
+; x: r22-r25
+; y: r18-r21
+; z: r14-r17
+md5_F:
+	and r18, r22
+	and r19, r23
+	and r20, r24
+	and r21, r25
+	com r22
+	com r23
+	com r24
+	com r25
+	and r22, r14
+	and r23, r15
+	and r24, r16
+	and r25, r17
+	or  r22, r18
+	or  r23, r19
+	or  r24, r20
+	or  r25, r21
+	rjmp md5_core_F_exit
+	
+/*
+static
+uint32_t md5_G(uint32_t x, uint32_t y, uint32_t z){
+	return ((x&z)|((~z)&y));
+}
+*/
+
+; x: r22-r25
+; y: r18-r21
+; z: r14-r17
+md5_G:
+	and r22, r14
+	and r23, r15
+	and r24, r16
+	and r25, r17
+	com r14
+	com r15
+	com r16
+	com r17
+	and r18, r14
+	and r19, r15
+	and r20, r16
+	and r21, r17
+	or  r22, r18
+	or  r23, r19
+	or  r24, r20
+	or  r25, r21
+	rjmp md5_core_F_exit
+/*
+static
+uint32_t md5_H(uint32_t x, uint32_t y, uint32_t z){
+	return (x^y^z);
+}
+*/
+; x: r22-r25
+; y: r18-r21
+; z: r14-r17
+md5_H:
+	eor r22, r18
+	eor r22, r14
+	eor r23, r19
+	eor r23, r15
+	eor r24, r20
+	eor r24, r16
+	eor r25, r21
+	eor r25, r17
+	rjmp md5_core_F_exit
+/*
+static
+uint32_t md5_I(uint32_t x, uint32_t y, uint32_t z){
+	return (y ^ (x | (~z)));
+}
+*/
+
+jump_table:
+	rjmp md5_F
+	rjmp md5_G
+	rjmp md5_H
+;	rjmp md5_I
+
+; x: r22-r25
+; y: r18-r21
+; z: r14-r17
+md5_I:
+	com r14
+	com r15
+	com r16
+	com r17
+	or  r22, r14
+	or  r23, r15
+	or  r24, r16
+	or  r25, r17
+	eor r22, r18
+	eor r23, r19
+	eor r24, r20
+	eor r25, r21
+	rjmp md5_core_F_exit
+
+as_table:
+;     (as+0)&3  (as+3)&3  (as+1)&3  (as+2)&3
+;                  Z         X         Y
+;     AS_SAVE0  AS_SAVE1  AS_SAVE2  AS_SAVE3 
+.byte   1*4,      0*4,      2*4,      3*4    ;as=1
+.byte   2*4,      1*4,      3*4,      0*4    ;as=2
+.byte   3*4,      2*4,      0*4,      1*4    ;as=3
+.byte   0*4,      3*4,      1*4,      2*4    ;as=4
+
+;###########################################################	
+.global md5_core
+md5_core:
+	mov r21, r20
+	mov r20, r18
+	mov r19, r16
+	mov r18, r14
+;	rjmp md5_core_asm
+/*
+void md5_core(uint32_t* a, void* block, uint8_t as, uint8_t s, uint8_t i, uint8_t fi){
+	uint32_t t;
+	md5_func_t* funcs[]={md5_F, md5_G, md5_H, md5_I};
+	as &= 0x3;
+	/ * a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). * /
+	t = a[as] + funcs[fi](a[(as+1)&3], a[(as+2)&3], a[(as+3)&3]) + *((uint32_t*)block) + md5_T[i] ;
+	a[as]=a[(as+1)&3] + ROTL32(t, s);
+}
+*/
+; a:     r24-r25
+; block: r22-r23
+; as:    r21
+; s:     r20
+; i:     r19
+; fi:    r18
+P_A0 = 24
+P_A1 = 25
+P_B0 = 22
+P_B1 = 23
+P_AS = 21
+P_S  = 20
+P_I  = 19
+P_FI = 18
+
+; x: r22-r25
+; y: r18-r21
+; z: r14-r17
+
+
+AS_SAVE0  =  4
+AS_SAVE1  =  5
+AS_SAVE2  =  6
+AS_SAVE3  =  7
+FI_SAVE   =  8
+S_SAVE    =  9
+ACCU0     = 10
+ACCU1     = 11
+ACCU2     = 12
+ACCU3     = 13
+ARG_X0    = 22
+ARG_X1    = 23
+ARG_X2    = 24
+ARG_X3    = 25
+ARG_Y0    = 18
+ARG_Y1    = 19
+ARG_Y2    = 20
+ARG_Y3    = 21
+ARG_Z0    = 14
+ARG_Z1    = 15
+ARG_Z2    = 16
+ARG_Z3    = 17
+
+
+md5_core_asm:
+	push r16
+	push r17
+	push_range 4, 8
+	ldi r30, lo8(T_table)
+	ldi r31, hi8(T_table)
+	lsl P_I
+	rol r1
+	lsl P_I
+	rol r1
+	add r30, P_I
+	adc r31, r1
+	clr r1
+	mov FI_SAVE, r18
+	/* loading T[i] into ACCU */	
+	lpm ACCU0, Z+	
+	lpm ACCU1, Z+	
+	lpm ACCU2, Z+	
+	lpm ACCU3, Z
+	/* add *block to ACCU */
+	movw r30, P_B0
+	ld r0, Z+
+	add ACCU0, r0
+	ld r0, Z+
+	adc ACCU1, r0
+	ld r0, Z+
+	adc ACCU2, r0
+	ld r0, Z+
+	adc ACCU3, r0
+	/* add a[as+0&3] to ACCU */
+	ldi r30, lo8(as_table)
+	ldi r31, hi8(as_table)
+	dec P_AS
+	andi P_AS, 0x03
+	lsl P_AS
+	lsl P_AS
+	add r30, r21
+	adc r31, r1       ; Z points to the correct row in as_table
+	lpm AS_SAVE0, Z+
+	lpm AS_SAVE1, Z+
+	lpm AS_SAVE2, Z+
+	lpm AS_SAVE3, Z
+	movw r26, r24     ; X points to a[0]
+	add r26, AS_SAVE0
+	adc r27, r1       ; X points at a[as&3]
+	ld r0, X+
+	add ACCU0, r0
+	ld r0, X+
+	adc ACCU1, r0
+	ld r0, X+
+	adc ACCU2, r0
+	ld r0, X+
+	adc ACCU3, r0
+	mov S_SAVE, r20
+
+	movw r28, r24
+	/* loading z value */
+	movw r26, r28
+	add r26, AS_SAVE1
+	adc r27, r1
+	ld ARG_Z0, X+
+	ld ARG_Z1, X+
+	ld ARG_Z2, X+
+	ld ARG_Z3, X
+
+	/* loading x value */
+	movw r26, r28	
+	add r26, AS_SAVE2
+	adc r27, r1
+	ld ARG_X0, X+
+	ld ARG_X1, X+
+	ld ARG_X2, X+
+	ld ARG_X3, X
+
+	/* loading y value */
+	movw r26, r28
+	add r26, AS_SAVE3
+	adc r27, r1
+	ldi r30, pm_lo8(jump_table)
+	ldi r31, pm_hi8(jump_table)
+	add r30, FI_SAVE
+	adc r31, r1    ; Z points to the correct entry in our jump table
+	ld ARG_Y0, X+
+	ld ARG_Y1, X+
+	ld ARG_Y2, X+
+	ld ARG_Y3, X
+
+	ijmp /* calls the function pointed by Z */
+md5_core_F_exit:		
+
+	/* add ACCU to result of f() */
+	add r22, ACCU0
+	adc r23, ACCU1
+	adc r24, ACCU2
+	adc r25, ACCU3
+
+	/* rotate */
+	mov r20, S_SAVE
+rotl32:
+	cpi r20, 8
+	brlo bitrotl
+	mov r21, r25
+	mov r25, r24
+	mov r24, r23
+	mov r23, r22
+	mov r22, r21
+	subi r20, 8
+	rjmp rotl32
+bitrotl:
+	mov r21, r25
+bitrotl_loop:	
+	tst r20
+	breq fixrotl
+bitrotl_loop2:	
+	lsl r21
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	dec r20
+	brne bitrotl_loop2
+fixrotl:
+
+	/* add a[(as+1)&3]  */
+	movw r26, r28
+	add r26, AS_SAVE2
+	adc r27, r1
+	ld r0, X+
+	add r22, r0
+	ld r0, X+
+	adc r23, r0
+	ld r0, X+
+	adc r24, r0
+	ld r0, X
+	adc r25, r0
+
+	/* store result */
+	movw r26, r28
+	add r26, AS_SAVE0
+	adc r27, r1
+	st X+, r22
+	st X+, r23
+	st X+, r24
+	st X , r25	
+md5_core_exit:
+	pop_range 4, 8
+	pop r17
+	pop r16
+	ret
+
+;###################################################################
+/*
+void md5_nextBlock(md5_ctx_t *state, void* block){
+	uint32_t	a[4];
+	uint8_t		m,n,i=0;
+
+	a[0]=state->a[0];
+	a[1]=state->a[1];
+	a[2]=state->a[2];
+	a[3]=state->a[3];
+	
+	/ * round 1 * /
+	uint8_t s1t[]={7,12,17,22}; // 1,-1   1,4   2,-1   3,-2
+	for(m=0;m<4;++m){
+		for(n=0;n<4;++n){
+			md5_core(a, &(((uint32_t*)block)[m*4+n]), 4-n, s1t[n],i++,0);
+		}
+	}
+	/ * round 2 * /
+	uint8_t s2t[]={5,9,14,20}; // 1,-3   1,1   2,-2   2,4
+	for(m=0;m<4;++m){
+		for(n=0;n<4;++n){
+			md5_core(a, &(((uint32_t*)block)[(1+m*4+n*5)&0xf]), 4-n, s2t[n],i++,1);
+		}
+	}
+	/ * round 3 * /
+	uint8_t s3t[]={4,11,16,23}; // 0,4   1,3   2,0   3,-1
+	for(m=0;m<4;++m){
+		for(n=0;n<4;++n){
+			md5_core(a, &(((uint32_t*)block)[(5-m*4+n*3)&0xf]), 4-n, s3t[n],i++,2);
+		}
+	}
+	/ * round 4 * /
+	uint8_t s4t[]={6,10,15,21}; // 1,-2   1,2   2,-1   3,-3
+	for(m=0;m<4;++m){
+		for(n=0;n<4;++n){
+			md5_core(a, &(((uint32_t*)block)[(0-m*4+n*7)&0xf]), 4-n, s4t[n],i++,3);
+		}
+	}
+	state->a[0] += a[0];
+	state->a[1] += a[1];
+	state->a[2] += a[2];
+	state->a[3] += a[3];
+	state->counter++;
+}
+*/
+
+shift_table_1:  .byte  7,12,17,22
+shift_table_2:  .byte  5, 9,14,20
+shift_table_3:  .byte  4,11,16,23
+shift_table_4:  .byte  6,10,15,21
+
+index_table_r2:
+;(1+m*4+n*5)&0xf:
+        .byte 0x04, 0x18, 0x2c, 0x00 
+        .byte 0x14, 0x28, 0x3c, 0x10 
+        .byte 0x24, 0x38, 0x0c, 0x20 
+        .byte 0x34, 0x08, 0x1c, 0x30 
+
+index_table_r3:
+;(5-m*4+n*3)&0xf:
+        .byte 0x14, 0x20, 0x2c, 0x38 
+        .byte 0x04, 0x10, 0x1c, 0x28 
+        .byte 0x34, 0x00, 0x0c, 0x18 
+        .byte 0x24, 0x30, 0x3c, 0x08 
+
+index_table_r4:
+;(0-m*4+n*7)&0xf:
+        .byte 0x00, 0x1c, 0x38, 0x14 
+        .byte 0x30, 0x0c, 0x28, 0x04 
+        .byte 0x20, 0x3c, 0x18, 0x34 
+        .byte 0x10, 0x2c, 0x08, 0x24
+
+APTR_REG = 2
+BPTR_REG = 4
+N_REG = 6
+M_REG = 7
+I_REG = 8
+.global md5_nextBlock
+md5_nextBlock:
+	stack_alloc 16
+	push_range 2, 17
+	push r28
+	push r29
+	push r24
+	push r25
+	adiw r30, 1 /* Z now points to the beginning of the allocated memory */
+	movw r2, r30
+	movw r4, r22
+	movw r26, r24
+	ldi r20, 16
+1:
+	ld r0, X+
+	st Z+, r0
+	dec r20
+	brne 1b
+	/* state now copied to stack memory */
+	clr I_REG	
+	/* Round 1 */
+	clr M_REG
+	ldi r17, 4
+1:
+	clr N_REG	
+	ldi r16, 4
+2:
+	movw r24, APTR_REG
+	movw r22, BPTR_REG 
+	mov r0, M_REG
+	lsl r0
+	lsl r0
+	add r0, N_REG
+	lsl r0
+	lsl r0
+	add r22, r0
+	adc r23, r1
+	mov r21, r16	
+	ldi r30, lo8(shift_table_1)
+	ldi r31, hi8(shift_table_1)
+	add r30, N_REG
+	adc r31, r1
+	lpm r20, Z
+	mov r19, I_REG
+	ldi r18, 0
+	rcall md5_core_asm
+	inc I_REG
+	inc N_REG
+	dec r16
+	brne 2b
+	inc M_REG
+	dec r17
+	brne 1b
+	
+	/* Round 2 */
+	clr M_REG
+	ldi r17, 4
+1:
+	clr N_REG	
+	ldi r16, 4
+2:
+	movw r24, APTR_REG
+	movw r22, BPTR_REG 
+	ldi r30, lo8(index_table_r2)
+	ldi r31, hi8(index_table_r2)
+	mov r0, M_REG
+	lsl r0
+	lsl r0
+	add r0, N_REG
+	add r30, r0
+	adc r31, r1
+	lpm r0, Z	
+	add r22, r0
+	adc r23, r1
+	mov r21, r16	
+	ldi r30, lo8(shift_table_2)
+	ldi r31, hi8(shift_table_2)
+	add r30, N_REG
+	adc r31, r1
+	lpm r20, Z
+	mov r19, I_REG
+	ldi r18, 1
+	rcall md5_core_asm
+	inc I_REG
+	inc N_REG
+	dec r16
+	brne 2b
+	inc M_REG
+	dec r17
+	brne 1b
+
+	/* Round 3 */
+	clr M_REG
+	ldi r17, 4
+1:
+	clr N_REG	
+	ldi r16, 4
+2:
+	movw r24, APTR_REG
+	movw r22, BPTR_REG 
+	ldi r30, lo8(index_table_r3)
+	ldi r31, hi8(index_table_r3)
+	mov r0, M_REG
+	lsl r0
+	lsl r0
+	add r0, N_REG
+	add r30, r0
+	adc r31, r1
+	lpm r0, Z	
+	add r22, r0
+	adc r23, r1
+	mov r21, r16	
+	ldi r30, lo8(shift_table_3)
+	ldi r31, hi8(shift_table_3)
+	add r30, N_REG
+	adc r31, r1
+	lpm r20, Z
+	mov r19, I_REG
+	ldi r18, 2
+	rcall md5_core_asm
+	inc I_REG
+	inc N_REG
+	dec r16
+	brne 2b
+	inc M_REG
+	dec r17
+	brne 1b
+
+	/* Round 4 */
+	clr M_REG
+	ldi r17, 4
+1:
+	clr N_REG	
+	ldi r16, 4
+2:
+	movw r24, APTR_REG
+	movw r22, BPTR_REG 
+	ldi r30, lo8(index_table_r4)
+	ldi r31, hi8(index_table_r4)
+	mov r0, M_REG
+	lsl r0
+	lsl r0
+	add r0, N_REG
+	add r30, r0
+	adc r31, r1
+	lpm r0, Z	
+	add r22, r0
+	adc r23, r1
+	mov r21, r16	
+	ldi r30, lo8(shift_table_4)
+	ldi r31, hi8(shift_table_4)
+	add r30, N_REG
+	adc r31, r1
+	lpm r20, Z
+	mov r19, I_REG
+	ldi r18, 3
+	rcall md5_core_asm
+	inc I_REG
+	inc N_REG
+	dec r16
+	brne 2b
+	inc M_REG
+	dec r17
+	brne 1b
+
+
+	pop r27
+	pop r26 /* X now points to the context */
+	movw r30, APTR_REG
+	ldi r16, 4
+1:
+	ld r0, X
+	ld r2, Z+
+	add r0, r2
+	st X+, r0	
+	ld r0, X
+	ld r2, Z+
+	adc r0, r2
+	st X+, r0	
+	ld r0, X
+	ld r2, Z+
+	adc r0, r2
+	st X+, r0	
+	ld r0, X
+	ld r2, Z+
+	adc r0, r2
+	st X+, r0	
+	dec r16
+	brne 1b
+
+	ld r0, X
+	inc r0
+	st X+, r0	
+	brne 2f
+	ld r0, X
+	inc r0
+	st X+, r0	
+	brne 2f
+	ld r0, X
+	inc r0
+	st X+, r0	
+	brne 2f	
+	ld r0, X
+	inc r0
+	st X+, r0	
+2:			
+
+	pop r29
+	pop r28
+	pop_range 2, 17
+	stack_free 16
+	ret
+
+;###############################################################################
+/*
+void md5_lastBlock(md5_ctx_t *state, const void* block, uint16_t length_b){
+	uint16_t l;
+	uint8_t b[64];
+	while (length_b >= 512){
+		md5_nextBlock(state, block);
+		length_b -= 512;
+		block = ((uint8_t*)block) + 512/8;
+	}
+	memset(b, 0, 64);
+	memcpy(b, block, length_b/8);
+	/ * insert padding one * /
+	l=length_b/8;
+	if(length_b%8){
+		uint8_t t;
+		t = ((uint8_t*)block)[l];
+		t |= (0x80>>(length_b%8));
+		b[l]=t;
+	}else{
+		b[l]=0x80;
+	}
+	/ * insert length value * /
+	if(l+sizeof(uint64_t) >= 512/8){
+		md5_nextBlock(state, b);
+		state->counter--;
+		memset(b, 0, 64-8);
+	}
+	*((uint64_t*)&b[64-sizeof(uint64_t)]) = (state->counter * 512) + length_b;
+	md5_nextBlock(state, b);
+}
+*/
+; state_ptr : r24,r25
+; block_ptr : r22,r23
+; length_b  : r20,r21
+.global md5_lastBlock
+md5_lastBlock:
+	stack_alloc_large 64
+	push_range 12, 17
+	push r30
+	push r31
+	movw r16, r20 /* length_b  */ 
+	movw r14, r22 /* block_ptr */
+	movw r12, r24 /* state_ptr */
+	ldi r18, 64
+2:
+	cpi r17, 2 /* hi8(512) */	
+	brlo 2f
+1:
+	movw r24, r12
+	movw r22, r14
+	rcall md5_nextBlock
+	add r14, r18
+	adc r15, r1
+	subi r17, 2
+	rjmp 2b
+2:
+	pop r31
+	pop r30
+
+	adiw r30, 1 /* adjust Z to point to buffer */
+	movw r26, r14
+	movw r24, r16
+	adiw r24, 7
+
+	lsr r25
+	ror r24
+	lsr r25
+	ror r24
+	lsr r24 /* r24 now holds how many bytes are to copy */
+    ldi r18, 64
+	sub r18, r24 /* r18 will hold the amount of used bytes in buffer */
+	tst r24
+4:
+	breq 5f
+	ld r0, X+
+	st Z+, r0 
+	dec r24
+	rjmp 4b /* Z points to the byte after msg in buffer */
+5:	/* append 1-bit */
+	mov r20, r16
+	ldi r19, 0x80
+	andi r20, 0x07
+	brne bit_fucking
+	st Z+, r19
+	dec r18 /* 'allocate' another byte in buffer */
+	rjmp after_bit_fucking
+bit_fucking:
+1:
+	lsr r19
+	dec r20
+	brne 1b
+	or r0, r19
+	st -Z, r0
+    adiw r30, 1
+after_bit_fucking:
+	clt	
+	cpi r18, 8
+	brmi 2f
+	set         /* store in t if the counter will also fit in this block (1 if fit)*/
+2:
+	tst r18
+	breq 2f
+1: /* fill remaning buffer with zeros */
+	st Z+, r1
+	dec r18
+	brne 1b
+2:
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r14, r30 /* r14:r15 now points to buffer */	
+	brts load_counter
+	/* counter does not fit, finalize this block */
+	movw r24, r12
+	movw r22, r14
+	rcall md5_nextBlock
+	movw r30, r14
+	ldi r20, 64-8
+3:
+	st Z+, r1
+	dec r20
+	brne 3b
+	
+load_counter:		
+	movw r26, r12 /* X points to state */
+	adiw r26, 16
+	ld r19, X+
+	ld r20, X+
+	ld r21, X+
+	ld r22, X+
+	brts post_counter_decrement	/* do not decremen because counter fits */
+counter_decrement:
+	subi r19, 1
+	sbci r20, 0
+	sbci r21, 0
+	sbci r22, 0
+post_counter_decrement:
+	clr r18
+	clr r23
+	lsl r19
+	rol r20
+	rol r21
+	rol r22
+	rol r23
+	mov r18, r16 /* r16:r17 length_b */
+	add r19, r17
+	adc r20, r1
+	adc r21, r1
+	adc r22, r1
+	adc r23, r1
+	movw r30, r14
+	adiw r30, 64-8
+	st Z+, r18
+	st Z+, r19
+	st Z+, r20
+	st Z+, r21
+	st Z+, r22
+	st Z+, r23
+	st Z+, r1
+	st Z, r1
+
+	sbiw r30, 63
+;	sbiw r30, 1
+	movw r24, r12
+	movw r22, r30
+	rcall md5_nextBlock
+md5_lastBlock_exit:	
+	pop_range 12, 17
+	stack_free_large 64
+	ret
+
+
+;###############################################################################
+
+
+.global md5_ctx2hash
+md5_ctx2hash:
+	movw r26, r24
+	movw r30, r22
+	ldi r22, 16
+1:
+	ld r0, Z+
+	st X+, r0
+	dec r22
+	brne 1b	
+	ret
+
+
+;###############################################################################
+
+
+.global md5
+md5:
+	stack_alloc 20
+	push_range  8, 17
+	adiw r30, 1
+	movw  r8, r30 /* ctx           */
+	movw r10, r24 /* dest          */
+	movw r12, r22 /* msg           */
+	movw r14, r18 /* length (low)  */
+	movw r16, r20 /* length (high) */
+	movw r24, r30
+	rcall md5_init
+1:
+	tst r16
+	brne next_round
+	tst r17
+	breq last_round
+next_round:
+	movw r24,  r8
+	movw r22, r12
+	rcall md5_nextBlock
+	ldi r22, 64
+	add r12, r22
+	adc r13, r1
+	ldi r22, 2
+	sub r15, r22
+	sbci r16, 0
+	sbci r17, 0
+	rjmp 1b
+last_round:		
+	movw r24, r8
+	movw r22, r12
+	movw r20, r14
+	rcall md5_lastBlock
+	movw r24, r10
+	movw r22,  r8
+	rcall md5_ctx2hash
+	pop_range  8, 17
+	stack_free 20
+	ret
+
+
+
diff --git a/md5.c b/md5/md5.c
similarity index 100%
rename from md5.c
rename to md5/md5.c
diff --git a/md5/md5.h b/md5/md5.h
new file mode 100644
index 0000000..6b65c4a
--- /dev/null
+++ b/md5/md5.h
@@ -0,0 +1,55 @@
+/* md5.h */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/* 
+ * File:	md5.h
+ * Author:	Daniel Otte
+ * Date: 	31.07.2006
+ * License: GPL
+ * Description: Implementation of the MD5 hash algorithm as described in RFC 1321
+ * 
+ */
+
+
+#ifndef MD5_H_
+#define MD5_H_
+
+#include <stdint.h>
+
+
+#define MD5_HASH_BITS  128
+#define MD5_HASH_BYTES (MD5_HASH_BITS/8)
+#define MD5_BLOCK_BITS 512
+#define MD5_BLOCK_BYTES (MD5_BLOCK_BITS/8)
+
+
+typedef struct md5_ctx_st {
+	uint32_t a[4];
+	uint32_t counter;
+} md5_ctx_t;
+
+typedef uint8_t md5_hash_t[MD5_HASH_BYTES];
+
+ 
+void md5_init(md5_ctx_t *s);
+void md5_nextBlock(md5_ctx_t *state, const void* block);
+void md5_lastBlock(md5_ctx_t *state, const void* block, uint16_t length);
+void md5_ctx2hash(md5_hash_t* dest, const md5_ctx_t* state);
+void md5(md5_hash_t* dest, const void* msg, uint32_t length_b);
+
+#endif /*MD5_H_*/
diff --git a/md5_sbox.h b/md5/md5_sbox.h
similarity index 100%
rename from md5_sbox.h
rename to md5/md5_sbox.h
diff --git a/mickey128.c b/mickey128/mickey128.c
similarity index 100%
rename from mickey128.c
rename to mickey128/mickey128.c
diff --git a/mickey128.h b/mickey128/mickey128.h
similarity index 100%
rename from mickey128.h
rename to mickey128/mickey128.h
diff --git a/mkfiles/arcfour.mk b/mkfiles/arcfour.mk
index d62c144..ebc6858 100644
--- a/mkfiles/arcfour.mk
+++ b/mkfiles/arcfour.mk
@@ -4,6 +4,7 @@ ALGO_NAME := ARCFOUR
 # comment out the following line for removement of ARCFOUR from the build process
 STREAM_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := arcfour/
 $(ALGO_NAME)_OBJ      := arcfour-asm.o
 $(ALGO_NAME)_TEST_BIN := main-arcfour-test.o $(CLI_STD)  \
                          nessie_stream_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/arcfour_c.mk b/mkfiles/arcfour_c.mk
index 40a3a84..5bee9f0 100644
--- a/mkfiles/arcfour_c.mk
+++ b/mkfiles/arcfour_c.mk
@@ -4,10 +4,9 @@ ALGO_NAME := ARCFOUR_C
 # comment out the following line for removement of ARCFOUR from the build process
 STREAM_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := arcfour/
 $(ALGO_NAME)_OBJ      := arcfour.o
-$(ALGO_NAME)_TEST_BIN := main-arcfour-test.o $(CLI_STD)  \
-                         nessie_stream_test.o nessie_common.o \
-                         performance_test.o
+$(ALGO_NAME)_TEST_BIN := main-arcfour-test.o $(CLI_STD) nessie_stream_test.o nessie_common.o performance_test.o
 $(ALGO_NAME)_NESSIE_TEST      := "nessie"
 $(ALGO_NAME)_PERFORMANCE_TEST := "performance"
 
diff --git a/mkfiles/base64.mk b/mkfiles/base64.mk
index 58883c3..013b7ee 100644
--- a/mkfiles/base64.mk
+++ b/mkfiles/base64.mk
@@ -4,7 +4,7 @@ ALGO_NAME := BASE64
 # comment out the following line for removement of base64 from the build process
 ENCODINGS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := base64/
 $(ALGO_NAME)_OBJ      := base64_enc.o base64_dec.o
 $(ALGO_NAME)_TEST_BIN := main-base64-test.o $(CLI_STD)  \
                          performance_test.o noekeon_asm.o noekeon_prng.o memxor.o
diff --git a/mkfiles/bmw_c.mk b/mkfiles/bmw_c.mk
index 6305932..585bbb2 100644
--- a/mkfiles/bmw_c.mk
+++ b/mkfiles/bmw_c.mk
@@ -4,7 +4,7 @@ ALGO_NAME := BMW_C
 # comment out the following line for removement of BlueMidnightWish from the build process
 HASHES += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := bmw/
 $(ALGO_NAME)_OBJ      := bmw_small.o bmw_large.o
 $(ALGO_NAME)_TEST_BIN := main-bmw-test.o hfal_bmw_small.o hfal_bmw_large.o $(CLI_STD) $(HFAL_STD)
 $(ALGO_NAME)_NESSIE_TEST      := test nessie
diff --git a/mkfiles/cast5.mk b/mkfiles/cast5.mk
index 08ead6e..318a0e5 100644
--- a/mkfiles/cast5.mk
+++ b/mkfiles/cast5.mk
@@ -4,6 +4,7 @@ ALGO_NAME := CAST5
 # comment out the following line for removement of CAST5 from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := cast5/
 $(ALGO_NAME)_OBJ      := cast5.o
 $(ALGO_NAME)_TEST_BIN := main-cast5-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/cast6.mk b/mkfiles/cast6.mk
index 3f8539f..f28800a 100644
--- a/mkfiles/cast6.mk
+++ b/mkfiles/cast6.mk
@@ -4,7 +4,7 @@ ALGO_NAME := CAST6
 # comment out the following line for removement of CAST6 from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := cast6/
 $(ALGO_NAME)_OBJ      := cast6.o
 $(ALGO_NAME)_TEST_BIN := main-cast6-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/des.mk b/mkfiles/des.mk
index 5d9540d..180d9e1 100644
--- a/mkfiles/des.mk
+++ b/mkfiles/des.mk
@@ -4,6 +4,7 @@ ALGO_NAME := DES
 # comment out the following line for removement of DES from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := des/
 $(ALGO_NAME)_OBJ      := des.o
 $(ALGO_NAME)_TEST_BIN := main-des-test.o $(CLI_STD)  \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/entropium.mk b/mkfiles/entropium.mk
index 02ad75b..e87b3de 100644
--- a/mkfiles/entropium.mk
+++ b/mkfiles/entropium.mk
@@ -4,6 +4,7 @@ ALGO_NAME := ENTROPIUM
 # comment out the following line for removement of PRNG from the build process
 PRNGS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := entropium/
 $(ALGO_NAME)_OBJ      := entropium.o sha256-asm.o
 $(ALGO_NAME)_TEST_BIN := main-entropium-test.o $(CLI_STD) performance_test.o
                          
diff --git a/mkfiles/grain.mk b/mkfiles/grain.mk
index 1b0da65..5e6638e 100644
--- a/mkfiles/grain.mk
+++ b/mkfiles/grain.mk
@@ -4,6 +4,7 @@ ALGO_NAME := GRAIN
 # comment out the following line for removement of Grain from the build process
 STREAM_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := grain/
 $(ALGO_NAME)_OBJ      := grain.o
 $(ALGO_NAME)_TEST_BIN := main-grain-test.o $(CLI_STD) \
                          nessie_stream_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/hmac-md5.mk b/mkfiles/hmac-md5.mk
index 7d3f644..fd23627 100644
--- a/mkfiles/hmac-md5.mk
+++ b/mkfiles/hmac-md5.mk
@@ -4,9 +4,10 @@ ALGO_NAME := HMAC-MD5
 # comment out the following line for removement of HMAC-MD5 from the build process
 MACS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := hmac-md5/
 $(ALGO_NAME)_OBJ      := hmac-md5.o md5-asm.o
 $(ALGO_NAME)_TEST_BIN := main-hmac-md5-test.o $(CLI_STD) \
-                         nessie_mac_test.o nessie_common.o base64_enc.o base64_dec.o
+                         nessie_mac_test.o nessie_common.o
 $(ALGO_NAME)_NESSIE_TEST      := "nessie"
 $(ALGO_NAME)_PERFORMANCE_TEST := "performance"
 
diff --git a/mkfiles/hmac-sha1.mk b/mkfiles/hmac-sha1.mk
index 9087400..40958a7 100644
--- a/mkfiles/hmac-sha1.mk
+++ b/mkfiles/hmac-sha1.mk
@@ -4,6 +4,7 @@ ALGO_NAME := HMAC-SHA1
 # comment out the following line for removement of HMAC-SHA1 from the build process
 MACS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := hmac-sha1/
 $(ALGO_NAME)_OBJ      := hmac-sha1.o sha1-asm.o
 $(ALGO_NAME)_TEST_BIN := main-hmac-sha1-test.o $(CLI_STD) \
                          nessie_mac_test.o nessie_common.o
diff --git a/mkfiles/hmac-sha256.mk b/mkfiles/hmac-sha256.mk
index ba221ab..4b25ea9 100644
--- a/mkfiles/hmac-sha256.mk
+++ b/mkfiles/hmac-sha256.mk
@@ -4,6 +4,7 @@ ALGO_NAME := HMAC-SHA256
 # comment out the following line for removement of HMAC-SHA256 from the build process
 MACS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := hmac-sha256/
 $(ALGO_NAME)_OBJ      := hmac-sha256.o sha256-asm.o
 $(ALGO_NAME)_TEST_BIN := main-hmac-sha256-test.o $(CLI_STD) \
                          nessie_mac_test.o nessie_common.o
diff --git a/mkfiles/md5.mk b/mkfiles/md5.mk
index 025e9eb..bebdaa3 100644
--- a/mkfiles/md5.mk
+++ b/mkfiles/md5.mk
@@ -4,6 +4,7 @@ ALGO_NAME := MD5
 # comment out the following line for removement of MD5 from the build process
 HASHES += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := md5/
 $(ALGO_NAME)_OBJ      := md5-asm.o
 $(ALGO_NAME)_TEST_BIN := main-md5-test.o hfal_md5.o $(CLI_STD) $(HFAL_STD)
 $(ALGO_NAME)_NESSIE_TEST      := "nessie"
diff --git a/mkfiles/md5_c.mk b/mkfiles/md5_c.mk
index 7b6bb1a..d7421e7 100644
--- a/mkfiles/md5_c.mk
+++ b/mkfiles/md5_c.mk
@@ -4,6 +4,7 @@ ALGO_NAME := MD5_C
 # comment out the following line for removement of MD5 from the build process
 HASHES += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := md5/
 $(ALGO_NAME)_OBJ      := md5.o
 $(ALGO_NAME)_TEST_BIN := main-md5-test.o hfal_md5.o $(CLI_STD) $(HFAL_STD)
 $(ALGO_NAME)_NESSIE_TEST      := "nessie"
diff --git a/mkfiles/mickey128.mk b/mkfiles/mickey128.mk
index 6c95c96..c7bc17d 100644
--- a/mkfiles/mickey128.mk
+++ b/mkfiles/mickey128.mk
@@ -4,6 +4,7 @@ ALGO_NAME := MICKEY128
 # comment out the following line for removement of Mickey128 from the build process
 STREAM_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := mickey128/
 $(ALGO_NAME)_OBJ      := mickey128.o
 $(ALGO_NAME)_TEST_BIN := main-mickey128-test.o $(CLI_STD) \
                          nessie_stream_test.o nessie_common.o
diff --git a/mkfiles/present.mk b/mkfiles/present.mk
index 3a4012c..3c73f82 100644
--- a/mkfiles/present.mk
+++ b/mkfiles/present.mk
@@ -4,7 +4,7 @@ ALGO_NAME := PRESENT
 # comment out the following line for removement of present from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := present/
 $(ALGO_NAME)_OBJ      := present.o
 $(ALGO_NAME)_TEST_BIN := main-present-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/rc5.mk b/mkfiles/rc5.mk
index 3a5f128..be82175 100644
--- a/mkfiles/rc5.mk
+++ b/mkfiles/rc5.mk
@@ -4,7 +4,7 @@ ALGO_NAME := RC5
 # comment out the following line for removement of RC5 from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := rc5/
 $(ALGO_NAME)_OBJ      := rc5.o
 $(ALGO_NAME)_TEST_BIN := main-rc5-test.o $(CLI_STD) nessie_bc_test.o \
                          nessie_common.o performance_test.o
diff --git a/mkfiles/rc6.mk b/mkfiles/rc6.mk
index e0bc603..a58b138 100644
--- a/mkfiles/rc6.mk
+++ b/mkfiles/rc6.mk
@@ -4,7 +4,7 @@ ALGO_NAME := RC6
 # comment out the following line for removement of RC6 from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := rc6/
 $(ALGO_NAME)_OBJ      := rc6.o
 $(ALGO_NAME)_TEST_BIN := main-rc6-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/seed.mk b/mkfiles/seed.mk
index 1c2b605..07ceb3e 100644
--- a/mkfiles/seed.mk
+++ b/mkfiles/seed.mk
@@ -4,6 +4,7 @@ ALGO_NAME := SEED
 # comment out the following line for removement of SEED from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := seed/
 $(ALGO_NAME)_OBJ      := seed-asm.o
 $(ALGO_NAME)_TEST_BIN := main-seed-test.o $(CLI_STD)  \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/seed_C.mk b/mkfiles/seed_C.mk
index 14aa8ac..7c77666 100644
--- a/mkfiles/seed_C.mk
+++ b/mkfiles/seed_C.mk
@@ -4,6 +4,7 @@ ALGO_NAME := SEED_C
 # comment out the following line for removement of SEED from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := seed/
 $(ALGO_NAME)_OBJ      := seed_C.o
 $(ALGO_NAME)_TEST_BIN := main-seed-test.o $(CLI_STD)  \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/serpent-bitslice.mk b/mkfiles/serpent-bitslice.mk
index 6a9b76a..5719143 100644
--- a/mkfiles/serpent-bitslice.mk
+++ b/mkfiles/serpent-bitslice.mk
@@ -4,7 +4,7 @@ ALGO_NAME := SERPENT_BITSLICE
 # comment out the following line for removement of serpent from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := serpent/
 $(ALGO_NAME)_OBJ      := serpent-asm.o serpent-sboxes-bitslice-asm.o memxor.o
 $(ALGO_NAME)_TEST_BIN := main-serpent-test.o $(CLI_STD)  \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/serpent_asm_bitslice.mk b/mkfiles/serpent_asm_bitslice.mk
index afd0868..a5956c3 100644
--- a/mkfiles/serpent_asm_bitslice.mk
+++ b/mkfiles/serpent_asm_bitslice.mk
@@ -4,7 +4,7 @@ ALGO_NAME := SERPENT_ASM_BITSLICE
 # comment out the following line for removement of serpent from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := serpent/
 $(ALGO_NAME)_OBJ      := serpent-sboxes-bitslice-asm.o serpent-asm.o memxor.o
 $(ALGO_NAME)_TEST_BIN := main-serpent-test.o $(CLI_STD)  \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/serpent_asm_fast.mk b/mkfiles/serpent_asm_fast.mk
index 3e3a4fb..d9ff725 100644
--- a/mkfiles/serpent_asm_fast.mk
+++ b/mkfiles/serpent_asm_fast.mk
@@ -4,7 +4,7 @@ ALGO_NAME := SERPENT_ASM_FAST
 # comment out the following line for removement of serpent from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := serpent/
 $(ALGO_NAME)_OBJ      := serpent-asm.o serpent-sboxes-fast.o memxor.o
 $(ALGO_NAME)_TEST_BIN := main-serpent-test.o $(CLI_STD)  \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/serpent_asm_small.mk b/mkfiles/serpent_asm_small.mk
index 6dde94b..4d6750e 100644
--- a/mkfiles/serpent_asm_small.mk
+++ b/mkfiles/serpent_asm_small.mk
@@ -4,7 +4,7 @@ ALGO_NAME := SERPENT_ASM_SMALL
 # comment out the following line for removement of serpent from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := serpent/
 $(ALGO_NAME)_OBJ      := serpent-asm.o serpent-sboxes-small.o memxor.o
 $(ALGO_NAME)_TEST_BIN := main-serpent-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/serpent_c.mk b/mkfiles/serpent_c.mk
index dd3a69b..f52ced4 100644
--- a/mkfiles/serpent_c.mk
+++ b/mkfiles/serpent_c.mk
@@ -4,7 +4,7 @@ ALGO_NAME := SERPENT_C
 # comment out the following line for removement of serpent from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := serpent/
 $(ALGO_NAME)_OBJ      := serpent.o serpent-sboxes_c.o memxor.o
 $(ALGO_NAME)_TEST_BIN := main-serpent-test.o $(CLI_STD)  \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/sha1.mk b/mkfiles/sha1.mk
index 45df051..c986aeb 100644
--- a/mkfiles/sha1.mk
+++ b/mkfiles/sha1.mk
@@ -4,6 +4,7 @@ ALGO_NAME := SHA1
 # comment out the following line for removement of SHA1 from the build process
 HASHES += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := sha1/
 $(ALGO_NAME)_OBJ      := sha1-asm.o
 $(ALGO_NAME)_TEST_BIN := main-sha1-test.o hfal_sha1.o $(CLI_STD) $(HFAL_STD) dump-decl.o dump-asm.o 
 $(ALGO_NAME)_NESSIE_TEST      := "nessie"
diff --git a/mkfiles/sha1_c.mk b/mkfiles/sha1_c.mk
index 4b0b7ae..6998cbc 100644
--- a/mkfiles/sha1_c.mk
+++ b/mkfiles/sha1_c.mk
@@ -4,6 +4,7 @@ ALGO_NAME := SHA1_C
 # comment out the following line for removement of SHA1 from the build process
 HASHES += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := sha1/
 $(ALGO_NAME)_OBJ      := sha1.o
 $(ALGO_NAME)_TEST_BIN := main-sha1-test.o hfal_sha1.o dump-asm.o dump-decl.o $(CLI_STD) $(HFAL_STD)
 $(ALGO_NAME)_NESSIE_TEST      := "nessie"
diff --git a/mkfiles/sha256.mk b/mkfiles/sha256.mk
index 93ebdc7..9e56ed6 100644
--- a/mkfiles/sha256.mk
+++ b/mkfiles/sha256.mk
@@ -4,6 +4,7 @@ ALGO_NAME := SHA256
 # comment out the following line for removement of SHA256 from the build process
 HASHES += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := sha256/
 $(ALGO_NAME)_OBJ      := sha256-asm.o
 $(ALGO_NAME)_TEST_BIN := main-sha256-test.o dump-asm.o dump-decl.o hfal_sha256.o $(CLI_STD) $(HFAL_STD)
 			
diff --git a/mkfiles/sha256_c.mk b/mkfiles/sha256_c.mk
index d52fe88..6c58677 100644
--- a/mkfiles/sha256_c.mk
+++ b/mkfiles/sha256_c.mk
@@ -4,6 +4,7 @@ ALGO_NAME := SHA256_C
 # comment out the following line for removement of SHA256 from the build process
 HASHES += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := sha256/
 $(ALGO_NAME)_OBJ      := sha256.o
 $(ALGO_NAME)_TEST_BIN := main-sha256-test.o $(CLI_STD) $(HFAL_STD) hfal_sha256.o dump-asm.o dump-decl.o
 $(ALGO_NAME)_NESSIE_TEST      := "nessie"
diff --git a/mkfiles/shabea.mk b/mkfiles/shabea.mk
index e5f4a68..9c05cb3 100644
--- a/mkfiles/shabea.mk
+++ b/mkfiles/shabea.mk
@@ -4,6 +4,7 @@ ALGO_NAME := SHABEA
 # comment out the following line for removement of SHABEA from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := shabea/
 $(ALGO_NAME)_OBJ      := shabea.o sha256-asm.o memxor.o
 $(ALGO_NAME)_TEST_BIN := main-shabea-test.o $(CLI_STD)  \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/shacal1enc.mk b/mkfiles/shacal1enc.mk
index 63be6c8..3003638 100644
--- a/mkfiles/shacal1enc.mk
+++ b/mkfiles/shacal1enc.mk
@@ -1,10 +1,10 @@
-# Makefile for present
+# Makefile for shacal1
 ALGO_NAME := SHACAL1ENC
 
-# comment out the following line for removement of present from the build process
+# comment out the following line for removement of shacal1 from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := shacal1/
 $(ALGO_NAME)_OBJ      := shacal1_enc.o sha1-asm.o
 $(ALGO_NAME)_TEST_BIN := main-shacal1_enc-test.o $(CLI_STD)  \
                          nessie_bc_test.o nessie_common.o performance_test.o 
diff --git a/mkfiles/shacal2enc.mk b/mkfiles/shacal2enc.mk
index e8a91a2..d5f0d9d 100644
--- a/mkfiles/shacal2enc.mk
+++ b/mkfiles/shacal2enc.mk
@@ -1,10 +1,10 @@
-# Makefile for present
+# Makefile for shacal2
 ALGO_NAME := SHACAL2ENC
 
-# comment out the following line for removement of present from the build process
+# comment out the following line for removement of shacal2 from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
-
+$(ALGO_NAME)_DIR      := shacal2/
 $(ALGO_NAME)_OBJ      := shacal2_enc.o sha256-asm.o
 $(ALGO_NAME)_TEST_BIN := main-shacal2_enc-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o 
diff --git a/mkfiles/skipjack.mk b/mkfiles/skipjack.mk
index 56579b2..ef859df 100644
--- a/mkfiles/skipjack.mk
+++ b/mkfiles/skipjack.mk
@@ -4,6 +4,7 @@ ALGO_NAME := SKIPJACK
 # comment out the following line for removement of skipjack from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := skipjack/
 $(ALGO_NAME)_OBJ      := skipjack.o
 $(ALGO_NAME)_TEST_BIN := main-skipjack-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/tdes.mk b/mkfiles/tdes.mk
index 7bdc41e..09bb96c 100644
--- a/mkfiles/tdes.mk
+++ b/mkfiles/tdes.mk
@@ -1,9 +1,10 @@
-# Makefile for DES
+# Makefile for Triple-DES
 ALGO_NAME := TDES
 
-# comment out the following line for removement of DES from the build process
+# comment out the following line for removement of Triple-DES from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := des/
 $(ALGO_NAME)_OBJ      := des.o
 $(ALGO_NAME)_TEST_BIN := main-tdes-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/trivium.mk b/mkfiles/trivium.mk
index bbfba3b..a668ee7 100644
--- a/mkfiles/trivium.mk
+++ b/mkfiles/trivium.mk
@@ -4,6 +4,7 @@ ALGO_NAME := TRIVIUM
 # comment out the following line for removement of Trivium from the build process
 STREAM_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := trivium/
 $(ALGO_NAME)_OBJ      := trivium.o
 $(ALGO_NAME)_TEST_BIN := main-trivium-test.o $(CLI_STD) \
                          nessie_stream_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/xtea.mk b/mkfiles/xtea.mk
index 68adcfb..f2d1169 100644
--- a/mkfiles/xtea.mk
+++ b/mkfiles/xtea.mk
@@ -4,6 +4,7 @@ ALGO_NAME := XTEA
 # comment out the following line for removement of XTEA from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := xtea/
 $(ALGO_NAME)_OBJ      := xtea-asm.o
 $(ALGO_NAME)_TEST_BIN := main-xtea-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/mkfiles/xtea_c.mk b/mkfiles/xtea_c.mk
index dec8f8c..5bbd680 100644
--- a/mkfiles/xtea_c.mk
+++ b/mkfiles/xtea_c.mk
@@ -4,6 +4,7 @@ ALGO_NAME := XTEA_C
 # comment out the following line for removement of XTEA from the build process
 BLOCK_CIPHERS += $(ALGO_NAME)
 
+$(ALGO_NAME)_DIR      := xtea/
 $(ALGO_NAME)_OBJ      := xtea.o
 $(ALGO_NAME)_TEST_BIN := main-xtea-test.o $(CLI_STD) \
                          nessie_bc_test.o nessie_common.o performance_test.o
diff --git a/present.c b/present/present.c
similarity index 100%
rename from present.c
rename to present/present.c
diff --git a/present.h b/present/present.h
similarity index 100%
rename from present.h
rename to present/present.h
diff --git a/rc5.c b/rc5/rc5.c
similarity index 100%
rename from rc5.c
rename to rc5/rc5.c
diff --git a/rc5.h b/rc5/rc5.h
similarity index 100%
rename from rc5.h
rename to rc5/rc5.h
diff --git a/rc6.c b/rc6/rc6.c
similarity index 100%
rename from rc6.c
rename to rc6/rc6.c
diff --git a/rc6.h b/rc6/rc6.h
similarity index 100%
rename from rc6.h
rename to rc6/rc6.h
diff --git a/seed-asm.S b/seed/seed-asm.S
similarity index 100%
rename from seed-asm.S
rename to seed/seed-asm.S
diff --git a/seed.h b/seed/seed.h
similarity index 100%
rename from seed.h
rename to seed/seed.h
diff --git a/seed_C.c b/seed/seed_C.c
similarity index 100%
rename from seed_C.c
rename to seed/seed_C.c
diff --git a/seed_sbox.h b/seed/seed_sbox.h
similarity index 100%
rename from seed_sbox.h
rename to seed/seed_sbox.h
diff --git a/serpent/memxor.S b/serpent/memxor.S
new file mode 100644
index 0000000..a32058b
--- /dev/null
+++ b/serpent/memxor.S
@@ -0,0 +1,66 @@
+/* memxor.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * File:        memxor.S
+ * Author:      Daniel Otte
+ * Date:        2008-08-07
+ * License:     GPLv3 or later
+ * Description: memxor, XORing one block into another
+ *
+ */
+
+/*
+ * void memxor(void* dest, const void* src, uint16_t n);
+ */
+ /*
+  * param dest is passed in r24:r25
+  * param src  is passed in r22:r23
+  * param n    is passed in r20:r21
+  */
+.global memxor
+memxor:
+	movw r30, r24
+	movw r26, r22
+	movw r24, r20
+	adiw r24, 0
+	breq 2f
+1:
+	ld r20, X+
+	ld r21, Z
+	eor r20, r21
+	st Z+, r20
+	sbiw r24, 1
+	brne 1b
+2:
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/serpent/memxor.h b/serpent/memxor.h
new file mode 100644
index 0000000..a62a616
--- /dev/null
+++ b/serpent/memxor.h
@@ -0,0 +1,7 @@
+#ifndef MEMXOR_H_
+#define MEMXOR_H_
+#include <stdint.h>
+
+void memxor(void* dest, const void* src, uint16_t n);
+
+#endif
diff --git a/serpent-asm.S b/serpent/serpent-asm.S
similarity index 100%
rename from serpent-asm.S
rename to serpent/serpent-asm.S
diff --git a/serpent-sboxes-bitslice-asm.S b/serpent/serpent-sboxes-bitslice-asm.S
similarity index 100%
rename from serpent-sboxes-bitslice-asm.S
rename to serpent/serpent-sboxes-bitslice-asm.S
diff --git a/serpent-sboxes-bitslice.c b/serpent/serpent-sboxes-bitslice.c
similarity index 100%
rename from serpent-sboxes-bitslice.c
rename to serpent/serpent-sboxes-bitslice.c
diff --git a/serpent-sboxes-fast.S b/serpent/serpent-sboxes-fast.S
similarity index 100%
rename from serpent-sboxes-fast.S
rename to serpent/serpent-sboxes-fast.S
diff --git a/serpent-sboxes-small.S b/serpent/serpent-sboxes-small.S
similarity index 100%
rename from serpent-sboxes-small.S
rename to serpent/serpent-sboxes-small.S
diff --git a/serpent-sboxes.h b/serpent/serpent-sboxes.h
similarity index 100%
rename from serpent-sboxes.h
rename to serpent/serpent-sboxes.h
diff --git a/serpent-sboxes_c.c b/serpent/serpent-sboxes_c.c
similarity index 100%
rename from serpent-sboxes_c.c
rename to serpent/serpent-sboxes_c.c
diff --git a/serpent.c b/serpent/serpent.c
similarity index 100%
rename from serpent.c
rename to serpent/serpent.c
diff --git a/serpent.h b/serpent/serpent.h
similarity index 100%
rename from serpent.h
rename to serpent/serpent.h
diff --git a/sha1/sha1-asm.S b/sha1/sha1-asm.S
new file mode 100644
index 0000000..f571685
--- /dev/null
+++ b/sha1/sha1-asm.S
@@ -0,0 +1,886 @@
+/* sha1-asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; SHA1 implementation in assembler for AVR
+SHA1_BLOCK_BITS = 512
+SHA1_HASH_BITS = 160
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+.macro delay
+/*	
+	push r0
+	push r1
+	clr r0
+1:	clr r1
+2:	dec r1
+	brne 2b
+	dec r0
+	brne 1b
+	pop r1
+	pop r0  // */
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+/*	
+	precall
+	hexdump \length
+	postcall
+	// */
+.endm
+
+
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha1_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha1_ctx2hash
+; === sha1_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha1_ctx structure
+;	given in r23,r22
+sha1_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 5
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha1
+; === sha1 ===
+; this function calculates SHA-1 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha1:
+sha1_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 5*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha1_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha1_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha1_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha1_ctx2hash	
+	
+sha1_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 5*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha1_lastBlock
+; === sha1_lastBlock ===
+; this function does padding & Co. for calculating SHA-1 hashes
+;  param1: the 16-bit pointer to sha1_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
+
+
+sha1_lastBlock:
+	cpi r21, 0x02
+	brlo sha1_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	push r21
+	push r20
+	rcall sha1_nextBlock
+	pop r20
+	pop r21
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	subi r21, 2
+	subi r23, -2
+	rjmp sha1_lastBlock
+sha1_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64) /* ??? */
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha1_lastBlock_post_copy
+	mov r1, r18
+sha1_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha1_lastBlock_copy_loop
+sha1_lastBlock_post_copy:	
+sha1_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha1_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha1_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*5+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha1_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha1_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha1_lastBlock_epilog
+sha1_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 5*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha1_nextBlock
+
+sha1_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha1_nextBlock
+; === sha1_nextBlock ===
+; this is the core function for calculating SHA-1 hashes
+;  param1: the 16-bit pointer to sha1_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
+
+xtmp = 0
+xNULL = 1
+W1 = 10
+W2 = 11
+T1	= 12
+T2	= 13
+T3	= 14
+T4	= 15
+LoopC = 16
+S	  = 17
+tmp1 = 18
+tmp2 = 19
+tmp3 = 20
+tmp4 = 21
+F1 = 22
+F2 = 23
+F3 = 24
+F4 = 25
+
+/* byteorder: high number <--> high significance */
+sha1_nextBlock:
+ ; initial, let's make some space ready for local vars
+ 			 /* replace push & pop by mem ops? */
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack /* maybe removeable? */ 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha1_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	push r18
+	push r19 /* push old SP on new stack */
+	push r24
+	push r25 /* param1 will be needed later */
+	
+	/* load a[] with state */
+	movw 28, r24 /* load pointer to state in Y */
+	adiw r26, 1 ; X++
+
+	ldi LoopC, 5*4	
+1:	ld tmp1, Y+
+	st X+, tmp1
+	dec LoopC
+	brne 1b
+
+	movw W1, r26 /* save pointer to w[0] */
+	/* load w[] with endian fixed message */
+		/* we might also use the changeendian32() function at bottom */
+	movw r30, r22 /* mv param2 (ponter to msg) to Z */	
+	ldi LoopC, 16
+1:
+	ldd tmp1, Z+3
+	st X+, tmp1
+	ldd tmp1, Z+2
+	st X+, tmp1
+	ldd tmp1, Z+1
+	st X+, tmp1
+	ld tmp1, Z
+	st X+, tmp1
+	adiw r30, 4
+	dec LoopC
+	brne 1b
+	
+	;clr LoopC /* LoopC is named t in FIPS 180-2 */	
+	clr xtmp
+sha1_nextBlock_mainloop:
+	mov S, LoopC
+	lsl S
+	lsl S
+	andi S, 0x3C /* S is a bytepointer so *4 */
+	/* load w[s] */
+	movw r26, W1
+	add r26, S /* X points at w[s] */
+	adc r27, xNULL
+	ld T1, X+
+	ld T2, X+
+	ld T3, X+
+	ld T4, X+
+
+	/**/
+	push r26
+	push r27
+	push T4
+	push T3
+	push T2
+	push T1
+	in r26, SPL
+	in r27, SPH
+	adiw r26, 1
+	dbg_hexdump 4
+	pop T1
+	pop T2
+	pop T3
+	pop T4
+	pop r27
+	pop r26
+	/**/
+
+	cpi LoopC, 16
+	brlt sha1_nextBlock_mainloop_core
+	/* update w[s] */
+	ldi tmp1, 2*4
+	rcall 1f
+	ldi tmp1, 8*4
+	rcall 1f
+	ldi tmp1, 13*4
+	rcall 1f
+	rjmp 2f
+1:		/* this might be "outsourced" to save the jump above */
+	add tmp1, S
+	andi tmp1, 0x3f
+	movw r26, W1
+	add r26, tmp1
+	adc r27, xNULL
+	ld tmp2, X+
+	eor T1, tmp2
+	ld tmp2, X+
+	eor T2, tmp2
+	ld tmp2, X+
+	eor T3, tmp2
+	ld tmp2, X+
+	eor T4, tmp2
+	ret
+2:	/* now we just hav to do a ROTL(T) and save T back */
+	mov tmp2, T4
+	rol tmp2
+	rol T1
+	rol T2
+	rol T3
+	rol T4
+	movw r26, W1
+	add r26, S
+	adc r27, xNULL
+	st X+, T1
+	st X+, T2
+	st X+, T3
+	st X+, T4
+	
+sha1_nextBlock_mainloop_core:	/* ther core function; T=ROTL5(a) ....*/	
+								/* T already contains w[s] */
+	movw r26, W1
+	sbiw r26, 4*1		/* X points at a[4] aka e */
+	ld tmp1, X+ 
+	add T1, tmp1
+	ld tmp1, X+ 
+	adc T2, tmp1
+	ld tmp1, X+ 
+	adc T3, tmp1
+	ld tmp1, X+ 
+	adc T4, tmp1		/* T = w[s]+e */
+	sbiw r26, 4*5		/* X points at a[0] aka a */
+	ld F1, X+ 
+	ld F2, X+ 
+	ld F3, X+ 
+	ld F4, X+ 
+	mov tmp1, F4		/* X points at a[1] aka b */
+	ldi tmp2, 5
+1:
+	rol tmp1
+	rol F1
+	rol F2
+	rol F3
+	rol F4
+	dec tmp2
+	brne 1b
+	
+	add T1, F1
+	adc T2, F2
+	adc T3, F3
+	adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
+	
+	/* now we have to do this fucking conditional stuff */
+	ldi r30, lo8(sha1_nextBlock_xTable)
+	ldi r31, hi8(sha1_nextBlock_xTable)
+	add r30, xtmp
+	adc r31, xNULL
+	lpm tmp1, Z
+	cp tmp1, LoopC
+	brne 1f
+	inc xtmp
+1:	ldi r30, lo8(sha1_nextBlock_KTable)
+	ldi r31, hi8(sha1_nextBlock_KTable)
+	lsl xtmp
+	lsl xtmp
+	add r30, xtmp
+	adc r31, xNULL
+	lsr xtmp
+	lsr xtmp
+	 
+	lpm tmp1, Z+
+	add T1, tmp1
+	lpm tmp1, Z+
+	adc T2, tmp1
+	lpm tmp1, Z+
+	adc T3, tmp1
+	lpm tmp1, Z+
+	adc T4, tmp1
+			/* T = ROTL(a,5) + e + kt + w[s] */
+	
+	/* Z-4 is just pointing to kt ... */
+	movw r28, r26 /* copy X in Y */
+	adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
+	lsr r31
+	ror r30
+		
+	icall
+	mov F1, tmp1
+	icall
+	mov F2, tmp1
+	icall
+	mov F3, tmp1
+	icall
+	
+	add T1, F1
+	adc T2, F2
+	adc T3, F3
+	adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
+				 /* X points still at a[1] aka b, Y points at a[2] aka c */	
+	/* update a[] */
+sha1_nextBlock_update_a:
+	/*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
+	//adiw r28, 3*4  /* Y should point at a[4] aka e */
+	movw r28, W1
+	sbiw r28, 4
+	
+	ldi tmp2, 4*4 
+1:	
+	ld tmp1, -Y
+	std Y+4, tmp1
+	dec tmp2
+	brne 1b
+	/* Y points at a[0] aka a*/
+	
+	movw r28, W1
+	sbiw r28, 5*4
+	/* store T in a[0] aka a */
+	st Y+, T1
+	st Y+, T2
+	st Y+, T3
+	st Y+, T4
+	/* Y points at a[1] aka b*/
+	
+	/* rotate c */
+	ldd T1, Y+1*4
+	ldd T2, Y+1*4+1
+	ldd T3, Y+1*4+2
+	ldd T4, Y+1*4+3
+	mov tmp1, T1
+	ldi tmp2, 2
+1:	ror tmp1
+	ror T4
+	ror T3
+	ror T2
+	ror T1
+	dec tmp2
+	brne 1b
+	std Y+1*4+0, T1
+	std Y+1*4+1, T2
+	std Y+1*4+2, T3
+	std Y+1*4+3, T4
+	
+	push r27
+	push r26
+	movw r26, W1
+	sbiw r26, 4*5
+	dbg_hexdump 4*5
+	pop r26
+	pop r27
+	
+	inc LoopC
+	cpi LoopC, 80
+	brge 1f
+	rjmp sha1_nextBlock_mainloop
+/**************************************/
+1:	
+   /* littel patch */
+	sbiw r28, 4
+
+/* add a[] to state and inc length */	
+	pop r27
+	pop r26		/* now X points to state (and Y still at a[0]) */
+	ldi tmp4, 5
+1:	clc
+	ldi tmp3, 4
+2:	ld tmp1, X
+	ld tmp2, Y+
+	adc tmp1, tmp2
+	st X+, tmp1
+	dec tmp3
+	brne 2b
+	dec tmp4
+	brne 1b
+	
+	/* now length += 512 */
+	adiw r26, 1 /* we skip the least significant byte */
+	ld tmp1, X
+	ldi tmp2, hi8(512) /* 2 */
+	add tmp1, tmp2
+	st X+, tmp1
+	ldi tmp2, 6
+1:
+	ld tmp1, X
+	adc tmp1, xNULL
+	st X+, tmp1
+	dec tmp2
+	brne 1b
+	
+; EPILOG
+sha1_nextBlock_epilog:
+/* now we should clean up the stack */
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	ret
+
+sha1_nextBlock_xTable:
+.byte 20,40,60,0
+sha1_nextBlock_KTable:
+.int	0x5a827999 
+.int	0x6ed9eba1 
+.int	0x8f1bbcdc 
+.int	0xca62c1d6
+sha1_nextBlock_JumpTable:
+rjmp sha1_nextBlock_Ch
+	nop	
+rjmp sha1_nextBlock_Parity
+	nop
+rjmp sha1_nextBlock_Maj
+	nop
+rjmp sha1_nextBlock_Parity
+
+	 /* X and Y still point at a[1] aka b ; return value in tmp1 */
+sha1_nextBlock_Ch:
+	ld tmp1, Y+
+	mov tmp2, tmp1
+	com tmp2
+	ldd tmp3, Y+3	/* load from c */
+	and tmp1, tmp3
+	ldd tmp3, Y+7	/* load from d */
+	and tmp2, tmp3
+	eor tmp1, tmp2
+	ret
+	
+sha1_nextBlock_Maj:
+	ld tmp1, Y+
+	mov tmp2, tmp1
+	ldd tmp3, Y+3	/* load from c */
+	and tmp1, tmp3
+	ldd tmp4, Y+7	/* load from d */
+	and tmp2, tmp4
+	eor tmp1, tmp2
+	and tmp3, tmp4
+	eor tmp1, tmp3
+	ret
+
+sha1_nextBlock_Parity:
+	ld tmp1, Y+
+	ldd tmp2, Y+3	/* load from c */
+	eor tmp1, tmp2
+	ldd tmp2, Y+7	/* load from d */
+	eor tmp1, tmp2
+	ret
+/*	
+ch_str:			.asciz "\r\nCh"
+maj_str:		.asciz "\r\nMaj"
+parity_str:	.asciz "\r\nParity"
+*/
+;###########################################################	
+
+.global sha1_init 
+;void sha1_init(sha1_ctx_t *state){
+;	DEBUG_S("\r\nSHA1_INIT");
+;	state->h[0] = 0x67452301;
+;	state->h[1] = 0xefcdab89;
+;	state->h[2] = 0x98badcfe;
+;	state->h[3] = 0x10325476;
+;	state->h[4] = 0xc3d2e1f0;
+;	state->length = 0;
+;}
+; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha1_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha1_init_vector))
+	ldi r31, hi8((sha1_init_vector))
+	ldi r22, 5*4 /* bytes to copy */
+sha1_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha1_init_vloop
+	ldi r22, 8
+sha1_init_lloop:
+	st X+, r1
+	dec r22
+	brne sha1_init_lloop
+	ret
+	
+sha1_init_vector:
+.int 0x67452301;
+.int 0xefcdab89;
+.int 0x98badcfe;
+.int 0x10325476;
+.int 0xc3d2e1f0;
+
diff --git a/sha1.c b/sha1/sha1.c
similarity index 100%
rename from sha1.c
rename to sha1/sha1.c
diff --git a/sha1/sha1.h b/sha1/sha1.h
new file mode 100644
index 0000000..6675d20
--- /dev/null
+++ b/sha1/sha1.h
@@ -0,0 +1,117 @@
+/* sha1.h */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+ * \file	sha1.h
+ * \author	Daniel Otte
+ * \email   daniel.otte@rub.de
+ * \date	2006-10-08
+ * \license GPLv3 or later
+ * \brief   SHA-1 declaration.
+ * \ingroup SHA-1
+ * 
+ */
+ 
+#ifndef SHA1_H_
+#define SHA1_H_
+
+#include <stdint.h>
+/** \def SHA1_HASH_BITS
+ * definees the size of a SHA-1 hash in bits 
+ */
+
+/** \def SHA1_HASH_BYTES
+ * definees the size of a SHA-1 hash in bytes 
+ */
+
+/** \def SHA1_BLOCK_BITS
+ * definees the size of a SHA-1 input block in bits 
+ */
+
+/** \def SHA1_BLOCK_BYTES
+ * definees the size of a SHA-1 input block in bytes 
+ */
+#define SHA1_HASH_BITS  160
+#define SHA1_HASH_BYTES (SHA1_HASH_BITS/8)
+#define SHA1_BLOCK_BITS 512
+#define SHA1_BLOCK_BYTES (SHA1_BLOCK_BITS/8)
+
+/** \typedef sha1_ctx_t
+ * \brief SHA-1 context type
+ * 
+ * A vatiable of this type may hold the state of a SHA-1 hashing process
+ */
+typedef struct {
+	uint32_t h[5];
+	uint64_t length;
+} sha1_ctx_t;
+
+/** \typedef sha1_hash_t
+ * \brief hash value type
+ * A variable of this type may hold a SHA-1 hash value 
+ */
+typedef uint8_t sha1_hash_t[SHA1_HASH_BITS/8];
+
+/** \fn sha1_init(sha1_ctx_t *state)
+ * \brief initializes a SHA-1 context
+ * This function sets a ::sha1_ctx_t variable to the initialization vector
+ * for SHA-1 hashing.
+ * \param state pointer to the SHA-1 context variable
+ */
+void sha1_init(sha1_ctx_t *state);
+
+/** \fn sha1_nextBlock(sha1_ctx_t *state, const void* block)
+ *  \brief process one input block
+ * This function processes one input block and updates the hash context 
+ * accordingly
+ * \param state pointer to the state variable to update
+ * \param block pointer to the message block to process
+ */
+void sha1_nextBlock (sha1_ctx_t *state, const void* block);
+
+/** \fn sha1_lastBlock(sha1_ctx_t *state, const void* block, uint16_t length_b)
+ * \brief processes the given block and finalizes the context
+ * This function processes the last block in a SHA-1 hashing process.
+ * The block should have a maximum length of a single input block.
+ * \param state pointer to the state variable to update and finalize
+ * \param block pointer to themessage block to process
+ * \param length_b length of the message block in bits  
+ */
+void sha1_lastBlock (sha1_ctx_t *state, const void* block, uint16_t length_b);
+
+/** \fn sha1_ctx2hash(sha1_hash_t *dest, sha1_ctx_t *state)
+ * \brief convert a state variable into an actual hash value
+ * Writes the hash value corresponding to the state to the memory pointed by dest.
+ * \param dest pointer to the hash value destination
+ * \param state pointer to the hash context
+ */ 
+void sha1_ctx2hash (sha1_hash_t *dest, sha1_ctx_t *state);
+
+/** \fn sha1(sha1_hash_t *dest, const void* msg, uint32_t length_b)
+ * \brief hashing a message which in located entirely in RAM
+ * This function automatically hashes a message which is entirely in RAM with
+ * the SHA-1 hashing algorithm.
+ * \param dest pointer to the hash value destination
+ * \param msg  pointer to the message which should be hashed
+ * \param length_b length of the message in bits
+ */ 
+void sha1(sha1_hash_t *dest, const void* msg, uint32_t length_b);
+
+
+
+#endif /*SHA1_H_*/
diff --git a/sha256/sha256-asm.S b/sha256/sha256-asm.S
new file mode 100644
index 0000000..d9eb6b6
--- /dev/null
+++ b/sha256/sha256-asm.S
@@ -0,0 +1,1042 @@
+/* sha256-asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; sha-256 implementation in assembler	
+SHA256_BLOCK_BITS = 512
+SHA256_HASH_BITS = 256
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+	precall
+	hexdump \length
+	postcall
+.endm
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha256_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][h5][h6][h7][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha256_ctx2hash
+; === sha256_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha256_ctx structure
+;	given in r23,r22
+sha256_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 8
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha256
+; === sha256 ===
+; this function calculates SHA-256 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha256:
+sha256_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 8*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha256_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha256_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha256_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha256_ctx2hash	
+	
+sha256_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 8*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha256_lastBlock
+; === sha256_lastBlock ===
+; this function does padding & Co. for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
+
+
+sha256_lastBlock:
+	cpi r21, 0x02
+	brlo sha256_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	subi r21, 0x02
+	subi r23, -2
+	rjmp sha256_lastBlock	
+sha256_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha256_lastBlock_post_copy
+	mov r1, r18
+sha256_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha256_lastBlock_copy_loop
+sha256_lastBlock_post_copy:	
+sha256_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha256_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*8+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha256_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha256_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha256_lastBlock_epilog
+sha256_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 8*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha256_nextBlock
+
+sha256_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha256_nextBlock
+; === sha256_nextBlock ===
+; this is the core function for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
+
+Bck1 = 12
+Bck2 = 13
+Bck3 = 14
+Bck4 = 15
+Func1 = 22
+Func2 = 23
+Func3 = 24
+Func4 = 25
+Accu1 = 16
+Accu2 = 17
+Accu3 = 18
+Accu4 = 19
+XAccu1 = 8
+XAccu2 = 9
+XAccu3 = 10
+XAccu4 = 11
+T1	= 4
+T2	= 5
+T3	= 6
+T4	= 7
+LoopC = 1
+/* byteorder: high number <--> high significance */
+sha256_nextBlock:
+ ; initial, let's make some space ready for local vars
+	push r4 /* replace push & pop by mem ops? */
+	push r5
+	push r6
+	push r7
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha256_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	push r18
+	push r19
+	push r24
+	push r25 /* param1 will be needed later */
+ ; now we fill the w array with message (think about endianess)
+ 	adiw r26, 1 ; X++
+ 	ldi r20, 16
+sha256_nextBlock_wcpyloop: 	
+ 	ld r23, Z+
+ 	ld r22, Z+
+ 	ld r19, Z+
+ 	ld r18, Z+
+ 	st X+, r18
+ 	st X+, r19
+ 	st X+, r22	
+	st X+, r23
+	dec r20
+	brne sha256_nextBlock_wcpyloop
+/*	for (i=16; i<64; ++i){
+		w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];	
+	} */
+	/* r25,r24,r23,r24 (r21,r20) are function values
+	   r19,r18,r17,r16 are the accumulator
+	   r15,r14,r13,rBck1 are backup1
+	   r11,r10,r9 ,r8  are xor accu   
+	   r1 is round counter 								*/
+
+	ldi r20, 64-16
+	mov LoopC, r20
+sha256_nextBlock_wcalcloop:		 
+	movw r30, r26 ; cp X to Z
+	sbiw r30, 63
+	sbiw r30, 1 		; substract 64 = 16*4
+	ld Accu1, Z+
+	ld Accu2, Z+
+	ld Accu3, Z+
+	ld Accu4, Z+ /* w[i] = w[i-16] */
+	ld Bck1, Z+
+	ld Bck2, Z+
+	ld Bck3, Z+
+	ld Bck4, Z+ /* backup = w[i-15] */
+	/* now sigma 0 */
+	mov Func1, Bck2
+	mov Func2, Bck3
+	mov Func3, Bck4
+	mov Func4, Bck1  /* prerotated by 8 */
+	ldi r20, 1
+	rcall bitrotl
+	movw XAccu1, Func1
+	movw XAccu3, Func3	 /* store ROTR(w[i-15],7) in xor accu */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 3		 /* now shr3 */ /*we can destroy backup now*/
+sigma0_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2
+	ror Bck1	
+	dec Func2
+	brne sigma0_shr
+	eor XAccu1, Bck1
+	eor XAccu2, Bck2
+	eor XAccu3, Bck3
+	eor XAccu4, Bck4	/* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	ldd Func1, Z+7*4  /* now accu += w[i-7] */
+	ldd Func2, Z+7*4+1
+	ldd Func3, Z+7*4+2
+	ldd Func4, Z+7*4+3
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
+	ldd Bck2, Z+12*4+1
+	ldd Bck3, Z+12*4+2
+	ldd Bck4, Z+12*4+3
+	/* now sigma 1 */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 1
+	rcall bitrotr
+	movw XAccu3, Func3
+	movw XAccu1, Func1	 /* store in ROTR(w[i-2], 17) xor accu */
+;	movw Func1, Bck3
+;	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 2	 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
+sigma1_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2	
+	dec Func2
+	brne sigma1_shr
+	eor XAccu1, Bck2
+	eor XAccu2, Bck3
+	eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	/* now let's store the shit */
+	st X+, Accu1
+	st X+, Accu2
+	st X+, Accu3
+	st X+, Accu4
+	dec LoopC
+	breq 3f  ; skip if zero
+	rjmp sha256_nextBlock_wcalcloop
+3:
+	/* we are finished with w array X points one byte post w */
+/* init a array */
+	pop r31
+	pop r30
+	push r30
+	push r31
+	ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
+init_a_array:	
+	ld r1, Z+
+	st X+, r1
+	dec r25
+	brne init_a_array
+	
+/* now the real fun begins */
+/* for (i=0; i<64; ++i){
+			t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
+			t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
+			memmove(&(a[1]), &(a[0]), 7*4); 	// a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; 
+			a[4] += t1;
+			a[0] = t1 + t2;
+		} */
+	/* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
+	sbiw r26, 8*4  /* X still points at a[7]+1*/
+	movw r28, r26
+	ldi r30, lo8(sha256_kv)
+	ldi r31, hi8(sha256_kv)		
+	dec r27  /* X - (64*4 == 256) */
+	ldi r25, 64
+	mov LoopC, r25
+sha256_main_loop:
+	/* now calculate t1 */
+	 /*CH(x,y,z) = (x&y)^((~x)&z)*/
+	ldd T1, Y+5*4
+	ldd T2, Y+5*4+1
+	ldd T3, Y+5*4+2
+	ldd T4, Y+5*4+3 /* y in T */
+	ldd Func1, Y+4*4
+	ldd Func2, Y+4*4+1
+	ldd Func3, Y+4*4+2
+	ldd Func4, Y+4*4+3  /* x in Func */
+	ldd Bck1, Y+6*4
+	ldd Bck2, Y+6*4+1
+	ldd Bck3, Y+6*4+2
+	ldd Bck4, Y+6*4+3 /* z in Bck */
+	and T1, Func1
+	and T2, Func2
+	and T3, Func3
+	and T4, Func4
+	com Func1
+	com Func2
+	com Func3
+	com Func4
+	and Bck1, Func1
+	and Bck2, Func2
+	and Bck3, Func3
+	and Bck4, Func4
+	eor T1, Bck1
+	eor T2, Bck2
+	eor T3, Bck3
+	eor T4, Bck4 /* done, CH(x,y,z) is in T */
+	/* now SIGMA1(a[4]) */
+	ldd Bck4, Y+4*4		/* think about using it from Func reg above*/
+	ldd Bck1, Y+4*4+1	
+	ldd Bck2, Y+4*4+2
+	ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */	
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2 
+	rcall bitrotl		/* rotr(x,6) */ 
+	movw XAccu1, Func1
+	movw XAccu3, Func3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 3 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	movw Func1, Bck3 /* this prerotates furteh 16 bits*/
+	movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
+	ldi r20, 1 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4
+	/* now we've to add a[7], w[i] and k[i] */
+	ldd XAccu1, Y+4*7
+	ldd XAccu2, Y+4*7+1
+	ldd XAccu3, Y+4*7+2
+	ldd XAccu4, Y+4*7+3
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add a[7] */
+	ld XAccu1, X+
+	ld XAccu2, X+
+	ld XAccu3, X+
+	ld XAccu4, X+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add w[i] */
+	lpm XAccu1, Z+
+	lpm XAccu2, Z+
+	lpm XAccu3, Z+
+	lpm XAccu4, Z+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
+	/*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
+		/* starting with MAJ(x,y,z) */
+	ldd Func1, Y+4*0+0
+	ldd Func2, Y+4*0+1
+	ldd Func3, Y+4*0+2
+	ldd Func4, Y+4*0+3 /* load x=a[0] */
+	ldd XAccu1, Y+4*1+0
+	ldd XAccu2, Y+4*1+1
+	ldd XAccu3, Y+4*1+2
+	ldd XAccu4, Y+4*1+3 /* load y=a[1] */
+	and XAccu1, Func1
+	and XAccu2, Func2
+	and XAccu3, Func3
+	and XAccu4, Func4	/* XAccu == (x & y) */
+	ldd Bck1, Y+4*2+0
+	ldd Bck2, Y+4*2+1
+	ldd Bck3, Y+4*2+2
+	ldd Bck4, Y+4*2+3 /* load z=a[2] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == (x & y) ^ (x & z) */
+	ldd Func1, Y+4*1+0
+	ldd Func2, Y+4*1+1
+	ldd Func3, Y+4*1+2
+	ldd Func4, Y+4*1+3 /* load y=a[1] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
+   	/* SIGMA0(a[0]) */
+	ldd Bck1, Y+4*0+0 /* we should combine this with above */
+	ldd Bck2, Y+4*0+1
+	ldd Bck3, Y+4*0+2
+	ldd Bck4, Y+4*0+3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2
+	rcall bitrotr
+	movw Accu1, Func1
+	movw Accu3, Func3 /* Accu = shr(a[0], 2) */
+	movw Func1, Bck3 
+	movw Func3, Bck1 /* prerotate by 16 bits */
+	ldi r20, 3
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
+	mov Func1, Bck4
+	mov Func2, Bck1
+	mov Func3, Bck2
+	mov Func4, Bck3  /* prerotate by 24 bits */
+	ldi r20, 2
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
+	add Accu1, XAccu1 /* add previous result (MAJ)*/
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4
+	/* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
+	/* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
+
+	ldi r21, 7*4
+	adiw r28, 7*4
+a_shift_loop:
+	ld  r25, -Y /* warning: this is PREdecrement */
+	std Y+4, r25
+	dec r21
+	brne a_shift_loop
+
+	ldd Bck1, Y+4*4+0
+	ldd Bck2, Y+4*4+1
+	ldd Bck3, Y+4*4+2
+	ldd Bck4, Y+4*4+3
+	add Bck1, T1
+	adc Bck2, T2
+	adc Bck3, T3
+	adc Bck4, T4
+	std Y+4*4+0, Bck1
+	std Y+4*4+1, Bck2
+	std Y+4*4+2, Bck3
+	std Y+4*4+3, Bck4
+	add Accu1, T1
+	adc Accu2, T2
+	adc Accu3, T3
+	adc Accu4, T4
+	std Y+4*0+0, Accu1
+	std Y+4*0+1, Accu2
+	std Y+4*0+2, Accu3
+	std Y+4*0+3, Accu4 /* a array updated */
+	
+	
+	dec LoopC
+	breq update_state
+	rjmp sha256_main_loop ;brne sha256_main_loop
+update_state:	
+	/* update state */
+	/* pointers to state should still exist on the stack ;-) */
+	pop r31
+	pop r30
+	ldi r21, 8
+update_state_loop:
+	ldd Accu1, Z+0
+	ldd Accu2, Z+1
+	ldd Accu3, Z+2
+	ldd Accu4, Z+3 
+	ld Func1, Y+
+	ld Func2, Y+
+	ld Func3, Y+
+	ld Func4, Y+
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	st Z+, Accu1
+	st Z+, Accu2
+	st Z+, Accu3
+	st Z+, Accu4
+	dec r21
+	brne update_state_loop
+	/* now we just have to update the length */
+	adiw r30, 1 /* since we add 512, we can simply skip the LSB */ 
+	ldi r21, 2
+	ldi r22, 6
+	ld r20, Z
+	add r20, r21
+	st Z+, r20	
+	clr r21
+sha256_nextBlock_fix_length:	
+	brcc sha256_nextBlock_epilog
+	ld r20, Z
+	adc r20, r21
+	st Z+, r20
+	dec r22
+	brne sha256_nextBlock_fix_length
+	
+; EPILOG
+sha256_nextBlock_epilog:
+/* now we should clean up the stack */
+	
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	pop r7
+	pop r6
+	pop r5
+	pop r4 
+	ret
+
+sha256_kv: ; round-key-vector stored in ProgMem 
+.word	0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
+.word	0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
+.word	0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
+.word	0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
+.word	0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
+.word	0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
+.word	0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
+.word	0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
+
+	
+;###########################################################	
+
+.global sha256_init 
+;uint32_t sha256_init_vector[]={
+;  	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+;	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
+;
+;void sha256_init(sha256_ctx_t *state){
+;	state->length=0;
+;	memcpy(state->h, sha256_init_vector, 8*4);
+;}
+; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha256_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha256_init_vector))
+	ldi r31, hi8((sha256_init_vector))
+	ldi r22, 32+8
+sha256_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha256_init_vloop
+	ret
+	
+sha256_init_vector:
+.word 0xE667, 0x6A09
+.word 0xAE85, 0xBB67 
+.word 0xF372, 0x3C6E 
+.word 0xF53A, 0xA54F 
+.word 0x527F, 0x510E 
+.word 0x688C, 0x9B05 
+.word 0xD9AB, 0x1F83 
+.word 0xCD19, 0x5BE0
+.word 0x0000, 0x0000
+.word 0x0000, 0x0000
+
+;###########################################################	
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,r22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotl32:
+	cpi r20, 8
+	brlo bitrotl
+	mov r21, r25
+	mov r25, r24
+	mov r24, r23
+	mov r23, r22
+	mov r22, r21
+	subi r20, 8
+	rjmp rotl32
+bitrotl:
+	clr r21
+	clc
+bitrotl_loop:	
+	tst r20
+	breq fixrotl
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	rol r21
+	dec r20
+	rjmp bitrotl_loop
+fixrotl:
+	or r22, r21
+	ret
+	
+
+;###########################################################	
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotr32:
+	cpi r20, 8
+	brlo bitrotr
+	mov r21, r22
+	mov r22, r23
+	mov r23, r24
+	mov r24, r25
+	mov r25, r21
+	subi r20, 8
+	rjmp rotr32
+bitrotr:
+	clr r21
+	clc
+bitrotr_loop:	
+	tst r20
+	breq fixrotr
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	dec r20
+	rjmp bitrotr_loop
+fixrotr:
+	or r25, r21
+	ret
+	
+	
+;###########################################################	
+	
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+;  param1: the 32-bit word
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  modifys: r21, r22
+change_endian32:
+	movw r20,  r22 ; (r22,r23) --> (r20,r21)
+	mov r22, r25
+	mov r23, r24
+	mov r24, r21
+	mov r25, r20 
+	ret
+
diff --git a/sha256.c b/sha256/sha256.c
similarity index 100%
rename from sha256.c
rename to sha256/sha256.c
diff --git a/sha256/sha256.h b/sha256/sha256.h
new file mode 100644
index 0000000..24960a3
--- /dev/null
+++ b/sha256/sha256.h
@@ -0,0 +1,122 @@
+/* sha256.h */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+ * \file	sha256.h
+ * \author  Daniel Otte 
+ * \date    2006-05-16
+ * \license	GPLv3 or later
+ * 
+ */
+
+#ifndef SHA256_H_
+#define SHA256_H_
+
+#define __LITTLE_ENDIAN__
+
+
+#include <stdint.h>
+
+/** \def SHA256_HASH_BITS
+ * defines the size of a SHA-256 hash value in bits
+ */
+
+/** \def SHA256_HASH_BYTES
+ * defines the size of a SHA-256 hash value in bytes
+ */
+
+/** \def SHA256_BLOCK_BITS
+ * defines the size of a SHA-256 input block in bits
+ */
+
+/** \def SHA256_BLOCK_BYTES
+ * defines the size of a SHA-256 input block in bytes
+ */
+
+#define SHA256_HASH_BITS  256
+#define SHA256_HASH_BYTES (SHA256_HASH_BITS/8)
+#define SHA256_BLOCK_BITS 512
+#define SHA256_BLOCK_BYTES (SHA256_BLOCK_BITS/8)
+
+/** \typedef sha256_ctx_t
+ * \brief SHA-256 context type
+ * 
+ * A variable of this type may hold the state of a SHA-256 hashing process
+ */
+typedef struct {
+	uint32_t h[8];
+	uint64_t length;
+} sha256_ctx_t;
+
+/** \typedef sha256_hash_t
+ * \brief SHA-256 hash value type
+ * 
+ * A variable of this type may hold the hash value produced by the
+ * sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state) function.
+ */
+typedef uint8_t sha256_hash_t[SHA256_HASH_BYTES];
+
+/** \fn void sha256_init(sha256_ctx_t *state)
+ * \brief initialise a SHA-256 context
+ * 
+ * This function sets a ::sha256_ctx_t to the initial values for hashing.
+ * \param state pointer to the SHA-256 hashing context
+ */
+void sha256_init(sha256_ctx_t *state);
+
+/** \fn void sha256_nextBlock (sha256_ctx_t* state, const void* block)
+ * \brief update the context with a given block
+ * 
+ * This function updates the SHA-256 hash context by processing the given block
+ * of fixed length.
+ * \param state pointer to the SHA-256 hash context
+ * \param block pointer to the block of fixed length (512 bit = 64 byte)
+ */
+void sha256_nextBlock (sha256_ctx_t* state, const void* block);
+
+/** \fn void sha256_lastBlock(sha256_ctx_t* state, const void* block, uint16_t length_b)
+ * \brief finalize the context with the given block 
+ * 
+ * This function finalizes the SHA-256 hash context by processing the given block
+ * of variable length.
+ * \param state pointer to the SHA-256 hash context
+ * \param block pointer to the block of fixed length (512 bit = 64 byte)
+ * \param length_b the length of the block in bits
+ */
+void sha256_lastBlock(sha256_ctx_t* state, const void* block, uint16_t length_b);
+
+/** \fn void sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state)
+ * \brief convert the hash state into the hash value
+ * This function reads the context and writes the hash value to the destination
+ * \param dest pointer to the location where the hash value should be written
+ * \param state pointer to the SHA-256 hash context
+ */
+void sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state);
+
+/** \fn void sha256(sha256_hash_t* dest, const void* msg, uint32_t length_b)
+ * \brief simple SHA-256 hashing function for direct hashing
+ * 
+ * This function automaticaly hashes a given message of arbitary length with
+ * the SHA-256 hashing algorithm.
+ * \param dest pointer to the location where the hash value is going to be written to
+ * \param msg pointer to the message thats going to be hashed
+ * \param length_b length of the message in bits
+ */
+void sha256(sha256_hash_t* dest, const void* msg, uint32_t length_b);
+
+#endif /*SHA256_H_*/
diff --git a/shabea/memxor.S b/shabea/memxor.S
new file mode 100644
index 0000000..a32058b
--- /dev/null
+++ b/shabea/memxor.S
@@ -0,0 +1,66 @@
+/* memxor.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * File:        memxor.S
+ * Author:      Daniel Otte
+ * Date:        2008-08-07
+ * License:     GPLv3 or later
+ * Description: memxor, XORing one block into another
+ *
+ */
+
+/*
+ * void memxor(void* dest, const void* src, uint16_t n);
+ */
+ /*
+  * param dest is passed in r24:r25
+  * param src  is passed in r22:r23
+  * param n    is passed in r20:r21
+  */
+.global memxor
+memxor:
+	movw r30, r24
+	movw r26, r22
+	movw r24, r20
+	adiw r24, 0
+	breq 2f
+1:
+	ld r20, X+
+	ld r21, Z
+	eor r20, r21
+	st Z+, r20
+	sbiw r24, 1
+	brne 1b
+2:
+	ret
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/shabea/memxor.h b/shabea/memxor.h
new file mode 100644
index 0000000..a62a616
--- /dev/null
+++ b/shabea/memxor.h
@@ -0,0 +1,7 @@
+#ifndef MEMXOR_H_
+#define MEMXOR_H_
+#include <stdint.h>
+
+void memxor(void* dest, const void* src, uint16_t n);
+
+#endif
diff --git a/shabea/sha256-asm.S b/shabea/sha256-asm.S
new file mode 100644
index 0000000..d9eb6b6
--- /dev/null
+++ b/shabea/sha256-asm.S
@@ -0,0 +1,1042 @@
+/* sha256-asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; sha-256 implementation in assembler	
+SHA256_BLOCK_BITS = 512
+SHA256_HASH_BITS = 256
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+	precall
+	hexdump \length
+	postcall
+.endm
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha256_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][h5][h6][h7][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha256_ctx2hash
+; === sha256_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha256_ctx structure
+;	given in r23,r22
+sha256_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 8
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha256
+; === sha256 ===
+; this function calculates SHA-256 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha256:
+sha256_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 8*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha256_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha256_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha256_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha256_ctx2hash	
+	
+sha256_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 8*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha256_lastBlock
+; === sha256_lastBlock ===
+; this function does padding & Co. for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
+
+
+sha256_lastBlock:
+	cpi r21, 0x02
+	brlo sha256_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	subi r21, 0x02
+	subi r23, -2
+	rjmp sha256_lastBlock	
+sha256_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha256_lastBlock_post_copy
+	mov r1, r18
+sha256_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha256_lastBlock_copy_loop
+sha256_lastBlock_post_copy:	
+sha256_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha256_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*8+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha256_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha256_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha256_lastBlock_epilog
+sha256_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 8*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha256_nextBlock
+
+sha256_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha256_nextBlock
+; === sha256_nextBlock ===
+; this is the core function for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
+
+Bck1 = 12
+Bck2 = 13
+Bck3 = 14
+Bck4 = 15
+Func1 = 22
+Func2 = 23
+Func3 = 24
+Func4 = 25
+Accu1 = 16
+Accu2 = 17
+Accu3 = 18
+Accu4 = 19
+XAccu1 = 8
+XAccu2 = 9
+XAccu3 = 10
+XAccu4 = 11
+T1	= 4
+T2	= 5
+T3	= 6
+T4	= 7
+LoopC = 1
+/* byteorder: high number <--> high significance */
+sha256_nextBlock:
+ ; initial, let's make some space ready for local vars
+	push r4 /* replace push & pop by mem ops? */
+	push r5
+	push r6
+	push r7
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha256_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	push r18
+	push r19
+	push r24
+	push r25 /* param1 will be needed later */
+ ; now we fill the w array with message (think about endianess)
+ 	adiw r26, 1 ; X++
+ 	ldi r20, 16
+sha256_nextBlock_wcpyloop: 	
+ 	ld r23, Z+
+ 	ld r22, Z+
+ 	ld r19, Z+
+ 	ld r18, Z+
+ 	st X+, r18
+ 	st X+, r19
+ 	st X+, r22	
+	st X+, r23
+	dec r20
+	brne sha256_nextBlock_wcpyloop
+/*	for (i=16; i<64; ++i){
+		w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];	
+	} */
+	/* r25,r24,r23,r24 (r21,r20) are function values
+	   r19,r18,r17,r16 are the accumulator
+	   r15,r14,r13,rBck1 are backup1
+	   r11,r10,r9 ,r8  are xor accu   
+	   r1 is round counter 								*/
+
+	ldi r20, 64-16
+	mov LoopC, r20
+sha256_nextBlock_wcalcloop:		 
+	movw r30, r26 ; cp X to Z
+	sbiw r30, 63
+	sbiw r30, 1 		; substract 64 = 16*4
+	ld Accu1, Z+
+	ld Accu2, Z+
+	ld Accu3, Z+
+	ld Accu4, Z+ /* w[i] = w[i-16] */
+	ld Bck1, Z+
+	ld Bck2, Z+
+	ld Bck3, Z+
+	ld Bck4, Z+ /* backup = w[i-15] */
+	/* now sigma 0 */
+	mov Func1, Bck2
+	mov Func2, Bck3
+	mov Func3, Bck4
+	mov Func4, Bck1  /* prerotated by 8 */
+	ldi r20, 1
+	rcall bitrotl
+	movw XAccu1, Func1
+	movw XAccu3, Func3	 /* store ROTR(w[i-15],7) in xor accu */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 3		 /* now shr3 */ /*we can destroy backup now*/
+sigma0_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2
+	ror Bck1	
+	dec Func2
+	brne sigma0_shr
+	eor XAccu1, Bck1
+	eor XAccu2, Bck2
+	eor XAccu3, Bck3
+	eor XAccu4, Bck4	/* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	ldd Func1, Z+7*4  /* now accu += w[i-7] */
+	ldd Func2, Z+7*4+1
+	ldd Func3, Z+7*4+2
+	ldd Func4, Z+7*4+3
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
+	ldd Bck2, Z+12*4+1
+	ldd Bck3, Z+12*4+2
+	ldd Bck4, Z+12*4+3
+	/* now sigma 1 */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 1
+	rcall bitrotr
+	movw XAccu3, Func3
+	movw XAccu1, Func1	 /* store in ROTR(w[i-2], 17) xor accu */
+;	movw Func1, Bck3
+;	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 2	 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
+sigma1_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2	
+	dec Func2
+	brne sigma1_shr
+	eor XAccu1, Bck2
+	eor XAccu2, Bck3
+	eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	/* now let's store the shit */
+	st X+, Accu1
+	st X+, Accu2
+	st X+, Accu3
+	st X+, Accu4
+	dec LoopC
+	breq 3f  ; skip if zero
+	rjmp sha256_nextBlock_wcalcloop
+3:
+	/* we are finished with w array X points one byte post w */
+/* init a array */
+	pop r31
+	pop r30
+	push r30
+	push r31
+	ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
+init_a_array:	
+	ld r1, Z+
+	st X+, r1
+	dec r25
+	brne init_a_array
+	
+/* now the real fun begins */
+/* for (i=0; i<64; ++i){
+			t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
+			t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
+			memmove(&(a[1]), &(a[0]), 7*4); 	// a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; 
+			a[4] += t1;
+			a[0] = t1 + t2;
+		} */
+	/* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
+	sbiw r26, 8*4  /* X still points at a[7]+1*/
+	movw r28, r26
+	ldi r30, lo8(sha256_kv)
+	ldi r31, hi8(sha256_kv)		
+	dec r27  /* X - (64*4 == 256) */
+	ldi r25, 64
+	mov LoopC, r25
+sha256_main_loop:
+	/* now calculate t1 */
+	 /*CH(x,y,z) = (x&y)^((~x)&z)*/
+	ldd T1, Y+5*4
+	ldd T2, Y+5*4+1
+	ldd T3, Y+5*4+2
+	ldd T4, Y+5*4+3 /* y in T */
+	ldd Func1, Y+4*4
+	ldd Func2, Y+4*4+1
+	ldd Func3, Y+4*4+2
+	ldd Func4, Y+4*4+3  /* x in Func */
+	ldd Bck1, Y+6*4
+	ldd Bck2, Y+6*4+1
+	ldd Bck3, Y+6*4+2
+	ldd Bck4, Y+6*4+3 /* z in Bck */
+	and T1, Func1
+	and T2, Func2
+	and T3, Func3
+	and T4, Func4
+	com Func1
+	com Func2
+	com Func3
+	com Func4
+	and Bck1, Func1
+	and Bck2, Func2
+	and Bck3, Func3
+	and Bck4, Func4
+	eor T1, Bck1
+	eor T2, Bck2
+	eor T3, Bck3
+	eor T4, Bck4 /* done, CH(x,y,z) is in T */
+	/* now SIGMA1(a[4]) */
+	ldd Bck4, Y+4*4		/* think about using it from Func reg above*/
+	ldd Bck1, Y+4*4+1	
+	ldd Bck2, Y+4*4+2
+	ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */	
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2 
+	rcall bitrotl		/* rotr(x,6) */ 
+	movw XAccu1, Func1
+	movw XAccu3, Func3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 3 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	movw Func1, Bck3 /* this prerotates furteh 16 bits*/
+	movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
+	ldi r20, 1 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4
+	/* now we've to add a[7], w[i] and k[i] */
+	ldd XAccu1, Y+4*7
+	ldd XAccu2, Y+4*7+1
+	ldd XAccu3, Y+4*7+2
+	ldd XAccu4, Y+4*7+3
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add a[7] */
+	ld XAccu1, X+
+	ld XAccu2, X+
+	ld XAccu3, X+
+	ld XAccu4, X+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add w[i] */
+	lpm XAccu1, Z+
+	lpm XAccu2, Z+
+	lpm XAccu3, Z+
+	lpm XAccu4, Z+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
+	/*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
+		/* starting with MAJ(x,y,z) */
+	ldd Func1, Y+4*0+0
+	ldd Func2, Y+4*0+1
+	ldd Func3, Y+4*0+2
+	ldd Func4, Y+4*0+3 /* load x=a[0] */
+	ldd XAccu1, Y+4*1+0
+	ldd XAccu2, Y+4*1+1
+	ldd XAccu3, Y+4*1+2
+	ldd XAccu4, Y+4*1+3 /* load y=a[1] */
+	and XAccu1, Func1
+	and XAccu2, Func2
+	and XAccu3, Func3
+	and XAccu4, Func4	/* XAccu == (x & y) */
+	ldd Bck1, Y+4*2+0
+	ldd Bck2, Y+4*2+1
+	ldd Bck3, Y+4*2+2
+	ldd Bck4, Y+4*2+3 /* load z=a[2] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == (x & y) ^ (x & z) */
+	ldd Func1, Y+4*1+0
+	ldd Func2, Y+4*1+1
+	ldd Func3, Y+4*1+2
+	ldd Func4, Y+4*1+3 /* load y=a[1] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
+   	/* SIGMA0(a[0]) */
+	ldd Bck1, Y+4*0+0 /* we should combine this with above */
+	ldd Bck2, Y+4*0+1
+	ldd Bck3, Y+4*0+2
+	ldd Bck4, Y+4*0+3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2
+	rcall bitrotr
+	movw Accu1, Func1
+	movw Accu3, Func3 /* Accu = shr(a[0], 2) */
+	movw Func1, Bck3 
+	movw Func3, Bck1 /* prerotate by 16 bits */
+	ldi r20, 3
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
+	mov Func1, Bck4
+	mov Func2, Bck1
+	mov Func3, Bck2
+	mov Func4, Bck3  /* prerotate by 24 bits */
+	ldi r20, 2
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
+	add Accu1, XAccu1 /* add previous result (MAJ)*/
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4
+	/* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
+	/* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
+
+	ldi r21, 7*4
+	adiw r28, 7*4
+a_shift_loop:
+	ld  r25, -Y /* warning: this is PREdecrement */
+	std Y+4, r25
+	dec r21
+	brne a_shift_loop
+
+	ldd Bck1, Y+4*4+0
+	ldd Bck2, Y+4*4+1
+	ldd Bck3, Y+4*4+2
+	ldd Bck4, Y+4*4+3
+	add Bck1, T1
+	adc Bck2, T2
+	adc Bck3, T3
+	adc Bck4, T4
+	std Y+4*4+0, Bck1
+	std Y+4*4+1, Bck2
+	std Y+4*4+2, Bck3
+	std Y+4*4+3, Bck4
+	add Accu1, T1
+	adc Accu2, T2
+	adc Accu3, T3
+	adc Accu4, T4
+	std Y+4*0+0, Accu1
+	std Y+4*0+1, Accu2
+	std Y+4*0+2, Accu3
+	std Y+4*0+3, Accu4 /* a array updated */
+	
+	
+	dec LoopC
+	breq update_state
+	rjmp sha256_main_loop ;brne sha256_main_loop
+update_state:	
+	/* update state */
+	/* pointers to state should still exist on the stack ;-) */
+	pop r31
+	pop r30
+	ldi r21, 8
+update_state_loop:
+	ldd Accu1, Z+0
+	ldd Accu2, Z+1
+	ldd Accu3, Z+2
+	ldd Accu4, Z+3 
+	ld Func1, Y+
+	ld Func2, Y+
+	ld Func3, Y+
+	ld Func4, Y+
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	st Z+, Accu1
+	st Z+, Accu2
+	st Z+, Accu3
+	st Z+, Accu4
+	dec r21
+	brne update_state_loop
+	/* now we just have to update the length */
+	adiw r30, 1 /* since we add 512, we can simply skip the LSB */ 
+	ldi r21, 2
+	ldi r22, 6
+	ld r20, Z
+	add r20, r21
+	st Z+, r20	
+	clr r21
+sha256_nextBlock_fix_length:	
+	brcc sha256_nextBlock_epilog
+	ld r20, Z
+	adc r20, r21
+	st Z+, r20
+	dec r22
+	brne sha256_nextBlock_fix_length
+	
+; EPILOG
+sha256_nextBlock_epilog:
+/* now we should clean up the stack */
+	
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	pop r7
+	pop r6
+	pop r5
+	pop r4 
+	ret
+
+sha256_kv: ; round-key-vector stored in ProgMem 
+.word	0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
+.word	0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
+.word	0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
+.word	0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
+.word	0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
+.word	0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
+.word	0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
+.word	0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
+
+	
+;###########################################################	
+
+.global sha256_init 
+;uint32_t sha256_init_vector[]={
+;  	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+;	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
+;
+;void sha256_init(sha256_ctx_t *state){
+;	state->length=0;
+;	memcpy(state->h, sha256_init_vector, 8*4);
+;}
+; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha256_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha256_init_vector))
+	ldi r31, hi8((sha256_init_vector))
+	ldi r22, 32+8
+sha256_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha256_init_vloop
+	ret
+	
+sha256_init_vector:
+.word 0xE667, 0x6A09
+.word 0xAE85, 0xBB67 
+.word 0xF372, 0x3C6E 
+.word 0xF53A, 0xA54F 
+.word 0x527F, 0x510E 
+.word 0x688C, 0x9B05 
+.word 0xD9AB, 0x1F83 
+.word 0xCD19, 0x5BE0
+.word 0x0000, 0x0000
+.word 0x0000, 0x0000
+
+;###########################################################	
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,r22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotl32:
+	cpi r20, 8
+	brlo bitrotl
+	mov r21, r25
+	mov r25, r24
+	mov r24, r23
+	mov r23, r22
+	mov r22, r21
+	subi r20, 8
+	rjmp rotl32
+bitrotl:
+	clr r21
+	clc
+bitrotl_loop:	
+	tst r20
+	breq fixrotl
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	rol r21
+	dec r20
+	rjmp bitrotl_loop
+fixrotl:
+	or r22, r21
+	ret
+	
+
+;###########################################################	
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotr32:
+	cpi r20, 8
+	brlo bitrotr
+	mov r21, r22
+	mov r22, r23
+	mov r23, r24
+	mov r24, r25
+	mov r25, r21
+	subi r20, 8
+	rjmp rotr32
+bitrotr:
+	clr r21
+	clc
+bitrotr_loop:	
+	tst r20
+	breq fixrotr
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	dec r20
+	rjmp bitrotr_loop
+fixrotr:
+	or r25, r21
+	ret
+	
+	
+;###########################################################	
+	
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+;  param1: the 32-bit word
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  modifys: r21, r22
+change_endian32:
+	movw r20,  r22 ; (r22,r23) --> (r20,r21)
+	mov r22, r25
+	mov r23, r24
+	mov r24, r21
+	mov r25, r20 
+	ret
+
diff --git a/shabea/sha256.h b/shabea/sha256.h
new file mode 100644
index 0000000..24960a3
--- /dev/null
+++ b/shabea/sha256.h
@@ -0,0 +1,122 @@
+/* sha256.h */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+ * \file	sha256.h
+ * \author  Daniel Otte 
+ * \date    2006-05-16
+ * \license	GPLv3 or later
+ * 
+ */
+
+#ifndef SHA256_H_
+#define SHA256_H_
+
+#define __LITTLE_ENDIAN__
+
+
+#include <stdint.h>
+
+/** \def SHA256_HASH_BITS
+ * defines the size of a SHA-256 hash value in bits
+ */
+
+/** \def SHA256_HASH_BYTES
+ * defines the size of a SHA-256 hash value in bytes
+ */
+
+/** \def SHA256_BLOCK_BITS
+ * defines the size of a SHA-256 input block in bits
+ */
+
+/** \def SHA256_BLOCK_BYTES
+ * defines the size of a SHA-256 input block in bytes
+ */
+
+#define SHA256_HASH_BITS  256
+#define SHA256_HASH_BYTES (SHA256_HASH_BITS/8)
+#define SHA256_BLOCK_BITS 512
+#define SHA256_BLOCK_BYTES (SHA256_BLOCK_BITS/8)
+
+/** \typedef sha256_ctx_t
+ * \brief SHA-256 context type
+ * 
+ * A variable of this type may hold the state of a SHA-256 hashing process
+ */
+typedef struct {
+	uint32_t h[8];
+	uint64_t length;
+} sha256_ctx_t;
+
+/** \typedef sha256_hash_t
+ * \brief SHA-256 hash value type
+ * 
+ * A variable of this type may hold the hash value produced by the
+ * sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state) function.
+ */
+typedef uint8_t sha256_hash_t[SHA256_HASH_BYTES];
+
+/** \fn void sha256_init(sha256_ctx_t *state)
+ * \brief initialise a SHA-256 context
+ * 
+ * This function sets a ::sha256_ctx_t to the initial values for hashing.
+ * \param state pointer to the SHA-256 hashing context
+ */
+void sha256_init(sha256_ctx_t *state);
+
+/** \fn void sha256_nextBlock (sha256_ctx_t* state, const void* block)
+ * \brief update the context with a given block
+ * 
+ * This function updates the SHA-256 hash context by processing the given block
+ * of fixed length.
+ * \param state pointer to the SHA-256 hash context
+ * \param block pointer to the block of fixed length (512 bit = 64 byte)
+ */
+void sha256_nextBlock (sha256_ctx_t* state, const void* block);
+
+/** \fn void sha256_lastBlock(sha256_ctx_t* state, const void* block, uint16_t length_b)
+ * \brief finalize the context with the given block 
+ * 
+ * This function finalizes the SHA-256 hash context by processing the given block
+ * of variable length.
+ * \param state pointer to the SHA-256 hash context
+ * \param block pointer to the block of fixed length (512 bit = 64 byte)
+ * \param length_b the length of the block in bits
+ */
+void sha256_lastBlock(sha256_ctx_t* state, const void* block, uint16_t length_b);
+
+/** \fn void sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state)
+ * \brief convert the hash state into the hash value
+ * This function reads the context and writes the hash value to the destination
+ * \param dest pointer to the location where the hash value should be written
+ * \param state pointer to the SHA-256 hash context
+ */
+void sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state);
+
+/** \fn void sha256(sha256_hash_t* dest, const void* msg, uint32_t length_b)
+ * \brief simple SHA-256 hashing function for direct hashing
+ * 
+ * This function automaticaly hashes a given message of arbitary length with
+ * the SHA-256 hashing algorithm.
+ * \param dest pointer to the location where the hash value is going to be written to
+ * \param msg pointer to the message thats going to be hashed
+ * \param length_b length of the message in bits
+ */
+void sha256(sha256_hash_t* dest, const void* msg, uint32_t length_b);
+
+#endif /*SHA256_H_*/
diff --git a/shabea.c b/shabea/shabea.c
similarity index 100%
rename from shabea.c
rename to shabea/shabea.c
diff --git a/shabea.h b/shabea/shabea.h
similarity index 100%
rename from shabea.h
rename to shabea/shabea.h
diff --git a/shacal1/sha1-asm.S b/shacal1/sha1-asm.S
new file mode 100644
index 0000000..f571685
--- /dev/null
+++ b/shacal1/sha1-asm.S
@@ -0,0 +1,886 @@
+/* sha1-asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; SHA1 implementation in assembler for AVR
+SHA1_BLOCK_BITS = 512
+SHA1_HASH_BITS = 160
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+.macro delay
+/*	
+	push r0
+	push r1
+	clr r0
+1:	clr r1
+2:	dec r1
+	brne 2b
+	dec r0
+	brne 1b
+	pop r1
+	pop r0  // */
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+/*	
+	precall
+	hexdump \length
+	postcall
+	// */
+.endm
+
+
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha1_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha1_ctx2hash
+; === sha1_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha1_ctx structure
+;	given in r23,r22
+sha1_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 5
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha1
+; === sha1 ===
+; this function calculates SHA-1 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha1:
+sha1_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 5*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha1_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha1_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha1_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha1_ctx2hash	
+	
+sha1_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 5*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha1_lastBlock
+; === sha1_lastBlock ===
+; this function does padding & Co. for calculating SHA-1 hashes
+;  param1: the 16-bit pointer to sha1_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
+
+
+sha1_lastBlock:
+	cpi r21, 0x02
+	brlo sha1_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	push r21
+	push r20
+	rcall sha1_nextBlock
+	pop r20
+	pop r21
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	subi r21, 2
+	subi r23, -2
+	rjmp sha1_lastBlock
+sha1_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64) /* ??? */
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha1_lastBlock_post_copy
+	mov r1, r18
+sha1_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha1_lastBlock_copy_loop
+sha1_lastBlock_post_copy:	
+sha1_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha1_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha1_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*5+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha1_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha1_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha1_lastBlock_epilog
+sha1_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 5*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha1_nextBlock
+
+sha1_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha1_nextBlock
+; === sha1_nextBlock ===
+; this is the core function for calculating SHA-1 hashes
+;  param1: the 16-bit pointer to sha1_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
+
+xtmp = 0
+xNULL = 1
+W1 = 10
+W2 = 11
+T1	= 12
+T2	= 13
+T3	= 14
+T4	= 15
+LoopC = 16
+S	  = 17
+tmp1 = 18
+tmp2 = 19
+tmp3 = 20
+tmp4 = 21
+F1 = 22
+F2 = 23
+F3 = 24
+F4 = 25
+
+/* byteorder: high number <--> high significance */
+sha1_nextBlock:
+ ; initial, let's make some space ready for local vars
+ 			 /* replace push & pop by mem ops? */
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack /* maybe removeable? */ 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha1_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	push r18
+	push r19 /* push old SP on new stack */
+	push r24
+	push r25 /* param1 will be needed later */
+	
+	/* load a[] with state */
+	movw 28, r24 /* load pointer to state in Y */
+	adiw r26, 1 ; X++
+
+	ldi LoopC, 5*4	
+1:	ld tmp1, Y+
+	st X+, tmp1
+	dec LoopC
+	brne 1b
+
+	movw W1, r26 /* save pointer to w[0] */
+	/* load w[] with endian fixed message */
+		/* we might also use the changeendian32() function at bottom */
+	movw r30, r22 /* mv param2 (ponter to msg) to Z */	
+	ldi LoopC, 16
+1:
+	ldd tmp1, Z+3
+	st X+, tmp1
+	ldd tmp1, Z+2
+	st X+, tmp1
+	ldd tmp1, Z+1
+	st X+, tmp1
+	ld tmp1, Z
+	st X+, tmp1
+	adiw r30, 4
+	dec LoopC
+	brne 1b
+	
+	;clr LoopC /* LoopC is named t in FIPS 180-2 */	
+	clr xtmp
+sha1_nextBlock_mainloop:
+	mov S, LoopC
+	lsl S
+	lsl S
+	andi S, 0x3C /* S is a bytepointer so *4 */
+	/* load w[s] */
+	movw r26, W1
+	add r26, S /* X points at w[s] */
+	adc r27, xNULL
+	ld T1, X+
+	ld T2, X+
+	ld T3, X+
+	ld T4, X+
+
+	/**/
+	push r26
+	push r27
+	push T4
+	push T3
+	push T2
+	push T1
+	in r26, SPL
+	in r27, SPH
+	adiw r26, 1
+	dbg_hexdump 4
+	pop T1
+	pop T2
+	pop T3
+	pop T4
+	pop r27
+	pop r26
+	/**/
+
+	cpi LoopC, 16
+	brlt sha1_nextBlock_mainloop_core
+	/* update w[s] */
+	ldi tmp1, 2*4
+	rcall 1f
+	ldi tmp1, 8*4
+	rcall 1f
+	ldi tmp1, 13*4
+	rcall 1f
+	rjmp 2f
+1:		/* this might be "outsourced" to save the jump above */
+	add tmp1, S
+	andi tmp1, 0x3f
+	movw r26, W1
+	add r26, tmp1
+	adc r27, xNULL
+	ld tmp2, X+
+	eor T1, tmp2
+	ld tmp2, X+
+	eor T2, tmp2
+	ld tmp2, X+
+	eor T3, tmp2
+	ld tmp2, X+
+	eor T4, tmp2
+	ret
+2:	/* now we just hav to do a ROTL(T) and save T back */
+	mov tmp2, T4
+	rol tmp2
+	rol T1
+	rol T2
+	rol T3
+	rol T4
+	movw r26, W1
+	add r26, S
+	adc r27, xNULL
+	st X+, T1
+	st X+, T2
+	st X+, T3
+	st X+, T4
+	
+sha1_nextBlock_mainloop_core:	/* ther core function; T=ROTL5(a) ....*/	
+								/* T already contains w[s] */
+	movw r26, W1
+	sbiw r26, 4*1		/* X points at a[4] aka e */
+	ld tmp1, X+ 
+	add T1, tmp1
+	ld tmp1, X+ 
+	adc T2, tmp1
+	ld tmp1, X+ 
+	adc T3, tmp1
+	ld tmp1, X+ 
+	adc T4, tmp1		/* T = w[s]+e */
+	sbiw r26, 4*5		/* X points at a[0] aka a */
+	ld F1, X+ 
+	ld F2, X+ 
+	ld F3, X+ 
+	ld F4, X+ 
+	mov tmp1, F4		/* X points at a[1] aka b */
+	ldi tmp2, 5
+1:
+	rol tmp1
+	rol F1
+	rol F2
+	rol F3
+	rol F4
+	dec tmp2
+	brne 1b
+	
+	add T1, F1
+	adc T2, F2
+	adc T3, F3
+	adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
+	
+	/* now we have to do this fucking conditional stuff */
+	ldi r30, lo8(sha1_nextBlock_xTable)
+	ldi r31, hi8(sha1_nextBlock_xTable)
+	add r30, xtmp
+	adc r31, xNULL
+	lpm tmp1, Z
+	cp tmp1, LoopC
+	brne 1f
+	inc xtmp
+1:	ldi r30, lo8(sha1_nextBlock_KTable)
+	ldi r31, hi8(sha1_nextBlock_KTable)
+	lsl xtmp
+	lsl xtmp
+	add r30, xtmp
+	adc r31, xNULL
+	lsr xtmp
+	lsr xtmp
+	 
+	lpm tmp1, Z+
+	add T1, tmp1
+	lpm tmp1, Z+
+	adc T2, tmp1
+	lpm tmp1, Z+
+	adc T3, tmp1
+	lpm tmp1, Z+
+	adc T4, tmp1
+			/* T = ROTL(a,5) + e + kt + w[s] */
+	
+	/* Z-4 is just pointing to kt ... */
+	movw r28, r26 /* copy X in Y */
+	adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
+	lsr r31
+	ror r30
+		
+	icall
+	mov F1, tmp1
+	icall
+	mov F2, tmp1
+	icall
+	mov F3, tmp1
+	icall
+	
+	add T1, F1
+	adc T2, F2
+	adc T3, F3
+	adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
+				 /* X points still at a[1] aka b, Y points at a[2] aka c */	
+	/* update a[] */
+sha1_nextBlock_update_a:
+	/*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
+	//adiw r28, 3*4  /* Y should point at a[4] aka e */
+	movw r28, W1
+	sbiw r28, 4
+	
+	ldi tmp2, 4*4 
+1:	
+	ld tmp1, -Y
+	std Y+4, tmp1
+	dec tmp2
+	brne 1b
+	/* Y points at a[0] aka a*/
+	
+	movw r28, W1
+	sbiw r28, 5*4
+	/* store T in a[0] aka a */
+	st Y+, T1
+	st Y+, T2
+	st Y+, T3
+	st Y+, T4
+	/* Y points at a[1] aka b*/
+	
+	/* rotate c */
+	ldd T1, Y+1*4
+	ldd T2, Y+1*4+1
+	ldd T3, Y+1*4+2
+	ldd T4, Y+1*4+3
+	mov tmp1, T1
+	ldi tmp2, 2
+1:	ror tmp1
+	ror T4
+	ror T3
+	ror T2
+	ror T1
+	dec tmp2
+	brne 1b
+	std Y+1*4+0, T1
+	std Y+1*4+1, T2
+	std Y+1*4+2, T3
+	std Y+1*4+3, T4
+	
+	push r27
+	push r26
+	movw r26, W1
+	sbiw r26, 4*5
+	dbg_hexdump 4*5
+	pop r26
+	pop r27
+	
+	inc LoopC
+	cpi LoopC, 80
+	brge 1f
+	rjmp sha1_nextBlock_mainloop
+/**************************************/
+1:	
+   /* littel patch */
+	sbiw r28, 4
+
+/* add a[] to state and inc length */	
+	pop r27
+	pop r26		/* now X points to state (and Y still at a[0]) */
+	ldi tmp4, 5
+1:	clc
+	ldi tmp3, 4
+2:	ld tmp1, X
+	ld tmp2, Y+
+	adc tmp1, tmp2
+	st X+, tmp1
+	dec tmp3
+	brne 2b
+	dec tmp4
+	brne 1b
+	
+	/* now length += 512 */
+	adiw r26, 1 /* we skip the least significant byte */
+	ld tmp1, X
+	ldi tmp2, hi8(512) /* 2 */
+	add tmp1, tmp2
+	st X+, tmp1
+	ldi tmp2, 6
+1:
+	ld tmp1, X
+	adc tmp1, xNULL
+	st X+, tmp1
+	dec tmp2
+	brne 1b
+	
+; EPILOG
+sha1_nextBlock_epilog:
+/* now we should clean up the stack */
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	ret
+
+sha1_nextBlock_xTable:
+.byte 20,40,60,0
+sha1_nextBlock_KTable:
+.int	0x5a827999 
+.int	0x6ed9eba1 
+.int	0x8f1bbcdc 
+.int	0xca62c1d6
+sha1_nextBlock_JumpTable:
+rjmp sha1_nextBlock_Ch
+	nop	
+rjmp sha1_nextBlock_Parity
+	nop
+rjmp sha1_nextBlock_Maj
+	nop
+rjmp sha1_nextBlock_Parity
+
+	 /* X and Y still point at a[1] aka b ; return value in tmp1 */
+sha1_nextBlock_Ch:
+	ld tmp1, Y+
+	mov tmp2, tmp1
+	com tmp2
+	ldd tmp3, Y+3	/* load from c */
+	and tmp1, tmp3
+	ldd tmp3, Y+7	/* load from d */
+	and tmp2, tmp3
+	eor tmp1, tmp2
+	ret
+	
+sha1_nextBlock_Maj:
+	ld tmp1, Y+
+	mov tmp2, tmp1
+	ldd tmp3, Y+3	/* load from c */
+	and tmp1, tmp3
+	ldd tmp4, Y+7	/* load from d */
+	and tmp2, tmp4
+	eor tmp1, tmp2
+	and tmp3, tmp4
+	eor tmp1, tmp3
+	ret
+
+sha1_nextBlock_Parity:
+	ld tmp1, Y+
+	ldd tmp2, Y+3	/* load from c */
+	eor tmp1, tmp2
+	ldd tmp2, Y+7	/* load from d */
+	eor tmp1, tmp2
+	ret
+/*	
+ch_str:			.asciz "\r\nCh"
+maj_str:		.asciz "\r\nMaj"
+parity_str:	.asciz "\r\nParity"
+*/
+;###########################################################	
+
+.global sha1_init 
+;void sha1_init(sha1_ctx_t *state){
+;	DEBUG_S("\r\nSHA1_INIT");
+;	state->h[0] = 0x67452301;
+;	state->h[1] = 0xefcdab89;
+;	state->h[2] = 0x98badcfe;
+;	state->h[3] = 0x10325476;
+;	state->h[4] = 0xc3d2e1f0;
+;	state->length = 0;
+;}
+; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha1_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha1_init_vector))
+	ldi r31, hi8((sha1_init_vector))
+	ldi r22, 5*4 /* bytes to copy */
+sha1_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha1_init_vloop
+	ldi r22, 8
+sha1_init_lloop:
+	st X+, r1
+	dec r22
+	brne sha1_init_lloop
+	ret
+	
+sha1_init_vector:
+.int 0x67452301;
+.int 0xefcdab89;
+.int 0x98badcfe;
+.int 0x10325476;
+.int 0xc3d2e1f0;
+
diff --git a/shacal1/sha1.h b/shacal1/sha1.h
new file mode 100644
index 0000000..6675d20
--- /dev/null
+++ b/shacal1/sha1.h
@@ -0,0 +1,117 @@
+/* sha1.h */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+ * \file	sha1.h
+ * \author	Daniel Otte
+ * \email   daniel.otte@rub.de
+ * \date	2006-10-08
+ * \license GPLv3 or later
+ * \brief   SHA-1 declaration.
+ * \ingroup SHA-1
+ * 
+ */
+ 
+#ifndef SHA1_H_
+#define SHA1_H_
+
+#include <stdint.h>
+/** \def SHA1_HASH_BITS
+ * definees the size of a SHA-1 hash in bits 
+ */
+
+/** \def SHA1_HASH_BYTES
+ * definees the size of a SHA-1 hash in bytes 
+ */
+
+/** \def SHA1_BLOCK_BITS
+ * definees the size of a SHA-1 input block in bits 
+ */
+
+/** \def SHA1_BLOCK_BYTES
+ * definees the size of a SHA-1 input block in bytes 
+ */
+#define SHA1_HASH_BITS  160
+#define SHA1_HASH_BYTES (SHA1_HASH_BITS/8)
+#define SHA1_BLOCK_BITS 512
+#define SHA1_BLOCK_BYTES (SHA1_BLOCK_BITS/8)
+
+/** \typedef sha1_ctx_t
+ * \brief SHA-1 context type
+ * 
+ * A vatiable of this type may hold the state of a SHA-1 hashing process
+ */
+typedef struct {
+	uint32_t h[5];
+	uint64_t length;
+} sha1_ctx_t;
+
+/** \typedef sha1_hash_t
+ * \brief hash value type
+ * A variable of this type may hold a SHA-1 hash value 
+ */
+typedef uint8_t sha1_hash_t[SHA1_HASH_BITS/8];
+
+/** \fn sha1_init(sha1_ctx_t *state)
+ * \brief initializes a SHA-1 context
+ * This function sets a ::sha1_ctx_t variable to the initialization vector
+ * for SHA-1 hashing.
+ * \param state pointer to the SHA-1 context variable
+ */
+void sha1_init(sha1_ctx_t *state);
+
+/** \fn sha1_nextBlock(sha1_ctx_t *state, const void* block)
+ *  \brief process one input block
+ * This function processes one input block and updates the hash context 
+ * accordingly
+ * \param state pointer to the state variable to update
+ * \param block pointer to the message block to process
+ */
+void sha1_nextBlock (sha1_ctx_t *state, const void* block);
+
+/** \fn sha1_lastBlock(sha1_ctx_t *state, const void* block, uint16_t length_b)
+ * \brief processes the given block and finalizes the context
+ * This function processes the last block in a SHA-1 hashing process.
+ * The block should have a maximum length of a single input block.
+ * \param state pointer to the state variable to update and finalize
+ * \param block pointer to themessage block to process
+ * \param length_b length of the message block in bits  
+ */
+void sha1_lastBlock (sha1_ctx_t *state, const void* block, uint16_t length_b);
+
+/** \fn sha1_ctx2hash(sha1_hash_t *dest, sha1_ctx_t *state)
+ * \brief convert a state variable into an actual hash value
+ * Writes the hash value corresponding to the state to the memory pointed by dest.
+ * \param dest pointer to the hash value destination
+ * \param state pointer to the hash context
+ */ 
+void sha1_ctx2hash (sha1_hash_t *dest, sha1_ctx_t *state);
+
+/** \fn sha1(sha1_hash_t *dest, const void* msg, uint32_t length_b)
+ * \brief hashing a message which in located entirely in RAM
+ * This function automatically hashes a message which is entirely in RAM with
+ * the SHA-1 hashing algorithm.
+ * \param dest pointer to the hash value destination
+ * \param msg  pointer to the message which should be hashed
+ * \param length_b length of the message in bits
+ */ 
+void sha1(sha1_hash_t *dest, const void* msg, uint32_t length_b);
+
+
+
+#endif /*SHA1_H_*/
diff --git a/shacal1_enc.c b/shacal1/shacal1_enc.c
similarity index 100%
rename from shacal1_enc.c
rename to shacal1/shacal1_enc.c
diff --git a/shacal1_enc.h b/shacal1/shacal1_enc.h
similarity index 100%
rename from shacal1_enc.h
rename to shacal1/shacal1_enc.h
diff --git a/shacal2/sha256-asm.S b/shacal2/sha256-asm.S
new file mode 100644
index 0000000..d9eb6b6
--- /dev/null
+++ b/shacal2/sha256-asm.S
@@ -0,0 +1,1042 @@
+/* sha256-asm.S */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+ * Author:	Daniel Otte
+ *
+ * License: GPLv3 or later
+*/
+; sha-256 implementation in assembler	
+SHA256_BLOCK_BITS = 512
+SHA256_HASH_BITS = 256
+
+.macro precall
+	/* push r18 - r27, r30 - r31*/
+	push r0
+	push r1
+	push r18
+	push r19
+	push r20
+	push r21
+	push r22
+	push r23
+	push r24
+	push r25
+	push r26
+	push r27
+	push r30
+	push r31
+	clr r1
+.endm
+
+.macro postcall
+	pop r31
+	pop r30
+	pop r27
+	pop r26
+	pop r25
+	pop r24
+	pop r23
+	pop r22
+	pop r21
+	pop r20
+	pop r19
+	pop r18
+	pop r1
+	pop r0
+.endm
+
+
+.macro hexdump length
+	push r27
+	push r26
+	ldi r25, '\r'
+	mov r24, r25
+	call uart_putc
+	ldi r25, '\n'
+	mov r24, r25
+	call uart_putc
+	pop r26
+	pop r27
+	movw r24, r26
+.if \length > 16
+	ldi r22, lo8(16)
+	ldi r23, hi8(16)
+	push r27
+	push r26
+	call uart_hexdump
+	pop r26
+	pop r27
+	adiw r26, 16
+	hexdump \length-16
+.else
+	ldi r22, lo8(\length)
+	ldi r23, hi8(\length)
+	call uart_hexdump
+.endif
+.endm
+
+/* X points to Block */
+.macro dbg_hexdump length
+	precall
+	hexdump \length
+	postcall
+.endm
+
+.section .text
+
+SPL = 0x3D
+SPH = 0x3E
+SREG = 0x3F
+
+
+;
+;sha256_ctx_t is:
+;
+; [h0][h1][h2][h3][h4][h5][h6][h7][length]
+; hn is 32 bit large, length is 64 bit large
+
+;###########################################################	
+
+.global sha256_ctx2hash
+; === sha256_ctx2hash ===
+; this function converts a state into a normal hash (bytestring)
+;  param1: the 16-bit destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to sha256_ctx structure
+;	given in r23,r22
+sha256_ctx2hash:
+	movw r26, r22
+	movw r30, r24
+	ldi r21, 8
+	sbiw r26, 4
+1:	
+	ldi r20, 4
+	adiw r26, 8
+2:	
+		ld r0, -X
+		st Z+, r0	
+	dec r20
+	brne 2b
+	
+	dec r21
+	brne 1b
+	
+	ret
+
+;###########################################################	
+
+.global sha256
+; === sha256 ===
+; this function calculates SHA-256 hashes from messages in RAM
+;  param1: the 16-bit hash destination pointer
+;	given in r25,r24 (r25 is most significant)
+;  param2: the 16-bit pointer to message
+;	given in r23,r22
+;  param3: 32-bit length value (length of message in bits)
+;   given in r21,r20,r19,r18
+sha256:
+sha256_prolog:
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r16
+	push r17
+	in r16, SPL
+	in r17, SPH
+	subi r16, 8*4+8 
+	sbci r17, 0	
+	in r0, SREG
+	cli
+	out SPL, r16
+	out SPH, r17
+	out SREG, r0
+	
+	push r25
+	push r24
+	inc r16
+	adc r17, r1
+	
+	movw r8, r18		/* backup of length*/
+	movw r10, r20
+	
+	movw r12, r22	/* backup pf msg-ptr */
+	
+	movw r24, r16
+	rcall sha256_init
+	/* if length >= 512 */
+1:
+	tst r11
+	brne 4f
+	tst r10
+	brne 4f
+	mov r19, r9
+	cpi r19, 0x02
+	brlo 4f
+	
+	movw r24, r16
+	movw r22, r12
+	rcall sha256_nextBlock
+	ldi r19, 0x64
+	add r22, r19
+	adc r23, r1
+	/* length -= 512 */
+	ldi r19, 0x02
+	sub r9, r19
+	sbc r10, r1
+	sbc r11, r1
+	rjmp 1b
+	
+4:
+	movw r24, r16
+	movw r22, r12
+	movw r20, r8
+	rcall sha256_lastBlock
+	
+	pop r24
+	pop r25
+	movw r22, r16
+	rcall sha256_ctx2hash	
+	
+sha256_epilog:
+	in r30, SPL
+	in r31, SPH
+	adiw r30, 8*4+8 	
+	in r0, SREG
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG, r0
+	pop r17
+	pop r16
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	ret
+
+;###########################################################	
+
+
+; block MUST NOT be larger than 64 bytes
+
+.global sha256_lastBlock
+; === sha256_lastBlock ===
+; this function does padding & Co. for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+;  param3: an 16-bit integer specifing length of block in bits
+;	given in r21,r20
+sha256_lastBlock_localSpace = (SHA256_BLOCK_BITS/8+1)
+
+
+sha256_lastBlock:
+	cpi r21, 0x02
+	brlo sha256_lastBlock_prolog
+	push r25
+	push r24
+	push r23
+	push r22
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r22
+	pop r23
+	pop r24
+	pop r25
+	subi r21, 0x02
+	subi r23, -2
+	rjmp sha256_lastBlock	
+sha256_lastBlock_prolog:
+	/* allocate space on stack */
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	subi r30, lo8(64)
+	sbci r31, hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+
+	adiw r30, 1 /* SP points to next free byte on stack */
+	mov r18, r20 /* r20 = LSB(length) */
+	lsr r18
+	lsr r18
+	lsr r18
+	bst r21, 0	/* may be we should explain this ... */
+	bld r18, 5  /* now: r18 == length/8 (aka. length in bytes) */
+	
+	
+	movw r26, r22 /* X points to begin of msg */
+	tst r18
+	breq sha256_lastBlock_post_copy
+	mov r1, r18
+sha256_lastBlock_copy_loop:
+	ld r0, X+
+	st Z+, r0
+	dec r1
+	brne sha256_lastBlock_copy_loop
+sha256_lastBlock_post_copy:	
+sha256_lastBlock_insert_stuffing_bit:	
+	ldi r19, 0x80
+	mov r0,r19 	
+	ldi r19, 0x07
+	and r19, r20 /* if we are in bitmode */
+	breq 2f	/* no bitmode */
+1:	
+	lsr r0
+	dec r19
+	brne 1b
+	ld r19, X
+/* maybe we should do some ANDing here, just for safety */
+	or r0, r19
+2:	
+	st Z+, r0
+	inc r18
+
+/* checking stuff here */
+	cpi r18, 64-8+1
+	brsh 0f 
+	rjmp sha256_lastBlock_insert_zeros
+0:
+	/* oh shit, we landed here */
+	/* first we have to fill it up with zeros */
+	ldi r19, 64
+	sub r19, r18
+	breq 2f
+1:	
+	st Z+, r1
+	dec r19
+	brne 1b	
+2:	
+	sbiw r30, 63
+	sbiw r30,  1
+	movw r22, r30
+	
+	push r31
+	push r30
+	push r25
+	push r24
+	push r21
+	push r20
+	rcall sha256_nextBlock
+	pop r20
+	pop r21
+	pop r24
+	pop r25
+	pop r30
+	pop r31
+	
+	/* now we should subtract 512 from length */
+	movw r26, r24
+	adiw r26, 4*8+1 /* we can skip the lowest byte */
+	ld r19, X
+	subi r19, hi8(512)
+	st X+, r19
+	ldi r18, 6
+1:
+	ld r19, X
+	sbci r19, 0
+	st X+, r19
+	dec r18
+	brne 1b
+	
+;	clr r18 /* not neccessary ;-) */
+	/* reset Z pointer to begin of block */
+
+sha256_lastBlock_insert_zeros:	
+	ldi r19, 64-8
+	sub r19, r18
+	breq sha256_lastBlock_insert_length
+	clr r1
+1:
+	st Z+, r1	/* r1 is still zero */
+	dec r19
+	brne 1b
+
+;	rjmp sha256_lastBlock_epilog
+sha256_lastBlock_insert_length:
+	movw r26, r24	/* X points to state */
+	adiw r26, 8*4	/* X points to (state.length) */
+	adiw r30, 8		/* Z points one after the last byte of block */
+	ld r0, X+
+	add r0, r20
+	st -Z, r0
+	ld r0, X+
+	adc r0, r21
+	st -Z, r0
+	ldi r19, 6
+1:
+	ld r0, X+
+	adc r0, r1
+	st -Z, r0
+	dec r19
+	brne 1b
+
+	sbiw r30, 64-8
+	movw r22, r30
+	rcall sha256_nextBlock
+
+sha256_lastBlock_epilog:
+	in r30, SPL
+	in r31, SPH
+	in r1, SREG
+	adiw r30, 63 ; lo8(64)
+	adiw r30,  1  ; hi8(64)
+	cli
+	out SPL, r30
+	out SPH, r31
+	out SREG,r1
+	clr r1
+	clr r0
+	ret
+
+/**/
+;###########################################################	
+
+.global sha256_nextBlock
+; === sha256_nextBlock ===
+; this is the core function for calculating SHA-256 hashes
+;  param1: the 16-bit pointer to sha256_ctx structure
+;	given in r25,r24 (r25 is most significant)
+;  param2: an 16-bit pointer to 64 byte block to hash
+;	given in r23,r22
+sha256_nextBlock_localSpace = (64+8)*4 ; 64 32-bit values for w array and 8 32-bit values for a array (total 288 byte)
+
+Bck1 = 12
+Bck2 = 13
+Bck3 = 14
+Bck4 = 15
+Func1 = 22
+Func2 = 23
+Func3 = 24
+Func4 = 25
+Accu1 = 16
+Accu2 = 17
+Accu3 = 18
+Accu4 = 19
+XAccu1 = 8
+XAccu2 = 9
+XAccu3 = 10
+XAccu4 = 11
+T1	= 4
+T2	= 5
+T3	= 6
+T4	= 7
+LoopC = 1
+/* byteorder: high number <--> high significance */
+sha256_nextBlock:
+ ; initial, let's make some space ready for local vars
+	push r4 /* replace push & pop by mem ops? */
+	push r5
+	push r6
+	push r7
+	push r8
+	push r9
+	push r10
+	push r11
+	push r12
+	push r13
+	push r14
+	push r15
+	push r16
+	push r17
+	push r28
+	push r29
+	in r20, SPL
+	in r21, SPH
+	movw r18, r20			;backup SP
+;	movw r26, r20			; X points to free space on stack 
+	movw r30, r22			; Z points to message
+	subi r20, lo8(sha256_nextBlock_localSpace) ;sbiw can do only up to 63
+	sbci r21, hi8(sha256_nextBlock_localSpace)
+	movw r26, r20			; X points to free space on stack 
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	push r18
+	push r19
+	push r24
+	push r25 /* param1 will be needed later */
+ ; now we fill the w array with message (think about endianess)
+ 	adiw r26, 1 ; X++
+ 	ldi r20, 16
+sha256_nextBlock_wcpyloop: 	
+ 	ld r23, Z+
+ 	ld r22, Z+
+ 	ld r19, Z+
+ 	ld r18, Z+
+ 	st X+, r18
+ 	st X+, r19
+ 	st X+, r22	
+	st X+, r23
+	dec r20
+	brne sha256_nextBlock_wcpyloop
+/*	for (i=16; i<64; ++i){
+		w[i] = SIGMA_b(w[i-2]) + w[i-7] + SIGMA_a(w[i-15]) + w[i-16];	
+	} */
+	/* r25,r24,r23,r24 (r21,r20) are function values
+	   r19,r18,r17,r16 are the accumulator
+	   r15,r14,r13,rBck1 are backup1
+	   r11,r10,r9 ,r8  are xor accu   
+	   r1 is round counter 								*/
+
+	ldi r20, 64-16
+	mov LoopC, r20
+sha256_nextBlock_wcalcloop:		 
+	movw r30, r26 ; cp X to Z
+	sbiw r30, 63
+	sbiw r30, 1 		; substract 64 = 16*4
+	ld Accu1, Z+
+	ld Accu2, Z+
+	ld Accu3, Z+
+	ld Accu4, Z+ /* w[i] = w[i-16] */
+	ld Bck1, Z+
+	ld Bck2, Z+
+	ld Bck3, Z+
+	ld Bck4, Z+ /* backup = w[i-15] */
+	/* now sigma 0 */
+	mov Func1, Bck2
+	mov Func2, Bck3
+	mov Func3, Bck4
+	mov Func4, Bck1  /* prerotated by 8 */
+	ldi r20, 1
+	rcall bitrotl
+	movw XAccu1, Func1
+	movw XAccu3, Func3	 /* store ROTR(w[i-15],7) in xor accu */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-15], 18)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 3		 /* now shr3 */ /*we can destroy backup now*/
+sigma0_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2
+	ror Bck1	
+	dec Func2
+	brne sigma0_shr
+	eor XAccu1, Bck1
+	eor XAccu2, Bck2
+	eor XAccu3, Bck3
+	eor XAccu4, Bck4	/* xor SHR(w[i-15], 3)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	ldd Func1, Z+7*4  /* now accu += w[i-7] */
+	ldd Func2, Z+7*4+1
+	ldd Func3, Z+7*4+2
+	ldd Func4, Z+7*4+3
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	ldd Bck1, Z+12*4 /* now backup = w[i-2]*/
+	ldd Bck2, Z+12*4+1
+	ldd Bck3, Z+12*4+2
+	ldd Bck4, Z+12*4+3
+	/* now sigma 1 */
+	movw Func1, Bck3
+	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 1
+	rcall bitrotr
+	movw XAccu3, Func3
+	movw XAccu1, Func1	 /* store in ROTR(w[i-2], 17) xor accu */
+;	movw Func1, Bck3
+;	movw Func3, Bck1 /* prerotated by 16 */
+	ldi r20, 2
+	rcall bitrotr
+	eor XAccu1, Func1  /* xor ROTR(w[i-2], 19)*/
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	ldi Func2, 2	 /* now shr10 (dirty trick, skipping a byte) */ /*we can destroy backup now*/
+sigma1_shr:
+	lsr Bck4
+	ror Bck3
+	ror Bck2	
+	dec Func2
+	brne sigma1_shr
+	eor XAccu1, Bck2
+	eor XAccu2, Bck3
+	eor XAccu3, Bck4  /* xor SHR(w[i-2], 10)*/ /* xor accu == sigma1(w[i-15]) */
+	add Accu1, XAccu1
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4 /* finished with sigma0 */
+	/* now let's store the shit */
+	st X+, Accu1
+	st X+, Accu2
+	st X+, Accu3
+	st X+, Accu4
+	dec LoopC
+	breq 3f  ; skip if zero
+	rjmp sha256_nextBlock_wcalcloop
+3:
+	/* we are finished with w array X points one byte post w */
+/* init a array */
+	pop r31
+	pop r30
+	push r30
+	push r31
+	ldi r25, 8*4 /* 8 32-bit values to copy from ctx to a array */
+init_a_array:	
+	ld r1, Z+
+	st X+, r1
+	dec r25
+	brne init_a_array
+	
+/* now the real fun begins */
+/* for (i=0; i<64; ++i){
+			t1 = a[7] + SIGMA1(a[4]) + CH(a[4],a[5],a[6]) + k[i] + w[i];
+			t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]);
+			memmove(&(a[1]), &(a[0]), 7*4); 	// a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; 
+			a[4] += t1;
+			a[0] = t1 + t2;
+		} */
+	/* Y points to a[0], Z ('cause lpm wants it) points to k[i], X points to w[i] */
+	sbiw r26, 8*4  /* X still points at a[7]+1*/
+	movw r28, r26
+	ldi r30, lo8(sha256_kv)
+	ldi r31, hi8(sha256_kv)		
+	dec r27  /* X - (64*4 == 256) */
+	ldi r25, 64
+	mov LoopC, r25
+sha256_main_loop:
+	/* now calculate t1 */
+	 /*CH(x,y,z) = (x&y)^((~x)&z)*/
+	ldd T1, Y+5*4
+	ldd T2, Y+5*4+1
+	ldd T3, Y+5*4+2
+	ldd T4, Y+5*4+3 /* y in T */
+	ldd Func1, Y+4*4
+	ldd Func2, Y+4*4+1
+	ldd Func3, Y+4*4+2
+	ldd Func4, Y+4*4+3  /* x in Func */
+	ldd Bck1, Y+6*4
+	ldd Bck2, Y+6*4+1
+	ldd Bck3, Y+6*4+2
+	ldd Bck4, Y+6*4+3 /* z in Bck */
+	and T1, Func1
+	and T2, Func2
+	and T3, Func3
+	and T4, Func4
+	com Func1
+	com Func2
+	com Func3
+	com Func4
+	and Bck1, Func1
+	and Bck2, Func2
+	and Bck3, Func3
+	and Bck4, Func4
+	eor T1, Bck1
+	eor T2, Bck2
+	eor T3, Bck3
+	eor T4, Bck4 /* done, CH(x,y,z) is in T */
+	/* now SIGMA1(a[4]) */
+	ldd Bck4, Y+4*4		/* think about using it from Func reg above*/
+	ldd Bck1, Y+4*4+1	
+	ldd Bck2, Y+4*4+2
+	ldd Bck3, Y+4*4+3 /* load prerotate by 8-bit */	
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2 
+	rcall bitrotl		/* rotr(x,6) */ 
+	movw XAccu1, Func1
+	movw XAccu3, Func3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 3 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4
+	movw Func1, Bck3 /* this prerotates furteh 16 bits*/
+	movw Func3, Bck1 /* so we have now prerotated by 24 bits*/
+	ldi r20, 1 
+	rcall bitrotr 	/* rotr(x,11) */
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4 /* finished with SIGMA1, add it to T */
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4
+	/* now we've to add a[7], w[i] and k[i] */
+	ldd XAccu1, Y+4*7
+	ldd XAccu2, Y+4*7+1
+	ldd XAccu3, Y+4*7+2
+	ldd XAccu4, Y+4*7+3
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add a[7] */
+	ld XAccu1, X+
+	ld XAccu2, X+
+	ld XAccu3, X+
+	ld XAccu4, X+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add w[i] */
+	lpm XAccu1, Z+
+	lpm XAccu2, Z+
+	lpm XAccu3, Z+
+	lpm XAccu4, Z+
+	add T1, XAccu1
+	adc T2, XAccu2
+	adc T3, XAccu3
+	adc T4, XAccu4 /* add k[i] */ /* finished with t1 */
+	/*now t2 = SIGMA0(a[0]) + MAJ(a[0],a[1],a[2]) */ /*i did to much x86 asm, i always see 4 32bit regs*/
+		/* starting with MAJ(x,y,z) */
+	ldd Func1, Y+4*0+0
+	ldd Func2, Y+4*0+1
+	ldd Func3, Y+4*0+2
+	ldd Func4, Y+4*0+3 /* load x=a[0] */
+	ldd XAccu1, Y+4*1+0
+	ldd XAccu2, Y+4*1+1
+	ldd XAccu3, Y+4*1+2
+	ldd XAccu4, Y+4*1+3 /* load y=a[1] */
+	and XAccu1, Func1
+	and XAccu2, Func2
+	and XAccu3, Func3
+	and XAccu4, Func4	/* XAccu == (x & y) */
+	ldd Bck1, Y+4*2+0
+	ldd Bck2, Y+4*2+1
+	ldd Bck3, Y+4*2+2
+	ldd Bck4, Y+4*2+3 /* load z=a[2] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == (x & y) ^ (x & z) */
+	ldd Func1, Y+4*1+0
+	ldd Func2, Y+4*1+1
+	ldd Func3, Y+4*1+2
+	ldd Func4, Y+4*1+3 /* load y=a[1] */
+	and Func1, Bck1
+	and Func2, Bck2
+	and Func3, Bck3
+	and Func4, Bck4
+	eor XAccu1, Func1
+	eor XAccu2, Func2
+	eor XAccu3, Func3
+	eor XAccu4, Func4	/* XAccu == Maj(x,y,z) == (x & y) ^ (x & z) ^ (y & z) */
+   	/* SIGMA0(a[0]) */
+	ldd Bck1, Y+4*0+0 /* we should combine this with above */
+	ldd Bck2, Y+4*0+1
+	ldd Bck3, Y+4*0+2
+	ldd Bck4, Y+4*0+3
+	movw Func1, Bck1
+	movw Func3, Bck3
+	ldi r20, 2
+	rcall bitrotr
+	movw Accu1, Func1
+	movw Accu3, Func3 /* Accu = shr(a[0], 2) */
+	movw Func1, Bck3 
+	movw Func3, Bck1 /* prerotate by 16 bits */
+	ldi r20, 3
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 13) */
+	mov Func1, Bck4
+	mov Func2, Bck1
+	mov Func3, Bck2
+	mov Func4, Bck3  /* prerotate by 24 bits */
+	ldi r20, 2
+	rcall bitrotl
+	eor Accu1, Func1
+	eor Accu2, Func2
+	eor Accu3, Func3
+	eor Accu4, Func4 /* Accu ^= shr(a[0], 22) */
+	add Accu1, XAccu1 /* add previous result (MAJ)*/
+	adc Accu2, XAccu2
+	adc Accu3, XAccu3
+	adc Accu4, XAccu4
+	/* now we are finished with the computing stuff (t1 in T, t2 in Accu)*/
+	/* a[7]=a[6]; a[6]=a[5]; a[5]=a[4]; a[4]=a[3]; a[3]=a[2]; a[2]=a[1]; a[1]=a[0]; */
+
+	ldi r21, 7*4
+	adiw r28, 7*4
+a_shift_loop:
+	ld  r25, -Y /* warning: this is PREdecrement */
+	std Y+4, r25
+	dec r21
+	brne a_shift_loop
+
+	ldd Bck1, Y+4*4+0
+	ldd Bck2, Y+4*4+1
+	ldd Bck3, Y+4*4+2
+	ldd Bck4, Y+4*4+3
+	add Bck1, T1
+	adc Bck2, T2
+	adc Bck3, T3
+	adc Bck4, T4
+	std Y+4*4+0, Bck1
+	std Y+4*4+1, Bck2
+	std Y+4*4+2, Bck3
+	std Y+4*4+3, Bck4
+	add Accu1, T1
+	adc Accu2, T2
+	adc Accu3, T3
+	adc Accu4, T4
+	std Y+4*0+0, Accu1
+	std Y+4*0+1, Accu2
+	std Y+4*0+2, Accu3
+	std Y+4*0+3, Accu4 /* a array updated */
+	
+	
+	dec LoopC
+	breq update_state
+	rjmp sha256_main_loop ;brne sha256_main_loop
+update_state:	
+	/* update state */
+	/* pointers to state should still exist on the stack ;-) */
+	pop r31
+	pop r30
+	ldi r21, 8
+update_state_loop:
+	ldd Accu1, Z+0
+	ldd Accu2, Z+1
+	ldd Accu3, Z+2
+	ldd Accu4, Z+3 
+	ld Func1, Y+
+	ld Func2, Y+
+	ld Func3, Y+
+	ld Func4, Y+
+	add Accu1, Func1
+	adc Accu2, Func2
+	adc Accu3, Func3
+	adc Accu4, Func4
+	st Z+, Accu1
+	st Z+, Accu2
+	st Z+, Accu3
+	st Z+, Accu4
+	dec r21
+	brne update_state_loop
+	/* now we just have to update the length */
+	adiw r30, 1 /* since we add 512, we can simply skip the LSB */ 
+	ldi r21, 2
+	ldi r22, 6
+	ld r20, Z
+	add r20, r21
+	st Z+, r20	
+	clr r21
+sha256_nextBlock_fix_length:	
+	brcc sha256_nextBlock_epilog
+	ld r20, Z
+	adc r20, r21
+	st Z+, r20
+	dec r22
+	brne sha256_nextBlock_fix_length
+	
+; EPILOG
+sha256_nextBlock_epilog:
+/* now we should clean up the stack */
+	
+	pop r21
+	pop r20
+	in r0, SREG
+	cli ; we want to be uninterrupted while updating SP
+	out SPL, r20
+	out SPH, r21
+	out SREG, r0
+	
+	clr r1
+	pop r29
+	pop r28
+	pop r17
+	pop r16
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop r11
+	pop r10
+	pop r9
+	pop r8
+	pop r7
+	pop r6
+	pop r5
+	pop r4 
+	ret
+
+sha256_kv: ; round-key-vector stored in ProgMem 
+.word	0x2f98, 0x428a, 0x4491, 0x7137, 0xfbcf, 0xb5c0, 0xdba5, 0xe9b5, 0xc25b, 0x3956, 0x11f1, 0x59f1, 0x82a4, 0x923f, 0x5ed5, 0xab1c
+.word	0xaa98, 0xd807, 0x5b01, 0x1283, 0x85be, 0x2431, 0x7dc3, 0x550c, 0x5d74, 0x72be, 0xb1fe, 0x80de, 0x06a7, 0x9bdc, 0xf174, 0xc19b
+.word	0x69c1, 0xe49b, 0x4786, 0xefbe, 0x9dc6, 0x0fc1, 0xa1cc, 0x240c, 0x2c6f, 0x2de9, 0x84aa, 0x4a74, 0xa9dc, 0x5cb0, 0x88da, 0x76f9
+.word	0x5152, 0x983e, 0xc66d, 0xa831, 0x27c8, 0xb003, 0x7fc7, 0xbf59, 0x0bf3, 0xc6e0, 0x9147, 0xd5a7, 0x6351, 0x06ca, 0x2967, 0x1429
+.word	0x0a85, 0x27b7, 0x2138, 0x2e1b, 0x6dfc, 0x4d2c, 0x0d13, 0x5338, 0x7354, 0x650a, 0x0abb, 0x766a, 0xc92e, 0x81c2, 0x2c85, 0x9272
+.word	0xe8a1, 0xa2bf, 0x664b, 0xa81a, 0x8b70, 0xc24b, 0x51a3, 0xc76c, 0xe819, 0xd192, 0x0624, 0xd699, 0x3585, 0xf40e, 0xa070, 0x106a
+.word	0xc116, 0x19a4, 0x6c08, 0x1e37, 0x774c, 0x2748, 0xbcb5, 0x34b0, 0x0cb3, 0x391c, 0xaa4a, 0x4ed8, 0xca4f, 0x5b9c, 0x6ff3, 0x682e
+.word	0x82ee, 0x748f, 0x636f, 0x78a5, 0x7814, 0x84c8, 0x0208, 0x8cc7, 0xfffa, 0x90be, 0x6ceb, 0xa450, 0xa3f7, 0xbef9, 0x78f2, 0xc671
+
+	
+;###########################################################	
+
+.global sha256_init 
+;uint32_t sha256_init_vector[]={
+;  	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+;	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 };
+;
+;void sha256_init(sha256_ctx_t *state){
+;	state->length=0;
+;	memcpy(state->h, sha256_init_vector, 8*4);
+;}
+; param1: (r23,r24) 16-bit pointer to sha256_ctx_t struct in ram
+; modifys: Z(r30,r31), Func1, r22
+sha256_init:
+	movw r26, r24 ; (24,25) --> (26,27) load X with param1
+	ldi r30, lo8((sha256_init_vector))
+	ldi r31, hi8((sha256_init_vector))
+	ldi r22, 32+8
+sha256_init_vloop:	
+	lpm r23, Z+ 
+	st X+, r23
+	dec r22
+	brne sha256_init_vloop
+	ret
+	
+sha256_init_vector:
+.word 0xE667, 0x6A09
+.word 0xAE85, 0xBB67 
+.word 0xF372, 0x3C6E 
+.word 0xF53A, 0xA54F 
+.word 0x527F, 0x510E 
+.word 0x688C, 0x9B05 
+.word 0xD9AB, 0x1F83 
+.word 0xCD19, 0x5BE0
+.word 0x0000, 0x0000
+.word 0x0000, 0x0000
+
+;###########################################################	
+
+.global rotl32
+; === ROTL32 ===
+; function that rotates a 32 bit word to the left
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,r22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotl32:
+	cpi r20, 8
+	brlo bitrotl
+	mov r21, r25
+	mov r25, r24
+	mov r24, r23
+	mov r23, r22
+	mov r22, r21
+	subi r20, 8
+	rjmp rotl32
+bitrotl:
+	clr r21
+	clc
+bitrotl_loop:	
+	tst r20
+	breq fixrotl
+	rol r22
+	rol r23
+	rol r24
+	rol r25
+	rol r21
+	dec r20
+	rjmp bitrotl_loop
+fixrotl:
+	or r22, r21
+	ret
+	
+
+;###########################################################	
+
+.global rotr32
+; === ROTR32 ===
+; function that rotates a 32 bit word to the right
+;  param1: the 32-bit word to rotate
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  param2: an 8-bit value telling how often to rotate
+;	given in r20
+; modifys: r21, r22
+rotr32:
+	cpi r20, 8
+	brlo bitrotr
+	mov r21, r22
+	mov r22, r23
+	mov r23, r24
+	mov r24, r25
+	mov r25, r21
+	subi r20, 8
+	rjmp rotr32
+bitrotr:
+	clr r21
+	clc
+bitrotr_loop:	
+	tst r20
+	breq fixrotr
+	ror r25
+	ror r24
+	ror r23
+	ror r22
+	ror r21
+	dec r20
+	rjmp bitrotr_loop
+fixrotr:
+	or r25, r21
+	ret
+	
+	
+;###########################################################	
+	
+.global change_endian32
+; === change_endian32 ===
+; function that changes the endianess of a 32-bit word
+;  param1: the 32-bit word
+;	given in r25,r24,r23,22 (r25 is most significant)
+;  modifys: r21, r22
+change_endian32:
+	movw r20,  r22 ; (r22,r23) --> (r20,r21)
+	mov r22, r25
+	mov r23, r24
+	mov r24, r21
+	mov r25, r20 
+	ret
+
diff --git a/shacal2/sha256.h b/shacal2/sha256.h
new file mode 100644
index 0000000..24960a3
--- /dev/null
+++ b/shacal2/sha256.h
@@ -0,0 +1,122 @@
+/* sha256.h */
+/*
+    This file is part of the AVR-Crypto-Lib.
+    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/**
+ * \file	sha256.h
+ * \author  Daniel Otte 
+ * \date    2006-05-16
+ * \license	GPLv3 or later
+ * 
+ */
+
+#ifndef SHA256_H_
+#define SHA256_H_
+
+#define __LITTLE_ENDIAN__
+
+
+#include <stdint.h>
+
+/** \def SHA256_HASH_BITS
+ * defines the size of a SHA-256 hash value in bits
+ */
+
+/** \def SHA256_HASH_BYTES
+ * defines the size of a SHA-256 hash value in bytes
+ */
+
+/** \def SHA256_BLOCK_BITS
+ * defines the size of a SHA-256 input block in bits
+ */
+
+/** \def SHA256_BLOCK_BYTES
+ * defines the size of a SHA-256 input block in bytes
+ */
+
+#define SHA256_HASH_BITS  256
+#define SHA256_HASH_BYTES (SHA256_HASH_BITS/8)
+#define SHA256_BLOCK_BITS 512
+#define SHA256_BLOCK_BYTES (SHA256_BLOCK_BITS/8)
+
+/** \typedef sha256_ctx_t
+ * \brief SHA-256 context type
+ * 
+ * A variable of this type may hold the state of a SHA-256 hashing process
+ */
+typedef struct {
+	uint32_t h[8];
+	uint64_t length;
+} sha256_ctx_t;
+
+/** \typedef sha256_hash_t
+ * \brief SHA-256 hash value type
+ * 
+ * A variable of this type may hold the hash value produced by the
+ * sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state) function.
+ */
+typedef uint8_t sha256_hash_t[SHA256_HASH_BYTES];
+
+/** \fn void sha256_init(sha256_ctx_t *state)
+ * \brief initialise a SHA-256 context
+ * 
+ * This function sets a ::sha256_ctx_t to the initial values for hashing.
+ * \param state pointer to the SHA-256 hashing context
+ */
+void sha256_init(sha256_ctx_t *state);
+
+/** \fn void sha256_nextBlock (sha256_ctx_t* state, const void* block)
+ * \brief update the context with a given block
+ * 
+ * This function updates the SHA-256 hash context by processing the given block
+ * of fixed length.
+ * \param state pointer to the SHA-256 hash context
+ * \param block pointer to the block of fixed length (512 bit = 64 byte)
+ */
+void sha256_nextBlock (sha256_ctx_t* state, const void* block);
+
+/** \fn void sha256_lastBlock(sha256_ctx_t* state, const void* block, uint16_t length_b)
+ * \brief finalize the context with the given block 
+ * 
+ * This function finalizes the SHA-256 hash context by processing the given block
+ * of variable length.
+ * \param state pointer to the SHA-256 hash context
+ * \param block pointer to the block of fixed length (512 bit = 64 byte)
+ * \param length_b the length of the block in bits
+ */
+void sha256_lastBlock(sha256_ctx_t* state, const void* block, uint16_t length_b);
+
+/** \fn void sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state)
+ * \brief convert the hash state into the hash value
+ * This function reads the context and writes the hash value to the destination
+ * \param dest pointer to the location where the hash value should be written
+ * \param state pointer to the SHA-256 hash context
+ */
+void sha256_ctx2hash(sha256_hash_t* dest, const sha256_ctx_t* state);
+
+/** \fn void sha256(sha256_hash_t* dest, const void* msg, uint32_t length_b)
+ * \brief simple SHA-256 hashing function for direct hashing
+ * 
+ * This function automaticaly hashes a given message of arbitary length with
+ * the SHA-256 hashing algorithm.
+ * \param dest pointer to the location where the hash value is going to be written to
+ * \param msg pointer to the message thats going to be hashed
+ * \param length_b length of the message in bits
+ */
+void sha256(sha256_hash_t* dest, const void* msg, uint32_t length_b);
+
+#endif /*SHA256_H_*/
diff --git a/shacal2_enc.c b/shacal2/shacal2_enc.c
similarity index 100%
rename from shacal2_enc.c
rename to shacal2/shacal2_enc.c
diff --git a/shacal2_enc.h b/shacal2/shacal2_enc.h
similarity index 100%
rename from shacal2_enc.h
rename to shacal2/shacal2_enc.h
diff --git a/skipjack.c b/skipjack/skipjack.c
similarity index 100%
rename from skipjack.c
rename to skipjack/skipjack.c
diff --git a/skipjack.h b/skipjack/skipjack.h
similarity index 100%
rename from skipjack.h
rename to skipjack/skipjack.h
diff --git a/test_src/main-hmac-md5-test.c b/test_src/main-hmac-md5-test.c
index dfbcead..f61e119 100644
--- a/test_src/main-hmac-md5-test.c
+++ b/test_src/main-hmac-md5-test.c
@@ -28,9 +28,10 @@
 
 #include "md5.h"
 #include "hmac-md5.h"
+/*
 #include "base64_enc.h"
 #include "base64_dec.h"
-
+*/
 #include "nessie_mac_test.h"
 
 #include <stdint.h>
@@ -109,6 +110,7 @@ void strhexdump(char* dest, void* src, uint16_t length){
 	}
 }
 
+/*
 void cram_md5_interactive(void){
 	char key[101];
 	char msg_b64[101];
@@ -137,9 +139,9 @@ void cram_md5_interactive(void){
 	cli_putstr_P(PSTR("\r\nresponse: "));
 	cli_hexdump(hmac, HMAC_MD5_BYTES);
 	cli_putstr_P(PSTR("\r\nresponse (b64): "));
-	cli_putstr(msg_b64);
-	
+	cli_putstr(msg_b64);	
 }
+*/
 
 
 void md5_interactive(void){
@@ -164,7 +166,7 @@ const char test_str[]        PROGMEM = "test";
 /* const char performance_str[] PROGMEM = "performance"; */
 const char echo_str[]        PROGMEM = "echo";
 const char hmd5i_str[]       PROGMEM = "hmac-md5";
-const char crammd5i_str[]    PROGMEM = "cram-md5";
+/* const char crammd5i_str[]    PROGMEM = "cram-md5"; */
 const char md5i_str[]        PROGMEM = "md5";
 
 
@@ -172,7 +174,7 @@ cmdlist_entry_t cmdlist[] PROGMEM = {
 	{ nessie_str,      NULL, testrun_nessie_hmacmd5},
 	{ test_str,        NULL, testrun_test_hmacmd5},
 	{ hmd5i_str,       NULL, hmacmd5_interactive},
-	{ crammd5i_str,    NULL, cram_md5_interactive},
+/*	{ crammd5i_str,    NULL, cram_md5_interactive},        */
 	{ md5i_str,        NULL, md5_interactive},
 /*	{ performance_str, NULL, testrun_performance_hmacmd5}, */
 	{ echo_str,    (void*)1, (void_fpt)echo_ctrl},
diff --git a/trivium.c b/trivium/trivium.c
similarity index 100%
rename from trivium.c
rename to trivium/trivium.c
diff --git a/trivium.h b/trivium/trivium.h
similarity index 100%
rename from trivium.h
rename to trivium/trivium.h
diff --git a/xtea-asm.S b/xtea/xtea-asm.S
similarity index 100%
rename from xtea-asm.S
rename to xtea/xtea-asm.S
diff --git a/xtea.c b/xtea/xtea.c
similarity index 100%
rename from xtea.c
rename to xtea/xtea.c
diff --git a/xtea.h b/xtea/xtea.h
similarity index 100%
rename from xtea.h
rename to xtea/xtea.h