avr-crypto-lib/hmac-sha1/sha1-asm.S

887 lines
15 KiB
ArmAsm

/* sha1-asm.S */
/*
This file is part of the AVR-Crypto-Lib.
Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* Author: Daniel Otte
*
* License: GPLv3 or later
*/
; SHA1 implementation in assembler for AVR
SHA1_BLOCK_BITS = 512
SHA1_HASH_BITS = 160
.macro precall
/* push r18 - r27, r30 - r31*/
push r0
push r1
push r18
push r19
push r20
push r21
push r22
push r23
push r24
push r25
push r26
push r27
push r30
push r31
clr r1
.endm
.macro postcall
pop r31
pop r30
pop r27
pop r26
pop r25
pop r24
pop r23
pop r22
pop r21
pop r20
pop r19
pop r18
pop r1
pop r0
.endm
.macro hexdump length
push r27
push r26
ldi r25, '\r'
mov r24, r25
call uart_putc
ldi r25, '\n'
mov r24, r25
call uart_putc
pop r26
pop r27
movw r24, r26
.if \length > 16
ldi r22, lo8(16)
ldi r23, hi8(16)
push r27
push r26
call uart_hexdump
pop r26
pop r27
adiw r26, 16
hexdump \length-16
.else
ldi r22, lo8(\length)
ldi r23, hi8(\length)
call uart_hexdump
.endif
.endm
.macro delay
/*
push r0
push r1
clr r0
1: clr r1
2: dec r1
brne 2b
dec r0
brne 1b
pop r1
pop r0 // */
.endm
/* X points to Block */
.macro dbg_hexdump length
/*
precall
hexdump \length
postcall
// */
.endm
.section .text
SPL = 0x3D
SPH = 0x3E
SREG = 0x3F
;
;sha1_ctx_t is:
;
; [h0][h1][h2][h3][h4][length]
; hn is 32 bit large, length is 64 bit large
;###########################################################
.global sha1_ctx2hash
; === sha1_ctx2hash ===
; this function converts a state into a normal hash (bytestring)
; param1: the 16-bit destination pointer
; given in r25,r24 (r25 is most significant)
; param2: the 16-bit pointer to sha1_ctx structure
; given in r23,r22
sha1_ctx2hash:
movw r26, r22
movw r30, r24
ldi r21, 5
sbiw r26, 4
1:
ldi r20, 4
adiw r26, 8
2:
ld r0, -X
st Z+, r0
dec r20
brne 2b
dec r21
brne 1b
ret
;###########################################################
.global sha1
; === sha1 ===
; this function calculates SHA-1 hashes from messages in RAM
; param1: the 16-bit hash destination pointer
; given in r25,r24 (r25 is most significant)
; param2: the 16-bit pointer to message
; given in r23,r22
; param3: 32-bit length value (length of message in bits)
; given in r21,r20,r19,r18
sha1:
sha1_prolog:
push r8
push r9
push r10
push r11
push r12
push r13
push r16
push r17
in r16, SPL
in r17, SPH
subi r16, 5*4+8
sbci r17, 0
in r0, SREG
cli
out SPL, r16
out SPH, r17
out SREG, r0
push r25
push r24
inc r16
adc r17, r1
movw r8, r18 /* backup of length*/
movw r10, r20
movw r12, r22 /* backup pf msg-ptr */
movw r24, r16
rcall sha1_init
/* if length >= 512 */
1:
tst r11
brne 4f
tst r10
brne 4f
mov r19, r9
cpi r19, 0x02
brlo 4f
movw r24, r16
movw r22, r12
rcall sha1_nextBlock
ldi r19, 0x64
add r22, r19
adc r23, r1
/* length -= 512 */
ldi r19, 0x02
sub r9, r19
sbc r10, r1
sbc r11, r1
rjmp 1b
4:
movw r24, r16
movw r22, r12
movw r20, r8
rcall sha1_lastBlock
pop r24
pop r25
movw r22, r16
rcall sha1_ctx2hash
sha1_epilog:
in r30, SPL
in r31, SPH
adiw r30, 5*4+8
in r0, SREG
cli
out SPL, r30
out SPH, r31
out SREG, r0
pop r17
pop r16
pop r13
pop r12
pop r11
pop r10
pop r9
pop r8
ret
;###########################################################
; block MUST NOT be larger than 64 bytes
.global sha1_lastBlock
; === sha1_lastBlock ===
; this function does padding & Co. for calculating SHA-1 hashes
; param1: the 16-bit pointer to sha1_ctx structure
; given in r25,r24 (r25 is most significant)
; param2: an 16-bit pointer to 64 byte block to hash
; given in r23,r22
; param3: an 16-bit integer specifing length of block in bits
; given in r21,r20
sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
sha1_lastBlock:
cpi r21, 0x02
brlo sha1_lastBlock_prolog
push r25
push r24
push r23
push r22
push r21
push r20
rcall sha1_nextBlock
pop r20
pop r21
pop r22
pop r23
pop r24
pop r25
subi r21, 2
subi r23, -2
rjmp sha1_lastBlock
sha1_lastBlock_prolog:
/* allocate space on stack */
in r30, SPL
in r31, SPH
in r1, SREG
subi r30, lo8(64)
sbci r31, hi8(64) /* ??? */
cli
out SPL, r30
out SPH, r31
out SREG,r1
adiw r30, 1 /* SP points to next free byte on stack */
mov r18, r20 /* r20 = LSB(length) */
lsr r18
lsr r18
lsr r18
bst r21, 0 /* may be we should explain this ... */
bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
movw r26, r22 /* X points to begin of msg */
tst r18
breq sha1_lastBlock_post_copy
mov r1, r18
sha1_lastBlock_copy_loop:
ld r0, X+
st Z+, r0
dec r1
brne sha1_lastBlock_copy_loop
sha1_lastBlock_post_copy:
sha1_lastBlock_insert_stuffing_bit:
ldi r19, 0x80
mov r0,r19
ldi r19, 0x07
and r19, r20 /* if we are in bitmode */
breq 2f /* no bitmode */
1:
lsr r0
dec r19
brne 1b
ld r19, X
/* maybe we should do some ANDing here, just for safety */
or r0, r19
2:
st Z+, r0
inc r18
/* checking stuff here */
cpi r18, 64-8+1
brsh 0f
rjmp sha1_lastBlock_insert_zeros
0:
/* oh shit, we landed here */
/* first we have to fill it up with zeros */
ldi r19, 64
sub r19, r18
breq 2f
1:
st Z+, r1
dec r19
brne 1b
2:
sbiw r30, 63
sbiw r30, 1
movw r22, r30
push r31
push r30
push r25
push r24
push r21
push r20
rcall sha1_nextBlock
pop r20
pop r21
pop r24
pop r25
pop r30
pop r31
/* now we should subtract 512 from length */
movw r26, r24
adiw r26, 4*5+1 /* we can skip the lowest byte */
ld r19, X
subi r19, hi8(512)
st X+, r19
ldi r18, 6
1:
ld r19, X
sbci r19, 0
st X+, r19
dec r18
brne 1b
; clr r18 /* not neccessary ;-) */
/* reset Z pointer to begin of block */
sha1_lastBlock_insert_zeros:
ldi r19, 64-8
sub r19, r18
breq sha1_lastBlock_insert_length
clr r1
1:
st Z+, r1 /* r1 is still zero */
dec r19
brne 1b
; rjmp sha1_lastBlock_epilog
sha1_lastBlock_insert_length:
movw r26, r24 /* X points to state */
adiw r26, 5*4 /* X points to (state.length) */
adiw r30, 8 /* Z points one after the last byte of block */
ld r0, X+
add r0, r20
st -Z, r0
ld r0, X+
adc r0, r21
st -Z, r0
ldi r19, 6
1:
ld r0, X+
adc r0, r1
st -Z, r0
dec r19
brne 1b
sbiw r30, 64-8
movw r22, r30
rcall sha1_nextBlock
sha1_lastBlock_epilog:
in r30, SPL
in r31, SPH
in r1, SREG
adiw r30, 63 ; lo8(64)
adiw r30, 1 ; hi8(64)
cli
out SPL, r30
out SPH, r31
out SREG,r1
clr r1
clr r0
ret
/**/
;###########################################################
.global sha1_nextBlock
; === sha1_nextBlock ===
; this is the core function for calculating SHA-1 hashes
; param1: the 16-bit pointer to sha1_ctx structure
; given in r25,r24 (r25 is most significant)
; param2: an 16-bit pointer to 64 byte block to hash
; given in r23,r22
sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
xtmp = 0
xNULL = 1
W1 = 10
W2 = 11
T1 = 12
T2 = 13
T3 = 14
T4 = 15
LoopC = 16
S = 17
tmp1 = 18
tmp2 = 19
tmp3 = 20
tmp4 = 21
F1 = 22
F2 = 23
F3 = 24
F4 = 25
/* byteorder: high number <--> high significance */
sha1_nextBlock:
; initial, let's make some space ready for local vars
/* replace push & pop by mem ops? */
push r10
push r11
push r12
push r13
push r14
push r15
push r16
push r17
push r28
push r29
in r20, SPL
in r21, SPH
movw r18, r20 ;backup SP
; movw r26, r20 ; X points to free space on stack /* maybe removeable? */
movw r30, r22 ; Z points to message
subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
sbci r21, hi8(sha1_nextBlock_localSpace)
movw r26, r20 ; X points to free space on stack
in r0, SREG
cli ; we want to be uninterrupted while updating SP
out SPL, r20
out SPH, r21
out SREG, r0
push r18
push r19 /* push old SP on new stack */
push r24
push r25 /* param1 will be needed later */
/* load a[] with state */
movw 28, r24 /* load pointer to state in Y */
adiw r26, 1 ; X++
ldi LoopC, 5*4
1: ld tmp1, Y+
st X+, tmp1
dec LoopC
brne 1b
movw W1, r26 /* save pointer to w[0] */
/* load w[] with endian fixed message */
/* we might also use the changeendian32() function at bottom */
movw r30, r22 /* mv param2 (ponter to msg) to Z */
ldi LoopC, 16
1:
ldd tmp1, Z+3
st X+, tmp1
ldd tmp1, Z+2
st X+, tmp1
ldd tmp1, Z+1
st X+, tmp1
ld tmp1, Z
st X+, tmp1
adiw r30, 4
dec LoopC
brne 1b
;clr LoopC /* LoopC is named t in FIPS 180-2 */
clr xtmp
sha1_nextBlock_mainloop:
mov S, LoopC
lsl S
lsl S
andi S, 0x3C /* S is a bytepointer so *4 */
/* load w[s] */
movw r26, W1
add r26, S /* X points at w[s] */
adc r27, xNULL
ld T1, X+
ld T2, X+
ld T3, X+
ld T4, X+
/**/
push r26
push r27
push T4
push T3
push T2
push T1
in r26, SPL
in r27, SPH
adiw r26, 1
dbg_hexdump 4
pop T1
pop T2
pop T3
pop T4
pop r27
pop r26
/**/
cpi LoopC, 16
brlt sha1_nextBlock_mainloop_core
/* update w[s] */
ldi tmp1, 2*4
rcall 1f
ldi tmp1, 8*4
rcall 1f
ldi tmp1, 13*4
rcall 1f
rjmp 2f
1: /* this might be "outsourced" to save the jump above */
add tmp1, S
andi tmp1, 0x3f
movw r26, W1
add r26, tmp1
adc r27, xNULL
ld tmp2, X+
eor T1, tmp2
ld tmp2, X+
eor T2, tmp2
ld tmp2, X+
eor T3, tmp2
ld tmp2, X+
eor T4, tmp2
ret
2: /* now we just hav to do a ROTL(T) and save T back */
mov tmp2, T4
rol tmp2
rol T1
rol T2
rol T3
rol T4
movw r26, W1
add r26, S
adc r27, xNULL
st X+, T1
st X+, T2
st X+, T3
st X+, T4
sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/
/* T already contains w[s] */
movw r26, W1
sbiw r26, 4*1 /* X points at a[4] aka e */
ld tmp1, X+
add T1, tmp1
ld tmp1, X+
adc T2, tmp1
ld tmp1, X+
adc T3, tmp1
ld tmp1, X+
adc T4, tmp1 /* T = w[s]+e */
sbiw r26, 4*5 /* X points at a[0] aka a */
ld F1, X+
ld F2, X+
ld F3, X+
ld F4, X+
mov tmp1, F4 /* X points at a[1] aka b */
ldi tmp2, 5
1:
rol tmp1
rol F1
rol F2
rol F3
rol F4
dec tmp2
brne 1b
add T1, F1
adc T2, F2
adc T3, F3
adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
/* now we have to do this fucking conditional stuff */
ldi r30, lo8(sha1_nextBlock_xTable)
ldi r31, hi8(sha1_nextBlock_xTable)
add r30, xtmp
adc r31, xNULL
lpm tmp1, Z
cp tmp1, LoopC
brne 1f
inc xtmp
1: ldi r30, lo8(sha1_nextBlock_KTable)
ldi r31, hi8(sha1_nextBlock_KTable)
lsl xtmp
lsl xtmp
add r30, xtmp
adc r31, xNULL
lsr xtmp
lsr xtmp
lpm tmp1, Z+
add T1, tmp1
lpm tmp1, Z+
adc T2, tmp1
lpm tmp1, Z+
adc T3, tmp1
lpm tmp1, Z+
adc T4, tmp1
/* T = ROTL(a,5) + e + kt + w[s] */
/* Z-4 is just pointing to kt ... */
movw r28, r26 /* copy X in Y */
adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
lsr r31
ror r30
icall
mov F1, tmp1
icall
mov F2, tmp1
icall
mov F3, tmp1
icall
add T1, F1
adc T2, F2
adc T3, F3
adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
/* X points still at a[1] aka b, Y points at a[2] aka c */
/* update a[] */
sha1_nextBlock_update_a:
/*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
//adiw r28, 3*4 /* Y should point at a[4] aka e */
movw r28, W1
sbiw r28, 4
ldi tmp2, 4*4
1:
ld tmp1, -Y
std Y+4, tmp1
dec tmp2
brne 1b
/* Y points at a[0] aka a*/
movw r28, W1
sbiw r28, 5*4
/* store T in a[0] aka a */
st Y+, T1
st Y+, T2
st Y+, T3
st Y+, T4
/* Y points at a[1] aka b*/
/* rotate c */
ldd T1, Y+1*4
ldd T2, Y+1*4+1
ldd T3, Y+1*4+2
ldd T4, Y+1*4+3
mov tmp1, T1
ldi tmp2, 2
1: ror tmp1
ror T4
ror T3
ror T2
ror T1
dec tmp2
brne 1b
std Y+1*4+0, T1
std Y+1*4+1, T2
std Y+1*4+2, T3
std Y+1*4+3, T4
push r27
push r26
movw r26, W1
sbiw r26, 4*5
dbg_hexdump 4*5
pop r26
pop r27
inc LoopC
cpi LoopC, 80
brge 1f
rjmp sha1_nextBlock_mainloop
/**************************************/
1:
/* littel patch */
sbiw r28, 4
/* add a[] to state and inc length */
pop r27
pop r26 /* now X points to state (and Y still at a[0]) */
ldi tmp4, 5
1: clc
ldi tmp3, 4
2: ld tmp1, X
ld tmp2, Y+
adc tmp1, tmp2
st X+, tmp1
dec tmp3
brne 2b
dec tmp4
brne 1b
/* now length += 512 */
adiw r26, 1 /* we skip the least significant byte */
ld tmp1, X
ldi tmp2, hi8(512) /* 2 */
add tmp1, tmp2
st X+, tmp1
ldi tmp2, 6
1:
ld tmp1, X
adc tmp1, xNULL
st X+, tmp1
dec tmp2
brne 1b
; EPILOG
sha1_nextBlock_epilog:
/* now we should clean up the stack */
pop r21
pop r20
in r0, SREG
cli ; we want to be uninterrupted while updating SP
out SPL, r20
out SPH, r21
out SREG, r0
clr r1
pop r29
pop r28
pop r17
pop r16
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret
sha1_nextBlock_xTable:
.byte 20,40,60,0
sha1_nextBlock_KTable:
.int 0x5a827999
.int 0x6ed9eba1
.int 0x8f1bbcdc
.int 0xca62c1d6
sha1_nextBlock_JumpTable:
rjmp sha1_nextBlock_Ch
nop
rjmp sha1_nextBlock_Parity
nop
rjmp sha1_nextBlock_Maj
nop
rjmp sha1_nextBlock_Parity
/* X and Y still point at a[1] aka b ; return value in tmp1 */
sha1_nextBlock_Ch:
ld tmp1, Y+
mov tmp2, tmp1
com tmp2
ldd tmp3, Y+3 /* load from c */
and tmp1, tmp3
ldd tmp3, Y+7 /* load from d */
and tmp2, tmp3
eor tmp1, tmp2
ret
sha1_nextBlock_Maj:
ld tmp1, Y+
mov tmp2, tmp1
ldd tmp3, Y+3 /* load from c */
and tmp1, tmp3
ldd tmp4, Y+7 /* load from d */
and tmp2, tmp4
eor tmp1, tmp2
and tmp3, tmp4
eor tmp1, tmp3
ret
sha1_nextBlock_Parity:
ld tmp1, Y+
ldd tmp2, Y+3 /* load from c */
eor tmp1, tmp2
ldd tmp2, Y+7 /* load from d */
eor tmp1, tmp2
ret
/*
ch_str: .asciz "\r\nCh"
maj_str: .asciz "\r\nMaj"
parity_str: .asciz "\r\nParity"
*/
;###########################################################
.global sha1_init
;void sha1_init(sha1_ctx_t *state){
; DEBUG_S("\r\nSHA1_INIT");
; state->h[0] = 0x67452301;
; state->h[1] = 0xefcdab89;
; state->h[2] = 0x98badcfe;
; state->h[3] = 0x10325476;
; state->h[4] = 0xc3d2e1f0;
; state->length = 0;
;}
; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
; modifys: Z(r30,r31), Func1, r22
sha1_init:
movw r26, r24 ; (24,25) --> (26,27) load X with param1
ldi r30, lo8((sha1_init_vector))
ldi r31, hi8((sha1_init_vector))
ldi r22, 5*4 /* bytes to copy */
sha1_init_vloop:
lpm r23, Z+
st X+, r23
dec r22
brne sha1_init_vloop
ldi r22, 8
sha1_init_lloop:
st X+, r1
dec r22
brne sha1_init_lloop
ret
sha1_init_vector:
.int 0x67452301;
.int 0xefcdab89;
.int 0x98badcfe;
.int 0x10325476;
.int 0xc3d2e1f0;