avr-crypto-lib/bmw/bmw_small-asm.S

2198 lines
37 KiB
ArmAsm

/* bmw_small-asm.S */
/*
This file is part of the AVR-Crypto-Lib.
Copyright (C) 2006-2015 Daniel Otte (bg@nerilex.org)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* File: bmw_small-asm.S
* Author: Daniel Otte
* Date: 2009-11-13
* License: GPLv3 or later
* Description: implementation of BlueMidnightWish
*
*/
#include "avr-asm-macros.S"
shiftcodetable:
; .byte 0x00 ; 0
shiftcodetable_1:
.byte 0x01 ; 1
.byte 0x02 ; 2
.byte 0x03 ; 3
.byte 0x04 ; 4
.byte 0x1B ; 5
.byte 0x1A ; 6
.byte 0x19 ; 7
.byte 0x10 ; 8
shiftcodetable_9:
.byte 0x11 ; 9
.byte 0x12 ; 10
.byte 0x13 ; 11
.byte 0x2C ; 12
.byte 0x2B ; 13
.byte 0x2A ; 14
.byte 0x29 ; 15
.byte 0x20 ; 16
; .byte 0x21 ; 17 unused but necesseray for padding
/*******************************************************************************
* shiftl32
* value: r25:r22
* shift: r20
*/
shiftl32:
1:
; clc
lsl r22
rol r23
rol r24
rol r25
dec r20
brne 1b
ret
/*******************************************************************************
* shiftr32
* value: r25:r22
* shift: r20
*/
shiftr32:
1:
; clc
lsr r25
ror r24
ror r23
ror r22
dec r20
brne 1b
ret
/*******************************************************************************
* rotl32
* value: r25:r22
* shift: r20
*/
rotl32:
mov r21, r25
1:
lsl r21
rol r22
rol r23
rol r24
rol r25
dec r20
brne 1b
ret
/*******************************************************************************
* rotr32
* value: r25:r22
* shift: r20
*/
rotr32:
mov r21, r22
1:
lsr r21
ror r25
ror r24
ror r23
ror r22
dec r20
brne 1b
some_ret:
ret
/*******************************************************************************
* rotl32p9
* value: r25:r22
* shift: r20
*/
rotl32p9:
push_range 30, 31
ldi r30, lo8(shiftcodetable_9)
ldi r31, hi8(shiftcodetable_9)
add r30, r20
adc r31, r1
lpm r20, Z
pop_range 30, 31
sbrs r20, 4
rjmp 2f
mov r0, r25
mov r25, r24
mov r24, r23
mov r23, r22
mov r22, r0
2: sbrs r20, 5
rjmp 3f
movw r0, r24
movw r24, r22
movw r22, r0
clr r1
3: bst r20, 3
andi r20, 0x07
breq some_ret
brts rotr32
rjmp rotl32
/*******************************************************************************
* uint32_t rotl_addel(uint32_t x, uint8_t v){
* uint32_t r;
* r = ROTL32(x, (v&0xf)+1);
* return r;
* }
* param x: r25:r22
* param v: r20
*/
.global rotl_addel
rotl_addel:
andi r20, 0x0f
ldi r30, lo8(shiftcodetable_1)
ldi r31, hi8(shiftcodetable_1)
add r30, r20
adc r31, r1
lpm r20, Z
sbrs r20, 4
rjmp 1f
mov r21, r25
mov r25, r24
mov r24, r23
mov r23, r22
mov r22, r21
1: sbrs r20, 5
rjmp 2f
movw r30, r24
movw r24, r22
movw r22, r30
2: bst r20, 3
andi r20, 0x07
breq some_ret
3:
brts rotr32; 4f
rjmp rotl32
;4: rjmp rotr32
/******************************************************************************/
preg0 = 22 /* preg for processing register */
preg1 = 23
preg2 = 24
preg3 = 25
breg0 = 26 /* breg for backup register */
breg1 = 27
breg2 = 18
breg3 = 19
areg0 = 0 /* areg for accumulator register */
areg1 = 1
areg2 = 30
areg3 = 31
/*******************************************************************************
* uint32_t bmw_small_s0(uint32_t x){
* uint32_t r;
* r = SHR32(x, 1)
* ^ SHL32(x, 3)
* ^ ROTL32(x, 4)
* ^ ROTR32(x, 13);
* return r;
* }
*/
.global bmw_small_s0
bmw_small_s0:
movw breg0, preg0
movw breg2, preg2
ldi r20, 1
rcall shiftr32
movw areg2, preg2
movw areg0, preg0
movw preg2, breg2
movw preg0, breg0
ldi r20, 3
rcall shiftl32
eor areg0, preg0
eor areg1, preg1
eor areg2, preg2
eor areg3, preg3
movw preg2, breg2
movw preg0, breg0
ldi r20, 4
rcall rotl32
eor areg0, preg0
eor areg1, preg1
eor areg2, preg2
eor areg3, preg3
/* now the trick, we simply can rotate the old value to the right by 17 */
movw breg0, preg0 /* first rotate by 16 */
movw preg0, preg2
movw preg2, breg0
outro_1:
ldi r20, 1
rcall rotr32
outro_2:
eor preg0, areg0
eor preg1, areg1
eor preg2, areg2
eor preg3, areg3
clr r1
ret
/*******************************************************************************
* uint32_t bmw_small_s1(uint32_t x){
* uint32_t r;
* r = SHR32(x, 1)
* ^ SHL32(x, 2)
* ^ ROTL32(x, 8)
* ^ ROTR32(x, 9);
* return r;
* }
*/
.global bmw_small_s1
bmw_small_s1:
movw breg0, preg0
movw breg2, preg2
ldi r20, 1
rcall shiftr32
movw areg2, preg2
movw areg0, preg0
movw preg2, breg2
movw preg0, breg0
ldi r20, 2
rcall shiftl32
eor areg0, preg0
eor areg1, preg1
eor areg2, preg2
eor areg3, preg3
eor areg0, breg3
eor areg1, breg0
eor areg2, breg1
eor areg3, breg2
mov preg0, breg1
mov preg1, breg2
mov preg2, breg3
mov preg3, breg0
rjmp outro_1
/*******************************************************************************
* uint32_t bmw_small_s2(uint32_t x){
* uint32_t r;
* r = SHR32(x, 2)
* ^ SHL32(x, 1)
* ^ ROTL32(x, 12)
* ^ ROTR32(x, 7);
* return r;
* }
*/
.global bmw_small_s2
bmw_small_s2:
movw breg0, preg0
movw breg2, preg2
ldi r20, 2
rcall shiftr32
movw areg2, preg2
movw areg0, preg0
movw preg2, breg2
movw preg0, breg0
ldi r20, 1
rcall shiftl32
eor areg0, preg0
eor areg1, preg1
eor areg2, preg2
eor areg3, preg3
movw preg0, breg2
movw preg2, breg0
ldi r20, 4
rcall rotr32
eor areg0, preg0
eor areg1, preg1
eor areg2, preg2
eor areg3, preg3
mov preg0, breg1
mov preg1, breg2
mov preg2, breg3
mov preg3, breg0
ldi r20, 1
rcall rotl32
rjmp outro_2
/*******************************************************************************
* uint32_t bmw_small_s3(uint32_t x){
* uint32_t r;
* r = SHR32(x, 2)
* ^ SHL32(x, 2)
* ^ ROTL32(x, 15)
* ^ ROTR32(x, 3);
* return r;
* }
*/
.global bmw_small_s3
bmw_small_s3:
movw breg0, preg0
movw breg2, preg2
ldi r20, 2
rcall shiftr32
movw areg2, preg2
movw areg0, preg0
movw preg2, breg2
movw preg0, breg0
ldi r20, 2
rcall shiftl32
eor areg0, preg0
eor areg1, preg1
eor areg2, preg2
eor areg3, preg3
movw preg0, breg2
movw preg2, breg0
ldi r20, 1
rcall rotr32
eor areg0, preg0
eor areg1, preg1
eor areg2, preg2
eor areg3, preg3
movw preg0, breg0
movw preg2, breg2
ldi r20, 3
rcall rotr32
rjmp outro_2
/*******************************************************************************
* uint32_t bmw_small_s4(uint32_t x){
* uint32_t r;
* r = SHR32(x, 1)
* ^ x;
* return r;
* }
*/
.global bmw_small_s4
bmw_small_s4:
movw areg0, preg0
movw areg2, preg2
ldi r20, 1
rcall shiftr32
rjmp outro_2
/*******************************************************************************
* uint32_t bmw_small_s5(uint32_t x){
* uint32_t r;
* r = SHR32(x, 2)
* ^ x;
* return r;
* }
*/
.global bmw_small_s5
bmw_small_s5:
movw areg0, preg0
movw areg2, preg2
ldi r20, 2
rcall shiftr32
rjmp outro_2
/*******************************************************************************
* uint32_t bmw_small_r1(uint32_t x){
* uint32_t r;
* r = ROTL32(x, 3);
* return r;
* }
*/
.global bmw_small_r1
bmw_small_r1:
ldi r20, 3
rjmp rotl32
/*******************************************************************************
* uint32_t bmw_small_r2(uint32_t x){
* uint32_t r;
* r = ROTL32(x, 7);
* return r;
* }
*/
.global bmw_small_r2
bmw_small_r2:
ldi r20, 7
rjmp rotl32
/*******************************************************************************
* uint32_t bmw_small_r3(uint32_t x){
* uint32_t r;
* r = ROTL32(x, 13);
* return r;
* }
*/
.global bmw_small_r3
bmw_small_r3:
movw r18, r24
movw r24, r22
movw r22, r18
ldi r20, 3
rjmp rotr32
/*******************************************************************************
* uint32_t bmw_small_r4(uint32_t x){
* uint32_t r;
* r = ROTL32(x, 16);
* return r;
* }
*/
.global bmw_small_r4
bmw_small_r4:
movw r18, r24
movw r24, r22
movw r22, r18
ret
/*******************************************************************************
* uint32_t bmw_small_r5(uint32_t x){
* uint32_t r;
* r = ROTR32(x, 13);
* return r;
* }
*/
.global bmw_small_r5
bmw_small_r5:
movw r18, r24
movw r24, r22
movw r22, r18
ldi r20, 3
rjmp rotl32
/*******************************************************************************
* uint32_t bmw_small_r6(uint32_t x){
* uint32_t r;
* r = ROTR32(x, 9);
* return r;
* }
*/
.global bmw_small_r6
bmw_small_r6:
mov r18, r22
mov r22, r23
mov r23, r24
mov r24, r25
mov r25, r18
ldi r20, 1
rjmp rotr32
/*******************************************************************************
* uint32_t bmw_small_r7(uint32_t x){
* uint32_t r;
* r = ROTR32(x, 5);
* return r;
* }
*/
.global bmw_small_r7
bmw_small_r7:
ldi r20, 5
rjmp rotr32
/******************************************************************************/
const_lut:
.long 0x55555550, 0x5aaaaaa5, 0x5ffffffa, 0x6555554f
.long 0x6aaaaaa4, 0x6ffffff9, 0x7555554e, 0x7aaaaaa3
.long 0x7ffffff8, 0x8555554d, 0x8aaaaaa2, 0x8ffffff7
.long 0x9555554c, 0x9aaaaaa1, 0x9ffffff6, 0xa555554b
/*******************************************************************************
* uint32_t addelment(uint8_t j, const uint32_t *m, const uint32_t *h){
* uint32_t r;
* r = pgm_read_dword(k_lut+j);
* r += rotl_addel(((uint32_t*)m)[j&0xf], j+0);
* r += rotl_addel(((uint32_t*)m)[(j+3)&0xf], j+3);
* r -= rotl_addel(((uint32_t*)m)[(j+10)&0xf], j+10);
* r ^= ((uint32_t*)h)[(j+7)&0xf];
* return r;
* }
* param j: r24
* param m: r22:r23
* param h: r20:r21
*/
j = 16
acc2 = 8
acc3 = 9
h0 = 10
h1 = 11
m0 = 12
m1 = 13
acc0 = 14
acc1 = 15
.global addelement
addelement:
push_range 8, 16
mov j, r24
movw h0, r20
movw m0, r22
lsl r24
lsl r24
ldi r30, lo8(const_lut)
ldi r31, hi8(const_lut)
add r30, r24
adc r31, r1
lpm acc0, Z+
lpm acc1, Z+
lpm acc2, Z+
lpm acc3, Z+
mov r20, j
andi r20, 0x0f
lsl r20
lsl r20
movw r26, m0
add r26, r20
adc r27, r1
ld r22, X+
ld r23, X+
ld r24, X+
ld r25, X+
mov r20, j
rcall rotl_addel
add acc0, r22
adc acc1, r23
adc acc2, r24
adc acc3, r25
subi j, -3
mov r20, j
andi r20, 0x0f
lsl r20
lsl r20
movw r26, m0
add r26, r20
adc r27, r1
ld r22, X+
ld r23, X+
ld r24, X+
ld r25, X+
mov r20, j
rcall rotl_addel
add acc0, r22
adc acc1, r23
adc acc2, r24
adc acc3, r25
subi j, -7
mov r20, j
andi r20, 0x0f
lsl r20
lsl r20
movw r26, m0
add r26, r20
adc r27, r1
ld r22, X+
ld r23, X+
ld r24, X+
ld r25, X+
mov r20, j
rcall rotl_addel
sub acc0, r22
sbc acc1, r23
sbc acc2, r24
sbc acc3, r25
subi j, 3
mov r20, j
andi r20, 0x0f
lsl r20
lsl r20
movw r26, h0
add r26, r20
adc r27, r1
ld r22, X+
ld r23, X+
ld r24, X+
ld r25, X+
eor r22, acc0
eor r23, acc1
eor r24, acc2
eor r25, acc3
pop_range 8, 16
ret
/*******************************************************************************
* uint32_t bmw_small_expand1(uint8_t j, const void *m, const void *h, const uint32_t *q){
* uint32_t(*s[])(uint32_t) = {bmw_small_s1, bmw_small_s2, bmw_small_s3, bmw_small_s0};
* uint32_t r;
* uint8_t i;
* r = addelement(j, m, h);
* i=15;
* do{
* r += s[i%4](q[j+i]);
* }while(i--!=0);
* return r;
*
* param j: r24
* param m: r22:r23
* param h: r20:r21
* param q: r18:r19
*/
acc0 = 2
acc1 = 3
acc2 = 4
acc3 = 5
.global bmw_small_expand1
bmw_small_expand1:
push_range 28, 29
movw r28, r18
mov r18, r24
lsl r18
lsl r18
add r28, r18
adc r29, r1
rcall addelement
push_range 2, 5
push r16
ldi r16, 4
movw acc0, r22
movw acc2, r24
1:
ld r22, Y+
ld r23, Y+
ld r24, Y+
ld r25, Y+
rcall bmw_small_s1
add acc0, r22
adc acc1, r23
adc acc2, r24
adc acc3, r25
ld r22, Y+
ld r23, Y+
ld r24, Y+
ld r25, Y+
rcall bmw_small_s2
add acc0, r22
adc acc1, r23
adc acc2, r24
adc acc3, r25
ld r22, Y+
ld r23, Y+
ld r24, Y+
ld r25, Y+
rcall bmw_small_s3
add acc0, r22
adc acc1, r23
adc acc2, r24
adc acc3, r25
ld r22, Y+
ld r23, Y+
ld r24, Y+
ld r25, Y+
rcall bmw_small_s0
add acc0, r22
adc acc1, r23
adc acc2, r24
adc acc3, r25
dec r16
brne 1b
expand1_exit:
movw r22, acc0
movw r24, acc2
pop r16
pop_range 2, 5
pop_range 28, 29
ret
/*******************************************************************************
* uint32_t bmw_small_expand2(uint8_t j, const void *m, const void *h, const uint32_t *q){
* uint32_t(*rf[])(uint32_t) = {bmw_small_r1, bmw_small_r2, bmw_small_r3,
* bmw_small_r4, bmw_small_r5, bmw_small_r6,
* bmw_small_r7};
* uint32_t r;
* uint8_t i;
* r = addelement(j, m, h);
* for(i=0; i<14; i+=2){
* r += q[j+i];
* }
* for(i=0; i<14; i+=2){
* r += rf[i/2](q[j+i+1]);
* }
* r += bmw_small_s4(q[j+14]);
* r += bmw_small_s5(q[j+15]);
* return r;
* }
*/
expand2_jumptable:
ret
rjmp bmw_small_r1
ret
rjmp bmw_small_r2
ret
rjmp bmw_small_r3
ret
rjmp bmw_small_r4
ret
rjmp bmw_small_r5
ret
rjmp bmw_small_r6
ret
rjmp bmw_small_r7
rjmp bmw_small_s4
rjmp bmw_small_s5
.global bmw_small_expand2
bmw_small_expand2:
push_range 28, 29
movw r28, r18
mov r18, r24
lsl r18
lsl r18
add r28, r18
adc r29, r1
rcall addelement
push_range 2, 5
push r16
ldi r16, 16
movw acc0, r22
movw acc2, r24
ldi r30, pm_lo8(expand2_jumptable)
ldi r31, pm_hi8(expand2_jumptable)
1:
ld r22, Y+
ld r23, Y+
ld r24, Y+
ld r25, Y+
push r30
push r31
icall
pop r31
pop r30
adiw r30, 1
add acc0, r22
adc acc1, r23
adc acc2, r24
adc acc3, r25
dec r16
brne 1b
rjmp expand1_exit
/*******************************************************************************
* void bmw_small_f1(uint32_t *q, const void *m, const void *h){
* uint8_t i;
* q[16] = bmw_small_expand1(0, m, h, q);
* q[17] = bmw_small_expand1(1, m, h, q);
* for(i=2; i<16; ++i){
* q[16+i] = bmw_small_expand2(i, m, h, q);
* }
* }
*/
m0 = 2
m1 = 3
h0 = 4
h1 = 5
q0 = 6
q1 = 7
.global bmw_small_f1
bmw_small_f1:
; push_range 2, 7
; push_range 28, 29
push r16
movw q0, r24
movw m0, r22
movw h0, r20
movw r28, q0
adiw r28, 63
adiw r28, 1
clr r24
clr r25 /* not required */
movw r18, q0
rcall bmw_small_expand1
st Y+, r22
st Y+, r23
st Y+, r24
st Y+, r25
ldi r16, 1
mov r24, r16
clr r25 /* not required */
movw r22, m0
movw r20, h0
movw r18, q0
rcall bmw_small_expand1
st Y+, r22
st Y+, r23
st Y+, r24
st Y+, r25
inc r16
1:
mov r24, r16
movw r22, m0
movw r20, h0
movw r18, q0
rcall bmw_small_expand2
st Y+, r22
st Y+, r23
st Y+, r24
st Y+, r25
inc r16
cpi r16, 16
brne 1b
pop r16
; pop_range 28, 29
; pop_range 2, 7
ret
/*******************************************************************************
* uint16_t hack_table[5] PROGMEM = { 0x0311, 0xDDB3, 0x2A79, 0x07AA, 0x51C2 };
* uint8_t offset_table[5] PROGMEM = { 4+16, 6+16, 9+16, 12+16, 13+16 };
*
* void bmw_small_f0(uint32_t *h, const void *m, uint32_t *q){
* uint16_t hack_reg;
* uint8_t c,i,j;
* uint32_t(*s[])(uint32_t)={ bmw_small_s0, bmw_small_s1, bmw_small_s2,
* bmw_small_s3, bmw_small_s4 };
* for(i=0; i<16; ++i){
* ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
* }
* dump_x(h, 16, 'T');
* memset(q, 0, 4*16);
* c=4;
* do{
* i=15;
* j=pgm_read_byte(offset_table+c);
* hack_reg=pgm_read_word(&(hack_table[c]));
* do{
* if(hack_reg&1){
* q[i]-= h[j&15];
* }else{
* q[i]+= h[j&15];
* }
* --j;
* hack_reg>>= 1;
* }while(i--!=0);
* }while(c--!=0);
* dump_x(q, 16, 'W');
* for(i=0; i<16; ++i){
* q[i] = s[i%5](q[i]);
* }
* for(i=0; i<16; ++i){
* ((uint32_t*)h)[i] ^= ((uint32_t*)m)[i];
* }
* for(i=0; i<16; ++i){
* q[i] += h[(i+1)&0xf];
* }
* }
*
* param h: r24:r25
* param m: r22:r23
* param q: r20:r21
*/
h0 = 24
h1 = 25
m0 = 22
m1 = 23
q0 = 20
q1 = 21
acc0 = 4
acc1 = 5
acc2 = 6
acc3 = 7
bcc0 = 8
bcc1 = 9
bcc2 = 10
bcc3 = 11
hack = 16
f0_helper:
20:
ldd acc0, Z+0
ldd acc1, Z+1
ldd acc2, Z+2
ldd acc3, Z+3
ld bcc0, X+
ld bcc1, X+
ld bcc2, X+
ld bcc3, X+
lsr r17
ror r16
brcs l20_sub
add acc0, bcc0
adc acc1, bcc1
adc acc2, bcc2
adc acc3, bcc3
rjmp l20_post
l20_sub:
sub acc0, bcc0
sbc acc1, bcc1
sbc acc2, bcc2
sbc acc3, bcc3
l20_post:
st Z+, acc0
st Z+, acc1
st Z+, acc2
st Z+, acc3
dec r18
brne 20b
ret
f0_jumptable:
rjmp bmw_small_s0
rjmp bmw_small_s1
rjmp bmw_small_s2
rjmp bmw_small_s3
rjmp bmw_small_s4
rjmp bmw_small_s0
rjmp bmw_small_s1
rjmp bmw_small_s2
rjmp bmw_small_s3
rjmp bmw_small_s4
rjmp bmw_small_s0
rjmp bmw_small_s1
rjmp bmw_small_s2
rjmp bmw_small_s3
rjmp bmw_small_s4
rjmp bmw_small_s0
.global bmw_small_f0
bmw_small_f0:
; push_range 28, 29
; push_range 4, 11
; push_range 16, 17
/* h[i] ^= m[i]; q[i]= 0 */
movw r26, h0 ; h
movw r30, m0 ; m
movw r28, q0 ; q
ldi r18, 64
1: ld r0, X
ld r19, Z+
eor r0, r19
st X+, r0
st Y+, r1
dec r18
brne 1b
;------
ldi r17, 0x88
ldi r16, 0xC0
movw r26, h0 ; X = h
adiw r26, 5*4
ldi r18, 16-5
movw r30, q0 ; Z = q
rcall f0_helper
movw r26, h0 ; X = h
ldi r18, 5
rcall f0_helper
;---
ldi r17, 0xCD
ldi r16, 0xBB
movw r26, h0 ; X = h
adiw r26, 7*4
ldi r18, 16-7
movw r30, q0 ; Z = q
rcall f0_helper
movw r26, h0 ; X = h
ldi r18, 7
rcall f0_helper
;---
ldi r17, 0x9E
ldi r16, 0x54
movw r26, h0 ; X = h
adiw r26, 10*4
ldi r18, 16-10
movw r30, q0 ; Z = q
rcall f0_helper
movw r26, h0 ; X = h
ldi r18, 10
rcall f0_helper
;---
ldi r17, 0x55
ldi r16, 0xE0
movw r26, h0 ; X = h
adiw r26, 13*4
ldi r18, 16-13
movw r30, q0 ; Z = q
rcall f0_helper
movw r26, h0 ; X = h
ldi r18, 13
rcall f0_helper
;---
ldi r17, 0x43
ldi r16, 0x8A
movw r26, h0 ; X = h
adiw r26, 14*4
ldi r18, 16-14
movw r30, q0 ; Z = q
rcall f0_helper
movw r26, h0 ; X = h
ldi r18, 14
rcall f0_helper
;--------------- h[i] ^= m[i]
movw r26, h0 ; h
movw r30, m0 ; m
ldi r18, 64
25: ld r0, X
ld r19, Z+
eor r0, r19
st X+, r0
dec r18
brne 25b
;--------------- q[i] = s[i%5](q[i])
ldi r16, 16
ldi r30, pm_lo8(f0_jumptable)
ldi r31, pm_hi8(f0_jumptable)
movw bcc0, r30
movw bcc2, h0 ; h
movw acc0, q0 ; q
movw r28, q0 ; Y = q
30:
ldd r22, Y+0
ldd r23, Y+1
ldd r24, Y+2
ldd r25, Y+3
icall
st Y+, r22
st Y+, r23
st Y+, r24
st Y+, r25
movw r30, bcc0
adiw r30, 1
movw bcc0, r30
dec r16
brne 30b
;--------------- q[i] += h[(i+1)%16]
movw r30, acc0 ; q
movw r26, bcc2 ; h
adiw r26, 4
ldi r18, 15
40:
ld acc0, Z
ld acc1, X+
add acc0, acc1
st Z+, acc0
ld acc0, Z
ld acc1, X+
adc acc0, acc1
st Z+, acc0
ld acc0, Z
ld acc1, X+
adc acc0, acc1
st Z+, acc0
ld acc0, Z
ld acc1, X+
adc acc0, acc1
st Z+, acc0
dec r18
brne 40b
movw r26, bcc2 ; h
ld acc0, Z
ld acc1, X+
add acc0, acc1
st Z+, acc0
ld acc0, Z
ld acc1, X+
adc acc0, acc1
st Z+, acc0
ld acc0, Z
ld acc1, X+
adc acc0, acc1
st Z+, acc0
ld acc0, Z
ld acc1, X+
adc acc0, acc1
st Z+, acc0
; pop_range 16, 17
; pop_range 4, 11
; pop_range 28, 29
ret
/*******************************************************************************
* void bmw_small_f2(uint32_t *h, const uint32_t *q, const void *m){
* uint32_t xl=0, xh;
* uint8_t i;
* for(i=16;i<24;++i){
* xl ^= q[i];
* }
* xh = xl;
* for(i=24;i<32;++i){
* xh ^= q[i];
* }
* memcpy(h, m, 16*4);
* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5);
* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6);
* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5);
* h[4] ^= SHR32(xh, 3) ^ q[20];
* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6);
* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5);
* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8);
* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2);
* for(i=0; i<8; ++i){
* h[i] += xl ^ q[24+i] ^ q[i];
* }
* for(i=0; i<8; ++i){
* h[8+i] ^= xh ^ q[24+i];
* h[8+i] += ROTL32(h[(4+i)%8],i+9);
* }
* h[11] += SHL32(xl, 4) ^ q[18] ^ q[11];
* h[10] += SHL32(xl, 6) ^ q[17] ^ q[10];
* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8];
* h[15] += SHR32(xl, 2) ^ q[22] ^ q[15];
* h[12] += SHR32(xl, 3) ^ q[19] ^ q[12];
* h[13] += SHR32(xl, 4) ^ q[20] ^ q[13];
* h[ 9] += SHR32(xl, 6) ^ q[16] ^ q[ 9];
* h[14] += SHR32(xl, 7) ^ q[21] ^ q[14];
* }
*
* param h: r24:r25
* param q: r22:r23
* param m: r20:r21
*/
xl0 = 2
xl1 = 3
xl2 = 4
xl3 = 5
xh0 = 6
xh1 = 7
xh2 = 8
xh3 = 9
q0 = 10
q1 = 11
h0 = 12
h1 = 13
t0 = 14
t1 = 15
t2 = 16
t3 = 17
.macro modify_h_2 addr:req
ldd r22, Y+\addr*4+0
ldd r23, Y+\addr*4+1
ldd r24, Y+\addr*4+2
ldd r25, Y+\addr*4+3
eor r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ldd r0, Z+\addr*4+0
add r0, r22
std Z+\addr*4+0, r0
ldd r0, Z+\addr*4+1
adc r0, r23
std Z+\addr*4+1, r0
ldd r0, Z+\addr*4+2
adc r0, r24
std Z+\addr*4+2, r0
ldd r0, Z+\addr*4+3
adc r0, r25
std Z+\addr*4+3, r0
.endm
tshiftr:
lsr t3
ror t2
ror t1
ror t0
dec r20
brne tshiftr
ret
tshiftl:
lsl t0
rol t1
rol t2
rol t3
dec r20
brne tshiftl
ret
.global bmw_small_f2
bmw_small_f2:
/* memcpy(h, m, 64) */
movw r26, r24
movw r30, r20
ldi r18, 64
1: ld r0, Z+
st X+, r0
dec r18
brne 1b
; push_range 28, 29
; push_range 2, 17
movw q0, r22
movw h0, r24
/* calc xl */
/* for(i=16;i<24;++i){
xl ^= q[i];
}
*/
movw r26, q0
adiw r26, 63
adiw r26, 1 ; X points at q[16]
ld xl0, X+
ld xl1, X+
ld xl2, X+
ld xl3, X+
ldi r18, 8-1
20: ld r0, X+
eor xl0, r0
ld r0, X+
eor xl1, r0
ld r0, X+
eor xl2, r0
ld r0, X+
eor xl3, r0
dec r18
brne 20b
/* calc xh */
/* xh = xl
for(i=24;i<32;++i){
xh ^= q[i];
}
*/
movw xh0, xl0
movw xh2, xl2
ldi r18, 8
25: ld r0, X+
eor xh0, r0
ld r0, X+
eor xh1, r0
ld r0, X+
eor xh2, r0
ld r0, X+
eor xh3, r0
dec r18
brne 25b
/* h[0]..h[7] */
movw r30, h0
movw r28, q0
adiw r28, 60 ; Y points at q[15]
/* h[0] ^= SHL32(xh, 5) ^ SHR32(q[16], 5); */
movw t0, xh0
movw t2, xh2
ldi r20, 5
rcall tshiftl
ldd r22, Y+4
ldd r23, Y+5
ldd r24, Y+6
ldd r25, Y+7
ldi r20, 5
rcall shiftr32
eor r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ldd r0, Z+0
eor r22, r0
ldd r0, Z+1
eor r23, r0
ldd r0, Z+2
eor r24, r0
ldd r0, Z+3
eor r25, r0
std Z+0, r22
std Z+1, r23
std Z+2, r24
std Z+3, r25
/* h[5] ^= SHL32(xh, 6) ^ SHR32(q[21], 6); */
lsl t0
rol t1
rol t2
rol t3
ldd r22, Y+24
ldd r23, Y+25
ldd r24, Y+26
ldd r25, Y+27
ldi r20, 6
rcall shiftr32
eor r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ldd r0, Z+20
eor r22, r0
ldd r0, Z+21
eor r23, r0
ldd r0, Z+22
eor r24, r0
ldd r0, Z+23
eor r25, r0
std Z+20, r22
std Z+21, r23
std Z+22, r24
std Z+23, r25
/* h[3] ^= SHR32(xh, 1) ^ SHL32(q[19], 5); */
movw t0, xh0
movw t2, xh2
lsr t3
ror t2
ror t1
ror t0
ldd r22, Y+16
ldd r23, Y+17
ldd r24, Y+18
ldd r25, Y+19
ldi r20, 5
rcall shiftl32
eor r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ldd r0, Z+12
eor r22, r0
ldd r0, Z+13
eor r23, r0
ldd r0, Z+14
eor r24, r0
ldd r0, Z+15
eor r25, r0
std Z+12, r22
std Z+13, r23
std Z+14, r24
std Z+15, r25
/* h[4] ^= SHR32(xh, 3) ^ q[20]; */
ldi r20, 2
rcall tshiftr
ldd r22, Y+20
ldd r23, Y+21
ldd r24, Y+22
ldd r25, Y+23
eor r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ldd r0, Z+16
eor r22, r0
ldd r0, Z+17
eor r23, r0
ldd r0, Z+18
eor r24, r0
ldd r0, Z+19
eor r25, r0
std Z+16, r22
std Z+17, r23
std Z+18, r24
std Z+19, r25
/* h[6] ^= SHR32(xh, 4) ^ SHL32(q[22], 6); */
lsr t3
ror t2
ror t1
ror t0
ldd r22, Y+28
ldd r23, Y+29
ldd r24, Y+30
ldd r25, Y+31
ldi r20, 6
rcall shiftl32
eor r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ldd r0, Z+24
eor r22, r0
ldd r0, Z+25
eor r23, r0
ldd r0, Z+26
eor r24, r0
ldd r0, Z+27
eor r25, r0
std Z+24, r22
std Z+25, r23
std Z+26, r24
std Z+27, r25
/* h[2] ^= SHR32(xh, 5) ^ SHL32(q[18], 5); */
lsr t3
ror t2
ror t1
ror t0
ldd r22, Y+12
ldd r23, Y+13
ldd r24, Y+14
ldd r25, Y+15
ldi r20, 5
rcall shiftl32
eor r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ldd r0, Z+8
eor r22, r0
ldd r0, Z+9
eor r23, r0
ldd r0, Z+10
eor r24, r0
ldd r0, Z+11
eor r25, r0
std Z+8 , r22
std Z+9 , r23
std Z+10, r24
std Z+11, r25
/* h[1] ^= SHR32(xh, 7) ^ SHL32(q[17], 8); */
ldi r20, 2
rcall tshiftr
ldd r23, Y+8
ldd r24, Y+9
ldd r25, Y+10
mov r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ldd r0, Z+4
eor r22, r0
ldd r0, Z+5
eor r23, r0
ldd r0, Z+6
eor r24, r0
ldd r0, Z+7
eor r25, r0
std Z+4 , r22
std Z+5 , r23
std Z+6 , r24
std Z+7 , r25
/* h[7] ^= SHR32(xh,11) ^ SHL32(q[23], 2); */
ldi r20, 4
rcall tshiftr
ldd r22, Y+32
ldd r23, Y+33
ldd r24, Y+34
ldd r25, Y+35
ldi r20, 2
rcall shiftl32
eor r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ldd r0, Z+28
eor r22, r0
ldd r0, Z+29
eor r23, r0
ldd r0, Z+30
eor r24, r0
ldd r0, Z+31
eor r25, r0
std Z+28, r22
std Z+29, r23
std Z+30, r24
std Z+31, r25
/* for(i=0; i<8; ++i){
* h[i] += xl ^ q[24+i] ^ q[i];
* }
*/
movw r26, q0
movw r28, q0
adiw r28, 63
adiw r28, 24*4-63
ldi r18, 8
10:
movw t0, xl0
movw t2, xl2
ld r0, X+
eor t0, r0
ld r0, X+
eor t1, r0
ld r0, X+
eor t2, r0
ld r0, X+
eor t3, r0
ld r0, Y+
eor t0, r0
ld r0, Y+
eor t1, r0
ld r0, Y+
eor t2, r0
ld r0, Y+
eor t3, r0
ldd r22, Z+0
ldd r23, Z+1
ldd r24, Z+2
ldd r25, Z+3
add r22, t0
adc r23, t1
adc r24, t2
adc r25, t3
st Z+, r22
st Z+, r23
st Z+, r24
st Z+, r25
dec r18
brne 10b
; Z points to h[8]
/* for(i=0; i<8; ++i){
h[8+i] ^= xh ^ q[24+i];
h[8+i] += ROTL32(h[(4+i)%8],i+9);
}
*/
; Z points at h[8]
; clr r18
sbiw r28, 8*4 ; Y points at q[24]
movw r26, r30
sbiw r26, 4*4 ; X points at h[4]
15:
ldd t0, Z+0
ldd t1, Z+1
ldd t2, Z+2
ldd t3, Z+3
eor t0, xh0
eor t1, xh1
eor t2, xh2
eor t3, xh3
ld r0, Y+
eor t0, r0
ld r0, Y+
eor t1, r0
ld r0, Y+
eor t2, r0
ld r0, Y+
eor t3, r0
ld r22, X+
ld r23, X+
ld r24, X+
ld r25, X+
mov r20, r18
rcall rotl32p9
add t0, r22
adc t1, r23
adc t2, r24
adc t3, r25
st Z+, t0
st Z+, t1
st Z+, t2
st Z+, t3
inc r18
cpi r18, 4
brne 16f
movw r26, h0
16:
sbrs r18, 3
rjmp 15b
sbiw r30, 4*8 ; adjust Z to point at h[8]
sbiw r28, 16*4-1
sbiw r28, 1 ; adjust Y to point at q[16]
movw r26, r28
sbiw r26, 7*4 ; adjust X to point at q[9]
ldi r18, 7*4
20: /* now we do the memxor stuff */
ld t0, X
ld t1, Y+
eor t0, t1
st X+, t0
dec r18
brne 20b
; X points at q[16]
; Y points at q[23]
sbiw r26, 4*8 ; X points at q[8]
clr t0
mov t1, xl0
mov t2, xl1
mov t3, xl2
/* h[ 8] += SHL32(xl, 8) ^ q[23] ^ q[ 8]; */
ld r22, X+
ld r23, X+
ld r24, X+
ld r25, X+
ld r0, Y+
eor r22, r0
ld r0, Y+
eor r23, r0
ld r0, Y+
eor r24, r0
ld r0, Y+
eor r25, r0
eor r22, t0
eor r23, t1
eor r24, t2
eor r25, t3
ld r0, Z
add r0, r22
st Z+, r0
ld r0, Z
adc r0, r23
st Z+, r0
ld r0, Z
adc r0, r24
st Z+, r0
ld r0, Z
adc r0, r25
st Z+, r0
movw r28, r26
; Z points at h[9]
; X points at q[9] but we won't need it anymore
; Y points at q[9]
/* h[11] += SHL32(xl, 4) ^ q[11]; */
movw t0, xl0
movw t2, xl2
ldi r20, 4
rcall tshiftl
modify_h_2 2
/* h[10] += SHL32(xl, 6) ^ q[10]; */
ldi r20, 2
rcall tshiftl
modify_h_2 1
/* h[15] += SHR32(xl, 2) ^ q[15]; */
movw t0, xl0
movw t2, xl2
ldi r20, 2
rcall tshiftr
modify_h_2 6
/* h[12] += SHR32(xl, 3) ^ q[12]; */
ldi r20, 1
rcall tshiftr
modify_h_2 3
/* h[13] += SHR32(xl, 4) ^ q[13]; */
ldi r20, 1
rcall tshiftr
modify_h_2 4
/* h[ 9] += SHR32(xl, 6) ^ q[ 9]; */
ldi r20, 2
rcall tshiftr
modify_h_2 0
/* h[14] += SHR32(xl, 7) ^ q[14]; */
ldi r20, 1
rcall tshiftr
modify_h_2 5
bmw_small_f2_exit:
; pop_range 2, 17
; pop_range 28, 29
ret
#if DEBUG_FUNCTIONS
cli_putb:
push r2
push_range 18, 26
push_range 30, 31
mov r2, r24
swap r24
andi r24, 0xf
ldi r30, lo8(hextable)
ldi r31, hi8(hextable)
add r30, r24
adc r31, r1
lpm r24, Z
clr r25
call cli_putc
mov r24, r2
andi r24, 0xf
ldi r30, lo8(hextable)
ldi r31, hi8(hextable)
add r30, r24
adc r31, r1
lpm r24, Z
clr r25
call cli_putc
pop_range 30, 31
pop_range 18, 26
pop r2
ret
hextable:
.byte '0', '1', '2', '3', '4', '5', '6', '7'
.byte '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
cli_putchar:
push_range 18, 31
call cli_putc
pop_range 18, 31
ret
#endif
/*******************************************************************************
* void bmw_small_nextBlock(bmw_small_ctx_t *ctx, const void *block){
* uint32_t q[32];
* dump_x(block, 16, 'M');
* bmw_small_f0(ctx->h, block, q);
* dump_x(q, 16, 'Q');
* bmw_small_f1(q, block, ctx->h);
* dump_x(q, 32, 'Q');
* bmw_small_f2(ctx->h, q, block);
* ctx->counter += 1;
* ctx_dump(ctx);
* }
*
* param ctx: r24:r25
* param block: r22:r23
*/
h0 = 2
h1 = 3
b0 = 4
b1 = 5
q0 = 6
q1 = 7
.global bmw_small_nextBlock
.global bmw224_nextBlock
.global bmw256_nextBlock
bmw_small_nextBlock:
bmw224_nextBlock:
bmw256_nextBlock:
push_range 28, 29
push_range 2, 17
stack_alloc_large 32*4, 30, 31
adiw r30, 1
movw q0, r30
movw h0, r24
movw b0, r22
/* increment counter */
movw r30, r24
adiw r30, 60
ldd r22, Z+4
ldd r23, Z+5
ldd r24, Z+6
ldd r25, Z+7
ldi r21, 1
add r22, r21
adc r23, r1
adc r24, r1
adc r25, r1
std Z+4, r22
std Z+5, r23
std Z+6, r24
std Z+7, r25
/* call bmw_small_f0(ctx->h, block, q) */
movw r24, h0
movw r22, b0
movw r20, q0
push_ q1, q0, b1, b0, h1, h0
rcall bmw_small_f0
/* call bmw_small_f1(q, block, ctx->h) */
pop_ 20, 21, 22, 23, 24, 25,
push_ 21, 20, 25, 24, 23, 22
rcall bmw_small_f1
/* call bmw_small_f2(ctx->h, q, block) */
pop_ 20, 21, 22, 23, 24, 25,
rcall bmw_small_f2
stack_free_large3 32*4
pop_range 2, 17
pop_range 28, 29
ret
/*******************************************************************************
* void bmw224_init(bmw224_ctx_t *ctx){
* uint8_t i;
* ctx->h[0] = 0x00010203;
* for(i=1; i<16; ++i){
* ctx->h[i] = ctx->h[i-1]+ 0x04040404;
* }
* ctx->counter=0;
* }
*
* param ctx: r24:r25
*/
.global bmw224_init
bmw224_init:
movw r26, r24
ldi r22, 0x03
ldi r23, 0x02
ldi r24, 0x01
ldi r25, 0x00
bmw_small_init:
st X+, r22
st X+, r23
st X+, r24
st X+, r25
ldi r18, 16-1
ldi r20, 0x04
1:
add r22, r20
adc r23, r20
adc r24, r20
adc r25, r20
st X+, r22
st X+, r23
st X+, r24
st X+, r25
dec r18
brne 1b
st X+, r1
st X+, r1
st X+, r1
st X+, r1
ret
.global bmw256_init
bmw256_init:
movw r26, r24
ldi r22, 0x43
ldi r23, 0x42
ldi r24, 0x41
ldi r25, 0x40
rjmp bmw_small_init
/*******************************************************************************
* void bmw_small_lastBlock(bmw_small_ctx_t *ctx, const void *block, uint16_t length_b){
* struct {
* uint8_t buffer[64];
* uint32_t ctr;
* } pctx;
* while(length_b >= BMW_SMALL_BLOCKSIZE){
* bmw_small_nextBlock(ctx, block);
* length_b -= BMW_SMALL_BLOCKSIZE;
* block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
* }
* memset(pctx.buffer, 0, 64);
* memcpy(pctx.buffer, block, (length_b+7)/8);
* pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
* if(length_b+1>64*8-64){
* bmw_small_nextBlock(ctx, pctx.buffer);
* memset(pctx.buffer, 0, 64-8);
* ctx->counter -= 1;
* }
* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
* bmw_small_nextBlock(ctx, pctx.buffer);
* uint8_t i;
* memset(pctx.buffer, 0xaa, 64);
* for(i=0; i<16;++i){
* pctx.buffer[i*4] = i+0xa0;
* }
* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
* memcpy(ctx->h, pctx.buffer, 64);
* }
*
* param ctx: r24:r25
* param block: r22:r23
* param length_b: r20:r21
*/
ctx0 = 2
ctx1 = 3
blc0 = 4
blc1 = 5
len0 = 28
len1 = 29
buf0 = 6
buf1 = 7
.global bmw_small_lastBlock
.global bmw224_lastBlock
.global bmw256_lastBlock
bmw_small_lastBlock:
bmw224_lastBlock:
bmw256_lastBlock:
/* while(length_b >= BMW_SMALL_BLOCKSIZE){
bmw_small_nextBlock(ctx, block);
length_b -= BMW_SMALL_BLOCKSIZE;
block = (uint8_t*)block + BMW_SMALL_BLOCKSIZE_B;
}
*/
push_range 2, 7
push_range 28, 29
movw ctx0, r24
movw blc0, r22
movw len0, r20
1:
cpi len1, hi8(512)
brlo 2f
movw r24, ctx0
movw r22, blc0
rcall bmw_small_nextBlock
ldi r24, 64
add blc0, r24
adc blc1, r1
subi len1, hi8(512)
rjmp 1b
2:
/* struct {
uint8_t buffer[64];
uint32_t ctr;
} pctx;
*/
stack_alloc_large 68
adiw r30, 1
movw buf0, r30
/* memset(pctx.buffer, 0, 64);
memcpy(pctx.buffer, block, (length_b+7)/8);
pctx.buffer[length_b>>3] |= 0x80 >> (length_b&0x07);
*/ movw r24, len0
lsr r25
ror r24
lsr r24
lsr r24
; inc r24
ldi r23, 63
sub r23, r24
movw r26, blc0
tst r24
breq 301f
30: ld r20, X+
st Z+, r20
dec r24
brne 30b
301:
clr r20
mov r21, len0
ldi r24, 0x80
andi r21, 0x07
breq 305f
ld r20, X+
303:
lsr r24
dec r21
brne 303b
305:
or r20, r24
st Z+, r20
tst r23
breq 32f
31: st Z+, r1
dec r23
brne 31b
32:
/* if(length_b+1>64*8-64){ ; = 64*7-1 = 447 max(length_b)=511
bmw_small_nextBlock(ctx, pctx.buffer);
memset(pctx.buffer, 0, 64-8);
ctx->counter -= 1;
}
*/
tst len1
breq 400f
cpi len0, 192
brlo 400f
movw r24, ctx0
movw r22, buf0
rcall bmw_small_nextBlock
movw r26, buf0
ldi r20, 64-8
350:
st X+, r1
dec r20
brne 350b
movw r30, ctx0
adiw r30, 60
ldd r21, Z+4
ldd r22, Z+5
ldd r23, Z+6
ldd r24, Z+7
subi r21, 1
sbc r22, r1
sbc r23, r1
sbc r24, r1
rjmp 410f
/* *((uint64_t*)&(pctx.buffer[64-8])) = (uint64_t)(ctx->counter*512LL)+(uint64_t)length_b;
bmw_small_nextBlock(ctx, pctx.buffer);
*/
400:
movw r30, ctx0
adiw r30, 60
ldd r21, Z+4
ldd r22, Z+5
ldd r23, Z+6
ldd r24, Z+7
410:
clr r25
lsl r21
rol r22
rol r23
rol r24
rol r25
mov r20, len0
add r21, len1
adc r22, r1
adc r23, r1
adc r24, r1
adc r25, r1
movw r30, buf0
adiw r30, 64-8
st Z+, r20
st Z+, r21
st Z+, r22
st Z+, r23
st Z+, r24
st Z+, r25
st Z+, r1
st Z+, r1
movw r24, ctx0
movw r22, buf0
rcall bmw_small_nextBlock
/* memset(pctx.buffer, 0xaa, 64);
for(i=0; i<16;++i){
pctx.buffer[i*4] = i+0xa0;
}
*/
ldi r18, 0xa0
ldi r19, 0xaa
movw r26, buf0
500:
st X+, r18
st X+, r19
st X+, r19
st X+, r19
inc r18
sbrs r18, 4
rjmp 500b
/* bmw_small_nextBlock((bmw_small_ctx_t*)&pctx, ctx->h);
memcpy(ctx->h, pctx.buffer, 64);
*/
movw r24, buf0
movw r22, ctx0
rcall bmw_small_nextBlock
ldi r18, 64
movw r26, ctx0
movw r30, buf0
600:
ld r20, Z+
st X+, r20
dec r18
brne 600b
stack_free_large 68
pop_range 28, 29
pop_range 2, 7
ret
/*******************************************************************************
* void bmw224_ctx2hash(void *dest, const bmw224_ctx_t *ctx){
* memcpy(dest, &(ctx->h[9]), 224/8);
* }
*
* param dest: r24:r25
* param ctx: r22:r23
*/
.global bmw224_ctx2hash
bmw224_ctx2hash:
movw r26, r24
movw r30, r22
adiw r30, 9*4
ldi r22, 28
rjmp 1f
/*******************************************************************************
* void bmw256_ctx2hash(void *dest, const bmw256_ctx_t *ctx){
* memcpy(dest, &(ctx->h[8]), 256/8);
* }
*
* param dest: r24:r25
* param ctx: r22:r23
*/
.global bmw256_ctx2hash
bmw256_ctx2hash:
movw r26, r24
movw r30, r22
adiw r30, 8*4
ldi r22, 32
1:
ld r23, Z+
st X+, r23
dec r22
brne 1b
ret
/*******************************************************************************
* void bmw256(void *dest, const void *msg, uint32_t length_b){
* bmw_small_ctx_t ctx;
* bmw256_init(&ctx);
* while(length_b>=BMW_SMALL_BLOCKSIZE){
* bmw_small_nextBlock(&ctx, msg);
* length_b -= BMW_SMALL_BLOCKSIZE;
* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
* }
* bmw_small_lastBlock(&ctx, msg, length_b);
* bmw256_ctx2hash(dest, &ctx);
* }
*
* param dest: r24:r25
* param msg: r22:r23
* param length_b: r18:r21
*/
ctx0 = 2
ctx1 = 3
msg0 = 4
msg1 = 5
len0 = 6
len1 = 7
len2 = 8
len3 = 9
dst0 = 10
dst1 = 11
.global bmw256
bmw256:
push r16
ldi r16, 1
rjmp bmw_small_all
/*******************************************************************************
* void bmw224(void *dest, const void *msg, uint32_t length_b){
* bmw_small_ctx_t ctx;
* bmw224_init(&ctx);
* while(length_b>=BMW_SMALL_BLOCKSIZE){
* bmw_small_nextBlock(&ctx, msg);
* length_b -= BMW_SMALL_BLOCKSIZE;
* msg = (uint8_t*)msg + BMW_SMALL_BLOCKSIZE_B;
* }
* bmw_small_lastBlock(&ctx, msg, length_b);
* bmw224_ctx2hash(dest, &ctx);
* }
*
* param dest: r24:r25
* param msg: r22:r23
* param length_b: r18:r21
*/
ctx0 = 2
ctx1 = 3
msg0 = 4
msg1 = 5
len0 = 6
len1 = 7
len2 = 8
len3 = 9
dst0 = 10
dst1 = 11
.global bmw224
bmw224:
push r16
clr r16
bmw_small_all:
push_range 2, 11
stack_alloc_large 64+4
adiw r30, 1
movw ctx0, r30
movw dst0, r24
movw msg0, r22
movw len0, r18
movw len2, r20
movw r24, ctx0
ldi r30, pm_lo8(init_lut)
ldi r31, pm_hi8(init_lut)
add r30, r16
adc r31, r1
icall
20:
mov r18, len2
or r18, len3
breq 50f
movw r24, ctx0
movw r22, msg0
rcall bmw_small_nextBlock
ldi r20, 2
sub len1, r20
sbc len2, r1
sbc len3, r1
ldi r20, 64
add msg0, r20
adc msg1, r1
rjmp 20b
50:
movw r24, ctx0
movw r22, msg0
movw r20, len0
rcall bmw_small_lastBlock
movw r24, dst0
movw r22, ctx0
ldi r30, pm_lo8(c2h_lut)
ldi r31, pm_hi8(c2h_lut)
add r30, r16
adc r31, r1
icall
stack_free_large 64+4
pop_range 2, 11
pop r16
ret
init_lut:
rjmp bmw224_init
rjmp bmw256_init
c2h_lut:
rjmp bmw224_ctx2hash
rjmp bmw256_ctx2hash