740 lines
10 KiB
ArmAsm
740 lines
10 KiB
ArmAsm
/* shabal-asm.S */
|
|
/*
|
|
This file is part of the AVR-Crypto-Lib.
|
|
Copyright (C) 2009 Daniel Otte (daniel.otte@rub.de)
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
/*
|
|
* \file shabal-asm.S
|
|
* \author Daniel Otte
|
|
* \email daniel.otte@rub.de
|
|
* \date 2009-04-27
|
|
* \license GPLv3 or later
|
|
*/
|
|
|
|
#include "avr-asm-macros.S"
|
|
|
|
/******************************************************************************/
|
|
/*
|
|
void shabal_p(shabal_ctx_t* ctx, const void* m){
|
|
uint8_t i,j;
|
|
for(i=0;i<16;++i){
|
|
ctx->b[i] = ROTL32(ctx->b[i],17);
|
|
}
|
|
for(j=0;j<SHABAL_P;++j){
|
|
for(i=0;i<16;++i){
|
|
ctx->a[(i+16*j)%SHABAL_R] =
|
|
shabal_u(ctx->a[(i+16*j)%SHABAL_R]
|
|
^ shabal_v(ROTL32(ctx->a[(i+16*j+SHABAL_R-1)%SHABAL_R],15))
|
|
^ ctx->c[(8-i+16)%16])
|
|
^ ctx->b[(i+SHABAL_O1)%16]
|
|
^ ((ctx->b[(i+SHABAL_O2)%16]) & ~(ctx->b[(i+SHABAL_O3)%16]))
|
|
^ ((uint32_t*)m)[i];
|
|
ctx->b[i] = ROTL32(ctx->b[i], 1) ^ ~(ctx->a[(i+16*j)%SHABAL_R]);
|
|
}
|
|
}
|
|
|
|
for(j=0;j<36;++j){
|
|
ctx->a[j%SHABAL_R] += ctx->c[(j+3)%16];
|
|
}
|
|
}
|
|
*/
|
|
MB0 = 2
|
|
MB1 = 3
|
|
AB0 = 4
|
|
AB1 = 5
|
|
BB0 = 6
|
|
BB1 = 7
|
|
CB0 = 8
|
|
CB1 = 9
|
|
AL0 = 10
|
|
AL1 = 11
|
|
AL2 = 12
|
|
AL3 = 13
|
|
A0 = 14
|
|
A1 = 15
|
|
A2 = 16
|
|
A3 = 17
|
|
B0 = 18
|
|
B1 = 19
|
|
B2 = 20
|
|
B3 = 21
|
|
I = 22
|
|
J = 23
|
|
T0 = 26
|
|
T1 = 27
|
|
T2 = 28
|
|
T3 = 29
|
|
/*
|
|
* param ctx: r24:r25
|
|
* param m: r22:r23
|
|
*/
|
|
; .global shabal_p
|
|
shabal_p:
|
|
push_range 2, 17
|
|
push r28
|
|
push r29
|
|
movw MB0, r22
|
|
movw r30, r24
|
|
adiw r30, 8
|
|
ld BB0, Z+
|
|
ld BB1, Z+
|
|
ld CB0, Z+
|
|
ld CB1, Z+
|
|
movw AB0, r30
|
|
movw r30, BB0
|
|
adiw r30, 16*4-1
|
|
adiw r30, 1
|
|
ldi r24, 16
|
|
1:
|
|
ld A3, -Z
|
|
ld A2, -Z
|
|
ld A1, -Z
|
|
ld A0, -Z
|
|
mov r0, A3
|
|
rol r0
|
|
rol A0
|
|
rol A1
|
|
rol A2
|
|
rol A3
|
|
std Z+0, A2
|
|
std Z+1, A3
|
|
std Z+2, A0
|
|
std Z+3, A1
|
|
dec r24
|
|
brne 1b
|
|
movw B0, A2
|
|
movw B2, A0
|
|
/* load ctx->a[(i+16*j-1)%12]*/
|
|
movw r26, AB0
|
|
adiw r26, 4*11
|
|
ld AL0, X+
|
|
ld AL1, X+
|
|
ld AL2, X+
|
|
ld AL3, X+
|
|
clr I
|
|
clr J
|
|
1:
|
|
/* ROTL32(AL, 15)*/
|
|
movw T0, AL2
|
|
movw T2, AL0
|
|
mov r0, T0
|
|
ror r0
|
|
ror T3
|
|
ror T2
|
|
ror T1
|
|
ror T0
|
|
movw AL0, T0
|
|
movw AL2, T2
|
|
/* apply V to AL */
|
|
movw A0, AL0
|
|
movw A2, AL2
|
|
lsl A0
|
|
rol A1
|
|
rol A2
|
|
rol A3
|
|
lsl A0
|
|
rol A1
|
|
rol A2
|
|
rol A3
|
|
add A0, AL0
|
|
adc A1, AL1
|
|
adc A2, AL2
|
|
adc A3, AL3
|
|
/* xor in ctx->c[(8-i+16)%16] */
|
|
ldi T0, 24
|
|
sub T0, I
|
|
andi T0, 0x0f
|
|
lsl T0
|
|
lsl T0
|
|
movw r30, CB0
|
|
add r30, T0
|
|
adc r31, r1
|
|
ld r0, Z+
|
|
eor A0, r0
|
|
ld r0, Z+
|
|
eor A1, r0
|
|
ld r0, Z+
|
|
eor A2, r0
|
|
ld r0, Z+
|
|
eor A3, r0
|
|
/* xor in ctx->a[(i+16*j)%12] */
|
|
mov T0, J
|
|
swap T0 /* *=16 */
|
|
add T0, I
|
|
ldi r30, lo8(mod12table)
|
|
ldi r31, hi8(mod12table)
|
|
add r30, T0
|
|
adc r31, r1
|
|
lpm T0, Z
|
|
movw r30, AB0
|
|
add r30, T0
|
|
adc r31, r1
|
|
movw T2, r30
|
|
ld r0, Z+
|
|
eor A0, r0
|
|
ld r0, Z+
|
|
eor A1, r0
|
|
ld r0, Z+
|
|
eor A2, r0
|
|
ld r0, Z+
|
|
eor A3, r0
|
|
/* AL = 3*A */
|
|
movw AL0, A0
|
|
movw AL2, A2
|
|
lsl AL0
|
|
rol AL1
|
|
rol AL2
|
|
rol AL3
|
|
add AL0, A0
|
|
adc AL1, A1
|
|
adc AL2, A2
|
|
adc AL3, A3
|
|
/* xor in ctx->b[(i+13)%16] */
|
|
ldi T0, 13
|
|
add T0, I
|
|
andi T0, 0x0f
|
|
lsl T0
|
|
lsl T0
|
|
movw r30, BB0
|
|
add r30, T0
|
|
adc r31, r1
|
|
ld r0, Z+
|
|
eor AL0, r0
|
|
ld r0, Z+
|
|
eor AL1, r0
|
|
ld r0, Z+
|
|
eor AL2, r0
|
|
ld r0, Z+
|
|
eor AL3, r0
|
|
/* load ctx->b[(i+9)%16] into A */
|
|
ldi T0, 9
|
|
add T0, I
|
|
andi T0, 0x0f
|
|
lsl T0
|
|
lsl T0
|
|
movw r30, BB0
|
|
add r30, T0
|
|
adc r31, r1
|
|
ld A0, Z+
|
|
ld A1, Z+
|
|
ld A2, Z+
|
|
ld A3, Z+
|
|
/* and in ~(ctx->b[(i+6)%16]) */
|
|
ldi T0, 6
|
|
add T0, I
|
|
andi T0, 0x0f
|
|
lsl T0
|
|
lsl T0
|
|
movw r30, BB0
|
|
add r30, T0
|
|
adc r31, r1
|
|
ld r0, Z+
|
|
com r0
|
|
and A0, r0
|
|
ld r0, Z+
|
|
com r0
|
|
and A1, r0
|
|
ld r0, Z+
|
|
com r0
|
|
and A2, r0
|
|
ld r0, Z+
|
|
com r0
|
|
and A3, r0
|
|
/* xor A into AL */
|
|
eor AL0, A0
|
|
eor AL1, A1
|
|
eor AL2, A2
|
|
eor AL3, A3
|
|
/* xor m[i] into AL */
|
|
mov T0, I
|
|
lsl T0
|
|
lsl T0
|
|
movw r30, MB0
|
|
add r30, T0
|
|
adc r31, r1
|
|
ld r0, Z+
|
|
eor AL0, r0
|
|
ld r0, Z+
|
|
eor AL1, r0
|
|
ld r0, Z+
|
|
eor AL2, r0
|
|
ld r0, Z+
|
|
eor AL3, r0
|
|
/* A (AL) is done, now store it */
|
|
movw r30, T2
|
|
st Z+, AL0
|
|
st Z+, AL1
|
|
st Z+, AL2
|
|
st Z+, AL3
|
|
/* process ctx->b[i] */
|
|
/* ROTL32(b, 1)*/
|
|
mov r0, B3
|
|
rol r0
|
|
rol B0
|
|
rol B1
|
|
rol B2
|
|
rol B3
|
|
/* xor in ~(ctx->a[(i+16*j)%SHABAL_R]) */
|
|
movw A0, AL0
|
|
movw A2, AL2
|
|
com A0
|
|
com A1
|
|
com A2
|
|
com A3
|
|
eor B0, A0
|
|
eor B1, A1
|
|
eor B2, A2
|
|
eor B3, A3
|
|
/* store B */
|
|
movw r30, BB0
|
|
mov T0, I
|
|
lsl T0
|
|
lsl T0
|
|
add r30, T0
|
|
adc r31, r1
|
|
st Z+, B0
|
|
st Z+, B1
|
|
st Z+, B2
|
|
st Z+, B3
|
|
inc I
|
|
cpi I, 16
|
|
brne local_reload
|
|
inc J
|
|
cpi J, 3
|
|
brne global_reload
|
|
rjmp addition
|
|
global_reload:
|
|
clr I
|
|
local_reload:
|
|
mov T0, I
|
|
lsl T0
|
|
lsl T0
|
|
movw r30, BB0
|
|
add r30, T0
|
|
adc r31, r1
|
|
ld B0, Z+
|
|
ld B1, Z+
|
|
ld B2, Z+
|
|
ld B3, Z+
|
|
|
|
rjmp 1b
|
|
addition:
|
|
clr J
|
|
movw r30, AB0
|
|
movw r26, CB0
|
|
adiw r26, 3*4
|
|
1:
|
|
/* J = 0..11 */
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
add A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
inc J
|
|
cpi J, 12
|
|
brne 1b
|
|
/* J = 12 */
|
|
movw r30, AB0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
add A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
inc J
|
|
/* J= 13..23*/
|
|
movw r26, CB0
|
|
1:
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
add A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
inc J
|
|
cpi J, 24
|
|
brne 1b
|
|
/* J= 24..28*/
|
|
movw r30, AB0
|
|
1:
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
add A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
inc J
|
|
cpi J, 29
|
|
brne 1b
|
|
|
|
/* J= 29..35*/
|
|
movw r26, CB0
|
|
1:
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
add A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
ld AL0, X+
|
|
ld A0, Z
|
|
adc A0, AL0
|
|
st Z+, A0
|
|
inc J
|
|
cpi J, 36
|
|
brne 1b
|
|
exit:
|
|
pop r29
|
|
pop r28
|
|
pop_range 2, 17
|
|
ret
|
|
|
|
mod12table:
|
|
.byte 0, 4, 8, 12, 16, 20, 24, 28
|
|
.byte 32, 36, 40, 44, 0, 4, 8, 12
|
|
.byte 16, 20, 24, 28, 32, 36, 40, 44
|
|
.byte 0, 4, 8, 12, 16, 20, 24, 28
|
|
.byte 32, 36, 40, 44, 0, 4, 8, 12
|
|
.byte 16, 20, 24, 28, 32, 36, 40, 44
|
|
|
|
/******************************************************************************/
|
|
/*
|
|
void shabal_nextBlock(shabal_ctx_t* ctx, const void* block){
|
|
uint8_t i;
|
|
uint32_t* t;
|
|
for(i=0;i<16;++i){
|
|
ctx->b[i] += ((uint32_t*)block)[i];
|
|
}
|
|
ctx->a[0] ^= ctx->w.w32[0];
|
|
ctx->a[1] ^= ctx->w.w32[1];
|
|
shabal_p(ctx, block);
|
|
for(i=0;i<16;++i){
|
|
ctx->c[i] -= ((uint32_t*)block)[i];
|
|
}
|
|
ctx->w.w64++;
|
|
t = ctx->c;
|
|
ctx->c = ctx->b;
|
|
ctx->b = t;
|
|
}
|
|
*/
|
|
/*
|
|
* param ctx: r24:r25
|
|
* param block: r22:r23
|
|
*/
|
|
MB0 = 14
|
|
MB1 = 15
|
|
CTX0 = 16
|
|
CTX1 = 17
|
|
.global shabal_nextBlock
|
|
shabal_nextBlock:
|
|
push_range 14, 17
|
|
movw CTX0, r24
|
|
movw MB0, r22
|
|
/* xor W into A and increment W */
|
|
movw r30, CTX0
|
|
ldi r19, 8
|
|
sec
|
|
1:
|
|
ld r20, Z
|
|
ldd r21, Z+(8+4)
|
|
eor r21, r20
|
|
std Z+(8+4), r21
|
|
adc r20, r1
|
|
st Z+, r20
|
|
dec r19
|
|
brne 1b
|
|
/* add block to ctx->b */
|
|
ld r26, Z+
|
|
ld r27, Z
|
|
movw r30, MB0
|
|
ldi r19, 16
|
|
1:
|
|
ld r0, X
|
|
ld r18, Z+
|
|
add r0, r18
|
|
st X+, r0
|
|
ld r0, X
|
|
ld r18, Z+
|
|
adc r0, r18
|
|
st X+, r0
|
|
ld r0, X
|
|
ld r18, Z+
|
|
adc r0, r18
|
|
st X+, r0
|
|
ld r0, X
|
|
ld r18, Z+
|
|
adc r0, r18
|
|
st X+, r0
|
|
dec r19
|
|
brne 1b
|
|
/* call shbal_p */
|
|
rcall shabal_p
|
|
/* sub block from ctx->c */
|
|
movw r30, CTX0
|
|
adiw r30, 8+2
|
|
ld r26, Z+
|
|
ld r27, Z
|
|
movw r30, MB0
|
|
ldi r19, 16
|
|
1:
|
|
ld r0, X
|
|
ld r18, Z+
|
|
sub r0, r18
|
|
st X+, r0
|
|
ld r0, X
|
|
ld r18, Z+
|
|
sbc r0, r18
|
|
st X+, r0
|
|
ld r0, X
|
|
ld r18, Z+
|
|
sbc r0, r18
|
|
st X+, r0
|
|
ld r0, X
|
|
ld r18, Z+
|
|
sbc r0, r18
|
|
st X+, r0
|
|
dec r19
|
|
brne 1b
|
|
/* xchange ctx->b with ctx->c*/
|
|
movw r30, CTX0
|
|
ldd r22, Z+8
|
|
ldd r23, Z+9
|
|
ldd r24, Z+10
|
|
ldd r25, Z+11
|
|
std Z+10, r22
|
|
std Z+11, r23
|
|
std Z+8, r24
|
|
std Z+9, r25
|
|
pop_range 14, 17
|
|
ret
|
|
|
|
/******************************************************************************/
|
|
/*
|
|
void shabal_lastBlock(shabal_ctx_t* ctx, const void* block, uint16_t length_b){
|
|
uint8_t i,j;
|
|
uint32_t* t;
|
|
uint8_t buffer[64];
|
|
while(length_b>=SHABAL_BLOCKSIZE){
|
|
shabal_nextBlock(ctx, block);
|
|
block = (uint8_t*)block + SHABAL_BLOCKSIZE_B;
|
|
length_b -= SHABAL_BLOCKSIZE;
|
|
}
|
|
memset(buffer, 0, 64);
|
|
memcpy(buffer, block, (length_b+7)/8);
|
|
buffer[length_b/8] |= 0x80>>(length_b%8);
|
|
for(i=0;i<16;++i){
|
|
ctx->b[i] += ((uint32_t*)buffer)[i];
|
|
}
|
|
for(j=0; j<4;++j){
|
|
ctx->a[0] ^= ctx->w.w32[0];
|
|
ctx->a[1] ^= ctx->w.w32[1];
|
|
shabal_p(ctx, buffer);
|
|
t = ctx->c;
|
|
ctx->c = ctx->b;
|
|
ctx->b = t;
|
|
}
|
|
}
|
|
*/
|
|
I = 16
|
|
LEN0 = 16
|
|
LEN1 = 17
|
|
CTX0 = 14
|
|
CTX1 = 15
|
|
MB0 = 12
|
|
MB1 = 13
|
|
/*
|
|
* param ctx: r24:r25
|
|
* param block: r22:r23
|
|
* param length_b: r20:r21
|
|
*/
|
|
.global shabal_lastBlock
|
|
shabal_lastBlock:
|
|
push_range 12, 17
|
|
movw CTX0, r24
|
|
movw MB0, r22
|
|
movw LEN0, r20
|
|
1:
|
|
cpi LEN1, 0x02
|
|
brlo 2f
|
|
movw r24, CTX0
|
|
movw r22, MB0
|
|
rcall shabal_nextBlock
|
|
subi LEN1, 0x02
|
|
ldi r18, 64
|
|
add MB0, r18
|
|
adc MB1, r1
|
|
rjmp 1b
|
|
2:
|
|
stack_alloc_large 64
|
|
adiw r30, 1 /* Z points at buffer */
|
|
movw r26, MB0
|
|
/* r24 = LEN/8*/
|
|
movw r24, LEN0
|
|
lsr r25
|
|
ror r24
|
|
lsr r24
|
|
lsr r24
|
|
ldi r25, 64-1
|
|
sub r25, r24
|
|
tst r24
|
|
breq 32f
|
|
31:
|
|
ld r0, X+
|
|
st Z+, r0
|
|
dec r24
|
|
brne 31b
|
|
32:
|
|
ldi r18, 0x80
|
|
andi LEN0, 0x07
|
|
breq append_0x80
|
|
ld r0, X+
|
|
33:
|
|
lsr r18
|
|
dec LEN0
|
|
brne 33b
|
|
or r0, r18
|
|
st Z+, r0
|
|
rjmp append_zeros
|
|
append_0x80:
|
|
st Z+, r18
|
|
append_zeros:
|
|
tst r25
|
|
breq 4f
|
|
34: st Z+, r1
|
|
dec r25
|
|
brne 34b
|
|
4:
|
|
sbiw r30, 63
|
|
sbiw r30, 1
|
|
movw MB0, r30
|
|
movw r26, CTX0
|
|
adiw r26, 8
|
|
ld r24, X+
|
|
ld r25, X
|
|
movw r26, r24
|
|
ldi r18, 16
|
|
41:
|
|
ld r24, X
|
|
ld r25, Z+
|
|
add r24, r25
|
|
st X+, r24
|
|
ld r24, X
|
|
ld r25, Z+
|
|
adc r24, r25
|
|
st X+, r24
|
|
ld r24, X
|
|
ld r25, Z+
|
|
adc r24, r25
|
|
st X+, r24
|
|
ld r24, X
|
|
ld r25, Z+
|
|
adc r24, r25
|
|
st X+, r24
|
|
dec r18
|
|
brne 41b
|
|
/* final loop */
|
|
ldi I, 4
|
|
5:
|
|
/* xor W into A */
|
|
movw r30, CTX0
|
|
ldi r19, 8
|
|
51:
|
|
ld r24, Z+
|
|
ldd r25, Z+(8+4-1)
|
|
eor r24, r25
|
|
std Z+(8+4-1), r24
|
|
dec r19
|
|
brne 51b
|
|
movw r24, CTX0
|
|
movw r22, MB0
|
|
rcall shabal_p
|
|
movw r30, CTX0
|
|
ldd r22, Z+8
|
|
ldd r23, Z+9
|
|
ldd r24, Z+10
|
|
ldd r25, Z+11
|
|
std Z+10, r22
|
|
std Z+11, r23
|
|
std Z+8, r24
|
|
std Z+9, r25
|
|
dec I
|
|
brne 5b
|
|
|
|
stack_free_large 64
|
|
pop_range 12, 17
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|