avr-crypto-lib/shabal/shabal-asm.S

740 lines
10 KiB
ArmAsm

/* shabal-asm.S */
/*
This file is part of the AVR-Crypto-Lib.
Copyright (C) 2006-2015 Daniel Otte (bg@nerilex.org)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* \file shabal-asm.S
* \author Daniel Otte
* \email bg@nerilex.org
* \date 2009-04-27
* \license GPLv3 or later
*/
#include "avr-asm-macros.S"
/******************************************************************************/
/*
void shabal_p(shabal_ctx_t *ctx, const void *m){
uint8_t i,j;
for(i=0;i<16;++i){
ctx->b[i] = ROTL32(ctx->b[i],17);
}
for(j=0;j<SHABAL_P;++j){
for(i=0;i<16;++i){
ctx->a[(i+16*j)%SHABAL_R] =
shabal_u(ctx->a[(i+16*j)%SHABAL_R]
^ shabal_v(ROTL32(ctx->a[(i+16*j+SHABAL_R-1)%SHABAL_R],15))
^ ctx->c[(8-i+16)%16])
^ ctx->b[(i+SHABAL_O1)%16]
^ ((ctx->b[(i+SHABAL_O2)%16]) & ~(ctx->b[(i+SHABAL_O3)%16]))
^ ((uint32_t*)m)[i];
ctx->b[i] = ROTL32(ctx->b[i], 1) ^ ~(ctx->a[(i+16*j)%SHABAL_R]);
}
}
for(j=0;j<36;++j){
ctx->a[j%SHABAL_R] += ctx->c[(j+3)%16];
}
}
*/
MB0 = 2
MB1 = 3
AB0 = 4
AB1 = 5
BB0 = 6
BB1 = 7
CB0 = 8
CB1 = 9
AL0 = 10
AL1 = 11
AL2 = 12
AL3 = 13
A0 = 14
A1 = 15
A2 = 16
A3 = 17
B0 = 18
B1 = 19
B2 = 20
B3 = 21
I = 22
J = 23
T0 = 26
T1 = 27
T2 = 28
T3 = 29
/*
* param ctx: r24:r25
* param m: r22:r23
*/
; .global shabal_p
shabal_p:
push_range 2, 17
push r28
push r29
movw MB0, r22
movw r30, r24
adiw r30, 8
ld BB0, Z+
ld BB1, Z+
ld CB0, Z+
ld CB1, Z+
movw AB0, r30
movw r30, BB0
adiw r30, 16*4-1
adiw r30, 1
ldi r24, 16
1:
ld A3, -Z
ld A2, -Z
ld A1, -Z
ld A0, -Z
mov r0, A3
rol r0
rol A0
rol A1
rol A2
rol A3
std Z+0, A2
std Z+1, A3
std Z+2, A0
std Z+3, A1
dec r24
brne 1b
movw B0, A2
movw B2, A0
/* load ctx->a[(i+16*j-1)%12]*/
movw r26, AB0
adiw r26, 4*11
ld AL0, X+
ld AL1, X+
ld AL2, X+
ld AL3, X+
clr I
clr J
1:
/* ROTL32(AL, 15)*/
movw T0, AL2
movw T2, AL0
mov r0, T0
ror r0
ror T3
ror T2
ror T1
ror T0
movw AL0, T0
movw AL2, T2
/* apply V to AL */
movw A0, AL0
movw A2, AL2
lsl A0
rol A1
rol A2
rol A3
lsl A0
rol A1
rol A2
rol A3
add A0, AL0
adc A1, AL1
adc A2, AL2
adc A3, AL3
/* xor in ctx->c[(8-i+16)%16] */
ldi T0, 24
sub T0, I
andi T0, 0x0f
lsl T0
lsl T0
movw r30, CB0
add r30, T0
adc r31, r1
ld r0, Z+
eor A0, r0
ld r0, Z+
eor A1, r0
ld r0, Z+
eor A2, r0
ld r0, Z+
eor A3, r0
/* xor in ctx->a[(i+16*j)%12] */
mov T0, J
swap T0 /* *=16 */
add T0, I
ldi r30, lo8(mod12table)
ldi r31, hi8(mod12table)
add r30, T0
adc r31, r1
lpm T0, Z
movw r30, AB0
add r30, T0
adc r31, r1
movw T2, r30
ld r0, Z+
eor A0, r0
ld r0, Z+
eor A1, r0
ld r0, Z+
eor A2, r0
ld r0, Z+
eor A3, r0
/* AL = 3*A */
movw AL0, A0
movw AL2, A2
lsl AL0
rol AL1
rol AL2
rol AL3
add AL0, A0
adc AL1, A1
adc AL2, A2
adc AL3, A3
/* xor in ctx->b[(i+13)%16] */
ldi T0, 13
add T0, I
andi T0, 0x0f
lsl T0
lsl T0
movw r30, BB0
add r30, T0
adc r31, r1
ld r0, Z+
eor AL0, r0
ld r0, Z+
eor AL1, r0
ld r0, Z+
eor AL2, r0
ld r0, Z+
eor AL3, r0
/* load ctx->b[(i+9)%16] into A */
ldi T0, 9
add T0, I
andi T0, 0x0f
lsl T0
lsl T0
movw r30, BB0
add r30, T0
adc r31, r1
ld A0, Z+
ld A1, Z+
ld A2, Z+
ld A3, Z+
/* and in ~(ctx->b[(i+6)%16]) */
ldi T0, 6
add T0, I
andi T0, 0x0f
lsl T0
lsl T0
movw r30, BB0
add r30, T0
adc r31, r1
ld r0, Z+
com r0
and A0, r0
ld r0, Z+
com r0
and A1, r0
ld r0, Z+
com r0
and A2, r0
ld r0, Z+
com r0
and A3, r0
/* xor A into AL */
eor AL0, A0
eor AL1, A1
eor AL2, A2
eor AL3, A3
/* xor m[i] into AL */
mov T0, I
lsl T0
lsl T0
movw r30, MB0
add r30, T0
adc r31, r1
ld r0, Z+
eor AL0, r0
ld r0, Z+
eor AL1, r0
ld r0, Z+
eor AL2, r0
ld r0, Z+
eor AL3, r0
/* A (AL) is done, now store it */
movw r30, T2
st Z+, AL0
st Z+, AL1
st Z+, AL2
st Z+, AL3
/* process ctx->b[i] */
/* ROTL32(b, 1)*/
mov r0, B3
rol r0
rol B0
rol B1
rol B2
rol B3
/* xor in ~(ctx->a[(i+16*j)%SHABAL_R]) */
movw A0, AL0
movw A2, AL2
com A0
com A1
com A2
com A3
eor B0, A0
eor B1, A1
eor B2, A2
eor B3, A3
/* store B */
movw r30, BB0
mov T0, I
lsl T0
lsl T0
add r30, T0
adc r31, r1
st Z+, B0
st Z+, B1
st Z+, B2
st Z+, B3
inc I
cpi I, 16
brne local_reload
inc J
cpi J, 3
brne global_reload
rjmp addition
global_reload:
clr I
local_reload:
mov T0, I
lsl T0
lsl T0
movw r30, BB0
add r30, T0
adc r31, r1
ld B0, Z+
ld B1, Z+
ld B2, Z+
ld B3, Z+
rjmp 1b
addition:
clr J
movw r30, AB0
movw r26, CB0
adiw r26, 3*4
1:
/* J = 0..11 */
ld AL0, X+
ld A0, Z
add A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
inc J
cpi J, 12
brne 1b
/* J = 12 */
movw r30, AB0
ld AL0, X+
ld A0, Z
add A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
inc J
/* J= 13..23*/
movw r26, CB0
1:
ld AL0, X+
ld A0, Z
add A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
inc J
cpi J, 24
brne 1b
/* J= 24..28*/
movw r30, AB0
1:
ld AL0, X+
ld A0, Z
add A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
inc J
cpi J, 29
brne 1b
/* J= 29..35*/
movw r26, CB0
1:
ld AL0, X+
ld A0, Z
add A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
ld AL0, X+
ld A0, Z
adc A0, AL0
st Z+, A0
inc J
cpi J, 36
brne 1b
exit:
pop r29
pop r28
pop_range 2, 17
ret
mod12table:
.byte 0, 4, 8, 12, 16, 20, 24, 28
.byte 32, 36, 40, 44, 0, 4, 8, 12
.byte 16, 20, 24, 28, 32, 36, 40, 44
.byte 0, 4, 8, 12, 16, 20, 24, 28
.byte 32, 36, 40, 44, 0, 4, 8, 12
.byte 16, 20, 24, 28, 32, 36, 40, 44
/******************************************************************************/
/*
void shabal_nextBlock(shabal_ctx_t *ctx, const void *block){
uint8_t i;
uint32_t *t;
for(i=0;i<16;++i){
ctx->b[i] += ((uint32_t*)block)[i];
}
ctx->a[0] ^= ctx->w.w32[0];
ctx->a[1] ^= ctx->w.w32[1];
shabal_p(ctx, block);
for(i=0;i<16;++i){
ctx->c[i] -= ((uint32_t*)block)[i];
}
ctx->w.w64++;
t = ctx->c;
ctx->c = ctx->b;
ctx->b = t;
}
*/
/*
* param ctx: r24:r25
* param block: r22:r23
*/
MB0 = 14
MB1 = 15
CTX0 = 16
CTX1 = 17
.global shabal_nextBlock
shabal_nextBlock:
push_range 14, 17
movw CTX0, r24
movw MB0, r22
/* xor W into A and increment W */
movw r30, CTX0
ldi r19, 8
sec
1:
ld r20, Z
ldd r21, Z+(8+4)
eor r21, r20
std Z+(8+4), r21
adc r20, r1
st Z+, r20
dec r19
brne 1b
/* add block to ctx->b */
ld r26, Z+
ld r27, Z
movw r30, MB0
ldi r19, 16
1:
ld r0, X
ld r18, Z+
add r0, r18
st X+, r0
ld r0, X
ld r18, Z+
adc r0, r18
st X+, r0
ld r0, X
ld r18, Z+
adc r0, r18
st X+, r0
ld r0, X
ld r18, Z+
adc r0, r18
st X+, r0
dec r19
brne 1b
/* call shbal_p */
rcall shabal_p
/* sub block from ctx->c */
movw r30, CTX0
adiw r30, 8+2
ld r26, Z+
ld r27, Z
movw r30, MB0
ldi r19, 16
1:
ld r0, X
ld r18, Z+
sub r0, r18
st X+, r0
ld r0, X
ld r18, Z+
sbc r0, r18
st X+, r0
ld r0, X
ld r18, Z+
sbc r0, r18
st X+, r0
ld r0, X
ld r18, Z+
sbc r0, r18
st X+, r0
dec r19
brne 1b
/* xchange ctx->b with ctx->c*/
movw r30, CTX0
ldd r22, Z+8
ldd r23, Z+9
ldd r24, Z+10
ldd r25, Z+11
std Z+10, r22
std Z+11, r23
std Z+8, r24
std Z+9, r25
pop_range 14, 17
ret
/******************************************************************************/
/*
void shabal_lastBlock(shabal_ctx_t *ctx, const void *block, uint16_t length_b){
uint8_t i,j;
uint32_t *t;
uint8_t buffer[64];
while(length_b>=SHABAL_BLOCKSIZE){
shabal_nextBlock(ctx, block);
block = (uint8_t*)block + SHABAL_BLOCKSIZE_B;
length_b -= SHABAL_BLOCKSIZE;
}
memset(buffer, 0, 64);
memcpy(buffer, block, (length_b+7)/8);
buffer[length_b/8] |= 0x80>>(length_b%8);
for(i=0;i<16;++i){
ctx->b[i] += ((uint32_t*)buffer)[i];
}
for(j=0; j<4;++j){
ctx->a[0] ^= ctx->w.w32[0];
ctx->a[1] ^= ctx->w.w32[1];
shabal_p(ctx, buffer);
t = ctx->c;
ctx->c = ctx->b;
ctx->b = t;
}
}
*/
I = 16
LEN0 = 16
LEN1 = 17
CTX0 = 14
CTX1 = 15
MB0 = 12
MB1 = 13
/*
* param ctx: r24:r25
* param block: r22:r23
* param length_b: r20:r21
*/
.global shabal_lastBlock
shabal_lastBlock:
push_range 12, 17
movw CTX0, r24
movw MB0, r22
movw LEN0, r20
1:
cpi LEN1, 0x02
brlo 2f
movw r24, CTX0
movw r22, MB0
rcall shabal_nextBlock
subi LEN1, 0x02
ldi r18, 64
add MB0, r18
adc MB1, r1
rjmp 1b
2:
stack_alloc_large 64
adiw r30, 1 /* Z points at buffer */
movw r26, MB0
/* r24 = LEN/8*/
movw r24, LEN0
lsr r25
ror r24
lsr r24
lsr r24
ldi r25, 64-1
sub r25, r24
tst r24
breq 32f
31:
ld r0, X+
st Z+, r0
dec r24
brne 31b
32:
ldi r18, 0x80
andi LEN0, 0x07
breq append_0x80
ld r0, X+
33:
lsr r18
dec LEN0
brne 33b
or r0, r18
st Z+, r0
rjmp append_zeros
append_0x80:
st Z+, r18
append_zeros:
tst r25
breq 4f
34: st Z+, r1
dec r25
brne 34b
4:
sbiw r30, 63
sbiw r30, 1
movw MB0, r30
movw r26, CTX0
adiw r26, 8
ld r24, X+
ld r25, X
movw r26, r24
ldi r18, 16
41:
ld r24, X
ld r25, Z+
add r24, r25
st X+, r24
ld r24, X
ld r25, Z+
adc r24, r25
st X+, r24
ld r24, X
ld r25, Z+
adc r24, r25
st X+, r24
ld r24, X
ld r25, Z+
adc r24, r25
st X+, r24
dec r18
brne 41b
/* final loop */
ldi I, 4
5:
/* xor W into A */
movw r30, CTX0
ldi r19, 8
51:
ld r24, Z+
ldd r25, Z+(8+4-1)
eor r24, r25
std Z+(8+4-1), r24
dec r19
brne 51b
movw r24, CTX0
movw r22, MB0
rcall shabal_p
movw r30, CTX0
ldd r22, Z+8
ldd r23, Z+9
ldd r24, Z+10
ldd r25, Z+11
std Z+10, r22
std Z+11, r23
std Z+8, r24
std Z+9, r25
dec I
brne 5b
stack_free_large 64
pop_range 12, 17
ret