548 lines
9.2 KiB
ArmAsm
548 lines
9.2 KiB
ArmAsm
/* mqq160-sign_P-asm.S */
|
|
/*
|
|
This file is part of the AVR-Crypto-Lib.
|
|
Copyright (C) 2010 Daniel Otte (daniel.otte@rub.de)
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
/**
|
|
* \file mqq160-sign_P-asm.S
|
|
* \email daniel.otte@rub.de
|
|
* \author Daniel Otte
|
|
* \date 2010-03-21
|
|
* \license GPLv3 or later
|
|
*
|
|
*/
|
|
|
|
#include "avr-asm-macros.S"
|
|
|
|
#if 0
|
|
static void mqq_inv_affine_transformation(uint8_t *input_bytes, uint8_t *result, const mqq160_sign_key_t *key){
|
|
/* The matrix SInv is given as two permutations of 160 elements. */
|
|
uint8_t j, byteindex, bitindex, bitindex_d, byteindex_d, rp1, rp5;
|
|
uint8_t *r1_ptr, *r5_ptr;
|
|
uint8_t h1[20];
|
|
|
|
/* Initialize H1 and H2 = 0 */
|
|
memset(h1, 0, 20);
|
|
memset(result, 0, 20);
|
|
|
|
/*
|
|
Fill H1 with bits of InputBytes accordingly to RP1 permutation
|
|
and fill H2 with bits of InputBytes accordingly to RP5 permutation
|
|
*/
|
|
bitindex_d = 0x80;
|
|
byteindex_d = 0;
|
|
j=160;
|
|
r1_ptr = key->rp1;
|
|
r5_ptr = key->rp5;
|
|
do{
|
|
rp1 = pgm_read_byte(r1_ptr++);
|
|
rp5 = pgm_read_byte(r5_ptr++);
|
|
byteindex = rp1>>3;
|
|
bitindex = 0x80 >> (rp1&0x07);
|
|
if (input_bytes[byteindex] & bitindex){
|
|
h1[byteindex_d] ^= bitindex_d;
|
|
}
|
|
|
|
byteindex = rp5>>3;
|
|
bitindex = 0x80 >> (rp5&0x07);
|
|
if (input_bytes[byteindex] & bitindex){
|
|
result[byteindex_d] ^= bitindex_d;
|
|
}
|
|
bitindex_d >>= 1;
|
|
if(bitindex_d==0){
|
|
++byteindex_d;
|
|
bitindex_d = 0x80;
|
|
}
|
|
}while(--j);
|
|
|
|
for (j=0; j<20; j++){
|
|
result[j] ^= h1[j] ^ h1[pgm_read_byte(j+mod20_table)]
|
|
^ h1[pgm_read_byte(8+j+mod20_table)]
|
|
^ h1[pgm_read_byte(12+j+mod20_table)];
|
|
}
|
|
}
|
|
#endif
|
|
|
|
fetch_bit:
|
|
lpm r0, Z+
|
|
mov r28, r0
|
|
ldi r29, 0x80
|
|
andi r28, 7
|
|
breq 3f
|
|
2: lsr r29
|
|
dec r28
|
|
brne 2b
|
|
3: mov r28, r0
|
|
lsr r28
|
|
lsr r28
|
|
lsr r28
|
|
mov r0, r29
|
|
clr r29
|
|
add r28, r24
|
|
adc r29, r25
|
|
ld r28, Y
|
|
clt
|
|
and r28, r0
|
|
breq 4f
|
|
set
|
|
4: ret
|
|
|
|
xres_0 = 18
|
|
xres_1 = 19
|
|
h_0 = 20
|
|
h_1 = 21
|
|
xrp5_0 = 22
|
|
xrp5_1 = 23
|
|
inp_0 = 24
|
|
inp_1 = 25
|
|
tmp_0 = 22
|
|
tmp_1 = 23
|
|
tmp_2 = 24
|
|
tmp_3 = 25
|
|
tmp_4 = 18
|
|
|
|
/*
|
|
param input_bytes: r24:r25
|
|
param result: r22:r23
|
|
param key: r20:r21
|
|
*/
|
|
;.global mqq_inv_affine_transformation
|
|
mqq_inv_affine_transformation:
|
|
push r17
|
|
; push r28
|
|
; push r29
|
|
stack_alloc 20
|
|
adiw r30, 1 /* Z points to stack space for h1 */
|
|
movw r28, r20 /* Y points to the key struct in RAM */
|
|
movw xres_0, r22
|
|
movw r26, r30 /* X points to h1[0] */
|
|
ldd xrp5_0, Y+8 /* load pointer rp5 to xrp5 */
|
|
ldd xrp5_1, Y+9
|
|
movw h_0, r30
|
|
ldd r30, Y+6 /* load pointer to rp1 in Z */
|
|
ldd r31, Y+7
|
|
ldi r17, 20
|
|
20: rcall fetch_bit
|
|
bld r1, 7
|
|
rcall fetch_bit
|
|
bld r1, 6
|
|
rcall fetch_bit
|
|
bld r1, 5
|
|
rcall fetch_bit
|
|
bld r1, 4
|
|
rcall fetch_bit
|
|
bld r1, 3
|
|
rcall fetch_bit
|
|
bld r1, 2
|
|
rcall fetch_bit
|
|
bld r1, 1
|
|
rcall fetch_bit
|
|
bld r1, 0
|
|
st X+, r1
|
|
dec r17
|
|
brne 20b
|
|
;----
|
|
movw r26, xres_0 /* X points to result */
|
|
movw r30, xrp5_0
|
|
ldi r17, 20
|
|
20: rcall fetch_bit
|
|
bld r1, 7
|
|
rcall fetch_bit
|
|
bld r1, 6
|
|
rcall fetch_bit
|
|
bld r1, 5
|
|
rcall fetch_bit
|
|
bld r1, 4
|
|
rcall fetch_bit
|
|
bld r1, 3
|
|
rcall fetch_bit
|
|
bld r1, 2
|
|
rcall fetch_bit
|
|
bld r1, 1
|
|
rcall fetch_bit
|
|
bld r1, 0
|
|
st X+, r1
|
|
dec r17
|
|
brne 20b
|
|
clr r1
|
|
; --- now we mix result with h1
|
|
sbiw r26, 20 /* adjusting X to point at result[0] */
|
|
movw tmp_2, h_0
|
|
ldi r30, lo8(affine_mix_lut)
|
|
ldi r31, hi8(affine_mix_lut)
|
|
ldi r17, 20
|
|
30:
|
|
ld tmp_0, X
|
|
movw r28, tmp_2
|
|
ld tmp_1, Y+
|
|
movw tmp_2, r28
|
|
eor tmp_0, tmp_1
|
|
movw r28, h_0
|
|
lpm r0, Z+
|
|
mov tmp_4, r0
|
|
andi tmp_4, 0x0f
|
|
add r28, tmp_4
|
|
adc r29, r1
|
|
ld tmp_1, Y
|
|
eor tmp_0, tmp_1
|
|
adiw r28, 4
|
|
sbrc r0, 7
|
|
adiw r28, 4
|
|
ld tmp_1, Y
|
|
eor tmp_0, tmp_1
|
|
adiw r28, 4
|
|
sbrc r0, 6
|
|
adiw r28, 4
|
|
ld tmp_1, Y
|
|
eor tmp_0, tmp_1
|
|
st X+, tmp_0
|
|
dec r17
|
|
brne 30b
|
|
|
|
stack_free 20
|
|
; pop r29
|
|
; pop r28
|
|
pop r17
|
|
ret
|
|
|
|
affine_mix_lut:
|
|
.byte 0x84, 0x85, 0x86, 0x87
|
|
.byte 0xC0, 0xC1, 0xC2, 0xC3
|
|
.byte 0x40, 0x41, 0x42, 0x43
|
|
.byte 0x44, 0x45, 0x46, 0x47
|
|
.byte 0x80, 0x81, 0x82, 0x83
|
|
|
|
/******************************************************************************/
|
|
|
|
xres = 20
|
|
tmp_0 = 23
|
|
tmp_1 = 22
|
|
tmp_2 = 21
|
|
tmp_3 = 19
|
|
/*
|
|
param i: r24
|
|
param b1: r22
|
|
param b2: r20
|
|
param key: r18:r19
|
|
*/
|
|
;.global mqq_q
|
|
mqq_q:
|
|
; push r28
|
|
; push r29
|
|
; stack_alloc 25, r26, r27
|
|
; adiw r26, 1 /* X points to e[0] */
|
|
movw r28, r18
|
|
sbrs r24, 0
|
|
adiw r28, 2
|
|
ldd r30, Y+2
|
|
ldd r31, Y+3
|
|
ldi r28, 9
|
|
10: lpm r0, Z+
|
|
st X+, r0
|
|
dec r28
|
|
brne 10b
|
|
sbiw r26, 9 /* adjust X to point at e[0] */
|
|
;---
|
|
movw r28, r18
|
|
ld r30, Y+ /* Z points to a[0] in progmem */
|
|
ld r31, Y
|
|
sbrs r24, 0
|
|
rjmp 40f
|
|
20:
|
|
sbrs r22, 7
|
|
rjmp 30f
|
|
ldi r25, 9
|
|
movw r28, r30
|
|
25: lpm r0, Z
|
|
adiw r30, 9
|
|
ld r24, X
|
|
eor r24, r0
|
|
st X+, r24
|
|
dec r25
|
|
brne 25b
|
|
movw r30, r28
|
|
sbiw r26, 9
|
|
30:
|
|
adiw r30, 1
|
|
lsl r22
|
|
breq 60f
|
|
rjmp 20b
|
|
40:
|
|
sbrs r22, 7
|
|
rjmp 50f
|
|
ldi r25, 9
|
|
movw r28, r30
|
|
45: lpm r0, Z+
|
|
ld r24, X
|
|
eor r24, r0
|
|
st X+, r24
|
|
dec r25
|
|
brne 45b
|
|
movw r30, r28
|
|
sbiw r26, 9
|
|
50:
|
|
adiw r30, 9
|
|
lsl r22
|
|
breq 60f
|
|
rjmp 40b
|
|
60:
|
|
;------ all inputs are consumed, X points at e[0]
|
|
;------ So we finished with obtaining e0 .. e7 and e8
|
|
movw r28, r26
|
|
ldd r0, Y+8
|
|
eor xres, r0
|
|
;---
|
|
|
|
/*
|
|
We can look at the bits of e0 .. e7 as a columns of a given matrix. We want to define 8 variables that have the rows
|
|
of that matrix. The variables need to be 16-bit because we will put into the upper 8 bits the bits of e0 .. e7,
|
|
and the bits of the variable result will be the Least Significant Bits of a[0] ... a[7].
|
|
*/
|
|
adiw r28, 9 /* Y points at a[0] */
|
|
ldi r25, 8
|
|
63:
|
|
ldi r24, 8
|
|
clr tmp_0
|
|
65: ld tmp_1, X
|
|
lsl tmp_1
|
|
st X+, tmp_1
|
|
rol tmp_0
|
|
dec r24
|
|
brne 65b
|
|
;---
|
|
clr tmp_1
|
|
lsl xres
|
|
rol tmp_1
|
|
st Y+, tmp_1
|
|
st Y+, tmp_0
|
|
sbiw r26, 8
|
|
dec r25
|
|
brne 63b
|
|
;------- First we apply upper triangular transformation
|
|
sbiw r28, 16 /* Y points at a[0] */
|
|
movw r30, r28 /* Z points at a[0] */
|
|
|
|
col = 25
|
|
ldi r24, 8
|
|
clr col
|
|
70:
|
|
mov r1, col
|
|
ldi tmp_3, 0x80
|
|
tst r1
|
|
breq 72f
|
|
71: lsr tmp_3
|
|
dec r1
|
|
brne 71b
|
|
72:
|
|
clt
|
|
movw r28, r30 /* Y points at a[row]*/
|
|
73: ldd tmp_0, Y+1
|
|
and tmp_0, tmp_3
|
|
brne 74f
|
|
set
|
|
adiw r28, 2
|
|
rjmp 73b
|
|
74:
|
|
/* Y points at a[row] */
|
|
/* if T is set we have to permute [Y] and [Z] */
|
|
brtc 75f
|
|
ld tmp_0, Y
|
|
ld tmp_1, Z
|
|
st Y, tmp_1
|
|
st Z, tmp_0
|
|
ldd tmp_0, Y+1
|
|
ldd tmp_1, Z+1
|
|
std Y+1, tmp_1
|
|
std Z+1, tmp_0
|
|
75: /* permutation done */
|
|
ldi r26, 7
|
|
sub r26, col
|
|
breq 78f
|
|
movw r28, r30
|
|
76: adiw r28, 2
|
|
ldd tmp_0, Y+1
|
|
and tmp_0, tmp_3
|
|
breq 77f
|
|
ld tmp_0, Y
|
|
ld tmp_1, Z
|
|
eor tmp_0, tmp_1
|
|
st Y, tmp_0
|
|
ldd tmp_0, Y+1
|
|
ldd tmp_1, Z+1
|
|
eor tmp_0, tmp_1
|
|
std Y+1, tmp_0
|
|
77:
|
|
dec r26
|
|
brne 76b
|
|
78:
|
|
adiw r30, 2
|
|
inc col
|
|
dec r24
|
|
brne 70b
|
|
79:
|
|
;------ Then we eliminate 1s above the main diagonal
|
|
|
|
ldi col, 7
|
|
ldi tmp_3, 1
|
|
sbiw r30, 2
|
|
80:
|
|
movw r28, r30
|
|
mov r26, col
|
|
81:
|
|
sbiw r28, 2
|
|
ldd tmp_0, Y+1
|
|
and tmp_0, tmp_3
|
|
breq 82f
|
|
ld tmp_0, Y
|
|
ld tmp_1, Z
|
|
eor tmp_0, tmp_1
|
|
st Y, tmp_0
|
|
ldd tmp_0, Y+1
|
|
ldd tmp_1, Z+1
|
|
eor tmp_0, tmp_1
|
|
std Y+1, tmp_0
|
|
82:
|
|
dec r26
|
|
brne 81b
|
|
sbiw r30, 2
|
|
lsl tmp_3
|
|
dec col
|
|
brne 80b
|
|
89:
|
|
;------ The result is in the Least Significant Bits of a[0] ... a[7]
|
|
/* Z should point at a[0] */
|
|
ldi r25, 8
|
|
clr r24
|
|
90:
|
|
ld tmp_0, Z
|
|
adiw r30, 2
|
|
lsr tmp_0
|
|
rol r24
|
|
dec r25
|
|
brne 90b
|
|
mqq_q_exit:
|
|
; stack_free 25
|
|
; pop r29
|
|
; pop r28
|
|
ret
|
|
|
|
/******************************************************************************/
|
|
|
|
/*
|
|
param dest: r24:r25
|
|
param hash: r22:r23
|
|
param key: r20:r21
|
|
*/
|
|
|
|
dest_0 = 2
|
|
dest_1 = 3
|
|
xr1_0 = 4
|
|
xr1_1 = 5
|
|
key_0 = 6
|
|
key_1 = 7
|
|
i = 8
|
|
c = 9
|
|
qstack_0 = 10
|
|
qstack_1 = 11
|
|
|
|
.global mqq160_sign_P
|
|
mqq160_sign_P:
|
|
push_range 2, 11
|
|
push_range 28, 29
|
|
stack_alloc 10+20, r26, r27 /* r1[20] + key */
|
|
adiw r26, 1 /* X points to stack memory */
|
|
movw key_0, r26
|
|
/* load key structure */
|
|
movw r30, r20
|
|
ldi r18, 10
|
|
10: lpm r0, Z+
|
|
st X+, r0
|
|
dec r18
|
|
brne 10b
|
|
movw xr1_0, r26
|
|
movw dest_0, r24
|
|
/* call to mqq_inv_affine_transformation(hash, dest, &key); */
|
|
movw r24, r22
|
|
movw r22, dest_0
|
|
movw r20, key_0
|
|
rcall mqq_inv_affine_transformation
|
|
/* r1[0]=((uint8_t*)dest)[0]; */
|
|
|
|
movw r26, dest_0
|
|
movw r30, xr1_0
|
|
ld r0, X
|
|
st Z, r0
|
|
;----
|
|
ldi r18, 19
|
|
mov c, r18
|
|
clr i
|
|
inc i
|
|
stack_alloc 25, r28, r29
|
|
adiw r28, 1
|
|
movw qstack_0, r28
|
|
20: mov r24, i
|
|
movw r26, xr1_0
|
|
add r26, i
|
|
adc r27, r1
|
|
sbiw r26, 1
|
|
ld r22, X
|
|
movw r26, dest_0
|
|
add r26, i
|
|
adc r27, r1
|
|
ld r20, X
|
|
movw r18, key_0
|
|
movw r26, qstack_0
|
|
rcall mqq_q
|
|
movw r26, xr1_0
|
|
add r26, i
|
|
adc r27, r1
|
|
st X, r24
|
|
inc i
|
|
dec c
|
|
brne 20b
|
|
stack_free 25
|
|
;-----
|
|
|
|
|
|
movw r28, key_0
|
|
ldd r30, Y+8
|
|
ldd r31, Y+9
|
|
movw r26, xr1_0
|
|
ldi r18, 20
|
|
30: lpm r20, Z+
|
|
swap r20
|
|
andi r20, 0xF0
|
|
lpm r21, Z+
|
|
andi r21, 0x0F
|
|
or r20, r21
|
|
ld r21, X
|
|
eor r21, r20
|
|
st X+, r21
|
|
dec r18
|
|
brne 30b
|
|
;----
|
|
|
|
movw r24, xr1_0
|
|
movw r22, dest_0
|
|
movw r20, key_0
|
|
rcall mqq_inv_affine_transformation
|
|
stack_free 30
|
|
pop_range 28, 29
|
|
pop_range 2, 11
|
|
ret
|
|
|
|
|