avr-crypto-lib/xtea-asm.S

586 lines
9.9 KiB
ArmAsm

/* xtea-asm.S */
/*
This file is part of the AVR-Crypto-Lib.
Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/* xtea-asm.S
* Author: Daniel Otte
* Date: 2006-06-06
* License: GPLv3 or later
* Implementation of XTEA for AVR
* include xtea.h in your C-Project to use this functions.
*/
V01 = 2
V02 = 3
V03 = 4
V04 = 5
V11 = 6
V12 = 7
V13 = 8
V14 = 9
Accu1 = 14
Accu2 = 15
Accu3 = 16
Accu4 = 17
Sum1 = 18
Sum2 = 19
Sum3 = 20
Sum4 = 21
Func1 = 22
Func2 = 23
Func3 = 24
Func4 = 25
C = 28 /* der kleine Zaehler fuer zwischendurch */
.global xtea_enc
; == xtea_enc ==
; xtea encrytion function
; param1: 16-bit pointer to destination for encrypted block
; given in r25,r24
; param2: 16-bit pointer to the block (64-bit) which is to encrypt
; given in r23,r22
; param3: 16-bit pointer to the key (128-bit)
; given in r21,r20
;
xtea_enc:
/* prolog */
push r2
push r3
push r4
push r5
push r6
push r7
push r8
push r9
push r14
push r15
push r16
push r17
push r28
/* load the block */
movw r26, r22 /* X points to block */
movw r30, r20 /* Z points to key */
ld V01, X+
ld V02, X+
ld V03, X+
ld V04, X+
ld V11, X+
ld V12, X+
ld V13, X+
ld V14, X+
; push r25
; push r24
movw r26, r24 /* X points to destination */
ldi Func1, 32
mov r0, Func1 /* r0 is cycle-counter */
clr Sum1
clr Sum2
movw Sum3, Sum1
clt
1:
movw Accu1, V11
movw Accu3, V13
ldi C, 4
2: lsl Accu1
rol Accu2
rol Accu3
rol Accu4
dec C
brne 2b /* Accu == V1 << 4 */
movw Func1, V11
movw Func3, V13
ldi C, 5
3: lsr Func4
ror Func3
ror Func2
ror Func1
dec C
brne 3b /* Func == V1 >> 5 */
eor Accu1, Func1
eor Accu2, Func2
eor Accu3, Func3
eor Accu4, Func4
add Accu1, V11
adc Accu2, V12
adc Accu3, V13
adc Accu4, V14 /* Accu == ( (V1<<4)^(V1>>5) ) + V1 */
brtc 4f
mov C, Sum2
lsr C
andi C,(0x03 <<2)
clt
rjmp 5f
4:
mov C, Sum1 /* calc key offset */
andi C, 0x03
lsl C
lsl C
set
5:
add r30, C
adc r31, r1
ld Func1, Z
ldd Func2, Z+1
ldd Func3, Z+2
ldd Func4, Z+3 /* Func = key[sum & 3] */
sub r30, C
sbci r31, 0
add Func1, Sum1
adc Func2, Sum2
adc Func3, Sum3
adc Func4, Sum4
eor Accu1, Func1
eor Accu2, Func2
eor Accu3, Func3
eor Accu4, Func4 /* Accu = ((V1<<4 ^ V1>>5) + V1) ^ (sum + key[sum&3]) */
add Accu1, V01
adc Accu2, V02
adc Accu3, V03
adc Accu4, V04
movw V01, V11
movw V03, V13
movw V11, Accu1
movw V13, Accu3
/* sum += delta */ /* delta == 0x9E3779B9 */
brtc 6f
ldi C, 0xB9
add Sum1, C
ldi C, 0x79
adc Sum2, C
ldi C, 0x37
adc Sum3, C
ldi C, 0x9E
adc Sum4, C
rjmp 1b
6:
dec r0
breq 7f
rjmp 1b
7:
/* write block back */
; pop r26
; pop r27
st X+, V01
st X+, V02
st X+, V03
st X+, V04
st X+, V11
st X+, V12
st X+, V13
st X+, V14
/* epilog */
pop r28
pop r17
pop r16
pop r15
pop r14
pop r9
pop r8
pop r7
pop r6
pop r5
pop r4
pop r3
pop r2
ret
;####################################################################
/* #endif TWO_IN_ONE */
/* #ifdef TWO_IN_ONE */
/* now we use the same base-structure for enc- and decryption
to indicate operation mode we use the highest bit of param3 (16 bit pointer to key),
this is ok, since even the larges atmel today has "only" 8k of ram,
but you shouldn't use this feature while using external ram.
*/
.global xtea_enc
ori r21, 0x80
.global xtea_dec
; == xtea_dec ==
; xtea decrytion function
; param1: 16-bit pointer to destination for decrypted block
; given in r25,r24
; param2: 16-bit pointer to the block (64-bit) which is to derypt
; given in r23,r22
; param3: 16-bit pointer to the key (128-bit)
; given in r21,r20
;
/*
void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) {
uint32_t v0=v[0], v1=v[1], i;
uint32_t sum=0xC6EF3720, delta=0x9E3779B9;
for(i=0; i<32; i++) {
v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
sum -= delta;
v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
}
dest[0]=v0; dest[1]=v1;
}
*/
xtea_dec:
/* prolog */
push r2
push r3
push r4
push r5
push r6
push r7
push r8
push r9
push r14
push r15
push r16
push r17
push r28
/* load the block */
movw r26, r22 /* Z points to block */
movw r30, r20 /* X points to key */
ld V01, X+
ld V02, X+
ld V03, X+
ld V04, X+
ld V11, X+
ld V12, X+
ld V13, X+
ld V14, X+
movw r26, r24 /* Z points to destination */
ldi Sum1, 32
mov r0, Sum1 /* r1 is cycle-counter */
ldi Sum1, 0x20 /* sum = 0xC6EF3720 */
ldi Sum2, 0x37
ldi Sum3, 0xEF
ldi Sum4, 0xC6
clt
1:
movw Accu1, V01
movw Accu3, V03
ldi C, 4
2: lsl Accu1
rol Accu2
rol Accu3
rol Accu4
dec C
brne 2b /* Accu == V0 << 4 */
movw Func1, V01
movw Func3, V03
ldi C, 5
3: lsr Func4
ror Func3
ror Func2
ror Func1
dec C
brne 3b /* Func == V0 >> 5 */
eor Accu1, Func1
eor Accu2, Func2
eor Accu3, Func3
eor Accu4, Func4
add Accu1, V01
adc Accu2, V02
adc Accu3, V03
adc Accu4, V04 /* Accu == ( (V0<<4)^(V0>>5) ) + V0 */
brts 4f
mov C, Sum2
lsr C
andi C,(0x03 <<2)
set
rjmp 5f
4:
mov C, Sum1 /* calc key offset */
andi C, 0x03
lsl C
lsl C
clt
5:
add r30, C
adc r31, r1
ld Func1, Z
ldd Func2, Z+1
ldd Func3, Z+2
ldd Func4, Z+3 /* Func = key[sum & 3] */
sub r30, C
sbci r31, 0
add Func1, Sum1
adc Func2, Sum2
adc Func3, Sum3
adc Func4, Sum4
eor Accu1, Func1
eor Accu2, Func2
eor Accu3, Func3
eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3]) */
sub V11, Accu1
sbc V12, Accu2
sbc V13, Accu3
sbc V14, Accu4
movw Accu1, V01
movw Accu3, V03
movw V01, V11
movw V03, V13
movw V11, Accu1
movw V13, Accu3
/* sum += delta */ /* delta == 0x9E3779B9 */
brtc 6f
subi Sum1, 0xB9
sbci Sum2, 0x79
sbci Sum3, 0x37
sbci Sum4, 0x9E
rjmp 1b
6:
dec r0
breq 7f
rjmp 1b
7:
/* write block back */
st X+, V01
st X+, V02
st X+, V03
st X+, V04
st X+, V11
st X+, V12
st X+, V13
st X+, V14
/* epilog */
pop r28
pop r17
pop r16
pop r15
pop r14
pop r9
pop r8
pop r7
pop r6
pop r5
pop r4
pop r3
pop r2
ret
/* #endif */
;####################################################################
#ifdef TWO_IN_ONE
/* now we use the same base-structure for enc- and decryption
to indicate operation mode we use the highest bit of param3 (16 bit pointer to key),
this is ok, since even the larges atmel today has "only" 8k of ram,
but you shouldn't use this feature while using external ram.
*/
.global xtea_enc
ori r21, 0x80
.global xtea_dec
; == xtea_dec ==
; xtea decrytion function
; param1: 16-bit pointer to destination for decrypted block
; given in r25,r24
; param2: 16-bit pointer to the block (64-bit) which is to derypt
; given in r23,r22
; param3: 16-bit pointer to the key (128-bit)
; given in r21,r20
;
/*
void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) {
uint32_t v0=v[0], v1=v[1], i;
uint32_t sum=0xC6EF3720, delta=0x9E3779B9;
for(i=0; i<32; i++) {
v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
sum -= delta;
v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
}
dest[0]=v0; dest[1]=v1;
}
*/
xtea_dec:
/* prolog */
push r2
push r3
push r4
push r5
push r6
push r7
push r8
push r9
push r14
push r15
push r16
push r17
push r28
/* set T-bit if we are going to encrypt, clear otherwise */
bst r21, 7
andi r21, 0x7f /* fix r21:r22 to a real addr */
/* load the block */
movw r26, r22 /* Z points to block */
movw r30, r20 /* X points to key */
ld V01, X+
ld V02, X+
ld V03, X+
ld V04, X+
ld V11, X+
ld V12, X+
ld V13, X+
ld V14, X+
movw r26, r24 /* Z points to destination */
ldi Sum1, 32
mov r0, Sum1 /* r1 is cycle-counter */
ldi Sum1, 0x20 /* sum = 0xC6EF3720 */
ldi Sum2, 0x37
ldi Sum3, 0xEF
ldi Sum4, 0xC6
clt
1:
movw Accu1, V01
movw Accu3, V03
ldi C, 4
2: lsl Accu1
rol Accu2
rol Accu3
rol Accu4
dec C
brne 2b /* Accu == V0 << 4 */
movw Func1, V01
movw Func3, V03
ldi C, 5
3: lsr Func4
ror Func3
ror Func2
ror Func1
dec C
brne 3b /* Func == V0 >> 5 */
eor Accu1, Func1
eor Accu2, Func2
eor Accu3, Func3
eor Accu4, Func4
add Accu1, V01
adc Accu2, V02
adc Accu3, V03
adc Accu4, V04 /* Accu == ( (V0<<4)^(V0>>5) ) + V0 */
brts 4f
mov C, Sum2
lsr C
andi C,(0x03 <<2)
set
rjmp 5f
4:
mov C, Sum1 /* calc key offset */
andi C, 0x03
lsl C
lsl C
clt
5:
add r30, C
adc r31, r1
ld Func1, Z
ldd Func2, Z+1
ldd Func3, Z+2
ldd Func4, Z+3 /* Func = key[sum & 3] */
sub r30, C
sbci r31, 0
add Func1, Sum1
adc Func2, Sum2
adc Func3, Sum3
adc Func4, Sum4
eor Accu1, Func1
eor Accu2, Func2
eor Accu3, Func3
eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3]) */
sub V11, Accu1
sbc V12, Accu2
sbc V13, Accu3
sbc V14, Accu4
movw Accu1, V01
movw Accu3, V03
movw V01, V11
movw V03, V13
movw V11, Accu1
movw V13, Accu3
/* sum += delta */ /* delta == 0x9E3779B9 */
brtc 6f
subi Sum1, 0xB9
sbci Sum2, 0x79
sbci Sum3, 0x37
sbci Sum4, 0x9E
rjmp 1b
6:
dec r0
breq 7f
rjmp 1b
7:
/* write block back */
st X+, V01
st X+, V02
st X+, V03
st X+, V04
st X+, V11
st X+, V12
st X+, V13
st X+, V14
/* epilog */
pop r28
pop r17
pop r16
pop r15
pop r14
pop r9
pop r8
pop r7
pop r6
pop r5
pop r4
pop r3
pop r2
ret
#endif