586 lines
9.9 KiB
ArmAsm
586 lines
9.9 KiB
ArmAsm
/* xtea-asm.S */
|
|
/*
|
|
This file is part of the Crypto-avr-lib/microcrypt-lib.
|
|
Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
/* xtea-asm.S
|
|
* Author: Daniel Otte
|
|
* Date: 2006-06-06
|
|
* License: GPLv3 or later
|
|
* Implementation of XTEA for AVR
|
|
* include xtea.h in your C-Project to use this functions.
|
|
*/
|
|
|
|
V01 = 2
|
|
V02 = 3
|
|
V03 = 4
|
|
V04 = 5
|
|
V11 = 6
|
|
V12 = 7
|
|
V13 = 8
|
|
V14 = 9
|
|
Accu1 = 14
|
|
Accu2 = 15
|
|
Accu3 = 16
|
|
Accu4 = 17
|
|
Sum1 = 18
|
|
Sum2 = 19
|
|
Sum3 = 20
|
|
Sum4 = 21
|
|
Func1 = 22
|
|
Func2 = 23
|
|
Func3 = 24
|
|
Func4 = 25
|
|
C = 28 /* der kleine Zaehler fuer zwischendurch */
|
|
|
|
.global xtea_enc
|
|
; == xtea_enc ==
|
|
; xtea encrytion function
|
|
; param1: 16-bit pointer to destination for encrypted block
|
|
; given in r25,r24
|
|
; param2: 16-bit pointer to the block (64-bit) which is to encrypt
|
|
; given in r23,r22
|
|
; param3: 16-bit pointer to the key (128-bit)
|
|
; given in r21,r20
|
|
;
|
|
xtea_enc:
|
|
/* prolog */
|
|
push r2
|
|
push r3
|
|
push r4
|
|
push r5
|
|
push r6
|
|
push r7
|
|
push r8
|
|
push r9
|
|
push r14
|
|
push r15
|
|
push r16
|
|
push r17
|
|
push r28
|
|
|
|
/* load the block */
|
|
movw r26, r22 /* X points to block */
|
|
movw r30, r20 /* Z points to key */
|
|
ld V01, X+
|
|
ld V02, X+
|
|
ld V03, X+
|
|
ld V04, X+
|
|
ld V11, X+
|
|
ld V12, X+
|
|
ld V13, X+
|
|
ld V14, X+
|
|
; push r25
|
|
; push r24
|
|
movw r26, r24 /* X points to destination */
|
|
|
|
ldi Func1, 32
|
|
mov r0, Func1 /* r1 is cycle-counter */
|
|
clr Sum1
|
|
clr Sum2
|
|
movw Sum3, Sum1
|
|
clt
|
|
|
|
1:
|
|
movw Accu1, V11
|
|
movw Accu3, V13
|
|
ldi C, 4
|
|
2: lsl Accu1
|
|
rol Accu2
|
|
rol Accu3
|
|
rol Accu4
|
|
dec C
|
|
brne 2b /* Accu == V1 << 4 */
|
|
|
|
movw Func1, V11
|
|
movw Func3, V13
|
|
ldi C, 5
|
|
3: lsr Func4
|
|
ror Func3
|
|
ror Func2
|
|
ror Func1
|
|
dec C
|
|
brne 3b /* Func == V1 >> 5 */
|
|
|
|
eor Accu1, Func1
|
|
eor Accu2, Func2
|
|
eor Accu3, Func3
|
|
eor Accu4, Func4
|
|
add Accu1, V11
|
|
adc Accu2, V12
|
|
adc Accu3, V13
|
|
adc Accu4, V14 /* Accu == ( (V1<<4)^(V1>>5) ) + V1 */
|
|
|
|
brtc 4f
|
|
mov C, Sum2
|
|
lsr C
|
|
andi C,(0x03 <<2)
|
|
clt
|
|
rjmp 5f
|
|
4:
|
|
mov C, Sum1 /* calc key offset */
|
|
andi C, 0x03
|
|
lsl C
|
|
lsl C
|
|
set
|
|
|
|
5:
|
|
add r30, C
|
|
adc r31, r1
|
|
ld Func1, Z
|
|
ldd Func2, Z+1
|
|
ldd Func3, Z+2
|
|
ldd Func4, Z+3 /* Func = key[sum & 3] */
|
|
sub r30, C
|
|
sbci r31, 0
|
|
add Func1, Sum1
|
|
adc Func2, Sum2
|
|
adc Func3, Sum3
|
|
adc Func4, Sum4
|
|
eor Accu1, Func1
|
|
eor Accu2, Func2
|
|
eor Accu3, Func3
|
|
eor Accu4, Func4 /* Accu = ((V1<<4 ^ V1>>5) + V1) ^ (sum + key[sum&3]) */
|
|
add Accu1, V01
|
|
adc Accu2, V02
|
|
adc Accu3, V03
|
|
adc Accu4, V04
|
|
|
|
movw V01, V11
|
|
movw V03, V13
|
|
movw V11, Accu1
|
|
movw V13, Accu3
|
|
|
|
/* sum += delta */ /* delta == 0x9E3779B9 */
|
|
brtc 6f
|
|
ldi C, 0xB9
|
|
add Sum1, C
|
|
ldi C, 0x79
|
|
adc Sum2, C
|
|
ldi C, 0x37
|
|
adc Sum3, C
|
|
ldi C, 0x9E
|
|
adc Sum4, C
|
|
rjmp 1b
|
|
|
|
6:
|
|
dec r0
|
|
breq 7f
|
|
rjmp 1b
|
|
|
|
7:
|
|
/* write block back */
|
|
; pop r26
|
|
; pop r27
|
|
st X+, V01
|
|
st X+, V02
|
|
st X+, V03
|
|
st X+, V04
|
|
st X+, V11
|
|
st X+, V12
|
|
st X+, V13
|
|
st X+, V14
|
|
|
|
/* epilog */
|
|
pop r28
|
|
pop r17
|
|
pop r16
|
|
pop r15
|
|
pop r14
|
|
pop r9
|
|
pop r8
|
|
pop r7
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
pop r2
|
|
ret
|
|
|
|
;####################################################################
|
|
|
|
/* #endif TWO_IN_ONE */
|
|
|
|
/* #ifdef TWO_IN_ONE */
|
|
/* now we use the same base-structure for enc- and decryption
|
|
to indicate operation mode we use the highest bit of param3 (16 bit pointer to key),
|
|
this is ok, since even the larges atmel today has "only" 8k of ram,
|
|
but you shouldn't use this feature while using external ram.
|
|
*/
|
|
.global xtea_enc
|
|
ori r21, 0x80
|
|
|
|
.global xtea_dec
|
|
; == xtea_dec ==
|
|
; xtea decrytion function
|
|
; param1: 16-bit pointer to destination for decrypted block
|
|
; given in r25,r24
|
|
; param2: 16-bit pointer to the block (64-bit) which is to derypt
|
|
; given in r23,r22
|
|
; param3: 16-bit pointer to the key (128-bit)
|
|
; given in r21,r20
|
|
;
|
|
/*
|
|
void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) {
|
|
uint32_t v0=v[0], v1=v[1], i;
|
|
uint32_t sum=0xC6EF3720, delta=0x9E3779B9;
|
|
for(i=0; i<32; i++) {
|
|
v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
|
|
sum -= delta;
|
|
v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
|
|
}
|
|
dest[0]=v0; dest[1]=v1;
|
|
}
|
|
*/
|
|
|
|
xtea_dec:
|
|
/* prolog */
|
|
push r2
|
|
push r3
|
|
push r4
|
|
push r5
|
|
push r6
|
|
push r7
|
|
push r8
|
|
push r9
|
|
push r14
|
|
push r15
|
|
push r16
|
|
push r17
|
|
push r28
|
|
/* load the block */
|
|
movw r26, r22 /* Z points to block */
|
|
movw r30, r20 /* X points to key */
|
|
ld V01, X+
|
|
ld V02, X+
|
|
ld V03, X+
|
|
ld V04, X+
|
|
ld V11, X+
|
|
ld V12, X+
|
|
ld V13, X+
|
|
ld V14, X+
|
|
movw r26, r24 /* Z points to destination */
|
|
|
|
ldi Sum1, 32
|
|
mov r0, Sum1 /* r1 is cycle-counter */
|
|
ldi Sum1, 0x20 /* sum = 0xC6EF3720 */
|
|
ldi Sum2, 0x37
|
|
ldi Sum3, 0xEF
|
|
ldi Sum4, 0xC6
|
|
clt
|
|
|
|
1:
|
|
movw Accu1, V01
|
|
movw Accu3, V03
|
|
ldi C, 4
|
|
2: lsl Accu1
|
|
rol Accu2
|
|
rol Accu3
|
|
rol Accu4
|
|
dec C
|
|
brne 2b /* Accu == V0 << 4 */
|
|
|
|
movw Func1, V01
|
|
movw Func3, V03
|
|
ldi C, 5
|
|
3: lsr Func4
|
|
ror Func3
|
|
ror Func2
|
|
ror Func1
|
|
dec C
|
|
brne 3b /* Func == V0 >> 5 */
|
|
|
|
eor Accu1, Func1
|
|
eor Accu2, Func2
|
|
eor Accu3, Func3
|
|
eor Accu4, Func4
|
|
add Accu1, V01
|
|
adc Accu2, V02
|
|
adc Accu3, V03
|
|
adc Accu4, V04 /* Accu == ( (V0<<4)^(V0>>5) ) + V0 */
|
|
|
|
brts 4f
|
|
mov C, Sum2
|
|
lsr C
|
|
andi C,(0x03 <<2)
|
|
set
|
|
rjmp 5f
|
|
4:
|
|
mov C, Sum1 /* calc key offset */
|
|
andi C, 0x03
|
|
lsl C
|
|
lsl C
|
|
clt
|
|
|
|
5:
|
|
add r30, C
|
|
adc r31, r1
|
|
ld Func1, Z
|
|
ldd Func2, Z+1
|
|
ldd Func3, Z+2
|
|
ldd Func4, Z+3 /* Func = key[sum & 3] */
|
|
sub r30, C
|
|
sbci r31, 0
|
|
add Func1, Sum1
|
|
adc Func2, Sum2
|
|
adc Func3, Sum3
|
|
adc Func4, Sum4
|
|
eor Accu1, Func1
|
|
eor Accu2, Func2
|
|
eor Accu3, Func3
|
|
eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3]) */
|
|
sub V11, Accu1
|
|
sbc V12, Accu2
|
|
sbc V13, Accu3
|
|
sbc V14, Accu4
|
|
|
|
movw Accu1, V01
|
|
movw Accu3, V03
|
|
movw V01, V11
|
|
movw V03, V13
|
|
movw V11, Accu1
|
|
movw V13, Accu3
|
|
|
|
/* sum += delta */ /* delta == 0x9E3779B9 */
|
|
brtc 6f
|
|
subi Sum1, 0xB9
|
|
sbci Sum2, 0x79
|
|
sbci Sum3, 0x37
|
|
sbci Sum4, 0x9E
|
|
rjmp 1b
|
|
|
|
6:
|
|
dec r0
|
|
breq 7f
|
|
rjmp 1b
|
|
|
|
7:
|
|
/* write block back */
|
|
st X+, V01
|
|
st X+, V02
|
|
st X+, V03
|
|
st X+, V04
|
|
st X+, V11
|
|
st X+, V12
|
|
st X+, V13
|
|
st X+, V14
|
|
|
|
/* epilog */
|
|
pop r28
|
|
pop r17
|
|
pop r16
|
|
pop r15
|
|
pop r14
|
|
pop r9
|
|
pop r8
|
|
pop r7
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
pop r2
|
|
ret
|
|
|
|
/* #endif */
|
|
|
|
;####################################################################
|
|
|
|
#ifdef TWO_IN_ONE
|
|
/* now we use the same base-structure for enc- and decryption
|
|
to indicate operation mode we use the highest bit of param3 (16 bit pointer to key),
|
|
this is ok, since even the larges atmel today has "only" 8k of ram,
|
|
but you shouldn't use this feature while using external ram.
|
|
*/
|
|
.global xtea_enc
|
|
ori r21, 0x80
|
|
|
|
.global xtea_dec
|
|
; == xtea_dec ==
|
|
; xtea decrytion function
|
|
; param1: 16-bit pointer to destination for decrypted block
|
|
; given in r25,r24
|
|
; param2: 16-bit pointer to the block (64-bit) which is to derypt
|
|
; given in r23,r22
|
|
; param3: 16-bit pointer to the key (128-bit)
|
|
; given in r21,r20
|
|
;
|
|
/*
|
|
void xtea_dec(uint32_t* dest, uint32_t* v, uint32_t* k) {
|
|
uint32_t v0=v[0], v1=v[1], i;
|
|
uint32_t sum=0xC6EF3720, delta=0x9E3779B9;
|
|
for(i=0; i<32; i++) {
|
|
v1 -= ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
|
|
sum -= delta;
|
|
v0 -= ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
|
|
}
|
|
dest[0]=v0; dest[1]=v1;
|
|
}
|
|
*/
|
|
|
|
xtea_dec:
|
|
/* prolog */
|
|
push r2
|
|
push r3
|
|
push r4
|
|
push r5
|
|
push r6
|
|
push r7
|
|
push r8
|
|
push r9
|
|
push r14
|
|
push r15
|
|
push r16
|
|
push r17
|
|
push r28
|
|
/* set T-bit if we are going to encrypt, clear otherwise */
|
|
bst r21, 7
|
|
andi r21, 0x7f /* fix r21:r22 to a real addr */
|
|
/* load the block */
|
|
movw r26, r22 /* Z points to block */
|
|
movw r30, r20 /* X points to key */
|
|
ld V01, X+
|
|
ld V02, X+
|
|
ld V03, X+
|
|
ld V04, X+
|
|
ld V11, X+
|
|
ld V12, X+
|
|
ld V13, X+
|
|
ld V14, X+
|
|
movw r26, r24 /* Z points to destination */
|
|
|
|
ldi Sum1, 32
|
|
mov r0, Sum1 /* r1 is cycle-counter */
|
|
ldi Sum1, 0x20 /* sum = 0xC6EF3720 */
|
|
ldi Sum2, 0x37
|
|
ldi Sum3, 0xEF
|
|
ldi Sum4, 0xC6
|
|
clt
|
|
|
|
1:
|
|
movw Accu1, V01
|
|
movw Accu3, V03
|
|
ldi C, 4
|
|
2: lsl Accu1
|
|
rol Accu2
|
|
rol Accu3
|
|
rol Accu4
|
|
dec C
|
|
brne 2b /* Accu == V0 << 4 */
|
|
|
|
movw Func1, V01
|
|
movw Func3, V03
|
|
ldi C, 5
|
|
3: lsr Func4
|
|
ror Func3
|
|
ror Func2
|
|
ror Func1
|
|
dec C
|
|
brne 3b /* Func == V0 >> 5 */
|
|
|
|
eor Accu1, Func1
|
|
eor Accu2, Func2
|
|
eor Accu3, Func3
|
|
eor Accu4, Func4
|
|
add Accu1, V01
|
|
adc Accu2, V02
|
|
adc Accu3, V03
|
|
adc Accu4, V04 /* Accu == ( (V0<<4)^(V0>>5) ) + V0 */
|
|
|
|
brts 4f
|
|
mov C, Sum2
|
|
lsr C
|
|
andi C,(0x03 <<2)
|
|
set
|
|
rjmp 5f
|
|
4:
|
|
mov C, Sum1 /* calc key offset */
|
|
andi C, 0x03
|
|
lsl C
|
|
lsl C
|
|
clt
|
|
|
|
5:
|
|
add r30, C
|
|
adc r31, r1
|
|
ld Func1, Z
|
|
ldd Func2, Z+1
|
|
ldd Func3, Z+2
|
|
ldd Func4, Z+3 /* Func = key[sum & 3] */
|
|
sub r30, C
|
|
sbci r31, 0
|
|
add Func1, Sum1
|
|
adc Func2, Sum2
|
|
adc Func3, Sum3
|
|
adc Func4, Sum4
|
|
eor Accu1, Func1
|
|
eor Accu2, Func2
|
|
eor Accu3, Func3
|
|
eor Accu4, Func4 /* Accu = ((V0<<4 ^ V0>>5) + V0) ^ (sum + key[sum&3]) */
|
|
sub V11, Accu1
|
|
sbc V12, Accu2
|
|
sbc V13, Accu3
|
|
sbc V14, Accu4
|
|
|
|
movw Accu1, V01
|
|
movw Accu3, V03
|
|
movw V01, V11
|
|
movw V03, V13
|
|
movw V11, Accu1
|
|
movw V13, Accu3
|
|
|
|
/* sum += delta */ /* delta == 0x9E3779B9 */
|
|
brtc 6f
|
|
subi Sum1, 0xB9
|
|
sbci Sum2, 0x79
|
|
sbci Sum3, 0x37
|
|
sbci Sum4, 0x9E
|
|
rjmp 1b
|
|
|
|
6:
|
|
dec r0
|
|
breq 7f
|
|
rjmp 1b
|
|
|
|
7:
|
|
/* write block back */
|
|
st X+, V01
|
|
st X+, V02
|
|
st X+, V03
|
|
st X+, V04
|
|
st X+, V11
|
|
st X+, V12
|
|
st X+, V13
|
|
st X+, V14
|
|
|
|
/* epilog */
|
|
pop r28
|
|
pop r17
|
|
pop r16
|
|
pop r15
|
|
pop r14
|
|
pop r9
|
|
pop r8
|
|
pop r7
|
|
pop r6
|
|
pop r5
|
|
pop r4
|
|
pop r3
|
|
pop r2
|
|
ret
|
|
|
|
#endif
|
|
|