
996 lines
17 KiB
Raw Normal View History

2008-05-26 19:13:21 +00:00
/* camellia-asm.S */
This file is part of the Crypto-avr-lib/microcrypt-lib.
Copyright (C) 2008 Daniel Otte (daniel.otte@rub.de)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
2008-07-03 04:11:34 +00:00
* File: camellis-asm.S
* Author: Daniel Otte
* Date: 2006-11-10
* License: GPLv3 or later
* Description: Implementation of the camellia block cipher algorithm.
.macro SWAP_R A, B
eor \A, \B
eor \B, \A
eor \A, \B
.macro precall
/* push r18 - r27, r30 - r31*/
push r0
push r1
push r18
push r19
push r20
push r21
push r22
push r23
push r24
push r25
push r26
push r27
push r30
push r31
clr r1
.macro postcall
pop r31
pop r30
pop r27
pop r26
pop r25
pop r24
pop r23
pop r22
pop r21
pop r20
pop r19
pop r18
pop r1
pop r0
.macro hexdump length
push r27
push r26
ldi r25, '\r'
mov r24, r25
call uart_putc
ldi r25, '\n'
mov r24, r25
call uart_putc
pop r26
pop r27
movw r24, r26
.if \length > 16
ldi r22, lo8(16)
ldi r23, hi8(16)
push r27
push r26
call uart_hexdump
pop r26
pop r27
adiw r26, 16
hexdump \length-16
ldi r22, lo8(\length)
ldi r23, hi8(\length)
call uart_hexdump
/* X points to Block */
.macro dbg_hexdump length
hexdump \length
SPL = 0x3D
SPH = 0x3E
SREG = 0x3F
NULLr = 1
.byte 112, 130, 44, 236, 179, 39, 192, 229, 228, 133, 87, 53, 234, 12, 174, 65
.byte 35, 239, 107, 147, 69, 25, 165, 33, 237, 14, 79, 78, 29, 101, 146, 189
.byte 134, 184, 175, 143, 124, 235, 31, 206, 62, 48, 220, 95, 94, 197, 11, 26
.byte 166, 225, 57, 202, 213, 71, 93, 61, 217, 1, 90, 214, 81, 86, 108, 77
.byte 139, 13, 154, 102, 251, 204, 176, 45, 116, 18, 43, 32, 240, 177, 132, 153
.byte 223, 76, 203, 194, 52, 126, 118, 5, 109, 183, 169, 49, 209, 23, 4, 215
.byte 20, 88, 58, 97, 222, 27, 17, 28, 50, 15, 156, 22, 83, 24, 242, 34
.byte 254, 68, 207, 178, 195, 181, 122, 145, 36, 8, 232, 168, 96, 252, 105, 80
.byte 170, 208, 160, 125, 161, 137, 98, 151, 84, 91, 30, 149, 224, 255, 100, 210
.byte 16, 196, 0, 72, 163, 247, 117, 219, 138, 3, 230, 218, 9, 63, 221, 148
.byte 135, 92, 131, 2, 205, 74, 144, 51, 115, 103, 246, 243, 157, 127, 191, 226
.byte 82, 155, 216, 38, 200, 55, 198, 59, 129, 150, 111, 75, 19, 190, 99, 46
.byte 233, 121, 167, 140, 159, 110, 188, 142, 41, 245, 249, 182, 47, 253, 180, 89
.byte 120, 152, 6, 106, 231, 70, 113, 186, 212, 37, 171, 66, 136, 162, 141, 250
.byte 114, 7, 185, 85, 248, 238, 172, 10, 54, 73, 42, 104, 60, 56, 241, 164
.byte 64, 40, 211, 123, 187, 201, 67, 193, 21, 227, 173, 244, 119, 199, 128, 158
//.global camellia_sigma
.quad 0xA09E667F3BCC908B
.quad 0xB67AE8584CAA73B2
.quad 0xC6EF372FE94F82BE
.quad 0x54FF53A5F1D36F1C
.quad 0x10E527FADE682D1D
.quad 0xB05688C2B3E6C1FD
/* uint8_t camellia_s1(uint8_t b) */
.global camellia_s1
ldi r30, lo8(camellia_sbox)
ldi r31, hi8(camellia_sbox)
add r30, r24
adc r31, NULLr
lpm r24, Z
clr r25
.global camellia_s2
ldi r30, lo8(camellia_sbox)
ldi r31, hi8(camellia_sbox)
add r30, r24
adc r31, NULLr
lpm r24, Z
lsl r24
adc r24, NULLr
clr r25
.global camellia_s3
ldi r30, lo8(camellia_sbox)
ldi r31, hi8(camellia_sbox)
add r30, r24
adc r31, NULLr
lpm r24, Z
bst r24, 0
lsr r24
bld r24, 7
clr r25
.global camellia_s4
ldi r30, lo8(camellia_sbox)
ldi r31, hi8(camellia_sbox)
lsl r24
adc r24, NULLr
add r30, r24
adc r31, NULLr
lpm r24, Z
clr r25
.global camellia_s
/* uint64_t camellia_s(uint64_t d){
#define D ((uint8_t*)(&d))
D[7] = camellia_s1(D[7]); // MSB
D[6] = camellia_s2(D[6]);
D[5] = camellia_s3(D[5]);
D[4] = camellia_s4(D[4]);
D[3] = camellia_s2(D[3]);
D[2] = camellia_s3(D[2]);
D[1] = camellia_s4(D[1]);
D[0] = camellia_s1(D[0]); // LSB
#undef D
return d;
; parameters
; d: r18-r25 (r18 is LSB)
movw r26, r24 ; backup r24,r25 -> X
clr r25
call camellia_s2
mov r26, r24
mov r24, r27
call camellia_s1
mov r27, r24
mov r24, r23
call camellia_s3
mov r23, r24
mov r24, r22
call camellia_s4
mov r22, r24
mov r24, r21
call camellia_s2
mov r21, r24
mov r24, r20
call camellia_s3
mov r20, r24
mov r24, r19
call camellia_s4
mov r19, r24
mov r24, r18
call camellia_s1
mov r18, r24
movw r24, r26
/* uint64_t camellia_p(uint64_t d) */
; param: r18-r25 (r18 is LSB)
z1 = 25
z2 = 24
z3 = 23
z4 = 22
z5 = 21
z6 = 20
z7 = 19
z8 = 18
.global camellia_p
eor z1, z6
eor z2, z7
eor z3, z8
eor z4, z5
eor z5, z3
eor z6, z4
eor z7, z1
eor z8, z2
eor z1, z8
eor z2, z5
eor z3, z6
eor z4, z7
eor z5, z4
eor z6, z1
eor z7, z2
eor z8, z3
movw r26, z8
movw r30, z6 ; backup z5 bis z8
movw z8, z4
movw z6, z2
movw z4, r26
movw z2, r30
/* uint64_t camellia_f(uint64_t x, uint64_t k) */
; param x: r18-r25
; param k: r10-r17
.global camellia_f
eor r18, r10
eor r19, r11
eor r20, r12
eor r21, r13
eor r22, r14
eor r23, r15
eor r24, r16
eor r25, r17
call camellia_s
call camellia_p
/* uint64_t camellia_fl(uint64_t x, uint64_t k) */
; param x: r18-r25 xl: r22-r25, xr: r18-r21
; param k: r10-r17 kl: r14-r17, kr: r10-r13
kl1 = 14
kl2 = 15
kl3 = 16
kl4 = 17
kr1 = 10
kr2 = 11
kr3 = 12
kr4 = 13
xr1 = 18
xr2 = 19
xr3 = 20
xr4 = 21
xl1 = 22
xl2 = 23
xl3 = 24
xl4 = 25
.global camellia_fl
and kl1, xl1
and kl2, xl2
and kl3, xl3
and kl4, xl4
mov r26, kl4
rol r26
rol kl1
rol kl2
rol kl3
rol kl4
eor xr1, kl1
eor xr2, kl2
eor xr3, kl3
eor xr4, kl4
// that was part one
or kr1, xr1
or kr2, xr2
or kr3, xr3
or kr4, xr4
eor xl1, kr1
eor xl2, kr2
eor xl3, kr3
eor xl4, kr4
/* uint64_t camellia_fl_inv(uint64_t y, uint64_t k) */
; param y: r18-r25 yl: r22-r25, yr: r18-r21
; param k: r10-r17 kl: r14-r17, kr: r10-r13
kl1 = 14
kl2 = 15
kl3 = 16
kl4 = 17
kr1 = 10
kr2 = 11
kr3 = 12
kr4 = 13
yr1 = 18
yr2 = 19
yr3 = 20
yr4 = 21
yl1 = 22
yl2 = 23
yl3 = 24
yl4 = 25
.global camellia_fl_inv
or kr1, yr1
or kr2, yr2
or kr3, yr3
or kr4, yr4
eor yl1, kr1
eor yl2, kr2
eor yl3, kr3
eor yl4, kr4
// the first one is done
and kl1, yl1
and kl2, yl2
and kl3, yl3
and kl4, yl4
mov r26, kl4
rol r26
rol kl1
rol kl2
rol kl3
rol kl4
eor yr1, kl1
eor yr2, kl2
eor yr3, kl3
eor yr4, kl4
; param s: r24-r25
; param q: r22
B1 = 18
B2 = 19
.global camellia128_keyop_rot15
movw r30, r24 ; Z points at LSB of kl ;-- 0
ldi r22, 2
2: adiw r30, 15 ;-- 15
ld r21, Z
ld r20, -Z ;-- 14
movw B1, r20 ; store Backup of the 2 MSB of kl
ror r20
ldi r21, 14
1: ld r20, -Z ;-- 13..0
ror r20
std Z+2, r20 ;-- (15..2)
dec r21
brne 1b
ror B2
ror B1
st Z+, B1 ;-- 1
st Z, B2
adiw r30, 15 ;-- 16
dec r22
brne 2b
; param s: r24-r25
; param q: r22
.global camellia128_keyop_rot17
push r8
push r9
push r10
push r11
push r12
push r13
push r14
push r15
push r16
push r17
movw r30, r24
clr r27
2: ldi r26, 8
mov r1, r26
lsl r1 ; r1=16
;push r1
; load 128bit value
ldd r0, Z+15
rol r0
1: ld r0, Z+
rol r0
st X+, r0
dec r1
brne 1b
st -Z, 21
st -Z, 20
st -Z, 19
st -Z, 18
st -Z, 17
st -Z, 16
st -Z, 15
st -Z, 14 ;--
st -Z, 13
st -Z, 12
st -Z, 11
st -Z, 10
st -Z, 9
st -Z, 8
st -Z, 23
st -Z, 22
brts 2f
adiw r30, 16
rjmp 2b
pop r17
pop r16
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop r8
; param s: r24-r25
; param q: r22
.global camellia128_keyop
cpi r22, 1
breq camellia128_keyop_rot17
rjmp camellia128_keyop_rot15
; param s: r24-r25
; param q: r22
B1 = 18
B2 = 19
.global camellia128_keyop_inv_rot15
movw r30, r24 ; Z points at LSB of kl ;-- 0
movw r26, r24 ; X also
ldi r22, 2
2: ;-- 0
ld r20, Z+ ;-- 0/1
ld r21, Z+ ;-- 1/2
movw B1, r20 ; store Backup of the 2 LSB of kl
rol r21
ldi r20, 14
1: ld r21, Z+ ;-- 2/14..3/16
rol r21
st X+, r21 ;-- (0..13)/(1..14)
dec r20
brne 1b
rol B1
rol B2
st X+, B1 ;-- 14/15
st X+, B2 ;-- 15/16
dec r22
brne 2b
; param s: r24-r25
; param q: r22
.global camellia128_keyop_inv_rot17
push r8
push r9
push r10
push r11
push r12
push r13
push r14
push r15
push r16
push r17
movw r30, r24
clr r27
2: ldi r26, 8
mov r1, r26
lsl r1 ; r1=16
; load 128bit value
ld r0, Z
adiw r30, 16
ror r0
1: ld r0, -Z
ror r0
st X+, r0
dec r1
brne 1b
st Z+, 21
st Z+, 20
st Z+, 19
st Z+, 18
st Z+, 17
st Z+, 16
st Z+, 15
st Z+, 14 ;--
st Z+, 13
st Z+, 12
st Z+, 11
st Z+, 10
st Z+, 9
st Z+, 8
st Z+, 23
st Z+, 22
brts 2f
; adiw r30, 16
rjmp 2b
pop r17
pop r16
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop r8
; param s: r24-r25
; param q: r22
.global camellia128_keyop_inv
cpi r22, 1
breq camellia128_keyop_inv_rot17
rjmp camellia128_keyop_inv_rot15
; param p: r24-r25 pointer to data
; param l: r22 length of word
.global change_endian
movw r26, r24
movw r30, r24
add r30, r22
adc r31, r1
lsr r22
ld r20, X
ld r21, -Z
st X+, r21
st Z, r20
dec r22
brne 1b
#define SEL_KA 1
#define SEL_KL 0
#define KEY_POSTC1 0x00
#define KEY_POSTC2 0x01
#define KEY_INC2 0x02
#define KEY_DIR 0x04
#define KEY_DIR_NORM 0x00
#define KEY_DIR_INV 0x04
#define KEY_AMMOUNT 0x08
#define KEY_ROL17 0x08
#define KEY_ROL15 0x00
void camellia_6rounds(camellia128_ctx_t* s, uint64_t* bl, uint64_t* br, uint8_t roundop, uint8_t keychoice){
uint8_t i;
uint64_t* k[4];
k[0] = &(s->kll);
k[1] = &(s->klr);
k[2] = &(s->kal);
k[3] = &(s->kar);
for(i=0; i<3; ++i){ / * each cycle * /
br[0] ^= camellia_f(bl[0],*(k[(keychoice&1)*2+((roundop&KEY_DIR)?1:0)]));
keychoice >>= 1;
if((i == 1) && (roundop&KEY_INC2)){
bl[0] ^= camellia_f(br[0],*(k[(keychoice&1)*2+((roundop&KEY_DIR)?0:1)]));
keychoice >>= 1;
/ * check if we should do some keyop * /
if((i == (roundop&1)) && (!(roundop&KEY_INC2)) ){
/ * isn't it fuckin nice what we can do in C?! * /
; param s: r24-r25
; param bl: r22-r23
; param br: r20-r21
; param roundop: r18
; param keychoice: r16
s1 = 24
s2 = 25
bl1 = 22
bl2 = 23
br1 = 20
br2 = 22
xro = 18
kc = 16
xro_sec = 17
br1_sec = 10
br2_sec = 11
bl1_sec = 12
bl2_sec = 13
s1_sec = 14
t = 9
loop_cnt = 8
keyop_time = 7
.global camellia_6rounds
push r17
push r16
push r15
push r14
push r13
push r12
push r11
push r10
push r9
push r8
push r7
ldi r17, 6
mov loop_cnt, r17
mov xro_sec, xro
movw br1_sec, br1
movw bl1_sec, bl1
movw s1_sec, s1
clr keyop_time
inc keyop_time
rol keyop_time // keyop_time == 3
SBRC xro, 1 // KEY_INC2
rjmp 1f
SBRS xro, 0 // KEY_POSTC1
inc keyop_time
SBRS xro, 0 // KEY_POSTC1
inc keyop_time
rjmp 2f
1: inc keyop_time
/* now we load the key to r18-r25 */
movw r26, s1_sec
SBRC kc, 0 /* select between KA and KL */
adiw r26, 16
SBRC xro_sec, 2 // KEY_DIR
rjmp 2f
SBRS loop_cnt, 0 /* enc */
adiw r26, 8
rjmp 3f
2: SBRC loop_cnt, 0 /* dec */
adiw r26, 8
rjmp 3f
lsr kc
ld r18, X+
ld r19, X+
ld r20, X+
ld r21, X+
ld r22, X+
ld r23, X+
ld r24, X+
ld r25, X+
/* now we xor bl in */
movw r26, bl1_sec
ld r0, X+
eor r18, r0
ld r0, X+
eor r19, r0
ld r0, X+
eor r20, r0
ld r0, X+
eor r21, r0
ld r0, X+
eor r22, r0
ld r0, X+
eor r23, r0
ld r0, X+
eor r24, r0
ld r0, X+
eor r25, r0
/* f(x,k) = p(s(x xor k)) ; xor is done */
call camellia_s;
call camellia_p;
// in r26, SPL
// in r27, SPH
// sbiw r26, 9
// dbg_hexdump 10
/* now we have to xor the result into br */
clr r31
ldi r30, 18
movw r26, br1_sec
; ldi r1, 8 ;-- this won't work
clr r1
ror r1
swap r1
1: ld r0, X
ld t, Z+
eor r0, t
st X+, r0
dec r1
brne 1b
/* check for keyop */
cp loop_cnt, keyop_time
brne 3f
movw s1, s1_sec
ldi r22, 1
SBRS xro_sec, 3 // KEY_ROL17
neg r22
SBRS xro_sec, 2 // KEY_DIR
rjmp 2f
call camellia128_keyop_inv
rjmp 3f
2: call camellia128_keyop
3: /* loop back */
SWAP_R br1_sec, bl1_sec
SWAP_R br2_sec, bl2_sec
dec loop_cnt
breq 2f
jmp main_loop
pop r7
pop r8
pop r9
pop r10
pop r11
pop r12
pop r13
pop r14
pop r15
pop r16
pop r17
void camellia128_init(camellia128_ctx_t* s, uint8_t* key){
uint8_t i;
s->kll = 0; //((uint64_t*)key)[0];
/ * load the key, endian-adjusted, to kll,klr * /
for(i=0; i<8; ++i){
s->kll <<= 8;
s->kll |= *key++;
for(i=0; i<8; ++i){
s->klr <<= 8;
s->klr |= *key++;
s->kal = s->kll;
s->kar = s->klr;
s->kar ^= camellia_f(s->kal, camellia_sigma[0]);
s->kal ^= camellia_f(s->kar, camellia_sigma[1]);
s->kal ^= s->kll;
s->kar ^= s->klr;
s->kar ^= camellia_f(s->kal, camellia_sigma[2]);
s->kal ^= camellia_f(s->kar, camellia_sigma[3]);
/ * * /
// uart_putstr("\n\r----------------init finished--------------------");
ld r0, X+
eor r18, r0
ld r0, X+
eor r19, r0
ld r0, X+
eor r20, r0
ld r0, X+
eor r21, r0
ld r0, X+
eor r22, r0
ld r0, X+
eor r23, r0
ld r0, X+
eor r24, r0
ld r0, X+
eor r25, r0
ld r18, X+
ld r19, X+
ld r20, X+
ld r21, X+
ld r22, X+
ld r23, X+
ld r24, X+
ld r25, X+
ld r0, Y
eor r18, r0
st Y+, r18
ld r0, Y
eor r19, r0
st Y+, r19
ld r0, Y
eor r20, r0
st Y+, r20
ld r0, Y
eor r21, r0
st Y+, r21
ld r0, Y
eor r22, r0
st Y+, r22
ld r0, Y
eor r23, r0
st Y+, r23
ld r0, Y
eor r24, r0
st Y+, r24
ld r0, Y
eor r25, r0
st Y+, r25
; param s: r24-r25
; param *k: r22-r23
//.global camellia128_init
push r29
push r28
movw r30, r24 ; Z is statepointer
movw r26, r22 ; X is keypointer
clr r29
ldi r28, 18
// / * load key into kl, ka and kal to r18:r25 * /
adiw r26, 128/8 ;-- 16
ldi r16, (128/8)-1
1: ld r17, -X
std Z+(128/8), r17
st Z+, r17
sbrs r16, 3
st Y+, r17 ; this should only be done the last 8 rounds 0<=r16<=7
dec r16
brpl 1b
// / * step 1 * /
ldi r26, lo8(camellia_sigma)
ldi r27, hi8(camellia_sigma)
call X64_xor_in
call camellia_s
call camellia_p // / * f(x,k) is done * /
sbiw r30, 128/8
movw r28, r30 ; Z&Y point on kar now
call Y64_load_xor_store
// / * step 2 now * /
call X64_xor_in
call camellia_s
call camellia_p // / * f(x,k) is done * /
call Y64_load_xor_store
// / * now the xor part (kl and kr) * /
sbiw r30, 128/8 ; Z points to klr
ldi r16, 128/8
1: ld r0, Z+
ldd r1, Z+(128/8)-1
eor r0, r1
std Z+(128/8)-1, r0
dec r16
brne 1b
// / * now s->kar ^= camellia_f(s->kal, camellia_sigma[2]); * /
call X64_load ; load sigma[2]
movw r26, r28 ; X&Y point at kal
call X64_xor_in
call camellia_s
call camellia_p
sbiw r28, 128/8/2 ; Y points at kar
call Y64_load_xor_store
// / * now s->kal ^= camellia_f(s->kar, camellia_sigma[3]); * /
sbiw r26, 128/8 ;
call X64_load ; load kar
ldi r26, lo8(camellia_sigma+3*8)
ldi r27, hi8(camellia_sigma+3*8)
call X64_xor_in ; xor sigma[3] in
call camellia_s
call camellia_p
call Y64_load_xor_store
pop r28
pop r29