diff options
Diffstat (limited to 'arch/x86/crypto/cast5-avx-x86_64-asm_64.S')
-rw-r--r-- | arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 348 |
1 files changed, 265 insertions, 83 deletions
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index a41a3aaba220..15b00ac7cbd3 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S @@ -25,10 +25,10 @@ .file "cast5-avx-x86_64-asm_64.S" -.extern cast5_s1 -.extern cast5_s2 -.extern cast5_s3 -.extern cast5_s4 +.extern cast_s1 +.extern cast_s2 +.extern cast_s3 +.extern cast_s4 /* structure of crypto context */ #define km 0 @@ -36,10 +36,10 @@ #define rr ((16*4)+16) /* s-boxes */ -#define s1 cast5_s1 -#define s2 cast5_s2 -#define s3 cast5_s3 -#define s4 cast5_s4 +#define s1 cast_s1 +#define s2 cast_s2 +#define s3 cast_s3 +#define s4 cast_s4 /********************************************************************** 16-way AVX cast5 @@ -180,31 +180,17 @@ vpunpcklqdq t1, t0, x0; \ vpunpckhqdq t1, t0, x1; -#define inpack_blocks(in, x0, x1, t0, t1, rmask) \ - vmovdqu (0*4*4)(in), x0; \ - vmovdqu (1*4*4)(in), x1; \ +#define inpack_blocks(x0, x1, t0, t1, rmask) \ vpshufb rmask, x0, x0; \ vpshufb rmask, x1, x1; \ \ transpose_2x4(x0, x1, t0, t1) -#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \ +#define outunpack_blocks(x0, x1, t0, t1, rmask) \ transpose_2x4(x0, x1, t0, t1) \ \ vpshufb rmask, x0, x0; \ - vpshufb rmask, x1, x1; \ - vmovdqu x0, (0*4*4)(out); \ - vmovdqu x1, (1*4*4)(out); - -#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \ - transpose_2x4(x0, x1, t0, t1) \ - \ - vpshufb rmask, x0, x0; \ - vpshufb rmask, x1, x1; \ - vpxor (0*4*4)(out), x0, x0; \ - vmovdqu x0, (0*4*4)(out); \ - vpxor (1*4*4)(out), x1, x1; \ - vmovdqu x1, (1*4*4)(out); + vpshufb rmask, x1, x1; .data @@ -213,6 +199,8 @@ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +.Lbswap_iv_mask: + .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 .L16_mask: .byte 16, 16, 16, 16 .L32_mask: @@ -223,35 +211,42 @@ .text .align 16 -.global __cast5_enc_blk_16way -.type __cast5_enc_blk_16way,@function; +.type __cast5_enc_blk16,@function; -__cast5_enc_blk_16way: +__cast5_enc_blk16: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: bool, if true: xor output + * RL1: blocks 1 and 2 + * RR1: blocks 3 and 4 + * RL2: blocks 5 and 6 + * RR2: blocks 7 and 8 + * RL3: blocks 9 and 10 + * RR3: blocks 11 and 12 + * RL4: blocks 13 and 14 + * RR4: blocks 15 and 16 + * output: + * RL1: encrypted blocks 1 and 2 + * RR1: encrypted blocks 3 and 4 + * RL2: encrypted blocks 5 and 6 + * RR2: encrypted blocks 7 and 8 + * RL3: encrypted blocks 9 and 10 + * RR3: encrypted blocks 11 and 12 + * RL4: encrypted blocks 13 and 14 + * RR4: encrypted blocks 15 and 16 */ pushq %rbp; pushq %rbx; - pushq %rcx; vmovdqa .Lbswap_mask, RKM; vmovd .Lfirst_mask, R1ST; vmovd .L32_mask, R32; enc_preload_rkr(); - leaq 1*(2*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); - inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); - - movq %rsi, %r11; + inpack_blocks(RL1, RR1, RTMP, RX, RKM); + inpack_blocks(RL2, RR2, RTMP, RX, RKM); + inpack_blocks(RL3, RR3, RTMP, RX, RKM); + inpack_blocks(RL4, RR4, RTMP, RX, RKM); round(RL, RR, 0, 1); round(RR, RL, 1, 2); @@ -276,44 +271,41 @@ __cast5_enc_blk_16way: round(RR, RL, 15, 1); __skip_enc: - popq %rcx; popq %rbx; popq %rbp; vmovdqa .Lbswap_mask, RKM; - leaq 1*(2*4*4)(%r11), %rax; - testb %cl, %cl; - jnz __enc_xor16; - - outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); - outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%r11), %rax; - outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%r11), %rax; - outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); - - ret; - -__enc_xor16: - outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM); - outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%r11), %rax; - outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%r11), %rax; - outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM); + outunpack_blocks(RR1, RL1, RTMP, RX, RKM); + outunpack_blocks(RR2, RL2, RTMP, RX, RKM); + outunpack_blocks(RR3, RL3, RTMP, RX, RKM); + outunpack_blocks(RR4, RL4, RTMP, RX, RKM); ret; .align 16 -.global cast5_dec_blk_16way -.type cast5_dec_blk_16way,@function; +.type __cast5_dec_blk16,@function; -cast5_dec_blk_16way: +__cast5_dec_blk16: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src + * RL1: encrypted blocks 1 and 2 + * RR1: encrypted blocks 3 and 4 + * RL2: encrypted blocks 5 and 6 + * RR2: encrypted blocks 7 and 8 + * RL3: encrypted blocks 9 and 10 + * RR3: encrypted blocks 11 and 12 + * RL4: encrypted blocks 13 and 14 + * RR4: encrypted blocks 15 and 16 + * output: + * RL1: decrypted blocks 1 and 2 + * RR1: decrypted blocks 3 and 4 + * RL2: decrypted blocks 5 and 6 + * RR2: decrypted blocks 7 and 8 + * RL3: decrypted blocks 9 and 10 + * RR3: decrypted blocks 11 and 12 + * RL4: decrypted blocks 13 and 14 + * RR4: decrypted blocks 15 and 16 */ pushq %rbp; @@ -324,15 +316,10 @@ cast5_dec_blk_16way: vmovd .L32_mask, R32; dec_preload_rkr(); - leaq 1*(2*4*4)(%rdx), %rax; - inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); - inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%rdx), %rax; - inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); - - movq %rsi, %r11; + inpack_blocks(RL1, RR1, RTMP, RX, RKM); + inpack_blocks(RL2, RR2, RTMP, RX, RKM); + inpack_blocks(RL3, RR3, RTMP, RX, RKM); + inpack_blocks(RL4, RR4, RTMP, RX, RKM); movzbl rr(CTX), %eax; testl %eax, %eax; @@ -361,16 +348,211 @@ __dec_tail: popq %rbx; popq %rbp; - leaq 1*(2*4*4)(%r11), %rax; - outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); - outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); - leaq 2*(2*4*4)(%r11), %rax; - outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); - leaq 3*(2*4*4)(%r11), %rax; - outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); + outunpack_blocks(RR1, RL1, RTMP, RX, RKM); + outunpack_blocks(RR2, RL2, RTMP, RX, RKM); + outunpack_blocks(RR3, RL3, RTMP, RX, RKM); + outunpack_blocks(RR4, RL4, RTMP, RX, RKM); ret; __skip_dec: vpsrldq $4, RKR, RKR; jmp __dec_tail; + +.align 16 +.global cast5_ecb_enc_16way +.type cast5_ecb_enc_16way,@function; + +cast5_ecb_enc_16way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + movq %rsi, %r11; + + vmovdqu (0*4*4)(%rdx), RL1; + vmovdqu (1*4*4)(%rdx), RR1; + vmovdqu (2*4*4)(%rdx), RL2; + vmovdqu (3*4*4)(%rdx), RR2; + vmovdqu (4*4*4)(%rdx), RL3; + vmovdqu (5*4*4)(%rdx), RR3; + vmovdqu (6*4*4)(%rdx), RL4; + vmovdqu (7*4*4)(%rdx), RR4; + + call __cast5_enc_blk16; + + vmovdqu RR1, (0*4*4)(%r11); + vmovdqu RL1, (1*4*4)(%r11); + vmovdqu RR2, (2*4*4)(%r11); + vmovdqu RL2, (3*4*4)(%r11); + vmovdqu RR3, (4*4*4)(%r11); + vmovdqu RL3, (5*4*4)(%r11); + vmovdqu RR4, (6*4*4)(%r11); + vmovdqu RL4, (7*4*4)(%r11); + + ret; + +.align 16 +.global cast5_ecb_dec_16way +.type cast5_ecb_dec_16way,@function; + +cast5_ecb_dec_16way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + movq %rsi, %r11; + + vmovdqu (0*4*4)(%rdx), RL1; + vmovdqu (1*4*4)(%rdx), RR1; + vmovdqu (2*4*4)(%rdx), RL2; + vmovdqu (3*4*4)(%rdx), RR2; + vmovdqu (4*4*4)(%rdx), RL3; + vmovdqu (5*4*4)(%rdx), RR3; + vmovdqu (6*4*4)(%rdx), RL4; + vmovdqu (7*4*4)(%rdx), RR4; + + call __cast5_dec_blk16; + + vmovdqu RR1, (0*4*4)(%r11); + vmovdqu RL1, (1*4*4)(%r11); + vmovdqu RR2, (2*4*4)(%r11); + vmovdqu RL2, (3*4*4)(%r11); + vmovdqu RR3, (4*4*4)(%r11); + vmovdqu RL3, (5*4*4)(%r11); + vmovdqu RR4, (6*4*4)(%r11); + vmovdqu RL4, (7*4*4)(%r11); + + ret; + +.align 16 +.global cast5_cbc_dec_16way +.type cast5_cbc_dec_16way,@function; + +cast5_cbc_dec_16way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pushq %r12; + + movq %rsi, %r11; + movq %rdx, %r12; + + vmovdqu (0*16)(%rdx), RL1; + vmovdqu (1*16)(%rdx), RR1; + vmovdqu (2*16)(%rdx), RL2; + vmovdqu (3*16)(%rdx), RR2; + vmovdqu (4*16)(%rdx), RL3; + vmovdqu (5*16)(%rdx), RR3; + vmovdqu (6*16)(%rdx), RL4; + vmovdqu (7*16)(%rdx), RR4; + + call __cast5_dec_blk16; + + /* xor with src */ + vmovq (%r12), RX; + vpshufd $0x4f, RX, RX; + vpxor RX, RR1, RR1; + vpxor 0*16+8(%r12), RL1, RL1; + vpxor 1*16+8(%r12), RR2, RR2; + vpxor 2*16+8(%r12), RL2, RL2; + vpxor 3*16+8(%r12), RR3, RR3; + vpxor 4*16+8(%r12), RL3, RL3; + vpxor 5*16+8(%r12), RR4, RR4; + vpxor 6*16+8(%r12), RL4, RL4; + + vmovdqu RR1, (0*16)(%r11); + vmovdqu RL1, (1*16)(%r11); + vmovdqu RR2, (2*16)(%r11); + vmovdqu RL2, (3*16)(%r11); + vmovdqu RR3, (4*16)(%r11); + vmovdqu RL3, (5*16)(%r11); + vmovdqu RR4, (6*16)(%r11); + vmovdqu RL4, (7*16)(%r11); + + popq %r12; + + ret; + +.align 16 +.global cast5_ctr_16way +.type cast5_ctr_16way,@function; + +cast5_ctr_16way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (big endian, 64bit) + */ + + pushq %r12; + + movq %rsi, %r11; + movq %rdx, %r12; + + vpcmpeqd RTMP, RTMP, RTMP; + vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */ + + vpcmpeqd RKR, RKR, RKR; + vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ + vmovdqa .Lbswap_iv_mask, R1ST; + vmovdqa .Lbswap128_mask, RKM; + + /* load IV and byteswap */ + vmovq (%rcx), RX; + vpshufb R1ST, RX, RX; + + /* construct IVs */ + vpsubq RTMP, RX, RX; /* le: IV1, IV0 */ + vpshufb RKM, RX, RL1; /* be: IV0, IV1 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RR1; /* be: IV2, IV3 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RL2; /* be: IV4, IV5 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RR2; /* be: IV6, IV7 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RL3; /* be: IV8, IV9 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RR3; /* be: IV10, IV11 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RL4; /* be: IV12, IV13 */ + vpsubq RKR, RX, RX; + vpshufb RKM, RX, RR4; /* be: IV14, IV15 */ + + /* store last IV */ + vpsubq RTMP, RX, RX; /* le: IV16, IV14 */ + vpshufb R1ST, RX, RX; /* be: IV16, IV16 */ + vmovq RX, (%rcx); + + call __cast5_enc_blk16; + + /* dst = src ^ iv */ + vpxor (0*16)(%r12), RR1, RR1; + vpxor (1*16)(%r12), RL1, RL1; + vpxor (2*16)(%r12), RR2, RR2; + vpxor (3*16)(%r12), RL2, RL2; + vpxor (4*16)(%r12), RR3, RR3; + vpxor (5*16)(%r12), RL3, RL3; + vpxor (6*16)(%r12), RR4, RR4; + vpxor (7*16)(%r12), RL4, RL4; + vmovdqu RR1, (0*16)(%r11); + vmovdqu RL1, (1*16)(%r11); + vmovdqu RR2, (2*16)(%r11); + vmovdqu RL2, (3*16)(%r11); + vmovdqu RR3, (4*16)(%r11); + vmovdqu RL3, (5*16)(%r11); + vmovdqu RR4, (6*16)(%r11); + vmovdqu RL4, (7*16)(%r11); + + popq %r12; + + ret; |