diff options
Diffstat (limited to 'arch/arm64/crypto/aes-modes.S')
-rw-r--r-- | arch/arm64/crypto/aes-modes.S | 355 |
1 files changed, 118 insertions, 237 deletions
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S index 2674d43d1384..a68412e1e3a4 100644 --- a/arch/arm64/crypto/aes-modes.S +++ b/arch/arm64/crypto/aes-modes.S @@ -13,127 +13,39 @@ .text .align 4 -/* - * There are several ways to instantiate this code: - * - no interleave, all inline - * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) - * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) - * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) - * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) - * - * Macros imported by this code: - * - enc_prepare - setup NEON registers for encryption - * - dec_prepare - setup NEON registers for decryption - * - enc_switch_key - change to new key after having prepared for encryption - * - encrypt_block - encrypt a single block - * - decrypt block - decrypt a single block - * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) - * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) - * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) - * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) - */ - -#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) -#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp -#define FRAME_POP ldp x29, x30, [sp],#16 - -#if INTERLEAVE == 2 - -aes_encrypt_block2x: - encrypt_block2x v0, v1, w3, x2, x6, w7 - ret -ENDPROC(aes_encrypt_block2x) - -aes_decrypt_block2x: - decrypt_block2x v0, v1, w3, x2, x6, w7 - ret -ENDPROC(aes_decrypt_block2x) - -#elif INTERLEAVE == 4 - aes_encrypt_block4x: - encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 + encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 ret ENDPROC(aes_encrypt_block4x) aes_decrypt_block4x: - decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 + decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 ret ENDPROC(aes_decrypt_block4x) -#else -#error INTERLEAVE should equal 2 or 4 -#endif - - .macro do_encrypt_block2x - bl aes_encrypt_block2x - .endm - - .macro do_decrypt_block2x - bl aes_decrypt_block2x - .endm - - .macro do_encrypt_block4x - bl aes_encrypt_block4x - .endm - - .macro do_decrypt_block4x - bl aes_decrypt_block4x - .endm - -#else -#define FRAME_PUSH -#define FRAME_POP - - .macro do_encrypt_block2x - encrypt_block2x v0, v1, w3, x2, x6, w7 - .endm - - .macro do_decrypt_block2x - decrypt_block2x v0, v1, w3, x2, x6, w7 - .endm - - .macro do_encrypt_block4x - encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 - .endm - - .macro do_decrypt_block4x - decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 - .endm - -#endif - /* * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, int first) + * int blocks) * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, int first) + * int blocks) */ AES_ENTRY(aes_ecb_encrypt) - FRAME_PUSH - cbz w5, .LecbencloopNx + stp x29, x30, [sp, #-16]! + mov x29, sp enc_prepare w3, x2, x5 .LecbencloopNx: -#if INTERLEAVE >= 2 - subs w4, w4, #INTERLEAVE + subs w4, w4, #4 bmi .Lecbenc1x -#if INTERLEAVE == 2 - ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ - do_encrypt_block2x - st1 {v0.16b-v1.16b}, [x0], #32 -#else ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ - do_encrypt_block4x + bl aes_encrypt_block4x st1 {v0.16b-v3.16b}, [x0], #64 -#endif b .LecbencloopNx .Lecbenc1x: - adds w4, w4, #INTERLEAVE + adds w4, w4, #4 beq .Lecbencout -#endif .Lecbencloop: ld1 {v0.16b}, [x1], #16 /* get next pt block */ encrypt_block v0, w3, x2, x5, w6 @@ -141,35 +53,27 @@ AES_ENTRY(aes_ecb_encrypt) subs w4, w4, #1 bne .Lecbencloop .Lecbencout: - FRAME_POP + ldp x29, x30, [sp], #16 ret AES_ENDPROC(aes_ecb_encrypt) AES_ENTRY(aes_ecb_decrypt) - FRAME_PUSH - cbz w5, .LecbdecloopNx + stp x29, x30, [sp, #-16]! + mov x29, sp dec_prepare w3, x2, x5 .LecbdecloopNx: -#if INTERLEAVE >= 2 - subs w4, w4, #INTERLEAVE + subs w4, w4, #4 bmi .Lecbdec1x -#if INTERLEAVE == 2 - ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ - do_decrypt_block2x - st1 {v0.16b-v1.16b}, [x0], #32 -#else ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ - do_decrypt_block4x + bl aes_decrypt_block4x st1 {v0.16b-v3.16b}, [x0], #64 -#endif b .LecbdecloopNx .Lecbdec1x: - adds w4, w4, #INTERLEAVE + adds w4, w4, #4 beq .Lecbdecout -#endif .Lecbdecloop: ld1 {v0.16b}, [x1], #16 /* get next ct block */ decrypt_block v0, w3, x2, x5, w6 @@ -177,62 +81,68 @@ AES_ENTRY(aes_ecb_decrypt) subs w4, w4, #1 bne .Lecbdecloop .Lecbdecout: - FRAME_POP + ldp x29, x30, [sp], #16 ret AES_ENDPROC(aes_ecb_decrypt) /* * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, u8 iv[], int first) + * int blocks, u8 iv[]) * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, u8 iv[], int first) + * int blocks, u8 iv[]) */ AES_ENTRY(aes_cbc_encrypt) - cbz w6, .Lcbcencloop - - ld1 {v0.16b}, [x5] /* get iv */ + ld1 {v4.16b}, [x5] /* get iv */ enc_prepare w3, x2, x6 -.Lcbcencloop: - ld1 {v1.16b}, [x1], #16 /* get next pt block */ - eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ +.Lcbcencloop4x: + subs w4, w4, #4 + bmi .Lcbcenc1x + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ + eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ encrypt_block v0, w3, x2, x6, w7 - st1 {v0.16b}, [x0], #16 + eor v1.16b, v1.16b, v0.16b + encrypt_block v1, w3, x2, x6, w7 + eor v2.16b, v2.16b, v1.16b + encrypt_block v2, w3, x2, x6, w7 + eor v3.16b, v3.16b, v2.16b + encrypt_block v3, w3, x2, x6, w7 + st1 {v0.16b-v3.16b}, [x0], #64 + mov v4.16b, v3.16b + b .Lcbcencloop4x +.Lcbcenc1x: + adds w4, w4, #4 + beq .Lcbcencout +.Lcbcencloop: + ld1 {v0.16b}, [x1], #16 /* get next pt block */ + eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ + encrypt_block v4, w3, x2, x6, w7 + st1 {v4.16b}, [x0], #16 subs w4, w4, #1 bne .Lcbcencloop - st1 {v0.16b}, [x5] /* return iv */ +.Lcbcencout: + st1 {v4.16b}, [x5] /* return iv */ ret AES_ENDPROC(aes_cbc_encrypt) AES_ENTRY(aes_cbc_decrypt) - FRAME_PUSH - cbz w6, .LcbcdecloopNx + stp x29, x30, [sp, #-16]! + mov x29, sp ld1 {v7.16b}, [x5] /* get iv */ dec_prepare w3, x2, x6 .LcbcdecloopNx: -#if INTERLEAVE >= 2 - subs w4, w4, #INTERLEAVE + subs w4, w4, #4 bmi .Lcbcdec1x -#if INTERLEAVE == 2 - ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ - mov v2.16b, v0.16b - mov v3.16b, v1.16b - do_decrypt_block2x - eor v0.16b, v0.16b, v7.16b - eor v1.16b, v1.16b, v2.16b - mov v7.16b, v3.16b - st1 {v0.16b-v1.16b}, [x0], #32 -#else ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ mov v4.16b, v0.16b mov v5.16b, v1.16b mov v6.16b, v2.16b - do_decrypt_block4x + bl aes_decrypt_block4x sub x1, x1, #16 eor v0.16b, v0.16b, v7.16b eor v1.16b, v1.16b, v4.16b @@ -240,12 +150,10 @@ AES_ENTRY(aes_cbc_decrypt) eor v2.16b, v2.16b, v5.16b eor v3.16b, v3.16b, v6.16b st1 {v0.16b-v3.16b}, [x0], #64 -#endif b .LcbcdecloopNx .Lcbcdec1x: - adds w4, w4, #INTERLEAVE + adds w4, w4, #4 beq .Lcbcdecout -#endif .Lcbcdecloop: ld1 {v1.16b}, [x1], #16 /* get next ct block */ mov v0.16b, v1.16b /* ...and copy to v0 */ @@ -256,49 +164,33 @@ AES_ENTRY(aes_cbc_decrypt) subs w4, w4, #1 bne .Lcbcdecloop .Lcbcdecout: - FRAME_POP st1 {v7.16b}, [x5] /* return iv */ + ldp x29, x30, [sp], #16 ret AES_ENDPROC(aes_cbc_decrypt) /* * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, - * int blocks, u8 ctr[], int first) + * int blocks, u8 ctr[]) */ AES_ENTRY(aes_ctr_encrypt) - FRAME_PUSH - cbz w6, .Lctrnotfirst /* 1st time around? */ + stp x29, x30, [sp, #-16]! + mov x29, sp + enc_prepare w3, x2, x6 ld1 {v4.16b}, [x5] -.Lctrnotfirst: - umov x8, v4.d[1] /* keep swabbed ctr in reg */ - rev x8, x8 -#if INTERLEAVE >= 2 - cmn w8, w4 /* 32 bit overflow? */ + umov x6, v4.d[1] /* keep swabbed ctr in reg */ + rev x6, x6 + cmn w6, w4 /* 32 bit overflow? */ bcs .Lctrloop .LctrloopNx: - subs w4, w4, #INTERLEAVE + subs w4, w4, #4 bmi .Lctr1x -#if INTERLEAVE == 2 - mov v0.8b, v4.8b - mov v1.8b, v4.8b - rev x7, x8 - add x8, x8, #1 - ins v0.d[1], x7 - rev x7, x8 - add x8, x8, #1 - ins v1.d[1], x7 - ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ - do_encrypt_block2x - eor v0.16b, v0.16b, v2.16b - eor v1.16b, v1.16b, v3.16b - st1 {v0.16b-v1.16b}, [x0], #32 -#else ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ - dup v7.4s, w8 + dup v7.4s, w6 mov v0.16b, v4.16b add v7.4s, v7.4s, v8.4s mov v1.16b, v4.16b @@ -309,29 +201,27 @@ AES_ENTRY(aes_ctr_encrypt) mov v2.s[3], v8.s[1] mov v3.s[3], v8.s[2] ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ - do_encrypt_block4x + bl aes_encrypt_block4x eor v0.16b, v5.16b, v0.16b ld1 {v5.16b}, [x1], #16 /* get 1 input block */ eor v1.16b, v6.16b, v1.16b eor v2.16b, v7.16b, v2.16b eor v3.16b, v5.16b, v3.16b st1 {v0.16b-v3.16b}, [x0], #64 - add x8, x8, #INTERLEAVE -#endif - rev x7, x8 + add x6, x6, #4 + rev x7, x6 ins v4.d[1], x7 cbz w4, .Lctrout b .LctrloopNx .Lctr1x: - adds w4, w4, #INTERLEAVE + adds w4, w4, #4 beq .Lctrout -#endif .Lctrloop: mov v0.16b, v4.16b - encrypt_block v0, w3, x2, x6, w7 + encrypt_block v0, w3, x2, x8, w7 - adds x8, x8, #1 /* increment BE ctr */ - rev x7, x8 + adds x6, x6, #1 /* increment BE ctr */ + rev x7, x6 ins v4.d[1], x7 bcs .Lctrcarry /* overflow? */ @@ -345,12 +235,12 @@ AES_ENTRY(aes_ctr_encrypt) .Lctrout: st1 {v4.16b}, [x5] /* return next CTR value */ - FRAME_POP + ldp x29, x30, [sp], #16 ret .Lctrtailblock: st1 {v0.16b}, [x0] - FRAME_POP + ldp x29, x30, [sp], #16 ret .Lctrcarry: @@ -384,39 +274,26 @@ CPU_LE( .quad 1, 0x87 ) CPU_BE( .quad 0x87, 1 ) AES_ENTRY(aes_xts_encrypt) - FRAME_PUSH - cbz w7, .LxtsencloopNx + stp x29, x30, [sp, #-16]! + mov x29, sp ld1 {v4.16b}, [x6] - enc_prepare w3, x5, x6 - encrypt_block v4, w3, x5, x6, w7 /* first tweak */ - enc_switch_key w3, x2, x6 + cbz w7, .Lxtsencnotfirst + + enc_prepare w3, x5, x8 + encrypt_block v4, w3, x5, x8, w7 /* first tweak */ + enc_switch_key w3, x2, x8 ldr q7, .Lxts_mul_x b .LxtsencNx +.Lxtsencnotfirst: + enc_prepare w3, x2, x8 .LxtsencloopNx: ldr q7, .Lxts_mul_x next_tweak v4, v4, v7, v8 .LxtsencNx: -#if INTERLEAVE >= 2 - subs w4, w4, #INTERLEAVE + subs w4, w4, #4 bmi .Lxtsenc1x -#if INTERLEAVE == 2 - ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ - next_tweak v5, v4, v7, v8 - eor v0.16b, v0.16b, v4.16b - eor v1.16b, v1.16b, v5.16b - do_encrypt_block2x - eor v0.16b, v0.16b, v4.16b - eor v1.16b, v1.16b, v5.16b - st1 {v0.16b-v1.16b}, [x0], #32 - cbz w4, .LxtsencoutNx - next_tweak v4, v5, v7, v8 - b .LxtsencNx -.LxtsencoutNx: - mov v4.16b, v5.16b - b .Lxtsencout -#else ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ next_tweak v5, v4, v7, v8 eor v0.16b, v0.16b, v4.16b @@ -425,7 +302,7 @@ AES_ENTRY(aes_xts_encrypt) eor v2.16b, v2.16b, v6.16b next_tweak v7, v6, v7, v8 eor v3.16b, v3.16b, v7.16b - do_encrypt_block4x + bl aes_encrypt_block4x eor v3.16b, v3.16b, v7.16b eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b @@ -434,15 +311,13 @@ AES_ENTRY(aes_xts_encrypt) mov v4.16b, v7.16b cbz w4, .Lxtsencout b .LxtsencloopNx -#endif .Lxtsenc1x: - adds w4, w4, #INTERLEAVE + adds w4, w4, #4 beq .Lxtsencout -#endif .Lxtsencloop: ld1 {v1.16b}, [x1], #16 eor v0.16b, v1.16b, v4.16b - encrypt_block v0, w3, x2, x6, w7 + encrypt_block v0, w3, x2, x8, w7 eor v0.16b, v0.16b, v4.16b st1 {v0.16b}, [x0], #16 subs w4, w4, #1 @@ -450,45 +325,33 @@ AES_ENTRY(aes_xts_encrypt) next_tweak v4, v4, v7, v8 b .Lxtsencloop .Lxtsencout: - FRAME_POP + st1 {v4.16b}, [x6] + ldp x29, x30, [sp], #16 ret AES_ENDPROC(aes_xts_encrypt) AES_ENTRY(aes_xts_decrypt) - FRAME_PUSH - cbz w7, .LxtsdecloopNx + stp x29, x30, [sp, #-16]! + mov x29, sp ld1 {v4.16b}, [x6] - enc_prepare w3, x5, x6 - encrypt_block v4, w3, x5, x6, w7 /* first tweak */ - dec_prepare w3, x2, x6 + cbz w7, .Lxtsdecnotfirst + + enc_prepare w3, x5, x8 + encrypt_block v4, w3, x5, x8, w7 /* first tweak */ + dec_prepare w3, x2, x8 ldr q7, .Lxts_mul_x b .LxtsdecNx +.Lxtsdecnotfirst: + dec_prepare w3, x2, x8 .LxtsdecloopNx: ldr q7, .Lxts_mul_x next_tweak v4, v4, v7, v8 .LxtsdecNx: -#if INTERLEAVE >= 2 - subs w4, w4, #INTERLEAVE + subs w4, w4, #4 bmi .Lxtsdec1x -#if INTERLEAVE == 2 - ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ - next_tweak v5, v4, v7, v8 - eor v0.16b, v0.16b, v4.16b - eor v1.16b, v1.16b, v5.16b - do_decrypt_block2x - eor v0.16b, v0.16b, v4.16b - eor v1.16b, v1.16b, v5.16b - st1 {v0.16b-v1.16b}, [x0], #32 - cbz w4, .LxtsdecoutNx - next_tweak v4, v5, v7, v8 - b .LxtsdecNx -.LxtsdecoutNx: - mov v4.16b, v5.16b - b .Lxtsdecout -#else ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ next_tweak v5, v4, v7, v8 eor v0.16b, v0.16b, v4.16b @@ -497,7 +360,7 @@ AES_ENTRY(aes_xts_decrypt) eor v2.16b, v2.16b, v6.16b next_tweak v7, v6, v7, v8 eor v3.16b, v3.16b, v7.16b - do_decrypt_block4x + bl aes_decrypt_block4x eor v3.16b, v3.16b, v7.16b eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b @@ -506,15 +369,13 @@ AES_ENTRY(aes_xts_decrypt) mov v4.16b, v7.16b cbz w4, .Lxtsdecout b .LxtsdecloopNx -#endif .Lxtsdec1x: - adds w4, w4, #INTERLEAVE + adds w4, w4, #4 beq .Lxtsdecout -#endif .Lxtsdecloop: ld1 {v1.16b}, [x1], #16 eor v0.16b, v1.16b, v4.16b - decrypt_block v0, w3, x2, x6, w7 + decrypt_block v0, w3, x2, x8, w7 eor v0.16b, v0.16b, v4.16b st1 {v0.16b}, [x0], #16 subs w4, w4, #1 @@ -522,7 +383,8 @@ AES_ENTRY(aes_xts_decrypt) next_tweak v4, v4, v7, v8 b .Lxtsdecloop .Lxtsdecout: - FRAME_POP + st1 {v4.16b}, [x6] + ldp x29, x30, [sp], #16 ret AES_ENDPROC(aes_xts_decrypt) @@ -533,8 +395,28 @@ AES_ENDPROC(aes_xts_decrypt) AES_ENTRY(aes_mac_update) ld1 {v0.16b}, [x4] /* get dg */ enc_prepare w2, x1, x7 - cbnz w5, .Lmacenc + cbz w5, .Lmacloop4x + + encrypt_block v0, w2, x1, x7, w8 +.Lmacloop4x: + subs w3, w3, #4 + bmi .Lmac1x + ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ + eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ + encrypt_block v0, w2, x1, x7, w8 + eor v0.16b, v0.16b, v2.16b + encrypt_block v0, w2, x1, x7, w8 + eor v0.16b, v0.16b, v3.16b + encrypt_block v0, w2, x1, x7, w8 + eor v0.16b, v0.16b, v4.16b + cmp w3, wzr + csinv x5, x6, xzr, eq + cbz w5, .Lmacout + encrypt_block v0, w2, x1, x7, w8 + b .Lmacloop4x +.Lmac1x: + add w3, w3, #4 .Lmacloop: cbz w3, .Lmacout ld1 {v1.16b}, [x0], #16 /* get next pt block */ @@ -544,7 +426,6 @@ AES_ENTRY(aes_mac_update) csinv x5, x6, xzr, eq cbz w5, .Lmacout -.Lmacenc: encrypt_block v0, w2, x1, x7, w8 b .Lmacloop |