diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 14:22:45 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-01-31 14:22:45 -0800 |
commit | a103950e0dd2058df5e8a8d4a915707bdcf205f0 (patch) | |
tree | af5d091f768db4ed7a12fc3c5484d3e20ad9d514 /arch | |
parent | 2cfa1cd3da14814a1e9ec6a4fce8612637d3ee3d (diff) | |
parent | 2d55807b7f7bf62bb05a8b91247c5eb7cd19ac04 (diff) |
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu:
"API:
- Enforce the setting of keys for keyed aead/hash/skcipher
algorithms.
- Add multibuf speed tests in tcrypt.
Algorithms:
- Improve performance of sha3-generic.
- Add native sha512 support on arm64.
- Add v8.2 Crypto Extentions version of sha3/sm3 on arm64.
- Avoid hmac nesting by requiring underlying algorithm to be unkeyed.
- Add cryptd_max_cpu_qlen module parameter to cryptd.
Drivers:
- Add support for EIP97 engine in inside-secure.
- Add inline IPsec support to chelsio.
- Add RevB core support to crypto4xx.
- Fix AEAD ICV check in crypto4xx.
- Add stm32 crypto driver.
- Add support for BCM63xx platforms in bcm2835 and remove bcm63xx.
- Add Derived Key Protocol (DKP) support in caam.
- Add Samsung Exynos True RNG driver.
- Add support for Exynos5250+ SoCs in exynos PRNG driver"
* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (166 commits)
crypto: picoxcell - Fix error handling in spacc_probe()
crypto: arm64/sha512 - fix/improve new v8.2 Crypto Extensions code
crypto: arm64/sm3 - new v8.2 Crypto Extensions implementation
crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation
crypto: testmgr - add new testcases for sha3
crypto: sha3-generic - export init/update/final routines
crypto: sha3-generic - simplify code
crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize
crypto: sha3-generic - fixes for alignment and big endian operation
crypto: aesni - handle zero length dst buffer
crypto: artpec6 - remove select on non-existing CRYPTO_SHA384
hwrng: bcm2835 - Remove redundant dev_err call in bcm2835_rng_probe()
crypto: stm32 - remove redundant dev_err call in stm32_cryp_probe()
crypto: axis - remove unnecessary platform_get_resource() error check
crypto: testmgr - test misuse of result in ahash
crypto: inside-secure - make function safexcel_try_push_requests static
crypto: aes-generic - fix aes-generic regression on powerpc
crypto: chelsio - Fix indentation warning
crypto: arm64/sha1-ce - get rid of literal pool
crypto: arm64/sha2-ce - move the round constant table to .rodata section
...
Diffstat (limited to 'arch')
34 files changed, 1327 insertions, 716 deletions
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index 18768f330449..07e31941dc67 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -181,9 +181,8 @@ static int cbc_init(struct crypto_tfm *tfm) struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm); ctx->enc_tfm = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(ctx->enc_tfm)) - return PTR_ERR(ctx->enc_tfm); - return 0; + + return PTR_ERR_OR_ZERO(ctx->enc_tfm); } static void cbc_exit(struct crypto_tfm *tfm) @@ -258,9 +257,8 @@ static int xts_init(struct crypto_tfm *tfm) struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm); ctx->tweak_tfm = crypto_alloc_cipher("aes", 0, 0); - if (IS_ERR(ctx->tweak_tfm)) - return PTR_ERR(ctx->tweak_tfm); - return 0; + + return PTR_ERR_OR_ZERO(ctx->tweak_tfm); } static void xts_exit(struct crypto_tfm *tfm) diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c index 1b0e0e86ee9c..96e62ec105d0 100644 --- a/arch/arm/crypto/crc32-ce-glue.c +++ b/arch/arm/crypto/crc32-ce-glue.c @@ -188,6 +188,7 @@ static struct shash_alg crc32_pmull_algs[] = { { .base.cra_name = "crc32", .base.cra_driver_name = "crc32-arm-ce", .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .base.cra_blocksize = 1, .base.cra_module = THIS_MODULE, }, { @@ -203,6 +204,7 @@ static struct shash_alg crc32_pmull_algs[] = { { .base.cra_name = "crc32c", .base.cra_driver_name = "crc32c-arm-ce", .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .base.cra_blocksize = 1, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 70c517aa4501..285c36c7b408 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -29,6 +29,24 @@ config CRYPTO_SHA2_ARM64_CE select CRYPTO_HASH select CRYPTO_SHA256_ARM64 +config CRYPTO_SHA512_ARM64_CE + tristate "SHA-384/SHA-512 digest algorithm (ARMv8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SHA512_ARM64 + +config CRYPTO_SHA3_ARM64 + tristate "SHA3 digest algorithm (ARMv8.2 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SHA3 + +config CRYPTO_SM3_ARM64_CE + tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_SM3 + config CRYPTO_GHASH_ARM64_CE tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index b5edc5918c28..cee9b8d9830b 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -14,6 +14,15 @@ sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o +obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o +sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o + +obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o +sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o + +obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o +sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o + obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o @@ -24,7 +33,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S new file mode 100644 index 000000000000..8efdfdade393 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-core.S @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .arch armv8-a+crypto + +ENTRY(__aes_ce_encrypt) + sub w3, w3, #2 + ld1 {v0.16b}, [x2] + ld1 {v1.4s}, [x0], #16 + cmp w3, #10 + bmi 0f + bne 3f + mov v3.16b, v1.16b + b 2f +0: mov v2.16b, v1.16b + ld1 {v3.4s}, [x0], #16 +1: aese v0.16b, v2.16b + aesmc v0.16b, v0.16b +2: ld1 {v1.4s}, [x0], #16 + aese v0.16b, v3.16b + aesmc v0.16b, v0.16b +3: ld1 {v2.4s}, [x0], #16 + subs w3, w3, #3 + aese v0.16b, v1.16b + aesmc v0.16b, v0.16b + ld1 {v3.4s}, [x0], #16 + bpl 1b + aese v0.16b, v2.16b + eor v0.16b, v0.16b, v3.16b + st1 {v0.16b}, [x1] + ret +ENDPROC(__aes_ce_encrypt) + +ENTRY(__aes_ce_decrypt) + sub w3, w3, #2 + ld1 {v0.16b}, [x2] + ld1 {v1.4s}, [x0], #16 + cmp w3, #10 + bmi 0f + bne 3f + mov v3.16b, v1.16b + b 2f +0: mov v2.16b, v1.16b + ld1 {v3.4s}, [x0], #16 +1: aesd v0.16b, v2.16b + aesimc v0.16b, v0.16b +2: ld1 {v1.4s}, [x0], #16 + aesd v0.16b, v3.16b + aesimc v0.16b, v0.16b +3: ld1 {v2.4s}, [x0], #16 + subs w3, w3, #3 + aesd v0.16b, v1.16b + aesimc v0.16b, v0.16b + ld1 {v3.4s}, [x0], #16 + bpl 1b + aesd v0.16b, v2.16b + eor v0.16b, v0.16b, v3.16b + st1 {v0.16b}, [x1] + ret +ENDPROC(__aes_ce_decrypt) + +/* + * __aes_ce_sub() - use the aese instruction to perform the AES sbox + * substitution on each byte in 'input' + */ +ENTRY(__aes_ce_sub) + dup v1.4s, w0 + movi v0.16b, #0 + aese v0.16b, v1.16b + umov w0, v0.s[0] + ret +ENDPROC(__aes_ce_sub) + +ENTRY(__aes_ce_invert) + ld1 {v0.4s}, [x1] + aesimc v1.16b, v0.16b + st1 {v1.4s}, [x0] + ret +ENDPROC(__aes_ce_invert) diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-glue.c index 6a75cd75ed11..e6b3227bbf57 100644 --- a/arch/arm64/crypto/aes-ce-cipher.c +++ b/arch/arm64/crypto/aes-ce-glue.c @@ -29,6 +29,13 @@ struct aes_block { u8 b[AES_BLOCK_SIZE]; }; +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds); + +asmlinkage u32 __aes_ce_sub(u32 l); +asmlinkage void __aes_ce_invert(struct aes_block *out, + const struct aes_block *in); + static int num_rounds(struct crypto_aes_ctx *ctx) { /* @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx) static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - struct aes_block *out = (struct aes_block *)dst; - struct aes_block const *in = (struct aes_block *)src; - void *dummy0; - int dummy1; if (!may_use_simd()) { __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx)); @@ -55,49 +58,13 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) } kernel_neon_begin(); - - __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.4s}, [%[key]], #16 ;" - " cmp %w[rounds], #10 ;" - " bmi 0f ;" - " bne 3f ;" - " mov v3.16b, v1.16b ;" - " b 2f ;" - "0: mov v2.16b, v1.16b ;" - " ld1 {v3.4s}, [%[key]], #16 ;" - "1: aese v0.16b, v2.16b ;" - " aesmc v0.16b, v0.16b ;" - "2: ld1 {v1.4s}, [%[key]], #16 ;" - " aese v0.16b, v3.16b ;" - " aesmc v0.16b, v0.16b ;" - "3: ld1 {v2.4s}, [%[key]], #16 ;" - " subs %w[rounds], %w[rounds], #3 ;" - " aese v0.16b, v1.16b ;" - " aesmc v0.16b, v0.16b ;" - " ld1 {v3.4s}, [%[key]], #16 ;" - " bpl 1b ;" - " aese v0.16b, v2.16b ;" - " eor v0.16b, v0.16b, v3.16b ;" - " st1 {v0.16b}, %[out] ;" - - : [out] "=Q"(*out), - [key] "=r"(dummy0), - [rounds] "=r"(dummy1) - : [in] "Q"(*in), - "1"(ctx->key_enc), - "2"(num_rounds(ctx) - 2) - : "cc"); - + __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx)); kernel_neon_end(); } static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - struct aes_block *out = (struct aes_block *)dst; - struct aes_block const *in = (struct aes_block *)src; - void *dummy0; - int dummy1; if (!may_use_simd()) { __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx)); @@ -105,62 +72,10 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) } kernel_neon_begin(); - - __asm__(" ld1 {v0.16b}, %[in] ;" - " ld1 {v1.4s}, [%[key]], #16 ;" - " cmp %w[rounds], #10 ;" - " bmi 0f ;" - " bne 3f ;" - " mov v3.16b, v1.16b ;" - " b 2f ;" - "0: mov v2.16b, v1.16b ;" - " ld1 {v3.4s}, [%[key]], #16 ;" - "1: aesd v0.16b, v2.16b ;" - " aesimc v0.16b, v0.16b ;" - "2: ld1 {v1.4s}, [%[key]], #16 ;" - " aesd v0.16b, v3.16b ;" - " aesimc v0.16b, v0.16b ;" - "3: ld1 {v2.4s}, [%[key]], #16 ;" - " subs %w[rounds], %w[rounds], #3 ;" - " aesd v0.16b, v1.16b ;" - " aesimc v0.16b, v0.16b ;" - " ld1 {v3.4s}, [%[key]], #16 ;" - " bpl 1b ;" - " aesd v0.16b, v2.16b ;" - " eor v0.16b, v0.16b, v3.16b ;" - " st1 {v0.16b}, %[out] ;" - - : [out] "=Q"(*out), - [key] "=r"(dummy0), - [rounds] "=r"(dummy1) - : [in] "Q"(*in), - "1"(ctx->key_dec), - "2"(num_rounds(ctx) - 2) - : "cc"); - + __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx)); kernel_neon_end(); } -/* - * aes_sub() - use the aese instruction to perform the AES sbox substitution - * on each byte in 'input' - */ -static u32 aes_sub(u32 input) -{ - u32 ret; - - __asm__("dup v1.4s, %w[in] ;" - "movi v0.16b, #0 ;" - "aese v0.16b, v1.16b ;" - "umov %w[out], v0.4s[0] ;" - - : [out] "=r"(ret) - : [in] "r"(input) - : "v0","v1"); - - return ret; -} - int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len) { @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, u32 *rki = ctx->key_enc + (i * kwords); u32 *rko = rki + kwords; - rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0]; + rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0]; rko[1] = rko[0] ^ rki[1]; rko[2] = rko[1] ^ rki[2]; rko[3] = rko[2] ^ rki[3]; @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, } else if (key_len == AES_KEYSIZE_256) { if (i >= 6) break; - rko[4] = aes_sub(rko[3]) ^ rki[4]; + rko[4] = __aes_ce_sub(rko[3]) ^ rki[4]; rko[5] = rko[4] ^ rki[5]; rko[6] = rko[5] ^ rki[6]; rko[7] = rko[6] ^ rki[7]; @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, key_dec[0] = key_enc[j]; for (i = 1, j--; j > 0; i++, j--) - __asm__("ld1 {v0.4s}, %[in] ;" - "aesimc v1.16b, v0.16b ;" - "st1 {v1.4s}, %[out] ;" - - : [out] "=Q"(key_dec[i]) - : [in] "Q"(key_enc[j]) - : "v0","v1"); + __aes_ce_invert(key_dec + i, key_enc + j); key_dec[i] = key_enc[0]; kernel_neon_end(); diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S index 6d2445d603cc..3a44eada2347 100644 --- a/arch/arm64/crypto/aes-cipher-core.S +++ b/arch/arm64/crypto/aes-cipher-core.S @@ -125,6 +125,16 @@ CPU_BE( rev w7, w7 ) ret .endm +ENTRY(__aes_arm64_encrypt) + do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 +ENDPROC(__aes_arm64_encrypt) + + .align 5 +ENTRY(__aes_arm64_decrypt) + do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0 +ENDPROC(__aes_arm64_decrypt) + + .section ".rodata", "a" .align L1_CACHE_SHIFT .type __aes_arm64_inverse_sbox, %object __aes_arm64_inverse_sbox: @@ -161,12 +171,3 @@ __aes_arm64_inverse_sbox: .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d .size __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox - -ENTRY(__aes_arm64_encrypt) - do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 -ENDPROC(__aes_arm64_encrypt) - - .align 5 -ENTRY(__aes_arm64_decrypt) - do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0 -ENDPROC(__aes_arm64_decrypt) diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 998ba519a026..2fa850e86aa8 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -665,6 +665,7 @@ static int __init aes_init(void) unregister_simds: aes_exit(); + return err; unregister_ciphers: crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); return err; diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S index f1e3aa2732f9..1c7b45b7268e 100644 --- a/arch/arm64/crypto/aes-neon.S +++ b/arch/arm64/crypto/aes-neon.S @@ -32,10 +32,10 @@ /* preload the entire Sbox */ .macro prepare, sbox, shiftrows, temp - adr \temp, \sbox movi v12.16b, #0x1b - ldr q13, \shiftrows - ldr q14, .Lror32by8 + ldr_l q13, \shiftrows, \temp + ldr_l q14, .Lror32by8, \temp + adr_l \temp, \sbox ld1 {v16.16b-v19.16b}, [\temp], #64 ld1 {v20.16b-v23.16b}, [\temp], #64 ld1 {v24.16b-v27.16b}, [\temp], #64 @@ -272,7 +272,7 @@ #include "aes-modes.S" - .text + .section ".rodata", "a" .align 6 .LForward_Sbox: .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S index 18f5a8442276..16ed3c7ebd37 100644 --- a/arch/arm64/crypto/crc32-ce-core.S +++ b/arch/arm64/crypto/crc32-ce-core.S @@ -50,7 +50,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> - .text + .section ".rodata", "a" .align 6 .cpu generic+crypto+crc @@ -115,12 +115,13 @@ * uint crc32_pmull_le(unsigned char const *buffer, * size_t len, uint crc32) */ + .text ENTRY(crc32_pmull_le) - adr x3, .Lcrc32_constants + adr_l x3, .Lcrc32_constants b 0f ENTRY(crc32c_pmull_le) - adr x3, .Lcrc32c_constants + adr_l x3, .Lcrc32c_constants 0: bic LEN, LEN, #15 ld1 {v1.16b-v4.16b}, [BUF], #0x40 diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c index 624f4137918c..34b4e3d46aab 100644 --- a/arch/arm64/crypto/crc32-ce-glue.c +++ b/arch/arm64/crypto/crc32-ce-glue.c @@ -185,6 +185,7 @@ static struct shash_alg crc32_pmull_algs[] = { { .base.cra_name = "crc32", .base.cra_driver_name = "crc32-arm64-ce", .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .base.cra_blocksize = 1, .base.cra_module = THIS_MODULE, }, { @@ -200,6 +201,7 @@ static struct shash_alg crc32_pmull_algs[] = { { .base.cra_name = "crc32c", .base.cra_driver_name = "crc32c-arm64-ce", .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .base.cra_blocksize = 1, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S index d5b5a8c038c8..f179c01bd55c 100644 --- a/arch/arm64/crypto/crct10dif-ce-core.S +++ b/arch/arm64/crypto/crct10dif-ce-core.S @@ -128,7 +128,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) // XOR the initial_crc value eor v0.16b, v0.16b, v10.16b - ldr q10, rk3 // xmm10 has rk3 and rk4 + ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4 // type of pmull instruction // will determine which constant to use @@ -184,13 +184,13 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) // fold the 8 vector registers to 1 vector register with different // constants - ldr q10, rk9 + ldr_l q10, rk9, x8 .macro fold16, reg, rk pmull v8.1q, \reg\().1d, v10.1d pmull2 \reg\().1q, \reg\().2d, v10.2d .ifnb \rk - ldr q10, \rk + ldr_l q10, \rk, x8 .endif eor v7.16b, v7.16b, v8.16b eor v7.16b, v7.16b, \reg\().16b @@ -251,7 +251,7 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) // get rid of the extra data that was loaded before // load the shift constant - adr x4, tbl_shf_table + 16 + adr_l x4, tbl_shf_table + 16 sub x4, x4, arg3 ld1 {v0.16b}, [x4] @@ -275,7 +275,7 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) _128_done: // compute crc of a 128-bit value - ldr q10, rk5 // rk5 and rk6 in xmm10 + ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10 // 64b fold ext v0.16b, vzr.16b, v7.16b, #8 @@ -291,7 +291,7 @@ _128_done: // barrett reduction _barrett: - ldr q10, rk7 + ldr_l q10, rk7, x8 mov v0.d[0], v7.d[1] pmull v0.1q, v0.1d, v10.1d @@ -321,7 +321,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) b.eq _128_done // exactly 16 left b.lt _less_than_16_left - ldr q10, rk1 // rk1 and rk2 in xmm10 + ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10 // update the counter. subtract 32 instead of 16 to save one // instruction from the loop @@ -333,7 +333,7 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) _less_than_16_left: // shl r9, 4 - adr x0, tbl_shf_table + 16 + adr_l x0, tbl_shf_table + 16 sub x0, x0, arg3 ld1 {v0.16b}, [x0] movi v9.16b, #0x80 @@ -345,6 +345,7 @@ ENDPROC(crc_t10dif_pmull) // precomputed constants // these constants are precomputed from the poly: // 0x8bb70000 (0x8bb7 scaled to 32 bits) + .section ".rodata", "a" .align 4 // Q = 0x18BB70000 // rk1 = 2^(32*3) mod Q << 32 diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index 8550408735a0..46049850727d 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -58,12 +58,11 @@ sha1su1 v\s0\().4s, v\s3\().4s .endm - /* - * The SHA1 round constants - */ - .align 4 -.Lsha1_rcon: - .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 + .macro loadrc, k, val, tmp + movz \tmp, :abs_g0_nc:\val + movk \tmp, :abs_g1:\val + dup \k, \tmp + .endm /* * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, @@ -71,11 +70,10 @@ */ ENTRY(sha1_ce_transform) /* load round constants */ - adr x6, .Lsha1_rcon - ld1r {k0.4s}, [x6], #4 - ld1r {k1.4s}, [x6], #4 - ld1r {k2.4s}, [x6], #4 - ld1r {k3.4s}, [x6] + loadrc k0.4s, 0x5a827999, w6 + loadrc k1.4s, 0x6ed9eba1, w6 + loadrc k2.4s, 0x8f1bbcdc, w6 + loadrc k3.4s, 0xca62c1d6, w6 /* load state */ ld1 {dgav.4s}, [x0] diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S index 679c6c002f4f..4c3c89b812ce 100644 --- a/arch/arm64/crypto/sha2-ce-core.S +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -53,6 +53,7 @@ /* * The SHA-256 round constants */ + .section ".rodata", "a" .align 4 .Lsha2_rcon: .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 @@ -76,9 +77,10 @@ * void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, * int blocks) */ + .text ENTRY(sha2_ce_transform) /* load round constants */ - adr x8, .Lsha2_rcon + adr_l x8, .Lsha2_rcon ld1 { v0.4s- v3.4s}, [x8], #64 ld1 { v4.4s- v7.4s}, [x8], #64 ld1 { v8.4s-v11.4s}, [x8], #64 diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S new file mode 100644 index 000000000000..332ad7530690 --- /dev/null +++ b/arch/arm64/crypto/sha3-ce-core.S @@ -0,0 +1,210 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 + .set .Lv\b\().2d, \b + .set .Lv\b\().16b, \b + .endr + + /* + * ARMv8.2 Crypto Extensions instructions + */ + .macro eor3, rd, rn, rm, ra + .inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) + .endm + + .macro rax1, rd, rn, rm + .inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro bcax, rd, rn, rm, ra + .inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) + .endm + + .macro xar, rd, rn, rm, imm6 + .inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16) + .endm + + /* + * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size) + */ + .text +ENTRY(sha3_ce_transform) + /* load state */ + add x8, x0, #32 + ld1 { v0.1d- v3.1d}, [x0] + ld1 { v4.1d- v7.1d}, [x8], #32 + ld1 { v8.1d-v11.1d}, [x8], #32 + ld1 {v12.1d-v15.1d}, [x8], #32 + ld1 {v16.1d-v19.1d}, [x8], #32 + ld1 {v20.1d-v23.1d}, [x8], #32 + ld1 {v24.1d}, [x8] + +0: sub w2, w2, #1 + mov w8, #24 + adr_l x9, .Lsha3_rcon + + /* load input */ + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v31.8b}, [x1], #24 + eor v0.8b, v0.8b, v25.8b + eor v1.8b, v1.8b, v26.8b + eor v2.8b, v2.8b, v27.8b + eor v3.8b, v3.8b, v28.8b + eor v4.8b, v4.8b, v29.8b + eor v5.8b, v5.8b, v30.8b + eor v6.8b, v6.8b, v31.8b + + tbnz x3, #6, 2f // SHA3-512 + + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b-v30.8b}, [x1], #16 + eor v7.8b, v7.8b, v25.8b + eor v8.8b, v8.8b, v26.8b + eor v9.8b, v9.8b, v27.8b + eor v10.8b, v10.8b, v28.8b + eor v11.8b, v11.8b, v29.8b + eor v12.8b, v12.8b, v30.8b + + tbnz x3, #4, 1f // SHA3-384 or SHA3-224 + + // SHA3-256 + ld1 {v25.8b-v28.8b}, [x1], #32 + eor v13.8b, v13.8b, v25.8b + eor v14.8b, v14.8b, v26.8b + eor v15.8b, v15.8b, v27.8b + eor v16.8b, v16.8b, v28.8b + b 3f + +1: tbz x3, #2, 3f // bit 2 cleared? SHA-384 + + // SHA3-224 + ld1 {v25.8b-v28.8b}, [x1], #32 + ld1 {v29.8b}, [x1], #8 + eor v13.8b, v13.8b, v25.8b + eor v14.8b, v14.8b, v26.8b + eor v15.8b, v15.8b, v27.8b + eor v16.8b, v16.8b, v28.8b + eor v17.8b, v17.8b, v29.8b + b 3f + + // SHA3-512 +2: ld1 {v25.8b-v26.8b}, [x1], #16 + eor v7.8b, v7.8b, v25.8b + eor v8.8b, v8.8b, v26.8b + +3: sub w8, w8, #1 + + eor3 v29.16b, v4.16b, v9.16b, v14.16b + eor3 v26.16b, v1.16b, v6.16b, v11.16b + eor3 v28.16b, v3.16b, v8.16b, v13.16b + eor3 v25.16b, v0.16b, v5.16b, v10.16b + eor3 v27.16b, v2.16b, v7.16b, v12.16b + eor3 v29.16b, v29.16b, v19.16b, v24.16b + eor3 v26.16b, v26.16b, v16.16b, v21.16b + eor3 v28.16b, v28.16b, v18.16b, v23.16b + eor3 v25.16b, v25.16b, v15.16b, v20.16b + eor3 v27.16b, v27.16b, v17.16b, v22.16b + + rax1 v30.2d, v29.2d, v26.2d // bc[0] + rax1 v26.2d, v26.2d, v28.2d // bc[2] + rax1 v28.2d, v28.2d, v25.2d // bc[4] + rax1 v25.2d, v25.2d, v27.2d // bc[1] + rax1 v27.2d, v27.2d, v29.2d // bc[3] + + eor v0.16b, v0.16b, v30.16b + xar v29.2d, v1.2d, v25.2d, (64 - 1) + xar v1.2d, v6.2d, v25.2d, (64 - 44) + xar v6.2d, v9.2d, v28.2d, (64 - 20) + xar v9.2d, v22.2d, v26.2d, (64 - 61) + xar v22.2d, v14.2d, v28.2d, (64 - 39) + xar v14.2d, v20.2d, v30.2d, (64 - 18) + xar v31.2d, v2.2d, v26.2d, (64 - 62) + xar v2.2d, v12.2d, v26.2d, (64 - 43) + xar v12.2d, v13.2d, v27.2d, (64 - 25) + xar v13.2d, v19.2d, v28.2d, (64 - 8) + xar v19.2d, v23.2d, v27.2d, (64 - 56) + xar v23.2d, v15.2d, v30.2d, (64 - 41) + xar v15.2d, v4.2d, v28.2d, (64 - 27) + xar v28.2d, v24.2d, v28.2d, (64 - 14) + xar v24.2d, v21.2d, v25.2d, (64 - 2) + xar v8.2d, v8.2d, v27.2d, (64 - 55) + xar v4.2d, v16.2d, v25.2d, (64 - 45) + xar v16.2d, v5.2d, v30.2d, (64 - 36) + xar v5.2d, v3.2d, v27.2d, (64 - 28) + xar v27.2d, v18.2d, v27.2d, (64 - 21) + xar v3.2d, v17.2d, v26.2d, (64 - 15) + xar v25.2d, v11.2d, v25.2d, (64 - 10) + xar v26.2d, v7.2d, v26.2d, (64 - 6) + xar v30.2d, v10.2d, v30.2d, (64 - 3) + + bcax v20.16b, v31.16b, v22.16b, v8.16b + bcax v21.16b, v8.16b, v23.16b, v22.16b + bcax v22.16b, v22.16b, v24.16b, v23.16b + bcax v23.16b, v23.16b, v31.16b, v24.16b + bcax v24.16b, v24.16b, v8.16b, v31.16b + + ld1r {v31.2d}, [x9], #8 + + bcax v17.16b, v25.16b, v19.16b, v3.16b + bcax v18.16b, v3.16b, v15.16b, v19.16b + bcax v19.16b, v19.16b, v16.16b, v15.16b + bcax v15.16b, v15.16b, v25.16b, v16.16b + bcax v16.16b, v16.16b, v3.16b, v25.16b + + bcax v10.16b, v29.16b, v12.16b, v26.16b + bcax v11.16b, v26.16b, v13.16b, v12.16b + bcax v12.16b, v12.16b, v14.16b, v13.16b + bcax v13.16b, v13.16b, v29.16b, v14.16b + bcax v14.16b, v14.16b, v26.16b, v29.16b + + bcax v7.16b, v30.16b, v9.16b, v4.16b + bcax v8.16b, v4.16b, v5.16b, v9.16b + bcax v9.16b, v9.16b, v6.16b, v5.16b + bcax v5.16b, v5.16b, v30.16b, v6.16b + bcax v6.16b, v6.16b, v4.16b, v30.16b + + bcax v3.16b, v27.16b, v0.16b, v28.16b + bcax v4.16b, v28.16b, v1.16b, v0.16b + bcax v0.16b, v0.16b, v2.16b, v1.16b + bcax v1.16b, v1.16b, v27.16b, v2.16b + bcax v2.16b, v2.16b, v28.16b, v27.16b + + eor v0.16b, v0.16b, v31.16b + + cbnz w8, 3b + cbnz w2, 0b + + /* save state */ + st1 { v0.1d- v3.1d}, [x0], #32 + st1 { v4.1d- v7.1d}, [x0], #32 + st1 { v8.1d-v11.1d}, [x0], #32 + st1 {v12.1d-v15.1d}, [x0], #32 + st1 {v16.1d-v19.1d}, [x0], #32 + st1 {v20.1d-v23.1d}, [x0], #32 + st1 {v24.1d}, [x0] + ret +ENDPROC(sha3_ce_transform) + + .section ".rodata", "a" + .align 8 +.Lsha3_rcon: + .quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a + .quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001 + .quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a + .quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a + .quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089 + .quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080 + .quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081 + .quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c new file mode 100644 index 000000000000..da8222e528bd --- /dev/null +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -0,0 +1,161 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/sha3.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks, + int md_len); + +static int sha3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha3_state *sctx = shash_desc_ctx(desc); + unsigned int digest_size = crypto_shash_digestsize(desc->tfm); + + if (!may_use_simd()) + return crypto_sha3_update(desc, data, len); + + if ((sctx->partial + len) >= sctx->rsiz) { + int blocks; + + if (sctx->partial) { + int p = sctx->rsiz - sctx->partial; + + memcpy(sctx->buf + sctx->partial, data, p); + kernel_neon_begin(); + sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size); + kernel_neon_end(); + + data += p; + len -= p; + sctx->partial = 0; + } + + blocks = len / sctx->rsiz; + len %= sctx->rsiz; + + if (blocks) { + kernel_neon_begin(); + sha3_ce_transform(sctx->st, data, blocks, digest_size); + kernel_neon_end(); + data += blocks * sctx->rsiz; + } + } + + if (len) { + memcpy(sctx->buf + sctx->partial, data, len); + sctx->partial += len; + } + return 0; +} + +static int sha3_final(struct shash_desc *desc, u8 *out) +{ + struct sha3_state *sctx = shash_desc_ctx(desc); + unsigned int digest_size = crypto_shash_digestsize(desc->tfm); + __le64 *digest = (__le64 *)out; + int i; + + if (!may_use_simd()) + return crypto_sha3_final(desc, out); + + sctx->buf[sctx->partial++] = 0x06; + memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial); + sctx->buf[sctx->rsiz - 1] |= 0x80; + + kernel_neon_begin(); + sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size); + kernel_neon_end(); + + for (i = 0; i < digest_size / 8; i++) + put_unaligned_le64(sctx->st[i], digest++); + + if (digest_size & 4) + put_unaligned_le32(sctx->st[i], (__le32 *)digest); + + *sctx = (struct sha3_state){}; + return 0; +} + +static struct shash_alg algs[] = { { + .digestsize = SHA3_224_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-224", + .base.cra_driver_name = "sha3-224-ce", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_224_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}, { + .digestsize = SHA3_256_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-256", + .base.cra_driver_name = "sha3-256-ce", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_256_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}, { + .digestsize = SHA3_384_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-384", + .base.cra_driver_name = "sha3-384-ce", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_384_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}, { + .digestsize = SHA3_512_DIGEST_SIZE, + .init = crypto_sha3_init, + .update = sha3_update, + .final = sha3_final, + .descsize = sizeof(struct sha3_state), + .base.cra_name = "sha3-512", + .base.cra_driver_name = "sha3-512-ce", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA3_512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +} }; + +static int __init sha3_neon_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha3_neon_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_cpu_feature_match(SHA3, sha3_neon_mod_init); +module_exit(sha3_neon_mod_fini); diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S new file mode 100644 index 000000000000..7f3bca5c59a2 --- /dev/null +++ b/arch/arm64/crypto/sha512-ce-core.S @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19 + .set .Lq\b, \b + .set .Lv\b\().2d, \b + .endr + + .macro sha512h, rd, rn, rm + .inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sha512h2, rd, rn, rm + .inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sha512su0, rd, rn + .inst 0xcec08000 | .L\rd | (.L\rn << 5) + .endm + + .macro sha512su1, rd, rn, rm + .inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + /* + * The SHA-512 round constants + */ + .section ".rodata", "a" + .align 4 +.Lsha512_rcon: + .quad 0x428a2f98d728ae22, 0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538, 0x59f111f1b605d019 + .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242, 0x12835b0145706fbe + .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235, 0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .quad 0x983e5152ee66dfab, 0xa831c66d2db43210 + .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .quad 0x06ca6351e003826f, 0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6, 0x92722c851482353b + .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .quad 0xd192e819d6ef5218, 0xd69906245565a910 + .quad 0xf40e35855771202a, 0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .quad 0x90befffa23631e28, 0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .quad 0xca273eceea26619c, 0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae, 0x1b710b35131c471b + .quad 0x28db77f523047d84, 0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 + + .macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4 + .ifnb \rc1 + ld1 {v\rc1\().2d}, [x4], #16 + .endif + add v5.2d, v\rc0\().2d, v\in0\().2d + ext v6.16b, v\i2\().16b, v\i3\().16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + ext v7.16b, v\i1\().16b, v\i2\().16b, #8 + add v\i3\().2d, v\i3\().2d, v5.2d + .ifnb \in1 + ext v5.16b, v\in3\().16b, v\in4\().16b, #8 + sha512su0 v\in0\().2d, v\in1\().2d + .endif + sha512h q\i3, q6, v7.2d + .ifnb \in1 + sha512su1 v\in0\().2d, v\in2\().2d, v5.2d + .endif + add v\i4\().2d, v\i1\().2d, v\i3\().2d + sha512h2 q\i3, q\i1, v\i0\().2d + .endm + + /* + * void sha512_ce_transform(struct sha512_state *sst, u8 const *src, + * int blocks) + */ + .text +ENTRY(sha512_ce_transform) + /* load state */ + ld1 {v8.2d-v11.2d}, [x0] + + /* load first 4 round constants */ + adr_l x3, .Lsha512_rcon + ld1 {v20.2d-v23.2d}, [x3], #64 + + /* load input */ +0: ld1 {v12.2d-v15.2d}, [x1], #64 + ld1 {v16.2d-v19.2d}, [x1], #64 + sub w2, w2, #1 + +CPU_LE( rev64 v12.16b, v12.16b ) +CPU_LE( rev64 v13.16b, v13.16b ) +CPU_LE( rev64 v14.16b, v14.16b ) +CPU_LE( rev64 v15.16b, v15.16b ) +CPU_LE( rev64 v16.16b, v16.16b ) +CPU_LE( rev64 v17.16b, v17.16b ) +CPU_LE( rev64 v18.16b, v18.16b ) +CPU_LE( rev64 v19.16b, v19.16b ) + + mov x4, x3 // rc pointer + + mov v0.16b, v8.16b + mov v1.16b, v9.16b + mov v2.16b, v10.16b + mov v3.16b, v11.16b + + // v0 ab cd -- ef gh ab + // v1 cd -- ef gh ab cd + // v2 ef gh ab cd -- ef + // v3 gh ab cd -- ef gh + // v4 -- ef gh ab cd -- + + dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17 + dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18 + dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19 + dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12 + dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13 + + dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14 + dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15 + dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16 + dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17 + dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18 + + dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19 + dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12 + dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13 + dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14 + dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15 + + dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16 + dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17 + dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18 + dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19 + dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12 + + dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13 + dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14 + dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15 + dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16 + dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17 + + dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18 + dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19 + dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12 + dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13 + dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14 + + dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15 + dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16 + dround 2, 3, 1, 4, 0, 28, 24, 12 + dround 4, 2, 0, 1, 3, 29, 25, 13 + dround 1, 4, 3, 0, 2, 30, 26, 14 + + dround 0, 1, 2, 3, 4, 31, 27, 15 + dround 3, 0, 4, 2, 1, 24, , 16 + dround 2, 3, 1, 4, 0, 25, , 17 + dround 4, 2, 0, 1, 3, 26, , 18 + dround 1, 4, 3, 0, 2, 27, , 19 + + /* update state */ + add v8.2d, v8.2d, v0.2d + add v9.2d, v9.2d, v1.2d + add v10.2d, v10.2d, v2.2d + add v11.2d, v11.2d, v3.2d + + /* handled all input blocks? */ + cbnz w2, 0b + + /* store new state */ +3: st1 {v8.2d-v11.2d}, [x0] + ret +ENDPROC(sha512_ce_transform) diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c new file mode 100644 index 000000000000..a77c8632a589 --- /dev/null +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/sha.h> +#include <crypto/sha512_base.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sha512_ce_transform(struct sha512_state *sst, u8 const *src, + int blocks); + +asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks); + +static int sha512_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + if (!may_use_simd()) + return sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_block_data_order); + + kernel_neon_begin(); + sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_ce_transform); + kernel_neon_end(); + + return 0; +} + +static int sha512_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!may_use_simd()) { + if (len) + sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_block_data_order); + sha512_base_do_finalize(desc, + (sha512_block_fn *)sha512_block_data_order); + return sha512_base_finish(desc, out); + } + + kernel_neon_begin(); + sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_ce_transform); + sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_ce_transform); + kernel_neon_end(); + return sha512_base_finish(desc, out); +} + +static int sha512_ce_final(struct shash_desc *desc, u8 *out) +{ + if (!may_use_simd()) { + sha512_base_do_finalize(desc, + (sha512_block_fn *)sha512_block_data_order); + return sha512_base_finish(desc, out); + } + + kernel_neon_begin(); + sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_ce_transform); + kernel_neon_end(); + return sha512_base_finish(desc, out); +} + +static struct shash_alg algs[] = { { + .init = sha384_base_init, + .update = sha512_ce_update, + .final = sha512_ce_final, + .finup = sha512_ce_finup, + .descsize = sizeof(struct sha512_state), + .digestsize = SHA384_DIGEST_SIZE, + .base.cra_name = "sha384", + .base.cra_driver_name = "sha384-ce", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +}, { + .init = sha512_base_init, + .update = sha512_ce_update, + .final = sha512_ce_final, + .finup = sha512_ce_finup, + .descsize = sizeof(struct sha512_state), + .digestsize = SHA512_DIGEST_SIZE, + .base.cra_name = "sha512", + .base.cra_driver_name = "sha512-ce", + .base.cra_priority = 200, + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SHA512_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, +} }; + +static int __init sha512_ce_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha512_ce_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_cpu_feature_match(SHA512, sha512_ce_mod_init); +module_exit(sha512_ce_mod_fini); diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c index aff35c9992a4..27db4851e380 100644 --- a/arch/arm64/crypto/sha512-glue.c +++ b/arch/arm64/crypto/sha512-glue.c @@ -27,6 +27,7 @@ MODULE_ALIAS_CRYPTO("sha512"); asmlinkage void sha512_block_data_order(u32 *digest, const void *data, unsigned int num_blks); +EXPORT_SYMBOL(sha512_block_data_order); static int sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len) diff --git a/arch/arm64/crypto/sm3-ce-core.S b/arch/arm64/crypto/sm3-ce-core.S new file mode 100644 index 000000000000..27169fe07a68 --- /dev/null +++ b/arch/arm64/crypto/sm3-ce-core.S @@ -0,0 +1,141 @@ +/* + * sm3-ce-core.S - SM3 secure hash using ARMv8.2 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 + .set .Lv\b\().4s, \b + .endr + + .macro sm3partw1, rd, rn, rm + .inst 0xce60c000 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sm3partw2, rd, rn, rm + .inst 0xce60c400 | .L\rd | (.L\rn << 5) | (.L\rm << 16) + .endm + + .macro sm3ss1, rd, rn, rm, ra + .inst 0xce400000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) + .endm + + .macro sm3tt1a, rd, rn, rm, imm2 + .inst 0xce408000 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) + .endm + + .macro sm3tt1b, rd, rn, rm, imm2 + .inst 0xce408400 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) + .endm + + .macro sm3tt2a, rd, rn, rm, imm2 + .inst 0xce408800 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) + .endm + + .macro sm3tt2b, rd, rn, rm, imm2 + .inst 0xce408c00 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) + .endm + + .macro round, ab, s0, t0, t1, i + sm3ss1 v5.4s, v8.4s, \t0\().4s, v9.4s + shl \t1\().4s, \t0\().4s, #1 + sri \t1\().4s, \t0\().4s, #31 + sm3tt1\ab v8.4s, v5.4s, v10.4s, \i + sm3tt2\ab v9.4s, v5.4s, \s0\().4s, \i + .endm + + .macro qround, ab, s0, s1, s2, s3, s4 + .ifnb \s4 + ext \s4\().16b, \s1\().16b, \s2\().16b, #12 + ext v6.16b, \s0\().16b, \s1\().16b, #12 + ext v7.16b, \s2\().16b, \s3\().16b, #8 + sm3partw1 \s4\().4s, \s0\().4s, \s3\().4s + .endif + + eor v10.16b, \s0\().16b, \s1\().16b + + round \ab, \s0, v11, v12, 0 + round \ab, \s0, v12, v11, 1 + round \ab, \s0, v11, v12, 2 + round \ab, \s0, v12, v11, 3 + + .ifnb \s4 + sm3partw2 \s4\().4s, v7.4s, v6.4s + .endif + .endm + + /* + * void sm3_ce_transform(struct sm3_state *sst, u8 const *src, + * int blocks) + */ + .text +ENTRY(sm3_ce_transform) + /* load state */ + ld1 {v8.4s-v9.4s}, [x0] + rev64 v8.4s, v8.4s + rev64 v9.4s, v9.4s + ext v8.16b, v8.16b, v8.16b, #8 + ext v9.16b, v9.16b, v9.16b, #8 + + adr_l x8, .Lt + ldp s13, s14, [x8] + + /* load input */ +0: ld1 {v0.16b-v3.16b}, [x1], #64 + sub w2, w2, #1 + + mov v15.16b, v8.16b + mov v16.16b, v9.16b + +CPU_LE( rev32 v0.16b, v0.16b ) +CPU_LE( rev32 v1.16b, v1.16b ) +CPU_LE( rev32 v2.16b, v2.16b ) +CPU_LE( rev32 v3.16b, v3.16b ) + + ext v11.16b, v13.16b, v13.16b, #4 + + qround a, v0, v1, v2, v3, v4 + qround a, v1, v2, v3, v4, v0 + qround a, v2, v3, v4, v0, v1 + qround a, v3, v4, v0, v1, v2 + + ext v11.16b, v14.16b, v14.16b, #4 + + qround b, v4, v0, v1, v2, v3 + qround b, v0, v1, v2, v3, v4 + qround b, v1, v2, v3, v4, v0 + qround b, v2, v3, v4, v0, v1 + qround b, v3, v4, v0, v1, v2 + qround b, v4, v0, v1, v2, v3 + qround b, v0, v1, v2, v3, v4 + qround b, v1, v2, v3, v4, v0 + qround b, v2, v3, v4, v0, v1 + qround b, v3, v4 + qround b, v4, v0 + qround b, v0, v1 + + eor v8.16b, v8.16b, v15.16b + eor v9.16b, v9.16b, v16.16b + + /* handled all input blocks? */ + cbnz w2, 0b + + /* save state */ + rev64 v8.4s, v8.4s + rev64 v9.4s, v9.4s + ext v8.16b, v8.16b, v8.16b, #8 + ext v9.16b, v9.16b, v9.16b, #8 + st1 {v8.4s-v9.4s}, [x0] + ret +ENDPROC(sm3_ce_transform) + + .section ".rodata", "a" + .align 3 +.Lt: .word 0x79cc4519, 0x9d8a7a87 diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c new file mode 100644 index 000000000000..3b4948f7e26f --- /dev/null +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -0,0 +1,92 @@ +/* + * sm3-ce-glue.c - SM3 secure hash using ARMv8.2 Crypto Extensions + * + * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/sm3.h> +#include <crypto/sm3_base.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src, + int blocks); + +static int sm3_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + if (!may_use_simd()) + return crypto_sm3_update(desc, data, len); + + kernel_neon_begin(); + sm3_base_do_update(desc, data, len, sm3_ce_transform); + kernel_neon_end(); + + return 0; +} + +static int sm3_ce_final(struct shash_desc *desc, u8 *out) +{ + if (!may_use_simd()) + return crypto_sm3_finup(desc, NULL, 0, out); + + kernel_neon_begin(); + sm3_base_do_finalize(desc, sm3_ce_transform); + kernel_neon_end(); + + return sm3_base_finish(desc, out); +} + +static int sm3_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!may_use_simd()) + return crypto_sm3_finup(desc, data, len, out); + + kernel_neon_begin(); + sm3_base_do_update(desc, data, len, sm3_ce_transform); + kernel_neon_end(); + + return sm3_ce_final(desc, out); +} + +static struct shash_alg sm3_alg = { + .digestsize = SM3_DIGEST_SIZE, + .init = sm3_base_init, + .update = sm3_ce_update, + .final = sm3_ce_final, + .finup = sm3_ce_finup, + .descsize = sizeof(struct sm3_state), + .base.cra_name = "sm3", + .base.cra_driver_name = "sm3-ce", + .base.cra_flags = CRYPTO_ALG_TYPE_SHASH, + .base.cra_blocksize = SM3_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + .base.cra_priority = 200, +}; + +static int __init sm3_ce_mod_init(void) +{ + return crypto_register_shash(&sm3_alg); +} + +static void __exit sm3_ce_mod_fini(void) +{ + crypto_unregister_shash(&sm3_alg); +} + +module_cpu_feature_match(SM3, sm3_ce_mod_init); +module_exit(sm3_ce_mod_fini); diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c index f058e0c3e4d4..fd1d6c83f0c0 100644 --- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c +++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c @@ -141,6 +141,7 @@ static struct shash_alg alg = { .cra_name = "crc32c", .cra_driver_name = "crc32c-vpmsum", .cra_priority = 200, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CHKSUM_BLOCK_SIZE, .cra_ctxsize = sizeof(u32), .cra_module = THIS_MODULE, diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c index 436865926c26..423ee05887e6 100644 --- a/arch/s390/crypto/crc32-vx.c +++ b/arch/s390/crypto/crc32-vx.c @@ -239,6 +239,7 @@ static struct shash_alg crc32_vx_algs[] = { .cra_name = "crc32", .cra_driver_name = "crc32-vx", .cra_priority = 200, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CRC32_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crc_ctx), .cra_module = THIS_MODULE, @@ -259,6 +260,7 @@ static struct shash_alg crc32_vx_algs[] = { .cra_name = "crc32be", .cra_driver_name = "crc32be-vx", .cra_priority = 200, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CRC32_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crc_ctx), .cra_module = THIS_MODULE, @@ -279,6 +281,7 @@ static struct shash_alg crc32_vx_algs[] = { .cra_name = "crc32c", .cra_driver_name = "crc32c-vx", .cra_priority = 200, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CRC32_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crc_ctx), .cra_module = THIS_MODULE, diff --git a/arch/sparc/crypto/crc32c_glue.c b/arch/sparc/crypto/crc32c_glue.c index d1064e46efe8..8aa664638c3c 100644 --- a/arch/sparc/crypto/crc32c_glue.c +++ b/arch/sparc/crypto/crc32c_glue.c @@ -133,6 +133,7 @@ static struct shash_alg alg = { .cra_name = "crc32c", .cra_driver_name = "crc32c-sparc64", .cra_priority = SPARC_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CHKSUM_BLOCK_SIZE, .cra_ctxsize = sizeof(u32), .cra_alignmask = 7, diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 3d09e3aca18d..12e8484a8ee7 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -90,30 +90,6 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ALL_F: .octa 0xffffffffffffffffffffffffffffffff .octa 0x00000000000000000000000000000000 -.section .rodata -.align 16 -.type aad_shift_arr, @object -.size aad_shift_arr, 272 -aad_shift_arr: - .octa 0xffffffffffffffffffffffffffffffff - .octa 0xffffffffffffffffffffffffffffff0C - .octa 0xffffffffffffffffffffffffffff0D0C - .octa 0xffffffffffffffffffffffffff0E0D0C - .octa 0xffffffffffffffffffffffff0F0E0D0C - .octa 0xffffffffffffffffffffff0C0B0A0908 - .octa 0xffffffffffffffffffff0D0C0B0A0908 - .octa 0xffffffffffffffffff0E0D0C0B0A0908 - .octa 0xffffffffffffffff0F0E0D0C0B0A0908 - .octa 0xffffffffffffff0C0B0A090807060504 - .octa 0xffffffffffff0D0C0B0A090807060504 - .octa 0xffffffffff0E0D0C0B0A090807060504 - .octa 0xffffffff0F0E0D0C0B0A090807060504 - .octa 0xffffff0C0B0A09080706050403020100 - .octa 0xffff0D0C0B0A09080706050403020100 - .octa 0xff0E0D0C0B0A09080706050403020100 - .octa 0x0F0E0D0C0B0A09080706050403020100 - - .text @@ -257,6 +233,37 @@ aad_shift_arr: pxor \TMP1, \GH # result is in TMP1 .endm +# Reads DLEN bytes starting at DPTR and stores in XMMDst +# where 0 < DLEN < 16 +# Clobbers %rax, DLEN and XMM1 +.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst + cmp $8, \DLEN + jl _read_lt8_\@ + mov (\DPTR), %rax + MOVQ_R64_XMM %rax, \XMMDst + sub $8, \DLEN + jz _done_read_partial_block_\@ + xor %eax, %eax +_read_next_byte_\@: + shl $8, %rax + mov 7(\DPTR, \DLEN, 1), %al + dec \DLEN + jnz _read_next_byte_\@ + MOVQ_R64_XMM %rax, \XMM1 + pslldq $8, \XMM1 + por \XMM1, \XMMDst + jmp _done_read_partial_block_\@ +_read_lt8_\@: + xor %eax, %eax +_read_next_byte_lt8_\@: + shl $8, %rax + mov -1(\DPTR, \DLEN, 1), %al + dec \DLEN + jnz _read_next_byte_lt8_\@ + MOVQ_R64_XMM %rax, \XMMDst +_done_read_partial_block_\@: +.endm + /* * if a = number of total plaintext bytes * b = floor(a/16) @@ -273,62 +280,30 @@ aad_shift_arr: XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation MOVADQ SHUF_MASK(%rip), %xmm14 mov arg7, %r10 # %r10 = AAD - mov arg8, %r12 # %r12 = aadLen - mov %r12, %r11 + mov arg8, %r11 # %r11 = aadLen pxor %xmm\i, %xmm\i pxor \XMM2, \XMM2 cmp $16, %r11 - jl _get_AAD_rest8\num_initial_blocks\operation + jl _get_AAD_rest\num_initial_blocks\operation _get_AAD_blocks\num_initial_blocks\operation: movdqu (%r10), %xmm\i PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data pxor %xmm\i, \XMM2 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 add $16, %r10 - sub $16, %r12 sub $16, %r11 cmp $16, %r11 jge _get_AAD_blocks\num_initial_blocks\operation movdqu \XMM2, %xmm\i + + /* read the last <16B of AAD */ +_get_AAD_rest\num_initial_blocks\operation: cmp $0, %r11 je _get_AAD_done\num_initial_blocks\operation - pxor %xmm\i,%xmm\i - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\num_initial_blocks\operation: - cmp $4, %r11 - jle _get_AAD_rest4\num_initial_blocks\operation - movq (%r10), \TMP1 - add $8, %r10 - sub $8, %r11 - pslldq $8, \TMP1 - psrldq $8, %xmm\i - pxor \TMP1, %xmm\i - jmp _get_AAD_rest8\num_initial_blocks\operation -_get_AAD_rest4\num_initial_blocks\operation: - cmp $0, %r11 - jle _get_AAD_rest0\num_initial_blocks\operation - mov (%r10), %eax - movq %rax, \TMP1 - add $4, %r10 - sub $4, %r10 - pslldq $12, \TMP1 - psrldq $4, %xmm\i - pxor \TMP1, %xmm\i -_get_AAD_rest0\num_initial_blocks\operation: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \TMP1 - PSHUFB_XMM \TMP1, %xmm\i -_get_AAD_rest_final\num_initial_blocks\operation: + READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data pxor \XMM2, %xmm\i GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 @@ -532,62 +507,30 @@ _initial_blocks_done\num_initial_blocks\operation: XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation MOVADQ SHUF_MASK(%rip), %xmm14 mov arg7, %r10 # %r10 = AAD - mov arg8, %r12 # %r12 = aadLen - mov %r12, %r11 + mov arg8, %r11 # %r11 = aadLen pxor %xmm\i, %xmm\i pxor \XMM2, \XMM2 cmp $16, %r11 - jl _get_AAD_rest8\num_initial_blocks\operation + jl _get_AAD_rest\num_initial_blocks\operation _get_AAD_blocks\num_initial_blocks\operation: movdqu (%r10), %xmm\i PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data pxor %xmm\i, \XMM2 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 add $16, %r10 - sub $16, %r12 sub $16, %r11 cmp $16, %r11 jge _get_AAD_blocks\num_initial_blocks\operation movdqu \XMM2, %xmm\i + + /* read the last <16B of AAD */ +_get_AAD_rest\num_initial_blocks\operation: cmp $0, %r11 je _get_AAD_done\num_initial_blocks\operation - pxor %xmm\i,%xmm\i - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some PT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\num_initial_blocks\operation: - cmp $4, %r11 - jle _get_AAD_rest4\num_initial_blocks\operation - movq (%r10), \TMP1 - add $8, %r10 - sub $8, %r11 - pslldq $8, \TMP1 - psrldq $8, %xmm\i - pxor \TMP1, %xmm\i - jmp _get_AAD_rest8\num_initial_blocks\operation -_get_AAD_rest4\num_initial_blocks\operation: - cmp $0, %r11 - jle _get_AAD_rest0\num_initial_blocks\operation - mov (%r10), %eax - movq %rax, \TMP1 - add $4, %r10 - sub $4, %r10 - pslldq $12, \TMP1 - psrldq $4, %xmm\i - pxor \TMP1, %xmm\i -_get_AAD_rest0\num_initial_blocks\operation: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \TMP1 - PSHUFB_XMM \TMP1, %xmm\i -_get_AAD_rest_final\num_initial_blocks\operation: + READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data pxor \XMM2, %xmm\i GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 @@ -1386,14 +1329,6 @@ _esb_loop_\@: * * AAD Format with 64-bit Extended Sequence Number * -* aadLen: -* from the definition of the spec, aadLen can only be 8 or 12 bytes. -* The code supports 16 too but for other sizes, the code will fail. -* -* TLen: -* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. -* For other sizes, the code will fail. -* * poly = x^128 + x^127 + x^126 + x^121 + 1 * *****************************************************************************/ @@ -1487,19 +1422,16 @@ _zero_cipher_left_decrypt: PSHUFB_XMM %xmm10, %xmm0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) - sub $16, %r11 - add %r13, %r11 - movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 -# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes -# (%r13 is the number of bytes in plaintext mod 16) - movdqu (%r12), %xmm2 # get the appropriate shuffle mask - PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes + lea (%arg3,%r11,1), %r10 + mov %r13, %r12 + READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 + + lea ALL_F+16(%rip), %r12 + sub %r13, %r12 movdqa %xmm1, %xmm2 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + movdqu (%r12), %xmm1 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 pand %xmm1, %xmm2 @@ -1508,9 +1440,6 @@ _zero_cipher_left_decrypt: pxor %xmm2, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - # GHASH computation for the last <16 byte block - sub %r13, %r11 - add $16, %r11 # output %r13 bytes MOVQ_R64_XMM %xmm0, %rax @@ -1664,14 +1593,6 @@ ENDPROC(aesni_gcm_dec) * * AAD Format with 64-bit Extended Sequence Number * -* aadLen: -* from the definition of the spec, aadLen can only be 8 or 12 bytes. -* The code supports 16 too but for other sizes, the code will fail. -* -* TLen: -* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. -* For other sizes, the code will fail. -* * poly = x^128 + x^127 + x^126 + x^121 + 1 ***************************************************************************/ ENTRY(aesni_gcm_enc) @@ -1764,19 +1685,16 @@ _zero_cipher_left_encrypt: movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm0 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) - sub $16, %r11 - add %r13, %r11 - movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks - lea SHIFT_MASK+16(%rip), %r12 + + lea (%arg3,%r11,1), %r10 + mov %r13, %r12 + READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 + + lea ALL_F+16(%rip), %r12 sub %r13, %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (%r13 is the number of bytes in plaintext mod 16) - movdqu (%r12), %xmm2 # get the appropriate shuffle mask - PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + movdqu (%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 movdqa SHUF_MASK(%rip), %xmm10 @@ -1785,9 +1703,6 @@ _zero_cipher_left_encrypt: pxor %xmm0, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # GHASH computation for the last <16 byte block - sub %r13, %r11 - add $16, %r11 - movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm0 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 3bf3dcf29825..34cf1c1f8c98 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -690,8 +690,8 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); } -static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, - unsigned int key_len) +static int gcmaes_wrapper_set_key(struct crypto_aead *parent, const u8 *key, + unsigned int key_len) { struct cryptd_aead **ctx = crypto_aead_ctx(parent); struct cryptd_aead *cryptd_tfm = *ctx; @@ -716,8 +716,8 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead, /* This is the Integrity Check Value (aka the authentication tag length and can * be 8, 12 or 16 bytes long. */ -static int rfc4106_set_authsize(struct crypto_aead *parent, - unsigned int authsize) +static int gcmaes_wrapper_set_authsize(struct crypto_aead *parent, + unsigned int authsize) { struct cryptd_aead **ctx = crypto_aead_ctx(parent); struct cryptd_aead *cryptd_tfm = *ctx; @@ -824,7 +824,7 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, if (sg_is_last(req->src) && (!PageHighMem(sg_page(req->src)) || req->src->offset + req->src->length <= PAGE_SIZE) && - sg_is_last(req->dst) && + sg_is_last(req->dst) && req->dst->length && (!PageHighMem(sg_page(req->dst)) || req->dst->offset + req->dst->length <= PAGE_SIZE)) { one_entry_in_sg = 1; @@ -929,7 +929,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req) aes_ctx); } -static int rfc4106_encrypt(struct aead_request *req) +static int gcmaes_wrapper_encrypt(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cryptd_aead **ctx = crypto_aead_ctx(tfm); @@ -945,7 +945,7 @@ static int rfc4106_encrypt(struct aead_request *req) return crypto_aead_encrypt(req); } -static int rfc4106_decrypt(struct aead_request *req) +static int gcmaes_wrapper_decrypt(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct cryptd_aead **ctx = crypto_aead_ctx(tfm); @@ -1117,7 +1117,7 @@ static int generic_gcmaes_decrypt(struct aead_request *req) { __be32 counter = cpu_to_be32(1); struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); void *aes_ctx = &(ctx->aes_key_expanded); u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN))); @@ -1128,6 +1128,30 @@ static int generic_gcmaes_decrypt(struct aead_request *req) aes_ctx); } +static int generic_gcmaes_init(struct crypto_aead *aead) +{ + struct cryptd_aead *cryptd_tfm; + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_tfm = cryptd_alloc_aead("__driver-generic-gcm-aes-aesni", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + *ctx = cryptd_tfm; + crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); + + return 0; +} + +static void generic_gcmaes_exit(struct crypto_aead *aead) +{ + struct cryptd_aead **ctx = crypto_aead_ctx(aead); + + cryptd_free_aead(*ctx); +} + static struct aead_alg aesni_aead_algs[] = { { .setkey = common_rfc4106_set_key, .setauthsize = common_rfc4106_set_authsize, @@ -1147,10 +1171,10 @@ static struct aead_alg aesni_aead_algs[] = { { }, { .init = rfc4106_init, .exit = rfc4106_exit, - .setkey = rfc4106_set_key, - .setauthsize = rfc4106_set_authsize, - .encrypt = rfc4106_encrypt, - .decrypt = rfc4106_decrypt, + .setkey = gcmaes_wrapper_set_key, + .setauthsize = gcmaes_wrapper_set_authsize, + .encrypt = gcmaes_wrapper_encrypt, + .decrypt = gcmaes_wrapper_decrypt, .ivsize = GCM_RFC4106_IV_SIZE, .maxauthsize = 16, .base = { @@ -1170,13 +1194,31 @@ static struct aead_alg aesni_aead_algs[] = { { .ivsize = GCM_AES_IV_SIZE, .maxauthsize = 16, .base = { + .cra_name = "__generic-gcm-aes-aesni", + .cra_driver_name = "__driver-generic-gcm-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), + .cra_alignmask = AESNI_ALIGN - 1, + .cra_module = THIS_MODULE, + }, +}, { + .init = generic_gcmaes_init, + .exit = generic_gcmaes_exit, + .setkey = gcmaes_wrapper_set_key, + .setauthsize = gcmaes_wrapper_set_authsize, + .encrypt = gcmaes_wrapper_encrypt, + .decrypt = gcmaes_wrapper_decrypt, + .ivsize = GCM_AES_IV_SIZE, + .maxauthsize = 16, + .base = { .cra_name = "gcm(aes)", .cra_driver_name = "generic-gcm-aesni", .cra_priority = 400, .cra_flags = CRYPTO_ALG_ASYNC, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), - .cra_alignmask = AESNI_ALIGN - 1, + .cra_ctxsize = sizeof(struct cryptd_aead *), .cra_module = THIS_MODULE, }, } }; diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 1e6af1b35f7b..dce7c5d39c2f 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -107,7 +107,6 @@ static struct skcipher_alg alg = { .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha20_ctx), - .base.cra_alignmask = sizeof(u32) - 1, .base.cra_module = THIS_MODULE, .min_keysize = CHACHA20_KEY_SIZE, diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 27226df3f7d8..c8d9cdacbf10 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -162,6 +162,7 @@ static struct shash_alg alg = { .cra_name = "crc32", .cra_driver_name = "crc32-pclmul", .cra_priority = 200, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CHKSUM_BLOCK_SIZE, .cra_ctxsize = sizeof(u32), .cra_module = THIS_MODULE, diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index c194d5717ae5..5773e1161072 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -226,6 +226,7 @@ static struct shash_alg alg = { .cra_name = "crc32c", .cra_driver_name = "crc32c-intel", .cra_priority = 200, + .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = CHKSUM_BLOCK_SIZE, .cra_ctxsize = sizeof(u32), .cra_module = THIS_MODULE, diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index e32142bc071d..790377797544 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -164,14 +164,12 @@ static struct shash_alg alg = { .init = poly1305_simd_init, .update = poly1305_simd_update, .final = crypto_poly1305_final, - .setkey = crypto_poly1305_setkey, .descsize = sizeof(struct poly1305_simd_desc_ctx), .base = { .cra_name = "poly1305", .cra_driver_name = "poly1305-simd", .cra_priority = 300, .cra_flags = CRYPTO_ALG_TYPE_SHASH, - .cra_alignmask = sizeof(u32) - 1, .cra_blocksize = POLY1305_BLOCK_SIZE, .cra_module = THIS_MODULE, }, diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S index 329452b8f794..6014b7b9e52a 100644 --- a/arch/x86/crypto/salsa20-i586-asm_32.S +++ b/arch/x86/crypto/salsa20-i586-asm_32.S @@ -1,6 +1,7 @@ -# salsa20_pm.s version 20051229 -# D. J. Bernstein -# Public domain. +# Derived from: +# salsa20_pm.s version 20051229 +# D. J. Bernstein +# Public domain. #include <linux/linkage.h> @@ -935,180 +936,3 @@ ENTRY(salsa20_encrypt_bytes) # goto bytesatleast1 jmp ._bytesatleast1 ENDPROC(salsa20_encrypt_bytes) - -# enter salsa20_keysetup -ENTRY(salsa20_keysetup) - mov %esp,%eax - and $31,%eax - add $256,%eax - sub %eax,%esp - # eax_stack = eax - movl %eax,64(%esp) - # ebx_stack = ebx - movl %ebx,68(%esp) - # esi_stack = esi - movl %esi,72(%esp) - # edi_stack = edi - movl %edi,76(%esp) - # ebp_stack = ebp - movl %ebp,80(%esp) - # k = arg2 - movl 8(%esp,%eax),%ecx - # kbits = arg3 - movl 12(%esp,%eax),%edx - # x = arg1 - movl 4(%esp,%eax),%eax - # in1 = *(uint32 *) (k + 0) - movl 0(%ecx),%ebx - # in2 = *(uint32 *) (k + 4) - movl 4(%ecx),%esi - # in3 = *(uint32 *) (k + 8) - movl 8(%ecx),%edi - # in4 = *(uint32 *) (k + 12) - movl 12(%ecx),%ebp - # *(uint32 *) (x + 4) = in1 - movl %ebx,4(%eax) - # *(uint32 *) (x + 8) = in2 - movl %esi,8(%eax) - # *(uint32 *) (x + 12) = in3 - movl %edi,12(%eax) - # *(uint32 *) (x + 16) = in4 - movl %ebp,16(%eax) - # kbits - 256 - cmp $256,%edx - # goto kbits128 if unsigned< - jb ._kbits128 -._kbits256: - # in11 = *(uint32 *) (k + 16) - movl 16(%ecx),%edx - # in12 = *(uint32 *) (k + 20) - movl 20(%ecx),%ebx - # in13 = *(uint32 *) (k + 24) - movl 24(%ecx),%esi - # in14 = *(uint32 *) (k + 28) - movl 28(%ecx),%ecx - # *(uint32 *) (x + 44) = in11 - movl %edx,44(%eax) - # *(uint32 *) (x + 48) = in12 - movl %ebx,48(%eax) - # *(uint32 *) (x + 52) = in13 - movl %esi,52(%eax) - # *(uint32 *) (x + 56) = in14 - movl %ecx,56(%eax) - # in0 = 1634760805 - mov $1634760805,%ecx - # in5 = 857760878 - mov $857760878,%edx - # in10 = 2036477234 - mov $2036477234,%ebx - # in15 = 1797285236 - mov $1797285236,%esi - # *(uint32 *) (x + 0) = in0 - movl %ecx,0(%eax) - # *(uint32 *) (x + 20) = in5 - movl %edx,20(%eax) - # *(uint32 *) (x + 40) = in10 - movl %ebx,40(%eax) - # *(uint32 *) (x + 60) = in15 - movl %esi,60(%eax) - # goto keysetupdone - jmp ._keysetupdone -._kbits128: - # in11 = *(uint32 *) (k + 0) - movl 0(%ecx),%edx - # in12 = *(uint32 *) (k + 4) - movl 4(%ecx),%ebx - # in13 = *(uint32 *) (k + 8) - movl 8(%ecx),%esi - # in14 = *(uint32 *) (k + 12) - movl 12(%ecx),%ecx - # *(uint32 *) (x + 44) = in11 - movl %edx,44(%eax) - # *(uint32 *) (x + 48) = in12 - movl %ebx,48(%eax) - # *(uint32 *) (x + 52) = in13 - movl %esi,52(%eax) - # *(uint32 *) (x + 56) = in14 - movl %ecx,56(%eax) - # in0 = 1634760805 - mov $1634760805,%ecx - # in5 = 824206446 - mov $824206446,%edx - # in10 = 2036477238 - mov $2036477238,%ebx - # in15 = 1797285236 - mov $1797285236,%esi - # *(uint32 *) (x + 0) = in0 - movl %ecx,0(%eax) - # *(uint32 *) (x + 20) = in5 - movl %edx,20(%eax) - # *(uint32 *) (x + 40) = in10 - movl %ebx,40(%eax) - # *(uint32 *) (x + 60) = in15 - movl %esi,60(%eax) -._keysetupdone: - # eax = eax_stack - movl 64(%esp),%eax - # ebx = ebx_stack - movl 68(%esp),%ebx - # esi = esi_stack - movl 72(%esp),%esi - # edi = edi_stack - movl 76(%esp),%edi - # ebp = ebp_stack - movl 80(%esp),%ebp - # leave - add %eax,%esp - ret -ENDPROC(salsa20_keysetup) - -# enter salsa20_ivsetup -ENTRY(salsa20_ivsetup) - mov %esp,%eax - and $31,%eax - add $256,%eax - sub %eax,%esp - # eax_stack = eax - movl %eax,64(%esp) - # ebx_stack = ebx - movl %ebx,68(%esp) - # esi_stack = esi - movl %esi,72(%esp) - # edi_stack = edi - movl %edi,76(%esp) - # ebp_stack = ebp - movl %ebp,80(%esp) - # iv = arg2 - movl 8(%esp,%eax),%ecx - # x = arg1 - movl 4(%esp,%eax),%eax - # in6 = *(uint32 *) (iv + 0) - movl 0(%ecx),%edx - # in7 = *(uint32 *) (iv + 4) - movl 4(%ecx),%ecx - # in8 = 0 - mov $0,%ebx - # in9 = 0 - mov $0,%esi - # *(uint32 *) (x + 24) = in6 - movl %edx,24(%eax) - # *(uint32 *) (x + 28) = in7 - movl %ecx,28(%eax) - # *(uint32 *) (x + 32) = in8 - movl %ebx,32(%eax) - # *(uint32 *) (x + 36) = in9 - movl %esi,36(%eax) - # eax = eax_stack - movl 64(%esp),%eax - # ebx = ebx_stack - movl 68(%esp),%ebx - # esi = esi_stack - movl 72(%esp),%esi - # edi = edi_stack - movl 76(%esp),%edi - # ebp = ebp_stack - movl 80(%esp),%ebp - # leave - add %eax,%esp - ret -ENDPROC(salsa20_ivsetup) diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S index 10db30d58006..03a4918f41ee 100644 --- a/arch/x86/crypto/salsa20-x86_64-asm_64.S +++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S @@ -803,117 +803,3 @@ ENTRY(salsa20_encrypt_bytes) # goto bytesatleast1 jmp ._bytesatleast1 ENDPROC(salsa20_encrypt_bytes) - -# enter salsa20_keysetup -ENTRY(salsa20_keysetup) - mov %rsp,%r11 - and $31,%r11 - add $256,%r11 - sub %r11,%rsp - # k = arg2 - mov %rsi,%rsi - # kbits = arg3 - mov %rdx,%rdx - # x = arg1 - mov %rdi,%rdi - # in0 = *(uint64 *) (k + 0) - movq 0(%rsi),%r8 - # in2 = *(uint64 *) (k + 8) - movq 8(%rsi),%r9 - # *(uint64 *) (x + 4) = in0 - movq %r8,4(%rdi) - # *(uint64 *) (x + 12) = in2 - movq %r9,12(%rdi) - # unsigned<? kbits - 256 - cmp $256,%rdx - # comment:fp stack unchanged by jump - # goto kbits128 if unsigned< - jb ._kbits128 -# kbits256: -._kbits256: - # in10 = *(uint64 *) (k + 16) - movq 16(%rsi),%rdx - # in12 = *(uint64 *) (k + 24) - movq 24(%rsi),%rsi - # *(uint64 *) (x + 44) = in10 - movq %rdx,44(%rdi) - # *(uint64 *) (x + 52) = in12 - movq %rsi,52(%rdi) - # in0 = 1634760805 - mov $1634760805,%rsi - # in4 = 857760878 - mov $857760878,%rdx - # in10 = 2036477234 - mov $2036477234,%rcx - # in14 = 1797285236 - mov $1797285236,%r8 - # *(uint32 *) (x + 0) = in0 - movl %esi,0(%rdi) - # *(uint32 *) (x + 20) = in4 - movl %edx,20(%rdi) - # *(uint32 *) (x + 40) = in10 - movl %ecx,40(%rdi) - # *(uint32 *) (x + 60) = in14 - movl %r8d,60(%rdi) - # comment:fp stack unchanged by jump - # goto keysetupdone - jmp ._keysetupdone -# kbits128: -._kbits128: - # in10 = *(uint64 *) (k + 0) - movq 0(%rsi),%rdx - # in12 = *(uint64 *) (k + 8) - movq 8(%rsi),%rsi - # *(uint64 *) (x + 44) = in10 - movq %rdx,44(%rdi) - # *(uint64 *) (x + 52) = in12 - movq %rsi,52(%rdi) - # in0 = 1634760805 - mov $1634760805,%rsi - # in4 = 824206446 - mov $824206446,%rdx - # in10 = 2036477238 - mov $2036477238,%rcx - # in14 = 1797285236 - mov $1797285236,%r8 - # *(uint32 *) (x + 0) = in0 - movl %esi,0(%rdi) - # *(uint32 *) (x + 20) = in4 - movl %edx,20(%rdi) - # *(uint32 *) (x + 40) = in10 - movl %ecx,40(%rdi) - # *(uint32 *) (x + 60) = in14 - movl %r8d,60(%rdi) -# keysetupdone: -._keysetupdone: - # leave - add %r11,%rsp - mov %rdi,%rax - mov %rsi,%rdx - ret -ENDPROC(salsa20_keysetup) - -# enter salsa20_ivsetup -ENTRY(salsa20_ivsetup) - mov %rsp,%r11 - and $31,%r11 - add $256,%r11 - sub %r11,%rsp - # iv = arg2 - mov %rsi,%rsi - # x = arg1 - mov %rdi,%rdi - # in6 = *(uint64 *) (iv + 0) - movq 0(%rsi),%rsi - # in8 = 0 - mov $0,%r8 - # *(uint64 *) (x + 24) = in6 - movq %rsi,24(%rdi) - # *(uint64 *) (x + 32) = in8 - movq %r8,32(%rdi) - # leave - add %r11,%rsp - mov %rdi,%rax - mov %rsi,%rdx - ret -ENDPROC(salsa20_ivsetup) diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index cb91a64a99e7..b07d7d959806 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c @@ -11,6 +11,9 @@ * - x86-64 version, renamed as salsa20-x86_64-asm_64.S * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s> * + * Also modified to set up the initial state using the generic C code rather + * than in assembly. + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) @@ -18,93 +21,65 @@ * */ -#include <crypto/algapi.h> +#include <asm/unaligned.h> +#include <crypto/internal/skcipher.h> +#include <crypto/salsa20.h> #include <linux/module.h> -#include <linux/crypto.h> - -#define SALSA20_IV_SIZE 8U -#define SALSA20_MIN_KEY_SIZE 16U -#define SALSA20_MAX_KEY_SIZE 32U - -struct salsa20_ctx -{ - u32 input[16]; -}; -asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k, - u32 keysize, u32 ivsize); -asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv); -asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx, - const u8 *src, u8 *dst, u32 bytes); +asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst, + u32 bytes); -static int setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keysize) +static int salsa20_asm_crypt(struct skcipher_request *req) { - struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm); - salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8); - return 0; -} - -static int encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, struct scatterlist *src, - unsigned int nbytes) -{ - struct blkcipher_walk walk; - struct crypto_blkcipher *tfm = desc->tfm; - struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + u32 state[16]; int err; - blkcipher_walk_init(&walk, dst, src, nbytes); - err = blkcipher_walk_virt_block(desc, &walk, 64); + err = skcipher_walk_virt(&walk, req, true); - salsa20_ivsetup(ctx, walk.iv); + crypto_salsa20_init(state, ctx, walk.iv); - while (walk.nbytes >= 64) { - salsa20_encrypt_bytes(ctx, walk.src.virt.addr, - walk.dst.virt.addr, - walk.nbytes - (walk.nbytes % 64)); - err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64); - } + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; - if (walk.nbytes) { - salsa20_encrypt_bytes(ctx, walk.src.virt.addr, - walk.dst.virt.addr, walk.nbytes); - err = blkcipher_walk_done(desc, &walk, 0); + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + salsa20_encrypt_bytes(state, walk.src.virt.addr, + walk.dst.virt.addr, nbytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } -static struct crypto_alg alg = { - .cra_name = "salsa20", - .cra_driver_name = "salsa20-asm", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_type = &crypto_blkcipher_type, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct salsa20_ctx), - .cra_alignmask = 3, - .cra_module = THIS_MODULE, - .cra_u = { - .blkcipher = { - .setkey = setkey, - .encrypt = encrypt, - .decrypt = encrypt, - .min_keysize = SALSA20_MIN_KEY_SIZE, - .max_keysize = SALSA20_MAX_KEY_SIZE, - .ivsize = SALSA20_IV_SIZE, - } - } +static struct skcipher_alg alg = { + .base.cra_name = "salsa20", + .base.cra_driver_name = "salsa20-asm", + .base.cra_priority = 200, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct salsa20_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = SALSA20_MIN_KEY_SIZE, + .max_keysize = SALSA20_MAX_KEY_SIZE, + .ivsize = SALSA20_IV_SIZE, + .chunksize = SALSA20_BLOCK_SIZE, + .setkey = crypto_salsa20_setkey, + .encrypt = salsa20_asm_crypt, + .decrypt = salsa20_asm_crypt, }; static int __init init(void) { - return crypto_register_alg(&alg); + return crypto_register_skcipher(&alg); } static void __exit fini(void) { - crypto_unregister_alg(&alg); + crypto_unregister_skcipher(&alg); } module_init(init); diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S index 1c3b7ceb36d2..e7273a606a07 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -55,29 +55,31 @@ #define RAB1bl %bl #define RAB2bl %cl +#define CD0 0x0(%rsp) +#define CD1 0x8(%rsp) +#define CD2 0x10(%rsp) + +# used only before/after all rounds #define RCD0 %r8 #define RCD1 %r9 #define RCD2 %r10 -#define RCD0d %r8d -#define RCD1d %r9d -#define RCD2d %r10d - -#define RX0 %rbp -#define RX1 %r11 -#define RX2 %r12 +# used only during rounds +#define RX0 %r8 +#define RX1 %r9 +#define RX2 %r10 -#define RX0d %ebp -#define RX1d %r11d -#define RX2d %r12d +#define RX0d %r8d +#define RX1d %r9d +#define RX2d %r10d -#define RY0 %r13 -#define RY1 %r14 -#define RY2 %r15 +#define RY0 %r11 +#define RY1 %r12 +#define RY2 %r13 -#define RY0d %r13d -#define RY1d %r14d -#define RY2d %r15d +#define RY0d %r11d +#define RY1d %r12d +#define RY2d %r13d #define RT0 %rdx #define RT1 %rsi @@ -85,6 +87,8 @@ #define RT0d %edx #define RT1d %esi +#define RT1bl %sil + #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ movzbl ab ## bl, tmp2 ## d; \ movzbl ab ## bh, tmp1 ## d; \ @@ -92,6 +96,11 @@ op1##l T0(CTX, tmp2, 4), dst ## d; \ op2##l T1(CTX, tmp1, 4), dst ## d; +#define swap_ab_with_cd(ab, cd, tmp) \ + movq cd, tmp; \ + movq ab, cd; \ + movq tmp, ab; + /* * Combined G1 & G2 function. Reordered with help of rotates to have moves * at begining. @@ -110,15 +119,15 @@ /* G1,2 && G2,2 */ \ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ - xchgq cd ## 0, ab ## 0; \ + swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \ \ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ - xchgq cd ## 1, ab ## 1; \ + swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \ \ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ - xchgq cd ## 2, ab ## 2; + swap_ab_with_cd(ab ## 2, cd ## 2, RT0); #define enc_round_end(ab, x, y, n) \ addl y ## d, x ## d; \ @@ -168,6 +177,16 @@ decrypt_round3(ba, dc, (n*2)+1); \ decrypt_round3(ba, dc, (n*2)); +#define push_cd() \ + pushq RCD2; \ + pushq RCD1; \ + pushq RCD0; + +#define pop_cd() \ + popq RCD0; \ + popq RCD1; \ + popq RCD2; + #define inpack3(in, n, xy, m) \ movq 4*(n)(in), xy ## 0; \ xorq w+4*m(CTX), xy ## 0; \ @@ -223,11 +242,8 @@ ENTRY(__twofish_enc_blk_3way) * %rdx: src, RIO * %rcx: bool, if true: xor output */ - pushq %r15; - pushq %r14; pushq %r13; pushq %r12; - pushq %rbp; pushq %rbx; pushq %rcx; /* bool xor */ @@ -235,40 +251,36 @@ ENTRY(__twofish_enc_blk_3way) inpack_enc3(); - encrypt_cycle3(RAB, RCD, 0); - encrypt_cycle3(RAB, RCD, 1); - encrypt_cycle3(RAB, RCD, 2); - encrypt_cycle3(RAB, RCD, 3); - encrypt_cycle3(RAB, RCD, 4); - encrypt_cycle3(RAB, RCD, 5); - encrypt_cycle3(RAB, RCD, 6); - encrypt_cycle3(RAB, RCD, 7); + push_cd(); + encrypt_cycle3(RAB, CD, 0); + encrypt_cycle3(RAB, CD, 1); + encrypt_cycle3(RAB, CD, 2); + encrypt_cycle3(RAB, CD, 3); + encrypt_cycle3(RAB, CD, 4); + encrypt_cycle3(RAB, CD, 5); + encrypt_cycle3(RAB, CD, 6); + encrypt_cycle3(RAB, CD, 7); + pop_cd(); popq RIO; /* dst */ - popq %rbp; /* bool xor */ + popq RT1; /* bool xor */ - testb %bpl, %bpl; + testb RT1bl, RT1bl; jnz .L__enc_xor3; outunpack_enc3(mov); popq %rbx; - popq %rbp; popq %r12; popq %r13; - popq %r14; - popq %r15; ret; .L__enc_xor3: outunpack_enc3(xor); popq %rbx; - popq %rbp; popq %r12; popq %r13; - popq %r14; - popq %r15; ret; ENDPROC(__twofish_enc_blk_3way) @@ -278,35 +290,31 @@ ENTRY(twofish_dec_blk_3way) * %rsi: dst * %rdx: src, RIO */ - pushq %r15; - pushq %r14; pushq %r13; pushq %r12; - pushq %rbp; pushq %rbx; pushq %rsi; /* dst */ inpack_dec3(); - decrypt_cycle3(RAB, RCD, 7); - decrypt_cycle3(RAB, RCD, 6); - decrypt_cycle3(RAB, RCD, 5); - decrypt_cycle3(RAB, RCD, 4); - decrypt_cycle3(RAB, RCD, 3); - decrypt_cycle3(RAB, RCD, 2); - decrypt_cycle3(RAB, RCD, 1); - decrypt_cycle3(RAB, RCD, 0); + push_cd(); + decrypt_cycle3(RAB, CD, 7); + decrypt_cycle3(RAB, CD, 6); + decrypt_cycle3(RAB, CD, 5); + decrypt_cycle3(RAB, CD, 4); + decrypt_cycle3(RAB, CD, 3); + decrypt_cycle3(RAB, CD, 2); + decrypt_cycle3(RAB, CD, 1); + decrypt_cycle3(RAB, CD, 0); + pop_cd(); popq RIO; /* dst */ outunpack_dec3(); popq %rbx; - popq %rbp; popq %r12; popq %r13; - popq %r14; - popq %r15; ret; ENDPROC(twofish_dec_blk_3way) |