diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-19 08:52:58 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-07-19 08:52:58 -0700 |
commit | c434e25b62f8efcfbb6bf1f7ce55960206c1137e (patch) | |
tree | 824a68893982c718225a1821fda98c495178473d | |
parent | 720261cfc7329406a50c2a8536e0039b9dd9a4e5 (diff) | |
parent | df1e9791998a92fe9f1e7d3f031b34daaad39e2f (diff) |
Merge tag 'v6.11-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu:
"API:
- Test setkey in no-SIMD context
- Add skcipher speed test for user-specified algorithm
Algorithms:
- Add x25519 support on ppc64le
- Add VAES and AVX512 / AVX10 optimized AES-GCM on x86
- Remove sm2 algorithm
Drivers:
- Add Allwinner H616 support to sun8i-ce
- Use DMA in stm32
- Add Exynos850 hwrng support to exynos"
* tag 'v6.11-p1' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (81 commits)
hwrng: core - remove (un)register_miscdev()
crypto: lib/mpi - delete unnecessary condition
crypto: testmgr - generate power-of-2 lengths more often
crypto: mxs-dcp - Ensure payload is zero when using key slot
hwrng: Kconfig - Do not enable by default CN10K driver
crypto: starfive - Fix nent assignment in rsa dec
crypto: starfive - Align rsa input data to 32-bit
crypto: qat - fix unintentional re-enabling of error interrupts
crypto: qat - extend scope of lock in adf_cfg_add_key_value_param()
Documentation: qat: fix auto_reset attribute details
crypto: sun8i-ce - add Allwinner H616 support
crypto: sun8i-ce - wrap accesses to descriptor address fields
dt-bindings: crypto: sun8i-ce: Add compatible for H616
hwrng: core - Fix wrong quality calculation at hw rng registration
hwrng: exynos - Enable Exynos850 support
hwrng: exynos - Add SMC based TRNG operation
hwrng: exynos - Implement bus clock control
hwrng: exynos - Use devm_clk_get_enabled() to get the clock
hwrng: exynos - Improve coding style
dt-bindings: rng: Add Exynos850 support to exynos-trng
...
114 files changed, 5484 insertions, 5897 deletions
diff --git a/Documentation/ABI/testing/sysfs-driver-qat b/Documentation/ABI/testing/sysfs-driver-qat index 96020fb051c3..f290e77cd590 100644 --- a/Documentation/ABI/testing/sysfs-driver-qat +++ b/Documentation/ABI/testing/sysfs-driver-qat @@ -143,8 +143,8 @@ Description: This attribute is only available for qat_4xxx devices. What: /sys/bus/pci/devices/<BDF>/qat/auto_reset -Date: March 2024 -KernelVersion: 6.8 +Date: May 2024 +KernelVersion: 6.9 Contact: qat-linux@intel.com Description: (RW) Reports the current state of the autoreset feature for a QAT device diff --git a/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml b/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml index 4287678aa79f..da47b601c165 100644 --- a/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml +++ b/Documentation/devicetree/bindings/crypto/allwinner,sun8i-ce.yaml @@ -18,6 +18,7 @@ properties: - allwinner,sun50i-a64-crypto - allwinner,sun50i-h5-crypto - allwinner,sun50i-h6-crypto + - allwinner,sun50i-h616-crypto reg: maxItems: 1 @@ -49,6 +50,7 @@ if: compatible: enum: - allwinner,sun20i-d1-crypto + - allwinner,sun50i-h616-crypto then: properties: clocks: diff --git a/Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml b/Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml index afa52af442a7..f03b87e1b01c 100644 --- a/Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml +++ b/Documentation/devicetree/bindings/rng/amlogic,meson-rng.yaml @@ -26,6 +26,9 @@ properties: items: - const: core + power-domains: + maxItems: 1 + required: - compatible - reg diff --git a/Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml b/Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml index 765d9f9edd6e..1a71935d8a19 100644 --- a/Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml +++ b/Documentation/devicetree/bindings/rng/samsung,exynos5250-trng.yaml @@ -12,14 +12,17 @@ maintainers: properties: compatible: - const: samsung,exynos5250-trng + enum: + - samsung,exynos5250-trng + - samsung,exynos850-trng clocks: - maxItems: 1 + minItems: 1 + maxItems: 2 clock-names: - items: - - const: secss + minItems: 1 + maxItems: 2 reg: maxItems: 1 @@ -30,6 +33,35 @@ required: - clock-names - reg +allOf: + - if: + properties: + compatible: + contains: + const: samsung,exynos850-trng + + then: + properties: + clocks: + items: + - description: SSS (Security Sub System) operating clock + - description: SSS (Security Sub System) bus clock + + clock-names: + items: + - const: secss + - const: pclk + + else: + properties: + clocks: + items: + - description: SSS (Security Sub System) operating clock + + clock-names: + items: + - const: secss + additionalProperties: false examples: diff --git a/MAINTAINERS b/MAINTAINERS index 63af77fcdd32..5694c59f1fdc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -980,6 +980,12 @@ F: include/uapi/linux/psp-dbc.h F: tools/crypto/ccp/*.c F: tools/crypto/ccp/*.py +AMD CRYPTOGRAPHIC COPROCESSOR (CCP) DRIVER - HSTI SUPPORT +M: Mario Limonciello <mario.limonciello@amd.com> +L: linux-crypto@vger.kernel.org +S: Supported +F: drivers/crypto/ccp/hsti.* + AMD DISPLAY CORE M: Harry Wentland <harry.wentland@amd.com> M: Leo Li <sunpeng.li@amd.com> diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index f00f042ef357..201eb35dde37 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -17,6 +17,7 @@ #include <linux/module.h> MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("Bit sliced AES using NEON instructions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ecb(aes)"); diff --git a/arch/arm/crypto/crc32-ce-core.S b/arch/arm/crypto/crc32-ce-core.S index 3f13a76b9066..88f9edf94e95 100644 --- a/arch/arm/crypto/crc32-ce-core.S +++ b/arch/arm/crypto/crc32-ce-core.S @@ -48,6 +48,7 @@ */ #include <linux/linkage.h> +#include <linux/cfi_types.h> #include <asm/assembler.h> .text @@ -123,11 +124,12 @@ * uint crc32_pmull_le(unsigned char const *buffer, * size_t len, uint crc32) */ -ENTRY(crc32_pmull_le) +SYM_FUNC_START(crc32_pmull_le) adr r3, .Lcrc32_constants b 0f +SYM_FUNC_END(crc32_pmull_le) -ENTRY(crc32c_pmull_le) +SYM_FUNC_START(crc32c_pmull_le) adr r3, .Lcrc32c_constants 0: bic LEN, LEN, #15 @@ -236,8 +238,7 @@ fold_64: vmov r0, s5 bx lr -ENDPROC(crc32_pmull_le) -ENDPROC(crc32c_pmull_le) +SYM_FUNC_END(crc32c_pmull_le) .macro __crc32, c subs ip, r2, #8 @@ -296,11 +297,11 @@ ARM_BE8(rev16 r3, r3 ) .endm .align 5 -ENTRY(crc32_armv8_le) +SYM_TYPED_FUNC_START(crc32_armv8_le) __crc32 -ENDPROC(crc32_armv8_le) +SYM_FUNC_END(crc32_armv8_le) .align 5 -ENTRY(crc32c_armv8_le) +SYM_TYPED_FUNC_START(crc32c_armv8_le) __crc32 c -ENDPROC(crc32c_armv8_le) +SYM_FUNC_END(crc32c_armv8_le) diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c index 2208445808d7..4ff18044af07 100644 --- a/arch/arm/crypto/crc32-ce-glue.c +++ b/arch/arm/crypto/crc32-ce-glue.c @@ -241,6 +241,7 @@ module_init(crc32_pmull_mod_init); module_exit(crc32_pmull_mod_exit); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("crc32"); MODULE_ALIAS_CRYPTO("crc32c"); diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c index e9191a8c87b9..79f3b204d8c0 100644 --- a/arch/arm/crypto/crct10dif-ce-glue.c +++ b/arch/arm/crypto/crct10dif-ce-glue.c @@ -84,5 +84,6 @@ module_init(crc_t10dif_mod_init); module_exit(crc_t10dif_mod_exit); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("crct10dif"); diff --git a/arch/arm/crypto/curve25519-glue.c b/arch/arm/crypto/curve25519-glue.c index 9bdafd57888c..e7b87e09dd99 100644 --- a/arch/arm/crypto/curve25519-glue.c +++ b/arch/arm/crypto/curve25519-glue.c @@ -133,4 +133,5 @@ module_exit(arm_curve25519_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-neon"); +MODULE_DESCRIPTION("Public key crypto: Curve25519 (NEON-accelerated)"); MODULE_LICENSE("GPL v2"); diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c index c31bd8f7c092..8482e302c45a 100644 --- a/arch/arm/crypto/poly1305-glue.c +++ b/arch/arm/crypto/poly1305-glue.c @@ -267,6 +267,7 @@ static void __exit arm_poly1305_mod_exit(void) module_init(arm_poly1305_mod_init); module_exit(arm_poly1305_mod_exit); +MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-arm"); diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index 467ac2f768ac..46425e7b9755 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -16,6 +16,7 @@ #include <linux/module.h> MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("Bit sliced AES using NEON instructions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ecb(aes)"); diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index 09eb1456aed4..606d25c559ed 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -98,7 +98,7 @@ static struct shash_alg crc_t10dif_alg[] = {{ .base.cra_name = "crct10dif", .base.cra_driver_name = "crct10dif-arm64-neon", - .base.cra_priority = 100, + .base.cra_priority = 150, .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { @@ -138,6 +138,7 @@ module_cpu_feature_match(ASIMD, crc_t10dif_mod_init); module_exit(crc_t10dif_mod_exit); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("CRC-T10DIF using arm64 NEON and Crypto Extensions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("crct10dif"); MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce"); diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index 1fae18ba11ed..9c4bfd62e789 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -226,6 +226,7 @@ static void __exit neon_poly1305_mod_exit(void) module_init(neon_poly1305_mod_init); module_exit(neon_poly1305_mod_exit); +MODULE_DESCRIPTION("Poly1305 transform using NEON instructions"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("poly1305"); MODULE_ALIAS_CRYPTO("poly1305-neon"); diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig index 1e201b7ae2fc..09ebcbdfb34f 100644 --- a/arch/powerpc/crypto/Kconfig +++ b/arch/powerpc/crypto/Kconfig @@ -2,6 +2,17 @@ menu "Accelerated Cryptographic Algorithms for CPU (powerpc)" +config CRYPTO_CURVE25519_PPC64 + tristate "Public key crypto: Curve25519 (PowerPC64)" + depends on PPC64 && CPU_LITTLE_ENDIAN + select CRYPTO_LIB_CURVE25519_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CURVE25519 + help + Curve25519 algorithm + + Architecture: PowerPC64 + - Little-endian + config CRYPTO_CRC32C_VPMSUM tristate "CRC32c" depends on PPC64 && ALTIVEC diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index fca0e9739869..59808592f0a1 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_CRYPTO_AES_GCM_P10) += aes-gcm-p10-crypto.o obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o +obj-$(CONFIG_CRYPTO_CURVE25519_PPC64) += curve25519-ppc64le.o aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o md5-ppc-y := md5-asm.o md5-glue.o @@ -29,6 +30,7 @@ aes-gcm-p10-crypto-y := aes-gcm-p10-glue.o aes-gcm-p10.o ghashp10-ppc.o aesp10-p chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o +curve25519-ppc64le-y := curve25519-ppc64le-core.o curve25519-ppc64le_asm.o ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) override flavour := linux-ppc64le diff --git a/arch/powerpc/crypto/curve25519-ppc64le-core.c b/arch/powerpc/crypto/curve25519-ppc64le-core.c new file mode 100644 index 000000000000..4e3e44ea4484 --- /dev/null +++ b/arch/powerpc/crypto/curve25519-ppc64le-core.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2024- IBM Corp. + * + * X25519 scalar multiplication with 51 bits limbs for PPC64le. + * Based on RFC7748 and AArch64 optimized implementation for X25519 + * - Algorithm 1 Scalar multiplication of a variable point + */ + +#include <crypto/curve25519.h> +#include <crypto/internal/kpp.h> + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/scatterlist.h> + +#include <linux/cpufeature.h> +#include <linux/processor.h> + +typedef uint64_t fe51[5]; + +asmlinkage void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); +asmlinkage void x25519_fe51_sqr(fe51 h, const fe51 f); +asmlinkage void x25519_fe51_mul121666(fe51 h, fe51 f); +asmlinkage void x25519_fe51_sqr_times(fe51 h, const fe51 f, int n); +asmlinkage void x25519_fe51_frombytes(fe51 h, const uint8_t *s); +asmlinkage void x25519_fe51_tobytes(uint8_t *s, const fe51 h); +asmlinkage void x25519_cswap(fe51 p, fe51 q, unsigned int bit); + +#define fmul x25519_fe51_mul +#define fsqr x25519_fe51_sqr +#define fmul121666 x25519_fe51_mul121666 +#define fe51_tobytes x25519_fe51_tobytes + +static void fadd(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = f[0] + g[0]; + h[1] = f[1] + g[1]; + h[2] = f[2] + g[2]; + h[3] = f[3] + g[3]; + h[4] = f[4] + g[4]; +} + +/* + * Prime = 2 ** 255 - 19, 255 bits + * (0x7fffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffed) + * + * Prime in 5 51-bit limbs + */ +static fe51 prime51 = { 0x7ffffffffffed, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff, 0x7ffffffffffff}; + +static void fsub(fe51 h, const fe51 f, const fe51 g) +{ + h[0] = (f[0] + ((prime51[0] * 2))) - g[0]; + h[1] = (f[1] + ((prime51[1] * 2))) - g[1]; + h[2] = (f[2] + ((prime51[2] * 2))) - g[2]; + h[3] = (f[3] + ((prime51[3] * 2))) - g[3]; + h[4] = (f[4] + ((prime51[4] * 2))) - g[4]; +} + +static void fe51_frombytes(fe51 h, const uint8_t *s) +{ + /* + * Make sure 64-bit aligned. + */ + unsigned char sbuf[32+8]; + unsigned char *sb = PTR_ALIGN((void *)sbuf, 8); + + memcpy(sb, s, 32); + x25519_fe51_frombytes(h, sb); +} + +static void finv(fe51 o, const fe51 i) +{ + fe51 a0, b, c, t00; + + fsqr(a0, i); + x25519_fe51_sqr_times(t00, a0, 2); + + fmul(b, t00, i); + fmul(a0, b, a0); + + fsqr(t00, a0); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 5); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 10); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 20); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 10); + + fmul(b, t00, b); + x25519_fe51_sqr_times(t00, b, 50); + + fmul(c, t00, b); + x25519_fe51_sqr_times(t00, c, 100); + + fmul(t00, t00, c); + x25519_fe51_sqr_times(t00, t00, 50); + + fmul(t00, t00, b); + x25519_fe51_sqr_times(t00, t00, 5); + + fmul(o, t00, a0); +} + +static void curve25519_fe51(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) +{ + fe51 x1, x2, z2, x3, z3; + uint8_t s[32]; + unsigned int swap = 0; + int i; + + memcpy(s, scalar, 32); + s[0] &= 0xf8; + s[31] &= 0x7f; + s[31] |= 0x40; + fe51_frombytes(x1, point); + + z2[0] = z2[1] = z2[2] = z2[3] = z2[4] = 0; + x3[0] = x1[0]; + x3[1] = x1[1]; + x3[2] = x1[2]; + x3[3] = x1[3]; + x3[4] = x1[4]; + + x2[0] = z3[0] = 1; + x2[1] = z3[1] = 0; + x2[2] = z3[2] = 0; + x2[3] = z3[3] = 0; + x2[4] = z3[4] = 0; + + for (i = 254; i >= 0; --i) { + unsigned int k_t = 1 & (s[i / 8] >> (i & 7)); + fe51 a, b, c, d, e; + fe51 da, cb, aa, bb; + fe51 dacb_p, dacb_m; + + swap ^= k_t; + x25519_cswap(x2, x3, swap); + x25519_cswap(z2, z3, swap); + swap = k_t; + + fsub(b, x2, z2); // B = x_2 - z_2 + fadd(a, x2, z2); // A = x_2 + z_2 + fsub(d, x3, z3); // D = x_3 - z_3 + fadd(c, x3, z3); // C = x_3 + z_3 + + fsqr(bb, b); // BB = B^2 + fsqr(aa, a); // AA = A^2 + fmul(da, d, a); // DA = D * A + fmul(cb, c, b); // CB = C * B + + fsub(e, aa, bb); // E = AA - BB + fmul(x2, aa, bb); // x2 = AA * BB + fadd(dacb_p, da, cb); // DA + CB + fsub(dacb_m, da, cb); // DA - CB + + fmul121666(z3, e); // 121666 * E + fsqr(z2, dacb_m); // (DA - CB)^2 + fsqr(x3, dacb_p); // x3 = (DA + CB)^2 + fadd(b, bb, z3); // BB + 121666 * E + fmul(z3, x1, z2); // z3 = x1 * (DA - CB)^2 + fmul(z2, e, b); // z2 = e * (BB + (DA + CB)^2) + } + + finv(z2, z2); + fmul(x2, x2, z2); + fe51_tobytes(out, x2); +} + +void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + curve25519_fe51(mypublic, secret, basepoint); +} +EXPORT_SYMBOL(curve25519_arch); + +void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + curve25519_fe51(pub, secret, curve25519_base_point); +} +EXPORT_SYMBOL(curve25519_base_arch); + +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, + unsigned int len) +{ + u8 *secret = kpp_tfm_ctx(tfm); + + if (!len) + curve25519_generate_secret(secret); + else if (len == CURVE25519_KEY_SIZE && + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) + memcpy(secret, buf, CURVE25519_KEY_SIZE); + else + return -EINVAL; + return 0; +} + +static int curve25519_generate_public_key(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (req->src) + return -EINVAL; + + curve25519_base_arch(buf, secret); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static int curve25519_compute_shared_secret(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 public_key[CURVE25519_KEY_SIZE]; + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (!req->src) + return -EINVAL; + + copied = sg_copy_to_buffer(req->src, + sg_nents_for_len(req->src, + CURVE25519_KEY_SIZE), + public_key, CURVE25519_KEY_SIZE); + if (copied != CURVE25519_KEY_SIZE) + return -EINVAL; + + curve25519_arch(buf, secret, public_key); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static unsigned int curve25519_max_size(struct crypto_kpp *tfm) +{ + return CURVE25519_KEY_SIZE; +} + +static struct kpp_alg curve25519_alg = { + .base.cra_name = "curve25519", + .base.cra_driver_name = "curve25519-ppc64le", + .base.cra_priority = 200, + .base.cra_module = THIS_MODULE, + .base.cra_ctxsize = CURVE25519_KEY_SIZE, + + .set_secret = curve25519_set_secret, + .generate_public_key = curve25519_generate_public_key, + .compute_shared_secret = curve25519_compute_shared_secret, + .max_size = curve25519_max_size, +}; + + +static int __init curve25519_mod_init(void) +{ + return IS_REACHABLE(CONFIG_CRYPTO_KPP) ? + crypto_register_kpp(&curve25519_alg) : 0; +} + +static void __exit curve25519_mod_exit(void) +{ + if (IS_REACHABLE(CONFIG_CRYPTO_KPP)) + crypto_unregister_kpp(&curve25519_alg); +} + +module_init(curve25519_mod_init); +module_exit(curve25519_mod_exit); + +MODULE_ALIAS_CRYPTO("curve25519"); +MODULE_ALIAS_CRYPTO("curve25519-ppc64le"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Danny Tsen <dtsen@us.ibm.com>"); diff --git a/arch/powerpc/crypto/curve25519-ppc64le_asm.S b/arch/powerpc/crypto/curve25519-ppc64le_asm.S new file mode 100644 index 000000000000..06c1febe24b9 --- /dev/null +++ b/arch/powerpc/crypto/curve25519-ppc64le_asm.S @@ -0,0 +1,671 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# This code is taken from CRYPTOGAMs[1] and is included here using the option +# in the license to distribute the code under the GPL. Therefore this program +# is free software; you can redistribute it and/or modify it under the terms of +# the GNU General Public License version 2 as published by the Free Software +# Foundation. +# +# [1] https://github.com/dot-asm/cryptogams/ + +# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain copyright notices, +# this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# * Neither the name of the CRYPTOGAMS nor the names of its +# copyright holder and contributors may be used to endorse or +# promote products derived from this software without specific +# prior written permission. +# +# ALTERNATIVELY, provided that this notice is retained in full, this +# product may be distributed under the terms of the GNU General Public +# License (GPL), in which case the provisions of the GPL apply INSTEAD OF +# those given above. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see https://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# +# ==================================================================== +# Written and Modified by Danny Tsen <dtsen@us.ibm.com> +# - Added x25519_fe51_sqr_times, x25519_fe51_frombytes, x25519_fe51_tobytes +# and x25519_cswap +# +# Copyright 2024- IBM Corp. +# +# X25519 lower-level primitives for PPC64. +# + +#include <linux/linkage.h> + +.text + +.align 5 +SYM_FUNC_START(x25519_fe51_mul) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 6,0(5) + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mulld 22,7,6 + mulhdu 23,7,6 + + mulld 24,8,6 + mulhdu 25,8,6 + + mulld 30,11,6 + mulhdu 31,11,6 + ld 4,8(5) + mulli 11,11,19 + + mulld 26,9,6 + mulhdu 27,9,6 + + mulld 28,10,6 + mulhdu 29,10,6 + mulld 12,11,4 + mulhdu 21,11,4 + addc 22,22,12 + adde 23,23,21 + + mulld 12,7,4 + mulhdu 21,7,4 + addc 24,24,12 + adde 25,25,21 + + mulld 12,10,4 + mulhdu 21,10,4 + ld 6,16(5) + mulli 10,10,19 + addc 30,30,12 + adde 31,31,21 + + mulld 12,8,4 + mulhdu 21,8,4 + addc 26,26,12 + adde 27,27,21 + + mulld 12,9,4 + mulhdu 21,9,4 + addc 28,28,12 + adde 29,29,21 + mulld 12,10,6 + mulhdu 21,10,6 + addc 22,22,12 + adde 23,23,21 + + mulld 12,11,6 + mulhdu 21,11,6 + addc 24,24,12 + adde 25,25,21 + + mulld 12,9,6 + mulhdu 21,9,6 + ld 4,24(5) + mulli 9,9,19 + addc 30,30,12 + adde 31,31,21 + + mulld 12,7,6 + mulhdu 21,7,6 + addc 26,26,12 + adde 27,27,21 + + mulld 12,8,6 + mulhdu 21,8,6 + addc 28,28,12 + adde 29,29,21 + mulld 12,9,4 + mulhdu 21,9,4 + addc 22,22,12 + adde 23,23,21 + + mulld 12,10,4 + mulhdu 21,10,4 + addc 24,24,12 + adde 25,25,21 + + mulld 12,8,4 + mulhdu 21,8,4 + ld 6,32(5) + mulli 8,8,19 + addc 30,30,12 + adde 31,31,21 + + mulld 12,11,4 + mulhdu 21,11,4 + addc 26,26,12 + adde 27,27,21 + + mulld 12,7,4 + mulhdu 21,7,4 + addc 28,28,12 + adde 29,29,21 + mulld 12,8,6 + mulhdu 21,8,6 + addc 22,22,12 + adde 23,23,21 + + mulld 12,9,6 + mulhdu 21,9,6 + addc 24,24,12 + adde 25,25,21 + + mulld 12,10,6 + mulhdu 21,10,6 + addc 26,26,12 + adde 27,27,21 + + mulld 12,11,6 + mulhdu 21,11,6 + addc 28,28,12 + adde 29,29,21 + + mulld 12,7,6 + mulhdu 21,7,6 + addc 30,30,12 + adde 31,31,21 + +.Lfe51_reduce: + li 0,-1 + srdi 0,0,13 + + srdi 12,26,51 + and 9,26,0 + insrdi 12,27,51,0 + srdi 21,22,51 + and 7,22,0 + insrdi 21,23,51,0 + addc 28,28,12 + addze 29,29 + addc 24,24,21 + addze 25,25 + + srdi 12,28,51 + and 10,28,0 + insrdi 12,29,51,0 + srdi 21,24,51 + and 8,24,0 + insrdi 21,25,51,0 + addc 30,30,12 + addze 31,31 + add 9,9,21 + + srdi 12,30,51 + and 11,30,0 + insrdi 12,31,51,0 + mulli 12,12,19 + + add 7,7,12 + + srdi 21,9,51 + and 9,9,0 + add 10,10,21 + + srdi 12,7,51 + and 7,7,0 + add 8,8,12 + + std 9,16(3) + std 10,24(3) + std 11,32(3) + std 7,0(3) + std 8,8(3) + + ld 21,56(1) + ld 22,64(1) + ld 23,72(1) + ld 24,80(1) + ld 25,88(1) + ld 26,96(1) + ld 27,104(1) + ld 28,112(1) + ld 29,120(1) + ld 30,128(1) + ld 31,136(1) + addi 1,1,144 + blr +SYM_FUNC_END(x25519_fe51_mul) + +.align 5 +SYM_FUNC_START(x25519_fe51_sqr) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + add 6,7,7 + mulli 21,11,19 + + mulld 22,7,7 + mulhdu 23,7,7 + mulld 24,8,6 + mulhdu 25,8,6 + mulld 26,9,6 + mulhdu 27,9,6 + mulld 28,10,6 + mulhdu 29,10,6 + mulld 30,11,6 + mulhdu 31,11,6 + add 6,8,8 + mulld 12,11,21 + mulhdu 11,11,21 + addc 28,28,12 + adde 29,29,11 + + mulli 5,10,19 + + mulld 12,8,8 + mulhdu 11,8,8 + addc 26,26,12 + adde 27,27,11 + mulld 12,9,6 + mulhdu 11,9,6 + addc 28,28,12 + adde 29,29,11 + mulld 12,10,6 + mulhdu 11,10,6 + addc 30,30,12 + adde 31,31,11 + mulld 12,21,6 + mulhdu 11,21,6 + add 6,10,10 + addc 22,22,12 + adde 23,23,11 + mulld 12,10,5 + mulhdu 10,10,5 + addc 24,24,12 + adde 25,25,10 + mulld 12,6,21 + mulhdu 10,6,21 + add 6,9,9 + addc 26,26,12 + adde 27,27,10 + + mulld 12,9,9 + mulhdu 10,9,9 + addc 30,30,12 + adde 31,31,10 + mulld 12,5,6 + mulhdu 10,5,6 + addc 22,22,12 + adde 23,23,10 + mulld 12,21,6 + mulhdu 10,21,6 + addc 24,24,12 + adde 25,25,10 + + b .Lfe51_reduce +SYM_FUNC_END(x25519_fe51_sqr) + +.align 5 +SYM_FUNC_START(x25519_fe51_mul121666) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + lis 6,1 + ori 6,6,56130 + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mulld 22,7,6 + mulhdu 23,7,6 + mulld 24,8,6 + mulhdu 25,8,6 + mulld 26,9,6 + mulhdu 27,9,6 + mulld 28,10,6 + mulhdu 29,10,6 + mulld 30,11,6 + mulhdu 31,11,6 + + b .Lfe51_reduce +SYM_FUNC_END(x25519_fe51_mul121666) + +.align 5 +SYM_FUNC_START(x25519_fe51_sqr_times) + + stdu 1,-144(1) + std 21,56(1) + std 22,64(1) + std 23,72(1) + std 24,80(1) + std 25,88(1) + std 26,96(1) + std 27,104(1) + std 28,112(1) + std 29,120(1) + std 30,128(1) + std 31,136(1) + + ld 7,0(4) + ld 8,8(4) + ld 9,16(4) + ld 10,24(4) + ld 11,32(4) + + mtctr 5 + +.Lsqr_times_loop: + add 6,7,7 + mulli 21,11,19 + + mulld 22,7,7 + mulhdu 23,7,7 + mulld 24,8,6 + mulhdu 25,8,6 + mulld 26,9,6 + mulhdu 27,9,6 + mulld 28,10,6 + mulhdu 29,10,6 + mulld 30,11,6 + mulhdu 31,11,6 + add 6,8,8 + mulld 12,11,21 + mulhdu 11,11,21 + addc 28,28,12 + adde 29,29,11 + + mulli 5,10,19 + + mulld 12,8,8 + mulhdu 11,8,8 + addc 26,26,12 + adde 27,27,11 + mulld 12,9,6 + mulhdu 11,9,6 + addc 28,28,12 + adde 29,29,11 + mulld 12,10,6 + mulhdu 11,10,6 + addc 30,30,12 + adde 31,31,11 + mulld 12,21,6 + mulhdu 11,21,6 + add 6,10,10 + addc 22,22,12 + adde 23,23,11 + mulld 12,10,5 + mulhdu 10,10,5 + addc 24,24,12 + adde 25,25,10 + mulld 12,6,21 + mulhdu 10,6,21 + add 6,9,9 + addc 26,26,12 + adde 27,27,10 + + mulld 12,9,9 + mulhdu 10,9,9 + addc 30,30,12 + adde 31,31,10 + mulld 12,5,6 + mulhdu 10,5,6 + addc 22,22,12 + adde 23,23,10 + mulld 12,21,6 + mulhdu 10,21,6 + addc 24,24,12 + adde 25,25,10 + + # fe51_reduce + li 0,-1 + srdi 0,0,13 + + srdi 12,26,51 + and 9,26,0 + insrdi 12,27,51,0 + srdi 21,22,51 + and 7,22,0 + insrdi 21,23,51,0 + addc 28,28,12 + addze 29,29 + addc 24,24,21 + addze 25,25 + + srdi 12,28,51 + and 10,28,0 + insrdi 12,29,51,0 + srdi 21,24,51 + and 8,24,0 + insrdi 21,25,51,0 + addc 30,30,12 + addze 31,31 + add 9,9,21 + + srdi 12,30,51 + and 11,30,0 + insrdi 12,31,51,0 + mulli 12,12,19 + + add 7,7,12 + + srdi 21,9,51 + and 9,9,0 + add 10,10,21 + + srdi 12,7,51 + and 7,7,0 + add 8,8,12 + + bdnz .Lsqr_times_loop + + std 9,16(3) + std 10,24(3) + std 11,32(3) + std 7,0(3) + std 8,8(3) + + ld 21,56(1) + ld 22,64(1) + ld 23,72(1) + ld 24,80(1) + ld 25,88(1) + ld 26,96(1) + ld 27,104(1) + ld 28,112(1) + ld 29,120(1) + ld 30,128(1) + ld 31,136(1) + addi 1,1,144 + blr +SYM_FUNC_END(x25519_fe51_sqr_times) + +.align 5 +SYM_FUNC_START(x25519_fe51_frombytes) + + li 12, -1 + srdi 12, 12, 13 # 0x7ffffffffffff + + ld 5, 0(4) + ld 6, 8(4) + ld 7, 16(4) + ld 8, 24(4) + + srdi 10, 5, 51 + and 5, 5, 12 # h0 + + sldi 11, 6, 13 + or 11, 10, 11 # h1t + srdi 10, 6, 38 + and 6, 11, 12 # h1 + + sldi 11, 7, 26 + or 10, 10, 11 # h2t + + srdi 11, 7, 25 + and 7, 10, 12 # h2 + sldi 10, 8, 39 + or 11, 11, 10 # h3t + + srdi 9, 8, 12 + and 8, 11, 12 # h3 + and 9, 9, 12 # h4 + + std 5, 0(3) + std 6, 8(3) + std 7, 16(3) + std 8, 24(3) + std 9, 32(3) + + blr +SYM_FUNC_END(x25519_fe51_frombytes) + +.align 5 +SYM_FUNC_START(x25519_fe51_tobytes) + + ld 5, 0(4) + ld 6, 8(4) + ld 7, 16(4) + ld 8, 24(4) + ld 9, 32(4) + + li 12, -1 + srdi 12, 12, 13 # 0x7ffffffffffff + + # Full reducuction + addi 10, 5, 19 + srdi 10, 10, 51 + add 10, 10, 6 + srdi 10, 10, 51 + add 10, 10, 7 + srdi 10, 10, 51 + add 10, 10, 8 + srdi 10, 10, 51 + add 10, 10, 9 + srdi 10, 10, 51 + + mulli 10, 10, 19 + add 5, 5, 10 + srdi 11, 5, 51 + add 6, 6, 11 + srdi 11, 6, 51 + add 7, 7, 11 + srdi 11, 7, 51 + add 8, 8, 11 + srdi 11, 8, 51 + add 9, 9, 11 + + and 5, 5, 12 + and 6, 6, 12 + and 7, 7, 12 + and 8, 8, 12 + and 9, 9, 12 + + sldi 10, 6, 51 + or 5, 5, 10 # s0 + + srdi 11, 6, 13 + sldi 10, 7, 38 + or 6, 11, 10 # s1 + + srdi 11, 7, 26 + sldi 10, 8, 25 + or 7, 11, 10 # s2 + + srdi 11, 8, 39 + sldi 10, 9, 12 + or 8, 11, 10 # s4 + + std 5, 0(3) + std 6, 8(3) + std 7, 16(3) + std 8, 24(3) + + blr +SYM_FUNC_END(x25519_fe51_tobytes) + +.align 5 +SYM_FUNC_START(x25519_cswap) + + li 7, 5 + neg 6, 5 + mtctr 7 + +.Lswap_loop: + ld 8, 0(3) + ld 9, 0(4) + xor 10, 8, 9 + and 10, 10, 6 + xor 11, 8, 10 + xor 12, 9, 10 + std 11, 0(3) + addi 3, 3, 8 + std 12, 0(4) + addi 4, 4, 8 + bdnz .Lswap_loop + + blr +SYM_FUNC_END(x25519_cswap) diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index c9e59589a1ce..24875e6295f2 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -18,6 +18,7 @@ config CRYPTO_AES_NI_INTEL depends on X86 select CRYPTO_AEAD select CRYPTO_LIB_AES + select CRYPTO_LIB_GF128MUL select CRYPTO_ALGAPI select CRYPTO_SKCIPHER select CRYPTO_SIMD diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 9c5ce5613738..53b4a277809e 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -48,8 +48,12 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o -aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \ - aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o +aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \ + aes-gcm-aesni-x86_64.o \ + aes-xts-avx-x86_64.o +ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) +aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o +endif obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S new file mode 100644 index 000000000000..45940e2883a0 --- /dev/null +++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S @@ -0,0 +1,1128 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// AES-NI optimized AES-GCM for x86_64 +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that +// support the original set of AES instructions, i.e. AES-NI. Two +// implementations are provided, one that uses AVX and one that doesn't. They +// are very similar, being generated by the same macros. The only difference is +// that the AVX implementation takes advantage of VEX-coded instructions in some +// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX +// implementation does *not* use 256-bit vectors, as AES is not supported on +// 256-bit vectors until the VAES feature (which this file doesn't target). +// +// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1 +// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems +// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) +// +// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is +// more thoroughly commented. This file has the following notable changes: +// +// - The vector length is fixed at 128-bit, i.e. xmm registers. This means +// there is only one AES block (and GHASH block) per register. +// +// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of +// 32. We work around this by being much more careful about using +// registers, relying heavily on loads to load values as they are needed. +// +// - Masking is not available either. We work around this by implementing +// partial block loads and stores using overlapping scalar loads and stores +// combined with shifts and SSE4.1 insertion and extraction instructions. +// +// - The main loop is organized differently due to the different design +// constraints. First, with just one AES block per SIMD register, on some +// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore +// do an 8-register wide loop. Considering that and the fact that we have +// just 16 SIMD registers to work with, it's not feasible to cache AES +// round keys and GHASH key powers in registers across loop iterations. +// That's not ideal, but also not actually that bad, since loads can run in +// parallel with other instructions. Significantly, this also makes it +// possible to roll up the inner loops, relying on hardware loop unrolling +// instead of software loop unrolling, greatly reducing code size. +// +// - We implement the GHASH multiplications in the main loop using Karatsuba +// multiplication instead of schoolbook multiplication. This saves one +// pclmulqdq instruction per block, at the cost of one 64-bit load, one +// pshufd, and 0.25 pxors per block. (This is without the three-argument +// XOR support that would be provided by AVX512 / AVX10, which would be +// more beneficial to schoolbook than Karatsuba.) +// +// As a rough approximation, we can assume that Karatsuba multiplication is +// faster than schoolbook multiplication in this context if one pshufd and +// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit +// load is "free" due to running in parallel with arithmetic instructions.) +// This is true on AMD CPUs, including all that support pclmulqdq up to at +// least Zen 3. It's also true on older Intel CPUs: Westmere through +// Haswell on the Core side, and Silvermont through Goldmont Plus on the +// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the +// benefit of Karatsuba should be substantial. On newer Intel CPUs, +// schoolbook multiplication should be faster, but only marginally. +// +// Not all these CPUs were available to be tested. However, benchmarks on +// available CPUs suggest that this approximation is plausible. Switching +// to Karatsuba showed negligible change (< 1%) on Intel Broadwell, +// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%. +// Considering that and the fact that Karatsuba should be even more +// beneficial on older Intel CPUs, it seems like the right choice here. +// +// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be +// saved by using a multiplication-less reduction method. We don't do that +// because it would require a large number of shift and xor instructions, +// making it less worthwhile and likely harmful on newer CPUs. +// +// It does make sense to sometimes use a different reduction optimization +// that saves a pclmulqdq, though: precompute the hash key times x^64, and +// multiply the low half of the data block by the hash key with the extra +// factor of x^64. This eliminates one step of the reduction. However, +// this is incompatible with Karatsuba multiplication. Therefore, for +// multi-block processing we use Karatsuba multiplication with a regular +// reduction. For single-block processing, we use the x^64 optimization. + +#include <linux/linkage.h> + +.section .rodata +.p2align 4 +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f +.Lgfpoly: + .quad 0xc200000000000000 +.Lone: + .quad 1 +.Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of + // 'len' 0xff bytes and the rest zeroes. +.Lzeropad_mask: + .octa 0xffffffffffffffffffffffffffffffff + .octa 0 + +// Offsets in struct aes_gcm_key_aesni +#define OFFSETOF_AESKEYLEN 480 +#define OFFSETOF_H_POWERS 496 +#define OFFSETOF_H_POWERS_XORED 624 +#define OFFSETOF_H_TIMES_X64 688 + +.text + +// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback +// assumes that all operands are distinct and that any mem operand is aligned. +.macro _vpclmulqdq imm, src1, src2, dst +.if USE_AVX + vpclmulqdq \imm, \src1, \src2, \dst +.else + movdqa \src2, \dst + pclmulqdq \imm, \src1, \dst +.endif +.endm + +// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes +// that all operands are distinct and that any mem operand is aligned. +.macro _vpshufb src1, src2, dst +.if USE_AVX + vpshufb \src1, \src2, \dst +.else + movdqa \src2, \dst + pshufb \src1, \dst +.endif +.endm + +// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that +// all operands are distinct. +.macro _vpand src1, src2, dst +.if USE_AVX + vpand \src1, \src2, \dst +.else + movdqu \src1, \dst + pand \src2, \dst +.endif +.endm + +// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must +// be a temporary xmm register. +.macro _xor_mem_to_reg mem, reg, tmp +.if USE_AVX + vpxor \mem, \reg, \reg +.else + movdqu \mem, \tmp + pxor \tmp, \reg +.endif +.endm + +// Test the unaligned memory operand \mem against the xmm register \reg. \tmp +// must be a temporary xmm register. +.macro _test_mem mem, reg, tmp +.if USE_AVX + vptest \mem, \reg +.else + movdqu \mem, \tmp + ptest \tmp, \reg +.endif +.endm + +// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst +// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. +.macro _load_partial_block src, dst, tmp64, tmp32 + sub $8, %ecx // LEN - 8 + jle .Lle8\@ + + // Load 9 <= LEN <= 15 bytes. + movq (\src), \dst // Load first 8 bytes + mov (\src, %rcx), %rax // Load last 8 bytes + neg %ecx + shl $3, %ecx + shr %cl, %rax // Discard overlapping bytes + pinsrq $1, %rax, \dst + jmp .Ldone\@ + +.Lle8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Load 4 <= LEN <= 8 bytes. + mov (\src), %eax // Load first 4 bytes + mov (\src, %rcx), \tmp32 // Load last 4 bytes + jmp .Lcombine\@ + +.Llt4\@: + // Load 1 <= LEN <= 3 bytes. + add $2, %ecx // LEN - 2 + movzbl (\src), %eax // Load first byte + jl .Lmovq\@ + movzwl (\src, %rcx), \tmp32 // Load last 2 bytes +.Lcombine\@: + shl $3, %ecx + shl %cl, \tmp64 + or \tmp64, %rax // Combine the two parts +.Lmovq\@: + movq %rax, \dst +.Ldone\@: +.endm + +// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. +// Clobbers %rax, %rcx, and %rsi. +.macro _store_partial_block src, dst + sub $8, %ecx // LEN - 8 + jl .Llt8\@ + + // Store 8 <= LEN <= 15 bytes. + pextrq $1, \src, %rax + mov %ecx, %esi + shl $3, %ecx + ror %cl, %rax + mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes + movq \src, (\dst) // Store first 8 bytes + jmp .Ldone\@ + +.Llt8\@: + add $4, %ecx // LEN - 4 + jl .Llt4\@ + + // Store 4 <= LEN <= 7 bytes. + pextrd $1, \src, %eax + mov %ecx, %esi + shl $3, %ecx + ror %cl, %eax + mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes + movd \src, (\dst) // Store first 4 bytes + jmp .Ldone\@ + +.Llt4\@: + // Store 1 <= LEN <= 3 bytes. + pextrb $0, \src, 0(\dst) + cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? + jl .Ldone\@ + pextrb $1, \src, 1(\dst) + je .Ldone\@ + pextrb $2, \src, 2(\dst) +.Ldone\@: +.endm + +// Do one step of GHASH-multiplying \a by \b and storing the reduced product in +// \b. To complete all steps, this must be invoked with \i=0 through \i=9. +// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the +// .Lgfpoly constant, and \t0-\t1 must be temporary registers. +.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1 + + // MI = (a_L * b_H) + ((a*x^64)_L * b_L) +.if \i == 0 + _vpclmulqdq $0x01, \a, \b, \t0 +.elseif \i == 1 + _vpclmulqdq $0x00, \a_times_x64, \b, \t1 +.elseif \i == 2 + pxor \t1, \t0 + + // HI = (a_H * b_H) + ((a*x^64)_H * b_L) +.elseif \i == 3 + _vpclmulqdq $0x11, \a, \b, \t1 +.elseif \i == 4 + pclmulqdq $0x10, \a_times_x64, \b +.elseif \i == 5 + pxor \t1, \b +.elseif \i == 6 + + // Fold MI into HI. + pshufd $0x4e, \t0, \t1 // Swap halves of MI +.elseif \i == 7 + pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + pxor \t1, \b +.elseif \i == 9 + pxor \t0, \b +.endif +.endm + +// GHASH-multiply \a by \b and store the reduced product in \b. +// See _ghash_mul_step for details. +.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1 +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1 +.endr +.endm + +// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi. +// This does Karatsuba multiplication and must be paired with _ghash_reduce. On +// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the +// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered. +.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0 + + // LO += a_L * b_L + _vpclmulqdq $0x00, \a, \b, \t0 + pxor \t0, \lo + + // b_L + b_H + pshufd $0x4e, \b, \t0 + pxor \b, \t0 + + // HI += a_H * b_H + pclmulqdq $0x11, \a, \b + pxor \b, \hi + + // MI += (a_L + a_H) * (b_L + b_H) + pclmulqdq $0x00, \a_xored, \t0 + pxor \t0, \mi +.endm + +// Reduce the product from \lo, \mi, and \hi, and store the result in \dst. +// This assumes that _ghash_mul_noreduce was used. +.macro _ghash_reduce lo, mi, hi, dst, t0 + + movq .Lgfpoly(%rip), \t0 + + // MI += LO + HI (needed because we used Karatsuba multiplication) + pxor \lo, \mi + pxor \hi, \mi + + // Fold LO into MI. + pshufd $0x4e, \lo, \dst + pclmulqdq $0x00, \t0, \lo + pxor \dst, \mi + pxor \lo, \mi + + // Fold MI into HI. + pshufd $0x4e, \mi, \dst + pclmulqdq $0x00, \t0, \mi + pxor \hi, \dst + pxor \mi, \dst +.endm + +// Do the first step of the GHASH update of a set of 8 ciphertext blocks. +// +// The whole GHASH update does: +// +// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 + +// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1 +// +// This macro just does the first step: it does the unreduced multiplication +// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm +// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the +// inner block counter in %rax, which is a value that counts up by 8 for each +// block in the set of 8 and is used later to index by 8*blknum and 16*blknum. +// +// To reduce the number of pclmulqdq instructions required, both this macro and +// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook +// multiplication. See the file comment for more details about this choice. +// +// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if +// encrypting, or SRC if decrypting. They also expect the precomputed hash key +// powers H^i and their XOR'd-together halves to be available in the struct +// pointed to by KEY. Both macros clobber TMP[0-2]. +.macro _ghash_update_begin_8x enc + + // Initialize the inner block counter. + xor %eax, %eax + + // Load the highest hash key power, H^8. + movdqa OFFSETOF_H_POWERS(KEY), TMP0 + + // Load the first ciphertext block and byte-reflect it. +.if \enc + movdqu (DST), TMP1 +.else + movdqu (SRC), TMP1 +.endif + pshufb BSWAP_MASK, TMP1 + + // Add the GHASH accumulator to the ciphertext block to get the block + // 'b' that needs to be multiplied with the hash key power 'a'. + pxor TMP1, GHASH_ACC + + // b_L + b_H + pshufd $0x4e, GHASH_ACC, MI + pxor GHASH_ACC, MI + + // LO = a_L * b_L + _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO + + // HI = a_H * b_H + pclmulqdq $0x11, TMP0, GHASH_ACC + + // MI = (a_L + a_H) * (b_L + b_H) + pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI +.endm + +// Continue the GHASH update of 8 ciphertext blocks as described above by doing +// an unreduced multiplication of the next ciphertext block by the next lowest +// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI. +.macro _ghash_update_continue_8x enc + add $8, %eax + + // Load the next lowest key power. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0 + + // Load the next ciphertext block and byte-reflect it. +.if \enc + movdqu (DST,%rax,2), TMP1 +.else + movdqu (SRC,%rax,2), TMP1 +.endif + pshufb BSWAP_MASK, TMP1 + + // LO += a_L * b_L + _vpclmulqdq $0x00, TMP0, TMP1, TMP2 + pxor TMP2, LO + + // b_L + b_H + pshufd $0x4e, TMP1, TMP2 + pxor TMP1, TMP2 + + // HI += a_H * b_H + pclmulqdq $0x11, TMP0, TMP1 + pxor TMP1, GHASH_ACC + + // MI += (a_L + a_H) * (b_L + b_H) + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1 + pclmulqdq $0x00, TMP1, TMP2 + pxor TMP2, MI +.endm + +// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to +// _ghash_reduce, but it's hardcoded to use the registers of the main loop and +// it uses the same register for HI and the destination. It's also divided into +// two steps. TMP1 must be preserved across steps. +// +// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of +// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would +// increase the critical path length, and it seems to slightly hurt performance. +.macro _ghash_update_end_8x_step i +.if \i == 0 + movq .Lgfpoly(%rip), TMP1 + pxor LO, MI + pxor GHASH_ACC, MI + pshufd $0x4e, LO, TMP2 + pclmulqdq $0x00, TMP1, LO + pxor TMP2, MI + pxor LO, MI +.elseif \i == 1 + pshufd $0x4e, MI, TMP2 + pclmulqdq $0x00, TMP1, MI + pxor TMP2, GHASH_ACC + pxor MI, GHASH_ACC +.endif +.endm + +// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key); +// +// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH +// related fields in the key struct. +.macro _aes_gcm_precompute + + // Function arguments + .set KEY, %rdi + + // Additional local variables. + // %xmm0-%xmm1 and %rax are used as temporaries. + .set RNDKEYLAST_PTR, %rsi + .set H_CUR, %xmm2 + .set H_POW1, %xmm3 // H^1 + .set H_POW1_X64, %xmm4 // H^1 * x^64 + .set GFPOLY, %xmm5 + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR + movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block + lea 16(KEY), %rax +1: + aesenc (%rax), H_POW1 + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + aesenclast (RNDKEYLAST_PTR), H_POW1 + + // Preprocess the raw hash subkey as needed to operate on GHASH's + // bit-reflected values directly: reflect its bytes, then multiply it by + // x^-1 (using the backwards interpretation of polynomial coefficients + // from the GCM spec) or equivalently x^1 (using the alternative, + // natural interpretation of polynomial coefficients). + pshufb .Lbswap_mask(%rip), H_POW1 + movdqa H_POW1, %xmm0 + pshufd $0xd3, %xmm0, %xmm0 + psrad $31, %xmm0 + paddq H_POW1, H_POW1 + pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0 + pxor %xmm0, H_POW1 + + // Store H^1. + movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY) + + // Compute and store H^1 * x^64. + movq .Lgfpoly(%rip), GFPOLY + pshufd $0x4e, H_POW1, %xmm0 + _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64 + pxor %xmm0, H_POW1_X64 + movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY) + + // Compute and store the halves of H^1 XOR'd together. + pxor H_POW1, %xmm0 + movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY) + + // Compute and store the remaining key powers H^2 through H^8. + movdqa H_POW1, H_CUR + mov $6*8, %eax +.Lprecompute_next\@: + // Compute H^i = H^{i-1} * H^1. + _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1 + // Store H^i. + movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2) + // Compute and store the halves of H^i XOR'd together. + pshufd $0x4e, H_CUR, %xmm0 + pxor H_CUR, %xmm0 + movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax) + sub $8, %eax + jge .Lprecompute_next\@ + + RET +.endm + +// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, +// u8 ghash_acc[16], const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all +// zeroes. |aadlen| must be a multiple of 16, except on the last call where it +// can be any length. The caller must do any buffering needed to ensure this. +.macro _aes_gcm_aad_update + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx + // Note: _load_partial_block relies on AADLEN being in %ecx. + + // Additional local variables. + // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers. + .set BSWAP_MASK, %xmm2 + .set GHASH_ACC, %xmm3 + .set H_POW1, %xmm4 // H^1 + .set H_POW1_X64, %xmm5 // H^1 * x^64 + .set GFPOLY, %xmm6 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movdqu (GHASH_ACC_PTR), GHASH_ACC + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 + movq .Lgfpoly(%rip), GFPOLY + + // Process the AAD one full block at a time. + sub $16, AADLEN + jl .Laad_loop_1x_done\@ +.Laad_loop_1x\@: + movdqu (AAD), %xmm0 + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0, GHASH_ACC + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 + add $16, AAD + sub $16, AADLEN + jge .Laad_loop_1x\@ +.Laad_loop_1x_done\@: + // Check whether there is a partial block at the end. + add $16, AADLEN + jz .Laad_done\@ + + // Process a partial block of length 1 <= AADLEN <= 15. + // _load_partial_block assumes that %ecx contains AADLEN. + _load_partial_block AAD, %xmm0, %r10, %r10d + pshufb BSWAP_MASK, %xmm0 + pxor %xmm0, GHASH_ACC + _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 + +.Laad_done\@: + movdqu GHASH_ACC, (GHASH_ACC_PTR) + RET +.endm + +// Increment LE_CTR eight times to generate eight little-endian counter blocks, +// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with +// the zero-th AES round key. Clobbers TMP0 and TMP1. +.macro _ctr_begin_8x + movq .Lone(%rip), TMP0 + movdqa (KEY), TMP1 // zero-th round key +.irp i, 0,1,2,3,4,5,6,7 + _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i + pxor TMP1, AESDATA\i + paddd TMP0, LE_CTR +.endr +.endm + +// Do a non-last round of AES on AESDATA[0-7] using \round_key. +.macro _aesenc_8x round_key +.irp i, 0,1,2,3,4,5,6,7 + aesenc \round_key, AESDATA\i +.endr +.endm + +// Do the last round of AES on AESDATA[0-7] using \round_key. +.macro _aesenclast_8x round_key +.irp i, 0,1,2,3,4,5,6,7 + aesenclast \round_key, AESDATA\i +.endr +.endm + +// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and +// store the result to DST. Clobbers TMP0. +.macro _xor_data_8x +.irp i, 0,1,2,3,4,5,6,7 + _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0 +.endr +.irp i, 0,1,2,3,4,5,6,7 + movdqu AESDATA\i, \i*16(DST) +.endr +.endm + +// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); +// +// This macro generates a GCM encryption or decryption update function with the +// above prototype (with \enc selecting which one). +// +// This function computes the next portion of the CTR keystream, XOR's it with +// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted +// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the +// next |datalen| ciphertext bytes. +// +// |datalen| must be a multiple of 16, except on the last call where it can be +// any length. The caller must do any buffering needed to ensure this. Both +// in-place and out-of-place en/decryption are supported. +// +// |le_ctr| must give the current counter in little-endian format. For a new +// message, the low word of the counter must be 2. This function loads the +// counter from |le_ctr| and increments the loaded counter as needed, but it +// does *not* store the updated counter back to |le_ctr|. The caller must +// update |le_ctr| if any more data segments follow. Internally, only the low +// 32-bit word of the counter is incremented, following the GCM standard. +.macro _aes_gcm_update enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg + .set GHASH_ACC_PTR, %rdx + .set SRC, %rcx + .set DST, %r8 + .set DATALEN, %r9d + .set DATALEN64, %r9 // Zero-extend DATALEN before using! + // Note: the code setting up for _load_partial_block assumes that SRC is + // in %rcx (and that DATALEN is *not* in %rcx). + + // Additional local variables + + // %rax and %rsi are used as temporary registers. Note: %rsi overlaps + // with LE_CTR_PTR, which is used only at the beginning. + + .set AESKEYLEN, %r10d // AES key length in bytes + .set AESKEYLEN64, %r10 + .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key + + // Put the most frequently used values in %xmm0-%xmm7 to reduce code + // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.) + .set TMP0, %xmm0 + .set TMP1, %xmm1 + .set TMP2, %xmm2 + .set LO, %xmm3 // Low part of unreduced product + .set MI, %xmm4 // Middle part of unreduced product + .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also + // the high part of unreduced product + .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes + .set LE_CTR, %xmm7 // Little-endian counter value + .set AESDATA0, %xmm8 + .set AESDATA1, %xmm9 + .set AESDATA2, %xmm10 + .set AESDATA3, %xmm11 + .set AESDATA4, %xmm12 + .set AESDATA5, %xmm13 + .set AESDATA6, %xmm14 + .set AESDATA7, %xmm15 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movdqu (GHASH_ACC_PTR), GHASH_ACC + movdqu (LE_CTR_PTR), LE_CTR + + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR + + // If there are at least 8*16 bytes of data, then continue into the main + // loop, which processes 8*16 bytes of data per iteration. + // + // The main loop interleaves AES and GHASH to improve performance on + // CPUs that can execute these instructions in parallel. When + // decrypting, the GHASH input (the ciphertext) is immediately + // available. When encrypting, we instead encrypt a set of 8 blocks + // first and then GHASH those blocks while encrypting the next set of 8, + // repeat that as needed, and finally GHASH the last set of 8 blocks. + // + // Code size optimization: Prefer adding or subtracting -8*16 over 8*16, + // as this makes the immediate fit in a signed byte, saving 3 bytes. + add $-8*16, DATALEN + jl .Lcrypt_loop_8x_done\@ +.if \enc + // Encrypt the first 8 plaintext blocks. + _ctr_begin_8x + lea 16(KEY), %rsi + .p2align 4 +1: + movdqa (%rsi), TMP0 + _aesenc_8x TMP0 + add $16, %rsi + cmp %rsi, RNDKEYLAST_PTR + jne 1b + movdqa (%rsi), TMP0 + _aesenclast_8x TMP0 + _xor_data_8x + // Don't increment DST until the ciphertext blocks have been hashed. + sub $-8*16, SRC + add $-8*16, DATALEN + jl .Lghash_last_ciphertext_8x\@ +.endif + + .p2align 4 +.Lcrypt_loop_8x\@: + + // Generate the next set of 8 counter blocks and start encrypting them. + _ctr_begin_8x + lea 16(KEY), %rsi + + // Do a round of AES, and start the GHASH update of 8 ciphertext blocks + // by doing the unreduced multiplication for the first ciphertext block. + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + _ghash_update_begin_8x \enc + + // Do 7 more rounds of AES, and continue the GHASH update by doing the + // unreduced multiplication for the remaining ciphertext blocks. + .p2align 4 +1: + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + _ghash_update_continue_8x \enc + cmp $7*8, %eax + jne 1b + + // Do the remaining AES rounds. + .p2align 4 +1: + movdqa (%rsi), TMP0 + add $16, %rsi + _aesenc_8x TMP0 + cmp %rsi, RNDKEYLAST_PTR + jne 1b + + // Do the GHASH reduction and the last round of AES. + movdqa (RNDKEYLAST_PTR), TMP0 + _ghash_update_end_8x_step 0 + _aesenclast_8x TMP0 + _ghash_update_end_8x_step 1 + + // XOR the data with the AES-CTR keystream blocks. +.if \enc + sub $-8*16, DST +.endif + _xor_data_8x + sub $-8*16, SRC +.if !\enc + sub $-8*16, DST +.endif + add $-8*16, DATALEN + jge .Lcrypt_loop_8x\@ + +.if \enc +.Lghash_last_ciphertext_8x\@: + // Update GHASH with the last set of 8 ciphertext blocks. + _ghash_update_begin_8x \enc + .p2align 4 +1: + _ghash_update_continue_8x \enc + cmp $7*8, %eax + jne 1b + _ghash_update_end_8x_step 0 + _ghash_update_end_8x_step 1 + sub $-8*16, DST +.endif + +.Lcrypt_loop_8x_done\@: + + sub $-8*16, DATALEN + jz .Ldone\@ + + // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep + // things simple and keep the code size down by just going one block at + // a time, again taking advantage of hardware loop unrolling. Since + // there are enough key powers available for all remaining data, we do + // the GHASH multiplications unreduced, and only reduce at the very end. + + .set HI, TMP2 + .set H_POW, AESDATA0 + .set H_POW_XORED, AESDATA1 + .set ONE, AESDATA2 + + movq .Lone(%rip), ONE + + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. + pxor LO, LO + pxor MI, MI + pxor HI, HI + + // Set up a block counter %rax to contain 8*(8-n), where n is the number + // of blocks that remain, counting any partial block. This will be used + // to access the key powers H^n through H^1. + mov DATALEN, %eax + neg %eax + and $~15, %eax + sar $1, %eax + add $64, %eax + + sub $16, DATALEN + jl .Lcrypt_loop_1x_done\@ + + // Process the data one full block at a time. +.Lcrypt_loop_1x\@: + + // Encrypt the next counter block. + _vpshufb BSWAP_MASK, LE_CTR, TMP0 + paddd ONE, LE_CTR + pxor (KEY), TMP0 + lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + aesenc -7*16(%rsi), TMP0 + aesenc -6*16(%rsi), TMP0 +192: + aesenc -5*16(%rsi), TMP0 + aesenc -4*16(%rsi), TMP0 +128: +.irp i, -3,-2,-1,0,1,2,3,4,5 + aesenc \i*16(%rsi), TMP0 +.endr + aesenclast (RNDKEYLAST_PTR), TMP0 + + // Load the next key power H^i. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED + + // XOR the keystream block that was just generated in TMP0 with the next + // source data block and store the resulting en/decrypted data to DST. +.if \enc + _xor_mem_to_reg (SRC), TMP0, tmp=TMP1 + movdqu TMP0, (DST) +.else + movdqu (SRC), TMP1 + pxor TMP1, TMP0 + movdqu TMP0, (DST) +.endif + + // Update GHASH with the ciphertext block. +.if \enc + pshufb BSWAP_MASK, TMP0 + pxor TMP0, GHASH_ACC +.else + pshufb BSWAP_MASK, TMP1 + pxor TMP1, GHASH_ACC +.endif + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 + pxor GHASH_ACC, GHASH_ACC + + add $8, %eax + add $16, SRC + add $16, DST + sub $16, DATALEN + jge .Lcrypt_loop_1x\@ +.Lcrypt_loop_1x_done\@: + // Check whether there is a partial block at the end. + add $16, DATALEN + jz .Lghash_reduce\@ + + // Process a partial block of length 1 <= DATALEN <= 15. + + // Encrypt a counter block for the last time. + pshufb BSWAP_MASK, LE_CTR + pxor (KEY), LE_CTR + lea 16(KEY), %rsi +1: + aesenc (%rsi), LE_CTR + add $16, %rsi + cmp %rsi, RNDKEYLAST_PTR + jne 1b + aesenclast (RNDKEYLAST_PTR), LE_CTR + + // Load the lowest key power, H^1. + movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW + movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED + + // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is + // in %rcx, but _load_partial_block needs DATALEN in %rcx instead. + // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC. + mov SRC, RNDKEYLAST_PTR + mov DATALEN, %ecx + _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi + + // XOR the keystream block that was just generated in LE_CTR with the + // source data block and store the resulting en/decrypted data to DST. + pxor TMP0, LE_CTR + mov DATALEN, %ecx + _store_partial_block LE_CTR, DST + + // If encrypting, zero-pad the final ciphertext block for GHASH. (If + // decrypting, this was already done by _load_partial_block.) +.if \enc + lea .Lzeropad_mask+16(%rip), %rax + sub DATALEN64, %rax + _vpand (%rax), LE_CTR, TMP0 +.endif + + // Update GHASH with the final ciphertext block. + pshufb BSWAP_MASK, TMP0 + pxor TMP0, GHASH_ACC + _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 + +.Lghash_reduce\@: + // Finally, do the GHASH reduction. + _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0 + +.Ldone\@: + // Store the updated GHASH accumulator back to memory. + movdqu GHASH_ACC, (GHASH_ACC_PTR) + + RET +.endm + +// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key, +// const u32 le_ctr[4], const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); +// +// This macro generates one of the above two functions (with \enc selecting +// which one). Both functions finish computing the GCM authentication tag by +// updating GHASH with the lengths block and encrypting the GHASH accumulator. +// |total_aadlen| and |total_datalen| must be the total length of the additional +// authenticated data and the en/decrypted data in bytes, respectively. +// +// The encryption function then stores the full-length (16-byte) computed +// authentication tag to |ghash_acc|. The decryption function instead loads the +// expected authentication tag (the one that was transmitted) from the 16-byte +// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the +// computed tag in constant time, and returns true if and only if they match. +.macro _aes_gcm_final enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set TOTAL_AADLEN, %rcx + .set TOTAL_DATALEN, %r8 + .set TAG, %r9 + .set TAGLEN, %r10d // Originally at 8(%rsp) + .set TAGLEN64, %r10 + + // Additional local variables. + // %rax and %xmm0-%xmm2 are used as temporary registers. + .set AESKEYLEN, %r11d + .set AESKEYLEN64, %r11 + .set BSWAP_MASK, %xmm3 + .set GHASH_ACC, %xmm4 + .set H_POW1, %xmm5 // H^1 + .set H_POW1_X64, %xmm6 // H^1 * x^64 + .set GFPOLY, %xmm7 + + movdqa .Lbswap_mask(%rip), BSWAP_MASK + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Set up a counter block with 1 in the low 32-bit word. This is the + // counter that produces the ciphertext needed to encrypt the auth tag. + movdqu (LE_CTR_PTR), %xmm0 + mov $1, %eax + pinsrd $0, %eax, %xmm0 + + // Build the lengths block and XOR it into the GHASH accumulator. + movq TOTAL_DATALEN, GHASH_ACC + pinsrq $1, TOTAL_AADLEN, GHASH_ACC + psllq $3, GHASH_ACC // Bytes to bits + _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1 + + movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 + movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 + movq .Lgfpoly(%rip), GFPOLY + + // Make %rax point to the 6th from last AES round key. (Using signed + // byte offsets -7*16 through 6*16 decreases code size.) + lea (KEY,AESKEYLEN64,4), %rax + + // AES-encrypt the counter block and also multiply GHASH_ACC by H^1. + // Interleave the AES and GHASH instructions to improve performance. + pshufb BSWAP_MASK, %xmm0 + pxor (KEY), %xmm0 + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + aesenc -7*16(%rax), %xmm0 + aesenc -6*16(%rax), %xmm0 +192: + aesenc -5*16(%rax), %xmm0 + aesenc -4*16(%rax), %xmm0 +128: +.irp i, 0,1,2,3,4,5,6,7,8 + aesenc (\i-3)*16(%rax), %xmm0 + _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 +.endr + aesenclast 6*16(%rax), %xmm0 + _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 + + // Undo the byte reflection of the GHASH accumulator. + pshufb BSWAP_MASK, GHASH_ACC + + // Encrypt the GHASH accumulator. + pxor %xmm0, GHASH_ACC + +.if \enc + // Return the computed auth tag. + movdqu GHASH_ACC, (GHASH_ACC_PTR) +.else + .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN! + + // Verify the auth tag in constant time by XOR'ing the transmitted and + // computed auth tags together and using the ptest instruction to check + // whether the first TAGLEN bytes of the result are zero. + _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0 + movl 8(%rsp), TAGLEN + lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR + sub TAGLEN64, ZEROPAD_MASK_PTR + xor %eax, %eax + _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0 + sete %al +.endif + RET +.endm + +.set USE_AVX, 0 +SYM_FUNC_START(aes_gcm_precompute_aesni) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_aesni) +SYM_FUNC_START(aes_gcm_aad_update_aesni) + _aes_gcm_aad_update +SYM_FUNC_END(aes_gcm_aad_update_aesni) +SYM_FUNC_START(aes_gcm_enc_update_aesni) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_aesni) +SYM_FUNC_START(aes_gcm_dec_update_aesni) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_aesni) +SYM_FUNC_START(aes_gcm_enc_final_aesni) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_aesni) +SYM_FUNC_START(aes_gcm_dec_final_aesni) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_aesni) + +.set USE_AVX, 1 +SYM_FUNC_START(aes_gcm_precompute_aesni_avx) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_aesni_avx) +SYM_FUNC_START(aes_gcm_aad_update_aesni_avx) + _aes_gcm_aad_update +SYM_FUNC_END(aes_gcm_aad_update_aesni_avx) +SYM_FUNC_START(aes_gcm_enc_update_aesni_avx) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_aesni_avx) +SYM_FUNC_START(aes_gcm_dec_update_aesni_avx) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_aesni_avx) +SYM_FUNC_START(aes_gcm_enc_final_aesni_avx) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_aesni_avx) +SYM_FUNC_START(aes_gcm_dec_final_aesni_avx) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_aesni_avx) diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S new file mode 100644 index 000000000000..97e0ee515fc5 --- /dev/null +++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S @@ -0,0 +1,1222 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// VAES and VPCLMULQDQ optimized AES-GCM for x86_64 +// +// Copyright 2024 Google LLC +// +// Author: Eric Biggers <ebiggers@google.com> +// +//------------------------------------------------------------------------------ +// +// This file is dual-licensed, meaning that you can use it under your choice of +// either of the following two licenses: +// +// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// or +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +//------------------------------------------------------------------------------ +// +// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that +// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and +// either AVX512 or AVX10. Some of the functions, notably the encryption and +// decryption update functions which are the most performance-critical, are +// provided in two variants generated from a macro: one using 256-bit vectors +// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The +// other, "shared" functions (vaes_avx10) use at most 256-bit vectors. +// +// The functions that use 512-bit vectors are intended for CPUs that support +// 512-bit vectors *and* where using them doesn't cause significant +// downclocking. They require the following CPU features: +// +// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) +// +// The other functions require the following CPU features: +// +// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) +// +// All functions use the "System V" ABI. The Windows ABI is not supported. +// +// Note that we use "avx10" in the names of the functions as a shorthand to +// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's +// introduction of AVX512 and then its replacement by AVX10, there doesn't seem +// to be a simple way to name things that makes sense on all CPUs. +// +// Note that the macros that support both 256-bit and 512-bit vectors could +// fairly easily be changed to support 128-bit too. However, this would *not* +// be sufficient to allow the code to run on CPUs without AVX512 or AVX10, +// because the code heavily uses several features of these extensions other than +// the vector length: the increase in the number of SIMD registers from 16 to +// 32, masking support, and new instructions such as vpternlogd (which can do a +// three-argument XOR). These features are very useful for AES-GCM. + +#include <linux/linkage.h> + +.section .rodata +.p2align 6 + + // A shuffle mask that reflects the bytes of 16-byte blocks +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f + + // This is the GHASH reducing polynomial without its constant term, i.e. + // x^128 + x^7 + x^2 + x, represented using the backwards mapping + // between bits and polynomial coefficients. + // + // Alternatively, it can be interpreted as the naturally-ordered + // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the + // "reversed" GHASH reducing polynomial without its x^128 term. +.Lgfpoly: + .octa 0xc2000000000000000000000000000001 + + // Same as above, but with the (1 << 64) bit set. +.Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + + // The below constants are used for incrementing the counter blocks. + // ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. + // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and + // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. +.Lctr_pattern: + .octa 0 + .octa 1 +.Linc_2blocks: + .octa 2 + .octa 3 +.Linc_4blocks: + .octa 4 + +// Number of powers of the hash key stored in the key struct. The powers are +// stored from highest (H^NUM_H_POWERS) to lowest (H^1). +#define NUM_H_POWERS 16 + +// Offset to AES key length (in bytes) in the key struct +#define OFFSETOF_AESKEYLEN 480 + +// Offset to start of hash key powers array in the key struct +#define OFFSETOF_H_POWERS 512 + +// Offset to end of hash key powers array in the key struct. +// +// This is immediately followed by three zeroized padding blocks, which are +// included so that partial vectors can be handled more easily. E.g. if VL=64 +// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most +// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. +#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) + +.text + +// Set the vector length in bytes. This sets the VL variable and defines +// register aliases V0-V31 that map to the ymm or zmm registers. +.macro _set_veclen vl + .set VL, \vl +.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if VL == 32 + .set V\i, %ymm\i +.elseif VL == 64 + .set V\i, %zmm\i +.else + .error "Unsupported vector length" +.endif +.endr +.endm + +// The _ghash_mul_step macro does one step of GHASH multiplication of the +// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the +// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the +// same size as \a and \b. To complete all steps, this must invoked with \i=0 +// through \i=9. The division into steps allows users of this macro to +// optionally interleave the computation with other instructions. Users of this +// macro must preserve the parameter registers across steps. +// +// The multiplications are done in GHASH's representation of the finite field +// GF(2^128). Elements of GF(2^128) are represented as binary polynomials +// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial +// G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is +// just XOR, while multiplication is more complex and has two parts: (a) do +// carryless multiplication of two 128-bit input polynomials to get a 256-bit +// intermediate product polynomial, and (b) reduce the intermediate product to +// 128 bits by adding multiples of G that cancel out terms in it. (Adding +// multiples of G doesn't change which field element the polynomial represents.) +// +// Unfortunately, the GCM specification maps bits to/from polynomial +// coefficients backwards from the natural order. In each byte it specifies the +// highest bit to be the lowest order polynomial coefficient, *not* the highest! +// This makes it nontrivial to work with the GHASH polynomials. We could +// reflect the bits, but x86 doesn't have an instruction that does that. +// +// Instead, we operate on the values without bit-reflecting them. This *mostly* +// just works, since XOR and carryless multiplication are symmetric with respect +// to bit order, but it has some consequences. First, due to GHASH's byte +// order, by skipping bit reflection, *byte* reflection becomes necessary to +// give the polynomial terms a consistent order. E.g., considering an N-bit +// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0 +// through N-1 of the byte-reflected value represent the coefficients of x^(N-1) +// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value +// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked +// with. Fortunately, x86's vpshufb instruction can do byte reflection. +// +// Second, forgoing the bit reflection causes an extra multiple of x (still +// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each +// multiplication. This is because an M-bit by N-bit carryless multiplication +// really produces a (M+N-1)-bit product, but in practice it's zero-extended to +// M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits +// to polynomial coefficients backwards, this zero-extension actually changes +// the product by introducing an extra factor of x. Therefore, users of this +// macro must ensure that one of the inputs has an extra factor of x^-1, i.e. +// the multiplicative inverse of x, to cancel out the extra x. +// +// Third, the backwards coefficients convention is just confusing to work with, +// since it makes "low" and "high" in the polynomial math mean the opposite of +// their normal meaning in computer programming. This can be solved by using an +// alternative interpretation: the polynomial coefficients are understood to be +// in the natural order, and the multiplication is actually \a * \b * x^-128 mod +// x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs, +// or the implementation at all; it just changes the mathematical interpretation +// of what each instruction is doing. Starting from here, we'll use this +// alternative interpretation, as it's easier to understand the code that way. +// +// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 => +// 128-bit carryless multiplication, so we break the 128 x 128 multiplication +// into parts as follows (the _L and _H suffixes denote low and high 64 bits): +// +// LO = a_L * b_L +// MI = (a_L * b_H) + (a_H * b_L) +// HI = a_H * b_H +// +// The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit. +// Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and +// HI right away, since the way the reduction works makes that unnecessary. +// +// For the reduction, we cancel out the low 128 bits by adding multiples of G = +// x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of +// which cancels out the next lowest 64 bits. Consider a value x^64*A + B, +// where A and B are 128-bit. Adding B_L*G to that value gives: +// +// x^64*A + B + B_L*G +// = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1) +// = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L +// = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L +// = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57)) +// +// So: if we sum A, B with its halves swapped, and the low half of B times x^63 +// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the +// original value x^64*A + B. I.e., the low 64 bits got canceled out. +// +// We just need to apply this twice: first to fold LO into MI, and second to +// fold the updated MI into HI. +// +// The needed three-argument XORs are done using the vpternlogd instruction with +// immediate 0x96, since this is faster than two vpxord instructions. +// +// A potential optimization, assuming that b is fixed per-key (if a is fixed +// per-key it would work the other way around), is to use one iteration of the +// reduction described above to precompute a value c such that x^64*c = b mod G, +// and then multiply a_L by c (and implicitly by x^64) instead of by b: +// +// MI = (a_L * c_L) + (a_H * b_L) +// HI = (a_L * c_H) + (a_H * b_H) +// +// This would eliminate the LO part of the intermediate product, which would +// eliminate the need to fold LO into MI. This would save two instructions, +// including a vpclmulqdq. However, we currently don't use this optimization +// because it would require twice as many per-key precomputed values. +// +// Using Karatsuba multiplication instead of "schoolbook" multiplication +// similarly would save a vpclmulqdq but does not seem to be worth it. +.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 +.if \i == 0 + vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L + vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H +.elseif \i == 1 + vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L +.elseif \i == 2 + vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1 +.elseif \i == 3 + vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) +.elseif \i == 4 + vpshufd $0x4e, \t0, \t0 // Swap halves of LO +.elseif \i == 5 + vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI +.elseif \i == 6 + vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H +.elseif \i == 7 + vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + vpshufd $0x4e, \t1, \t1 // Swap halves of MI +.elseif \i == 9 + vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI +.endif +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store +// the reduced products in \dst. See _ghash_mul_step for full explanation. +.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 +.endr +.endm + +// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the +// *unreduced* products to \lo, \mi, and \hi. +.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3 + vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L + vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H + vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L + vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H + vpxord \t0, \lo, \lo + vpternlogd $0x96, \t2, \t1, \mi + vpxord \t3, \hi, \hi +.endm + +// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit +// reduced products in \hi. See _ghash_mul_step for explanation of reduction. +.macro _ghash_reduce lo, mi, hi, gfpoly, t0 + vpclmulqdq $0x01, \lo, \gfpoly, \t0 + vpshufd $0x4e, \lo, \lo + vpternlogd $0x96, \t0, \lo, \mi + vpclmulqdq $0x01, \mi, \gfpoly, \t0 + vpshufd $0x4e, \mi, \mi + vpternlogd $0x96, \t0, \mi, \hi +.endm + +// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key); +// +// Given the expanded AES key |key->aes_key|, this function derives the GHASH +// subkey and initializes |key->ghash_key_powers| with powers of it. +// +// The number of key powers initialized is NUM_H_POWERS, and they are stored in +// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key +// powers themselves are also initialized. +// +// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked +// with the desired length. In the VL=32 case, the function computes twice as +// many key powers than are actually used by the VL=32 GCM update functions. +// This is done to keep the key format the same regardless of vector length. +.macro _aes_gcm_precompute + + // Function arguments + .set KEY, %rdi + + // Additional local variables. V0-V2 and %rax are used as temporaries. + .set POWERS_PTR, %rsi + .set RNDKEYLAST_PTR, %rdx + .set H_CUR, V3 + .set H_CUR_YMM, %ymm3 + .set H_CUR_XMM, %xmm3 + .set H_INC, V4 + .set H_INC_YMM, %ymm4 + .set H_INC_XMM, %xmm4 + .set GFPOLY, V5 + .set GFPOLY_YMM, %ymm5 + .set GFPOLY_XMM, %xmm5 + + // Get pointer to lowest set of key powers (located at end of array). + lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax + lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR + vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block + add $16, KEY +1: + vaesenc (KEY), %xmm0, %xmm0 + add $16, KEY + cmp KEY, RNDKEYLAST_PTR + jne 1b + vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0 + + // Reflect the bytes of the raw hash subkey. + vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM + + // Zeroize the padding blocks. + vpxor %xmm0, %xmm0, %xmm0 + vmovdqu %ymm0, VL(POWERS_PTR) + vmovdqu %xmm0, VL+2*16(POWERS_PTR) + + // Finish preprocessing the first key power, H^1. Since this GHASH + // implementation operates directly on values with the backwards bit + // order specified by the GCM standard, it's necessary to preprocess the + // raw key as follows. First, reflect its bytes. Second, multiply it + // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards + // interpretation of polynomial coefficients), which can also be + // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121 + // + 1 using the alternative, natural interpretation of polynomial + // coefficients. For details, see the comment above _ghash_mul_step. + // + // Either way, for the multiplication the concrete operation performed + // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2 + // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit + // wide shift instruction, so instead double each of the two 64-bit + // halves and incorporate the internal carry bit into the value XOR'd. + vpshufd $0xd3, H_CUR_XMM, %xmm0 + vpsrad $31, %xmm0, %xmm0 + vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM + vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0 + vpxor %xmm0, H_CUR_XMM, H_CUR_XMM + + // Load the gfpoly constant. + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY + + // Square H^1 to get H^2. + // + // Note that as with H^1, all higher key powers also need an extra + // factor of x^-1 (or x using the natural interpretation). Nothing + // special needs to be done to make this happen, though: H^1 * H^1 would + // end up with two factors of x^-1, but the multiplication consumes one. + // So the product H^2 ends up with the desired one factor of x^-1. + _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \ + %xmm0, %xmm1, %xmm2 + + // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. + vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM + vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM + +.if VL == 64 + // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. + _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \ + %ymm0, %ymm1, %ymm2 + vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR + vshufi64x2 $0, H_INC, H_INC, H_INC +.endif + + // Store the lowest set of key powers. + vmovdqu8 H_CUR, (POWERS_PTR) + + // Compute and store the remaining key powers. With VL=32, repeatedly + // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. + // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by + // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. + mov $(NUM_H_POWERS*16/VL) - 1, %eax +.Lprecompute_next\@: + sub $VL, POWERS_PTR + _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2 + vmovdqu8 H_CUR, (POWERS_PTR) + dec %eax + jnz .Lprecompute_next\@ + + vzeroupper // This is needed after using ymm or zmm registers. + RET +.endm + +// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store +// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. +.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm + vextracti32x4 $1, \src, \t0_xmm +.if VL == 32 + vpxord \t0_xmm, \src_xmm, \dst_xmm +.elseif VL == 64 + vextracti32x4 $2, \src, \t1_xmm + vextracti32x4 $3, \src, \t2_xmm + vpxord \t0_xmm, \src_xmm, \dst_xmm + vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm +.else + .error "Unsupported vector length" +.endif +.endm + +// Do one step of the GHASH update of the data blocks given in the vector +// registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The +// division into steps allows users of this macro to optionally interleave the +// computation with other instructions. This macro uses the vector register +// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered; +// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and +// GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the +// data blocks. The parameter registers must be preserved across steps. +// +// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + +// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the +// operations are vectorized operations on vectors of 16-byte blocks. E.g., +// with VL=32 there are 2 blocks per vector and the vectorized terms correspond +// to the following non-vectorized terms: +// +// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) +// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 +// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 +// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 +// +// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. +// +// More concretely, this code does: +// - Do vectorized "schoolbook" multiplications to compute the intermediate +// 256-bit product of each block and its corresponding hash key power. +// There are 4*VL/16 of these intermediate products. +// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves +// VL/16 256-bit intermediate values. +// - Do a vectorized reduction of these 256-bit intermediate values to +// 128-bits each. This leaves VL/16 128-bit intermediate values. +// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. +// +// See _ghash_mul_step for the full explanation of the operations performed for +// each individual finite field multiplication and reduction. +.macro _ghash_step_4x i +.if \i == 0 + vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 + vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0 + vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1 + vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2 +.elseif \i == 1 + vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3 + vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0 + vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1 + vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2 +.elseif \i == 2 + vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0}) + vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0}) + vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0 +.elseif \i == 3 + vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1 + vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0}) + vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3 +.elseif \i == 4 + vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0}) + vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5 + vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6 +.elseif \i == 5 + vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0}) + vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57) + vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7 + vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0}) +.elseif \i == 6 + vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO + vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0 + vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1 + vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2 +.elseif \i == 7 + vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI + vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3 + vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0}) + vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57) +.elseif \i == 8 + vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0}) + vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI + vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI +.elseif \i == 9 + _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ + GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM +.endif +.endm + +// Do one non-last round of AES encryption on the counter blocks in V0-V3 using +// the round key that has been broadcast to all 128-bit lanes of \round_key. +.macro _vaesenc_4x round_key + vaesenc \round_key, V0, V0 + vaesenc \round_key, V1, V1 + vaesenc \round_key, V2, V2 + vaesenc \round_key, V3, V3 +.endm + +// Start the AES encryption of four vectors of counter blocks. +.macro _ctr_begin_4x + + // Increment LE_CTR four times to generate four vectors of little-endian + // counter blocks, swap each to big-endian, and store them in V0-V3. + vpshufb BSWAP_MASK, LE_CTR, V0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V1 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V2 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpshufb BSWAP_MASK, LE_CTR, V3 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + + // AES "round zero": XOR in the zero-th round key. + vpxord RNDKEY0, V0, V0 + vpxord RNDKEY0, V1, V1 + vpxord RNDKEY0, V2, V2 + vpxord RNDKEY0, V3, V3 +.endm + +// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// const u8 *src, u8 *dst, int datalen); +// +// This macro generates a GCM encryption or decryption update function with the +// above prototype (with \enc selecting which one). This macro supports both +// VL=32 and VL=64. _set_veclen must have been invoked with the desired length. +// +// This function computes the next portion of the CTR keystream, XOR's it with +// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted +// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the +// next |datalen| ciphertext bytes. +// +// |datalen| must be a multiple of 16, except on the last call where it can be +// any length. The caller must do any buffering needed to ensure this. Both +// in-place and out-of-place en/decryption are supported. +// +// |le_ctr| must give the current counter in little-endian format. For a new +// message, the low word of the counter must be 2. This function loads the +// counter from |le_ctr| and increments the loaded counter as needed, but it +// does *not* store the updated counter back to |le_ctr|. The caller must +// update |le_ctr| if any more data segments follow. Internally, only the low +// 32-bit word of the counter is incremented, following the GCM standard. +.macro _aes_gcm_update enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set SRC, %rcx + .set DST, %r8 + .set DATALEN, %r9d + .set DATALEN64, %r9 // Zero-extend DATALEN before using! + + // Additional local variables + + // %rax and %k1 are used as temporary registers. LE_CTR_PTR is also + // available as a temporary register after the counter is loaded. + + // AES key length in bytes + .set AESKEYLEN, %r10d + .set AESKEYLEN64, %r10 + + // Pointer to the last AES round key for the chosen AES variant + .set RNDKEYLAST_PTR, %r11 + + // In the main loop, V0-V3 are used as AES input and output. Elsewhere + // they are used as temporary registers. + + // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. + .set GHASHDATA0, V4 + .set GHASHDATA0_XMM, %xmm4 + .set GHASHDATA1, V5 + .set GHASHDATA1_XMM, %xmm5 + .set GHASHDATA2, V6 + .set GHASHDATA2_XMM, %xmm6 + .set GHASHDATA3, V7 + + // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values + // using vpshufb, copied to all 128-bit lanes. + .set BSWAP_MASK, V8 + + // RNDKEY temporarily holds the next AES round key. + .set RNDKEY, V9 + + // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, + // only the lowest 128-bit lane can be nonzero. When not fully reduced, + // more than one lane may be used, and they need to be XOR'd together. + .set GHASH_ACC, V10 + .set GHASH_ACC_XMM, %xmm10 + + // LE_CTR_INC is the vector of 32-bit words that need to be added to a + // vector of little-endian counter blocks to advance it forwards. + .set LE_CTR_INC, V11 + + // LE_CTR contains the next set of little-endian counter blocks. + .set LE_CTR, V12 + + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys, + // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, + // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. + .set RNDKEY0, V13 + .set RNDKEYLAST, V14 + .set RNDKEY_M9, V15 + .set RNDKEY_M8, V16 + .set RNDKEY_M7, V17 + .set RNDKEY_M6, V18 + .set RNDKEY_M5, V19 + + // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with + // the corresponding block of source data. This is useful because + // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can + // be computed in parallel with the AES rounds. + .set RNDKEYLAST0, V20 + .set RNDKEYLAST1, V21 + .set RNDKEYLAST2, V22 + .set RNDKEYLAST3, V23 + + // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These + // cannot coincide with anything used for AES encryption, since for + // performance reasons GHASH and AES encryption are interleaved. + .set GHASHTMP0, V24 + .set GHASHTMP1, V25 + .set GHASHTMP2, V26 + + // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The + // descending numbering reflects the order of the key powers. + .set H_POW4, V27 + .set H_POW3, V28 + .set H_POW2, V29 + .set H_POW1, V30 + + // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. + .set GFPOLY, V31 + + // Load some constants. + vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY + + // Load the GHASH accumulator and the starting counter. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + vbroadcasti32x4 (LE_CTR_PTR), LE_CTR + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Make RNDKEYLAST_PTR point to the last AES round key. This is the + // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 + // respectively. Then load the zero-th and last round keys. + lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR + vbroadcasti32x4 (KEY), RNDKEY0 + vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST + + // Finish initializing LE_CTR by adding [0, 1, ...] to its low words. + vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR + + // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. +.if VL == 32 + vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC +.elseif VL == 64 + vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC +.else + .error "Unsupported vector length" +.endif + + // If there are at least 4*VL bytes of data, then continue into the loop + // that processes 4*VL bytes of data at a time. Otherwise skip it. + // + // Pre-subtracting 4*VL from DATALEN saves an instruction from the main + // loop and also ensures that at least one write always occurs to + // DATALEN, zero-extending it and allowing DATALEN64 to be used later. + sub $4*VL, DATALEN + jl .Lcrypt_loop_4x_done\@ + + // Load powers of the hash key. + vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 + vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 + vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2 + vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1 + + // Main loop: en/decrypt and hash 4 vectors at a time. + // + // When possible, interleave the AES encryption of the counter blocks + // with the GHASH update of the ciphertext blocks. This improves + // performance on many CPUs because the execution ports used by the VAES + // instructions often differ from those used by vpclmulqdq and other + // instructions used in GHASH. For example, many Intel CPUs dispatch + // vaesenc to ports 0 and 1 and vpclmulqdq to port 5. + // + // The interleaving is easiest to do during decryption, since during + // decryption the ciphertext blocks are immediately available. For + // encryption, instead encrypt the first set of blocks, then hash those + // blocks while encrypting the next set of blocks, repeat that as + // needed, and finally hash the last set of blocks. + +.if \enc + // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting + // ciphertext in GHASHDATA[0-3] for GHASH. + _ctr_begin_4x + lea 16(KEY), %rax +1: + vbroadcasti32x4 (%rax), RNDKEY + _vaesenc_4x RNDKEY + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 + vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 + vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 + vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 + vaesenclast RNDKEYLAST0, V0, GHASHDATA0 + vaesenclast RNDKEYLAST1, V1, GHASHDATA1 + vaesenclast RNDKEYLAST2, V2, GHASHDATA2 + vaesenclast RNDKEYLAST3, V3, GHASHDATA3 + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) + add $4*VL, SRC + add $4*VL, DST + sub $4*VL, DATALEN + jl .Lghash_last_ciphertext_4x\@ +.endif + + // Cache as many additional AES round keys as possible. +.irp i, 9,8,7,6,5 + vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i +.endr + +.Lcrypt_loop_4x\@: + + // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If + // encrypting, GHASHDATA[0-3] already contain the previous ciphertext. +.if !\enc + vmovdqu8 0*VL(SRC), GHASHDATA0 + vmovdqu8 1*VL(SRC), GHASHDATA1 + vmovdqu8 2*VL(SRC), GHASHDATA2 + vmovdqu8 3*VL(SRC), GHASHDATA3 +.endif + + // Start the AES encryption of the counter blocks. + _ctr_begin_4x + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY +192: + vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY +128: + + // XOR the source data with the last round key, saving the result in + // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). +.if \enc + vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 + vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 + vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 + vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 +.else + vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0 + vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1 + vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2 + vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3 +.endif + + // Finish the AES encryption of the counter blocks in V0-V3, interleaved + // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. +.irp i, 9,8,7,6,5 + _vaesenc_4x RNDKEY_M\i + _ghash_step_4x (9 - \i) +.endr +.irp i, 4,3,2,1 + vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY + _vaesenc_4x RNDKEY + _ghash_step_4x (9 - \i) +.endr + _ghash_step_4x 9 + + // Do the last AES round. This handles the XOR with the source data + // too, as per the optimization described above. + vaesenclast RNDKEYLAST0, V0, GHASHDATA0 + vaesenclast RNDKEYLAST1, V1, GHASHDATA1 + vaesenclast RNDKEYLAST2, V2, GHASHDATA2 + vaesenclast RNDKEYLAST3, V3, GHASHDATA3 + + // Store the en/decrypted data to DST. + vmovdqu8 GHASHDATA0, 0*VL(DST) + vmovdqu8 GHASHDATA1, 1*VL(DST) + vmovdqu8 GHASHDATA2, 2*VL(DST) + vmovdqu8 GHASHDATA3, 3*VL(DST) + + add $4*VL, SRC + add $4*VL, DST + sub $4*VL, DATALEN + jge .Lcrypt_loop_4x\@ + +.if \enc +.Lghash_last_ciphertext_4x\@: + // Update GHASH with the last set of ciphertext blocks. +.irp i, 0,1,2,3,4,5,6,7,8,9 + _ghash_step_4x \i +.endr +.endif + +.Lcrypt_loop_4x_done\@: + + // Undo the extra subtraction by 4*VL and check whether data remains. + add $4*VL, DATALEN + jz .Ldone\@ + + // The data length isn't a multiple of 4*VL. Process the remaining data + // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. + // Going one vector at a time may seem inefficient compared to having + // separate code paths for each possible number of vectors remaining. + // However, using a loop keeps the code size down, and it performs + // surprising well; modern CPUs will start executing the next iteration + // before the previous one finishes and also predict the number of loop + // iterations. For a similar reason, we roll up the AES rounds. + // + // On the last iteration, the remaining length may be less than VL. + // Handle this using masking. + // + // Since there are enough key powers available for all remaining data, + // there is no need to do a GHASH reduction after each iteration. + // Instead, multiply each remaining block by its own key power, and only + // do a GHASH reduction at the very end. + + // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N + // is the number of blocks that remain. + .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. + mov DATALEN, %eax + neg %rax + and $~15, %rax // -round_up(DATALEN, 16) + lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR + + // Start collecting the unreduced GHASH intermediate value LO, MI, HI. + .set LO, GHASHDATA0 + .set LO_XMM, GHASHDATA0_XMM + .set MI, GHASHDATA1 + .set MI_XMM, GHASHDATA1_XMM + .set HI, GHASHDATA2 + .set HI_XMM, GHASHDATA2_XMM + vpxor LO_XMM, LO_XMM, LO_XMM + vpxor MI_XMM, MI_XMM, MI_XMM + vpxor HI_XMM, HI_XMM, HI_XMM + +.Lcrypt_loop_1x\@: + + // Select the appropriate mask for this iteration: all 1's if + // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the + // bzhi instruction from BMI2. (This relies on DATALEN <= 255.) +.if VL < 64 + mov $-1, %eax + bzhi DATALEN, %eax, %eax + kmovd %eax, %k1 +.else + mov $-1, %rax + bzhi DATALEN64, %rax, %rax + kmovq %rax, %k1 +.endif + + // Encrypt a vector of counter blocks. This does not need to be masked. + vpshufb BSWAP_MASK, LE_CTR, V0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + vpxord RNDKEY0, V0, V0 + lea 16(KEY), %rax +1: + vbroadcasti32x4 (%rax), RNDKEY + vaesenc RNDKEY, V0, V0 + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b + vaesenclast RNDKEYLAST, V0, V0 + + // XOR the data with the appropriate number of keystream bytes. + vmovdqu8 (SRC), V1{%k1}{z} + vpxord V1, V0, V0 + vmovdqu8 V0, (DST){%k1} + + // Update GHASH with the ciphertext block(s), without reducing. + // + // In the case of DATALEN < VL, the ciphertext is zero-padded to VL. + // (If decrypting, it's done by the above masked load. If encrypting, + // it's done by the below masked register-to-register move.) Note that + // if DATALEN <= VL - 16, there will be additional padding beyond the + // padding of the last block specified by GHASH itself; i.e., there may + // be whole block(s) that get processed by the GHASH multiplication and + // reduction instructions but should not actually be included in the + // GHASH. However, any such blocks are all-zeroes, and the values that + // they're multiplied with are also all-zeroes. Therefore they just add + // 0 * 0 = 0 to the final GHASH result, which makes no difference. + vmovdqu8 (POWERS_PTR), H_POW1 +.if \enc + vmovdqu8 V0, V1{%k1}{z} +.endif + vpshufb BSWAP_MASK, V1, V0 + vpxord GHASH_ACC, V0, V0 + _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3 + vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + + add $VL, POWERS_PTR + add $VL, SRC + add $VL, DST + sub $VL, DATALEN + jg .Lcrypt_loop_1x\@ + + // Finally, do the GHASH reduction. + _ghash_reduce LO, MI, HI, GFPOLY, V0 + _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2 + +.Ldone\@: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper // This is needed after using ymm or zmm registers. + RET +.endm + +// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen); +// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// const u32 le_ctr[4], +// const u8 ghash_acc[16], +// u64 total_aadlen, u64 total_datalen, +// const u8 tag[16], int taglen); +// +// This macro generates one of the above two functions (with \enc selecting +// which one). Both functions finish computing the GCM authentication tag by +// updating GHASH with the lengths block and encrypting the GHASH accumulator. +// |total_aadlen| and |total_datalen| must be the total length of the additional +// authenticated data and the en/decrypted data in bytes, respectively. +// +// The encryption function then stores the full-length (16-byte) computed +// authentication tag to |ghash_acc|. The decryption function instead loads the +// expected authentication tag (the one that was transmitted) from the 16-byte +// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the +// computed tag in constant time, and returns true if and only if they match. +.macro _aes_gcm_final enc + + // Function arguments + .set KEY, %rdi + .set LE_CTR_PTR, %rsi + .set GHASH_ACC_PTR, %rdx + .set TOTAL_AADLEN, %rcx + .set TOTAL_DATALEN, %r8 + .set TAG, %r9 + .set TAGLEN, %r10d // Originally at 8(%rsp) + + // Additional local variables. + // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers. + .set AESKEYLEN, %r11d + .set AESKEYLEN64, %r11 + .set GFPOLY, %xmm4 + .set BSWAP_MASK, %xmm5 + .set LE_CTR, %xmm6 + .set GHASH_ACC, %xmm7 + .set H_POW1, %xmm8 + + // Load some constants. + vmovdqa .Lgfpoly(%rip), GFPOLY + vmovdqa .Lbswap_mask(%rip), BSWAP_MASK + + // Load the AES key length in bytes. + movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN + + // Set up a counter block with 1 in the low 32-bit word. This is the + // counter that produces the ciphertext needed to encrypt the auth tag. + // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. + vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR + + // Build the lengths block and XOR it with the GHASH accumulator. + // Although the lengths block is defined as the AAD length followed by + // the en/decrypted data length, both in big-endian byte order, a byte + // reflection of the full block is needed because of the way we compute + // GHASH (see _ghash_mul_step). By using little-endian values in the + // opposite order, we avoid having to reflect any bytes here. + vmovq TOTAL_DATALEN, %xmm0 + vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 + vpsllq $3, %xmm0, %xmm0 // Bytes to bits + vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC + + // Load the first hash key power (H^1), which is stored last. + vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1 + +.if !\enc + // Prepare a mask of TAGLEN one bits. + movl 8(%rsp), TAGLEN + mov $-1, %eax + bzhi TAGLEN, %eax, %eax + kmovd %eax, %k1 +.endif + + // Make %rax point to the last AES round key for the chosen AES variant. + lea 6*16(KEY,AESKEYLEN64,4), %rax + + // Start the AES encryption of the counter block by swapping the counter + // block to big-endian and XOR-ing it with the zero-th AES round key. + vpshufb BSWAP_MASK, LE_CTR, %xmm0 + vpxor (KEY), %xmm0, %xmm0 + + // Complete the AES encryption and multiply GHASH_ACC by H^1. + // Interleave the AES and GHASH instructions to improve performance. + cmp $24, AESKEYLEN + jl 128f // AES-128? + je 192f // AES-192? + // AES-256 + vaesenc -13*16(%rax), %xmm0, %xmm0 + vaesenc -12*16(%rax), %xmm0, %xmm0 +192: + vaesenc -11*16(%rax), %xmm0, %xmm0 + vaesenc -10*16(%rax), %xmm0, %xmm0 +128: +.irp i, 0,1,2,3,4,5,6,7,8 + _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 +.endr + _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %xmm1, %xmm2, %xmm3 + + // Undo the byte reflection of the GHASH accumulator. + vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC + + // Do the last AES round and XOR the resulting keystream block with the + // GHASH accumulator to produce the full computed authentication tag. + // + // Reduce latency by taking advantage of the property vaesenclast(key, + // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last + // round key, instead of XOR'ing the final AES output with GHASH_ACC. + // + // enc_final then returns the computed auth tag, while dec_final + // compares it with the transmitted one and returns a bool. To compare + // the tags, dec_final XORs them together and uses vptest to check + // whether the result is all-zeroes. This should be constant-time. + // dec_final applies the vaesenclast optimization to this additional + // value XOR'd too, using vpternlogd to XOR the last round key, GHASH + // accumulator, and transmitted auth tag together in one instruction. +.if \enc + vpxor (%rax), GHASH_ACC, %xmm1 + vaesenclast %xmm1, %xmm0, GHASH_ACC + vmovdqu GHASH_ACC, (GHASH_ACC_PTR) +.else + vmovdqu (TAG), %xmm1 + vpternlogd $0x96, (%rax), GHASH_ACC, %xmm1 + vaesenclast %xmm1, %xmm0, %xmm0 + xor %eax, %eax + vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes + vptest %xmm0, %xmm0 + sete %al +.endif + // No need for vzeroupper here, since only used xmm registers were used. + RET +.endm + +_set_veclen 32 +SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256) +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256) + +_set_veclen 64 +SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512) + _aes_gcm_precompute +SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512) +SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512) + _aes_gcm_update 1 +SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512) +SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512) + _aes_gcm_update 0 +SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512) + +// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, +// u8 ghash_acc[16], +// const u8 *aad, int aadlen); +// +// This function processes the AAD (Additional Authenticated Data) in GCM. +// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been +// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen| +// must be a multiple of 16, except on the last call where it can be any length. +// The caller must do any buffering needed to ensure this. +// +// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes. +// Therefore, for AAD processing we currently only provide this implementation +// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This +// keeps the code size down, and it enables some micro-optimizations, e.g. using +// VEX-coded instructions instead of EVEX-coded to save some instruction bytes. +// To optimize for large amounts of AAD, we could implement a 4x-wide loop and +// provide a version using 512-bit vectors, but that doesn't seem to be useful. +SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10) + + // Function arguments + .set KEY, %rdi + .set GHASH_ACC_PTR, %rsi + .set AAD, %rdx + .set AADLEN, %ecx + .set AADLEN64, %rcx // Zero-extend AADLEN before using! + + // Additional local variables. + // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers. + .set BSWAP_MASK, %ymm4 + .set GFPOLY, %ymm5 + .set GHASH_ACC, %ymm6 + .set GHASH_ACC_XMM, %xmm6 + .set H_POW1, %ymm7 + + // Load some constants. + vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK + vbroadcasti128 .Lgfpoly(%rip), GFPOLY + + // Load the GHASH accumulator. + vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM + + // Update GHASH with 32 bytes of AAD at a time. + // + // Pre-subtracting 32 from AADLEN saves an instruction from the loop and + // also ensures that at least one write always occurs to AADLEN, + // zero-extending it and allowing AADLEN64 to be used later. + sub $32, AADLEN + jl .Laad_loop_1x_done + vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1] +.Laad_loop_1x: + vmovdqu (AAD), %ymm0 + vpshufb BSWAP_MASK, %ymm0, %ymm0 + vpxor %ymm0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %ymm0, %ymm1, %ymm2 + vextracti128 $1, GHASH_ACC, %xmm0 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM + add $32, AAD + sub $32, AADLEN + jge .Laad_loop_1x +.Laad_loop_1x_done: + add $32, AADLEN + jz .Laad_done + + // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD. + mov $-1, %eax + bzhi AADLEN, %eax, %eax + kmovd %eax, %k1 + vmovdqu8 (AAD), %ymm0{%k1}{z} + neg AADLEN64 + and $~15, AADLEN64 // -round_up(AADLEN, 16) + vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 + vpshufb BSWAP_MASK, %ymm0, %ymm0 + vpxor %ymm0, GHASH_ACC, GHASH_ACC + _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ + %ymm0, %ymm1, %ymm2 + vextracti128 $1, GHASH_ACC, %xmm0 + vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM + +.Laad_done: + // Store the updated GHASH accumulator back to memory. + vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) + + vzeroupper // This is needed after using ymm or zmm registers. + RET +SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10) + +SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10) + _aes_gcm_final 1 +SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10) +SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10) + _aes_gcm_final 0 +SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 39066b57a70e..eb153eff9331 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -10,16 +10,7 @@ * Vinodh Gopal <vinodh.gopal@intel.com> * Kahraman Akdemir * - * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD - * interface for 64-bit kernels. - * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) - * Aidan O'Mahony (aidan.o.mahony@intel.com) - * Adrian Hoban <adrian.hoban@intel.com> - * James Guilford (james.guilford@intel.com) - * Gabriele Paoloni <gabriele.paoloni@intel.com> - * Tadeusz Struk (tadeusz.struk@intel.com) - * Wajdi Feghali (wajdi.k.feghali@intel.com) - * Copyright (c) 2010, Intel Corporation. + * Copyright (c) 2010, Intel Corporation. * * Ported x86_64 version to x86: * Author: Mathias Krause <minipli@googlemail.com> @@ -27,95 +18,6 @@ #include <linux/linkage.h> #include <asm/frame.h> -#include <asm/nospec-branch.h> - -/* - * The following macros are used to move an (un)aligned 16 byte value to/from - * an XMM register. This can done for either FP or integer values, for FP use - * movaps (move aligned packed single) or integer use movdqa (move double quad - * aligned). It doesn't make a performance difference which instruction is used - * since Nehalem (original Core i7) was released. However, the movaps is a byte - * shorter, so that is the one we'll use for now. (same for unaligned). - */ -#define MOVADQ movaps -#define MOVUDQ movups - -#ifdef __x86_64__ - -# constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.POLY, "aM", @progbits, 16 -.align 16 -POLY: .octa 0xC2000000000000000000000000000001 -.section .rodata.cst16.TWOONE, "aM", @progbits, 16 -.align 16 -TWOONE: .octa 0x00000001000000000000000000000001 - -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 -.align 16 -SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F -.section .rodata.cst16.MASK1, "aM", @progbits, 16 -.align 16 -MASK1: .octa 0x0000000000000000ffffffffffffffff -.section .rodata.cst16.MASK2, "aM", @progbits, 16 -.align 16 -MASK2: .octa 0xffffffffffffffff0000000000000000 -.section .rodata.cst16.ONE, "aM", @progbits, 16 -.align 16 -ONE: .octa 0x00000000000000000000000000000001 -.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 -.align 16 -F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 -.section .rodata.cst16.dec, "aM", @progbits, 16 -.align 16 -dec: .octa 0x1 -.section .rodata.cst16.enc, "aM", @progbits, 16 -.align 16 -enc: .octa 0x2 - -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, -# and zero should follow ALL_F -.section .rodata, "a", @progbits -.align 16 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff - .octa 0x00000000000000000000000000000000 - -.text - -#define AadHash 16*0 -#define AadLen 16*1 -#define InLen (16*1)+8 -#define PBlockEncKey 16*2 -#define OrigIV 16*3 -#define CurCount 16*4 -#define PBlockLen 16*5 -#define HashKey 16*6 // store HashKey <<1 mod poly here -#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here -#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here -#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here -#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 - // bits of HashKey <<1 mod poly here - //(for Karatsuba purposes) -#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 - // bits of HashKey^2 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 - // bits of HashKey^3 <<1 mod poly here - // (for Karatsuba purposes) -#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 - // bits of HashKey^4 <<1 mod poly here - // (for Karatsuba purposes) - -#define arg1 rdi -#define arg2 rsi -#define arg3 rdx -#define arg4 rcx -#define arg5 r8 -#define arg6 r9 -#define keysize 2*15*16(%arg1) -#endif - #define STATE1 %xmm0 #define STATE2 %xmm4 @@ -162,1409 +64,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff #define TKEYP T1 #endif -.macro FUNC_SAVE - push %r12 - push %r13 - push %r14 -# -# states of %xmm registers %xmm6:%xmm15 not saved -# all %xmm registers are clobbered -# -.endm - - -.macro FUNC_RESTORE - pop %r14 - pop %r13 - pop %r12 -.endm - -# Precompute hashkeys. -# Input: Hash subkey. -# Output: HashKeys stored in gcm_context_data. Only needs to be called -# once per key. -# clobbers r12, and tmp xmm registers. -.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 - mov \SUBKEY, %r12 - movdqu (%r12), \TMP3 - movdqa SHUF_MASK(%rip), \TMP2 - pshufb \TMP2, \TMP3 - - # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) - - movdqa \TMP3, \TMP2 - psllq $1, \TMP3 - psrlq $63, \TMP2 - movdqa \TMP2, \TMP1 - pslldq $8, \TMP2 - psrldq $8, \TMP1 - por \TMP2, \TMP3 - - # reduce HashKey<<1 - - pshufd $0x24, \TMP1, \TMP2 - pcmpeqd TWOONE(%rip), \TMP2 - pand POLY(%rip), \TMP2 - pxor \TMP2, \TMP3 - movdqu \TMP3, HashKey(%arg2) - - movdqa \TMP3, \TMP5 - pshufd $78, \TMP3, \TMP1 - pxor \TMP3, \TMP1 - movdqu \TMP1, HashKey_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^2<<1 (mod poly) - movdqu \TMP5, HashKey_2(%arg2) -# HashKey_2 = HashKey^2<<1 (mod poly) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_2_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_3(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_3_k(%arg2) - - GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 -# TMP5 = HashKey^3<<1 (mod poly) - movdqu \TMP5, HashKey_4(%arg2) - pshufd $78, \TMP5, \TMP1 - pxor \TMP5, \TMP1 - movdqu \TMP1, HashKey_4_k(%arg2) -.endm - -# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. -# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 -.macro GCM_INIT Iv SUBKEY AAD AADLEN - mov \AADLEN, %r11 - mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length - xor %r11d, %r11d - mov %r11, InLen(%arg2) # ctx_data.in_length = 0 - mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 - mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 - mov \Iv, %rax - movdqu (%rax), %xmm0 - movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv - - movdqa SHUF_MASK(%rip), %xmm2 - pshufb %xmm2, %xmm0 - movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv - - PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 - movdqu HashKey(%arg2), %xmm13 - - CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ - %xmm4, %xmm5, %xmm6 -.endm - -# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context -# struct has been initialized by GCM_INIT. -# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK -# Clobbers rax, r10-r13, and xmm0-xmm15 -.macro GCM_ENC_DEC operation - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - add %arg5, InLen(%arg2) - - xor %r11d, %r11d # initialise the data pointer offset as zero - PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation - - sub %r11, %arg5 # sub partial block data used - mov %arg5, %r13 # save the number of bytes - - and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) - mov %r13, %r12 - # Encrypt/Decrypt first few blocks - - and $(3<<4), %r12 - jz .L_initial_num_blocks_is_0_\@ - cmp $(2<<4), %r12 - jb .L_initial_num_blocks_is_1_\@ - je .L_initial_num_blocks_is_2_\@ -.L_initial_num_blocks_is_3_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation - sub $48, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_2_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation - sub $32, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_1_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation - sub $16, %r13 - jmp .L_initial_blocks_\@ -.L_initial_num_blocks_is_0_\@: - INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ -%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation -.L_initial_blocks_\@: - - # Main loop - Encrypt/Decrypt remaining blocks - - test %r13, %r13 - je .L_zero_cipher_left_\@ - sub $64, %r13 - je .L_four_cipher_left_\@ -.L_crypt_by_4_\@: - GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ - %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ - %xmm7, %xmm8, enc - add $64, %r11 - sub $64, %r13 - jne .L_crypt_by_4_\@ -.L_four_cipher_left_\@: - GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ -%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 -.L_zero_cipher_left_\@: - movdqu %xmm8, AadHash(%arg2) - movdqu %xmm0, CurCount(%arg2) - - mov %arg5, %r13 - and $15, %r13 # %r13 = arg5 (mod 16) - je .L_multiple_of_16_bytes_\@ - - mov %r13, PBlockLen(%arg2) - - # Handle the last <16 Byte block separately - paddd ONE(%rip), %xmm0 # INCR CNT to get Yn - movdqu %xmm0, CurCount(%arg2) - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm0 - - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) - movdqu %xmm0, PBlockEncKey(%arg2) - - cmp $16, %arg5 - jge .L_large_enough_update_\@ - - lea (%arg4,%r11,1), %r10 - mov %r13, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 - jmp .L_data_read_\@ - -.L_large_enough_update_\@: - sub $16, %r11 - add %r13, %r11 - - # receive the last <16 Byte block - movdqu (%arg4, %r11, 1), %xmm1 - - sub %r13, %r11 - add $16, %r11 - - lea SHIFT_MASK+16(%rip), %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - sub %r13, %r12 - # get the appropriate shuffle mask - movdqu (%r12), %xmm2 - # shift right 16-r13 bytes - pshufb %xmm2, %xmm1 - -.L_data_read_\@: - lea ALL_F+16(%rip), %r12 - sub %r13, %r12 - -.ifc \operation, dec - movdqa %xmm1, %xmm2 -.endif - pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) - movdqu (%r12), %xmm1 - # get the appropriate mask to mask out top 16-r13 bytes of xmm0 - pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 -.ifc \operation, dec - pand %xmm1, %xmm2 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10 ,%xmm2 - - pxor %xmm2, %xmm8 -.else - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10,%xmm0 - - pxor %xmm0, %xmm8 -.endif - - movdqu %xmm8, AadHash(%arg2) -.ifc \operation, enc - # GHASH computation for the last <16 byte block - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm0 back to output as ciphertext - pshufb %xmm10, %xmm0 -.endif - - # Output %r13 bytes - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - mov %rax, (%arg3 , %r11, 1) - add $8, %r11 - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - mov %al, (%arg3, %r11, 1) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_multiple_of_16_bytes_\@: -.endm - -# GCM_COMPLETE Finishes update of tag of last partial block -# Output: Authorization Tag (AUTH_TAG) -# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN - movdqu AadHash(%arg2), %xmm8 - movdqu HashKey(%arg2), %xmm13 - - mov PBlockLen(%arg2), %r12 - - test %r12, %r12 - je .L_partial_done\@ - - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - -.L_partial_done\@: - mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - movd %r12d, %xmm15 # len(A) in %xmm15 - mov InLen(%arg2), %r12 - shl $3, %r12 # len(C) in bits (*128) - movq %r12, %xmm1 - - pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 - pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) - pxor %xmm15, %xmm8 - GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 - # final GHASH computation - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm8 - - movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 - ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) - pxor %xmm8, %xmm0 -.L_return_T_\@: - mov \AUTHTAG, %r10 # %r10 = authTag - mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len - cmp $16, %r11 - je .L_T_16_\@ - cmp $8, %r11 - jl .L_T_4_\@ -.L_T_8_\@: - movq %xmm0, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - psrldq $8, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_4_\@: - movd %xmm0, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - psrldq $4, %xmm0 - test %r11, %r11 - je .L_return_T_done_\@ -.L_T_123_\@: - movd %xmm0, %eax - cmp $2, %r11 - jl .L_T_1_\@ - mov %ax, (%r10) - cmp $2, %r11 - je .L_return_T_done_\@ - add $2, %r10 - sar $16, %eax -.L_T_1_\@: - mov %al, (%r10) - jmp .L_return_T_done_\@ -.L_T_16_\@: - movdqu %xmm0, (%r10) -.L_return_T_done_\@: -.endm - -#ifdef __x86_64__ -/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -* -* -* Input: A and B (128-bits each, bit-reflected) -* Output: C = A*B*x mod poly, (i.e. >>1 ) -* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -* -*/ -.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 - movdqa \GH, \TMP1 - pshufd $78, \GH, \TMP2 - pshufd $78, \HK, \TMP3 - pxor \GH, \TMP2 # TMP2 = a1+a0 - pxor \HK, \TMP3 # TMP3 = b1+b0 - pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \HK, \GH # GH = a0*b0 - pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) - pxor \GH, \TMP2 - pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \GH - pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK - - # first phase of the reduction - - movdqa \GH, \TMP2 - movdqa \GH, \TMP3 - movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - pslld $31, \TMP2 # packed right shift <<31 - pslld $30, \TMP3 # packed right shift <<30 - pslld $25, \TMP4 # packed right shift <<25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift TMP5 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \GH - - # second phase of the reduction - - movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 - # in in order to perform - # independent shifts - movdqa \GH,\TMP3 - movdqa \GH,\TMP4 - psrld $1,\TMP2 # packed left shift >>1 - psrld $2,\TMP3 # packed left shift >>2 - psrld $7,\TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \GH - pxor \TMP1, \GH # result is in TMP1 -.endm - -# Reads DLEN bytes starting at DPTR and stores in XMMDst -# where 0 < DLEN < 16 -# Clobbers %rax, DLEN and XMM1 -.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst - cmp $8, \DLEN - jl .L_read_lt8_\@ - mov (\DPTR), %rax - movq %rax, \XMMDst - sub $8, \DLEN - jz .L_done_read_partial_block_\@ - xor %eax, %eax -.L_read_next_byte_\@: - shl $8, %rax - mov 7(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_\@ - movq %rax, \XMM1 - pslldq $8, \XMM1 - por \XMM1, \XMMDst - jmp .L_done_read_partial_block_\@ -.L_read_lt8_\@: - xor %eax, %eax -.L_read_next_byte_lt8_\@: - shl $8, %rax - mov -1(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_lt8_\@ - movq %rax, \XMMDst -.L_done_read_partial_block_\@: -.endm - -# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. -# clobbers r10-11, xmm14 -.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ - TMP6 TMP7 - MOVADQ SHUF_MASK(%rip), %xmm14 - mov \AAD, %r10 # %r10 = AAD - mov \AADLEN, %r11 # %r11 = aadLen - pxor \TMP7, \TMP7 - pxor \TMP6, \TMP6 - - cmp $16, %r11 - jl .L_get_AAD_rest\@ -.L_get_AAD_blocks\@: - movdqu (%r10), \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP7, \TMP6 - GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - add $16, %r10 - sub $16, %r11 - cmp $16, %r11 - jge .L_get_AAD_blocks\@ - - movdqu \TMP6, \TMP7 - - /* read the last <16B of AAD */ -.L_get_AAD_rest\@: - test %r11, %r11 - je .L_get_AAD_done\@ - - READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 - pshufb %xmm14, \TMP7 # byte-reflect the AAD data - pxor \TMP6, \TMP7 - GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 - movdqu \TMP7, \TMP6 - -.L_get_AAD_done\@: - movdqu \TMP6, AadHash(%arg2) -.endm - -# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks -# between update calls. -# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK -# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context -# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 -.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ - AAD_HASH operation - mov PBlockLen(%arg2), %r13 - test %r13, %r13 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks - # Read in input data without over reading - cmp $16, \PLAIN_CYPH_LEN - jl .L_fewer_than_16_bytes_\@ - movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp .L_data_read_\@ - -.L_fewer_than_16_bytes_\@: - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 - mov \PLAIN_CYPH_LEN, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 - - mov PBlockLen(%arg2), %r13 - -.L_data_read_\@: # Finished reading in data - - movdqu PBlockEncKey(%arg2), %xmm9 - movdqu HashKey(%arg2), %xmm13 - - lea SHIFT_MASK(%rip), %r12 - - # adjust the shuffle mask pointer to be able to shift r13 bytes - # r16-r13 is the number of bytes in plaintext mod 16) - add %r13, %r12 - movdqu (%r12), %xmm2 # get the appropriate shuffle mask - pshufb %xmm2, %xmm9 # shift right r13 bytes - -.ifc \operation, dec - movdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_1_\@ - sub %r10, %r12 -.L_no_extra_mask_1_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 - - pand %xmm1, %xmm3 - movdqa SHUF_MASK(%rip), %xmm10 - pshufb %xmm10, %xmm3 - pshufb %xmm2, %xmm3 - pxor %xmm3, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_1_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_dec_done_\@ -.L_partial_incomplete_1_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_dec_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) -.else - pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_2_\@ - sub %r10, %r12 -.L_no_extra_mask_2_\@: - - movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - pand %xmm1, %xmm9 - - movdqa SHUF_MASK(%rip), %xmm1 - pshufb %xmm1, %xmm9 - pshufb %xmm2, %xmm9 - pxor %xmm9, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_2_\@ - - # GHASH computation for the last <16 Byte block - GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax, %eax - - mov %rax, PBlockLen(%arg2) - jmp .L_encode_done_\@ -.L_partial_incomplete_2_\@: - add \PLAIN_CYPH_LEN, PBlockLen(%arg2) -.L_encode_done_\@: - movdqu \AAD_HASH, AadHash(%arg2) - - movdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm9 back to output as ciphertext - pshufb %xmm10, %xmm9 - pshufb %xmm2, %xmm9 -.endif - # output encrypted Bytes - test %r10, %r10 - jl .L_partial_fill_\@ - mov %r13, %r12 - mov $16, %r13 - # Set r13 to be the number of bytes to write out - sub %r12, %r13 - jmp .L_count_set_\@ -.L_partial_fill_\@: - mov \PLAIN_CYPH_LEN, %r13 -.L_count_set_\@: - movdqa %xmm9, %xmm0 - movq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $8, \DATA_OFFSET - psrldq $8, %xmm0 - movq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $1, \DATA_OFFSET - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_partial_block_done_\@: -.endm # PARTIAL_BLOCK - -/* -* if a = number of total plaintext bytes -* b = floor(a/16) -* num_initial_blocks = b mod 4 -* encrypt the initial num_initial_blocks blocks and apply ghash on -* the ciphertext -* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers -* are clobbered -* arg1, %arg2, %arg3 are used as a pointer only, not modified -*/ - - -.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ - XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation - MOVADQ SHUF_MASK(%rip), %xmm14 - - movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 - - # start AES for num_initial_blocks blocks - - movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 - -.if (\i == 5) || (\i == 6) || (\i == 7) - - MOVADQ ONE(%RIP),\TMP1 - MOVADQ 0(%arg1),\TMP2 -.irpc index, \i_seq - paddd \TMP1, \XMM0 # INCR Y0 -.ifc \operation, dec - movdqa \XMM0, %xmm\index -.else - MOVADQ \XMM0, %xmm\index -.endif - pshufb %xmm14, %xmm\index # perform a 16 byte swap - pxor \TMP2, %xmm\index -.endr - lea 0x10(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - -.Laes_loop_initial_\@: - MOVADQ (%r10),\TMP1 -.irpc index, \i_seq - aesenc \TMP1, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_initial_\@ - - MOVADQ (%r10), \TMP1 -.irpc index, \i_seq - aesenclast \TMP1, %xmm\index # Last Round -.endr -.irpc index, \i_seq - movdqu (%arg4 , %r11, 1), \TMP1 - pxor \TMP1, %xmm\index - movdqu %xmm\index, (%arg3 , %r11, 1) - # write back plaintext/ciphertext for num_initial_blocks - add $16, %r11 - -.ifc \operation, dec - movdqa \TMP1, %xmm\index -.endif - pshufb %xmm14, %xmm\index - - # prepare plaintext/ciphertext for GHASH computation -.endr -.endif - - # apply GHASH on num_initial_blocks blocks - -.if \i == 5 - pxor %xmm5, %xmm6 - GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 6 - pxor %xmm6, %xmm7 - GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.elseif \i == 7 - pxor %xmm7, %xmm8 - GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 -.endif - cmp $64, %r13 - jl .L_initial_blocks_done\@ - # no need for precomputed values -/* -* -* Precomputations for HashKey parallel with encryption of first 4 blocks. -* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i -*/ - MOVADQ ONE(%RIP),\TMP1 - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM1 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM2 - pshufb %xmm14, \XMM2 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM3 - pshufb %xmm14, \XMM3 # perform a 16 byte swap - - paddd \TMP1, \XMM0 # INCR Y0 - MOVADQ \XMM0, \XMM4 - pshufb %xmm14, \XMM4 # perform a 16 byte swap - - MOVADQ 0(%arg1),\TMP1 - pxor \TMP1, \XMM1 - pxor \TMP1, \XMM2 - pxor \TMP1, \XMM3 - pxor \TMP1, \XMM4 -.irpc index, 1234 # do 4 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr -.irpc index, 56789 # do next 5 rounds - movaps 0x10*\index(%arg1), \TMP1 - aesenc \TMP1, \XMM1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 -.endr - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_pre_done\@ - -.Laes_loop_pre_\@: - MOVADQ (%r10),\TMP2 -.irpc index, 1234 - aesenc \TMP2, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_pre_\@ - -.Laes_loop_pre_done\@: - MOVADQ (%r10), \TMP2 - aesenclast \TMP2, \XMM1 - aesenclast \TMP2, \XMM2 - aesenclast \TMP2, \XMM3 - aesenclast \TMP2, \XMM4 - movdqu 16*0(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM1 -.ifc \operation, dec - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM1 -.endif - movdqu 16*1(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM2 -.ifc \operation, dec - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM2 -.endif - movdqu 16*2(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM3 -.ifc \operation, dec - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM3 -.endif - movdqu 16*3(%arg4 , %r11 , 1), \TMP1 - pxor \TMP1, \XMM4 -.ifc \operation, dec - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) - movdqa \TMP1, \XMM4 -.else - movdqu \XMM1, 16*0(%arg3 , %r11 , 1) - movdqu \XMM2, 16*1(%arg3 , %r11 , 1) - movdqu \XMM3, 16*2(%arg3 , %r11 , 1) - movdqu \XMM4, 16*3(%arg3 , %r11 , 1) -.endif - - add $64, %r11 - pshufb %xmm14, \XMM1 # perform a 16 byte swap - pxor \XMMDst, \XMM1 -# combine GHASHed value with the corresponding ciphertext - pshufb %xmm14, \XMM2 # perform a 16 byte swap - pshufb %xmm14, \XMM3 # perform a 16 byte swap - pshufb %xmm14, \XMM4 # perform a 16 byte swap - -.L_initial_blocks_done\@: - -.endm - -/* -* encrypt 4 blocks at a time -* ghash the 4 previously encrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_enc_done\@ - -.Laes_loop_par_enc\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_enc\@ - -.Laes_loop_par_enc_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # Round 10 - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer - movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* -* decrypt 4 blocks at a time -* ghash the 4 previously decrypted ciphertext blocks -* arg1, %arg3, %arg4 are used as pointers only, not modified -* %r11 is the data offset value -*/ -.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ -TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation - - movdqa \XMM1, \XMM5 - movdqa \XMM2, \XMM6 - movdqa \XMM3, \XMM7 - movdqa \XMM4, \XMM8 - - movdqa SHUF_MASK(%rip), %xmm15 - # multiply TMP5 * HashKey using karatsuba - - movdqa \XMM5, \TMP4 - pshufd $78, \XMM5, \TMP6 - pxor \XMM5, \TMP6 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 - movdqa \XMM0, \XMM1 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM2 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM3 - paddd ONE(%rip), \XMM0 # INCR CNT - movdqa \XMM0, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor (%arg1), \XMM1 - pxor (%arg1), \XMM2 - pxor (%arg1), \XMM3 - pxor (%arg1), \XMM4 - movdqu HashKey_4_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) - movaps 0x10(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 1 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movaps 0x20(%arg1), \TMP1 - aesenc \TMP1, \XMM1 # Round 2 - aesenc \TMP1, \XMM2 - aesenc \TMP1, \XMM3 - aesenc \TMP1, \XMM4 - movdqa \XMM6, \TMP1 - pshufd $78, \XMM6, \TMP2 - pxor \XMM6, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 - movaps 0x30(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 3 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 - movaps 0x40(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 4 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_3_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x50(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 5 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM6, \XMM5 - pxor \TMP2, \TMP6 - movdqa \XMM7, \TMP1 - pshufd $78, \XMM7, \TMP2 - pxor \XMM7, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - - # Multiply TMP5 * HashKey using karatsuba - - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x60(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 6 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 - movaps 0x70(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 7 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - movdqu HashKey_2_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movaps 0x80(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 8 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pxor \TMP1, \TMP4 -# accumulate the results in TMP4:XMM5, TMP6 holds the middle part - pxor \XMM7, \XMM5 - pxor \TMP2, \TMP6 - - # Multiply XMM8 * HashKey - # XMM8 and TMP5 hold the values for the two operands - - movdqa \XMM8, \TMP1 - pshufd $78, \XMM8, \TMP2 - pxor \XMM8, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - movaps 0x90(%arg1), \TMP3 - aesenc \TMP3, \XMM1 # Round 9 - aesenc \TMP3, \XMM2 - aesenc \TMP3, \XMM3 - aesenc \TMP3, \XMM4 - pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 - lea 0xa0(%arg1),%r10 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - sub $4,%eax # 128->0, 192->2, 256->4 - jz .Laes_loop_par_dec_done\@ - -.Laes_loop_par_dec\@: - MOVADQ (%r10),\TMP3 -.irpc index, 1234 - aesenc \TMP3, %xmm\index -.endr - add $16,%r10 - sub $1,%eax - jnz .Laes_loop_par_dec\@ - -.Laes_loop_par_dec_done\@: - MOVADQ (%r10), \TMP3 - aesenclast \TMP3, \XMM1 # last round - aesenclast \TMP3, \XMM2 - aesenclast \TMP3, \XMM3 - aesenclast \TMP3, \XMM4 - movdqu HashKey_k(%arg2), \TMP5 - pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqu (%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK - movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM1 - movdqu 16(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK - movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM2 - movdqu 32(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK - movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM3 - movdqu 48(%arg4,%r11,1), \TMP3 - pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK - movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer - movdqa \TMP3, \XMM4 - pshufb %xmm15, \XMM1 # perform a 16 byte swap - pshufb %xmm15, \XMM2 # perform a 16 byte swap - pshufb %xmm15, \XMM3 # perform a 16 byte swap - pshufb %xmm15, \XMM4 # perform a 16 byte swap - - pxor \TMP4, \TMP1 - pxor \XMM8, \XMM5 - pxor \TMP6, \TMP2 - pxor \TMP1, \TMP2 - pxor \XMM5, \TMP2 - movdqa \TMP2, \TMP3 - pslldq $8, \TMP3 # left shift TMP3 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP3, \XMM5 - pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 - - # first phase of reduction - - movdqa \XMM5, \TMP2 - movdqa \XMM5, \TMP3 - movdqa \XMM5, \TMP4 -# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently - pslld $31, \TMP2 # packed right shift << 31 - pslld $30, \TMP3 # packed right shift << 30 - pslld $25, \TMP4 # packed right shift << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP5 - psrldq $4, \TMP5 # right shift T5 1 DW - pslldq $12, \TMP2 # left shift T2 3 DWs - pxor \TMP2, \XMM5 - - # second phase of reduction - - movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 - movdqa \XMM5,\TMP3 - movdqa \XMM5,\TMP4 - psrld $1, \TMP2 # packed left shift >>1 - psrld $2, \TMP3 # packed left shift >>2 - psrld $7, \TMP4 # packed left shift >>7 - pxor \TMP3,\TMP2 # xor the shifted versions - pxor \TMP4,\TMP2 - pxor \TMP5, \TMP2 - pxor \TMP2, \XMM5 - pxor \TMP1, \XMM5 # result is in TMP1 - - pxor \XMM5, \XMM1 -.endm - -/* GHASH the last 4 ciphertext blocks. */ -.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ -TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst - - # Multiply TMP6 * HashKey (using Karatsuba) - - movdqa \XMM1, \TMP6 - pshufd $78, \XMM1, \TMP2 - pxor \XMM1, \TMP2 - movdqu HashKey_4(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 - movdqu HashKey_4_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - movdqa \XMM1, \XMMDst - movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM2, \TMP1 - pshufd $78, \XMM2, \TMP2 - pxor \XMM2, \TMP2 - movdqu HashKey_3(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 - movdqu HashKey_3_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM2, \XMMDst - pxor \TMP2, \XMM1 -# results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - - movdqa \XMM3, \TMP1 - pshufd $78, \XMM3, \TMP2 - pxor \XMM3, \TMP2 - movdqu HashKey_2(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 - movdqu HashKey_2_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM3, \XMMDst - pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 - - # Multiply TMP1 * HashKey (using Karatsuba) - movdqa \XMM4, \TMP1 - pshufd $78, \XMM4, \TMP2 - pxor \XMM4, \TMP2 - movdqu HashKey(%arg2), \TMP5 - pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 - movdqu HashKey_k(%arg2), \TMP4 - pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) - pxor \TMP1, \TMP6 - pxor \XMM4, \XMMDst - pxor \XMM1, \TMP2 - pxor \TMP6, \TMP2 - pxor \XMMDst, \TMP2 - # middle section of the temp results combined as in karatsuba algorithm - movdqa \TMP2, \TMP4 - pslldq $8, \TMP4 # left shift TMP4 2 DWs - psrldq $8, \TMP2 # right shift TMP2 2 DWs - pxor \TMP4, \XMMDst - pxor \TMP2, \TMP6 -# TMP6:XMMDst holds the result of the accumulated carry-less multiplications - # first phase of the reduction - movdqa \XMMDst, \TMP2 - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 -# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently - pslld $31, \TMP2 # packed right shifting << 31 - pslld $30, \TMP3 # packed right shifting << 30 - pslld $25, \TMP4 # packed right shifting << 25 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - movdqa \TMP2, \TMP7 - psrldq $4, \TMP7 # right shift TMP7 1 DW - pslldq $12, \TMP2 # left shift TMP2 3 DWs - pxor \TMP2, \XMMDst - - # second phase of the reduction - movdqa \XMMDst, \TMP2 - # make 3 copies of XMMDst for doing 3 shift operations - movdqa \XMMDst, \TMP3 - movdqa \XMMDst, \TMP4 - psrld $1, \TMP2 # packed left shift >> 1 - psrld $2, \TMP3 # packed left shift >> 2 - psrld $7, \TMP4 # packed left shift >> 7 - pxor \TMP3, \TMP2 # xor the shifted versions - pxor \TMP4, \TMP2 - pxor \TMP7, \TMP2 - pxor \TMP2, \XMMDst - pxor \TMP6, \XMMDst # reduced result is in XMMDst -.endm - - -/* Encryption of a single block -* uses eax & r10 -*/ - -.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 - - pxor (%arg1), \XMM0 - mov keysize,%eax - shr $2,%eax # 128->4, 192->6, 256->8 - add $5,%eax # 128->9, 192->11, 256->13 - lea 16(%arg1), %r10 # get first expanded key address - -_esb_loop_\@: - MOVADQ (%r10),\TMP1 - aesenc \TMP1,\XMM0 - add $16,%r10 - sub $1,%eax - jnz _esb_loop_\@ - - MOVADQ (%r10),\TMP1 - aesenclast \TMP1,\XMM0 -.endm - -/***************************************************************************** -* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) -* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) -* // concatenated with 0x00000001. 16-byte aligned pointer. -* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. -* const u8 *aad, // Additional Authentication Data (AAD) -* u64 aad_len) // Length of AAD in bytes. -*/ -SYM_FUNC_START(aesni_gcm_init) - FUNC_SAVE - GCM_INIT %arg3, %arg4,%arg5, %arg6 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init) - -/***************************************************************************** -* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_enc_update) - FUNC_SAVE - GCM_ENC_DEC enc - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update) - -/***************************************************************************** -* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *out, // Ciphertext output. Encrypt in-place is allowed. -* const u8 *in, // Plaintext input -* u64 plaintext_len, // Length of data in bytes for encryption. -*/ -SYM_FUNC_START(aesni_gcm_dec_update) - FUNC_SAVE - GCM_ENC_DEC dec - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update) - -/***************************************************************************** -* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. -* struct gcm_context_data *data, -* // context data -* u8 *auth_tag, // Authenticated Tag output. -* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), -* // 12 or 8. -*/ -SYM_FUNC_START(aesni_gcm_finalize) - FUNC_SAVE - GCM_COMPLETE %arg3 %arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize) - -#endif - SYM_FUNC_START_LOCAL(_key_expansion_256a) pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S deleted file mode 100644 index 8c9749ed0651..000000000000 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ /dev/null @@ -1,2804 +0,0 @@ -######################################################################## -# Copyright (c) 2013, Intel Corporation -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the -# distribution. -# -# * Neither the name of the Intel Corporation nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# -# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR -# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -######################################################################## -## -## Authors: -## Erdinc Ozturk <erdinc.ozturk@intel.com> -## Vinodh Gopal <vinodh.gopal@intel.com> -## James Guilford <james.guilford@intel.com> -## Tim Chen <tim.c.chen@linux.intel.com> -## -## References: -## This code was derived and highly optimized from the code described in paper: -## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation -## on Intel Architecture Processors. August, 2010 -## The details of the implementation is explained in: -## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode -## on Intel Architecture Processors. October, 2012. -## -## Assumptions: -## -## -## -## iv: -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | Salt (From the SA) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | Initialization Vector | -## | (This is the sequence number from IPSec header) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x1 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## -## -## AAD: -## AAD padded to 128 bits with 0 -## for example, assume AAD is a u32 vector -## -## if AAD is 8 bytes: -## AAD[3] = {A0, A1}# -## padded AAD in xmm register = {A1 A0 0 0} -## -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | SPI (A1) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 32-bit Sequence Number (A0) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x0 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## AAD Format with 32-bit Sequence Number -## -## if AAD is 12 bytes: -## AAD[3] = {A0, A1, A2}# -## padded AAD in xmm register = {A2 A1 A0 0} -## -## 0 1 2 3 -## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | SPI (A2) | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 64-bit Extended Sequence Number {A1,A0} | -## | | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## | 0x0 | -## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -## -## AAD Format with 64-bit Extended Sequence Number -## -## -## aadLen: -## from the definition of the spec, aadLen can only be 8 or 12 bytes. -## The code additionally supports aadLen of length 16 bytes. -## -## TLen: -## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. -## -## poly = x^128 + x^127 + x^126 + x^121 + 1 -## throughout the code, one tab and two tab indentations are used. one tab is -## for GHASH part, two tabs is for AES part. -## - -#include <linux/linkage.h> - -# constants in mergeable sections, linker can reorder and merge -.section .rodata.cst16.POLY, "aM", @progbits, 16 -.align 16 -POLY: .octa 0xC2000000000000000000000000000001 - -.section .rodata.cst16.POLY2, "aM", @progbits, 16 -.align 16 -POLY2: .octa 0xC20000000000000000000001C2000000 - -.section .rodata.cst16.TWOONE, "aM", @progbits, 16 -.align 16 -TWOONE: .octa 0x00000001000000000000000000000001 - -.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 -.align 16 -SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F - -.section .rodata.cst16.ONE, "aM", @progbits, 16 -.align 16 -ONE: .octa 0x00000000000000000000000000000001 - -.section .rodata.cst16.ONEf, "aM", @progbits, 16 -.align 16 -ONEf: .octa 0x01000000000000000000000000000000 - -# order of these constants should not change. -# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F -.section .rodata, "a", @progbits -.align 16 -SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 -ALL_F: .octa 0xffffffffffffffffffffffffffffffff - .octa 0x00000000000000000000000000000000 - -.text - - -#define AadHash 16*0 -#define AadLen 16*1 -#define InLen (16*1)+8 -#define PBlockEncKey 16*2 -#define OrigIV 16*3 -#define CurCount 16*4 -#define PBlockLen 16*5 - -HashKey = 16*6 # store HashKey <<1 mod poly here -HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here -HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here -HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here -HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here -HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here -HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here -HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here -HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) -HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) -HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) -HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) -HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) -HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) -HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) -HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) - -#define arg1 %rdi -#define arg2 %rsi -#define arg3 %rdx -#define arg4 %rcx -#define arg5 %r8 -#define arg6 %r9 -#define keysize 2*15*16(arg1) - -i = 0 -j = 0 - -out_order = 0 -in_order = 1 -DEC = 0 -ENC = 1 - -.macro define_reg r n -reg_\r = %xmm\n -.endm - -.macro setreg -.altmacro -define_reg i %i -define_reg j %j -.noaltmacro -.endm - -TMP1 = 16*0 # Temporary storage for AAD -TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) -TMP3 = 16*2 # Temporary storage for AES State 3 -TMP4 = 16*3 # Temporary storage for AES State 4 -TMP5 = 16*4 # Temporary storage for AES State 5 -TMP6 = 16*5 # Temporary storage for AES State 6 -TMP7 = 16*6 # Temporary storage for AES State 7 -TMP8 = 16*7 # Temporary storage for AES State 8 - -VARIABLE_OFFSET = 16*8 - -################################ -# Utility Macros -################################ - -.macro FUNC_SAVE - push %r12 - push %r13 - push %r15 - - push %rbp - mov %rsp, %rbp - - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes -.endm - -.macro FUNC_RESTORE - mov %rbp, %rsp - pop %rbp - - pop %r15 - pop %r13 - pop %r12 -.endm - -# Encryption of a single block -.macro ENCRYPT_SINGLE_BLOCK REP XMM0 - vpxor (arg1), \XMM0, \XMM0 - i = 1 - setreg -.rep \REP - vaesenc 16*i(arg1), \XMM0, \XMM0 - i = (i+1) - setreg -.endr - vaesenclast 16*i(arg1), \XMM0, \XMM0 -.endm - -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r15, rax -.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP - vmovdqu AadHash(arg2), %xmm8 - vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey - add arg5, InLen(arg2) - - # initialize the data pointer offset as zero - xor %r11d, %r11d - - PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC - sub %r11, arg5 - - mov arg5, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) - - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz .L_initial_num_blocks_is_0\@ - - cmp $7, %r12 - je .L_initial_num_blocks_is_7\@ - cmp $6, %r12 - je .L_initial_num_blocks_is_6\@ - cmp $5, %r12 - je .L_initial_num_blocks_is_5\@ - cmp $4, %r12 - je .L_initial_num_blocks_is_4\@ - cmp $3, %r12 - je .L_initial_num_blocks_is_3\@ - cmp $2, %r12 - je .L_initial_num_blocks_is_2\@ - - jmp .L_initial_num_blocks_is_1\@ - -.L_initial_num_blocks_is_7\@: - \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_6\@: - \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_5\@: - \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_4\@: - \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_3\@: - \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_2\@: - \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_1\@: - \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp .L_initial_blocks_encrypted\@ - -.L_initial_num_blocks_is_0\@: - \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - - -.L_initial_blocks_encrypted\@: - test %r13, %r13 - je .L_zero_cipher_left\@ - - sub $128, %r13 - je .L_eight_cipher_left\@ - - - - - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - -.L_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg .L_encrypt_by_8\@ - - - - add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne .L_encrypt_by_8_new\@ - - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp .L_eight_cipher_left\@ - -.L_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne .L_encrypt_by_8_new\@ - - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - - - -.L_eight_cipher_left\@: - \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 - - -.L_zero_cipher_left\@: - vmovdqu %xmm14, AadHash(arg2) - vmovdqu %xmm9, CurCount(arg2) - - # check for 0 length - mov arg5, %r13 - and $15, %r13 # r13 = (arg5 mod 16) - - je .L_multiple_of_16_bytes\@ - - # handle the last <16 Byte block separately - - mov %r13, PBlockLen(arg2) - - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vmovdqu %xmm9, CurCount(arg2) - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) - vmovdqu %xmm9, PBlockEncKey(arg2) - - cmp $16, arg5 - jge .L_large_enough_update\@ - - lea (arg4,%r11,1), %r10 - mov %r13, %r12 - - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 - - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) - - jmp .L_final_ghash_mul\@ - -.L_large_enough_update\@: - sub $16, %r11 - add %r13, %r11 - - # receive the last <16 Byte block - vmovdqu (arg4, %r11, 1), %xmm1 - - sub %r13, %r11 - add $16, %r11 - - lea SHIFT_MASK+16(%rip), %r12 - # adjust the shuffle mask pointer to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - sub %r13, %r12 - # get the appropriate shuffle mask - vmovdqu (%r12), %xmm2 - # shift right 16-r13 bytes - vpshufb %xmm2, %xmm1, %xmm1 - -.L_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - - vmovdqu %xmm14, AadHash(arg2) - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - - vmovdqu %xmm14, AadHash(arg2) - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif - - - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left\@ - - mov %rax, (arg3 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 - -.L_less_than_8_bytes_left\@: - movb %al, (arg3 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left\@ - ############################# - -.L_multiple_of_16_bytes\@: -.endm - - -# GCM_COMPLETE Finishes update of tag of last partial block -# Output: Authorization Tag (AUTH_TAG) -# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 -.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN - vmovdqu AadHash(arg2), %xmm14 - vmovdqu HashKey(arg2), %xmm13 - - mov PBlockLen(arg2), %r12 - test %r12, %r12 - je .L_partial_done\@ - - #GHASH computation for the last <16 Byte block - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - -.L_partial_done\@: - mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 - - mov InLen(arg2), %r12 - shl $3, %r12 # len(C) in bits (*128) - vmovq %r12, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) - - vpxor %xmm15, %xmm14, %xmm14 - \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - - vmovdqu OrigIV(arg2), %xmm9 - - ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) - - vpxor %xmm14, %xmm9, %xmm9 - - - -.L_return_T\@: - mov \AUTH_TAG, %r10 # r10 = authTag - mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len - - cmp $16, %r11 - je .L_T_16\@ - - cmp $8, %r11 - jl .L_T_4\@ - -.L_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - test %r11, %r11 - je .L_return_T_done\@ -.L_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - test %r11, %r11 - je .L_return_T_done\@ -.L_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl .L_T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je .L_return_T_done\@ - add $2, %r10 - sar $16, %eax -.L_T_1\@: - mov %al, (%r10) - jmp .L_return_T_done\@ - -.L_T_16\@: - vmovdqu %xmm9, (%r10) - -.L_return_T_done\@: -.endm - -.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 - - mov \AAD, %r10 # r10 = AAD - mov \AADLEN, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor \T8, \T8, \T8 - vpxor \T7, \T7, \T7 - cmp $16, %r11 - jl .L_get_AAD_rest8\@ -.L_get_AAD_blocks\@: - vmovdqu (%r10), \T7 - vpshufb SHUF_MASK(%rip), \T7, \T7 - vpxor \T7, \T8, \T8 - \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge .L_get_AAD_blocks\@ - vmovdqu \T8, \T7 - test %r11, %r11 - je .L_get_AAD_done\@ - - vpxor \T7, \T7, \T7 - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -.L_get_AAD_rest8\@: - cmp $4, %r11 - jle .L_get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, \T7, \T7 - vpxor \T1, \T7, \T7 - jmp .L_get_AAD_rest8\@ -.L_get_AAD_rest4\@: - test %r11, %r11 - jle .L_get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, \T7, \T7 - vpxor \T1, \T7, \T7 -.L_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and a pair of shuffle masks */ - leaq ALL_F(%rip), %r11 - subq %r12, %r11 - vmovdqu 16(%r11), \T1 - andq $~3, %r11 - vpshufb (%r11), \T7, \T7 - vpand \T1, \T7, \T7 -.L_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), \T7, \T7 - vpxor \T8, \T7, \T7 - \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 - -.L_get_AAD_done\@: - vmovdqu \T7, AadHash(arg2) -.endm - -.macro INIT GHASH_MUL PRECOMPUTE - mov arg6, %r11 - mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length - xor %r11d, %r11d - mov %r11, InLen(arg2) # ctx_data.in_length = 0 - - mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 - mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 - mov arg3, %rax - movdqu (%rax), %xmm0 - movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv - - vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 - movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv - - vmovdqu (arg4), %xmm6 # xmm6 = HashKey - - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly - ####################################################################### - vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly - - CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 - - \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 -.endm - - -# Reads DLEN bytes starting at DPTR and stores in XMMDst -# where 0 < DLEN < 16 -# Clobbers %rax, DLEN -.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst - vpxor \XMMDst, \XMMDst, \XMMDst - - cmp $8, \DLEN - jl .L_read_lt8_\@ - mov (\DPTR), %rax - vpinsrq $0, %rax, \XMMDst, \XMMDst - sub $8, \DLEN - jz .L_done_read_partial_block_\@ - xor %eax, %eax -.L_read_next_byte_\@: - shl $8, %rax - mov 7(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_\@ - vpinsrq $1, %rax, \XMMDst, \XMMDst - jmp .L_done_read_partial_block_\@ -.L_read_lt8_\@: - xor %eax, %eax -.L_read_next_byte_lt8_\@: - shl $8, %rax - mov -1(\DPTR, \DLEN, 1), %al - dec \DLEN - jnz .L_read_next_byte_lt8_\@ - vpinsrq $0, %rax, \XMMDst, \XMMDst -.L_done_read_partial_block_\@: -.endm - -# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks -# between update calls. -# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK -# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context -# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 -.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ - AAD_HASH ENC_DEC - mov PBlockLen(arg2), %r13 - test %r13, %r13 - je .L_partial_block_done_\@ # Leave Macro if no partial blocks - # Read in input data without over reading - cmp $16, \PLAIN_CYPH_LEN - jl .L_fewer_than_16_bytes_\@ - vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm - jmp .L_data_read_\@ - -.L_fewer_than_16_bytes_\@: - lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 - mov \PLAIN_CYPH_LEN, %r12 - READ_PARTIAL_BLOCK %r10 %r12 %xmm1 - - mov PBlockLen(arg2), %r13 - -.L_data_read_\@: # Finished reading in data - - vmovdqu PBlockEncKey(arg2), %xmm9 - vmovdqu HashKey(arg2), %xmm13 - - lea SHIFT_MASK(%rip), %r12 - - # adjust the shuffle mask pointer to be able to shift r13 bytes - # r16-r13 is the number of bytes in plaintext mod 16) - add %r13, %r12 - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes - -.if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm3 - pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_1_\@ - sub %r10, %r12 -.L_no_extra_mask_1_\@: - - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 - - vpand %xmm1, %xmm3, %xmm3 - vmovdqa SHUF_MASK(%rip), %xmm10 - vpshufb %xmm10, %xmm3, %xmm3 - vpshufb %xmm2, %xmm3, %xmm3 - vpxor %xmm3, \AAD_HASH, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_1_\@ - - # GHASH computation for the last <16 Byte block - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax,%eax - - mov %rax, PBlockLen(arg2) - jmp .L_dec_done_\@ -.L_partial_incomplete_1_\@: - add \PLAIN_CYPH_LEN, PBlockLen(arg2) -.L_dec_done_\@: - vmovdqu \AAD_HASH, AadHash(arg2) -.else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - - mov \PLAIN_CYPH_LEN, %r10 - add %r13, %r10 - # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling - sub $16, %r10 - # Determine if partial block is not being filled and - # shift mask accordingly - jge .L_no_extra_mask_2_\@ - sub %r10, %r12 -.L_no_extra_mask_2_\@: - - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 - # get the appropriate mask to mask out bottom r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 - - vmovdqa SHUF_MASK(%rip), %xmm1 - vpshufb %xmm1, %xmm9, %xmm9 - vpshufb %xmm2, %xmm9, %xmm9 - vpxor %xmm9, \AAD_HASH, \AAD_HASH - - test %r10, %r10 - jl .L_partial_incomplete_2_\@ - - # GHASH computation for the last <16 Byte block - \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %eax,%eax - - mov %rax, PBlockLen(arg2) - jmp .L_encode_done_\@ -.L_partial_incomplete_2_\@: - add \PLAIN_CYPH_LEN, PBlockLen(arg2) -.L_encode_done_\@: - vmovdqu \AAD_HASH, AadHash(arg2) - - vmovdqa SHUF_MASK(%rip), %xmm10 - # shuffle xmm9 back to output as ciphertext - vpshufb %xmm10, %xmm9, %xmm9 - vpshufb %xmm2, %xmm9, %xmm9 -.endif - # output encrypted Bytes - test %r10, %r10 - jl .L_partial_fill_\@ - mov %r13, %r12 - mov $16, %r13 - # Set r13 to be the number of bytes to write out - sub %r12, %r13 - jmp .L_count_set_\@ -.L_partial_fill_\@: - mov \PLAIN_CYPH_LEN, %r13 -.L_count_set_\@: - vmovdqa %xmm9, %xmm0 - vmovq %xmm0, %rax - cmp $8, %r13 - jle .L_less_than_8_bytes_left_\@ - - mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $8, \DATA_OFFSET - psrldq $8, %xmm0 - vmovq %xmm0, %rax - sub $8, %r13 -.L_less_than_8_bytes_left_\@: - movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) - add $1, \DATA_OFFSET - shr $8, %rax - sub $1, %r13 - jne .L_less_than_8_bytes_left_\@ -.L_partial_block_done_\@: -.endm # PARTIAL_BLOCK - -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 - - vpshufd $0b01001110, \GH, \T2 - vpshufd $0b01001110, \HK, \T3 - vpxor \GH , \T2, \T2 # T2 = (a1+a0) - vpxor \HK , \T3, \T3 # T3 = (b1+b0) - - vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 - vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 - vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) - vpxor \GH, \T2,\T2 - vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 - - vpslldq $8, \T2,\T3 # shift-L T3 2 DWs - vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs - vpxor \T3, \GH, \GH - vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK - - #first phase of the reduction - vpslld $31, \GH, \T2 # packed right shifting << 31 - vpslld $30, \GH, \T3 # packed right shifting shift << 30 - vpslld $25, \GH, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T5 # shift-R T5 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \GH, \GH # first phase of the reduction complete - - #second phase of the reduction - - vpsrld $1,\GH, \T2 # packed left shifting >> 1 - vpsrld $2,\GH, \T3 # packed left shifting >> 2 - vpsrld $7,\GH, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T5, \T2, \T2 - vpxor \T2, \GH, \GH - vpxor \T1, \GH, \GH # the result is in GH - - -.endm - -.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 - - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 - - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_2_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqu \T5, HashKey_3(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_3_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqu \T5, HashKey_4(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_4_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqu \T5, HashKey_5(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_5_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqu \T5, HashKey_6(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_6_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqu \T5, HashKey_7(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_7_k(arg2) - - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqu \T5, HashKey_8(arg2) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqu \T1, HashKey_8_k(arg2) - -.endm - -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, arg4 are used as pointers only, not modified - -.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC - i = (8-\num_initial_blocks) - setreg - vmovdqu AadHash(arg2), reg_i - - # start AES for num_initial_blocks blocks - vmovdqu CurCount(arg2), \CTR - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr - - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = 1 - setreg -.rep \REP - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = (j+1) - setreg -.endr - - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg4, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr - - - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg - -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here - - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 - - cmp $128, %r13 - jl .L_initial_blocks_done\@ # no need for precomputed constants - -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 - - i = 1 - setreg -.rep \REP # do REP rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr - - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 - - vmovdqu (arg4, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif - - vmovdqu 16*1(arg4, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif - - vmovdqu 16*2(arg4, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif - - vmovdqu 16*3(arg4, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - - vmovdqu 16*4(arg4, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif - - vmovdqu 16*5(arg4, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif - - vmovdqu 16*6(arg4, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif - - vmovdqu 16*7(arg4, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif - - add $128, %r11 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - -############################################################################### - -.L_initial_blocks_done\@: - -.endm - -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3, arg4 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) - -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif - - - ####################################################################### - - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 - - ####################################################################### - - - - - - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - ####################################################################### - - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - - vpshufd $0b01001110, \T2, \T6 - vpxor \T2, \T6, \T6 - - vmovdqu HashKey_8_k(arg2), \T5 - vpclmulqdq $0x00, \T5, \T6, \T6 - - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_7_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - ####################################################################### - - vmovdqa TMP3(%rsp), \T1 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_6_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP4(%rsp), \T1 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_5_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - vmovdqa TMP5(%rsp), \T1 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_4_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP6(%rsp), \T1 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_3_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP7(%rsp), \T1 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_2_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - ####################################################################### - - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqa TMP8(%rsp), \T1 - vmovdqu HashKey(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqu HashKey_k(arg2), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vpxor \T4, \T6, \T6 - vpxor \T7, \T6, \T6 - - vmovdqu 16*10(arg1), \T5 - - i = 11 - setreg -.rep (\REP-9) - - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqu 16*i(arg1), \T5 - i = i + 1 - setreg -.endr - - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg4, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg4, %r11), reg_j - vmovdqu \T3, 16*i(arg3, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### - - - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 - - - - ####################################################################### - #first phase of the reduction - ####################################################################### - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer - .endif - - ####################################################################### - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - ####################################################################### - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - - vpxor \T6, \XMM1, \XMM1 - - - -.endm - - -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - - ## Karatsuba Method - - - vpshufd $0b01001110, \XMM1, \T2 - vpxor \XMM1, \T2, \T2 - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 - - vmovdqu HashKey_8_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM2, \T2 - vpxor \XMM2, \T2, \T2 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_7_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM3, \T2 - vpxor \XMM3, \T2, \T2 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_6_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM4, \T2 - vpxor \XMM4, \T2, \T2 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_5_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM5, \T2 - vpxor \XMM5, \T2, \T2 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_4_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM6, \T2 - vpxor \XMM6, \T2, \T2 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_3_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM7, \T2 - vpxor \XMM7, \T2, \T2 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_2_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vpshufd $0b01001110, \XMM8, \T2 - vpxor \XMM8, \T2, \T2 - vmovdqu HashKey(arg2), \T5 - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vmovdqu HashKey_k(arg2), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of - # the accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 - - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - -.endm - -############################################################# -#void aesni_gcm_precomp_avx_gen2 -# (gcm_data *my_ctx_data, -# gcm_context_data *data, -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -############################################################# -SYM_FUNC_START(aesni_gcm_init_avx_gen2) - FUNC_SAVE - INIT GHASH_MUL_AVX, PRECOMPUTE_AVX - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init_avx_gen2) - -############################################################################### -#void aesni_gcm_enc_update_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ -# const u8 *in, /* Plaintext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) - FUNC_SAVE - mov keysize, %eax - cmp $32, %eax - je key_256_enc_update - cmp $16, %eax - je key_128_enc_update - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 - FUNC_RESTORE - RET -key_128_enc_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 - FUNC_RESTORE - RET -key_256_enc_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) - -############################################################################### -#void aesni_gcm_dec_update_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ -# const u8 *in, /* Ciphertext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_dec_update - cmp $16, %eax - je key_128_dec_update - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 - FUNC_RESTORE - RET -key_128_dec_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 - FUNC_RESTORE - RET -key_256_dec_update: - GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) - -############################################################################### -#void aesni_gcm_finalize_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_finalize - cmp $16, %eax - je key_128_finalize - # must be 192 - GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 - FUNC_RESTORE - RET -key_128_finalize: - GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 - FUNC_RESTORE - RET -key_256_finalize: - GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) - -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 - - vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 - vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 - vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 - vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 - vpxor \T3, \GH, \GH - - - vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs - vpslldq $8 , \GH, \GH # shift-L GH 2 DWs - - vpxor \T3, \T1, \T1 - vpxor \T2, \GH, \GH - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \GH, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L T2 2 DWs - - vpxor \T2, \GH, \GH # first phase of the reduction complete - ####################################################################### - #second phase of the reduction - vpclmulqdq $0x00, \GH, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \GH, \T3, \GH - vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \GH, \GH # second phase of the reduction complete - ####################################################################### - vpxor \T1, \GH, \GH # the result is in GH - - -.endm - -.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 - - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqu \T5, HashKey_3(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqu \T5, HashKey_4(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqu \T5, HashKey_5(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqu \T5, HashKey_6(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqu \T5, HashKey_7(arg2) - - GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqu \T5, HashKey_8(arg2) - -.endm - -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, arg4 are used as pointers only, not modified - -.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER - i = (8-\num_initial_blocks) - setreg - vmovdqu AadHash(arg2), reg_i - - # start AES for num_initial_blocks blocks - vmovdqu CurCount(arg2), \CTR - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr - - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = 1 - setreg -.rep \REP - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - j = (j+1) - setreg -.endr - - - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr - - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg4, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for - # num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr - - - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg - -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here - - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 - - cmp $128, %r13 - jl .L_initial_blocks_done\@ # no need for precomputed constants - -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 - - i = 1 - setreg -.rep \REP # do REP rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr - - - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 - - vmovdqu (arg4, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif - - vmovdqu 16*1(arg4, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif - - vmovdqu 16*2(arg4, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif - - vmovdqu 16*3(arg4, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif - - vmovdqu 16*4(arg4, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif - - vmovdqu 16*5(arg4, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif - - vmovdqu 16*6(arg4, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif - - vmovdqu 16*7(arg4, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg3 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif - - add $128, %r11 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with - # the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - -############################################################################### - -.L_initial_blocks_done\@: - - -.endm - - - -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3, arg4 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC - - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) - -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif - - - ####################################################################### - - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 - - ####################################################################### - - - - - - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - ####################################################################### - - vmovdqu HashKey_8(arg2), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 - vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 - vpxor \T5, \T6, \T6 - - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqu HashKey_7(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - ####################################################################### - - vmovdqa TMP3(%rsp), \T1 - vmovdqu HashKey_6(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP4(%rsp), \T1 - vmovdqu HashKey_5(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - vmovdqa TMP5(%rsp), \T1 - vmovdqu HashKey_4(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP6(%rsp), \T1 - vmovdqu HashKey_3(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP7(%rsp), \T1 - vmovdqu HashKey_2(arg2), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - - ####################################################################### - - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqa TMP8(%rsp), \T1 - vmovdqu HashKey(arg2), \T5 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T1 - - - vmovdqu 16*10(arg1), \T5 - - i = 11 - setreg -.rep (\REP-9) - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqu 16*i(arg1), \T5 - i = i + 1 - setreg -.endr - - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg4, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg4, %r11), reg_j - vmovdqu \T3, 16*i(arg3, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### - - - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 - - - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer - .endif - - ####################################################################### - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T1, \T1 # the result is in T1 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - - vpxor \T1, \XMM1, \XMM1 - - - -.endm - - -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - - ## Karatsuba Method - - vmovdqu HashKey_8(arg2), \T5 - - vpshufd $0b01001110, \XMM1, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM1, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 - - vpclmulqdq $0x00, \T3, \T2, \XMM1 - - ###################### - - vmovdqu HashKey_7(arg2), \T5 - vpshufd $0b01001110, \XMM2, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM2, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_6(arg2), \T5 - vpshufd $0b01001110, \XMM3, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM3, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_5(arg2), \T5 - vpshufd $0b01001110, \XMM4, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM4, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_4(arg2), \T5 - vpshufd $0b01001110, \XMM5, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM5, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_3(arg2), \T5 - vpshufd $0b01001110, \XMM6, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM6, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey_2(arg2), \T5 - vpshufd $0b01001110, \XMM7, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM7, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqu HashKey(arg2), \T5 - vpshufd $0b01001110, \XMM8, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM8, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 - - - - - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 - - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the - # accumulated carry-less multiplications - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - - - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T6, \T6 # the result is in T6 -.endm - - - -############################################################# -#void aesni_gcm_init_avx_gen4 -# (gcm_data *my_ctx_data, -# gcm_context_data *data, -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -############################################################# -SYM_FUNC_START(aesni_gcm_init_avx_gen4) - FUNC_SAVE - INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_init_avx_gen4) - -############################################################################### -#void aesni_gcm_enc_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ -# const u8 *in, /* Plaintext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_enc_update4 - cmp $16, %eax - je key_128_enc_update4 - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 - FUNC_RESTORE - RET -key_128_enc_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 - FUNC_RESTORE - RET -key_256_enc_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) - -############################################################################### -#void aesni_gcm_dec_update_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ -# const u8 *in, /* Ciphertext input */ -# u64 plaintext_len) /* Length of data in Bytes for encryption. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_dec_update4 - cmp $16, %eax - je key_128_dec_update4 - # must be 192 - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 - FUNC_RESTORE - RET -key_128_dec_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 - FUNC_RESTORE - RET -key_256_dec_update4: - GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) - -############################################################################### -#void aesni_gcm_finalize_avx_gen4( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# gcm_context_data *data, -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ -############################################################################### -SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) - FUNC_SAVE - mov keysize,%eax - cmp $32, %eax - je key_256_finalize4 - cmp $16, %eax - je key_128_finalize4 - # must be 192 - GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 - FUNC_RESTORE - RET -key_128_finalize4: - GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 - FUNC_RESTORE - RET -key_256_finalize4: - GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 - FUNC_RESTORE - RET -SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index ef031655b2d3..cd37de5ec404 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Support for Intel AES-NI instructions. This file contains glue - * code, the real AES implementation is in intel-aes_asm.S. + * Support for AES-NI and VAES instructions. This file contains glue code. + * The real AES implementations are in aesni-intel_asm.S and other .S files. * * Copyright (C) 2008, Intel Corp. * Author: Huang Ying <ying.huang@intel.com> @@ -13,6 +13,8 @@ * Tadeusz Struk (tadeusz.struk@intel.com) * Aidan O'Mahony (aidan.o.mahony@intel.com) * Copyright (c) 2010, Intel Corporation. + * + * Copyright 2024 Google LLC */ #include <linux/hardirq.h> @@ -44,41 +46,11 @@ #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) -/* This data is stored at the end of the crypto_tfm struct. - * It's a type of per "session" data storage location. - * This needs to be 16 byte aligned. - */ -struct aesni_rfc4106_gcm_ctx { - u8 hash_subkey[16] AESNI_ALIGN_ATTR; - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; - u8 nonce[4]; -}; - -struct generic_gcmaes_ctx { - u8 hash_subkey[16] AESNI_ALIGN_ATTR; - struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; -}; - struct aesni_xts_ctx { struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR; struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR; }; -#define GCM_BLOCK_LEN 16 - -struct gcm_context_data { - /* init, update and finalize context data */ - u8 aad_hash[GCM_BLOCK_LEN]; - u64 aad_length; - u64 in_length; - u8 partial_block_enc_key[GCM_BLOCK_LEN]; - u8 orig_IV[GCM_BLOCK_LEN]; - u8 current_counter[GCM_BLOCK_LEN]; - u64 partial_block_len; - u64 unused; - u8 hash_keys[GCM_BLOCK_LEN * 16]; -}; - static inline void *aes_align_addr(void *addr) { if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN) @@ -103,9 +75,6 @@ asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -#define AVX_GEN2_OPTSIZE 640 -#define AVX_GEN4_OPTSIZE 4096 - asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); @@ -118,23 +87,6 @@ asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); -/* Scatter / Gather routines, with args similar to above */ -asmlinkage void aesni_gcm_init(void *ctx, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, const u8 *aad, - unsigned long aad_len); -asmlinkage void aesni_gcm_enc_update(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, @@ -154,67 +106,6 @@ asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, u8 *out, unsigned int num_bytes, unsigned int byte_ctr); - -/* - * asmlinkage void aesni_gcm_init_avx_gen2() - * gcm_data *my_ctx_data, context data - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - */ -asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, - const u8 *aad, - unsigned long aad_len); - -asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -/* - * asmlinkage void aesni_gcm_init_avx_gen4() - * gcm_data *my_ctx_data, context data - * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. - */ -asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data, - struct gcm_context_data *gdata, - u8 *iv, - u8 *hash_subkey, - const u8 *aad, - unsigned long aad_len); - -asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, unsigned long plaintext_len); -asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx, - struct gcm_context_data *gdata, u8 *out, - const u8 *in, - unsigned long ciphertext_len); -asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, - struct gcm_context_data *gdata, - u8 *auth_tag, unsigned long auth_tag_len); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); - -static inline struct -aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) -{ - return aes_align_addr(crypto_aead_ctx(tfm)); -} - -static inline struct -generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) -{ - return aes_align_addr(crypto_aead_ctx(tfm)); -} #endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) @@ -588,280 +479,6 @@ static int xctr_crypt(struct skcipher_request *req) } return err; } - -static int aes_gcm_derive_hash_subkey(const struct crypto_aes_ctx *aes_key, - u8 hash_subkey[AES_BLOCK_SIZE]) -{ - static const u8 zeroes[AES_BLOCK_SIZE]; - - aes_encrypt(aes_key, hash_subkey, zeroes); - return 0; -} - -static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, - unsigned int key_len) -{ - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); - - if (key_len < 4) - return -EINVAL; - - /*Account for 4 byte nonce at the end.*/ - key_len -= 4; - - memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); - - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, - ctx->hash_subkey); -} - -/* This is the Integrity Check Value (aka the authentication tag) length and can - * be 8, 12 or 16 bytes long. */ -static int common_rfc4106_set_authsize(struct crypto_aead *aead, - unsigned int authsize) -{ - switch (authsize) { - case 8: - case 12: - case 16: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, - unsigned int authsize) -{ - switch (authsize) { - case 4: - case 8: - case 12: - case 13: - case 14: - case 15: - case 16: - break; - default: - return -EINVAL; - } - - return 0; -} - -static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, - unsigned int assoclen, u8 *hash_subkey, - u8 *iv, void *aes_ctx, u8 *auth_tag, - unsigned long auth_tag_len) -{ - u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); - struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); - unsigned long left = req->cryptlen; - struct scatter_walk assoc_sg_walk; - struct skcipher_walk walk; - bool do_avx, do_avx2; - u8 *assocmem = NULL; - u8 *assoc; - int err; - - if (!enc) - left -= auth_tag_len; - - do_avx = (left >= AVX_GEN2_OPTSIZE); - do_avx2 = (left >= AVX_GEN4_OPTSIZE); - - /* Linearize assoc, if not already linear */ - if (req->src->length >= assoclen && req->src->length) { - scatterwalk_start(&assoc_sg_walk, req->src); - assoc = scatterwalk_map(&assoc_sg_walk); - } else { - gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? - GFP_KERNEL : GFP_ATOMIC; - - /* assoc can be any length, so must be on heap */ - assocmem = kmalloc(assoclen, flags); - if (unlikely(!assocmem)) - return -ENOMEM; - assoc = assocmem; - - scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); - } - - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) - aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc, - assoclen); - else if (static_branch_likely(&gcm_use_avx) && do_avx) - aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc, - assoclen); - else - aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); - kernel_fpu_end(); - - if (!assocmem) - scatterwalk_unmap(assoc); - else - kfree(assocmem); - - err = enc ? skcipher_walk_aead_encrypt(&walk, req, false) - : skcipher_walk_aead_decrypt(&walk, req, false); - - while (walk.nbytes > 0) { - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) { - if (enc) - aesni_gcm_enc_update_avx_gen4(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - else - aesni_gcm_dec_update_avx_gen4(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - } else if (static_branch_likely(&gcm_use_avx) && do_avx) { - if (enc) - aesni_gcm_enc_update_avx_gen2(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - else - aesni_gcm_dec_update_avx_gen2(aes_ctx, data, - walk.dst.virt.addr, - walk.src.virt.addr, - walk.nbytes); - } else if (enc) { - aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr, - walk.src.virt.addr, walk.nbytes); - } else { - aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr, - walk.src.virt.addr, walk.nbytes); - } - kernel_fpu_end(); - - err = skcipher_walk_done(&walk, 0); - } - - if (err) - return err; - - kernel_fpu_begin(); - if (static_branch_likely(&gcm_use_avx2) && do_avx2) - aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag, - auth_tag_len); - else if (static_branch_likely(&gcm_use_avx) && do_avx) - aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag, - auth_tag_len); - else - aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len); - kernel_fpu_end(); - - return 0; -} - -static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, - u8 *hash_subkey, u8 *iv, void *aes_ctx) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 auth_tag[16]; - int err; - - err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx, - auth_tag, auth_tag_len); - if (err) - return err; - - scatterwalk_map_and_copy(auth_tag, req->dst, - req->assoclen + req->cryptlen, - auth_tag_len, 1); - return 0; -} - -static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, - u8 *hash_subkey, u8 *iv, void *aes_ctx) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 auth_tag_msg[16]; - u8 auth_tag[16]; - int err; - - err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx, - auth_tag, auth_tag_len); - if (err) - return err; - - /* Copy out original auth_tag */ - scatterwalk_map_and_copy(auth_tag_msg, req->src, - req->assoclen + req->cryptlen - auth_tag_len, - auth_tag_len, 0); - - /* Compare generated tag with passed in tag. */ - if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) { - memzero_explicit(auth_tag, sizeof(auth_tag)); - return -EBADMSG; - } - return 0; -} - -static int helper_rfc4106_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - unsigned int i; - __be32 counter = cpu_to_be32(1); - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length equal */ - /* to 16 or 20 bytes */ - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; - - return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, - aes_ctx); -} - -static int helper_rfc4106_decrypt(struct aead_request *req) -{ - __be32 counter = cpu_to_be32(1); - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - unsigned int i; - - if (unlikely(req->assoclen != 16 && req->assoclen != 20)) - return -EINVAL; - - /* Assuming we are supporting rfc4106 64-bit extended */ - /* sequence numbers We need to have the AAD length */ - /* equal to 16 or 20 bytes */ - - /* IV below built */ - for (i = 0; i < 4; i++) - *(iv+i) = ctx->nonce[i]; - for (i = 0; i < 8; i++) - *(iv+4+i) = req->iv[i]; - *((__be32 *)(iv+12)) = counter; - - return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, - aes_ctx); -} #endif static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, @@ -1216,11 +833,717 @@ DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); #endif +/* The common part of the x86_64 AES-GCM key struct */ +struct aes_gcm_key { + /* Expanded AES key and the AES key length in bytes */ + struct crypto_aes_ctx aes_key; + + /* RFC4106 nonce (used only by the rfc4106 algorithms) */ + u32 rfc4106_nonce; +}; + +/* Key struct used by the AES-NI implementations of AES-GCM */ +struct aes_gcm_key_aesni { + /* + * Common part of the key. The assembly code requires 16-byte alignment + * for the round keys; we get this by them being located at the start of + * the struct and the whole struct being 16-byte aligned. + */ + struct aes_gcm_key base; + + /* + * Powers of the hash key H^8 through H^1. These are 128-bit values. + * They all have an extra factor of x^-1 and are byte-reversed. 16-byte + * alignment is required by the assembly code. + */ + u64 h_powers[8][2] __aligned(16); + + /* + * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd + * together. It's used for Karatsuba multiplication. 16-byte alignment + * is required by the assembly code. + */ + u64 h_powers_xored[8] __aligned(16); + + /* + * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte + * alignment is required by the assembly code. + */ + u64 h_times_x64[2] __aligned(16); +}; +#define AES_GCM_KEY_AESNI(key) \ + container_of((key), struct aes_gcm_key_aesni, base) +#define AES_GCM_KEY_AESNI_SIZE \ + (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) + +/* Key struct used by the VAES + AVX10 implementations of AES-GCM */ +struct aes_gcm_key_avx10 { + /* + * Common part of the key. The assembly code prefers 16-byte alignment + * for the round keys; we get this by them being located at the start of + * the struct and the whole struct being 64-byte aligned. + */ + struct aes_gcm_key base; + + /* + * Powers of the hash key H^16 through H^1. These are 128-bit values. + * They all have an extra factor of x^-1 and are byte-reversed. This + * array is aligned to a 64-byte boundary to make it naturally aligned + * for 512-bit loads, which can improve performance. (The assembly code + * doesn't *need* the alignment; this is just an optimization.) + */ + u64 h_powers[16][2] __aligned(64); + + /* Three padding blocks required by the assembly code */ + u64 padding[3][2]; +}; +#define AES_GCM_KEY_AVX10(key) \ + container_of((key), struct aes_gcm_key_avx10, base) +#define AES_GCM_KEY_AVX10_SIZE \ + (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1))) + +/* + * These flags are passed to the AES-GCM helper functions to specify the + * specific version of AES-GCM (RFC4106 or not), whether it's encryption or + * decryption, and which assembly functions should be called. Assembly + * functions are selected using flags instead of function pointers to avoid + * indirect calls (which are very expensive on x86) regardless of inlining. + */ +#define FLAG_RFC4106 BIT(0) +#define FLAG_ENC BIT(1) +#define FLAG_AVX BIT(2) +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +# define FLAG_AVX10_256 BIT(3) +# define FLAG_AVX10_512 BIT(4) +#else + /* + * This should cause all calls to the AVX10 assembly functions to be + * optimized out, avoiding the need to ifdef each call individually. + */ +# define FLAG_AVX10_256 0 +# define FLAG_AVX10_512 0 +#endif + +static inline struct aes_gcm_key * +aes_gcm_key_get(struct crypto_aead *tfm, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + return PTR_ALIGN(crypto_aead_ctx(tfm), 64); + else + return PTR_ALIGN(crypto_aead_ctx(tfm), 16); +} + +asmlinkage void +aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); +asmlinkage void +aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); +asmlinkage void +aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); +asmlinkage void +aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key); + +static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) +{ + /* + * To make things a bit easier on the assembly side, the AVX10 + * implementations use the same key format. Therefore, a single + * function using 256-bit vectors would suffice here. However, it's + * straightforward to provide a 512-bit one because of how the assembly + * code is structured, and it works nicely because the total size of the + * key powers is a multiple of 512 bits. So we take advantage of that. + * + * A similar situation applies to the AES-NI implementations. + */ + if (flags & FLAG_AVX10_512) + aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); + else if (flags & FLAG_AVX10_256) + aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); + else if (flags & FLAG_AVX) + aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); + else + aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key)); +} + +asmlinkage void +aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); +asmlinkage void +aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); +asmlinkage void +aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); + +static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], + const u8 *aad, int aadlen, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, + aad, aadlen); + else if (flags & FLAG_AVX) + aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, + aad, aadlen); + else + aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc, + aad, aadlen); +} + +asmlinkage void +aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + +asmlinkage void +aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); +asmlinkage void +aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline void +aes_gcm_update(const struct aes_gcm_key *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen, int flags) +{ + if (flags & FLAG_ENC) { + if (flags & FLAG_AVX10_512) + aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX10_256) + aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, + ghash_acc, src, dst, datalen); + } else { + if (flags & FLAG_AVX10_512) + aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX10_256) + aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + else + aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + src, dst, datalen); + } +} + +asmlinkage void +aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); +asmlinkage void +aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); +asmlinkage void +aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline void +aes_gcm_enc_final(const struct aes_gcm_key *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); + else + aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen); +} + +asmlinkage bool __must_check +aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); +asmlinkage bool __must_check +aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); +asmlinkage bool __must_check +aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, + const u32 le_ctr[4], const u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline bool __must_check +aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], + u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, + u8 tag[16], int taglen, int flags) +{ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) + return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); + else if (flags & FLAG_AVX) + return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); + else + return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, + total_aadlen, total_datalen, + tag, taglen); +} + +/* + * This is the Integrity Check Value (aka the authentication tag) length and can + * be 8, 12 or 16 bytes long. + */ +static int common_rfc4106_set_authsize(struct crypto_aead *aead, + unsigned int authsize) +{ + switch (authsize) { + case 8: + case 12: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, + unsigned int authsize) +{ + switch (authsize) { + case 4: + case 8: + case 12: + case 13: + case 14: + case 15: + case 16: + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * This is the setkey function for the x86_64 implementations of AES-GCM. It + * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes + * powers of the hash key. + * + * To comply with the crypto_aead API, this has to be usable in no-SIMD context. + * For that reason, this function includes a portable C implementation of the + * needed logic. However, the portable C implementation is very slow, taking + * about the same time as encrypting 37 KB of data. To be ready for users that + * may set a key even somewhat frequently, we therefore also include a SIMD + * assembly implementation, expanding the AES key using AES-NI and precomputing + * the hash key powers using PCLMULQDQ or VPCLMULQDQ. + */ +static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, + unsigned int keylen, int flags) +{ + struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); + int err; + + if (flags & FLAG_RFC4106) { + if (keylen < 4) + return -EINVAL; + keylen -= 4; + key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen); + } + + /* The assembly code assumes the following offsets. */ + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768); + + if (likely(crypto_simd_usable())) { + err = aes_check_keylen(keylen); + if (err) + return err; + kernel_fpu_begin(); + aesni_set_key(&key->aes_key, raw_key, keylen); + aes_gcm_precompute(key, flags); + kernel_fpu_end(); + } else { + static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = { + [0] = 0xc2, [15] = 1 + }; + static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = { + [7] = 1, + }; + be128 h1 = {}; + be128 h; + int i; + + err = aes_expandkey(&key->aes_key, raw_key, keylen); + if (err) + return err; + + /* Encrypt the all-zeroes block to get the hash key H^1 */ + aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1); + + /* Compute H^1 * x^-1 */ + h = h1; + gf128mul_lle(&h, (const be128 *)x_to_the_minus1); + + /* Compute the needed key powers */ + if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) { + struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + gf128mul_lle(&h, &h1); + } + memset(k->padding, 0, sizeof(k->padding)); + } else { + struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); + k->h_powers[i][1] = be64_to_cpu(h.a); + k->h_powers_xored[i] = k->h_powers[i][0] ^ + k->h_powers[i][1]; + gf128mul_lle(&h, &h1); + } + gf128mul_lle(&h1, (const be128 *)x_to_the_63); + k->h_times_x64[0] = be64_to_cpu(h1.b); + k->h_times_x64[1] = be64_to_cpu(h1.a); + } + } + return 0; +} + +/* + * Initialize @ghash_acc, then pass all @assoclen bytes of associated data + * (a.k.a. additional authenticated data) from @sg_src through the GHASH update + * assembly function. kernel_fpu_begin() must have already been called. + */ +static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16], + struct scatterlist *sg_src, unsigned int assoclen, + int flags) +{ + struct scatter_walk walk; + /* + * The assembly function requires that the length of any non-last + * segment of associated data be a multiple of 16 bytes, so this + * function does the buffering needed to achieve that. + */ + unsigned int pos = 0; + u8 buf[16]; + + memset(ghash_acc, 0, 16); + scatterwalk_start(&walk, sg_src); + + while (assoclen) { + unsigned int len_this_page = scatterwalk_clamp(&walk, assoclen); + void *mapped = scatterwalk_map(&walk); + const void *src = mapped; + unsigned int len; + + assoclen -= len_this_page; + scatterwalk_advance(&walk, len_this_page); + if (unlikely(pos)) { + len = min(len_this_page, 16 - pos); + memcpy(&buf[pos], src, len); + pos += len; + src += len; + len_this_page -= len; + if (pos < 16) + goto next; + aes_gcm_aad_update(key, ghash_acc, buf, 16, flags); + pos = 0; + } + len = len_this_page; + if (unlikely(assoclen)) /* Not the last segment yet? */ + len = round_down(len, 16); + aes_gcm_aad_update(key, ghash_acc, src, len, flags); + src += len; + len_this_page -= len; + if (unlikely(len_this_page)) { + memcpy(buf, src, len_this_page); + pos = len_this_page; + } +next: + scatterwalk_unmap(mapped); + scatterwalk_pagedone(&walk, 0, assoclen); + if (need_resched()) { + kernel_fpu_end(); + kernel_fpu_begin(); + } + } + if (unlikely(pos)) + aes_gcm_aad_update(key, ghash_acc, buf, pos, flags); +} + + +/* __always_inline to optimize out the branches based on @flags */ +static __always_inline int +gcm_crypt(struct aead_request *req, int flags) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); + unsigned int assoclen = req->assoclen; + struct skcipher_walk walk; + unsigned int nbytes; + u8 ghash_acc[16]; /* GHASH accumulator */ + u32 le_ctr[4]; /* Counter in little-endian format */ + int taglen; + int err; + + /* Initialize the counter and determine the associated data length. */ + le_ctr[0] = 2; + if (flags & FLAG_RFC4106) { + if (unlikely(assoclen != 16 && assoclen != 20)) + return -EINVAL; + assoclen -= 8; + le_ctr[1] = get_unaligned_be32(req->iv + 4); + le_ctr[2] = get_unaligned_be32(req->iv + 0); + le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */ + } else { + le_ctr[1] = get_unaligned_be32(req->iv + 8); + le_ctr[2] = get_unaligned_be32(req->iv + 4); + le_ctr[3] = get_unaligned_be32(req->iv + 0); + } + + /* Begin walking through the plaintext or ciphertext. */ + if (flags & FLAG_ENC) + err = skcipher_walk_aead_encrypt(&walk, req, false); + else + err = skcipher_walk_aead_decrypt(&walk, req, false); + + /* + * Since the AES-GCM assembly code requires that at least three assembly + * functions be called to process any message (this is needed to support + * incremental updates cleanly), to reduce overhead we try to do all + * three calls in the same kernel FPU section if possible. We close the + * section and start a new one if there are multiple data segments or if + * rescheduling is needed while processing the associated data. + */ + kernel_fpu_begin(); + + /* Pass the associated data through GHASH. */ + gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); + + /* En/decrypt the data and pass the ciphertext through GHASH. */ + while ((nbytes = walk.nbytes) != 0) { + if (unlikely(nbytes < walk.total)) { + /* + * Non-last segment. In this case, the assembly + * function requires that the length be a multiple of 16 + * (AES_BLOCK_SIZE) bytes. The needed buffering of up + * to 16 bytes is handled by the skcipher_walk. Here we + * just need to round down to a multiple of 16. + */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + aes_gcm_update(key, le_ctr, ghash_acc, + walk.src.virt.addr, walk.dst.virt.addr, + nbytes, flags); + le_ctr[0] += nbytes / AES_BLOCK_SIZE; + kernel_fpu_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + kernel_fpu_begin(); + } else { + /* Last segment: process all remaining data. */ + aes_gcm_update(key, le_ctr, ghash_acc, + walk.src.virt.addr, walk.dst.virt.addr, + nbytes, flags); + err = skcipher_walk_done(&walk, 0); + /* + * The low word of the counter isn't used by the + * finalize, so there's no need to increment it here. + */ + } + } + if (err) + goto out; + + /* Finalize */ + taglen = crypto_aead_authsize(tfm); + if (flags & FLAG_ENC) { + /* Finish computing the auth tag. */ + aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen, + req->cryptlen, flags); + + /* Store the computed auth tag in the dst scatterlist. */ + scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen + + req->cryptlen, taglen, 1); + } else { + unsigned int datalen = req->cryptlen - taglen; + u8 tag[16]; + + /* Get the transmitted auth tag from the src scatterlist. */ + scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen, + taglen, 0); + /* + * Finish computing the auth tag and compare it to the + * transmitted one. The assembly function does the actual tag + * comparison. Here, just check the boolean result. + */ + if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen, + datalen, tag, taglen, flags)) + err = -EBADMSG; + } +out: + kernel_fpu_end(); + return err; +} + +#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \ + ctxsize, priority) \ + \ +static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ + unsigned int keylen) \ +{ \ + return gcm_setkey(tfm, raw_key, keylen, (flags)); \ +} \ + \ +static int gcm_encrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_ENC); \ +} \ + \ +static int gcm_decrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags)); \ +} \ + \ +static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ + unsigned int keylen) \ +{ \ + return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \ +} \ + \ +static int rfc4106_encrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \ +} \ + \ +static int rfc4106_decrypt_##suffix(struct aead_request *req) \ +{ \ + return gcm_crypt(req, (flags) | FLAG_RFC4106); \ +} \ + \ +static struct aead_alg aes_gcm_algs_##suffix[] = { { \ + .setkey = gcm_setkey_##suffix, \ + .setauthsize = generic_gcmaes_set_authsize, \ + .encrypt = gcm_encrypt_##suffix, \ + .decrypt = gcm_decrypt_##suffix, \ + .ivsize = GCM_AES_IV_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .maxauthsize = 16, \ + .base = { \ + .cra_name = "__gcm(aes)", \ + .cra_driver_name = "__" generic_driver_name, \ + .cra_priority = (priority), \ + .cra_flags = CRYPTO_ALG_INTERNAL, \ + .cra_blocksize = 1, \ + .cra_ctxsize = (ctxsize), \ + .cra_module = THIS_MODULE, \ + }, \ +}, { \ + .setkey = rfc4106_setkey_##suffix, \ + .setauthsize = common_rfc4106_set_authsize, \ + .encrypt = rfc4106_encrypt_##suffix, \ + .decrypt = rfc4106_decrypt_##suffix, \ + .ivsize = GCM_RFC4106_IV_SIZE, \ + .chunksize = AES_BLOCK_SIZE, \ + .maxauthsize = 16, \ + .base = { \ + .cra_name = "__rfc4106(gcm(aes))", \ + .cra_driver_name = "__" rfc_driver_name, \ + .cra_priority = (priority), \ + .cra_flags = CRYPTO_ALG_INTERNAL, \ + .cra_blocksize = 1, \ + .cra_ctxsize = (ctxsize), \ + .cra_module = THIS_MODULE, \ + }, \ +} }; \ + \ +static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \ + +/* aes_gcm_algs_aesni */ +DEFINE_GCM_ALGS(aesni, /* no flags */ 0, + "generic-gcm-aesni", "rfc4106-gcm-aesni", + AES_GCM_KEY_AESNI_SIZE, 400); + +/* aes_gcm_algs_aesni_avx */ +DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, + "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", + AES_GCM_KEY_AESNI_SIZE, 500); + +#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) +/* aes_gcm_algs_vaes_avx10_256 */ +DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256, + "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", + AES_GCM_KEY_AVX10_SIZE, 700); + +/* aes_gcm_algs_vaes_avx10_512 */ +DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, + "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512", + AES_GCM_KEY_AVX10_SIZE, 800); +#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ + /* * This is a list of CPU models that are known to suffer from downclocking when - * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS - * implementation with zmm registers won't be used by default. An - * implementation with ymm registers (256-bit vectors) will be used instead. + * zmm registers (512-bit vectors) are used. On these CPUs, the AES mode + * implementations with zmm registers won't be used by default. Implementations + * with ymm registers (256-bit vectors) will be used by default instead. */ static const struct x86_cpu_id zmm_exclusion_list[] = { X86_MATCH_VFM(INTEL_SKYLAKE_X, 0), @@ -1236,7 +1559,7 @@ static const struct x86_cpu_id zmm_exclusion_list[] = { {}, }; -static int __init register_xts_algs(void) +static int __init register_avx_algs(void) { int err; @@ -1246,6 +1569,11 @@ static int __init register_xts_algs(void) &aes_xts_simdalg_aesni_avx); if (err) return err; + err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx), + aes_gcm_simdalgs_aesni_avx); + if (err) + return err; #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_VAES) || @@ -1269,23 +1597,42 @@ static int __init register_xts_algs(void) &aes_xts_simdalg_vaes_avx10_256); if (err) return err; + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), + aes_gcm_simdalgs_vaes_avx10_256); + if (err) + return err; + + if (x86_match_cpu(zmm_exclusion_list)) { + int i; - if (x86_match_cpu(zmm_exclusion_list)) aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; + for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) + aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; + } err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, &aes_xts_simdalg_vaes_avx10_512); if (err) return err; + err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), + aes_gcm_simdalgs_vaes_avx10_512); + if (err) + return err; #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ return 0; } -static void unregister_xts_algs(void) +static void unregister_avx_algs(void) { if (aes_xts_simdalg_aesni_avx) simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, &aes_xts_simdalg_aesni_avx); + if (aes_gcm_simdalgs_aesni_avx[0]) + simd_unregister_aeads(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx), + aes_gcm_simdalgs_aesni_avx); #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) if (aes_xts_simdalg_vaes_avx2) simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, @@ -1293,106 +1640,33 @@ static void unregister_xts_algs(void) if (aes_xts_simdalg_vaes_avx10_256) simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, &aes_xts_simdalg_vaes_avx10_256); + if (aes_gcm_simdalgs_vaes_avx10_256[0]) + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), + aes_gcm_simdalgs_vaes_avx10_256); if (aes_xts_simdalg_vaes_avx10_512) simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, &aes_xts_simdalg_vaes_avx10_512); + if (aes_gcm_simdalgs_vaes_avx10_512[0]) + simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), + aes_gcm_simdalgs_vaes_avx10_512); #endif } #else /* CONFIG_X86_64 */ -static int __init register_xts_algs(void) +static struct aead_alg aes_gcm_algs_aesni[0]; +static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0]; + +static int __init register_avx_algs(void) { return 0; } -static void unregister_xts_algs(void) +static void unregister_avx_algs(void) { } #endif /* !CONFIG_X86_64 */ -#ifdef CONFIG_X86_64 -static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, - unsigned int key_len) -{ - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); - - return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: - aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, - ctx->hash_subkey); -} - -static int generic_gcmaes_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - __be32 counter = cpu_to_be32(1); - - memcpy(iv, req->iv, 12); - *((__be32 *)(iv+12)) = counter; - - return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, - aes_ctx); -} - -static int generic_gcmaes_decrypt(struct aead_request *req) -{ - __be32 counter = cpu_to_be32(1); - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); - void *aes_ctx = &(ctx->aes_key_expanded); - u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); - u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); - - memcpy(iv, req->iv, 12); - *((__be32 *)(iv+12)) = counter; - - return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, - aes_ctx); -} - -static struct aead_alg aesni_aeads[] = { { - .setkey = common_rfc4106_set_key, - .setauthsize = common_rfc4106_set_authsize, - .encrypt = helper_rfc4106_encrypt, - .decrypt = helper_rfc4106_decrypt, - .ivsize = GCM_RFC4106_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "__rfc4106(gcm(aes))", - .cra_driver_name = "__rfc4106-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, -}, { - .setkey = generic_gcmaes_set_key, - .setauthsize = generic_gcmaes_set_authsize, - .encrypt = generic_gcmaes_encrypt, - .decrypt = generic_gcmaes_decrypt, - .ivsize = GCM_AES_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "__gcm(aes)", - .cra_driver_name = "__generic-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - }, -} }; -#else -static struct aead_alg aesni_aeads[0]; -#endif - -static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; - static const struct x86_cpu_id aesni_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), {} @@ -1406,17 +1680,6 @@ static int __init aesni_init(void) if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; #ifdef CONFIG_X86_64 - if (boot_cpu_has(X86_FEATURE_AVX2)) { - pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - static_branch_enable(&gcm_use_avx); - static_branch_enable(&gcm_use_avx2); - } else - if (boot_cpu_has(X86_FEATURE_AVX)) { - pr_info("AVX version of gcm_enc/dec engaged.\n"); - static_branch_enable(&gcm_use_avx); - } else { - pr_info("SSE version of gcm_enc/dec engaged.\n"); - } if (boot_cpu_has(X86_FEATURE_AVX)) { /* optimize performance of ctr mode encryption transform */ static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); @@ -1434,8 +1697,9 @@ static int __init aesni_init(void) if (err) goto unregister_cipher; - err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); + err = simd_register_aeads_compat(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); if (err) goto unregister_skciphers; @@ -1447,22 +1711,22 @@ static int __init aesni_init(void) goto unregister_aeads; #endif /* CONFIG_X86_64 */ - err = register_xts_algs(); + err = register_avx_algs(); if (err) - goto unregister_xts; + goto unregister_avx; return 0; -unregister_xts: - unregister_xts_algs(); +unregister_avx: + unregister_avx_algs(); #ifdef CONFIG_X86_64 if (aesni_simd_xctr) simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); unregister_aeads: #endif /* CONFIG_X86_64 */ - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); - + simd_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); @@ -1473,8 +1737,9 @@ unregister_cipher: static void __exit aesni_exit(void) { - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); + simd_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni), + aes_gcm_simdalgs_aesni); simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); crypto_unregister_alg(&aesni_cipher_alg); @@ -1482,7 +1747,7 @@ static void __exit aesni_exit(void) if (boot_cpu_has(X86_FEATURE_AVX)) simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); #endif /* CONFIG_X86_64 */ - unregister_xts_algs(); + unregister_avx_algs(); } late_initcall(aesni_init); diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index 98cf3b4e4c9f..9f5e342b9845 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -195,6 +195,7 @@ module_init(crc32_pclmul_mod_init); module_exit(crc32_pclmul_mod_fini); MODULE_AUTHOR("Alexander Boyko <alexander_boyko@xyratex.com>"); +MODULE_DESCRIPTION("CRC32 algorithm (IEEE 802.3) accelerated with PCLMULQDQ"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("crc32"); diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index d55fa9e9b9e6..dcfc0de333de 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -1720,5 +1720,6 @@ module_exit(curve25519_mod_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-x86"); +MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 1dfb8af48a3c..08ff4b489f7e 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -12,7 +12,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/sizes.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/simd.h> asmlinkage void poly1305_init_x86_64(void *ctx, @@ -269,7 +269,7 @@ static int __init poly1305_simd_mod_init(void) boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X) + boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) static_branch_enable(&poly1305_use_avx512); return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; } diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 90454cf18e0d..1a1ecfa7f72a 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -5,6 +5,7 @@ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ +#include <asm/cpu_device_id.h> #include <crypto/algapi.h> #include <crypto/twofish.h> #include <linux/crypto.h> @@ -107,10 +108,10 @@ static bool is_blacklisted_cpu(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return false; - if (boot_cpu_data.x86 == 0x06 && - (boot_cpu_data.x86_model == 0x1c || - boot_cpu_data.x86_model == 0x26 || - boot_cpu_data.x86_model == 0x36)) { + switch (boot_cpu_data.x86_vfm) { + case INTEL_ATOM_BONNELL: + case INTEL_ATOM_BONNELL_MID: + case INTEL_ATOM_SALTWELL: /* * On Atom, twofish-3way is slower than original assembler * implementation. Twofish-3way trades off some performance in diff --git a/crypto/Kconfig b/crypto/Kconfig index 5688d42a59c2..72e2decb8c6a 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -313,24 +313,6 @@ config CRYPTO_ECRDSA One of the Russian cryptographic standard algorithms (called GOST algorithms). Only signature verification is implemented. -config CRYPTO_SM2 - tristate "SM2 (ShangMi 2)" - select CRYPTO_SM3 - select CRYPTO_AKCIPHER - select CRYPTO_MANAGER - select MPILIB - select ASN1 - help - SM2 (ShangMi 2) public key algorithm - - Published by State Encryption Management Bureau, China, - as specified by OSCCA GM/T 0003.1-2012 -- 0003.5-2012. - - References: - https://datatracker.ietf.org/doc/draft-shen-sm2-ecdsa/ - http://www.oscca.gov.cn/sca/xxgk/2010-12/17/content_1002386.shtml - http://www.gmbz.org.cn/main/bzlb.html - config CRYPTO_CURVE25519 tristate "Curve25519" select CRYPTO_KPP diff --git a/crypto/Makefile b/crypto/Makefile index edbbaa3ffef5..4c99e5d376f6 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -50,14 +50,6 @@ rsa_generic-y += rsa_helper.o rsa_generic-y += rsa-pkcs1pad.o obj-$(CONFIG_CRYPTO_RSA) += rsa_generic.o -$(obj)/sm2signature.asn1.o: $(obj)/sm2signature.asn1.c $(obj)/sm2signature.asn1.h -$(obj)/sm2.o: $(obj)/sm2signature.asn1.h - -sm2_generic-y += sm2signature.asn1.o -sm2_generic-y += sm2.o - -obj-$(CONFIG_CRYPTO_SM2) += sm2_generic.o - $(obj)/ecdsasignature.asn1.o: $(obj)/ecdsasignature.asn1.c $(obj)/ecdsasignature.asn1.h $(obj)/ecdsa.o: $(obj)/ecdsasignature.asn1.h ecdsa_generic-y += ecdsa.o diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 18cfead0081d..0da7c1ac778a 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -1317,5 +1317,6 @@ static void __exit af_alg_exit(void) module_init(af_alg_init); module_exit(af_alg_exit); +MODULE_DESCRIPTION("Crypto userspace interface"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_ALG); diff --git a/crypto/algapi.c b/crypto/algapi.c index 85bc279b4233..122cd910c4e1 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -1056,6 +1056,9 @@ EXPORT_SYMBOL_GPL(crypto_type_has_alg); static void __init crypto_start_tests(void) { + if (!IS_BUILTIN(CONFIG_CRYPTO_ALGAPI)) + return; + if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) return; diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 7c7394d46a23..5498a87249d3 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -471,4 +471,5 @@ static void __exit algif_hash_exit(void) module_init(algif_hash_init); module_exit(algif_hash_exit); +MODULE_DESCRIPTION("Userspace interface for hash algorithms"); MODULE_LICENSE("GPL"); diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 02cea2149504..125d395c5e00 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -437,4 +437,5 @@ static void __exit algif_skcipher_exit(void) module_init(algif_skcipher_init); module_exit(algif_skcipher_exit); +MODULE_DESCRIPTION("Userspace interface for skcipher algorithms"); MODULE_LICENSE("GPL"); diff --git a/crypto/api.c b/crypto/api.c index 6aa5a3b4ed5e..22556907b3bc 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -31,9 +31,9 @@ EXPORT_SYMBOL_GPL(crypto_alg_sem); BLOCKING_NOTIFIER_HEAD(crypto_chain); EXPORT_SYMBOL_GPL(crypto_chain); -#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +#if IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) && \ + !IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished); -EXPORT_SYMBOL_GPL(__crypto_boot_test_finished); #endif static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg); diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index 231ad7b3789d..423d13c47545 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -292,10 +292,6 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, ctx->sinfo->sig->pkey_algo = "ecdsa"; ctx->sinfo->sig->encoding = "x962"; break; - case OID_SM2_with_SM3: - ctx->sinfo->sig->pkey_algo = "sm2"; - ctx->sinfo->sig->encoding = "raw"; - break; case OID_gost2012PKey256: case OID_gost2012PKey512: ctx->sinfo->sig->pkey_algo = "ecrdsa"; diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 3474fb34ded9..422940a6706a 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -124,13 +124,6 @@ software_key_determine_akcipher(const struct public_key *pkey, strcmp(hash_algo, "sha3-384") != 0 && strcmp(hash_algo, "sha3-512") != 0) return -EINVAL; - } else if (strcmp(pkey->pkey_algo, "sm2") == 0) { - if (strcmp(encoding, "raw") != 0) - return -EINVAL; - if (!hash_algo) - return -EINVAL; - if (strcmp(hash_algo, "sm3") != 0) - return -EINVAL; } else if (strcmp(pkey->pkey_algo, "ecrdsa") == 0) { if (strcmp(encoding, "raw") != 0) return -EINVAL; diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index 25cc4273472f..ee2fdab42334 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -257,10 +257,6 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag, case OID_gost2012Signature512: ctx->cert->sig->hash_algo = "streebog512"; goto ecrdsa; - - case OID_SM2_with_SM3: - ctx->cert->sig->hash_algo = "sm3"; - goto sm2; } rsa_pkcs1: @@ -273,11 +269,6 @@ ecrdsa: ctx->cert->sig->encoding = "raw"; ctx->sig_algo = ctx->last_oid; return 0; -sm2: - ctx->cert->sig->pkey_algo = "sm2"; - ctx->cert->sig->encoding = "raw"; - ctx->sig_algo = ctx->last_oid; - return 0; ecdsa: ctx->cert->sig->pkey_algo = "ecdsa"; ctx->cert->sig->encoding = "x962"; @@ -309,7 +300,6 @@ int x509_note_signature(void *context, size_t hdrlen, if (strcmp(ctx->cert->sig->pkey_algo, "rsa") == 0 || strcmp(ctx->cert->sig->pkey_algo, "ecrdsa") == 0 || - strcmp(ctx->cert->sig->pkey_algo, "sm2") == 0 || strcmp(ctx->cert->sig->pkey_algo, "ecdsa") == 0) { /* Discard the BIT STRING metadata */ if (vlen < 1 || *(const u8 *)value != 0) @@ -514,17 +504,11 @@ int x509_extract_key_data(void *context, size_t hdrlen, case OID_gost2012PKey512: ctx->cert->pub->pkey_algo = "ecrdsa"; break; - case OID_sm2: - ctx->cert->pub->pkey_algo = "sm2"; - break; case OID_id_ecPublicKey: if (parse_OID(ctx->params, ctx->params_size, &oid) != 0) return -EBADMSG; switch (oid) { - case OID_sm2: - ctx->cert->pub->pkey_algo = "sm2"; - break; case OID_id_prime192v1: ctx->cert->pub->pkey_algo = "ecdsa-nist-p192"; break; diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c index 00ac7159fba2..8409d7d36cb4 100644 --- a/crypto/asymmetric_keys/x509_public_key.c +++ b/crypto/asymmetric_keys/x509_public_key.c @@ -7,7 +7,6 @@ #define pr_fmt(fmt) "X.509: "fmt #include <crypto/hash.h> -#include <crypto/sm2.h> #include <keys/asymmetric-parser.h> #include <keys/asymmetric-subtype.h> #include <keys/system_keyring.h> @@ -64,20 +63,8 @@ int x509_get_sig_params(struct x509_certificate *cert) desc->tfm = tfm; - if (strcmp(cert->pub->pkey_algo, "sm2") == 0) { - ret = strcmp(sig->hash_algo, "sm3") != 0 ? -EINVAL : - crypto_shash_init(desc) ?: - sm2_compute_z_digest(desc, cert->pub->key, - cert->pub->keylen, sig->digest) ?: - crypto_shash_init(desc) ?: - crypto_shash_update(desc, sig->digest, - sig->digest_size) ?: - crypto_shash_finup(desc, cert->tbs, cert->tbs_size, - sig->digest); - } else { - ret = crypto_shash_digest(desc, cert->tbs, cert->tbs_size, - sig->digest); - } + ret = crypto_shash_digest(desc, cert->tbs, cert->tbs_size, + sig->digest); if (ret < 0) goto error_2; diff --git a/crypto/cast_common.c b/crypto/cast_common.c index 9b2f60fd4cef..fec1f6609a40 100644 --- a/crypto/cast_common.c +++ b/crypto/cast_common.c @@ -282,4 +282,5 @@ __visible const u32 cast_s4[256] = { }; EXPORT_SYMBOL_GPL(cast_s4); +MODULE_DESCRIPTION("Common lookup tables for CAST-128 (cast5) and CAST-256 (cast6)"); MODULE_LICENSE("GPL"); diff --git a/crypto/curve25519-generic.c b/crypto/curve25519-generic.c index d055b0784c77..68a673262e04 100644 --- a/crypto/curve25519-generic.c +++ b/crypto/curve25519-generic.c @@ -87,4 +87,5 @@ module_exit(curve25519_exit); MODULE_ALIAS_CRYPTO("curve25519"); MODULE_ALIAS_CRYPTO("curve25519-generic"); +MODULE_DESCRIPTION("Curve25519 elliptic curve (RFC7748)"); MODULE_LICENSE("GPL"); diff --git a/crypto/deflate.c b/crypto/deflate.c index 6e31e0db0e86..98e8bcb81a6a 100644 --- a/crypto/deflate.c +++ b/crypto/deflate.c @@ -311,3 +311,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Deflate Compression Algorithm for IPCOMP"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); MODULE_ALIAS_CRYPTO("deflate"); +MODULE_ALIAS_CRYPTO("deflate-generic"); diff --git a/crypto/ecc.c b/crypto/ecc.c index fe761256e335..420decdad7d9 100644 --- a/crypto/ecc.c +++ b/crypto/ecc.c @@ -78,7 +78,7 @@ void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes, /* diff > 0: not enough input bytes: set most significant digits to 0 */ if (diff > 0) { ndigits -= diff; - memset(&out[ndigits - 1], 0, diff * sizeof(u64)); + memset(&out[ndigits], 0, diff * sizeof(u64)); } if (o) { @@ -1715,4 +1715,5 @@ out: } EXPORT_SYMBOL(crypto_ecdh_shared_secret); +MODULE_DESCRIPTION("core elliptic curve module"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/crypto/ecdsa.c b/crypto/ecdsa.c index 258fffbf623d..d5a10959ec28 100644 --- a/crypto/ecdsa.c +++ b/crypto/ecdsa.c @@ -38,7 +38,6 @@ static int ecdsa_get_signature_rs(u64 *dest, size_t hdrlen, unsigned char tag, size_t bufsize = ndigits * sizeof(u64); ssize_t diff = vlen - bufsize; const char *d = value; - u8 rs[ECC_MAX_BYTES]; if (!value || !vlen) return -EINVAL; @@ -46,7 +45,7 @@ static int ecdsa_get_signature_rs(u64 *dest, size_t hdrlen, unsigned char tag, /* diff = 0: 'value' has exacly the right size * diff > 0: 'value' has too many bytes; one leading zero is allowed that * makes the value a positive integer; error on more - * diff < 0: 'value' is missing leading zeros, which we add + * diff < 0: 'value' is missing leading zeros */ if (diff > 0) { /* skip over leading zeros that make 'value' a positive int */ @@ -61,14 +60,7 @@ static int ecdsa_get_signature_rs(u64 *dest, size_t hdrlen, unsigned char tag, if (-diff >= bufsize) return -EINVAL; - if (diff) { - /* leading zeros not given in 'value' */ - memset(rs, 0, -diff); - } - - memcpy(&rs[-diff], d, vlen); - - ecc_swap_digits((u64 *)rs, dest, ndigits); + ecc_digits_from_bytes(d, vlen, dest, ndigits); return 0; } @@ -142,10 +134,8 @@ static int ecdsa_verify(struct akcipher_request *req) struct ecdsa_signature_ctx sig_ctx = { .curve = ctx->curve, }; - u8 rawhash[ECC_MAX_BYTES]; u64 hash[ECC_MAX_DIGITS]; unsigned char *buffer; - ssize_t diff; int ret; if (unlikely(!ctx->pub_key_set)) @@ -164,18 +154,11 @@ static int ecdsa_verify(struct akcipher_request *req) if (ret < 0) goto error; - /* if the hash is shorter then we will add leading zeros to fit to ndigits */ - diff = bufsize - req->dst_len; - if (diff >= 0) { - if (diff) - memset(rawhash, 0, diff); - memcpy(&rawhash[diff], buffer + req->src_len, req->dst_len); - } else if (diff < 0) { - /* given hash is longer, we take the left-most bytes */ - memcpy(&rawhash, buffer + req->src_len, bufsize); - } + if (bufsize > req->dst_len) + bufsize = req->dst_len; - ecc_swap_digits((u64 *)rawhash, hash, ctx->curve->g.ndigits); + ecc_digits_from_bytes(buffer + req->src_len, bufsize, + hash, ctx->curve->g.ndigits); ret = _ecdsa_verify(ctx, hash, sig_ctx.r, sig_ctx.s); @@ -215,9 +198,8 @@ static int ecdsa_ecc_ctx_reset(struct ecc_ctx *ctx) } /* - * Set the public key given the raw uncompressed key data from an X509 - * certificate. The key data contain the concatenated X and Y coordinates of - * the public key. + * Set the public ECC key as defined by RFC5480 section 2.2 "Subject Public + * Key". Only the uncompressed format is supported. */ static int ecdsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen) { diff --git a/crypto/internal.h b/crypto/internal.h index 63e59240d5fb..aee31319be2e 100644 --- a/crypto/internal.h +++ b/crypto/internal.h @@ -66,7 +66,8 @@ extern struct blocking_notifier_head crypto_chain; int alg_test(const char *driver, const char *alg, u32 type, u32 mask); -#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS +#if !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) || \ + IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) static inline bool crypto_boot_test_finished(void) { return true; @@ -84,7 +85,9 @@ static inline void set_crypto_boot_test_finished(void) { static_branch_enable(&__crypto_boot_test_finished); } -#endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */ +#endif /* !IS_BUILTIN(CONFIG_CRYPTO_ALGAPI) || + * IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) + */ #ifdef CONFIG_PROC_FS void __init crypto_init_proc(void); diff --git a/crypto/simd.c b/crypto/simd.c index edaa479a1ec5..2aa4f72e224f 100644 --- a/crypto/simd.c +++ b/crypto/simd.c @@ -523,4 +523,5 @@ void simd_unregister_aeads(struct aead_alg *algs, int count, } EXPORT_SYMBOL_GPL(simd_unregister_aeads); +MODULE_DESCRIPTION("Shared crypto SIMD helpers"); MODULE_LICENSE("GPL"); diff --git a/crypto/sm2.c b/crypto/sm2.c deleted file mode 100644 index 5ab120d74c59..000000000000 --- a/crypto/sm2.c +++ /dev/null @@ -1,498 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SM2 asymmetric public-key algorithm - * as specified by OSCCA GM/T 0003.1-2012 -- 0003.5-2012 SM2 and - * described at https://tools.ietf.org/html/draft-shen-sm2-ecdsa-02 - * - * Copyright (c) 2020, Alibaba Group. - * Authors: Tianjia Zhang <tianjia.zhang@linux.alibaba.com> - */ - -#include <linux/module.h> -#include <linux/mpi.h> -#include <crypto/internal/akcipher.h> -#include <crypto/akcipher.h> -#include <crypto/hash.h> -#include <crypto/rng.h> -#include <crypto/sm2.h> -#include "sm2signature.asn1.h" - -/* The default user id as specified in GM/T 0009-2012 */ -#define SM2_DEFAULT_USERID "1234567812345678" -#define SM2_DEFAULT_USERID_LEN 16 - -#define MPI_NBYTES(m) ((mpi_get_nbits(m) + 7) / 8) - -struct ecc_domain_parms { - const char *desc; /* Description of the curve. */ - unsigned int nbits; /* Number of bits. */ - unsigned int fips:1; /* True if this is a FIPS140-2 approved curve */ - - /* The model describing this curve. This is mainly used to select - * the group equation. - */ - enum gcry_mpi_ec_models model; - - /* The actual ECC dialect used. This is used for curve specific - * optimizations and to select encodings etc. - */ - enum ecc_dialects dialect; - - const char *p; /* The prime defining the field. */ - const char *a, *b; /* The coefficients. For Twisted Edwards - * Curves b is used for d. For Montgomery - * Curves (a,b) has ((A-2)/4,B^-1). - */ - const char *n; /* The order of the base point. */ - const char *g_x, *g_y; /* Base point. */ - unsigned int h; /* Cofactor. */ -}; - -static const struct ecc_domain_parms sm2_ecp = { - .desc = "sm2p256v1", - .nbits = 256, - .fips = 0, - .model = MPI_EC_WEIERSTRASS, - .dialect = ECC_DIALECT_STANDARD, - .p = "0xfffffffeffffffffffffffffffffffffffffffff00000000ffffffffffffffff", - .a = "0xfffffffeffffffffffffffffffffffffffffffff00000000fffffffffffffffc", - .b = "0x28e9fa9e9d9f5e344d5a9e4bcf6509a7f39789f515ab8f92ddbcbd414d940e93", - .n = "0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54123", - .g_x = "0x32c4ae2c1f1981195f9904466a39c9948fe30bbff2660be1715a4589334c74c7", - .g_y = "0xbc3736a2f4f6779c59bdcee36b692153d0a9877cc62a474002df32e52139f0a0", - .h = 1 -}; - -static int __sm2_set_pub_key(struct mpi_ec_ctx *ec, - const void *key, unsigned int keylen); - -static int sm2_ec_ctx_init(struct mpi_ec_ctx *ec) -{ - const struct ecc_domain_parms *ecp = &sm2_ecp; - MPI p, a, b; - MPI x, y; - int rc = -EINVAL; - - p = mpi_scanval(ecp->p); - a = mpi_scanval(ecp->a); - b = mpi_scanval(ecp->b); - if (!p || !a || !b) - goto free_p; - - x = mpi_scanval(ecp->g_x); - y = mpi_scanval(ecp->g_y); - if (!x || !y) - goto free; - - rc = -ENOMEM; - - ec->Q = mpi_point_new(0); - if (!ec->Q) - goto free; - - /* mpi_ec_setup_elliptic_curve */ - ec->G = mpi_point_new(0); - if (!ec->G) { - mpi_point_release(ec->Q); - goto free; - } - - mpi_set(ec->G->x, x); - mpi_set(ec->G->y, y); - mpi_set_ui(ec->G->z, 1); - - rc = -EINVAL; - ec->n = mpi_scanval(ecp->n); - if (!ec->n) { - mpi_point_release(ec->Q); - mpi_point_release(ec->G); - goto free; - } - - ec->h = ecp->h; - ec->name = ecp->desc; - mpi_ec_init(ec, ecp->model, ecp->dialect, 0, p, a, b); - - rc = 0; - -free: - mpi_free(x); - mpi_free(y); -free_p: - mpi_free(p); - mpi_free(a); - mpi_free(b); - - return rc; -} - -static void sm2_ec_ctx_deinit(struct mpi_ec_ctx *ec) -{ - mpi_ec_deinit(ec); - - memset(ec, 0, sizeof(*ec)); -} - -/* RESULT must have been initialized and is set on success to the - * point given by VALUE. - */ -static int sm2_ecc_os2ec(MPI_POINT result, MPI value) -{ - int rc; - size_t n; - unsigned char *buf; - MPI x, y; - - n = MPI_NBYTES(value); - buf = kmalloc(n, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - rc = mpi_print(GCRYMPI_FMT_USG, buf, n, &n, value); - if (rc) - goto err_freebuf; - - rc = -EINVAL; - if (n < 1 || ((n - 1) % 2)) - goto err_freebuf; - /* No support for point compression */ - if (*buf != 0x4) - goto err_freebuf; - - rc = -ENOMEM; - n = (n - 1) / 2; - x = mpi_read_raw_data(buf + 1, n); - if (!x) - goto err_freebuf; - y = mpi_read_raw_data(buf + 1 + n, n); - if (!y) - goto err_freex; - - mpi_normalize(x); - mpi_normalize(y); - mpi_set(result->x, x); - mpi_set(result->y, y); - mpi_set_ui(result->z, 1); - - rc = 0; - - mpi_free(y); -err_freex: - mpi_free(x); -err_freebuf: - kfree(buf); - return rc; -} - -struct sm2_signature_ctx { - MPI sig_r; - MPI sig_s; -}; - -int sm2_get_signature_r(void *context, size_t hdrlen, unsigned char tag, - const void *value, size_t vlen) -{ - struct sm2_signature_ctx *sig = context; - - if (!value || !vlen) - return -EINVAL; - - sig->sig_r = mpi_read_raw_data(value, vlen); - if (!sig->sig_r) - return -ENOMEM; - - return 0; -} - -int sm2_get_signature_s(void *context, size_t hdrlen, unsigned char tag, - const void *value, size_t vlen) -{ - struct sm2_signature_ctx *sig = context; - - if (!value || !vlen) - return -EINVAL; - - sig->sig_s = mpi_read_raw_data(value, vlen); - if (!sig->sig_s) - return -ENOMEM; - - return 0; -} - -static int sm2_z_digest_update(struct shash_desc *desc, - MPI m, unsigned int pbytes) -{ - static const unsigned char zero[32]; - unsigned char *in; - unsigned int inlen; - int err; - - in = mpi_get_buffer(m, &inlen, NULL); - if (!in) - return -EINVAL; - - if (inlen < pbytes) { - /* padding with zero */ - err = crypto_shash_update(desc, zero, pbytes - inlen) ?: - crypto_shash_update(desc, in, inlen); - } else if (inlen > pbytes) { - /* skip the starting zero */ - err = crypto_shash_update(desc, in + inlen - pbytes, pbytes); - } else { - err = crypto_shash_update(desc, in, inlen); - } - - kfree(in); - return err; -} - -static int sm2_z_digest_update_point(struct shash_desc *desc, - MPI_POINT point, struct mpi_ec_ctx *ec, - unsigned int pbytes) -{ - MPI x, y; - int ret = -EINVAL; - - x = mpi_new(0); - y = mpi_new(0); - - ret = mpi_ec_get_affine(x, y, point, ec) ? -EINVAL : - sm2_z_digest_update(desc, x, pbytes) ?: - sm2_z_digest_update(desc, y, pbytes); - - mpi_free(x); - mpi_free(y); - return ret; -} - -int sm2_compute_z_digest(struct shash_desc *desc, - const void *key, unsigned int keylen, void *dgst) -{ - struct mpi_ec_ctx *ec; - unsigned int bits_len; - unsigned int pbytes; - u8 entl[2]; - int err; - - ec = kmalloc(sizeof(*ec), GFP_KERNEL); - if (!ec) - return -ENOMEM; - - err = sm2_ec_ctx_init(ec); - if (err) - goto out_free_ec; - - err = __sm2_set_pub_key(ec, key, keylen); - if (err) - goto out_deinit_ec; - - bits_len = SM2_DEFAULT_USERID_LEN * 8; - entl[0] = bits_len >> 8; - entl[1] = bits_len & 0xff; - - pbytes = MPI_NBYTES(ec->p); - - /* ZA = H256(ENTLA | IDA | a | b | xG | yG | xA | yA) */ - err = crypto_shash_init(desc); - if (err) - goto out_deinit_ec; - - err = crypto_shash_update(desc, entl, 2); - if (err) - goto out_deinit_ec; - - err = crypto_shash_update(desc, SM2_DEFAULT_USERID, - SM2_DEFAULT_USERID_LEN); - if (err) - goto out_deinit_ec; - - err = sm2_z_digest_update(desc, ec->a, pbytes) ?: - sm2_z_digest_update(desc, ec->b, pbytes) ?: - sm2_z_digest_update_point(desc, ec->G, ec, pbytes) ?: - sm2_z_digest_update_point(desc, ec->Q, ec, pbytes); - if (err) - goto out_deinit_ec; - - err = crypto_shash_final(desc, dgst); - -out_deinit_ec: - sm2_ec_ctx_deinit(ec); -out_free_ec: - kfree(ec); - return err; -} -EXPORT_SYMBOL_GPL(sm2_compute_z_digest); - -static int _sm2_verify(struct mpi_ec_ctx *ec, MPI hash, MPI sig_r, MPI sig_s) -{ - int rc = -EINVAL; - struct gcry_mpi_point sG, tP; - MPI t = NULL; - MPI x1 = NULL, y1 = NULL; - - mpi_point_init(&sG); - mpi_point_init(&tP); - x1 = mpi_new(0); - y1 = mpi_new(0); - t = mpi_new(0); - - /* r, s in [1, n-1] */ - if (mpi_cmp_ui(sig_r, 1) < 0 || mpi_cmp(sig_r, ec->n) > 0 || - mpi_cmp_ui(sig_s, 1) < 0 || mpi_cmp(sig_s, ec->n) > 0) { - goto leave; - } - - /* t = (r + s) % n, t == 0 */ - mpi_addm(t, sig_r, sig_s, ec->n); - if (mpi_cmp_ui(t, 0) == 0) - goto leave; - - /* sG + tP = (x1, y1) */ - rc = -EBADMSG; - mpi_ec_mul_point(&sG, sig_s, ec->G, ec); - mpi_ec_mul_point(&tP, t, ec->Q, ec); - mpi_ec_add_points(&sG, &sG, &tP, ec); - if (mpi_ec_get_affine(x1, y1, &sG, ec)) - goto leave; - - /* R = (e + x1) % n */ - mpi_addm(t, hash, x1, ec->n); - - /* check R == r */ - rc = -EKEYREJECTED; - if (mpi_cmp(t, sig_r)) - goto leave; - - rc = 0; - -leave: - mpi_point_free_parts(&sG); - mpi_point_free_parts(&tP); - mpi_free(x1); - mpi_free(y1); - mpi_free(t); - - return rc; -} - -static int sm2_verify(struct akcipher_request *req) -{ - struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req); - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); - unsigned char *buffer; - struct sm2_signature_ctx sig; - MPI hash; - int ret; - - if (unlikely(!ec->Q)) - return -EINVAL; - - buffer = kmalloc(req->src_len + req->dst_len, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - sg_pcopy_to_buffer(req->src, - sg_nents_for_len(req->src, req->src_len + req->dst_len), - buffer, req->src_len + req->dst_len, 0); - - sig.sig_r = NULL; - sig.sig_s = NULL; - ret = asn1_ber_decoder(&sm2signature_decoder, &sig, - buffer, req->src_len); - if (ret) - goto error; - - ret = -ENOMEM; - hash = mpi_read_raw_data(buffer + req->src_len, req->dst_len); - if (!hash) - goto error; - - ret = _sm2_verify(ec, hash, sig.sig_r, sig.sig_s); - - mpi_free(hash); -error: - mpi_free(sig.sig_r); - mpi_free(sig.sig_s); - kfree(buffer); - return ret; -} - -static int sm2_set_pub_key(struct crypto_akcipher *tfm, - const void *key, unsigned int keylen) -{ - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); - - return __sm2_set_pub_key(ec, key, keylen); - -} - -static int __sm2_set_pub_key(struct mpi_ec_ctx *ec, - const void *key, unsigned int keylen) -{ - MPI a; - int rc; - - /* include the uncompressed flag '0x04' */ - a = mpi_read_raw_data(key, keylen); - if (!a) - return -ENOMEM; - - mpi_normalize(a); - rc = sm2_ecc_os2ec(ec->Q, a); - mpi_free(a); - - return rc; -} - -static unsigned int sm2_max_size(struct crypto_akcipher *tfm) -{ - /* Unlimited max size */ - return PAGE_SIZE; -} - -static int sm2_init_tfm(struct crypto_akcipher *tfm) -{ - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); - - return sm2_ec_ctx_init(ec); -} - -static void sm2_exit_tfm(struct crypto_akcipher *tfm) -{ - struct mpi_ec_ctx *ec = akcipher_tfm_ctx(tfm); - - sm2_ec_ctx_deinit(ec); -} - -static struct akcipher_alg sm2 = { - .verify = sm2_verify, - .set_pub_key = sm2_set_pub_key, - .max_size = sm2_max_size, - .init = sm2_init_tfm, - .exit = sm2_exit_tfm, - .base = { - .cra_name = "sm2", - .cra_driver_name = "sm2-generic", - .cra_priority = 100, - .cra_module = THIS_MODULE, - .cra_ctxsize = sizeof(struct mpi_ec_ctx), - }, -}; - -static int __init sm2_init(void) -{ - return crypto_register_akcipher(&sm2); -} - -static void __exit sm2_exit(void) -{ - crypto_unregister_akcipher(&sm2); -} - -subsys_initcall(sm2_init); -module_exit(sm2_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>"); -MODULE_DESCRIPTION("SM2 generic algorithm"); -MODULE_ALIAS_CRYPTO("sm2-generic"); diff --git a/crypto/sm2signature.asn1 b/crypto/sm2signature.asn1 deleted file mode 100644 index ab8c0b754d21..000000000000 --- a/crypto/sm2signature.asn1 +++ /dev/null @@ -1,4 +0,0 @@ -Sm2Signature ::= SEQUENCE { - sig_r INTEGER ({ sm2_get_signature_r }), - sig_s INTEGER ({ sm2_get_signature_s }) -} diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 8aea416f6480..e9e7dceb606e 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -2613,6 +2613,15 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) break; case 600: + if (alg) { + u8 speed_template[2] = {klen, 0}; + test_mb_skcipher_speed(alg, ENCRYPT, sec, NULL, 0, + speed_template, num_mb); + test_mb_skcipher_speed(alg, DECRYPT, sec, NULL, 0, + speed_template, num_mb); + break; + } + test_mb_skcipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0, speed_template_16_24_32, num_mb); test_mb_skcipher_speed("ecb(aes)", DECRYPT, sec, NULL, 0, diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 00f5a6cf341a..f02cb075bd68 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -293,6 +293,10 @@ struct test_sg_division { * the @key_offset * @finalization_type: what finalization function to use for hashes * @nosimd: execute with SIMD disabled? Requires !CRYPTO_TFM_REQ_MAY_SLEEP. + * This applies to the parts of the operation that aren't controlled + * individually by @nosimd_setkey or @src_divs[].nosimd. + * @nosimd_setkey: set the key (if applicable) with SIMD disabled? Requires + * !CRYPTO_TFM_REQ_MAY_SLEEP. */ struct testvec_config { const char *name; @@ -306,6 +310,7 @@ struct testvec_config { bool key_offset_relative_to_alignmask; enum finalization_type finalization_type; bool nosimd; + bool nosimd_setkey; }; #define TESTVEC_CONFIG_NAMELEN 192 @@ -533,7 +538,8 @@ static bool valid_testvec_config(const struct testvec_config *cfg) cfg->finalization_type == FINALIZATION_TYPE_DIGEST) return false; - if ((cfg->nosimd || (flags & SGDIVS_HAVE_NOSIMD)) && + if ((cfg->nosimd || cfg->nosimd_setkey || + (flags & SGDIVS_HAVE_NOSIMD)) && (cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP)) return false; @@ -841,7 +847,10 @@ static int prepare_keybuf(const u8 *key, unsigned int ksize, return 0; } -/* Like setkey_f(tfm, key, ksize), but sometimes misalign the key */ +/* + * Like setkey_f(tfm, key, ksize), but sometimes misalign the key. + * In addition, run the setkey function in no-SIMD context if requested. + */ #define do_setkey(setkey_f, tfm, key, ksize, cfg, alignmask) \ ({ \ const u8 *keybuf, *keyptr; \ @@ -850,7 +859,11 @@ static int prepare_keybuf(const u8 *key, unsigned int ksize, err = prepare_keybuf((key), (ksize), (cfg), (alignmask), \ &keybuf, &keyptr); \ if (err == 0) { \ + if ((cfg)->nosimd_setkey) \ + crypto_disable_simd_for_test(); \ err = setkey_f((tfm), keyptr, (ksize)); \ + if ((cfg)->nosimd_setkey) \ + crypto_reenable_simd_for_test(); \ kfree(keybuf); \ } \ err; \ @@ -903,14 +916,20 @@ static unsigned int generate_random_length(struct rnd_state *rng, switch (prandom_u32_below(rng, 4)) { case 0: - return len % 64; + len %= 64; + break; case 1: - return len % 256; + len %= 256; + break; case 2: - return len % 1024; + len %= 1024; + break; default: - return len; + break; } + if (len && prandom_u32_below(rng, 4) == 0) + len = rounddown_pow_of_two(len); + return len; } /* Flip a random bit in the given nonempty data buffer */ @@ -1006,6 +1025,8 @@ static char *generate_random_sgl_divisions(struct rnd_state *rng, if (div == &divs[max_divs - 1] || prandom_bool(rng)) this_len = remaining; + else if (prandom_u32_below(rng, 4) == 0) + this_len = (remaining + 1) / 2; else this_len = prandom_u32_inclusive(rng, 1, remaining); div->proportion_of_total = this_len; @@ -1118,9 +1139,15 @@ static void generate_random_testvec_config(struct rnd_state *rng, break; } - if (!(cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP) && prandom_bool(rng)) { - cfg->nosimd = true; - p += scnprintf(p, end - p, " nosimd"); + if (!(cfg->req_flags & CRYPTO_TFM_REQ_MAY_SLEEP)) { + if (prandom_bool(rng)) { + cfg->nosimd = true; + p += scnprintf(p, end - p, " nosimd"); + } + if (prandom_bool(rng)) { + cfg->nosimd_setkey = true; + p += scnprintf(p, end - p, " nosimd_setkey"); + } } p += scnprintf(p, end - p, " src_divs=["); @@ -5590,12 +5617,6 @@ static const struct alg_test_desc alg_test_descs[] = { .hash = __VECS(sha512_tv_template) } }, { - .alg = "sm2", - .test = alg_test_akcipher, - .suite = { - .akcipher = __VECS(sm2_tv_template) - } - }, { .alg = "sm3", .test = alg_test_hash, .suite = { diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 5350cfd9d325..9b38501a17b2 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -5774,65 +5774,6 @@ static const struct hash_testvec hmac_streebog512_tv_template[] = { }, }; -/* - * SM2 test vectors. - */ -static const struct akcipher_testvec sm2_tv_template[] = { - { /* Generated from openssl */ - .key = - "\x04" - "\x8e\xa0\x33\x69\x91\x7e\x3d\xec\xad\x8e\xf0\x45\x5e\x13\x3e\x68" - "\x5b\x8c\xab\x5c\xc6\xc8\x50\xdf\x91\x00\xe0\x24\x73\x4d\x31\xf2" - "\x2e\xc0\xd5\x6b\xee\xda\x98\x93\xec\xd8\x36\xaa\xb9\xcf\x63\x82" - "\xef\xa7\x1a\x03\xed\x16\xba\x74\xb8\x8b\xf9\xe5\x70\x39\xa4\x70", - .key_len = 65, - .param_len = 0, - .c = - "\x30\x45" - "\x02\x20" - "\x70\xab\xb6\x7d\xd6\x54\x80\x64\x42\x7e\x2d\x05\x08\x36\xc9\x96" - "\x25\xc2\xbb\xff\x08\xe5\x43\x15\x5e\xf3\x06\xd9\x2b\x2f\x0a\x9f" - "\x02\x21" - "\x00" - "\xbf\x21\x5f\x7e\x5d\x3f\x1a\x4d\x8f\x84\xc2\xe9\xa6\x4c\xa4\x18" - "\xb2\xb8\x46\xf4\x32\x96\xfa\x57\xc6\x29\xd4\x89\xae\xcc\xda\xdb", - .c_size = 71, - .algo = OID_SM2_with_SM3, - .m = - "\x47\xa7\xbf\xd3\xda\xc4\x79\xee\xda\x8b\x4f\xe8\x40\x94\xd4\x32" - "\x8f\xf1\xcd\x68\x4d\xbd\x9b\x1d\xe0\xd8\x9a\x5d\xad\x85\x47\x5c", - .m_size = 32, - .public_key_vec = true, - .siggen_sigver_test = true, - }, - { /* From libgcrypt */ - .key = - "\x04" - "\x87\x59\x38\x9a\x34\xaa\xad\x07\xec\xf4\xe0\xc8\xc2\x65\x0a\x44" - "\x59\xc8\xd9\x26\xee\x23\x78\x32\x4e\x02\x61\xc5\x25\x38\xcb\x47" - "\x75\x28\x10\x6b\x1e\x0b\x7c\x8d\xd5\xff\x29\xa9\xc8\x6a\x89\x06" - "\x56\x56\xeb\x33\x15\x4b\xc0\x55\x60\x91\xef\x8a\xc9\xd1\x7d\x78", - .key_len = 65, - .param_len = 0, - .c = - "\x30\x44" - "\x02\x20" - "\xd9\xec\xef\xe8\x5f\xee\x3c\x59\x57\x8e\x5b\xab\xb3\x02\xe1\x42" - "\x4b\x67\x2c\x0b\x26\xb6\x51\x2c\x3e\xfc\xc6\x49\xec\xfe\x89\xe5" - "\x02\x20" - "\x43\x45\xd0\xa5\xff\xe5\x13\x27\x26\xd0\xec\x37\xad\x24\x1e\x9a" - "\x71\x9a\xa4\x89\xb0\x7e\x0f\xc4\xbb\x2d\x50\xd0\xe5\x7f\x7a\x68", - .c_size = 70, - .algo = OID_SM2_with_SM3, - .m = - "\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\xdd\xee\xff\x00" - "\x12\x34\x56\x78\x9a\xbc\xde\xf0\x12\x34\x56\x78\x9a\xbc\xde\xf0", - .m_size = 32, - .public_key_vec = true, - .siggen_sigver_test = true, - }, -}; - /* Example vectors below taken from * http://www.oscca.gov.cn/UpFile/20101222141857786.pdf * diff --git a/crypto/xor.c b/crypto/xor.c index 8e72e5d5db0d..a1363162978c 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -165,6 +165,7 @@ out: static __exit void xor_exit(void) { } +MODULE_DESCRIPTION("RAID-5 checksumming functions"); MODULE_LICENSE("GPL"); #ifndef MODULE diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index 442c40efb200..01e2e1ef82cf 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -555,7 +555,6 @@ config HW_RANDOM_ARM_SMCCC_TRNG config HW_RANDOM_CN10K tristate "Marvell CN10K Random Number Generator support" depends on HW_RANDOM && PCI && (ARM64 || (64BIT && COMPILE_TEST)) - default HW_RANDOM help This driver provides support for the True Random Number generator available in Marvell CN10K SoCs. diff --git a/drivers/char/hw_random/amd-rng.c b/drivers/char/hw_random/amd-rng.c index 86162a13681e..9a24d19236dc 100644 --- a/drivers/char/hw_random/amd-rng.c +++ b/drivers/char/hw_random/amd-rng.c @@ -143,8 +143,10 @@ static int __init amd_rng_mod_init(void) found: err = pci_read_config_dword(pdev, 0x58, &pmbase); - if (err) + if (err) { + err = pcibios_err_to_errno(err); goto put_dev; + } pmbase &= 0x0000FF00; if (pmbase == 0) { diff --git a/drivers/char/hw_random/arm_smccc_trng.c b/drivers/char/hw_random/arm_smccc_trng.c index 7e954341b09f..dcb8e7f37f25 100644 --- a/drivers/char/hw_random/arm_smccc_trng.c +++ b/drivers/char/hw_random/arm_smccc_trng.c @@ -118,4 +118,5 @@ module_platform_driver(smccc_trng_driver); MODULE_ALIAS("platform:smccc_trng"); MODULE_AUTHOR("Andre Przywara"); +MODULE_DESCRIPTION("Arm SMCCC TRNG firmware interface support"); MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/cavium-rng-vf.c b/drivers/char/hw_random/cavium-rng-vf.c index c99c54cd99c6..c1b8918b2292 100644 --- a/drivers/char/hw_random/cavium-rng-vf.c +++ b/drivers/char/hw_random/cavium-rng-vf.c @@ -266,4 +266,5 @@ static struct pci_driver cavium_rng_vf_driver = { module_pci_driver(cavium_rng_vf_driver); MODULE_AUTHOR("Omer Khaliq <okhaliq@caviumnetworks.com>"); +MODULE_DESCRIPTION("Cavium ThunderX Random Number Generator VF support"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/char/hw_random/cavium-rng.c b/drivers/char/hw_random/cavium-rng.c index b96579222408..d9d7b6038c06 100644 --- a/drivers/char/hw_random/cavium-rng.c +++ b/drivers/char/hw_random/cavium-rng.c @@ -88,4 +88,5 @@ static struct pci_driver cavium_rng_pf_driver = { module_pci_driver(cavium_rng_pf_driver); MODULE_AUTHOR("Omer Khaliq <okhaliq@caviumnetworks.com>"); +MODULE_DESCRIPTION("Cavium ThunderX Random Number Generator support"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 4084df65c9fa..57c51efa5613 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -161,7 +161,6 @@ static int hwrng_init(struct hwrng *rng) reinit_completion(&rng->cleanup_done); skip_init: - rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); current_quality = rng->quality; /* obsolete */ return 0; @@ -470,16 +469,6 @@ static struct attribute *rng_dev_attrs[] = { ATTRIBUTE_GROUPS(rng_dev); -static void __exit unregister_miscdev(void) -{ - misc_deregister(&rng_miscdev); -} - -static int __init register_miscdev(void) -{ - return misc_register(&rng_miscdev); -} - static int hwrng_fillfn(void *unused) { size_t entropy, entropy_credit = 0; /* in 1/1024 of a bit */ @@ -545,6 +534,9 @@ int hwrng_register(struct hwrng *rng) complete(&rng->cleanup_done); init_completion(&rng->dying); + /* Adjust quality field to always have a proper value */ + rng->quality = min_t(u16, min_t(u16, default_quality, 1024), rng->quality ?: 1024); + if (!current_rng || (!cur_rng_set_by_user && rng->quality > current_rng->quality)) { /* @@ -668,7 +660,7 @@ static int __init hwrng_modinit(void) return -ENOMEM; } - ret = register_miscdev(); + ret = misc_register(&rng_miscdev); if (ret) { kfree(rng_fillbuf); kfree(rng_buffer); @@ -685,7 +677,7 @@ static void __exit hwrng_modexit(void) kfree(rng_fillbuf); mutex_unlock(&rng_mutex); - unregister_miscdev(); + misc_deregister(&rng_miscdev); } fs_initcall(hwrng_modinit); /* depends on misc_register() */ diff --git a/drivers/char/hw_random/exynos-trng.c b/drivers/char/hw_random/exynos-trng.c index 0ed5d22fe667..9f039fddaee3 100644 --- a/drivers/char/hw_random/exynos-trng.c +++ b/drivers/char/hw_random/exynos-trng.c @@ -10,6 +10,7 @@ * Krzysztof Kozłowski <krzk@kernel.org> */ +#include <linux/arm-smccc.h> #include <linux/clk.h> #include <linux/crypto.h> #include <linux/delay.h> @@ -22,46 +23,69 @@ #include <linux/mod_devicetable.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> - -#define EXYNOS_TRNG_CLKDIV (0x0) - -#define EXYNOS_TRNG_CTRL (0x20) -#define EXYNOS_TRNG_CTRL_RNGEN BIT(31) - -#define EXYNOS_TRNG_POST_CTRL (0x30) -#define EXYNOS_TRNG_ONLINE_CTRL (0x40) -#define EXYNOS_TRNG_ONLINE_STAT (0x44) -#define EXYNOS_TRNG_ONLINE_MAXCHI2 (0x48) -#define EXYNOS_TRNG_FIFO_CTRL (0x50) -#define EXYNOS_TRNG_FIFO_0 (0x80) -#define EXYNOS_TRNG_FIFO_1 (0x84) -#define EXYNOS_TRNG_FIFO_2 (0x88) -#define EXYNOS_TRNG_FIFO_3 (0x8c) -#define EXYNOS_TRNG_FIFO_4 (0x90) -#define EXYNOS_TRNG_FIFO_5 (0x94) -#define EXYNOS_TRNG_FIFO_6 (0x98) -#define EXYNOS_TRNG_FIFO_7 (0x9c) -#define EXYNOS_TRNG_FIFO_LEN (8) -#define EXYNOS_TRNG_CLOCK_RATE (500000) - +#include <linux/property.h> + +#define EXYNOS_TRNG_CLKDIV 0x0 + +#define EXYNOS_TRNG_CTRL 0x20 +#define EXYNOS_TRNG_CTRL_RNGEN BIT(31) + +#define EXYNOS_TRNG_POST_CTRL 0x30 +#define EXYNOS_TRNG_ONLINE_CTRL 0x40 +#define EXYNOS_TRNG_ONLINE_STAT 0x44 +#define EXYNOS_TRNG_ONLINE_MAXCHI2 0x48 +#define EXYNOS_TRNG_FIFO_CTRL 0x50 +#define EXYNOS_TRNG_FIFO_0 0x80 +#define EXYNOS_TRNG_FIFO_1 0x84 +#define EXYNOS_TRNG_FIFO_2 0x88 +#define EXYNOS_TRNG_FIFO_3 0x8c +#define EXYNOS_TRNG_FIFO_4 0x90 +#define EXYNOS_TRNG_FIFO_5 0x94 +#define EXYNOS_TRNG_FIFO_6 0x98 +#define EXYNOS_TRNG_FIFO_7 0x9c +#define EXYNOS_TRNG_FIFO_LEN 8 +#define EXYNOS_TRNG_CLOCK_RATE 500000 + +/* Driver feature flags */ +#define EXYNOS_SMC BIT(0) + +#define EXYNOS_SMC_CALL_VAL(func_num) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + ARM_SMCCC_OWNER_SIP, \ + func_num) + +/* SMC command for DTRNG access */ +#define SMC_CMD_RANDOM EXYNOS_SMC_CALL_VAL(0x1012) + +/* SMC_CMD_RANDOM: arguments */ +#define HWRNG_INIT 0x0 +#define HWRNG_EXIT 0x1 +#define HWRNG_GET_DATA 0x2 +#define HWRNG_RESUME 0x3 + +/* SMC_CMD_RANDOM: return values */ +#define HWRNG_RET_OK 0x0 +#define HWRNG_RET_RETRY_ERROR 0x2 + +#define HWRNG_MAX_TRIES 100 struct exynos_trng_dev { - struct device *dev; - void __iomem *mem; - struct clk *clk; - struct hwrng rng; + struct device *dev; + void __iomem *mem; + struct clk *clk; /* operating clock */ + struct clk *pclk; /* bus clock */ + struct hwrng rng; + unsigned long flags; }; -static int exynos_trng_do_read(struct hwrng *rng, void *data, size_t max, - bool wait) +static int exynos_trng_do_read_reg(struct hwrng *rng, void *data, size_t max, + bool wait) { - struct exynos_trng_dev *trng; + struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; int val; max = min_t(size_t, max, (EXYNOS_TRNG_FIFO_LEN * 4)); - - trng = (struct exynos_trng_dev *)rng->priv; - writel_relaxed(max * 8, trng->mem + EXYNOS_TRNG_FIFO_CTRL); val = readl_poll_timeout(trng->mem + EXYNOS_TRNG_FIFO_CTRL, val, val == 0, 200, 1000000); @@ -73,7 +97,40 @@ static int exynos_trng_do_read(struct hwrng *rng, void *data, size_t max, return max; } -static int exynos_trng_init(struct hwrng *rng) +static int exynos_trng_do_read_smc(struct hwrng *rng, void *data, size_t max, + bool wait) +{ + struct arm_smccc_res res; + unsigned int copied = 0; + u32 *buf = data; + int tries = 0; + + while (copied < max) { + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_GET_DATA, 0, 0, 0, 0, 0, 0, + &res); + switch (res.a0) { + case HWRNG_RET_OK: + *buf++ = res.a2; + *buf++ = res.a3; + copied += 8; + tries = 0; + break; + case HWRNG_RET_RETRY_ERROR: + if (!wait) + return copied; + if (++tries >= HWRNG_MAX_TRIES) + return copied; + cond_resched(); + break; + default: + return -EIO; + } + } + + return copied; +} + +static int exynos_trng_init_reg(struct hwrng *rng) { struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; unsigned long sss_rate; @@ -87,7 +144,7 @@ static int exynos_trng_init(struct hwrng *rng) */ val = sss_rate / (EXYNOS_TRNG_CLOCK_RATE * 2); if (val > 0x7fff) { - dev_err(trng->dev, "clock divider too large: %d", val); + dev_err(trng->dev, "clock divider too large: %d\n", val); return -ERANGE; } val = val << 1; @@ -106,6 +163,24 @@ static int exynos_trng_init(struct hwrng *rng) return 0; } +static int exynos_trng_init_smc(struct hwrng *rng) +{ + struct exynos_trng_dev *trng = (struct exynos_trng_dev *)rng->priv; + struct arm_smccc_res res; + int ret = 0; + + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_INIT, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != HWRNG_RET_OK) { + dev_err(trng->dev, "SMC command for TRNG init failed (%d)\n", + (int)res.a0); + ret = -EIO; + } + if ((int)res.a0 == -1) + dev_info(trng->dev, "Make sure LDFW is loaded by your BL\n"); + + return ret; +} + static int exynos_trng_probe(struct platform_device *pdev) { struct exynos_trng_dev *trng; @@ -115,21 +190,29 @@ static int exynos_trng_probe(struct platform_device *pdev) if (!trng) return ret; + platform_set_drvdata(pdev, trng); + trng->dev = &pdev->dev; + + trng->flags = (unsigned long)device_get_match_data(&pdev->dev); + trng->rng.name = devm_kstrdup(&pdev->dev, dev_name(&pdev->dev), GFP_KERNEL); if (!trng->rng.name) return ret; - trng->rng.init = exynos_trng_init; - trng->rng.read = exynos_trng_do_read; - trng->rng.priv = (unsigned long) trng; + trng->rng.priv = (unsigned long)trng; - platform_set_drvdata(pdev, trng); - trng->dev = &pdev->dev; + if (trng->flags & EXYNOS_SMC) { + trng->rng.init = exynos_trng_init_smc; + trng->rng.read = exynos_trng_do_read_smc; + } else { + trng->rng.init = exynos_trng_init_reg; + trng->rng.read = exynos_trng_do_read_reg; - trng->mem = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(trng->mem)) - return PTR_ERR(trng->mem); + trng->mem = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(trng->mem)) + return PTR_ERR(trng->mem); + } pm_runtime_enable(&pdev->dev); ret = pm_runtime_resume_and_get(&pdev->dev); @@ -138,32 +221,30 @@ static int exynos_trng_probe(struct platform_device *pdev) goto err_pm_get; } - trng->clk = devm_clk_get(&pdev->dev, "secss"); + trng->clk = devm_clk_get_enabled(&pdev->dev, "secss"); if (IS_ERR(trng->clk)) { - ret = PTR_ERR(trng->clk); - dev_err(&pdev->dev, "Could not get clock.\n"); + ret = dev_err_probe(&pdev->dev, PTR_ERR(trng->clk), + "Could not get clock\n"); goto err_clock; } - ret = clk_prepare_enable(trng->clk); - if (ret) { - dev_err(&pdev->dev, "Could not enable the clk.\n"); + trng->pclk = devm_clk_get_optional_enabled(&pdev->dev, "pclk"); + if (IS_ERR(trng->pclk)) { + ret = dev_err_probe(&pdev->dev, PTR_ERR(trng->pclk), + "Could not get pclk\n"); goto err_clock; } ret = devm_hwrng_register(&pdev->dev, &trng->rng); if (ret) { dev_err(&pdev->dev, "Could not register hwrng device.\n"); - goto err_register; + goto err_clock; } dev_info(&pdev->dev, "Exynos True Random Number Generator.\n"); return 0; -err_register: - clk_disable_unprepare(trng->clk); - err_clock: pm_runtime_put_noidle(&pdev->dev); @@ -175,9 +256,14 @@ err_pm_get: static void exynos_trng_remove(struct platform_device *pdev) { - struct exynos_trng_dev *trng = platform_get_drvdata(pdev); + struct exynos_trng_dev *trng = platform_get_drvdata(pdev); - clk_disable_unprepare(trng->clk); + if (trng->flags & EXYNOS_SMC) { + struct arm_smccc_res res; + + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_EXIT, 0, 0, 0, 0, 0, 0, + &res); + } pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); @@ -185,6 +271,16 @@ static void exynos_trng_remove(struct platform_device *pdev) static int exynos_trng_suspend(struct device *dev) { + struct exynos_trng_dev *trng = dev_get_drvdata(dev); + struct arm_smccc_res res; + + if (trng->flags & EXYNOS_SMC) { + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_EXIT, 0, 0, 0, 0, 0, 0, + &res); + if (res.a0 != HWRNG_RET_OK) + return -EIO; + } + pm_runtime_put_sync(dev); return 0; @@ -192,6 +288,7 @@ static int exynos_trng_suspend(struct device *dev) static int exynos_trng_resume(struct device *dev) { + struct exynos_trng_dev *trng = dev_get_drvdata(dev); int ret; ret = pm_runtime_resume_and_get(dev); @@ -200,15 +297,32 @@ static int exynos_trng_resume(struct device *dev) return ret; } + if (trng->flags & EXYNOS_SMC) { + struct arm_smccc_res res; + + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_RESUME, 0, 0, 0, 0, 0, 0, + &res); + if (res.a0 != HWRNG_RET_OK) + return -EIO; + + arm_smccc_smc(SMC_CMD_RANDOM, HWRNG_INIT, 0, 0, 0, 0, 0, 0, + &res); + if (res.a0 != HWRNG_RET_OK) + return -EIO; + } + return 0; } static DEFINE_SIMPLE_DEV_PM_OPS(exynos_trng_pm_ops, exynos_trng_suspend, - exynos_trng_resume); + exynos_trng_resume); static const struct of_device_id exynos_trng_dt_match[] = { { .compatible = "samsung,exynos5250-trng", + }, { + .compatible = "samsung,exynos850-trng", + .data = (void *)EXYNOS_SMC, }, { }, }; @@ -225,6 +339,7 @@ static struct platform_driver exynos_trng_driver = { }; module_platform_driver(exynos_trng_driver); + MODULE_AUTHOR("Łukasz Stelmach"); MODULE_DESCRIPTION("H/W TRNG driver for Exynos chips"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/char/hw_random/omap-rng.c b/drivers/char/hw_random/omap-rng.c index d4c02e900466..4914a8720e58 100644 --- a/drivers/char/hw_random/omap-rng.c +++ b/drivers/char/hw_random/omap-rng.c @@ -564,4 +564,5 @@ static struct platform_driver omap_rng_driver = { module_platform_driver(omap_rng_driver); MODULE_ALIAS("platform:omap_rng"); MODULE_AUTHOR("Deepak Saxena (and others)"); +MODULE_DESCRIPTION("RNG driver for TI OMAP CPU family"); MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/omap3-rom-rng.c b/drivers/char/hw_random/omap3-rom-rng.c index 18dc46b1b58e..8064c792caf0 100644 --- a/drivers/char/hw_random/omap3-rom-rng.c +++ b/drivers/char/hw_random/omap3-rom-rng.c @@ -178,4 +178,5 @@ module_platform_driver(omap3_rom_rng_driver); MODULE_ALIAS("platform:omap3-rom-rng"); MODULE_AUTHOR("Juha Yrjola"); MODULE_AUTHOR("Pali Rohár <pali@kernel.org>"); +MODULE_DESCRIPTION("RNG driver for TI OMAP3 CPU family"); MODULE_LICENSE("GPL"); diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c index 0e903d6e22e3..9d041a67c295 100644 --- a/drivers/char/hw_random/stm32-rng.c +++ b/drivers/char/hw_random/stm32-rng.c @@ -70,6 +70,7 @@ struct stm32_rng_config { struct stm32_rng_private { struct hwrng rng; + struct device *dev; void __iomem *base; struct clk *clk; struct reset_control *rst; @@ -99,7 +100,7 @@ struct stm32_rng_private { */ static int stm32_rng_conceal_seed_error_cond_reset(struct stm32_rng_private *priv) { - struct device *dev = (struct device *)priv->rng.priv; + struct device *dev = priv->dev; u32 sr = readl_relaxed(priv->base + RNG_SR); u32 cr = readl_relaxed(priv->base + RNG_CR); int err; @@ -171,7 +172,7 @@ static int stm32_rng_conceal_seed_error(struct hwrng *rng) { struct stm32_rng_private *priv = container_of(rng, struct stm32_rng_private, rng); - dev_dbg((struct device *)priv->rng.priv, "Concealing seed error\n"); + dev_dbg(priv->dev, "Concealing seed error\n"); if (priv->data->has_cond_reset) return stm32_rng_conceal_seed_error_cond_reset(priv); @@ -187,7 +188,9 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) int retval = 0, err = 0; u32 sr; - pm_runtime_get_sync((struct device *) priv->rng.priv); + retval = pm_runtime_resume_and_get(priv->dev); + if (retval) + return retval; if (readl_relaxed(priv->base + RNG_SR) & RNG_SR_SEIS) stm32_rng_conceal_seed_error(rng); @@ -204,8 +207,7 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) sr, sr, 10, 50000); if (err) { - dev_err((struct device *)priv->rng.priv, - "%s: timeout %x!\n", __func__, sr); + dev_err(priv->dev, "%s: timeout %x!\n", __func__, sr); break; } } else if (!sr) { @@ -218,8 +220,7 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) err = stm32_rng_conceal_seed_error(rng); i++; if (err && i > RNG_NB_RECOVER_TRIES) { - dev_err((struct device *)priv->rng.priv, - "Couldn't recover from seed error\n"); + dev_err(priv->dev, "Couldn't recover from seed error\n"); retval = -ENOTRECOVERABLE; goto exit_rpm; } @@ -237,8 +238,7 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) err = stm32_rng_conceal_seed_error(rng); i++; if (err && i > RNG_NB_RECOVER_TRIES) { - dev_err((struct device *)priv->rng.priv, - "Couldn't recover from seed error"); + dev_err(priv->dev, "Couldn't recover from seed error"); retval = -ENOTRECOVERABLE; goto exit_rpm; } @@ -253,8 +253,8 @@ static int stm32_rng_read(struct hwrng *rng, void *data, size_t max, bool wait) } exit_rpm: - pm_runtime_mark_last_busy((struct device *) priv->rng.priv); - pm_runtime_put_sync_autosuspend((struct device *) priv->rng.priv); + pm_runtime_mark_last_busy(priv->dev); + pm_runtime_put_sync_autosuspend(priv->dev); return retval || !wait ? retval : -EIO; } @@ -329,8 +329,7 @@ static int stm32_rng_init(struct hwrng *rng) 10, 50000); if (err) { clk_disable_unprepare(priv->clk); - dev_err((struct device *)priv->rng.priv, - "%s: timeout %x!\n", __func__, reg); + dev_err(priv->dev, "%s: timeout %x!\n", __func__, reg); return -EINVAL; } } else { @@ -358,8 +357,7 @@ static int stm32_rng_init(struct hwrng *rng) 10, 100000); if (err || (reg & ~RNG_SR_DRDY)) { clk_disable_unprepare(priv->clk); - dev_err((struct device *)priv->rng.priv, - "%s: timeout:%x SR: %x!\n", __func__, err, reg); + dev_err(priv->dev, "%s: timeout:%x SR: %x!\n", __func__, err, reg); return -EINVAL; } @@ -465,8 +463,7 @@ static int __maybe_unused stm32_rng_resume(struct device *dev) if (err) { clk_disable_unprepare(priv->clk); - dev_err((struct device *)priv->rng.priv, - "%s: timeout:%x CR: %x!\n", __func__, err, reg); + dev_err(priv->dev, "%s: timeout:%x CR: %x!\n", __func__, err, reg); return -EINVAL; } } else { @@ -520,7 +517,7 @@ static int stm32_rng_probe(struct platform_device *ofdev) struct stm32_rng_private *priv; struct resource *res; - priv = devm_kzalloc(dev, sizeof(struct stm32_rng_private), GFP_KERNEL); + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -541,6 +538,7 @@ static int stm32_rng_probe(struct platform_device *ofdev) priv->ced = of_property_read_bool(np, "clock-error-detect"); priv->lock_conf = of_property_read_bool(np, "st,rng-lock-conf"); + priv->dev = dev; priv->data = of_device_get_match_data(dev); if (!priv->data) @@ -551,7 +549,6 @@ static int stm32_rng_probe(struct platform_device *ofdev) priv->rng.name = dev_driver_string(dev); priv->rng.init = stm32_rng_init; priv->rng.read = stm32_rng_read; - priv->rng.priv = (unsigned long) dev; priv->rng.quality = 900; pm_runtime_set_autosuspend_delay(dev, 100); diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c index de50c00ba218..19b7fb4a93e8 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-cipher.c @@ -190,7 +190,7 @@ static int sun8i_ce_cipher_prepare(struct crypto_engine *engine, void *async_req err = -EFAULT; goto theend; } - cet->t_key = cpu_to_le32(rctx->addr_key); + cet->t_key = desc_addr_val_le32(ce, rctx->addr_key); ivsize = crypto_skcipher_ivsize(tfm); if (areq->iv && crypto_skcipher_ivsize(tfm) > 0) { @@ -208,7 +208,7 @@ static int sun8i_ce_cipher_prepare(struct crypto_engine *engine, void *async_req err = -ENOMEM; goto theend_iv; } - cet->t_iv = cpu_to_le32(rctx->addr_iv); + cet->t_iv = desc_addr_val_le32(ce, rctx->addr_iv); } if (areq->src == areq->dst) { @@ -236,7 +236,7 @@ static int sun8i_ce_cipher_prepare(struct crypto_engine *engine, void *async_req len = areq->cryptlen; for_each_sg(areq->src, sg, nr_sgs, i) { - cet->t_src[i].addr = cpu_to_le32(sg_dma_address(sg)); + cet->t_src[i].addr = desc_addr_val_le32(ce, sg_dma_address(sg)); todo = min(len, sg_dma_len(sg)); cet->t_src[i].len = cpu_to_le32(todo / 4); dev_dbg(ce->dev, "%s total=%u SG(%d %u off=%d) todo=%u\n", __func__, @@ -251,7 +251,7 @@ static int sun8i_ce_cipher_prepare(struct crypto_engine *engine, void *async_req len = areq->cryptlen; for_each_sg(areq->dst, sg, nr_sgd, i) { - cet->t_dst[i].addr = cpu_to_le32(sg_dma_address(sg)); + cet->t_dst[i].addr = desc_addr_val_le32(ce, sg_dma_address(sg)); todo = min(len, sg_dma_len(sg)); cet->t_dst[i].len = cpu_to_le32(todo / 4); dev_dbg(ce->dev, "%s total=%u SG(%d %u off=%d) todo=%u\n", __func__, diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c index 0408b2d5d533..e55e58e164db 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c @@ -92,6 +92,30 @@ static const struct ce_variant ce_h6_variant = { .trng = CE_ALG_TRNG_V2, }; +static const struct ce_variant ce_h616_variant = { + .alg_cipher = { CE_ALG_AES, CE_ALG_DES, CE_ALG_3DES, + }, + .alg_hash = { CE_ALG_MD5, CE_ALG_SHA1, CE_ALG_SHA224, CE_ALG_SHA256, + CE_ALG_SHA384, CE_ALG_SHA512 + }, + .op_mode = { CE_OP_ECB, CE_OP_CBC + }, + .cipher_t_dlen_in_bytes = true, + .hash_t_dlen_in_bits = true, + .prng_t_dlen_in_bytes = true, + .trng_t_dlen_in_bytes = true, + .needs_word_addresses = true, + .ce_clks = { + { "bus", 0, 200000000 }, + { "mod", 300000000, 0 }, + { "ram", 0, 400000000 }, + { "trng", 0, 0 }, + }, + .esr = ESR_H6, + .prng = CE_ALG_PRNG_V2, + .trng = CE_ALG_TRNG_V2, +}; + static const struct ce_variant ce_a64_variant = { .alg_cipher = { CE_ALG_AES, CE_ALG_DES, CE_ALG_3DES, }, @@ -172,7 +196,7 @@ int sun8i_ce_run_task(struct sun8i_ce_dev *ce, int flow, const char *name) writel(v, ce->base + CE_ICR); reinit_completion(&ce->chanlist[flow].complete); - writel(ce->chanlist[flow].t_phy, ce->base + CE_TDQ); + writel(desc_addr_val(ce, ce->chanlist[flow].t_phy), ce->base + CE_TDQ); ce->chanlist[flow].status = 0; /* Be sure all data is written before enabling the task */ @@ -1097,6 +1121,8 @@ static const struct of_device_id sun8i_ce_crypto_of_match_table[] = { .data = &ce_h5_variant }, { .compatible = "allwinner,sun50i-h6-crypto", .data = &ce_h6_variant }, + { .compatible = "allwinner,sun50i-h616-crypto", + .data = &ce_h616_variant }, {} }; MODULE_DEVICE_TABLE(of, sun8i_ce_crypto_of_match_table); diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c index ee2a28c906ed..6072dd9f390b 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c @@ -403,7 +403,7 @@ int sun8i_ce_hash_run(struct crypto_engine *engine, void *breq) len = areq->nbytes; for_each_sg(areq->src, sg, nr_sgs, i) { - cet->t_src[i].addr = cpu_to_le32(sg_dma_address(sg)); + cet->t_src[i].addr = desc_addr_val_le32(ce, sg_dma_address(sg)); todo = min(len, sg_dma_len(sg)); cet->t_src[i].len = cpu_to_le32(todo / 4); len -= todo; @@ -414,7 +414,7 @@ int sun8i_ce_hash_run(struct crypto_engine *engine, void *breq) goto theend; } addr_res = dma_map_single(ce->dev, result, digestsize, DMA_FROM_DEVICE); - cet->t_dst[0].addr = cpu_to_le32(addr_res); + cet->t_dst[0].addr = desc_addr_val_le32(ce, addr_res); cet->t_dst[0].len = cpu_to_le32(digestsize / 4); if (dma_mapping_error(ce->dev, addr_res)) { dev_err(ce->dev, "DMA map dest\n"); @@ -445,7 +445,7 @@ int sun8i_ce_hash_run(struct crypto_engine *engine, void *breq) } addr_pad = dma_map_single(ce->dev, buf, j * 4, DMA_TO_DEVICE); - cet->t_src[i].addr = cpu_to_le32(addr_pad); + cet->t_src[i].addr = desc_addr_val_le32(ce, addr_pad); cet->t_src[i].len = cpu_to_le32(j); if (dma_mapping_error(ce->dev, addr_pad)) { dev_err(ce->dev, "DMA error on padding SG\n"); diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c index 80815379f6fc..762459867b6c 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-prng.c @@ -132,10 +132,10 @@ int sun8i_ce_prng_generate(struct crypto_rng *tfm, const u8 *src, cet->t_sym_ctl = cpu_to_le32(sym); cet->t_asym_ctl = 0; - cet->t_key = cpu_to_le32(dma_iv); - cet->t_iv = cpu_to_le32(dma_iv); + cet->t_key = desc_addr_val_le32(ce, dma_iv); + cet->t_iv = desc_addr_val_le32(ce, dma_iv); - cet->t_dst[0].addr = cpu_to_le32(dma_dst); + cet->t_dst[0].addr = desc_addr_val_le32(ce, dma_dst); cet->t_dst[0].len = cpu_to_le32(todo / 4); ce->chanlist[flow].timeout = 2000; diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c index 9c35f2a83eda..e1e8bc15202e 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-trng.c @@ -77,7 +77,7 @@ static int sun8i_ce_trng_read(struct hwrng *rng, void *data, size_t max, bool wa cet->t_sym_ctl = 0; cet->t_asym_ctl = 0; - cet->t_dst[0].addr = cpu_to_le32(dma_dst); + cet->t_dst[0].addr = desc_addr_val_le32(ce, dma_dst); cet->t_dst[0].len = cpu_to_le32(todo / 4); ce->chanlist[flow].timeout = todo; diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h index 93d4985def87..3b5c2af013d0 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h @@ -149,6 +149,7 @@ struct ce_variant { bool hash_t_dlen_in_bits; bool prng_t_dlen_in_bytes; bool trng_t_dlen_in_bytes; + bool needs_word_addresses; struct ce_clock ce_clks[CE_MAX_CLOCKS]; int esr; unsigned char prng; @@ -241,6 +242,20 @@ struct sun8i_ce_dev { #endif }; +static inline u32 desc_addr_val(struct sun8i_ce_dev *dev, dma_addr_t addr) +{ + if (dev->variant->needs_word_addresses) + return addr / 4; + + return addr; +} + +static inline __le32 desc_addr_val_le32(struct sun8i_ce_dev *dev, + dma_addr_t addr) +{ + return cpu_to_le32(desc_addr_val(dev, addr)); +} + /* * struct sun8i_cipher_req_ctx - context for a skcipher request * @op_dir: direction (encrypt vs decrypt) for this request diff --git a/drivers/crypto/atmel-sha204a.c b/drivers/crypto/atmel-sha204a.c index 24ffdf505023..a02d496f4c41 100644 --- a/drivers/crypto/atmel-sha204a.c +++ b/drivers/crypto/atmel-sha204a.c @@ -106,7 +106,7 @@ static int atmel_sha204a_otp_read(struct i2c_client *client, u16 addr, u8 *otp) if (cmd.data[0] == 0xff) { dev_err(&client->dev, "failed, device not ready\n"); - return -ret; + return -EINVAL; } memcpy(otp, cmd.data+1, 4); @@ -232,4 +232,5 @@ module_init(atmel_sha204a_init); module_exit(atmel_sha204a_exit); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_DESCRIPTION("Microchip / Atmel SHA204A (I2C) driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c index dbc1d483f2af..75440ea6206e 100644 --- a/drivers/crypto/axis/artpec6_crypto.c +++ b/drivers/crypto/axis/artpec6_crypto.c @@ -2811,13 +2811,6 @@ static struct aead_alg aead_algos[] = { #ifdef CONFIG_DEBUG_FS -struct dbgfs_u32 { - char *name; - mode_t mode; - u32 *flag; - char *desc; -}; - static struct dentry *dbgfs_root; static void artpec6_crypto_init_debugfs(void) diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index aa0ba2d17e1e..394484929dae 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -12,7 +12,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ sev-dev.o \ tee-dev.o \ platform-access.o \ - dbc.o + dbc.o \ + hsti.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/dbc.c b/drivers/crypto/ccp/dbc.c index d373caab52f8..5b105a23f699 100644 --- a/drivers/crypto/ccp/dbc.c +++ b/drivers/crypto/ccp/dbc.c @@ -223,7 +223,7 @@ int dbc_dev_init(struct psp_device *psp) dbc_dev->dev = dev; dbc_dev->psp = psp; - if (PSP_CAPABILITY(psp, DBC_THRU_EXT)) { + if (psp->capability.dbc_thru_ext) { dbc_dev->use_ext = true; dbc_dev->payload_size = &dbc_dev->mbox->ext_req.header.payload_size; dbc_dev->result = &dbc_dev->mbox->ext_req.header.status; diff --git a/drivers/crypto/ccp/hsti.c b/drivers/crypto/ccp/hsti.c new file mode 100644 index 000000000000..1b39a4fb55c0 --- /dev/null +++ b/drivers/crypto/ccp/hsti.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor device driver, security attributes + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. + * + * Author: Mario Limonciello <mario.limonciello@amd.com> + */ + +#include <linux/device.h> + +#include "psp-dev.h" +#include "hsti.h" + +#define PSP_CAPABILITY_PSP_SECURITY_OFFSET 8 + +struct hsti_request { + struct psp_req_buffer_hdr header; + u32 hsti; +} __packed; + +#define security_attribute_show(name) \ +static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ + char *buf) \ +{ \ + struct sp_device *sp = dev_get_drvdata(d); \ + struct psp_device *psp = sp->psp_data; \ + return sysfs_emit(buf, "%d\n", psp->capability.name); \ +} + +security_attribute_show(fused_part) +static DEVICE_ATTR_RO(fused_part); +security_attribute_show(debug_lock_on) +static DEVICE_ATTR_RO(debug_lock_on); +security_attribute_show(tsme_status) +static DEVICE_ATTR_RO(tsme_status); +security_attribute_show(anti_rollback_status) +static DEVICE_ATTR_RO(anti_rollback_status); +security_attribute_show(rpmc_production_enabled) +static DEVICE_ATTR_RO(rpmc_production_enabled); +security_attribute_show(rpmc_spirom_available) +static DEVICE_ATTR_RO(rpmc_spirom_available); +security_attribute_show(hsp_tpm_available) +static DEVICE_ATTR_RO(hsp_tpm_available); +security_attribute_show(rom_armor_enforced) +static DEVICE_ATTR_RO(rom_armor_enforced); + +static struct attribute *psp_security_attrs[] = { + &dev_attr_fused_part.attr, + &dev_attr_debug_lock_on.attr, + &dev_attr_tsme_status.attr, + &dev_attr_anti_rollback_status.attr, + &dev_attr_rpmc_production_enabled.attr, + &dev_attr_rpmc_spirom_available.attr, + &dev_attr_hsp_tpm_available.attr, + &dev_attr_rom_armor_enforced.attr, + NULL +}; + +static umode_t psp_security_is_visible(struct kobject *kobj, struct attribute *attr, int idx) +{ + struct device *dev = kobj_to_dev(kobj); + struct sp_device *sp = dev_get_drvdata(dev); + struct psp_device *psp = sp->psp_data; + + if (psp && psp->capability.security_reporting) + return 0444; + + return 0; +} + +struct attribute_group psp_security_attr_group = { + .attrs = psp_security_attrs, + .is_visible = psp_security_is_visible, +}; + +static int psp_poulate_hsti(struct psp_device *psp) +{ + struct hsti_request *req; + int ret; + + /* Are the security attributes already reported? */ + if (psp->capability.security_reporting) + return 0; + + /* Allocate command-response buffer */ + req = kzalloc(sizeof(*req), GFP_KERNEL | __GFP_ZERO); + if (!req) + return -ENOMEM; + + req->header.payload_size = sizeof(req); + + ret = psp_send_platform_access_msg(PSP_CMD_HSTI_QUERY, (struct psp_request *)req); + if (ret) + goto out; + + if (req->header.status != 0) { + dev_dbg(psp->dev, "failed to populate HSTI state: %d\n", req->header.status); + ret = -EINVAL; + goto out; + } + + psp->capability.security_reporting = 1; + psp->capability.raw |= req->hsti << PSP_CAPABILITY_PSP_SECURITY_OFFSET; + +out: + kfree(req); + + return ret; +} + +int psp_init_hsti(struct psp_device *psp) +{ + int ret; + + if (PSP_FEATURE(psp, HSTI)) { + ret = psp_poulate_hsti(psp); + if (ret) + return ret; + } + + /* + * At this stage, if security information hasn't been populated by + * either the PSP or by the driver through the platform command, + * then there is nothing more to do. + */ + if (!psp->capability.security_reporting) + return 0; + + if (psp->capability.tsme_status) { + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + dev_notice(psp->dev, "psp: Both TSME and SME are active, SME is unnecessary when TSME is active.\n"); + else + dev_notice(psp->dev, "psp: TSME enabled\n"); + } + + return 0; +} diff --git a/drivers/crypto/ccp/hsti.h b/drivers/crypto/ccp/hsti.h new file mode 100644 index 000000000000..6a70f922d2c4 --- /dev/null +++ b/drivers/crypto/ccp/hsti.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Secure Processor device driver, security attributes + * + * Copyright (C) 2023-2024 Advanced Micro Devices, Inc. + * + * Author: Mario Limonciello <mario.limonciello@amd.com> + */ + +#ifndef __HSTI_H +#define __HSTI_H + +extern struct attribute_group psp_security_attr_group; + +int psp_init_hsti(struct psp_device *psp); + +#endif /* __HSTI_H */ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 56bf832c2947..1c5a7189631e 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -19,6 +19,7 @@ #include "tee-dev.h" #include "platform-access.h" #include "dbc.h" +#include "hsti.h" struct psp_device *psp_master; @@ -154,16 +155,7 @@ static unsigned int psp_get_capability(struct psp_device *psp) dev_notice(psp->dev, "psp: unable to access the device: you might be running a broken BIOS.\n"); return -ENODEV; } - psp->capability = val; - - /* Detect TSME and/or SME status */ - if (PSP_CAPABILITY(psp, PSP_SECURITY_REPORTING) && - psp->capability & (PSP_SECURITY_TSME_STATUS << PSP_CAPABILITY_PSP_SECURITY_OFFSET)) { - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) - dev_notice(psp->dev, "psp: Both TSME and SME are active, SME is unnecessary when TSME is active.\n"); - else - dev_notice(psp->dev, "psp: TSME enabled\n"); - } + psp->capability.raw = val; return 0; } @@ -171,7 +163,7 @@ static unsigned int psp_get_capability(struct psp_device *psp) static int psp_check_sev_support(struct psp_device *psp) { /* Check if device supports SEV feature */ - if (!PSP_CAPABILITY(psp, SEV)) { + if (!psp->capability.sev) { dev_dbg(psp->dev, "psp does not support SEV\n"); return -ENODEV; } @@ -182,7 +174,7 @@ static int psp_check_sev_support(struct psp_device *psp) static int psp_check_tee_support(struct psp_device *psp) { /* Check if device supports TEE feature */ - if (!PSP_CAPABILITY(psp, TEE)) { + if (!psp->capability.tee) { dev_dbg(psp->dev, "psp does not support TEE\n"); return -ENODEV; } @@ -214,12 +206,17 @@ static int psp_init(struct psp_device *psp) /* dbc must come after platform access as it tests the feature */ if (PSP_FEATURE(psp, DBC) || - PSP_CAPABILITY(psp, DBC_THRU_EXT)) { + psp->capability.dbc_thru_ext) { ret = dbc_dev_init(psp); if (ret) return ret; } + /* HSTI uses platform access on some systems. */ + ret = psp_init_hsti(psp); + if (ret) + return ret; + return 0; } diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index ae582ba63729..e43ce87ede76 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -26,6 +26,29 @@ extern struct psp_device *psp_master; typedef void (*psp_irq_handler_t)(int, void *, unsigned int); +union psp_cap_register { + unsigned int raw; + struct { + unsigned int sev :1, + tee :1, + dbc_thru_ext :1, + rsvd1 :4, + security_reporting :1, + fused_part :1, + rsvd2 :1, + debug_lock_on :1, + rsvd3 :2, + tsme_status :1, + rsvd4 :1, + anti_rollback_status :1, + rpmc_production_enabled :1, + rpmc_spirom_available :1, + hsp_tpm_available :1, + rom_armor_enforced :1, + rsvd5 :12; + }; +}; + struct psp_device { struct list_head entry; @@ -46,7 +69,7 @@ struct psp_device { void *platform_access_data; void *dbc_data; - unsigned int capability; + union psp_cap_register capability; }; void psp_set_sev_irq_handler(struct psp_device *psp, psp_irq_handler_t handler, @@ -55,27 +78,6 @@ void psp_clear_sev_irq_handler(struct psp_device *psp); struct psp_device *psp_get_master_device(void); -#define PSP_CAPABILITY_SEV BIT(0) -#define PSP_CAPABILITY_TEE BIT(1) -#define PSP_CAPABILITY_DBC_THRU_EXT BIT(2) -#define PSP_CAPABILITY_PSP_SECURITY_REPORTING BIT(7) - -#define PSP_CAPABILITY_PSP_SECURITY_OFFSET 8 -/* - * The PSP doesn't directly store these bits in the capability register - * but instead copies them from the results of query command. - * - * The offsets from the query command are below, and shifted when used. - */ -#define PSP_SECURITY_FUSED_PART BIT(0) -#define PSP_SECURITY_DEBUG_LOCK_ON BIT(2) -#define PSP_SECURITY_TSME_STATUS BIT(5) -#define PSP_SECURITY_ANTI_ROLLBACK_STATUS BIT(7) -#define PSP_SECURITY_RPMC_PRODUCTION_ENABLED BIT(8) -#define PSP_SECURITY_RPMC_SPIROM_AVAILABLE BIT(9) -#define PSP_SECURITY_HSP_TPM_AVAILABLE BIT(10) -#define PSP_SECURITY_ROM_ARMOR_ENFORCED BIT(11) - /** * enum psp_cmd - PSP mailbox commands * @PSP_CMD_TEE_RING_INIT: Initialize TEE ring buffer diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 2102377f727b..1912bee22dd4 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1642,10 +1642,16 @@ fw_err: static int __sev_snp_shutdown_locked(int *error, bool panic) { - struct sev_device *sev = psp_master->sev_data; + struct psp_device *psp = psp_master; + struct sev_device *sev; struct sev_data_snp_shutdown_ex data; int ret; + if (!psp || !psp->sev_data) + return 0; + + sev = psp->sev_data; + if (!sev->snp_initialized) return 0; diff --git a/drivers/crypto/ccp/sp-dev.h b/drivers/crypto/ccp/sp-dev.h index 03d5b9e04084..0895de823674 100644 --- a/drivers/crypto/ccp/sp-dev.h +++ b/drivers/crypto/ccp/sp-dev.h @@ -29,8 +29,8 @@ #define CACHE_WB_NO_ALLOC 0xb7 #define PLATFORM_FEATURE_DBC 0x1 +#define PLATFORM_FEATURE_HSTI 0x2 -#define PSP_CAPABILITY(psp, cap) (psp->capability & PSP_CAPABILITY_##cap) #define PSP_FEATURE(psp, feat) (psp->vdata && psp->vdata->platform_features & PLATFORM_FEATURE_##feat) /* Structure to hold CCP device data */ diff --git a/drivers/crypto/ccp/sp-pci.c b/drivers/crypto/ccp/sp-pci.c index 300dda14182b..248d98fd8c48 100644 --- a/drivers/crypto/ccp/sp-pci.c +++ b/drivers/crypto/ccp/sp-pci.c @@ -24,6 +24,7 @@ #include "ccp-dev.h" #include "psp-dev.h" +#include "hsti.h" /* used for version string AA.BB.CC.DD */ #define AA GENMASK(31, 24) @@ -39,62 +40,6 @@ struct sp_pci { }; static struct sp_device *sp_dev_master; -#define security_attribute_show(name, def) \ -static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ - char *buf) \ -{ \ - struct sp_device *sp = dev_get_drvdata(d); \ - struct psp_device *psp = sp->psp_data; \ - int bit = PSP_SECURITY_##def << PSP_CAPABILITY_PSP_SECURITY_OFFSET; \ - return sysfs_emit(buf, "%d\n", (psp->capability & bit) > 0); \ -} - -security_attribute_show(fused_part, FUSED_PART) -static DEVICE_ATTR_RO(fused_part); -security_attribute_show(debug_lock_on, DEBUG_LOCK_ON) -static DEVICE_ATTR_RO(debug_lock_on); -security_attribute_show(tsme_status, TSME_STATUS) -static DEVICE_ATTR_RO(tsme_status); -security_attribute_show(anti_rollback_status, ANTI_ROLLBACK_STATUS) -static DEVICE_ATTR_RO(anti_rollback_status); -security_attribute_show(rpmc_production_enabled, RPMC_PRODUCTION_ENABLED) -static DEVICE_ATTR_RO(rpmc_production_enabled); -security_attribute_show(rpmc_spirom_available, RPMC_SPIROM_AVAILABLE) -static DEVICE_ATTR_RO(rpmc_spirom_available); -security_attribute_show(hsp_tpm_available, HSP_TPM_AVAILABLE) -static DEVICE_ATTR_RO(hsp_tpm_available); -security_attribute_show(rom_armor_enforced, ROM_ARMOR_ENFORCED) -static DEVICE_ATTR_RO(rom_armor_enforced); - -static struct attribute *psp_security_attrs[] = { - &dev_attr_fused_part.attr, - &dev_attr_debug_lock_on.attr, - &dev_attr_tsme_status.attr, - &dev_attr_anti_rollback_status.attr, - &dev_attr_rpmc_production_enabled.attr, - &dev_attr_rpmc_spirom_available.attr, - &dev_attr_hsp_tpm_available.attr, - &dev_attr_rom_armor_enforced.attr, - NULL -}; - -static umode_t psp_security_is_visible(struct kobject *kobj, struct attribute *attr, int idx) -{ - struct device *dev = kobj_to_dev(kobj); - struct sp_device *sp = dev_get_drvdata(dev); - struct psp_device *psp = sp->psp_data; - - if (psp && PSP_CAPABILITY(psp, PSP_SECURITY_REPORTING)) - return 0444; - - return 0; -} - -static struct attribute_group psp_security_attr_group = { - .attrs = psp_security_attrs, - .is_visible = psp_security_is_visible, -}; - #define version_attribute_show(name, _offset) \ static ssize_t name##_show(struct device *d, struct device_attribute *attr, \ char *buf) \ @@ -134,8 +79,7 @@ static umode_t psp_firmware_is_visible(struct kobject *kobj, struct attribute *a psp->vdata->bootloader_info_reg) val = ioread32(psp->io_regs + psp->vdata->bootloader_info_reg); - if (attr == &dev_attr_tee_version.attr && - PSP_CAPABILITY(psp, TEE) && + if (attr == &dev_attr_tee_version.attr && psp->capability.tee && psp->vdata->tee->info_reg) val = ioread32(psp->io_regs + psp->vdata->tee->info_reg); @@ -152,7 +96,9 @@ static struct attribute_group psp_firmware_attr_group = { }; static const struct attribute_group *psp_groups[] = { +#ifdef CONFIG_CRYPTO_DEV_SP_PSP &psp_security_attr_group, +#endif &psp_firmware_attr_group, NULL, }; @@ -451,10 +397,12 @@ static const struct psp_vdata pspv1 = { static const struct psp_vdata pspv2 = { .sev = &sevv2, + .platform_access = &pa_v1, .bootloader_info_reg = 0x109ec, /* C2PMSG_59 */ .feature_reg = 0x109fc, /* C2PMSG_63 */ .inten_reg = 0x10690, /* P2CMSG_INTEN */ .intsts_reg = 0x10694, /* P2CMSG_INTSTS */ + .platform_features = PLATFORM_FEATURE_HSTI, }; static const struct psp_vdata pspv3 = { @@ -467,7 +415,8 @@ static const struct psp_vdata pspv3 = { .feature_reg = 0x109fc, /* C2PMSG_63 */ .inten_reg = 0x10690, /* P2CMSG_INTEN */ .intsts_reg = 0x10694, /* P2CMSG_INTSTS */ - .platform_features = PLATFORM_FEATURE_DBC, + .platform_features = PLATFORM_FEATURE_DBC | + PLATFORM_FEATURE_HSTI, }; static const struct psp_vdata pspv4 = { diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c index cd66a580e8b6..3fb667a17bbb 100644 --- a/drivers/crypto/ccree/cc_cipher.c +++ b/drivers/crypto/ccree/cc_cipher.c @@ -261,12 +261,6 @@ static void cc_cipher_exit(struct crypto_tfm *tfm) kfree_sensitive(ctx_p->user.key); } -struct tdes_keys { - u8 key1[DES_KEY_SIZE]; - u8 key2[DES_KEY_SIZE]; - u8 key3[DES_KEY_SIZE]; -}; - static enum cc_hw_crypto_key cc_slot_to_hw_key(u8 slot_num) { switch (slot_num) { diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c index b4a4ec35bce0..925991526745 100644 --- a/drivers/crypto/hifn_795x.c +++ b/drivers/crypto/hifn_795x.c @@ -495,16 +495,6 @@ struct hifn_crypt_command { #define HIFN_CRYPT_CMD_SRCLEN_M 0xc000 #define HIFN_CRYPT_CMD_SRCLEN_S 14 -/* - * Structure to help build up the command data structure. - */ -struct hifn_mac_command { - volatile __le16 masks; - volatile __le16 header_skip; - volatile __le16 source_count; - volatile __le16 reserved; -}; - #define HIFN_MAC_CMD_ALG_MASK 0x0001 #define HIFN_MAC_CMD_ALG_SHA1 0x0000 #define HIFN_MAC_CMD_ALG_MD5 0x0001 @@ -526,13 +516,6 @@ struct hifn_mac_command { #define HIFN_MAC_CMD_POS_IPSEC 0x0200 #define HIFN_MAC_CMD_NEW_KEY 0x0800 -struct hifn_comp_command { - volatile __le16 masks; - volatile __le16 header_skip; - volatile __le16 source_count; - volatile __le16 reserved; -}; - #define HIFN_COMP_CMD_SRCLEN_M 0xc000 #define HIFN_COMP_CMD_SRCLEN_S 14 #define HIFN_COMP_CMD_ONE 0x0100 /* must be one */ diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 3dac8d8e8568..f614fd228b56 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3793,14 +3793,13 @@ int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs) goto err_put_sync; } - qm->vfs_num = num_vfs; - ret = pci_enable_sriov(pdev, num_vfs); if (ret) { pci_err(pdev, "Can't enable VF!\n"); qm_clear_vft_config(qm); goto err_put_sync; } + qm->vfs_num = num_vfs; pci_info(pdev, "VF enabled, vfs_num(=%d)!\n", num_vfs); @@ -3822,7 +3821,6 @@ EXPORT_SYMBOL_GPL(hisi_qm_sriov_enable); int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen) { struct hisi_qm *qm = pci_get_drvdata(pdev); - int ret; if (pci_vfs_assigned(pdev)) { pci_err(pdev, "Failed to disable VFs as VFs are assigned!\n"); @@ -3837,13 +3835,10 @@ int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen) pci_disable_sriov(pdev); - ret = qm_clear_vft_config(qm); - if (ret) - return ret; - + qm->vfs_num = 0; qm_pm_put_sync(qm); - return 0; + return qm_clear_vft_config(qm); } EXPORT_SYMBOL_GPL(hisi_qm_sriov_disable); diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index c94a7b20d07e..7c2d803886fd 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -37,7 +37,7 @@ #define HZIP_QM_IDEL_STATUS 0x3040e4 #define HZIP_CORE_DFX_BASE 0x301000 -#define HZIP_CLOCK_GATED_CONTL 0X301004 +#define HZIP_CORE_DFX_DECOMP_BASE 0x304000 #define HZIP_CORE_DFX_COMP_0 0x302000 #define HZIP_CORE_DFX_COMP_1 0x303000 #define HZIP_CORE_DFX_DECOMP_0 0x304000 @@ -48,6 +48,7 @@ #define HZIP_CORE_DFX_DECOMP_5 0x309000 #define HZIP_CORE_REGS_BASE_LEN 0xB0 #define HZIP_CORE_REGS_DFX_LEN 0x28 +#define HZIP_CORE_ADDR_INTRVL 0x1000 #define HZIP_CORE_INT_SOURCE 0x3010A0 #define HZIP_CORE_INT_MASK_REG 0x3010A4 @@ -269,28 +270,6 @@ static const u32 zip_pre_store_caps[] = { ZIP_DEV_ALG_BITMAP, }; -enum { - HZIP_COMP_CORE0, - HZIP_COMP_CORE1, - HZIP_DECOMP_CORE0, - HZIP_DECOMP_CORE1, - HZIP_DECOMP_CORE2, - HZIP_DECOMP_CORE3, - HZIP_DECOMP_CORE4, - HZIP_DECOMP_CORE5, -}; - -static const u64 core_offsets[] = { - [HZIP_COMP_CORE0] = 0x302000, - [HZIP_COMP_CORE1] = 0x303000, - [HZIP_DECOMP_CORE0] = 0x304000, - [HZIP_DECOMP_CORE1] = 0x305000, - [HZIP_DECOMP_CORE2] = 0x306000, - [HZIP_DECOMP_CORE3] = 0x307000, - [HZIP_DECOMP_CORE4] = 0x308000, - [HZIP_DECOMP_CORE5] = 0x309000, -}; - static const struct debugfs_reg32 hzip_dfx_regs[] = { {"HZIP_GET_BD_NUM ", 0x00}, {"HZIP_GET_RIGHT_BD ", 0x04}, @@ -807,6 +786,18 @@ static int hisi_zip_regs_show(struct seq_file *s, void *unused) DEFINE_SHOW_ATTRIBUTE(hisi_zip_regs); +static void __iomem *get_zip_core_addr(struct hisi_qm *qm, int core_num) +{ + u32 zip_comp_core_num = qm->cap_tables.dev_cap_table[ZIP_CLUSTER_COMP_NUM_CAP_IDX].cap_val; + + if (core_num < zip_comp_core_num) + return qm->io_base + HZIP_CORE_DFX_BASE + + (core_num + 1) * HZIP_CORE_ADDR_INTRVL; + + return qm->io_base + HZIP_CORE_DFX_DECOMP_BASE + + (core_num - zip_comp_core_num) * HZIP_CORE_ADDR_INTRVL; +} + static int hisi_zip_core_debug_init(struct hisi_qm *qm) { u32 zip_core_num, zip_comp_core_num; @@ -832,7 +823,7 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm) regset->regs = hzip_dfx_regs; regset->nregs = ARRAY_SIZE(hzip_dfx_regs); - regset->base = qm->io_base + core_offsets[i]; + regset->base = get_zip_core_addr(qm, i); regset->dev = dev; tmp_d = debugfs_create_dir(buf, qm->debug.debug_root); @@ -921,13 +912,14 @@ debugfs_remove: /* hisi_zip_debug_regs_clear() - clear the zip debug regs */ static void hisi_zip_debug_regs_clear(struct hisi_qm *qm) { + u32 zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val; int i, j; /* enable register read_clear bit */ writel(HZIP_RD_CNT_CLR_CE_EN, qm->io_base + HZIP_SOFT_CTRL_CNT_CLR_CE); - for (i = 0; i < ARRAY_SIZE(core_offsets); i++) + for (i = 0; i < zip_core_num; i++) for (j = 0; j < ARRAY_SIZE(hzip_dfx_regs); j++) - readl(qm->io_base + core_offsets[i] + + readl(get_zip_core_addr(qm, i) + hzip_dfx_regs[j].offset); /* disable register read_clear bit */ @@ -970,7 +962,7 @@ static int hisi_zip_show_last_regs_init(struct hisi_qm *qm) } for (i = 0; i < zip_core_num; i++) { - io_base = qm->io_base + core_offsets[i]; + io_base = get_zip_core_addr(qm, i); for (j = 0; j < core_dfx_regs_num; j++) { idx = com_dfx_regs_num + i * core_dfx_regs_num + j; debug->last_words[idx] = readl_relaxed( @@ -1022,7 +1014,7 @@ static void hisi_zip_show_last_dfx_regs(struct hisi_qm *qm) else scnprintf(buf, sizeof(buf), "Decomp_core-%d", i - zip_comp_core_num); - base = qm->io_base + core_offsets[i]; + base = get_zip_core_addr(qm, i); pci_info(qm->pdev, "==>%s:\n", buf); /* dump last word for dfx regs during control resetting */ diff --git a/drivers/crypto/intel/keembay/ocs-hcu.c b/drivers/crypto/intel/keembay/ocs-hcu.c index deb9bd460ee6..55a41e6ab103 100644 --- a/drivers/crypto/intel/keembay/ocs-hcu.c +++ b/drivers/crypto/intel/keembay/ocs-hcu.c @@ -837,4 +837,5 @@ complete: return IRQ_HANDLED; } +MODULE_DESCRIPTION("Intel Keem Bay OCS HCU Crypto Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/crypto/intel/qat/qat_common/adf_cfg.c b/drivers/crypto/intel/qat/qat_common/adf_cfg.c index 8836f015c39c..2cf102ad4ca8 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_cfg.c +++ b/drivers/crypto/intel/qat/qat_common/adf_cfg.c @@ -290,17 +290,19 @@ int adf_cfg_add_key_value_param(struct adf_accel_dev *accel_dev, * 3. if the key exists with the same value, then return without doing * anything (the newly created key_val is freed). */ + down_write(&cfg->lock); if (!adf_cfg_key_val_get(accel_dev, section_name, key, temp_val)) { if (strncmp(temp_val, key_val->val, sizeof(temp_val))) { adf_cfg_keyval_remove(key, section); } else { kfree(key_val); - return 0; + goto out; } } - down_write(&cfg->lock); adf_cfg_keyval_add(key_val, section); + +out: up_write(&cfg->lock); return 0; } diff --git a/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c b/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c index 29c4422f243c..26a1662fafbb 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c +++ b/drivers/crypto/intel/qat/qat_common/adf_ctl_drv.c @@ -31,19 +31,22 @@ static const struct file_operations adf_ctl_ops = { .compat_ioctl = compat_ptr_ioctl, }; +static const struct class adf_ctl_class = { + .name = DEVICE_NAME, +}; + struct adf_ctl_drv_info { unsigned int major; struct cdev drv_cdev; - struct class *drv_class; }; static struct adf_ctl_drv_info adf_ctl_drv; static void adf_chr_drv_destroy(void) { - device_destroy(adf_ctl_drv.drv_class, MKDEV(adf_ctl_drv.major, 0)); + device_destroy(&adf_ctl_class, MKDEV(adf_ctl_drv.major, 0)); cdev_del(&adf_ctl_drv.drv_cdev); - class_destroy(adf_ctl_drv.drv_class); + class_unregister(&adf_ctl_class); unregister_chrdev_region(MKDEV(adf_ctl_drv.major, 0), 1); } @@ -51,17 +54,17 @@ static int adf_chr_drv_create(void) { dev_t dev_id; struct device *drv_device; + int ret; if (alloc_chrdev_region(&dev_id, 0, 1, DEVICE_NAME)) { pr_err("QAT: unable to allocate chrdev region\n"); return -EFAULT; } - adf_ctl_drv.drv_class = class_create(DEVICE_NAME); - if (IS_ERR(adf_ctl_drv.drv_class)) { - pr_err("QAT: class_create failed for adf_ctl\n"); + ret = class_register(&adf_ctl_class); + if (ret) goto err_chrdev_unreg; - } + adf_ctl_drv.major = MAJOR(dev_id); cdev_init(&adf_ctl_drv.drv_cdev, &adf_ctl_ops); if (cdev_add(&adf_ctl_drv.drv_cdev, dev_id, 1)) { @@ -69,7 +72,7 @@ static int adf_chr_drv_create(void) goto err_class_destr; } - drv_device = device_create(adf_ctl_drv.drv_class, NULL, + drv_device = device_create(&adf_ctl_class, NULL, MKDEV(adf_ctl_drv.major, 0), NULL, DEVICE_NAME); if (IS_ERR(drv_device)) { @@ -80,7 +83,7 @@ static int adf_chr_drv_create(void) err_cdev_del: cdev_del(&adf_ctl_drv.drv_cdev); err_class_destr: - class_destroy(adf_ctl_drv.drv_class); + class_unregister(&adf_ctl_class); err_chrdev_unreg: unregister_chrdev_region(dev_id, 1); return -EFAULT; diff --git a/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c b/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c index f07b748795f7..96ddd1c419c4 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c +++ b/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c @@ -59,7 +59,7 @@ static int adf_get_vf_real_id(u32 fake) } /** - * adf_clean_vf_map() - Cleans VF id mapings + * adf_clean_vf_map() - Cleans VF id mappings * @vf: flag indicating whether mappings is cleaned * for vfs only or for vfs and pfs * diff --git a/drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c b/drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c index 70ef11963938..43af81fcab86 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c +++ b/drivers/crypto/intel/qat/qat_common/adf_gen2_pfvf.c @@ -100,7 +100,9 @@ static u32 adf_gen2_disable_pending_vf2pf_interrupts(void __iomem *pmisc_addr) errmsk3 |= ADF_GEN2_ERR_MSK_VF2PF(ADF_GEN2_VF_MSK); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); - errmsk3 &= ADF_GEN2_ERR_MSK_VF2PF(sources | disabled); + /* Update only section of errmsk3 related to VF2PF */ + errmsk3 &= ~ADF_GEN2_ERR_MSK_VF2PF(ADF_GEN2_VF_MSK); + errmsk3 |= ADF_GEN2_ERR_MSK_VF2PF(sources | disabled); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); /* Return the sources of the (new) interrupt(s) */ diff --git a/drivers/crypto/intel/qat/qat_common/adf_rl.c b/drivers/crypto/intel/qat/qat_common/adf_rl.c index 346ef8bee99d..e782c23fc1bf 100644 --- a/drivers/crypto/intel/qat/qat_common/adf_rl.c +++ b/drivers/crypto/intel/qat/qat_common/adf_rl.c @@ -1106,6 +1106,7 @@ int adf_rl_init(struct adf_accel_dev *accel_dev) mutex_init(&rl->rl_lock); rl->device_data = &accel_dev->hw_device->rl_data; rl->accel_dev = accel_dev; + init_rwsem(&rl->user_input.lock); accel_dev->rate_limiting = rl; err_ret: diff --git a/drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c b/drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c index 6e24d57e6b98..c0661ff5e929 100644 --- a/drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c +++ b/drivers/crypto/intel/qat/qat_dh895xcc/adf_dh895xcc_hw_data.c @@ -193,8 +193,12 @@ static u32 disable_pending_vf2pf_interrupts(void __iomem *pmisc_addr) ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK5, errmsk5); - errmsk3 &= ADF_DH895XCC_ERR_MSK_VF2PF_L(sources | disabled); - errmsk5 &= ADF_DH895XCC_ERR_MSK_VF2PF_U(sources | disabled); + /* Update only section of errmsk3 and errmsk5 related to VF2PF */ + errmsk3 &= ~ADF_DH895XCC_ERR_MSK_VF2PF_L(ADF_DH895XCC_VF_MSK); + errmsk5 &= ~ADF_DH895XCC_ERR_MSK_VF2PF_U(ADF_DH895XCC_VF_MSK); + + errmsk3 |= ADF_DH895XCC_ERR_MSK_VF2PF_L(sources | disabled); + errmsk5 |= ADF_DH895XCC_ERR_MSK_VF2PF_U(sources | disabled); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK3, errmsk3); ADF_CSR_WR(pmisc_addr, ADF_GEN2_ERRMSK5, errmsk5); diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c index 057d73c370b7..c82775dbb557 100644 --- a/drivers/crypto/mxs-dcp.c +++ b/drivers/crypto/mxs-dcp.c @@ -225,7 +225,8 @@ static int mxs_dcp_start_dma(struct dcp_async_ctx *actx) static int mxs_dcp_run_aes(struct dcp_async_ctx *actx, struct skcipher_request *req, int init) { - dma_addr_t key_phys, src_phys, dst_phys; + dma_addr_t key_phys = 0; + dma_addr_t src_phys, dst_phys; struct dcp *sdcp = global_sdcp; struct dcp_dma_desc *desc = &sdcp->coh->desc[actx->chan]; struct dcp_aes_req_ctx *rctx = skcipher_request_ctx(req); diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c index 59d472cb11e7..251e088a53df 100644 --- a/drivers/crypto/n2_core.c +++ b/drivers/crypto/n2_core.c @@ -720,10 +720,6 @@ static inline struct n2_skcipher_alg *n2_skcipher_alg(struct crypto_skcipher *tf return container_of(alg, struct n2_skcipher_alg, skcipher); } -struct n2_skcipher_request_context { - struct skcipher_walk walk; -}; - static int n2_aes_setkey(struct crypto_skcipher *skcipher, const u8 *key, unsigned int keylen) { diff --git a/drivers/crypto/sa2ul.c b/drivers/crypto/sa2ul.c index 78a4930c6480..461eca40e878 100644 --- a/drivers/crypto/sa2ul.c +++ b/drivers/crypto/sa2ul.c @@ -2496,4 +2496,5 @@ static struct platform_driver sa_ul_driver = { }, }; module_platform_driver(sa_ul_driver); +MODULE_DESCRIPTION("K3 SA2UL crypto accelerator driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/crypto/starfive/jh7110-cryp.h b/drivers/crypto/starfive/jh7110-cryp.h index 494a74f52706..5ed4ba5da7f9 100644 --- a/drivers/crypto/starfive/jh7110-cryp.h +++ b/drivers/crypto/starfive/jh7110-cryp.h @@ -30,6 +30,7 @@ #define MAX_KEY_SIZE SHA512_BLOCK_SIZE #define STARFIVE_AES_IV_LEN AES_BLOCK_SIZE #define STARFIVE_AES_CTR_LEN AES_BLOCK_SIZE +#define STARFIVE_RSA_MAX_KEYSZ 256 union starfive_aes_csr { u32 v; @@ -217,12 +218,11 @@ struct starfive_cryp_request_ctx { struct scatterlist *out_sg; struct ahash_request ahash_fbk_req; size_t total; - size_t nents; unsigned int blksize; unsigned int digsize; unsigned long in_sg_len; unsigned char *adata; - u8 rsa_data[] __aligned(sizeof(u32)); + u8 rsa_data[STARFIVE_RSA_MAX_KEYSZ] __aligned(sizeof(u32)); }; struct starfive_cryp_dev *starfive_cryp_find_dev(struct starfive_cryp_ctx *ctx); diff --git a/drivers/crypto/starfive/jh7110-rsa.c b/drivers/crypto/starfive/jh7110-rsa.c index 33093ba4b13a..a778c4846025 100644 --- a/drivers/crypto/starfive/jh7110-rsa.c +++ b/drivers/crypto/starfive/jh7110-rsa.c @@ -31,7 +31,6 @@ /* A * A * R mod N ==> A */ #define CRYPTO_CMD_AARN 0x7 -#define STARFIVE_RSA_MAX_KEYSZ 256 #define STARFIVE_RSA_RESET 0x2 static inline int starfive_pka_wait_done(struct starfive_cryp_ctx *ctx) @@ -74,7 +73,7 @@ static int starfive_rsa_montgomery_form(struct starfive_cryp_ctx *ctx, { struct starfive_cryp_dev *cryp = ctx->cryp; struct starfive_cryp_request_ctx *rctx = ctx->rctx; - int count = rctx->total / sizeof(u32) - 1; + int count = (ALIGN(rctx->total, 4) / 4) - 1; int loop; u32 temp; u8 opsize; @@ -251,12 +250,17 @@ static int starfive_rsa_enc_core(struct starfive_cryp_ctx *ctx, int enc) struct starfive_cryp_dev *cryp = ctx->cryp; struct starfive_cryp_request_ctx *rctx = ctx->rctx; struct starfive_rsa_key *key = &ctx->rsa_key; - int ret = 0; + int ret = 0, shift = 0; writel(STARFIVE_RSA_RESET, cryp->base + STARFIVE_PKA_CACR_OFFSET); - rctx->total = sg_copy_to_buffer(rctx->in_sg, rctx->nents, - rctx->rsa_data, rctx->total); + if (!IS_ALIGNED(rctx->total, sizeof(u32))) { + shift = sizeof(u32) - (rctx->total & 0x3); + memset(rctx->rsa_data, 0, shift); + } + + rctx->total = sg_copy_to_buffer(rctx->in_sg, sg_nents(rctx->in_sg), + rctx->rsa_data + shift, rctx->total); if (enc) { key->bitlen = key->e_bitlen; @@ -305,7 +309,6 @@ static int starfive_rsa_enc(struct akcipher_request *req) rctx->in_sg = req->src; rctx->out_sg = req->dst; rctx->total = req->src_len; - rctx->nents = sg_nents(rctx->in_sg); ctx->rctx = rctx; return starfive_rsa_enc_core(ctx, 1); diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c index 11ad4ffdce0d..937f6dab8955 100644 --- a/drivers/crypto/stm32/stm32-cryp.c +++ b/drivers/crypto/stm32/stm32-cryp.c @@ -11,8 +11,11 @@ #include <crypto/internal/des.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> +#include <linux/bottom_half.h> #include <linux/clk.h> #include <linux/delay.h> +#include <linux/dma-mapping.h> +#include <linux/dmaengine.h> #include <linux/err.h> #include <linux/iopoll.h> #include <linux/interrupt.h> @@ -40,6 +43,8 @@ /* Mode mask = bits [15..0] */ #define FLG_MODE_MASK GENMASK(15, 0) /* Bit [31..16] status */ +#define FLG_IN_OUT_DMA BIT(16) +#define FLG_HEADER_DMA BIT(17) /* Registers */ #define CRYP_CR 0x00000000 @@ -121,8 +126,12 @@ #define CR_PH_MASK 0x00030000 #define CR_NBPBL_SHIFT 20 -#define SR_BUSY 0x00000010 -#define SR_OFNE 0x00000004 +#define SR_IFNF BIT(1) +#define SR_OFNE BIT(2) +#define SR_BUSY BIT(8) + +#define DMACR_DIEN BIT(0) +#define DMACR_DOEN BIT(1) #define IMSCR_IN BIT(0) #define IMSCR_OUT BIT(1) @@ -133,7 +142,15 @@ /* Misc */ #define AES_BLOCK_32 (AES_BLOCK_SIZE / sizeof(u32)) #define GCM_CTR_INIT 2 -#define CRYP_AUTOSUSPEND_DELAY 50 +#define CRYP_AUTOSUSPEND_DELAY 50 + +#define CRYP_DMA_BURST_REG 4 + +enum stm32_dma_mode { + NO_DMA, + DMA_PLAIN_SG, + DMA_NEED_SG_TRUNC +}; struct stm32_cryp_caps { bool aeads_support; @@ -146,6 +163,7 @@ struct stm32_cryp_caps { u32 sr; u32 din; u32 dout; + u32 dmacr; u32 imsc; u32 mis; u32 k1l; @@ -172,6 +190,7 @@ struct stm32_cryp { struct list_head list; struct device *dev; void __iomem *regs; + phys_addr_t phys_base; struct clk *clk; unsigned long flags; u32 irq_status; @@ -190,8 +209,20 @@ struct stm32_cryp { size_t header_in; size_t payload_out; + /* DMA process fields */ + struct scatterlist *in_sg; + struct scatterlist *header_sg; struct scatterlist *out_sg; + size_t in_sg_len; + size_t header_sg_len; + size_t out_sg_len; + struct completion dma_completion; + + struct dma_chan *dma_lch_in; + struct dma_chan *dma_lch_out; + enum stm32_dma_mode dma_mode; + /* IT process fields */ struct scatter_walk in_walk; struct scatter_walk out_walk; @@ -291,12 +322,20 @@ static inline int stm32_cryp_wait_enable(struct stm32_cryp *cryp) !(status & CR_CRYPEN), 10, 100000); } +static inline int stm32_cryp_wait_input(struct stm32_cryp *cryp) +{ + u32 status; + + return readl_relaxed_poll_timeout_atomic(cryp->regs + cryp->caps->sr, status, + status & SR_IFNF, 1, 10); +} + static inline int stm32_cryp_wait_output(struct stm32_cryp *cryp) { u32 status; - return readl_relaxed_poll_timeout(cryp->regs + cryp->caps->sr, status, - status & SR_OFNE, 10, 100000); + return readl_relaxed_poll_timeout_atomic(cryp->regs + cryp->caps->sr, status, + status & SR_OFNE, 1, 10); } static inline void stm32_cryp_key_read_enable(struct stm32_cryp *cryp) @@ -311,8 +350,13 @@ static inline void stm32_cryp_key_read_disable(struct stm32_cryp *cryp) cryp->regs + cryp->caps->cr); } +static void stm32_cryp_irq_read_data(struct stm32_cryp *cryp); +static void stm32_cryp_irq_write_data(struct stm32_cryp *cryp); +static void stm32_cryp_irq_write_gcmccm_header(struct stm32_cryp *cryp); static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp); static void stm32_cryp_finish_req(struct stm32_cryp *cryp, int err); +static int stm32_cryp_dma_start(struct stm32_cryp *cryp); +static int stm32_cryp_it_start(struct stm32_cryp *cryp); static struct stm32_cryp *stm32_cryp_find_dev(struct stm32_cryp_ctx *ctx) { @@ -813,11 +857,238 @@ static void stm32_cryp_finish_req(struct stm32_cryp *cryp, int err) if (is_gcm(cryp) || is_ccm(cryp)) crypto_finalize_aead_request(cryp->engine, cryp->areq, err); else - crypto_finalize_skcipher_request(cryp->engine, cryp->req, - err); + crypto_finalize_skcipher_request(cryp->engine, cryp->req, err); +} + +static void stm32_cryp_header_dma_callback(void *param) +{ + struct stm32_cryp *cryp = (struct stm32_cryp *)param; + int ret; + u32 reg; + + dma_unmap_sg(cryp->dev, cryp->header_sg, cryp->header_sg_len, DMA_TO_DEVICE); + + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); + stm32_cryp_write(cryp, cryp->caps->dmacr, reg & ~(DMACR_DOEN | DMACR_DIEN)); + + kfree(cryp->header_sg); + + reg = stm32_cryp_read(cryp, cryp->caps->cr); + + if (cryp->header_in) { + stm32_cryp_write(cryp, cryp->caps->cr, reg | CR_CRYPEN); + + ret = stm32_cryp_wait_input(cryp); + if (ret) { + dev_err(cryp->dev, "input header ready timeout after dma\n"); + stm32_cryp_finish_req(cryp, ret); + return; + } + stm32_cryp_irq_write_gcmccm_header(cryp); + WARN_ON(cryp->header_in); + } + + if (stm32_cryp_get_input_text_len(cryp)) { + /* Phase 3 : payload */ + reg = stm32_cryp_read(cryp, cryp->caps->cr); + stm32_cryp_write(cryp, cryp->caps->cr, reg & ~CR_CRYPEN); + + reg &= ~CR_PH_MASK; + reg |= CR_PH_PAYLOAD | CR_CRYPEN; + stm32_cryp_write(cryp, cryp->caps->cr, reg); + + if (cryp->flags & FLG_IN_OUT_DMA) { + ret = stm32_cryp_dma_start(cryp); + if (ret) + stm32_cryp_finish_req(cryp, ret); + } else { + stm32_cryp_it_start(cryp); + } + } else { + /* + * Phase 4 : tag. + * Nothing to read, nothing to write => end request + */ + stm32_cryp_finish_req(cryp, 0); + } +} + +static void stm32_cryp_dma_callback(void *param) +{ + struct stm32_cryp *cryp = (struct stm32_cryp *)param; + int ret; + u32 reg; + + complete(&cryp->dma_completion); /* completion to indicate no timeout */ + + dma_sync_sg_for_device(cryp->dev, cryp->out_sg, cryp->out_sg_len, DMA_FROM_DEVICE); + + if (cryp->in_sg != cryp->out_sg) + dma_unmap_sg(cryp->dev, cryp->in_sg, cryp->in_sg_len, DMA_TO_DEVICE); + + dma_unmap_sg(cryp->dev, cryp->out_sg, cryp->out_sg_len, DMA_FROM_DEVICE); + + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); + stm32_cryp_write(cryp, cryp->caps->dmacr, reg & ~(DMACR_DOEN | DMACR_DIEN)); + + reg = stm32_cryp_read(cryp, cryp->caps->cr); + + if (is_gcm(cryp) || is_ccm(cryp)) { + kfree(cryp->in_sg); + kfree(cryp->out_sg); + } else { + if (cryp->in_sg != cryp->req->src) + kfree(cryp->in_sg); + if (cryp->out_sg != cryp->req->dst) + kfree(cryp->out_sg); + } + + if (cryp->payload_in) { + stm32_cryp_write(cryp, cryp->caps->cr, reg | CR_CRYPEN); + + ret = stm32_cryp_wait_input(cryp); + if (ret) { + dev_err(cryp->dev, "input ready timeout after dma\n"); + stm32_cryp_finish_req(cryp, ret); + return; + } + stm32_cryp_irq_write_data(cryp); + + ret = stm32_cryp_wait_output(cryp); + if (ret) { + dev_err(cryp->dev, "output ready timeout after dma\n"); + stm32_cryp_finish_req(cryp, ret); + return; + } + stm32_cryp_irq_read_data(cryp); + } + + stm32_cryp_finish_req(cryp, 0); +} + +static int stm32_cryp_header_dma_start(struct stm32_cryp *cryp) +{ + int ret; + struct dma_async_tx_descriptor *tx_in; + u32 reg; + size_t align_size; + + ret = dma_map_sg(cryp->dev, cryp->header_sg, cryp->header_sg_len, DMA_TO_DEVICE); + if (!ret) { + dev_err(cryp->dev, "dma_map_sg() error\n"); + return -ENOMEM; + } + + dma_sync_sg_for_device(cryp->dev, cryp->header_sg, cryp->header_sg_len, DMA_TO_DEVICE); + + tx_in = dmaengine_prep_slave_sg(cryp->dma_lch_in, cryp->header_sg, cryp->header_sg_len, + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + if (!tx_in) { + dev_err(cryp->dev, "IN prep_slave_sg() failed\n"); + return -EINVAL; + } + + tx_in->callback_param = cryp; + tx_in->callback = stm32_cryp_header_dma_callback; + + /* Advance scatterwalk to not DMA'ed data */ + align_size = ALIGN_DOWN(cryp->header_in, cryp->hw_blocksize); + scatterwalk_copychunks(NULL, &cryp->in_walk, align_size, 2); + cryp->header_in -= align_size; + + ret = dma_submit_error(dmaengine_submit(tx_in)); + if (ret < 0) { + dev_err(cryp->dev, "DMA in submit failed\n"); + return ret; + } + dma_async_issue_pending(cryp->dma_lch_in); + + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); + stm32_cryp_write(cryp, cryp->caps->dmacr, reg | DMACR_DIEN); + + return 0; +} + +static int stm32_cryp_dma_start(struct stm32_cryp *cryp) +{ + int ret; + size_t align_size; + struct dma_async_tx_descriptor *tx_in, *tx_out; + u32 reg; + + if (cryp->in_sg != cryp->out_sg) { + ret = dma_map_sg(cryp->dev, cryp->in_sg, cryp->in_sg_len, DMA_TO_DEVICE); + if (!ret) { + dev_err(cryp->dev, "dma_map_sg() error\n"); + return -ENOMEM; + } + } + + ret = dma_map_sg(cryp->dev, cryp->out_sg, cryp->out_sg_len, DMA_FROM_DEVICE); + if (!ret) { + dev_err(cryp->dev, "dma_map_sg() error\n"); + return -ENOMEM; + } + + dma_sync_sg_for_device(cryp->dev, cryp->in_sg, cryp->in_sg_len, DMA_TO_DEVICE); + + tx_in = dmaengine_prep_slave_sg(cryp->dma_lch_in, cryp->in_sg, cryp->in_sg_len, + DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + if (!tx_in) { + dev_err(cryp->dev, "IN prep_slave_sg() failed\n"); + return -EINVAL; + } + + /* No callback necessary */ + tx_in->callback_param = cryp; + tx_in->callback = NULL; + + tx_out = dmaengine_prep_slave_sg(cryp->dma_lch_out, cryp->out_sg, cryp->out_sg_len, + DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + if (!tx_out) { + dev_err(cryp->dev, "OUT prep_slave_sg() failed\n"); + return -EINVAL; + } + + reinit_completion(&cryp->dma_completion); + tx_out->callback = stm32_cryp_dma_callback; + tx_out->callback_param = cryp; + + /* Advance scatterwalk to not DMA'ed data */ + align_size = ALIGN_DOWN(cryp->payload_in, cryp->hw_blocksize); + scatterwalk_copychunks(NULL, &cryp->in_walk, align_size, 2); + cryp->payload_in -= align_size; + + ret = dma_submit_error(dmaengine_submit(tx_in)); + if (ret < 0) { + dev_err(cryp->dev, "DMA in submit failed\n"); + return ret; + } + dma_async_issue_pending(cryp->dma_lch_in); + + /* Advance scatterwalk to not DMA'ed data */ + scatterwalk_copychunks(NULL, &cryp->out_walk, align_size, 2); + cryp->payload_out -= align_size; + ret = dma_submit_error(dmaengine_submit(tx_out)); + if (ret < 0) { + dev_err(cryp->dev, "DMA out submit failed\n"); + return ret; + } + dma_async_issue_pending(cryp->dma_lch_out); + + reg = stm32_cryp_read(cryp, cryp->caps->dmacr); + stm32_cryp_write(cryp, cryp->caps->dmacr, reg | DMACR_DOEN | DMACR_DIEN); + + if (!wait_for_completion_timeout(&cryp->dma_completion, msecs_to_jiffies(1000))) { + dev_err(cryp->dev, "DMA out timed out\n"); + dmaengine_terminate_sync(cryp->dma_lch_out); + return -ETIMEDOUT; + } + + return 0; } -static int stm32_cryp_cpu_start(struct stm32_cryp *cryp) +static int stm32_cryp_it_start(struct stm32_cryp *cryp) { /* Enable interrupt and let the IRQ handler do everything */ stm32_cryp_write(cryp, cryp->caps->imsc, IMSCR_IN | IMSCR_OUT); @@ -1149,13 +1420,256 @@ static int stm32_cryp_tdes_cbc_decrypt(struct skcipher_request *req) return stm32_cryp_crypt(req, FLG_TDES | FLG_CBC); } +static enum stm32_dma_mode stm32_cryp_dma_check_sg(struct scatterlist *test_sg, size_t len, + size_t block_size) +{ + struct scatterlist *sg; + int i; + + if (len <= 16) + return NO_DMA; /* Faster */ + + for_each_sg(test_sg, sg, sg_nents(test_sg), i) { + if (!IS_ALIGNED(sg->length, block_size) && !sg_is_last(sg)) + return NO_DMA; + + if (sg->offset % sizeof(u32)) + return NO_DMA; + + if (sg_is_last(sg) && !IS_ALIGNED(sg->length, AES_BLOCK_SIZE)) + return DMA_NEED_SG_TRUNC; + } + + return DMA_PLAIN_SG; +} + +static enum stm32_dma_mode stm32_cryp_dma_check(struct stm32_cryp *cryp, struct scatterlist *in_sg, + struct scatterlist *out_sg) +{ + enum stm32_dma_mode ret = DMA_PLAIN_SG; + + if (!is_aes(cryp)) + return NO_DMA; + + if (!cryp->dma_lch_in || !cryp->dma_lch_out) + return NO_DMA; + + ret = stm32_cryp_dma_check_sg(in_sg, cryp->payload_in, AES_BLOCK_SIZE); + if (ret == NO_DMA) + return ret; + + ret = stm32_cryp_dma_check_sg(out_sg, cryp->payload_out, AES_BLOCK_SIZE); + if (ret == NO_DMA) + return ret; + + /* Check CTR counter overflow */ + if (is_aes(cryp) && is_ctr(cryp)) { + u32 c; + __be32 iv3; + + memcpy(&iv3, &cryp->req->iv[3 * sizeof(u32)], sizeof(iv3)); + c = be32_to_cpu(iv3); + if ((c + cryp->payload_in) < cryp->payload_in) + return NO_DMA; + } + + /* Workaround */ + if (is_aes(cryp) && is_ctr(cryp) && ret == DMA_NEED_SG_TRUNC) + return NO_DMA; + + return ret; +} + +static int stm32_cryp_truncate_sg(struct scatterlist **new_sg, size_t *new_sg_len, + struct scatterlist *sg, off_t skip, size_t size) +{ + struct scatterlist *cur; + int alloc_sg_len; + + *new_sg_len = 0; + + if (!sg || !size) { + *new_sg = NULL; + return 0; + } + + alloc_sg_len = sg_nents_for_len(sg, skip + size); + if (alloc_sg_len < 0) + return alloc_sg_len; + + /* We allocate to much sg entry, but it is easier */ + *new_sg = kmalloc_array((size_t)alloc_sg_len, sizeof(struct scatterlist), GFP_KERNEL); + if (!*new_sg) + return -ENOMEM; + + sg_init_table(*new_sg, (unsigned int)alloc_sg_len); + + cur = *new_sg; + while (sg && size) { + unsigned int len = sg->length; + unsigned int offset = sg->offset; + + if (skip > len) { + skip -= len; + sg = sg_next(sg); + continue; + } + + if (skip) { + len -= skip; + offset += skip; + skip = 0; + } + + if (size < len) + len = size; + + if (len > 0) { + (*new_sg_len)++; + size -= len; + sg_set_page(cur, sg_page(sg), len, offset); + if (size == 0) + sg_mark_end(cur); + cur = sg_next(cur); + } + + sg = sg_next(sg); + } + + return 0; +} + +static int stm32_cryp_cipher_prepare(struct stm32_cryp *cryp, struct scatterlist *in_sg, + struct scatterlist *out_sg) +{ + size_t align_size; + int ret; + + cryp->dma_mode = stm32_cryp_dma_check(cryp, in_sg, out_sg); + + scatterwalk_start(&cryp->in_walk, in_sg); + scatterwalk_start(&cryp->out_walk, out_sg); + + if (cryp->dma_mode == NO_DMA) { + cryp->flags &= ~FLG_IN_OUT_DMA; + + if (is_ctr(cryp)) + memset(cryp->last_ctr, 0, sizeof(cryp->last_ctr)); + + } else if (cryp->dma_mode == DMA_NEED_SG_TRUNC) { + + cryp->flags |= FLG_IN_OUT_DMA; + + align_size = ALIGN_DOWN(cryp->payload_in, cryp->hw_blocksize); + ret = stm32_cryp_truncate_sg(&cryp->in_sg, &cryp->in_sg_len, in_sg, 0, align_size); + if (ret) + return ret; + + ret = stm32_cryp_truncate_sg(&cryp->out_sg, &cryp->out_sg_len, out_sg, 0, + align_size); + if (ret) { + kfree(cryp->in_sg); + return ret; + } + } else { + cryp->flags |= FLG_IN_OUT_DMA; + + cryp->in_sg = in_sg; + cryp->out_sg = out_sg; + + ret = sg_nents_for_len(cryp->in_sg, cryp->payload_in); + if (ret < 0) + return ret; + cryp->in_sg_len = (size_t)ret; + + ret = sg_nents_for_len(out_sg, cryp->payload_out); + if (ret < 0) + return ret; + cryp->out_sg_len = (size_t)ret; + } + + return 0; +} + +static int stm32_cryp_aead_prepare(struct stm32_cryp *cryp, struct scatterlist *in_sg, + struct scatterlist *out_sg) +{ + size_t align_size; + off_t skip; + int ret, ret2; + + cryp->header_sg = NULL; + cryp->in_sg = NULL; + cryp->out_sg = NULL; + + if (!cryp->dma_lch_in || !cryp->dma_lch_out) { + cryp->dma_mode = NO_DMA; + cryp->flags &= ~(FLG_IN_OUT_DMA | FLG_HEADER_DMA); + + return 0; + } + + /* CCM hw_init may have advanced in header */ + skip = cryp->areq->assoclen - cryp->header_in; + + align_size = ALIGN_DOWN(cryp->header_in, cryp->hw_blocksize); + ret = stm32_cryp_truncate_sg(&cryp->header_sg, &cryp->header_sg_len, in_sg, skip, + align_size); + if (ret) + return ret; + + ret = stm32_cryp_dma_check_sg(cryp->header_sg, align_size, AES_BLOCK_SIZE); + if (ret == NO_DMA) { + /* We cannot DMA the header */ + kfree(cryp->header_sg); + cryp->header_sg = NULL; + + cryp->flags &= ~FLG_HEADER_DMA; + } else { + cryp->flags |= FLG_HEADER_DMA; + } + + /* Now skip all header to be at payload start */ + skip = cryp->areq->assoclen; + align_size = ALIGN_DOWN(cryp->payload_in, cryp->hw_blocksize); + ret = stm32_cryp_truncate_sg(&cryp->in_sg, &cryp->in_sg_len, in_sg, skip, align_size); + if (ret) { + kfree(cryp->header_sg); + return ret; + } + + /* For out buffer align_size is same as in buffer */ + ret = stm32_cryp_truncate_sg(&cryp->out_sg, &cryp->out_sg_len, out_sg, skip, align_size); + if (ret) { + kfree(cryp->header_sg); + kfree(cryp->in_sg); + return ret; + } + + ret = stm32_cryp_dma_check_sg(cryp->in_sg, align_size, AES_BLOCK_SIZE); + ret2 = stm32_cryp_dma_check_sg(cryp->out_sg, align_size, AES_BLOCK_SIZE); + if (ret == NO_DMA || ret2 == NO_DMA) { + kfree(cryp->in_sg); + cryp->in_sg = NULL; + + kfree(cryp->out_sg); + cryp->out_sg = NULL; + + cryp->flags &= ~FLG_IN_OUT_DMA; + } else { + cryp->flags |= FLG_IN_OUT_DMA; + } + + return 0; +} + static int stm32_cryp_prepare_req(struct skcipher_request *req, struct aead_request *areq) { struct stm32_cryp_ctx *ctx; struct stm32_cryp *cryp; struct stm32_cryp_reqctx *rctx; - struct scatterlist *in_sg; + struct scatterlist *in_sg, *out_sg; int ret; if (!req && !areq) @@ -1169,8 +1683,6 @@ static int stm32_cryp_prepare_req(struct skcipher_request *req, rctx = req ? skcipher_request_ctx(req) : aead_request_ctx(areq); rctx->mode &= FLG_MODE_MASK; - ctx->cryp = cryp; - cryp->flags = (cryp->flags & ~FLG_MODE_MASK) | rctx->mode; cryp->hw_blocksize = is_aes(cryp) ? AES_BLOCK_SIZE : DES_BLOCK_SIZE; cryp->ctx = ctx; @@ -1182,6 +1694,15 @@ static int stm32_cryp_prepare_req(struct skcipher_request *req, cryp->payload_in = req->cryptlen; cryp->payload_out = req->cryptlen; cryp->authsize = 0; + + in_sg = req->src; + out_sg = req->dst; + + ret = stm32_cryp_cipher_prepare(cryp, in_sg, out_sg); + if (ret) + return ret; + + ret = stm32_cryp_hw_init(cryp); } else { /* * Length of input and output data: @@ -1211,23 +1732,22 @@ static int stm32_cryp_prepare_req(struct skcipher_request *req, cryp->header_in = areq->assoclen; cryp->payload_out = cryp->payload_in; } - } - in_sg = req ? req->src : areq->src; - scatterwalk_start(&cryp->in_walk, in_sg); - - cryp->out_sg = req ? req->dst : areq->dst; - scatterwalk_start(&cryp->out_walk, cryp->out_sg); + in_sg = areq->src; + out_sg = areq->dst; - if (is_gcm(cryp) || is_ccm(cryp)) { + scatterwalk_start(&cryp->in_walk, in_sg); + scatterwalk_start(&cryp->out_walk, out_sg); /* In output, jump after assoc data */ scatterwalk_copychunks(NULL, &cryp->out_walk, cryp->areq->assoclen, 2); - } - if (is_ctr(cryp)) - memset(cryp->last_ctr, 0, sizeof(cryp->last_ctr)); + ret = stm32_cryp_hw_init(cryp); + if (ret) + return ret; + + ret = stm32_cryp_aead_prepare(cryp, in_sg, out_sg); + } - ret = stm32_cryp_hw_init(cryp); return ret; } @@ -1239,12 +1759,24 @@ static int stm32_cryp_cipher_one_req(struct crypto_engine *engine, void *areq) struct stm32_cryp_ctx *ctx = crypto_skcipher_ctx( crypto_skcipher_reqtfm(req)); struct stm32_cryp *cryp = ctx->cryp; + int ret; if (!cryp) return -ENODEV; - return stm32_cryp_prepare_req(req, NULL) ?: - stm32_cryp_cpu_start(cryp); + ret = stm32_cryp_prepare_req(req, NULL); + if (ret) + return ret; + + if (cryp->flags & FLG_IN_OUT_DMA) + ret = stm32_cryp_dma_start(cryp); + else + ret = stm32_cryp_it_start(cryp); + + if (ret == -ETIMEDOUT) + stm32_cryp_finish_req(cryp, ret); + + return ret; } static int stm32_cryp_aead_one_req(struct crypto_engine *engine, void *areq) @@ -1262,13 +1794,20 @@ static int stm32_cryp_aead_one_req(struct crypto_engine *engine, void *areq) if (err) return err; - if (unlikely(!cryp->payload_in && !cryp->header_in)) { + if (!stm32_cryp_get_input_text_len(cryp) && !cryp->header_in && + !(cryp->flags & FLG_HEADER_DMA)) { /* No input data to process: get tag and finish */ stm32_cryp_finish_req(cryp, 0); return 0; } - return stm32_cryp_cpu_start(cryp); + if (cryp->flags & FLG_HEADER_DMA) + return stm32_cryp_header_dma_start(cryp); + + if (!cryp->header_in && cryp->flags & FLG_IN_OUT_DMA) + return stm32_cryp_dma_start(cryp); + + return stm32_cryp_it_start(cryp); } static int stm32_cryp_read_auth_tag(struct stm32_cryp *cryp) @@ -1665,8 +2204,11 @@ static irqreturn_t stm32_cryp_irq_thread(int irq, void *arg) it_mask &= ~IMSCR_OUT; stm32_cryp_write(cryp, cryp->caps->imsc, it_mask); - if (!cryp->payload_in && !cryp->header_in && !cryp->payload_out) + if (!cryp->payload_in && !cryp->header_in && !cryp->payload_out) { + local_bh_disable(); stm32_cryp_finish_req(cryp, 0); + local_bh_enable(); + } return IRQ_HANDLED; } @@ -1680,13 +2222,72 @@ static irqreturn_t stm32_cryp_irq(int irq, void *arg) return IRQ_WAKE_THREAD; } +static int stm32_cryp_dma_init(struct stm32_cryp *cryp) +{ + struct dma_slave_config dma_conf; + struct dma_chan *chan; + int ret; + + memset(&dma_conf, 0, sizeof(dma_conf)); + + dma_conf.direction = DMA_MEM_TO_DEV; + dma_conf.dst_addr = cryp->phys_base + cryp->caps->din; + dma_conf.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + dma_conf.dst_maxburst = CRYP_DMA_BURST_REG; + dma_conf.device_fc = false; + + chan = dma_request_chan(cryp->dev, "in"); + if (IS_ERR(chan)) + return PTR_ERR(chan); + + cryp->dma_lch_in = chan; + ret = dmaengine_slave_config(cryp->dma_lch_in, &dma_conf); + if (ret) { + dma_release_channel(cryp->dma_lch_in); + cryp->dma_lch_in = NULL; + dev_err(cryp->dev, "Couldn't configure DMA in slave.\n"); + return ret; + } + + memset(&dma_conf, 0, sizeof(dma_conf)); + + dma_conf.direction = DMA_DEV_TO_MEM; + dma_conf.src_addr = cryp->phys_base + cryp->caps->dout; + dma_conf.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES; + dma_conf.src_maxburst = CRYP_DMA_BURST_REG; + dma_conf.device_fc = false; + + chan = dma_request_chan(cryp->dev, "out"); + if (IS_ERR(chan)) { + dma_release_channel(cryp->dma_lch_in); + cryp->dma_lch_in = NULL; + return PTR_ERR(chan); + } + + cryp->dma_lch_out = chan; + + ret = dmaengine_slave_config(cryp->dma_lch_out, &dma_conf); + if (ret) { + dma_release_channel(cryp->dma_lch_out); + cryp->dma_lch_out = NULL; + dev_err(cryp->dev, "Couldn't configure DMA out slave.\n"); + dma_release_channel(cryp->dma_lch_in); + cryp->dma_lch_in = NULL; + return ret; + } + + init_completion(&cryp->dma_completion); + + return 0; +} + static struct skcipher_engine_alg crypto_algs[] = { { .base = { .base.cra_name = "ecb(aes)", .base.cra_driver_name = "stm32-ecb-aes", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_priority = 300, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -1707,8 +2308,8 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "cbc(aes)", .base.cra_driver_name = "stm32-cbc-aes", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_priority = 300, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -1730,8 +2331,8 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "ctr(aes)", .base.cra_driver_name = "stm32-ctr-aes", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_priority = 300, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -1753,8 +2354,8 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "ecb(des)", .base.cra_driver_name = "stm32-ecb-des", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_priority = 300, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -1775,8 +2376,8 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "cbc(des)", .base.cra_driver_name = "stm32-cbc-des", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_priority = 300, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -1798,8 +2399,8 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "ecb(des3_ede)", .base.cra_driver_name = "stm32-ecb-des3", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_priority = 300, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -1820,8 +2421,8 @@ static struct skcipher_engine_alg crypto_algs[] = { .base = { .base.cra_name = "cbc(des3_ede)", .base.cra_driver_name = "stm32-cbc-des3", - .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_ASYNC, + .base.cra_priority = 300, + .base.cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .base.cra_blocksize = DES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct stm32_cryp_ctx), .base.cra_alignmask = 0, @@ -1854,8 +2455,8 @@ static struct aead_engine_alg aead_algs[] = { .base.base = { .cra_name = "gcm(aes)", .cra_driver_name = "stm32-gcm-aes", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_ASYNC, + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct stm32_cryp_ctx), .cra_alignmask = 0, @@ -1877,8 +2478,8 @@ static struct aead_engine_alg aead_algs[] = { .base.base = { .cra_name = "ccm(aes)", .cra_driver_name = "stm32-ccm-aes", - .cra_priority = 200, - .cra_flags = CRYPTO_ALG_ASYNC, + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_KERN_DRIVER_ONLY, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct stm32_cryp_ctx), .cra_alignmask = 0, @@ -1901,6 +2502,7 @@ static const struct stm32_cryp_caps ux500_data = { .sr = UX500_CRYP_SR, .din = UX500_CRYP_DIN, .dout = UX500_CRYP_DOUT, + .dmacr = UX500_CRYP_DMACR, .imsc = UX500_CRYP_IMSC, .mis = UX500_CRYP_MIS, .k1l = UX500_CRYP_K1L, @@ -1923,6 +2525,7 @@ static const struct stm32_cryp_caps f7_data = { .sr = CRYP_SR, .din = CRYP_DIN, .dout = CRYP_DOUT, + .dmacr = CRYP_DMACR, .imsc = CRYP_IMSCR, .mis = CRYP_MISR, .k1l = CRYP_K1LR, @@ -1945,6 +2548,7 @@ static const struct stm32_cryp_caps mp1_data = { .sr = CRYP_SR, .din = CRYP_DIN, .dout = CRYP_DOUT, + .dmacr = CRYP_DMACR, .imsc = CRYP_IMSCR, .mis = CRYP_MISR, .k1l = CRYP_K1LR, @@ -1985,6 +2589,8 @@ static int stm32_cryp_probe(struct platform_device *pdev) if (IS_ERR(cryp->regs)) return PTR_ERR(cryp->regs); + cryp->phys_base = platform_get_resource(pdev, IORESOURCE_MEM, 0)->start; + irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; @@ -2030,6 +2636,17 @@ static int stm32_cryp_probe(struct platform_device *pdev) platform_set_drvdata(pdev, cryp); + ret = stm32_cryp_dma_init(cryp); + switch (ret) { + case 0: + break; + case -ENODEV: + dev_dbg(dev, "DMA mode not available\n"); + break; + default: + goto err_dma; + } + spin_lock(&cryp_list.lock); list_add(&cryp->list, &cryp_list.dev_list); spin_unlock(&cryp_list.lock); @@ -2075,6 +2692,12 @@ err_engine1: spin_lock(&cryp_list.lock); list_del(&cryp->list); spin_unlock(&cryp_list.lock); + + if (cryp->dma_lch_in) + dma_release_channel(cryp->dma_lch_in); + if (cryp->dma_lch_out) + dma_release_channel(cryp->dma_lch_out); +err_dma: err_rst: pm_runtime_disable(dev); pm_runtime_put_noidle(dev); @@ -2101,6 +2724,12 @@ static void stm32_cryp_remove(struct platform_device *pdev) list_del(&cryp->list); spin_unlock(&cryp_list.lock); + if (cryp->dma_lch_in) + dma_release_channel(cryp->dma_lch_in); + + if (cryp->dma_lch_out) + dma_release_channel(cryp->dma_lch_out); + pm_runtime_disable(cryp->dev); pm_runtime_put_noidle(cryp->dev); diff --git a/drivers/crypto/tegra/tegra-se-main.c b/drivers/crypto/tegra/tegra-se-main.c index 9955874b3dc3..f94c0331b148 100644 --- a/drivers/crypto/tegra/tegra-se-main.c +++ b/drivers/crypto/tegra/tegra-se-main.c @@ -326,7 +326,6 @@ static void tegra_se_remove(struct platform_device *pdev) crypto_engine_stop(se->engine); crypto_engine_exit(se->engine); - iommu_fwspec_free(se->dev); host1x_client_unregister(&se->client); } diff --git a/drivers/crypto/xilinx/zynqmp-aes-gcm.c b/drivers/crypto/xilinx/zynqmp-aes-gcm.c index e61405718840..7f0ec6887a39 100644 --- a/drivers/crypto/xilinx/zynqmp-aes-gcm.c +++ b/drivers/crypto/xilinx/zynqmp-aes-gcm.c @@ -446,4 +446,5 @@ static struct platform_driver zynqmp_aes_driver = { }; module_platform_driver(zynqmp_aes_driver); +MODULE_DESCRIPTION("Xilinx ZynqMP AES Driver"); MODULE_LICENSE("GPL"); diff --git a/include/crypto/internal/ecc.h b/include/crypto/internal/ecc.h index f7e75e1e71f3..0717a53ae732 100644 --- a/include/crypto/internal/ecc.h +++ b/include/crypto/internal/ecc.h @@ -63,6 +63,9 @@ static inline void ecc_swap_digits(const void *in, u64 *out, unsigned int ndigit * @nbytes Size of input byte array * @out Output digits array * @ndigits: Number of digits to create from byte array + * + * The first byte in the input byte array is expected to hold the most + * significant bits of the large integer. */ void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes, u64 *out, unsigned int ndigits); diff --git a/include/crypto/sm2.h b/include/crypto/sm2.h deleted file mode 100644 index 04a92c1013c8..000000000000 --- a/include/crypto/sm2.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * sm2.h - SM2 asymmetric public-key algorithm - * as specified by OSCCA GM/T 0003.1-2012 -- 0003.5-2012 SM2 and - * described at https://tools.ietf.org/html/draft-shen-sm2-ecdsa-02 - * - * Copyright (c) 2020, Alibaba Group. - * Written by Tianjia Zhang <tianjia.zhang@linux.alibaba.com> - */ - -#ifndef _CRYPTO_SM2_H -#define _CRYPTO_SM2_H - -struct shash_desc; - -#if IS_REACHABLE(CONFIG_CRYPTO_SM2) -int sm2_compute_z_digest(struct shash_desc *desc, - const void *key, unsigned int keylen, void *dgst); -#else -static inline int sm2_compute_z_digest(struct shash_desc *desc, - const void *key, unsigned int keylen, - void *dgst) -{ - return -ENOTSUPP; -} -#endif - -#endif /* _CRYPTO_SM2_H */ diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index 136e9842120e..b424555753b1 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -13,9 +13,8 @@ #define LINUX_HWRANDOM_H_ #include <linux/completion.h> -#include <linux/types.h> -#include <linux/list.h> #include <linux/kref.h> +#include <linux/types.h> /** * struct hwrng - Hardware Random Number Generator driver diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index c1dc87fc536b..1504fb012c05 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -6,8 +6,9 @@ #include <linux/psp.h> enum psp_platform_access_msg { - PSP_CMD_NONE = 0x0, - PSP_I2C_REQ_BUS_CMD = 0x64, + PSP_CMD_NONE = 0x0, + PSP_CMD_HSTI_QUERY = 0x14, + PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, PSP_DYNAMIC_BOOST_SET_UID, PSP_DYNAMIC_BOOST_GET_PARAMETER, diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c index c2020f19c652..838812d18216 100644 --- a/lib/crypto/arc4.c +++ b/lib/crypto/arc4.c @@ -71,4 +71,5 @@ void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len) } EXPORT_SYMBOL(arc4_crypt); +MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/des.c b/lib/crypto/des.c index ef5bb8822aba..9518658b97cf 100644 --- a/lib/crypto/des.c +++ b/lib/crypto/des.c @@ -899,4 +899,5 @@ void des3_ede_decrypt(const struct des3_ede_ctx *dctx, u8 *dst, const u8 *src) } EXPORT_SYMBOL_GPL(des3_ede_decrypt); +MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/libchacha.c b/lib/crypto/libchacha.c index dabc3accae05..cc1be0496eb9 100644 --- a/lib/crypto/libchacha.c +++ b/lib/crypto/libchacha.c @@ -32,4 +32,5 @@ void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src, } EXPORT_SYMBOL(chacha_crypt_generic); +MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/mpi/ec.c b/lib/crypto/mpi/ec.c index e16dca1e23d5..4781f00982ef 100644 --- a/lib/crypto/mpi/ec.c +++ b/lib/crypto/mpi/ec.c @@ -1285,14 +1285,12 @@ void mpi_ec_mul_point(MPI_POINT result, sum = &p2_; for (j = nbits-1; j >= 0; j--) { - MPI_POINT t; - sw = mpi_test_bit(scalar, j); point_swap_cond(q1, q2, sw, ctx); montgomery_ladder(prd, sum, q1, q2, point->x, ctx); point_swap_cond(prd, sum, sw, ctx); - t = q1; q1 = prd; prd = t; - t = q2; q2 = sum; sum = t; + swap(q1, prd); + swap(q2, sum); } mpi_clear(result->y); diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c index 070ba784c9f1..e08fc202ea5c 100644 --- a/lib/crypto/mpi/mpi-bit.c +++ b/lib/crypto/mpi/mpi-bit.c @@ -212,12 +212,10 @@ void mpi_rshift(MPI x, MPI a, unsigned int n) return; } - if (nlimbs) { - for (i = 0; i < x->nlimbs - nlimbs; i++) - x->d[i] = x->d[i+nlimbs]; - x->d[i] = 0; - x->nlimbs -= nlimbs; - } + for (i = 0; i < x->nlimbs - nlimbs; i++) + x->d[i] = x->d[i+nlimbs]; + x->d[i] = 0; + x->nlimbs -= nlimbs; if (x->nlimbs && nbits) mpihelp_rshift(x->d, x->d, x->nlimbs, nbits); diff --git a/lib/crypto/mpi/mpi-pow.c b/lib/crypto/mpi/mpi-pow.c index 2fd7a46d55ec..67fbd4c2503d 100644 --- a/lib/crypto/mpi/mpi-pow.c +++ b/lib/crypto/mpi/mpi-pow.c @@ -176,7 +176,6 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) for (;;) { while (c) { - mpi_ptr_t tp; mpi_size_t xsize; /*if (mpihelp_mul_n(xp, rp, rp, rsize) < 0) goto enomem */ @@ -207,9 +206,7 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) xsize = msize; } - tp = rp; - rp = xp; - xp = tp; + swap(rp, xp); rsize = xsize; if ((mpi_limb_signed_t) e < 0) { @@ -235,9 +232,7 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) xsize = msize; } - tp = rp; - rp = xp; - xp = tp; + swap(rp, xp); rsize = xsize; } e <<= 1; diff --git a/lib/crypto/poly1305.c b/lib/crypto/poly1305.c index 26d87fc3823e..5d8378d23e95 100644 --- a/lib/crypto/poly1305.c +++ b/lib/crypto/poly1305.c @@ -76,3 +76,4 @@ EXPORT_SYMBOL_GPL(poly1305_final_generic); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_DESCRIPTION("Poly1305 authenticator algorithm, RFC7539"); diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c index 1aebe7be9401..6d2922747cab 100644 --- a/lib/crypto/sha1.c +++ b/lib/crypto/sha1.c @@ -137,4 +137,5 @@ void sha1_init(__u32 *buf) } EXPORT_SYMBOL(sha1_init); +MODULE_DESCRIPTION("SHA-1 Algorithm"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index 3ac1ef8677db..3f42d203c7bc 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -165,4 +165,5 @@ void sha256(const u8 *data, unsigned int len, u8 *out) } EXPORT_SYMBOL(sha256); +MODULE_DESCRIPTION("SHA-256 Algorithm"); MODULE_LICENSE("GPL"); diff --git a/lib/crypto/utils.c b/lib/crypto/utils.c index c852c7151b0a..373364141408 100644 --- a/lib/crypto/utils.c +++ b/lib/crypto/utils.c @@ -85,4 +85,5 @@ void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len) } EXPORT_SYMBOL_GPL(__crypto_xor); +MODULE_DESCRIPTION("Crypto library utility functions"); MODULE_LICENSE("GPL"); diff --git a/security/integrity/digsig_asymmetric.c b/security/integrity/digsig_asymmetric.c index de603cf42ac7..457c0a396caf 100644 --- a/security/integrity/digsig_asymmetric.c +++ b/security/integrity/digsig_asymmetric.c @@ -114,8 +114,7 @@ int asymmetric_verify(struct key *keyring, const char *sig, } else if (!strncmp(pk->pkey_algo, "ecdsa-", 6)) { /* edcsa-nist-p192 etc. */ pks.encoding = "x962"; - } else if (!strcmp(pk->pkey_algo, "ecrdsa") || - !strcmp(pk->pkey_algo, "sm2")) { + } else if (!strcmp(pk->pkey_algo, "ecrdsa")) { pks.encoding = "raw"; } else { ret = -ENOPKG; |