Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu: "API: - Enforce the setting of keys for keyed aead/hash/skcipher algorithms. - Add multibuf speed tests in tcrypt. Algorithms: - Improve performance of sha3-generic. - Add native sha512 support on arm64. - Add v8.2 Crypto Extentions version of sha3/sm3 on arm64. - Avoid hmac nesting by requiring underlying algorithm to be unkeyed. - Add cryptd_max_cpu_qlen module parameter to cryptd. Drivers: - Add support for EIP97 engine in inside-secure. - Add inline IPsec support to chelsio. - Add RevB core support to crypto4xx. - Fix AEAD ICV check in crypto4xx. - Add stm32 crypto driver. - Add support for BCM63xx platforms in bcm2835 and remove bcm63xx. - Add Derived Key Protocol (DKP) support in caam. - Add Samsung Exynos True RNG driver. - Add support for Exynos5250+ SoCs in exynos PRNG driver" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (166 commits) crypto: picoxcell - Fix error handling in spacc_probe() crypto: arm64/sha512 - fix/improve new v8.2 Crypto Extensions code crypto: arm64/sm3 - new v8.2 Crypto Extensions implementation crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation crypto: testmgr - add new testcases for sha3 crypto: sha3-generic - export init/update/final routines crypto: sha3-generic - simplify code crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize crypto: sha3-generic - fixes for alignment and big endian operation crypto: aesni - handle zero length dst buffer crypto: artpec6 - remove select on non-existing CRYPTO_SHA384 hwrng: bcm2835 - Remove redundant dev_err call in bcm2835_rng_probe() crypto: stm32 - remove redundant dev_err call in stm32_cryp_probe() crypto: axis - remove unnecessary platform_get_resource() error check crypto: testmgr - test misuse of result in ahash crypto: inside-secure - make function safexcel_try_push_requests static crypto: aes-generic - fix aes-generic regression on powerpc crypto: chelsio - Fix indentation warning crypto: arm64/sha1-ce - get rid of literal pool crypto: arm64/sha2-ce - move the round constant table to .rodata section ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-01-31 14:22:45 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-01-31 14:22:45 -0800
commit: a103950e0dd2058df5e8a8d4a915707bdcf205f0 (patch)
tree: af5d091f768db4ed7a12fc3c5484d3e20ad9d514 /arch
parent: 2cfa1cd3da14814a1e9ec6a4fce8612637d3ee3d (diff)
parent: 2d55807b7f7bf62bb05a8b91247c5eb7cd19ac04 (diff)
34 files changed, 1327 insertions, 716 deletions
diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c
index 18768f330449..07e31941dc67 100644
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -181,9 +181,8 @@ static int cbc_init(struct crypto_tfm *tfm)
 	struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	ctx->enc_tfm = crypto_alloc_cipher("aes", 0, 0);
-	if (IS_ERR(ctx->enc_tfm))
-		return PTR_ERR(ctx->enc_tfm);
-	return 0;
+
+	return PTR_ERR_OR_ZERO(ctx->enc_tfm);
 }
 
 static void cbc_exit(struct crypto_tfm *tfm)
@@ -258,9 +257,8 @@ static int xts_init(struct crypto_tfm *tfm)
 	struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	ctx->tweak_tfm = crypto_alloc_cipher("aes", 0, 0);
-	if (IS_ERR(ctx->tweak_tfm))
-		return PTR_ERR(ctx->tweak_tfm);
-	return 0;
+
+	return PTR_ERR_OR_ZERO(ctx->tweak_tfm);
 }
 
 static void xts_exit(struct crypto_tfm *tfm)
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
index 1b0e0e86ee9c..96e62ec105d0 100644
--- a/arch/arm/crypto/crc32-ce-glue.c
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -188,6 +188,7 @@ static struct shash_alg crc32_pmull_algs[] = { {
 	.base.cra_name		= "crc32",
 	.base.cra_driver_name	= "crc32-arm-ce",
 	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 	.base.cra_blocksize	= 1,
 	.base.cra_module	= THIS_MODULE,
 }, {
@@ -203,6 +204,7 @@ static struct shash_alg crc32_pmull_algs[] = { {
 	.base.cra_name		= "crc32c",
 	.base.cra_driver_name	= "crc32c-arm-ce",
 	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 	.base.cra_blocksize	= 1,
 	.base.cra_module	= THIS_MODULE,
 } };
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 70c517aa4501..285c36c7b408 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -29,6 +29,24 @@ config CRYPTO_SHA2_ARM64_CE
 	select CRYPTO_HASH
 	select CRYPTO_SHA256_ARM64
 
+config CRYPTO_SHA512_ARM64_CE
+	tristate "SHA-384/SHA-512 digest algorithm (ARMv8 Crypto Extensions)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_HASH
+	select CRYPTO_SHA512_ARM64
+
+config CRYPTO_SHA3_ARM64
+	tristate "SHA3 digest algorithm (ARMv8.2 Crypto Extensions)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_HASH
+	select CRYPTO_SHA3
+
+config CRYPTO_SM3_ARM64_CE
+	tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)"
+	depends on KERNEL_MODE_NEON
+	select CRYPTO_HASH
+	select CRYPTO_SM3
+
 config CRYPTO_GHASH_ARM64_CE
 	tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index b5edc5918c28..cee9b8d9830b 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -14,6 +14,15 @@ sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
 obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
 sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
 
+obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
+sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
+
+obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
+sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
+
+obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o
+sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
+
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
@@ -24,7 +33,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
 crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
-CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
+aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
 aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S
new file mode 100644
index 000000000000..8efdfdade393
--- /dev/null
+++ b/arch/arm64/crypto/aes-ce-core.S
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.arch		armv8-a+crypto
+
+ENTRY(__aes_ce_encrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aese		v0.16b, v2.16b
+	aesmc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aese		v0.16b, v3.16b
+	aesmc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aese		v0.16b, v1.16b
+	aesmc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aese		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+ENDPROC(__aes_ce_encrypt)
+
+ENTRY(__aes_ce_decrypt)
+	sub		w3, w3, #2
+	ld1		{v0.16b}, [x2]
+	ld1		{v1.4s}, [x0], #16
+	cmp		w3, #10
+	bmi		0f
+	bne		3f
+	mov		v3.16b, v1.16b
+	b		2f
+0:	mov		v2.16b, v1.16b
+	ld1		{v3.4s}, [x0], #16
+1:	aesd		v0.16b, v2.16b
+	aesimc		v0.16b, v0.16b
+2:	ld1		{v1.4s}, [x0], #16
+	aesd		v0.16b, v3.16b
+	aesimc		v0.16b, v0.16b
+3:	ld1		{v2.4s}, [x0], #16
+	subs		w3, w3, #3
+	aesd		v0.16b, v1.16b
+	aesimc		v0.16b, v0.16b
+	ld1		{v3.4s}, [x0], #16
+	bpl		1b
+	aesd		v0.16b, v2.16b
+	eor		v0.16b, v0.16b, v3.16b
+	st1		{v0.16b}, [x1]
+	ret
+ENDPROC(__aes_ce_decrypt)
+
+/*
+ * __aes_ce_sub() - use the aese instruction to perform the AES sbox
+ *                  substitution on each byte in 'input'
+ */
+ENTRY(__aes_ce_sub)
+	dup		v1.4s, w0
+	movi		v0.16b, #0
+	aese		v0.16b, v1.16b
+	umov		w0, v0.s[0]
+	ret
+ENDPROC(__aes_ce_sub)
+
+ENTRY(__aes_ce_invert)
+	ld1		{v0.4s}, [x1]
+	aesimc		v1.16b, v0.16b
+	st1		{v1.4s}, [x0]
+	ret
+ENDPROC(__aes_ce_invert)
diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-glue.c
index 6a75cd75ed11..e6b3227bbf57 100644
--- a/arch/arm64/crypto/aes-ce-cipher.c
+++ b/arch/arm64/crypto/aes-ce-glue.c
@@ -29,6 +29,13 @@ struct aes_block {
 	u8 b[AES_BLOCK_SIZE];
 };
 
+asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
+asmlinkage u32 __aes_ce_sub(u32 l);
+asmlinkage void __aes_ce_invert(struct aes_block *out,
+				const struct aes_block *in);
+
 static int num_rounds(struct crypto_aes_ctx *ctx)
 {
 	/*
@@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
 static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct aes_block *out = (struct aes_block *)dst;
-	struct aes_block const *in = (struct aes_block *)src;
-	void *dummy0;
-	int dummy1;
 
 	if (!may_use_simd()) {
 		__aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
@@ -55,49 +58,13 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	}
 
 	kernel_neon_begin();
-
-	__asm__("	ld1	{v0.16b}, %[in]			;"
-		"	ld1	{v1.4s}, [%[key]], #16		;"
-		"	cmp	%w[rounds], #10			;"
-		"	bmi	0f				;"
-		"	bne	3f				;"
-		"	mov	v3.16b, v1.16b			;"
-		"	b	2f				;"
-		"0:	mov	v2.16b, v1.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"1:	aese	v0.16b, v2.16b			;"
-		"	aesmc	v0.16b, v0.16b			;"
-		"2:	ld1	{v1.4s}, [%[key]], #16		;"
-		"	aese	v0.16b, v3.16b			;"
-		"	aesmc	v0.16b, v0.16b			;"
-		"3:	ld1	{v2.4s}, [%[key]], #16		;"
-		"	subs	%w[rounds], %w[rounds], #3	;"
-		"	aese	v0.16b, v1.16b			;"
-		"	aesmc	v0.16b, v0.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"	bpl	1b				;"
-		"	aese	v0.16b, v2.16b			;"
-		"	eor	v0.16b, v0.16b, v3.16b		;"
-		"	st1	{v0.16b}, %[out]		;"
-
-	:	[out]		"=Q"(*out),
-		[key]		"=r"(dummy0),
-		[rounds]	"=r"(dummy1)
-	:	[in]		"Q"(*in),
-				"1"(ctx->key_enc),
-				"2"(num_rounds(ctx) - 2)
-	:	"cc");
-
+	__aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
 	kernel_neon_end();
 }
 
 static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
-	struct aes_block *out = (struct aes_block *)dst;
-	struct aes_block const *in = (struct aes_block *)src;
-	void *dummy0;
-	int dummy1;
 
 	if (!may_use_simd()) {
 		__aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
@@ -105,62 +72,10 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	}
 
 	kernel_neon_begin();
-
-	__asm__("	ld1	{v0.16b}, %[in]			;"
-		"	ld1	{v1.4s}, [%[key]], #16		;"
-		"	cmp	%w[rounds], #10			;"
-		"	bmi	0f				;"
-		"	bne	3f				;"
-		"	mov	v3.16b, v1.16b			;"
-		"	b	2f				;"
-		"0:	mov	v2.16b, v1.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"1:	aesd	v0.16b, v2.16b			;"
-		"	aesimc	v0.16b, v0.16b			;"
-		"2:	ld1	{v1.4s}, [%[key]], #16		;"
-		"	aesd	v0.16b, v3.16b			;"
-		"	aesimc	v0.16b, v0.16b			;"
-		"3:	ld1	{v2.4s}, [%[key]], #16		;"
-		"	subs	%w[rounds], %w[rounds], #3	;"
-		"	aesd	v0.16b, v1.16b			;"
-		"	aesimc	v0.16b, v0.16b			;"
-		"	ld1	{v3.4s}, [%[key]], #16		;"
-		"	bpl	1b				;"
-		"	aesd	v0.16b, v2.16b			;"
-		"	eor	v0.16b, v0.16b, v3.16b		;"
-		"	st1	{v0.16b}, %[out]		;"
-
-	:	[out]		"=Q"(*out),
-		[key]		"=r"(dummy0),
-		[rounds]	"=r"(dummy1)
-	:	[in]		"Q"(*in),
-				"1"(ctx->key_dec),
-				"2"(num_rounds(ctx) - 2)
-	:	"cc");
-
+	__aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
 	kernel_neon_end();
 }
 
-/*
- * aes_sub() - use the aese instruction to perform the AES sbox substitution
- *             on each byte in 'input'
- */
-static u32 aes_sub(u32 input)
-{
-	u32 ret;
-
-	__asm__("dup	v1.4s, %w[in]		;"
-		"movi	v0.16b, #0		;"
-		"aese	v0.16b, v1.16b		;"
-		"umov	%w[out], v0.4s[0]	;"
-
-	:	[out]	"=r"(ret)
-	:	[in]	"r"(input)
-	:		"v0","v1");
-
-	return ret;
-}
-
 int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		     unsigned int key_len)
 {
@@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		u32 *rki = ctx->key_enc + (i * kwords);
 		u32 *rko = rki + kwords;
 
-		rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
+		rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
 		rko[1] = rko[0] ^ rki[1];
 		rko[2] = rko[1] ^ rki[2];
 		rko[3] = rko[2] ^ rki[3];
@@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 		} else if (key_len == AES_KEYSIZE_256) {
 			if (i >= 6)
 				break;
-			rko[4] = aes_sub(rko[3]) ^ rki[4];
+			rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
 			rko[5] = rko[4] ^ rki[5];
 			rko[6] = rko[5] ^ rki[6];
 			rko[7] = rko[6] ^ rki[7];
@@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
 
 	key_dec[0] = key_enc[j];
 	for (i = 1, j--; j > 0; i++, j--)
-		__asm__("ld1	{v0.4s}, %[in]		;"
-			"aesimc	v1.16b, v0.16b		;"
-			"st1	{v1.4s}, %[out]	;"
-
-		:	[out]	"=Q"(key_dec[i])
-		:	[in]	"Q"(key_enc[j])
-		:		"v0","v1");
+		__aes_ce_invert(key_dec + i, key_enc + j);
 	key_dec[i] = key_enc[0];
 
 	kernel_neon_end();
diff --git a/arch/arm64/crypto/aes-cipher-core.S b/arch/arm64/crypto/aes-cipher-core.S
index 6d2445d603cc..3a44eada2347 100644
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -125,6 +125,16 @@ CPU_BE(	rev		w7, w7		)
 	ret
 	.endm
 
+ENTRY(__aes_arm64_encrypt)
+	do_crypt	fround, crypto_ft_tab, crypto_ft_tab + 1, 2
+ENDPROC(__aes_arm64_encrypt)
+
+	.align		5
+ENTRY(__aes_arm64_decrypt)
+	do_crypt	iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0
+ENDPROC(__aes_arm64_decrypt)
+
+	.section	".rodata", "a"
 	.align		L1_CACHE_SHIFT
 	.type		__aes_arm64_inverse_sbox, %object
 __aes_arm64_inverse_sbox:
@@ -161,12 +171,3 @@ __aes_arm64_inverse_sbox:
 	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
 	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
 	.size		__aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox
-
-ENTRY(__aes_arm64_encrypt)
-	do_crypt	fround, crypto_ft_tab, crypto_ft_tab + 1, 2
-ENDPROC(__aes_arm64_encrypt)
-
-	.align		5
-ENTRY(__aes_arm64_decrypt)
-	do_crypt	iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0
-ENDPROC(__aes_arm64_decrypt)
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index 998ba519a026..2fa850e86aa8 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -665,6 +665,7 @@ static int __init aes_init(void)
 
 unregister_simds:
 	aes_exit();
+	return err;
 unregister_ciphers:
 	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 	return err;
diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S
index f1e3aa2732f9..1c7b45b7268e 100644
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -32,10 +32,10 @@
 
 	/* preload the entire Sbox */
 	.macro		prepare, sbox, shiftrows, temp
-	adr		\temp, \sbox
 	movi		v12.16b, #0x1b
-	ldr		q13, \shiftrows
-	ldr		q14, .Lror32by8
+	ldr_l		q13, \shiftrows, \temp
+	ldr_l		q14, .Lror32by8, \temp
+	adr_l		\temp, \sbox
 	ld1		{v16.16b-v19.16b}, [\temp], #64
 	ld1		{v20.16b-v23.16b}, [\temp], #64
 	ld1		{v24.16b-v27.16b}, [\temp], #64
@@ -272,7 +272,7 @@
 
 #include "aes-modes.S"
 
-	.text
+	.section	".rodata", "a"
 	.align		6
 .LForward_Sbox:
 	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
diff --git a/arch/arm64/crypto/crc32-ce-core.S b/arch/arm64/crypto/crc32-ce-core.S
index 18f5a8442276..16ed3c7ebd37 100644
--- a/arch/arm64/crypto/crc32-ce-core.S
+++ b/arch/arm64/crypto/crc32-ce-core.S
@@ -50,7 +50,7 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-	.text
+	.section	".rodata", "a"
 	.align		6
 	.cpu		generic+crypto+crc
 
@@ -115,12 +115,13 @@
 	 * uint crc32_pmull_le(unsigned char const *buffer,
 	 *                     size_t len, uint crc32)
 	 */
+	.text
 ENTRY(crc32_pmull_le)
-	adr		x3, .Lcrc32_constants
+	adr_l		x3, .Lcrc32_constants
 	b		0f
 
 ENTRY(crc32c_pmull_le)
-	adr		x3, .Lcrc32c_constants
+	adr_l		x3, .Lcrc32c_constants
 
 0:	bic		LEN, LEN, #15
 	ld1		{v1.16b-v4.16b}, [BUF], #0x40
diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c
index 624f4137918c..34b4e3d46aab 100644
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -185,6 +185,7 @@ static struct shash_alg crc32_pmull_algs[] = { {
 	.base.cra_name		= "crc32",
 	.base.cra_driver_name	= "crc32-arm64-ce",
 	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 	.base.cra_blocksize	= 1,
 	.base.cra_module	= THIS_MODULE,
 }, {
@@ -200,6 +201,7 @@ static struct shash_alg crc32_pmull_algs[] = { {
 	.base.cra_name		= "crc32c",
 	.base.cra_driver_name	= "crc32c-arm64-ce",
 	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 	.base.cra_blocksize	= 1,
 	.base.cra_module	= THIS_MODULE,
 } };
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index d5b5a8c038c8..f179c01bd55c 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -128,7 +128,7 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	// XOR the initial_crc value
 	eor		v0.16b, v0.16b, v10.16b
 
-	ldr		q10, rk3	// xmm10 has rk3 and rk4
+	ldr_l		q10, rk3, x8	// xmm10 has rk3 and rk4
 					// type of pmull instruction
 					// will determine which constant to use
 
@@ -184,13 +184,13 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	// fold the 8 vector registers to 1 vector register with different
 	// constants
 
-	ldr		q10, rk9
+	ldr_l		q10, rk9, x8
 
 	.macro		fold16, reg, rk
 	pmull		v8.1q, \reg\().1d, v10.1d
 	pmull2		\reg\().1q, \reg\().2d, v10.2d
 	.ifnb		\rk
-	ldr		q10, \rk
+	ldr_l		q10, \rk, x8
 	.endif
 	eor		v7.16b, v7.16b, v8.16b
 	eor		v7.16b, v7.16b, \reg\().16b
@@ -251,7 +251,7 @@ CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
 
 	// get rid of the extra data that was loaded before
 	// load the shift constant
-	adr		x4, tbl_shf_table + 16
+	adr_l		x4, tbl_shf_table + 16
 	sub		x4, x4, arg3
 	ld1		{v0.16b}, [x4]
 
@@ -275,7 +275,7 @@ CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
 
 _128_done:
 	// compute crc of a 128-bit value
-	ldr		q10, rk5		// rk5 and rk6 in xmm10
+	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10
 
 	// 64b fold
 	ext		v0.16b, vzr.16b, v7.16b, #8
@@ -291,7 +291,7 @@ _128_done:
 
 	// barrett reduction
 _barrett:
-	ldr		q10, rk7
+	ldr_l		q10, rk7, x8
 	mov		v0.d[0], v7.d[1]
 
 	pmull		v0.1q, v0.1d, v10.1d
@@ -321,7 +321,7 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	b.eq		_128_done		// exactly 16 left
 	b.lt		_less_than_16_left
 
-	ldr		q10, rk1		// rk1 and rk2 in xmm10
+	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10
 
 	// update the counter. subtract 32 instead of 16 to save one
 	// instruction from the loop
@@ -333,7 +333,7 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 
 _less_than_16_left:
 	// shl r9, 4
-	adr		x0, tbl_shf_table + 16
+	adr_l		x0, tbl_shf_table + 16
 	sub		x0, x0, arg3
 	ld1		{v0.16b}, [x0]
 	movi		v9.16b, #0x80
@@ -345,6 +345,7 @@ ENDPROC(crc_t10dif_pmull)
 // precomputed constants
 // these constants are precomputed from the poly:
 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
+	.section	".rodata", "a"
 	.align		4
 // Q = 0x18BB70000
 // rk1 = 2^(32*3) mod Q << 32
diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
index 8550408735a0..46049850727d 100644
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -58,12 +58,11 @@
 	sha1su1		v\s0\().4s, v\s3\().4s
 	.endm
 
-	/*
-	 * The SHA1 round constants
-	 */
-	.align		4
-.Lsha1_rcon:
-	.word		0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
+	.macro		loadrc, k, val, tmp
+	movz		\tmp, :abs_g0_nc:\val
+	movk		\tmp, :abs_g1:\val
+	dup		\k, \tmp
+	.endm
 
 	/*
 	 * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
@@ -71,11 +70,10 @@
 	 */
 ENTRY(sha1_ce_transform)
 	/* load round constants */
-	adr		x6, .Lsha1_rcon
-	ld1r		{k0.4s}, [x6], #4
-	ld1r		{k1.4s}, [x6], #4
-	ld1r		{k2.4s}, [x6], #4
-	ld1r		{k3.4s}, [x6]
+	loadrc		k0.4s, 0x5a827999, w6
+	loadrc		k1.4s, 0x6ed9eba1, w6
+	loadrc		k2.4s, 0x8f1bbcdc, w6
+	loadrc		k3.4s, 0xca62c1d6, w6
 
 	/* load state */
 	ld1		{dgav.4s}, [x0]
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
index 679c6c002f4f..4c3c89b812ce 100644
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -53,6 +53,7 @@
 	/*
 	 * The SHA-256 round constants
 	 */
+	.section	".rodata", "a"
 	.align		4
 .Lsha2_rcon:
 	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
@@ -76,9 +77,10 @@
 	 * void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
 	 *			  int blocks)
 	 */
+	.text
 ENTRY(sha2_ce_transform)
 	/* load round constants */
-	adr		x8, .Lsha2_rcon
+	adr_l		x8, .Lsha2_rcon
 	ld1		{ v0.4s- v3.4s}, [x8], #64
 	ld1		{ v4.4s- v7.4s}, [x8], #64
 	ld1		{ v8.4s-v11.4s}, [x8], #64
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
new file mode 100644
index 000000000000..332ad7530690
--- /dev/null
+++ b/arch/arm64/crypto/sha3-ce-core.S
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+	.set	.Lv\b\().2d, \b
+	.set	.Lv\b\().16b, \b
+	.endr
+
+	/*
+	 * ARMv8.2 Crypto Extensions instructions
+	 */
+	.macro	eor3, rd, rn, rm, ra
+	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+	.endm
+
+	.macro	rax1, rd, rn, rm
+	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+	.endm
+
+	.macro	bcax, rd, rn, rm, ra
+	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+	.endm
+
+	.macro	xar, rd, rn, rm, imm6
+	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
+	.endm
+
+	/*
+	 * sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
+	 */
+	.text
+ENTRY(sha3_ce_transform)
+	/* load state */
+	add	x8, x0, #32
+	ld1	{ v0.1d- v3.1d}, [x0]
+	ld1	{ v4.1d- v7.1d}, [x8], #32
+	ld1	{ v8.1d-v11.1d}, [x8], #32
+	ld1	{v12.1d-v15.1d}, [x8], #32
+	ld1	{v16.1d-v19.1d}, [x8], #32
+	ld1	{v20.1d-v23.1d}, [x8], #32
+	ld1	{v24.1d}, [x8]
+
+0:	sub	w2, w2, #1
+	mov	w8, #24
+	adr_l	x9, .Lsha3_rcon
+
+	/* load input */
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	ld1	{v29.8b-v31.8b}, [x1], #24
+	eor	v0.8b, v0.8b, v25.8b
+	eor	v1.8b, v1.8b, v26.8b
+	eor	v2.8b, v2.8b, v27.8b
+	eor	v3.8b, v3.8b, v28.8b
+	eor	v4.8b, v4.8b, v29.8b
+	eor	v5.8b, v5.8b, v30.8b
+	eor	v6.8b, v6.8b, v31.8b
+
+	tbnz	x3, #6, 2f		// SHA3-512
+
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	ld1	{v29.8b-v30.8b}, [x1], #16
+	eor	 v7.8b,  v7.8b, v25.8b
+	eor	 v8.8b,  v8.8b, v26.8b
+	eor	 v9.8b,  v9.8b, v27.8b
+	eor	v10.8b, v10.8b, v28.8b
+	eor	v11.8b, v11.8b, v29.8b
+	eor	v12.8b, v12.8b, v30.8b
+
+	tbnz	x3, #4, 1f		// SHA3-384 or SHA3-224
+
+	// SHA3-256
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v13.8b, v13.8b, v25.8b
+	eor	v14.8b, v14.8b, v26.8b
+	eor	v15.8b, v15.8b, v27.8b
+	eor	v16.8b, v16.8b, v28.8b
+	b	3f
+
+1:	tbz	x3, #2, 3f		// bit 2 cleared? SHA-384
+
+	// SHA3-224
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	ld1	{v29.8b}, [x1], #8
+	eor	v13.8b, v13.8b, v25.8b
+	eor	v14.8b, v14.8b, v26.8b
+	eor	v15.8b, v15.8b, v27.8b
+	eor	v16.8b, v16.8b, v28.8b
+	eor	v17.8b, v17.8b, v29.8b
+	b	3f
+
+	// SHA3-512
+2:	ld1	{v25.8b-v26.8b}, [x1], #16
+	eor	 v7.8b,  v7.8b, v25.8b
+	eor	 v8.8b,  v8.8b, v26.8b
+
+3:	sub	w8, w8, #1
+
+	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
+	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
+	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
+	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
+	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
+	eor3	v29.16b, v29.16b, v19.16b, v24.16b
+	eor3	v26.16b, v26.16b, v16.16b, v21.16b
+	eor3	v28.16b, v28.16b, v18.16b, v23.16b
+	eor3	v25.16b, v25.16b, v15.16b, v20.16b
+	eor3	v27.16b, v27.16b, v17.16b, v22.16b
+
+	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
+	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
+	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
+	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
+	rax1	v27.2d, v27.2d, v29.2d	// bc[3]
+
+	eor	 v0.16b,  v0.16b, v30.16b
+	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
+	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
+	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
+	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
+	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
+	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
+	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
+	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
+	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
+	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
+	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
+	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
+	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
+	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
+	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
+	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
+	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
+	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
+	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
+	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
+	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
+	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
+	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
+	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)
+
+	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
+	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
+	bcax	v22.16b, v22.16b, v24.16b, v23.16b
+	bcax	v23.16b, v23.16b, v31.16b, v24.16b
+	bcax	v24.16b, v24.16b,  v8.16b, v31.16b
+
+	ld1r	{v31.2d}, [x9], #8
+
+	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
+	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
+	bcax	v19.16b, v19.16b, v16.16b, v15.16b
+	bcax	v15.16b, v15.16b, v25.16b, v16.16b
+	bcax	v16.16b, v16.16b,  v3.16b, v25.16b
+
+	bcax	v10.16b, v29.16b, v12.16b, v26.16b
+	bcax	v11.16b, v26.16b, v13.16b, v12.16b
+	bcax	v12.16b, v12.16b, v14.16b, v13.16b
+	bcax	v13.16b, v13.16b, v29.16b, v14.16b
+	bcax	v14.16b, v14.16b, v26.16b, v29.16b
+
+	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
+	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
+	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
+	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
+	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b
+
+	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
+	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
+	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
+	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
+	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b
+
+	eor	 v0.16b,  v0.16b, v31.16b
+
+	cbnz	w8, 3b
+	cbnz	w2, 0b
+
+	/* save state */
+	st1	{ v0.1d- v3.1d}, [x0], #32
+	st1	{ v4.1d- v7.1d}, [x0], #32
+	st1	{ v8.1d-v11.1d}, [x0], #32
+	st1	{v12.1d-v15.1d}, [x0], #32
+	st1	{v16.1d-v19.1d}, [x0], #32
+	st1	{v20.1d-v23.1d}, [x0], #32
+	st1	{v24.1d}, [x0]
+	ret
+ENDPROC(sha3_ce_transform)
+
+	.section	".rodata", "a"
+	.align		8
+.Lsha3_rcon:
+	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
+	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
+	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
+	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
+	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
+	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
+	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
+	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
new file mode 100644
index 000000000000..da8222e528bd
--- /dev/null
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sha3.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks,
+				  int md_len);
+
+static int sha3_update(struct shash_desc *desc, const u8 *data,
+		       unsigned int len)
+{
+	struct sha3_state *sctx = shash_desc_ctx(desc);
+	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+
+	if (!may_use_simd())
+		return crypto_sha3_update(desc, data, len);
+
+	if ((sctx->partial + len) >= sctx->rsiz) {
+		int blocks;
+
+		if (sctx->partial) {
+			int p = sctx->rsiz - sctx->partial;
+
+			memcpy(sctx->buf + sctx->partial, data, p);
+			kernel_neon_begin();
+			sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
+			kernel_neon_end();
+
+			data += p;
+			len -= p;
+			sctx->partial = 0;
+		}
+
+		blocks = len / sctx->rsiz;
+		len %= sctx->rsiz;
+
+		if (blocks) {
+			kernel_neon_begin();
+			sha3_ce_transform(sctx->st, data, blocks, digest_size);
+			kernel_neon_end();
+			data += blocks * sctx->rsiz;
+		}
+	}
+
+	if (len) {
+		memcpy(sctx->buf + sctx->partial, data, len);
+		sctx->partial += len;
+	}
+	return 0;
+}
+
+static int sha3_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha3_state *sctx = shash_desc_ctx(desc);
+	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+	__le64 *digest = (__le64 *)out;
+	int i;
+
+	if (!may_use_simd())
+		return crypto_sha3_final(desc, out);
+
+	sctx->buf[sctx->partial++] = 0x06;
+	memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial);
+	sctx->buf[sctx->rsiz - 1] |= 0x80;
+
+	kernel_neon_begin();
+	sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
+	kernel_neon_end();
+
+	for (i = 0; i < digest_size / 8; i++)
+		put_unaligned_le64(sctx->st[i], digest++);
+
+	if (digest_size & 4)
+		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
+
+	*sctx = (struct sha3_state){};
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize		= SHA3_224_DIGEST_SIZE,
+	.init			= crypto_sha3_init,
+	.update			= sha3_update,
+	.final			= sha3_final,
+	.descsize		= sizeof(struct sha3_state),
+	.base.cra_name		= "sha3-224",
+	.base.cra_driver_name	= "sha3-224-ce",
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_priority	= 200,
+}, {
+	.digestsize		= SHA3_256_DIGEST_SIZE,
+	.init			= crypto_sha3_init,
+	.update			= sha3_update,
+	.final			= sha3_final,
+	.descsize		= sizeof(struct sha3_state),
+	.base.cra_name		= "sha3-256",
+	.base.cra_driver_name	= "sha3-256-ce",
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_priority	= 200,
+}, {
+	.digestsize		= SHA3_384_DIGEST_SIZE,
+	.init			= crypto_sha3_init,
+	.update			= sha3_update,
+	.final			= sha3_final,
+	.descsize		= sizeof(struct sha3_state),
+	.base.cra_name		= "sha3-384",
+	.base.cra_driver_name	= "sha3-384-ce",
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_priority	= 200,
+}, {
+	.digestsize		= SHA3_512_DIGEST_SIZE,
+	.init			= crypto_sha3_init,
+	.update			= sha3_update,
+	.final			= sha3_final,
+	.descsize		= sizeof(struct sha3_state),
+	.base.cra_name		= "sha3-512",
+	.base.cra_driver_name	= "sha3-512-ce",
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_priority	= 200,
+} };
+
+static int __init sha3_neon_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit sha3_neon_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_cpu_feature_match(SHA3, sha3_neon_mod_init);
+module_exit(sha3_neon_mod_fini);
diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
new file mode 100644
index 000000000000..7f3bca5c59a2
--- /dev/null
+++ b/arch/arm64/crypto/sha512-ce-core.S
@@ -0,0 +1,204 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.irp		b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+	.set		.Lq\b, \b
+	.set		.Lv\b\().2d, \b
+	.endr
+
+	.macro		sha512h, rd, rn, rm
+	.inst		0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+	.endm
+
+	.macro		sha512h2, rd, rn, rm
+	.inst		0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+	.endm
+
+	.macro		sha512su0, rd, rn
+	.inst		0xcec08000 | .L\rd | (.L\rn << 5)
+	.endm
+
+	.macro		sha512su1, rd, rn, rm
+	.inst		0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+	.endm
+
+	/*
+	 * The SHA-512 round constants
+	 */
+	.section	".rodata", "a"
+	.align		4
+.Lsha512_rcon:
+	.quad		0x428a2f98d728ae22, 0x7137449123ef65cd
+	.quad		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+	.quad		0x3956c25bf348b538, 0x59f111f1b605d019
+	.quad		0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+	.quad		0xd807aa98a3030242, 0x12835b0145706fbe
+	.quad		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+	.quad		0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+	.quad		0x9bdc06a725c71235, 0xc19bf174cf692694
+	.quad		0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+	.quad		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+	.quad		0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+	.quad		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+	.quad		0x983e5152ee66dfab, 0xa831c66d2db43210
+	.quad		0xb00327c898fb213f, 0xbf597fc7beef0ee4
+	.quad		0xc6e00bf33da88fc2, 0xd5a79147930aa725
+	.quad		0x06ca6351e003826f, 0x142929670a0e6e70
+	.quad		0x27b70a8546d22ffc, 0x2e1b21385c26c926
+	.quad		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+	.quad		0x650a73548baf63de, 0x766a0abb3c77b2a8
+	.quad		0x81c2c92e47edaee6, 0x92722c851482353b
+	.quad		0xa2bfe8a14cf10364, 0xa81a664bbc423001
+	.quad		0xc24b8b70d0f89791, 0xc76c51a30654be30
+	.quad		0xd192e819d6ef5218, 0xd69906245565a910
+	.quad		0xf40e35855771202a, 0x106aa07032bbd1b8
+	.quad		0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+	.quad		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+	.quad		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+	.quad		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+	.quad		0x748f82ee5defb2fc, 0x78a5636f43172f60
+	.quad		0x84c87814a1f0ab72, 0x8cc702081a6439ec
+	.quad		0x90befffa23631e28, 0xa4506cebde82bde9
+	.quad		0xbef9a3f7b2c67915, 0xc67178f2e372532b
+	.quad		0xca273eceea26619c, 0xd186b8c721c0c207
+	.quad		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+	.quad		0x06f067aa72176fba, 0x0a637dc5a2c898a6
+	.quad		0x113f9804bef90dae, 0x1b710b35131c471b
+	.quad		0x28db77f523047d84, 0x32caab7b40c72493
+	.quad		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+	.quad		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+	.quad		0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+
+	.macro		dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
+	.ifnb		\rc1
+	ld1		{v\rc1\().2d}, [x4], #16
+	.endif
+	add		v5.2d, v\rc0\().2d, v\in0\().2d
+	ext		v6.16b, v\i2\().16b, v\i3\().16b, #8
+	ext		v5.16b, v5.16b, v5.16b, #8
+	ext		v7.16b, v\i1\().16b, v\i2\().16b, #8
+	add		v\i3\().2d, v\i3\().2d, v5.2d
+	.ifnb		\in1
+	ext		v5.16b, v\in3\().16b, v\in4\().16b, #8
+	sha512su0	v\in0\().2d, v\in1\().2d
+	.endif
+	sha512h		q\i3, q6, v7.2d
+	.ifnb		\in1
+	sha512su1	v\in0\().2d, v\in2\().2d, v5.2d
+	.endif
+	add		v\i4\().2d, v\i1\().2d, v\i3\().2d
+	sha512h2	q\i3, q\i1, v\i0\().2d
+	.endm
+
+	/*
+	 * void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+	 *			  int blocks)
+	 */
+	.text
+ENTRY(sha512_ce_transform)
+	/* load state */
+	ld1		{v8.2d-v11.2d}, [x0]
+
+	/* load first 4 round constants */
+	adr_l		x3, .Lsha512_rcon
+	ld1		{v20.2d-v23.2d}, [x3], #64
+
+	/* load input */
+0:	ld1		{v12.2d-v15.2d}, [x1], #64
+	ld1		{v16.2d-v19.2d}, [x1], #64
+	sub		w2, w2, #1
+
+CPU_LE(	rev64		v12.16b, v12.16b	)
+CPU_LE(	rev64		v13.16b, v13.16b	)
+CPU_LE(	rev64		v14.16b, v14.16b	)
+CPU_LE(	rev64		v15.16b, v15.16b	)
+CPU_LE(	rev64		v16.16b, v16.16b	)
+CPU_LE(	rev64		v17.16b, v17.16b	)
+CPU_LE(	rev64		v18.16b, v18.16b	)
+CPU_LE(	rev64		v19.16b, v19.16b	)
+
+	mov		x4, x3				// rc pointer
+
+	mov		v0.16b, v8.16b
+	mov		v1.16b, v9.16b
+	mov		v2.16b, v10.16b
+	mov		v3.16b, v11.16b
+
+	// v0  ab  cd  --  ef  gh  ab
+	// v1  cd  --  ef  gh  ab  cd
+	// v2  ef  gh  ab  cd  --  ef
+	// v3  gh  ab  cd  --  ef  gh
+	// v4  --  ef  gh  ab  cd  --
+
+	dround		0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
+	dround		3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
+	dround		2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
+	dround		4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
+	dround		1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
+
+	dround		0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
+	dround		3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
+	dround		2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
+	dround		4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
+	dround		1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
+
+	dround		0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
+	dround		3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
+	dround		2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
+	dround		4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
+	dround		1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
+
+	dround		0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
+	dround		3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
+	dround		2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
+	dround		4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
+	dround		1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
+
+	dround		0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
+	dround		3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
+	dround		2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
+	dround		4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
+	dround		1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
+
+	dround		0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
+	dround		3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
+	dround		2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
+	dround		4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
+	dround		1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
+
+	dround		0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
+	dround		3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
+	dround		2, 3, 1, 4, 0, 28, 24, 12
+	dround		4, 2, 0, 1, 3, 29, 25, 13
+	dround		1, 4, 3, 0, 2, 30, 26, 14
+
+	dround		0, 1, 2, 3, 4, 31, 27, 15
+	dround		3, 0, 4, 2, 1, 24,   , 16
+	dround		2, 3, 1, 4, 0, 25,   , 17
+	dround		4, 2, 0, 1, 3, 26,   , 18
+	dround		1, 4, 3, 0, 2, 27,   , 19
+
+	/* update state */
+	add		v8.2d, v8.2d, v0.2d
+	add		v9.2d, v9.2d, v1.2d
+	add		v10.2d, v10.2d, v2.2d
+	add		v11.2d, v11.2d, v3.2d
+
+	/* handled all input blocks? */
+	cbnz		w2, 0b
+
+	/* store new state */
+3:	st1		{v8.2d-v11.2d}, [x0]
+	ret
+ENDPROC(sha512_ce_transform)
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
new file mode 100644
index 000000000000..a77c8632a589
--- /dev/null
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
+				    int blocks);
+
+asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
+
+static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
+			    unsigned int len)
+{
+	if (!may_use_simd())
+		return sha512_base_do_update(desc, data, len,
+				(sha512_block_fn *)sha512_block_data_order);
+
+	kernel_neon_begin();
+	sha512_base_do_update(desc, data, len,
+			      (sha512_block_fn *)sha512_ce_transform);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
+			   unsigned int len, u8 *out)
+{
+	if (!may_use_simd()) {
+		if (len)
+			sha512_base_do_update(desc, data, len,
+				(sha512_block_fn *)sha512_block_data_order);
+		sha512_base_do_finalize(desc,
+				(sha512_block_fn *)sha512_block_data_order);
+		return sha512_base_finish(desc, out);
+	}
+
+	kernel_neon_begin();
+	sha512_base_do_update(desc, data, len,
+			      (sha512_block_fn *)sha512_ce_transform);
+	sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_ce_transform);
+	kernel_neon_end();
+	return sha512_base_finish(desc, out);
+}
+
+static int sha512_ce_final(struct shash_desc *desc, u8 *out)
+{
+	if (!may_use_simd()) {
+		sha512_base_do_finalize(desc,
+				(sha512_block_fn *)sha512_block_data_order);
+		return sha512_base_finish(desc, out);
+	}
+
+	kernel_neon_begin();
+	sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_ce_transform);
+	kernel_neon_end();
+	return sha512_base_finish(desc, out);
+}
+
+static struct shash_alg algs[] = { {
+	.init			= sha384_base_init,
+	.update			= sha512_ce_update,
+	.final			= sha512_ce_final,
+	.finup			= sha512_ce_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA384_DIGEST_SIZE,
+	.base.cra_name		= "sha384",
+	.base.cra_driver_name	= "sha384-ce",
+	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.init			= sha512_base_init,
+	.update			= sha512_ce_update,
+	.final			= sha512_ce_final,
+	.finup			= sha512_ce_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA512_DIGEST_SIZE,
+	.base.cra_name		= "sha512",
+	.base.cra_driver_name	= "sha512-ce",
+	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init sha512_ce_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit sha512_ce_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_cpu_feature_match(SHA512, sha512_ce_mod_init);
+module_exit(sha512_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
index aff35c9992a4..27db4851e380 100644
--- a/arch/arm64/crypto/sha512-glue.c
+++ b/arch/arm64/crypto/sha512-glue.c
@@ -27,6 +27,7 @@ MODULE_ALIAS_CRYPTO("sha512");
 
 asmlinkage void sha512_block_data_order(u32 *digest, const void *data,
 					unsigned int num_blks);
+EXPORT_SYMBOL(sha512_block_data_order);
 
 static int sha512_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
diff --git a/arch/arm64/crypto/sm3-ce-core.S b/arch/arm64/crypto/sm3-ce-core.S
new file mode 100644
index 000000000000..27169fe07a68
--- /dev/null
+++ b/arch/arm64/crypto/sm3-ce-core.S
@@ -0,0 +1,141 @@
+/*
+ * sm3-ce-core.S - SM3 secure hash using ARMv8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.irp		b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+	.set		.Lv\b\().4s, \b
+	.endr
+
+	.macro		sm3partw1, rd, rn, rm
+	.inst		0xce60c000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+	.endm
+
+	.macro		sm3partw2, rd, rn, rm
+	.inst		0xce60c400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+	.endm
+
+	.macro		sm3ss1, rd, rn, rm, ra
+	.inst		0xce400000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+	.endm
+
+	.macro		sm3tt1a, rd, rn, rm, imm2
+	.inst		0xce408000 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
+	.endm
+
+	.macro		sm3tt1b, rd, rn, rm, imm2
+	.inst		0xce408400 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
+	.endm
+
+	.macro		sm3tt2a, rd, rn, rm, imm2
+	.inst		0xce408800 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
+	.endm
+
+	.macro		sm3tt2b, rd, rn, rm, imm2
+	.inst		0xce408c00 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
+	.endm
+
+	.macro		round, ab, s0, t0, t1, i
+	sm3ss1		v5.4s, v8.4s, \t0\().4s, v9.4s
+	shl		\t1\().4s, \t0\().4s, #1
+	sri		\t1\().4s, \t0\().4s, #31
+	sm3tt1\ab	v8.4s, v5.4s, v10.4s, \i
+	sm3tt2\ab	v9.4s, v5.4s, \s0\().4s, \i
+	.endm
+
+	.macro		qround, ab, s0, s1, s2, s3, s4
+	.ifnb		\s4
+	ext		\s4\().16b, \s1\().16b, \s2\().16b, #12
+	ext		v6.16b, \s0\().16b, \s1\().16b, #12
+	ext		v7.16b, \s2\().16b, \s3\().16b, #8
+	sm3partw1	\s4\().4s, \s0\().4s, \s3\().4s
+	.endif
+
+	eor		v10.16b, \s0\().16b, \s1\().16b
+
+	round		\ab, \s0, v11, v12, 0
+	round		\ab, \s0, v12, v11, 1
+	round		\ab, \s0, v11, v12, 2
+	round		\ab, \s0, v12, v11, 3
+
+	.ifnb		\s4
+	sm3partw2	\s4\().4s, v7.4s, v6.4s
+	.endif
+	.endm
+
+	/*
+	 * void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
+	 *                       int blocks)
+	 */
+	.text
+ENTRY(sm3_ce_transform)
+	/* load state */
+	ld1		{v8.4s-v9.4s}, [x0]
+	rev64		v8.4s, v8.4s
+	rev64		v9.4s, v9.4s
+	ext		v8.16b, v8.16b, v8.16b, #8
+	ext		v9.16b, v9.16b, v9.16b, #8
+
+	adr_l		x8, .Lt
+	ldp		s13, s14, [x8]
+
+	/* load input */
+0:	ld1		{v0.16b-v3.16b}, [x1], #64
+	sub		w2, w2, #1
+
+	mov		v15.16b, v8.16b
+	mov		v16.16b, v9.16b
+
+CPU_LE(	rev32		v0.16b, v0.16b		)
+CPU_LE(	rev32		v1.16b, v1.16b		)
+CPU_LE(	rev32		v2.16b, v2.16b		)
+CPU_LE(	rev32		v3.16b, v3.16b		)
+
+	ext		v11.16b, v13.16b, v13.16b, #4
+
+	qround		a, v0, v1, v2, v3, v4
+	qround		a, v1, v2, v3, v4, v0
+	qround		a, v2, v3, v4, v0, v1
+	qround		a, v3, v4, v0, v1, v2
+
+	ext		v11.16b, v14.16b, v14.16b, #4
+
+	qround		b, v4, v0, v1, v2, v3
+	qround		b, v0, v1, v2, v3, v4
+	qround		b, v1, v2, v3, v4, v0
+	qround		b, v2, v3, v4, v0, v1
+	qround		b, v3, v4, v0, v1, v2
+	qround		b, v4, v0, v1, v2, v3
+	qround		b, v0, v1, v2, v3, v4
+	qround		b, v1, v2, v3, v4, v0
+	qround		b, v2, v3, v4, v0, v1
+	qround		b, v3, v4
+	qround		b, v4, v0
+	qround		b, v0, v1
+
+	eor		v8.16b, v8.16b, v15.16b
+	eor		v9.16b, v9.16b, v16.16b
+
+	/* handled all input blocks? */
+	cbnz		w2, 0b
+
+	/* save state */
+	rev64		v8.4s, v8.4s
+	rev64		v9.4s, v9.4s
+	ext		v8.16b, v8.16b, v8.16b, #8
+	ext		v9.16b, v9.16b, v9.16b, #8
+	st1		{v8.4s-v9.4s}, [x0]
+	ret
+ENDPROC(sm3_ce_transform)
+
+	.section	".rodata", "a"
+	.align		3
+.Lt:	.word		0x79cc4519, 0x9d8a7a87
diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c
new file mode 100644
index 000000000000..3b4948f7e26f
--- /dev/null
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@@ -0,0 +1,92 @@
+/*
+ * sm3-ce-glue.c - SM3 secure hash using ARMv8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/hash.h>
+#include <crypto/sm3.h>
+#include <crypto/sm3_base.h>
+#include <linux/cpufeature.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
+				 int blocks);
+
+static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len)
+{
+	if (!may_use_simd())
+		return crypto_sm3_update(desc, data, len);
+
+	kernel_neon_begin();
+	sm3_base_do_update(desc, data, len, sm3_ce_transform);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int sm3_ce_final(struct shash_desc *desc, u8 *out)
+{
+	if (!may_use_simd())
+		return crypto_sm3_finup(desc, NULL, 0, out);
+
+	kernel_neon_begin();
+	sm3_base_do_finalize(desc, sm3_ce_transform);
+	kernel_neon_end();
+
+	return sm3_base_finish(desc, out);
+}
+
+static int sm3_ce_finup(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
+{
+	if (!may_use_simd())
+		return crypto_sm3_finup(desc, data, len, out);
+
+	kernel_neon_begin();
+	sm3_base_do_update(desc, data, len, sm3_ce_transform);
+	kernel_neon_end();
+
+	return sm3_ce_final(desc, out);
+}
+
+static struct shash_alg sm3_alg = {
+	.digestsize		= SM3_DIGEST_SIZE,
+	.init			= sm3_base_init,
+	.update			= sm3_ce_update,
+	.final			= sm3_ce_final,
+	.finup			= sm3_ce_finup,
+	.descsize		= sizeof(struct sm3_state),
+	.base.cra_name		= "sm3",
+	.base.cra_driver_name	= "sm3-ce",
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+	.base.cra_blocksize	= SM3_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+	.base.cra_priority	= 200,
+};
+
+static int __init sm3_ce_mod_init(void)
+{
+	return crypto_register_shash(&sm3_alg);
+}
+
+static void __exit sm3_ce_mod_fini(void)
+{
+	crypto_unregister_shash(&sm3_alg);
+}
+
+module_cpu_feature_match(SM3, sm3_ce_mod_init);
+module_exit(sm3_ce_mod_fini);
diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
index f058e0c3e4d4..fd1d6c83f0c0 100644
--- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c
+++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
@@ -141,6 +141,7 @@ static struct shash_alg alg = {
 		.cra_name		= "crc32c",
 		.cra_driver_name	= "crc32c-vpmsum",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 		.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(u32),
 		.cra_module		= THIS_MODULE,
diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c
index 436865926c26..423ee05887e6 100644
--- a/arch/s390/crypto/crc32-vx.c
+++ b/arch/s390/crypto/crc32-vx.c
@@ -239,6 +239,7 @@ static struct shash_alg crc32_vx_algs[] = {
 			.cra_name	 = "crc32",
 			.cra_driver_name = "crc32-vx",
 			.cra_priority	 = 200,
+			.cra_flags	 = CRYPTO_ALG_OPTIONAL_KEY,
 			.cra_blocksize	 = CRC32_BLOCK_SIZE,
 			.cra_ctxsize	 = sizeof(struct crc_ctx),
 			.cra_module	 = THIS_MODULE,
@@ -259,6 +260,7 @@ static struct shash_alg crc32_vx_algs[] = {
 			.cra_name	 = "crc32be",
 			.cra_driver_name = "crc32be-vx",
 			.cra_priority	 = 200,
+			.cra_flags	 = CRYPTO_ALG_OPTIONAL_KEY,
 			.cra_blocksize	 = CRC32_BLOCK_SIZE,
 			.cra_ctxsize	 = sizeof(struct crc_ctx),
 			.cra_module	 = THIS_MODULE,
@@ -279,6 +281,7 @@ static struct shash_alg crc32_vx_algs[] = {
 			.cra_name	 = "crc32c",
 			.cra_driver_name = "crc32c-vx",
 			.cra_priority	 = 200,
+			.cra_flags	 = CRYPTO_ALG_OPTIONAL_KEY,
 			.cra_blocksize	 = CRC32_BLOCK_SIZE,
 			.cra_ctxsize	 = sizeof(struct crc_ctx),
 			.cra_module	 = THIS_MODULE,
diff --git a/arch/sparc/crypto/crc32c_glue.c b/arch/sparc/crypto/crc32c_glue.c
index d1064e46efe8..8aa664638c3c 100644
--- a/arch/sparc/crypto/crc32c_glue.c
+++ b/arch/sparc/crypto/crc32c_glue.c
@@ -133,6 +133,7 @@ static struct shash_alg alg = {
 		.cra_name		=	"crc32c",
 		.cra_driver_name	=	"crc32c-sparc64",
 		.cra_priority		=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags		=	CRYPTO_ALG_OPTIONAL_KEY,
 		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		=	sizeof(u32),
 		.cra_alignmask		=	7,
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 3d09e3aca18d..12e8484a8ee7 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -90,30 +90,6 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
             .octa 0x00000000000000000000000000000000
 
-.section .rodata
-.align 16
-.type aad_shift_arr, @object
-.size aad_shift_arr, 272
-aad_shift_arr:
-        .octa     0xffffffffffffffffffffffffffffffff
-        .octa     0xffffffffffffffffffffffffffffff0C
-        .octa     0xffffffffffffffffffffffffffff0D0C
-        .octa     0xffffffffffffffffffffffffff0E0D0C
-        .octa     0xffffffffffffffffffffffff0F0E0D0C
-        .octa     0xffffffffffffffffffffff0C0B0A0908
-        .octa     0xffffffffffffffffffff0D0C0B0A0908
-        .octa     0xffffffffffffffffff0E0D0C0B0A0908
-        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
-        .octa     0xffffffffffffff0C0B0A090807060504
-        .octa     0xffffffffffff0D0C0B0A090807060504
-        .octa     0xffffffffff0E0D0C0B0A090807060504
-        .octa     0xffffffff0F0E0D0C0B0A090807060504
-        .octa     0xffffff0C0B0A09080706050403020100
-        .octa     0xffff0D0C0B0A09080706050403020100
-        .octa     0xff0E0D0C0B0A09080706050403020100
-        .octa     0x0F0E0D0C0B0A09080706050403020100
-
-
 .text
 
 
@@ -257,6 +233,37 @@ aad_shift_arr:
 	pxor      \TMP1, \GH            # result is in TMP1
 .endm
 
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN and XMM1
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
+        cmp $8, \DLEN
+        jl _read_lt8_\@
+        mov (\DPTR), %rax
+        MOVQ_R64_XMM %rax, \XMMDst
+        sub $8, \DLEN
+        jz _done_read_partial_block_\@
+	xor %eax, %eax
+_read_next_byte_\@:
+        shl $8, %rax
+        mov 7(\DPTR, \DLEN, 1), %al
+        dec \DLEN
+        jnz _read_next_byte_\@
+        MOVQ_R64_XMM %rax, \XMM1
+	pslldq $8, \XMM1
+        por \XMM1, \XMMDst
+	jmp _done_read_partial_block_\@
+_read_lt8_\@:
+	xor %eax, %eax
+_read_next_byte_lt8_\@:
+        shl $8, %rax
+        mov -1(\DPTR, \DLEN, 1), %al
+        dec \DLEN
+        jnz _read_next_byte_lt8_\@
+        MOVQ_R64_XMM %rax, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
 /*
 * if a = number of total plaintext bytes
 * b = floor(a/16)
@@ -273,62 +280,30 @@ aad_shift_arr:
 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
         MOVADQ     SHUF_MASK(%rip), %xmm14
 	mov	   arg7, %r10           # %r10 = AAD
-	mov	   arg8, %r12           # %r12 = aadLen
-	mov	   %r12, %r11
+	mov	   arg8, %r11           # %r11 = aadLen
 	pxor	   %xmm\i, %xmm\i
 	pxor       \XMM2, \XMM2
 
 	cmp	   $16, %r11
-	jl	   _get_AAD_rest8\num_initial_blocks\operation
+	jl	   _get_AAD_rest\num_initial_blocks\operation
 _get_AAD_blocks\num_initial_blocks\operation:
 	movdqu	   (%r10), %xmm\i
 	PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
 	pxor	   %xmm\i, \XMM2
 	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 	add	   $16, %r10
-	sub	   $16, %r12
 	sub	   $16, %r11
 	cmp	   $16, %r11
 	jge	   _get_AAD_blocks\num_initial_blocks\operation
 
 	movdqu	   \XMM2, %xmm\i
+
+	/* read the last <16B of AAD */
+_get_AAD_rest\num_initial_blocks\operation:
 	cmp	   $0, %r11
 	je	   _get_AAD_done\num_initial_blocks\operation
 
-	pxor	   %xmm\i,%xmm\i
-
-	/* read the last <16B of AAD. since we have at least 4B of
-	data right after the AAD (the ICV, and maybe some CT), we can
-	read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\num_initial_blocks\operation:
-	cmp	   $4, %r11
-	jle	   _get_AAD_rest4\num_initial_blocks\operation
-	movq	   (%r10), \TMP1
-	add	   $8, %r10
-	sub	   $8, %r11
-	pslldq	   $8, \TMP1
-	psrldq	   $8, %xmm\i
-	pxor	   \TMP1, %xmm\i
-	jmp	   _get_AAD_rest8\num_initial_blocks\operation
-_get_AAD_rest4\num_initial_blocks\operation:
-	cmp	   $0, %r11
-	jle	   _get_AAD_rest0\num_initial_blocks\operation
-	mov	   (%r10), %eax
-	movq	   %rax, \TMP1
-	add	   $4, %r10
-	sub	   $4, %r10
-	pslldq	   $12, \TMP1
-	psrldq	   $4, %xmm\i
-	pxor	   \TMP1, %xmm\i
-_get_AAD_rest0\num_initial_blocks\operation:
-	/* finalize: shift out the extra bytes we read, and align
-	left. since pslldq can only shift by an immediate, we use
-	vpshufb and an array of shuffle masks */
-	movq	   %r12, %r11
-	salq	   $4, %r11
-	movdqu	   aad_shift_arr(%r11), \TMP1
-	PSHUFB_XMM \TMP1, %xmm\i
-_get_AAD_rest_final\num_initial_blocks\operation:
+	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
 	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 	pxor	   \XMM2, %xmm\i
 	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
@@ -532,62 +507,30 @@ _initial_blocks_done\num_initial_blocks\operation:
 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
         MOVADQ     SHUF_MASK(%rip), %xmm14
 	mov	   arg7, %r10           # %r10 = AAD
-	mov	   arg8, %r12           # %r12 = aadLen
-	mov	   %r12, %r11
+	mov	   arg8, %r11           # %r11 = aadLen
 	pxor	   %xmm\i, %xmm\i
 	pxor	   \XMM2, \XMM2
 
 	cmp	   $16, %r11
-	jl	   _get_AAD_rest8\num_initial_blocks\operation
+	jl	   _get_AAD_rest\num_initial_blocks\operation
 _get_AAD_blocks\num_initial_blocks\operation:
 	movdqu	   (%r10), %xmm\i
 	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 	pxor	   %xmm\i, \XMM2
 	GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 	add	   $16, %r10
-	sub	   $16, %r12
 	sub	   $16, %r11
 	cmp	   $16, %r11
 	jge	   _get_AAD_blocks\num_initial_blocks\operation
 
 	movdqu	   \XMM2, %xmm\i
+
+	/* read the last <16B of AAD */
+_get_AAD_rest\num_initial_blocks\operation:
 	cmp	   $0, %r11
 	je	   _get_AAD_done\num_initial_blocks\operation
 
-	pxor	   %xmm\i,%xmm\i
-
-	/* read the last <16B of AAD. since we have at least 4B of
-	data right after the AAD (the ICV, and maybe some PT), we can
-	read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\num_initial_blocks\operation:
-	cmp	   $4, %r11
-	jle	   _get_AAD_rest4\num_initial_blocks\operation
-	movq	   (%r10), \TMP1
-	add	   $8, %r10
-	sub	   $8, %r11
-	pslldq	   $8, \TMP1
-	psrldq	   $8, %xmm\i
-	pxor	   \TMP1, %xmm\i
-	jmp	   _get_AAD_rest8\num_initial_blocks\operation
-_get_AAD_rest4\num_initial_blocks\operation:
-	cmp	   $0, %r11
-	jle	   _get_AAD_rest0\num_initial_blocks\operation
-	mov	   (%r10), %eax
-	movq	   %rax, \TMP1
-	add	   $4, %r10
-	sub	   $4, %r10
-	pslldq	   $12, \TMP1
-	psrldq	   $4, %xmm\i
-	pxor	   \TMP1, %xmm\i
-_get_AAD_rest0\num_initial_blocks\operation:
-	/* finalize: shift out the extra bytes we read, and align
-	left. since pslldq can only shift by an immediate, we use
-	vpshufb and an array of shuffle masks */
-	movq	   %r12, %r11
-	salq	   $4, %r11
-	movdqu	   aad_shift_arr(%r11), \TMP1
-	PSHUFB_XMM \TMP1, %xmm\i
-_get_AAD_rest_final\num_initial_blocks\operation:
+	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
 	PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
 	pxor	   \XMM2, %xmm\i
 	GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
@@ -1386,14 +1329,6 @@ _esb_loop_\@:
 *
 *                        AAD Format with 64-bit Extended Sequence Number
 *
-* aadLen:
-*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
-*       The code supports 16 too but for other sizes, the code will fail.
-*
-* TLen:
-*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-*       For other sizes, the code will fail.
-*
 * poly = x^128 + x^127 + x^126 + x^121 + 1
 *
 *****************************************************************************/
@@ -1487,19 +1422,16 @@ _zero_cipher_left_decrypt:
 	PSHUFB_XMM %xmm10, %xmm0
 
 	ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
-	sub $16, %r11
-	add %r13, %r11
-	movdqu (%arg3,%r11,1), %xmm1   # receive the last <16 byte block
-	lea SHIFT_MASK+16(%rip), %r12
-	sub %r13, %r12
-# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
-# (%r13 is the number of bytes in plaintext mod 16)
-	movdqu (%r12), %xmm2           # get the appropriate shuffle mask
-	PSHUFB_XMM %xmm2, %xmm1            # right shift 16-%r13 butes
 
+	lea (%arg3,%r11,1), %r10
+	mov %r13, %r12
+	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+
+	lea ALL_F+16(%rip), %r12
+	sub %r13, %r12
 	movdqa  %xmm1, %xmm2
 	pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
-	movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+	movdqu (%r12), %xmm1
 	# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
 	pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
 	pand    %xmm1, %xmm2
@@ -1508,9 +1440,6 @@ _zero_cipher_left_decrypt:
 
 	pxor %xmm2, %xmm8
 	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-	          # GHASH computation for the last <16 byte block
-	sub %r13, %r11
-	add $16, %r11
 
         # output %r13 bytes
 	MOVQ_R64_XMM	%xmm0, %rax
@@ -1664,14 +1593,6 @@ ENDPROC(aesni_gcm_dec)
 *
 *                         AAD Format with 64-bit Extended Sequence Number
 *
-* aadLen:
-*       from the definition of the spec, aadLen can only be 8 or 12 bytes.
-*       The code supports 16 too but for other sizes, the code will fail.
-*
-* TLen:
-*       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-*       For other sizes, the code will fail.
-*
 * poly = x^128 + x^127 + x^126 + x^121 + 1
 ***************************************************************************/
 ENTRY(aesni_gcm_enc)
@@ -1764,19 +1685,16 @@ _zero_cipher_left_encrypt:
         movdqa SHUF_MASK(%rip), %xmm10
 	PSHUFB_XMM %xmm10, %xmm0
 
-
 	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
-	sub $16, %r11
-	add %r13, %r11
-	movdqu (%arg3,%r11,1), %xmm1     # receive the last <16 byte blocks
-	lea SHIFT_MASK+16(%rip), %r12
+
+	lea (%arg3,%r11,1), %r10
+	mov %r13, %r12
+	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+
+	lea ALL_F+16(%rip), %r12
 	sub %r13, %r12
-	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
-	# (%r13 is the number of bytes in plaintext mod 16)
-	movdqu	(%r12), %xmm2           # get the appropriate shuffle mask
-	PSHUFB_XMM	%xmm2, %xmm1            # shift right 16-r13 byte
 	pxor	%xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
-	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
+	movdqu	(%r12), %xmm1
 	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
 	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
         movdqa SHUF_MASK(%rip), %xmm10
@@ -1785,9 +1703,6 @@ _zero_cipher_left_encrypt:
 	pxor	%xmm0, %xmm8
 	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 	# GHASH computation for the last <16 byte block
-	sub	%r13, %r11
-	add	$16, %r11
-
 	movdqa SHUF_MASK(%rip), %xmm10
 	PSHUFB_XMM %xmm10, %xmm0
 
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 3bf3dcf29825..34cf1c1f8c98 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -690,8 +690,8 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
 	       rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
 }
 
-static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
-			   unsigned int key_len)
+static int gcmaes_wrapper_set_key(struct crypto_aead *parent, const u8 *key,
+				  unsigned int key_len)
 {
 	struct cryptd_aead **ctx = crypto_aead_ctx(parent);
 	struct cryptd_aead *cryptd_tfm = *ctx;
@@ -716,8 +716,8 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead,
 
 /* This is the Integrity Check Value (aka the authentication tag length and can
  * be 8, 12 or 16 bytes long. */
-static int rfc4106_set_authsize(struct crypto_aead *parent,
-				unsigned int authsize)
+static int gcmaes_wrapper_set_authsize(struct crypto_aead *parent,
+				       unsigned int authsize)
 {
 	struct cryptd_aead **ctx = crypto_aead_ctx(parent);
 	struct cryptd_aead *cryptd_tfm = *ctx;
@@ -824,7 +824,7 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
 	if (sg_is_last(req->src) &&
 	    (!PageHighMem(sg_page(req->src)) ||
 	    req->src->offset + req->src->length <= PAGE_SIZE) &&
-	    sg_is_last(req->dst) &&
+	    sg_is_last(req->dst) && req->dst->length &&
 	    (!PageHighMem(sg_page(req->dst)) ||
 	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
 		one_entry_in_sg = 1;
@@ -929,7 +929,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
 			      aes_ctx);
 }
 
-static int rfc4106_encrypt(struct aead_request *req)
+static int gcmaes_wrapper_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
@@ -945,7 +945,7 @@ static int rfc4106_encrypt(struct aead_request *req)
 	return crypto_aead_encrypt(req);
 }
 
-static int rfc4106_decrypt(struct aead_request *req)
+static int gcmaes_wrapper_decrypt(struct aead_request *req)
 {
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
@@ -1117,7 +1117,7 @@ static int generic_gcmaes_decrypt(struct aead_request *req)
 {
 	__be32 counter = cpu_to_be32(1);
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
 	void *aes_ctx = &(ctx->aes_key_expanded);
 	u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
 
@@ -1128,6 +1128,30 @@ static int generic_gcmaes_decrypt(struct aead_request *req)
 			      aes_ctx);
 }
 
+static int generic_gcmaes_init(struct crypto_aead *aead)
+{
+	struct cryptd_aead *cryptd_tfm;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_tfm = cryptd_alloc_aead("__driver-generic-gcm-aes-aesni",
+				       CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base));
+
+	return 0;
+}
+
+static void generic_gcmaes_exit(struct crypto_aead *aead)
+{
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
+}
+
 static struct aead_alg aesni_aead_algs[] = { {
 	.setkey			= common_rfc4106_set_key,
 	.setauthsize		= common_rfc4106_set_authsize,
@@ -1147,10 +1171,10 @@ static struct aead_alg aesni_aead_algs[] = { {
 }, {
 	.init			= rfc4106_init,
 	.exit			= rfc4106_exit,
-	.setkey			= rfc4106_set_key,
-	.setauthsize		= rfc4106_set_authsize,
-	.encrypt		= rfc4106_encrypt,
-	.decrypt		= rfc4106_decrypt,
+	.setkey			= gcmaes_wrapper_set_key,
+	.setauthsize		= gcmaes_wrapper_set_authsize,
+	.encrypt		= gcmaes_wrapper_encrypt,
+	.decrypt		= gcmaes_wrapper_decrypt,
 	.ivsize			= GCM_RFC4106_IV_SIZE,
 	.maxauthsize		= 16,
 	.base = {
@@ -1170,13 +1194,31 @@ static struct aead_alg aesni_aead_algs[] = { {
 	.ivsize			= GCM_AES_IV_SIZE,
 	.maxauthsize		= 16,
 	.base = {
+		.cra_name		= "__generic-gcm-aes-aesni",
+		.cra_driver_name	= "__driver-generic-gcm-aes-aesni",
+		.cra_priority		= 0,
+		.cra_flags		= CRYPTO_ALG_INTERNAL,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct generic_gcmaes_ctx),
+		.cra_alignmask		= AESNI_ALIGN - 1,
+		.cra_module		= THIS_MODULE,
+	},
+}, {
+	.init			= generic_gcmaes_init,
+	.exit			= generic_gcmaes_exit,
+	.setkey			= gcmaes_wrapper_set_key,
+	.setauthsize		= gcmaes_wrapper_set_authsize,
+	.encrypt		= gcmaes_wrapper_encrypt,
+	.decrypt		= gcmaes_wrapper_decrypt,
+	.ivsize			= GCM_AES_IV_SIZE,
+	.maxauthsize		= 16,
+	.base = {
 		.cra_name		= "gcm(aes)",
 		.cra_driver_name	= "generic-gcm-aesni",
 		.cra_priority		= 400,
 		.cra_flags		= CRYPTO_ALG_ASYNC,
 		.cra_blocksize		= 1,
-		.cra_ctxsize		= sizeof(struct generic_gcmaes_ctx),
-		.cra_alignmask		= AESNI_ALIGN - 1,
+		.cra_ctxsize		= sizeof(struct cryptd_aead *),
 		.cra_module		= THIS_MODULE,
 	},
 } };
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
index 1e6af1b35f7b..dce7c5d39c2f 100644
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -107,7 +107,6 @@ static struct skcipher_alg alg = {
 	.base.cra_priority	= 300,
 	.base.cra_blocksize	= 1,
 	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
-	.base.cra_alignmask	= sizeof(u32) - 1,
 	.base.cra_module	= THIS_MODULE,
 
 	.min_keysize		= CHACHA20_KEY_SIZE,
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 27226df3f7d8..c8d9cdacbf10 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -162,6 +162,7 @@ static struct shash_alg alg = {
 			.cra_name		= "crc32",
 			.cra_driver_name	= "crc32-pclmul",
 			.cra_priority		= 200,
+			.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 			.cra_blocksize		= CHKSUM_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(u32),
 			.cra_module		= THIS_MODULE,
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index c194d5717ae5..5773e1161072 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -226,6 +226,7 @@ static struct shash_alg alg = {
 		.cra_name		=	"crc32c",
 		.cra_driver_name	=	"crc32c-intel",
 		.cra_priority		=	200,
+		.cra_flags		=	CRYPTO_ALG_OPTIONAL_KEY,
 		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
 		.cra_ctxsize		=	sizeof(u32),
 		.cra_module		=	THIS_MODULE,
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index e32142bc071d..790377797544 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -164,14 +164,12 @@ static struct shash_alg alg = {
 	.init		= poly1305_simd_init,
 	.update		= poly1305_simd_update,
 	.final		= crypto_poly1305_final,
-	.setkey		= crypto_poly1305_setkey,
 	.descsize	= sizeof(struct poly1305_simd_desc_ctx),
 	.base		= {
 		.cra_name		= "poly1305",
 		.cra_driver_name	= "poly1305-simd",
 		.cra_priority		= 300,
 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
-		.cra_alignmask		= sizeof(u32) - 1,
 		.cra_blocksize		= POLY1305_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	},
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
index 329452b8f794..6014b7b9e52a 100644
--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ b/arch/x86/crypto/salsa20-i586-asm_32.S
@@ -1,6 +1,7 @@
-# salsa20_pm.s version 20051229
-# D. J. Bernstein
-# Public domain.
+# Derived from:
+#	salsa20_pm.s version 20051229
+#	D. J. Bernstein
+#	Public domain.
 
 #include <linux/linkage.h>
 
@@ -935,180 +936,3 @@ ENTRY(salsa20_encrypt_bytes)
 	# goto bytesatleast1
 	jmp	._bytesatleast1
 ENDPROC(salsa20_encrypt_bytes)
-
-# enter salsa20_keysetup
-ENTRY(salsa20_keysetup)
-	mov	%esp,%eax
-	and	$31,%eax
-	add	$256,%eax
-	sub	%eax,%esp
-	#   eax_stack = eax
-	movl	%eax,64(%esp)
-	#   ebx_stack = ebx
-	movl	%ebx,68(%esp)
-	#   esi_stack = esi
-	movl	%esi,72(%esp)
-	#   edi_stack = edi
-	movl	%edi,76(%esp)
-	#   ebp_stack = ebp
-	movl	%ebp,80(%esp)
-	#   k = arg2
-	movl	8(%esp,%eax),%ecx
-	#   kbits = arg3
-	movl	12(%esp,%eax),%edx
-	#   x = arg1
-	movl	4(%esp,%eax),%eax
-	#   in1 = *(uint32 *) (k + 0)
-	movl	0(%ecx),%ebx
-	#   in2 = *(uint32 *) (k + 4)
-	movl	4(%ecx),%esi
-	#   in3 = *(uint32 *) (k + 8)
-	movl	8(%ecx),%edi
-	#   in4 = *(uint32 *) (k + 12)
-	movl	12(%ecx),%ebp
-	#   *(uint32 *) (x + 4) = in1
-	movl	%ebx,4(%eax)
-	#   *(uint32 *) (x + 8) = in2
-	movl	%esi,8(%eax)
-	#   *(uint32 *) (x + 12) = in3
-	movl	%edi,12(%eax)
-	#   *(uint32 *) (x + 16) = in4
-	movl	%ebp,16(%eax)
-	#   kbits - 256
-	cmp	$256,%edx
-	#   goto kbits128 if unsigned<
-	jb	._kbits128
-._kbits256:
-	#     in11 = *(uint32 *) (k + 16)
-	movl	16(%ecx),%edx
-	#     in12 = *(uint32 *) (k + 20)
-	movl	20(%ecx),%ebx
-	#     in13 = *(uint32 *) (k + 24)
-	movl	24(%ecx),%esi
-	#     in14 = *(uint32 *) (k + 28)
-	movl	28(%ecx),%ecx
-	#     *(uint32 *) (x + 44) = in11
-	movl	%edx,44(%eax)
-	#     *(uint32 *) (x + 48) = in12
-	movl	%ebx,48(%eax)
-	#     *(uint32 *) (x + 52) = in13
-	movl	%esi,52(%eax)
-	#     *(uint32 *) (x + 56) = in14
-	movl	%ecx,56(%eax)
-	#     in0 = 1634760805
-	mov	$1634760805,%ecx
-	#     in5 = 857760878
-	mov	$857760878,%edx
-	#     in10 = 2036477234
-	mov	$2036477234,%ebx
-	#     in15 = 1797285236
-	mov	$1797285236,%esi
-	#     *(uint32 *) (x + 0) = in0
-	movl	%ecx,0(%eax)
-	#     *(uint32 *) (x + 20) = in5
-	movl	%edx,20(%eax)
-	#     *(uint32 *) (x + 40) = in10
-	movl	%ebx,40(%eax)
-	#     *(uint32 *) (x + 60) = in15
-	movl	%esi,60(%eax)
-	#   goto keysetupdone
-	jmp	._keysetupdone
-._kbits128:
-	#     in11 = *(uint32 *) (k + 0)
-	movl	0(%ecx),%edx
-	#     in12 = *(uint32 *) (k + 4)
-	movl	4(%ecx),%ebx
-	#     in13 = *(uint32 *) (k + 8)
-	movl	8(%ecx),%esi
-	#     in14 = *(uint32 *) (k + 12)
-	movl	12(%ecx),%ecx
-	#     *(uint32 *) (x + 44) = in11
-	movl	%edx,44(%eax)
-	#     *(uint32 *) (x + 48) = in12
-	movl	%ebx,48(%eax)
-	#     *(uint32 *) (x + 52) = in13
-	movl	%esi,52(%eax)
-	#     *(uint32 *) (x + 56) = in14
-	movl	%ecx,56(%eax)
-	#     in0 = 1634760805
-	mov	$1634760805,%ecx
-	#     in5 = 824206446
-	mov	$824206446,%edx
-	#     in10 = 2036477238
-	mov	$2036477238,%ebx
-	#     in15 = 1797285236
-	mov	$1797285236,%esi
-	#     *(uint32 *) (x + 0) = in0
-	movl	%ecx,0(%eax)
-	#     *(uint32 *) (x + 20) = in5
-	movl	%edx,20(%eax)
-	#     *(uint32 *) (x + 40) = in10
-	movl	%ebx,40(%eax)
-	#     *(uint32 *) (x + 60) = in15
-	movl	%esi,60(%eax)
-._keysetupdone:
-	#   eax = eax_stack
-	movl	64(%esp),%eax
-	#   ebx = ebx_stack
-	movl	68(%esp),%ebx
-	#   esi = esi_stack
-	movl	72(%esp),%esi
-	#   edi = edi_stack
-	movl	76(%esp),%edi
-	#   ebp = ebp_stack
-	movl	80(%esp),%ebp
-	# leave
-	add	%eax,%esp
-	ret
-ENDPROC(salsa20_keysetup)
-
-# enter salsa20_ivsetup
-ENTRY(salsa20_ivsetup)
-	mov	%esp,%eax
-	and	$31,%eax
-	add	$256,%eax
-	sub	%eax,%esp
-	#   eax_stack = eax
-	movl	%eax,64(%esp)
-	#   ebx_stack = ebx
-	movl	%ebx,68(%esp)
-	#   esi_stack = esi
-	movl	%esi,72(%esp)
-	#   edi_stack = edi
-	movl	%edi,76(%esp)
-	#   ebp_stack = ebp
-	movl	%ebp,80(%esp)
-	#   iv = arg2
-	movl	8(%esp,%eax),%ecx
-	#   x = arg1
-	movl	4(%esp,%eax),%eax
-	#   in6 = *(uint32 *) (iv + 0)
-	movl	0(%ecx),%edx
-	#   in7 = *(uint32 *) (iv + 4)
-	movl	4(%ecx),%ecx
-	#   in8 = 0
-	mov	$0,%ebx
-	#   in9 = 0
-	mov	$0,%esi
-	#   *(uint32 *) (x + 24) = in6
-	movl	%edx,24(%eax)
-	#   *(uint32 *) (x + 28) = in7
-	movl	%ecx,28(%eax)
-	#   *(uint32 *) (x + 32) = in8
-	movl	%ebx,32(%eax)
-	#   *(uint32 *) (x + 36) = in9
-	movl	%esi,36(%eax)
-	#   eax = eax_stack
-	movl	64(%esp),%eax
-	#   ebx = ebx_stack
-	movl	68(%esp),%ebx
-	#   esi = esi_stack
-	movl	72(%esp),%esi
-	#   edi = edi_stack
-	movl	76(%esp),%edi
-	#   ebp = ebp_stack
-	movl	80(%esp),%ebp
-	# leave
-	add	%eax,%esp
-	ret
-ENDPROC(salsa20_ivsetup)
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
index 10db30d58006..03a4918f41ee 100644
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -803,117 +803,3 @@ ENTRY(salsa20_encrypt_bytes)
 	# goto bytesatleast1
 	jmp	._bytesatleast1
 ENDPROC(salsa20_encrypt_bytes)
-
-# enter salsa20_keysetup
-ENTRY(salsa20_keysetup)
-	mov	%rsp,%r11
-	and	$31,%r11
-	add	$256,%r11
-	sub	%r11,%rsp
-	#   k = arg2
-	mov	%rsi,%rsi
-	#   kbits = arg3
-	mov	%rdx,%rdx
-	#   x = arg1
-	mov	%rdi,%rdi
-	#   in0 = *(uint64 *) (k + 0)
-	movq	0(%rsi),%r8
-	#   in2 = *(uint64 *) (k + 8)
-	movq	8(%rsi),%r9
-	#   *(uint64 *) (x + 4) = in0
-	movq	%r8,4(%rdi)
-	#   *(uint64 *) (x + 12) = in2
-	movq	%r9,12(%rdi)
-	#                    unsigned<? kbits - 256
-	cmp	$256,%rdx
-	# comment:fp stack unchanged by jump
-	#   goto kbits128 if unsigned<
-	jb	._kbits128
-#   kbits256:
-._kbits256:
-	#     in10 = *(uint64 *) (k + 16)
-	movq	16(%rsi),%rdx
-	#     in12 = *(uint64 *) (k + 24)
-	movq	24(%rsi),%rsi
-	#     *(uint64 *) (x + 44) = in10
-	movq	%rdx,44(%rdi)
-	#     *(uint64 *) (x + 52) = in12
-	movq	%rsi,52(%rdi)
-	#     in0 = 1634760805
-	mov	$1634760805,%rsi
-	#     in4 = 857760878
-	mov	$857760878,%rdx
-	#     in10 = 2036477234
-	mov	$2036477234,%rcx
-	#     in14 = 1797285236
-	mov	$1797285236,%r8
-	#     *(uint32 *) (x + 0) = in0
-	movl	%esi,0(%rdi)
-	#     *(uint32 *) (x + 20) = in4
-	movl	%edx,20(%rdi)
-	#     *(uint32 *) (x + 40) = in10
-	movl	%ecx,40(%rdi)
-	#     *(uint32 *) (x + 60) = in14
-	movl	%r8d,60(%rdi)
-	# comment:fp stack unchanged by jump
-	#   goto keysetupdone
-	jmp	._keysetupdone
-#   kbits128:
-._kbits128:
-	#     in10 = *(uint64 *) (k + 0)
-	movq	0(%rsi),%rdx
-	#     in12 = *(uint64 *) (k + 8)
-	movq	8(%rsi),%rsi
-	#     *(uint64 *) (x + 44) = in10
-	movq	%rdx,44(%rdi)
-	#     *(uint64 *) (x + 52) = in12
-	movq	%rsi,52(%rdi)
-	#     in0 = 1634760805
-	mov	$1634760805,%rsi
-	#     in4 = 824206446
-	mov	$824206446,%rdx
-	#     in10 = 2036477238
-	mov	$2036477238,%rcx
-	#     in14 = 1797285236
-	mov	$1797285236,%r8
-	#     *(uint32 *) (x + 0) = in0
-	movl	%esi,0(%rdi)
-	#     *(uint32 *) (x + 20) = in4
-	movl	%edx,20(%rdi)
-	#     *(uint32 *) (x + 40) = in10
-	movl	%ecx,40(%rdi)
-	#     *(uint32 *) (x + 60) = in14
-	movl	%r8d,60(%rdi)
-#   keysetupdone:
-._keysetupdone:
-	# leave
-	add	%r11,%rsp
-	mov	%rdi,%rax
-	mov	%rsi,%rdx
-	ret
-ENDPROC(salsa20_keysetup)
-
-# enter salsa20_ivsetup
-ENTRY(salsa20_ivsetup)
-	mov	%rsp,%r11
-	and	$31,%r11
-	add	$256,%r11
-	sub	%r11,%rsp
-	#   iv = arg2
-	mov	%rsi,%rsi
-	#   x = arg1
-	mov	%rdi,%rdi
-	#   in6 = *(uint64 *) (iv + 0)
-	movq	0(%rsi),%rsi
-	#   in8 = 0
-	mov	$0,%r8
-	#   *(uint64 *) (x + 24) = in6
-	movq	%rsi,24(%rdi)
-	#   *(uint64 *) (x + 32) = in8
-	movq	%r8,32(%rdi)
-	# leave
-	add	%r11,%rsp
-	mov	%rdi,%rax
-	mov	%rsi,%rdx
-	ret
-ENDPROC(salsa20_ivsetup)
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index cb91a64a99e7..b07d7d959806 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -11,6 +11,9 @@
  * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
  *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
  *
+ * Also modified to set up the initial state using the generic C code rather
+ * than in assembly.
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
  * Software Foundation; either version 2 of the License, or (at your option)
@@ -18,93 +21,65 @@
  *
  */
 
-#include <crypto/algapi.h>
+#include <asm/unaligned.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/salsa20.h>
 #include <linux/module.h>
-#include <linux/crypto.h>
-
-#define SALSA20_IV_SIZE        8U
-#define SALSA20_MIN_KEY_SIZE  16U
-#define SALSA20_MAX_KEY_SIZE  32U
-
-struct salsa20_ctx
-{
-	u32 input[16];
-};
 
-asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
-				 u32 keysize, u32 ivsize);
-asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
-asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
-				      const u8 *src, u8 *dst, u32 bytes);
+asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst,
+				      u32 bytes);
 
-static int setkey(struct crypto_tfm *tfm, const u8 *key,
-		  unsigned int keysize)
+static int salsa20_asm_crypt(struct skcipher_request *req)
 {
-	struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
-	salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
-	return 0;
-}
-
-static int encrypt(struct blkcipher_desc *desc,
-		   struct scatterlist *dst, struct scatterlist *src,
-		   unsigned int nbytes)
-{
-	struct blkcipher_walk walk;
-	struct crypto_blkcipher *tfm = desc->tfm;
-	struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u32 state[16];
 	int err;
 
-	blkcipher_walk_init(&walk, dst, src, nbytes);
-	err = blkcipher_walk_virt_block(desc, &walk, 64);
+	err = skcipher_walk_virt(&walk, req, true);
 
-	salsa20_ivsetup(ctx, walk.iv);
+	crypto_salsa20_init(state, ctx, walk.iv);
 
-	while (walk.nbytes >= 64) {
-		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
-				      walk.dst.virt.addr,
-				      walk.nbytes - (walk.nbytes % 64));
-		err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
-	}
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
 
-	if (walk.nbytes) {
-		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
-				      walk.dst.virt.addr, walk.nbytes);
-		err = blkcipher_walk_done(desc, &walk, 0);
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		salsa20_encrypt_bytes(state, walk.src.virt.addr,
+				      walk.dst.virt.addr, nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 
 	return err;
 }
 
-static struct crypto_alg alg = {
-	.cra_name           =   "salsa20",
-	.cra_driver_name    =   "salsa20-asm",
-	.cra_priority       =   200,
-	.cra_flags          =   CRYPTO_ALG_TYPE_BLKCIPHER,
-	.cra_type           =   &crypto_blkcipher_type,
-	.cra_blocksize      =   1,
-	.cra_ctxsize        =   sizeof(struct salsa20_ctx),
-	.cra_alignmask      =	3,
-	.cra_module         =   THIS_MODULE,
-	.cra_u              =   {
-		.blkcipher = {
-			.setkey         =   setkey,
-			.encrypt        =   encrypt,
-			.decrypt        =   encrypt,
-			.min_keysize    =   SALSA20_MIN_KEY_SIZE,
-			.max_keysize    =   SALSA20_MAX_KEY_SIZE,
-			.ivsize         =   SALSA20_IV_SIZE,
-		}
-	}
+static struct skcipher_alg alg = {
+	.base.cra_name		= "salsa20",
+	.base.cra_driver_name	= "salsa20-asm",
+	.base.cra_priority	= 200,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct salsa20_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= SALSA20_MIN_KEY_SIZE,
+	.max_keysize		= SALSA20_MAX_KEY_SIZE,
+	.ivsize			= SALSA20_IV_SIZE,
+	.chunksize		= SALSA20_BLOCK_SIZE,
+	.setkey			= crypto_salsa20_setkey,
+	.encrypt		= salsa20_asm_crypt,
+	.decrypt		= salsa20_asm_crypt,
 };
 
 static int __init init(void)
 {
-	return crypto_register_alg(&alg);
+	return crypto_register_skcipher(&alg);
 }
 
 static void __exit fini(void)
 {
-	crypto_unregister_alg(&alg);
+	crypto_unregister_skcipher(&alg);
 }
 
 module_init(init);
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index 1c3b7ceb36d2..e7273a606a07 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -55,29 +55,31 @@
 #define RAB1bl %bl
 #define RAB2bl %cl
 
+#define CD0 0x0(%rsp)
+#define CD1 0x8(%rsp)
+#define CD2 0x10(%rsp)
+
+# used only before/after all rounds
 #define RCD0 %r8
 #define RCD1 %r9
 #define RCD2 %r10
 
-#define RCD0d %r8d
-#define RCD1d %r9d
-#define RCD2d %r10d
-
-#define RX0 %rbp
-#define RX1 %r11
-#define RX2 %r12
+# used only during rounds
+#define RX0 %r8
+#define RX1 %r9
+#define RX2 %r10
 
-#define RX0d %ebp
-#define RX1d %r11d
-#define RX2d %r12d
+#define RX0d %r8d
+#define RX1d %r9d
+#define RX2d %r10d
 
-#define RY0 %r13
-#define RY1 %r14
-#define RY2 %r15
+#define RY0 %r11
+#define RY1 %r12
+#define RY2 %r13
 
-#define RY0d %r13d
-#define RY1d %r14d
-#define RY2d %r15d
+#define RY0d %r11d
+#define RY1d %r12d
+#define RY2d %r13d
 
 #define RT0 %rdx
 #define RT1 %rsi
@@ -85,6 +87,8 @@
 #define RT0d %edx
 #define RT1d %esi
 
+#define RT1bl %sil
+
 #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
 	movzbl ab ## bl,		tmp2 ## d; \
 	movzbl ab ## bh,		tmp1 ## d; \
@@ -92,6 +96,11 @@
 	op1##l T0(CTX, tmp2, 4),	dst ## d; \
 	op2##l T1(CTX, tmp1, 4),	dst ## d;
 
+#define swap_ab_with_cd(ab, cd, tmp)	\
+	movq cd, tmp;			\
+	movq ab, cd;			\
+	movq tmp, ab;
+
 /*
  * Combined G1 & G2 function. Reordered with help of rotates to have moves
  * at begining.
@@ -110,15 +119,15 @@
 	/* G1,2 && G2,2 */ \
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
-	xchgq cd ## 0, ab ## 0; \
+	swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
 	\
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
-	xchgq cd ## 1, ab ## 1; \
+	swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
 	\
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
-	xchgq cd ## 2, ab ## 2;
+	swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
 
 #define enc_round_end(ab, x, y, n) \
 	addl y ## d,			x ## d; \
@@ -168,6 +177,16 @@
 	decrypt_round3(ba, dc, (n*2)+1); \
 	decrypt_round3(ba, dc, (n*2));
 
+#define push_cd()	\
+	pushq RCD2;	\
+	pushq RCD1;	\
+	pushq RCD0;
+
+#define pop_cd()	\
+	popq RCD0;	\
+	popq RCD1;	\
+	popq RCD2;
+
 #define inpack3(in, n, xy, m) \
 	movq 4*(n)(in),			xy ## 0; \
 	xorq w+4*m(CTX),		xy ## 0; \
@@ -223,11 +242,8 @@ ENTRY(__twofish_enc_blk_3way)
 	 *	%rdx: src, RIO
 	 *	%rcx: bool, if true: xor output
 	 */
-	pushq %r15;
-	pushq %r14;
 	pushq %r13;
 	pushq %r12;
-	pushq %rbp;
 	pushq %rbx;
 
 	pushq %rcx; /* bool xor */
@@ -235,40 +251,36 @@ ENTRY(__twofish_enc_blk_3way)
 
 	inpack_enc3();
 
-	encrypt_cycle3(RAB, RCD, 0);
-	encrypt_cycle3(RAB, RCD, 1);
-	encrypt_cycle3(RAB, RCD, 2);
-	encrypt_cycle3(RAB, RCD, 3);
-	encrypt_cycle3(RAB, RCD, 4);
-	encrypt_cycle3(RAB, RCD, 5);
-	encrypt_cycle3(RAB, RCD, 6);
-	encrypt_cycle3(RAB, RCD, 7);
+	push_cd();
+	encrypt_cycle3(RAB, CD, 0);
+	encrypt_cycle3(RAB, CD, 1);
+	encrypt_cycle3(RAB, CD, 2);
+	encrypt_cycle3(RAB, CD, 3);
+	encrypt_cycle3(RAB, CD, 4);
+	encrypt_cycle3(RAB, CD, 5);
+	encrypt_cycle3(RAB, CD, 6);
+	encrypt_cycle3(RAB, CD, 7);
+	pop_cd();
 
 	popq RIO; /* dst */
-	popq %rbp; /* bool xor */
+	popq RT1; /* bool xor */
 
-	testb %bpl, %bpl;
+	testb RT1bl, RT1bl;
 	jnz .L__enc_xor3;
 
 	outunpack_enc3(mov);
 
 	popq %rbx;
-	popq %rbp;
 	popq %r12;
 	popq %r13;
-	popq %r14;
-	popq %r15;
 	ret;
 
 .L__enc_xor3:
 	outunpack_enc3(xor);
 
 	popq %rbx;
-	popq %rbp;
 	popq %r12;
 	popq %r13;
-	popq %r14;
-	popq %r15;
 	ret;
 ENDPROC(__twofish_enc_blk_3way)
 
@@ -278,35 +290,31 @@ ENTRY(twofish_dec_blk_3way)
 	 *	%rsi: dst
 	 *	%rdx: src, RIO
 	 */
-	pushq %r15;
-	pushq %r14;
 	pushq %r13;
 	pushq %r12;
-	pushq %rbp;
 	pushq %rbx;
 
 	pushq %rsi; /* dst */
 
 	inpack_dec3();
 
-	decrypt_cycle3(RAB, RCD, 7);
-	decrypt_cycle3(RAB, RCD, 6);
-	decrypt_cycle3(RAB, RCD, 5);
-	decrypt_cycle3(RAB, RCD, 4);
-	decrypt_cycle3(RAB, RCD, 3);
-	decrypt_cycle3(RAB, RCD, 2);
-	decrypt_cycle3(RAB, RCD, 1);
-	decrypt_cycle3(RAB, RCD, 0);
+	push_cd();
+	decrypt_cycle3(RAB, CD, 7);
+	decrypt_cycle3(RAB, CD, 6);
+	decrypt_cycle3(RAB, CD, 5);
+	decrypt_cycle3(RAB, CD, 4);
+	decrypt_cycle3(RAB, CD, 3);
+	decrypt_cycle3(RAB, CD, 2);
+	decrypt_cycle3(RAB, CD, 1);
+	decrypt_cycle3(RAB, CD, 0);
+	pop_cd();
 
 	popq RIO; /* dst */
 
 	outunpack_dec3();
 
 	popq %rbx;
-	popq %rbp;
 	popq %r12;
 	popq %r13;
-	popq %r14;
-	popq %r15;
 	ret;
 ENDPROC(twofish_dec_blk_3way)
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-01-31 14:22:45 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-01-31 14:22:45 -0800
commit	a103950e0dd2058df5e8a8d4a915707bdcf205f0 (patch)
tree	af5d091f768db4ed7a12fc3c5484d3e20ad9d514 /arch
parent	2cfa1cd3da14814a1e9ec6a4fce8612637d3ee3d (diff)
parent	2d55807b7f7bf62bb05a8b91247c5eb7cd19ac04 (diff)