From 35b80920d4f0253fed03a1c3a345df8578dbd057 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Thu, 27 Sep 2012 15:44:17 -0700
Subject: crypto: crc32c - Rename crc32c-intel.c to crc32c-intel_glue.c

This patch renames the crc32c-intel.c file to crc32c-intel_glue.c file
in preparation for linking with the new crc32c-pcl-intel-asm.S file,
which contains optimized crc32c calculation based on PCLMULQDQ
instruction.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile            |   1 +
 arch/x86/crypto/crc32c-intel.c      | 203 ------------------------------------
 arch/x86/crypto/crc32c-intel_glue.c | 203 ++++++++++++++++++++++++++++++++++++
 3 files changed, 204 insertions(+), 203 deletions(-)
 delete mode 100644 arch/x86/crypto/crc32c-intel.c
 create mode 100644 arch/x86/crypto/crc32c-intel_glue.c

(limited to 'arch')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5bacb4a226ac..b95778311178 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -47,3 +47,4 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+crc32c-intel-y := crc32c-intel_glue.o
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
deleted file mode 100644
index 493f959261f7..000000000000
--- a/arch/x86/crypto/crc32c-intel.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
- * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
- * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
- * http://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2A: Instruction Set Reference, A-M
- *
- * Copyright (C) 2008 Intel Corporation
- * Authors: Austin Zhang <austin_zhang@linux.intel.com>
- *          Kent Liu <kent.liu@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-#include <crypto/internal/hash.h>
-
-#include <asm/cpufeature.h>
-#include <asm/cpu_device_id.h>
-
-#define CHKSUM_BLOCK_SIZE	1
-#define CHKSUM_DIGEST_SIZE	4
-
-#define SCALE_F	sizeof(unsigned long)
-
-#ifdef CONFIG_X86_64
-#define REX_PRE "0x48, "
-#else
-#define REX_PRE
-#endif
-
-static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
-{
-	while (length--) {
-		__asm__ __volatile__(
-			".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
-			:"=S"(crc)
-			:"0"(crc), "c"(*data)
-		);
-		data++;
-	}
-
-	return crc;
-}
-
-static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
-{
-	unsigned int iquotient = len / SCALE_F;
-	unsigned int iremainder = len % SCALE_F;
-	unsigned long *ptmp = (unsigned long *)p;
-
-	while (iquotient--) {
-		__asm__ __volatile__(
-			".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
-			:"=S"(crc)
-			:"0"(crc), "c"(*ptmp)
-		);
-		ptmp++;
-	}
-
-	if (iremainder)
-		crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
-				 iremainder);
-
-	return crc;
-}
-
-/*
- * Setting the seed allows arbitrary accumulators and flexible XOR policy
- * If your algorithm starts with ~0, then XOR with ~0 before you set
- * the seed.
- */
-static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
-			unsigned int keylen)
-{
-	u32 *mctx = crypto_shash_ctx(hash);
-
-	if (keylen != sizeof(u32)) {
-		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
-	*mctx = le32_to_cpup((__le32 *)key);
-	return 0;
-}
-
-static int crc32c_intel_init(struct shash_desc *desc)
-{
-	u32 *mctx = crypto_shash_ctx(desc->tfm);
-	u32 *crcp = shash_desc_ctx(desc);
-
-	*crcp = *mctx;
-
-	return 0;
-}
-
-static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len)
-{
-	u32 *crcp = shash_desc_ctx(desc);
-
-	*crcp = crc32c_intel_le_hw(*crcp, data, len);
-	return 0;
-}
-
-static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
-				u8 *out)
-{
-	*(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
-	return 0;
-}
-
-static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
-			      unsigned int len, u8 *out)
-{
-	return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
-}
-
-static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
-{
-	u32 *crcp = shash_desc_ctx(desc);
-
-	*(__le32 *)out = ~cpu_to_le32p(crcp);
-	return 0;
-}
-
-static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, u8 *out)
-{
-	return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
-				    out);
-}
-
-static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
-{
-	u32 *key = crypto_tfm_ctx(tfm);
-
-	*key = ~0;
-
-	return 0;
-}
-
-static struct shash_alg alg = {
-	.setkey			=	crc32c_intel_setkey,
-	.init			=	crc32c_intel_init,
-	.update			=	crc32c_intel_update,
-	.final			=	crc32c_intel_final,
-	.finup			=	crc32c_intel_finup,
-	.digest			=	crc32c_intel_digest,
-	.descsize		=	sizeof(u32),
-	.digestsize		=	CHKSUM_DIGEST_SIZE,
-	.base			=	{
-		.cra_name		=	"crc32c",
-		.cra_driver_name	=	"crc32c-intel",
-		.cra_priority		=	200,
-		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
-		.cra_ctxsize		=	sizeof(u32),
-		.cra_module		=	THIS_MODULE,
-		.cra_init		=	crc32c_intel_cra_init,
-	}
-};
-
-static const struct x86_cpu_id crc32c_cpu_id[] = {
-	X86_FEATURE_MATCH(X86_FEATURE_XMM4_2),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);
-
-static int __init crc32c_intel_mod_init(void)
-{
-	if (!x86_match_cpu(crc32c_cpu_id))
-		return -ENODEV;
-	return crypto_register_shash(&alg);
-}
-
-static void __exit crc32c_intel_mod_fini(void)
-{
-	crypto_unregister_shash(&alg);
-}
-
-module_init(crc32c_intel_mod_init);
-module_exit(crc32c_intel_mod_fini);
-
-MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
-MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
-MODULE_LICENSE("GPL");
-
-MODULE_ALIAS("crc32c");
-MODULE_ALIAS("crc32c-intel");
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
new file mode 100644
index 000000000000..493f959261f7
--- /dev/null
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -0,0 +1,203 @@
+/*
+ * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
+ * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
+ * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2A: Instruction Set Reference, A-M
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Authors: Austin Zhang <austin_zhang@linux.intel.com>
+ *          Kent Liu <kent.liu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <crypto/internal/hash.h>
+
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+#define SCALE_F	sizeof(unsigned long)
+
+#ifdef CONFIG_X86_64
+#define REX_PRE "0x48, "
+#else
+#define REX_PRE
+#endif
+
+static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
+{
+	while (length--) {
+		__asm__ __volatile__(
+			".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
+			:"=S"(crc)
+			:"0"(crc), "c"(*data)
+		);
+		data++;
+	}
+
+	return crc;
+}
+
+static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
+{
+	unsigned int iquotient = len / SCALE_F;
+	unsigned int iremainder = len % SCALE_F;
+	unsigned long *ptmp = (unsigned long *)p;
+
+	while (iquotient--) {
+		__asm__ __volatile__(
+			".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
+			:"=S"(crc)
+			:"0"(crc), "c"(*ptmp)
+		);
+		ptmp++;
+	}
+
+	if (iremainder)
+		crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
+				 iremainder);
+
+	return crc;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
+			unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32c_intel_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = *mctx;
+
+	return 0;
+}
+
+static int crc32c_intel_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = crc32c_intel_le_hw(*crcp, data, len);
+	return 0;
+}
+
+static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
+				u8 *out)
+{
+	*(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+	return 0;
+}
+
+static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
+{
+	return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_intel_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*(__le32 *)out = ~cpu_to_le32p(crcp);
+	return 0;
+}
+
+static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *out)
+{
+	return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+
+static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.setkey			=	crc32c_intel_setkey,
+	.init			=	crc32c_intel_init,
+	.update			=	crc32c_intel_update,
+	.final			=	crc32c_intel_final,
+	.finup			=	crc32c_intel_finup,
+	.digest			=	crc32c_intel_digest,
+	.descsize		=	sizeof(u32),
+	.digestsize		=	CHKSUM_DIGEST_SIZE,
+	.base			=	{
+		.cra_name		=	"crc32c",
+		.cra_driver_name	=	"crc32c-intel",
+		.cra_priority		=	200,
+		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		=	sizeof(u32),
+		.cra_module		=	THIS_MODULE,
+		.cra_init		=	crc32c_intel_cra_init,
+	}
+};
+
+static const struct x86_cpu_id crc32c_cpu_id[] = {
+	X86_FEATURE_MATCH(X86_FEATURE_XMM4_2),
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc32c_cpu_id);
+
+static int __init crc32c_intel_mod_init(void)
+{
+	if (!x86_match_cpu(crc32c_cpu_id))
+		return -ENODEV;
+	return crypto_register_shash(&alg);
+}
+
+static void __exit crc32c_intel_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(crc32c_intel_mod_init);
+module_exit(crc32c_intel_mod_fini);
+
+MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
+MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS("crc32c");
+MODULE_ALIAS("crc32c-intel");
-- 
cgit v1.2.3-58-ga151


From 6a8ce1ef3940e0cab5ff5f11e1cff5301f83fef6 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Thu, 27 Sep 2012 15:44:22 -0700
Subject: crypto: crc32c - Optimize CRC32C calculation with PCLMULQDQ
 instruction

This patch adds the crc_pcl function that calculates CRC32C checksum using the
PCLMULQDQ instruction on processors that support this feature. This will
provide speedup over using CRC32 instruction only.
The usage of PCLMULQDQ necessitate the invocation of kernel_fpu_begin and
kernel_fpu_end and incur some overhead.  So the new crc_pcl function is only
invoked for buffer size of 512 bytes or more.  Larger sized
buffers will expect to see greater speedup.  This feature is best used coupled
with eager_fpu which reduces the kernel_fpu_begin/end overhead.  For
buffer size of 1K the speedup is around 1.6x and for buffer size greater than
4K, the speedup is around 3x compared to original implementation in crc32c-intel
module. Test was performed on Sandy Bridge based platform with constant frequency
set for cpu.

A white paper detailing the algorithm can be found here:
http://download.intel.com/design/intarch/papers/323405.pdf

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                  |   1 +
 arch/x86/crypto/crc32c-intel_glue.c       |  81 ++++++
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 460 ++++++++++++++++++++++++++++++
 crypto/Kconfig                            |  10 +
 4 files changed, 552 insertions(+)
 create mode 100644 arch/x86/crypto/crc32c-pcl-intel-asm_64.S

(limited to 'arch')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b95778311178..84d7dbaba26e 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,3 +48,4 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 crc32c-intel-y := crc32c-intel_glue.o
+crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 493f959261f7..6812ad98355c 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -32,6 +32,8 @@
 
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
+#include <asm/i387.h>
+#include <asm/fpu-internal.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
@@ -44,6 +46,31 @@
 #define REX_PRE
 #endif
 
+#ifdef CONFIG_X86_64
+/*
+ * use carryless multiply version of crc32c when buffer
+ * size is >= 512 (when eager fpu is enabled) or
+ * >= 1024 (when eager fpu is disabled) to account
+ * for fpu state save/restore overhead.
+ */
+#define CRC32C_PCL_BREAKEVEN_EAGERFPU	512
+#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU	1024
+
+asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
+				unsigned int crc_init);
+static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
+#if defined(X86_FEATURE_EAGER_FPU)
+#define set_pcl_breakeven_point()					\
+do {									\
+	if (!use_eager_fpu())						\
+		crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU;	\
+} while (0)
+#else
+#define set_pcl_breakeven_point()					\
+	(crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU)
+#endif
+#endif /* CONFIG_X86_64 */
+
 static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
 {
 	while (length--) {
@@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
 	return 0;
 }
 
+#ifdef CONFIG_X86_64
+static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	/*
+	 * use faster PCL version if datasize is large enough to
+	 * overcome kernel fpu state save/restore overhead
+	 */
+	if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		*crcp = crc_pcl(data, len, *crcp);
+		kernel_fpu_end();
+	} else
+		*crcp = crc32c_intel_le_hw(*crcp, data, len);
+	return 0;
+}
+
+static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
+				u8 *out)
+{
+	if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
+		kernel_fpu_end();
+	} else
+		*(__le32 *)out =
+			~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
+	return 0;
+}
+
+static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
+{
+	return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, u8 *out)
+{
+	return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
+				    out);
+}
+#endif /* CONFIG_X86_64 */
+
 static struct shash_alg alg = {
 	.setkey			=	crc32c_intel_setkey,
 	.init			=	crc32c_intel_init,
@@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void)
 {
 	if (!x86_match_cpu(crc32c_cpu_id))
 		return -ENODEV;
+#ifdef CONFIG_X86_64
+	if (cpu_has_pclmulqdq) {
+		alg.update = crc32c_pcl_intel_update;
+		alg.finup = crc32c_pcl_intel_finup;
+		alg.digest = crc32c_pcl_intel_digest;
+		set_pcl_breakeven_point();
+	}
+#endif
 	return crypto_register_shash(&alg);
 }
 
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
new file mode 100644
index 000000000000..93c6d39237ac
--- /dev/null
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -0,0 +1,460 @@
+/*
+ * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
+ *
+ * The white paper on CRC32C calculations with PCLMULQDQ instruction can be
+ * downloaded from:
+ * http://download.intel.com/design/intarch/papers/323405.pdf
+ *
+ * Copyright (C) 2012 Intel Corporation.
+ *
+ * Authors:
+ *	Wajdi Feghali <wajdi.k.feghali@intel.com>
+ *	James Guilford <james.guilford@intel.com>
+ *	David Cote <david.m.cote@intel.com>
+ *	Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JMPTBL_ENTRY i
+.word crc_\i - crc_array
+.endm
+
+.macro JNC_LESS_THAN j
+	jnc less_than_\j
+.endm
+
+# Define threshold where buffers are considered "small" and routed to more
+# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
+# SMALL_SIZE can be no larger than 255.
+
+#define SMALL_SIZE 200
+
+.if (SMALL_SIZE > 255)
+.error "SMALL_ SIZE must be < 256"
+.endif
+
+# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+
+.global crc_pcl
+crc_pcl:
+#define    bufp		%rdi
+#define    bufp_dw	%edi
+#define    bufp_w	%di
+#define    bufp_b	%dil
+#define    bufptmp	%rcx
+#define    block_0	%rcx
+#define    block_1	%rdx
+#define    block_2	%r11
+#define    len		%rsi
+#define    len_dw	%esi
+#define    len_w	%si
+#define    len_b	%sil
+#define    crc_init_arg %rdx
+#define    tmp		%rbx
+#define    crc_init	%r8
+#define    crc_init_dw	%r8d
+#define    crc1		%r9
+#define    crc2		%r10
+
+	pushq   %rbx
+	pushq   %rdi
+	pushq   %rsi
+
+	## Move crc_init for Linux to a different
+	mov     crc_init_arg, crc_init
+
+	################################################################
+	## 1) ALIGN:
+	################################################################
+
+	mov     bufp, bufptmp		# rdi = *buf
+	neg     bufp
+	and     $7, bufp		# calculate the unalignment amount of
+					# the address
+	je      proc_block		# Skip if aligned
+
+	## If len is less than 8 and we're unaligned, we need to jump
+	## to special code to avoid reading beyond the end of the buffer
+	cmp     $8, len
+	jae     do_align
+	# less_than_8 expects length in upper 3 bits of len_dw
+	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+	shl     $32-3+1, len_dw
+	jmp     less_than_8_post_shl1
+
+do_align:
+	#### Calculate CRC of unaligned bytes of the buffer (if any)
+	movq    (bufptmp), tmp		# load a quadward from the buffer
+	add     bufp, bufptmp		# align buffer pointer for quadword
+					# processing
+	sub     bufp, len		# update buffer length
+align_loop:
+	crc32b  %bl, crc_init_dw 	# compute crc32 of 1-byte
+	shr     $8, tmp			# get next byte
+	dec     bufp
+	jne     align_loop
+
+proc_block:
+
+	################################################################
+	## 2) PROCESS  BLOCKS:
+	################################################################
+
+	## compute num of bytes to be processed
+	movq    len, tmp		# save num bytes in tmp
+
+	cmpq    $128*24, len
+	jae     full_block
+
+continue_block:
+	cmpq    $SMALL_SIZE, len
+	jb      small
+
+	## len < 128*24
+	movq    $2731, %rax		# 2731 = ceil(2^16 / 24)
+	mul     len_dw
+	shrq    $16, %rax
+
+	## eax contains floor(bytes / 24) = num 24-byte chunks to do
+
+	## process rax 24-byte chunks (128 >= rax >= 0)
+
+	## compute end address of each block
+	## block 0 (base addr + RAX * 8)
+	## block 1 (base addr + RAX * 16)
+	## block 2 (base addr + RAX * 24)
+	lea     (bufptmp, %rax, 8), block_0
+	lea     (block_0, %rax, 8), block_1
+	lea     (block_1, %rax, 8), block_2
+
+	xor     crc1, crc1
+	xor     crc2, crc2
+
+	## branch into array
+	lea	jump_table(%rip), bufp
+	movzxw  (bufp, %rax, 2), len
+	offset=crc_array-jump_table
+	lea     offset(bufp, len, 1), bufp
+	jmp     *bufp
+
+	################################################################
+	## 2a) PROCESS FULL BLOCKS:
+	################################################################
+full_block:
+	movq    $128,%rax
+	lea     128*8*2(block_0), block_1
+	lea     128*8*3(block_0), block_2
+	add     $128*8*1, block_0
+
+	xor     crc1,crc1
+	xor     crc2,crc2
+
+	# Fall thruogh into top of crc array (crc_128)
+
+	################################################################
+	## 3) CRC Array:
+	################################################################
+
+crc_array:
+	i=128
+.rept 128-1
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+	crc32q   -i*8(block_0), crc_init
+	crc32q   -i*8(block_1), crc1
+	crc32q   -i*8(block_2), crc2
+	i=(i-1)
+.endr
+
+.altmacro
+LABEL crc_ %i
+.noaltmacro
+	crc32q   -i*8(block_0), crc_init
+	crc32q   -i*8(block_1), crc1
+# SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
+
+	mov     block_2, block_0
+
+	################################################################
+	## 4) Combine three results:
+	################################################################
+
+	lea	(K_table-16)(%rip), bufp	# first entry is for idx 1
+	shlq    $3, %rax			# rax *= 8
+	subq    %rax, tmp			# tmp -= rax*8
+	shlq    $1, %rax
+	subq    %rax, tmp			# tmp -= rax*16
+						# (total tmp -= rax*24)
+	addq    %rax, bufp
+
+	movdqa  (bufp), %xmm0			# 2 consts: K1:K2
+
+	movq    crc_init, %xmm1			# CRC for block 1
+	pclmulqdq $0x00,%xmm0,%xmm1		# Multiply by K2
+
+	movq    crc1, %xmm2			# CRC for block 2
+	pclmulqdq $0x10, %xmm0, %xmm2		# Multiply by K1
+
+	pxor    %xmm2,%xmm1
+	movq    %xmm1, %rax
+	xor     -i*8(block_2), %rax
+	mov     crc2, crc_init
+	crc32   %rax, crc_init
+
+################################################################
+## 5) Check for end:
+################################################################
+
+LABEL crc_ 0
+	mov     tmp, len
+	cmp     $128*24, tmp
+	jae     full_block
+	cmp     $24, tmp
+	jae     continue_block
+
+less_than_24:
+	shl     $32-4, len_dw			# less_than_16 expects length
+						# in upper 4 bits of len_dw
+	jnc     less_than_16
+	crc32q  (bufptmp), crc_init
+	crc32q  8(bufptmp), crc_init
+	jz      do_return
+	add     $16, bufptmp
+	# len is less than 8 if we got here
+	# less_than_8 expects length in upper 3 bits of len_dw
+	# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
+	shl     $2, len_dw
+	jmp     less_than_8_post_shl1
+
+	#######################################################################
+	## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+	#######################################################################
+small:
+	shl $32-8, len_dw		# Prepare len_dw for less_than_256
+	j=256
+.rept 5					# j = {256, 128, 64, 32, 16}
+.altmacro
+LABEL less_than_ %j			# less_than_j: Length should be in
+					# upper lg(j) bits of len_dw
+	j=(j/2)
+	shl     $1, len_dw		# Get next MSB
+	JNC_LESS_THAN %j
+.noaltmacro
+	i=0
+.rept (j/8)
+	crc32q  i(bufptmp), crc_init	# Compute crc32 of 8-byte data
+	i=i+8
+.endr
+	jz      do_return		# Return if remaining length is zero
+	add     $j, bufptmp		# Advance buf
+.endr
+
+less_than_8:				# Length should be stored in
+					# upper 3 bits of len_dw
+	shl     $1, len_dw
+less_than_8_post_shl1:
+	jnc     less_than_4
+	crc32l  (bufptmp), crc_init_dw	# CRC of 4 bytes
+	jz      do_return		# return if remaining data is zero
+	add     $4, bufptmp
+less_than_4:				# Length should be stored in
+					# upper 2 bits of len_dw
+	shl     $1, len_dw
+	jnc     less_than_2
+	crc32w  (bufptmp), crc_init_dw	# CRC of 2 bytes
+	jz      do_return		# return if remaining data is zero
+	add     $2, bufptmp
+less_than_2:				# Length should be stored in the MSB
+					# of len_dw
+	shl     $1, len_dw
+	jnc     less_than_1
+	crc32b  (bufptmp), crc_init_dw	# CRC of 1 byte
+less_than_1:				# Length should be zero
+do_return:
+	movq    crc_init, %rax
+	popq    %rsi
+	popq    %rdi
+	popq    %rbx
+        ret
+
+        ################################################################
+        ## jump table        Table is 129 entries x 2 bytes each
+        ################################################################
+.align 4
+jump_table:
+	i=0
+.rept 129
+.altmacro
+JMPTBL_ENTRY %i
+.noaltmacro
+	i=i+1
+.endr
+	################################################################
+	## PCLMULQDQ tables
+	## Table is 128 entries x 2 quad words each
+	################################################################
+.data
+.align 64
+K_table:
+        .quad 0x14cd00bd6,0x105ec76f0
+        .quad 0x0ba4fc28e,0x14cd00bd6
+        .quad 0x1d82c63da,0x0f20c0dfe
+        .quad 0x09e4addf8,0x0ba4fc28e
+        .quad 0x039d3b296,0x1384aa63a
+        .quad 0x102f9b8a2,0x1d82c63da
+        .quad 0x14237f5e6,0x01c291d04
+        .quad 0x00d3b6092,0x09e4addf8
+        .quad 0x0c96cfdc0,0x0740eef02
+        .quad 0x18266e456,0x039d3b296
+        .quad 0x0daece73e,0x0083a6eec
+        .quad 0x0ab7aff2a,0x102f9b8a2
+        .quad 0x1248ea574,0x1c1733996
+        .quad 0x083348832,0x14237f5e6
+        .quad 0x12c743124,0x02ad91c30
+        .quad 0x0b9e02b86,0x00d3b6092
+        .quad 0x018b33a4e,0x06992cea2
+        .quad 0x1b331e26a,0x0c96cfdc0
+        .quad 0x17d35ba46,0x07e908048
+        .quad 0x1bf2e8b8a,0x18266e456
+        .quad 0x1a3e0968a,0x11ed1f9d8
+        .quad 0x0ce7f39f4,0x0daece73e
+        .quad 0x061d82e56,0x0f1d0f55e
+        .quad 0x0d270f1a2,0x0ab7aff2a
+        .quad 0x1c3f5f66c,0x0a87ab8a8
+        .quad 0x12ed0daac,0x1248ea574
+        .quad 0x065863b64,0x08462d800
+        .quad 0x11eef4f8e,0x083348832
+        .quad 0x1ee54f54c,0x071d111a8
+        .quad 0x0b3e32c28,0x12c743124
+        .quad 0x0064f7f26,0x0ffd852c6
+        .quad 0x0dd7e3b0c,0x0b9e02b86
+        .quad 0x0f285651c,0x0dcb17aa4
+        .quad 0x010746f3c,0x018b33a4e
+        .quad 0x1c24afea4,0x0f37c5aee
+        .quad 0x0271d9844,0x1b331e26a
+        .quad 0x08e766a0c,0x06051d5a2
+        .quad 0x093a5f730,0x17d35ba46
+        .quad 0x06cb08e5c,0x11d5ca20e
+        .quad 0x06b749fb2,0x1bf2e8b8a
+        .quad 0x1167f94f2,0x021f3d99c
+        .quad 0x0cec3662e,0x1a3e0968a
+        .quad 0x19329634a,0x08f158014
+        .quad 0x0e6fc4e6a,0x0ce7f39f4
+        .quad 0x08227bb8a,0x1a5e82106
+        .quad 0x0b0cd4768,0x061d82e56
+        .quad 0x13c2b89c4,0x188815ab2
+        .quad 0x0d7a4825c,0x0d270f1a2
+        .quad 0x10f5ff2ba,0x105405f3e
+        .quad 0x00167d312,0x1c3f5f66c
+        .quad 0x0f6076544,0x0e9adf796
+        .quad 0x026f6a60a,0x12ed0daac
+        .quad 0x1a2adb74e,0x096638b34
+        .quad 0x19d34af3a,0x065863b64
+        .quad 0x049c3cc9c,0x1e50585a0
+        .quad 0x068bce87a,0x11eef4f8e
+        .quad 0x1524fa6c6,0x19f1c69dc
+        .quad 0x16cba8aca,0x1ee54f54c
+        .quad 0x042d98888,0x12913343e
+        .quad 0x1329d9f7e,0x0b3e32c28
+        .quad 0x1b1c69528,0x088f25a3a
+        .quad 0x02178513a,0x0064f7f26
+        .quad 0x0e0ac139e,0x04e36f0b0
+        .quad 0x0170076fa,0x0dd7e3b0c
+        .quad 0x141a1a2e2,0x0bd6f81f8
+        .quad 0x16ad828b4,0x0f285651c
+        .quad 0x041d17b64,0x19425cbba
+        .quad 0x1fae1cc66,0x010746f3c
+        .quad 0x1a75b4b00,0x18db37e8a
+        .quad 0x0f872e54c,0x1c24afea4
+        .quad 0x01e41e9fc,0x04c144932
+        .quad 0x086d8e4d2,0x0271d9844
+        .quad 0x160f7af7a,0x052148f02
+        .quad 0x05bb8f1bc,0x08e766a0c
+        .quad 0x0a90fd27a,0x0a3c6f37a
+        .quad 0x0b3af077a,0x093a5f730
+        .quad 0x04984d782,0x1d22c238e
+        .quad 0x0ca6ef3ac,0x06cb08e5c
+        .quad 0x0234e0b26,0x063ded06a
+        .quad 0x1d88abd4a,0x06b749fb2
+        .quad 0x04597456a,0x04d56973c
+        .quad 0x0e9e28eb4,0x1167f94f2
+        .quad 0x07b3ff57a,0x19385bf2e
+        .quad 0x0c9c8b782,0x0cec3662e
+        .quad 0x13a9cba9e,0x0e417f38a
+        .quad 0x093e106a4,0x19329634a
+        .quad 0x167001a9c,0x14e727980
+        .quad 0x1ddffc5d4,0x0e6fc4e6a
+        .quad 0x00df04680,0x0d104b8fc
+        .quad 0x02342001e,0x08227bb8a
+        .quad 0x00a2a8d7e,0x05b397730
+        .quad 0x168763fa6,0x0b0cd4768
+        .quad 0x1ed5a407a,0x0e78eb416
+        .quad 0x0d2c3ed1a,0x13c2b89c4
+        .quad 0x0995a5724,0x1641378f0
+        .quad 0x19b1afbc4,0x0d7a4825c
+        .quad 0x109ffedc0,0x08d96551c
+        .quad 0x0f2271e60,0x10f5ff2ba
+        .quad 0x00b0bf8ca,0x00bf80dd2
+        .quad 0x123888b7a,0x00167d312
+        .quad 0x1e888f7dc,0x18dcddd1c
+        .quad 0x002ee03b2,0x0f6076544
+        .quad 0x183e8d8fe,0x06a45d2b2
+        .quad 0x133d7a042,0x026f6a60a
+        .quad 0x116b0f50c,0x1dd3e10e8
+        .quad 0x05fabe670,0x1a2adb74e
+        .quad 0x130004488,0x0de87806c
+        .quad 0x000bcf5f6,0x19d34af3a
+        .quad 0x18f0c7078,0x014338754
+        .quad 0x017f27698,0x049c3cc9c
+        .quad 0x058ca5f00,0x15e3e77ee
+        .quad 0x1af900c24,0x068bce87a
+        .quad 0x0b5cfca28,0x0dd07448e
+        .quad 0x0ded288f8,0x1524fa6c6
+        .quad 0x059f229bc,0x1d8048348
+        .quad 0x06d390dec,0x16cba8aca
+        .quad 0x037170390,0x0a3e3e02c
+        .quad 0x06353c1cc,0x042d98888
+        .quad 0x0c4584f5c,0x0d73c7bea
+        .quad 0x1f16a3418,0x1329d9f7e
+        .quad 0x0531377e2,0x185137662
+        .quad 0x1d8d9ca7c,0x1b1c69528
+        .quad 0x0b25b29f2,0x18a08b5bc
+        .quad 0x19fb2a8b0,0x02178513a
+        .quad 0x1a08fe6ac,0x1da758ae0
+        .quad 0x045cddf4e,0x0e0ac139e
+        .quad 0x1a91647f2,0x169cf9eb0
+        .quad 0x1a0f717c4,0x0170076fa
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 6563366bae80..b901b590635f 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -324,9 +324,19 @@ config CRYPTO_CRC32C
 	  by iSCSI for header and data digests and by others.
 	  See Castagnoli93.  Module will be crc32c.
 
+config CRYPTO_CRC32C_X86_64
+	bool
+	depends on X86 && 64BIT
+	select CRYPTO_HASH
+	help
+	  In Intel processor with SSE4.2 supported, the processor will
+	  support CRC32C calculation using hardware accelerated CRC32
+	  instruction optimized with PCLMULQDQ instruction when available.
+
 config CRYPTO_CRC32C_INTEL
 	tristate "CRC32c INTEL hardware acceleration"
 	depends on X86
+	select CRYPTO_CRC32C_X86_64 if 64BIT
 	select CRYPTO_HASH
 	help
 	  In Intel processor with SSE4.2 supported, the processor will
-- 
cgit v1.2.3-58-ga151


From 58990986f1cba40c23c0c10592ace08616de3ffa Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Sat, 20 Oct 2012 15:06:36 +0300
Subject: crypto: x86/glue_helper - use le128 instead of u128 for CTR mode

'u128' currently used for CTR mode is on little-endian 'long long' swapped
and would require extra swap operations by SSE/AVX code. Use of le128
instead of u128 allows IV calculations to be done with vector registers
easier.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/camellia_glue.c           | 16 ++++++++--------
 arch/x86/crypto/cast6_avx_glue.c          | 12 ++++++------
 arch/x86/crypto/glue_helper.c             | 12 ++++++------
 arch/x86/crypto/serpent_avx_glue.c        | 12 ++++++------
 arch/x86/crypto/serpent_sse2_glue.c       | 12 ++++++------
 arch/x86/crypto/twofish_avx_glue.c        |  6 +++---
 arch/x86/crypto/twofish_glue_3way.c       | 20 ++++++++++----------
 arch/x86/include/asm/crypto/glue_helper.h | 28 +++++++++++++++++-----------
 arch/x86/include/asm/crypto/twofish.h     |  4 ++--
 9 files changed, 64 insertions(+), 58 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 42ffd2bbab5b..021a0086186b 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1317,21 +1317,21 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
 	u128_xor(&dst[1], &dst[1], &iv);
 }
 
-static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	be128 ctrblk;
 
 	if (dst != src)
 		*dst = *src;
 
-	u128_to_be128(&ctrblk, iv);
-	u128_inc(iv);
+	le128_to_be128(&ctrblk, iv);
+	le128_inc(iv);
 
 	camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
 }
 
 static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
-				    u128 *iv)
+				    le128 *iv)
 {
 	be128 ctrblks[2];
 
@@ -1340,10 +1340,10 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
 		dst[1] = src[1];
 	}
 
-	u128_to_be128(&ctrblks[0], iv);
-	u128_inc(iv);
-	u128_to_be128(&ctrblks[1], iv);
-	u128_inc(iv);
+	le128_to_be128(&ctrblks[0], iv);
+	le128_inc(iv);
+	le128_to_be128(&ctrblks[1], iv);
+	le128_inc(iv);
 
 	camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
 }
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 15e5f85a5011..1dfd33b5b4fb 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -78,19 +78,19 @@ static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
 		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
 }
 
-static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	be128 ctrblk;
 
-	u128_to_be128(&ctrblk, iv);
-	u128_inc(iv);
+	le128_to_be128(&ctrblk, iv);
+	le128_inc(iv);
 
 	__cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
 	u128_xor(dst, src, (u128 *)&ctrblk);
 }
 
 static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-				   u128 *iv)
+				 le128 *iv)
 {
 	be128 ctrblks[CAST6_PARALLEL_BLOCKS];
 	unsigned int i;
@@ -99,8 +99,8 @@ static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
 		if (dst != src)
 			dst[i] = src[i];
 
-		u128_to_be128(&ctrblks[i], iv);
-		u128_inc(iv);
+		le128_to_be128(&ctrblks[i], iv);
+		le128_inc(iv);
 	}
 
 	cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 30b3927bd733..22ce4f683e55 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
 	u8 *src = (u8 *)walk->src.virt.addr;
 	u8 *dst = (u8 *)walk->dst.virt.addr;
 	unsigned int nbytes = walk->nbytes;
-	u128 ctrblk;
+	le128 ctrblk;
 	u128 tmp;
 
-	be128_to_u128(&ctrblk, (be128 *)walk->iv);
+	be128_to_le128(&ctrblk, (be128 *)walk->iv);
 
 	memcpy(&tmp, src, nbytes);
 	fn_ctr(ctx, &tmp, &tmp, &ctrblk);
 	memcpy(dst, &tmp, nbytes);
 
-	u128_to_be128((be128 *)walk->iv, &ctrblk);
+	le128_to_be128((be128 *)walk->iv, &ctrblk);
 }
 EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
 
@@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 	unsigned int nbytes = walk->nbytes;
 	u128 *src = (u128 *)walk->src.virt.addr;
 	u128 *dst = (u128 *)walk->dst.virt.addr;
-	u128 ctrblk;
+	le128 ctrblk;
 	unsigned int num_blocks, func_bytes;
 	unsigned int i;
 
-	be128_to_u128(&ctrblk, (be128 *)walk->iv);
+	be128_to_le128(&ctrblk, (be128 *)walk->iv);
 
 	/* Process multi-block batch */
 	for (i = 0; i < gctx->num_funcs; i++) {
@@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 	}
 
 done:
-	u128_to_be128((be128 *)walk->iv, &ctrblk);
+	le128_to_be128((be128 *)walk->iv, &ctrblk);
 	return nbytes;
 }
 
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 3f543a04cf1e..2aa31ade1e68 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -56,19 +56,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
 		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
 }
 
-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	be128 ctrblk;
 
-	u128_to_be128(&ctrblk, iv);
-	u128_inc(iv);
+	le128_to_be128(&ctrblk, iv);
+	le128_inc(iv);
 
 	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
 	u128_xor(dst, src, (u128 *)&ctrblk);
 }
 
 static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-				   u128 *iv)
+				   le128 *iv)
 {
 	be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
 	unsigned int i;
@@ -77,8 +77,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
 		if (dst != src)
 			dst[i] = src[i];
 
-		u128_to_be128(&ctrblks[i], iv);
-		u128_inc(iv);
+		le128_to_be128(&ctrblks[i], iv);
+		le128_inc(iv);
 	}
 
 	serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 9107a9908c41..97a356ece24d 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
 		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
 }
 
-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	be128 ctrblk;
 
-	u128_to_be128(&ctrblk, iv);
-	u128_inc(iv);
+	le128_to_be128(&ctrblk, iv);
+	le128_inc(iv);
 
 	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
 	u128_xor(dst, src, (u128 *)&ctrblk);
 }
 
 static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-				   u128 *iv)
+				   le128 *iv)
 {
 	be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
 	unsigned int i;
@@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
 		if (dst != src)
 			dst[i] = src[i];
 
-		u128_to_be128(&ctrblks[i], iv);
-		u128_inc(iv);
+		le128_to_be128(&ctrblks[i], iv);
+		le128_inc(iv);
 	}
 
 	serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index e7708b5442e0..810e45d51186 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -90,7 +90,7 @@ static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
 }
 
 static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-				     u128 *iv)
+				     le128 *iv)
 {
 	be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
 	unsigned int i;
@@ -99,8 +99,8 @@ static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
 		if (dst != src)
 			dst[i] = src[i];
 
-		u128_to_be128(&ctrblks[i], iv);
-		u128_inc(iv);
+		le128_to_be128(&ctrblks[i], iv);
+		le128_inc(iv);
 	}
 
 	twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index aa3eb358b7e8..13e63b3e1dfb 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
 }
 EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
 
-void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
+void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	be128 ctrblk;
 
 	if (dst != src)
 		*dst = *src;
 
-	u128_to_be128(&ctrblk, iv);
-	u128_inc(iv);
+	le128_to_be128(&ctrblk, iv);
+	le128_inc(iv);
 
 	twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
 	u128_xor(dst, dst, (u128 *)&ctrblk);
@@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
 EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
 
 void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
-				     u128 *iv)
+			      le128 *iv)
 {
 	be128 ctrblks[3];
 
@@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
 		dst[2] = src[2];
 	}
 
-	u128_to_be128(&ctrblks[0], iv);
-	u128_inc(iv);
-	u128_to_be128(&ctrblks[1], iv);
-	u128_inc(iv);
-	u128_to_be128(&ctrblks[2], iv);
-	u128_inc(iv);
+	le128_to_be128(&ctrblks[0], iv);
+	le128_inc(iv);
+	le128_to_be128(&ctrblks[1], iv);
+	le128_inc(iv);
+	le128_to_be128(&ctrblks[2], iv);
+	le128_inc(iv);
 
 	twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
 }
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 3e408bddc96f..e2d65b061d27 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -13,7 +13,7 @@
 typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
 typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
 typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
-				       u128 *iv);
+				       le128 *iv);
 
 #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
 #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
@@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled)
 		kernel_fpu_end();
 }
 
-static inline void u128_to_be128(be128 *dst, const u128 *src)
+static inline void le128_to_be128(be128 *dst, const le128 *src)
 {
-	dst->a = cpu_to_be64(src->a);
-	dst->b = cpu_to_be64(src->b);
+	dst->a = cpu_to_be64(le64_to_cpu(src->a));
+	dst->b = cpu_to_be64(le64_to_cpu(src->b));
 }
 
-static inline void be128_to_u128(u128 *dst, const be128 *src)
+static inline void be128_to_le128(le128 *dst, const be128 *src)
 {
-	dst->a = be64_to_cpu(src->a);
-	dst->b = be64_to_cpu(src->b);
+	dst->a = cpu_to_le64(be64_to_cpu(src->a));
+	dst->b = cpu_to_le64(be64_to_cpu(src->b));
 }
 
-static inline void u128_inc(u128 *i)
+static inline void le128_inc(le128 *i)
 {
-	i->b++;
-	if (!i->b)
-		i->a++;
+	u64 a = le64_to_cpu(i->a);
+	u64 b = le64_to_cpu(i->b);
+
+	b++;
+	if (!b)
+		a++;
+
+	i->a = cpu_to_le64(a);
+	i->b = cpu_to_le64(b);
 }
 
 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
index 9d2c514bd5f9..878c51ceebb5 100644
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 /* helpers from twofish_x86_64-3way module */
 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
-				u128 *iv);
+				le128 *iv);
 extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
-				     u128 *iv);
+				     le128 *iv);
 
 extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
 			      unsigned int keylen);
-- 
cgit v1.2.3-58-ga151


From cba1cce05498d55f363c28cd2512368e95605518 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Sat, 20 Oct 2012 15:06:41 +0300
Subject: crypto: cast6/avx - avoid using temporary stack buffers

Introduce new assembler functions to avoid use temporary stack buffers in
glue code. This also allows use of vector instructions for xoring output
in CTR and CBC modes and construction of IVs for CTR mode.

ECB mode sees ~0.5% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~2%.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 190 +++++++++++++++++++-----------
 arch/x86/crypto/cast6_avx_glue.c          |  71 ++---------
 arch/x86/crypto/glue_helper-asm-avx.S     |  91 ++++++++++++++
 3 files changed, 227 insertions(+), 125 deletions(-)
 create mode 100644 arch/x86/crypto/glue_helper-asm-avx.S

(limited to 'arch')

diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 218d283772f4..83a53818f0a5 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -23,6 +23,8 @@
  *
  */
 
+#include "glue_helper-asm-avx.S"
+
 .file "cast6-avx-x86_64-asm_64.S"
 
 .extern cast6_s1
@@ -205,11 +207,7 @@
 	vpunpcklqdq		x3, t2, x2; \
 	vpunpckhqdq		x3, t2, x3;
 
-#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
-	vmovdqu (0*4*4)(in),	x0; \
-	vmovdqu (1*4*4)(in),	x1; \
-	vmovdqu (2*4*4)(in),	x2; \
-	vmovdqu (3*4*4)(in),	x3; \
+#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
 	vpshufb rmask, x0,	x0; \
 	vpshufb rmask, x1,	x1; \
 	vpshufb rmask, x2,	x2; \
@@ -217,39 +215,21 @@
 	\
 	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 
-#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
+#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
 	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 	\
 	vpshufb rmask,		x0, x0;       \
 	vpshufb rmask,		x1, x1;       \
 	vpshufb rmask,		x2, x2;       \
-	vpshufb rmask,		x3, x3;       \
-	vmovdqu x0,		(0*4*4)(out); \
-	vmovdqu	x1,		(1*4*4)(out); \
-	vmovdqu	x2,		(2*4*4)(out); \
-	vmovdqu	x3,		(3*4*4)(out);
-
-#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
-	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
-	\
-	vpshufb rmask,		x0, x0;       \
-	vpshufb rmask,		x1, x1;       \
-	vpshufb rmask,		x2, x2;       \
-	vpshufb rmask,		x3, x3;       \
-	vpxor (0*4*4)(out),	x0, x0;       \
-	vmovdqu	x0,		(0*4*4)(out); \
-	vpxor (1*4*4)(out),	x1, x1;       \
-	vmovdqu	x1,		(1*4*4)(out); \
-	vpxor (2*4*4)(out),	x2, x2;       \
-	vmovdqu x2,		(2*4*4)(out); \
-	vpxor (3*4*4)(out),	x3, x3;       \
-	vmovdqu x3,		(3*4*4)(out);
+	vpshufb rmask,		x3, x3;
 
 .data
 
 .align 16
 .Lbswap_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 .Lrkr_enc_Q_Q_QBAR_QBAR:
 	.byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
@@ -269,31 +249,26 @@
 
 .text
 
-.align 16
-.global __cast6_enc_blk_8way
-.type   __cast6_enc_blk_8way,@function;
+.align 8
+.type   __cast6_enc_blk8,@function;
 
-__cast6_enc_blk_8way:
+__cast6_enc_blk8:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 *	%rcx: bool, if true: xor output
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+	 * output:
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 	 */
 
 	pushq %rbp;
 	pushq %rbx;
-	pushq %rcx;
 
 	vmovdqa .Lbswap_mask, RKM;
 	vmovd .Lfirst_mask, R1ST;
 	vmovd .L32_mask, R32;
 
-	leaq (4*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-
-	movq %rsi, %r11;
+	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
 
 	preload_rkr(0, dummy, none);
 	Q(0);
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
 	QBAR(10);
 	QBAR(11);
 
-	popq %rcx;
 	popq %rbx;
 	popq %rbp;
 
 	vmovdqa .Lbswap_mask, RKM;
-	leaq (4*4*4)(%r11), %rax;
-
-	testb %cl, %cl;
-	jnz __enc_xor8;
-
-	outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-
-	ret;
 
-__enc_xor8:
-	outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-	outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
 
 	ret;
 
-.align 16
-.global cast6_dec_blk_8way
-.type   cast6_dec_blk_8way,@function;
+.align 8
+.type   __cast6_dec_blk8,@function;
 
-cast6_dec_blk_8way:
+__cast6_dec_blk8:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+	 * output:
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
 	 */
 
 	pushq %rbp;
@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
 	vmovd .Lfirst_mask, R1ST;
 	vmovd .L32_mask, R32;
 
-	leaq (4*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
-
-	movq %rsi, %r11;
+	inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+	inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
 
 	preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
 	Q(11);
@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
 	popq %rbp;
 
 	vmovdqa .Lbswap_mask, RKM;
-	leaq (4*4*4)(%r11), %rax;
-	outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
-	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+	outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
+	outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
+
+	ret;
+
+.align 8
+.global cast6_ecb_enc_8way
+.type   cast6_ecb_enc_8way,@function;
+
+cast6_ecb_enc_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	movq %rsi, %r11;
+
+	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __cast6_enc_blk8;
+
+	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	ret;
+
+.align 8
+.global cast6_ecb_dec_8way
+.type   cast6_ecb_dec_8way,@function;
+
+cast6_ecb_dec_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	movq %rsi, %r11;
+
+	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __cast6_dec_blk8;
+
+	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	ret;
+
+.align 8
+.global cast6_cbc_dec_8way
+.type   cast6_cbc_dec_8way,@function;
+
+cast6_cbc_dec_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	pushq %r12;
+
+	movq %rsi, %r11;
+	movq %rdx, %r12;
+
+	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __cast6_dec_blk8;
+
+	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	popq %r12;
+
+	ret;
+
+.align 8
+.global cast6_ctr_8way
+.type   cast6_ctr_8way,@function;
+
+cast6_ctr_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	pushq %r12;
+
+	movq %rsi, %r11;
+	movq %rdx, %r12;
+
+	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+		      RD2, RX, RKR, RKM);
+
+	call __cast6_enc_blk8;
+
+	store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	popq %r12;
 
 	ret;
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 1dfd33b5b4fb..92f7ca24790a 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -40,43 +40,15 @@
 
 #define CAST6_PARALLEL_BLOCKS 8
 
-asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst,
-				     const u8 *src, bool xor);
-asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst,
+asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
+				   const u8 *src);
+asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
 				   const u8 *src);
 
-static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst,
-				      const u8 *src)
-{
-	__cast6_enc_blk_8way(ctx, dst, src, false);
-}
-
-static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
-					  const u8 *src)
-{
-	__cast6_enc_blk_8way(ctx, dst, src, true);
-}
-
-static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
-				      const u8 *src)
-{
-	cast6_dec_blk_8way(ctx, dst, src);
-}
-
-
-static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
-{
-	u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
-	unsigned int j;
-
-	for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
-		ivs[j] = src[j];
-
-	cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
-	for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
-		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
-}
+asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
+				   const u8 *src);
+asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
+			       le128 *iv);
 
 static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
@@ -89,30 +61,13 @@ static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 	u128_xor(dst, src, (u128 *)&ctrblk);
 }
 
-static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-				 le128 *iv)
-{
-	be128 ctrblks[CAST6_PARALLEL_BLOCKS];
-	unsigned int i;
-
-	for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
-		if (dst != src)
-			dst[i] = src[i];
-
-		le128_to_be128(&ctrblks[i], iv);
-		le128_inc(iv);
-	}
-
-	cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
-
 static const struct common_glue_ctx cast6_enc = {
 	.num_funcs = 2,
 	.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
 
 	.funcs = { {
 		.num_blocks = CAST6_PARALLEL_BLOCKS,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) }
+		.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {
 
 	.funcs = { {
 		.num_blocks = CAST6_PARALLEL_BLOCKS,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) }
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {
 
 	.funcs = { {
 		.num_blocks = CAST6_PARALLEL_BLOCKS,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) }
+		.fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {
 
 	.funcs = { {
 		.num_blocks = CAST6_PARALLEL_BLOCKS,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) }
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
 
 	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
-		cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
 		return;
 	}
 
@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
 
 	if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
-		cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
 		return;
 	}
 
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
new file mode 100644
index 000000000000..f7b6ea2ddfdb
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -0,0 +1,91 @@
+/*
+ * Shared glue code for 128bit block ciphers, AVX assembler macros
+ *
+ * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vmovdqu (0*16)(src), x0; \
+	vmovdqu (1*16)(src), x1; \
+	vmovdqu (2*16)(src), x2; \
+	vmovdqu (3*16)(src), x3; \
+	vmovdqu (4*16)(src), x4; \
+	vmovdqu (5*16)(src), x5; \
+	vmovdqu (6*16)(src), x6; \
+	vmovdqu (7*16)(src), x7;
+
+#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vmovdqu x0, (0*16)(dst); \
+	vmovdqu x1, (1*16)(dst); \
+	vmovdqu x2, (2*16)(dst); \
+	vmovdqu x3, (3*16)(dst); \
+	vmovdqu x4, (4*16)(dst); \
+	vmovdqu x5, (5*16)(dst); \
+	vmovdqu x6, (6*16)(dst); \
+	vmovdqu x7, (7*16)(dst);
+
+#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vpxor (0*16)(src), x1, x1; \
+	vpxor (1*16)(src), x2, x2; \
+	vpxor (2*16)(src), x3, x3; \
+	vpxor (3*16)(src), x4, x4; \
+	vpxor (4*16)(src), x5, x5; \
+	vpxor (5*16)(src), x6, x6; \
+	vpxor (6*16)(src), x7, x7; \
+	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
+	vpcmpeqd t0, t0, t0; \
+	vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
+	vmovdqa bswap, t1; \
+	\
+	/* load IV and byteswap */ \
+	vmovdqu (iv), x7; \
+	vpshufb t1, x7, x0; \
+	\
+	/* construct IVs */ \
+	inc_le128(x7, t0, t2); \
+	vpshufb t1, x7, x1; \
+	inc_le128(x7, t0, t2); \
+	vpshufb t1, x7, x2; \
+	inc_le128(x7, t0, t2); \
+	vpshufb t1, x7, x3; \
+	inc_le128(x7, t0, t2); \
+	vpshufb t1, x7, x4; \
+	inc_le128(x7, t0, t2); \
+	vpshufb t1, x7, x5; \
+	inc_le128(x7, t0, t2); \
+	vpshufb t1, x7, x6; \
+	inc_le128(x7, t0, t2); \
+	vmovdqa x7, t2; \
+	vpshufb t1, x7, x7; \
+	inc_le128(t2, t0, t1); \
+	vmovdqu t2, (iv);
+
+#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vpxor (0*16)(src), x0, x0; \
+	vpxor (1*16)(src), x1, x1; \
+	vpxor (2*16)(src), x2, x2; \
+	vpxor (3*16)(src), x3, x3; \
+	vpxor (4*16)(src), x4, x4; \
+	vpxor (5*16)(src), x5, x5; \
+	vpxor (6*16)(src), x6, x6; \
+	vpxor (7*16)(src), x7, x7; \
+	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
-- 
cgit v1.2.3-58-ga151


From 8435a3c3003c00c43f1b267368bbe1d8dada35d1 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Sat, 20 Oct 2012 15:06:46 +0300
Subject: crypto: twofish/avx - avoid using temporary stack buffers

Introduce new assembler functions to avoid use temporary stack buffers in glue
code. This also allows use of vector instructions for xoring output in CTR and
CBC modes and construction of IVs for CTR mode.

ECB mode sees ~0.2% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~3%.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S | 208 ++++++++++++++++++----------
 arch/x86/crypto/twofish_avx_glue.c          |  73 ++--------
 2 files changed, 152 insertions(+), 129 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 1585abb13dde..ebac16bfa830 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -23,7 +23,16 @@
  *
  */
 
+#include "glue_helper-asm-avx.S"
+
 .file "twofish-avx-x86_64-asm_64.S"
+
+.data
+.align 16
+
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
 .text
 
 /* structure of crypto context */
@@ -217,69 +226,45 @@
 	vpunpcklqdq		x3, t2, x2; \
 	vpunpckhqdq		x3, t2, x3;
 
-#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
-	vpxor (0*4*4)(in),	wkey, x0; \
-	vpxor (1*4*4)(in),	wkey, x1; \
-	vpxor (2*4*4)(in),	wkey, x2; \
-	vpxor (3*4*4)(in),	wkey, x3; \
+#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
+	vpxor		x0, wkey, x0; \
+	vpxor		x1, wkey, x1; \
+	vpxor		x2, wkey, x2; \
+	vpxor		x3, wkey, x3; \
 	\
 	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 
-#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
-	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
-	\
-	vpxor		x0, wkey, x0;     \
-	vmovdqu 	x0, (0*4*4)(out); \
-	vpxor		x1, wkey, x1;     \
-	vmovdqu		x1, (1*4*4)(out); \
-	vpxor		x2, wkey, x2;     \
-	vmovdqu		x2, (2*4*4)(out); \
-	vpxor		x3, wkey, x3;     \
-	vmovdqu		x3, (3*4*4)(out);
-
-#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
+#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
 	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 	\
-	vpxor		x0, wkey, x0;         \
-	vpxor		(0*4*4)(out), x0, x0; \
-	vmovdqu 	x0, (0*4*4)(out);     \
-	vpxor		x1, wkey, x1;         \
-	vpxor		(1*4*4)(out), x1, x1; \
-	vmovdqu	        x1, (1*4*4)(out);     \
-	vpxor		x2, wkey, x2;         \
-	vpxor           (2*4*4)(out), x2, x2; \
-	vmovdqu		x2, (2*4*4)(out);     \
-	vpxor		x3, wkey, x3;         \
-	vpxor           (3*4*4)(out), x3, x3; \
-	vmovdqu		x3, (3*4*4)(out);
+	vpxor		x0, wkey, x0; \
+	vpxor		x1, wkey, x1; \
+	vpxor		x2, wkey, x2; \
+	vpxor		x3, wkey, x3;
 
 .align 8
-.global __twofish_enc_blk_8way
-.type   __twofish_enc_blk_8way,@function;
+.type	__twofish_enc_blk8,@function;
 
-__twofish_enc_blk_8way:
+__twofish_enc_blk8:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 *	%rcx: bool, if true: xor output
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+	 * output:
+	 *	RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
 	 */
 
+	vmovdqu w(CTX), RK1;
+
 	pushq %rbp;
 	pushq %rbx;
 	pushq %rcx;
 
-	vmovdqu w(CTX), RK1;
-
-	leaq (4*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+	inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
 	preload_rgi(RA1);
 	rotate_1l(RD1);
-	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+	inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
 	rotate_1l(RD2);
 
-	movq %rsi, %r11;
-
 	encrypt_cycle(0);
 	encrypt_cycle(1);
 	encrypt_cycle(2);
@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
 	popq %rbx;
 	popq %rbp;
 
-	leaq (4*4*4)(%r11), %rax;
-
-	testb %cl, %cl;
-	jnz __enc_xor8;
-
-	outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
-	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
-
-	ret;
-
-__enc_xor8:
-	outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
-	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+	outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+	outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
 
 	ret;
 
 .align 8
-.global twofish_dec_blk_8way
-.type   twofish_dec_blk_8way,@function;
+.type	__twofish_dec_blk8,@function;
 
-twofish_dec_blk_8way:
+__twofish_dec_blk8:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
+	 *	RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
+	 * output:
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
 	 */
 
+	vmovdqu (w+4*4)(CTX), RK1;
+
 	pushq %rbp;
 	pushq %rbx;
 
-	vmovdqu (w+4*4)(CTX), RK1;
-
-	leaq (4*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+	inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
 	preload_rgi(RC1);
 	rotate_1l(RA1);
-	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+	inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
 	rotate_1l(RA2);
 
-	movq %rsi, %r11;
-
 	decrypt_cycle(7);
 	decrypt_cycle(6);
 	decrypt_cycle(5);
@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
 	popq %rbx;
 	popq %rbp;
 
-	leaq (4*4*4)(%r11), %rax;
-	outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
-	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+	outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+	outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+
+	ret;
+
+.align 8
+.global twofish_ecb_enc_8way
+.type   twofish_ecb_enc_8way,@function;
+
+twofish_ecb_enc_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	movq %rsi, %r11;
+
+	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __twofish_enc_blk8;
+
+	store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+	ret;
+
+.align 8
+.global twofish_ecb_dec_8way
+.type   twofish_ecb_dec_8way,@function;
+
+twofish_ecb_dec_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	movq %rsi, %r11;
+
+	load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+	call __twofish_dec_blk8;
+
+	store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	ret;
+
+.align 8
+.global twofish_cbc_dec_8way
+.type   twofish_cbc_dec_8way,@function;
+
+twofish_cbc_dec_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	pushq %r12;
+
+	movq %rsi, %r11;
+	movq %rdx, %r12;
+
+	load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+	call __twofish_dec_blk8;
+
+	store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	popq %r12;
+
+	ret;
+
+.align 8
+.global twofish_ctr_8way
+.type   twofish_ctr_8way,@function;
+
+twofish_ctr_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	pushq %r12;
+
+	movq %rsi, %r11;
+	movq %rdx, %r12;
+
+	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+		      RD2, RX0, RX1, RY0);
+
+	call __twofish_enc_blk8;
+
+	store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+	popq %r12;
 
 	ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 810e45d51186..94ac91d26e47 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -45,66 +45,23 @@
 
 #define TWOFISH_PARALLEL_BLOCKS 8
 
-static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
-					const u8 *src)
-{
-	__twofish_enc_blk_3way(ctx, dst, src, false);
-}
-
 /* 8-way parallel cipher functions */
-asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
-				       const u8 *src, bool xor);
-asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
+asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
 
-static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
-					const u8 *src)
-{
-	__twofish_enc_blk_8way(ctx, dst, src, false);
-}
-
-static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
-					    const u8 *src)
-{
-	__twofish_enc_blk_8way(ctx, dst, src, true);
-}
+asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
+				 const u8 *src, le128 *iv);
 
-static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 					const u8 *src)
 {
-	twofish_dec_blk_8way(ctx, dst, src);
-}
-
-static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
-{
-	u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
-	unsigned int j;
-
-	for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
-		ivs[j] = src[j];
-
-	twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
-	for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
-		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
+	__twofish_enc_blk_3way(ctx, dst, src, false);
 }
 
-static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-				     le128 *iv)
-{
-	be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
-	unsigned int i;
-
-	for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
-		if (dst != src)
-			dst[i] = src[i];
-
-		le128_to_be128(&ctrblks[i], iv);
-		le128_inc(iv);
-	}
-
-	twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
 
 static const struct common_glue_ctx twofish_enc = {
 	.num_funcs = 3,
@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {
 
 	.funcs = { {
 		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
 	}, {
 		.num_blocks = 3,
 		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {
 
 	.funcs = { {
 		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
 	}, {
 		.num_blocks = 3,
 		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {
 
 	.funcs = { {
 		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
 	}, {
 		.num_blocks = 3,
 		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {
 
 	.funcs = { {
 		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
 	}, {
 		.num_blocks = 3,
 		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
 
 	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
-		twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
 		return;
 	}
 
@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
 
 	if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
-		twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
 		return;
 	}
 
-- 
cgit v1.2.3-58-ga151


From facd416fbc1cdee357730909a414898934f16ae1 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Sat, 20 Oct 2012 15:06:51 +0300
Subject: crypto: serpent/avx - avoid using temporary stack buffers

Introduce new assembler functions to avoid use temporary stack buffers in glue
code. This also allows use of vector instructions for xoring output in CTR and
CBC modes and construction of IVs for CTR mode.

ECB mode sees ~0.5% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~3%.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 166 ++++++++++++++++++----------
 arch/x86/crypto/serpent_avx_glue.c          |  43 +------
 arch/x86/include/asm/crypto/serpent-avx.h   |  27 ++---
 3 files changed, 121 insertions(+), 115 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 504106bf04a2..02b0e9fe997c 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -24,7 +24,16 @@
  *
  */
 
+#include "glue_helper-asm-avx.S"
+
 .file "serpent-avx-x86_64-asm_64.S"
+
+.data
+.align 16
+
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
 .text
 
 #define CTX %rdi
@@ -550,51 +559,27 @@
 	vpunpcklqdq		x3, t2, x2; \
 	vpunpckhqdq		x3, t2, x3;
 
-#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
-	vmovdqu (0*4*4)(in),	x0; \
-	vmovdqu (1*4*4)(in),	x1; \
-	vmovdqu (2*4*4)(in),	x2; \
-	vmovdqu (3*4*4)(in),	x3; \
-	\
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
 	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 
-#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
-	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
-	\
-	vmovdqu x0,		(0*4*4)(out); \
-	vmovdqu x1,		(1*4*4)(out); \
-	vmovdqu x2,		(2*4*4)(out); \
-	vmovdqu x3,		(3*4*4)(out);
-
-#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
-	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
-	\
-	vpxor (0*4*4)(out),	x0, x0;       \
-	vmovdqu x0,		(0*4*4)(out); \
-	vpxor (1*4*4)(out),	x1, x1;       \
-	vmovdqu x1,		(1*4*4)(out); \
-	vpxor (2*4*4)(out),	x2, x2;       \
-	vmovdqu x2,		(2*4*4)(out); \
-	vpxor (3*4*4)(out),	x3, x3;       \
-	vmovdqu x3,		(3*4*4)(out);
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 
 .align 8
-.global __serpent_enc_blk_8way_avx
-.type   __serpent_enc_blk_8way_avx,@function;
+.type   __serpent_enc_blk8_avx,@function;
 
-__serpent_enc_blk_8way_avx:
+__serpent_enc_blk8_avx:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 *	%rcx: bool, if true: xor output
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+	 * output:
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 	 */
 
 	vpcmpeqd RNOT, RNOT, RNOT;
 
-	leaq (4*4*4)(%rdx), %rax;
-	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
-	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 
 						 K2(RA, RB, RC, RD, RE, 0);
 	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:
 	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
 	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
 
-	leaq (4*4*4)(%rsi), %rax;
-
-	testb %cl, %cl;
-	jnz __enc_xor8;
-
-	write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
-	write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
-
-	ret;
-
-__enc_xor8:
-	xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
-	xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+	write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 
 	ret;
 
 .align 8
-.global serpent_dec_blk_8way_avx
-.type   serpent_dec_blk_8way_avx,@function;
+.type   __serpent_dec_blk8_avx,@function;
 
-serpent_dec_blk_8way_avx:
+__serpent_dec_blk8_avx:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
+	 * output:
+	 *	RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
 	 */
 
 	vpcmpeqd RNOT, RNOT, RNOT;
 
-	leaq (4*4*4)(%rdx), %rax;
-	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
-	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 
 						 K2(RA, RB, RC, RD, RE, 32);
 	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:
 	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
 	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
 
-	leaq (4*4*4)(%rsi), %rax;
-	write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
-	write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+	write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+	write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+	ret;
+
+.align 8
+.global serpent_ecb_enc_8way_avx
+.type   serpent_ecb_enc_8way_avx,@function;
+
+serpent_ecb_enc_8way_avx:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __serpent_enc_blk8_avx;
+
+	store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	ret;
+
+.align 8
+.global serpent_ecb_dec_8way_avx
+.type   serpent_ecb_dec_8way_avx,@function;
+
+serpent_ecb_dec_8way_avx:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __serpent_dec_blk8_avx;
+
+	store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+	ret;
+
+.align 8
+.global serpent_cbc_dec_8way_avx
+.type   serpent_cbc_dec_8way_avx,@function;
+
+serpent_cbc_dec_8way_avx:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __serpent_dec_blk8_avx;
+
+	store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+	ret;
+
+.align 8
+.global serpent_ctr_8way_avx
+.type   serpent_ctr_8way_avx,@function;
+
+serpent_ctr_8way_avx:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+		      RD2, RK0, RK1, RK2);
+
+	call __serpent_enc_blk8_avx;
+
+	store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 
 	ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 2aa31ade1e68..52abaaf28e7f 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -42,20 +42,6 @@
 #include <asm/crypto/ablk_helper.h>
 #include <asm/crypto/glue_helper.h>
 
-static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
-{
-	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
-	unsigned int j;
-
-	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
-		ivs[j] = src[j];
-
-	serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
-	for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
-		u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
-}
-
 static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	be128 ctrblk;
@@ -67,30 +53,13 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 	u128_xor(dst, src, (u128 *)&ctrblk);
 }
 
-static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-				   le128 *iv)
-{
-	be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
-	unsigned int i;
-
-	for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
-		if (dst != src)
-			dst[i] = src[i];
-
-		le128_to_be128(&ctrblks[i], iv);
-		le128_inc(iv);
-	}
-
-	serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
-
 static const struct common_glue_ctx serpent_enc = {
 	.num_funcs = 2,
 	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
 
 	.funcs = { {
 		.num_blocks = SERPENT_PARALLEL_BLOCKS,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = {
 
 	.funcs = { {
 		.num_blocks = SERPENT_PARALLEL_BLOCKS,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = {
 
 	.funcs = { {
 		.num_blocks = SERPENT_PARALLEL_BLOCKS,
-		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = {
 
 	.funcs = { {
 		.num_blocks = SERPENT_PARALLEL_BLOCKS,
-		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
 	}, {
 		.num_blocks = 1,
 		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
-		serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
 		return;
 	}
 
@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
 
 	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
-		serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
 		return;
 	}
 
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index 432deedd2945..0da1d3e2a55c 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -6,27 +6,14 @@
 
 #define SERPENT_PARALLEL_BLOCKS 8
 
-asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
-					   const u8 *src, bool xor);
-asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src);
+asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 					 const u8 *src);
 
-static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
-				   const u8 *src)
-{
-	__serpent_enc_blk_8way_avx(ctx, dst, src, false);
-}
-
-static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
-				       const u8 *src)
-{
-	__serpent_enc_blk_8way_avx(ctx, dst, src, true);
-}
-
-static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
-				   const u8 *src)
-{
-	serpent_dec_blk_8way_avx(ctx, dst, src);
-}
+asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src);
+asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+				     const u8 *src, le128 *iv);
 
 #endif
-- 
cgit v1.2.3-58-ga151


From c12ab20b162c9414acadc18c6da6cfd3eea54b7b Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Sat, 20 Oct 2012 15:06:56 +0300
Subject: crypto: cast5/avx - avoid using temporary stack buffers

Introduce new assembler functions to avoid use temporary stack buffers in glue
code. This also allows use of vector instructions for xoring output in CTR and
CBC modes and construction of IVs for CTR mode.

ECB mode sees ~0.5% decrease in speed because added one extra function
call. CBC mode decryption and CTR mode benefit from vector operations
and gain ~5%.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 332 +++++++++++++++++++++++-------
 arch/x86/crypto/cast5_avx_glue.c          |  79 +++----
 2 files changed, 280 insertions(+), 131 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index a41a3aaba220..12478e472368 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -180,31 +180,17 @@
 	vpunpcklqdq		t1, t0, x0; \
 	vpunpckhqdq		t1, t0, x1;
 
-#define inpack_blocks(in, x0, x1, t0, t1, rmask) \
-	vmovdqu (0*4*4)(in),	x0; \
-	vmovdqu (1*4*4)(in),	x1; \
+#define inpack_blocks(x0, x1, t0, t1, rmask) \
 	vpshufb rmask, 	x0,	x0; \
 	vpshufb rmask, 	x1,	x1; \
 	\
 	transpose_2x4(x0, x1, t0, t1)
 
-#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \
+#define outunpack_blocks(x0, x1, t0, t1, rmask) \
 	transpose_2x4(x0, x1, t0, t1) \
 	\
 	vpshufb rmask,	x0, x0;           \
-	vpshufb rmask,	x1, x1;           \
-	vmovdqu 	x0, (0*4*4)(out); \
-	vmovdqu		x1, (1*4*4)(out);
-
-#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
-	transpose_2x4(x0, x1, t0, t1) \
-	\
-	vpshufb rmask,	x0, x0;               \
-	vpshufb rmask,	x1, x1;               \
-	vpxor		(0*4*4)(out), x0, x0; \
-	vmovdqu 	x0, (0*4*4)(out);     \
-	vpxor		(1*4*4)(out), x1, x1; \
-	vmovdqu	        x1, (1*4*4)(out);
+	vpshufb rmask,	x1, x1;
 
 .data
 
@@ -213,6 +199,8 @@
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap_iv_mask:
+	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
 .L16_mask:
 	.byte 16, 16, 16, 16
 .L32_mask:
@@ -223,35 +211,42 @@
 .text
 
 .align 16
-.global __cast5_enc_blk_16way
-.type   __cast5_enc_blk_16way,@function;
+.type   __cast5_enc_blk16,@function;
 
-__cast5_enc_blk_16way:
+__cast5_enc_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
-	 *	%rcx: bool, if true: xor output
+	 *	RL1: blocks 1 and 2
+	 *	RR1: blocks 3 and 4
+	 *	RL2: blocks 5 and 6
+	 *	RR2: blocks 7 and 8
+	 *	RL3: blocks 9 and 10
+	 *	RR3: blocks 11 and 12
+	 *	RL4: blocks 13 and 14
+	 *	RR4: blocks 15 and 16
+	 * output:
+	 *	RL1: encrypted blocks 1 and 2
+	 *	RR1: encrypted blocks 3 and 4
+	 *	RL2: encrypted blocks 5 and 6
+	 *	RR2: encrypted blocks 7 and 8
+	 *	RL3: encrypted blocks 9 and 10
+	 *	RR3: encrypted blocks 11 and 12
+	 *	RL4: encrypted blocks 13 and 14
+	 *	RR4: encrypted blocks 15 and 16
 	 */
 
 	pushq %rbp;
 	pushq %rbx;
-	pushq %rcx;
 
 	vmovdqa .Lbswap_mask, RKM;
 	vmovd .Lfirst_mask, R1ST;
 	vmovd .L32_mask, R32;
 	enc_preload_rkr();
 
-	leaq 1*(2*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
-	inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
-	leaq 2*(2*4*4)(%rdx), %rax;
-	inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
-	leaq 3*(2*4*4)(%rdx), %rax;
-	inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
-
-	movq %rsi, %r11;
+	inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+	inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+	inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+	inpack_blocks(RL4, RR4, RTMP, RX, RKM);
 
 	round(RL, RR, 0, 1);
 	round(RR, RL, 1, 2);
@@ -276,44 +271,41 @@ __cast5_enc_blk_16way:
 	round(RR, RL, 15, 1);
 
 __skip_enc:
-	popq %rcx;
 	popq %rbx;
 	popq %rbp;
 
 	vmovdqa .Lbswap_mask, RKM;
-	leaq 1*(2*4*4)(%r11), %rax;
 
-	testb %cl, %cl;
-	jnz __enc_xor16;
-
-	outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
-	outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
-	leaq 2*(2*4*4)(%r11), %rax;
-	outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
-	leaq 3*(2*4*4)(%r11), %rax;
-	outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
-
-	ret;
-
-__enc_xor16:
-	outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
-	outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
-	leaq 2*(2*4*4)(%r11), %rax;
-	outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
-	leaq 3*(2*4*4)(%r11), %rax;
-	outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
+	outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+	outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+	outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+	outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
 
 	ret;
 
 .align 16
-.global cast5_dec_blk_16way
-.type   cast5_dec_blk_16way,@function;
+.type   __cast5_dec_blk16,@function;
 
-cast5_dec_blk_16way:
+__cast5_dec_blk16:
 	/* input:
 	 *	%rdi: ctx, CTX
-	 *	%rsi: dst
-	 *	%rdx: src
+	 *	RL1: encrypted blocks 1 and 2
+	 *	RR1: encrypted blocks 3 and 4
+	 *	RL2: encrypted blocks 5 and 6
+	 *	RR2: encrypted blocks 7 and 8
+	 *	RL3: encrypted blocks 9 and 10
+	 *	RR3: encrypted blocks 11 and 12
+	 *	RL4: encrypted blocks 13 and 14
+	 *	RR4: encrypted blocks 15 and 16
+	 * output:
+	 *	RL1: decrypted blocks 1 and 2
+	 *	RR1: decrypted blocks 3 and 4
+	 *	RL2: decrypted blocks 5 and 6
+	 *	RR2: decrypted blocks 7 and 8
+	 *	RL3: decrypted blocks 9 and 10
+	 *	RR3: decrypted blocks 11 and 12
+	 *	RL4: decrypted blocks 13 and 14
+	 *	RR4: decrypted blocks 15 and 16
 	 */
 
 	pushq %rbp;
@@ -324,15 +316,10 @@ cast5_dec_blk_16way:
 	vmovd .L32_mask, R32;
 	dec_preload_rkr();
 
-	leaq 1*(2*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
-	inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
-	leaq 2*(2*4*4)(%rdx), %rax;
-	inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
-	leaq 3*(2*4*4)(%rdx), %rax;
-	inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
-
-	movq %rsi, %r11;
+	inpack_blocks(RL1, RR1, RTMP, RX, RKM);
+	inpack_blocks(RL2, RR2, RTMP, RX, RKM);
+	inpack_blocks(RL3, RR3, RTMP, RX, RKM);
+	inpack_blocks(RL4, RR4, RTMP, RX, RKM);
 
 	movzbl rr(CTX), %eax;
 	testl %eax, %eax;
@@ -361,16 +348,211 @@ __dec_tail:
 	popq %rbx;
 	popq %rbp;
 
-	leaq 1*(2*4*4)(%r11), %rax;
-	outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
-	outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
-	leaq 2*(2*4*4)(%r11), %rax;
-	outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
-	leaq 3*(2*4*4)(%r11), %rax;
-	outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
+	outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
+	outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
+	outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
+	outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
 
 	ret;
 
 __skip_dec:
 	vpsrldq $4, RKR, RKR;
 	jmp __dec_tail;
+
+.align 16
+.global cast5_ecb_enc_16way
+.type   cast5_ecb_enc_16way,@function;
+
+cast5_ecb_enc_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	movq %rsi, %r11;
+
+	vmovdqu (0*4*4)(%rdx), RL1;
+	vmovdqu (1*4*4)(%rdx), RR1;
+	vmovdqu (2*4*4)(%rdx), RL2;
+	vmovdqu (3*4*4)(%rdx), RR2;
+	vmovdqu (4*4*4)(%rdx), RL3;
+	vmovdqu (5*4*4)(%rdx), RR3;
+	vmovdqu (6*4*4)(%rdx), RL4;
+	vmovdqu (7*4*4)(%rdx), RR4;
+
+	call __cast5_enc_blk16;
+
+	vmovdqu RR1, (0*4*4)(%r11);
+	vmovdqu RL1, (1*4*4)(%r11);
+	vmovdqu RR2, (2*4*4)(%r11);
+	vmovdqu RL2, (3*4*4)(%r11);
+	vmovdqu RR3, (4*4*4)(%r11);
+	vmovdqu RL3, (5*4*4)(%r11);
+	vmovdqu RR4, (6*4*4)(%r11);
+	vmovdqu RL4, (7*4*4)(%r11);
+
+	ret;
+
+.align 16
+.global cast5_ecb_dec_16way
+.type   cast5_ecb_dec_16way,@function;
+
+cast5_ecb_dec_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	movq %rsi, %r11;
+
+	vmovdqu (0*4*4)(%rdx), RL1;
+	vmovdqu (1*4*4)(%rdx), RR1;
+	vmovdqu (2*4*4)(%rdx), RL2;
+	vmovdqu (3*4*4)(%rdx), RR2;
+	vmovdqu (4*4*4)(%rdx), RL3;
+	vmovdqu (5*4*4)(%rdx), RR3;
+	vmovdqu (6*4*4)(%rdx), RL4;
+	vmovdqu (7*4*4)(%rdx), RR4;
+
+	call __cast5_dec_blk16;
+
+	vmovdqu RR1, (0*4*4)(%r11);
+	vmovdqu RL1, (1*4*4)(%r11);
+	vmovdqu RR2, (2*4*4)(%r11);
+	vmovdqu RL2, (3*4*4)(%r11);
+	vmovdqu RR3, (4*4*4)(%r11);
+	vmovdqu RL3, (5*4*4)(%r11);
+	vmovdqu RR4, (6*4*4)(%r11);
+	vmovdqu RL4, (7*4*4)(%r11);
+
+	ret;
+
+.align 16
+.global cast5_cbc_dec_16way
+.type   cast5_cbc_dec_16way,@function;
+
+cast5_cbc_dec_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	pushq %r12;
+
+	movq %rsi, %r11;
+	movq %rdx, %r12;
+
+	vmovdqu (0*16)(%rdx), RL1;
+	vmovdqu (1*16)(%rdx), RR1;
+	vmovdqu (2*16)(%rdx), RL2;
+	vmovdqu (3*16)(%rdx), RR2;
+	vmovdqu (4*16)(%rdx), RL3;
+	vmovdqu (5*16)(%rdx), RR3;
+	vmovdqu (6*16)(%rdx), RL4;
+	vmovdqu (7*16)(%rdx), RR4;
+
+	call __cast5_dec_blk16;
+
+	/* xor with src */
+	vmovq (%r12), RX;
+	vpshufd $0x4f, RX, RX;
+	vpxor RX, RR1, RR1;
+	vpxor 0*16+8(%r12), RL1, RL1;
+	vpxor 1*16+8(%r12), RR2, RR2;
+	vpxor 2*16+8(%r12), RL2, RL2;
+	vpxor 3*16+8(%r12), RR3, RR3;
+	vpxor 4*16+8(%r12), RL3, RL3;
+	vpxor 5*16+8(%r12), RR4, RR4;
+	vpxor 6*16+8(%r12), RL4, RL4;
+
+	vmovdqu RR1, (0*16)(%r11);
+	vmovdqu RL1, (1*16)(%r11);
+	vmovdqu RR2, (2*16)(%r11);
+	vmovdqu RL2, (3*16)(%r11);
+	vmovdqu RR3, (4*16)(%r11);
+	vmovdqu RL3, (5*16)(%r11);
+	vmovdqu RR4, (6*16)(%r11);
+	vmovdqu RL4, (7*16)(%r11);
+
+	popq %r12;
+
+	ret;
+
+.align 16
+.global cast5_ctr_16way
+.type   cast5_ctr_16way,@function;
+
+cast5_ctr_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+
+	pushq %r12;
+
+	movq %rsi, %r11;
+	movq %rdx, %r12;
+
+	vpcmpeqd RTMP, RTMP, RTMP;
+	vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
+
+	vpcmpeqd RKR, RKR, RKR;
+	vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
+	vmovdqa .Lbswap_iv_mask, R1ST;
+	vmovdqa .Lbswap128_mask, RKM;
+
+	/* load IV and byteswap */
+	vmovq (%rcx), RX;
+	vpshufb R1ST, RX, RX;
+
+	/* construct IVs */
+	vpsubq RTMP, RX, RX;  /* le: IV1, IV0 */
+	vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
+	vpsubq RKR, RX, RX;
+	vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
+	vpsubq RKR, RX, RX;
+	vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
+	vpsubq RKR, RX, RX;
+	vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
+	vpsubq RKR, RX, RX;
+	vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
+	vpsubq RKR, RX, RX;
+	vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
+	vpsubq RKR, RX, RX;
+	vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
+	vpsubq RKR, RX, RX;
+	vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
+
+	/* store last IV */
+	vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
+	vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
+	vmovq RX, (%rcx);
+
+	call __cast5_enc_blk16;
+
+	/* dst = src ^ iv */
+	vpxor (0*16)(%r12), RR1, RR1;
+	vpxor (1*16)(%r12), RL1, RL1;
+	vpxor (2*16)(%r12), RR2, RR2;
+	vpxor (3*16)(%r12), RL2, RL2;
+	vpxor (4*16)(%r12), RR3, RR3;
+	vpxor (5*16)(%r12), RL3, RL3;
+	vpxor (6*16)(%r12), RR4, RR4;
+	vpxor (7*16)(%r12), RL4, RL4;
+	vmovdqu RR1, (0*16)(%r11);
+	vmovdqu RL1, (1*16)(%r11);
+	vmovdqu RR2, (2*16)(%r11);
+	vmovdqu RL2, (3*16)(%r11);
+	vmovdqu RR3, (4*16)(%r11);
+	vmovdqu RL3, (5*16)(%r11);
+	vmovdqu RR4, (6*16)(%r11);
+	vmovdqu RL4, (7*16)(%r11);
+
+	popq %r12;
+
+	ret;
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index e0ea14f9547f..c6631813dc11 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -37,29 +37,14 @@
 
 #define CAST5_PARALLEL_BLOCKS 16
 
-asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst,
-				      const u8 *src, bool xor);
-asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
+asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
 				    const u8 *src);
-
-static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst,
-				      const u8 *src)
-{
-	__cast5_enc_blk_16way(ctx, dst, src, false);
-}
-
-static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
-					  const u8 *src)
-{
-	__cast5_enc_blk_16way(ctx, dst, src, true);
-}
-
-static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
-				      const u8 *src)
-{
-	cast5_dec_blk_16way(ctx, dst, src);
-}
-
+asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+				    const u8 *src);
+asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
+				    const u8 *src);
+asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
+				__be64 *iv);
 
 static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
 {
@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
 	struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	const unsigned int bsize = CAST5_BLOCK_SIZE;
 	unsigned int nbytes;
+	void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
 	int err;
 
+	fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
+
 	err = blkcipher_walk_virt(desc, walk);
 	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 
@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
 		/* Process multi-block batch */
 		if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
 			do {
-				if (enc)
-					cast5_enc_blk_xway(ctx, wdst, wsrc);
-				else
-					cast5_dec_blk_xway(ctx, wdst, wsrc);
+				fn(ctx, wdst, wsrc);
 
 				wsrc += bsize * CAST5_PARALLEL_BLOCKS;
 				wdst += bsize * CAST5_PARALLEL_BLOCKS;
@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
 				goto done;
 		}
 
+		fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
+
 		/* Handle leftovers */
 		do {
-			if (enc)
-				__cast5_encrypt(ctx, wdst, wsrc);
-			else
-				__cast5_decrypt(ctx, wdst, wsrc);
+			fn(ctx, wdst, wsrc);
 
 			wsrc += bsize;
 			wdst += bsize;
@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
 	unsigned int nbytes = walk->nbytes;
 	u64 *src = (u64 *)walk->src.virt.addr;
 	u64 *dst = (u64 *)walk->dst.virt.addr;
-	u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
 	u64 last_iv;
-	int i;
 
 	/* Start of the last block. */
 	src += nbytes / bsize - 1;
@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
 			src -= CAST5_PARALLEL_BLOCKS - 1;
 			dst -= CAST5_PARALLEL_BLOCKS - 1;
 
-			for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
-				ivs[i] = src[i];
-
-			cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
-			for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
-				*(dst + (i + 1)) ^= *(ivs + i);
+			cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
 
 			nbytes -= bsize;
 			if (nbytes < bsize)
@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
 	unsigned int nbytes = walk->nbytes;
 	u64 *src = (u64 *)walk->src.virt.addr;
 	u64 *dst = (u64 *)walk->dst.virt.addr;
-	u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
-	__be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
-	int i;
 
 	/* Process multi-block batch */
 	if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
 		do {
-			/* create ctrblks for parallel encrypt */
-			for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) {
-				if (dst != src)
-					dst[i] = src[i];
-
-				ctrblocks[i] = cpu_to_be64(ctrblk++);
-			}
-
-			cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
-					       (u8 *)ctrblocks);
+			cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
+					(__be64 *)walk->iv);
 
 			src += CAST5_PARALLEL_BLOCKS;
 			dst += CAST5_PARALLEL_BLOCKS;
@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
 
 	/* Handle leftovers */
 	do {
+		u64 ctrblk;
+
 		if (dst != src)
 			*dst = *src;
 
-		ctrblocks[0] = cpu_to_be64(ctrblk++);
+		ctrblk = *(u64 *)walk->iv;
+		be64_add_cpu((__be64 *)walk->iv, 1);
 
-		__cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
-		*dst ^= ctrblocks[0];
+		__cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
+		*dst ^= ctrblk;
 
 		src += 1;
 		dst += 1;
@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
 	} while (nbytes >= bsize);
 
 done:
-	*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
 	return nbytes;
 }
 
-- 
cgit v1.2.3-58-ga151


From cf582ccedad02eb9bfdcdb25adfc800dd117b428 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 26 Oct 2012 14:48:56 +0300
Subject: crypto: camellia-x86_64 - share common functions and move structures
 and function definitions to header file

Prepare camellia-x86_64 functions to be reused from AVX/AESNI implementation
module.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/camellia_glue.c        | 80 ++++++++++-----------------------
 arch/x86/include/asm/crypto/camellia.h | 82 ++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 57 deletions(-)
 create mode 100644 arch/x86/include/asm/crypto/camellia.h

(limited to 'arch')

diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 021a0086186b..5cb86ccd4acb 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -32,53 +32,24 @@
 #include <crypto/algapi.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
+#include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
 
-#define CAMELLIA_MIN_KEY_SIZE	16
-#define CAMELLIA_MAX_KEY_SIZE	32
-#define CAMELLIA_BLOCK_SIZE	16
-#define CAMELLIA_TABLE_BYTE_LEN	272
-
-struct camellia_ctx {
-	u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
-	u32 key_length;
-};
-
 /* regular block cipher functions */
 asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
 				   const u8 *src, bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk);
 asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
 				 const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_dec_blk);
 
 /* 2-way parallel cipher functions */
 asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
 					const u8 *src, bool xor);
+EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
 asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
 				      const u8 *src);
-
-static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
-				    const u8 *src)
-{
-	__camellia_enc_blk(ctx, dst, src, false);
-}
-
-static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
-					const u8 *src)
-{
-	__camellia_enc_blk(ctx, dst, src, true);
-}
-
-static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
-					 const u8 *src)
-{
-	__camellia_enc_blk_2way(ctx, dst, src, false);
-}
-
-static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
-					     const u8 *src)
-{
-	__camellia_enc_blk_2way(ctx, dst, src, true);
-}
+EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
 
 static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
@@ -1275,9 +1246,8 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey)
 	camellia_setup256(kk, subkey);
 }
 
-static int __camellia_setkey(struct camellia_ctx *cctx,
-			     const unsigned char *key,
-			     unsigned int key_len, u32 *flags)
+int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
+		      unsigned int key_len, u32 *flags)
 {
 	if (key_len != 16 && key_len != 24 && key_len != 32) {
 		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
@@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__camellia_setkey);
 
 static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
 			   unsigned int key_len)
@@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
 				 &tfm->crt_flags);
 }
 
-static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
+void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
 {
 	u128 iv = *src;
 
@@ -1316,8 +1287,9 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
 
 	u128_xor(&dst[1], &dst[1], &iv);
 }
+EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
 
-static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	be128 ctrblk;
 
@@ -1329,9 +1301,9 @@ static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 
 	camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
 }
+EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
 
-static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
-				    le128 *iv)
+void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 	be128 ctrblks[2];
 
@@ -1347,6 +1319,7 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
 
 	camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
 }
+EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
 
 static const struct common_glue_ctx camellia_enc = {
 	.num_funcs = 2,
@@ -1464,13 +1437,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		camellia_dec_blk(ctx, srcdst, srcdst);
 }
 
-struct camellia_lrw_ctx {
-	struct lrw_table_ctx lrw_table;
-	struct camellia_ctx camellia_ctx;
-};
-
-static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
+int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+			unsigned int keylen)
 {
 	struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 	int err;
@@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
 	return lrw_init_table(&ctx->lrw_table,
 			      key + keylen - CAMELLIA_BLOCK_SIZE);
 }
+EXPORT_SYMBOL_GPL(lrw_camellia_setkey);
 
 static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
@@ -1519,20 +1488,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return lrw_crypt(desc, dst, src, nbytes, &req);
 }
 
-static void lrw_exit_tfm(struct crypto_tfm *tfm)
+void lrw_camellia_exit_tfm(struct crypto_tfm *tfm)
 {
 	struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 
 	lrw_free_table(&ctx->lrw_table);
 }
+EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm);
 
-struct camellia_xts_ctx {
-	struct camellia_ctx tweak_ctx;
-	struct camellia_ctx crypt_ctx;
-};
-
-static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
+int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+			unsigned int keylen)
 {
 	struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 *flags = &tfm->crt_flags;
@@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
 	return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
 				flags);
 }
+EXPORT_SYMBOL_GPL(xts_camellia_setkey);
 
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
@@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { {
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_blkcipher_type,
 	.cra_module		= THIS_MODULE,
-	.cra_exit		= lrw_exit_tfm,
+	.cra_exit		= lrw_camellia_exit_tfm,
 	.cra_u = {
 		.blkcipher = {
 			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h
new file mode 100644
index 000000000000..98038add801e
--- /dev/null
+++ b/arch/x86/include/asm/crypto/camellia.h
@@ -0,0 +1,82 @@
+#ifndef ASM_X86_CAMELLIA_H
+#define ASM_X86_CAMELLIA_H
+
+#include <linux/kernel.h>
+#include <linux/crypto.h>
+
+#define CAMELLIA_MIN_KEY_SIZE	16
+#define CAMELLIA_MAX_KEY_SIZE	32
+#define CAMELLIA_BLOCK_SIZE	16
+#define CAMELLIA_TABLE_BYTE_LEN	272
+#define CAMELLIA_PARALLEL_BLOCKS 2
+
+struct camellia_ctx {
+	u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+	u32 key_length;
+};
+
+struct camellia_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct camellia_ctx camellia_ctx;
+};
+
+struct camellia_xts_ctx {
+	struct camellia_ctx tweak_ctx;
+	struct camellia_ctx crypt_ctx;
+};
+
+extern int __camellia_setkey(struct camellia_ctx *cctx,
+			     const unsigned char *key,
+			     unsigned int key_len, u32 *flags);
+
+extern int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+			       unsigned int keylen);
+extern void lrw_camellia_exit_tfm(struct crypto_tfm *tfm);
+
+extern int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
+			       unsigned int keylen);
+
+/* regular block cipher functions */
+asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
+				   const u8 *src, bool xor);
+asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
+				 const u8 *src);
+
+/* 2-way parallel cipher functions */
+asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+					const u8 *src, bool xor);
+asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+				      const u8 *src);
+
+static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
+				    const u8 *src)
+{
+	__camellia_enc_blk(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__camellia_enc_blk(ctx, dst, src, true);
+}
+
+static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+					 const u8 *src)
+{
+	__camellia_enc_blk_2way(ctx, dst, src, false);
+}
+
+static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
+					     const u8 *src)
+{
+	__camellia_enc_blk_2way(ctx, dst, src, true);
+}
+
+/* glue helpers */
+extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src);
+extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
+			       le128 *iv);
+extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
+				    le128 *iv);
+
+#endif /* ASM_X86_CAMELLIA_H */
-- 
cgit v1.2.3-58-ga151


From d9b1d2e7e10d2e926775b1d3da39da0f51491e54 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Fri, 26 Oct 2012 14:49:01 +0300
Subject: crypto: camellia - add AES-NI/AVX/x86_64 assembler implementation of
 camellia cipher

This patch adds AES-NI/AVX/x86_64 assembler implementation of Camellia block
cipher. Implementation process data in sixteen block chunks, which are
byte-sliced and AES SubBytes is reused for Camellia s-box with help of pre-
and post-filtering.

Patch has been tested with tcrypt and automated filesystem tests.

tcrypt test results:

Intel Core i5-2450M:

camellia-aesni-avx vs camellia-asm-x86_64-2way:
128bit key:                                             (lrw:256bit)    (xts:256bit)
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec
16B     0.98x   0.96x   0.99x   0.96x   0.96x   0.95x   0.95x   0.94x   0.97x   0.98x
64B     0.99x   0.98x   1.00x   0.98x   0.98x   0.99x   0.98x   0.93x   0.99x   0.98x
256B    2.28x   2.28x   1.01x   2.29x   2.25x   2.24x   1.96x   1.97x   1.91x   1.90x
1024B   2.57x   2.56x   1.00x   2.57x   2.51x   2.53x   2.19x   2.17x   2.19x   2.22x
8192B   2.49x   2.49x   1.00x   2.53x   2.48x   2.49x   2.17x   2.17x   2.22x   2.22x

256bit key:                                             (lrw:384bit)    (xts:512bit)
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec
16B     0.97x   0.98x   0.99x   0.97x   0.97x   0.96x   0.97x   0.98x   0.98x   0.99x
64B     1.00x   1.00x   1.01x   0.99x   0.98x   0.99x   0.99x   0.99x   0.99x   0.99x
256B    2.37x   2.37x   1.01x   2.39x   2.35x   2.33x   2.10x   2.11x   1.99x   2.02x
1024B   2.58x   2.60x   1.00x   2.58x   2.56x   2.56x   2.28x   2.29x   2.28x   2.29x
8192B   2.50x   2.52x   1.00x   2.56x   2.51x   2.51x   2.24x   2.25x   2.26x   2.29x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                    |    3 +
 arch/x86/crypto/camellia-aesni-avx-asm_64.S | 1102 +++++++++++++++++++++++++++
 arch/x86/crypto/camellia_aesni_avx_glue.c   |  558 ++++++++++++++
 crypto/Kconfig                              |   22 +
 crypto/testmgr.c                            |   62 ++
 5 files changed, 1747 insertions(+)
 create mode 100644 arch/x86/crypto/camellia-aesni-avx-asm_64.S
 create mode 100644 arch/x86/crypto/camellia_aesni_avx_glue.c

(limited to 'arch')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 84d7dbaba26e..e0ca7c9ac383 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
+obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
+camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+			       camellia_aesni_avx_glue.o
 cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
 cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..2306d2e4816f
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -0,0 +1,1102 @@
+/*
+ * x86_64/AVX/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+/*
+ * Version licensed under 2-clause BSD License is available at:
+ *	http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
+ */
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+
+/**********************************************************************
+  16-way camellia
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand x, mask4bit, tmp0; \
+	vpandn x, mask4bit, x; \
+	vpsrld $4, x, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+		  t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	vmovdqa .Linv_shift_row, t4; \
+	vbroadcastss .L0f0f0f0f, t7; \
+	vmovdqa .Lpre_tf_lo_s1, t0; \
+	vmovdqa .Lpre_tf_hi_s1, t1; \
+	\
+	/* AES inverse shift rows */ \
+	vpshufb t4, x0, x0; \
+	vpshufb t4, x7, x7; \
+	vpshufb t4, x1, x1; \
+	vpshufb t4, x4, x4; \
+	vpshufb t4, x2, x2; \
+	vpshufb t4, x5, x5; \
+	vpshufb t4, x3, x3; \
+	vpshufb t4, x6, x6; \
+	\
+	/* prefilter sboxes 1, 2 and 3 */ \
+	vmovdqa .Lpre_tf_lo_s4, t2; \
+	vmovdqa .Lpre_tf_hi_s4, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x1, t0, t1, t7, t6); \
+	filter_8bit(x4, t0, t1, t7, t6); \
+	filter_8bit(x2, t0, t1, t7, t6); \
+	filter_8bit(x5, t0, t1, t7, t6); \
+	\
+	/* prefilter sbox 4 */ \
+	vpxor t4, t4, t4; \
+	filter_8bit(x3, t2, t3, t7, t6); \
+	filter_8bit(x6, t2, t3, t7, t6); \
+	\
+	/* AES subbytes + AES shift rows */ \
+	vmovdqa .Lpost_tf_lo_s1, t0; \
+	vmovdqa .Lpost_tf_hi_s1, t1; \
+	vaesenclast t4, x0, x0; \
+	vaesenclast t4, x7, x7; \
+	vaesenclast t4, x1, x1; \
+	vaesenclast t4, x4, x4; \
+	vaesenclast t4, x2, x2; \
+	vaesenclast t4, x5, x5; \
+	vaesenclast t4, x3, x3; \
+	vaesenclast t4, x6, x6; \
+	\
+	/* postfilter sboxes 1 and 4 */ \
+	vmovdqa .Lpost_tf_lo_s3, t2; \
+	vmovdqa .Lpost_tf_hi_s3, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x3, t0, t1, t7, t6); \
+	filter_8bit(x6, t0, t1, t7, t6); \
+	\
+	/* postfilter sbox 3 */ \
+	vmovdqa .Lpost_tf_lo_s2, t4; \
+	vmovdqa .Lpost_tf_hi_s2, t5; \
+	filter_8bit(x2, t2, t3, t7, t6); \
+	filter_8bit(x5, t2, t3, t7, t6); \
+	\
+	vpxor t6, t6, t6; \
+	vmovq key, t0; \
+	\
+	/* postfilter sbox 2 */ \
+	filter_8bit(x1, t4, t5, t7, t2); \
+	filter_8bit(x4, t4, t5, t7, t2); \
+	\
+	vpsrldq $5, t0, t5; \
+	vpsrldq $1, t0, t1; \
+	vpsrldq $2, t0, t2; \
+	vpsrldq $3, t0, t3; \
+	vpsrldq $4, t0, t4; \
+	vpshufb t6, t0, t0; \
+	vpshufb t6, t1, t1; \
+	vpshufb t6, t2, t2; \
+	vpshufb t6, t3, t3; \
+	vpshufb t6, t4, t4; \
+	vpsrldq $2, t5, t7; \
+	vpshufb t6, t7, t7; \
+	\
+	/* \
+	 * P-function \
+	 */ \
+	vpxor x5, x0, x0; \
+	vpxor x6, x1, x1; \
+	vpxor x7, x2, x2; \
+	vpxor x4, x3, x3; \
+	\
+	vpxor x2, x4, x4; \
+	vpxor x3, x5, x5; \
+	vpxor x0, x6, x6; \
+	vpxor x1, x7, x7; \
+	\
+	vpxor x7, x0, x0; \
+	vpxor x4, x1, x1; \
+	vpxor x5, x2, x2; \
+	vpxor x6, x3, x3; \
+	\
+	vpxor x3, x4, x4; \
+	vpxor x0, x5, x5; \
+	vpxor x1, x6, x6; \
+	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+	\
+	/* \
+	 * Add key material and result to CD (x becomes new CD) \
+	 */ \
+	\
+	vpxor t3, x4, x4; \
+	vpxor 0 * 16(mem_cd), x4, x4; \
+	\
+	vpxor t2, x5, x5; \
+	vpxor 1 * 16(mem_cd), x5, x5; \
+	\
+	vpsrldq $1, t5, t3; \
+	vpshufb t6, t5, t5; \
+	vpshufb t6, t3, t6; \
+	\
+	vpxor t1, x6, x6; \
+	vpxor 2 * 16(mem_cd), x6, x6; \
+	\
+	vpxor t0, x7, x7; \
+	vpxor 3 * 16(mem_cd), x7, x7; \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 16(mem_cd), x0, x0; \
+	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 16(mem_cd), x1, x1; \
+	\
+	vpxor t5, x2, x2; \
+	vpxor 6 * 16(mem_cd), x2, x2; \
+	\
+	vpxor t4, x3, x3; \
+	vpxor 7 * 16(mem_cd), x3, x3;
+
+/*
+ * Size optimization... with inlined roundsm16, binary would be over 5 times
+ * larger and would only be 0.5% faster (on sandy-bridge).
+ */
+.align 8
+roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
+	roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		  %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
+		  %rcx, (%r9));
+	ret;
+
+.align 8
+roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
+	roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
+		  %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
+		  %rax, (%r9));
+	ret;
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+	leaq (key_table + (i) * 8)(CTX), %r9; \
+	call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+	\
+	vmovdqu x4, 0 * 16(mem_cd); \
+	vmovdqu x5, 1 * 16(mem_cd); \
+	vmovdqu x6, 2 * 16(mem_cd); \
+	vmovdqu x7, 3 * 16(mem_cd); \
+	vmovdqu x0, 4 * 16(mem_cd); \
+	vmovdqu x1, 5 * 16(mem_cd); \
+	vmovdqu x2, 6 * 16(mem_cd); \
+	vmovdqu x3, 7 * 16(mem_cd); \
+	\
+	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+	call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+	\
+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+	/* Store new AB state */ \
+	vmovdqu x0, 0 * 16(mem_ab); \
+	vmovdqu x1, 1 * 16(mem_ab); \
+	vmovdqu x2, 2 * 16(mem_ab); \
+	vmovdqu x3, 3 * 16(mem_ab); \
+	vmovdqu x4, 4 * 16(mem_ab); \
+	vmovdqu x5, 5 * 16(mem_ab); \
+	vmovdqu x6, 6 * 16(mem_ab); \
+	vmovdqu x7, 7 * 16(mem_ab);
+
+#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+	two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
+	vpcmpgtb v0, zero, t0; \
+	vpaddb v0, v0, v0; \
+	vpabsb t0, t0; \
+	\
+	vpcmpgtb v1, zero, t1; \
+	vpaddb v1, v1, v1; \
+	vpabsb t1, t1; \
+	\
+	vpcmpgtb v2, zero, t2; \
+	vpaddb v2, v2, v2; \
+	vpabsb t2, t2; \
+	\
+	vpor t0, v1, v1; \
+	\
+	vpcmpgtb v3, zero, t0; \
+	vpaddb v3, v3, v3; \
+	vpabsb t0, t0; \
+	\
+	vpor t1, v2, v2; \
+	vpor t2, v3, v3; \
+	vpor t0, v0, v0;
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+	      tt1, tt2, tt3, kll, klr, krl, krr) \
+	/* \
+	 * t0 = kll; \
+	 * t0 &= ll; \
+	 * lr ^= rol32(t0, 1); \
+	 */ \
+	vpxor tt0, tt0, tt0; \
+	vmovd kll, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand l0, t0, t0; \
+	vpand l1, t1, t1; \
+	vpand l2, t2, t2; \
+	vpand l3, t3, t3; \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor l4, t0, l4; \
+	vmovdqu l4, 4 * 16(l); \
+	vpxor l5, t1, l5; \
+	vmovdqu l5, 5 * 16(l); \
+	vpxor l6, t2, l6; \
+	vmovdqu l6, 6 * 16(l); \
+	vpxor l7, t3, l7; \
+	vmovdqu l7, 7 * 16(l); \
+	\
+	/* \
+	 * t2 = krr; \
+	 * t2 |= rr; \
+	 * rl ^= t2; \
+	 */ \
+	\
+	vmovd krr, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor 4 * 16(r), t0, t0; \
+	vpor 5 * 16(r), t1, t1; \
+	vpor 6 * 16(r), t2, t2; \
+	vpor 7 * 16(r), t3, t3; \
+	\
+	vpxor 0 * 16(r), t0, t0; \
+	vpxor 1 * 16(r), t1, t1; \
+	vpxor 2 * 16(r), t2, t2; \
+	vpxor 3 * 16(r), t3, t3; \
+	vmovdqu t0, 0 * 16(r); \
+	vmovdqu t1, 1 * 16(r); \
+	vmovdqu t2, 2 * 16(r); \
+	vmovdqu t3, 3 * 16(r); \
+	\
+	/* \
+	 * t2 = krl; \
+	 * t2 &= rl; \
+	 * rr ^= rol32(t2, 1); \
+	 */ \
+	vmovd krl, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpand 0 * 16(r), t0, t0; \
+	vpand 1 * 16(r), t1, t1; \
+	vpand 2 * 16(r), t2, t2; \
+	vpand 3 * 16(r), t3, t3; \
+	\
+	rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor 4 * 16(r), t0, t0; \
+	vpxor 5 * 16(r), t1, t1; \
+	vpxor 6 * 16(r), t2, t2; \
+	vpxor 7 * 16(r), t3, t3; \
+	vmovdqu t0, 4 * 16(r); \
+	vmovdqu t1, 5 * 16(r); \
+	vmovdqu t2, 6 * 16(r); \
+	vmovdqu t3, 7 * 16(r); \
+	\
+	/* \
+	 * t0 = klr; \
+	 * t0 |= lr; \
+	 * ll ^= t0; \
+	 */ \
+	\
+	vmovd klr, t0; \
+	vpshufb tt0, t0, t3; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t2; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t1; \
+	vpsrldq $1, t0, t0; \
+	vpshufb tt0, t0, t0; \
+	\
+	vpor l4, t0, t0; \
+	vpor l5, t1, t1; \
+	vpor l6, t2, t2; \
+	vpor l7, t3, t3; \
+	\
+	vpxor l0, t0, l0; \
+	vmovdqu l0, 0 * 16(l); \
+	vpxor l1, t1, l1; \
+	vmovdqu l1, 1 * 16(l); \
+	vpxor l2, t2, l2; \
+	vmovdqu l2, 2 * 16(l); \
+	vpxor l3, t3, l3; \
+	vmovdqu l3, 3 * 16(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
+			 b3, c3, d3, st0, st1) \
+	vmovdqu d2, st0; \
+	vmovdqu d3, st1; \
+	transpose_4x4(a0, a1, a2, a3, d2, d3); \
+	transpose_4x4(b0, b1, b2, b3, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu a0, st0; \
+	vmovdqu a1, st1; \
+	transpose_4x4(c0, c1, c2, c3, a0, a1); \
+	transpose_4x4(d0, d1, d2, d3, a0, a1); \
+	\
+	vmovdqu .Lshufb_16x16b, a0; \
+	vmovdqu st1, a1; \
+	vpshufb a0, a2, a2; \
+	vpshufb a0, a3, a3; \
+	vpshufb a0, b0, b0; \
+	vpshufb a0, b1, b1; \
+	vpshufb a0, b2, b2; \
+	vpshufb a0, b3, b3; \
+	vpshufb a0, a1, a1; \
+	vpshufb a0, c0, c0; \
+	vpshufb a0, c1, c1; \
+	vpshufb a0, c2, c2; \
+	vpshufb a0, c3, c3; \
+	vpshufb a0, d0, d0; \
+	vpshufb a0, d1, d1; \
+	vpshufb a0, d2, d2; \
+	vpshufb a0, d3, d3; \
+	vmovdqu d3, st1; \
+	vmovdqu st0, d3; \
+	vpshufb a0, d3, a0; \
+	vmovdqu d2, st0; \
+	\
+	transpose_4x4(a0, b0, c0, d0, d2, d3); \
+	transpose_4x4(a1, b1, c1, d1, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu b0, st0; \
+	vmovdqu b1, st1; \
+	transpose_4x4(a2, b2, c2, d2, b0, b1); \
+	transpose_4x4(a3, b3, c3, d3, b0, b1); \
+	vmovdqu st0, b0; \
+	vmovdqu st1, b1; \
+	/* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio, key) \
+	vmovq key, x0; \
+	vpshufb .Lpack_bswap, x0, x0; \
+	\
+	vpxor 0 * 16(rio), x0, y7; \
+	vpxor 1 * 16(rio), x0, y6; \
+	vpxor 2 * 16(rio), x0, y5; \
+	vpxor 3 * 16(rio), x0, y4; \
+	vpxor 4 * 16(rio), x0, y3; \
+	vpxor 5 * 16(rio), x0, y2; \
+	vpxor 6 * 16(rio), x0, y1; \
+	vpxor 7 * 16(rio), x0, y0; \
+	vpxor 8 * 16(rio), x0, x7; \
+	vpxor 9 * 16(rio), x0, x6; \
+	vpxor 10 * 16(rio), x0, x5; \
+	vpxor 11 * 16(rio), x0, x4; \
+	vpxor 12 * 16(rio), x0, x3; \
+	vpxor 13 * 16(rio), x0, x2; \
+	vpxor 14 * 16(rio), x0, x1; \
+	vpxor 15 * 16(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd) \
+	byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+			 y5, y6, y7, (mem_ab), (mem_cd)); \
+	\
+	vmovdqu x0, 0 * 16(mem_ab); \
+	vmovdqu x1, 1 * 16(mem_ab); \
+	vmovdqu x2, 2 * 16(mem_ab); \
+	vmovdqu x3, 3 * 16(mem_ab); \
+	vmovdqu x4, 4 * 16(mem_ab); \
+	vmovdqu x5, 5 * 16(mem_ab); \
+	vmovdqu x6, 6 * 16(mem_ab); \
+	vmovdqu x7, 7 * 16(mem_ab); \
+	vmovdqu y0, 0 * 16(mem_cd); \
+	vmovdqu y1, 1 * 16(mem_cd); \
+	vmovdqu y2, 2 * 16(mem_cd); \
+	vmovdqu y3, 3 * 16(mem_cd); \
+	vmovdqu y4, 4 * 16(mem_cd); \
+	vmovdqu y5, 5 * 16(mem_cd); \
+	vmovdqu y6, 6 * 16(mem_cd); \
+	vmovdqu y7, 7 * 16(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+	byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
+			 y7, x3, x7, stack_tmp0, stack_tmp1); \
+	\
+	vmovdqu x0, stack_tmp0; \
+	\
+	vmovq key, x0; \
+	vpshufb .Lpack_bswap, x0, x0; \
+	\
+	vpxor x0, y7, y7; \
+	vpxor x0, y6, y6; \
+	vpxor x0, y5, y5; \
+	vpxor x0, y4, y4; \
+	vpxor x0, y3, y3; \
+	vpxor x0, y2, y2; \
+	vpxor x0, y1, y1; \
+	vpxor x0, y0, y0; \
+	vpxor x0, x7, x7; \
+	vpxor x0, x6, x6; \
+	vpxor x0, x5, x5; \
+	vpxor x0, x4, x4; \
+	vpxor x0, x3, x3; \
+	vpxor x0, x2, x2; \
+	vpxor x0, x1, x1; \
+	vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio) \
+	vmovdqu x0, 0 * 16(rio); \
+	vmovdqu x1, 1 * 16(rio); \
+	vmovdqu x2, 2 * 16(rio); \
+	vmovdqu x3, 3 * 16(rio); \
+	vmovdqu x4, 4 * 16(rio); \
+	vmovdqu x5, 5 * 16(rio); \
+	vmovdqu x6, 6 * 16(rio); \
+	vmovdqu x7, 7 * 16(rio); \
+	vmovdqu y0, 8 * 16(rio); \
+	vmovdqu y1, 9 * 16(rio); \
+	vmovdqu y2, 10 * 16(rio); \
+	vmovdqu y3, 11 * 16(rio); \
+	vmovdqu y4, 12 * 16(rio); \
+	vmovdqu y5, 13 * 16(rio); \
+	vmovdqu y6, 14 * 16(rio); \
+	vmovdqu y7, 15 * 16(rio);
+
+.data
+.align 16
+
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
+
+.Lpack_bswap:
+	.long 0x00010203
+	.long 0x04050607
+	.long 0x80808080
+	.long 0x80808080
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+/* 4-bit mask */
+.align 4
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+.text
+
+.align 8
+.type   __camellia_enc_blk16,@function;
+
+__camellia_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 256 bytes
+	 *	%xmm0..%xmm15: 16 plaintext blocks
+	 * output:
+	 *	%xmm0..%xmm15: 16 encrypted blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 16(%rax), %rcx;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %rcx);
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 0);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX),
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX));
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 8);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX),
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX));
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 16);
+
+	movl $24, %r8d;
+	cmpl $16, key_length(CTX);
+	jne .Lenc_max32;
+
+.Lenc_done:
+	/* load CD for output */
+	vmovdqu 0 * 16(%rcx), %xmm8;
+	vmovdqu 1 * 16(%rcx), %xmm9;
+	vmovdqu 2 * 16(%rcx), %xmm10;
+	vmovdqu 3 * 16(%rcx), %xmm11;
+	vmovdqu 4 * 16(%rcx), %xmm12;
+	vmovdqu 5 * 16(%rcx), %xmm13;
+	vmovdqu 6 * 16(%rcx), %xmm14;
+	vmovdqu 7 * 16(%rcx), %xmm15;
+
+	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		    %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
+
+	ret;
+
+.align 8
+.Lenc_max32:
+	movl $32, %r8d;
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX),
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX));
+
+	enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 24);
+
+	jmp .Lenc_done;
+
+.align 8
+.type   __camellia_dec_blk16,@function;
+
+__camellia_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 256 bytes
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%xmm0..%xmm15: 16 encrypted blocks
+	 * output:
+	 *	%xmm0..%xmm15: 16 plaintext blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 16(%rax), %rcx;
+
+	inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		      %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		      %xmm15, %rax, %rcx);
+
+	cmpl $32, %r8d;
+	je .Ldec_max32;
+
+.Ldec_max24:
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 16);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX),
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX));
+
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 8);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX),
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX));
+
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 0);
+
+	/* load CD for output */
+	vmovdqu 0 * 16(%rcx), %xmm8;
+	vmovdqu 1 * 16(%rcx), %xmm9;
+	vmovdqu 2 * 16(%rcx), %xmm10;
+	vmovdqu 3 * 16(%rcx), %xmm11;
+	vmovdqu 4 * 16(%rcx), %xmm12;
+	vmovdqu 5 * 16(%rcx), %xmm13;
+	vmovdqu 6 * 16(%rcx), %xmm14;
+	vmovdqu 7 * 16(%rcx), %xmm15;
+
+	outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		    %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		    %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+
+	ret;
+
+.align 8
+.Ldec_max32:
+	dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rax, %rcx, 24);
+
+	fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+	      %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+	      %xmm15,
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX),
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX));
+
+	jmp .Ldec_max24;
+
+.align 8
+.global camellia_ecb_enc_16way
+.type   camellia_ecb_enc_16way,@function;
+
+camellia_ecb_enc_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 */
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx, (key_table)(CTX));
+
+	/* now dst can be used as temporary buffer (even in src == dst case) */
+	movq	%rsi, %rax;
+
+	call __camellia_enc_blk16;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	ret;
+
+.align 8
+.global camellia_ecb_dec_16way
+.type   camellia_ecb_dec_16way,@function;
+
+camellia_ecb_dec_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 */
+
+	cmpl $16, key_length(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	/* now dst can be used as temporary buffer (even in src == dst case) */
+	movq	%rsi, %rax;
+
+	call __camellia_dec_blk16;
+
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	ret;
+
+.align 8
+.global camellia_cbc_dec_16way
+.type   camellia_cbc_dec_16way,@function;
+
+camellia_cbc_dec_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 */
+
+	cmpl $16, key_length(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+		     %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
+		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	/*
+	 * dst might still be in-use (in case dst == src), so use stack for
+	 * temporary storage.
+	 */
+	subq $(16 * 16), %rsp;
+	movq %rsp, %rax;
+
+	call __camellia_dec_blk16;
+
+	addq $(16 * 16), %rsp;
+
+	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
+	vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
+	vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
+	vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
+	vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
+	vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
+	vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
+	vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
+	vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
+	vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
+	vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
+	vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
+	vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
+	vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
+	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	ret;
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+.align 8
+.global camellia_ctr_16way
+.type   camellia_ctr_16way,@function;
+
+camellia_ctr_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	subq $(16 * 16), %rsp;
+	movq %rsp, %rax;
+
+	vmovdqa .Lbswap128_mask, %xmm14;
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), %xmm0;
+	vpshufb %xmm14, %xmm0, %xmm15;
+	vmovdqu %xmm15, 15 * 16(%rax);
+
+	vpcmpeqd %xmm15, %xmm15, %xmm15;
+	vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
+
+	/* construct IVs */
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm13;
+	vmovdqu %xmm13, 14 * 16(%rax);
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm13;
+	vmovdqu %xmm13, 13 * 16(%rax);
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm12;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm11;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm10;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm9;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm8;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm7;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm6;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm5;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm4;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm3;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm2;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vpshufb %xmm14, %xmm0, %xmm1;
+	inc_le128(%xmm0, %xmm15, %xmm13);
+	vmovdqa %xmm0, %xmm13;
+	vpshufb %xmm14, %xmm0, %xmm0;
+	inc_le128(%xmm13, %xmm15, %xmm14);
+	vmovdqu %xmm13, (%rcx);
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX), %xmm15;
+	vpshufb .Lpack_bswap, %xmm15, %xmm15;
+	vpxor %xmm0, %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor 13 * 16(%rax), %xmm15, %xmm13;
+	vpxor 14 * 16(%rax), %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call __camellia_enc_blk16;
+
+	addq $(16 * 16), %rsp;
+
+	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+	vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+	vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+	vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+	vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+	vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+	vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+	vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+	vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+	vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+	vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+	vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+	vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+	vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	ret;
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
new file mode 100644
index 000000000000..96cbb6068fce
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -0,0 +1,558 @@
+/*
+ * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+
+/* 16-way AES-NI parallel cipher functions */
+asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+
+asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
+				   const u8 *src, le128 *iv);
+
+static const struct common_glue_ctx camellia_enc = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_ctr = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_dec = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_dec_cbc = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+	} }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
+				       dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
+				       nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
+			      CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
+			      nbytes);
+}
+
+static inline void camellia_fpu_end(bool fpu_enabled)
+{
+	glue_fpu_end(fpu_enabled);
+}
+
+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+			   unsigned int key_len)
+{
+	return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
+				 &tfm->crt_flags);
+}
+
+struct crypt_priv {
+	struct camellia_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->camellia_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	camellia_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->camellia_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	camellia_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	camellia_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	camellia_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static struct crypto_alg cmll_algs[10] = { {
+	.cra_name		= "__ecb-camellia-aesni",
+	.cra_driver_name	= "__driver-ecb-camellia-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-camellia-aesni",
+	.cra_driver_name	= "__driver-cbc-camellia-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-camellia-aesni",
+	.cra_driver_name	= "__driver-ctr-camellia-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-camellia-aesni",
+	.cra_driver_name	= "__driver-lrw-camellia-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_exit		= lrw_camellia_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= lrw_camellia_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-camellia-aesni",
+	.cra_driver_name	= "__driver-xts-camellia-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= xts_camellia_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(camellia)",
+	.cra_driver_name	= "ecb-camellia-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(camellia)",
+	.cra_driver_name	= "cbc-camellia-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(camellia)",
+	.cra_driver_name	= "ctr-camellia-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "lrw(camellia)",
+	.cra_driver_name	= "lrw-camellia-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(camellia)",
+	.cra_driver_name	= "xts-camellia-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+} };
+
+static int __init camellia_aesni_init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
+		pr_info("AVX or AES-NI instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+
+static void __exit camellia_aesni_fini(void)
+{
+	crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
+MODULE_ALIAS("camellia");
+MODULE_ALIAS("camellia-asm");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index b901b590635f..c226b2c7e47c 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -803,6 +803,28 @@ config CRYPTO_CAMELLIA_X86_64
 	  See also:
 	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
 
+config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
+	tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX)"
+	depends on X86 && 64BIT
+	depends on CRYPTO
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_GLUE_HELPER_X86
+	select CRYPTO_CAMELLIA_X86_64
+	select CRYPTO_LRW
+	select CRYPTO_XTS
+	help
+	  Camellia cipher algorithm module (x86_64/AES-NI/AVX).
+
+	  Camellia is a symmetric key block cipher developed jointly
+	  at NTT and Mitsubishi Electric Corporation.
+
+	  The Camellia specifies three key sizes: 128, 192 and 256 bits.
+
+	  See also:
+	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
+
 config CRYPTO_CAMELLIA_SPARC64
 	tristate "Camellia cipher algorithm (SPARC64)"
 	depends on SPARC64
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 879b61d436e9..3933241708c2 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1726,6 +1726,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "__driver-cbc-camellia-aesni",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
 	}, {
 		.alg = "__driver-cbc-cast5-avx",
 		.test = alg_test_null,
@@ -1817,6 +1832,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "__driver-ecb-camellia-aesni",
+		.test = alg_test_null,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
 	}, {
 		.alg = "__driver-ecb-cast5-avx",
 		.test = alg_test_null,
@@ -2142,6 +2172,22 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "cryptd(__driver-cbc-camellia-aesni)",
+		.test = alg_test_null,
+		.fips_allowed = 1,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
 	}, {
 		.alg = "cryptd(__driver-ecb-aes-aesni)",
 		.test = alg_test_null,
@@ -2158,6 +2204,22 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 			}
 		}
+	}, {
+		.alg = "cryptd(__driver-ecb-camellia-aesni)",
+		.test = alg_test_null,
+		.fips_allowed = 1,
+		.suite = {
+			.cipher = {
+				.enc = {
+					.vecs = NULL,
+					.count = 0
+				},
+				.dec = {
+					.vecs = NULL,
+					.count = 0
+				}
+			}
+		}
 	}, {
 		.alg = "cryptd(__driver-ecb-cast5-avx)",
 		.test = alg_test_null,
-- 
cgit v1.2.3-58-ga151


From 044ab5257806310a0150146df3b74b8adaa4ebcf Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Tue, 13 Nov 2012 11:43:14 +0200
Subject: crypto: cast5/cast6 - move lookup tables to shared module

CAST5 and CAST6 both use same lookup tables, which can be moved shared module
'cast_common'.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/cast5-avx-x86_64-asm_64.S |  16 +-
 arch/x86/crypto/cast6-avx-x86_64-asm_64.S |  16 +-
 crypto/Kconfig                            |  10 ++
 crypto/Makefile                           |   1 +
 crypto/cast5_generic.c                    | 277 +---------------------------
 crypto/cast6_generic.c                    | 280 +----------------------------
 crypto/cast_common.c                      | 290 ++++++++++++++++++++++++++++++
 include/crypto/cast5.h                    |   6 +-
 include/crypto/cast6.h                    |   6 +-
 include/crypto/cast_common.h              |   9 +
 10 files changed, 336 insertions(+), 575 deletions(-)
 create mode 100644 crypto/cast_common.c
 create mode 100644 include/crypto/cast_common.h

(limited to 'arch')

diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 12478e472368..15b00ac7cbd3 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -25,10 +25,10 @@
 
 .file "cast5-avx-x86_64-asm_64.S"
 
-.extern cast5_s1
-.extern cast5_s2
-.extern cast5_s3
-.extern cast5_s4
+.extern cast_s1
+.extern cast_s2
+.extern cast_s3
+.extern cast_s4
 
 /* structure of crypto context */
 #define km	0
@@ -36,10 +36,10 @@
 #define rr	((16*4)+16)
 
 /* s-boxes */
-#define s1	cast5_s1
-#define s2	cast5_s2
-#define s3	cast5_s3
-#define s4	cast5_s4
+#define s1	cast_s1
+#define s2	cast_s2
+#define s3	cast_s3
+#define s4	cast_s4
 
 /**********************************************************************
   16-way AVX cast5
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 83a53818f0a5..2569d0da841f 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -27,20 +27,20 @@
 
 .file "cast6-avx-x86_64-asm_64.S"
 
-.extern cast6_s1
-.extern cast6_s2
-.extern cast6_s3
-.extern cast6_s4
+.extern cast_s1
+.extern cast_s2
+.extern cast_s3
+.extern cast_s4
 
 /* structure of crypto context */
 #define km	0
 #define kr	(12*4*4)
 
 /* s-boxes */
-#define s1	cast6_s1
-#define s2	cast6_s2
-#define s3	cast6_s3
-#define s4	cast6_s4
+#define s1	cast_s1
+#define s2	cast_s2
+#define s3	cast_s3
+#define s4	cast_s4
 
 /**********************************************************************
   8-way AVX cast6
diff --git a/crypto/Kconfig b/crypto/Kconfig
index c226b2c7e47c..4641d95651d3 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -841,9 +841,16 @@ config CRYPTO_CAMELLIA_SPARC64
 	  See also:
 	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
 
+config CRYPTO_CAST_COMMON
+	tristate
+	help
+	  Common parts of the CAST cipher algorithms shared by the
+	  generic c and the assembler implementations.
+
 config CRYPTO_CAST5
 	tristate "CAST5 (CAST-128) cipher algorithm"
 	select CRYPTO_ALGAPI
+	select CRYPTO_CAST_COMMON
 	help
 	  The CAST5 encryption algorithm (synonymous with CAST-128) is
 	  described in RFC2144.
@@ -854,6 +861,7 @@ config CRYPTO_CAST5_AVX_X86_64
 	select CRYPTO_ALGAPI
 	select CRYPTO_CRYPTD
 	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_CAST_COMMON
 	select CRYPTO_CAST5
 	help
 	  The CAST5 encryption algorithm (synonymous with CAST-128) is
@@ -865,6 +873,7 @@ config CRYPTO_CAST5_AVX_X86_64
 config CRYPTO_CAST6
 	tristate "CAST6 (CAST-256) cipher algorithm"
 	select CRYPTO_ALGAPI
+	select CRYPTO_CAST_COMMON
 	help
 	  The CAST6 encryption algorithm (synonymous with CAST-256) is
 	  described in RFC2612.
@@ -876,6 +885,7 @@ config CRYPTO_CAST6_AVX_X86_64
 	select CRYPTO_CRYPTD
 	select CRYPTO_ABLK_HELPER_X86
 	select CRYPTO_GLUE_HELPER_X86
+	select CRYPTO_CAST_COMMON
 	select CRYPTO_CAST6
 	select CRYPTO_LRW
 	select CRYPTO_XTS
diff --git a/crypto/Makefile b/crypto/Makefile
index 8cf61ffe3513..d59dec749804 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_CRYPTO_TWOFISH_COMMON) += twofish_common.o
 obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
 obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
 obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o
+obj-$(CONFIG_CRYPTO_CAST_COMMON) += cast_common.o
 obj-$(CONFIG_CRYPTO_CAST5) += cast5_generic.o
 obj-$(CONFIG_CRYPTO_CAST6) += cast6_generic.o
 obj-$(CONFIG_CRYPTO_ARC4) += arc4.o
diff --git a/crypto/cast5_generic.c b/crypto/cast5_generic.c
index bc525dbd8a4b..5558f630a0eb 100644
--- a/crypto/cast5_generic.c
+++ b/crypto/cast5_generic.c
@@ -30,275 +30,6 @@
 #include <linux/types.h>
 #include <crypto/cast5.h>
 
-
-const u32 cast5_s1[256] = {
-	0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f,
-	0x9c004dd3, 0x6003e540, 0xcf9fc949,
-	0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0,
-	0x15c361d2, 0xc2e7661d, 0x22d4ff8e,
-	0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3,
-	0xdf2f8656, 0x887ca41a, 0xa2d2bd2d,
-	0xa1c9e0d6, 0x346c4819, 0x61b76d87, 0x22540f2f, 0x2abe32e1,
-	0xaa54166b, 0x22568e3a, 0xa2d341d0,
-	0x66db40c8, 0xa784392f, 0x004dff2f, 0x2db9d2de, 0x97943fac,
-	0x4a97c1d8, 0x527644b7, 0xb5f437a7,
-	0xb82cbaef, 0xd751d159, 0x6ff7f0ed, 0x5a097a1f, 0x827b68d0,
-	0x90ecf52e, 0x22b0c054, 0xbc8e5935,
-	0x4b6d2f7f, 0x50bb64a2, 0xd2664910, 0xbee5812d, 0xb7332290,
-	0xe93b159f, 0xb48ee411, 0x4bff345d,
-	0xfd45c240, 0xad31973f, 0xc4f6d02e, 0x55fc8165, 0xd5b1caad,
-	0xa1ac2dae, 0xa2d4b76d, 0xc19b0c50,
-	0x882240f2, 0x0c6e4f38, 0xa4e4bfd7, 0x4f5ba272, 0x564c1d2f,
-	0xc59c5319, 0xb949e354, 0xb04669fe,
-	0xb1b6ab8a, 0xc71358dd, 0x6385c545, 0x110f935d, 0x57538ad5,
-	0x6a390493, 0xe63d37e0, 0x2a54f6b3,
-	0x3a787d5f, 0x6276a0b5, 0x19a6fcdf, 0x7a42206a, 0x29f9d4d5,
-	0xf61b1891, 0xbb72275e, 0xaa508167,
-	0x38901091, 0xc6b505eb, 0x84c7cb8c, 0x2ad75a0f, 0x874a1427,
-	0xa2d1936b, 0x2ad286af, 0xaa56d291,
-	0xd7894360, 0x425c750d, 0x93b39e26, 0x187184c9, 0x6c00b32d,
-	0x73e2bb14, 0xa0bebc3c, 0x54623779,
-	0x64459eab, 0x3f328b82, 0x7718cf82, 0x59a2cea6, 0x04ee002e,
-	0x89fe78e6, 0x3fab0950, 0x325ff6c2,
-	0x81383f05, 0x6963c5c8, 0x76cb5ad6, 0xd49974c9, 0xca180dcf,
-	0x380782d5, 0xc7fa5cf6, 0x8ac31511,
-	0x35e79e13, 0x47da91d0, 0xf40f9086, 0xa7e2419e, 0x31366241,
-	0x051ef495, 0xaa573b04, 0x4a805d8d,
-	0x548300d0, 0x00322a3c, 0xbf64cddf, 0xba57a68e, 0x75c6372b,
-	0x50afd341, 0xa7c13275, 0x915a0bf5,
-	0x6b54bfab, 0x2b0b1426, 0xab4cc9d7, 0x449ccd82, 0xf7fbf265,
-	0xab85c5f3, 0x1b55db94, 0xaad4e324,
-	0xcfa4bd3f, 0x2deaa3e2, 0x9e204d02, 0xc8bd25ac, 0xeadf55b3,
-	0xd5bd9e98, 0xe31231b2, 0x2ad5ad6c,
-	0x954329de, 0xadbe4528, 0xd8710f69, 0xaa51c90f, 0xaa786bf6,
-	0x22513f1e, 0xaa51a79b, 0x2ad344cc,
-	0x7b5a41f0, 0xd37cfbad, 0x1b069505, 0x41ece491, 0xb4c332e6,
-	0x032268d4, 0xc9600acc, 0xce387e6d,
-	0xbf6bb16c, 0x6a70fb78, 0x0d03d9c9, 0xd4df39de, 0xe01063da,
-	0x4736f464, 0x5ad328d8, 0xb347cc96,
-	0x75bb0fc3, 0x98511bfb, 0x4ffbcc35, 0xb58bcf6a, 0xe11f0abc,
-	0xbfc5fe4a, 0xa70aec10, 0xac39570a,
-	0x3f04442f, 0x6188b153, 0xe0397a2e, 0x5727cb79, 0x9ceb418f,
-	0x1cacd68d, 0x2ad37c96, 0x0175cb9d,
-	0xc69dff09, 0xc75b65f0, 0xd9db40d8, 0xec0e7779, 0x4744ead4,
-	0xb11c3274, 0xdd24cb9e, 0x7e1c54bd,
-	0xf01144f9, 0xd2240eb1, 0x9675b3fd, 0xa3ac3755, 0xd47c27af,
-	0x51c85f4d, 0x56907596, 0xa5bb15e6,
-	0x580304f0, 0xca042cf1, 0x011a37ea, 0x8dbfaadb, 0x35ba3e4a,
-	0x3526ffa0, 0xc37b4d09, 0xbc306ed9,
-	0x98a52666, 0x5648f725, 0xff5e569d, 0x0ced63d0, 0x7c63b2cf,
-	0x700b45e1, 0xd5ea50f1, 0x85a92872,
-	0xaf1fbda7, 0xd4234870, 0xa7870bf3, 0x2d3b4d79, 0x42e04198,
-	0x0cd0ede7, 0x26470db8, 0xf881814c,
-	0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db,
-	0xab838653, 0x6e2f1e23, 0x83719c9e,
-	0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c,
-	0xe1e696ff, 0xb141ab08, 0x7cca89b9,
-	0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c,
-	0x5ac9f049, 0xdd8f0f00, 0x5c8165bf
-};
-EXPORT_SYMBOL_GPL(cast5_s1);
-const u32 cast5_s2[256] = {
-	0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a,
-	0xeec5207a, 0x55889c94, 0x72fc0651,
-	0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef,
-	0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3,
-	0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086,
-	0xef944459, 0xba83ccb3, 0xe0c3cdfb,
-	0xd1da4181, 0x3b092ab1, 0xf997f1c1, 0xa5e6cf7b, 0x01420ddb,
-	0xe4e7ef5b, 0x25a1ff41, 0xe180f806,
-	0x1fc41080, 0x179bee7a, 0xd37ac6a9, 0xfe5830a4, 0x98de8b7f,
-	0x77e83f4e, 0x79929269, 0x24fa9f7b,
-	0xe113c85b, 0xacc40083, 0xd7503525, 0xf7ea615f, 0x62143154,
-	0x0d554b63, 0x5d681121, 0xc866c359,
-	0x3d63cf73, 0xcee234c0, 0xd4d87e87, 0x5c672b21, 0x071f6181,
-	0x39f7627f, 0x361e3084, 0xe4eb573b,
-	0x602f64a4, 0xd63acd9c, 0x1bbc4635, 0x9e81032d, 0x2701f50c,
-	0x99847ab4, 0xa0e3df79, 0xba6cf38c,
-	0x10843094, 0x2537a95e, 0xf46f6ffe, 0xa1ff3b1f, 0x208cfb6a,
-	0x8f458c74, 0xd9e0a227, 0x4ec73a34,
-	0xfc884f69, 0x3e4de8df, 0xef0e0088, 0x3559648d, 0x8a45388c,
-	0x1d804366, 0x721d9bfd, 0xa58684bb,
-	0xe8256333, 0x844e8212, 0x128d8098, 0xfed33fb4, 0xce280ae1,
-	0x27e19ba5, 0xd5a6c252, 0xe49754bd,
-	0xc5d655dd, 0xeb667064, 0x77840b4d, 0xa1b6a801, 0x84db26a9,
-	0xe0b56714, 0x21f043b7, 0xe5d05860,
-	0x54f03084, 0x066ff472, 0xa31aa153, 0xdadc4755, 0xb5625dbf,
-	0x68561be6, 0x83ca6b94, 0x2d6ed23b,
-	0xeccf01db, 0xa6d3d0ba, 0xb6803d5c, 0xaf77a709, 0x33b4a34c,
-	0x397bc8d6, 0x5ee22b95, 0x5f0e5304,
-	0x81ed6f61, 0x20e74364, 0xb45e1378, 0xde18639b, 0x881ca122,
-	0xb96726d1, 0x8049a7e8, 0x22b7da7b,
-	0x5e552d25, 0x5272d237, 0x79d2951c, 0xc60d894c, 0x488cb402,
-	0x1ba4fe5b, 0xa4b09f6b, 0x1ca815cf,
-	0xa20c3005, 0x8871df63, 0xb9de2fcb, 0x0cc6c9e9, 0x0beeff53,
-	0xe3214517, 0xb4542835, 0x9f63293c,
-	0xee41e729, 0x6e1d2d7c, 0x50045286, 0x1e6685f3, 0xf33401c6,
-	0x30a22c95, 0x31a70850, 0x60930f13,
-	0x73f98417, 0xa1269859, 0xec645c44, 0x52c877a9, 0xcdff33a6,
-	0xa02b1741, 0x7cbad9a2, 0x2180036f,
-	0x50d99c08, 0xcb3f4861, 0xc26bd765, 0x64a3f6ab, 0x80342676,
-	0x25a75e7b, 0xe4e6d1fc, 0x20c710e6,
-	0xcdf0b680, 0x17844d3b, 0x31eef84d, 0x7e0824e4, 0x2ccb49eb,
-	0x846a3bae, 0x8ff77888, 0xee5d60f6,
-	0x7af75673, 0x2fdd5cdb, 0xa11631c1, 0x30f66f43, 0xb3faec54,
-	0x157fd7fa, 0xef8579cc, 0xd152de58,
-	0xdb2ffd5e, 0x8f32ce19, 0x306af97a, 0x02f03ef8, 0x99319ad5,
-	0xc242fa0f, 0xa7e3ebb0, 0xc68e4906,
-	0xb8da230c, 0x80823028, 0xdcdef3c8, 0xd35fb171, 0x088a1bc8,
-	0xbec0c560, 0x61a3c9e8, 0xbca8f54d,
-	0xc72feffa, 0x22822e99, 0x82c570b4, 0xd8d94e89, 0x8b1c34bc,
-	0x301e16e6, 0x273be979, 0xb0ffeaa6,
-	0x61d9b8c6, 0x00b24869, 0xb7ffce3f, 0x08dc283b, 0x43daf65a,
-	0xf7e19798, 0x7619b72f, 0x8f1c9ba4,
-	0xdc8637a0, 0x16a7d3b1, 0x9fc393b7, 0xa7136eeb, 0xc6bcc63e,
-	0x1a513742, 0xef6828bc, 0x520365d6,
-	0x2d6a77ab, 0x3527ed4b, 0x821fd216, 0x095c6e2e, 0xdb92f2fb,
-	0x5eea29cb, 0x145892f5, 0x91584f7f,
-	0x5483697b, 0x2667a8cc, 0x85196048, 0x8c4bacea, 0x833860d4,
-	0x0d23e0f9, 0x6c387e8a, 0x0ae6d249,
-	0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3,
-	0x230eabb0, 0x6438bc87, 0xf0b5b1fa,
-	0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589,
-	0xa345415e, 0x5c038323, 0x3e5d3bb9,
-	0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539,
-	0x73bfbe70, 0x83877605, 0x4523ecf1
-};
-EXPORT_SYMBOL_GPL(cast5_s2);
-const u32 cast5_s3[256] = {
-	0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff,
-	0x369fe44b, 0x8c1fc644, 0xaececa90,
-	0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806,
-	0xf0ad0548, 0xe13c8d83, 0x927010d5,
-	0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820,
-	0xfade82e0, 0xa067268b, 0x8272792e,
-	0x553fb2c0, 0x489ae22b, 0xd4ef9794, 0x125e3fbc, 0x21fffcee,
-	0x825b1bfd, 0x9255c5ed, 0x1257a240,
-	0x4e1a8302, 0xbae07fff, 0x528246e7, 0x8e57140e, 0x3373f7bf,
-	0x8c9f8188, 0xa6fc4ee8, 0xc982b5a5,
-	0xa8c01db7, 0x579fc264, 0x67094f31, 0xf2bd3f5f, 0x40fff7c1,
-	0x1fb78dfc, 0x8e6bd2c1, 0x437be59b,
-	0x99b03dbf, 0xb5dbc64b, 0x638dc0e6, 0x55819d99, 0xa197c81c,
-	0x4a012d6e, 0xc5884a28, 0xccc36f71,
-	0xb843c213, 0x6c0743f1, 0x8309893c, 0x0feddd5f, 0x2f7fe850,
-	0xd7c07f7e, 0x02507fbf, 0x5afb9a04,
-	0xa747d2d0, 0x1651192e, 0xaf70bf3e, 0x58c31380, 0x5f98302e,
-	0x727cc3c4, 0x0a0fb402, 0x0f7fef82,
-	0x8c96fdad, 0x5d2c2aae, 0x8ee99a49, 0x50da88b8, 0x8427f4a0,
-	0x1eac5790, 0x796fb449, 0x8252dc15,
-	0xefbd7d9b, 0xa672597d, 0xada840d8, 0x45f54504, 0xfa5d7403,
-	0xe83ec305, 0x4f91751a, 0x925669c2,
-	0x23efe941, 0xa903f12e, 0x60270df2, 0x0276e4b6, 0x94fd6574,
-	0x927985b2, 0x8276dbcb, 0x02778176,
-	0xf8af918d, 0x4e48f79e, 0x8f616ddf, 0xe29d840e, 0x842f7d83,
-	0x340ce5c8, 0x96bbb682, 0x93b4b148,
-	0xef303cab, 0x984faf28, 0x779faf9b, 0x92dc560d, 0x224d1e20,
-	0x8437aa88, 0x7d29dc96, 0x2756d3dc,
-	0x8b907cee, 0xb51fd240, 0xe7c07ce3, 0xe566b4a1, 0xc3e9615e,
-	0x3cf8209d, 0x6094d1e3, 0xcd9ca341,
-	0x5c76460e, 0x00ea983b, 0xd4d67881, 0xfd47572c, 0xf76cedd9,
-	0xbda8229c, 0x127dadaa, 0x438a074e,
-	0x1f97c090, 0x081bdb8a, 0x93a07ebe, 0xb938ca15, 0x97b03cff,
-	0x3dc2c0f8, 0x8d1ab2ec, 0x64380e51,
-	0x68cc7bfb, 0xd90f2788, 0x12490181, 0x5de5ffd4, 0xdd7ef86a,
-	0x76a2e214, 0xb9a40368, 0x925d958f,
-	0x4b39fffa, 0xba39aee9, 0xa4ffd30b, 0xfaf7933b, 0x6d498623,
-	0x193cbcfa, 0x27627545, 0x825cf47a,
-	0x61bd8ba0, 0xd11e42d1, 0xcead04f4, 0x127ea392, 0x10428db7,
-	0x8272a972, 0x9270c4a8, 0x127de50b,
-	0x285ba1c8, 0x3c62f44f, 0x35c0eaa5, 0xe805d231, 0x428929fb,
-	0xb4fcdf82, 0x4fb66a53, 0x0e7dc15b,
-	0x1f081fab, 0x108618ae, 0xfcfd086d, 0xf9ff2889, 0x694bcc11,
-	0x236a5cae, 0x12deca4d, 0x2c3f8cc5,
-	0xd2d02dfe, 0xf8ef5896, 0xe4cf52da, 0x95155b67, 0x494a488c,
-	0xb9b6a80c, 0x5c8f82bc, 0x89d36b45,
-	0x3a609437, 0xec00c9a9, 0x44715253, 0x0a874b49, 0xd773bc40,
-	0x7c34671c, 0x02717ef6, 0x4feb5536,
-	0xa2d02fff, 0xd2bf60c4, 0xd43f03c0, 0x50b4ef6d, 0x07478cd1,
-	0x006e1888, 0xa2e53f55, 0xb9e6d4bc,
-	0xa2048016, 0x97573833, 0xd7207d67, 0xde0f8f3d, 0x72f87b33,
-	0xabcc4f33, 0x7688c55d, 0x7b00a6b0,
-	0x947b0001, 0x570075d2, 0xf9bb88f8, 0x8942019e, 0x4264a5ff,
-	0x856302e0, 0x72dbd92b, 0xee971b69,
-	0x6ea22fde, 0x5f08ae2b, 0xaf7a616d, 0xe5c98767, 0xcf1febd2,
-	0x61efc8c2, 0xf1ac2571, 0xcc8239c2,
-	0x67214cb8, 0xb1e583d1, 0xb7dc3e62, 0x7f10bdce, 0xf90a5c38,
-	0x0ff0443d, 0x606e6dc6, 0x60543a49,
-	0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f,
-	0x68458425, 0x99833be5, 0x600d457d,
-	0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31,
-	0x9c305a00, 0x52bce688, 0x1b03588a,
-	0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636,
-	0xa133c501, 0xe9d3531c, 0xee353783
-};
-EXPORT_SYMBOL_GPL(cast5_s3);
-const u32 cast5_s4[256] = {
-	0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb,
-	0x64ad8c57, 0x85510443, 0xfa020ed1,
-	0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43,
-	0x6497b7b1, 0xf3641f63, 0x241e4adf,
-	0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30,
-	0xc0a5374f, 0x1d2d00d9, 0x24147b15,
-	0xee4d111a, 0x0fca5167, 0x71ff904c, 0x2d195ffe, 0x1a05645f,
-	0x0c13fefe, 0x081b08ca, 0x05170121,
-	0x80530100, 0xe83e5efe, 0xac9af4f8, 0x7fe72701, 0xd2b8ee5f,
-	0x06df4261, 0xbb9e9b8a, 0x7293ea25,
-	0xce84ffdf, 0xf5718801, 0x3dd64b04, 0xa26f263b, 0x7ed48400,
-	0x547eebe6, 0x446d4ca0, 0x6cf3d6f5,
-	0x2649abdf, 0xaea0c7f5, 0x36338cc1, 0x503f7e93, 0xd3772061,
-	0x11b638e1, 0x72500e03, 0xf80eb2bb,
-	0xabe0502e, 0xec8d77de, 0x57971e81, 0xe14f6746, 0xc9335400,
-	0x6920318f, 0x081dbb99, 0xffc304a5,
-	0x4d351805, 0x7f3d5ce3, 0xa6c866c6, 0x5d5bcca9, 0xdaec6fea,
-	0x9f926f91, 0x9f46222f, 0x3991467d,
-	0xa5bf6d8e, 0x1143c44f, 0x43958302, 0xd0214eeb, 0x022083b8,
-	0x3fb6180c, 0x18f8931e, 0x281658e6,
-	0x26486e3e, 0x8bd78a70, 0x7477e4c1, 0xb506e07c, 0xf32d0a25,
-	0x79098b02, 0xe4eabb81, 0x28123b23,
-	0x69dead38, 0x1574ca16, 0xdf871b62, 0x211c40b7, 0xa51a9ef9,
-	0x0014377b, 0x041e8ac8, 0x09114003,
-	0xbd59e4d2, 0xe3d156d5, 0x4fe876d5, 0x2f91a340, 0x557be8de,
-	0x00eae4a7, 0x0ce5c2ec, 0x4db4bba6,
-	0xe756bdff, 0xdd3369ac, 0xec17b035, 0x06572327, 0x99afc8b0,
-	0x56c8c391, 0x6b65811c, 0x5e146119,
-	0x6e85cb75, 0xbe07c002, 0xc2325577, 0x893ff4ec, 0x5bbfc92d,
-	0xd0ec3b25, 0xb7801ab7, 0x8d6d3b24,
-	0x20c763ef, 0xc366a5fc, 0x9c382880, 0x0ace3205, 0xaac9548a,
-	0xeca1d7c7, 0x041afa32, 0x1d16625a,
-	0x6701902c, 0x9b757a54, 0x31d477f7, 0x9126b031, 0x36cc6fdb,
-	0xc70b8b46, 0xd9e66a48, 0x56e55a79,
-	0x026a4ceb, 0x52437eff, 0x2f8f76b4, 0x0df980a5, 0x8674cde3,
-	0xedda04eb, 0x17a9be04, 0x2c18f4df,
-	0xb7747f9d, 0xab2af7b4, 0xefc34d20, 0x2e096b7c, 0x1741a254,
-	0xe5b6a035, 0x213d42f6, 0x2c1c7c26,
-	0x61c2f50f, 0x6552daf9, 0xd2c231f8, 0x25130f69, 0xd8167fa2,
-	0x0418f2c8, 0x001a96a6, 0x0d1526ab,
-	0x63315c21, 0x5e0a72ec, 0x49bafefd, 0x187908d9, 0x8d0dbd86,
-	0x311170a7, 0x3e9b640c, 0xcc3e10d7,
-	0xd5cad3b6, 0x0caec388, 0xf73001e1, 0x6c728aff, 0x71eae2a1,
-	0x1f9af36e, 0xcfcbd12f, 0xc1de8417,
-	0xac07be6b, 0xcb44a1d8, 0x8b9b0f56, 0x013988c3, 0xb1c52fca,
-	0xb4be31cd, 0xd8782806, 0x12a3a4e2,
-	0x6f7de532, 0x58fd7eb6, 0xd01ee900, 0x24adffc2, 0xf4990fc5,
-	0x9711aac5, 0x001d7b95, 0x82e5e7d2,
-	0x109873f6, 0x00613096, 0xc32d9521, 0xada121ff, 0x29908415,
-	0x7fbb977f, 0xaf9eb3db, 0x29c9ed2a,
-	0x5ce2a465, 0xa730f32c, 0xd0aa3fe8, 0x8a5cc091, 0xd49e2ce7,
-	0x0ce454a9, 0xd60acd86, 0x015f1919,
-	0x77079103, 0xdea03af6, 0x78a8565e, 0xdee356df, 0x21f05cbe,
-	0x8b75e387, 0xb3c50651, 0xb8a5c3ef,
-	0xd8eeb6d2, 0xe523be77, 0xc2154529, 0x2f69efdf, 0xafe67afb,
-	0xf470c4b2, 0xf3e0eb5b, 0xd6cc9876,
-	0x39e4460c, 0x1fda8538, 0x1987832f, 0xca007367, 0xa99144f8,
-	0x296b299e, 0x492fc295, 0x9266beab,
-	0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee,
-	0xf65324e6, 0x6afce36c, 0x0316cc04,
-	0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979,
-	0x932bcdf6, 0xb657c34d, 0x4edfd282,
-	0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0,
-	0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2
-};
-EXPORT_SYMBOL_GPL(cast5_s4);
 static const u32 s5[256] = {
 	0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff,
 	0x1dd358f5, 0x44dd9d44, 0x1731167f,
@@ -564,10 +295,10 @@ static const u32 sb8[256] = {
 	0xeaee6801, 0x8db2a283, 0xea8bf59e
 };
 
-#define s1 cast5_s1
-#define s2 cast5_s2
-#define s3 cast5_s3
-#define s4 cast5_s4
+#define s1 cast_s1
+#define s2 cast_s2
+#define s3 cast_s3
+#define s4 cast_s4
 
 #define F1(D, m, r)  ((I = ((m) + (D))), (I = rol32(I, (r))),   \
 	(((s1[I >> 24] ^ s2[(I>>16)&0xff]) - s3[(I>>8)&0xff]) + s4[I&0xff]))
diff --git a/crypto/cast6_generic.c b/crypto/cast6_generic.c
index 1acd2f1c48fc..de732528a430 100644
--- a/crypto/cast6_generic.c
+++ b/crypto/cast6_generic.c
@@ -27,10 +27,10 @@
 #include <linux/types.h>
 #include <crypto/cast6.h>
 
-#define s1 cast6_s1
-#define s2 cast6_s2
-#define s3 cast6_s3
-#define s4 cast6_s4
+#define s1 cast_s1
+#define s2 cast_s2
+#define s3 cast_s3
+#define s4 cast_s4
 
 #define F1(D, r, m)  ((I = ((m) + (D))), (I = rol32(I, (r))),   \
 	(((s1[I >> 24] ^ s2[(I>>16)&0xff]) - s3[(I>>8)&0xff]) + s4[I&0xff]))
@@ -39,278 +39,6 @@
 #define F3(D, r, m)  ((I = ((m) - (D))), (I = rol32(I, (r))),   \
 	(((s1[I >> 24] + s2[(I>>16)&0xff]) ^ s3[(I>>8)&0xff]) - s4[I&0xff]))
 
-const u32 cast6_s1[256] = {
-	0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f,
-	0x9c004dd3, 0x6003e540, 0xcf9fc949,
-	0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0,
-	0x15c361d2, 0xc2e7661d, 0x22d4ff8e,
-	0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3,
-	0xdf2f8656, 0x887ca41a, 0xa2d2bd2d,
-	0xa1c9e0d6, 0x346c4819, 0x61b76d87, 0x22540f2f, 0x2abe32e1,
-	0xaa54166b, 0x22568e3a, 0xa2d341d0,
-	0x66db40c8, 0xa784392f, 0x004dff2f, 0x2db9d2de, 0x97943fac,
-	0x4a97c1d8, 0x527644b7, 0xb5f437a7,
-	0xb82cbaef, 0xd751d159, 0x6ff7f0ed, 0x5a097a1f, 0x827b68d0,
-	0x90ecf52e, 0x22b0c054, 0xbc8e5935,
-	0x4b6d2f7f, 0x50bb64a2, 0xd2664910, 0xbee5812d, 0xb7332290,
-	0xe93b159f, 0xb48ee411, 0x4bff345d,
-	0xfd45c240, 0xad31973f, 0xc4f6d02e, 0x55fc8165, 0xd5b1caad,
-	0xa1ac2dae, 0xa2d4b76d, 0xc19b0c50,
-	0x882240f2, 0x0c6e4f38, 0xa4e4bfd7, 0x4f5ba272, 0x564c1d2f,
-	0xc59c5319, 0xb949e354, 0xb04669fe,
-	0xb1b6ab8a, 0xc71358dd, 0x6385c545, 0x110f935d, 0x57538ad5,
-	0x6a390493, 0xe63d37e0, 0x2a54f6b3,
-	0x3a787d5f, 0x6276a0b5, 0x19a6fcdf, 0x7a42206a, 0x29f9d4d5,
-	0xf61b1891, 0xbb72275e, 0xaa508167,
-	0x38901091, 0xc6b505eb, 0x84c7cb8c, 0x2ad75a0f, 0x874a1427,
-	0xa2d1936b, 0x2ad286af, 0xaa56d291,
-	0xd7894360, 0x425c750d, 0x93b39e26, 0x187184c9, 0x6c00b32d,
-	0x73e2bb14, 0xa0bebc3c, 0x54623779,
-	0x64459eab, 0x3f328b82, 0x7718cf82, 0x59a2cea6, 0x04ee002e,
-	0x89fe78e6, 0x3fab0950, 0x325ff6c2,
-	0x81383f05, 0x6963c5c8, 0x76cb5ad6, 0xd49974c9, 0xca180dcf,
-	0x380782d5, 0xc7fa5cf6, 0x8ac31511,
-	0x35e79e13, 0x47da91d0, 0xf40f9086, 0xa7e2419e, 0x31366241,
-	0x051ef495, 0xaa573b04, 0x4a805d8d,
-	0x548300d0, 0x00322a3c, 0xbf64cddf, 0xba57a68e, 0x75c6372b,
-	0x50afd341, 0xa7c13275, 0x915a0bf5,
-	0x6b54bfab, 0x2b0b1426, 0xab4cc9d7, 0x449ccd82, 0xf7fbf265,
-	0xab85c5f3, 0x1b55db94, 0xaad4e324,
-	0xcfa4bd3f, 0x2deaa3e2, 0x9e204d02, 0xc8bd25ac, 0xeadf55b3,
-	0xd5bd9e98, 0xe31231b2, 0x2ad5ad6c,
-	0x954329de, 0xadbe4528, 0xd8710f69, 0xaa51c90f, 0xaa786bf6,
-	0x22513f1e, 0xaa51a79b, 0x2ad344cc,
-	0x7b5a41f0, 0xd37cfbad, 0x1b069505, 0x41ece491, 0xb4c332e6,
-	0x032268d4, 0xc9600acc, 0xce387e6d,
-	0xbf6bb16c, 0x6a70fb78, 0x0d03d9c9, 0xd4df39de, 0xe01063da,
-	0x4736f464, 0x5ad328d8, 0xb347cc96,
-	0x75bb0fc3, 0x98511bfb, 0x4ffbcc35, 0xb58bcf6a, 0xe11f0abc,
-	0xbfc5fe4a, 0xa70aec10, 0xac39570a,
-	0x3f04442f, 0x6188b153, 0xe0397a2e, 0x5727cb79, 0x9ceb418f,
-	0x1cacd68d, 0x2ad37c96, 0x0175cb9d,
-	0xc69dff09, 0xc75b65f0, 0xd9db40d8, 0xec0e7779, 0x4744ead4,
-	0xb11c3274, 0xdd24cb9e, 0x7e1c54bd,
-	0xf01144f9, 0xd2240eb1, 0x9675b3fd, 0xa3ac3755, 0xd47c27af,
-	0x51c85f4d, 0x56907596, 0xa5bb15e6,
-	0x580304f0, 0xca042cf1, 0x011a37ea, 0x8dbfaadb, 0x35ba3e4a,
-	0x3526ffa0, 0xc37b4d09, 0xbc306ed9,
-	0x98a52666, 0x5648f725, 0xff5e569d, 0x0ced63d0, 0x7c63b2cf,
-	0x700b45e1, 0xd5ea50f1, 0x85a92872,
-	0xaf1fbda7, 0xd4234870, 0xa7870bf3, 0x2d3b4d79, 0x42e04198,
-	0x0cd0ede7, 0x26470db8, 0xf881814c,
-	0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db,
-	0xab838653, 0x6e2f1e23, 0x83719c9e,
-	0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c,
-	0xe1e696ff, 0xb141ab08, 0x7cca89b9,
-	0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c,
-	0x5ac9f049, 0xdd8f0f00, 0x5c8165bf
-};
-EXPORT_SYMBOL_GPL(cast6_s1);
-
-const u32 cast6_s2[256] = {
-	0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a,
-	0xeec5207a, 0x55889c94, 0x72fc0651,
-	0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef,
-	0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3,
-	0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086,
-	0xef944459, 0xba83ccb3, 0xe0c3cdfb,
-	0xd1da4181, 0x3b092ab1, 0xf997f1c1, 0xa5e6cf7b, 0x01420ddb,
-	0xe4e7ef5b, 0x25a1ff41, 0xe180f806,
-	0x1fc41080, 0x179bee7a, 0xd37ac6a9, 0xfe5830a4, 0x98de8b7f,
-	0x77e83f4e, 0x79929269, 0x24fa9f7b,
-	0xe113c85b, 0xacc40083, 0xd7503525, 0xf7ea615f, 0x62143154,
-	0x0d554b63, 0x5d681121, 0xc866c359,
-	0x3d63cf73, 0xcee234c0, 0xd4d87e87, 0x5c672b21, 0x071f6181,
-	0x39f7627f, 0x361e3084, 0xe4eb573b,
-	0x602f64a4, 0xd63acd9c, 0x1bbc4635, 0x9e81032d, 0x2701f50c,
-	0x99847ab4, 0xa0e3df79, 0xba6cf38c,
-	0x10843094, 0x2537a95e, 0xf46f6ffe, 0xa1ff3b1f, 0x208cfb6a,
-	0x8f458c74, 0xd9e0a227, 0x4ec73a34,
-	0xfc884f69, 0x3e4de8df, 0xef0e0088, 0x3559648d, 0x8a45388c,
-	0x1d804366, 0x721d9bfd, 0xa58684bb,
-	0xe8256333, 0x844e8212, 0x128d8098, 0xfed33fb4, 0xce280ae1,
-	0x27e19ba5, 0xd5a6c252, 0xe49754bd,
-	0xc5d655dd, 0xeb667064, 0x77840b4d, 0xa1b6a801, 0x84db26a9,
-	0xe0b56714, 0x21f043b7, 0xe5d05860,
-	0x54f03084, 0x066ff472, 0xa31aa153, 0xdadc4755, 0xb5625dbf,
-	0x68561be6, 0x83ca6b94, 0x2d6ed23b,
-	0xeccf01db, 0xa6d3d0ba, 0xb6803d5c, 0xaf77a709, 0x33b4a34c,
-	0x397bc8d6, 0x5ee22b95, 0x5f0e5304,
-	0x81ed6f61, 0x20e74364, 0xb45e1378, 0xde18639b, 0x881ca122,
-	0xb96726d1, 0x8049a7e8, 0x22b7da7b,
-	0x5e552d25, 0x5272d237, 0x79d2951c, 0xc60d894c, 0x488cb402,
-	0x1ba4fe5b, 0xa4b09f6b, 0x1ca815cf,
-	0xa20c3005, 0x8871df63, 0xb9de2fcb, 0x0cc6c9e9, 0x0beeff53,
-	0xe3214517, 0xb4542835, 0x9f63293c,
-	0xee41e729, 0x6e1d2d7c, 0x50045286, 0x1e6685f3, 0xf33401c6,
-	0x30a22c95, 0x31a70850, 0x60930f13,
-	0x73f98417, 0xa1269859, 0xec645c44, 0x52c877a9, 0xcdff33a6,
-	0xa02b1741, 0x7cbad9a2, 0x2180036f,
-	0x50d99c08, 0xcb3f4861, 0xc26bd765, 0x64a3f6ab, 0x80342676,
-	0x25a75e7b, 0xe4e6d1fc, 0x20c710e6,
-	0xcdf0b680, 0x17844d3b, 0x31eef84d, 0x7e0824e4, 0x2ccb49eb,
-	0x846a3bae, 0x8ff77888, 0xee5d60f6,
-	0x7af75673, 0x2fdd5cdb, 0xa11631c1, 0x30f66f43, 0xb3faec54,
-	0x157fd7fa, 0xef8579cc, 0xd152de58,
-	0xdb2ffd5e, 0x8f32ce19, 0x306af97a, 0x02f03ef8, 0x99319ad5,
-	0xc242fa0f, 0xa7e3ebb0, 0xc68e4906,
-	0xb8da230c, 0x80823028, 0xdcdef3c8, 0xd35fb171, 0x088a1bc8,
-	0xbec0c560, 0x61a3c9e8, 0xbca8f54d,
-	0xc72feffa, 0x22822e99, 0x82c570b4, 0xd8d94e89, 0x8b1c34bc,
-	0x301e16e6, 0x273be979, 0xb0ffeaa6,
-	0x61d9b8c6, 0x00b24869, 0xb7ffce3f, 0x08dc283b, 0x43daf65a,
-	0xf7e19798, 0x7619b72f, 0x8f1c9ba4,
-	0xdc8637a0, 0x16a7d3b1, 0x9fc393b7, 0xa7136eeb, 0xc6bcc63e,
-	0x1a513742, 0xef6828bc, 0x520365d6,
-	0x2d6a77ab, 0x3527ed4b, 0x821fd216, 0x095c6e2e, 0xdb92f2fb,
-	0x5eea29cb, 0x145892f5, 0x91584f7f,
-	0x5483697b, 0x2667a8cc, 0x85196048, 0x8c4bacea, 0x833860d4,
-	0x0d23e0f9, 0x6c387e8a, 0x0ae6d249,
-	0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3,
-	0x230eabb0, 0x6438bc87, 0xf0b5b1fa,
-	0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589,
-	0xa345415e, 0x5c038323, 0x3e5d3bb9,
-	0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539,
-	0x73bfbe70, 0x83877605, 0x4523ecf1
-};
-EXPORT_SYMBOL_GPL(cast6_s2);
-
-const u32 cast6_s3[256] = {
-	0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff,
-	0x369fe44b, 0x8c1fc644, 0xaececa90,
-	0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806,
-	0xf0ad0548, 0xe13c8d83, 0x927010d5,
-	0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820,
-	0xfade82e0, 0xa067268b, 0x8272792e,
-	0x553fb2c0, 0x489ae22b, 0xd4ef9794, 0x125e3fbc, 0x21fffcee,
-	0x825b1bfd, 0x9255c5ed, 0x1257a240,
-	0x4e1a8302, 0xbae07fff, 0x528246e7, 0x8e57140e, 0x3373f7bf,
-	0x8c9f8188, 0xa6fc4ee8, 0xc982b5a5,
-	0xa8c01db7, 0x579fc264, 0x67094f31, 0xf2bd3f5f, 0x40fff7c1,
-	0x1fb78dfc, 0x8e6bd2c1, 0x437be59b,
-	0x99b03dbf, 0xb5dbc64b, 0x638dc0e6, 0x55819d99, 0xa197c81c,
-	0x4a012d6e, 0xc5884a28, 0xccc36f71,
-	0xb843c213, 0x6c0743f1, 0x8309893c, 0x0feddd5f, 0x2f7fe850,
-	0xd7c07f7e, 0x02507fbf, 0x5afb9a04,
-	0xa747d2d0, 0x1651192e, 0xaf70bf3e, 0x58c31380, 0x5f98302e,
-	0x727cc3c4, 0x0a0fb402, 0x0f7fef82,
-	0x8c96fdad, 0x5d2c2aae, 0x8ee99a49, 0x50da88b8, 0x8427f4a0,
-	0x1eac5790, 0x796fb449, 0x8252dc15,
-	0xefbd7d9b, 0xa672597d, 0xada840d8, 0x45f54504, 0xfa5d7403,
-	0xe83ec305, 0x4f91751a, 0x925669c2,
-	0x23efe941, 0xa903f12e, 0x60270df2, 0x0276e4b6, 0x94fd6574,
-	0x927985b2, 0x8276dbcb, 0x02778176,
-	0xf8af918d, 0x4e48f79e, 0x8f616ddf, 0xe29d840e, 0x842f7d83,
-	0x340ce5c8, 0x96bbb682, 0x93b4b148,
-	0xef303cab, 0x984faf28, 0x779faf9b, 0x92dc560d, 0x224d1e20,
-	0x8437aa88, 0x7d29dc96, 0x2756d3dc,
-	0x8b907cee, 0xb51fd240, 0xe7c07ce3, 0xe566b4a1, 0xc3e9615e,
-	0x3cf8209d, 0x6094d1e3, 0xcd9ca341,
-	0x5c76460e, 0x00ea983b, 0xd4d67881, 0xfd47572c, 0xf76cedd9,
-	0xbda8229c, 0x127dadaa, 0x438a074e,
-	0x1f97c090, 0x081bdb8a, 0x93a07ebe, 0xb938ca15, 0x97b03cff,
-	0x3dc2c0f8, 0x8d1ab2ec, 0x64380e51,
-	0x68cc7bfb, 0xd90f2788, 0x12490181, 0x5de5ffd4, 0xdd7ef86a,
-	0x76a2e214, 0xb9a40368, 0x925d958f,
-	0x4b39fffa, 0xba39aee9, 0xa4ffd30b, 0xfaf7933b, 0x6d498623,
-	0x193cbcfa, 0x27627545, 0x825cf47a,
-	0x61bd8ba0, 0xd11e42d1, 0xcead04f4, 0x127ea392, 0x10428db7,
-	0x8272a972, 0x9270c4a8, 0x127de50b,
-	0x285ba1c8, 0x3c62f44f, 0x35c0eaa5, 0xe805d231, 0x428929fb,
-	0xb4fcdf82, 0x4fb66a53, 0x0e7dc15b,
-	0x1f081fab, 0x108618ae, 0xfcfd086d, 0xf9ff2889, 0x694bcc11,
-	0x236a5cae, 0x12deca4d, 0x2c3f8cc5,
-	0xd2d02dfe, 0xf8ef5896, 0xe4cf52da, 0x95155b67, 0x494a488c,
-	0xb9b6a80c, 0x5c8f82bc, 0x89d36b45,
-	0x3a609437, 0xec00c9a9, 0x44715253, 0x0a874b49, 0xd773bc40,
-	0x7c34671c, 0x02717ef6, 0x4feb5536,
-	0xa2d02fff, 0xd2bf60c4, 0xd43f03c0, 0x50b4ef6d, 0x07478cd1,
-	0x006e1888, 0xa2e53f55, 0xb9e6d4bc,
-	0xa2048016, 0x97573833, 0xd7207d67, 0xde0f8f3d, 0x72f87b33,
-	0xabcc4f33, 0x7688c55d, 0x7b00a6b0,
-	0x947b0001, 0x570075d2, 0xf9bb88f8, 0x8942019e, 0x4264a5ff,
-	0x856302e0, 0x72dbd92b, 0xee971b69,
-	0x6ea22fde, 0x5f08ae2b, 0xaf7a616d, 0xe5c98767, 0xcf1febd2,
-	0x61efc8c2, 0xf1ac2571, 0xcc8239c2,
-	0x67214cb8, 0xb1e583d1, 0xb7dc3e62, 0x7f10bdce, 0xf90a5c38,
-	0x0ff0443d, 0x606e6dc6, 0x60543a49,
-	0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f,
-	0x68458425, 0x99833be5, 0x600d457d,
-	0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31,
-	0x9c305a00, 0x52bce688, 0x1b03588a,
-	0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636,
-	0xa133c501, 0xe9d3531c, 0xee353783
-};
-EXPORT_SYMBOL_GPL(cast6_s3);
-
-const u32 cast6_s4[256] = {
-	0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb,
-	0x64ad8c57, 0x85510443, 0xfa020ed1,
-	0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43,
-	0x6497b7b1, 0xf3641f63, 0x241e4adf,
-	0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30,
-	0xc0a5374f, 0x1d2d00d9, 0x24147b15,
-	0xee4d111a, 0x0fca5167, 0x71ff904c, 0x2d195ffe, 0x1a05645f,
-	0x0c13fefe, 0x081b08ca, 0x05170121,
-	0x80530100, 0xe83e5efe, 0xac9af4f8, 0x7fe72701, 0xd2b8ee5f,
-	0x06df4261, 0xbb9e9b8a, 0x7293ea25,
-	0xce84ffdf, 0xf5718801, 0x3dd64b04, 0xa26f263b, 0x7ed48400,
-	0x547eebe6, 0x446d4ca0, 0x6cf3d6f5,
-	0x2649abdf, 0xaea0c7f5, 0x36338cc1, 0x503f7e93, 0xd3772061,
-	0x11b638e1, 0x72500e03, 0xf80eb2bb,
-	0xabe0502e, 0xec8d77de, 0x57971e81, 0xe14f6746, 0xc9335400,
-	0x6920318f, 0x081dbb99, 0xffc304a5,
-	0x4d351805, 0x7f3d5ce3, 0xa6c866c6, 0x5d5bcca9, 0xdaec6fea,
-	0x9f926f91, 0x9f46222f, 0x3991467d,
-	0xa5bf6d8e, 0x1143c44f, 0x43958302, 0xd0214eeb, 0x022083b8,
-	0x3fb6180c, 0x18f8931e, 0x281658e6,
-	0x26486e3e, 0x8bd78a70, 0x7477e4c1, 0xb506e07c, 0xf32d0a25,
-	0x79098b02, 0xe4eabb81, 0x28123b23,
-	0x69dead38, 0x1574ca16, 0xdf871b62, 0x211c40b7, 0xa51a9ef9,
-	0x0014377b, 0x041e8ac8, 0x09114003,
-	0xbd59e4d2, 0xe3d156d5, 0x4fe876d5, 0x2f91a340, 0x557be8de,
-	0x00eae4a7, 0x0ce5c2ec, 0x4db4bba6,
-	0xe756bdff, 0xdd3369ac, 0xec17b035, 0x06572327, 0x99afc8b0,
-	0x56c8c391, 0x6b65811c, 0x5e146119,
-	0x6e85cb75, 0xbe07c002, 0xc2325577, 0x893ff4ec, 0x5bbfc92d,
-	0xd0ec3b25, 0xb7801ab7, 0x8d6d3b24,
-	0x20c763ef, 0xc366a5fc, 0x9c382880, 0x0ace3205, 0xaac9548a,
-	0xeca1d7c7, 0x041afa32, 0x1d16625a,
-	0x6701902c, 0x9b757a54, 0x31d477f7, 0x9126b031, 0x36cc6fdb,
-	0xc70b8b46, 0xd9e66a48, 0x56e55a79,
-	0x026a4ceb, 0x52437eff, 0x2f8f76b4, 0x0df980a5, 0x8674cde3,
-	0xedda04eb, 0x17a9be04, 0x2c18f4df,
-	0xb7747f9d, 0xab2af7b4, 0xefc34d20, 0x2e096b7c, 0x1741a254,
-	0xe5b6a035, 0x213d42f6, 0x2c1c7c26,
-	0x61c2f50f, 0x6552daf9, 0xd2c231f8, 0x25130f69, 0xd8167fa2,
-	0x0418f2c8, 0x001a96a6, 0x0d1526ab,
-	0x63315c21, 0x5e0a72ec, 0x49bafefd, 0x187908d9, 0x8d0dbd86,
-	0x311170a7, 0x3e9b640c, 0xcc3e10d7,
-	0xd5cad3b6, 0x0caec388, 0xf73001e1, 0x6c728aff, 0x71eae2a1,
-	0x1f9af36e, 0xcfcbd12f, 0xc1de8417,
-	0xac07be6b, 0xcb44a1d8, 0x8b9b0f56, 0x013988c3, 0xb1c52fca,
-	0xb4be31cd, 0xd8782806, 0x12a3a4e2,
-	0x6f7de532, 0x58fd7eb6, 0xd01ee900, 0x24adffc2, 0xf4990fc5,
-	0x9711aac5, 0x001d7b95, 0x82e5e7d2,
-	0x109873f6, 0x00613096, 0xc32d9521, 0xada121ff, 0x29908415,
-	0x7fbb977f, 0xaf9eb3db, 0x29c9ed2a,
-	0x5ce2a465, 0xa730f32c, 0xd0aa3fe8, 0x8a5cc091, 0xd49e2ce7,
-	0x0ce454a9, 0xd60acd86, 0x015f1919,
-	0x77079103, 0xdea03af6, 0x78a8565e, 0xdee356df, 0x21f05cbe,
-	0x8b75e387, 0xb3c50651, 0xb8a5c3ef,
-	0xd8eeb6d2, 0xe523be77, 0xc2154529, 0x2f69efdf, 0xafe67afb,
-	0xf470c4b2, 0xf3e0eb5b, 0xd6cc9876,
-	0x39e4460c, 0x1fda8538, 0x1987832f, 0xca007367, 0xa99144f8,
-	0x296b299e, 0x492fc295, 0x9266beab,
-	0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee,
-	0xf65324e6, 0x6afce36c, 0x0316cc04,
-	0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979,
-	0x932bcdf6, 0xb657c34d, 0x4edfd282,
-	0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0,
-	0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2
-};
-EXPORT_SYMBOL_GPL(cast6_s4);
-
 static const u32 Tm[24][8] = {
 	{ 0x5a827999, 0xc95c653a, 0x383650db, 0xa7103c7c, 0x15ea281d,
 		0x84c413be, 0xf39dff5f, 0x6277eb00 } ,
diff --git a/crypto/cast_common.c b/crypto/cast_common.c
new file mode 100644
index 000000000000..a15f523d5f56
--- /dev/null
+++ b/crypto/cast_common.c
@@ -0,0 +1,290 @@
+/*
+ * Common lookup tables for CAST-128 (cast5) and CAST-256 (cast6)
+ *
+ * Copyright © 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ * Copyright © 2003 Kartikey Mahendra Bhatt <kartik_me@hotmail.com>
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <crypto/cast_common.h>
+
+const u32 cast_s1[256] = {
+	0x30fb40d4, 0x9fa0ff0b, 0x6beccd2f, 0x3f258c7a, 0x1e213f2f,
+	0x9c004dd3, 0x6003e540, 0xcf9fc949,
+	0xbfd4af27, 0x88bbbdb5, 0xe2034090, 0x98d09675, 0x6e63a0e0,
+	0x15c361d2, 0xc2e7661d, 0x22d4ff8e,
+	0x28683b6f, 0xc07fd059, 0xff2379c8, 0x775f50e2, 0x43c340d3,
+	0xdf2f8656, 0x887ca41a, 0xa2d2bd2d,
+	0xa1c9e0d6, 0x346c4819, 0x61b76d87, 0x22540f2f, 0x2abe32e1,
+	0xaa54166b, 0x22568e3a, 0xa2d341d0,
+	0x66db40c8, 0xa784392f, 0x004dff2f, 0x2db9d2de, 0x97943fac,
+	0x4a97c1d8, 0x527644b7, 0xb5f437a7,
+	0xb82cbaef, 0xd751d159, 0x6ff7f0ed, 0x5a097a1f, 0x827b68d0,
+	0x90ecf52e, 0x22b0c054, 0xbc8e5935,
+	0x4b6d2f7f, 0x50bb64a2, 0xd2664910, 0xbee5812d, 0xb7332290,
+	0xe93b159f, 0xb48ee411, 0x4bff345d,
+	0xfd45c240, 0xad31973f, 0xc4f6d02e, 0x55fc8165, 0xd5b1caad,
+	0xa1ac2dae, 0xa2d4b76d, 0xc19b0c50,
+	0x882240f2, 0x0c6e4f38, 0xa4e4bfd7, 0x4f5ba272, 0x564c1d2f,
+	0xc59c5319, 0xb949e354, 0xb04669fe,
+	0xb1b6ab8a, 0xc71358dd, 0x6385c545, 0x110f935d, 0x57538ad5,
+	0x6a390493, 0xe63d37e0, 0x2a54f6b3,
+	0x3a787d5f, 0x6276a0b5, 0x19a6fcdf, 0x7a42206a, 0x29f9d4d5,
+	0xf61b1891, 0xbb72275e, 0xaa508167,
+	0x38901091, 0xc6b505eb, 0x84c7cb8c, 0x2ad75a0f, 0x874a1427,
+	0xa2d1936b, 0x2ad286af, 0xaa56d291,
+	0xd7894360, 0x425c750d, 0x93b39e26, 0x187184c9, 0x6c00b32d,
+	0x73e2bb14, 0xa0bebc3c, 0x54623779,
+	0x64459eab, 0x3f328b82, 0x7718cf82, 0x59a2cea6, 0x04ee002e,
+	0x89fe78e6, 0x3fab0950, 0x325ff6c2,
+	0x81383f05, 0x6963c5c8, 0x76cb5ad6, 0xd49974c9, 0xca180dcf,
+	0x380782d5, 0xc7fa5cf6, 0x8ac31511,
+	0x35e79e13, 0x47da91d0, 0xf40f9086, 0xa7e2419e, 0x31366241,
+	0x051ef495, 0xaa573b04, 0x4a805d8d,
+	0x548300d0, 0x00322a3c, 0xbf64cddf, 0xba57a68e, 0x75c6372b,
+	0x50afd341, 0xa7c13275, 0x915a0bf5,
+	0x6b54bfab, 0x2b0b1426, 0xab4cc9d7, 0x449ccd82, 0xf7fbf265,
+	0xab85c5f3, 0x1b55db94, 0xaad4e324,
+	0xcfa4bd3f, 0x2deaa3e2, 0x9e204d02, 0xc8bd25ac, 0xeadf55b3,
+	0xd5bd9e98, 0xe31231b2, 0x2ad5ad6c,
+	0x954329de, 0xadbe4528, 0xd8710f69, 0xaa51c90f, 0xaa786bf6,
+	0x22513f1e, 0xaa51a79b, 0x2ad344cc,
+	0x7b5a41f0, 0xd37cfbad, 0x1b069505, 0x41ece491, 0xb4c332e6,
+	0x032268d4, 0xc9600acc, 0xce387e6d,
+	0xbf6bb16c, 0x6a70fb78, 0x0d03d9c9, 0xd4df39de, 0xe01063da,
+	0x4736f464, 0x5ad328d8, 0xb347cc96,
+	0x75bb0fc3, 0x98511bfb, 0x4ffbcc35, 0xb58bcf6a, 0xe11f0abc,
+	0xbfc5fe4a, 0xa70aec10, 0xac39570a,
+	0x3f04442f, 0x6188b153, 0xe0397a2e, 0x5727cb79, 0x9ceb418f,
+	0x1cacd68d, 0x2ad37c96, 0x0175cb9d,
+	0xc69dff09, 0xc75b65f0, 0xd9db40d8, 0xec0e7779, 0x4744ead4,
+	0xb11c3274, 0xdd24cb9e, 0x7e1c54bd,
+	0xf01144f9, 0xd2240eb1, 0x9675b3fd, 0xa3ac3755, 0xd47c27af,
+	0x51c85f4d, 0x56907596, 0xa5bb15e6,
+	0x580304f0, 0xca042cf1, 0x011a37ea, 0x8dbfaadb, 0x35ba3e4a,
+	0x3526ffa0, 0xc37b4d09, 0xbc306ed9,
+	0x98a52666, 0x5648f725, 0xff5e569d, 0x0ced63d0, 0x7c63b2cf,
+	0x700b45e1, 0xd5ea50f1, 0x85a92872,
+	0xaf1fbda7, 0xd4234870, 0xa7870bf3, 0x2d3b4d79, 0x42e04198,
+	0x0cd0ede7, 0x26470db8, 0xf881814c,
+	0x474d6ad7, 0x7c0c5e5c, 0xd1231959, 0x381b7298, 0xf5d2f4db,
+	0xab838653, 0x6e2f1e23, 0x83719c9e,
+	0xbd91e046, 0x9a56456e, 0xdc39200c, 0x20c8c571, 0x962bda1c,
+	0xe1e696ff, 0xb141ab08, 0x7cca89b9,
+	0x1a69e783, 0x02cc4843, 0xa2f7c579, 0x429ef47d, 0x427b169c,
+	0x5ac9f049, 0xdd8f0f00, 0x5c8165bf
+};
+EXPORT_SYMBOL_GPL(cast_s1);
+
+const u32 cast_s2[256] = {
+	0x1f201094, 0xef0ba75b, 0x69e3cf7e, 0x393f4380, 0xfe61cf7a,
+	0xeec5207a, 0x55889c94, 0x72fc0651,
+	0xada7ef79, 0x4e1d7235, 0xd55a63ce, 0xde0436ba, 0x99c430ef,
+	0x5f0c0794, 0x18dcdb7d, 0xa1d6eff3,
+	0xa0b52f7b, 0x59e83605, 0xee15b094, 0xe9ffd909, 0xdc440086,
+	0xef944459, 0xba83ccb3, 0xe0c3cdfb,
+	0xd1da4181, 0x3b092ab1, 0xf997f1c1, 0xa5e6cf7b, 0x01420ddb,
+	0xe4e7ef5b, 0x25a1ff41, 0xe180f806,
+	0x1fc41080, 0x179bee7a, 0xd37ac6a9, 0xfe5830a4, 0x98de8b7f,
+	0x77e83f4e, 0x79929269, 0x24fa9f7b,
+	0xe113c85b, 0xacc40083, 0xd7503525, 0xf7ea615f, 0x62143154,
+	0x0d554b63, 0x5d681121, 0xc866c359,
+	0x3d63cf73, 0xcee234c0, 0xd4d87e87, 0x5c672b21, 0x071f6181,
+	0x39f7627f, 0x361e3084, 0xe4eb573b,
+	0x602f64a4, 0xd63acd9c, 0x1bbc4635, 0x9e81032d, 0x2701f50c,
+	0x99847ab4, 0xa0e3df79, 0xba6cf38c,
+	0x10843094, 0x2537a95e, 0xf46f6ffe, 0xa1ff3b1f, 0x208cfb6a,
+	0x8f458c74, 0xd9e0a227, 0x4ec73a34,
+	0xfc884f69, 0x3e4de8df, 0xef0e0088, 0x3559648d, 0x8a45388c,
+	0x1d804366, 0x721d9bfd, 0xa58684bb,
+	0xe8256333, 0x844e8212, 0x128d8098, 0xfed33fb4, 0xce280ae1,
+	0x27e19ba5, 0xd5a6c252, 0xe49754bd,
+	0xc5d655dd, 0xeb667064, 0x77840b4d, 0xa1b6a801, 0x84db26a9,
+	0xe0b56714, 0x21f043b7, 0xe5d05860,
+	0x54f03084, 0x066ff472, 0xa31aa153, 0xdadc4755, 0xb5625dbf,
+	0x68561be6, 0x83ca6b94, 0x2d6ed23b,
+	0xeccf01db, 0xa6d3d0ba, 0xb6803d5c, 0xaf77a709, 0x33b4a34c,
+	0x397bc8d6, 0x5ee22b95, 0x5f0e5304,
+	0x81ed6f61, 0x20e74364, 0xb45e1378, 0xde18639b, 0x881ca122,
+	0xb96726d1, 0x8049a7e8, 0x22b7da7b,
+	0x5e552d25, 0x5272d237, 0x79d2951c, 0xc60d894c, 0x488cb402,
+	0x1ba4fe5b, 0xa4b09f6b, 0x1ca815cf,
+	0xa20c3005, 0x8871df63, 0xb9de2fcb, 0x0cc6c9e9, 0x0beeff53,
+	0xe3214517, 0xb4542835, 0x9f63293c,
+	0xee41e729, 0x6e1d2d7c, 0x50045286, 0x1e6685f3, 0xf33401c6,
+	0x30a22c95, 0x31a70850, 0x60930f13,
+	0x73f98417, 0xa1269859, 0xec645c44, 0x52c877a9, 0xcdff33a6,
+	0xa02b1741, 0x7cbad9a2, 0x2180036f,
+	0x50d99c08, 0xcb3f4861, 0xc26bd765, 0x64a3f6ab, 0x80342676,
+	0x25a75e7b, 0xe4e6d1fc, 0x20c710e6,
+	0xcdf0b680, 0x17844d3b, 0x31eef84d, 0x7e0824e4, 0x2ccb49eb,
+	0x846a3bae, 0x8ff77888, 0xee5d60f6,
+	0x7af75673, 0x2fdd5cdb, 0xa11631c1, 0x30f66f43, 0xb3faec54,
+	0x157fd7fa, 0xef8579cc, 0xd152de58,
+	0xdb2ffd5e, 0x8f32ce19, 0x306af97a, 0x02f03ef8, 0x99319ad5,
+	0xc242fa0f, 0xa7e3ebb0, 0xc68e4906,
+	0xb8da230c, 0x80823028, 0xdcdef3c8, 0xd35fb171, 0x088a1bc8,
+	0xbec0c560, 0x61a3c9e8, 0xbca8f54d,
+	0xc72feffa, 0x22822e99, 0x82c570b4, 0xd8d94e89, 0x8b1c34bc,
+	0x301e16e6, 0x273be979, 0xb0ffeaa6,
+	0x61d9b8c6, 0x00b24869, 0xb7ffce3f, 0x08dc283b, 0x43daf65a,
+	0xf7e19798, 0x7619b72f, 0x8f1c9ba4,
+	0xdc8637a0, 0x16a7d3b1, 0x9fc393b7, 0xa7136eeb, 0xc6bcc63e,
+	0x1a513742, 0xef6828bc, 0x520365d6,
+	0x2d6a77ab, 0x3527ed4b, 0x821fd216, 0x095c6e2e, 0xdb92f2fb,
+	0x5eea29cb, 0x145892f5, 0x91584f7f,
+	0x5483697b, 0x2667a8cc, 0x85196048, 0x8c4bacea, 0x833860d4,
+	0x0d23e0f9, 0x6c387e8a, 0x0ae6d249,
+	0xb284600c, 0xd835731d, 0xdcb1c647, 0xac4c56ea, 0x3ebd81b3,
+	0x230eabb0, 0x6438bc87, 0xf0b5b1fa,
+	0x8f5ea2b3, 0xfc184642, 0x0a036b7a, 0x4fb089bd, 0x649da589,
+	0xa345415e, 0x5c038323, 0x3e5d3bb9,
+	0x43d79572, 0x7e6dd07c, 0x06dfdf1e, 0x6c6cc4ef, 0x7160a539,
+	0x73bfbe70, 0x83877605, 0x4523ecf1
+};
+EXPORT_SYMBOL_GPL(cast_s2);
+
+const u32 cast_s3[256] = {
+	0x8defc240, 0x25fa5d9f, 0xeb903dbf, 0xe810c907, 0x47607fff,
+	0x369fe44b, 0x8c1fc644, 0xaececa90,
+	0xbeb1f9bf, 0xeefbcaea, 0xe8cf1950, 0x51df07ae, 0x920e8806,
+	0xf0ad0548, 0xe13c8d83, 0x927010d5,
+	0x11107d9f, 0x07647db9, 0xb2e3e4d4, 0x3d4f285e, 0xb9afa820,
+	0xfade82e0, 0xa067268b, 0x8272792e,
+	0x553fb2c0, 0x489ae22b, 0xd4ef9794, 0x125e3fbc, 0x21fffcee,
+	0x825b1bfd, 0x9255c5ed, 0x1257a240,
+	0x4e1a8302, 0xbae07fff, 0x528246e7, 0x8e57140e, 0x3373f7bf,
+	0x8c9f8188, 0xa6fc4ee8, 0xc982b5a5,
+	0xa8c01db7, 0x579fc264, 0x67094f31, 0xf2bd3f5f, 0x40fff7c1,
+	0x1fb78dfc, 0x8e6bd2c1, 0x437be59b,
+	0x99b03dbf, 0xb5dbc64b, 0x638dc0e6, 0x55819d99, 0xa197c81c,
+	0x4a012d6e, 0xc5884a28, 0xccc36f71,
+	0xb843c213, 0x6c0743f1, 0x8309893c, 0x0feddd5f, 0x2f7fe850,
+	0xd7c07f7e, 0x02507fbf, 0x5afb9a04,
+	0xa747d2d0, 0x1651192e, 0xaf70bf3e, 0x58c31380, 0x5f98302e,
+	0x727cc3c4, 0x0a0fb402, 0x0f7fef82,
+	0x8c96fdad, 0x5d2c2aae, 0x8ee99a49, 0x50da88b8, 0x8427f4a0,
+	0x1eac5790, 0x796fb449, 0x8252dc15,
+	0xefbd7d9b, 0xa672597d, 0xada840d8, 0x45f54504, 0xfa5d7403,
+	0xe83ec305, 0x4f91751a, 0x925669c2,
+	0x23efe941, 0xa903f12e, 0x60270df2, 0x0276e4b6, 0x94fd6574,
+	0x927985b2, 0x8276dbcb, 0x02778176,
+	0xf8af918d, 0x4e48f79e, 0x8f616ddf, 0xe29d840e, 0x842f7d83,
+	0x340ce5c8, 0x96bbb682, 0x93b4b148,
+	0xef303cab, 0x984faf28, 0x779faf9b, 0x92dc560d, 0x224d1e20,
+	0x8437aa88, 0x7d29dc96, 0x2756d3dc,
+	0x8b907cee, 0xb51fd240, 0xe7c07ce3, 0xe566b4a1, 0xc3e9615e,
+	0x3cf8209d, 0x6094d1e3, 0xcd9ca341,
+	0x5c76460e, 0x00ea983b, 0xd4d67881, 0xfd47572c, 0xf76cedd9,
+	0xbda8229c, 0x127dadaa, 0x438a074e,
+	0x1f97c090, 0x081bdb8a, 0x93a07ebe, 0xb938ca15, 0x97b03cff,
+	0x3dc2c0f8, 0x8d1ab2ec, 0x64380e51,
+	0x68cc7bfb, 0xd90f2788, 0x12490181, 0x5de5ffd4, 0xdd7ef86a,
+	0x76a2e214, 0xb9a40368, 0x925d958f,
+	0x4b39fffa, 0xba39aee9, 0xa4ffd30b, 0xfaf7933b, 0x6d498623,
+	0x193cbcfa, 0x27627545, 0x825cf47a,
+	0x61bd8ba0, 0xd11e42d1, 0xcead04f4, 0x127ea392, 0x10428db7,
+	0x8272a972, 0x9270c4a8, 0x127de50b,
+	0x285ba1c8, 0x3c62f44f, 0x35c0eaa5, 0xe805d231, 0x428929fb,
+	0xb4fcdf82, 0x4fb66a53, 0x0e7dc15b,
+	0x1f081fab, 0x108618ae, 0xfcfd086d, 0xf9ff2889, 0x694bcc11,
+	0x236a5cae, 0x12deca4d, 0x2c3f8cc5,
+	0xd2d02dfe, 0xf8ef5896, 0xe4cf52da, 0x95155b67, 0x494a488c,
+	0xb9b6a80c, 0x5c8f82bc, 0x89d36b45,
+	0x3a609437, 0xec00c9a9, 0x44715253, 0x0a874b49, 0xd773bc40,
+	0x7c34671c, 0x02717ef6, 0x4feb5536,
+	0xa2d02fff, 0xd2bf60c4, 0xd43f03c0, 0x50b4ef6d, 0x07478cd1,
+	0x006e1888, 0xa2e53f55, 0xb9e6d4bc,
+	0xa2048016, 0x97573833, 0xd7207d67, 0xde0f8f3d, 0x72f87b33,
+	0xabcc4f33, 0x7688c55d, 0x7b00a6b0,
+	0x947b0001, 0x570075d2, 0xf9bb88f8, 0x8942019e, 0x4264a5ff,
+	0x856302e0, 0x72dbd92b, 0xee971b69,
+	0x6ea22fde, 0x5f08ae2b, 0xaf7a616d, 0xe5c98767, 0xcf1febd2,
+	0x61efc8c2, 0xf1ac2571, 0xcc8239c2,
+	0x67214cb8, 0xb1e583d1, 0xb7dc3e62, 0x7f10bdce, 0xf90a5c38,
+	0x0ff0443d, 0x606e6dc6, 0x60543a49,
+	0x5727c148, 0x2be98a1d, 0x8ab41738, 0x20e1be24, 0xaf96da0f,
+	0x68458425, 0x99833be5, 0x600d457d,
+	0x282f9350, 0x8334b362, 0xd91d1120, 0x2b6d8da0, 0x642b1e31,
+	0x9c305a00, 0x52bce688, 0x1b03588a,
+	0xf7baefd5, 0x4142ed9c, 0xa4315c11, 0x83323ec5, 0xdfef4636,
+	0xa133c501, 0xe9d3531c, 0xee353783
+};
+EXPORT_SYMBOL_GPL(cast_s3);
+
+const u32 cast_s4[256] = {
+	0x9db30420, 0x1fb6e9de, 0xa7be7bef, 0xd273a298, 0x4a4f7bdb,
+	0x64ad8c57, 0x85510443, 0xfa020ed1,
+	0x7e287aff, 0xe60fb663, 0x095f35a1, 0x79ebf120, 0xfd059d43,
+	0x6497b7b1, 0xf3641f63, 0x241e4adf,
+	0x28147f5f, 0x4fa2b8cd, 0xc9430040, 0x0cc32220, 0xfdd30b30,
+	0xc0a5374f, 0x1d2d00d9, 0x24147b15,
+	0xee4d111a, 0x0fca5167, 0x71ff904c, 0x2d195ffe, 0x1a05645f,
+	0x0c13fefe, 0x081b08ca, 0x05170121,
+	0x80530100, 0xe83e5efe, 0xac9af4f8, 0x7fe72701, 0xd2b8ee5f,
+	0x06df4261, 0xbb9e9b8a, 0x7293ea25,
+	0xce84ffdf, 0xf5718801, 0x3dd64b04, 0xa26f263b, 0x7ed48400,
+	0x547eebe6, 0x446d4ca0, 0x6cf3d6f5,
+	0x2649abdf, 0xaea0c7f5, 0x36338cc1, 0x503f7e93, 0xd3772061,
+	0x11b638e1, 0x72500e03, 0xf80eb2bb,
+	0xabe0502e, 0xec8d77de, 0x57971e81, 0xe14f6746, 0xc9335400,
+	0x6920318f, 0x081dbb99, 0xffc304a5,
+	0x4d351805, 0x7f3d5ce3, 0xa6c866c6, 0x5d5bcca9, 0xdaec6fea,
+	0x9f926f91, 0x9f46222f, 0x3991467d,
+	0xa5bf6d8e, 0x1143c44f, 0x43958302, 0xd0214eeb, 0x022083b8,
+	0x3fb6180c, 0x18f8931e, 0x281658e6,
+	0x26486e3e, 0x8bd78a70, 0x7477e4c1, 0xb506e07c, 0xf32d0a25,
+	0x79098b02, 0xe4eabb81, 0x28123b23,
+	0x69dead38, 0x1574ca16, 0xdf871b62, 0x211c40b7, 0xa51a9ef9,
+	0x0014377b, 0x041e8ac8, 0x09114003,
+	0xbd59e4d2, 0xe3d156d5, 0x4fe876d5, 0x2f91a340, 0x557be8de,
+	0x00eae4a7, 0x0ce5c2ec, 0x4db4bba6,
+	0xe756bdff, 0xdd3369ac, 0xec17b035, 0x06572327, 0x99afc8b0,
+	0x56c8c391, 0x6b65811c, 0x5e146119,
+	0x6e85cb75, 0xbe07c002, 0xc2325577, 0x893ff4ec, 0x5bbfc92d,
+	0xd0ec3b25, 0xb7801ab7, 0x8d6d3b24,
+	0x20c763ef, 0xc366a5fc, 0x9c382880, 0x0ace3205, 0xaac9548a,
+	0xeca1d7c7, 0x041afa32, 0x1d16625a,
+	0x6701902c, 0x9b757a54, 0x31d477f7, 0x9126b031, 0x36cc6fdb,
+	0xc70b8b46, 0xd9e66a48, 0x56e55a79,
+	0x026a4ceb, 0x52437eff, 0x2f8f76b4, 0x0df980a5, 0x8674cde3,
+	0xedda04eb, 0x17a9be04, 0x2c18f4df,
+	0xb7747f9d, 0xab2af7b4, 0xefc34d20, 0x2e096b7c, 0x1741a254,
+	0xe5b6a035, 0x213d42f6, 0x2c1c7c26,
+	0x61c2f50f, 0x6552daf9, 0xd2c231f8, 0x25130f69, 0xd8167fa2,
+	0x0418f2c8, 0x001a96a6, 0x0d1526ab,
+	0x63315c21, 0x5e0a72ec, 0x49bafefd, 0x187908d9, 0x8d0dbd86,
+	0x311170a7, 0x3e9b640c, 0xcc3e10d7,
+	0xd5cad3b6, 0x0caec388, 0xf73001e1, 0x6c728aff, 0x71eae2a1,
+	0x1f9af36e, 0xcfcbd12f, 0xc1de8417,
+	0xac07be6b, 0xcb44a1d8, 0x8b9b0f56, 0x013988c3, 0xb1c52fca,
+	0xb4be31cd, 0xd8782806, 0x12a3a4e2,
+	0x6f7de532, 0x58fd7eb6, 0xd01ee900, 0x24adffc2, 0xf4990fc5,
+	0x9711aac5, 0x001d7b95, 0x82e5e7d2,
+	0x109873f6, 0x00613096, 0xc32d9521, 0xada121ff, 0x29908415,
+	0x7fbb977f, 0xaf9eb3db, 0x29c9ed2a,
+	0x5ce2a465, 0xa730f32c, 0xd0aa3fe8, 0x8a5cc091, 0xd49e2ce7,
+	0x0ce454a9, 0xd60acd86, 0x015f1919,
+	0x77079103, 0xdea03af6, 0x78a8565e, 0xdee356df, 0x21f05cbe,
+	0x8b75e387, 0xb3c50651, 0xb8a5c3ef,
+	0xd8eeb6d2, 0xe523be77, 0xc2154529, 0x2f69efdf, 0xafe67afb,
+	0xf470c4b2, 0xf3e0eb5b, 0xd6cc9876,
+	0x39e4460c, 0x1fda8538, 0x1987832f, 0xca007367, 0xa99144f8,
+	0x296b299e, 0x492fc295, 0x9266beab,
+	0xb5676e69, 0x9bd3ddda, 0xdf7e052f, 0xdb25701c, 0x1b5e51ee,
+	0xf65324e6, 0x6afce36c, 0x0316cc04,
+	0x8644213e, 0xb7dc59d0, 0x7965291f, 0xccd6fd43, 0x41823979,
+	0x932bcdf6, 0xb657c34d, 0x4edfd282,
+	0x7ae5290c, 0x3cb9536b, 0x851e20fe, 0x9833557e, 0x13ecf0b0,
+	0xd3ffb372, 0x3f85c5c1, 0x0aef7ed2
+};
+EXPORT_SYMBOL_GPL(cast_s4);
+
+MODULE_LICENSE("GPL");
diff --git a/include/crypto/cast5.h b/include/crypto/cast5.h
index 586183a0406e..14fbf39d6380 100644
--- a/include/crypto/cast5.h
+++ b/include/crypto/cast5.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/crypto.h>
+#include <crypto/cast_common.h>
 
 #define CAST5_BLOCK_SIZE 8
 #define CAST5_MIN_KEY_SIZE 5
@@ -19,9 +20,4 @@ int cast5_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen);
 void __cast5_encrypt(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
 void __cast5_decrypt(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
 
-extern const u32 cast5_s1[256];
-extern const u32 cast5_s2[256];
-extern const u32 cast5_s3[256];
-extern const u32 cast5_s4[256];
-
 #endif
diff --git a/include/crypto/cast6.h b/include/crypto/cast6.h
index 157af6f342c8..32b60eb8bd24 100644
--- a/include/crypto/cast6.h
+++ b/include/crypto/cast6.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/crypto.h>
+#include <crypto/cast_common.h>
 
 #define CAST6_BLOCK_SIZE 16
 #define CAST6_MIN_KEY_SIZE 16
@@ -20,9 +21,4 @@ int cast6_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen);
 void __cast6_encrypt(struct cast6_ctx *ctx, u8 *dst, const u8 *src);
 void __cast6_decrypt(struct cast6_ctx *ctx, u8 *dst, const u8 *src);
 
-extern const u32 cast6_s1[256];
-extern const u32 cast6_s2[256];
-extern const u32 cast6_s3[256];
-extern const u32 cast6_s4[256];
-
 #endif
diff --git a/include/crypto/cast_common.h b/include/crypto/cast_common.h
new file mode 100644
index 000000000000..b7df35cd9f0a
--- /dev/null
+++ b/include/crypto/cast_common.h
@@ -0,0 +1,9 @@
+#ifndef _CRYPTO_CAST_COMMON_H
+#define _CRYPTO_CAST_COMMON_H
+
+extern const u32 cast_s1[256];
+extern const u32 cast_s2[256];
+extern const u32 cast_s3[256];
+extern const u32 cast_s4[256];
+
+#endif
-- 
cgit v1.2.3-58-ga151