From 473946e674eb797940c0a1aea8a999bc8fa9fcff Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@horizon.com>
Date: Fri, 6 Jun 2014 23:08:58 -0400
Subject: crypto: crc32c-pclmul - Shrink K_table to 32-bit words

There's no need for the K_table to be made of 64-bit words.  For some
reason, the original authors didn't fully reduce the values modulo the
CRC32C polynomial, and so had some 33-bit values in there.  They can
all be reduced to 32 bits.

Doing that cuts the table size in half.  Since the code depends on both
pclmulq and crc32, SSE 4.1 is obviously present, so we can use pmovzxdq
to fetch it in the correct format.

This adds (measured on Ivy Bridge) 1 cycle per main loop iteration
(CRC of up to 3K bytes), less than 0.2%.  The hope is that the reduced
D-cache footprint will make up the loss in other code.

Two other related fixes:
* K_table is read-only, so belongs in .rodata, and
* There's no need for more than 8-byte alignment

Acked-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 281 +++++++++++++++---------------
 1 file changed, 139 insertions(+), 142 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index dbc4339b5417..26d49ebae040 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -72,6 +72,7 @@
 
 # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
 
+.text
 ENTRY(crc_pcl)
 #define    bufp		%rdi
 #define    bufp_dw	%edi
@@ -216,15 +217,11 @@ LABEL crc_ %i
 	## 4) Combine three results:
 	################################################################
 
-	lea	(K_table-16)(%rip), bufp	# first entry is for idx 1
+	lea	(K_table-8)(%rip), bufp		# first entry is for idx 1
 	shlq    $3, %rax			# rax *= 8
-	subq    %rax, tmp			# tmp -= rax*8
-	shlq    $1, %rax
-	subq    %rax, tmp			# tmp -= rax*16
-						# (total tmp -= rax*24)
-	addq    %rax, bufp
-
-	movdqa  (bufp), %xmm0			# 2 consts: K1:K2
+	pmovzxdq (bufp,%rax), %xmm0		# 2 consts: K1:K2
+	leal	(%eax,%eax,2), %eax		# rax *= 3 (total *24)
+	subq    %rax, tmp			# tmp -= rax*24
 
 	movq    crc_init, %xmm1			# CRC for block 1
 	PCLMULQDQ 0x00,%xmm0,%xmm1		# Multiply by K2
@@ -238,9 +235,9 @@ LABEL crc_ %i
 	mov     crc2, crc_init
 	crc32   %rax, crc_init
 
-################################################################
-## 5) Check for end:
-################################################################
+	################################################################
+	## 5) Check for end:
+	################################################################
 
 LABEL crc_ 0
 	mov     tmp, len
@@ -331,136 +328,136 @@ ENDPROC(crc_pcl)
 
 	################################################################
 	## PCLMULQDQ tables
-	## Table is 128 entries x 2 quad words each
+	## Table is 128 entries x 2 words (8 bytes) each
 	################################################################
-.data
-.align 64
+.section	.rotata, "a", %progbits
+.align 8
 K_table:
-        .quad 0x14cd00bd6,0x105ec76f0
-        .quad 0x0ba4fc28e,0x14cd00bd6
-        .quad 0x1d82c63da,0x0f20c0dfe
-        .quad 0x09e4addf8,0x0ba4fc28e
-        .quad 0x039d3b296,0x1384aa63a
-        .quad 0x102f9b8a2,0x1d82c63da
-        .quad 0x14237f5e6,0x01c291d04
-        .quad 0x00d3b6092,0x09e4addf8
-        .quad 0x0c96cfdc0,0x0740eef02
-        .quad 0x18266e456,0x039d3b296
-        .quad 0x0daece73e,0x0083a6eec
-        .quad 0x0ab7aff2a,0x102f9b8a2
-        .quad 0x1248ea574,0x1c1733996
-        .quad 0x083348832,0x14237f5e6
-        .quad 0x12c743124,0x02ad91c30
-        .quad 0x0b9e02b86,0x00d3b6092
-        .quad 0x018b33a4e,0x06992cea2
-        .quad 0x1b331e26a,0x0c96cfdc0
-        .quad 0x17d35ba46,0x07e908048
-        .quad 0x1bf2e8b8a,0x18266e456
-        .quad 0x1a3e0968a,0x11ed1f9d8
-        .quad 0x0ce7f39f4,0x0daece73e
-        .quad 0x061d82e56,0x0f1d0f55e
-        .quad 0x0d270f1a2,0x0ab7aff2a
-        .quad 0x1c3f5f66c,0x0a87ab8a8
-        .quad 0x12ed0daac,0x1248ea574
-        .quad 0x065863b64,0x08462d800
-        .quad 0x11eef4f8e,0x083348832
-        .quad 0x1ee54f54c,0x071d111a8
-        .quad 0x0b3e32c28,0x12c743124
-        .quad 0x0064f7f26,0x0ffd852c6
-        .quad 0x0dd7e3b0c,0x0b9e02b86
-        .quad 0x0f285651c,0x0dcb17aa4
-        .quad 0x010746f3c,0x018b33a4e
-        .quad 0x1c24afea4,0x0f37c5aee
-        .quad 0x0271d9844,0x1b331e26a
-        .quad 0x08e766a0c,0x06051d5a2
-        .quad 0x093a5f730,0x17d35ba46
-        .quad 0x06cb08e5c,0x11d5ca20e
-        .quad 0x06b749fb2,0x1bf2e8b8a
-        .quad 0x1167f94f2,0x021f3d99c
-        .quad 0x0cec3662e,0x1a3e0968a
-        .quad 0x19329634a,0x08f158014
-        .quad 0x0e6fc4e6a,0x0ce7f39f4
-        .quad 0x08227bb8a,0x1a5e82106
-        .quad 0x0b0cd4768,0x061d82e56
-        .quad 0x13c2b89c4,0x188815ab2
-        .quad 0x0d7a4825c,0x0d270f1a2
-        .quad 0x10f5ff2ba,0x105405f3e
-        .quad 0x00167d312,0x1c3f5f66c
-        .quad 0x0f6076544,0x0e9adf796
-        .quad 0x026f6a60a,0x12ed0daac
-        .quad 0x1a2adb74e,0x096638b34
-        .quad 0x19d34af3a,0x065863b64
-        .quad 0x049c3cc9c,0x1e50585a0
-        .quad 0x068bce87a,0x11eef4f8e
-        .quad 0x1524fa6c6,0x19f1c69dc
-        .quad 0x16cba8aca,0x1ee54f54c
-        .quad 0x042d98888,0x12913343e
-        .quad 0x1329d9f7e,0x0b3e32c28
-        .quad 0x1b1c69528,0x088f25a3a
-        .quad 0x02178513a,0x0064f7f26
-        .quad 0x0e0ac139e,0x04e36f0b0
-        .quad 0x0170076fa,0x0dd7e3b0c
-        .quad 0x141a1a2e2,0x0bd6f81f8
-        .quad 0x16ad828b4,0x0f285651c
-        .quad 0x041d17b64,0x19425cbba
-        .quad 0x1fae1cc66,0x010746f3c
-        .quad 0x1a75b4b00,0x18db37e8a
-        .quad 0x0f872e54c,0x1c24afea4
-        .quad 0x01e41e9fc,0x04c144932
-        .quad 0x086d8e4d2,0x0271d9844
-        .quad 0x160f7af7a,0x052148f02
-        .quad 0x05bb8f1bc,0x08e766a0c
-        .quad 0x0a90fd27a,0x0a3c6f37a
-        .quad 0x0b3af077a,0x093a5f730
-        .quad 0x04984d782,0x1d22c238e
-        .quad 0x0ca6ef3ac,0x06cb08e5c
-        .quad 0x0234e0b26,0x063ded06a
-        .quad 0x1d88abd4a,0x06b749fb2
-        .quad 0x04597456a,0x04d56973c
-        .quad 0x0e9e28eb4,0x1167f94f2
-        .quad 0x07b3ff57a,0x19385bf2e
-        .quad 0x0c9c8b782,0x0cec3662e
-        .quad 0x13a9cba9e,0x0e417f38a
-        .quad 0x093e106a4,0x19329634a
-        .quad 0x167001a9c,0x14e727980
-        .quad 0x1ddffc5d4,0x0e6fc4e6a
-        .quad 0x00df04680,0x0d104b8fc
-        .quad 0x02342001e,0x08227bb8a
-        .quad 0x00a2a8d7e,0x05b397730
-        .quad 0x168763fa6,0x0b0cd4768
-        .quad 0x1ed5a407a,0x0e78eb416
-        .quad 0x0d2c3ed1a,0x13c2b89c4
-        .quad 0x0995a5724,0x1641378f0
-        .quad 0x19b1afbc4,0x0d7a4825c
-        .quad 0x109ffedc0,0x08d96551c
-        .quad 0x0f2271e60,0x10f5ff2ba
-        .quad 0x00b0bf8ca,0x00bf80dd2
-        .quad 0x123888b7a,0x00167d312
-        .quad 0x1e888f7dc,0x18dcddd1c
-        .quad 0x002ee03b2,0x0f6076544
-        .quad 0x183e8d8fe,0x06a45d2b2
-        .quad 0x133d7a042,0x026f6a60a
-        .quad 0x116b0f50c,0x1dd3e10e8
-        .quad 0x05fabe670,0x1a2adb74e
-        .quad 0x130004488,0x0de87806c
-        .quad 0x000bcf5f6,0x19d34af3a
-        .quad 0x18f0c7078,0x014338754
-        .quad 0x017f27698,0x049c3cc9c
-        .quad 0x058ca5f00,0x15e3e77ee
-        .quad 0x1af900c24,0x068bce87a
-        .quad 0x0b5cfca28,0x0dd07448e
-        .quad 0x0ded288f8,0x1524fa6c6
-        .quad 0x059f229bc,0x1d8048348
-        .quad 0x06d390dec,0x16cba8aca
-        .quad 0x037170390,0x0a3e3e02c
-        .quad 0x06353c1cc,0x042d98888
-        .quad 0x0c4584f5c,0x0d73c7bea
-        .quad 0x1f16a3418,0x1329d9f7e
-        .quad 0x0531377e2,0x185137662
-        .quad 0x1d8d9ca7c,0x1b1c69528
-        .quad 0x0b25b29f2,0x18a08b5bc
-        .quad 0x19fb2a8b0,0x02178513a
-        .quad 0x1a08fe6ac,0x1da758ae0
-        .quad 0x045cddf4e,0x0e0ac139e
-        .quad 0x1a91647f2,0x169cf9eb0
-        .quad 0x1a0f717c4,0x0170076fa
+	.long 0x493c7d27, 0x00000001
+	.long 0xba4fc28e, 0x493c7d27
+	.long 0xddc0152b, 0xf20c0dfe
+	.long 0x9e4addf8, 0xba4fc28e
+	.long 0x39d3b296, 0x3da6d0cb
+	.long 0x0715ce53, 0xddc0152b
+	.long 0x47db8317, 0x1c291d04
+	.long 0x0d3b6092, 0x9e4addf8
+	.long 0xc96cfdc0, 0x740eef02
+	.long 0x878a92a7, 0x39d3b296
+	.long 0xdaece73e, 0x083a6eec
+	.long 0xab7aff2a, 0x0715ce53
+	.long 0x2162d385, 0xc49f4f67
+	.long 0x83348832, 0x47db8317
+	.long 0x299847d5, 0x2ad91c30
+	.long 0xb9e02b86, 0x0d3b6092
+	.long 0x18b33a4e, 0x6992cea2
+	.long 0xb6dd949b, 0xc96cfdc0
+	.long 0x78d9ccb7, 0x7e908048
+	.long 0xbac2fd7b, 0x878a92a7
+	.long 0xa60ce07b, 0x1b3d8f29
+	.long 0xce7f39f4, 0xdaece73e
+	.long 0x61d82e56, 0xf1d0f55e
+	.long 0xd270f1a2, 0xab7aff2a
+	.long 0xc619809d, 0xa87ab8a8
+	.long 0x2b3cac5d, 0x2162d385
+	.long 0x65863b64, 0x8462d800
+	.long 0x1b03397f, 0x83348832
+	.long 0xebb883bd, 0x71d111a8
+	.long 0xb3e32c28, 0x299847d5
+	.long 0x064f7f26, 0xffd852c6
+	.long 0xdd7e3b0c, 0xb9e02b86
+	.long 0xf285651c, 0xdcb17aa4
+	.long 0x10746f3c, 0x18b33a4e
+	.long 0xc7a68855, 0xf37c5aee
+	.long 0x271d9844, 0xb6dd949b
+	.long 0x8e766a0c, 0x6051d5a2
+	.long 0x93a5f730, 0x78d9ccb7
+	.long 0x6cb08e5c, 0x18b0d4ff
+	.long 0x6b749fb2, 0xbac2fd7b
+	.long 0x1393e203, 0x21f3d99c
+	.long 0xcec3662e, 0xa60ce07b
+	.long 0x96c515bb, 0x8f158014
+	.long 0xe6fc4e6a, 0xce7f39f4
+	.long 0x8227bb8a, 0xa00457f7
+	.long 0xb0cd4768, 0x61d82e56
+	.long 0x39c7ff35, 0x8d6d2c43
+	.long 0xd7a4825c, 0xd270f1a2
+	.long 0x0ab3844b, 0x00ac29cf
+	.long 0x0167d312, 0xc619809d
+	.long 0xf6076544, 0xe9adf796
+	.long 0x26f6a60a, 0x2b3cac5d
+	.long 0xa741c1bf, 0x96638b34
+	.long 0x98d8d9cb, 0x65863b64
+	.long 0x49c3cc9c, 0xe0e9f351
+	.long 0x68bce87a, 0x1b03397f
+	.long 0x57a3d037, 0x9af01f2d
+	.long 0x6956fc3b, 0xebb883bd
+	.long 0x42d98888, 0x2cff42cf
+	.long 0x3771e98f, 0xb3e32c28
+	.long 0xb42ae3d9, 0x88f25a3a
+	.long 0x2178513a, 0x064f7f26
+	.long 0xe0ac139e, 0x4e36f0b0
+	.long 0x170076fa, 0xdd7e3b0c
+	.long 0x444dd413, 0xbd6f81f8
+	.long 0x6f345e45, 0xf285651c
+	.long 0x41d17b64, 0x91c9bd4b
+	.long 0xff0dba97, 0x10746f3c
+	.long 0xa2b73df1, 0x885f087b
+	.long 0xf872e54c, 0xc7a68855
+	.long 0x1e41e9fc, 0x4c144932
+	.long 0x86d8e4d2, 0x271d9844
+	.long 0x651bd98b, 0x52148f02
+	.long 0x5bb8f1bc, 0x8e766a0c
+	.long 0xa90fd27a, 0xa3c6f37a
+	.long 0xb3af077a, 0x93a5f730
+	.long 0x4984d782, 0xd7c0557f
+	.long 0xca6ef3ac, 0x6cb08e5c
+	.long 0x234e0b26, 0x63ded06a
+	.long 0xdd66cbbb, 0x6b749fb2
+	.long 0x4597456a, 0x4d56973c
+	.long 0xe9e28eb4, 0x1393e203
+	.long 0x7b3ff57a, 0x9669c9df
+	.long 0xc9c8b782, 0xcec3662e
+	.long 0x3f70cc6f, 0xe417f38a
+	.long 0x93e106a4, 0x96c515bb
+	.long 0x62ec6c6d, 0x4b9e0f71
+	.long 0xd813b325, 0xe6fc4e6a
+	.long 0x0df04680, 0xd104b8fc
+	.long 0x2342001e, 0x8227bb8a
+	.long 0x0a2a8d7e, 0x5b397730
+	.long 0x6d9a4957, 0xb0cd4768
+	.long 0xe8b6368b, 0xe78eb416
+	.long 0xd2c3ed1a, 0x39c7ff35
+	.long 0x995a5724, 0x61ff0e01
+	.long 0x9ef68d35, 0xd7a4825c
+	.long 0x0c139b31, 0x8d96551c
+	.long 0xf2271e60, 0x0ab3844b
+	.long 0x0b0bf8ca, 0x0bf80dd2
+	.long 0x2664fd8b, 0x0167d312
+	.long 0xed64812d, 0x8821abed
+	.long 0x02ee03b2, 0xf6076544
+	.long 0x8604ae0f, 0x6a45d2b2
+	.long 0x363bd6b3, 0x26f6a60a
+	.long 0x135c83fd, 0xd8d26619
+	.long 0x5fabe670, 0xa741c1bf
+	.long 0x35ec3279, 0xde87806c
+	.long 0x00bcf5f6, 0x98d8d9cb
+	.long 0x8ae00689, 0x14338754
+	.long 0x17f27698, 0x49c3cc9c
+	.long 0x58ca5f00, 0x5bd2011f
+	.long 0xaa7c7ad5, 0x68bce87a
+	.long 0xb5cfca28, 0xdd07448e
+	.long 0xded288f8, 0x57a3d037
+	.long 0x59f229bc, 0xdde8f5b9
+	.long 0x6d390dec, 0x6956fc3b
+	.long 0x37170390, 0xa3e3e02c
+	.long 0x6353c1cc, 0x42d98888
+	.long 0xc4584f5c, 0xd73c7bea
+	.long 0xf48642e9, 0x3771e98f
+	.long 0x531377e2, 0x80ff0093
+	.long 0xdd35bc8d, 0xb42ae3d9
+	.long 0xb25b29f2, 0x8fe4c34d
+	.long 0x9a5ede41, 0x2178513a
+	.long 0xa563905d, 0xdf99fc11
+	.long 0x45cddf4e, 0xe0ac139e
+	.long 0xacfa3103, 0x6c23e841
+	.long 0xa51b6135, 0x170076fa
-- 
cgit v1.2.3-58-ga151


From 6574e6c64e971c9adb629e81e497afdb52b1c9df Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Mon, 9 Jun 2014 20:59:54 +0300
Subject: crypto: des_3des - add x86-64 assembly implementation

Patch adds x86_64 assembly implementation of Triple DES EDE cipher algorithm.
Two assembly implementations are provided. First is regular 'one-block at
time' encrypt/decrypt function. Second is 'three-blocks at time' function that
gains performance increase on out-of-order CPUs.

tcrypt test results:

Intel Core i5-4570:

des3_ede-asm vs des3_ede-generic:
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     1.21x   1.22x   1.27x   1.36x   1.25x   1.25x
64B     1.98x   1.96x   1.23x   2.04x   2.01x   2.00x
256B    2.34x   2.37x   1.21x   2.40x   2.38x   2.39x
1024B   2.50x   2.47x   1.22x   2.51x   2.52x   2.51x
8192B   2.51x   2.53x   1.21x   2.56x   2.54x   2.55x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile          |   2 +
 arch/x86/crypto/des3_ede-asm_64.S | 805 ++++++++++++++++++++++++++++++++++++++
 arch/x86/crypto/des3_ede_glue.c   | 509 ++++++++++++++++++++++++
 crypto/Kconfig                    |  13 +
 crypto/des_generic.c              |  22 +-
 include/crypto/des.h              |   3 +
 6 files changed, 1349 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/crypto/des3_ede-asm_64.S
 create mode 100644 arch/x86/crypto/des3_ede_glue.c

(limited to 'arch')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 61d6e281898b..a470de25570f 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
@@ -52,6 +53,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
+des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
new file mode 100644
index 000000000000..038f6ae87c5e
--- /dev/null
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -0,0 +1,805 @@
+/*
+ * des3_ede-asm_64.S  -  x86-64 assembly implementation of 3DES cipher
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/linkage.h>
+
+.file "des3_ede-asm_64.S"
+.text
+
+#define s1 .L_s1
+#define s2 ((s1) + (64*8))
+#define s3 ((s2) + (64*8))
+#define s4 ((s3) + (64*8))
+#define s5 ((s4) + (64*8))
+#define s6 ((s5) + (64*8))
+#define s7 ((s6) + (64*8))
+#define s8 ((s7) + (64*8))
+
+/* register macros */
+#define CTX %rdi
+
+#define RL0 %r8
+#define RL1 %r9
+#define RL2 %r10
+
+#define RL0d %r8d
+#define RL1d %r9d
+#define RL2d %r10d
+
+#define RR0 %r11
+#define RR1 %r12
+#define RR2 %r13
+
+#define RR0d %r11d
+#define RR1d %r12d
+#define RR2d %r13d
+
+#define RW0 %rax
+#define RW1 %rbx
+#define RW2 %rcx
+
+#define RW0d %eax
+#define RW1d %ebx
+#define RW2d %ecx
+
+#define RW0bl %al
+#define RW1bl %bl
+#define RW2bl %cl
+
+#define RW0bh %ah
+#define RW1bh %bh
+#define RW2bh %ch
+
+#define RT0 %r15
+#define RT1 %rbp
+#define RT2 %r14
+#define RT3 %rdx
+
+#define RT0d %r15d
+#define RT1d %ebp
+#define RT2d %r14d
+#define RT3d %edx
+
+/***********************************************************************
+ * 1-way 3DES
+ ***********************************************************************/
+#define do_permutation(a, b, offset, mask) \
+	movl a, RT0d; \
+	shrl $(offset), RT0d; \
+	xorl b, RT0d; \
+	andl $(mask), RT0d; \
+	xorl RT0d, b; \
+	shll $(offset), RT0d; \
+	xorl RT0d, a;
+
+#define expand_to_64bits(val, mask) \
+	movl val##d, RT0d; \
+	rorl $4, RT0d; \
+	shlq $32, RT0; \
+	orq RT0, val; \
+	andq mask, val;
+
+#define compress_to_64bits(val) \
+	movq val, RT0; \
+	shrq $32, RT0; \
+	roll $4, RT0d; \
+	orl RT0d, val##d;
+
+#define initial_permutation(left, right) \
+	do_permutation(left##d, right##d,  4, 0x0f0f0f0f); \
+	do_permutation(left##d, right##d, 16, 0x0000ffff); \
+	do_permutation(right##d, left##d,  2, 0x33333333); \
+	do_permutation(right##d, left##d,  8, 0x00ff00ff); \
+	movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+	movl left##d, RW0d; \
+	roll $1, right##d; \
+	xorl right##d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, left##d; \
+	xorl RW0d, right##d; \
+	roll $1, left##d; \
+	expand_to_64bits(right, RT3); \
+	expand_to_64bits(left, RT3);
+
+#define final_permutation(left, right) \
+	compress_to_64bits(right); \
+	compress_to_64bits(left); \
+	movl right##d, RW0d; \
+	rorl $1, left##d; \
+	xorl left##d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, right##d; \
+	xorl RW0d, left##d; \
+	rorl $1, right##d; \
+	do_permutation(right##d, left##d,  8, 0x00ff00ff); \
+	do_permutation(right##d, left##d,  2, 0x33333333); \
+	do_permutation(left##d, right##d, 16, 0x0000ffff); \
+	do_permutation(left##d, right##d,  4, 0x0f0f0f0f);
+
+#define round1(n, from, to, load_next_key) \
+	xorq from, RW0; \
+	\
+	movzbl RW0bl, RT0d; \
+	movzbl RW0bh, RT1d; \
+	shrq $16, RW0; \
+	movzbl RW0bl, RT2d; \
+	movzbl RW0bh, RT3d; \
+	shrq $16, RW0; \
+	movq s8(, RT0, 8), RT0; \
+	xorq s6(, RT1, 8), to; \
+	movzbl RW0bl, RL1d; \
+	movzbl RW0bh, RT1d; \
+	shrl $16, RW0d; \
+	xorq s4(, RT2, 8), RT0; \
+	xorq s2(, RT3, 8), to; \
+	movzbl RW0bl, RT2d; \
+	movzbl RW0bh, RT3d; \
+	xorq s7(, RL1, 8), RT0; \
+	xorq s5(, RT1, 8), to; \
+	xorq s3(, RT2, 8), RT0; \
+	load_next_key(n, RW0); \
+	xorq RT0, to; \
+	xorq s1(, RT3, 8), to; \
+
+#define load_next_key(n, RWx) \
+	movq (((n) + 1) * 8)(CTX), RWx;
+
+#define dummy2(a, b) /*_*/
+
+#define read_block(io, left, right) \
+	movl    (io), left##d; \
+	movl   4(io), right##d; \
+	bswapl left##d; \
+	bswapl right##d;
+
+#define write_block(io, left, right) \
+	bswapl left##d; \
+	bswapl right##d; \
+	movl   left##d,   (io); \
+	movl   right##d, 4(io);
+
+ENTRY(des3_ede_x86_64_crypt_blk)
+	/* input:
+	 *	%rdi: round keys, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+	pushq %r15;
+
+	read_block(%rdx, RL0, RR0);
+	initial_permutation(RL0, RR0);
+
+	movq (CTX), RW0;
+
+	round1(0, RR0, RL0, load_next_key);
+	round1(1, RL0, RR0, load_next_key);
+	round1(2, RR0, RL0, load_next_key);
+	round1(3, RL0, RR0, load_next_key);
+	round1(4, RR0, RL0, load_next_key);
+	round1(5, RL0, RR0, load_next_key);
+	round1(6, RR0, RL0, load_next_key);
+	round1(7, RL0, RR0, load_next_key);
+	round1(8, RR0, RL0, load_next_key);
+	round1(9, RL0, RR0, load_next_key);
+	round1(10, RR0, RL0, load_next_key);
+	round1(11, RL0, RR0, load_next_key);
+	round1(12, RR0, RL0, load_next_key);
+	round1(13, RL0, RR0, load_next_key);
+	round1(14, RR0, RL0, load_next_key);
+	round1(15, RL0, RR0, load_next_key);
+
+	round1(16+0, RL0, RR0, load_next_key);
+	round1(16+1, RR0, RL0, load_next_key);
+	round1(16+2, RL0, RR0, load_next_key);
+	round1(16+3, RR0, RL0, load_next_key);
+	round1(16+4, RL0, RR0, load_next_key);
+	round1(16+5, RR0, RL0, load_next_key);
+	round1(16+6, RL0, RR0, load_next_key);
+	round1(16+7, RR0, RL0, load_next_key);
+	round1(16+8, RL0, RR0, load_next_key);
+	round1(16+9, RR0, RL0, load_next_key);
+	round1(16+10, RL0, RR0, load_next_key);
+	round1(16+11, RR0, RL0, load_next_key);
+	round1(16+12, RL0, RR0, load_next_key);
+	round1(16+13, RR0, RL0, load_next_key);
+	round1(16+14, RL0, RR0, load_next_key);
+	round1(16+15, RR0, RL0, load_next_key);
+
+	round1(32+0, RR0, RL0, load_next_key);
+	round1(32+1, RL0, RR0, load_next_key);
+	round1(32+2, RR0, RL0, load_next_key);
+	round1(32+3, RL0, RR0, load_next_key);
+	round1(32+4, RR0, RL0, load_next_key);
+	round1(32+5, RL0, RR0, load_next_key);
+	round1(32+6, RR0, RL0, load_next_key);
+	round1(32+7, RL0, RR0, load_next_key);
+	round1(32+8, RR0, RL0, load_next_key);
+	round1(32+9, RL0, RR0, load_next_key);
+	round1(32+10, RR0, RL0, load_next_key);
+	round1(32+11, RL0, RR0, load_next_key);
+	round1(32+12, RR0, RL0, load_next_key);
+	round1(32+13, RL0, RR0, load_next_key);
+	round1(32+14, RR0, RL0, load_next_key);
+	round1(32+15, RL0, RR0, dummy2);
+
+	final_permutation(RR0, RL0);
+	write_block(%rsi, RR0, RL0);
+
+	popq %r15;
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+ENDPROC(des3_ede_x86_64_crypt_blk)
+
+/***********************************************************************
+ * 3-way 3DES
+ ***********************************************************************/
+#define expand_to_64bits(val, mask) \
+	movl val##d, RT0d; \
+	rorl $4, RT0d; \
+	shlq $32, RT0; \
+	orq RT0, val; \
+	andq mask, val;
+
+#define compress_to_64bits(val) \
+	movq val, RT0; \
+	shrq $32, RT0; \
+	roll $4, RT0d; \
+	orl RT0d, val##d;
+
+#define initial_permutation3(left, right) \
+	do_permutation(left##0d, right##0d,  4, 0x0f0f0f0f); \
+	do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+	  do_permutation(left##1d, right##1d,  4, 0x0f0f0f0f); \
+	  do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+	    do_permutation(left##2d, right##2d,  4, 0x0f0f0f0f); \
+	    do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+	    \
+	do_permutation(right##0d, left##0d,  2, 0x33333333); \
+	do_permutation(right##0d, left##0d,  8, 0x00ff00ff); \
+	  do_permutation(right##1d, left##1d,  2, 0x33333333); \
+	  do_permutation(right##1d, left##1d,  8, 0x00ff00ff); \
+	    do_permutation(right##2d, left##2d,  2, 0x33333333); \
+	    do_permutation(right##2d, left##2d,  8, 0x00ff00ff); \
+	    \
+	movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+	    \
+	movl left##0d, RW0d; \
+	roll $1, right##0d; \
+	xorl right##0d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, left##0d; \
+	xorl RW0d, right##0d; \
+	roll $1, left##0d; \
+	expand_to_64bits(right##0, RT3); \
+	expand_to_64bits(left##0, RT3); \
+	  movl left##1d, RW1d; \
+	  roll $1, right##1d; \
+	  xorl right##1d, RW1d; \
+	  andl $0xaaaaaaaa, RW1d; \
+	  xorl RW1d, left##1d; \
+	  xorl RW1d, right##1d; \
+	  roll $1, left##1d; \
+	  expand_to_64bits(right##1, RT3); \
+	  expand_to_64bits(left##1, RT3); \
+	    movl left##2d, RW2d; \
+	    roll $1, right##2d; \
+	    xorl right##2d, RW2d; \
+	    andl $0xaaaaaaaa, RW2d; \
+	    xorl RW2d, left##2d; \
+	    xorl RW2d, right##2d; \
+	    roll $1, left##2d; \
+	    expand_to_64bits(right##2, RT3); \
+	    expand_to_64bits(left##2, RT3);
+
+#define final_permutation3(left, right) \
+	compress_to_64bits(right##0); \
+	compress_to_64bits(left##0); \
+	movl right##0d, RW0d; \
+	rorl $1, left##0d; \
+	xorl left##0d, RW0d; \
+	andl $0xaaaaaaaa, RW0d; \
+	xorl RW0d, right##0d; \
+	xorl RW0d, left##0d; \
+	rorl $1, right##0d; \
+	  compress_to_64bits(right##1); \
+	  compress_to_64bits(left##1); \
+	  movl right##1d, RW1d; \
+	  rorl $1, left##1d; \
+	  xorl left##1d, RW1d; \
+	  andl $0xaaaaaaaa, RW1d; \
+	  xorl RW1d, right##1d; \
+	  xorl RW1d, left##1d; \
+	  rorl $1, right##1d; \
+	    compress_to_64bits(right##2); \
+	    compress_to_64bits(left##2); \
+	    movl right##2d, RW2d; \
+	    rorl $1, left##2d; \
+	    xorl left##2d, RW2d; \
+	    andl $0xaaaaaaaa, RW2d; \
+	    xorl RW2d, right##2d; \
+	    xorl RW2d, left##2d; \
+	    rorl $1, right##2d; \
+	    \
+	do_permutation(right##0d, left##0d,  8, 0x00ff00ff); \
+	do_permutation(right##0d, left##0d,  2, 0x33333333); \
+	  do_permutation(right##1d, left##1d,  8, 0x00ff00ff); \
+	  do_permutation(right##1d, left##1d,  2, 0x33333333); \
+	    do_permutation(right##2d, left##2d,  8, 0x00ff00ff); \
+	    do_permutation(right##2d, left##2d,  2, 0x33333333); \
+	    \
+	do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+	do_permutation(left##0d, right##0d,  4, 0x0f0f0f0f); \
+	  do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+	  do_permutation(left##1d, right##1d,  4, 0x0f0f0f0f); \
+	    do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+	    do_permutation(left##2d, right##2d,  4, 0x0f0f0f0f);
+
+#define round3(n, from, to, load_next_key, do_movq) \
+	xorq from##0, RW0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	shrq $16, RW0; \
+	xorq s8(, RT3, 8), to##0; \
+	xorq s6(, RT1, 8), to##0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	shrq $16, RW0; \
+	xorq s4(, RT3, 8), to##0; \
+	xorq s2(, RT1, 8), to##0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	shrl $16, RW0d; \
+	xorq s7(, RT3, 8), to##0; \
+	xorq s5(, RT1, 8), to##0; \
+	movzbl RW0bl, RT3d; \
+	movzbl RW0bh, RT1d; \
+	load_next_key(n, RW0); \
+	xorq s3(, RT3, 8), to##0; \
+	xorq s1(, RT1, 8), to##0; \
+		xorq from##1, RW1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		shrq $16, RW1; \
+		xorq s8(, RT3, 8), to##1; \
+		xorq s6(, RT1, 8), to##1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		shrq $16, RW1; \
+		xorq s4(, RT3, 8), to##1; \
+		xorq s2(, RT1, 8), to##1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		shrl $16, RW1d; \
+		xorq s7(, RT3, 8), to##1; \
+		xorq s5(, RT1, 8), to##1; \
+		movzbl RW1bl, RT3d; \
+		movzbl RW1bh, RT1d; \
+		do_movq(RW0, RW1); \
+		xorq s3(, RT3, 8), to##1; \
+		xorq s1(, RT1, 8), to##1; \
+			xorq from##2, RW2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			shrq $16, RW2; \
+			xorq s8(, RT3, 8), to##2; \
+			xorq s6(, RT1, 8), to##2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			shrq $16, RW2; \
+			xorq s4(, RT3, 8), to##2; \
+			xorq s2(, RT1, 8), to##2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			shrl $16, RW2d; \
+			xorq s7(, RT3, 8), to##2; \
+			xorq s5(, RT1, 8), to##2; \
+			movzbl RW2bl, RT3d; \
+			movzbl RW2bh, RT1d; \
+			do_movq(RW0, RW2); \
+			xorq s3(, RT3, 8), to##2; \
+			xorq s1(, RT1, 8), to##2;
+
+#define __movq(src, dst) \
+	movq src, dst;
+
+ENTRY(des3_ede_x86_64_crypt_blk_3way)
+	/* input:
+	 *	%rdi: ctx, round keys
+	 *	%rsi: dst (3 blocks)
+	 *	%rdx: src (3 blocks)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+	pushq %r15;
+
+	/* load input */
+	movl 0 * 4(%rdx), RL0d;
+	movl 1 * 4(%rdx), RR0d;
+	movl 2 * 4(%rdx), RL1d;
+	movl 3 * 4(%rdx), RR1d;
+	movl 4 * 4(%rdx), RL2d;
+	movl 5 * 4(%rdx), RR2d;
+
+	bswapl RL0d;
+	bswapl RR0d;
+	bswapl RL1d;
+	bswapl RR1d;
+	bswapl RL2d;
+	bswapl RR2d;
+
+	initial_permutation3(RL, RR);
+
+	movq 0(CTX), RW0;
+	movq RW0, RW1;
+	movq RW0, RW2;
+
+	round3(0, RR, RL, load_next_key, __movq);
+	round3(1, RL, RR, load_next_key, __movq);
+	round3(2, RR, RL, load_next_key, __movq);
+	round3(3, RL, RR, load_next_key, __movq);
+	round3(4, RR, RL, load_next_key, __movq);
+	round3(5, RL, RR, load_next_key, __movq);
+	round3(6, RR, RL, load_next_key, __movq);
+	round3(7, RL, RR, load_next_key, __movq);
+	round3(8, RR, RL, load_next_key, __movq);
+	round3(9, RL, RR, load_next_key, __movq);
+	round3(10, RR, RL, load_next_key, __movq);
+	round3(11, RL, RR, load_next_key, __movq);
+	round3(12, RR, RL, load_next_key, __movq);
+	round3(13, RL, RR, load_next_key, __movq);
+	round3(14, RR, RL, load_next_key, __movq);
+	round3(15, RL, RR, load_next_key, __movq);
+
+	round3(16+0, RL, RR, load_next_key, __movq);
+	round3(16+1, RR, RL, load_next_key, __movq);
+	round3(16+2, RL, RR, load_next_key, __movq);
+	round3(16+3, RR, RL, load_next_key, __movq);
+	round3(16+4, RL, RR, load_next_key, __movq);
+	round3(16+5, RR, RL, load_next_key, __movq);
+	round3(16+6, RL, RR, load_next_key, __movq);
+	round3(16+7, RR, RL, load_next_key, __movq);
+	round3(16+8, RL, RR, load_next_key, __movq);
+	round3(16+9, RR, RL, load_next_key, __movq);
+	round3(16+10, RL, RR, load_next_key, __movq);
+	round3(16+11, RR, RL, load_next_key, __movq);
+	round3(16+12, RL, RR, load_next_key, __movq);
+	round3(16+13, RR, RL, load_next_key, __movq);
+	round3(16+14, RL, RR, load_next_key, __movq);
+	round3(16+15, RR, RL, load_next_key, __movq);
+
+	round3(32+0, RR, RL, load_next_key, __movq);
+	round3(32+1, RL, RR, load_next_key, __movq);
+	round3(32+2, RR, RL, load_next_key, __movq);
+	round3(32+3, RL, RR, load_next_key, __movq);
+	round3(32+4, RR, RL, load_next_key, __movq);
+	round3(32+5, RL, RR, load_next_key, __movq);
+	round3(32+6, RR, RL, load_next_key, __movq);
+	round3(32+7, RL, RR, load_next_key, __movq);
+	round3(32+8, RR, RL, load_next_key, __movq);
+	round3(32+9, RL, RR, load_next_key, __movq);
+	round3(32+10, RR, RL, load_next_key, __movq);
+	round3(32+11, RL, RR, load_next_key, __movq);
+	round3(32+12, RR, RL, load_next_key, __movq);
+	round3(32+13, RL, RR, load_next_key, __movq);
+	round3(32+14, RR, RL, load_next_key, __movq);
+	round3(32+15, RL, RR, dummy2, dummy2);
+
+	final_permutation3(RR, RL);
+
+	bswapl RR0d;
+	bswapl RL0d;
+	bswapl RR1d;
+	bswapl RL1d;
+	bswapl RR2d;
+	bswapl RL2d;
+
+	movl RR0d, 0 * 4(%rsi);
+	movl RL0d, 1 * 4(%rsi);
+	movl RR1d, 2 * 4(%rsi);
+	movl RL1d, 3 * 4(%rsi);
+	movl RR2d, 4 * 4(%rsi);
+	movl RL2d, 5 * 4(%rsi);
+
+	popq %r15;
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+ENDPROC(des3_ede_x86_64_crypt_blk_3way)
+
+.data
+.align 16
+.L_s1:
+	.quad 0x0010100001010400, 0x0000000000000000
+	.quad 0x0000100000010000, 0x0010100001010404
+	.quad 0x0010100001010004, 0x0000100000010404
+	.quad 0x0000000000000004, 0x0000100000010000
+	.quad 0x0000000000000400, 0x0010100001010400
+	.quad 0x0010100001010404, 0x0000000000000400
+	.quad 0x0010000001000404, 0x0010100001010004
+	.quad 0x0010000001000000, 0x0000000000000004
+	.quad 0x0000000000000404, 0x0010000001000400
+	.quad 0x0010000001000400, 0x0000100000010400
+	.quad 0x0000100000010400, 0x0010100001010000
+	.quad 0x0010100001010000, 0x0010000001000404
+	.quad 0x0000100000010004, 0x0010000001000004
+	.quad 0x0010000001000004, 0x0000100000010004
+	.quad 0x0000000000000000, 0x0000000000000404
+	.quad 0x0000100000010404, 0x0010000001000000
+	.quad 0x0000100000010000, 0x0010100001010404
+	.quad 0x0000000000000004, 0x0010100001010000
+	.quad 0x0010100001010400, 0x0010000001000000
+	.quad 0x0010000001000000, 0x0000000000000400
+	.quad 0x0010100001010004, 0x0000100000010000
+	.quad 0x0000100000010400, 0x0010000001000004
+	.quad 0x0000000000000400, 0x0000000000000004
+	.quad 0x0010000001000404, 0x0000100000010404
+	.quad 0x0010100001010404, 0x0000100000010004
+	.quad 0x0010100001010000, 0x0010000001000404
+	.quad 0x0010000001000004, 0x0000000000000404
+	.quad 0x0000100000010404, 0x0010100001010400
+	.quad 0x0000000000000404, 0x0010000001000400
+	.quad 0x0010000001000400, 0x0000000000000000
+	.quad 0x0000100000010004, 0x0000100000010400
+	.quad 0x0000000000000000, 0x0010100001010004
+.L_s2:
+	.quad 0x0801080200100020, 0x0800080000000000
+	.quad 0x0000080000000000, 0x0001080200100020
+	.quad 0x0001000000100000, 0x0000000200000020
+	.quad 0x0801000200100020, 0x0800080200000020
+	.quad 0x0800000200000020, 0x0801080200100020
+	.quad 0x0801080000100000, 0x0800000000000000
+	.quad 0x0800080000000000, 0x0001000000100000
+	.quad 0x0000000200000020, 0x0801000200100020
+	.quad 0x0001080000100000, 0x0001000200100020
+	.quad 0x0800080200000020, 0x0000000000000000
+	.quad 0x0800000000000000, 0x0000080000000000
+	.quad 0x0001080200100020, 0x0801000000100000
+	.quad 0x0001000200100020, 0x0800000200000020
+	.quad 0x0000000000000000, 0x0001080000100000
+	.quad 0x0000080200000020, 0x0801080000100000
+	.quad 0x0801000000100000, 0x0000080200000020
+	.quad 0x0000000000000000, 0x0001080200100020
+	.quad 0x0801000200100020, 0x0001000000100000
+	.quad 0x0800080200000020, 0x0801000000100000
+	.quad 0x0801080000100000, 0x0000080000000000
+	.quad 0x0801000000100000, 0x0800080000000000
+	.quad 0x0000000200000020, 0x0801080200100020
+	.quad 0x0001080200100020, 0x0000000200000020
+	.quad 0x0000080000000000, 0x0800000000000000
+	.quad 0x0000080200000020, 0x0801080000100000
+	.quad 0x0001000000100000, 0x0800000200000020
+	.quad 0x0001000200100020, 0x0800080200000020
+	.quad 0x0800000200000020, 0x0001000200100020
+	.quad 0x0001080000100000, 0x0000000000000000
+	.quad 0x0800080000000000, 0x0000080200000020
+	.quad 0x0800000000000000, 0x0801000200100020
+	.quad 0x0801080200100020, 0x0001080000100000
+.L_s3:
+	.quad 0x0000002000000208, 0x0000202008020200
+	.quad 0x0000000000000000, 0x0000200008020008
+	.quad 0x0000002008000200, 0x0000000000000000
+	.quad 0x0000202000020208, 0x0000002008000200
+	.quad 0x0000200000020008, 0x0000000008000008
+	.quad 0x0000000008000008, 0x0000200000020000
+	.quad 0x0000202008020208, 0x0000200000020008
+	.quad 0x0000200008020000, 0x0000002000000208
+	.quad 0x0000000008000000, 0x0000000000000008
+	.quad 0x0000202008020200, 0x0000002000000200
+	.quad 0x0000202000020200, 0x0000200008020000
+	.quad 0x0000200008020008, 0x0000202000020208
+	.quad 0x0000002008000208, 0x0000202000020200
+	.quad 0x0000200000020000, 0x0000002008000208
+	.quad 0x0000000000000008, 0x0000202008020208
+	.quad 0x0000002000000200, 0x0000000008000000
+	.quad 0x0000202008020200, 0x0000000008000000
+	.quad 0x0000200000020008, 0x0000002000000208
+	.quad 0x0000200000020000, 0x0000202008020200
+	.quad 0x0000002008000200, 0x0000000000000000
+	.quad 0x0000002000000200, 0x0000200000020008
+	.quad 0x0000202008020208, 0x0000002008000200
+	.quad 0x0000000008000008, 0x0000002000000200
+	.quad 0x0000000000000000, 0x0000200008020008
+	.quad 0x0000002008000208, 0x0000200000020000
+	.quad 0x0000000008000000, 0x0000202008020208
+	.quad 0x0000000000000008, 0x0000202000020208
+	.quad 0x0000202000020200, 0x0000000008000008
+	.quad 0x0000200008020000, 0x0000002008000208
+	.quad 0x0000002000000208, 0x0000200008020000
+	.quad 0x0000202000020208, 0x0000000000000008
+	.quad 0x0000200008020008, 0x0000202000020200
+.L_s4:
+	.quad 0x1008020000002001, 0x1000020800002001
+	.quad 0x1000020800002001, 0x0000000800000000
+	.quad 0x0008020800002000, 0x1008000800000001
+	.quad 0x1008000000000001, 0x1000020000002001
+	.quad 0x0000000000000000, 0x0008020000002000
+	.quad 0x0008020000002000, 0x1008020800002001
+	.quad 0x1000000800000001, 0x0000000000000000
+	.quad 0x0008000800000000, 0x1008000000000001
+	.quad 0x1000000000000001, 0x0000020000002000
+	.quad 0x0008000000000000, 0x1008020000002001
+	.quad 0x0000000800000000, 0x0008000000000000
+	.quad 0x1000020000002001, 0x0000020800002000
+	.quad 0x1008000800000001, 0x1000000000000001
+	.quad 0x0000020800002000, 0x0008000800000000
+	.quad 0x0000020000002000, 0x0008020800002000
+	.quad 0x1008020800002001, 0x1000000800000001
+	.quad 0x0008000800000000, 0x1008000000000001
+	.quad 0x0008020000002000, 0x1008020800002001
+	.quad 0x1000000800000001, 0x0000000000000000
+	.quad 0x0000000000000000, 0x0008020000002000
+	.quad 0x0000020800002000, 0x0008000800000000
+	.quad 0x1008000800000001, 0x1000000000000001
+	.quad 0x1008020000002001, 0x1000020800002001
+	.quad 0x1000020800002001, 0x0000000800000000
+	.quad 0x1008020800002001, 0x1000000800000001
+	.quad 0x1000000000000001, 0x0000020000002000
+	.quad 0x1008000000000001, 0x1000020000002001
+	.quad 0x0008020800002000, 0x1008000800000001
+	.quad 0x1000020000002001, 0x0000020800002000
+	.quad 0x0008000000000000, 0x1008020000002001
+	.quad 0x0000000800000000, 0x0008000000000000
+	.quad 0x0000020000002000, 0x0008020800002000
+.L_s5:
+	.quad 0x0000001000000100, 0x0020001002080100
+	.quad 0x0020000002080000, 0x0420001002000100
+	.quad 0x0000000000080000, 0x0000001000000100
+	.quad 0x0400000000000000, 0x0020000002080000
+	.quad 0x0400001000080100, 0x0000000000080000
+	.quad 0x0020001002000100, 0x0400001000080100
+	.quad 0x0420001002000100, 0x0420000002080000
+	.quad 0x0000001000080100, 0x0400000000000000
+	.quad 0x0020000002000000, 0x0400000000080000
+	.quad 0x0400000000080000, 0x0000000000000000
+	.quad 0x0400001000000100, 0x0420001002080100
+	.quad 0x0420001002080100, 0x0020001002000100
+	.quad 0x0420000002080000, 0x0400001000000100
+	.quad 0x0000000000000000, 0x0420000002000000
+	.quad 0x0020001002080100, 0x0020000002000000
+	.quad 0x0420000002000000, 0x0000001000080100
+	.quad 0x0000000000080000, 0x0420001002000100
+	.quad 0x0000001000000100, 0x0020000002000000
+	.quad 0x0400000000000000, 0x0020000002080000
+	.quad 0x0420001002000100, 0x0400001000080100
+	.quad 0x0020001002000100, 0x0400000000000000
+	.quad 0x0420000002080000, 0x0020001002080100
+	.quad 0x0400001000080100, 0x0000001000000100
+	.quad 0x0020000002000000, 0x0420000002080000
+	.quad 0x0420001002080100, 0x0000001000080100
+	.quad 0x0420000002000000, 0x0420001002080100
+	.quad 0x0020000002080000, 0x0000000000000000
+	.quad 0x0400000000080000, 0x0420000002000000
+	.quad 0x0000001000080100, 0x0020001002000100
+	.quad 0x0400001000000100, 0x0000000000080000
+	.quad 0x0000000000000000, 0x0400000000080000
+	.quad 0x0020001002080100, 0x0400001000000100
+.L_s6:
+	.quad 0x0200000120000010, 0x0204000020000000
+	.quad 0x0000040000000000, 0x0204040120000010
+	.quad 0x0204000020000000, 0x0000000100000010
+	.quad 0x0204040120000010, 0x0004000000000000
+	.quad 0x0200040020000000, 0x0004040100000010
+	.quad 0x0004000000000000, 0x0200000120000010
+	.quad 0x0004000100000010, 0x0200040020000000
+	.quad 0x0200000020000000, 0x0000040100000010
+	.quad 0x0000000000000000, 0x0004000100000010
+	.quad 0x0200040120000010, 0x0000040000000000
+	.quad 0x0004040000000000, 0x0200040120000010
+	.quad 0x0000000100000010, 0x0204000120000010
+	.quad 0x0204000120000010, 0x0000000000000000
+	.quad 0x0004040100000010, 0x0204040020000000
+	.quad 0x0000040100000010, 0x0004040000000000
+	.quad 0x0204040020000000, 0x0200000020000000
+	.quad 0x0200040020000000, 0x0000000100000010
+	.quad 0x0204000120000010, 0x0004040000000000
+	.quad 0x0204040120000010, 0x0004000000000000
+	.quad 0x0000040100000010, 0x0200000120000010
+	.quad 0x0004000000000000, 0x0200040020000000
+	.quad 0x0200000020000000, 0x0000040100000010
+	.quad 0x0200000120000010, 0x0204040120000010
+	.quad 0x0004040000000000, 0x0204000020000000
+	.quad 0x0004040100000010, 0x0204040020000000
+	.quad 0x0000000000000000, 0x0204000120000010
+	.quad 0x0000000100000010, 0x0000040000000000
+	.quad 0x0204000020000000, 0x0004040100000010
+	.quad 0x0000040000000000, 0x0004000100000010
+	.quad 0x0200040120000010, 0x0000000000000000
+	.quad 0x0204040020000000, 0x0200000020000000
+	.quad 0x0004000100000010, 0x0200040120000010
+.L_s7:
+	.quad 0x0002000000200000, 0x2002000004200002
+	.quad 0x2000000004000802, 0x0000000000000000
+	.quad 0x0000000000000800, 0x2000000004000802
+	.quad 0x2002000000200802, 0x0002000004200800
+	.quad 0x2002000004200802, 0x0002000000200000
+	.quad 0x0000000000000000, 0x2000000004000002
+	.quad 0x2000000000000002, 0x0000000004000000
+	.quad 0x2002000004200002, 0x2000000000000802
+	.quad 0x0000000004000800, 0x2002000000200802
+	.quad 0x2002000000200002, 0x0000000004000800
+	.quad 0x2000000004000002, 0x0002000004200000
+	.quad 0x0002000004200800, 0x2002000000200002
+	.quad 0x0002000004200000, 0x0000000000000800
+	.quad 0x2000000000000802, 0x2002000004200802
+	.quad 0x0002000000200800, 0x2000000000000002
+	.quad 0x0000000004000000, 0x0002000000200800
+	.quad 0x0000000004000000, 0x0002000000200800
+	.quad 0x0002000000200000, 0x2000000004000802
+	.quad 0x2000000004000802, 0x2002000004200002
+	.quad 0x2002000004200002, 0x2000000000000002
+	.quad 0x2002000000200002, 0x0000000004000000
+	.quad 0x0000000004000800, 0x0002000000200000
+	.quad 0x0002000004200800, 0x2000000000000802
+	.quad 0x2002000000200802, 0x0002000004200800
+	.quad 0x2000000000000802, 0x2000000004000002
+	.quad 0x2002000004200802, 0x0002000004200000
+	.quad 0x0002000000200800, 0x0000000000000000
+	.quad 0x2000000000000002, 0x2002000004200802
+	.quad 0x0000000000000000, 0x2002000000200802
+	.quad 0x0002000004200000, 0x0000000000000800
+	.quad 0x2000000004000002, 0x0000000004000800
+	.quad 0x0000000000000800, 0x2002000000200002
+.L_s8:
+	.quad 0x0100010410001000, 0x0000010000001000
+	.quad 0x0000000000040000, 0x0100010410041000
+	.quad 0x0100000010000000, 0x0100010410001000
+	.quad 0x0000000400000000, 0x0100000010000000
+	.quad 0x0000000400040000, 0x0100000010040000
+	.quad 0x0100010410041000, 0x0000010000041000
+	.quad 0x0100010010041000, 0x0000010400041000
+	.quad 0x0000010000001000, 0x0000000400000000
+	.quad 0x0100000010040000, 0x0100000410000000
+	.quad 0x0100010010001000, 0x0000010400001000
+	.quad 0x0000010000041000, 0x0000000400040000
+	.quad 0x0100000410040000, 0x0100010010041000
+	.quad 0x0000010400001000, 0x0000000000000000
+	.quad 0x0000000000000000, 0x0100000410040000
+	.quad 0x0100000410000000, 0x0100010010001000
+	.quad 0x0000010400041000, 0x0000000000040000
+	.quad 0x0000010400041000, 0x0000000000040000
+	.quad 0x0100010010041000, 0x0000010000001000
+	.quad 0x0000000400000000, 0x0100000410040000
+	.quad 0x0000010000001000, 0x0000010400041000
+	.quad 0x0100010010001000, 0x0000000400000000
+	.quad 0x0100000410000000, 0x0100000010040000
+	.quad 0x0100000410040000, 0x0100000010000000
+	.quad 0x0000000000040000, 0x0100010410001000
+	.quad 0x0000000000000000, 0x0100010410041000
+	.quad 0x0000000400040000, 0x0100000410000000
+	.quad 0x0100000010040000, 0x0100010010001000
+	.quad 0x0100010410001000, 0x0000000000000000
+	.quad 0x0100010410041000, 0x0000010000041000
+	.quad 0x0000010000041000, 0x0000010400001000
+	.quad 0x0000010400001000, 0x0000000400040000
+	.quad 0x0100000010000000, 0x0100010010041000
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
new file mode 100644
index 000000000000..ebc421543e44
--- /dev/null
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -0,0 +1,509 @@
+/*
+ * Glue Code for assembler optimized version of 3DES
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <asm/processor.h>
+#include <crypto/des.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+
+struct des3_ede_x86_ctx {
+	u32 enc_expkey[DES3_EDE_EXPKEY_WORDS];
+	u32 dec_expkey[DES3_EDE_EXPKEY_WORDS];
+};
+
+/* regular block cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst,
+					  const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst,
+					       const u8 *src);
+
+static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+				    const u8 *src)
+{
+	u32 *enc_ctx = ctx->enc_expkey;
+
+	des3_ede_x86_64_crypt_blk(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+				    const u8 *src)
+{
+	u32 *dec_ctx = ctx->dec_expkey;
+
+	des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
+}
+
+static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+					 const u8 *src)
+{
+	u32 *enc_ctx = ctx->enc_expkey;
+
+	des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+					 const u8 *src)
+{
+	u32 *dec_ctx = ctx->dec_expkey;
+
+	des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src);
+}
+
+static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+		     const u32 *expkey)
+{
+	unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+	unsigned int nbytes;
+	int err;
+
+	err = blkcipher_walk_virt(desc, walk);
+
+	while ((nbytes = walk->nbytes)) {
+		u8 *wsrc = walk->src.virt.addr;
+		u8 *wdst = walk->dst.virt.addr;
+
+		/* Process four block batch */
+		if (nbytes >= bsize * 3) {
+			do {
+				des3_ede_x86_64_crypt_blk_3way(expkey, wdst,
+							       wsrc);
+
+				wsrc += bsize * 3;
+				wdst += bsize * 3;
+				nbytes -= bsize * 3;
+			} while (nbytes >= bsize * 3);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+
+		/* Handle leftovers */
+		do {
+			des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc);
+
+			wsrc += bsize;
+			wdst += bsize;
+			nbytes -= bsize;
+		} while (nbytes >= bsize);
+
+done:
+		err = blkcipher_walk_done(desc, walk, nbytes);
+	}
+
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, ctx->enc_expkey);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, ctx->dec_expkey);
+}
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u64 *src = (u64 *)walk->src.virt.addr;
+	u64 *dst = (u64 *)walk->dst.virt.addr;
+	u64 *iv = (u64 *)walk->iv;
+
+	do {
+		*dst = *src ^ *iv;
+		des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+		iv = dst;
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+	*(u64 *)walk->iv = *iv;
+	return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		nbytes = __cbc_encrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u64 *src = (u64 *)walk->src.virt.addr;
+	u64 *dst = (u64 *)walk->dst.virt.addr;
+	u64 ivs[3 - 1];
+	u64 last_iv;
+
+	/* Start of the last block. */
+	src += nbytes / bsize - 1;
+	dst += nbytes / bsize - 1;
+
+	last_iv = *src;
+
+	/* Process four block batch */
+	if (nbytes >= bsize * 3) {
+		do {
+			nbytes -= bsize * 3 - bsize;
+			src -= 3 - 1;
+			dst -= 3 - 1;
+
+			ivs[0] = src[0];
+			ivs[1] = src[1];
+
+			des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+
+			dst[1] ^= ivs[0];
+			dst[2] ^= ivs[1];
+
+			nbytes -= bsize;
+			if (nbytes < bsize)
+				goto done;
+
+			*dst ^= *(src - 1);
+			src -= 1;
+			dst -= 1;
+		} while (nbytes >= bsize * 3);
+	}
+
+	/* Handle leftovers */
+	for (;;) {
+		des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+		nbytes -= bsize;
+		if (nbytes < bsize)
+			break;
+
+		*dst ^= *(src - 1);
+		src -= 1;
+		dst -= 1;
+	}
+
+done:
+	*dst ^= *(u64 *)walk->iv;
+	*(u64 *)walk->iv = last_iv;
+
+	return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		nbytes = __cbc_decrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
+			    struct blkcipher_walk *walk)
+{
+	u8 *ctrblk = walk->iv;
+	u8 keystream[DES3_EDE_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	des3_ede_enc_blk(ctx, keystream, ctrblk);
+	crypto_xor(keystream, src, nbytes);
+	memcpy(dst, keystream, nbytes);
+
+	crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+				struct blkcipher_walk *walk)
+{
+	struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u64 *src = (u64 *)walk->src.virt.addr;
+	u64 *dst = (u64 *)walk->dst.virt.addr;
+	u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+	__be64 ctrblocks[3];
+
+	/* Process four block batch */
+	if (nbytes >= bsize * 3) {
+		do {
+			/* create ctrblks for parallel encrypt */
+			ctrblocks[0] = cpu_to_be64(ctrblk++);
+			ctrblocks[1] = cpu_to_be64(ctrblk++);
+			ctrblocks[2] = cpu_to_be64(ctrblk++);
+
+			des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks,
+					      (u8 *)ctrblocks);
+
+			dst[0] = src[0] ^ ctrblocks[0];
+			dst[1] = src[1] ^ ctrblocks[1];
+			dst[2] = src[2] ^ ctrblocks[2];
+
+			src += 3;
+			dst += 3;
+		} while ((nbytes -= bsize * 3) >= bsize * 3);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	do {
+		ctrblocks[0] = cpu_to_be64(ctrblk++);
+
+		des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+
+		dst[0] = src[0] ^ ctrblocks[0];
+
+		src += 1;
+		dst += 1;
+	} while ((nbytes -= bsize) >= bsize);
+
+done:
+	*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+	return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, DES3_EDE_BLOCK_SIZE);
+
+	while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) {
+		nbytes = __ctr_crypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	if (walk.nbytes) {
+		ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	return err;
+}
+
+static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
+			       unsigned int keylen)
+{
+	struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 i, j, tmp;
+	int err;
+
+	/* Generate encryption context using generic implementation. */
+	err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen);
+	if (err < 0)
+		return err;
+
+	/* Fix encryption context for this implementation and form decryption
+	 * context. */
+	j = DES3_EDE_EXPKEY_WORDS - 2;
+	for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) {
+		tmp = ror32(ctx->enc_expkey[i + 1], 4);
+		ctx->enc_expkey[i + 1] = tmp;
+
+		ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0];
+		ctx->dec_expkey[j + 1] = tmp;
+	}
+
+	return 0;
+}
+
+static struct crypto_alg des3_ede_algs[4] = { {
+	.cra_name		= "des3_ede",
+	.cra_driver_name	= "des3_ede-asm",
+	.cra_priority		= 200,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des3_ede_x86_ctx),
+	.cra_alignmask		= 0,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.cipher = {
+			.cia_min_keysize	= DES3_EDE_KEY_SIZE,
+			.cia_max_keysize	= DES3_EDE_KEY_SIZE,
+			.cia_setkey		= des3_ede_x86_setkey,
+			.cia_encrypt		= des3_ede_x86_encrypt,
+			.cia_decrypt		= des3_ede_x86_decrypt,
+		}
+	}
+}, {
+	.cra_name		= "ecb(des3_ede)",
+	.cra_driver_name	= "ecb-des3_ede-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des3_ede_x86_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES3_EDE_KEY_SIZE,
+			.max_keysize	= DES3_EDE_KEY_SIZE,
+			.setkey		= des3_ede_x86_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(des3_ede)",
+	.cra_driver_name	= "cbc-des3_ede-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des3_ede_x86_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES3_EDE_KEY_SIZE,
+			.max_keysize	= DES3_EDE_KEY_SIZE,
+			.ivsize		= DES3_EDE_BLOCK_SIZE,
+			.setkey		= des3_ede_x86_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(des3_ede)",
+	.cra_driver_name	= "ctr-des3_ede-asm",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct des3_ede_x86_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES3_EDE_KEY_SIZE,
+			.max_keysize	= DES3_EDE_KEY_SIZE,
+			.ivsize		= DES3_EDE_BLOCK_SIZE,
+			.setkey		= des3_ede_x86_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+} };
+
+static bool is_blacklisted_cpu(void)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return false;
+
+	if (boot_cpu_data.x86 == 0x0f) {
+		/*
+		 * On Pentium 4, des3_ede-x86_64 is slower than generic C
+		 * implementation because use of 64bit rotates (which are really
+		 * slow on P4). Therefore blacklist P4s.
+		 */
+		return true;
+	}
+
+	return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init des3_ede_x86_init(void)
+{
+	if (!force && is_blacklisted_cpu()) {
+		pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
+}
+
+static void __exit des3_ede_x86_fini(void)
+{
+	crypto_unregister_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
+}
+
+module_init(des3_ede_x86_init);
+module_exit(des3_ede_x86_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
+MODULE_ALIAS("des3_ede");
+MODULE_ALIAS("des3_ede-asm");
+MODULE_ALIAS("des");
+MODULE_ALIAS("des-asm");
+MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index c9c1cd91031c..025c5108442e 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1019,6 +1019,19 @@ config CRYPTO_DES_SPARC64
 	  DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3),
 	  optimized using SPARC64 crypto opcodes.
 
+config CRYPTO_DES3_EDE_X86_64
+	tristate "Triple DES EDE cipher algorithm (x86-64)"
+	depends on X86 && 64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_DES
+	help
+	  Triple DES EDE (FIPS 46-3) algorithm.
+
+	  This module provides implementation of the Triple DES EDE cipher
+	  algorithm that is optimized for x86-64 processors. Two versions of
+	  algorithm are provided; regular processing one input block and
+	  one that processes three blocks parallel.
+
 config CRYPTO_FCRYPT
 	tristate "FCrypt cipher algorithm"
 	select CRYPTO_ALGAPI
diff --git a/crypto/des_generic.c b/crypto/des_generic.c
index f6cf63f88468..298d464ab7d2 100644
--- a/crypto/des_generic.c
+++ b/crypto/des_generic.c
@@ -859,13 +859,10 @@ static void des_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
  *   property.
  *
  */
-static int des3_ede_setkey(struct crypto_tfm *tfm, const u8 *key,
-			   unsigned int keylen)
+int __des3_ede_setkey(u32 *expkey, u32 *flags, const u8 *key,
+		      unsigned int keylen)
 {
 	const u32 *K = (const u32 *)key;
-	struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm);
-	u32 *expkey = dctx->expkey;
-	u32 *flags = &tfm->crt_flags;
 
 	if (unlikely(!((K[0] ^ K[2]) | (K[1] ^ K[3])) ||
 		     !((K[2] ^ K[4]) | (K[3] ^ K[5]))) &&
@@ -880,6 +877,17 @@ static int des3_ede_setkey(struct crypto_tfm *tfm, const u8 *key,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__des3_ede_setkey);
+
+static int des3_ede_setkey(struct crypto_tfm *tfm, const u8 *key,
+			   unsigned int keylen)
+{
+	struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	u32 *expkey = dctx->expkey;
+
+	return __des3_ede_setkey(expkey, flags, key, keylen);
+}
 
 static void des3_ede_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
@@ -945,6 +953,8 @@ static void des3_ede_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 
 static struct crypto_alg des_algs[2] = { {
 	.cra_name		=	"des",
+	.cra_driver_name	=	"des-generic",
+	.cra_priority		=	100,
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	DES_BLOCK_SIZE,
 	.cra_ctxsize		=	sizeof(struct des_ctx),
@@ -958,6 +968,8 @@ static struct crypto_alg des_algs[2] = { {
 	.cia_decrypt		=	des_decrypt } }
 }, {
 	.cra_name		=	"des3_ede",
+	.cra_driver_name	=	"des3_ede-generic",
+	.cra_priority		=	100,
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	DES3_EDE_BLOCK_SIZE,
 	.cra_ctxsize		=	sizeof(struct des3_ede_ctx),
diff --git a/include/crypto/des.h b/include/crypto/des.h
index 2971c6304ade..fc6274c6bb26 100644
--- a/include/crypto/des.h
+++ b/include/crypto/des.h
@@ -16,4 +16,7 @@
 
 extern unsigned long des_ekey(u32 *pe, const u8 *k);
 
+extern int __des3_ede_setkey(u32 *expkey, u32 *flags, const u8 *key,
+			     unsigned int keylen);
+
 #endif /* __CRYPTO_DES_H */
-- 
cgit v1.2.3-58-ga151


From 22cddcc7df8fd35d52646ee220658d26ef09da17 Mon Sep 17 00:00:00 2001
From: chandramouli narayanan <mouli@linux.intel.com>
Date: Tue, 10 Jun 2014 09:22:47 -0700
Subject: crypto: aes - AES CTR x86_64 "by8" AVX optimization

This patch introduces "by8" AES CTR mode AVX optimization inspired by
Intel Optimized IPSEC Cryptograhpic library. For additional information,
please see:
http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972

The functions aes_ctr_enc_128_avx_by8(), aes_ctr_enc_192_avx_by8() and
aes_ctr_enc_256_avx_by8() are adapted from
Intel Optimized IPSEC Cryptographic library. When both AES and AVX features
are enabled in a platform, the glue code in AESNI module overrieds the
existing "by4" CTR mode en/decryption with the "by8"
AES CTR mode en/decryption.

On a Haswell desktop, with turbo disabled and all cpus running
at maximum frequency, the "by8" CTR mode optimization
shows better performance results across data & key sizes
as measured by tcrypt.

The average performance improvement of the "by8" version over the "by4"
version is as follows:

For 128 bit key and data sizes >= 256 bytes, there is a 10-16% improvement.
For 192 bit key and data sizes >= 256 bytes, there is a 20-22% improvement.
For 256 bit key and data sizes >= 256 bytes, there is a 20-25% improvement.

A typical run of tcrypt with AES CTR mode encryption of the "by4" and "by8"
optimization shows the following results:

tcrypt with "by4" AES CTR mode encryption optimization on a Haswell Desktop:
---------------------------------------------------------------------------

testing speed of __ctr-aes-aesni encryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 343 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 336 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 491 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 1130 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 7309 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 346 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 361 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 543 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 1321 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 9649 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 369 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 366 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 595 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 1531 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 10522 cycles (8192 bytes)

testing speed of __ctr-aes-aesni decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 336 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 350 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 487 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 1129 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 7287 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 350 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 359 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 635 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 1324 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 9595 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 364 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 377 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 604 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 1527 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 10549 cycles (8192 bytes)

tcrypt with "by8" AES CTR mode encryption optimization on a Haswell Desktop:
---------------------------------------------------------------------------

testing speed of __ctr-aes-aesni encryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 340 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 330 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 450 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 1043 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 6597 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 339 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 352 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 539 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 1153 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 8458 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 353 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 360 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 512 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 1277 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 8745 cycles (8192 bytes)

testing speed of __ctr-aes-aesni decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 348 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 335 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 451 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 1030 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 6611 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 354 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 346 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 488 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 1154 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 8390 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 357 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 362 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 515 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 1284 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 8681 cycles (8192 bytes)

crypto: Incorporate feed back to AES CTR mode optimization patch

Specifically, the following:
a) alignment around main loop in aes_ctrby8_avx_x86_64.S
b) .rodata around data constants used in the assembely code.
c) the use of CONFIG_AVX in the glue code.
d) fix up white space.
e) informational message for "by8" AES CTR mode optimization
f) "by8" AES CTR mode optimization can be simply enabled
if the platform supports both AES and AVX features. The
optimization works superbly on Sandybridge as well.

Testing on Haswell shows no performance change since the last.

Testing on Sandybridge shows that the "by8" AES CTR mode optimization
greatly improves performance.

tcrypt log with "by4" AES CTR mode optimization on Sandybridge
--------------------------------------------------------------

testing speed of __ctr-aes-aesni encryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 383 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 408 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 707 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 1864 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 12813 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 395 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 432 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 780 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 2132 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 15765 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 416 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 438 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 842 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 2383 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 16945 cycles (8192 bytes)

testing speed of __ctr-aes-aesni decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 389 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 409 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 704 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 1865 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 12783 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 409 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 434 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 792 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 2151 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 15804 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 421 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 444 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 840 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 2394 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 16928 cycles (8192 bytes)

tcrypt log with "by8" AES CTR mode optimization on Sandybridge
--------------------------------------------------------------

testing speed of __ctr-aes-aesni encryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 383 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 401 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 522 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 1136 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 7046 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 394 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 418 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 559 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 1263 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 9072 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 408 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 428 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 595 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 1385 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 9224 cycles (8192 bytes)

testing speed of __ctr-aes-aesni decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 390 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 402 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 530 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 1135 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 7079 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 414 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 417 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 572 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 1312 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 9073 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 415 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 454 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 598 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 1407 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 9288 cycles (8192 bytes)

crypto: Fix redundant checks

a) Fix the redundant check for cpu_has_aes
b) Fix the key length check when invoking the CTR mode "by8"
encryptor/decryptor.

crypto: fix typo in AES ctr mode transform

Signed-off-by: Chandramouli Narayanan <mouli@linux.intel.com>
Reviewed-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                |   2 +-
 arch/x86/crypto/aes_ctrby8_avx-x86_64.S | 546 ++++++++++++++++++++++++++++++++
 arch/x86/crypto/aesni-intel_glue.c      |  40 ++-
 3 files changed, 585 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/crypto/aes_ctrby8_avx-x86_64.S

(limited to 'arch')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a470de25570f..d551165a3159 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -78,7 +78,7 @@ ifeq ($(avx2_supported),yes)
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
new file mode 100644
index 000000000000..f091f122ed24
--- /dev/null
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -0,0 +1,546 @@
+/*
+ *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
+ *
+ * This is AES128/192/256 CTR mode optimization implementation. It requires
+ * the support of Intel(R) AESNI and AVX instructions.
+ *
+ * This work was inspired by the AES CTR mode optimization published
+ * in Intel Optimized IPSEC Cryptograhpic library.
+ * Additional information on it can be found at:
+ *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+#define CONCAT(a,b)	a##b
+#define VMOVDQ		vmovdqu
+
+#define xdata0		%xmm0
+#define xdata1		%xmm1
+#define xdata2		%xmm2
+#define xdata3		%xmm3
+#define xdata4		%xmm4
+#define xdata5		%xmm5
+#define xdata6		%xmm6
+#define xdata7		%xmm7
+#define xcounter	%xmm8
+#define xbyteswap	%xmm9
+#define xkey0		%xmm10
+#define xkey3		%xmm11
+#define xkey6		%xmm12
+#define xkey9		%xmm13
+#define xkey4		%xmm11
+#define xkey8		%xmm12
+#define xkey12		%xmm13
+#define xkeyA		%xmm14
+#define xkeyB		%xmm15
+
+#define p_in		%rdi
+#define p_iv		%rsi
+#define p_keys		%rdx
+#define p_out		%rcx
+#define num_bytes	%r8
+
+#define tmp		%r10
+#define	DDQ(i)		CONCAT(ddq_add_,i)
+#define	XMM(i)		CONCAT(%xmm, i)
+#define	DDQ_DATA	0
+#define	XDATA		1
+#define KEY_128		1
+#define KEY_192		2
+#define KEY_256		3
+
+.section .rodata
+.align 16
+
+byteswap_const:
+	.octa 0x000102030405060708090A0B0C0D0E0F
+ddq_add_1:
+	.octa 0x00000000000000000000000000000001
+ddq_add_2:
+	.octa 0x00000000000000000000000000000002
+ddq_add_3:
+	.octa 0x00000000000000000000000000000003
+ddq_add_4:
+	.octa 0x00000000000000000000000000000004
+ddq_add_5:
+	.octa 0x00000000000000000000000000000005
+ddq_add_6:
+	.octa 0x00000000000000000000000000000006
+ddq_add_7:
+	.octa 0x00000000000000000000000000000007
+ddq_add_8:
+	.octa 0x00000000000000000000000000000008
+
+.text
+
+/* generate a unique variable for ddq_add_x */
+
+.macro setddq n
+	var_ddq_add = DDQ(\n)
+.endm
+
+/* generate a unique variable for xmm register */
+.macro setxdata n
+	var_xdata = XMM(\n)
+.endm
+
+/* club the numeric 'id' to the symbol 'name' */
+
+.macro club name, id
+.altmacro
+	.if \name == DDQ_DATA
+		setddq %\id
+	.elseif \name == XDATA
+		setxdata %\id
+	.endif
+.noaltmacro
+.endm
+
+/*
+ * do_aes num_in_par load_keys key_len
+ * This increments p_in, but not p_out
+ */
+.macro do_aes b, k, key_len
+	.set by, \b
+	.set load_keys, \k
+	.set klen, \key_len
+
+	.if (load_keys)
+		vmovdqa	0*16(p_keys), xkey0
+	.endif
+
+	vpshufb	xbyteswap, xcounter, xdata0
+
+	.set i, 1
+	.rept (by - 1)
+		club DDQ_DATA, i
+		club XDATA, i
+		vpaddd	var_ddq_add(%rip), xcounter, var_xdata
+		vpshufb	xbyteswap, var_xdata, var_xdata
+		.set i, (i +1)
+	.endr
+
+	vmovdqa	1*16(p_keys), xkeyA
+
+	vpxor	xkey0, xdata0, xdata0
+	club DDQ_DATA, by
+	vpaddd	var_ddq_add(%rip), xcounter, xcounter
+
+	.set i, 1
+	.rept (by - 1)
+		club XDATA, i
+		vpxor	xkey0, var_xdata, var_xdata
+		.set i, (i +1)
+	.endr
+
+	vmovdqa	2*16(p_keys), xkeyB
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
+		.set i, (i +1)
+	.endr
+
+	.if (klen == KEY_128)
+		.if (load_keys)
+			vmovdqa	3*16(p_keys), xkeyA
+		.endif
+	.else
+		vmovdqa	3*16(p_keys), xkeyA
+	.endif
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
+		.set i, (i +1)
+	.endr
+
+	add	$(16*by), p_in
+
+	.if (klen == KEY_128)
+		vmovdqa	4*16(p_keys), xkey4
+	.else
+		.if (load_keys)
+			vmovdqa	4*16(p_keys), xkey4
+		.endif
+	.endif
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		vaesenc	xkeyA, var_xdata, var_xdata		/* key 3 */
+		.set i, (i +1)
+	.endr
+
+	vmovdqa	5*16(p_keys), xkeyA
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		vaesenc	xkey4, var_xdata, var_xdata		/* key 4 */
+		.set i, (i +1)
+	.endr
+
+	.if (klen == KEY_128)
+		.if (load_keys)
+			vmovdqa	6*16(p_keys), xkeyB
+		.endif
+	.else
+		vmovdqa	6*16(p_keys), xkeyB
+	.endif
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
+		.set i, (i +1)
+	.endr
+
+	vmovdqa	7*16(p_keys), xkeyA
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		vaesenc	xkeyB, var_xdata, var_xdata		/* key 6 */
+		.set i, (i +1)
+	.endr
+
+	.if (klen == KEY_128)
+		vmovdqa	8*16(p_keys), xkey8
+	.else
+		.if (load_keys)
+			vmovdqa	8*16(p_keys), xkey8
+		.endif
+	.endif
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
+		.set i, (i +1)
+	.endr
+
+	.if (klen == KEY_128)
+		.if (load_keys)
+			vmovdqa	9*16(p_keys), xkeyA
+		.endif
+	.else
+		vmovdqa	9*16(p_keys), xkeyA
+	.endif
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		vaesenc	xkey8, var_xdata, var_xdata		/* key 8 */
+		.set i, (i +1)
+	.endr
+
+	vmovdqa	10*16(p_keys), xkeyB
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		vaesenc	xkeyA, var_xdata, var_xdata		/* key 9 */
+		.set i, (i +1)
+	.endr
+
+	.if (klen != KEY_128)
+		vmovdqa	11*16(p_keys), xkeyA
+	.endif
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		/* key 10 */
+		.if (klen == KEY_128)
+			vaesenclast	xkeyB, var_xdata, var_xdata
+		.else
+			vaesenc	xkeyB, var_xdata, var_xdata
+		.endif
+		.set i, (i +1)
+	.endr
+
+	.if (klen != KEY_128)
+		.if (load_keys)
+			vmovdqa	12*16(p_keys), xkey12
+		.endif
+
+		.set i, 0
+		.rept by
+			club XDATA, i
+			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
+			.set i, (i +1)
+		.endr
+
+		.if (klen == KEY_256)
+			vmovdqa	13*16(p_keys), xkeyA
+		.endif
+
+		.set i, 0
+		.rept by
+			club XDATA, i
+			.if (klen == KEY_256)
+				/* key 12 */
+				vaesenc	xkey12, var_xdata, var_xdata
+			.else
+				vaesenclast xkey12, var_xdata, var_xdata
+			.endif
+			.set i, (i +1)
+		.endr
+
+		.if (klen == KEY_256)
+			vmovdqa	14*16(p_keys), xkeyB
+
+			.set i, 0
+			.rept by
+				club XDATA, i
+				/* key 13 */
+				vaesenc	xkeyA, var_xdata, var_xdata
+				.set i, (i +1)
+			.endr
+
+			.set i, 0
+			.rept by
+				club XDATA, i
+				/* key 14 */
+				vaesenclast	xkeyB, var_xdata, var_xdata
+				.set i, (i +1)
+			.endr
+		.endif
+	.endif
+
+	.set i, 0
+	.rept (by / 2)
+		.set j, (i+1)
+		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
+		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
+		club XDATA, i
+		vpxor	xkeyA, var_xdata, var_xdata
+		club XDATA, j
+		vpxor	xkeyB, var_xdata, var_xdata
+		.set i, (i+2)
+	.endr
+
+	.if (i < by)
+		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
+		club XDATA, i
+		vpxor	xkeyA, var_xdata, var_xdata
+	.endif
+
+	.set i, 0
+	.rept by
+		club XDATA, i
+		VMOVDQ	var_xdata, i*16(p_out)
+		.set i, (i+1)
+	.endr
+.endm
+
+.macro do_aes_load val, key_len
+	do_aes \val, 1, \key_len
+.endm
+
+.macro do_aes_noload val, key_len
+	do_aes \val, 0, \key_len
+.endm
+
+/* main body of aes ctr load */
+
+.macro do_aes_ctrmain key_len
+
+	cmp	$16, num_bytes
+	jb	.Ldo_return2\key_len
+
+	vmovdqa	byteswap_const(%rip), xbyteswap
+	vmovdqu	(p_iv), xcounter
+	vpshufb	xbyteswap, xcounter, xcounter
+
+	mov	num_bytes, tmp
+	and	$(7*16), tmp
+	jz	.Lmult_of_8_blks\key_len
+
+	/* 1 <= tmp <= 7 */
+	cmp	$(4*16), tmp
+	jg	.Lgt4\key_len
+	je	.Leq4\key_len
+
+.Llt4\key_len:
+	cmp	$(2*16), tmp
+	jg	.Leq3\key_len
+	je	.Leq2\key_len
+
+.Leq1\key_len:
+	do_aes_load	1, \key_len
+	add	$(1*16), p_out
+	and	$(~7*16), num_bytes
+	jz	.Ldo_return2\key_len
+	jmp	.Lmain_loop2\key_len
+
+.Leq2\key_len:
+	do_aes_load	2, \key_len
+	add	$(2*16), p_out
+	and	$(~7*16), num_bytes
+	jz	.Ldo_return2\key_len
+	jmp	.Lmain_loop2\key_len
+
+
+.Leq3\key_len:
+	do_aes_load	3, \key_len
+	add	$(3*16), p_out
+	and	$(~7*16), num_bytes
+	jz	.Ldo_return2\key_len
+	jmp	.Lmain_loop2\key_len
+
+.Leq4\key_len:
+	do_aes_load	4, \key_len
+	add	$(4*16), p_out
+	and	$(~7*16), num_bytes
+	jz	.Ldo_return2\key_len
+	jmp	.Lmain_loop2\key_len
+
+.Lgt4\key_len:
+	cmp	$(6*16), tmp
+	jg	.Leq7\key_len
+	je	.Leq6\key_len
+
+.Leq5\key_len:
+	do_aes_load	5, \key_len
+	add	$(5*16), p_out
+	and	$(~7*16), num_bytes
+	jz	.Ldo_return2\key_len
+	jmp	.Lmain_loop2\key_len
+
+.Leq6\key_len:
+	do_aes_load	6, \key_len
+	add	$(6*16), p_out
+	and	$(~7*16), num_bytes
+	jz	.Ldo_return2\key_len
+	jmp	.Lmain_loop2\key_len
+
+.Leq7\key_len:
+	do_aes_load	7, \key_len
+	add	$(7*16), p_out
+	and	$(~7*16), num_bytes
+	jz	.Ldo_return2\key_len
+	jmp	.Lmain_loop2\key_len
+
+.Lmult_of_8_blks\key_len:
+	.if (\key_len != KEY_128)
+		vmovdqa	0*16(p_keys), xkey0
+		vmovdqa	4*16(p_keys), xkey4
+		vmovdqa	8*16(p_keys), xkey8
+		vmovdqa	12*16(p_keys), xkey12
+	.else
+		vmovdqa	0*16(p_keys), xkey0
+		vmovdqa	3*16(p_keys), xkey4
+		vmovdqa	6*16(p_keys), xkey8
+		vmovdqa	9*16(p_keys), xkey12
+	.endif
+.align 16
+.Lmain_loop2\key_len:
+	/* num_bytes is a multiple of 8 and >0 */
+	do_aes_noload	8, \key_len
+	add	$(8*16), p_out
+	sub	$(8*16), num_bytes
+	jne	.Lmain_loop2\key_len
+
+.Ldo_return2\key_len:
+	/* return updated IV */
+	vpshufb	xbyteswap, xcounter, xcounter
+	vmovdqu	xcounter, (p_iv)
+	ret
+.endm
+
+/*
+ * routine to do AES128 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
+ *			unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_128_avx_by8)
+	/* call the aes main loop */
+	do_aes_ctrmain KEY_128
+
+ENDPROC(aes_ctr_enc_128_avx_by8)
+
+/*
+ * routine to do AES192 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
+ *			unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_192_avx_by8)
+	/* call the aes main loop */
+	do_aes_ctrmain KEY_192
+
+ENDPROC(aes_ctr_enc_192_avx_by8)
+
+/*
+ * routine to do AES256 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
+ *			unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_256_avx_by8)
+	/* call the aes main loop */
+	do_aes_ctrmain KEY_256
+
+ENDPROC(aes_ctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 948ad0e77741..888950f29fd9 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -105,6 +105,9 @@ void crypto_fpu_exit(void);
 #define AVX_GEN4_OPTSIZE 4096
 
 #ifdef CONFIG_X86_64
+
+static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
@@ -155,6 +158,12 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
 
 
 #ifdef CONFIG_AS_AVX
+asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
+		void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
+		void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
+		void *keys, u8 *out, unsigned int num_bytes);
 /*
  * asmlinkage void aesni_gcm_precomp_avx_gen2()
  * gcm_data *my_ctx_data, context data
@@ -472,6 +481,25 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
 	crypto_inc(ctrblk, AES_BLOCK_SIZE);
 }
 
+#ifdef CONFIG_AS_AVX
+static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv)
+{
+	/*
+	 * based on key length, override with the by8 version
+	 * of ctr mode encryption/decryption for improved performance
+	 * aes_set_key_common() ensures that key length is one of
+	 * {128,192,256}
+	 */
+	if (ctx->key_length == AES_KEYSIZE_128)
+		aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len);
+	else if (ctx->key_length == AES_KEYSIZE_192)
+		aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len);
+	else
+		aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
+}
+#endif
+
 static int ctr_crypt(struct blkcipher_desc *desc,
 		     struct scatterlist *dst, struct scatterlist *src,
 		     unsigned int nbytes)
@@ -486,8 +514,8 @@ static int ctr_crypt(struct blkcipher_desc *desc,
 
 	kernel_fpu_begin();
 	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
-		aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
-			      nbytes & AES_BLOCK_MASK, walk.iv);
+		aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+				  nbytes & AES_BLOCK_MASK, walk.iv);
 		nbytes &= AES_BLOCK_SIZE - 1;
 		err = blkcipher_walk_done(desc, &walk, nbytes);
 	}
@@ -1493,6 +1521,14 @@ static int __init aesni_init(void)
 		aesni_gcm_enc_tfm = aesni_gcm_enc;
 		aesni_gcm_dec_tfm = aesni_gcm_dec;
 	}
+	aesni_ctr_enc_tfm = aesni_ctr_enc;
+#ifdef CONFIG_AS_AVX
+	if (cpu_has_avx) {
+		/* optimize performance of ctr mode encryption transform */
+		aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
+		pr_info("AES CTR mode by8 optimization enabled\n");
+	}
+#endif
 #endif
 
 	err = crypto_fpu_init();
-- 
cgit v1.2.3-58-ga151


From 5e50d43d65d4190fb9bb183cc086e356a065cc23 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Date: Mon, 23 Jun 2014 19:39:17 +0300
Subject: crypto: des3_ede-x86_64 - fix parse warning

Patch fixes following sparse warning:

  CHECK   arch/x86/crypto/des3_ede_glue.c
arch/x86/crypto/des3_ede_glue.c:308:52: warning: restricted __be64 degrades to integer
arch/x86/crypto/des3_ede_glue.c:309:52: warning: restricted __be64 degrades to integer
arch/x86/crypto/des3_ede_glue.c:310:52: warning: restricted __be64 degrades to integer
arch/x86/crypto/des3_ede_glue.c:326:44: warning: restricted __be64 degrades to integer

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/des3_ede_glue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index ebc421543e44..0e9c0668fe4e 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -289,8 +289,8 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
 	struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	unsigned int bsize = DES3_EDE_BLOCK_SIZE;
 	unsigned int nbytes = walk->nbytes;
-	u64 *src = (u64 *)walk->src.virt.addr;
-	u64 *dst = (u64 *)walk->dst.virt.addr;
+	__be64 *src = (__be64 *)walk->src.virt.addr;
+	__be64 *dst = (__be64 *)walk->dst.virt.addr;
 	u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
 	__be64 ctrblocks[3];
 
-- 
cgit v1.2.3-58-ga151


From 6f39da1cadac76fedc16d629a8f7c0d0eef814c6 Mon Sep 17 00:00:00 2001
From: Nitesh Narayan Lal <b44382@freescale.com>
Date: Tue, 1 Jul 2014 19:37:23 +0530
Subject: crypto: dts - Addition of missing SEC compatibile property in c29x
 device tree

The driver is compatible with SEC version 4.0, which was missing from
device tree resulting that the caam driver doesn't gets probed. Since
SEC is backward compatible with older versions, so this patch adds those
missing versions in c29x device tree.

Signed-off-by: Nitesh Narayan Lal <b44382@freescale.com>
Signed-off-by: Vakul Garg <b16394@freescale.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/powerpc/boot/dts/fsl/qoriq-sec6.0-0.dtsi | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/boot/dts/fsl/qoriq-sec6.0-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-sec6.0-0.dtsi
index f75b4f820c3c..7d4a6a2354f4 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-sec6.0-0.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-sec6.0-0.dtsi
@@ -32,7 +32,8 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-	compatible = "fsl,sec-v6.0";
+	compatible = "fsl,sec-v6.0", "fsl,sec-v5.0",
+		     "fsl,sec-v4.0";
 	fsl,sec-era = <6>;
 	#address-cells = <1>;
 	#size-cells = <1>;
-- 
cgit v1.2.3-58-ga151