crypto: x86/poly1305 - fix overflow during partial reduction

The x86_64 implementation of Poly1305 produces the wrong result on some inputs because poly1305_4block_avx2() incorrectly assumes that when partially reducing the accumulator, the bits carried from limb 'd4' to limb 'h0' fit in a 32-bit integer. This is true for poly1305-generic which processes only one block at a time. However, it's not true for the AVX2 implementation, which processes 4 blocks at a time and therefore can produce intermediate limbs about 4x larger. Fix it by making the relevant calculations use 64-bit arithmetic rather than 32-bit. Note that most of the carries already used 64-bit arithmetic, but the d4 -> h0 carry was different for some reason. To be safe I also made the same change to the corresponding SSE2 code, though that only operates on 1 or 2 blocks at a time. I don't think it's really needed for poly1305_block_sse2(), but it doesn't hurt because it's already x86_64 code. It *might* be needed for poly1305_2block_sse2(), but overflows aren't easy to reproduce there. This bug was originally detected by my patches that improve testmgr to fuzz algorithms against their generic implementation. But also add a test vector which reproduces it directly (in the AVX2 case). Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64") Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64") Cc: <stable@vger.kernel.org> # v4.3+ Cc: Martin Willi <martin@strongswan.org> Cc: Jason A. Donenfeld <Jason@zx2c4.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Reviewed-by: Martin Willi <martin@strongswan.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Eric Biggers <ebiggers@google.com> 2019-03-31 13:04:11 -0700
committer: Herbert Xu <herbert@gondor.apana.org.au> 2019-04-08 14:43:06 +0800
commit: 678cce4019d746da6c680c48ba9e6d417803e127 (patch)
tree: f1fb27e128d51a60ceb4d961dd7baf11fa1525fa /arch/x86/crypto
parent: b4e9e931e9bb2f5b302ce66640832f5a3e57e8c4 (diff)
2 files changed, 24 insertions, 12 deletions
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
index 3b6e70d085da..8457cdd47f75 100644
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2)
 	vpaddq		t2,t1,t1
 	vmovq		t1x,d4
 
+	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+	# integers.  It's true in a single-block implementation, but not here.
+
 	# d1 += d0 >> 26
 	mov		d0,%rax
 	shr		$26,%rax
@@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2)
 	# h0 += (d4 >> 26) * 5
 	mov		d4,%rax
 	shr		$26,%rax
-	lea		(%eax,%eax,4),%eax
-	add		%eax,%ebx
+	lea		(%rax,%rax,4),%rax
+	add		%rax,%rbx
 	# h4 = d4 & 0x3ffffff
 	mov		d4,%rax
 	and		$0x3ffffff,%eax
 	mov		%eax,h4
 
 	# h1 += h0 >> 26
-	mov		%ebx,%eax
-	shr		$26,%eax
+	mov		%rbx,%rax
+	shr		$26,%rax
 	add		%eax,h1
 	# h0 = h0 & 0x3ffffff
 	andl		$0x3ffffff,%ebx
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index e6add74d78a5..6f0be7a86964 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2)
 	# h0 += (d4 >> 26) * 5
 	mov		d4,%rax
 	shr		$26,%rax
-	lea		(%eax,%eax,4),%eax
-	add		%eax,%ebx
+	lea		(%rax,%rax,4),%rax
+	add		%rax,%rbx
 	# h4 = d4 & 0x3ffffff
 	mov		d4,%rax
 	and		$0x3ffffff,%eax
 	mov		%eax,h4
 
 	# h1 += h0 >> 26
-	mov		%ebx,%eax
-	shr		$26,%eax
+	mov		%rbx,%rax
+	shr		$26,%rax
 	add		%eax,h1
 	# h0 = h0 & 0x3ffffff
 	andl		$0x3ffffff,%ebx
@@ -524,6 +524,12 @@ ENTRY(poly1305_2block_sse2)
 	paddq		t2,t1
 	movq		t1,d4
 
+	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+	# integers.  It's true in a single-block implementation, but not here.
+
 	# d1 += d0 >> 26
 	mov		d0,%rax
 	shr		$26,%rax
@@ -562,16 +568,16 @@ ENTRY(poly1305_2block_sse2)
 	# h0 += (d4 >> 26) * 5
 	mov		d4,%rax
 	shr		$26,%rax
-	lea		(%eax,%eax,4),%eax
-	add		%eax,%ebx
+	lea		(%rax,%rax,4),%rax
+	add		%rax,%rbx
 	# h4 = d4 & 0x3ffffff
 	mov		d4,%rax
 	and		$0x3ffffff,%eax
 	mov		%eax,h4
 
 	# h1 += h0 >> 26
-	mov		%ebx,%eax
-	shr		$26,%eax
+	mov		%rbx,%rax
+	shr		$26,%rax
 	add		%eax,h1
 	# h0 = h0 & 0x3ffffff
 	andl		$0x3ffffff,%ebx
author	Eric Biggers <ebiggers@google.com>	2019-03-31 13:04:11 -0700
committer	Herbert Xu <herbert@gondor.apana.org.au>	2019-04-08 14:43:06 +0800
commit	678cce4019d746da6c680c48ba9e6d417803e127 (patch)
tree	f1fb27e128d51a60ceb4d961dd7baf11fa1525fa /arch/x86/crypto
parent	b4e9e931e9bb2f5b302ce66640832f5a3e57e8c4 (diff)