x86/csum: Improve performance of `csum_partial`

1) Add special case for len == 40 as that is the hottest value. The nets a ~8-9% latency improvement and a ~30% throughput improvement in the len == 40 case. 2) Use multiple accumulators in the 64-byte loop. This dramatically improves ILP and results in up to a 40% latency/throughput improvement (better for more iterations). Results from benchmarking on Icelake. Times measured with rdtsc() len lat_new lat_old r tput_new tput_old r 8 3.58 3.47 1.032 3.58 3.51 1.021 16 4.14 4.02 1.028 3.96 3.78 1.046 24 4.99 5.03 0.992 4.23 4.03 1.050 32 5.09 5.08 1.001 4.68 4.47 1.048 40 5.57 6.08 0.916 3.05 4.43 0.690 48 6.65 6.63 1.003 4.97 4.69 1.059 56 7.74 7.72 1.003 5.22 4.95 1.055 64 6.65 7.22 0.921 6.38 6.42 0.994 96 9.43 9.96 0.946 7.46 7.54 0.990 128 9.39 12.15 0.773 8.90 8.79 1.012 200 12.65 18.08 0.699 11.63 11.60 1.002 272 15.82 23.37 0.677 14.43 14.35 1.005 440 24.12 36.43 0.662 21.57 22.69 0.951 952 46.20 74.01 0.624 42.98 53.12 0.809 1024 47.12 78.24 0.602 46.36 58.83 0.788 1552 72.01 117.30 0.614 71.92 96.78 0.743 2048 93.07 153.25 0.607 93.28 137.20 0.680 2600 114.73 194.30 0.590 114.28 179.32 0.637 3608 156.34 268.41 0.582 154.97 254.02 0.610 4096 175.01 304.03 0.576 175.89 292.08 0.602 There is no such thing as a free lunch, however, and the special case for len == 40 does add overhead to the len != 40 cases. This seems to amount to be ~5% throughput and slightly less in terms of latency. Testing: Part of this change is a new kunit test. The tests check all alignment X length pairs in [0, 64) X [0, 512). There are three cases. 1) Precomputed random inputs/seed. The expected results where generated use the generic implementation (which is assumed to be non-buggy). 2) An input of all 1s. The goal of this test is to catch any case a carry is missing. 3) An input that never carries. The goal of this test si to catch any case of incorrectly carrying. More exhaustive tests that test all alignment X length pairs in [0, 8192) X [0, 8192] on random data are also available here: https://github.com/goldsteinn/csum-reproduction The reposity also has the code for reproducing the above benchmark numbers. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Link: https://lore.kernel.org/all/20230511011002.935690-1-goldstein.w.n%40gmail.com
author: Noah Goldstein <goldstein.w.n@gmail.com> 2023-05-10 20:10:02 -0500
committer: Dave Hansen <dave.hansen@linux.intel.com> 2023-05-25 10:55:18 -0700
commit: 688eb8191b475db5acfd48634600b04fd3dda9ad (patch)
tree: 13159d466e73958d620c03aee7fd6a84a71f52c7 /arch/x86/lib
parent: b2ad431f6469b58914ee7254302c6dc97f688e54 (diff)
1 files changed, 65 insertions, 32 deletions
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 50734a23034c..fe5861951b15 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -5,22 +5,32 @@
  * This file contains network checksum routines that are better done
  * in an architecture-specific manner due to speed.
  */
- 
+
 #include <linux/compiler.h>
 #include <linux/export.h>
 #include <asm/checksum.h>
 #include <asm/word-at-a-time.h>
 
-static inline unsigned short from32to16(unsigned a) 
+static inline unsigned short from32to16(unsigned a)
 {
-	unsigned short b = a >> 16; 
+	unsigned short b = a >> 16;
 	asm("addw %w2,%w0\n\t"
-	    "adcw $0,%w0\n" 
+	    "adcw $0,%w0\n"
 	    : "=r" (b)
 	    : "0" (b), "r" (a));
 	return b;
 }
 
+static inline __wsum csum_tail(unsigned int result, u64 temp64, int odd)
+{
+	result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
+	if (unlikely(odd)) {
+		result = from32to16(result);
+		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+	}
+	return (__force __wsum)result;
+}
+
 /*
  * Do a checksum on an arbitrary memory area.
  * Returns a 32bit checksum.
@@ -47,21 +57,52 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
 		buff++;
 	}
 
-	while (unlikely(len >= 64)) {
+	/*
+	 * len == 40 is the hot case due to IPv6 headers, but annotating it likely()
+	 * has noticeable negative affect on codegen for all other cases with
+	 * minimal performance benefit here.
+	 */
+	if (len == 40) {
 		asm("addq 0*8(%[src]),%[res]\n\t"
 		    "adcq 1*8(%[src]),%[res]\n\t"
 		    "adcq 2*8(%[src]),%[res]\n\t"
 		    "adcq 3*8(%[src]),%[res]\n\t"
 		    "adcq 4*8(%[src]),%[res]\n\t"
-		    "adcq 5*8(%[src]),%[res]\n\t"
-		    "adcq 6*8(%[src]),%[res]\n\t"
-		    "adcq 7*8(%[src]),%[res]\n\t"
 		    "adcq $0,%[res]"
-		    : [res] "+r" (temp64)
-		    : [src] "r" (buff)
-		    : "memory");
-		buff += 64;
-		len -= 64;
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[40])buff));
+		return csum_tail(result, temp64, odd);
+	}
+	if (unlikely(len >= 64)) {
+		/*
+		 * Extra accumulators for better ILP in the loop.
+		 */
+		u64 tmp_accum, tmp_carries;
+
+		asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t"
+		    "xorl %k[tmp_carries],%k[tmp_carries]\n\t"
+		    "subl $64, %[len]\n\t"
+		    "1:\n\t"
+		    "addq 0*8(%[src]),%[res]\n\t"
+		    "adcq 1*8(%[src]),%[res]\n\t"
+		    "adcq 2*8(%[src]),%[res]\n\t"
+		    "adcq 3*8(%[src]),%[res]\n\t"
+		    "adcl $0,%k[tmp_carries]\n\t"
+		    "addq 4*8(%[src]),%[tmp_accum]\n\t"
+		    "adcq 5*8(%[src]),%[tmp_accum]\n\t"
+		    "adcq 6*8(%[src]),%[tmp_accum]\n\t"
+		    "adcq 7*8(%[src]),%[tmp_accum]\n\t"
+		    "adcl $0,%k[tmp_carries]\n\t"
+		    "addq $64, %[src]\n\t"
+		    "subl $64, %[len]\n\t"
+		    "jge 1b\n\t"
+		    "addq %[tmp_accum],%[res]\n\t"
+		    "adcq %[tmp_carries],%[res]\n\t"
+		    "adcq $0,%[res]"
+		    : [tmp_accum] "=&r"(tmp_accum),
+		      [tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64),
+		      [len] "+r"(len), [src] "+r"(buff)
+		    : "m"(*(const char *)buff));
 	}
 
 	if (len & 32) {
@@ -70,45 +111,37 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
 		    "adcq 2*8(%[src]),%[res]\n\t"
 		    "adcq 3*8(%[src]),%[res]\n\t"
 		    "adcq $0,%[res]"
-			: [res] "+r" (temp64)
-			: [src] "r" (buff)
-			: "memory");
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[32])buff));
 		buff += 32;
 	}
 	if (len & 16) {
 		asm("addq 0*8(%[src]),%[res]\n\t"
 		    "adcq 1*8(%[src]),%[res]\n\t"
 		    "adcq $0,%[res]"
-			: [res] "+r" (temp64)
-			: [src] "r" (buff)
-			: "memory");
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[16])buff));
 		buff += 16;
 	}
 	if (len & 8) {
 		asm("addq 0*8(%[src]),%[res]\n\t"
 		    "adcq $0,%[res]"
-			: [res] "+r" (temp64)
-			: [src] "r" (buff)
-			: "memory");
+		    : [res] "+r"(temp64)
+		    : [src] "r"(buff), "m"(*(const char(*)[8])buff));
 		buff += 8;
 	}
 	if (len & 7) {
-		unsigned int shift = (8 - (len & 7)) * 8;
+		unsigned int shift = (-len << 3) & 63;
 		unsigned long trail;
 
 		trail = (load_unaligned_zeropad(buff) << shift) >> shift;
 
 		asm("addq %[trail],%[res]\n\t"
 		    "adcq $0,%[res]"
-			: [res] "+r" (temp64)
-			: [trail] "r" (trail));
+		    : [res] "+r"(temp64)
+		    : [trail] "r"(trail));
 	}
-	result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
-	if (unlikely(odd)) {
-		result = from32to16(result);
-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
-	}
-	return (__force __wsum)result;
+	return csum_tail(result, temp64, odd);
 }
 EXPORT_SYMBOL(csum_partial);
 
@@ -118,6 +151,6 @@ EXPORT_SYMBOL(csum_partial);
  */
 __sum16 ip_compute_csum(const void *buff, int len)
 {
-	return csum_fold(csum_partial(buff,len,0));
+	return csum_fold(csum_partial(buff, len, 0));
 }
 EXPORT_SYMBOL(ip_compute_csum);
author	Noah Goldstein <goldstein.w.n@gmail.com>	2023-05-10 20:10:02 -0500
committer	Dave Hansen <dave.hansen@linux.intel.com>	2023-05-25 10:55:18 -0700
commit	688eb8191b475db5acfd48634600b04fd3dda9ad (patch)
tree	13159d466e73958d620c03aee7fd6a84a71f52c7 /arch/x86/lib
parent	b2ad431f6469b58914ee7254302c6dc97f688e54 (diff)