Merge branch 'word-at-a-time'

Merge minor word-at-a-time instruction choice improvements for x86 and arm64. This is the second of four branches that came out of me looking at the code generation for path lookup on arm64. The word-at-a-time infrastructure is used to do string operations in chunks of one word both when copying the pathname from user space (in strncpy_from_user()), and when parsing and hashing the individual path components (in link_path_walk()). In particular, the "find the first zero byte" uses various bit tricks to figure out the end of the string or path component, and get the length without having to do things one byte at a time. Both x86-64 and arm64 had less than optimal code choices for that. The commit message for the arm64 change in particular tries to explain the exact code flow for the zero byte finding for people who care. It's made a bit more complicated by the fact that we support big-endian hardware too, and so we have some extra abstraction layers to allow different models for finding the zero byte, quite apart from the issue of picking specialized instructions. * word-at-a-time: arm64: word-at-a-time: improve byte count calculations for LE x86-64: word-at-a-time: improve byte count calculations
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-07-15 08:55:10 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-07-15 08:55:10 -0700
commit: 6a31ffdfed10dc48e6fd1775d50c22429382ab98 (patch)
tree: ab78c04fb92e200a8d0182d953bd588cc08e8a58 /arch
parent: a5819099f601c1af5b86b1f5921a56859e45b19a (diff)
parent: f915a3e5b0182dd7376f11337e231500a157e1f4 (diff)
2 files changed, 26 insertions, 42 deletions
diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h
index 14251abee23c..824ca6987a51 100644
--- a/arch/arm64/include/asm/word-at-a-time.h
+++ b/arch/arm64/include/asm/word-at-a-time.h
@@ -27,20 +27,15 @@ static inline unsigned long has_zero(unsigned long a, unsigned long *bits,
 }
 
 #define prep_zero_mask(a, bits, c) (bits)
+#define create_zero_mask(bits) (bits)
+#define find_zero(bits) (__ffs(bits) >> 3)
 
-static inline unsigned long create_zero_mask(unsigned long bits)
+static inline unsigned long zero_bytemask(unsigned long bits)
 {
 	bits = (bits - 1) & ~bits;
 	return bits >> 7;
 }
 
-static inline unsigned long find_zero(unsigned long mask)
-{
-	return fls64(mask) >> 3;
-}
-
-#define zero_bytemask(mask) (mask)
-
 #else	/* __AARCH64EB__ */
 #include <asm-generic/word-at-a-time.h>
 #endif
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index e8d7d4941c4c..422a47746657 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -5,45 +5,12 @@
 #include <linux/bitops.h>
 #include <linux/wordpart.h>
 
-/*
- * This is largely generic for little-endian machines, but the
- * optimal byte mask counting is probably going to be something
- * that is architecture-specific. If you have a reliably fast
- * bit count instruction, that might be better than the multiply
- * and shift, for example.
- */
 struct word_at_a_time {
 	const unsigned long one_bits, high_bits;
 };
 
 #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
 
-#ifdef CONFIG_64BIT
-
-/*
- * Jan Achrenius on G+: microoptimized version of
- * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
- * that works for the bytemasks without having to
- * mask them first.
- */
-static inline long count_masked_bytes(unsigned long mask)
-{
-	return mask*0x0001020304050608ul >> 56;
-}
-
-#else	/* 32-bit case */
-
-/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
-static inline long count_masked_bytes(long mask)
-{
-	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
-	long a = (0x0ff0001+mask) >> 23;
-	/* Fix the 1 for 00 case */
-	return a & mask;
-}
-
-#endif
-
 /* Return nonzero if it has a zero */
 static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
 {
@@ -57,6 +24,22 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits,
 	return bits;
 }
 
+#ifdef CONFIG_64BIT
+
+/* Keep the initial has_zero() value for both bitmask and size calc */
+#define create_zero_mask(bits) (bits)
+
+static inline unsigned long zero_bytemask(unsigned long bits)
+{
+	bits = (bits - 1) & ~bits;
+	return bits >> 7;
+}
+
+#define find_zero(bits) (__ffs(bits) >> 3)
+
+#else
+
+/* Create the final mask for both bytemask and size */
 static inline unsigned long create_zero_mask(unsigned long bits)
 {
 	bits = (bits - 1) & ~bits;
@@ -66,11 +49,17 @@ static inline unsigned long create_zero_mask(unsigned long bits)
 /* The mask we created is directly usable as a bytemask */
 #define zero_bytemask(mask) (mask)
 
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
 static inline unsigned long find_zero(unsigned long mask)
 {
-	return count_masked_bytes(mask);
+	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+	long a = (0x0ff0001+mask) >> 23;
+	/* Fix the 1 for 00 case */
+	return a & mask;
 }
 
+#endif
+
 /*
  * Load an unaligned word from kernel space.
  *
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-07-15 08:55:10 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-07-15 08:55:10 -0700
commit	6a31ffdfed10dc48e6fd1775d50c22429382ab98 (patch)
tree	ab78c04fb92e200a8d0182d953bd588cc08e8a58 /arch
parent	a5819099f601c1af5b86b1f5921a56859e45b19a (diff)
parent	f915a3e5b0182dd7376f11337e231500a157e1f4 (diff)