summaryrefslogtreecommitdiff
path: root/arch/arm/crypto/crct10dif-ce-core.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm/crypto/crct10dif-ce-core.S')
-rw-r--r--arch/arm/crypto/crct10dif-ce-core.S568
1 files changed, 261 insertions, 307 deletions
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index ce45ba0c0687..86be258a803f 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -2,12 +2,14 @@
// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
//
// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+// Copyright (C) 2019 Google LLC <ebiggers@google.com>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License version 2 as
// published by the Free Software Foundation.
//
+// Derived from the x86 version:
//
// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
//
@@ -54,19 +56,11 @@
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
-// Function API:
-// UINT16 crc_t10dif_pcl(
-// UINT16 init_crc, //initial CRC value, 16 bits
-// const unsigned char *buf, //buffer pointer to calculate CRC on
-// UINT64 len //buffer length in bytes (64-bit data)
-// );
-//
// Reference paper titled "Fast CRC Computation for Generic
// Polynomials Using PCLMULQDQ Instruction"
// URL: http://www.intel.com/content/dam/www/public/us/en/documents
// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
//
-//
#include <linux/linkage.h>
#include <asm/assembler.h>
@@ -78,13 +72,14 @@
#endif
.text
+ .arch armv7-a
.fpu crypto-neon-fp-armv8
- arg1_low32 .req r0
- arg2 .req r1
- arg3 .req r2
+ init_crc .req r0
+ buf .req r1
+ len .req r2
- qzr .req q13
+ fold_consts_ptr .req ip
q0l .req d0
q0h .req d1
@@ -102,82 +97,35 @@
q6h .req d13
q7l .req d14
q7h .req d15
-
-ENTRY(crc_t10dif_pmull)
- vmov.i8 qzr, #0 // init zero register
-
- // adjust the 16-bit initial_crc value, scale it to 32 bits
- lsl arg1_low32, arg1_low32, #16
-
- // check if smaller than 256
- cmp arg3, #256
-
- // for sizes less than 128, we can't fold 64B at a time...
- blt _less_than_128
-
- // load the initial crc value
- // crc value does not need to be byte-reflected, but it needs
- // to be moved to the high part of the register.
- // because data will be byte-reflected and will align with
- // initial crc at correct place.
- vmov s0, arg1_low32 // initial crc
- vext.8 q10, qzr, q0, #4
-
- // receive the initial 64B data, xor the initial crc value
- vld1.64 {q0-q1}, [arg2, :128]!
- vld1.64 {q2-q3}, [arg2, :128]!
- vld1.64 {q4-q5}, [arg2, :128]!
- vld1.64 {q6-q7}, [arg2, :128]!
-CPU_LE( vrev64.8 q0, q0 )
-CPU_LE( vrev64.8 q1, q1 )
-CPU_LE( vrev64.8 q2, q2 )
-CPU_LE( vrev64.8 q3, q3 )
-CPU_LE( vrev64.8 q4, q4 )
-CPU_LE( vrev64.8 q5, q5 )
-CPU_LE( vrev64.8 q6, q6 )
-CPU_LE( vrev64.8 q7, q7 )
-
- vswp d0, d1
- vswp d2, d3
- vswp d4, d5
- vswp d6, d7
- vswp d8, d9
- vswp d10, d11
- vswp d12, d13
- vswp d14, d15
-
- // XOR the initial_crc value
- veor.8 q0, q0, q10
-
- adr ip, rk3
- vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
-
- //
- // we subtract 256 instead of 128 to save one instruction from the loop
- //
- sub arg3, arg3, #256
-
- // at this section of the code, there is 64*x+y (0<=y<64) bytes of
- // buffer. The _fold_64_B_loop will fold 64B at a time
- // until we have 64+y Bytes of buffer
-
-
- // fold 64B at a time. This section of the code folds 4 vector
- // registers in parallel
-_fold_64_B_loop:
-
- .macro fold64, reg1, reg2
- vld1.64 {q11-q12}, [arg2, :128]!
-
- vmull.p64 q8, \reg1\()h, d21
- vmull.p64 \reg1, \reg1\()l, d20
- vmull.p64 q9, \reg2\()h, d21
- vmull.p64 \reg2, \reg2\()l, d20
-
-CPU_LE( vrev64.8 q11, q11 )
-CPU_LE( vrev64.8 q12, q12 )
- vswp d22, d23
- vswp d24, d25
+ q8l .req d16
+ q8h .req d17
+ q9l .req d18
+ q9h .req d19
+ q10l .req d20
+ q10h .req d21
+ q11l .req d22
+ q11h .req d23
+ q12l .req d24
+ q12h .req d25
+
+ FOLD_CONSTS .req q10
+ FOLD_CONST_L .req q10l
+ FOLD_CONST_H .req q10h
+
+ // Fold reg1, reg2 into the next 32 data bytes, storing the result back
+ // into reg1, reg2.
+ .macro fold_32_bytes, reg1, reg2
+ vld1.64 {q11-q12}, [buf]!
+
+ vmull.p64 q8, \reg1\()h, FOLD_CONST_H
+ vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L
+ vmull.p64 q9, \reg2\()h, FOLD_CONST_H
+ vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L
+
+CPU_LE( vrev64.8 q11, q11 )
+CPU_LE( vrev64.8 q12, q12 )
+ vswp q11l, q11h
+ vswp q12l, q12h
veor.8 \reg1, \reg1, q8
veor.8 \reg2, \reg2, q9
@@ -185,242 +133,248 @@ CPU_LE( vrev64.8 q12, q12 )
veor.8 \reg2, \reg2, q12
.endm
- fold64 q0, q1
- fold64 q2, q3
- fold64 q4, q5
- fold64 q6, q7
-
- subs arg3, arg3, #128
-
- // check if there is another 64B in the buffer to be able to fold
- bge _fold_64_B_loop
-
- // at this point, the buffer pointer is pointing at the last y Bytes
- // of the buffer the 64B of folded data is in 4 of the vector
- // registers: v0, v1, v2, v3
-
- // fold the 8 vector registers to 1 vector register with different
- // constants
-
- adr ip, rk9
- vld1.64 {q10}, [ip, :128]!
-
- .macro fold16, reg, rk
- vmull.p64 q8, \reg\()l, d20
- vmull.p64 \reg, \reg\()h, d21
- .ifnb \rk
- vld1.64 {q10}, [ip, :128]!
+ // Fold src_reg into dst_reg, optionally loading the next fold constants
+ .macro fold_16_bytes, src_reg, dst_reg, load_next_consts
+ vmull.p64 q8, \src_reg\()l, FOLD_CONST_L
+ vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H
+ .ifnb \load_next_consts
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
.endif
- veor.8 q7, q7, q8
- veor.8 q7, q7, \reg
+ veor.8 \dst_reg, \dst_reg, q8
+ veor.8 \dst_reg, \dst_reg, \src_reg
.endm
- fold16 q0, rk11
- fold16 q1, rk13
- fold16 q2, rk15
- fold16 q3, rk17
- fold16 q4, rk19
- fold16 q5, rk1
- fold16 q6
-
- // instead of 64, we add 48 to the loop counter to save 1 instruction
- // from the loop instead of a cmp instruction, we use the negative
- // flag with the jl instruction
- adds arg3, arg3, #(128-16)
- blt _final_reduction_for_128
-
- // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
- // and the rest is in memory. We can fold 16 bytes at a time if y>=16
- // continue folding 16B at a time
-
-_16B_reduction_loop:
- vmull.p64 q8, d14, d20
- vmull.p64 q7, d15, d21
- veor.8 q7, q7, q8
+ .macro __adrl, out, sym
+ movw \out, #:lower16:\sym
+ movt \out, #:upper16:\sym
+ .endm
- vld1.64 {q0}, [arg2, :128]!
-CPU_LE( vrev64.8 q0, q0 )
- vswp d0, d1
- veor.8 q7, q7, q0
- subs arg3, arg3, #16
-
- // instead of a cmp instruction, we utilize the flags with the
- // jge instruction equivalent of: cmp arg3, 16-16
- // check if there is any more 16B in the buffer to be able to fold
- bge _16B_reduction_loop
-
- // now we have 16+z bytes left to reduce, where 0<= z < 16.
- // first, we reduce the data in the xmm7 register
-
-_final_reduction_for_128:
- // check if any more data to fold. If not, compute the CRC of
- // the final 128 bits
- adds arg3, arg3, #16
- beq _128_done
-
- // here we are getting data that is less than 16 bytes.
- // since we know that there was data before the pointer, we can
- // offset the input pointer before the actual point, to receive
- // exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_regs:
- add arg2, arg2, arg3
- sub arg2, arg2, #16
- vld1.64 {q1}, [arg2]
-CPU_LE( vrev64.8 q1, q1 )
- vswp d2, d3
-
- // get rid of the extra data that was loaded before
- // load the shift constant
- adr ip, tbl_shf_table + 16
- sub ip, ip, arg3
- vld1.8 {q0}, [ip]
-
- // shift v2 to the left by arg3 bytes
- vtbl.8 d4, {d14-d15}, d0
- vtbl.8 d5, {d14-d15}, d1
-
- // shift v7 to the right by 16-arg3 bytes
- vmov.i8 q9, #0x80
- veor.8 q0, q0, q9
- vtbl.8 d18, {d14-d15}, d0
- vtbl.8 d19, {d14-d15}, d1
-
- // blend
- vshr.s8 q0, q0, #7 // convert to 8-bit mask
- vbsl.8 q0, q2, q1
-
- // fold 16 Bytes
- vmull.p64 q8, d18, d20
- vmull.p64 q7, d19, d21
+//
+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+ENTRY(crc_t10dif_pmull)
+
+ // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp len, #256
+ blt .Lless_than_256_bytes
+
+ __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts
+
+ // Load the first 128 data bytes. Byte swapping is necessary to make
+ // the bit order match the polynomial coefficient order.
+ vld1.64 {q0-q1}, [buf]!
+ vld1.64 {q2-q3}, [buf]!
+ vld1.64 {q4-q5}, [buf]!
+ vld1.64 {q6-q7}, [buf]!
+CPU_LE( vrev64.8 q0, q0 )
+CPU_LE( vrev64.8 q1, q1 )
+CPU_LE( vrev64.8 q2, q2 )
+CPU_LE( vrev64.8 q3, q3 )
+CPU_LE( vrev64.8 q4, q4 )
+CPU_LE( vrev64.8 q5, q5 )
+CPU_LE( vrev64.8 q6, q6 )
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q0l, q0h
+ vswp q1l, q1h
+ vswp q2l, q2h
+ vswp q3l, q3h
+ vswp q4l, q4h
+ vswp q5l, q5h
+ vswp q6l, q6h
+ vswp q7l, q7h
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q8h, #0
+ vmov.u16 q8h[3], init_crc
+ veor q0h, q0h, q8h
+
+ // Load the constants for folding across 128 bytes.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ // Subtract 128 for the 128 data bytes just consumed. Subtract another
+ // 128 to simplify the termination condition of the following loop.
+ sub len, len, #256
+
+ // While >= 128 data bytes remain (not counting q0-q7), fold the 128
+ // bytes q0-q7 into them, storing the result back into q0-q7.
+.Lfold_128_bytes_loop:
+ fold_32_bytes q0, q1
+ fold_32_bytes q2, q3
+ fold_32_bytes q4, q5
+ fold_32_bytes q6, q7
+ subs len, len, #128
+ bge .Lfold_128_bytes_loop
+
+ // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
+
+ // Fold across 64 bytes.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+ fold_16_bytes q0, q4
+ fold_16_bytes q1, q5
+ fold_16_bytes q2, q6
+ fold_16_bytes q3, q7, 1
+ // Fold across 32 bytes.
+ fold_16_bytes q4, q6
+ fold_16_bytes q5, q7, 1
+ // Fold across 16 bytes.
+ fold_16_bytes q6, q7
+
+ // Add 128 to get the correct number of data bytes remaining in 0...127
+ // (not counting q7), following the previous extra subtraction by 128.
+ // Then subtract 16 to simplify the termination condition of the
+ // following loop.
+ adds len, len, #(128-16)
+
+ // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
+ // into them, storing the result back into q7.
+ blt .Lfold_16_bytes_loop_done
+.Lfold_16_bytes_loop:
+ vmull.p64 q8, q7l, FOLD_CONST_L
+ vmull.p64 q7, q7h, FOLD_CONST_H
veor.8 q7, q7, q8
+ vld1.64 {q0}, [buf]!
+CPU_LE( vrev64.8 q0, q0 )
+ vswp q0l, q0h
veor.8 q7, q7, q0
-
-_128_done:
- // compute crc of a 128-bit value
- vldr d20, rk5
- vldr d21, rk6 // rk5 and rk6 in xmm10
-
- // 64b fold
- vext.8 q0, qzr, q7, #8
- vmull.p64 q7, d15, d20
+ subs len, len, #16
+ bge .Lfold_16_bytes_loop
+
+.Lfold_16_bytes_loop_done:
+ // Add 16 to get the correct number of data bytes remaining in 0...15
+ // (not counting q7), following the previous extra subtraction by 16.
+ adds len, len, #16
+ beq .Lreduce_final_16_bytes
+
+.Lhandle_partial_segment:
+ // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
+ // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
+ // do this without needing a fold constant for each possible 'len',
+ // redivide the bytes into a first chunk of 'len' bytes and a second
+ // chunk of 16 bytes, then fold the first chunk into the second.
+
+ // q0 = last 16 original data bytes
+ add buf, buf, len
+ sub buf, buf, #16
+ vld1.64 {q0}, [buf]
+CPU_LE( vrev64.8 q0, q0 )
+ vswp q0l, q0h
+
+ // q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
+ __adrl r3, .Lbyteshift_table + 16
+ sub r3, r3, len
+ vld1.8 {q2}, [r3]
+ vtbl.8 q1l, {q7l-q7h}, q2l
+ vtbl.8 q1h, {q7l-q7h}, q2h
+
+ // q3 = first chunk: q7 right-shifted by '16-len' bytes.
+ vmov.i8 q3, #0x80
+ veor.8 q2, q2, q3
+ vtbl.8 q3l, {q7l-q7h}, q2l
+ vtbl.8 q3h, {q7l-q7h}, q2h
+
+ // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
+ vshr.s8 q2, q2, #7
+
+ // q2 = second chunk: 'len' bytes from q0 (low-order bytes),
+ // then '16-len' bytes from q1 (high-order bytes).
+ vbsl.8 q2, q1, q0
+
+ // Fold the first chunk into the second chunk, storing the result in q7.
+ vmull.p64 q0, q3l, FOLD_CONST_L
+ vmull.p64 q7, q3h, FOLD_CONST_H
veor.8 q7, q7, q0
+ veor.8 q7, q7, q2
+
+.Lreduce_final_16_bytes:
+ // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
+
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ vmull.p64 q0, q7h, FOLD_CONST_H // high bits * x^48 * (x^80 mod G(x))
+ veor.8 q0h, q0h, q7l // + low bits * x^64
+
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ vmov.i8 q1, #0
+ vmov s4, s3 // extract high 32 bits
+ vmov s3, s5 // zero high 32 bits
+ vmull.p64 q1, q1l, FOLD_CONST_L // high 32 bits * x^48 * (x^48 mod G(x))
+ veor.8 q0, q0, q1 // + low bits
+
+ // Load G(x) and floor(x^48 / G(x)).
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]
+
+ // Use Barrett reduction to compute the final CRC value.
+ vmull.p64 q1, q0h, FOLD_CONST_H // high 32 bits * floor(x^48 / G(x))
+ vshr.u64 q1l, q1l, #32 // /= x^32
+ vmull.p64 q1, q1l, FOLD_CONST_L // *= G(x)
+ vshr.u64 q0l, q0l, #48
+ veor.8 q0l, q0l, q1l // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of q0.
+
+ vmov.u16 r0, q0l[0]
+ bx lr
- // 32b fold
- vext.8 q0, q7, qzr, #12
- vmov s31, s3
- vmull.p64 q0, d0, d21
- veor.8 q7, q0, q7
-
- // barrett reduction
-_barrett:
- vldr d20, rk7
- vldr d21, rk8
-
- vmull.p64 q0, d15, d20
- vext.8 q0, qzr, q0, #12
- vmull.p64 q0, d1, d21
- vext.8 q0, qzr, q0, #12
- veor.8 q7, q7, q0
- vmov r0, s29
+.Lless_than_256_bytes:
+ // Checksumming a buffer of length 16...255 bytes
-_cleanup:
- // scale the result back to 16 bits
- lsr r0, r0, #16
- bx lr
+ __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts
-_less_than_128:
- teq arg3, #0
- beq _cleanup
+ // Load the first 16 data bytes.
+ vld1.64 {q7}, [buf]!
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
- vmov.i8 q0, #0
- vmov s3, arg1_low32 // get the initial crc value
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q0h, #0
+ vmov.u16 q0h[3], init_crc
+ veor.8 q7h, q7h, q0h
- vld1.64 {q7}, [arg2, :128]!
-CPU_LE( vrev64.8 q7, q7 )
- vswp d14, d15
- veor.8 q7, q7, q0
+ // Load the fold-across-16-bytes constants.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
- cmp arg3, #16
- beq _128_done // exactly 16 left
- blt _less_than_16_left
-
- // now if there is, load the constants
- vldr d20, rk1
- vldr d21, rk2 // rk1 and rk2 in xmm10
-
- // check if there is enough buffer to be able to fold 16B at a time
- subs arg3, arg3, #32
- addlt arg3, arg3, #16
- blt _get_last_two_regs
- b _16B_reduction_loop
-
-_less_than_16_left:
- // shl r9, 4
- adr ip, tbl_shf_table + 16
- sub ip, ip, arg3
- vld1.8 {q0}, [ip]
- vmov.i8 q9, #0x80
- veor.8 q0, q0, q9
- vtbl.8 d18, {d14-d15}, d0
- vtbl.8 d15, {d14-d15}, d1
- vmov d14, d18
- b _128_done
+ cmp len, #16
+ beq .Lreduce_final_16_bytes // len == 16
+ subs len, len, #32
+ addlt len, len, #16
+ blt .Lhandle_partial_segment // 17 <= len <= 31
+ b .Lfold_16_bytes_loop // 32 <= len <= 255
ENDPROC(crc_t10dif_pmull)
-// precomputed constants
-// these constants are precomputed from the poly:
-// 0x8bb70000 (0x8bb7 scaled to 32 bits)
+ .section ".rodata", "a"
.align 4
-// Q = 0x18BB70000
-// rk1 = 2^(32*3) mod Q << 32
-// rk2 = 2^(32*5) mod Q << 32
-// rk3 = 2^(32*15) mod Q << 32
-// rk4 = 2^(32*17) mod Q << 32
-// rk5 = 2^(32*3) mod Q << 32
-// rk6 = 2^(32*2) mod Q << 32
-// rk7 = floor(2^64/Q)
-// rk8 = Q
-
-rk3: .quad 0x9d9d000000000000
-rk4: .quad 0x7cf5000000000000
-rk5: .quad 0x2d56000000000000
-rk6: .quad 0x1368000000000000
-rk7: .quad 0x00000001f65a57f8
-rk8: .quad 0x000000018bb70000
-rk9: .quad 0xceae000000000000
-rk10: .quad 0xbfd6000000000000
-rk11: .quad 0x1e16000000000000
-rk12: .quad 0x713c000000000000
-rk13: .quad 0xf7f9000000000000
-rk14: .quad 0x80a6000000000000
-rk15: .quad 0x044c000000000000
-rk16: .quad 0xe658000000000000
-rk17: .quad 0xad18000000000000
-rk18: .quad 0xa497000000000000
-rk19: .quad 0x6ee3000000000000
-rk20: .quad 0xe7b5000000000000
-rk1: .quad 0x2d56000000000000
-rk2: .quad 0x06df000000000000
-
-tbl_shf_table:
-// use these values for shift constants for the tbl/tbx instruction
-// different alignments result in values as shown:
-// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
-// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
-// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
-// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
-// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
-// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
-// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
-// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
-// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
+// Fold constants precomputed from the polynomial 0x18bb7
+// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 // x^(8*128) mod G(x)
+ .quad 0x0000000000002295 // x^(8*128+64) mod G(x)
+// .Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 // x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
+// .Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d // x^(2*128) mod G(x)
+ .quad 0x0000000000007acc // x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 // x^(1*128) mod G(x)
+ .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
+// .Lfinal_fold_consts:
+ .quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
+// .Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 // G(x)
+ .quad 0x00000001f65a57f8 // floor(x^48 / G(x))
+
+// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
+// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
+// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
.byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7