diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-27 12:32:06 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-27 12:32:06 -0700 |
commit | 34e1a5d43c5deec563b94f3330b690dde9d1de53 (patch) | |
tree | 304dbe97fd7e31b5d64f888ea51efabbc7bb9fbb /arch | |
parent | 9c44575c78dbcdf89bd9f9bc3869ce8ab5cc1272 (diff) | |
parent | 9805f39d423a30a7189158905ec3d71774fe98a1 (diff) |
Merge tag 'random-6.12-rc1-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random
Pull more random number generator updates from Jason Donenfeld:
- Christophe realized that the LoongArch64 instructions could be
scheduled more similar to how GCC generates code, which Ruoyao
implemented, for a 5% speedup from basically some rearrangements
- An update to MAINTAINERS to match the right files
* tag 'random-6.12-rc1-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/crng/random:
LoongArch: vDSO: Tune chacha implementation
MAINTAINERS: make vDSO getrandom matches more generic
Diffstat (limited to 'arch')
-rw-r--r-- | arch/loongarch/vdso/vgetrandom-chacha.S | 92 |
1 files changed, 55 insertions, 37 deletions
diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S index 7e86a50f6e85..c2733e6c3a8d 100644 --- a/arch/loongarch/vdso/vgetrandom-chacha.S +++ b/arch/loongarch/vdso/vgetrandom-chacha.S @@ -9,23 +9,11 @@ .text -/* Salsa20 quarter-round */ -.macro QR a b c d - add.w \a, \a, \b - xor \d, \d, \a - rotri.w \d, \d, 16 - - add.w \c, \c, \d - xor \b, \b, \c - rotri.w \b, \b, 20 - - add.w \a, \a, \b - xor \d, \d, \a - rotri.w \d, \d, 24 - - add.w \c, \c, \d - xor \b, \b, \c - rotri.w \b, \b, 25 +.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 + \op \d0, \d0, \s0 + \op \d1, \d1, \s1 + \op \d2, \d2, \s2 + \op \d3, \d3, \s3 .endm /* @@ -74,6 +62,23 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) /* Reuse i as copy3 */ #define copy3 i +/* Packs to be used with OP_4REG */ +#define line0 state0, state1, state2, state3 +#define line1 state4, state5, state6, state7 +#define line2 state8, state9, state10, state11 +#define line3 state12, state13, state14, state15 + +#define line1_perm state5, state6, state7, state4 +#define line2_perm state10, state11, state8, state9 +#define line3_perm state15, state12, state13, state14 + +#define copy copy0, copy1, copy2, copy3 + +#define _16 16, 16, 16, 16 +#define _20 20, 20, 20, 20 +#define _24 24, 24, 24, 24 +#define _25 25, 25, 25, 25 + /* * The ABI requires s0-s9 saved, and sp aligned to 16-byte. * This does not violate the stack-less requirement: no sensitive data @@ -126,16 +131,38 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) li.w i, 10 .Lpermute: /* odd round */ - QR state0, state4, state8, state12 - QR state1, state5, state9, state13 - QR state2, state6, state10, state14 - QR state3, state7, state11, state15 + OP_4REG add.w line0, line1 + OP_4REG xor line3, line0 + OP_4REG rotri.w line3, _16 + + OP_4REG add.w line2, line3 + OP_4REG xor line1, line2 + OP_4REG rotri.w line1, _20 + + OP_4REG add.w line0, line1 + OP_4REG xor line3, line0 + OP_4REG rotri.w line3, _24 + + OP_4REG add.w line2, line3 + OP_4REG xor line1, line2 + OP_4REG rotri.w line1, _25 /* even round */ - QR state0, state5, state10, state15 - QR state1, state6, state11, state12 - QR state2, state7, state8, state13 - QR state3, state4, state9, state14 + OP_4REG add.w line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG rotri.w line3_perm, _16 + + OP_4REG add.w line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG rotri.w line1_perm, _20 + + OP_4REG add.w line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG rotri.w line3_perm, _24 + + OP_4REG add.w line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG rotri.w line1_perm, _25 addi.w i, i, -1 bnez i, .Lpermute @@ -147,10 +174,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) li.w copy3, 0x6b206574 /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ - add.w state0, state0, copy0 - add.w state1, state1, copy1 - add.w state2, state2, copy2 - add.w state3, state3, copy3 + OP_4REG add.w line0, copy st.w state0, output, 0 st.w state1, output, 4 st.w state2, output, 8 @@ -165,10 +189,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) ld.w state3, key, 12 /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ - add.w state4, state4, state0 - add.w state5, state5, state1 - add.w state6, state6, state2 - add.w state7, state7, state3 + OP_4REG add.w line1, line0 st.w state4, output, 16 st.w state5, output, 20 st.w state6, output, 24 @@ -181,10 +202,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack) ld.w state3, key, 28 /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */ - add.w state8, state8, state0 - add.w state9, state9, state1 - add.w state10, state10, state2 - add.w state11, state11, state3 + OP_4REG add.w line2, line0 st.w state8, output, 32 st.w state9, output, 36 st.w state10, output, 40 |