mirror of
https://github.com/openwrt/openwrt.git
synced 2025-01-31 00:24:12 +00:00
kernel-5.10: backport chacha non block size optimizations
These make a big difference when doing WireGuard with small armv7 routers, and the 5.4 backport already has it. Suggested-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
This commit is contained in:
parent
d540725871
commit
1265dbafcd
@ -0,0 +1,272 @@
|
||||
From 03662fcd41f4b764857f17b95f9a2a63c24bddd4 Mon Sep 17 00:00:00 2001
|
||||
From: Ard Biesheuvel <ardb@kernel.org>
|
||||
Date: Tue, 3 Nov 2020 17:28:09 +0100
|
||||
Subject: [PATCH 1/2] crypto: arm/chacha-neon - optimize for non-block size
|
||||
multiples
|
||||
|
||||
commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.
|
||||
|
||||
The current NEON based ChaCha implementation for ARM is optimized for
|
||||
multiples of 4x the ChaCha block size (64 bytes). This makes sense for
|
||||
block encryption, but given that ChaCha is also often used in the
|
||||
context of networking, it makes sense to consider arbitrary length
|
||||
inputs as well.
|
||||
|
||||
For example, WireGuard typically uses 1420 byte packets, and performing
|
||||
ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
|
||||
and 3 invocations of chacha_block_xor_neon(), where the last one also
|
||||
involves a memcpy() using a buffer on the stack to process the final
|
||||
chunk of 1420 % 64 == 12 bytes.
|
||||
|
||||
Let's optimize for this case as well, by letting chacha_4block_xor_neon()
|
||||
deal with any input size between 64 and 256 bytes, using NEON permutation
|
||||
instructions and overlapping loads and stores. This way, the 140 byte
|
||||
tail of a 1420 byte input buffer can simply be processed in one go.
|
||||
|
||||
This results in the following performance improvements for 1420 byte
|
||||
blocks, without significant impact on power-of-2 input sizes. (Note
|
||||
that Raspberry Pi is widely used in combination with a 32-bit kernel,
|
||||
even though the core is 64-bit capable)
|
||||
|
||||
Cortex-A8 (BeagleBone) : 7%
|
||||
Cortex-A15 (Calxeda Midway) : 21%
|
||||
Cortex-A53 (Raspberry Pi 3) : 3%
|
||||
Cortex-A72 (Raspberry Pi 4) : 19%
|
||||
|
||||
Cc: Eric Biggers <ebiggers@google.com>
|
||||
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
|
||||
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
|
||||
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
|
||||
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
||||
---
|
||||
arch/arm/crypto/chacha-glue.c | 34 +++++------
|
||||
arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++---
|
||||
2 files changed, 107 insertions(+), 24 deletions(-)
|
||||
|
||||
--- a/arch/arm/crypto/chacha-glue.c
|
||||
+++ b/arch/arm/crypto/chacha-glue.c
|
||||
@@ -23,7 +23,7 @@
|
||||
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds);
|
||||
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
- int nrounds);
|
||||
+ int nrounds, unsigned int nbytes);
|
||||
asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
|
||||
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
|
||||
|
||||
@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8
|
||||
{
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
|
||||
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
- chacha_4block_xor_neon(state, dst, src, nrounds);
|
||||
- bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
- src += CHACHA_BLOCK_SIZE * 4;
|
||||
- dst += CHACHA_BLOCK_SIZE * 4;
|
||||
- state[12] += 4;
|
||||
- }
|
||||
- while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
- chacha_block_xor_neon(state, dst, src, nrounds);
|
||||
- bytes -= CHACHA_BLOCK_SIZE;
|
||||
- src += CHACHA_BLOCK_SIZE;
|
||||
- dst += CHACHA_BLOCK_SIZE;
|
||||
- state[12]++;
|
||||
+ while (bytes > CHACHA_BLOCK_SIZE) {
|
||||
+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
|
||||
+
|
||||
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
|
||||
+ bytes -= l;
|
||||
+ src += l;
|
||||
+ dst += l;
|
||||
+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
|
||||
}
|
||||
if (bytes) {
|
||||
- memcpy(buf, src, bytes);
|
||||
- chacha_block_xor_neon(state, buf, buf, nrounds);
|
||||
- memcpy(dst, buf, bytes);
|
||||
+ const u8 *s = src;
|
||||
+ u8 *d = dst;
|
||||
+
|
||||
+ if (bytes != CHACHA_BLOCK_SIZE)
|
||||
+ s = d = memcpy(buf, src, bytes);
|
||||
+ chacha_block_xor_neon(state, d, s, nrounds);
|
||||
+ if (d != dst)
|
||||
+ memcpy(dst, buf, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
--- a/arch/arm/crypto/chacha-neon-core.S
|
||||
+++ b/arch/arm/crypto/chacha-neon-core.S
|
||||
@@ -47,6 +47,7 @@
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
+#include <asm/cache.h>
|
||||
|
||||
.text
|
||||
.fpu neon
|
||||
@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
|
||||
|
||||
.align 5
|
||||
ENTRY(chacha_4block_xor_neon)
|
||||
- push {r4-r5}
|
||||
+ push {r4, lr}
|
||||
mov r4, sp // preserve the stack pointer
|
||||
sub ip, sp, #0x20 // allocate a 32 byte buffer
|
||||
bic ip, ip, #0x1f // aligned to 32 bytes
|
||||
@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
|
||||
vld1.32 {q0-q1}, [r0]
|
||||
vld1.32 {q2-q3}, [ip]
|
||||
|
||||
- adr r5, .Lctrinc
|
||||
+ adr lr, .Lctrinc
|
||||
vdup.32 q15, d7[1]
|
||||
vdup.32 q14, d7[0]
|
||||
- vld1.32 {q4}, [r5, :128]
|
||||
+ vld1.32 {q4}, [lr, :128]
|
||||
vdup.32 q13, d6[1]
|
||||
vdup.32 q12, d6[0]
|
||||
vdup.32 q11, d5[1]
|
||||
@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
|
||||
|
||||
// Re-interleave the words in the first two rows of each block (x0..7).
|
||||
// Also add the counter values 0-3 to x12[0-3].
|
||||
- vld1.32 {q8}, [r5, :128] // load counter values 0-3
|
||||
+ vld1.32 {q8}, [lr, :128] // load counter values 0-3
|
||||
vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
|
||||
vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
|
||||
vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
|
||||
@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
|
||||
|
||||
// Re-interleave the words in the last two rows of each block (x8..15).
|
||||
vld1.32 {q8-q9}, [sp, :256]
|
||||
+ mov sp, r4 // restore original stack pointer
|
||||
+ ldr r4, [r4, #8] // load number of bytes
|
||||
vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
|
||||
vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
|
||||
vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
|
||||
@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
|
||||
// XOR the rest of the data with the keystream
|
||||
|
||||
vld1.8 {q0-q1}, [r2]!
|
||||
+ subs r4, r4, #96
|
||||
veor q0, q0, q8
|
||||
veor q1, q1, q12
|
||||
+ ble .Lle96
|
||||
vst1.8 {q0-q1}, [r1]!
|
||||
|
||||
vld1.8 {q0-q1}, [r2]!
|
||||
+ subs r4, r4, #32
|
||||
veor q0, q0, q2
|
||||
veor q1, q1, q6
|
||||
+ ble .Lle128
|
||||
vst1.8 {q0-q1}, [r1]!
|
||||
|
||||
vld1.8 {q0-q1}, [r2]!
|
||||
+ subs r4, r4, #32
|
||||
veor q0, q0, q10
|
||||
veor q1, q1, q14
|
||||
+ ble .Lle160
|
||||
vst1.8 {q0-q1}, [r1]!
|
||||
|
||||
vld1.8 {q0-q1}, [r2]!
|
||||
+ subs r4, r4, #32
|
||||
veor q0, q0, q4
|
||||
veor q1, q1, q5
|
||||
+ ble .Lle192
|
||||
vst1.8 {q0-q1}, [r1]!
|
||||
|
||||
vld1.8 {q0-q1}, [r2]!
|
||||
+ subs r4, r4, #32
|
||||
veor q0, q0, q9
|
||||
veor q1, q1, q13
|
||||
+ ble .Lle224
|
||||
vst1.8 {q0-q1}, [r1]!
|
||||
|
||||
vld1.8 {q0-q1}, [r2]!
|
||||
+ subs r4, r4, #32
|
||||
veor q0, q0, q3
|
||||
veor q1, q1, q7
|
||||
+ blt .Llt256
|
||||
+.Lout:
|
||||
vst1.8 {q0-q1}, [r1]!
|
||||
|
||||
vld1.8 {q0-q1}, [r2]
|
||||
- mov sp, r4 // restore original stack pointer
|
||||
veor q0, q0, q11
|
||||
veor q1, q1, q15
|
||||
vst1.8 {q0-q1}, [r1]
|
||||
|
||||
- pop {r4-r5}
|
||||
- bx lr
|
||||
+ pop {r4, pc}
|
||||
+
|
||||
+.Lle192:
|
||||
+ vmov q4, q9
|
||||
+ vmov q5, q13
|
||||
+
|
||||
+.Lle160:
|
||||
+ // nothing to do
|
||||
+
|
||||
+.Lfinalblock:
|
||||
+ // Process the final block if processing less than 4 full blocks.
|
||||
+ // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
|
||||
+ // previous 32 byte output block that still needs to be written at
|
||||
+ // [r1] in q0-q1.
|
||||
+ beq .Lfullblock
|
||||
+
|
||||
+.Lpartialblock:
|
||||
+ adr lr, .Lpermute + 32
|
||||
+ add r2, r2, r4
|
||||
+ add lr, lr, r4
|
||||
+ add r4, r4, r1
|
||||
+
|
||||
+ vld1.8 {q2-q3}, [lr]
|
||||
+ vld1.8 {q6-q7}, [r2]
|
||||
+
|
||||
+ add r4, r4, #32
|
||||
+
|
||||
+ vtbl.8 d4, {q4-q5}, d4
|
||||
+ vtbl.8 d5, {q4-q5}, d5
|
||||
+ vtbl.8 d6, {q4-q5}, d6
|
||||
+ vtbl.8 d7, {q4-q5}, d7
|
||||
+
|
||||
+ veor q6, q6, q2
|
||||
+ veor q7, q7, q3
|
||||
+
|
||||
+ vst1.8 {q6-q7}, [r4] // overlapping stores
|
||||
+ vst1.8 {q0-q1}, [r1]
|
||||
+ pop {r4, pc}
|
||||
+
|
||||
+.Lfullblock:
|
||||
+ vmov q11, q4
|
||||
+ vmov q15, q5
|
||||
+ b .Lout
|
||||
+.Lle96:
|
||||
+ vmov q4, q2
|
||||
+ vmov q5, q6
|
||||
+ b .Lfinalblock
|
||||
+.Lle128:
|
||||
+ vmov q4, q10
|
||||
+ vmov q5, q14
|
||||
+ b .Lfinalblock
|
||||
+.Lle224:
|
||||
+ vmov q4, q3
|
||||
+ vmov q5, q7
|
||||
+ b .Lfinalblock
|
||||
+.Llt256:
|
||||
+ vmov q4, q11
|
||||
+ vmov q5, q15
|
||||
+ b .Lpartialblock
|
||||
ENDPROC(chacha_4block_xor_neon)
|
||||
+
|
||||
+ .align L1_CACHE_SHIFT
|
||||
+.Lpermute:
|
||||
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
|
||||
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
|
||||
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
|
||||
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
|
||||
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
|
||||
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
|
||||
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
|
||||
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
|
@ -0,0 +1,38 @@
|
||||
From 7f63462faf9eab69132bea9abd48c2c05a93145b Mon Sep 17 00:00:00 2001
|
||||
From: Ard Biesheuvel <ardb@kernel.org>
|
||||
Date: Sun, 13 Dec 2020 15:39:29 +0100
|
||||
Subject: [PATCH 2/2] crypto: arm/chacha-neon - add missing counter increment
|
||||
|
||||
commit fd16931a2f518a32753920ff20895e5cf04c8ff1 upstream.
|
||||
|
||||
Commit 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block
|
||||
size multiples") refactored the chacha block handling in the glue code in
|
||||
a way that may result in the counter increment to be omitted when calling
|
||||
chacha_block_xor_neon() to process a full block. This violates the skcipher
|
||||
API, which requires that the output IV is suitable for handling more input
|
||||
as long as the preceding input has been presented in round multiples of the
|
||||
block size. Also, the same code is exposed via the chacha library interface
|
||||
whose callers may actually rely on this increment to occur even for final
|
||||
blocks that are smaller than the chacha block size.
|
||||
|
||||
So increment the counter after calling chacha_block_xor_neon().
|
||||
|
||||
Fixes: 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block size multiples")
|
||||
Reported-by: Eric Biggers <ebiggers@kernel.org>
|
||||
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
|
||||
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
|
||||
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
||||
---
|
||||
arch/arm/crypto/chacha-glue.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
--- a/arch/arm/crypto/chacha-glue.c
|
||||
+++ b/arch/arm/crypto/chacha-glue.c
|
||||
@@ -60,6 +60,7 @@ static void chacha_doneon(u32 *state, u8
|
||||
chacha_block_xor_neon(state, d, s, nrounds);
|
||||
if (d != dst)
|
||||
memcpy(dst, buf, bytes);
|
||||
+ state[12]++;
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user