mirror of
https://github.com/openwrt/openwrt.git
synced 2024-12-21 06:33:41 +00:00
d540725871
Without this patch, the chacha block counter is not incremented on neon rounds, resulting in incorrect calculations and corrupt packets. This also switches to using `--no-numbered --zero-commit` so that future diffs are smaller. Reported-by: Hans Geiblinger <cybrnook2002@yahoo.com> Reviewed-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
325 lines
9.4 KiB
Diff
325 lines
9.4 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Ard Biesheuvel <ardb@kernel.org>
|
|
Date: Fri, 6 Nov 2020 17:39:38 +0100
|
|
Subject: [PATCH] crypto: arm64/chacha - simplify tail block handling
|
|
|
|
commit c4fc6328d6c67690a7e6e03f43a5a976a13120ef upstream.
|
|
|
|
Based on lessons learnt from optimizing the 32-bit version of this driver,
|
|
we can simplify the arm64 version considerably, by reordering the final
|
|
two stores when the last block is not a multiple of 64 bytes. This removes
|
|
the need to use permutation instructions to calculate the elements that are
|
|
clobbered by the final overlapping store, given that the store of the
|
|
penultimate block now follows it, and that one carries the correct values
|
|
for those elements already.
|
|
|
|
While at it, simplify the overlapping loads as well, by calculating the
|
|
address of the final overlapping load upfront, and switching to this
|
|
address for every load that would otherwise extend past the end of the
|
|
source buffer.
|
|
|
|
There is no impact on performance, but the resulting code is substantially
|
|
smaller and easier to follow.
|
|
|
|
Cc: Eric Biggers <ebiggers@google.com>
|
|
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
|
|
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
|
|
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
|
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
|
---
|
|
arch/arm64/crypto/chacha-neon-core.S | 193 ++++++++++-----------------
|
|
1 file changed, 69 insertions(+), 124 deletions(-)
|
|
|
|
--- a/arch/arm64/crypto/chacha-neon-core.S
|
|
+++ b/arch/arm64/crypto/chacha-neon-core.S
|
|
@@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon)
|
|
adr_l x10, .Lpermute
|
|
and x5, x4, #63
|
|
add x10, x10, x5
|
|
- add x11, x10, #64
|
|
|
|
//
|
|
// This function encrypts four consecutive ChaCha blocks by loading
|
|
@@ -645,11 +644,11 @@ CPU_BE( rev a15, a15 )
|
|
zip2 v31.4s, v14.4s, v15.4s
|
|
eor a15, a15, w9
|
|
|
|
- mov x3, #64
|
|
+ add x3, x2, x4
|
|
+ sub x3, x3, #128 // start of last block
|
|
+
|
|
subs x5, x4, #128
|
|
- add x6, x5, x2
|
|
- csel x3, x3, xzr, ge
|
|
- csel x2, x2, x6, ge
|
|
+ csel x2, x2, x3, ge
|
|
|
|
// interleave 64-bit words in state n, n+2
|
|
zip1 v0.2d, v16.2d, v18.2d
|
|
@@ -658,13 +657,10 @@ CPU_BE( rev a15, a15 )
|
|
zip1 v8.2d, v17.2d, v19.2d
|
|
zip2 v12.2d, v17.2d, v19.2d
|
|
stp a2, a3, [x1, #-56]
|
|
- ld1 {v16.16b-v19.16b}, [x2], x3
|
|
|
|
subs x6, x4, #192
|
|
- ccmp x3, xzr, #4, lt
|
|
- add x7, x6, x2
|
|
- csel x3, x3, xzr, eq
|
|
- csel x2, x2, x7, eq
|
|
+ ld1 {v16.16b-v19.16b}, [x2], #64
|
|
+ csel x2, x2, x3, ge
|
|
|
|
zip1 v1.2d, v20.2d, v22.2d
|
|
zip2 v5.2d, v20.2d, v22.2d
|
|
@@ -672,13 +668,10 @@ CPU_BE( rev a15, a15 )
|
|
zip1 v9.2d, v21.2d, v23.2d
|
|
zip2 v13.2d, v21.2d, v23.2d
|
|
stp a6, a7, [x1, #-40]
|
|
- ld1 {v20.16b-v23.16b}, [x2], x3
|
|
|
|
subs x7, x4, #256
|
|
- ccmp x3, xzr, #4, lt
|
|
- add x8, x7, x2
|
|
- csel x3, x3, xzr, eq
|
|
- csel x2, x2, x8, eq
|
|
+ ld1 {v20.16b-v23.16b}, [x2], #64
|
|
+ csel x2, x2, x3, ge
|
|
|
|
zip1 v2.2d, v24.2d, v26.2d
|
|
zip2 v6.2d, v24.2d, v26.2d
|
|
@@ -686,12 +679,10 @@ CPU_BE( rev a15, a15 )
|
|
zip1 v10.2d, v25.2d, v27.2d
|
|
zip2 v14.2d, v25.2d, v27.2d
|
|
stp a10, a11, [x1, #-24]
|
|
- ld1 {v24.16b-v27.16b}, [x2], x3
|
|
|
|
subs x8, x4, #320
|
|
- ccmp x3, xzr, #4, lt
|
|
- add x9, x8, x2
|
|
- csel x2, x2, x9, eq
|
|
+ ld1 {v24.16b-v27.16b}, [x2], #64
|
|
+ csel x2, x2, x3, ge
|
|
|
|
zip1 v3.2d, v28.2d, v30.2d
|
|
zip2 v7.2d, v28.2d, v30.2d
|
|
@@ -699,151 +690,105 @@ CPU_BE( rev a15, a15 )
|
|
zip1 v11.2d, v29.2d, v31.2d
|
|
zip2 v15.2d, v29.2d, v31.2d
|
|
stp a14, a15, [x1, #-8]
|
|
+
|
|
+ tbnz x5, #63, .Lt128
|
|
ld1 {v28.16b-v31.16b}, [x2]
|
|
|
|
// xor with corresponding input, write to output
|
|
- tbnz x5, #63, 0f
|
|
eor v16.16b, v16.16b, v0.16b
|
|
eor v17.16b, v17.16b, v1.16b
|
|
eor v18.16b, v18.16b, v2.16b
|
|
eor v19.16b, v19.16b, v3.16b
|
|
- st1 {v16.16b-v19.16b}, [x1], #64
|
|
- cbz x5, .Lout
|
|
|
|
- tbnz x6, #63, 1f
|
|
+ tbnz x6, #63, .Lt192
|
|
+
|
|
eor v20.16b, v20.16b, v4.16b
|
|
eor v21.16b, v21.16b, v5.16b
|
|
eor v22.16b, v22.16b, v6.16b
|
|
eor v23.16b, v23.16b, v7.16b
|
|
- st1 {v20.16b-v23.16b}, [x1], #64
|
|
- cbz x6, .Lout
|
|
|
|
- tbnz x7, #63, 2f
|
|
+ st1 {v16.16b-v19.16b}, [x1], #64
|
|
+ tbnz x7, #63, .Lt256
|
|
+
|
|
eor v24.16b, v24.16b, v8.16b
|
|
eor v25.16b, v25.16b, v9.16b
|
|
eor v26.16b, v26.16b, v10.16b
|
|
eor v27.16b, v27.16b, v11.16b
|
|
- st1 {v24.16b-v27.16b}, [x1], #64
|
|
- cbz x7, .Lout
|
|
|
|
- tbnz x8, #63, 3f
|
|
+ st1 {v20.16b-v23.16b}, [x1], #64
|
|
+ tbnz x8, #63, .Lt320
|
|
+
|
|
eor v28.16b, v28.16b, v12.16b
|
|
eor v29.16b, v29.16b, v13.16b
|
|
eor v30.16b, v30.16b, v14.16b
|
|
eor v31.16b, v31.16b, v15.16b
|
|
+
|
|
+ st1 {v24.16b-v27.16b}, [x1], #64
|
|
st1 {v28.16b-v31.16b}, [x1]
|
|
|
|
.Lout: frame_pop
|
|
ret
|
|
|
|
- // fewer than 128 bytes of in/output
|
|
-0: ld1 {v8.16b}, [x10]
|
|
- ld1 {v9.16b}, [x11]
|
|
- movi v10.16b, #16
|
|
- sub x2, x1, #64
|
|
- add x1, x1, x5
|
|
- ld1 {v16.16b-v19.16b}, [x2]
|
|
- tbl v4.16b, {v0.16b-v3.16b}, v8.16b
|
|
- tbx v20.16b, {v16.16b-v19.16b}, v9.16b
|
|
- add v8.16b, v8.16b, v10.16b
|
|
- add v9.16b, v9.16b, v10.16b
|
|
- tbl v5.16b, {v0.16b-v3.16b}, v8.16b
|
|
- tbx v21.16b, {v16.16b-v19.16b}, v9.16b
|
|
- add v8.16b, v8.16b, v10.16b
|
|
- add v9.16b, v9.16b, v10.16b
|
|
- tbl v6.16b, {v0.16b-v3.16b}, v8.16b
|
|
- tbx v22.16b, {v16.16b-v19.16b}, v9.16b
|
|
- add v8.16b, v8.16b, v10.16b
|
|
- add v9.16b, v9.16b, v10.16b
|
|
- tbl v7.16b, {v0.16b-v3.16b}, v8.16b
|
|
- tbx v23.16b, {v16.16b-v19.16b}, v9.16b
|
|
-
|
|
- eor v20.16b, v20.16b, v4.16b
|
|
- eor v21.16b, v21.16b, v5.16b
|
|
- eor v22.16b, v22.16b, v6.16b
|
|
- eor v23.16b, v23.16b, v7.16b
|
|
- st1 {v20.16b-v23.16b}, [x1]
|
|
- b .Lout
|
|
-
|
|
// fewer than 192 bytes of in/output
|
|
-1: ld1 {v8.16b}, [x10]
|
|
- ld1 {v9.16b}, [x11]
|
|
- movi v10.16b, #16
|
|
- add x1, x1, x6
|
|
- tbl v0.16b, {v4.16b-v7.16b}, v8.16b
|
|
- tbx v20.16b, {v16.16b-v19.16b}, v9.16b
|
|
- add v8.16b, v8.16b, v10.16b
|
|
- add v9.16b, v9.16b, v10.16b
|
|
- tbl v1.16b, {v4.16b-v7.16b}, v8.16b
|
|
- tbx v21.16b, {v16.16b-v19.16b}, v9.16b
|
|
- add v8.16b, v8.16b, v10.16b
|
|
- add v9.16b, v9.16b, v10.16b
|
|
- tbl v2.16b, {v4.16b-v7.16b}, v8.16b
|
|
- tbx v22.16b, {v16.16b-v19.16b}, v9.16b
|
|
- add v8.16b, v8.16b, v10.16b
|
|
- add v9.16b, v9.16b, v10.16b
|
|
- tbl v3.16b, {v4.16b-v7.16b}, v8.16b
|
|
- tbx v23.16b, {v16.16b-v19.16b}, v9.16b
|
|
-
|
|
- eor v20.16b, v20.16b, v0.16b
|
|
- eor v21.16b, v21.16b, v1.16b
|
|
- eor v22.16b, v22.16b, v2.16b
|
|
- eor v23.16b, v23.16b, v3.16b
|
|
- st1 {v20.16b-v23.16b}, [x1]
|
|
+.Lt192: cbz x5, 1f // exactly 128 bytes?
|
|
+ ld1 {v28.16b-v31.16b}, [x10]
|
|
+ add x5, x5, x1
|
|
+ tbl v28.16b, {v4.16b-v7.16b}, v28.16b
|
|
+ tbl v29.16b, {v4.16b-v7.16b}, v29.16b
|
|
+ tbl v30.16b, {v4.16b-v7.16b}, v30.16b
|
|
+ tbl v31.16b, {v4.16b-v7.16b}, v31.16b
|
|
+
|
|
+0: eor v20.16b, v20.16b, v28.16b
|
|
+ eor v21.16b, v21.16b, v29.16b
|
|
+ eor v22.16b, v22.16b, v30.16b
|
|
+ eor v23.16b, v23.16b, v31.16b
|
|
+ st1 {v20.16b-v23.16b}, [x5] // overlapping stores
|
|
+1: st1 {v16.16b-v19.16b}, [x1]
|
|
b .Lout
|
|
|
|
+ // fewer than 128 bytes of in/output
|
|
+.Lt128: ld1 {v28.16b-v31.16b}, [x10]
|
|
+ add x5, x5, x1
|
|
+ sub x1, x1, #64
|
|
+ tbl v28.16b, {v0.16b-v3.16b}, v28.16b
|
|
+ tbl v29.16b, {v0.16b-v3.16b}, v29.16b
|
|
+ tbl v30.16b, {v0.16b-v3.16b}, v30.16b
|
|
+ tbl v31.16b, {v0.16b-v3.16b}, v31.16b
|
|
+ ld1 {v16.16b-v19.16b}, [x1] // reload first output block
|
|
+ b 0b
|
|
+
|
|
// fewer than 256 bytes of in/output
|
|
-2: ld1 {v4.16b}, [x10]
|
|
- ld1 {v5.16b}, [x11]
|
|
- movi v6.16b, #16
|
|
- add x1, x1, x7
|
|
+.Lt256: cbz x6, 2f // exactly 192 bytes?
|
|
+ ld1 {v4.16b-v7.16b}, [x10]
|
|
+ add x6, x6, x1
|
|
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
|
|
- tbx v24.16b, {v20.16b-v23.16b}, v5.16b
|
|
- add v4.16b, v4.16b, v6.16b
|
|
- add v5.16b, v5.16b, v6.16b
|
|
- tbl v1.16b, {v8.16b-v11.16b}, v4.16b
|
|
- tbx v25.16b, {v20.16b-v23.16b}, v5.16b
|
|
- add v4.16b, v4.16b, v6.16b
|
|
- add v5.16b, v5.16b, v6.16b
|
|
- tbl v2.16b, {v8.16b-v11.16b}, v4.16b
|
|
- tbx v26.16b, {v20.16b-v23.16b}, v5.16b
|
|
- add v4.16b, v4.16b, v6.16b
|
|
- add v5.16b, v5.16b, v6.16b
|
|
- tbl v3.16b, {v8.16b-v11.16b}, v4.16b
|
|
- tbx v27.16b, {v20.16b-v23.16b}, v5.16b
|
|
-
|
|
- eor v24.16b, v24.16b, v0.16b
|
|
- eor v25.16b, v25.16b, v1.16b
|
|
- eor v26.16b, v26.16b, v2.16b
|
|
- eor v27.16b, v27.16b, v3.16b
|
|
- st1 {v24.16b-v27.16b}, [x1]
|
|
+ tbl v1.16b, {v8.16b-v11.16b}, v5.16b
|
|
+ tbl v2.16b, {v8.16b-v11.16b}, v6.16b
|
|
+ tbl v3.16b, {v8.16b-v11.16b}, v7.16b
|
|
+
|
|
+ eor v28.16b, v28.16b, v0.16b
|
|
+ eor v29.16b, v29.16b, v1.16b
|
|
+ eor v30.16b, v30.16b, v2.16b
|
|
+ eor v31.16b, v31.16b, v3.16b
|
|
+ st1 {v28.16b-v31.16b}, [x6] // overlapping stores
|
|
+2: st1 {v20.16b-v23.16b}, [x1]
|
|
b .Lout
|
|
|
|
// fewer than 320 bytes of in/output
|
|
-3: ld1 {v4.16b}, [x10]
|
|
- ld1 {v5.16b}, [x11]
|
|
- movi v6.16b, #16
|
|
- add x1, x1, x8
|
|
+.Lt320: cbz x7, 3f // exactly 256 bytes?
|
|
+ ld1 {v4.16b-v7.16b}, [x10]
|
|
+ add x7, x7, x1
|
|
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
|
|
- tbx v28.16b, {v24.16b-v27.16b}, v5.16b
|
|
- add v4.16b, v4.16b, v6.16b
|
|
- add v5.16b, v5.16b, v6.16b
|
|
- tbl v1.16b, {v12.16b-v15.16b}, v4.16b
|
|
- tbx v29.16b, {v24.16b-v27.16b}, v5.16b
|
|
- add v4.16b, v4.16b, v6.16b
|
|
- add v5.16b, v5.16b, v6.16b
|
|
- tbl v2.16b, {v12.16b-v15.16b}, v4.16b
|
|
- tbx v30.16b, {v24.16b-v27.16b}, v5.16b
|
|
- add v4.16b, v4.16b, v6.16b
|
|
- add v5.16b, v5.16b, v6.16b
|
|
- tbl v3.16b, {v12.16b-v15.16b}, v4.16b
|
|
- tbx v31.16b, {v24.16b-v27.16b}, v5.16b
|
|
+ tbl v1.16b, {v12.16b-v15.16b}, v5.16b
|
|
+ tbl v2.16b, {v12.16b-v15.16b}, v6.16b
|
|
+ tbl v3.16b, {v12.16b-v15.16b}, v7.16b
|
|
|
|
eor v28.16b, v28.16b, v0.16b
|
|
eor v29.16b, v29.16b, v1.16b
|
|
eor v30.16b, v30.16b, v2.16b
|
|
eor v31.16b, v31.16b, v3.16b
|
|
- st1 {v28.16b-v31.16b}, [x1]
|
|
+ st1 {v28.16b-v31.16b}, [x7] // overlapping stores
|
|
+3: st1 {v24.16b-v27.16b}, [x1]
|
|
b .Lout
|
|
ENDPROC(chacha_4block_xor_neon)
|
|
|
|
@@ -851,7 +796,7 @@ ENDPROC(chacha_4block_xor_neon)
|
|
.align L1_CACHE_SHIFT
|
|
.Lpermute:
|
|
.set .Li, 0
|
|
- .rept 192
|
|
+ .rept 128
|
|
.byte (.Li - 64)
|
|
.set .Li, .Li + 1
|
|
.endr
|