mirror of
https://github.com/openwrt/openwrt.git
synced 2025-01-06 22:08:54 +00:00
d540725871
Without this patch, the chacha block counter is not incremented on neon rounds, resulting in incorrect calculations and corrupt packets. This also switches to using `--no-numbered --zero-commit` so that future diffs are smaller. Reported-by: Hans Geiblinger <cybrnook2002@yahoo.com> Reviewed-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
452 lines
10 KiB
Diff
452 lines
10 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
|
|
Date: Fri, 8 Nov 2019 13:22:16 +0100
|
|
Subject: [PATCH] crypto: mips/chacha - import 32r2 ChaCha code from Zinc
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
commit 49aa7c00eddf8d8f462b0256bd82e81762d7b0c6 upstream.
|
|
|
|
This imports the accelerated MIPS 32r2 ChaCha20 implementation from the
|
|
Zinc patch set.
|
|
|
|
Co-developed-by: René van Dorst <opensource@vdorst.com>
|
|
Signed-off-by: René van Dorst <opensource@vdorst.com>
|
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
|
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
|
|
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
|
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
|
---
|
|
arch/mips/crypto/chacha-core.S | 424 +++++++++++++++++++++++++++++++++
|
|
1 file changed, 424 insertions(+)
|
|
create mode 100644 arch/mips/crypto/chacha-core.S
|
|
|
|
--- /dev/null
|
|
+++ b/arch/mips/crypto/chacha-core.S
|
|
@@ -0,0 +1,424 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
|
+/*
|
|
+ * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
|
|
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
+ */
|
|
+
|
|
+#define MASK_U32 0x3c
|
|
+#define CHACHA20_BLOCK_SIZE 64
|
|
+#define STACK_SIZE 32
|
|
+
|
|
+#define X0 $t0
|
|
+#define X1 $t1
|
|
+#define X2 $t2
|
|
+#define X3 $t3
|
|
+#define X4 $t4
|
|
+#define X5 $t5
|
|
+#define X6 $t6
|
|
+#define X7 $t7
|
|
+#define X8 $t8
|
|
+#define X9 $t9
|
|
+#define X10 $v1
|
|
+#define X11 $s6
|
|
+#define X12 $s5
|
|
+#define X13 $s4
|
|
+#define X14 $s3
|
|
+#define X15 $s2
|
|
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
|
|
+#define T0 $s1
|
|
+#define T1 $s0
|
|
+#define T(n) T ## n
|
|
+#define X(n) X ## n
|
|
+
|
|
+/* Input arguments */
|
|
+#define STATE $a0
|
|
+#define OUT $a1
|
|
+#define IN $a2
|
|
+#define BYTES $a3
|
|
+
|
|
+/* Output argument */
|
|
+/* NONCE[0] is kept in a register and not in memory.
|
|
+ * We don't want to touch original value in memory.
|
|
+ * Must be incremented every loop iteration.
|
|
+ */
|
|
+#define NONCE_0 $v0
|
|
+
|
|
+/* SAVED_X and SAVED_CA are set in the jump table.
|
|
+ * Use regs which are overwritten on exit else we don't leak clear data.
|
|
+ * They are used to handling the last bytes which are not multiple of 4.
|
|
+ */
|
|
+#define SAVED_X X15
|
|
+#define SAVED_CA $s7
|
|
+
|
|
+#define IS_UNALIGNED $s7
|
|
+
|
|
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
+#define MSB 0
|
|
+#define LSB 3
|
|
+#define ROTx rotl
|
|
+#define ROTR(n) rotr n, 24
|
|
+#define CPU_TO_LE32(n) \
|
|
+ wsbh n; \
|
|
+ rotr n, 16;
|
|
+#else
|
|
+#define MSB 3
|
|
+#define LSB 0
|
|
+#define ROTx rotr
|
|
+#define CPU_TO_LE32(n)
|
|
+#define ROTR(n)
|
|
+#endif
|
|
+
|
|
+#define FOR_EACH_WORD(x) \
|
|
+ x( 0); \
|
|
+ x( 1); \
|
|
+ x( 2); \
|
|
+ x( 3); \
|
|
+ x( 4); \
|
|
+ x( 5); \
|
|
+ x( 6); \
|
|
+ x( 7); \
|
|
+ x( 8); \
|
|
+ x( 9); \
|
|
+ x(10); \
|
|
+ x(11); \
|
|
+ x(12); \
|
|
+ x(13); \
|
|
+ x(14); \
|
|
+ x(15);
|
|
+
|
|
+#define FOR_EACH_WORD_REV(x) \
|
|
+ x(15); \
|
|
+ x(14); \
|
|
+ x(13); \
|
|
+ x(12); \
|
|
+ x(11); \
|
|
+ x(10); \
|
|
+ x( 9); \
|
|
+ x( 8); \
|
|
+ x( 7); \
|
|
+ x( 6); \
|
|
+ x( 5); \
|
|
+ x( 4); \
|
|
+ x( 3); \
|
|
+ x( 2); \
|
|
+ x( 1); \
|
|
+ x( 0);
|
|
+
|
|
+#define PLUS_ONE_0 1
|
|
+#define PLUS_ONE_1 2
|
|
+#define PLUS_ONE_2 3
|
|
+#define PLUS_ONE_3 4
|
|
+#define PLUS_ONE_4 5
|
|
+#define PLUS_ONE_5 6
|
|
+#define PLUS_ONE_6 7
|
|
+#define PLUS_ONE_7 8
|
|
+#define PLUS_ONE_8 9
|
|
+#define PLUS_ONE_9 10
|
|
+#define PLUS_ONE_10 11
|
|
+#define PLUS_ONE_11 12
|
|
+#define PLUS_ONE_12 13
|
|
+#define PLUS_ONE_13 14
|
|
+#define PLUS_ONE_14 15
|
|
+#define PLUS_ONE_15 16
|
|
+#define PLUS_ONE(x) PLUS_ONE_ ## x
|
|
+#define _CONCAT3(a,b,c) a ## b ## c
|
|
+#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
|
|
+
|
|
+#define STORE_UNALIGNED(x) \
|
|
+CONCAT3(.Lchacha20_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
|
|
+ .if (x != 12); \
|
|
+ lw T0, (x*4)(STATE); \
|
|
+ .endif; \
|
|
+ lwl T1, (x*4)+MSB ## (IN); \
|
|
+ lwr T1, (x*4)+LSB ## (IN); \
|
|
+ .if (x == 12); \
|
|
+ addu X ## x, NONCE_0; \
|
|
+ .else; \
|
|
+ addu X ## x, T0; \
|
|
+ .endif; \
|
|
+ CPU_TO_LE32(X ## x); \
|
|
+ xor X ## x, T1; \
|
|
+ swl X ## x, (x*4)+MSB ## (OUT); \
|
|
+ swr X ## x, (x*4)+LSB ## (OUT);
|
|
+
|
|
+#define STORE_ALIGNED(x) \
|
|
+CONCAT3(.Lchacha20_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
|
|
+ .if (x != 12); \
|
|
+ lw T0, (x*4)(STATE); \
|
|
+ .endif; \
|
|
+ lw T1, (x*4) ## (IN); \
|
|
+ .if (x == 12); \
|
|
+ addu X ## x, NONCE_0; \
|
|
+ .else; \
|
|
+ addu X ## x, T0; \
|
|
+ .endif; \
|
|
+ CPU_TO_LE32(X ## x); \
|
|
+ xor X ## x, T1; \
|
|
+ sw X ## x, (x*4) ## (OUT);
|
|
+
|
|
+/* Jump table macro.
|
|
+ * Used for setup and handling the last bytes, which are not multiple of 4.
|
|
+ * X15 is free to store Xn
|
|
+ * Every jumptable entry must be equal in size.
|
|
+ */
|
|
+#define JMPTBL_ALIGNED(x) \
|
|
+.Lchacha20_mips_jmptbl_aligned_ ## x: ; \
|
|
+ .set noreorder; \
|
|
+ b .Lchacha20_mips_xor_aligned_ ## x ## _b; \
|
|
+ .if (x == 12); \
|
|
+ addu SAVED_X, X ## x, NONCE_0; \
|
|
+ .else; \
|
|
+ addu SAVED_X, X ## x, SAVED_CA; \
|
|
+ .endif; \
|
|
+ .set reorder
|
|
+
|
|
+#define JMPTBL_UNALIGNED(x) \
|
|
+.Lchacha20_mips_jmptbl_unaligned_ ## x: ; \
|
|
+ .set noreorder; \
|
|
+ b .Lchacha20_mips_xor_unaligned_ ## x ## _b; \
|
|
+ .if (x == 12); \
|
|
+ addu SAVED_X, X ## x, NONCE_0; \
|
|
+ .else; \
|
|
+ addu SAVED_X, X ## x, SAVED_CA; \
|
|
+ .endif; \
|
|
+ .set reorder
|
|
+
|
|
+#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
|
|
+ addu X(A), X(K); \
|
|
+ addu X(B), X(L); \
|
|
+ addu X(C), X(M); \
|
|
+ addu X(D), X(N); \
|
|
+ xor X(V), X(A); \
|
|
+ xor X(W), X(B); \
|
|
+ xor X(Y), X(C); \
|
|
+ xor X(Z), X(D); \
|
|
+ rotl X(V), S; \
|
|
+ rotl X(W), S; \
|
|
+ rotl X(Y), S; \
|
|
+ rotl X(Z), S;
|
|
+
|
|
+.text
|
|
+.set reorder
|
|
+.set noat
|
|
+.globl chacha20_mips
|
|
+.ent chacha20_mips
|
|
+chacha20_mips:
|
|
+ .frame $sp, STACK_SIZE, $ra
|
|
+
|
|
+ addiu $sp, -STACK_SIZE
|
|
+
|
|
+ /* Return bytes = 0. */
|
|
+ beqz BYTES, .Lchacha20_mips_end
|
|
+
|
|
+ lw NONCE_0, 48(STATE)
|
|
+
|
|
+ /* Save s0-s7 */
|
|
+ sw $s0, 0($sp)
|
|
+ sw $s1, 4($sp)
|
|
+ sw $s2, 8($sp)
|
|
+ sw $s3, 12($sp)
|
|
+ sw $s4, 16($sp)
|
|
+ sw $s5, 20($sp)
|
|
+ sw $s6, 24($sp)
|
|
+ sw $s7, 28($sp)
|
|
+
|
|
+ /* Test IN or OUT is unaligned.
|
|
+ * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
|
|
+ */
|
|
+ or IS_UNALIGNED, IN, OUT
|
|
+ andi IS_UNALIGNED, 0x3
|
|
+
|
|
+ /* Set number of rounds */
|
|
+ li $at, 20
|
|
+
|
|
+ b .Lchacha20_rounds_start
|
|
+
|
|
+.align 4
|
|
+.Loop_chacha20_rounds:
|
|
+ addiu IN, CHACHA20_BLOCK_SIZE
|
|
+ addiu OUT, CHACHA20_BLOCK_SIZE
|
|
+ addiu NONCE_0, 1
|
|
+
|
|
+.Lchacha20_rounds_start:
|
|
+ lw X0, 0(STATE)
|
|
+ lw X1, 4(STATE)
|
|
+ lw X2, 8(STATE)
|
|
+ lw X3, 12(STATE)
|
|
+
|
|
+ lw X4, 16(STATE)
|
|
+ lw X5, 20(STATE)
|
|
+ lw X6, 24(STATE)
|
|
+ lw X7, 28(STATE)
|
|
+ lw X8, 32(STATE)
|
|
+ lw X9, 36(STATE)
|
|
+ lw X10, 40(STATE)
|
|
+ lw X11, 44(STATE)
|
|
+
|
|
+ move X12, NONCE_0
|
|
+ lw X13, 52(STATE)
|
|
+ lw X14, 56(STATE)
|
|
+ lw X15, 60(STATE)
|
|
+
|
|
+.Loop_chacha20_xor_rounds:
|
|
+ addiu $at, -2
|
|
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
|
|
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
|
|
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
|
|
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
|
|
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
|
|
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
|
|
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
|
|
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
|
|
+ bnez $at, .Loop_chacha20_xor_rounds
|
|
+
|
|
+ addiu BYTES, -(CHACHA20_BLOCK_SIZE)
|
|
+
|
|
+ /* Is data src/dst unaligned? Jump */
|
|
+ bnez IS_UNALIGNED, .Loop_chacha20_unaligned
|
|
+
|
|
+ /* Set number rounds here to fill delayslot. */
|
|
+ li $at, 20
|
|
+
|
|
+ /* BYTES < 0, it has no full block. */
|
|
+ bltz BYTES, .Lchacha20_mips_no_full_block_aligned
|
|
+
|
|
+ FOR_EACH_WORD_REV(STORE_ALIGNED)
|
|
+
|
|
+ /* BYTES > 0? Loop again. */
|
|
+ bgtz BYTES, .Loop_chacha20_rounds
|
|
+
|
|
+ /* Place this here to fill delay slot */
|
|
+ addiu NONCE_0, 1
|
|
+
|
|
+ /* BYTES < 0? Handle last bytes */
|
|
+ bltz BYTES, .Lchacha20_mips_xor_bytes
|
|
+
|
|
+.Lchacha20_mips_xor_done:
|
|
+ /* Restore used registers */
|
|
+ lw $s0, 0($sp)
|
|
+ lw $s1, 4($sp)
|
|
+ lw $s2, 8($sp)
|
|
+ lw $s3, 12($sp)
|
|
+ lw $s4, 16($sp)
|
|
+ lw $s5, 20($sp)
|
|
+ lw $s6, 24($sp)
|
|
+ lw $s7, 28($sp)
|
|
+
|
|
+ /* Write NONCE_0 back to right location in state */
|
|
+ sw NONCE_0, 48(STATE)
|
|
+
|
|
+.Lchacha20_mips_end:
|
|
+ addiu $sp, STACK_SIZE
|
|
+ jr $ra
|
|
+
|
|
+.Lchacha20_mips_no_full_block_aligned:
|
|
+ /* Restore the offset on BYTES */
|
|
+ addiu BYTES, CHACHA20_BLOCK_SIZE
|
|
+
|
|
+ /* Get number of full WORDS */
|
|
+ andi $at, BYTES, MASK_U32
|
|
+
|
|
+ /* Load upper half of jump table addr */
|
|
+ lui T0, %hi(.Lchacha20_mips_jmptbl_aligned_0)
|
|
+
|
|
+ /* Calculate lower half jump table offset */
|
|
+ ins T0, $at, 1, 6
|
|
+
|
|
+ /* Add offset to STATE */
|
|
+ addu T1, STATE, $at
|
|
+
|
|
+ /* Add lower half jump table addr */
|
|
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_aligned_0)
|
|
+
|
|
+ /* Read value from STATE */
|
|
+ lw SAVED_CA, 0(T1)
|
|
+
|
|
+ /* Store remaining bytecounter as negative value */
|
|
+ subu BYTES, $at, BYTES
|
|
+
|
|
+ jr T0
|
|
+
|
|
+ /* Jump table */
|
|
+ FOR_EACH_WORD(JMPTBL_ALIGNED)
|
|
+
|
|
+
|
|
+.Loop_chacha20_unaligned:
|
|
+ /* Set number rounds here to fill delayslot. */
|
|
+ li $at, 20
|
|
+
|
|
+ /* BYTES > 0, it has no full block. */
|
|
+ bltz BYTES, .Lchacha20_mips_no_full_block_unaligned
|
|
+
|
|
+ FOR_EACH_WORD_REV(STORE_UNALIGNED)
|
|
+
|
|
+ /* BYTES > 0? Loop again. */
|
|
+ bgtz BYTES, .Loop_chacha20_rounds
|
|
+
|
|
+ /* Write NONCE_0 back to right location in state */
|
|
+ sw NONCE_0, 48(STATE)
|
|
+
|
|
+ .set noreorder
|
|
+ /* Fall through to byte handling */
|
|
+ bgez BYTES, .Lchacha20_mips_xor_done
|
|
+.Lchacha20_mips_xor_unaligned_0_b:
|
|
+.Lchacha20_mips_xor_aligned_0_b:
|
|
+ /* Place this here to fill delay slot */
|
|
+ addiu NONCE_0, 1
|
|
+ .set reorder
|
|
+
|
|
+.Lchacha20_mips_xor_bytes:
|
|
+ addu IN, $at
|
|
+ addu OUT, $at
|
|
+ /* First byte */
|
|
+ lbu T1, 0(IN)
|
|
+ addiu $at, BYTES, 1
|
|
+ CPU_TO_LE32(SAVED_X)
|
|
+ ROTR(SAVED_X)
|
|
+ xor T1, SAVED_X
|
|
+ sb T1, 0(OUT)
|
|
+ beqz $at, .Lchacha20_mips_xor_done
|
|
+ /* Second byte */
|
|
+ lbu T1, 1(IN)
|
|
+ addiu $at, BYTES, 2
|
|
+ ROTx SAVED_X, 8
|
|
+ xor T1, SAVED_X
|
|
+ sb T1, 1(OUT)
|
|
+ beqz $at, .Lchacha20_mips_xor_done
|
|
+ /* Third byte */
|
|
+ lbu T1, 2(IN)
|
|
+ ROTx SAVED_X, 8
|
|
+ xor T1, SAVED_X
|
|
+ sb T1, 2(OUT)
|
|
+ b .Lchacha20_mips_xor_done
|
|
+
|
|
+.Lchacha20_mips_no_full_block_unaligned:
|
|
+ /* Restore the offset on BYTES */
|
|
+ addiu BYTES, CHACHA20_BLOCK_SIZE
|
|
+
|
|
+ /* Get number of full WORDS */
|
|
+ andi $at, BYTES, MASK_U32
|
|
+
|
|
+ /* Load upper half of jump table addr */
|
|
+ lui T0, %hi(.Lchacha20_mips_jmptbl_unaligned_0)
|
|
+
|
|
+ /* Calculate lower half jump table offset */
|
|
+ ins T0, $at, 1, 6
|
|
+
|
|
+ /* Add offset to STATE */
|
|
+ addu T1, STATE, $at
|
|
+
|
|
+ /* Add lower half jump table addr */
|
|
+ addiu T0, %lo(.Lchacha20_mips_jmptbl_unaligned_0)
|
|
+
|
|
+ /* Read value from STATE */
|
|
+ lw SAVED_CA, 0(T1)
|
|
+
|
|
+ /* Store remaining bytecounter as negative value */
|
|
+ subu BYTES, $at, BYTES
|
|
+
|
|
+ jr T0
|
|
+
|
|
+ /* Jump table */
|
|
+ FOR_EACH_WORD(JMPTBL_UNALIGNED)
|
|
+.end chacha20_mips
|
|
+.set at
|