mirror of
https://github.com/openwrt/openwrt.git
synced 2025-01-10 23:12:48 +00:00
d540725871
Without this patch, the chacha block counter is not incremented on neon rounds, resulting in incorrect calculations and corrupt packets. This also switches to using `--no-numbered --zero-commit` so that future diffs are smaller. Reported-by: Hans Geiblinger <cybrnook2002@yahoo.com> Reviewed-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
558 lines
17 KiB
Diff
558 lines
17 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
|
|
Date: Fri, 8 Nov 2019 13:22:31 +0100
|
|
Subject: [PATCH] crypto: blake2s - x86_64 SIMD implementation
|
|
|
|
commit ed0356eda153f6a95649e11feb7b07083caf9e20 upstream.
|
|
|
|
These implementations from Samuel Neves support AVX and AVX-512VL.
|
|
Originally this used AVX-512F, but Skylake thermal throttling made
|
|
AVX-512VL more attractive and possible to do with negligable difference.
|
|
|
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
|
Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
|
|
Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
|
|
[ardb: move to arch/x86/crypto, wire into lib/crypto framework]
|
|
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
|
|
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
|
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
|
---
|
|
arch/x86/crypto/Makefile | 2 +
|
|
arch/x86/crypto/blake2s-core.S | 258 +++++++++++++++++++++++++++++++++
|
|
arch/x86/crypto/blake2s-glue.c | 233 +++++++++++++++++++++++++++++
|
|
crypto/Kconfig | 6 +
|
|
4 files changed, 499 insertions(+)
|
|
create mode 100644 arch/x86/crypto/blake2s-core.S
|
|
create mode 100644 arch/x86/crypto/blake2s-glue.c
|
|
|
|
--- a/arch/x86/crypto/Makefile
|
|
+++ b/arch/x86/crypto/Makefile
|
|
@@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
|
|
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
|
|
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
|
|
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
|
|
+ obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
|
|
endif
|
|
|
|
# These modules require assembler to support AVX2.
|
|
@@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x8
|
|
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
|
|
|
|
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
|
|
+blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
|
|
|
|
ifeq ($(avx_supported),yes)
|
|
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
|
|
--- /dev/null
|
|
+++ b/arch/x86/crypto/blake2s-core.S
|
|
@@ -0,0 +1,258 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
|
+/*
|
|
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
+ * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
|
|
+ */
|
|
+
|
|
+#include <linux/linkage.h>
|
|
+
|
|
+.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
|
|
+.align 32
|
|
+IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
|
|
+ .octa 0x5BE0CD191F83D9AB9B05688C510E527F
|
|
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
|
|
+.align 16
|
|
+ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
|
|
+.section .rodata.cst16.ROR328, "aM", @progbits, 16
|
|
+.align 16
|
|
+ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
|
|
+.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
|
|
+.align 64
|
|
+SIGMA:
|
|
+.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
|
+.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
|
|
+.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
|
|
+.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
|
|
+.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
|
|
+.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
|
|
+.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
|
|
+.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
|
|
+.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
|
|
+.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
|
|
+#ifdef CONFIG_AS_AVX512
|
|
+.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
|
|
+.align 64
|
|
+SIGMA2:
|
|
+.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
|
+.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
|
|
+.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
|
|
+.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
|
|
+.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
|
|
+.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
|
|
+.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
|
|
+.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
|
|
+.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
|
|
+.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
|
|
+#endif /* CONFIG_AS_AVX512 */
|
|
+
|
|
+.text
|
|
+#ifdef CONFIG_AS_SSSE3
|
|
+ENTRY(blake2s_compress_ssse3)
|
|
+ testq %rdx,%rdx
|
|
+ je .Lendofloop
|
|
+ movdqu (%rdi),%xmm0
|
|
+ movdqu 0x10(%rdi),%xmm1
|
|
+ movdqa ROT16(%rip),%xmm12
|
|
+ movdqa ROR328(%rip),%xmm13
|
|
+ movdqu 0x20(%rdi),%xmm14
|
|
+ movq %rcx,%xmm15
|
|
+ leaq SIGMA+0xa0(%rip),%r8
|
|
+ jmp .Lbeginofloop
|
|
+ .align 32
|
|
+.Lbeginofloop:
|
|
+ movdqa %xmm0,%xmm10
|
|
+ movdqa %xmm1,%xmm11
|
|
+ paddq %xmm15,%xmm14
|
|
+ movdqa IV(%rip),%xmm2
|
|
+ movdqa %xmm14,%xmm3
|
|
+ pxor IV+0x10(%rip),%xmm3
|
|
+ leaq SIGMA(%rip),%rcx
|
|
+.Lroundloop:
|
|
+ movzbl (%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm4
|
|
+ movzbl 0x1(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm5
|
|
+ movzbl 0x2(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm6
|
|
+ movzbl 0x3(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm7
|
|
+ punpckldq %xmm5,%xmm4
|
|
+ punpckldq %xmm7,%xmm6
|
|
+ punpcklqdq %xmm6,%xmm4
|
|
+ paddd %xmm4,%xmm0
|
|
+ paddd %xmm1,%xmm0
|
|
+ pxor %xmm0,%xmm3
|
|
+ pshufb %xmm12,%xmm3
|
|
+ paddd %xmm3,%xmm2
|
|
+ pxor %xmm2,%xmm1
|
|
+ movdqa %xmm1,%xmm8
|
|
+ psrld $0xc,%xmm1
|
|
+ pslld $0x14,%xmm8
|
|
+ por %xmm8,%xmm1
|
|
+ movzbl 0x4(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm5
|
|
+ movzbl 0x5(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm6
|
|
+ movzbl 0x6(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm7
|
|
+ movzbl 0x7(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm4
|
|
+ punpckldq %xmm6,%xmm5
|
|
+ punpckldq %xmm4,%xmm7
|
|
+ punpcklqdq %xmm7,%xmm5
|
|
+ paddd %xmm5,%xmm0
|
|
+ paddd %xmm1,%xmm0
|
|
+ pxor %xmm0,%xmm3
|
|
+ pshufb %xmm13,%xmm3
|
|
+ paddd %xmm3,%xmm2
|
|
+ pxor %xmm2,%xmm1
|
|
+ movdqa %xmm1,%xmm8
|
|
+ psrld $0x7,%xmm1
|
|
+ pslld $0x19,%xmm8
|
|
+ por %xmm8,%xmm1
|
|
+ pshufd $0x93,%xmm0,%xmm0
|
|
+ pshufd $0x4e,%xmm3,%xmm3
|
|
+ pshufd $0x39,%xmm2,%xmm2
|
|
+ movzbl 0x8(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm6
|
|
+ movzbl 0x9(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm7
|
|
+ movzbl 0xa(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm4
|
|
+ movzbl 0xb(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm5
|
|
+ punpckldq %xmm7,%xmm6
|
|
+ punpckldq %xmm5,%xmm4
|
|
+ punpcklqdq %xmm4,%xmm6
|
|
+ paddd %xmm6,%xmm0
|
|
+ paddd %xmm1,%xmm0
|
|
+ pxor %xmm0,%xmm3
|
|
+ pshufb %xmm12,%xmm3
|
|
+ paddd %xmm3,%xmm2
|
|
+ pxor %xmm2,%xmm1
|
|
+ movdqa %xmm1,%xmm8
|
|
+ psrld $0xc,%xmm1
|
|
+ pslld $0x14,%xmm8
|
|
+ por %xmm8,%xmm1
|
|
+ movzbl 0xc(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm7
|
|
+ movzbl 0xd(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm4
|
|
+ movzbl 0xe(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm5
|
|
+ movzbl 0xf(%rcx),%eax
|
|
+ movd (%rsi,%rax,4),%xmm6
|
|
+ punpckldq %xmm4,%xmm7
|
|
+ punpckldq %xmm6,%xmm5
|
|
+ punpcklqdq %xmm5,%xmm7
|
|
+ paddd %xmm7,%xmm0
|
|
+ paddd %xmm1,%xmm0
|
|
+ pxor %xmm0,%xmm3
|
|
+ pshufb %xmm13,%xmm3
|
|
+ paddd %xmm3,%xmm2
|
|
+ pxor %xmm2,%xmm1
|
|
+ movdqa %xmm1,%xmm8
|
|
+ psrld $0x7,%xmm1
|
|
+ pslld $0x19,%xmm8
|
|
+ por %xmm8,%xmm1
|
|
+ pshufd $0x39,%xmm0,%xmm0
|
|
+ pshufd $0x4e,%xmm3,%xmm3
|
|
+ pshufd $0x93,%xmm2,%xmm2
|
|
+ addq $0x10,%rcx
|
|
+ cmpq %r8,%rcx
|
|
+ jnz .Lroundloop
|
|
+ pxor %xmm2,%xmm0
|
|
+ pxor %xmm3,%xmm1
|
|
+ pxor %xmm10,%xmm0
|
|
+ pxor %xmm11,%xmm1
|
|
+ addq $0x40,%rsi
|
|
+ decq %rdx
|
|
+ jnz .Lbeginofloop
|
|
+ movdqu %xmm0,(%rdi)
|
|
+ movdqu %xmm1,0x10(%rdi)
|
|
+ movdqu %xmm14,0x20(%rdi)
|
|
+.Lendofloop:
|
|
+ ret
|
|
+ENDPROC(blake2s_compress_ssse3)
|
|
+#endif /* CONFIG_AS_SSSE3 */
|
|
+
|
|
+#ifdef CONFIG_AS_AVX512
|
|
+ENTRY(blake2s_compress_avx512)
|
|
+ vmovdqu (%rdi),%xmm0
|
|
+ vmovdqu 0x10(%rdi),%xmm1
|
|
+ vmovdqu 0x20(%rdi),%xmm4
|
|
+ vmovq %rcx,%xmm5
|
|
+ vmovdqa IV(%rip),%xmm14
|
|
+ vmovdqa IV+16(%rip),%xmm15
|
|
+ jmp .Lblake2s_compress_avx512_mainloop
|
|
+.align 32
|
|
+.Lblake2s_compress_avx512_mainloop:
|
|
+ vmovdqa %xmm0,%xmm10
|
|
+ vmovdqa %xmm1,%xmm11
|
|
+ vpaddq %xmm5,%xmm4,%xmm4
|
|
+ vmovdqa %xmm14,%xmm2
|
|
+ vpxor %xmm15,%xmm4,%xmm3
|
|
+ vmovdqu (%rsi),%ymm6
|
|
+ vmovdqu 0x20(%rsi),%ymm7
|
|
+ addq $0x40,%rsi
|
|
+ leaq SIGMA2(%rip),%rax
|
|
+ movb $0xa,%cl
|
|
+.Lblake2s_compress_avx512_roundloop:
|
|
+ addq $0x40,%rax
|
|
+ vmovdqa -0x40(%rax),%ymm8
|
|
+ vmovdqa -0x20(%rax),%ymm9
|
|
+ vpermi2d %ymm7,%ymm6,%ymm8
|
|
+ vpermi2d %ymm7,%ymm6,%ymm9
|
|
+ vmovdqa %ymm8,%ymm6
|
|
+ vmovdqa %ymm9,%ymm7
|
|
+ vpaddd %xmm8,%xmm0,%xmm0
|
|
+ vpaddd %xmm1,%xmm0,%xmm0
|
|
+ vpxor %xmm0,%xmm3,%xmm3
|
|
+ vprord $0x10,%xmm3,%xmm3
|
|
+ vpaddd %xmm3,%xmm2,%xmm2
|
|
+ vpxor %xmm2,%xmm1,%xmm1
|
|
+ vprord $0xc,%xmm1,%xmm1
|
|
+ vextracti128 $0x1,%ymm8,%xmm8
|
|
+ vpaddd %xmm8,%xmm0,%xmm0
|
|
+ vpaddd %xmm1,%xmm0,%xmm0
|
|
+ vpxor %xmm0,%xmm3,%xmm3
|
|
+ vprord $0x8,%xmm3,%xmm3
|
|
+ vpaddd %xmm3,%xmm2,%xmm2
|
|
+ vpxor %xmm2,%xmm1,%xmm1
|
|
+ vprord $0x7,%xmm1,%xmm1
|
|
+ vpshufd $0x93,%xmm0,%xmm0
|
|
+ vpshufd $0x4e,%xmm3,%xmm3
|
|
+ vpshufd $0x39,%xmm2,%xmm2
|
|
+ vpaddd %xmm9,%xmm0,%xmm0
|
|
+ vpaddd %xmm1,%xmm0,%xmm0
|
|
+ vpxor %xmm0,%xmm3,%xmm3
|
|
+ vprord $0x10,%xmm3,%xmm3
|
|
+ vpaddd %xmm3,%xmm2,%xmm2
|
|
+ vpxor %xmm2,%xmm1,%xmm1
|
|
+ vprord $0xc,%xmm1,%xmm1
|
|
+ vextracti128 $0x1,%ymm9,%xmm9
|
|
+ vpaddd %xmm9,%xmm0,%xmm0
|
|
+ vpaddd %xmm1,%xmm0,%xmm0
|
|
+ vpxor %xmm0,%xmm3,%xmm3
|
|
+ vprord $0x8,%xmm3,%xmm3
|
|
+ vpaddd %xmm3,%xmm2,%xmm2
|
|
+ vpxor %xmm2,%xmm1,%xmm1
|
|
+ vprord $0x7,%xmm1,%xmm1
|
|
+ vpshufd $0x39,%xmm0,%xmm0
|
|
+ vpshufd $0x4e,%xmm3,%xmm3
|
|
+ vpshufd $0x93,%xmm2,%xmm2
|
|
+ decb %cl
|
|
+ jne .Lblake2s_compress_avx512_roundloop
|
|
+ vpxor %xmm10,%xmm0,%xmm0
|
|
+ vpxor %xmm11,%xmm1,%xmm1
|
|
+ vpxor %xmm2,%xmm0,%xmm0
|
|
+ vpxor %xmm3,%xmm1,%xmm1
|
|
+ decq %rdx
|
|
+ jne .Lblake2s_compress_avx512_mainloop
|
|
+ vmovdqu %xmm0,(%rdi)
|
|
+ vmovdqu %xmm1,0x10(%rdi)
|
|
+ vmovdqu %xmm4,0x20(%rdi)
|
|
+ vzeroupper
|
|
+ retq
|
|
+ENDPROC(blake2s_compress_avx512)
|
|
+#endif /* CONFIG_AS_AVX512 */
|
|
--- /dev/null
|
|
+++ b/arch/x86/crypto/blake2s-glue.c
|
|
@@ -0,0 +1,233 @@
|
|
+// SPDX-License-Identifier: GPL-2.0 OR MIT
|
|
+/*
|
|
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
|
+ */
|
|
+
|
|
+#include <crypto/internal/blake2s.h>
|
|
+#include <crypto/internal/simd.h>
|
|
+#include <crypto/internal/hash.h>
|
|
+
|
|
+#include <linux/types.h>
|
|
+#include <linux/jump_label.h>
|
|
+#include <linux/kernel.h>
|
|
+#include <linux/module.h>
|
|
+
|
|
+#include <asm/cpufeature.h>
|
|
+#include <asm/fpu/api.h>
|
|
+#include <asm/processor.h>
|
|
+#include <asm/simd.h>
|
|
+
|
|
+asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
|
|
+ const u8 *block, const size_t nblocks,
|
|
+ const u32 inc);
|
|
+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
|
|
+ const u8 *block, const size_t nblocks,
|
|
+ const u32 inc);
|
|
+
|
|
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
|
|
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
|
|
+
|
|
+void blake2s_compress_arch(struct blake2s_state *state,
|
|
+ const u8 *block, size_t nblocks,
|
|
+ const u32 inc)
|
|
+{
|
|
+ /* SIMD disables preemption, so relax after processing each page. */
|
|
+ BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
|
|
+
|
|
+ if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
|
|
+ blake2s_compress_generic(state, block, nblocks, inc);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ for (;;) {
|
|
+ const size_t blocks = min_t(size_t, nblocks,
|
|
+ PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
|
|
+
|
|
+ kernel_fpu_begin();
|
|
+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
|
+ static_branch_likely(&blake2s_use_avx512))
|
|
+ blake2s_compress_avx512(state, block, blocks, inc);
|
|
+ else
|
|
+ blake2s_compress_ssse3(state, block, blocks, inc);
|
|
+ kernel_fpu_end();
|
|
+
|
|
+ nblocks -= blocks;
|
|
+ if (!nblocks)
|
|
+ break;
|
|
+ block += blocks * BLAKE2S_BLOCK_SIZE;
|
|
+ }
|
|
+}
|
|
+EXPORT_SYMBOL(blake2s_compress_arch);
|
|
+
|
|
+static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
|
|
+ unsigned int keylen)
|
|
+{
|
|
+ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
|
|
+
|
|
+ if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
|
|
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ memcpy(tctx->key, key, keylen);
|
|
+ tctx->keylen = keylen;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int crypto_blake2s_init(struct shash_desc *desc)
|
|
+{
|
|
+ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
|
|
+ struct blake2s_state *state = shash_desc_ctx(desc);
|
|
+ const int outlen = crypto_shash_digestsize(desc->tfm);
|
|
+
|
|
+ if (tctx->keylen)
|
|
+ blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
|
|
+ else
|
|
+ blake2s_init(state, outlen);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
|
|
+ unsigned int inlen)
|
|
+{
|
|
+ struct blake2s_state *state = shash_desc_ctx(desc);
|
|
+ const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
|
|
+
|
|
+ if (unlikely(!inlen))
|
|
+ return 0;
|
|
+ if (inlen > fill) {
|
|
+ memcpy(state->buf + state->buflen, in, fill);
|
|
+ blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
|
|
+ state->buflen = 0;
|
|
+ in += fill;
|
|
+ inlen -= fill;
|
|
+ }
|
|
+ if (inlen > BLAKE2S_BLOCK_SIZE) {
|
|
+ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
|
|
+ /* Hash one less (full) block than strictly possible */
|
|
+ blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
|
|
+ in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
|
+ inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
|
+ }
|
|
+ memcpy(state->buf + state->buflen, in, inlen);
|
|
+ state->buflen += inlen;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
|
|
+{
|
|
+ struct blake2s_state *state = shash_desc_ctx(desc);
|
|
+
|
|
+ blake2s_set_lastblock(state);
|
|
+ memset(state->buf + state->buflen, 0,
|
|
+ BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
|
|
+ blake2s_compress_arch(state, state->buf, 1, state->buflen);
|
|
+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
|
|
+ memcpy(out, state->h, state->outlen);
|
|
+ memzero_explicit(state, sizeof(*state));
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct shash_alg blake2s_algs[] = {{
|
|
+ .base.cra_name = "blake2s-128",
|
|
+ .base.cra_driver_name = "blake2s-128-x86",
|
|
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
|
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
|
+ .base.cra_priority = 200,
|
|
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
|
+ .base.cra_module = THIS_MODULE,
|
|
+
|
|
+ .digestsize = BLAKE2S_128_HASH_SIZE,
|
|
+ .setkey = crypto_blake2s_setkey,
|
|
+ .init = crypto_blake2s_init,
|
|
+ .update = crypto_blake2s_update,
|
|
+ .final = crypto_blake2s_final,
|
|
+ .descsize = sizeof(struct blake2s_state),
|
|
+}, {
|
|
+ .base.cra_name = "blake2s-160",
|
|
+ .base.cra_driver_name = "blake2s-160-x86",
|
|
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
|
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
|
+ .base.cra_priority = 200,
|
|
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
|
+ .base.cra_module = THIS_MODULE,
|
|
+
|
|
+ .digestsize = BLAKE2S_160_HASH_SIZE,
|
|
+ .setkey = crypto_blake2s_setkey,
|
|
+ .init = crypto_blake2s_init,
|
|
+ .update = crypto_blake2s_update,
|
|
+ .final = crypto_blake2s_final,
|
|
+ .descsize = sizeof(struct blake2s_state),
|
|
+}, {
|
|
+ .base.cra_name = "blake2s-224",
|
|
+ .base.cra_driver_name = "blake2s-224-x86",
|
|
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
|
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
|
+ .base.cra_priority = 200,
|
|
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
|
+ .base.cra_module = THIS_MODULE,
|
|
+
|
|
+ .digestsize = BLAKE2S_224_HASH_SIZE,
|
|
+ .setkey = crypto_blake2s_setkey,
|
|
+ .init = crypto_blake2s_init,
|
|
+ .update = crypto_blake2s_update,
|
|
+ .final = crypto_blake2s_final,
|
|
+ .descsize = sizeof(struct blake2s_state),
|
|
+}, {
|
|
+ .base.cra_name = "blake2s-256",
|
|
+ .base.cra_driver_name = "blake2s-256-x86",
|
|
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
|
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
|
+ .base.cra_priority = 200,
|
|
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
|
+ .base.cra_module = THIS_MODULE,
|
|
+
|
|
+ .digestsize = BLAKE2S_256_HASH_SIZE,
|
|
+ .setkey = crypto_blake2s_setkey,
|
|
+ .init = crypto_blake2s_init,
|
|
+ .update = crypto_blake2s_update,
|
|
+ .final = crypto_blake2s_final,
|
|
+ .descsize = sizeof(struct blake2s_state),
|
|
+}};
|
|
+
|
|
+static int __init blake2s_mod_init(void)
|
|
+{
|
|
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
|
+ return 0;
|
|
+
|
|
+ static_branch_enable(&blake2s_use_ssse3);
|
|
+
|
|
+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
|
+ boot_cpu_has(X86_FEATURE_AVX) &&
|
|
+ boot_cpu_has(X86_FEATURE_AVX2) &&
|
|
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
|
|
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
|
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
|
|
+ XFEATURE_MASK_AVX512, NULL))
|
|
+ static_branch_enable(&blake2s_use_avx512);
|
|
+
|
|
+ return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
|
+}
|
|
+
|
|
+static void __exit blake2s_mod_exit(void)
|
|
+{
|
|
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
|
|
+ crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
|
+}
|
|
+
|
|
+module_init(blake2s_mod_init);
|
|
+module_exit(blake2s_mod_exit);
|
|
+
|
|
+MODULE_ALIAS_CRYPTO("blake2s-128");
|
|
+MODULE_ALIAS_CRYPTO("blake2s-128-x86");
|
|
+MODULE_ALIAS_CRYPTO("blake2s-160");
|
|
+MODULE_ALIAS_CRYPTO("blake2s-160-x86");
|
|
+MODULE_ALIAS_CRYPTO("blake2s-224");
|
|
+MODULE_ALIAS_CRYPTO("blake2s-224-x86");
|
|
+MODULE_ALIAS_CRYPTO("blake2s-256");
|
|
+MODULE_ALIAS_CRYPTO("blake2s-256-x86");
|
|
+MODULE_LICENSE("GPL v2");
|
|
--- a/crypto/Kconfig
|
|
+++ b/crypto/Kconfig
|
|
@@ -657,6 +657,12 @@ config CRYPTO_BLAKE2S
|
|
|
|
See https://blake2.net for further information.
|
|
|
|
+config CRYPTO_BLAKE2S_X86
|
|
+ tristate "BLAKE2s digest algorithm (x86 accelerated version)"
|
|
+ depends on X86 && 64BIT
|
|
+ select CRYPTO_LIB_BLAKE2S_GENERIC
|
|
+ select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
|
|
+
|
|
config CRYPTO_CRCT10DIF
|
|
tristate "CRCT10DIF algorithm"
|
|
select CRYPTO_HASH
|