From 70f37962cfea04272c49371ee1b68e212e086b67 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Tue, 20 Oct 2020 18:50:28 -0400 Subject: [PATCH] Backport AES fixes for compiler, arch, and splitting into separate files. --- node/AES.cpp | 1020 +--------------------------------------- node/AES.hpp | 57 ++- node/AES_aesni.cpp | 651 +++++++++++++++++++++++++ node/AES_armcrypto.cpp | 388 +++++++++++++++ objects.mk | 2 + 5 files changed, 1100 insertions(+), 1018 deletions(-) create mode 100644 node/AES_aesni.cpp create mode 100644 node/AES_armcrypto.cpp diff --git a/node/AES.cpp b/node/AES.cpp index 00402146f..8402fc908 100644 --- a/node/AES.cpp +++ b/node/AES.cpp @@ -31,37 +31,6 @@ namespace ZeroTier { namespace { -#ifdef ZT_AES_NEON - -ZT_INLINE uint8x16_t s_clmul_armneon_crypto(uint8x16_t h, uint8x16_t y, const uint8_t b[16]) noexcept -{ - uint8x16_t r0, r1, t0, t1; - r0 = vld1q_u8(b); - const uint8x16_t z = veorq_u8(h, h); - y = veorq_u8(r0, y); - y = vrbitq_u8(y); - const uint8x16_t p = vreinterpretq_u8_u64(vdupq_n_u64(0x0000000000000087)); - t0 = vextq_u8(y, y, 8); - __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t" : "=w" (r0) : "w" (h), "w" (y)); - __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t" :"=w" (r1) : "w" (h), "w" (y)); - __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t" : "=w" (t1) : "w" (h), "w" (t0)); - __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t" :"=w" (t0) : "w" (h), "w" (t0)); - t0 = veorq_u8(t0, t1); - t1 = vextq_u8(z, t0, 8); - r0 = veorq_u8(r0, t1); - t1 = vextq_u8(t0, z, 8); - r1 = veorq_u8(r1, t1); - __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t" :"=w" (t0) : "w" (r1), "w" (p)); - t1 = vextq_u8(t0, z, 8); - r1 = veorq_u8(r1, t1); - t1 = vextq_u8(z, t0, 8); - r0 = veorq_u8(r0, t1); - __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t" : "=w" (t0) : "w" (r1), "w" (p)); - return vrbitq_u8(veorq_u8(r0, t0)); -} - -#endif // ZT_AES_NEON - #define s_bmul32(N, x, y, rh, rl) \ uint32_t x0t_##N = (x) & 0x11111111U; \ uint32_t x1t_##N = (x) & 0x22222222U; \ @@ -141,36 +110,6 @@ void s_gfmul(const uint64_t hh, const uint64_t hl, uint64_t &y0, uint64_t &y1) n } // anonymous namespace -#ifdef ZT_AES_AESNI - -// SSE shuffle parameter to reverse bytes in a 128-bit vector. -static const __m128i s_sseSwapBytes = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - -__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2"))) -static __m128i p_gmacPCLMUL128(const __m128i h, __m128i y) noexcept -{ - y = _mm_shuffle_epi8(y, s_sseSwapBytes); - __m128i t1 = _mm_clmulepi64_si128(h, y, 0x00); - __m128i t2 = _mm_clmulepi64_si128(h, y, 0x01); - __m128i t3 = _mm_clmulepi64_si128(h, y, 0x10); - __m128i t4 = _mm_clmulepi64_si128(h, y, 0x11); - t2 = _mm_xor_si128(t2, t3); - t3 = _mm_slli_si128(t2, 8); - t2 = _mm_srli_si128(t2, 8); - t1 = _mm_xor_si128(t1, t3); - t4 = _mm_xor_si128(t4, t2); - __m128i t5 = _mm_srli_epi32(t1, 31); - t1 = _mm_or_si128(_mm_slli_epi32(t1, 1), _mm_slli_si128(t5, 4)); - t4 = _mm_or_si128(_mm_or_si128(_mm_slli_epi32(t4, 1), _mm_slli_si128(_mm_srli_epi32(t4, 31), 4)), _mm_srli_si128(t5, 12)); - t5 = _mm_xor_si128(_mm_xor_si128(_mm_slli_epi32(t1, 31), _mm_slli_epi32(t1, 30)), _mm_slli_epi32(t1, 25)); - t1 = _mm_xor_si128(t1, _mm_slli_si128(t5, 12)); - t4 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(t4, _mm_srli_si128(t5, 4)), t1), _mm_srli_epi32(t1, 2)), _mm_srli_epi32(t1, 7)), _mm_srli_epi32(t1, 1)); - return _mm_shuffle_epi8(t4, s_sseSwapBytes); -} - -#endif - -__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2"))) void AES::GMAC::update(const void *const data, unsigned int len) noexcept { const uint8_t *in = reinterpret_cast(data); @@ -178,108 +117,20 @@ void AES::GMAC::update(const void *const data, unsigned int len) noexcept #ifdef ZT_AES_AESNI if (likely(Utils::CPUID.aes)) { - __m128i y = _mm_loadu_si128(reinterpret_cast(_y)); - - // Handle anything left over from a previous run that wasn't a multiple of 16 bytes. - if (_rp) { - for (;;) { - if (!len) - return; - --len; - _r[_rp++] = *(in++); - if (_rp == 16) { - y = p_gmacPCLMUL128(_aes._k.ni.h[0], _mm_xor_si128(y, _mm_loadu_si128(reinterpret_cast<__m128i *>(_r)))); - break; - } - } - } - - if (likely(len >= 64)) { - const __m128i sb = s_sseSwapBytes; - const __m128i h = _aes._k.ni.h[0]; - const __m128i hh = _aes._k.ni.h[1]; - const __m128i hhh = _aes._k.ni.h[2]; - const __m128i hhhh = _aes._k.ni.h[3]; - const __m128i h2 = _aes._k.ni.h2[0]; - const __m128i hh2 = _aes._k.ni.h2[1]; - const __m128i hhh2 = _aes._k.ni.h2[2]; - const __m128i hhhh2 = _aes._k.ni.h2[3]; - const uint8_t *const end64 = in + (len & ~((unsigned int)63)); - len &= 63; - do { - __m128i d1 = _mm_shuffle_epi8(_mm_xor_si128(y, _mm_loadu_si128(reinterpret_cast(in))), sb); - __m128i d2 = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast(in + 16)), sb); - __m128i d3 = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast(in + 32)), sb); - __m128i d4 = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast(in + 48)), sb); - in += 64; - __m128i a = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(hhhh, d1, 0x00), _mm_clmulepi64_si128(hhh, d2, 0x00)), _mm_xor_si128(_mm_clmulepi64_si128(hh, d3, 0x00), _mm_clmulepi64_si128(h, d4, 0x00))); - __m128i b = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(hhhh, d1, 0x11), _mm_clmulepi64_si128(hhh, d2, 0x11)), _mm_xor_si128(_mm_clmulepi64_si128(hh, d3, 0x11), _mm_clmulepi64_si128(h, d4, 0x11))); - __m128i c = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(hhhh2, _mm_xor_si128(_mm_shuffle_epi32(d1, 78), d1), 0x00), _mm_clmulepi64_si128(hhh2, _mm_xor_si128(_mm_shuffle_epi32(d2, 78), d2), 0x00)), _mm_xor_si128(_mm_clmulepi64_si128(hh2, _mm_xor_si128(_mm_shuffle_epi32(d3, 78), d3), 0x00), _mm_clmulepi64_si128(h2, _mm_xor_si128(_mm_shuffle_epi32(d4, 78), d4), 0x00))), _mm_xor_si128(a, b)); - a = _mm_xor_si128(_mm_slli_si128(c, 8), a); - b = _mm_xor_si128(_mm_srli_si128(c, 8), b); - c = _mm_srli_epi32(a, 31); - a = _mm_or_si128(_mm_slli_epi32(a, 1), _mm_slli_si128(c, 4)); - b = _mm_or_si128(_mm_or_si128(_mm_slli_epi32(b, 1), _mm_slli_si128(_mm_srli_epi32(b, 31), 4)), _mm_srli_si128(c, 12)); - c = _mm_xor_si128(_mm_slli_epi32(a, 31), _mm_xor_si128(_mm_slli_epi32(a, 30), _mm_slli_epi32(a, 25))); - a = _mm_xor_si128(a, _mm_slli_si128(c, 12)); - b = _mm_xor_si128(b, _mm_xor_si128(a, _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(a, 1), _mm_srli_si128(c, 4)), _mm_xor_si128(_mm_srli_epi32(a, 2), _mm_srli_epi32(a, 7))))); - y = _mm_shuffle_epi8(b, sb); - } while (likely(in != end64)); - } - - while (len >= 16) { - y = p_gmacPCLMUL128(_aes._k.ni.h[0], _mm_xor_si128(y, _mm_loadu_si128(reinterpret_cast(in)))); - in += 16; - len -= 16; - } - - _mm_storeu_si128(reinterpret_cast<__m128i *>(_y), y); - - // Any overflow is cached for a later run or finish(). - for (unsigned int i = 0; i < len; ++i) - _r[i] = in[i]; - _rp = len; // len is always less than 16 here - + p_aesNIUpdate(in, len); return; } #endif // ZT_AES_AESNI #ifdef ZT_AES_NEON if (Utils::ARMCAP.pmull) { - uint8x16_t y = vld1q_u8(reinterpret_cast(_y)); - const uint8x16_t h = _aes._k.neon.h; - - if (_rp) { - for(;;) { - if (!len) - return; - --len; - _r[_rp++] = *(in++); - if (_rp == 16) { - y = s_clmul_armneon_crypto(h, y, _r); - break; - } - } - } - - while (len >= 16) { - y = s_clmul_armneon_crypto(h, y, in); - in += 16; - len -= 16; - } - - vst1q_u8(reinterpret_cast(_y), y); - - for (unsigned int i = 0; i < len; ++i) - _r[i] = in[i]; - _rp = len; // len is always less than 16 here - + p_armUpdate(in, len); return; } #endif // ZT_AES_NEON - const uint64_t h0 = _aes._k.sw.h[0]; - const uint64_t h1 = _aes._k.sw.h[1]; + const uint64_t h0 = _aes.p_k.sw.h[0]; + const uint64_t h1 = _aes.p_k.sw.h[1]; uint64_t y0 = _y[0]; uint64_t y1 = _y[1]; @@ -324,116 +175,24 @@ void AES::GMAC::update(const void *const data, unsigned int len) noexcept _rp = len; // len is always less than 16 here } -__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2"))) void AES::GMAC::finish(uint8_t tag[16]) noexcept { #ifdef ZT_AES_AESNI if (likely(Utils::CPUID.aes)) { - __m128i y = _mm_loadu_si128(reinterpret_cast(_y)); - - // Handle any remaining bytes, padding the last block with zeroes. - if (_rp) { - while (_rp < 16) - _r[_rp++] = 0; - y = p_gmacPCLMUL128(_aes._k.ni.h[0], _mm_xor_si128(y, _mm_loadu_si128(reinterpret_cast<__m128i *>(_r)))); - } - - // Interleave encryption of IV with the final GHASH of y XOR (length * 8). - // Then XOR these together to get the final tag. - const __m128i *const k = _aes._k.ni.k; - const __m128i h = _aes._k.ni.h[0]; - y = _mm_xor_si128(y, _mm_set_epi64x(0LL, (long long)Utils::hton((uint64_t)_len << 3U))); - y = _mm_shuffle_epi8(y, s_sseSwapBytes); - __m128i encIV = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast(_iv)), k[0]); - __m128i t1 = _mm_clmulepi64_si128(h, y, 0x00); - __m128i t2 = _mm_clmulepi64_si128(h, y, 0x01); - __m128i t3 = _mm_clmulepi64_si128(h, y, 0x10); - __m128i t4 = _mm_clmulepi64_si128(h, y, 0x11); - encIV = _mm_aesenc_si128(encIV, k[1]); - t2 = _mm_xor_si128(t2, t3); - t3 = _mm_slli_si128(t2, 8); - encIV = _mm_aesenc_si128(encIV, k[2]); - t2 = _mm_srli_si128(t2, 8); - t1 = _mm_xor_si128(t1, t3); - encIV = _mm_aesenc_si128(encIV, k[3]); - t4 = _mm_xor_si128(t4, t2); - __m128i t5 = _mm_srli_epi32(t1, 31); - t1 = _mm_slli_epi32(t1, 1); - __m128i t6 = _mm_srli_epi32(t4, 31); - encIV = _mm_aesenc_si128(encIV, k[4]); - t4 = _mm_slli_epi32(t4, 1); - t3 = _mm_srli_si128(t5, 12); - encIV = _mm_aesenc_si128(encIV, k[5]); - t6 = _mm_slli_si128(t6, 4); - t5 = _mm_slli_si128(t5, 4); - encIV = _mm_aesenc_si128(encIV, k[6]); - t1 = _mm_or_si128(t1, t5); - t4 = _mm_or_si128(t4, t6); - encIV = _mm_aesenc_si128(encIV, k[7]); - t4 = _mm_or_si128(t4, t3); - t5 = _mm_slli_epi32(t1, 31); - encIV = _mm_aesenc_si128(encIV, k[8]); - t6 = _mm_slli_epi32(t1, 30); - t3 = _mm_slli_epi32(t1, 25); - encIV = _mm_aesenc_si128(encIV, k[9]); - t5 = _mm_xor_si128(t5, t6); - t5 = _mm_xor_si128(t5, t3); - encIV = _mm_aesenc_si128(encIV, k[10]); - t6 = _mm_srli_si128(t5, 4); - t4 = _mm_xor_si128(t4, t6); - encIV = _mm_aesenc_si128(encIV, k[11]); - t5 = _mm_slli_si128(t5, 12); - t1 = _mm_xor_si128(t1, t5); - t4 = _mm_xor_si128(t4, t1); - t5 = _mm_srli_epi32(t1, 1); - encIV = _mm_aesenc_si128(encIV, k[12]); - t2 = _mm_srli_epi32(t1, 2); - t3 = _mm_srli_epi32(t1, 7); - encIV = _mm_aesenc_si128(encIV, k[13]); - t4 = _mm_xor_si128(t4, t2); - t4 = _mm_xor_si128(t4, t3); - encIV = _mm_aesenclast_si128(encIV, k[14]); - t4 = _mm_xor_si128(t4, t5); - _mm_storeu_si128(reinterpret_cast<__m128i *>(tag), _mm_xor_si128(_mm_shuffle_epi8(t4, s_sseSwapBytes), encIV)); - + p_aesNIFinish(tag); return; } #endif // ZT_AES_AESNI #ifdef ZT_AES_NEON if (Utils::ARMCAP.pmull) { - uint64_t tmp[2]; - uint8x16_t y = vld1q_u8(reinterpret_cast(_y)); - const uint8x16_t h = _aes._k.neon.h; - - if (_rp) { - while (_rp < 16) - _r[_rp++] = 0; - y = s_clmul_armneon_crypto(h, y, _r); - } - - tmp[0] = Utils::hton((uint64_t)_len << 3U); - tmp[1] = 0; - y = s_clmul_armneon_crypto(h, y, reinterpret_cast(tmp)); - - Utils::copy< 12 >(tmp, _iv); -#if __BYTE_ORDER == __BIG_ENDIAN - reinterpret_cast(tmp)[3] = 0x00000001; -#else - reinterpret_cast(tmp)[3] = 0x01000000; -#endif - _aes.encrypt(tmp, tmp); - - uint8x16_t yy = y; - Utils::storeMachineEndian< uint64_t >(tag, tmp[0] ^ reinterpret_cast(&yy)[0]); - Utils::storeMachineEndian< uint64_t >(tag + 8, tmp[1] ^ reinterpret_cast(&yy)[1]); - + p_armFinish(tag); return; } #endif // ZT_AES_NEON - const uint64_t h0 = _aes._k.sw.h[0]; - const uint64_t h1 = _aes._k.sw.h[1]; + const uint64_t h0 = _aes.p_k.sw.h[0]; + const uint64_t h1 = _aes.p_k.sw.h[1]; uint64_t y0 = _y[0]; uint64_t y1 = _y[1]; @@ -463,140 +222,6 @@ void AES::GMAC::finish(uint8_t tag[16]) noexcept // AES-CTR ------------------------------------------------------------------------------------------------------------ -#ifdef ZT_AES_AESNI - -/* Disable VAES stuff on compilers too old to compile these intrinsics, - * and MinGW64 also seems not to support them so disable on Windows. - * The performance gain can be significant but regular SSE is already so - * fast it's highly unlikely to be a rate limiting factor except on massive - * servers and network infrastructure stuff. */ -#if !defined(__WINDOWS__) && ((__GNUC__ >= 8) || (__clang_major__ >= 7)) - -#define ZT_AES_VAES512 1 - -static -__attribute__((__target__("sse4,avx,avx2,vaes,avx512f,avx512bw"))) -void p_aesCtrInnerVAES512(unsigned int &len, const uint64_t c0, uint64_t &c1, const uint8_t *&in, uint8_t *&out, const __m128i *const k) noexcept -{ - const __m512i kk0 = _mm512_broadcast_i32x4(k[0]); - const __m512i kk1 = _mm512_broadcast_i32x4(k[1]); - const __m512i kk2 = _mm512_broadcast_i32x4(k[2]); - const __m512i kk3 = _mm512_broadcast_i32x4(k[3]); - const __m512i kk4 = _mm512_broadcast_i32x4(k[4]); - const __m512i kk5 = _mm512_broadcast_i32x4(k[5]); - const __m512i kk6 = _mm512_broadcast_i32x4(k[6]); - const __m512i kk7 = _mm512_broadcast_i32x4(k[7]); - const __m512i kk8 = _mm512_broadcast_i32x4(k[8]); - const __m512i kk9 = _mm512_broadcast_i32x4(k[9]); - const __m512i kk10 = _mm512_broadcast_i32x4(k[10]); - const __m512i kk11 = _mm512_broadcast_i32x4(k[11]); - const __m512i kk12 = _mm512_broadcast_i32x4(k[12]); - const __m512i kk13 = _mm512_broadcast_i32x4(k[13]); - const __m512i kk14 = _mm512_broadcast_i32x4(k[14]); - do { - __m512i p0 = _mm512_loadu_si512(reinterpret_cast(in)); - __m512i d0 = _mm512_set_epi64( - (long long)Utils::hton(c1 + 3ULL), (long long)c0, - (long long)Utils::hton(c1 + 2ULL), (long long)c0, - (long long)Utils::hton(c1 + 1ULL), (long long)c0, - (long long)Utils::hton(c1), (long long)c0); - c1 += 4; - in += 64; - len -= 64; - d0 = _mm512_xor_si512(d0, kk0); - d0 = _mm512_aesenc_epi128(d0, kk1); - d0 = _mm512_aesenc_epi128(d0, kk2); - d0 = _mm512_aesenc_epi128(d0, kk3); - d0 = _mm512_aesenc_epi128(d0, kk4); - d0 = _mm512_aesenc_epi128(d0, kk5); - d0 = _mm512_aesenc_epi128(d0, kk6); - d0 = _mm512_aesenc_epi128(d0, kk7); - d0 = _mm512_aesenc_epi128(d0, kk8); - d0 = _mm512_aesenc_epi128(d0, kk9); - d0 = _mm512_aesenc_epi128(d0, kk10); - d0 = _mm512_aesenc_epi128(d0, kk11); - d0 = _mm512_aesenc_epi128(d0, kk12); - d0 = _mm512_aesenc_epi128(d0, kk13); - d0 = _mm512_aesenclast_epi128(d0, kk14); - _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), _mm512_xor_si512(p0, d0)); - out += 64; - } while (likely(len >= 64)); -} - -#define ZT_AES_VAES256 1 - -static -__attribute__((__target__("sse4,avx,avx2,vaes"))) -void p_aesCtrInnerVAES256(unsigned int &len, const uint64_t c0, uint64_t &c1, const uint8_t *&in, uint8_t *&out, const __m128i *const k) noexcept -{ - const __m256i kk0 = _mm256_broadcastsi128_si256(k[0]); - const __m256i kk1 = _mm256_broadcastsi128_si256(k[1]); - const __m256i kk2 = _mm256_broadcastsi128_si256(k[2]); - const __m256i kk3 = _mm256_broadcastsi128_si256(k[3]); - const __m256i kk4 = _mm256_broadcastsi128_si256(k[4]); - const __m256i kk5 = _mm256_broadcastsi128_si256(k[5]); - const __m256i kk6 = _mm256_broadcastsi128_si256(k[6]); - const __m256i kk7 = _mm256_broadcastsi128_si256(k[7]); - const __m256i kk8 = _mm256_broadcastsi128_si256(k[8]); - const __m256i kk9 = _mm256_broadcastsi128_si256(k[9]); - const __m256i kk10 = _mm256_broadcastsi128_si256(k[10]); - const __m256i kk11 = _mm256_broadcastsi128_si256(k[11]); - const __m256i kk12 = _mm256_broadcastsi128_si256(k[12]); - const __m256i kk13 = _mm256_broadcastsi128_si256(k[13]); - const __m256i kk14 = _mm256_broadcastsi128_si256(k[14]); - do { - __m256i p0 = _mm256_loadu_si256(reinterpret_cast(in)); - __m256i p1 = _mm256_loadu_si256(reinterpret_cast(in + 32)); - __m256i d0 = _mm256_set_epi64x( - (long long)Utils::hton(c1 + 1ULL), (long long)c0, - (long long)Utils::hton(c1), (long long)c0); - __m256i d1 = _mm256_set_epi64x( - (long long)Utils::hton(c1 + 3ULL), (long long)c0, - (long long)Utils::hton(c1 + 2ULL), (long long)c0); - c1 += 4; - in += 64; - len -= 64; - d0 = _mm256_xor_si256(d0, kk0); - d1 = _mm256_xor_si256(d1, kk0); - d0 = _mm256_aesenc_epi128(d0, kk1); - d1 = _mm256_aesenc_epi128(d1, kk1); - d0 = _mm256_aesenc_epi128(d0, kk2); - d1 = _mm256_aesenc_epi128(d1, kk2); - d0 = _mm256_aesenc_epi128(d0, kk3); - d1 = _mm256_aesenc_epi128(d1, kk3); - d0 = _mm256_aesenc_epi128(d0, kk4); - d1 = _mm256_aesenc_epi128(d1, kk4); - d0 = _mm256_aesenc_epi128(d0, kk5); - d1 = _mm256_aesenc_epi128(d1, kk5); - d0 = _mm256_aesenc_epi128(d0, kk6); - d1 = _mm256_aesenc_epi128(d1, kk6); - d0 = _mm256_aesenc_epi128(d0, kk7); - d1 = _mm256_aesenc_epi128(d1, kk7); - d0 = _mm256_aesenc_epi128(d0, kk8); - d1 = _mm256_aesenc_epi128(d1, kk8); - d0 = _mm256_aesenc_epi128(d0, kk9); - d1 = _mm256_aesenc_epi128(d1, kk9); - d0 = _mm256_aesenc_epi128(d0, kk10); - d1 = _mm256_aesenc_epi128(d1, kk10); - d0 = _mm256_aesenc_epi128(d0, kk11); - d1 = _mm256_aesenc_epi128(d1, kk11); - d0 = _mm256_aesenc_epi128(d0, kk12); - d1 = _mm256_aesenc_epi128(d1, kk12); - d0 = _mm256_aesenc_epi128(d0, kk13); - d1 = _mm256_aesenc_epi128(d1, kk13); - d0 = _mm256_aesenclast_epi128(d0, kk14); - d1 = _mm256_aesenclast_epi128(d1, kk14); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), _mm256_xor_si256(d0, p0)); - _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32), _mm256_xor_si256(d1, p1)); - out += 64; - } while (likely(len >= 64)); -} - -#endif // does compiler support AVX2 and AVX512 AES intrinsics? - -#endif // ZT_AES_AESNI - -__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2"))) void AES::CTR::crypt(const void *const input, unsigned int len) noexcept { const uint8_t *in = reinterpret_cast(input); @@ -604,388 +229,14 @@ void AES::CTR::crypt(const void *const input, unsigned int len) noexcept #ifdef ZT_AES_AESNI if (likely(Utils::CPUID.aes)) { - const __m128i dd = _mm_set_epi64x(0, (long long)_ctr[0]); - uint64_t c1 = Utils::ntoh(_ctr[1]); - - const __m128i *const k = _aes._k.ni.k; - const __m128i k0 = k[0]; - const __m128i k1 = k[1]; - const __m128i k2 = k[2]; - const __m128i k3 = k[3]; - const __m128i k4 = k[4]; - const __m128i k5 = k[5]; - const __m128i k6 = k[6]; - const __m128i k7 = k[7]; - const __m128i k8 = k[8]; - const __m128i k9 = k[9]; - const __m128i k10 = k[10]; - const __m128i k11 = k[11]; - const __m128i k12 = k[12]; - const __m128i k13 = k[13]; - const __m128i k14 = k[14]; - - // Complete any unfinished blocks from previous calls to crypt(). - unsigned int totalLen = _len; - if ((totalLen & 15U)) { - for (;;) { - if (unlikely(!len)) { - _ctr[1] = Utils::hton(c1); - _len = totalLen; - return; - } - --len; - out[totalLen++] = *(in++); - if (!(totalLen & 15U)) { - __m128i d0 = _mm_insert_epi64(dd, (long long)Utils::hton(c1++), 1); - d0 = _mm_xor_si128(d0, k0); - d0 = _mm_aesenc_si128(d0, k1); - d0 = _mm_aesenc_si128(d0, k2); - d0 = _mm_aesenc_si128(d0, k3); - d0 = _mm_aesenc_si128(d0, k4); - d0 = _mm_aesenc_si128(d0, k5); - d0 = _mm_aesenc_si128(d0, k6); - d0 = _mm_aesenc_si128(d0, k7); - d0 = _mm_aesenc_si128(d0, k8); - d0 = _mm_aesenc_si128(d0, k9); - d0 = _mm_aesenc_si128(d0, k10); - __m128i *const outblk = reinterpret_cast<__m128i *>(out + (totalLen - 16)); - d0 = _mm_aesenc_si128(d0, k11); - const __m128i p0 = _mm_loadu_si128(outblk); - d0 = _mm_aesenc_si128(d0, k12); - d0 = _mm_aesenc_si128(d0, k13); - d0 = _mm_aesenclast_si128(d0, k14); - _mm_storeu_si128(outblk, _mm_xor_si128(p0, d0)); - break; - } - } - } - - out += totalLen; - _len = totalLen + len; - - if (likely(len >= 64)) { - -#if defined(ZT_AES_VAES512) && defined(ZT_AES_VAES256) - if (Utils::CPUID.vaes && (len >= 256)) { - if (Utils::CPUID.avx512f) { - p_aesCtrInnerVAES512(len, _ctr[0], c1, in, out, k); - } else { - p_aesCtrInnerVAES256(len, _ctr[0], c1, in, out, k); - } - goto skip_conventional_aesni_64; - } -#endif - -#if !defined(ZT_AES_VAES512) && defined(ZT_AES_VAES256) - if (Utils::CPUID.vaes && (len >= 256)) { - p_aesCtrInnerVAES256(len, _ctr[0], c1, in, out, k); - goto skip_conventional_aesni_64; - } -#endif - - const uint8_t *const eof64 = in + (len & ~((unsigned int)63)); - len &= 63; - __m128i d0, d1, d2, d3; - do { - const uint64_t c10 = Utils::hton(c1); - const uint64_t c11 = Utils::hton(c1 + 1ULL); - const uint64_t c12 = Utils::hton(c1 + 2ULL); - const uint64_t c13 = Utils::hton(c1 + 3ULL); - d0 = _mm_insert_epi64(dd, (long long)c10, 1); - d1 = _mm_insert_epi64(dd, (long long)c11, 1); - d2 = _mm_insert_epi64(dd, (long long)c12, 1); - d3 = _mm_insert_epi64(dd, (long long)c13, 1); - c1 += 4; - d0 = _mm_xor_si128(d0, k0); - d1 = _mm_xor_si128(d1, k0); - d2 = _mm_xor_si128(d2, k0); - d3 = _mm_xor_si128(d3, k0); - d0 = _mm_aesenc_si128(d0, k1); - d1 = _mm_aesenc_si128(d1, k1); - d2 = _mm_aesenc_si128(d2, k1); - d3 = _mm_aesenc_si128(d3, k1); - d0 = _mm_aesenc_si128(d0, k2); - d1 = _mm_aesenc_si128(d1, k2); - d2 = _mm_aesenc_si128(d2, k2); - d3 = _mm_aesenc_si128(d3, k2); - d0 = _mm_aesenc_si128(d0, k3); - d1 = _mm_aesenc_si128(d1, k3); - d2 = _mm_aesenc_si128(d2, k3); - d3 = _mm_aesenc_si128(d3, k3); - d0 = _mm_aesenc_si128(d0, k4); - d1 = _mm_aesenc_si128(d1, k4); - d2 = _mm_aesenc_si128(d2, k4); - d3 = _mm_aesenc_si128(d3, k4); - d0 = _mm_aesenc_si128(d0, k5); - d1 = _mm_aesenc_si128(d1, k5); - d2 = _mm_aesenc_si128(d2, k5); - d3 = _mm_aesenc_si128(d3, k5); - d0 = _mm_aesenc_si128(d0, k6); - d1 = _mm_aesenc_si128(d1, k6); - d2 = _mm_aesenc_si128(d2, k6); - d3 = _mm_aesenc_si128(d3, k6); - d0 = _mm_aesenc_si128(d0, k7); - d1 = _mm_aesenc_si128(d1, k7); - d2 = _mm_aesenc_si128(d2, k7); - d3 = _mm_aesenc_si128(d3, k7); - d0 = _mm_aesenc_si128(d0, k8); - d1 = _mm_aesenc_si128(d1, k8); - d2 = _mm_aesenc_si128(d2, k8); - d3 = _mm_aesenc_si128(d3, k8); - d0 = _mm_aesenc_si128(d0, k9); - d1 = _mm_aesenc_si128(d1, k9); - d2 = _mm_aesenc_si128(d2, k9); - d3 = _mm_aesenc_si128(d3, k9); - d0 = _mm_aesenc_si128(d0, k10); - d1 = _mm_aesenc_si128(d1, k10); - d2 = _mm_aesenc_si128(d2, k10); - d3 = _mm_aesenc_si128(d3, k10); - d0 = _mm_aesenc_si128(d0, k11); - d1 = _mm_aesenc_si128(d1, k11); - d2 = _mm_aesenc_si128(d2, k11); - d3 = _mm_aesenc_si128(d3, k11); - d0 = _mm_aesenc_si128(d0, k12); - d1 = _mm_aesenc_si128(d1, k12); - d2 = _mm_aesenc_si128(d2, k12); - d3 = _mm_aesenc_si128(d3, k12); - d0 = _mm_aesenc_si128(d0, k13); - d1 = _mm_aesenc_si128(d1, k13); - d2 = _mm_aesenc_si128(d2, k13); - d3 = _mm_aesenc_si128(d3, k13); - d0 = _mm_xor_si128(_mm_aesenclast_si128(d0, k14), _mm_loadu_si128(reinterpret_cast(in))); - d1 = _mm_xor_si128(_mm_aesenclast_si128(d1, k14), _mm_loadu_si128(reinterpret_cast(in + 16))); - d2 = _mm_xor_si128(_mm_aesenclast_si128(d2, k14), _mm_loadu_si128(reinterpret_cast(in + 32))); - d3 = _mm_xor_si128(_mm_aesenclast_si128(d3, k14), _mm_loadu_si128(reinterpret_cast(in + 48))); - in += 64; - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), d0); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16), d1); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32), d2); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48), d3); - out += 64; - } while (likely(in != eof64)); - - } - - skip_conventional_aesni_64: - while (len >= 16) { - __m128i d0 = _mm_insert_epi64(dd, (long long)Utils::hton(c1++), 1); - d0 = _mm_xor_si128(d0, k0); - d0 = _mm_aesenc_si128(d0, k1); - d0 = _mm_aesenc_si128(d0, k2); - d0 = _mm_aesenc_si128(d0, k3); - d0 = _mm_aesenc_si128(d0, k4); - d0 = _mm_aesenc_si128(d0, k5); - d0 = _mm_aesenc_si128(d0, k6); - d0 = _mm_aesenc_si128(d0, k7); - d0 = _mm_aesenc_si128(d0, k8); - d0 = _mm_aesenc_si128(d0, k9); - d0 = _mm_aesenc_si128(d0, k10); - d0 = _mm_aesenc_si128(d0, k11); - d0 = _mm_aesenc_si128(d0, k12); - d0 = _mm_aesenc_si128(d0, k13); - _mm_storeu_si128(reinterpret_cast<__m128i *>(out), _mm_xor_si128(_mm_aesenclast_si128(d0, k14), _mm_loadu_si128(reinterpret_cast(in)))); - in += 16; - len -= 16; - out += 16; - } - - // Any remaining input is placed in _out. This will be picked up and crypted - // on subsequent calls to crypt() or finish() as it'll mean _len will not be - // an even multiple of 16. - for (unsigned int i = 0; i < len; ++i) - out[i] = in[i]; - - _ctr[1] = Utils::hton(c1); + p_aesNICrypt(in, out, len); return; } #endif // ZT_AES_AESNI #ifdef ZT_AES_NEON if (Utils::ARMCAP.aes) { - uint8x16_t dd = vrev32q_u8(vld1q_u8(reinterpret_cast(_ctr))); - const uint32x4_t one = {0,0,0,1}; - - uint8x16_t k0 = _aes._k.neon.ek[0]; - uint8x16_t k1 = _aes._k.neon.ek[1]; - uint8x16_t k2 = _aes._k.neon.ek[2]; - uint8x16_t k3 = _aes._k.neon.ek[3]; - uint8x16_t k4 = _aes._k.neon.ek[4]; - uint8x16_t k5 = _aes._k.neon.ek[5]; - uint8x16_t k6 = _aes._k.neon.ek[6]; - uint8x16_t k7 = _aes._k.neon.ek[7]; - uint8x16_t k8 = _aes._k.neon.ek[8]; - uint8x16_t k9 = _aes._k.neon.ek[9]; - uint8x16_t k10 = _aes._k.neon.ek[10]; - uint8x16_t k11 = _aes._k.neon.ek[11]; - uint8x16_t k12 = _aes._k.neon.ek[12]; - uint8x16_t k13 = _aes._k.neon.ek[13]; - uint8x16_t k14 = _aes._k.neon.ek[14]; - - unsigned int totalLen = _len; - if ((totalLen & 15U)) { - for (;;) { - if (unlikely(!len)) { - vst1q_u8(reinterpret_cast(_ctr), vrev32q_u8(dd)); - _len = totalLen; - return; - } - --len; - out[totalLen++] = *(in++); - if (!(totalLen & 15U)) { - uint8_t *const otmp = out + (totalLen - 16); - uint8x16_t d0 = vrev32q_u8(dd); - uint8x16_t pt = vld1q_u8(otmp); - d0 = vaesmcq_u8(vaeseq_u8(d0, k0)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k1)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k2)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k3)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k4)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k5)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k6)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k7)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k8)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k9)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k10)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k11)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k12)); - d0 = veorq_u8(vaeseq_u8(d0, k13), k14); - vst1q_u8(otmp, veorq_u8(pt, d0)); - dd = (uint8x16_t)vaddq_u32((uint32x4_t)dd, one); - break; - } - } - } - - out += totalLen; - _len = totalLen + len; - - if (likely(len >= 64)) { - const uint32x4_t four = vshlq_n_u32(one, 2); - uint8x16_t dd1 = (uint8x16_t)vaddq_u32((uint32x4_t)dd, one); - uint8x16_t dd2 = (uint8x16_t)vaddq_u32((uint32x4_t)dd1, one); - uint8x16_t dd3 = (uint8x16_t)vaddq_u32((uint32x4_t)dd2, one); - for (;;) { - len -= 64; - uint8x16_t d0 = vrev32q_u8(dd); - uint8x16_t d1 = vrev32q_u8(dd1); - uint8x16_t d2 = vrev32q_u8(dd2); - uint8x16_t d3 = vrev32q_u8(dd3); - uint8x16_t pt0 = vld1q_u8(in); - in += 16; - d0 = vaesmcq_u8(vaeseq_u8(d0, k0)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k0)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k0)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k0)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k1)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k1)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k1)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k1)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k2)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k2)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k2)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k2)); - uint8x16_t pt1 = vld1q_u8(in); - in += 16; - d0 = vaesmcq_u8(vaeseq_u8(d0, k3)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k3)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k3)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k3)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k4)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k4)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k4)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k4)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k5)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k5)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k5)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k5)); - uint8x16_t pt2 = vld1q_u8(in); - in += 16; - d0 = vaesmcq_u8(vaeseq_u8(d0, k6)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k6)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k6)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k6)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k7)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k7)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k7)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k7)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k8)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k8)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k8)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k8)); - uint8x16_t pt3 = vld1q_u8(in); - in += 16; - d0 = vaesmcq_u8(vaeseq_u8(d0, k9)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k9)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k9)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k9)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k10)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k10)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k10)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k10)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k11)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k11)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k11)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k11)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k12)); - d1 = vaesmcq_u8(vaeseq_u8(d1, k12)); - d2 = vaesmcq_u8(vaeseq_u8(d2, k12)); - d3 = vaesmcq_u8(vaeseq_u8(d3, k12)); - d0 = veorq_u8(vaeseq_u8(d0, k13), k14); - d1 = veorq_u8(vaeseq_u8(d1, k13), k14); - d2 = veorq_u8(vaeseq_u8(d2, k13), k14); - d3 = veorq_u8(vaeseq_u8(d3, k13), k14); - - d0 = veorq_u8(pt0, d0); - d1 = veorq_u8(pt1, d1); - d2 = veorq_u8(pt2, d2); - d3 = veorq_u8(pt3, d3); - - vst1q_u8(out, d0); - vst1q_u8(out + 16, d1); - vst1q_u8(out + 32, d2); - vst1q_u8(out + 48, d3); - out += 64; - - dd = (uint8x16_t)vaddq_u32((uint32x4_t)dd, four); - if (unlikely(len < 64)) - break; - dd1 = (uint8x16_t)vaddq_u32((uint32x4_t)dd1, four); - dd2 = (uint8x16_t)vaddq_u32((uint32x4_t)dd2, four); - dd3 = (uint8x16_t)vaddq_u32((uint32x4_t)dd3, four); - } - } - - while (len >= 16) { - len -= 16; - uint8x16_t d0 = vrev32q_u8(dd); - uint8x16_t pt = vld1q_u8(in); - in += 16; - dd = (uint8x16_t)vaddq_u32((uint32x4_t)dd, one); - d0 = vaesmcq_u8(vaeseq_u8(d0, k0)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k1)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k2)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k3)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k4)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k5)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k6)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k7)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k8)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k9)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k10)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k11)); - d0 = vaesmcq_u8(vaeseq_u8(d0, k12)); - d0 = veorq_u8(vaeseq_u8(d0, k13), k14); - vst1q_u8(out, veorq_u8(pt, d0)); - out += 16; - } - - // Any remaining input is placed in _out. This will be picked up and crypted - // on subsequent calls to crypt() or finish() as it'll mean _len will not be - // an even multiple of 16. - for (unsigned int i = 0; i < len; ++i) - out[i] = in[i]; - - vst1q_u8(reinterpret_cast(_ctr), vrev32q_u8(dd)); + p_armCrypt(in, out, len); return; } #endif // ZT_AES_NEON @@ -1003,7 +254,7 @@ void AES::CTR::crypt(const void *const input, unsigned int len) noexcept --len; out[totalLen++] = *(in++); if (!(totalLen & 15U)) { - _aes._encryptSW(reinterpret_cast(_ctr), reinterpret_cast(keyStream)); + _aes.p_encryptSW(reinterpret_cast(_ctr), reinterpret_cast(keyStream)); reinterpret_cast(_ctr)[3] = Utils::hton(++ctr); uint8_t *outblk = out + (totalLen - 16); for (int i = 0; i < 16; ++i) @@ -1017,7 +268,7 @@ void AES::CTR::crypt(const void *const input, unsigned int len) noexcept _len = (totalLen + len); if (likely(len >= 16)) { - const uint32_t *const restrict rk = _aes._k.sw.ek; + const uint32_t *const restrict rk = _aes.p_k.sw.ek; const uint32_t ctr0rk0 = Utils::ntoh(reinterpret_cast(_ctr)[0]) ^rk[0]; const uint32_t ctr1rk1 = Utils::ntoh(reinterpret_cast(_ctr)[1]) ^rk[1]; const uint32_t ctr2rk2 = Utils::ntoh(reinterpret_cast(_ctr)[2]) ^rk[2]; @@ -1238,9 +489,9 @@ const uint8_t AES::Td4[256] = {0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0 0xef, 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d}; const uint32_t AES::rcon[15] = {0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000, 0x6c000000, 0xd8000000, 0xab000000, 0x4d000000, 0x9a000000}; -void AES::_initSW(const uint8_t key[32]) noexcept +void AES::p_initSW(const uint8_t *key) noexcept { - uint32_t *rk = _k.sw.ek; + uint32_t *rk = p_k.sw.ek; rk[0] = Utils::loadBigEndian< uint32_t >(key); rk[1] = Utils::loadBigEndian< uint32_t >(key + 4); @@ -1266,13 +517,13 @@ void AES::_initSW(const uint8_t key[32]) noexcept rk += 8; } - _encryptSW((const uint8_t *)Utils::ZERO256, (uint8_t *)_k.sw.h); - _k.sw.h[0] = Utils::ntoh(_k.sw.h[0]); - _k.sw.h[1] = Utils::ntoh(_k.sw.h[1]); + p_encryptSW((const uint8_t *)Utils::ZERO256, (uint8_t *)p_k.sw.h); + p_k.sw.h[0] = Utils::ntoh(p_k.sw.h[0]); + p_k.sw.h[1] = Utils::ntoh(p_k.sw.h[1]); for (int i = 0; i < 60; ++i) - _k.sw.dk[i] = _k.sw.ek[i]; - rk = _k.sw.dk; + p_k.sw.dk[i] = p_k.sw.ek[i]; + rk = p_k.sw.dk; for (int i = 0, j = 56; i < j; i += 4, j -= 4) { uint32_t temp = rk[i]; @@ -1297,9 +548,9 @@ void AES::_initSW(const uint8_t key[32]) noexcept } } -void AES::_encryptSW(const uint8_t in[16], uint8_t out[16]) const noexcept +void AES::p_encryptSW(const uint8_t *in, uint8_t *out) const noexcept { - const uint32_t *const restrict rk = _k.sw.ek; + const uint32_t *const restrict rk = p_k.sw.ek; const uint32_t m8 = 0x000000ff; const uint32_t m8_8 = 0x0000ff00; const uint32_t m8_16 = 0x00ff0000; @@ -1373,9 +624,9 @@ void AES::_encryptSW(const uint8_t in[16], uint8_t out[16]) const noexcept Utils::storeBigEndian< uint32_t >(out + 12, s3); } -void AES::_decryptSW(const uint8_t in[16], uint8_t out[16]) const noexcept +void AES::p_decryptSW(const uint8_t *in, uint8_t *out) const noexcept { - const uint32_t *restrict rk = _k.sw.dk; + const uint32_t *restrict rk = p_k.sw.dk; const uint32_t m8 = 0x000000ff; uint32_t s0 = Utils::loadBigEndian< uint32_t >(in) ^rk[0]; uint32_t s1 = Utils::loadBigEndian< uint32_t >(in + 4) ^rk[1]; @@ -1446,229 +697,4 @@ void AES::_decryptSW(const uint8_t in[16], uint8_t out[16]) const noexcept Utils::storeBigEndian< uint32_t >(out + 12, s3); } -#ifdef ZT_AES_AESNI - -static __m128i _init256_1_aesni(__m128i a, __m128i b) noexcept -{ - __m128i x, y; - b = _mm_shuffle_epi32(b, 0xff); - y = _mm_slli_si128(a, 0x04); - x = _mm_xor_si128(a, y); - y = _mm_slli_si128(y, 0x04); - x = _mm_xor_si128(x, y); - y = _mm_slli_si128(y, 0x04); - x = _mm_xor_si128(x, y); - x = _mm_xor_si128(x, b); - return x; -} - -static __m128i _init256_2_aesni(__m128i a, __m128i b) noexcept -{ - __m128i x, y, z; - y = _mm_aeskeygenassist_si128(a, 0x00); - z = _mm_shuffle_epi32(y, 0xaa); - y = _mm_slli_si128(b, 0x04); - x = _mm_xor_si128(b, y); - y = _mm_slli_si128(y, 0x04); - x = _mm_xor_si128(x, y); - y = _mm_slli_si128(y, 0x04); - x = _mm_xor_si128(x, y); - x = _mm_xor_si128(x, z); - return x; -} - -__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2"))) -void AES::_init_aesni(const uint8_t key[32]) noexcept -{ - __m128i t1, t2, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13; - _k.ni.k[0] = t1 = _mm_loadu_si128((const __m128i *)key); - _k.ni.k[1] = k1 = t2 = _mm_loadu_si128((const __m128i *)(key + 16)); - _k.ni.k[2] = k2 = t1 = _init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x01)); - _k.ni.k[3] = k3 = t2 = _init256_2_aesni(t1, t2); - _k.ni.k[4] = k4 = t1 = _init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x02)); - _k.ni.k[5] = k5 = t2 = _init256_2_aesni(t1, t2); - _k.ni.k[6] = k6 = t1 = _init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x04)); - _k.ni.k[7] = k7 = t2 = _init256_2_aesni(t1, t2); - _k.ni.k[8] = k8 = t1 = _init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x08)); - _k.ni.k[9] = k9 = t2 = _init256_2_aesni(t1, t2); - _k.ni.k[10] = k10 = t1 = _init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x10)); - _k.ni.k[11] = k11 = t2 = _init256_2_aesni(t1, t2); - _k.ni.k[12] = k12 = t1 = _init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x20)); - _k.ni.k[13] = k13 = t2 = _init256_2_aesni(t1, t2); - _k.ni.k[14] = _init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x40)); - _k.ni.k[15] = _mm_aesimc_si128(k13); - _k.ni.k[16] = _mm_aesimc_si128(k12); - _k.ni.k[17] = _mm_aesimc_si128(k11); - _k.ni.k[18] = _mm_aesimc_si128(k10); - _k.ni.k[19] = _mm_aesimc_si128(k9); - _k.ni.k[20] = _mm_aesimc_si128(k8); - _k.ni.k[21] = _mm_aesimc_si128(k7); - _k.ni.k[22] = _mm_aesimc_si128(k6); - _k.ni.k[23] = _mm_aesimc_si128(k5); - _k.ni.k[24] = _mm_aesimc_si128(k4); - _k.ni.k[25] = _mm_aesimc_si128(k3); - _k.ni.k[26] = _mm_aesimc_si128(k2); - _k.ni.k[27] = _mm_aesimc_si128(k1); - - __m128i h = _k.ni.k[0]; // _mm_xor_si128(_mm_setzero_si128(),_k.ni.k[0]); - h = _mm_aesenc_si128(h, k1); - h = _mm_aesenc_si128(h, k2); - h = _mm_aesenc_si128(h, k3); - h = _mm_aesenc_si128(h, k4); - h = _mm_aesenc_si128(h, k5); - h = _mm_aesenc_si128(h, k6); - h = _mm_aesenc_si128(h, k7); - h = _mm_aesenc_si128(h, k8); - h = _mm_aesenc_si128(h, k9); - h = _mm_aesenc_si128(h, k10); - h = _mm_aesenc_si128(h, k11); - h = _mm_aesenc_si128(h, k12); - h = _mm_aesenc_si128(h, k13); - h = _mm_aesenclast_si128(h, _k.ni.k[14]); - __m128i hswap = _mm_shuffle_epi8(h, s_sseSwapBytes); - __m128i hh = p_gmacPCLMUL128(hswap, h); - __m128i hhh = p_gmacPCLMUL128(hswap, hh); - __m128i hhhh = p_gmacPCLMUL128(hswap, hhh); - _k.ni.h[0] = hswap; - _k.ni.h[1] = hh = _mm_shuffle_epi8(hh, s_sseSwapBytes); - _k.ni.h[2] = hhh = _mm_shuffle_epi8(hhh, s_sseSwapBytes); - _k.ni.h[3] = hhhh = _mm_shuffle_epi8(hhhh, s_sseSwapBytes); - _k.ni.h2[0] = _mm_xor_si128(_mm_shuffle_epi32(hswap, 78), hswap); - _k.ni.h2[1] = _mm_xor_si128(_mm_shuffle_epi32(hh, 78), hh); - _k.ni.h2[2] = _mm_xor_si128(_mm_shuffle_epi32(hhh, 78), hhh); - _k.ni.h2[3] = _mm_xor_si128(_mm_shuffle_epi32(hhhh, 78), hhhh); -} - -void AES::_encrypt_aesni(const void *const in, void *const out) const noexcept -{ - __m128i tmp = _mm_loadu_si128((const __m128i *)in); - tmp = _mm_xor_si128(tmp, _k.ni.k[0]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[1]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[2]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[3]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[4]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[5]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[6]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[7]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[8]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[9]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[10]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[11]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[12]); - tmp = _mm_aesenc_si128(tmp, _k.ni.k[13]); - _mm_storeu_si128((__m128i *)out, _mm_aesenclast_si128(tmp, _k.ni.k[14])); -} - -void AES::_decrypt_aesni(const void *in, void *out) const noexcept -{ - __m128i tmp = _mm_loadu_si128((const __m128i *)in); - tmp = _mm_xor_si128(tmp, _k.ni.k[14]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[15]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[16]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[17]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[18]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[19]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[20]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[21]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[22]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[23]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[24]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[25]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[26]); - tmp = _mm_aesdec_si128(tmp, _k.ni.k[27]); - _mm_storeu_si128((__m128i *)out, _mm_aesdeclast_si128(tmp, _k.ni.k[0])); -} - -#endif // ZT_AES_AESNI - -#ifdef ZT_AES_NEON - -#define ZT_INIT_ARMNEON_CRYPTO_SUBWORD(w) ((uint32_t)s_sbox[w & 0xffU] + ((uint32_t)s_sbox[(w >> 8U) & 0xffU] << 8U) + ((uint32_t)s_sbox[(w >> 16U) & 0xffU] << 16U) + ((uint32_t)s_sbox[(w >> 24U) & 0xffU] << 24U)) -#define ZT_INIT_ARMNEON_CRYPTO_ROTWORD(w) (((w) << 8U) | ((w) >> 24U)) -#define ZT_INIT_ARMNEON_CRYPTO_NK 8 -#define ZT_INIT_ARMNEON_CRYPTO_NB 4 -#define ZT_INIT_ARMNEON_CRYPTO_NR 14 - -void AES::_init_armneon_crypto(const uint8_t key[32]) noexcept -{ - static const uint8_t s_sbox[256] = {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, - 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, - 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; - - uint64_t h[2]; - uint32_t *const w = reinterpret_cast(_k.neon.ek); - - for (unsigned int i=0;i(&(_k.neon.h), h); - _k.neon.h = vrbitq_u8(_k.neon.h); - _k.sw.h[0] = Utils::ntoh(h[0]); - _k.sw.h[1] = Utils::ntoh(h[1]); -} - -void AES::_encrypt_armneon_crypto(const void *const in, void *const out) const noexcept -{ - uint8x16_t tmp = vld1q_u8(reinterpret_cast(in)); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[0])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[1])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[2])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[3])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[4])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[5])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[6])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[7])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[8])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[9])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[10])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[11])); - tmp = vaesmcq_u8(vaeseq_u8(tmp, _k.neon.ek[12])); - tmp = veorq_u8(vaeseq_u8(tmp, _k.neon.ek[13]), _k.neon.ek[14]); - vst1q_u8(reinterpret_cast(out), tmp); -} - -void AES::_decrypt_armneon_crypto(const void *const in, void *const out) const noexcept -{ - uint8x16_t tmp = vld1q_u8(reinterpret_cast(in)); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[0])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[1])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[2])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[3])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[4])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[5])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[6])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[7])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[8])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[9])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[10])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[11])); - tmp = vaesimcq_u8(vaesdq_u8(tmp, _k.neon.dk[12])); - tmp = veorq_u8(vaesdq_u8(tmp, _k.neon.dk[13]), _k.neon.dk[14]); - vst1q_u8(reinterpret_cast(out), tmp); -} - -#endif // ZT_AES_NEON - } // namespace ZeroTier diff --git a/node/AES.hpp b/node/AES.hpp index ba9e07497..dc0130adf 100644 --- a/node/AES.hpp +++ b/node/AES.hpp @@ -18,6 +18,7 @@ #include "Utils.hpp" #include "SHA512.hpp" +// Uncomment to disable all hardware acceleration (usually for testing) //#define ZT_AES_NO_ACCEL #if !defined(ZT_AES_NO_ACCEL) && defined(ZT_ARCH_X64) @@ -73,7 +74,7 @@ public: { this->init(key); } ZT_INLINE ~AES() - { Utils::burn(&_k, sizeof(_k)); } + { Utils::burn(&p_k, sizeof(p_k)); } /** * Set (or re-set) this AES256 cipher's key @@ -84,17 +85,17 @@ public: { #ifdef ZT_AES_AESNI if (likely(Utils::CPUID.aes)) { - _init_aesni(reinterpret_cast(key)); + p_init_aesni(reinterpret_cast(key)); return; } #endif #ifdef ZT_AES_NEON if (Utils::ARMCAP.aes) { - _init_armneon_crypto(reinterpret_cast(key)); + p_init_armneon_crypto(reinterpret_cast(key)); return; } #endif - _initSW(reinterpret_cast(key)); + p_initSW(reinterpret_cast(key)); } /** @@ -107,17 +108,17 @@ public: { #ifdef ZT_AES_AESNI if (likely(Utils::CPUID.aes)) { - _encrypt_aesni(in, out); + p_encrypt_aesni(in, out); return; } #endif #ifdef ZT_AES_NEON if (Utils::ARMCAP.aes) { - _encrypt_armneon_crypto(in, out); + p_encrypt_armneon_crypto(in, out); return; } #endif - _encryptSW(reinterpret_cast(in), reinterpret_cast(out)); + p_encryptSW(reinterpret_cast(in), reinterpret_cast(out)); } /** @@ -130,17 +131,17 @@ public: { #ifdef ZT_AES_AESNI if (likely(Utils::CPUID.aes)) { - _decrypt_aesni(in, out); + p_decrypt_aesni(in, out); return; } #endif #ifdef ZT_AES_NEON if (Utils::ARMCAP.aes) { - _decrypt_armneon_crypto(in, out); + p_decrypt_armneon_crypto(in, out); return; } #endif - _decryptSW(reinterpret_cast(in), reinterpret_cast(out)); + p_decryptSW(reinterpret_cast(in), reinterpret_cast(out)); } class GMACSIVEncryptor; @@ -225,6 +226,14 @@ public: void finish(uint8_t tag[16]) noexcept; private: +#ifdef ZT_AES_AESNI + void p_aesNIUpdate(const uint8_t *in, unsigned int len) noexcept; + void p_aesNIFinish(uint8_t tag[16]) noexcept; +#endif +#ifdef ZT_AES_NEON + void p_armUpdate(const uint8_t *in, unsigned int len) noexcept; + void p_armFinish(uint8_t tag[16]) noexcept; +#endif const AES &_aes; unsigned int _rp; unsigned int _len; @@ -292,6 +301,12 @@ public: void finish() noexcept; private: +#ifdef ZT_AES_AESNI + void p_aesNICrypt(const uint8_t *in, uint8_t *out, unsigned int len) noexcept; +#endif +#ifdef ZT_AES_NEON + void p_armCrypt(const uint8_t *in, uint8_t *out, unsigned int len) noexcept; +#endif const AES &_aes; uint64_t _ctr[2]; uint8_t *_out; @@ -318,7 +333,7 @@ public: * @param k0 First of two AES instances keyed with K0 * @param k1 Second of two AES instances keyed with K1 */ - ZT_INLINE GMACSIVEncryptor(const AES &k0, const AES &k1) noexcept: + ZT_INLINE GMACSIVEncryptor(const AES &k0, const AES &k1) noexcept : _gmac(k0), _ctr(k1) {} @@ -528,9 +543,9 @@ private: static const uint8_t Td4[256]; static const uint32_t rcon[15]; - void _initSW(const uint8_t key[32]) noexcept; - void _encryptSW(const uint8_t in[16], uint8_t out[16]) const noexcept; - void _decryptSW(const uint8_t in[16], uint8_t out[16]) const noexcept; + void p_initSW(const uint8_t *key) noexcept; + void p_encryptSW(const uint8_t *in, uint8_t *out) const noexcept; + void p_decryptSW(const uint8_t *in, uint8_t *out) const noexcept; union { @@ -559,18 +574,18 @@ private: uint32_t ek[60]; uint32_t dk[60]; } sw; - } _k; + } p_k; #ifdef ZT_AES_AESNI - void _init_aesni(const uint8_t key[32]) noexcept; - void _encrypt_aesni(const void *in, void *out) const noexcept; - void _decrypt_aesni(const void *in, void *out) const noexcept; + void p_init_aesni(const uint8_t *key) noexcept; + void p_encrypt_aesni(const void *in, void *out) const noexcept; + void p_decrypt_aesni(const void *in, void *out) const noexcept; #endif #ifdef ZT_AES_NEON - void _init_armneon_crypto(const uint8_t key[32]) noexcept; - void _encrypt_armneon_crypto(const void *in, void *out) const noexcept; - void _decrypt_armneon_crypto(const void *in, void *out) const noexcept; + void p_init_armneon_crypto(const uint8_t *key) noexcept; + void p_encrypt_armneon_crypto(const void *in, void *out) const noexcept; + void p_decrypt_armneon_crypto(const void *in, void *out) const noexcept; #endif }; diff --git a/node/AES_aesni.cpp b/node/AES_aesni.cpp new file mode 100644 index 000000000..6990b6786 --- /dev/null +++ b/node/AES_aesni.cpp @@ -0,0 +1,651 @@ +/* + * Copyright (c)2013-2020 ZeroTier, Inc. + * + * Use of this software is governed by the Business Source License included + * in the LICENSE.TXT file in the project's root directory. + * + * Change Date: 2025-01-01 + * + * On the date above, in accordance with the Business Source License, use + * of this software will be governed by version 2.0 of the Apache License. + */ +/****/ + +#include "Constants.hpp" +#include "AES.hpp" + +#ifdef ZT_AES_AESNI + +#ifdef __GNUC__ +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif + +namespace ZeroTier { + +namespace { + +const __m128i s_sseSwapBytes = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + +__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,pclmul"))) +__m128i p_gmacPCLMUL128(const __m128i h, __m128i y) noexcept +{ + y = _mm_shuffle_epi8(y, s_sseSwapBytes); + __m128i t1 = _mm_clmulepi64_si128(h, y, 0x00); + __m128i t2 = _mm_clmulepi64_si128(h, y, 0x01); + __m128i t3 = _mm_clmulepi64_si128(h, y, 0x10); + __m128i t4 = _mm_clmulepi64_si128(h, y, 0x11); + t2 = _mm_xor_si128(t2, t3); + t3 = _mm_slli_si128(t2, 8); + t2 = _mm_srli_si128(t2, 8); + t1 = _mm_xor_si128(t1, t3); + t4 = _mm_xor_si128(t4, t2); + __m128i t5 = _mm_srli_epi32(t1, 31); + t1 = _mm_or_si128(_mm_slli_epi32(t1, 1), _mm_slli_si128(t5, 4)); + t4 = _mm_or_si128(_mm_or_si128(_mm_slli_epi32(t4, 1), _mm_slli_si128(_mm_srli_epi32(t4, 31), 4)), _mm_srli_si128(t5, 12)); + t5 = _mm_xor_si128(_mm_xor_si128(_mm_slli_epi32(t1, 31), _mm_slli_epi32(t1, 30)), _mm_slli_epi32(t1, 25)); + t1 = _mm_xor_si128(t1, _mm_slli_si128(t5, 12)); + t4 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(t4, _mm_srli_si128(t5, 4)), t1), _mm_srli_epi32(t1, 2)), _mm_srli_epi32(t1, 7)), _mm_srli_epi32(t1, 1)); + return _mm_shuffle_epi8(t4, s_sseSwapBytes); +} + +/* Disable VAES stuff on compilers too old to compile these intrinsics, + * and MinGW64 also seems not to support them so disable on Windows. + * The performance gain can be significant but regular SSE is already so + * fast it's highly unlikely to be a rate limiting factor except on massive + * servers and network infrastructure stuff. */ +#if !defined(__WINDOWS__) && ((__GNUC__ >= 8) || (__clang_major__ >= 7)) + +#define ZT_AES_VAES512 1 + +__attribute__((__target__("sse4,aes,avx,avx2,vaes,avx512f,avx512bw"))) +void p_aesCtrInnerVAES512(unsigned int &len, const uint64_t c0, uint64_t &c1, const uint8_t *&in, uint8_t *&out, const __m128i *const k) noexcept +{ + const __m512i kk0 = _mm512_broadcast_i32x4(k[0]); + const __m512i kk1 = _mm512_broadcast_i32x4(k[1]); + const __m512i kk2 = _mm512_broadcast_i32x4(k[2]); + const __m512i kk3 = _mm512_broadcast_i32x4(k[3]); + const __m512i kk4 = _mm512_broadcast_i32x4(k[4]); + const __m512i kk5 = _mm512_broadcast_i32x4(k[5]); + const __m512i kk6 = _mm512_broadcast_i32x4(k[6]); + const __m512i kk7 = _mm512_broadcast_i32x4(k[7]); + const __m512i kk8 = _mm512_broadcast_i32x4(k[8]); + const __m512i kk9 = _mm512_broadcast_i32x4(k[9]); + const __m512i kk10 = _mm512_broadcast_i32x4(k[10]); + const __m512i kk11 = _mm512_broadcast_i32x4(k[11]); + const __m512i kk12 = _mm512_broadcast_i32x4(k[12]); + const __m512i kk13 = _mm512_broadcast_i32x4(k[13]); + const __m512i kk14 = _mm512_broadcast_i32x4(k[14]); + do { + __m512i p0 = _mm512_loadu_si512(reinterpret_cast(in)); + __m512i d0 = _mm512_set_epi64( + (long long)Utils::hton(c1 + 3ULL), (long long)c0, + (long long)Utils::hton(c1 + 2ULL), (long long)c0, + (long long)Utils::hton(c1 + 1ULL), (long long)c0, + (long long)Utils::hton(c1), (long long)c0); + c1 += 4; + in += 64; + len -= 64; + d0 = _mm512_xor_si512(d0, kk0); + d0 = _mm512_aesenc_epi128(d0, kk1); + d0 = _mm512_aesenc_epi128(d0, kk2); + d0 = _mm512_aesenc_epi128(d0, kk3); + d0 = _mm512_aesenc_epi128(d0, kk4); + d0 = _mm512_aesenc_epi128(d0, kk5); + d0 = _mm512_aesenc_epi128(d0, kk6); + d0 = _mm512_aesenc_epi128(d0, kk7); + d0 = _mm512_aesenc_epi128(d0, kk8); + d0 = _mm512_aesenc_epi128(d0, kk9); + d0 = _mm512_aesenc_epi128(d0, kk10); + d0 = _mm512_aesenc_epi128(d0, kk11); + d0 = _mm512_aesenc_epi128(d0, kk12); + d0 = _mm512_aesenc_epi128(d0, kk13); + d0 = _mm512_aesenclast_epi128(d0, kk14); + _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), _mm512_xor_si512(p0, d0)); + out += 64; + } while (likely(len >= 64)); +} + +#define ZT_AES_VAES256 1 + +__attribute__((__target__("sse4,aes,avx,avx2,vaes"))) +void p_aesCtrInnerVAES256(unsigned int &len, const uint64_t c0, uint64_t &c1, const uint8_t *&in, uint8_t *&out, const __m128i *const k) noexcept +{ + const __m256i kk0 = _mm256_broadcastsi128_si256(k[0]); + const __m256i kk1 = _mm256_broadcastsi128_si256(k[1]); + const __m256i kk2 = _mm256_broadcastsi128_si256(k[2]); + const __m256i kk3 = _mm256_broadcastsi128_si256(k[3]); + const __m256i kk4 = _mm256_broadcastsi128_si256(k[4]); + const __m256i kk5 = _mm256_broadcastsi128_si256(k[5]); + const __m256i kk6 = _mm256_broadcastsi128_si256(k[6]); + const __m256i kk7 = _mm256_broadcastsi128_si256(k[7]); + const __m256i kk8 = _mm256_broadcastsi128_si256(k[8]); + const __m256i kk9 = _mm256_broadcastsi128_si256(k[9]); + const __m256i kk10 = _mm256_broadcastsi128_si256(k[10]); + const __m256i kk11 = _mm256_broadcastsi128_si256(k[11]); + const __m256i kk12 = _mm256_broadcastsi128_si256(k[12]); + const __m256i kk13 = _mm256_broadcastsi128_si256(k[13]); + const __m256i kk14 = _mm256_broadcastsi128_si256(k[14]); + do { + __m256i p0 = _mm256_loadu_si256(reinterpret_cast(in)); + __m256i p1 = _mm256_loadu_si256(reinterpret_cast(in + 32)); + __m256i d0 = _mm256_set_epi64x( + (long long)Utils::hton(c1 + 1ULL), (long long)c0, + (long long)Utils::hton(c1), (long long)c0); + __m256i d1 = _mm256_set_epi64x( + (long long)Utils::hton(c1 + 3ULL), (long long)c0, + (long long)Utils::hton(c1 + 2ULL), (long long)c0); + c1 += 4; + in += 64; + len -= 64; + d0 = _mm256_xor_si256(d0, kk0); + d1 = _mm256_xor_si256(d1, kk0); + d0 = _mm256_aesenc_epi128(d0, kk1); + d1 = _mm256_aesenc_epi128(d1, kk1); + d0 = _mm256_aesenc_epi128(d0, kk2); + d1 = _mm256_aesenc_epi128(d1, kk2); + d0 = _mm256_aesenc_epi128(d0, kk3); + d1 = _mm256_aesenc_epi128(d1, kk3); + d0 = _mm256_aesenc_epi128(d0, kk4); + d1 = _mm256_aesenc_epi128(d1, kk4); + d0 = _mm256_aesenc_epi128(d0, kk5); + d1 = _mm256_aesenc_epi128(d1, kk5); + d0 = _mm256_aesenc_epi128(d0, kk6); + d1 = _mm256_aesenc_epi128(d1, kk6); + d0 = _mm256_aesenc_epi128(d0, kk7); + d1 = _mm256_aesenc_epi128(d1, kk7); + d0 = _mm256_aesenc_epi128(d0, kk8); + d1 = _mm256_aesenc_epi128(d1, kk8); + d0 = _mm256_aesenc_epi128(d0, kk9); + d1 = _mm256_aesenc_epi128(d1, kk9); + d0 = _mm256_aesenc_epi128(d0, kk10); + d1 = _mm256_aesenc_epi128(d1, kk10); + d0 = _mm256_aesenc_epi128(d0, kk11); + d1 = _mm256_aesenc_epi128(d1, kk11); + d0 = _mm256_aesenc_epi128(d0, kk12); + d1 = _mm256_aesenc_epi128(d1, kk12); + d0 = _mm256_aesenc_epi128(d0, kk13); + d1 = _mm256_aesenc_epi128(d1, kk13); + d0 = _mm256_aesenclast_epi128(d0, kk14); + d1 = _mm256_aesenclast_epi128(d1, kk14); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out), _mm256_xor_si256(d0, p0)); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(out + 32), _mm256_xor_si256(d1, p1)); + out += 64; + } while (likely(len >= 64)); +} + +#endif // does compiler support AVX2 and AVX512 AES intrinsics? + +__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +__m128i p_init256_1_aesni(__m128i a, __m128i b) noexcept +{ + __m128i x, y; + b = _mm_shuffle_epi32(b, 0xff); + y = _mm_slli_si128(a, 0x04); + x = _mm_xor_si128(a, y); + y = _mm_slli_si128(y, 0x04); + x = _mm_xor_si128(x, y); + y = _mm_slli_si128(y, 0x04); + x = _mm_xor_si128(x, y); + x = _mm_xor_si128(x, b); + return x; +} + +__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +__m128i p_init256_2_aesni(__m128i a, __m128i b) noexcept +{ + __m128i x, y, z; + y = _mm_aeskeygenassist_si128(a, 0x00); + z = _mm_shuffle_epi32(y, 0xaa); + y = _mm_slli_si128(b, 0x04); + x = _mm_xor_si128(b, y); + y = _mm_slli_si128(y, 0x04); + x = _mm_xor_si128(x, y); + y = _mm_slli_si128(y, 0x04); + x = _mm_xor_si128(x, y); + x = _mm_xor_si128(x, z); + return x; +} + +} // anonymous namespace + +__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,pclmul"))) +void AES::GMAC::p_aesNIUpdate(const uint8_t *in, unsigned int len) noexcept +{ + __m128i y = _mm_loadu_si128(reinterpret_cast(_y)); + + // Handle anything left over from a previous run that wasn't a multiple of 16 bytes. + if (_rp) { + for (;;) { + if (!len) + return; + --len; + _r[_rp++] = *(in++); + if (_rp == 16) { + y = p_gmacPCLMUL128(_aes.p_k.ni.h[0], _mm_xor_si128(y, _mm_loadu_si128(reinterpret_cast<__m128i *>(_r)))); + break; + } + } + } + + if (likely(len >= 64)) { + const __m128i sb = s_sseSwapBytes; + const __m128i h = _aes.p_k.ni.h[0]; + const __m128i hh = _aes.p_k.ni.h[1]; + const __m128i hhh = _aes.p_k.ni.h[2]; + const __m128i hhhh = _aes.p_k.ni.h[3]; + const __m128i h2 = _aes.p_k.ni.h2[0]; + const __m128i hh2 = _aes.p_k.ni.h2[1]; + const __m128i hhh2 = _aes.p_k.ni.h2[2]; + const __m128i hhhh2 = _aes.p_k.ni.h2[3]; + const uint8_t *const end64 = in + (len & ~((unsigned int)63)); + len &= 63U; + do { + __m128i d1 = _mm_shuffle_epi8(_mm_xor_si128(y, _mm_loadu_si128(reinterpret_cast(in))), sb); + __m128i d2 = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast(in + 16)), sb); + __m128i d3 = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast(in + 32)), sb); + __m128i d4 = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast(in + 48)), sb); + in += 64; + __m128i a = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(hhhh, d1, 0x00), _mm_clmulepi64_si128(hhh, d2, 0x00)), _mm_xor_si128(_mm_clmulepi64_si128(hh, d3, 0x00), _mm_clmulepi64_si128(h, d4, 0x00))); + __m128i b = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(hhhh, d1, 0x11), _mm_clmulepi64_si128(hhh, d2, 0x11)), _mm_xor_si128(_mm_clmulepi64_si128(hh, d3, 0x11), _mm_clmulepi64_si128(h, d4, 0x11))); + __m128i c = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(hhhh2, _mm_xor_si128(_mm_shuffle_epi32(d1, 78), d1), 0x00), _mm_clmulepi64_si128(hhh2, _mm_xor_si128(_mm_shuffle_epi32(d2, 78), d2), 0x00)), _mm_xor_si128(_mm_clmulepi64_si128(hh2, _mm_xor_si128(_mm_shuffle_epi32(d3, 78), d3), 0x00), _mm_clmulepi64_si128(h2, _mm_xor_si128(_mm_shuffle_epi32(d4, 78), d4), 0x00))), _mm_xor_si128(a, b)); + a = _mm_xor_si128(_mm_slli_si128(c, 8), a); + b = _mm_xor_si128(_mm_srli_si128(c, 8), b); + c = _mm_srli_epi32(a, 31); + a = _mm_or_si128(_mm_slli_epi32(a, 1), _mm_slli_si128(c, 4)); + b = _mm_or_si128(_mm_or_si128(_mm_slli_epi32(b, 1), _mm_slli_si128(_mm_srli_epi32(b, 31), 4)), _mm_srli_si128(c, 12)); + c = _mm_xor_si128(_mm_slli_epi32(a, 31), _mm_xor_si128(_mm_slli_epi32(a, 30), _mm_slli_epi32(a, 25))); + a = _mm_xor_si128(a, _mm_slli_si128(c, 12)); + b = _mm_xor_si128(b, _mm_xor_si128(a, _mm_xor_si128(_mm_xor_si128(_mm_srli_epi32(a, 1), _mm_srli_si128(c, 4)), _mm_xor_si128(_mm_srli_epi32(a, 2), _mm_srli_epi32(a, 7))))); + y = _mm_shuffle_epi8(b, sb); + } while (likely(in != end64)); + } + + while (len >= 16) { + y = p_gmacPCLMUL128(_aes.p_k.ni.h[0], _mm_xor_si128(y, _mm_loadu_si128(reinterpret_cast(in)))); + in += 16; + len -= 16; + } + + _mm_storeu_si128(reinterpret_cast<__m128i *>(_y), y); + + // Any overflow is cached for a later run or finish(). + for (unsigned int i = 0; i < len; ++i) + _r[i] = in[i]; + _rp = len; // len is always less than 16 here +} + +__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,pclmul"))) +void AES::GMAC::p_aesNIFinish(uint8_t tag[16]) noexcept +{ + __m128i y = _mm_loadu_si128(reinterpret_cast(_y)); + + // Handle any remaining bytes, padding the last block with zeroes. + if (_rp) { + while (_rp < 16) + _r[_rp++] = 0; + y = p_gmacPCLMUL128(_aes.p_k.ni.h[0], _mm_xor_si128(y, _mm_loadu_si128(reinterpret_cast<__m128i *>(_r)))); + } + + // Interleave encryption of IV with the final GHASH of y XOR (length * 8). + // Then XOR these together to get the final tag. + const __m128i *const k = _aes.p_k.ni.k; + const __m128i h = _aes.p_k.ni.h[0]; + y = _mm_xor_si128(y, _mm_set_epi64x(0LL, (long long)Utils::hton((uint64_t)_len << 3U))); + y = _mm_shuffle_epi8(y, s_sseSwapBytes); + __m128i encIV = _mm_xor_si128(_mm_loadu_si128(reinterpret_cast(_iv)), k[0]); + __m128i t1 = _mm_clmulepi64_si128(h, y, 0x00); + __m128i t2 = _mm_clmulepi64_si128(h, y, 0x01); + __m128i t3 = _mm_clmulepi64_si128(h, y, 0x10); + __m128i t4 = _mm_clmulepi64_si128(h, y, 0x11); + encIV = _mm_aesenc_si128(encIV, k[1]); + t2 = _mm_xor_si128(t2, t3); + t3 = _mm_slli_si128(t2, 8); + encIV = _mm_aesenc_si128(encIV, k[2]); + t2 = _mm_srli_si128(t2, 8); + t1 = _mm_xor_si128(t1, t3); + encIV = _mm_aesenc_si128(encIV, k[3]); + t4 = _mm_xor_si128(t4, t2); + __m128i t5 = _mm_srli_epi32(t1, 31); + t1 = _mm_slli_epi32(t1, 1); + __m128i t6 = _mm_srli_epi32(t4, 31); + encIV = _mm_aesenc_si128(encIV, k[4]); + t4 = _mm_slli_epi32(t4, 1); + t3 = _mm_srli_si128(t5, 12); + encIV = _mm_aesenc_si128(encIV, k[5]); + t6 = _mm_slli_si128(t6, 4); + t5 = _mm_slli_si128(t5, 4); + encIV = _mm_aesenc_si128(encIV, k[6]); + t1 = _mm_or_si128(t1, t5); + t4 = _mm_or_si128(t4, t6); + encIV = _mm_aesenc_si128(encIV, k[7]); + t4 = _mm_or_si128(t4, t3); + t5 = _mm_slli_epi32(t1, 31); + encIV = _mm_aesenc_si128(encIV, k[8]); + t6 = _mm_slli_epi32(t1, 30); + t3 = _mm_slli_epi32(t1, 25); + encIV = _mm_aesenc_si128(encIV, k[9]); + t5 = _mm_xor_si128(t5, t6); + t5 = _mm_xor_si128(t5, t3); + encIV = _mm_aesenc_si128(encIV, k[10]); + t6 = _mm_srli_si128(t5, 4); + t4 = _mm_xor_si128(t4, t6); + encIV = _mm_aesenc_si128(encIV, k[11]); + t5 = _mm_slli_si128(t5, 12); + t1 = _mm_xor_si128(t1, t5); + t4 = _mm_xor_si128(t4, t1); + t5 = _mm_srli_epi32(t1, 1); + encIV = _mm_aesenc_si128(encIV, k[12]); + t2 = _mm_srli_epi32(t1, 2); + t3 = _mm_srli_epi32(t1, 7); + encIV = _mm_aesenc_si128(encIV, k[13]); + t4 = _mm_xor_si128(t4, t2); + t4 = _mm_xor_si128(t4, t3); + encIV = _mm_aesenclast_si128(encIV, k[14]); + t4 = _mm_xor_si128(t4, t5); + _mm_storeu_si128(reinterpret_cast<__m128i *>(tag), _mm_xor_si128(_mm_shuffle_epi8(t4, s_sseSwapBytes), encIV)); +} + +__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes"))) +void AES::CTR::p_aesNICrypt(const uint8_t *in, uint8_t *out, unsigned int len) noexcept +{ + const __m128i dd = _mm_set_epi64x(0, (long long)_ctr[0]); + uint64_t c1 = Utils::ntoh(_ctr[1]); + + const __m128i *const k = _aes.p_k.ni.k; + const __m128i k0 = k[0]; + const __m128i k1 = k[1]; + const __m128i k2 = k[2]; + const __m128i k3 = k[3]; + const __m128i k4 = k[4]; + const __m128i k5 = k[5]; + const __m128i k6 = k[6]; + const __m128i k7 = k[7]; + const __m128i k8 = k[8]; + const __m128i k9 = k[9]; + const __m128i k10 = k[10]; + const __m128i k11 = k[11]; + const __m128i k12 = k[12]; + const __m128i k13 = k[13]; + const __m128i k14 = k[14]; + + // Complete any unfinished blocks from previous calls to crypt(). + unsigned int totalLen = _len; + if ((totalLen & 15U)) { + for (;;) { + if (unlikely(!len)) { + _ctr[1] = Utils::hton(c1); + _len = totalLen; + return; + } + --len; + out[totalLen++] = *(in++); + if (!(totalLen & 15U)) { + __m128i d0 = _mm_insert_epi64(dd, (long long)Utils::hton(c1++), 1); + d0 = _mm_xor_si128(d0, k0); + d0 = _mm_aesenc_si128(d0, k1); + d0 = _mm_aesenc_si128(d0, k2); + d0 = _mm_aesenc_si128(d0, k3); + d0 = _mm_aesenc_si128(d0, k4); + d0 = _mm_aesenc_si128(d0, k5); + d0 = _mm_aesenc_si128(d0, k6); + d0 = _mm_aesenc_si128(d0, k7); + d0 = _mm_aesenc_si128(d0, k8); + d0 = _mm_aesenc_si128(d0, k9); + d0 = _mm_aesenc_si128(d0, k10); + __m128i *const outblk = reinterpret_cast<__m128i *>(out + (totalLen - 16)); + d0 = _mm_aesenc_si128(d0, k11); + const __m128i p0 = _mm_loadu_si128(outblk); + d0 = _mm_aesenc_si128(d0, k12); + d0 = _mm_aesenc_si128(d0, k13); + d0 = _mm_aesenclast_si128(d0, k14); + _mm_storeu_si128(outblk, _mm_xor_si128(p0, d0)); + break; + } + } + } + + out += totalLen; + _len = totalLen + len; + + if (likely(len >= 64)) { + +#if defined(ZT_AES_VAES512) && defined(ZT_AES_VAES256) + if (Utils::CPUID.vaes && (len >= 256)) { + if (Utils::CPUID.avx512f) { + p_aesCtrInnerVAES512(len, _ctr[0], c1, in, out, k); + } else { + p_aesCtrInnerVAES256(len, _ctr[0], c1, in, out, k); + } + goto skip_conventional_aesni_64; + } +#endif + +#if !defined(ZT_AES_VAES512) && defined(ZT_AES_VAES256) + if (Utils::CPUID.vaes && (len >= 256)) { + p_aesCtrInnerVAES256(len, _ctr[0], c1, in, out, k); + goto skip_conventional_aesni_64; + } +#endif + + const uint8_t *const eof64 = in + (len & ~((unsigned int)63)); + len &= 63; + __m128i d0, d1, d2, d3; + do { + const uint64_t c10 = Utils::hton(c1); + const uint64_t c11 = Utils::hton(c1 + 1ULL); + const uint64_t c12 = Utils::hton(c1 + 2ULL); + const uint64_t c13 = Utils::hton(c1 + 3ULL); + d0 = _mm_insert_epi64(dd, (long long)c10, 1); + d1 = _mm_insert_epi64(dd, (long long)c11, 1); + d2 = _mm_insert_epi64(dd, (long long)c12, 1); + d3 = _mm_insert_epi64(dd, (long long)c13, 1); + c1 += 4; + d0 = _mm_xor_si128(d0, k0); + d1 = _mm_xor_si128(d1, k0); + d2 = _mm_xor_si128(d2, k0); + d3 = _mm_xor_si128(d3, k0); + d0 = _mm_aesenc_si128(d0, k1); + d1 = _mm_aesenc_si128(d1, k1); + d2 = _mm_aesenc_si128(d2, k1); + d3 = _mm_aesenc_si128(d3, k1); + d0 = _mm_aesenc_si128(d0, k2); + d1 = _mm_aesenc_si128(d1, k2); + d2 = _mm_aesenc_si128(d2, k2); + d3 = _mm_aesenc_si128(d3, k2); + d0 = _mm_aesenc_si128(d0, k3); + d1 = _mm_aesenc_si128(d1, k3); + d2 = _mm_aesenc_si128(d2, k3); + d3 = _mm_aesenc_si128(d3, k3); + d0 = _mm_aesenc_si128(d0, k4); + d1 = _mm_aesenc_si128(d1, k4); + d2 = _mm_aesenc_si128(d2, k4); + d3 = _mm_aesenc_si128(d3, k4); + d0 = _mm_aesenc_si128(d0, k5); + d1 = _mm_aesenc_si128(d1, k5); + d2 = _mm_aesenc_si128(d2, k5); + d3 = _mm_aesenc_si128(d3, k5); + d0 = _mm_aesenc_si128(d0, k6); + d1 = _mm_aesenc_si128(d1, k6); + d2 = _mm_aesenc_si128(d2, k6); + d3 = _mm_aesenc_si128(d3, k6); + d0 = _mm_aesenc_si128(d0, k7); + d1 = _mm_aesenc_si128(d1, k7); + d2 = _mm_aesenc_si128(d2, k7); + d3 = _mm_aesenc_si128(d3, k7); + d0 = _mm_aesenc_si128(d0, k8); + d1 = _mm_aesenc_si128(d1, k8); + d2 = _mm_aesenc_si128(d2, k8); + d3 = _mm_aesenc_si128(d3, k8); + d0 = _mm_aesenc_si128(d0, k9); + d1 = _mm_aesenc_si128(d1, k9); + d2 = _mm_aesenc_si128(d2, k9); + d3 = _mm_aesenc_si128(d3, k9); + d0 = _mm_aesenc_si128(d0, k10); + d1 = _mm_aesenc_si128(d1, k10); + d2 = _mm_aesenc_si128(d2, k10); + d3 = _mm_aesenc_si128(d3, k10); + d0 = _mm_aesenc_si128(d0, k11); + d1 = _mm_aesenc_si128(d1, k11); + d2 = _mm_aesenc_si128(d2, k11); + d3 = _mm_aesenc_si128(d3, k11); + d0 = _mm_aesenc_si128(d0, k12); + d1 = _mm_aesenc_si128(d1, k12); + d2 = _mm_aesenc_si128(d2, k12); + d3 = _mm_aesenc_si128(d3, k12); + d0 = _mm_aesenc_si128(d0, k13); + d1 = _mm_aesenc_si128(d1, k13); + d2 = _mm_aesenc_si128(d2, k13); + d3 = _mm_aesenc_si128(d3, k13); + d0 = _mm_xor_si128(_mm_aesenclast_si128(d0, k14), _mm_loadu_si128(reinterpret_cast(in))); + d1 = _mm_xor_si128(_mm_aesenclast_si128(d1, k14), _mm_loadu_si128(reinterpret_cast(in + 16))); + d2 = _mm_xor_si128(_mm_aesenclast_si128(d2, k14), _mm_loadu_si128(reinterpret_cast(in + 32))); + d3 = _mm_xor_si128(_mm_aesenclast_si128(d3, k14), _mm_loadu_si128(reinterpret_cast(in + 48))); + in += 64; + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), d0); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 16), d1); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 32), d2); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + 48), d3); + out += 64; + } while (likely(in != eof64)); + + } + + skip_conventional_aesni_64: + while (len >= 16) { + __m128i d0 = _mm_insert_epi64(dd, (long long)Utils::hton(c1++), 1); + d0 = _mm_xor_si128(d0, k0); + d0 = _mm_aesenc_si128(d0, k1); + d0 = _mm_aesenc_si128(d0, k2); + d0 = _mm_aesenc_si128(d0, k3); + d0 = _mm_aesenc_si128(d0, k4); + d0 = _mm_aesenc_si128(d0, k5); + d0 = _mm_aesenc_si128(d0, k6); + d0 = _mm_aesenc_si128(d0, k7); + d0 = _mm_aesenc_si128(d0, k8); + d0 = _mm_aesenc_si128(d0, k9); + d0 = _mm_aesenc_si128(d0, k10); + d0 = _mm_aesenc_si128(d0, k11); + d0 = _mm_aesenc_si128(d0, k12); + d0 = _mm_aesenc_si128(d0, k13); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out), _mm_xor_si128(_mm_aesenclast_si128(d0, k14), _mm_loadu_si128(reinterpret_cast(in)))); + in += 16; + len -= 16; + out += 16; + } + + // Any remaining input is placed in _out. This will be picked up and crypted + // on subsequent calls to crypt() or finish() as it'll mean _len will not be + // an even multiple of 16. + for (unsigned int i = 0; i < len; ++i) + out[i] = in[i]; + + _ctr[1] = Utils::hton(c1); +} + +__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +void AES::p_init_aesni(const uint8_t *key) noexcept +{ + __m128i t1, t2, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13; + p_k.ni.k[0] = t1 = _mm_loadu_si128((const __m128i *)key); + p_k.ni.k[1] = k1 = t2 = _mm_loadu_si128((const __m128i *)(key + 16)); + p_k.ni.k[2] = k2 = t1 = p_init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x01)); + p_k.ni.k[3] = k3 = t2 = p_init256_2_aesni(t1, t2); + p_k.ni.k[4] = k4 = t1 = p_init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x02)); + p_k.ni.k[5] = k5 = t2 = p_init256_2_aesni(t1, t2); + p_k.ni.k[6] = k6 = t1 = p_init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x04)); + p_k.ni.k[7] = k7 = t2 = p_init256_2_aesni(t1, t2); + p_k.ni.k[8] = k8 = t1 = p_init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x08)); + p_k.ni.k[9] = k9 = t2 = p_init256_2_aesni(t1, t2); + p_k.ni.k[10] = k10 = t1 = p_init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x10)); + p_k.ni.k[11] = k11 = t2 = p_init256_2_aesni(t1, t2); + p_k.ni.k[12] = k12 = t1 = p_init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x20)); + p_k.ni.k[13] = k13 = t2 = p_init256_2_aesni(t1, t2); + p_k.ni.k[14] = p_init256_1_aesni(t1, _mm_aeskeygenassist_si128(t2, 0x40)); + p_k.ni.k[15] = _mm_aesimc_si128(k13); + p_k.ni.k[16] = _mm_aesimc_si128(k12); + p_k.ni.k[17] = _mm_aesimc_si128(k11); + p_k.ni.k[18] = _mm_aesimc_si128(k10); + p_k.ni.k[19] = _mm_aesimc_si128(k9); + p_k.ni.k[20] = _mm_aesimc_si128(k8); + p_k.ni.k[21] = _mm_aesimc_si128(k7); + p_k.ni.k[22] = _mm_aesimc_si128(k6); + p_k.ni.k[23] = _mm_aesimc_si128(k5); + p_k.ni.k[24] = _mm_aesimc_si128(k4); + p_k.ni.k[25] = _mm_aesimc_si128(k3); + p_k.ni.k[26] = _mm_aesimc_si128(k2); + p_k.ni.k[27] = _mm_aesimc_si128(k1); + + __m128i h = p_k.ni.k[0]; // _mm_xor_si128(_mm_setzero_si128(),_k.ni.k[0]); + h = _mm_aesenc_si128(h, k1); + h = _mm_aesenc_si128(h, k2); + h = _mm_aesenc_si128(h, k3); + h = _mm_aesenc_si128(h, k4); + h = _mm_aesenc_si128(h, k5); + h = _mm_aesenc_si128(h, k6); + h = _mm_aesenc_si128(h, k7); + h = _mm_aesenc_si128(h, k8); + h = _mm_aesenc_si128(h, k9); + h = _mm_aesenc_si128(h, k10); + h = _mm_aesenc_si128(h, k11); + h = _mm_aesenc_si128(h, k12); + h = _mm_aesenc_si128(h, k13); + h = _mm_aesenclast_si128(h, p_k.ni.k[14]); + __m128i hswap = _mm_shuffle_epi8(h, s_sseSwapBytes); + __m128i hh = p_gmacPCLMUL128(hswap, h); + __m128i hhh = p_gmacPCLMUL128(hswap, hh); + __m128i hhhh = p_gmacPCLMUL128(hswap, hhh); + p_k.ni.h[0] = hswap; + p_k.ni.h[1] = hh = _mm_shuffle_epi8(hh, s_sseSwapBytes); + p_k.ni.h[2] = hhh = _mm_shuffle_epi8(hhh, s_sseSwapBytes); + p_k.ni.h[3] = hhhh = _mm_shuffle_epi8(hhhh, s_sseSwapBytes); + p_k.ni.h2[0] = _mm_xor_si128(_mm_shuffle_epi32(hswap, 78), hswap); + p_k.ni.h2[1] = _mm_xor_si128(_mm_shuffle_epi32(hh, 78), hh); + p_k.ni.h2[2] = _mm_xor_si128(_mm_shuffle_epi32(hhh, 78), hhh); + p_k.ni.h2[3] = _mm_xor_si128(_mm_shuffle_epi32(hhhh, 78), hhhh); +} + +__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +void AES::p_encrypt_aesni(const void *const in, void *const out) const noexcept +{ + __m128i tmp = _mm_loadu_si128((const __m128i *)in); + tmp = _mm_xor_si128(tmp, p_k.ni.k[0]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[1]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[2]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[3]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[4]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[5]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[6]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[7]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[8]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[9]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[10]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[11]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[12]); + tmp = _mm_aesenc_si128(tmp, p_k.ni.k[13]); + _mm_storeu_si128((__m128i *)out, _mm_aesenclast_si128(tmp, p_k.ni.k[14])); +} + +__attribute__((__target__("ssse3,sse4,sse4.1,sse4.2,aes,pclmul"))) +void AES::p_decrypt_aesni(const void *in, void *out) const noexcept +{ + __m128i tmp = _mm_loadu_si128((const __m128i *)in); + tmp = _mm_xor_si128(tmp, p_k.ni.k[14]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[15]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[16]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[17]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[18]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[19]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[20]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[21]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[22]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[23]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[24]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[25]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[26]); + tmp = _mm_aesdec_si128(tmp, p_k.ni.k[27]); + _mm_storeu_si128((__m128i *)out, _mm_aesdeclast_si128(tmp, p_k.ni.k[0])); +} + +} // namespace ZeroTier + +#endif // ZT_AES_AESNI diff --git a/node/AES_armcrypto.cpp b/node/AES_armcrypto.cpp new file mode 100644 index 000000000..30a7ec35a --- /dev/null +++ b/node/AES_armcrypto.cpp @@ -0,0 +1,388 @@ +/* + * Copyright (c)2013-2020 ZeroTier, Inc. + * + * Use of this software is governed by the Business Source License included + * in the LICENSE.TXT file in the project's root directory. + * + * Change Date: 2025-01-01 + * + * On the date above, in accordance with the Business Source License, use + * of this software will be governed by version 2.0 of the Apache License. + */ +/****/ + +#include "Constants.hpp" +#include "AES.hpp" + +#ifdef ZT_AES_NEON + +namespace ZeroTier { + +namespace { + +ZT_INLINE uint8x16_t s_clmul_armneon_crypto(uint8x16_t h, uint8x16_t y, const uint8_t b[16]) noexcept +{ + uint8x16_t r0, r1, t0, t1; + r0 = vld1q_u8(b); + const uint8x16_t z = veorq_u8(h, h); + y = veorq_u8(r0, y); + y = vrbitq_u8(y); + const uint8x16_t p = vreinterpretq_u8_u64(vdupq_n_u64(0x0000000000000087)); + t0 = vextq_u8(y, y, 8); + __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t" : "=w" (r0) : "w" (h), "w" (y)); + __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t" :"=w" (r1) : "w" (h), "w" (y)); + __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t" : "=w" (t1) : "w" (h), "w" (t0)); + __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t" :"=w" (t0) : "w" (h), "w" (t0)); + t0 = veorq_u8(t0, t1); + t1 = vextq_u8(z, t0, 8); + r0 = veorq_u8(r0, t1); + t1 = vextq_u8(t0, z, 8); + r1 = veorq_u8(r1, t1); + __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t" :"=w" (t0) : "w" (r1), "w" (p)); + t1 = vextq_u8(t0, z, 8); + r1 = veorq_u8(r1, t1); + t1 = vextq_u8(z, t0, 8); + r0 = veorq_u8(r0, t1); + __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t" : "=w" (t0) : "w" (r1), "w" (p)); + return vrbitq_u8(veorq_u8(r0, t0)); +} + +} // anonymous namespace + +void AES::GMAC::p_armUpdate(const uint8_t *in, unsigned int len) noexcept +{ + uint8x16_t y = vld1q_u8(reinterpret_cast(_y)); + const uint8x16_t h = _aes.p_k.neon.h; + + if (_rp) { + for(;;) { + if (!len) + return; + --len; + _r[_rp++] = *(in++); + if (_rp == 16) { + y = s_clmul_armneon_crypto(h, y, _r); + break; + } + } + } + + while (len >= 16) { + y = s_clmul_armneon_crypto(h, y, in); + in += 16; + len -= 16; + } + + vst1q_u8(reinterpret_cast(_y), y); + + for (unsigned int i = 0; i < len; ++i) + _r[i] = in[i]; + _rp = len; // len is always less than 16 here +} + +void AES::GMAC::p_armFinish(uint8_t tag[16]) noexcept +{ + uint64_t tmp[2]; + uint8x16_t y = vld1q_u8(reinterpret_cast(_y)); + const uint8x16_t h = _aes.p_k.neon.h; + + if (_rp) { + while (_rp < 16) + _r[_rp++] = 0; + y = s_clmul_armneon_crypto(h, y, _r); + } + + tmp[0] = Utils::hton((uint64_t)_len << 3U); + tmp[1] = 0; + y = s_clmul_armneon_crypto(h, y, reinterpret_cast(tmp)); + + Utils::copy< 12 >(tmp, _iv); +#if __BYTE_ORDER == __BIG_ENDIAN + reinterpret_cast(tmp)[3] = 0x00000001; +#else + reinterpret_cast(tmp)[3] = 0x01000000; +#endif + _aes.encrypt(tmp, tmp); + + uint8x16_t yy = y; + Utils::storeMachineEndian< uint64_t >(tag, tmp[0] ^ reinterpret_cast(&yy)[0]); + Utils::storeMachineEndian< uint64_t >(tag + 8, tmp[1] ^ reinterpret_cast(&yy)[1]); +} + +void AES::CTR::p_armCrypt(const uint8_t *in, uint8_t *out, unsigned int len) noexcept +{ + uint8x16_t dd = vrev32q_u8(vld1q_u8(reinterpret_cast(_ctr))); + const uint32x4_t one = {0,0,0,1}; + + uint8x16_t k0 = _aes.p_k.neon.ek[0]; + uint8x16_t k1 = _aes.p_k.neon.ek[1]; + uint8x16_t k2 = _aes.p_k.neon.ek[2]; + uint8x16_t k3 = _aes.p_k.neon.ek[3]; + uint8x16_t k4 = _aes.p_k.neon.ek[4]; + uint8x16_t k5 = _aes.p_k.neon.ek[5]; + uint8x16_t k6 = _aes.p_k.neon.ek[6]; + uint8x16_t k7 = _aes.p_k.neon.ek[7]; + uint8x16_t k8 = _aes.p_k.neon.ek[8]; + uint8x16_t k9 = _aes.p_k.neon.ek[9]; + uint8x16_t k10 = _aes.p_k.neon.ek[10]; + uint8x16_t k11 = _aes.p_k.neon.ek[11]; + uint8x16_t k12 = _aes.p_k.neon.ek[12]; + uint8x16_t k13 = _aes.p_k.neon.ek[13]; + uint8x16_t k14 = _aes.p_k.neon.ek[14]; + + unsigned int totalLen = _len; + if ((totalLen & 15U)) { + for (;;) { + if (unlikely(!len)) { + vst1q_u8(reinterpret_cast(_ctr), vrev32q_u8(dd)); + _len = totalLen; + return; + } + --len; + out[totalLen++] = *(in++); + if (!(totalLen & 15U)) { + uint8_t *const otmp = out + (totalLen - 16); + uint8x16_t d0 = vrev32q_u8(dd); + uint8x16_t pt = vld1q_u8(otmp); + d0 = vaesmcq_u8(vaeseq_u8(d0, k0)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k1)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k2)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k3)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k4)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k5)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k6)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k7)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k8)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k9)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k10)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k11)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k12)); + d0 = veorq_u8(vaeseq_u8(d0, k13), k14); + vst1q_u8(otmp, veorq_u8(pt, d0)); + dd = (uint8x16_t)vaddq_u32((uint32x4_t)dd, one); + break; + } + } + } + + out += totalLen; + _len = totalLen + len; + + if (likely(len >= 64)) { + const uint32x4_t four = vshlq_n_u32(one, 2); + uint8x16_t dd1 = (uint8x16_t)vaddq_u32((uint32x4_t)dd, one); + uint8x16_t dd2 = (uint8x16_t)vaddq_u32((uint32x4_t)dd1, one); + uint8x16_t dd3 = (uint8x16_t)vaddq_u32((uint32x4_t)dd2, one); + for (;;) { + len -= 64; + uint8x16_t d0 = vrev32q_u8(dd); + uint8x16_t d1 = vrev32q_u8(dd1); + uint8x16_t d2 = vrev32q_u8(dd2); + uint8x16_t d3 = vrev32q_u8(dd3); + uint8x16_t pt0 = vld1q_u8(in); + in += 16; + d0 = vaesmcq_u8(vaeseq_u8(d0, k0)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k0)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k0)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k0)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k1)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k1)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k1)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k1)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k2)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k2)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k2)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k2)); + uint8x16_t pt1 = vld1q_u8(in); + in += 16; + d0 = vaesmcq_u8(vaeseq_u8(d0, k3)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k3)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k3)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k3)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k4)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k4)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k4)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k4)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k5)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k5)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k5)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k5)); + uint8x16_t pt2 = vld1q_u8(in); + in += 16; + d0 = vaesmcq_u8(vaeseq_u8(d0, k6)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k6)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k6)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k6)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k7)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k7)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k7)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k7)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k8)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k8)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k8)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k8)); + uint8x16_t pt3 = vld1q_u8(in); + in += 16; + d0 = vaesmcq_u8(vaeseq_u8(d0, k9)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k9)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k9)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k9)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k10)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k10)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k10)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k10)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k11)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k11)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k11)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k11)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k12)); + d1 = vaesmcq_u8(vaeseq_u8(d1, k12)); + d2 = vaesmcq_u8(vaeseq_u8(d2, k12)); + d3 = vaesmcq_u8(vaeseq_u8(d3, k12)); + d0 = veorq_u8(vaeseq_u8(d0, k13), k14); + d1 = veorq_u8(vaeseq_u8(d1, k13), k14); + d2 = veorq_u8(vaeseq_u8(d2, k13), k14); + d3 = veorq_u8(vaeseq_u8(d3, k13), k14); + + d0 = veorq_u8(pt0, d0); + d1 = veorq_u8(pt1, d1); + d2 = veorq_u8(pt2, d2); + d3 = veorq_u8(pt3, d3); + + vst1q_u8(out, d0); + vst1q_u8(out + 16, d1); + vst1q_u8(out + 32, d2); + vst1q_u8(out + 48, d3); + out += 64; + + dd = (uint8x16_t)vaddq_u32((uint32x4_t)dd, four); + if (unlikely(len < 64)) + break; + dd1 = (uint8x16_t)vaddq_u32((uint32x4_t)dd1, four); + dd2 = (uint8x16_t)vaddq_u32((uint32x4_t)dd2, four); + dd3 = (uint8x16_t)vaddq_u32((uint32x4_t)dd3, four); + } + } + + while (len >= 16) { + len -= 16; + uint8x16_t d0 = vrev32q_u8(dd); + uint8x16_t pt = vld1q_u8(in); + in += 16; + dd = (uint8x16_t)vaddq_u32((uint32x4_t)dd, one); + d0 = vaesmcq_u8(vaeseq_u8(d0, k0)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k1)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k2)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k3)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k4)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k5)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k6)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k7)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k8)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k9)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k10)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k11)); + d0 = vaesmcq_u8(vaeseq_u8(d0, k12)); + d0 = veorq_u8(vaeseq_u8(d0, k13), k14); + vst1q_u8(out, veorq_u8(pt, d0)); + out += 16; + } + + // Any remaining input is placed in _out. This will be picked up and crypted + // on subsequent calls to crypt() or finish() as it'll mean _len will not be + // an even multiple of 16. + for (unsigned int i = 0; i < len; ++i) + out[i] = in[i]; + + vst1q_u8(reinterpret_cast(_ctr), vrev32q_u8(dd)); +} + +#define ZT_INIT_ARMNEON_CRYPTO_SUBWORD(w) ((uint32_t)s_sbox[w & 0xffU] + ((uint32_t)s_sbox[(w >> 8U) & 0xffU] << 8U) + ((uint32_t)s_sbox[(w >> 16U) & 0xffU] << 16U) + ((uint32_t)s_sbox[(w >> 24U) & 0xffU] << 24U)) +#define ZT_INIT_ARMNEON_CRYPTO_ROTWORD(w) (((w) << 8U) | ((w) >> 24U)) +#define ZT_INIT_ARMNEON_CRYPTO_NK 8 +#define ZT_INIT_ARMNEON_CRYPTO_NB 4 +#define ZT_INIT_ARMNEON_CRYPTO_NR 14 + +void AES::p_init_armneon_crypto(const uint8_t *key) noexcept +{ + static const uint8_t s_sbox[256] = {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, + 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, + 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; + + uint64_t h[2]; + uint32_t *const w = reinterpret_cast(p_k.neon.ek); + + for (unsigned int i=0;i(&(p_k.neon.h), h); + p_k.neon.h = vrbitq_u8(p_k.neon.h); + p_k.sw.h[0] = Utils::ntoh(h[0]); + p_k.sw.h[1] = Utils::ntoh(h[1]); +} + +void AES::p_encrypt_armneon_crypto(const void *const in, void *const out) const noexcept +{ + uint8x16_t tmp = vld1q_u8(reinterpret_cast(in)); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[0])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[1])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[2])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[3])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[4])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[5])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[6])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[7])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[8])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[9])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[10])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[11])); + tmp = vaesmcq_u8(vaeseq_u8(tmp, p_k.neon.ek[12])); + tmp = veorq_u8(vaeseq_u8(tmp, p_k.neon.ek[13]), p_k.neon.ek[14]); + vst1q_u8(reinterpret_cast(out), tmp); +} + +void AES::p_decrypt_armneon_crypto(const void *const in, void *const out) const noexcept +{ + uint8x16_t tmp = vld1q_u8(reinterpret_cast(in)); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[0])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[1])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[2])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[3])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[4])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[5])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[6])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[7])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[8])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[9])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[10])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[11])); + tmp = vaesimcq_u8(vaesdq_u8(tmp, p_k.neon.dk[12])); + tmp = veorq_u8(vaesdq_u8(tmp, p_k.neon.dk[13]), p_k.neon.dk[14]); + vst1q_u8(reinterpret_cast(out), tmp); +} + +} // namespace ZeroTier + +#endif // ZT_AES_NEON diff --git a/objects.mk b/objects.mk index abed41af7..cc6f96ee2 100644 --- a/objects.mk +++ b/objects.mk @@ -1,5 +1,7 @@ CORE_OBJS=\ node/AES.o \ + node/AES_aesni.o \ + node/AES_armcrypto.o \ node/C25519.o \ node/Capability.o \ node/CertificateOfMembership.o \