From c0e92d06a5de92d9c0a9bb23c8bed9b0490d2296 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Thu, 5 Sep 2019 17:31:12 -0700 Subject: [PATCH] faster without const variable second-guessing of the compiler --- node/AES.hpp | 190 ++++++++++++++++++++++++--------------------------- 1 file changed, 90 insertions(+), 100 deletions(-) diff --git a/node/AES.hpp b/node/AES.hpp index 2178a1c66..be5abe799 100644 --- a/node/AES.hpp +++ b/node/AES.hpp @@ -167,6 +167,9 @@ public: * to use makes the IV itself a secret. This is not strictly necessary * but comes at little cost. * + * This code is ZeroTier-specific in a few ways, like the way the IV + * is specified, but would not be hard to generalize. + * * @param k1 GMAC key * @param k2 GMAC auth tag keyed hash key * @param k3 CTR IV keyed hash key @@ -199,7 +202,7 @@ public: miv[10] = (uint8_t)(len >> 8); miv[11] = (uint8_t)len; - // Compute auth TAG: AES-ECB[k2](GMAC[k1](miv,plaintext))[0:8] + // Compute auth tag: AES-ECB[k2](GMAC[k1](miv,plaintext))[0:8] k1.gmac(miv,in,len,ctrIv); k2.encrypt(ctrIv,ctrIv); // ECB mode encrypt step is because GMAC is not a PRF #ifdef ZT_NO_TYPE_PUNNING @@ -525,22 +528,6 @@ private: const __m64 iv0 = (__m64)(*((const uint64_t *)iv)); uint64_t ctr = Utils::ntoh(*((const uint64_t *)(iv+8))); - const __m128i k0 = _k.ni.k[0]; - const __m128i k1 = _k.ni.k[1]; - const __m128i k2 = _k.ni.k[2]; - const __m128i k3 = _k.ni.k[3]; - const __m128i k4 = _k.ni.k[4]; - const __m128i k5 = _k.ni.k[5]; - const __m128i k6 = _k.ni.k[6]; - const __m128i k7 = _k.ni.k[7]; - const __m128i k8 = _k.ni.k[8]; - const __m128i k9 = _k.ni.k[9]; - const __m128i k10 = _k.ni.k[10]; - const __m128i k11 = _k.ni.k[11]; - const __m128i k12 = _k.ni.k[12]; - const __m128i k13 = _k.ni.k[13]; - const __m128i k14 = _k.ni.k[14]; - #define ZT_AES_CTR_AESNI_ROUND(k) \ c0 = _mm_aesenc_si128(c0,k); \ c1 = _mm_aesenc_si128(c1,k); \ @@ -552,36 +539,41 @@ private: c7 = _mm_aesenc_si128(c7,k) while (len >= 128) { - __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr),iv0),k0); - __m128i c1 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+1ULL)),iv0),k0); - __m128i c2 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+2ULL)),iv0),k0); - __m128i c3 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+3ULL)),iv0),k0); - __m128i c4 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+4ULL)),iv0),k0); - __m128i c5 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+5ULL)),iv0),k0); - __m128i c6 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+6ULL)),iv0),k0); - __m128i c7 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+7ULL)),iv0),k0); + _mm_prefetch(in,_MM_HINT_T0); + _mm_prefetch(in + 32,_MM_HINT_T0); + _mm_prefetch(in + 64,_MM_HINT_T0); + _mm_prefetch(in + 96,_MM_HINT_T0); + _mm_prefetch(in + 128,_MM_HINT_T0); + __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr),iv0),_k.ni.k[0]); + __m128i c1 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+1ULL)),iv0),_k.ni.k[0]); + __m128i c2 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+2ULL)),iv0),_k.ni.k[0]); + __m128i c3 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+3ULL)),iv0),_k.ni.k[0]); + __m128i c4 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+4ULL)),iv0),_k.ni.k[0]); + __m128i c5 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+5ULL)),iv0),_k.ni.k[0]); + __m128i c6 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+6ULL)),iv0),_k.ni.k[0]); + __m128i c7 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+7ULL)),iv0),_k.ni.k[0]); ctr += 8; - ZT_AES_CTR_AESNI_ROUND(k1); - ZT_AES_CTR_AESNI_ROUND(k2); - ZT_AES_CTR_AESNI_ROUND(k3); - ZT_AES_CTR_AESNI_ROUND(k4); - ZT_AES_CTR_AESNI_ROUND(k5); - ZT_AES_CTR_AESNI_ROUND(k6); - ZT_AES_CTR_AESNI_ROUND(k7); - ZT_AES_CTR_AESNI_ROUND(k8); - ZT_AES_CTR_AESNI_ROUND(k9); - ZT_AES_CTR_AESNI_ROUND(k10); - ZT_AES_CTR_AESNI_ROUND(k11); - ZT_AES_CTR_AESNI_ROUND(k12); - ZT_AES_CTR_AESNI_ROUND(k13); - _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,k14))); - _mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,k14))); - _mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,k14))); - _mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,k14))); - _mm_storeu_si128((__m128i *)(out + 64),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 64)),_mm_aesenclast_si128(c4,k14))); - _mm_storeu_si128((__m128i *)(out + 80),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 80)),_mm_aesenclast_si128(c5,k14))); - _mm_storeu_si128((__m128i *)(out + 96),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 96)),_mm_aesenclast_si128(c6,k14))); - _mm_storeu_si128((__m128i *)(out + 112),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 112)),_mm_aesenclast_si128(c7,k14))); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[1]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[2]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[3]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[4]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[5]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[6]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[7]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[8]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[9]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[10]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[11]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[12]); + ZT_AES_CTR_AESNI_ROUND(_k.ni.k[13]); + _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,_k.ni.k[14]))); + _mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,_k.ni.k[14]))); + _mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,_k.ni.k[14]))); + _mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,_k.ni.k[14]))); + _mm_storeu_si128((__m128i *)(out + 64),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 64)),_mm_aesenclast_si128(c4,_k.ni.k[14]))); + _mm_storeu_si128((__m128i *)(out + 80),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 80)),_mm_aesenclast_si128(c5,_k.ni.k[14]))); + _mm_storeu_si128((__m128i *)(out + 96),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 96)),_mm_aesenclast_si128(c6,_k.ni.k[14]))); + _mm_storeu_si128((__m128i *)(out + 112),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 112)),_mm_aesenclast_si128(c7,_k.ni.k[14]))); in += 128; out += 128; len -= 128; @@ -590,42 +582,42 @@ private: #undef ZT_AES_CTR_AESNI_ROUND while (len >= 16) { - __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),k0); - c0 = _mm_aesenc_si128(c0,k1); - c0 = _mm_aesenc_si128(c0,k2); - c0 = _mm_aesenc_si128(c0,k3); - c0 = _mm_aesenc_si128(c0,k4); - c0 = _mm_aesenc_si128(c0,k5); - c0 = _mm_aesenc_si128(c0,k6); - c0 = _mm_aesenc_si128(c0,k7); - c0 = _mm_aesenc_si128(c0,k8); - c0 = _mm_aesenc_si128(c0,k9); - c0 = _mm_aesenc_si128(c0,k10); - c0 = _mm_aesenc_si128(c0,k11); - c0 = _mm_aesenc_si128(c0,k12); - c0 = _mm_aesenc_si128(c0,k13); - _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,k14))); + __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),_k.ni.k[0]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[1]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[2]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[3]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[4]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[5]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[6]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[7]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[8]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[9]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[10]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[11]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[12]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[13]); + _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,_k.ni.k[14]))); in += 16; out += 16; len -= 16; } if (len) { - __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),k0); - c0 = _mm_aesenc_si128(c0,k1); - c0 = _mm_aesenc_si128(c0,k2); - c0 = _mm_aesenc_si128(c0,k3); - c0 = _mm_aesenc_si128(c0,k4); - c0 = _mm_aesenc_si128(c0,k5); - c0 = _mm_aesenc_si128(c0,k6); - c0 = _mm_aesenc_si128(c0,k7); - c0 = _mm_aesenc_si128(c0,k8); - c0 = _mm_aesenc_si128(c0,k9); - c0 = _mm_aesenc_si128(c0,k10); - c0 = _mm_aesenc_si128(c0,k11); - c0 = _mm_aesenc_si128(c0,k12); - c0 = _mm_aesenc_si128(c0,k13); - c0 = _mm_aesenclast_si128(c0,k14); + __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),_k.ni.k[0]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[1]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[2]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[3]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[4]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[5]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[6]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[7]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[8]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[9]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[10]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[11]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[12]); + c0 = _mm_aesenc_si128(c0,_k.ni.k[13]); + c0 = _mm_aesenclast_si128(c0,_k.ni.k[14]); for(unsigned int i=0;i