From d02d3f72fef9c98ff4de5183844728fa1b197978 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Mon, 12 Aug 2019 12:51:32 -0700 Subject: [PATCH] AES-GCM code --- node/AES.cpp | 5 +- node/AES.hpp | 643 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 603 insertions(+), 45 deletions(-) diff --git a/node/AES.cpp b/node/AES.cpp index 9ed2db29c..c36bb19fd 100644 --- a/node/AES.cpp +++ b/node/AES.cpp @@ -293,10 +293,7 @@ static const uint32_t Te3[256] = { 0x2d2d775a,0x0f0f111e,0xb0b0cb7b,0x5454fca8,0xbbbbd66d, 0x16163a2c }; -static const uint32_t rcon[] = { - 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, - 0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000, -}; +static const uint32_t rcon[10] = { 0x01000000,0x02000000,0x04000000,0x08000000,0x10000000,0x20000000,0x40000000,0x80000000,0x1B000000,0x36000000 }; } // anonymous namespace diff --git a/node/AES.hpp b/node/AES.hpp index c8f3887df..b80e337dd 100644 --- a/node/AES.hpp +++ b/node/AES.hpp @@ -59,33 +59,51 @@ public: inline ~AES() { - Utils::burn(&_k,sizeof(_k)); // ensure that expanded key memory is zeroed on object destruction + Utils::burn(&_k,sizeof(_k)); } inline void init(const uint8_t key[32]) { - if (HW_ACCEL) { #ifdef ZT_AES_AESNI + if (HW_ACCEL) { _init_aesni(key); -#endif - } else { - _initSW(key); + return; } +#endif + _initSW(key); } inline void encrypt(const uint8_t in[16],uint8_t out[16]) const { - if (HW_ACCEL) { #ifdef ZT_AES_AESNI + if (HW_ACCEL) { _encrypt_aesni(in,out); -#endif - } else { - _encryptSW(in,out); + return; } +#endif + _encryptSW(in,out); + } + + inline void gcmEncrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,uint8_t *tag,unsigned int taglen) + { +#ifdef ZT_AES_AESNI + if (HW_ACCEL) { + _encrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tag,taglen); + return; + } +#endif + abort(); // TODO: software + } + + inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen) + { + abort(); // TODO: software + return false; } // These are public so the software mode can always be tested in self-test. - // Normally init(), encrypt(), etc. should be used. + // Normally init(), encrypt(), etc. should be used and will choose accelerated + // or software mode depending on hardware capability. void _initSW(const uint8_t key[32]); void _encryptSW(const uint8_t in[16],uint8_t out[16]) const; @@ -121,47 +139,590 @@ private: inline void _init_aesni(const uint8_t key[32]) { __m128i t1,t2; - _k.ni[0] = t1 = _mm_loadu_si128((const __m128i *)key); - _k.ni[1] = t2 = _mm_loadu_si128((const __m128i *)(key+16)); - _k.ni[2] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x01)); - _k.ni[3] = t2 = _init256_2(t1,t2); - _k.ni[4] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x02)); - _k.ni[5] = t2 = _init256_2(t1,t2); - _k.ni[6] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x04)); - _k.ni[7] = t2 = _init256_2(t1,t2); - _k.ni[8] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x08)); - _k.ni[9] = t2 = _init256_2(t1,t2); - _k.ni[10] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x10)); - _k.ni[11] = t2 = _init256_2(t1,t2); - _k.ni[12] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x20)); - _k.ni[13] = t2 = _init256_2(t1,t2); - _k.ni[14] = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x40)); + _k.ni.k[0] = t1 = _mm_loadu_si128((const __m128i *)key); + _k.ni.k[1] = t2 = _mm_loadu_si128((const __m128i *)(key+16)); + _k.ni.k[2] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x01)); + _k.ni.k[3] = t2 = _init256_2(t1,t2); + _k.ni.k[4] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x02)); + _k.ni.k[5] = t2 = _init256_2(t1,t2); + _k.ni.k[6] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x04)); + _k.ni.k[7] = t2 = _init256_2(t1,t2); + _k.ni.k[8] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x08)); + _k.ni.k[9] = t2 = _init256_2(t1,t2); + _k.ni.k[10] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x10)); + _k.ni.k[11] = t2 = _init256_2(t1,t2); + _k.ni.k[12] = t1 = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x20)); + _k.ni.k[13] = t2 = _init256_2(t1,t2); + _k.ni.k[14] = _init256_1(t1,_mm_aeskeygenassist_si128(t2,0x40)); + + __m128i h = _mm_xor_si128(_mm_setzero_si128(),_k.ni.k[0]); + h = _mm_aesenc_si128(h,_k.ni.k[1]); + h = _mm_aesenc_si128(h,_k.ni.k[2]); + h = _mm_aesenc_si128(h,_k.ni.k[3]); + h = _mm_aesenc_si128(h,_k.ni.k[4]); + h = _mm_aesenc_si128(h,_k.ni.k[5]); + h = _mm_aesenc_si128(h,_k.ni.k[6]); + h = _mm_aesenc_si128(h,_k.ni.k[7]); + h = _mm_aesenc_si128(h,_k.ni.k[8]); + h = _mm_aesenc_si128(h,_k.ni.k[9]); + h = _mm_aesenc_si128(h,_k.ni.k[10]); + h = _mm_aesenc_si128(h,_k.ni.k[11]); + h = _mm_aesenc_si128(h,_k.ni.k[12]); + h = _mm_aesenc_si128(h,_k.ni.k[13]); + h = _mm_aesenclast_si128(h,_k.ni.k[14]); + __m128i hswap = _swap128_aesni(h); + __m128i hh = _mult_block_aesni(hswap,h); + __m128i hhh = _mult_block_aesni(hswap,hh); + __m128i hhhh = _mult_block_aesni(hswap,hhh); + _k.ni.h = hswap; + _k.ni.hh = _swap128_aesni(hh); + _k.ni.hhh = _swap128_aesni(hhh); + _k.ni.hhhh = _swap128_aesni(hhhh); + /* + this->h = h; + h = swap128(h); + this->hh = mult_block(h, this->h); + this->hhh = mult_block(h, this->hh); + this->hhhh = mult_block(h, this->hhh); + this->h = swap128(this->h); + this->hh = swap128(this->hh); + this->hhh = swap128(this->hhh); + this->hhhh = swap128(this->hhhh); + */ } + inline void _encrypt_aesni(const void *in,void *out) const { __m128i tmp; tmp = _mm_loadu_si128((const __m128i *)in); - tmp = _mm_xor_si128(tmp,_k.ni[0]); - tmp = _mm_aesenc_si128(tmp,_k.ni[1]); - tmp = _mm_aesenc_si128(tmp,_k.ni[2]); - tmp = _mm_aesenc_si128(tmp,_k.ni[3]); - tmp = _mm_aesenc_si128(tmp,_k.ni[4]); - tmp = _mm_aesenc_si128(tmp,_k.ni[5]); - tmp = _mm_aesenc_si128(tmp,_k.ni[6]); - tmp = _mm_aesenc_si128(tmp,_k.ni[7]); - tmp = _mm_aesenc_si128(tmp,_k.ni[8]); - tmp = _mm_aesenc_si128(tmp,_k.ni[9]); - tmp = _mm_aesenc_si128(tmp,_k.ni[10]); - tmp = _mm_aesenc_si128(tmp,_k.ni[11]); - tmp = _mm_aesenc_si128(tmp,_k.ni[12]); - tmp = _mm_aesenc_si128(tmp,_k.ni[13]); - _mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni[14])); + tmp = _mm_xor_si128(tmp,_k.ni.k[0]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[1]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[2]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[3]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[4]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[5]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[6]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[7]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[8]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[9]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[10]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[11]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[12]); + tmp = _mm_aesenc_si128(tmp,_k.ni.k[13]); + _mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14])); + } + + static inline __m128i _swap128_aesni(__m128i x) { return _mm_shuffle_epi8(x,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)); } + static inline __m128i _mult_block_aesni(__m128i h,__m128i y) + { + __m128i t1,t2,t3,t4,t5,t6; + y = _swap128_aesni(y); + t1 = _mm_clmulepi64_si128(h, y, 0x00); + t2 = _mm_clmulepi64_si128(h, y, 0x01); + t3 = _mm_clmulepi64_si128(h, y, 0x10); + t4 = _mm_clmulepi64_si128(h, y, 0x11); + t2 = _mm_xor_si128(t2, t3); + t3 = _mm_slli_si128(t2, 8); + t2 = _mm_srli_si128(t2, 8); + t1 = _mm_xor_si128(t1, t3); + t4 = _mm_xor_si128(t4, t2); + t5 = _mm_srli_epi32(t1, 31); + t1 = _mm_slli_epi32(t1, 1); + t6 = _mm_srli_epi32(t4, 31); + t4 = _mm_slli_epi32(t4, 1); + t3 = _mm_srli_si128(t5, 12); + t6 = _mm_slli_si128(t6, 4); + t5 = _mm_slli_si128(t5, 4); + t1 = _mm_or_si128(t1, t5); + t4 = _mm_or_si128(t4, t6); + t4 = _mm_or_si128(t4, t3); + t5 = _mm_slli_epi32(t1, 31); + t6 = _mm_slli_epi32(t1, 30); + t3 = _mm_slli_epi32(t1, 25); + t5 = _mm_xor_si128(t5, t6); + t5 = _mm_xor_si128(t5, t3); + t6 = _mm_srli_si128(t5, 4); + t4 = _mm_xor_si128(t4, t6); + t5 = _mm_slli_si128(t5, 12); + t1 = _mm_xor_si128(t1, t5); + t4 = _mm_xor_si128(t4, t1); + t5 = _mm_srli_epi32(t1, 1); + t2 = _mm_srli_epi32(t1, 2); + t3 = _mm_srli_epi32(t1, 7); + t4 = _mm_xor_si128(t4, t2); + t4 = _mm_xor_si128(t4, t3); + t4 = _mm_xor_si128(t4, t5); + return _swap128_aesni(t4); + } + static inline __m128i _mult4xor_aesni(__m128i h1,__m128i h2,__m128i h3,__m128i h4,__m128i d1,__m128i d2,__m128i d3,__m128i d4) + { + __m128i t0,t1,t2,t3,t4,t5,t6,t7,t8,t9; + d1 = _swap128_aesni(d1); + d2 = _swap128_aesni(d2); + d3 = _swap128_aesni(d3); + d4 = _swap128_aesni(d4); + t0 = _mm_clmulepi64_si128(h1, d1, 0x00); + t1 = _mm_clmulepi64_si128(h2, d2, 0x00); + t2 = _mm_clmulepi64_si128(h3, d3, 0x00); + t3 = _mm_clmulepi64_si128(h4, d4, 0x00); + t8 = _mm_xor_si128(t0, t1); + t8 = _mm_xor_si128(t8, t2); + t8 = _mm_xor_si128(t8, t3); + t4 = _mm_clmulepi64_si128(h1, d1, 0x11); + t5 = _mm_clmulepi64_si128(h2, d2, 0x11); + t6 = _mm_clmulepi64_si128(h3, d3, 0x11); + t7 = _mm_clmulepi64_si128(h4, d4, 0x11); + t9 = _mm_xor_si128(t4, t5); + t9 = _mm_xor_si128(t9, t6); + t9 = _mm_xor_si128(t9, t7); + t0 = _mm_shuffle_epi32(h1, 78); + t4 = _mm_shuffle_epi32(d1, 78); + t0 = _mm_xor_si128(t0, h1); + t4 = _mm_xor_si128(t4, d1); + t1 = _mm_shuffle_epi32(h2, 78); + t5 = _mm_shuffle_epi32(d2, 78); + t1 = _mm_xor_si128(t1, h2); + t5 = _mm_xor_si128(t5, d2); + t2 = _mm_shuffle_epi32(h3, 78); + t6 = _mm_shuffle_epi32(d3, 78); + t2 = _mm_xor_si128(t2, h3); + t6 = _mm_xor_si128(t6, d3); + t3 = _mm_shuffle_epi32(h4, 78); + t7 = _mm_shuffle_epi32(d4, 78); + t3 = _mm_xor_si128(t3, h4); + t7 = _mm_xor_si128(t7, d4); + t0 = _mm_clmulepi64_si128(t0, t4, 0x00); + t1 = _mm_clmulepi64_si128(t1, t5, 0x00); + t2 = _mm_clmulepi64_si128(t2, t6, 0x00); + t3 = _mm_clmulepi64_si128(t3, t7, 0x00); + t0 = _mm_xor_si128(t0, t8); + t0 = _mm_xor_si128(t0, t9); + t0 = _mm_xor_si128(t1, t0); + t0 = _mm_xor_si128(t2, t0); + t0 = _mm_xor_si128(t3, t0); + t4 = _mm_slli_si128(t0, 8); + t0 = _mm_srli_si128(t0, 8); + t3 = _mm_xor_si128(t4, t8); + t6 = _mm_xor_si128(t0, t9); + t7 = _mm_srli_epi32(t3, 31); + t8 = _mm_srli_epi32(t6, 31); + t3 = _mm_slli_epi32(t3, 1); + t6 = _mm_slli_epi32(t6, 1); + t9 = _mm_srli_si128(t7, 12); + t8 = _mm_slli_si128(t8, 4); + t7 = _mm_slli_si128(t7, 4); + t3 = _mm_or_si128(t3, t7); + t6 = _mm_or_si128(t6, t8); + t6 = _mm_or_si128(t6, t9); + t7 = _mm_slli_epi32(t3, 31); + t8 = _mm_slli_epi32(t3, 30); + t9 = _mm_slli_epi32(t3, 25); + t7 = _mm_xor_si128(t7, t8); + t7 = _mm_xor_si128(t7, t9); + t8 = _mm_srli_si128(t7, 4); + t7 = _mm_slli_si128(t7, 12); + t3 = _mm_xor_si128(t3, t7); + t2 = _mm_srli_epi32(t3, 1); + t4 = _mm_srli_epi32(t3, 2); + t5 = _mm_srli_epi32(t3, 7); + t2 = _mm_xor_si128(t2, t4); + t2 = _mm_xor_si128(t2, t5); + t2 = _mm_xor_si128(t2, t8); + t3 = _mm_xor_si128(t3, t2); + t6 = _mm_xor_si128(t6, t3); + return _swap128_aesni(t6); + } + static inline __m128i _ghash_aesni(__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(h,_mm_xor_si128(y,x)); } + static inline __m128i _increment_be_aesni(__m128i x) + { + x = _swap128_aesni(x); + x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1)); + x = _swap128_aesni(x); + return x; + } + static inline void _htoun64_aesni(void *network,const uint64_t host) { *((uint64_t *)network) = Utils::hton(host); } + static inline void _htoun32_aesni(void *network,const uint64_t host) { *((uint32_t *)network) = Utils::hton(host); } + + inline __m128i _create_j_aesni(const uint8_t *iv) const + { + uint8_t j[16]; + *((uint64_t *)j) = *((const uint64_t *)iv); + *((uint32_t *)(j+8)) = *((const uint32_t *)(iv+8)); + j[12] = 0; + j[13] = 0; + j[14] = 0; + j[15] = 1; + return _mm_loadu_si128((__m128i *)j); + } + inline __m128i _icv_header_aesni(const void *assoc,unsigned int alen) const + { + unsigned int blocks,pblocks,rem,i; + __m128i h1,h2,h3,h4,d1,d2,d3,d4; + __m128i y,last; + const __m128i *ab; + h1 = _k.ni.hhhh; + h2 = _k.ni.hhh; + h3 = _k.ni.hh; + h4 = _k.ni.h; + y = _mm_setzero_si128(); + ab = (const __m128i *)assoc; + blocks = alen / 16; + pblocks = blocks - (blocks % 4); + rem = alen % 16; + for (i=0;i