/* * Copyright (c)2019 ZeroTier, Inc. * * Use of this software is governed by the Business Source License included * in the LICENSE.TXT file in the project's root directory. * * Change Date: 2023-01-01 * * On the date above, in accordance with the Business Source License, use * of this software will be governed by version 2.0 of the Apache License. */ /****/ #ifndef ZT_AES_HPP #define ZT_AES_HPP #include "Constants.hpp" #include "Utils.hpp" #if (defined(__amd64) || defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || defined(__AMD64) || defined(__AMD64__) || defined(_M_X64)) #include #include #include #define ZT_AES_AESNI 1 #endif #define ZT_AES_KEY_SIZE 32 #define ZT_AES_BLOCK_SIZE 16 namespace ZeroTier { /** * AES-256 and AES-GCM AEAD */ class AES { public: /** * This will be true if your platform's type of AES acceleration is supported on this machine */ static const bool HW_ACCEL; inline AES() {} inline AES(const uint8_t key[32]) { this->init(key); } inline ~AES() { Utils::burn(&_k,sizeof(_k)); } /** * Set (or re-set) this AES256 cipher's key */ inline void init(const uint8_t key[32]) { #ifdef ZT_AES_AESNI if (likely(HW_ACCEL)) { _init_aesni(key); return; } #endif _initSW(key); } /** * Encrypt a single AES block (ECB mode) * * @param in Input block * @param out Output block (can be same as input) */ inline void encrypt(const uint8_t in[16],uint8_t out[16]) const { #ifdef ZT_AES_AESNI if (likely(HW_ACCEL)) { _encrypt_aesni(in,out); return; } #endif _encryptSW(in,out); } /** * Compute GMAC-AES256 (GCM without ciphertext) * * @param iv 96-bit IV * @param in Input data * @param len Length of input * @param out 128-bit authorization tag from GMAC */ inline void gmac(const uint8_t iv[12],const void *in,const unsigned int len,uint8_t out[16]) const { #ifdef ZT_AES_AESNI if (likely(HW_ACCEL)) { _gmac_aesni(iv,(const uint8_t *)in,len,out); return; } #endif _gmacSW(iv,(const uint8_t *)in,len,out); } /** * Encrypt or decrypt (they're the same) using AES256-CTR * * The counter here is a 128-bit big-endian that starts at the IV. The code only * increments the least significant 64 bits, making it only safe to use for a * maximum of 2^64-1 bytes (much larger than we ever do). * * @param iv 128-bit CTR IV * @param in Input plaintext or ciphertext * @param len Length of input * @param out Output plaintext or ciphertext */ inline void ctr(const uint8_t iv[16],const void *in,unsigned int len,void *out) const { #ifdef ZT_AES_AESNI if (likely(HW_ACCEL)) { _crypt_ctr_aesni(iv,(const uint8_t *)in,len,(uint8_t *)out); return; } #endif uint64_t ctr[2],cenc[2]; memcpy(ctr,iv,16); uint64_t bctr = Utils::ntoh(ctr[1]); const uint8_t *i = (const uint8_t *)in; uint8_t *o = (uint8_t *)out; while (len >= 16) { _encryptSW((const uint8_t *)ctr,(uint8_t *)cenc); ctr[1] = Utils::hton(++bctr); #ifdef ZT_NO_TYPE_PUNNING for(unsigned int k=0;k<16;++k) *(o++) = *(i++) ^ ((uint8_t *)cenc)[k]; #else *((uint64_t *)o) = *((const uint64_t *)i) ^ cenc[0]; o += 8; i += 8; *((uint64_t *)o) = *((const uint64_t *)i) ^ cenc[1]; o += 8; i += 8; #endif len -= 16; } if (len) { _encryptSW((const uint8_t *)ctr,(uint8_t *)cenc); for(unsigned int k=0;k= 64) { __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr),iv0),k0); __m128i c1 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+1ULL)),iv0),k0); __m128i c2 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+2ULL)),iv0),k0); __m128i c3 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton((uint64_t)(ctr+3ULL)),iv0),k0); ctr += 4; c0 = _mm_aesenc_si128(c0,k1); c1 = _mm_aesenc_si128(c1,k1); c2 = _mm_aesenc_si128(c2,k1); c3 = _mm_aesenc_si128(c3,k1); c0 = _mm_aesenc_si128(c0,k2); c1 = _mm_aesenc_si128(c1,k2); c2 = _mm_aesenc_si128(c2,k2); c3 = _mm_aesenc_si128(c3,k2); c0 = _mm_aesenc_si128(c0,k3); c1 = _mm_aesenc_si128(c1,k3); c2 = _mm_aesenc_si128(c2,k3); c3 = _mm_aesenc_si128(c3,k3); c0 = _mm_aesenc_si128(c0,k4); c1 = _mm_aesenc_si128(c1,k4); c2 = _mm_aesenc_si128(c2,k4); c3 = _mm_aesenc_si128(c3,k4); c0 = _mm_aesenc_si128(c0,k5); c1 = _mm_aesenc_si128(c1,k5); c2 = _mm_aesenc_si128(c2,k5); c3 = _mm_aesenc_si128(c3,k5); c0 = _mm_aesenc_si128(c0,k6); c1 = _mm_aesenc_si128(c1,k6); c2 = _mm_aesenc_si128(c2,k6); c3 = _mm_aesenc_si128(c3,k6); c0 = _mm_aesenc_si128(c0,k7); c1 = _mm_aesenc_si128(c1,k7); c2 = _mm_aesenc_si128(c2,k7); c3 = _mm_aesenc_si128(c3,k7); c0 = _mm_aesenc_si128(c0,k8); c1 = _mm_aesenc_si128(c1,k8); c2 = _mm_aesenc_si128(c2,k8); c3 = _mm_aesenc_si128(c3,k8); c0 = _mm_aesenc_si128(c0,k9); c1 = _mm_aesenc_si128(c1,k9); c2 = _mm_aesenc_si128(c2,k9); c3 = _mm_aesenc_si128(c3,k9); c0 = _mm_aesenc_si128(c0,k10); c1 = _mm_aesenc_si128(c1,k10); c2 = _mm_aesenc_si128(c2,k10); c3 = _mm_aesenc_si128(c3,k10); c0 = _mm_aesenc_si128(c0,k11); c1 = _mm_aesenc_si128(c1,k11); c2 = _mm_aesenc_si128(c2,k11); c3 = _mm_aesenc_si128(c3,k11); c0 = _mm_aesenc_si128(c0,k12); c1 = _mm_aesenc_si128(c1,k12); c2 = _mm_aesenc_si128(c2,k12); c3 = _mm_aesenc_si128(c3,k12); c0 = _mm_aesenc_si128(c0,k13); c1 = _mm_aesenc_si128(c1,k13); c2 = _mm_aesenc_si128(c2,k13); c3 = _mm_aesenc_si128(c3,k13); _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,k14))); _mm_storeu_si128((__m128i *)(out + 16),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 16)),_mm_aesenclast_si128(c1,k14))); _mm_storeu_si128((__m128i *)(out + 32),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 32)),_mm_aesenclast_si128(c2,k14))); _mm_storeu_si128((__m128i *)(out + 48),_mm_xor_si128(_mm_loadu_si128((const __m128i *)(in + 48)),_mm_aesenclast_si128(c3,k14))); in += 64; out += 64; len -= 64; } while (len >= 16) { __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),k0); c0 = _mm_aesenc_si128(c0,k1); c0 = _mm_aesenc_si128(c0,k2); c0 = _mm_aesenc_si128(c0,k3); c0 = _mm_aesenc_si128(c0,k4); c0 = _mm_aesenc_si128(c0,k5); c0 = _mm_aesenc_si128(c0,k6); c0 = _mm_aesenc_si128(c0,k7); c0 = _mm_aesenc_si128(c0,k8); c0 = _mm_aesenc_si128(c0,k9); c0 = _mm_aesenc_si128(c0,k10); c0 = _mm_aesenc_si128(c0,k11); c0 = _mm_aesenc_si128(c0,k12); c0 = _mm_aesenc_si128(c0,k13); _mm_storeu_si128((__m128i *)out,_mm_xor_si128(_mm_loadu_si128((const __m128i *)in),_mm_aesenclast_si128(c0,k14))); in += 16; out += 16; len -= 16; } if (len) { __m128i c0 = _mm_xor_si128(_mm_set_epi64((__m64)Utils::hton(ctr++),(__m64)iv0),k0); c0 = _mm_aesenc_si128(c0,k1); c0 = _mm_aesenc_si128(c0,k2); c0 = _mm_aesenc_si128(c0,k3); c0 = _mm_aesenc_si128(c0,k4); c0 = _mm_aesenc_si128(c0,k5); c0 = _mm_aesenc_si128(c0,k6); c0 = _mm_aesenc_si128(c0,k7); c0 = _mm_aesenc_si128(c0,k8); c0 = _mm_aesenc_si128(c0,k9); c0 = _mm_aesenc_si128(c0,k10); c0 = _mm_aesenc_si128(c0,k11); c0 = _mm_aesenc_si128(c0,k12); c0 = _mm_aesenc_si128(c0,k13); c0 = _mm_aesenclast_si128(c0,k14); for(unsigned int i=0;i