diff --git a/Makefile b/Makefile index d7dba4231..71a86e51d 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,15 @@ # Common makefile -- loads make rules for each platform BUILDDIR := build +CMAKE_OPTS := -DCMAKE_BUILD_TYPE=Release .PHONY: all all: - mkdir -p ${BUILDDIR} && cd ${BUILDDIR} && cmake .. && $(MAKE) + mkdir -p ${BUILDDIR} && cd ${BUILDDIR} && cmake .. ${CMAKE_OPTS} && $(MAKE) clean: rm -rf ${BUILDDIR} + +distclean: + rm -rf ${BUILDDIR} diff --git a/node/AES.hpp b/node/AES.hpp index 71979a961..1d7be9d53 100644 --- a/node/AES.hpp +++ b/node/AES.hpp @@ -57,15 +57,12 @@ public: inline AES() {} inline AES(const uint8_t key[32]) { this->init(key); } - inline ~AES() - { - Utils::burn(&_k,sizeof(_k)); - } + inline ~AES() { Utils::burn(&_k,sizeof(_k)); } inline void init(const uint8_t key[32]) { #ifdef ZT_AES_AESNI - if (HW_ACCEL) { + if (likely(HW_ACCEL)) { _init_aesni(key); return; } @@ -76,7 +73,7 @@ public: inline void encrypt(const uint8_t in[16],uint8_t out[16]) const { #ifdef ZT_AES_AESNI - if (HW_ACCEL) { + if (likely(HW_ACCEL)) { _encrypt_aesni(in,out); return; } @@ -84,10 +81,53 @@ public: _encryptSW(in,out); } + inline void ecbEncrypt(const void *in,unsigned int inlen,void *out) + { + if (inlen < 16) + return; +#ifdef ZT_AES_AESNI + if (likely(HW_ACCEL)) { + const uint8_t *i = (const uint8_t *)in; + uint8_t *o = (uint8_t *)out; + while (inlen >= 128) { + _encrypt_8xecb_aesni(i,o); + i += 128; + o += 128; + inlen -= 128; + } + while (inlen >= 16) { + _encrypt_aesni(i,o); + i += 16; + o += 16; + inlen -= 16; + } + if (inlen != 0) { + i -= (16 - inlen); + o -= (16 - inlen); + _encrypt_aesni(i,o); + } + return; + } +#endif + const uint8_t *i = (const uint8_t *)in; + uint8_t *o = (uint8_t *)out; + while (inlen >= 16) { + _encryptSW(i,o); + i += 16; + o += 16; + inlen -= 16; + } + if (inlen != 0) { + i -= (16 - inlen); + o -= (16 - inlen); + _encryptSW(i,o); + } + } + inline void gcmEncrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,uint8_t *tag,unsigned int taglen) { #ifdef ZT_AES_AESNI - if (HW_ACCEL) { + if (likely(HW_ACCEL)) { _encrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tag,taglen); return; } @@ -98,7 +138,7 @@ public: inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen) { #ifdef ZT_AES_AESNI - if (HW_ACCEL) { + if (likely(HW_ACCEL)) { uint8_t tagbuf[16]; _decrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tagbuf,taglen); return Utils::secureEq(tagbuf,tag,taglen); @@ -218,6 +258,160 @@ private: tmp = _mm_aesenc_si128(tmp,_k.ni.k[13]); _mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14])); } + inline void _encrypt_8xecb_aesni(const void *in,void *out) const + { + __m128i tmp0 = _mm_loadu_si128((const __m128i *)in); + __m128i tmp1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 16)); + __m128i tmp2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 32)); + __m128i tmp3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 48)); + __m128i tmp4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 64)); + __m128i tmp5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 80)); + __m128i tmp6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 96)); + __m128i tmp7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 112)); + { + __m128i k0 = _k.ni.k[0]; + __m128i k1 = _k.ni.k[1]; + __m128i k2 = _k.ni.k[2]; + __m128i k3 = _k.ni.k[3]; + tmp0 = _mm_xor_si128(tmp0,k0); + tmp1 = _mm_xor_si128(tmp1,k0); + tmp2 = _mm_xor_si128(tmp2,k0); + tmp3 = _mm_xor_si128(tmp3,k0); + tmp4 = _mm_xor_si128(tmp4,k0); + tmp5 = _mm_xor_si128(tmp5,k0); + tmp6 = _mm_xor_si128(tmp6,k0); + tmp7 = _mm_xor_si128(tmp7,k0); + tmp0 = _mm_aesenc_si128(tmp0,k1); + tmp1 = _mm_aesenc_si128(tmp1,k1); + tmp2 = _mm_aesenc_si128(tmp2,k1); + tmp3 = _mm_aesenc_si128(tmp3,k1); + tmp4 = _mm_aesenc_si128(tmp4,k1); + tmp5 = _mm_aesenc_si128(tmp5,k1); + tmp6 = _mm_aesenc_si128(tmp6,k1); + tmp7 = _mm_aesenc_si128(tmp7,k1); + tmp0 = _mm_aesenc_si128(tmp0,k2); + tmp1 = _mm_aesenc_si128(tmp1,k2); + tmp2 = _mm_aesenc_si128(tmp2,k2); + tmp3 = _mm_aesenc_si128(tmp3,k2); + tmp4 = _mm_aesenc_si128(tmp4,k2); + tmp5 = _mm_aesenc_si128(tmp5,k2); + tmp6 = _mm_aesenc_si128(tmp6,k2); + tmp7 = _mm_aesenc_si128(tmp7,k2); + tmp0 = _mm_aesenc_si128(tmp0,k3); + tmp1 = _mm_aesenc_si128(tmp1,k3); + tmp2 = _mm_aesenc_si128(tmp2,k3); + tmp3 = _mm_aesenc_si128(tmp3,k3); + tmp4 = _mm_aesenc_si128(tmp4,k3); + tmp5 = _mm_aesenc_si128(tmp5,k3); + tmp6 = _mm_aesenc_si128(tmp6,k3); + tmp7 = _mm_aesenc_si128(tmp7,k3); + } + { + __m128i k4 = _k.ni.k[4]; + __m128i k5 = _k.ni.k[5]; + __m128i k6 = _k.ni.k[6]; + __m128i k7 = _k.ni.k[7]; + tmp0 = _mm_aesenc_si128(tmp0,k4); + tmp1 = _mm_aesenc_si128(tmp1,k4); + tmp2 = _mm_aesenc_si128(tmp2,k4); + tmp3 = _mm_aesenc_si128(tmp3,k4); + tmp4 = _mm_aesenc_si128(tmp4,k4); + tmp5 = _mm_aesenc_si128(tmp5,k4); + tmp6 = _mm_aesenc_si128(tmp6,k4); + tmp7 = _mm_aesenc_si128(tmp7,k4); + tmp0 = _mm_aesenc_si128(tmp0,k5); + tmp1 = _mm_aesenc_si128(tmp1,k5); + tmp2 = _mm_aesenc_si128(tmp2,k5); + tmp3 = _mm_aesenc_si128(tmp3,k5); + tmp4 = _mm_aesenc_si128(tmp4,k5); + tmp5 = _mm_aesenc_si128(tmp5,k5); + tmp6 = _mm_aesenc_si128(tmp6,k5); + tmp7 = _mm_aesenc_si128(tmp7,k5); + tmp0 = _mm_aesenc_si128(tmp0,k6); + tmp1 = _mm_aesenc_si128(tmp1,k6); + tmp2 = _mm_aesenc_si128(tmp2,k6); + tmp3 = _mm_aesenc_si128(tmp3,k6); + tmp4 = _mm_aesenc_si128(tmp4,k6); + tmp5 = _mm_aesenc_si128(tmp5,k6); + tmp6 = _mm_aesenc_si128(tmp6,k6); + tmp7 = _mm_aesenc_si128(tmp7,k6); + tmp0 = _mm_aesenc_si128(tmp0,k7); + tmp1 = _mm_aesenc_si128(tmp1,k7); + tmp2 = _mm_aesenc_si128(tmp2,k7); + tmp3 = _mm_aesenc_si128(tmp3,k7); + tmp4 = _mm_aesenc_si128(tmp4,k7); + tmp5 = _mm_aesenc_si128(tmp5,k7); + tmp6 = _mm_aesenc_si128(tmp6,k7); + tmp7 = _mm_aesenc_si128(tmp7,k7); + } + { + __m128i k8 = _k.ni.k[8]; + __m128i k9 = _k.ni.k[9]; + __m128i k10 = _k.ni.k[10]; + __m128i k11 = _k.ni.k[11]; + tmp0 = _mm_aesenc_si128(tmp0,k8); + tmp1 = _mm_aesenc_si128(tmp1,k8); + tmp2 = _mm_aesenc_si128(tmp2,k8); + tmp3 = _mm_aesenc_si128(tmp3,k8); + tmp4 = _mm_aesenc_si128(tmp4,k8); + tmp5 = _mm_aesenc_si128(tmp5,k8); + tmp6 = _mm_aesenc_si128(tmp6,k8); + tmp7 = _mm_aesenc_si128(tmp7,k8); + tmp0 = _mm_aesenc_si128(tmp0,k9); + tmp1 = _mm_aesenc_si128(tmp1,k9); + tmp2 = _mm_aesenc_si128(tmp2,k9); + tmp3 = _mm_aesenc_si128(tmp3,k9); + tmp4 = _mm_aesenc_si128(tmp4,k9); + tmp5 = _mm_aesenc_si128(tmp5,k9); + tmp6 = _mm_aesenc_si128(tmp6,k9); + tmp7 = _mm_aesenc_si128(tmp7,k9); + tmp0 = _mm_aesenc_si128(tmp0,k10); + tmp1 = _mm_aesenc_si128(tmp1,k10); + tmp2 = _mm_aesenc_si128(tmp2,k10); + tmp3 = _mm_aesenc_si128(tmp3,k10); + tmp4 = _mm_aesenc_si128(tmp4,k10); + tmp5 = _mm_aesenc_si128(tmp5,k10); + tmp6 = _mm_aesenc_si128(tmp6,k10); + tmp7 = _mm_aesenc_si128(tmp7,k10); + tmp0 = _mm_aesenc_si128(tmp0,k11); + tmp1 = _mm_aesenc_si128(tmp1,k11); + tmp2 = _mm_aesenc_si128(tmp2,k11); + tmp3 = _mm_aesenc_si128(tmp3,k11); + tmp4 = _mm_aesenc_si128(tmp4,k11); + tmp5 = _mm_aesenc_si128(tmp5,k11); + tmp6 = _mm_aesenc_si128(tmp6,k11); + tmp7 = _mm_aesenc_si128(tmp7,k11); + } + { + __m128i k12 = _k.ni.k[12]; + __m128i k13 = _k.ni.k[13]; + __m128i k14 = _k.ni.k[14]; + tmp0 = _mm_aesenc_si128(tmp0,k12); + tmp1 = _mm_aesenc_si128(tmp1,k12); + tmp2 = _mm_aesenc_si128(tmp2,k12); + tmp3 = _mm_aesenc_si128(tmp3,k12); + tmp4 = _mm_aesenc_si128(tmp4,k12); + tmp5 = _mm_aesenc_si128(tmp5,k12); + tmp6 = _mm_aesenc_si128(tmp6,k12); + tmp7 = _mm_aesenc_si128(tmp7,k12); + tmp0 = _mm_aesenc_si128(tmp0,k13); + tmp1 = _mm_aesenc_si128(tmp1,k13); + tmp2 = _mm_aesenc_si128(tmp2,k13); + tmp3 = _mm_aesenc_si128(tmp3,k13); + tmp4 = _mm_aesenc_si128(tmp4,k13); + tmp5 = _mm_aesenc_si128(tmp5,k13); + tmp6 = _mm_aesenc_si128(tmp6,k13); + tmp7 = _mm_aesenc_si128(tmp7,k13); + _mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp0,k14)); + _mm_storeu_si128((__m128i *)((uint8_t *)out + 16),_mm_aesenclast_si128(tmp1,k14)); + _mm_storeu_si128((__m128i *)((uint8_t *)out + 32),_mm_aesenclast_si128(tmp2,k14)); + _mm_storeu_si128((__m128i *)((uint8_t *)out + 48),_mm_aesenclast_si128(tmp3,k14)); + _mm_storeu_si128((__m128i *)((uint8_t *)out + 64),_mm_aesenclast_si128(tmp4,k14)); + _mm_storeu_si128((__m128i *)((uint8_t *)out + 80),_mm_aesenclast_si128(tmp5,k14)); + _mm_storeu_si128((__m128i *)((uint8_t *)out + 96),_mm_aesenclast_si128(tmp6,k14)); + _mm_storeu_si128((__m128i *)((uint8_t *)out + 112),_mm_aesenclast_si128(tmp7,k14)); + } + } static inline __m128i _swap128_aesni(__m128i x) { return _mm_shuffle_epi8(x,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)); } static inline __m128i _mult_block_aesni(__m128i h,__m128i y) @@ -458,28 +652,16 @@ private: __m128i *bi = (__m128i *)in; __m128i *bo = (__m128i *)out; - __m128i k0 = _k.ni.k[0]; - __m128i k1 = _k.ni.k[1]; - __m128i k2 = _k.ni.k[2]; - __m128i k3 = _k.ni.k[3]; - __m128i k4 = _k.ni.k[4]; - __m128i k5 = _k.ni.k[5]; - __m128i k6 = _k.ni.k[6]; - __m128i k7 = _k.ni.k[7]; - __m128i k8 = _k.ni.k[8]; - __m128i k9 = _k.ni.k[9]; - __m128i k10 = _k.ni.k[10]; - __m128i k11 = _k.ni.k[11]; - __m128i k12 = _k.ni.k[12]; - __m128i k13 = _k.ni.k[13]; - __m128i k14 = _k.ni.k[14]; - unsigned int i; for (i=0;i @@ -50,37 +43,6 @@ namespace ZeroTier { -/************************************************************************** */ - -/* Set up macros for fast single-pass ASM Salsa20/12 crypto, if we have it */ - -// x64 SSE crypto -#ifdef ZT_USE_X64_ASM_SALSA2012 -#define ZT_HAS_FAST_CRYPTO() (true) -#define ZT_FAST_SINGLE_PASS_SALSA2012(b,l,n,k) zt_salsa2012_amd64_xmm6(reinterpret_cast(b),(l),reinterpret_cast(n),reinterpret_cast(k)) -#endif - -// ARM (32-bit) NEON crypto (must be detected) -#ifdef ZT_USE_ARM32_NEON_ASM_SALSA2012 -class _FastCryptoChecker -{ -public: - _FastCryptoChecker() : canHas(zt_arm_has_neon()) {} - bool canHas; -}; -static const _FastCryptoChecker _ZT_FAST_CRYPTO_CHECK; -#define ZT_HAS_FAST_CRYPTO() (_ZT_FAST_CRYPTO_CHECK.canHas) -#define ZT_FAST_SINGLE_PASS_SALSA2012(b,l,n,k) zt_salsa2012_armneon3_xor(reinterpret_cast(b),(const unsigned char *)0,(l),reinterpret_cast(n),reinterpret_cast(k)) -#endif - -// No fast crypto available -#ifndef ZT_HAS_FAST_CRYPTO -#define ZT_HAS_FAST_CRYPTO() (false) -#define ZT_FAST_SINGLE_PASS_SALSA2012(b,l,n,k) {} -#endif - -/************************************************************************** */ - /* LZ4 is shipped encapsulated into Packet in an anonymous namespace. * * We're doing this as a deliberate workaround for various Linux distribution @@ -899,30 +861,16 @@ void Packet::armor(const void *key,bool encryptPayload) _salsa20MangleKey((const unsigned char *)key,mangledKey); - if (ZT_HAS_FAST_CRYPTO()) { - const unsigned int encryptLen = (encryptPayload) ? (size() - ZT_PACKET_IDX_VERB) : 0; - uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8]; - ZT_FAST_SINGLE_PASS_SALSA2012(keyStream,encryptLen + 64,(data + ZT_PACKET_IDX_IV),mangledKey); - Salsa20::memxor(data + ZT_PACKET_IDX_VERB,reinterpret_cast(keyStream + 8),encryptLen); - uint64_t mac[2]; - poly1305(mac,data + ZT_PACKET_IDX_VERB,size() - ZT_PACKET_IDX_VERB,keyStream); -#ifdef ZT_NO_TYPE_PUNNING - memcpy(data + ZT_PACKET_IDX_MAC,mac,8); -#else - (*reinterpret_cast(data + ZT_PACKET_IDX_MAC)) = mac[0]; -#endif - } else { - Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV); - uint64_t macKey[4]; - s20.crypt12(ZERO_KEY,macKey,sizeof(macKey)); - uint8_t *const payload = data + ZT_PACKET_IDX_VERB; - const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB; - if (encryptPayload) - s20.crypt12(payload,payload,payloadLen); - uint64_t mac[2]; - poly1305(mac,payload,payloadLen,macKey); - memcpy(data + ZT_PACKET_IDX_MAC,mac,8); - } + Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV); + uint64_t macKey[4]; + s20.crypt12(ZERO_KEY,macKey,sizeof(macKey)); + uint8_t *const payload = data + ZT_PACKET_IDX_VERB; + const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB; + if (encryptPayload) + s20.crypt12(payload,payload,payloadLen); + uint64_t mac[2]; + poly1305(mac,payload,payloadLen,macKey); + memcpy(data + ZT_PACKET_IDX_MAC,mac,8); } bool Packet::dearmor(const void *key) @@ -935,37 +883,20 @@ bool Packet::dearmor(const void *key) if ((cs == ZT_PROTO_CIPHER_SUITE__POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012)) { _salsa20MangleKey((const unsigned char *)key,mangledKey); - if (ZT_HAS_FAST_CRYPTO()) { - uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8]; - ZT_FAST_SINGLE_PASS_SALSA2012(keyStream,((cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012) ? (payloadLen + 64) : 64),(data + ZT_PACKET_IDX_IV),mangledKey); - uint64_t mac[2]; - poly1305(mac,payload,payloadLen,keyStream); + Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV); + uint64_t macKey[4]; + s20.crypt12(ZERO_KEY,macKey,sizeof(macKey)); + uint64_t mac[2]; + poly1305(mac,payload,payloadLen,macKey); #ifdef ZT_NO_TYPE_PUNNING - if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8)) - return false; + if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8)) + return false; #else - if ((*reinterpret_cast(data + ZT_PACKET_IDX_MAC)) != mac[0]) // also secure, constant time - return false; + if ((*reinterpret_cast(data + ZT_PACKET_IDX_MAC)) != mac[0]) // also secure, constant time + return false; #endif - if (cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012) - Salsa20::memxor(data + ZT_PACKET_IDX_VERB,reinterpret_cast(keyStream + 8),payloadLen); - } else { - Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV); - uint64_t macKey[4]; - s20.crypt12(ZERO_KEY,macKey,sizeof(macKey)); - uint64_t mac[2]; - poly1305(mac,payload,payloadLen,macKey); -#ifdef ZT_NO_TYPE_PUNNING - if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8)) - return false; -#else - if ((*reinterpret_cast(data + ZT_PACKET_IDX_MAC)) != mac[0]) // also secure, constant time - return false; -#endif - if (cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012) - s20.crypt12(payload,payload,payloadLen); - } - + if (cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012) + s20.crypt12(payload,payload,payloadLen); return true; } else { return false; // unrecognized cipher suite diff --git a/node/Salsa20.hpp b/node/Salsa20.hpp index 26413d840..b80d11427 100644 --- a/node/Salsa20.hpp +++ b/node/Salsa20.hpp @@ -34,82 +34,6 @@ public: inline Salsa20() {} inline ~Salsa20() { Utils::burn(&_state,sizeof(_state)); } - /** - * XOR d with s - * - * This is done efficiently using e.g. SSE if available. It's used when - * alternative Salsa20 implementations are used in Packet and is here - * since this is where all the SSE stuff is already included. - * - * @param d Destination to XOR - * @param s Source bytes to XOR with destination - * @param len Length of s and d - */ - static inline void memxor(uint8_t *d,const uint8_t *s,unsigned int len) - { -#ifdef ZT_SALSA20_SSE - while (len >= 128) { - __m128i s0 = _mm_loadu_si128(reinterpret_cast(s)); - __m128i s1 = _mm_loadu_si128(reinterpret_cast(s + 16)); - __m128i s2 = _mm_loadu_si128(reinterpret_cast(s + 32)); - __m128i s3 = _mm_loadu_si128(reinterpret_cast(s + 48)); - __m128i s4 = _mm_loadu_si128(reinterpret_cast(s + 64)); - __m128i s5 = _mm_loadu_si128(reinterpret_cast(s + 80)); - __m128i s6 = _mm_loadu_si128(reinterpret_cast(s + 96)); - __m128i s7 = _mm_loadu_si128(reinterpret_cast(s + 112)); - __m128i d0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d)); - __m128i d1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 16)); - __m128i d2 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 32)); - __m128i d3 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 48)); - __m128i d4 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 64)); - __m128i d5 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 80)); - __m128i d6 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 96)); - __m128i d7 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 112)); - d0 = _mm_xor_si128(d0,s0); - d1 = _mm_xor_si128(d1,s1); - d2 = _mm_xor_si128(d2,s2); - d3 = _mm_xor_si128(d3,s3); - d4 = _mm_xor_si128(d4,s4); - d5 = _mm_xor_si128(d5,s5); - d6 = _mm_xor_si128(d6,s6); - d7 = _mm_xor_si128(d7,s7); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d),d0); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 16),d1); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 32),d2); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 48),d3); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 64),d4); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 80),d5); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 96),d6); - _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 112),d7); - s += 128; - d += 128; - len -= 128; - } - while (len >= 16) { - _mm_storeu_si128(reinterpret_cast<__m128i *>(d),_mm_xor_si128(_mm_loadu_si128(reinterpret_cast<__m128i *>(d)),_mm_loadu_si128(reinterpret_cast(s)))); - s += 16; - d += 16; - len -= 16; - } -#else -#ifndef ZT_NO_TYPE_PUNNING - while (len >= 16) { - (*reinterpret_cast(d)) ^= (*reinterpret_cast(s)); - s += 8; - d += 8; - (*reinterpret_cast(d)) ^= (*reinterpret_cast(s)); - s += 8; - d += 8; - len -= 16; - } -#endif -#endif - while (len) { - --len; - *(d++) ^= *(s++); - } - } - /** * @param key 256-bit (32 byte) key * @param iv 64-bit initialization vector diff --git a/selftest.cpp b/selftest.cpp index f9c7a7502..876ce7c0a 100644 --- a/selftest.cpp +++ b/selftest.cpp @@ -209,13 +209,36 @@ static int testCrypto() } double gcmBytes = 0.0; int64_t start = OSUtils::now(); - for(unsigned long i=0;i<150000;++i) { + for(unsigned long i=0;i<100000;++i) { tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16); - gcmBytes += (double)sizeof(buf1); + tv.gcmEncrypt((const uint8_t *)hexbuf,buf2,sizeof(buf2),nullptr,0,buf1,(uint8_t *)(hexbuf + 32),16); + gcmBytes += (double)(sizeof(buf1) * 2); } int64_t end = OSUtils::now(); *dummy = buf1[0]; - std::cout << ((gcmBytes / 1048576.0) / ((long double)(end - start) / 1000.0)) << " MiB/second" << std::endl; + std::cout << ((gcmBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl << " AES-256 ECB scramble (benchmark): "; std::cout.flush(); + double ecbBytes = 0.0; + start = OSUtils::now(); + for(unsigned long i=0;i<100000;++i) { + tv.ecbEncrypt(buf1,sizeof(buf1),buf2); + tv.ecbEncrypt(buf2,sizeof(buf1),buf1); + ecbBytes += (double)(sizeof(buf1) * 2); + } + end = OSUtils::now(); + *dummy = buf1[0]; + std::cout << ((ecbBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl << " AES-256 GCM + ECB scramble (benchmark): "; std::cout.flush(); + ecbBytes = 0.0; + start = OSUtils::now(); + for(unsigned long i=0;i<100000;++i) { + tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16); + tv.ecbEncrypt(buf1,sizeof(buf1),buf2); + tv.gcmEncrypt((const uint8_t *)hexbuf,buf2,sizeof(buf2),nullptr,0,buf1,(uint8_t *)(hexbuf + 32),16); + tv.ecbEncrypt(buf2,sizeof(buf1),buf1); + ecbBytes += (double)(sizeof(buf1) * 2); + } + end = OSUtils::now(); + *dummy = buf1[0]; + std::cout << ((ecbBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl; std::cout << "[crypto] Testing Salsa20... "; std::cout.flush(); for(unsigned int i=0;i<4;++i) { @@ -275,42 +298,6 @@ static int testCrypto() ::free((void *)bb); } -#ifdef ZT_USE_X64_ASM_SALSA2012 - std::cout << "[crypto] Benchmarking Salsa20/12 fast x64 ASM... "; std::cout.flush(); - { - unsigned char *bb = (unsigned char *)::malloc(1234567); - double bytes = 0.0; - uint64_t start = OSUtils::now(); - for(unsigned int i=0;i<200;++i) { - zt_salsa2012_amd64_xmm6(bb,1234567,s20TV0Iv,s20TV0Key); - bytes += 1234567.0; - } - uint64_t end = OSUtils::now(); - *dummy = bb[0]; - std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl; - ::free((void *)bb); - } -#endif - -#ifdef ZT_USE_ARM32_NEON_ASM_SALSA2012 - if (zt_arm_has_neon()) { - std::cout << "[crypto] Benchmarking Salsa20/12 fast arm32/neon ASM... "; std::cout.flush(); - { - unsigned char *bb = (unsigned char *)::malloc(1234567); - double bytes = 0.0; - uint64_t start = OSUtils::now(); - for(unsigned int i=0;i<200;++i) { - zt_salsa2012_armneon3_xor(bb,(const unsigned char *)0,1234567,s20TV0Iv,s20TV0Key); - bytes += 1234567.0; - } - uint64_t end = OSUtils::now(); - *dummy = bb[0]; - std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl; - ::free((void *)bb); - } - } -#endif - std::cout << "[crypto] Benchmarking Salsa20/20... "; std::cout.flush(); { unsigned char *bb = (unsigned char *)::malloc(1234567);