mirror of
https://github.com/zerotier/ZeroTierOne.git
synced 2025-01-19 03:06:26 +00:00
Remove ASM Salsa20 since it will not be the default in 2.x any more... reduce build complexity.
This commit is contained in:
parent
2b681c37ac
commit
51a25fdec9
6
Makefile
6
Makefile
@ -1,11 +1,15 @@
|
||||
# Common makefile -- loads make rules for each platform
|
||||
|
||||
BUILDDIR := build
|
||||
CMAKE_OPTS := -DCMAKE_BUILD_TYPE=Release
|
||||
|
||||
.PHONY: all
|
||||
|
||||
all:
|
||||
mkdir -p ${BUILDDIR} && cd ${BUILDDIR} && cmake .. && $(MAKE)
|
||||
mkdir -p ${BUILDDIR} && cd ${BUILDDIR} && cmake .. ${CMAKE_OPTS} && $(MAKE)
|
||||
|
||||
clean:
|
||||
rm -rf ${BUILDDIR}
|
||||
|
||||
distclean:
|
||||
rm -rf ${BUILDDIR}
|
||||
|
256
node/AES.hpp
256
node/AES.hpp
@ -57,15 +57,12 @@ public:
|
||||
inline AES() {}
|
||||
inline AES(const uint8_t key[32]) { this->init(key); }
|
||||
|
||||
inline ~AES()
|
||||
{
|
||||
Utils::burn(&_k,sizeof(_k));
|
||||
}
|
||||
inline ~AES() { Utils::burn(&_k,sizeof(_k)); }
|
||||
|
||||
inline void init(const uint8_t key[32])
|
||||
{
|
||||
#ifdef ZT_AES_AESNI
|
||||
if (HW_ACCEL) {
|
||||
if (likely(HW_ACCEL)) {
|
||||
_init_aesni(key);
|
||||
return;
|
||||
}
|
||||
@ -76,7 +73,7 @@ public:
|
||||
inline void encrypt(const uint8_t in[16],uint8_t out[16]) const
|
||||
{
|
||||
#ifdef ZT_AES_AESNI
|
||||
if (HW_ACCEL) {
|
||||
if (likely(HW_ACCEL)) {
|
||||
_encrypt_aesni(in,out);
|
||||
return;
|
||||
}
|
||||
@ -84,10 +81,53 @@ public:
|
||||
_encryptSW(in,out);
|
||||
}
|
||||
|
||||
inline void ecbEncrypt(const void *in,unsigned int inlen,void *out)
|
||||
{
|
||||
if (inlen < 16)
|
||||
return;
|
||||
#ifdef ZT_AES_AESNI
|
||||
if (likely(HW_ACCEL)) {
|
||||
const uint8_t *i = (const uint8_t *)in;
|
||||
uint8_t *o = (uint8_t *)out;
|
||||
while (inlen >= 128) {
|
||||
_encrypt_8xecb_aesni(i,o);
|
||||
i += 128;
|
||||
o += 128;
|
||||
inlen -= 128;
|
||||
}
|
||||
while (inlen >= 16) {
|
||||
_encrypt_aesni(i,o);
|
||||
i += 16;
|
||||
o += 16;
|
||||
inlen -= 16;
|
||||
}
|
||||
if (inlen != 0) {
|
||||
i -= (16 - inlen);
|
||||
o -= (16 - inlen);
|
||||
_encrypt_aesni(i,o);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
const uint8_t *i = (const uint8_t *)in;
|
||||
uint8_t *o = (uint8_t *)out;
|
||||
while (inlen >= 16) {
|
||||
_encryptSW(i,o);
|
||||
i += 16;
|
||||
o += 16;
|
||||
inlen -= 16;
|
||||
}
|
||||
if (inlen != 0) {
|
||||
i -= (16 - inlen);
|
||||
o -= (16 - inlen);
|
||||
_encryptSW(i,o);
|
||||
}
|
||||
}
|
||||
|
||||
inline void gcmEncrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,uint8_t *tag,unsigned int taglen)
|
||||
{
|
||||
#ifdef ZT_AES_AESNI
|
||||
if (HW_ACCEL) {
|
||||
if (likely(HW_ACCEL)) {
|
||||
_encrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tag,taglen);
|
||||
return;
|
||||
}
|
||||
@ -98,7 +138,7 @@ public:
|
||||
inline bool gcmDecrypt(const uint8_t iv[12],const void *in,unsigned int inlen,const void *assoc,unsigned int assoclen,void *out,const uint8_t *tag,unsigned int taglen)
|
||||
{
|
||||
#ifdef ZT_AES_AESNI
|
||||
if (HW_ACCEL) {
|
||||
if (likely(HW_ACCEL)) {
|
||||
uint8_t tagbuf[16];
|
||||
_decrypt_gcm256_aesni(inlen,(const uint8_t *)in,(uint8_t *)out,iv,assoclen,(const uint8_t *)assoc,tagbuf,taglen);
|
||||
return Utils::secureEq(tagbuf,tag,taglen);
|
||||
@ -218,6 +258,160 @@ private:
|
||||
tmp = _mm_aesenc_si128(tmp,_k.ni.k[13]);
|
||||
_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp,_k.ni.k[14]));
|
||||
}
|
||||
inline void _encrypt_8xecb_aesni(const void *in,void *out) const
|
||||
{
|
||||
__m128i tmp0 = _mm_loadu_si128((const __m128i *)in);
|
||||
__m128i tmp1 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 16));
|
||||
__m128i tmp2 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 32));
|
||||
__m128i tmp3 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 48));
|
||||
__m128i tmp4 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 64));
|
||||
__m128i tmp5 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 80));
|
||||
__m128i tmp6 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 96));
|
||||
__m128i tmp7 = _mm_loadu_si128((const __m128i *)((const uint8_t *)in + 112));
|
||||
{
|
||||
__m128i k0 = _k.ni.k[0];
|
||||
__m128i k1 = _k.ni.k[1];
|
||||
__m128i k2 = _k.ni.k[2];
|
||||
__m128i k3 = _k.ni.k[3];
|
||||
tmp0 = _mm_xor_si128(tmp0,k0);
|
||||
tmp1 = _mm_xor_si128(tmp1,k0);
|
||||
tmp2 = _mm_xor_si128(tmp2,k0);
|
||||
tmp3 = _mm_xor_si128(tmp3,k0);
|
||||
tmp4 = _mm_xor_si128(tmp4,k0);
|
||||
tmp5 = _mm_xor_si128(tmp5,k0);
|
||||
tmp6 = _mm_xor_si128(tmp6,k0);
|
||||
tmp7 = _mm_xor_si128(tmp7,k0);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k1);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k1);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k1);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k1);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k1);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k1);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k1);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k1);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k2);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k2);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k2);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k2);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k2);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k2);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k2);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k2);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k3);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k3);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k3);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k3);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k3);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k3);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k3);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k3);
|
||||
}
|
||||
{
|
||||
__m128i k4 = _k.ni.k[4];
|
||||
__m128i k5 = _k.ni.k[5];
|
||||
__m128i k6 = _k.ni.k[6];
|
||||
__m128i k7 = _k.ni.k[7];
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k4);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k4);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k4);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k4);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k4);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k4);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k4);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k4);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k5);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k5);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k5);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k5);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k5);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k5);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k5);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k5);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k6);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k6);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k6);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k6);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k6);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k6);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k6);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k6);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k7);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k7);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k7);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k7);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k7);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k7);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k7);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k7);
|
||||
}
|
||||
{
|
||||
__m128i k8 = _k.ni.k[8];
|
||||
__m128i k9 = _k.ni.k[9];
|
||||
__m128i k10 = _k.ni.k[10];
|
||||
__m128i k11 = _k.ni.k[11];
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k8);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k8);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k8);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k8);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k8);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k8);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k8);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k8);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k9);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k9);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k9);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k9);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k9);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k9);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k9);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k9);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k10);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k10);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k10);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k10);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k10);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k10);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k10);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k10);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k11);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k11);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k11);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k11);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k11);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k11);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k11);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k11);
|
||||
}
|
||||
{
|
||||
__m128i k12 = _k.ni.k[12];
|
||||
__m128i k13 = _k.ni.k[13];
|
||||
__m128i k14 = _k.ni.k[14];
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k12);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k12);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k12);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k12);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k12);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k12);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k12);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k12);
|
||||
tmp0 = _mm_aesenc_si128(tmp0,k13);
|
||||
tmp1 = _mm_aesenc_si128(tmp1,k13);
|
||||
tmp2 = _mm_aesenc_si128(tmp2,k13);
|
||||
tmp3 = _mm_aesenc_si128(tmp3,k13);
|
||||
tmp4 = _mm_aesenc_si128(tmp4,k13);
|
||||
tmp5 = _mm_aesenc_si128(tmp5,k13);
|
||||
tmp6 = _mm_aesenc_si128(tmp6,k13);
|
||||
tmp7 = _mm_aesenc_si128(tmp7,k13);
|
||||
_mm_storeu_si128((__m128i *)out,_mm_aesenclast_si128(tmp0,k14));
|
||||
_mm_storeu_si128((__m128i *)((uint8_t *)out + 16),_mm_aesenclast_si128(tmp1,k14));
|
||||
_mm_storeu_si128((__m128i *)((uint8_t *)out + 32),_mm_aesenclast_si128(tmp2,k14));
|
||||
_mm_storeu_si128((__m128i *)((uint8_t *)out + 48),_mm_aesenclast_si128(tmp3,k14));
|
||||
_mm_storeu_si128((__m128i *)((uint8_t *)out + 64),_mm_aesenclast_si128(tmp4,k14));
|
||||
_mm_storeu_si128((__m128i *)((uint8_t *)out + 80),_mm_aesenclast_si128(tmp5,k14));
|
||||
_mm_storeu_si128((__m128i *)((uint8_t *)out + 96),_mm_aesenclast_si128(tmp6,k14));
|
||||
_mm_storeu_si128((__m128i *)((uint8_t *)out + 112),_mm_aesenclast_si128(tmp7,k14));
|
||||
}
|
||||
}
|
||||
|
||||
static inline __m128i _swap128_aesni(__m128i x) { return _mm_shuffle_epi8(x,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)); }
|
||||
static inline __m128i _mult_block_aesni(__m128i h,__m128i y)
|
||||
@ -458,28 +652,16 @@ private:
|
||||
__m128i *bi = (__m128i *)in;
|
||||
__m128i *bo = (__m128i *)out;
|
||||
|
||||
__m128i k0 = _k.ni.k[0];
|
||||
__m128i k1 = _k.ni.k[1];
|
||||
__m128i k2 = _k.ni.k[2];
|
||||
__m128i k3 = _k.ni.k[3];
|
||||
__m128i k4 = _k.ni.k[4];
|
||||
__m128i k5 = _k.ni.k[5];
|
||||
__m128i k6 = _k.ni.k[6];
|
||||
__m128i k7 = _k.ni.k[7];
|
||||
__m128i k8 = _k.ni.k[8];
|
||||
__m128i k9 = _k.ni.k[9];
|
||||
__m128i k10 = _k.ni.k[10];
|
||||
__m128i k11 = _k.ni.k[11];
|
||||
__m128i k12 = _k.ni.k[12];
|
||||
__m128i k13 = _k.ni.k[13];
|
||||
__m128i k14 = _k.ni.k[14];
|
||||
|
||||
unsigned int i;
|
||||
for (i=0;i<pblocks;i+=4) {
|
||||
__m128i d1 = _mm_loadu_si128(bi + i + 0);
|
||||
__m128i d2 = _mm_loadu_si128(bi + i + 1);
|
||||
__m128i d3 = _mm_loadu_si128(bi + i + 2);
|
||||
__m128i d4 = _mm_loadu_si128(bi + i + 3);
|
||||
__m128i k0 = _k.ni.k[0];
|
||||
__m128i k1 = _k.ni.k[1];
|
||||
__m128i k2 = _k.ni.k[2];
|
||||
__m128i k3 = _k.ni.k[3];
|
||||
__m128i t1 = _mm_xor_si128(cb,k0);
|
||||
cb = _increment_be_aesni(cb);
|
||||
__m128i t2 = _mm_xor_si128(cb,k0);
|
||||
@ -500,6 +682,10 @@ private:
|
||||
t2 = _mm_aesenc_si128(t2,k3);
|
||||
t3 = _mm_aesenc_si128(t3,k3);
|
||||
t4 = _mm_aesenc_si128(t4,k3);
|
||||
__m128i k4 = _k.ni.k[4];
|
||||
__m128i k5 = _k.ni.k[5];
|
||||
__m128i k6 = _k.ni.k[6];
|
||||
__m128i k7 = _k.ni.k[7];
|
||||
t1 = _mm_aesenc_si128(t1,k4);
|
||||
t2 = _mm_aesenc_si128(t2,k4);
|
||||
t3 = _mm_aesenc_si128(t3,k4);
|
||||
@ -516,6 +702,10 @@ private:
|
||||
t2 = _mm_aesenc_si128(t2,k7);
|
||||
t3 = _mm_aesenc_si128(t3,k7);
|
||||
t4 = _mm_aesenc_si128(t4,k7);
|
||||
__m128i k8 = _k.ni.k[8];
|
||||
__m128i k9 = _k.ni.k[9];
|
||||
__m128i k10 = _k.ni.k[10];
|
||||
__m128i k11 = _k.ni.k[11];
|
||||
t1 = _mm_aesenc_si128(t1,k8);
|
||||
t2 = _mm_aesenc_si128(t2,k8);
|
||||
t3 = _mm_aesenc_si128(t3,k8);
|
||||
@ -532,6 +722,9 @@ private:
|
||||
t2 = _mm_aesenc_si128(t2,k11);
|
||||
t3 = _mm_aesenc_si128(t3,k11);
|
||||
t4 = _mm_aesenc_si128(t4,k11);
|
||||
__m128i k12 = _k.ni.k[12];
|
||||
__m128i k13 = _k.ni.k[13];
|
||||
__m128i k14 = _k.ni.k[14];
|
||||
t1 = _mm_aesenc_si128(t1,k12);
|
||||
t2 = _mm_aesenc_si128(t2,k12);
|
||||
t3 = _mm_aesenc_si128(t3,k12);
|
||||
@ -558,18 +751,33 @@ private:
|
||||
|
||||
for (i=pblocks;i<blocks;++i) {
|
||||
__m128i d1 = _mm_loadu_si128(bi + i);
|
||||
__m128i k0 = _k.ni.k[0];
|
||||
__m128i k1 = _k.ni.k[1];
|
||||
__m128i k2 = _k.ni.k[2];
|
||||
__m128i k3 = _k.ni.k[3];
|
||||
__m128i t1 = _mm_xor_si128(cb,k0);
|
||||
t1 = _mm_aesenc_si128(t1,k1);
|
||||
t1 = _mm_aesenc_si128(t1,k2);
|
||||
t1 = _mm_aesenc_si128(t1,k3);
|
||||
__m128i k4 = _k.ni.k[4];
|
||||
__m128i k5 = _k.ni.k[5];
|
||||
__m128i k6 = _k.ni.k[6];
|
||||
__m128i k7 = _k.ni.k[7];
|
||||
t1 = _mm_aesenc_si128(t1,k4);
|
||||
t1 = _mm_aesenc_si128(t1,k5);
|
||||
t1 = _mm_aesenc_si128(t1,k6);
|
||||
t1 = _mm_aesenc_si128(t1,k7);
|
||||
__m128i k8 = _k.ni.k[8];
|
||||
__m128i k9 = _k.ni.k[9];
|
||||
__m128i k10 = _k.ni.k[10];
|
||||
__m128i k11 = _k.ni.k[11];
|
||||
t1 = _mm_aesenc_si128(t1,k8);
|
||||
t1 = _mm_aesenc_si128(t1,k9);
|
||||
t1 = _mm_aesenc_si128(t1,k10);
|
||||
t1 = _mm_aesenc_si128(t1,k11);
|
||||
__m128i k12 = _k.ni.k[12];
|
||||
__m128i k13 = _k.ni.k[13];
|
||||
__m128i k14 = _k.ni.k[14];
|
||||
t1 = _mm_aesenc_si128(t1,k12);
|
||||
t1 = _mm_aesenc_si128(t1,k13);
|
||||
t1 = _mm_aesenclast_si128(t1,k14);
|
||||
|
@ -32,13 +32,6 @@
|
||||
|
||||
#include "Packet.hpp"
|
||||
|
||||
#ifdef ZT_USE_X64_ASM_SALSA2012
|
||||
#include "../ext/x64-salsa2012-asm/salsa2012.h"
|
||||
#endif
|
||||
#ifdef ZT_USE_ARM32_NEON_ASM_SALSA2012
|
||||
#include "../ext/arm32-neon-salsa2012-asm/salsa2012.h"
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define FORCE_INLINE static __forceinline
|
||||
#include <intrin.h>
|
||||
@ -50,37 +43,6 @@
|
||||
|
||||
namespace ZeroTier {
|
||||
|
||||
/************************************************************************** */
|
||||
|
||||
/* Set up macros for fast single-pass ASM Salsa20/12 crypto, if we have it */
|
||||
|
||||
// x64 SSE crypto
|
||||
#ifdef ZT_USE_X64_ASM_SALSA2012
|
||||
#define ZT_HAS_FAST_CRYPTO() (true)
|
||||
#define ZT_FAST_SINGLE_PASS_SALSA2012(b,l,n,k) zt_salsa2012_amd64_xmm6(reinterpret_cast<unsigned char *>(b),(l),reinterpret_cast<const unsigned char *>(n),reinterpret_cast<const unsigned char *>(k))
|
||||
#endif
|
||||
|
||||
// ARM (32-bit) NEON crypto (must be detected)
|
||||
#ifdef ZT_USE_ARM32_NEON_ASM_SALSA2012
|
||||
class _FastCryptoChecker
|
||||
{
|
||||
public:
|
||||
_FastCryptoChecker() : canHas(zt_arm_has_neon()) {}
|
||||
bool canHas;
|
||||
};
|
||||
static const _FastCryptoChecker _ZT_FAST_CRYPTO_CHECK;
|
||||
#define ZT_HAS_FAST_CRYPTO() (_ZT_FAST_CRYPTO_CHECK.canHas)
|
||||
#define ZT_FAST_SINGLE_PASS_SALSA2012(b,l,n,k) zt_salsa2012_armneon3_xor(reinterpret_cast<unsigned char *>(b),(const unsigned char *)0,(l),reinterpret_cast<const unsigned char *>(n),reinterpret_cast<const unsigned char *>(k))
|
||||
#endif
|
||||
|
||||
// No fast crypto available
|
||||
#ifndef ZT_HAS_FAST_CRYPTO
|
||||
#define ZT_HAS_FAST_CRYPTO() (false)
|
||||
#define ZT_FAST_SINGLE_PASS_SALSA2012(b,l,n,k) {}
|
||||
#endif
|
||||
|
||||
/************************************************************************** */
|
||||
|
||||
/* LZ4 is shipped encapsulated into Packet in an anonymous namespace.
|
||||
*
|
||||
* We're doing this as a deliberate workaround for various Linux distribution
|
||||
@ -899,19 +861,6 @@ void Packet::armor(const void *key,bool encryptPayload)
|
||||
|
||||
_salsa20MangleKey((const unsigned char *)key,mangledKey);
|
||||
|
||||
if (ZT_HAS_FAST_CRYPTO()) {
|
||||
const unsigned int encryptLen = (encryptPayload) ? (size() - ZT_PACKET_IDX_VERB) : 0;
|
||||
uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8];
|
||||
ZT_FAST_SINGLE_PASS_SALSA2012(keyStream,encryptLen + 64,(data + ZT_PACKET_IDX_IV),mangledKey);
|
||||
Salsa20::memxor(data + ZT_PACKET_IDX_VERB,reinterpret_cast<const uint8_t *>(keyStream + 8),encryptLen);
|
||||
uint64_t mac[2];
|
||||
poly1305(mac,data + ZT_PACKET_IDX_VERB,size() - ZT_PACKET_IDX_VERB,keyStream);
|
||||
#ifdef ZT_NO_TYPE_PUNNING
|
||||
memcpy(data + ZT_PACKET_IDX_MAC,mac,8);
|
||||
#else
|
||||
(*reinterpret_cast<uint64_t *>(data + ZT_PACKET_IDX_MAC)) = mac[0];
|
||||
#endif
|
||||
} else {
|
||||
Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV);
|
||||
uint64_t macKey[4];
|
||||
s20.crypt12(ZERO_KEY,macKey,sizeof(macKey));
|
||||
@ -923,7 +872,6 @@ void Packet::armor(const void *key,bool encryptPayload)
|
||||
poly1305(mac,payload,payloadLen,macKey);
|
||||
memcpy(data + ZT_PACKET_IDX_MAC,mac,8);
|
||||
}
|
||||
}
|
||||
|
||||
bool Packet::dearmor(const void *key)
|
||||
{
|
||||
@ -935,21 +883,6 @@ bool Packet::dearmor(const void *key)
|
||||
|
||||
if ((cs == ZT_PROTO_CIPHER_SUITE__POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012)) {
|
||||
_salsa20MangleKey((const unsigned char *)key,mangledKey);
|
||||
if (ZT_HAS_FAST_CRYPTO()) {
|
||||
uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8];
|
||||
ZT_FAST_SINGLE_PASS_SALSA2012(keyStream,((cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012) ? (payloadLen + 64) : 64),(data + ZT_PACKET_IDX_IV),mangledKey);
|
||||
uint64_t mac[2];
|
||||
poly1305(mac,payload,payloadLen,keyStream);
|
||||
#ifdef ZT_NO_TYPE_PUNNING
|
||||
if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8))
|
||||
return false;
|
||||
#else
|
||||
if ((*reinterpret_cast<const uint64_t *>(data + ZT_PACKET_IDX_MAC)) != mac[0]) // also secure, constant time
|
||||
return false;
|
||||
#endif
|
||||
if (cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012)
|
||||
Salsa20::memxor(data + ZT_PACKET_IDX_VERB,reinterpret_cast<const uint8_t *>(keyStream + 8),payloadLen);
|
||||
} else {
|
||||
Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV);
|
||||
uint64_t macKey[4];
|
||||
s20.crypt12(ZERO_KEY,macKey,sizeof(macKey));
|
||||
@ -964,8 +897,6 @@ bool Packet::dearmor(const void *key)
|
||||
#endif
|
||||
if (cs == ZT_PROTO_CIPHER_SUITE__POLY1305_SALSA2012)
|
||||
s20.crypt12(payload,payload,payloadLen);
|
||||
}
|
||||
|
||||
return true;
|
||||
} else {
|
||||
return false; // unrecognized cipher suite
|
||||
|
@ -34,82 +34,6 @@ public:
|
||||
inline Salsa20() {}
|
||||
inline ~Salsa20() { Utils::burn(&_state,sizeof(_state)); }
|
||||
|
||||
/**
|
||||
* XOR d with s
|
||||
*
|
||||
* This is done efficiently using e.g. SSE if available. It's used when
|
||||
* alternative Salsa20 implementations are used in Packet and is here
|
||||
* since this is where all the SSE stuff is already included.
|
||||
*
|
||||
* @param d Destination to XOR
|
||||
* @param s Source bytes to XOR with destination
|
||||
* @param len Length of s and d
|
||||
*/
|
||||
static inline void memxor(uint8_t *d,const uint8_t *s,unsigned int len)
|
||||
{
|
||||
#ifdef ZT_SALSA20_SSE
|
||||
while (len >= 128) {
|
||||
__m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s));
|
||||
__m128i s1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 16));
|
||||
__m128i s2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 32));
|
||||
__m128i s3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 48));
|
||||
__m128i s4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 64));
|
||||
__m128i s5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 80));
|
||||
__m128i s6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 96));
|
||||
__m128i s7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 112));
|
||||
__m128i d0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d));
|
||||
__m128i d1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 16));
|
||||
__m128i d2 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 32));
|
||||
__m128i d3 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 48));
|
||||
__m128i d4 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 64));
|
||||
__m128i d5 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 80));
|
||||
__m128i d6 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 96));
|
||||
__m128i d7 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 112));
|
||||
d0 = _mm_xor_si128(d0,s0);
|
||||
d1 = _mm_xor_si128(d1,s1);
|
||||
d2 = _mm_xor_si128(d2,s2);
|
||||
d3 = _mm_xor_si128(d3,s3);
|
||||
d4 = _mm_xor_si128(d4,s4);
|
||||
d5 = _mm_xor_si128(d5,s5);
|
||||
d6 = _mm_xor_si128(d6,s6);
|
||||
d7 = _mm_xor_si128(d7,s7);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(d),d0);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 16),d1);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 32),d2);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 48),d3);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 64),d4);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 80),d5);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 96),d6);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(d + 112),d7);
|
||||
s += 128;
|
||||
d += 128;
|
||||
len -= 128;
|
||||
}
|
||||
while (len >= 16) {
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(d),_mm_xor_si128(_mm_loadu_si128(reinterpret_cast<__m128i *>(d)),_mm_loadu_si128(reinterpret_cast<const __m128i *>(s))));
|
||||
s += 16;
|
||||
d += 16;
|
||||
len -= 16;
|
||||
}
|
||||
#else
|
||||
#ifndef ZT_NO_TYPE_PUNNING
|
||||
while (len >= 16) {
|
||||
(*reinterpret_cast<uint64_t *>(d)) ^= (*reinterpret_cast<const uint64_t *>(s));
|
||||
s += 8;
|
||||
d += 8;
|
||||
(*reinterpret_cast<uint64_t *>(d)) ^= (*reinterpret_cast<const uint64_t *>(s));
|
||||
s += 8;
|
||||
d += 8;
|
||||
len -= 16;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
while (len) {
|
||||
--len;
|
||||
*(d++) ^= *(s++);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key 256-bit (32 byte) key
|
||||
* @param iv 64-bit initialization vector
|
||||
|
65
selftest.cpp
65
selftest.cpp
@ -209,13 +209,36 @@ static int testCrypto()
|
||||
}
|
||||
double gcmBytes = 0.0;
|
||||
int64_t start = OSUtils::now();
|
||||
for(unsigned long i=0;i<150000;++i) {
|
||||
for(unsigned long i=0;i<100000;++i) {
|
||||
tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16);
|
||||
gcmBytes += (double)sizeof(buf1);
|
||||
tv.gcmEncrypt((const uint8_t *)hexbuf,buf2,sizeof(buf2),nullptr,0,buf1,(uint8_t *)(hexbuf + 32),16);
|
||||
gcmBytes += (double)(sizeof(buf1) * 2);
|
||||
}
|
||||
int64_t end = OSUtils::now();
|
||||
*dummy = buf1[0];
|
||||
std::cout << ((gcmBytes / 1048576.0) / ((long double)(end - start) / 1000.0)) << " MiB/second" << std::endl;
|
||||
std::cout << ((gcmBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl << " AES-256 ECB scramble (benchmark): "; std::cout.flush();
|
||||
double ecbBytes = 0.0;
|
||||
start = OSUtils::now();
|
||||
for(unsigned long i=0;i<100000;++i) {
|
||||
tv.ecbEncrypt(buf1,sizeof(buf1),buf2);
|
||||
tv.ecbEncrypt(buf2,sizeof(buf1),buf1);
|
||||
ecbBytes += (double)(sizeof(buf1) * 2);
|
||||
}
|
||||
end = OSUtils::now();
|
||||
*dummy = buf1[0];
|
||||
std::cout << ((ecbBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl << " AES-256 GCM + ECB scramble (benchmark): "; std::cout.flush();
|
||||
ecbBytes = 0.0;
|
||||
start = OSUtils::now();
|
||||
for(unsigned long i=0;i<100000;++i) {
|
||||
tv.gcmEncrypt((const uint8_t *)hexbuf,buf1,sizeof(buf1),nullptr,0,buf2,(uint8_t *)(hexbuf + 32),16);
|
||||
tv.ecbEncrypt(buf1,sizeof(buf1),buf2);
|
||||
tv.gcmEncrypt((const uint8_t *)hexbuf,buf2,sizeof(buf2),nullptr,0,buf1,(uint8_t *)(hexbuf + 32),16);
|
||||
tv.ecbEncrypt(buf2,sizeof(buf1),buf1);
|
||||
ecbBytes += (double)(sizeof(buf1) * 2);
|
||||
}
|
||||
end = OSUtils::now();
|
||||
*dummy = buf1[0];
|
||||
std::cout << ((ecbBytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl;
|
||||
|
||||
std::cout << "[crypto] Testing Salsa20... "; std::cout.flush();
|
||||
for(unsigned int i=0;i<4;++i) {
|
||||
@ -275,42 +298,6 @@ static int testCrypto()
|
||||
::free((void *)bb);
|
||||
}
|
||||
|
||||
#ifdef ZT_USE_X64_ASM_SALSA2012
|
||||
std::cout << "[crypto] Benchmarking Salsa20/12 fast x64 ASM... "; std::cout.flush();
|
||||
{
|
||||
unsigned char *bb = (unsigned char *)::malloc(1234567);
|
||||
double bytes = 0.0;
|
||||
uint64_t start = OSUtils::now();
|
||||
for(unsigned int i=0;i<200;++i) {
|
||||
zt_salsa2012_amd64_xmm6(bb,1234567,s20TV0Iv,s20TV0Key);
|
||||
bytes += 1234567.0;
|
||||
}
|
||||
uint64_t end = OSUtils::now();
|
||||
*dummy = bb[0];
|
||||
std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl;
|
||||
::free((void *)bb);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ZT_USE_ARM32_NEON_ASM_SALSA2012
|
||||
if (zt_arm_has_neon()) {
|
||||
std::cout << "[crypto] Benchmarking Salsa20/12 fast arm32/neon ASM... "; std::cout.flush();
|
||||
{
|
||||
unsigned char *bb = (unsigned char *)::malloc(1234567);
|
||||
double bytes = 0.0;
|
||||
uint64_t start = OSUtils::now();
|
||||
for(unsigned int i=0;i<200;++i) {
|
||||
zt_salsa2012_armneon3_xor(bb,(const unsigned char *)0,1234567,s20TV0Iv,s20TV0Key);
|
||||
bytes += 1234567.0;
|
||||
}
|
||||
uint64_t end = OSUtils::now();
|
||||
*dummy = bb[0];
|
||||
std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl;
|
||||
::free((void *)bb);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
std::cout << "[crypto] Benchmarking Salsa20/20... "; std::cout.flush();
|
||||
{
|
||||
unsigned char *bb = (unsigned char *)::malloc(1234567);
|
||||
|
Loading…
Reference in New Issue
Block a user