mirror of
https://github.com/zerotier/ZeroTierOne.git
synced 2024-12-20 13:33:07 +00:00
A bit more optimization
This commit is contained in:
parent
f39693f97e
commit
f753519729
@ -63,7 +63,7 @@ ifeq ($(ZT_DEBUG),1)
|
|||||||
node/Salsa20.o node/SHA512.o node/C25519.o node/Poly1305.o node/AES.o: CFLAGS = -Wall -O2 -g -maes -mpclmul $(INCLUDES) $(DEFS)
|
node/Salsa20.o node/SHA512.o node/C25519.o node/Poly1305.o node/AES.o: CFLAGS = -Wall -O2 -g -maes -mpclmul $(INCLUDES) $(DEFS)
|
||||||
else
|
else
|
||||||
CFLAGS?=-Ofast -fstack-protector-strong
|
CFLAGS?=-Ofast -fstack-protector-strong
|
||||||
CFLAGS+=$(ARCH_FLAGS) -Wall -flto -fPIE -maes -msse -msse2 -mpclmul -mmacosx-version-min=10.9 -DNDEBUG -Wno-unused-private-field $(INCLUDES) $(DEFS)
|
CFLAGS+=$(ARCH_FLAGS) -Wall -flto -fPIE -maes -msse -msse2 -msse3 -mpclmul -mmacosx-version-min=10.9 -DNDEBUG -Wno-unused-private-field $(INCLUDES) $(DEFS)
|
||||||
STRIP=strip
|
STRIP=strip
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
202
node/AES.hpp
202
node/AES.hpp
@ -196,17 +196,6 @@ private:
|
|||||||
_k.ni.hh = _swap128_aesni(hh);
|
_k.ni.hh = _swap128_aesni(hh);
|
||||||
_k.ni.hhh = _swap128_aesni(hhh);
|
_k.ni.hhh = _swap128_aesni(hhh);
|
||||||
_k.ni.hhhh = _swap128_aesni(hhhh);
|
_k.ni.hhhh = _swap128_aesni(hhhh);
|
||||||
/*
|
|
||||||
this->h = h;
|
|
||||||
h = swap128(h);
|
|
||||||
this->hh = mult_block(h, this->h);
|
|
||||||
this->hhh = mult_block(h, this->hh);
|
|
||||||
this->hhhh = mult_block(h, this->hhh);
|
|
||||||
this->h = swap128(this->h);
|
|
||||||
this->hh = swap128(this->hh);
|
|
||||||
this->hhh = swap128(this->hhh);
|
|
||||||
this->hhhh = swap128(this->hhhh);
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _encrypt_aesni(const void *in,void *out) const
|
inline void _encrypt_aesni(const void *in,void *out) const
|
||||||
@ -354,7 +343,7 @@ private:
|
|||||||
static inline __m128i _increment_be_aesni(__m128i x)
|
static inline __m128i _increment_be_aesni(__m128i x)
|
||||||
{
|
{
|
||||||
x = _swap128_aesni(x);
|
x = _swap128_aesni(x);
|
||||||
x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
|
x = _mm_add_epi64(x,_mm_set_epi32(0,0,0,1));
|
||||||
x = _swap128_aesni(x);
|
x = _swap128_aesni(x);
|
||||||
return x;
|
return x;
|
||||||
}
|
}
|
||||||
@ -460,88 +449,101 @@ private:
|
|||||||
}
|
}
|
||||||
inline void _encrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize) const
|
inline void _encrypt_gcm256_aesni(unsigned int len,const uint8_t *in,uint8_t *out,const uint8_t *iv,unsigned int alen,const uint8_t *assoc,uint8_t *icv,unsigned int icvsize) const
|
||||||
{
|
{
|
||||||
__m128i d1,d2,d3,d4,t1,t2,t3,t4,k;
|
__m128i j = _create_j_aesni(iv);
|
||||||
__m128i y,j,cb,*bi,*bo;
|
__m128i cb = _increment_be_aesni(j);
|
||||||
|
__m128i y = _icv_header_aesni(assoc,alen);
|
||||||
j = _create_j_aesni(iv);
|
|
||||||
cb = _increment_be_aesni(j);
|
|
||||||
y = _icv_header_aesni(assoc,alen);
|
|
||||||
unsigned int blocks = len / 16;
|
unsigned int blocks = len / 16;
|
||||||
unsigned int pblocks = blocks - (blocks % 4);
|
unsigned int pblocks = blocks - (blocks % 4);
|
||||||
unsigned int rem = len % 16;
|
unsigned int rem = len % 16;
|
||||||
bi = (__m128i *)in;
|
__m128i *bi = (__m128i *)in;
|
||||||
bo = (__m128i *)out;
|
__m128i *bo = (__m128i *)out;
|
||||||
|
|
||||||
|
__m128i k0 = _k.ni.k[0];
|
||||||
|
__m128i k1 = _k.ni.k[1];
|
||||||
|
__m128i k2 = _k.ni.k[2];
|
||||||
|
__m128i k3 = _k.ni.k[3];
|
||||||
|
__m128i k4 = _k.ni.k[4];
|
||||||
|
__m128i k5 = _k.ni.k[5];
|
||||||
|
__m128i k6 = _k.ni.k[6];
|
||||||
|
__m128i k7 = _k.ni.k[7];
|
||||||
|
__m128i k8 = _k.ni.k[8];
|
||||||
|
__m128i k9 = _k.ni.k[9];
|
||||||
|
__m128i k10 = _k.ni.k[10];
|
||||||
|
__m128i k11 = _k.ni.k[11];
|
||||||
|
__m128i k12 = _k.ni.k[12];
|
||||||
|
__m128i k13 = _k.ni.k[13];
|
||||||
|
__m128i k14 = _k.ni.k[14];
|
||||||
|
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for (i=0;i<pblocks;i+=4) {
|
for (i=0;i<pblocks;i+=4) {
|
||||||
d1 = _mm_loadu_si128(bi + i + 0);
|
__m128i d1 = _mm_loadu_si128(bi + i + 0);
|
||||||
d2 = _mm_loadu_si128(bi + i + 1);
|
__m128i d2 = _mm_loadu_si128(bi + i + 1);
|
||||||
d3 = _mm_loadu_si128(bi + i + 2);
|
__m128i d3 = _mm_loadu_si128(bi + i + 2);
|
||||||
d4 = _mm_loadu_si128(bi + i + 3);
|
__m128i d4 = _mm_loadu_si128(bi + i + 3);
|
||||||
t1 = _mm_xor_si128(cb,k = _k.ni.k[0]);
|
__m128i t1 = _mm_xor_si128(cb,k0);
|
||||||
cb = _increment_be_aesni(cb);
|
cb = _increment_be_aesni(cb);
|
||||||
t2 = _mm_xor_si128(cb,k);
|
__m128i t2 = _mm_xor_si128(cb,k0);
|
||||||
cb = _increment_be_aesni(cb);
|
cb = _increment_be_aesni(cb);
|
||||||
t3 = _mm_xor_si128(cb,k);
|
__m128i t3 = _mm_xor_si128(cb,k0);
|
||||||
cb = _increment_be_aesni(cb);
|
cb = _increment_be_aesni(cb);
|
||||||
t4 = _mm_xor_si128(cb,k);
|
__m128i t4 = _mm_xor_si128(cb,k0);
|
||||||
cb = _increment_be_aesni(cb);
|
cb = _increment_be_aesni(cb);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[1]);
|
t1 = _mm_aesenc_si128(t1,k1);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k1);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k1);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k1);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[2]);
|
t1 = _mm_aesenc_si128(t1,k2);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k2);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k2);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k2);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[3]);
|
t1 = _mm_aesenc_si128(t1,k3);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k3);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k3);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k3);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[4]);
|
t1 = _mm_aesenc_si128(t1,k4);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k4);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k4);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k4);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[5]);
|
t1 = _mm_aesenc_si128(t1,k5);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k5);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k5);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k5);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[6]);
|
t1 = _mm_aesenc_si128(t1,k6);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k6);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k6);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k6);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[7]);
|
t1 = _mm_aesenc_si128(t1,k7);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k7);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k7);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k7);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[8]);
|
t1 = _mm_aesenc_si128(t1,k8);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k8);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k8);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k8);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[9]);
|
t1 = _mm_aesenc_si128(t1,k9);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k9);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k9);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k9);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[10]);
|
t1 = _mm_aesenc_si128(t1,k10);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k10);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k10);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k10);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[11]);
|
t1 = _mm_aesenc_si128(t1,k11);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k11);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k11);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k11);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[12]);
|
t1 = _mm_aesenc_si128(t1,k12);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k12);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k12);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k12);
|
||||||
t1 = _mm_aesenc_si128(t1,k = _k.ni.k[13]);
|
t1 = _mm_aesenc_si128(t1,k13);
|
||||||
t2 = _mm_aesenc_si128(t2,k);
|
t2 = _mm_aesenc_si128(t2,k13);
|
||||||
t3 = _mm_aesenc_si128(t3,k);
|
t3 = _mm_aesenc_si128(t3,k13);
|
||||||
t4 = _mm_aesenc_si128(t4,k);
|
t4 = _mm_aesenc_si128(t4,k13);
|
||||||
t1 = _mm_aesenclast_si128(t1,k = _k.ni.k[14]);
|
t1 = _mm_aesenclast_si128(t1,k14);
|
||||||
t2 = _mm_aesenclast_si128(t2,k);
|
t2 = _mm_aesenclast_si128(t2,k14);
|
||||||
t3 = _mm_aesenclast_si128(t3,k);
|
t3 = _mm_aesenclast_si128(t3,k14);
|
||||||
t4 = _mm_aesenclast_si128(t4,k);
|
t4 = _mm_aesenclast_si128(t4,k14);
|
||||||
t1 = _mm_xor_si128(t1,d1);
|
t1 = _mm_xor_si128(t1,d1);
|
||||||
t2 = _mm_xor_si128(t2,d2);
|
t2 = _mm_xor_si128(t2,d2);
|
||||||
t3 = _mm_xor_si128(t3,d3);
|
t3 = _mm_xor_si128(t3,d3);
|
||||||
@ -555,22 +557,22 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (i=pblocks;i<blocks;++i) {
|
for (i=pblocks;i<blocks;++i) {
|
||||||
d1 = _mm_loadu_si128(bi + i);
|
__m128i d1 = _mm_loadu_si128(bi + i);
|
||||||
t1 = _mm_xor_si128(cb,_k.ni.k[0]);
|
__m128i t1 = _mm_xor_si128(cb,k0);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[1]);
|
t1 = _mm_aesenc_si128(t1,k1);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[2]);
|
t1 = _mm_aesenc_si128(t1,k2);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[3]);
|
t1 = _mm_aesenc_si128(t1,k3);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[4]);
|
t1 = _mm_aesenc_si128(t1,k4);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[5]);
|
t1 = _mm_aesenc_si128(t1,k5);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[6]);
|
t1 = _mm_aesenc_si128(t1,k6);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[7]);
|
t1 = _mm_aesenc_si128(t1,k7);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[8]);
|
t1 = _mm_aesenc_si128(t1,k8);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[9]);
|
t1 = _mm_aesenc_si128(t1,k9);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[10]);
|
t1 = _mm_aesenc_si128(t1,k10);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[11]);
|
t1 = _mm_aesenc_si128(t1,k11);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[12]);
|
t1 = _mm_aesenc_si128(t1,k12);
|
||||||
t1 = _mm_aesenc_si128(t1,_k.ni.k[13]);
|
t1 = _mm_aesenc_si128(t1,k13);
|
||||||
t1 = _mm_aesenclast_si128(t1,_k.ni.k[14]);
|
t1 = _mm_aesenclast_si128(t1,k14);
|
||||||
t1 = _mm_xor_si128(t1,d1);
|
t1 = _mm_xor_si128(t1,d1);
|
||||||
_mm_storeu_si128(bo + i,t1);
|
_mm_storeu_si128(bo + i,t1);
|
||||||
y = _ghash_aesni(_k.ni.h,y,t1);
|
y = _ghash_aesni(_k.ni.h,y,t1);
|
||||||
|
Loading…
Reference in New Issue
Block a user