Just incorporate the X64 ASM version of Salsa20/12 for X64 platforms. This gives us (for example) 1.5gb/sec encryption on a Core i5 2.8ghz.

This commit is contained in:
Adam Ierymenko 2017-04-18 08:45:37 -07:00
parent 4938e82795
commit a1e94154be
6 changed files with 4597 additions and 4 deletions

View File

@ -0,0 +1,6 @@
Blazingly fast X64 ASM implementation of Salsa20/12
======
This is ripped from the [cnacl](https://github.com/cjdelisle/cnacl) source. The actual code is by Danial J. Bernstein and is in the public domain.
This is included on Linux and Mac 64-bit builds and is significantly faster than the SSE intrinsics or C versions. It's used for packet encode/decode only since its use differs a bit from the regular Salsa20 C++ class. Specifically it lacks the ability to be called on multiple blocks, preferring instead to take a key and a single stream to encrypt and that's it.

View File

@ -0,0 +1,13 @@
#ifdef __cplusplus
extern "C" {
#endif
// output, outlen, nonce, key (256-bit / 32-byte)
extern int zt_salsa2012_amd64_xmm6(unsigned char *,unsigned long long,const unsigned char *,const unsigned char *);
// ciphertext, message, mlen, nonce, key
extern int zt_salsa2012_amd64_xmm6_xor(unsigned char *,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -37,6 +37,10 @@ ifeq ($(ZT_ENABLE_CLUSTER),1)
DEFS+=-DZT_ENABLE_CLUSTER DEFS+=-DZT_ENABLE_CLUSTER
endif endif
# Use fast ASM Salsa20/12 for x64 processors
DEFS+=-DZT_USE_X64_ASM_SALSA2012
OBJS+=ext/x64-salsa2012-asm/salsa2012.o
# Build miniupnpc and nat-pmp as included libraries -- extra defs are required for these sources # Build miniupnpc and nat-pmp as included libraries -- extra defs are required for these sources
DEFS+=-DMACOSX -DZT_USE_MINIUPNPC -DMINIUPNP_STATICLIB -D_DARWIN_C_SOURCE -DMINIUPNPC_SET_SOCKET_TIMEOUT -DMINIUPNPC_GET_SRC_ADDR -D_BSD_SOURCE -D_DEFAULT_SOURCE -DOS_STRING=\"Darwin/15.0.0\" -DMINIUPNPC_VERSION_STRING=\"2.0\" -DUPNP_VERSION_STRING=\"UPnP/1.1\" -DENABLE_STRNATPMPERR DEFS+=-DMACOSX -DZT_USE_MINIUPNPC -DMINIUPNP_STATICLIB -D_DARWIN_C_SOURCE -DMINIUPNPC_SET_SOCKET_TIMEOUT -DMINIUPNPC_GET_SRC_ADDR -D_BSD_SOURCE -D_DEFAULT_SOURCE -DOS_STRING=\"Darwin/15.0.0\" -DMINIUPNPC_VERSION_STRING=\"2.0\" -DUPNP_VERSION_STRING=\"UPnP/1.1\" -DENABLE_STRNATPMPERR
OBJS+=ext/libnatpmp/natpmp.o ext/libnatpmp/getgateway.o ext/miniupnpc/connecthostport.o ext/miniupnpc/igd_desc_parse.o ext/miniupnpc/minisoap.o ext/miniupnpc/minissdpc.o ext/miniupnpc/miniupnpc.o ext/miniupnpc/miniwget.o ext/miniupnpc/minixml.o ext/miniupnpc/portlistingparse.o ext/miniupnpc/receivedata.o ext/miniupnpc/upnpcommands.o ext/miniupnpc/upnpdev.o ext/miniupnpc/upnperrors.o ext/miniupnpc/upnpreplyparse.o osdep/PortMapper.o OBJS+=ext/libnatpmp/natpmp.o ext/libnatpmp/getgateway.o ext/miniupnpc/connecthostport.o ext/miniupnpc/igd_desc_parse.o ext/miniupnpc/minisoap.o ext/miniupnpc/minissdpc.o ext/miniupnpc/miniupnpc.o ext/miniupnpc/miniwget.o ext/miniupnpc/minixml.o ext/miniupnpc/portlistingparse.o ext/miniupnpc/receivedata.o ext/miniupnpc/upnpcommands.o ext/miniupnpc/upnpdev.o ext/miniupnpc/upnperrors.o ext/miniupnpc/upnpreplyparse.o osdep/PortMapper.o
@ -57,6 +61,9 @@ endif
CXXFLAGS=$(CFLAGS) -std=c++11 -stdlib=libc++ CXXFLAGS=$(CFLAGS) -std=c++11 -stdlib=libc++
ext/x64-salsa2012-asm/salsa2012.o:
$(CC) $(CFLAGS) -c ext/x64-salsa2012-asm/salsa2012.s -o ext/x64-salsa2012-asm/salsa2012.o
all: one macui all: one macui
one: $(OBJS) service/OneService.o one.o one: $(OBJS) service/OneService.o one.o

View File

@ -24,6 +24,10 @@
#include "Packet.hpp" #include "Packet.hpp"
#ifdef ZT_USE_X64_ASM_SALSA2012
#include "../ext/x64-salsa2012-asm/salsa2012.h"
#endif
#ifdef _MSC_VER #ifdef _MSC_VER
#define FORCE_INLINE static __forceinline #define FORCE_INLINE static __forceinline
#include <intrin.h> #include <intrin.h>
@ -1064,7 +1068,7 @@ const char *Packet::errorString(ErrorCode e)
void Packet::armor(const void *key,bool encryptPayload,unsigned int counter) void Packet::armor(const void *key,bool encryptPayload,unsigned int counter)
{ {
uint8_t mangledKey[32],macKey[32],mac[16]; uint8_t mangledKey[32];
uint8_t *const data = reinterpret_cast<uint8_t *>(unsafeData()); uint8_t *const data = reinterpret_cast<uint8_t *>(unsafeData());
// Mask least significant 3 bits of packet ID with counter to embed packet send counter for QoS use // Mask least significant 3 bits of packet ID with counter to embed packet send counter for QoS use
@ -1074,23 +1078,47 @@ void Packet::armor(const void *key,bool encryptPayload,unsigned int counter)
setCipher(encryptPayload ? ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012 : ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE); setCipher(encryptPayload ? ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012 : ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE);
_salsa20MangleKey((const unsigned char *)key,mangledKey); _salsa20MangleKey((const unsigned char *)key,mangledKey);
#ifdef ZT_USE_X64_ASM_SALSA2012
const unsigned int payloadLen = (encryptPayload) ? (size() - ZT_PACKET_IDX_VERB) : 0;
uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8];
zt_salsa2012_amd64_xmm6(reinterpret_cast<unsigned char *>(keyStream),payloadLen + 64,reinterpret_cast<const unsigned char *>(data + ZT_PACKET_IDX_IV),reinterpret_cast<const unsigned char *>(mangledKey));
uint64_t *ksptr = keyStream + 8; // encryption starts after first Salsa20 block
uint8_t *dptr = data + ZT_PACKET_IDX_VERB;
unsigned int ksrem = payloadLen;
while (ksrem >= 8) {
ksrem -= 8;
*(reinterpret_cast<uint64_t *>(dptr)) ^= *(ksptr++);
dptr += 8;
}
for(unsigned int i=0;i<ksrem;++i) {
dptr[i] ^= reinterpret_cast<const uint8_t *>(ksptr)[i];
}
uint64_t mac[2];
Poly1305::compute(mac,data + ZT_PACKET_IDX_VERB,size() - ZT_PACKET_IDX_VERB,keyStream);
memcpy(data + ZT_PACKET_IDX_MAC,mac,8);
#else
Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV); Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV);
// MAC key is always the first 32 bytes of the Salsa20 key stream uint64_t macKey[4];
// This is the same construction DJB's NaCl library uses
s20.crypt12(ZERO_KEY,macKey,sizeof(macKey)); s20.crypt12(ZERO_KEY,macKey,sizeof(macKey));
uint8_t *const payload = data + ZT_PACKET_IDX_VERB; uint8_t *const payload = data + ZT_PACKET_IDX_VERB;
const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB; const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB;
if (encryptPayload) if (encryptPayload)
s20.crypt12(payload,payload,payloadLen); s20.crypt12(payload,payload,payloadLen);
uint64_t mac[2];
Poly1305::compute(mac,payload,payloadLen,macKey); Poly1305::compute(mac,payload,payloadLen,macKey);
memcpy(data + ZT_PACKET_IDX_MAC,mac,8); memcpy(data + ZT_PACKET_IDX_MAC,mac,8);
#endif
} }
bool Packet::dearmor(const void *key) bool Packet::dearmor(const void *key)
{ {
uint8_t mangledKey[32],macKey[32],mac[16]; uint8_t mangledKey[32];
uint8_t *const data = reinterpret_cast<uint8_t *>(unsafeData()); uint8_t *const data = reinterpret_cast<uint8_t *>(unsafeData());
const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB; const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB;
unsigned char *const payload = data + ZT_PACKET_IDX_VERB; unsigned char *const payload = data + ZT_PACKET_IDX_VERB;
@ -1098,9 +1126,37 @@ bool Packet::dearmor(const void *key)
if ((cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)) { if ((cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)) {
_salsa20MangleKey((const unsigned char *)key,mangledKey); _salsa20MangleKey((const unsigned char *)key,mangledKey);
#ifdef ZT_USE_X64_ASM_SALSA2012
uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8];
zt_salsa2012_amd64_xmm6(reinterpret_cast<unsigned char *>(keyStream),((cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012) ? (payloadLen + 64) : 64),reinterpret_cast<const unsigned char *>(data + ZT_PACKET_IDX_IV),reinterpret_cast<const unsigned char *>(mangledKey));
uint64_t mac[2];
Poly1305::compute(mac,payload,payloadLen,keyStream);
if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8))
return false; // MAC failed, packet is corrupt, modified, or is not from the sender
if (cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012) {
uint64_t *ksptr = keyStream + 8; // encryption starts after first Salsa20 block
uint8_t *dptr = data + ZT_PACKET_IDX_VERB;
unsigned int ksrem = payloadLen;
while (ksrem >= 8) {
ksrem -= 8;
*(reinterpret_cast<uint64_t *>(dptr)) ^= *(ksptr++);
dptr += 8;
}
for(unsigned int i=0;i<ksrem;++i) {
dptr[i] ^= reinterpret_cast<const uint8_t *>(ksptr)[i];
}
}
return true;
#else
Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV); Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV);
uint64_t macKey[4];
s20.crypt12(ZERO_KEY,macKey,sizeof(macKey)); s20.crypt12(ZERO_KEY,macKey,sizeof(macKey));
uint64_t mac[2];
Poly1305::compute(mac,payload,payloadLen,macKey); Poly1305::compute(mac,payload,payloadLen,macKey);
if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8)) if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8))
return false; // MAC failed, packet is corrupt, modified, or is not from the sender return false; // MAC failed, packet is corrupt, modified, or is not from the sender
@ -1109,6 +1165,7 @@ bool Packet::dearmor(const void *key)
s20.crypt12(payload,payload,payloadLen); s20.crypt12(payload,payload,payloadLen);
return true; return true;
#endif
} else { } else {
return false; // unrecognized cipher suite return false; // unrecognized cipher suite
} }

View File

@ -54,6 +54,10 @@
#include "controller/JSONDB.hpp" #include "controller/JSONDB.hpp"
#ifdef ZT_USE_X64_ASM_SALSA2012
#include "ext/x64-salsa2012-asm/salsa2012.h"
#endif
#ifdef __WINDOWS__ #ifdef __WINDOWS__
#include <tchar.h> #include <tchar.h>
#endif #endif
@ -204,6 +208,24 @@ static int testCrypto()
::free((void *)bb); ::free((void *)bb);
} }
#ifdef ZT_USE_X64_ASM_SALSA2012
std::cout << "[crypto] Benchmarking Salsa20/12 fast x64 ASM... "; std::cout.flush();
{
unsigned char *bb = (unsigned char *)::malloc(1234567);
for(unsigned int i=0;i<1234567;++i)
bb[i] = (unsigned char)i;
double bytes = 0.0;
uint64_t start = OSUtils::now();
for(unsigned int i=0;i<200;++i) {
zt_salsa2012_amd64_xmm6_xor(bb,bb,1234567,s20TV0Iv,s20TV0Key);
bytes += 1234567.0;
}
uint64_t end = OSUtils::now();
std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl;
::free((void *)bb);
}
#endif
std::cout << "[crypto] Benchmarking Salsa20/20... "; std::cout.flush(); std::cout << "[crypto] Benchmarking Salsa20/20... "; std::cout.flush();
{ {
unsigned char *bb = (unsigned char *)::malloc(1234567); unsigned char *bb = (unsigned char *)::malloc(1234567);