Unroll Salsa20 fully for a little more speed (non-SSE now almost as fast as SSE)

This commit is contained in:
Adam Ierymenko 2015-10-09 09:39:27 -07:00
parent 3fa6dd377f
commit 0c498556d5
7 changed files with 1131 additions and 178 deletions

View File

@ -41,7 +41,6 @@
#define ZT_IDENTITY_GEN_HASHCASH_FIRST_BYTE_LESS_THAN 17
#define ZT_IDENTITY_GEN_MEMORY 2097152
#define ZT_IDENTITY_GEN_SALSA20_ROUNDS 20
namespace ZeroTier {
@ -55,8 +54,8 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub
// ordinary Salsa20 is randomly seekable. This is good for a cipher
// but is not what we want for sequential memory-harndess.
memset(genmem,0,ZT_IDENTITY_GEN_MEMORY);
Salsa20 s20(digest,256,(char *)digest + 32,ZT_IDENTITY_GEN_SALSA20_ROUNDS);
s20.encrypt((char *)genmem,(char *)genmem,64);
Salsa20 s20(digest,256,(char *)digest + 32);
s20.encrypt20((char *)genmem,(char *)genmem,64);
for(unsigned long i=64;i<ZT_IDENTITY_GEN_MEMORY;i+=64) {
unsigned long k = i - 64;
*((uint64_t *)((char *)genmem + i)) = *((uint64_t *)((char *)genmem + k));
@ -67,7 +66,7 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub
*((uint64_t *)((char *)genmem + i + 40)) = *((uint64_t *)((char *)genmem + k + 40));
*((uint64_t *)((char *)genmem + i + 48)) = *((uint64_t *)((char *)genmem + k + 48));
*((uint64_t *)((char *)genmem + i + 56)) = *((uint64_t *)((char *)genmem + k + 56));
s20.encrypt((char *)genmem + i,(char *)genmem + i,64);
s20.encrypt20((char *)genmem + i,(char *)genmem + i,64);
}
// Render final digest using genmem as a lookup table
@ -77,7 +76,7 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub
uint64_t tmp = ((uint64_t *)genmem)[idx2];
((uint64_t *)genmem)[idx2] = ((uint64_t *)digest)[idx1];
((uint64_t *)digest)[idx1] = tmp;
s20.encrypt(digest,digest,64);
s20.encrypt20(digest,digest,64);
}
}

View File

@ -1149,9 +1149,9 @@ try_salsa2012sha512_again:
++*(reinterpret_cast<volatile uint64_t *>(candidate));
SHA512::hash(shabuf,candidate,16 + challengeLength);
s20.init(shabuf,256,&s20iv,12);
s20.init(shabuf,256,&s20iv);
memset(salsabuf,0,sizeof(salsabuf));
s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf));
s20.encrypt12(salsabuf,salsabuf,sizeof(salsabuf));
SHA512::hash(shabuf,salsabuf,sizeof(salsabuf));
d = difficulty;
@ -1186,9 +1186,9 @@ bool IncomingPacket::testSalsa2012Sha512ProofOfWorkResult(unsigned int difficult
memcpy(candidate + 16,challenge,challengeLength);
SHA512::hash(shabuf,candidate,16 + challengeLength);
s20.init(shabuf,256,&s20iv,12);
s20.init(shabuf,256,&s20iv);
memset(salsabuf,0,sizeof(salsabuf));
s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf));
s20.encrypt12(salsabuf,salsabuf,sizeof(salsabuf));
SHA512::hash(shabuf,salsabuf,sizeof(salsabuf));
d = difficulty;

View File

@ -88,9 +88,9 @@ Node::Node(
{
char foo[32];
Utils::getSecureRandom(foo,32);
_prng.init(foo,256,foo,8);
_prng.init(foo,256,foo);
memset(_prngStream,0,sizeof(_prngStream));
_prng.encrypt(_prngStream,_prngStream,sizeof(_prngStream));
_prng.encrypt12(_prngStream,_prngStream,sizeof(_prngStream));
}
std::string idtmp(dataStoreGet("identity.secret"));
@ -574,7 +574,7 @@ uint64_t Node::prng()
{
unsigned int p = (++_prngStreamPtr % (sizeof(_prngStream) / sizeof(uint64_t)));
if (!p)
_prng.encrypt(_prngStream,_prngStream,sizeof(_prngStream));
_prng.encrypt12(_prngStream,_prngStream,sizeof(_prngStream));
return _prngStream[p];
}

View File

@ -92,14 +92,14 @@ void Packet::armor(const void *key,bool encryptPayload)
setCipher(encryptPayload ? ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012 : ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE);
_salsa20MangleKey((const unsigned char *)key,mangledKey);
Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8),ZT_PROTO_SALSA20_ROUNDS);
Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8)/*,ZT_PROTO_SALSA20_ROUNDS*/);
// MAC key is always the first 32 bytes of the Salsa20 key stream
// This is the same construction DJB's NaCl library uses
s20.encrypt(ZERO_KEY,macKey,sizeof(macKey));
s20.encrypt12(ZERO_KEY,macKey,sizeof(macKey));
if (encryptPayload)
s20.encrypt(payload,payload,payloadLen);
s20.encrypt12(payload,payload,payloadLen);
Poly1305::compute(mac,payload,payloadLen,macKey);
memcpy(field(ZT_PACKET_IDX_MAC,8),mac,8);
@ -116,15 +116,15 @@ bool Packet::dearmor(const void *key)
if ((cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)) {
_salsa20MangleKey((const unsigned char *)key,mangledKey);
Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8),ZT_PROTO_SALSA20_ROUNDS);
Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8)/*,ZT_PROTO_SALSA20_ROUNDS*/);
s20.encrypt(ZERO_KEY,macKey,sizeof(macKey));
s20.encrypt12(ZERO_KEY,macKey,sizeof(macKey));
Poly1305::compute(mac,payload,payloadLen,macKey);
if (!Utils::secureEq(mac,field(ZT_PACKET_IDX_MAC,8),8))
return false;
if (cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)
s20.decrypt(payload,payload,payloadLen);
s20.decrypt12(payload,payload,payloadLen);
return true;
} else return false; // unrecognized cipher suite

File diff suppressed because it is too large Load Diff

View File

@ -35,12 +35,11 @@ public:
* @param key Key bits
* @param kbits Number of key bits: 128 or 256 (recommended)
* @param iv 64-bit initialization vector
* @param rounds Number of rounds: 8, 12, or 20
*/
Salsa20(const void *key,unsigned int kbits,const void *iv,unsigned int rounds)
Salsa20(const void *key,unsigned int kbits,const void *iv)
throw()
{
init(key,kbits,iv,rounds);
init(key,kbits,iv);
}
/**
@ -49,19 +48,28 @@ public:
* @param key Key bits
* @param kbits Number of key bits: 128 or 256 (recommended)
* @param iv 64-bit initialization vector
* @param rounds Number of rounds: 8, 12, or 20
*/
void init(const void *key,unsigned int kbits,const void *iv,unsigned int rounds)
void init(const void *key,unsigned int kbits,const void *iv)
throw();
/**
* Encrypt data
* Encrypt data using Salsa20/12
*
* @param in Input data
* @param out Output buffer
* @param bytes Length of data
*/
void encrypt(const void *in,void *out,unsigned int bytes)
void encrypt12(const void *in,void *out,unsigned int bytes)
throw();
/**
* Encrypt data using Salsa20/20
*
* @param in Input data
* @param out Output buffer
* @param bytes Length of data
*/
void encrypt20(const void *in,void *out,unsigned int bytes)
throw();
/**
@ -71,10 +79,23 @@ public:
* @param out Output buffer
* @param bytes Length of data
*/
inline void decrypt(const void *in,void *out,unsigned int bytes)
inline void decrypt12(const void *in,void *out,unsigned int bytes)
throw()
{
encrypt(in,out,bytes);
encrypt12(in,out,bytes);
}
/**
* Decrypt data
*
* @param in Input data
* @param out Output buffer
* @param bytes Length of data
*/
inline void decrypt20(const void *in,void *out,unsigned int bytes)
throw()
{
encrypt20(in,out,bytes);
}
private:
@ -84,7 +105,6 @@ private:
#endif // ZT_SALSA20_SSE
uint32_t i[16];
} _state;
unsigned int _roundsDiv4;
};
} // namespace ZeroTier

View File

@ -162,27 +162,27 @@ static int testCrypto()
memset(buf2,0,sizeof(buf2));
memset(buf3,0,sizeof(buf3));
Salsa20 s20;
s20.init("12345678123456781234567812345678",256,"12345678",20);
s20.encrypt(buf1,buf2,sizeof(buf1));
s20.init("12345678123456781234567812345678",256,"12345678",20);
s20.decrypt(buf2,buf3,sizeof(buf2));
s20.init("12345678123456781234567812345678",256,"12345678");
s20.encrypt20(buf1,buf2,sizeof(buf1));
s20.init("12345678123456781234567812345678",256,"12345678");
s20.decrypt20(buf2,buf3,sizeof(buf2));
if (memcmp(buf1,buf3,sizeof(buf1))) {
std::cout << "FAIL (encrypt/decrypt test)" << std::endl;
return -1;
}
}
Salsa20 s20(s20TV0Key,256,s20TV0Iv,20);
Salsa20 s20(s20TV0Key,256,s20TV0Iv);
memset(buf1,0,sizeof(buf1));
memset(buf2,0,sizeof(buf2));
s20.encrypt(buf1,buf2,64);
s20.encrypt20(buf1,buf2,64);
if (memcmp(buf2,s20TV0Ks,64)) {
std::cout << "FAIL (test vector 0)" << std::endl;
return -1;
}
s20.init(s2012TV0Key,256,s2012TV0Iv,12);
s20.init(s2012TV0Key,256,s2012TV0Iv);
memset(buf1,0,sizeof(buf1));
memset(buf2,0,sizeof(buf2));
s20.encrypt(buf1,buf2,64);
s20.encrypt12(buf1,buf2,64);
if (memcmp(buf2,s2012TV0Ks,64)) {
std::cout << "FAIL (test vector 1)" << std::endl;
return -1;
@ -195,34 +195,16 @@ static int testCrypto()
std::cout << "[crypto] Salsa20 SSE: DISABLED" << std::endl;
#endif
std::cout << "[crypto] Benchmarking Salsa20/8... "; std::cout.flush();
{
unsigned char *bb = (unsigned char *)::malloc(1234567);
for(unsigned int i=0;i<1234567;++i)
bb[i] = (unsigned char)i;
Salsa20 s20(s20TV0Key,256,s20TV0Iv,8);
double bytes = 0.0;
uint64_t start = OSUtils::now();
for(unsigned int i=0;i<200;++i) {
s20.encrypt(bb,bb,1234567);
bytes += 1234567.0;
}
uint64_t end = OSUtils::now();
SHA512::hash(buf1,bb,1234567);
std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (" << Utils::hex(buf1,16) << ')' << std::endl;
::free((void *)bb);
}
std::cout << "[crypto] Benchmarking Salsa20/12... "; std::cout.flush();
{
unsigned char *bb = (unsigned char *)::malloc(1234567);
for(unsigned int i=0;i<1234567;++i)
bb[i] = (unsigned char)i;
Salsa20 s20(s20TV0Key,256,s20TV0Iv,12);
Salsa20 s20(s20TV0Key,256,s20TV0Iv);
double bytes = 0.0;
uint64_t start = OSUtils::now();
for(unsigned int i=0;i<200;++i) {
s20.encrypt(bb,bb,1234567);
s20.encrypt12(bb,bb,1234567);
bytes += 1234567.0;
}
uint64_t end = OSUtils::now();
@ -236,11 +218,11 @@ static int testCrypto()
unsigned char *bb = (unsigned char *)::malloc(1234567);
for(unsigned int i=0;i<1234567;++i)
bb[i] = (unsigned char)i;
Salsa20 s20(s20TV0Key,256,s20TV0Iv,20);
Salsa20 s20(s20TV0Key,256,s20TV0Iv);
double bytes = 0.0;
uint64_t start = OSUtils::now();
for(unsigned int i=0;i<200;++i) {
s20.encrypt(bb,bb,1234567);
s20.encrypt20(bb,bb,1234567);
bytes += 1234567.0;
}
uint64_t end = OSUtils::now();