diff --git a/node/Salsa20.cpp b/node/Salsa20.cpp index cbe908c61..1ec6a2721 100644 --- a/node/Salsa20.cpp +++ b/node/Salsa20.cpp @@ -144,10 +144,14 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes) } #ifdef ZT_SALSA20_SSE - __m128i X0 = _state.v[0]; - __m128i X1 = _state.v[1]; - __m128i X2 = _state.v[2]; - __m128i X3 = _state.v[3]; + __m128i X0 = _mm_load_si128((const __m128i *)&(_state.v[0])); + __m128i X1 = _mm_load_si128((const __m128i *)&(_state.v[1])); + __m128i X2 = _mm_load_si128((const __m128i *)&(_state.v[2])); + __m128i X3 = _mm_load_si128((const __m128i *)&(_state.v[3])); + __m128i X0s = X0; + __m128i X1s = X1; + __m128i X2s = X2; + __m128i X3s = X3; for (i=0;i<_roundsDiv2;++i) { __m128i T = _mm_add_epi32(X0, X3); @@ -185,10 +189,10 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes) X3 = _mm_shuffle_epi32(X3, 0x93); } - X0 = _mm_add_epi32(_state.v[0],X0); - X1 = _mm_add_epi32(_state.v[1],X1); - X2 = _mm_add_epi32(_state.v[2],X2); - X3 = _mm_add_epi32(_state.v[3],X3); + X0 = _mm_add_epi32(X0s,X0); + X1 = _mm_add_epi32(X1s,X1); + X2 = _mm_add_epi32(X2s,X2); + X3 = _mm_add_epi32(X3s,X3); { __m128i k02 = _mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)); @@ -201,10 +205,10 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes) const float *const mv = (const float *)m; float *const cv = (float *)c; - _mm_storeu_ps(cv,_mm_xor_si128(_mm_unpackhi_epi64(k02,k20),_mm_loadu_ps(mv))); - _mm_storeu_ps(cv + 4,_mm_xor_si128(_mm_unpackhi_epi64(k13,k31),_mm_loadu_ps(mv + 4))); - _mm_storeu_ps(cv + 8,_mm_xor_si128(_mm_unpacklo_epi64(k20,k02),_mm_loadu_ps(mv + 8))); - _mm_storeu_ps(cv + 12,_mm_xor_si128(_mm_unpacklo_epi64(k31,k13),_mm_loadu_ps(mv + 12))); + _mm_storeu_ps(cv,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k02,k20),_mm_castps_si128(_mm_loadu_ps(mv))))); + _mm_storeu_ps(cv + 4,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k13,k31),_mm_castps_si128(_mm_loadu_ps(mv + 4))))); + _mm_storeu_ps(cv + 8,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k20,k02),_mm_castps_si128(_mm_loadu_ps(mv + 8))))); + _mm_storeu_ps(cv + 12,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k31,k13),_mm_castps_si128(_mm_loadu_ps(mv + 12))))); } if (!(++_state.i[8])) { diff --git a/windows/ZeroTierOne/ZeroTierOne.vcxproj b/windows/ZeroTierOne/ZeroTierOne.vcxproj index d564b57ee..fb67cec6d 100644 --- a/windows/ZeroTierOne/ZeroTierOne.vcxproj +++ b/windows/ZeroTierOne/ZeroTierOne.vcxproj @@ -40,6 +40,7 @@ + @@ -51,7 +52,6 @@ - @@ -93,6 +93,7 @@ + @@ -108,7 +109,6 @@ - @@ -221,7 +221,7 @@ true true $(SolutionDir)\ext\bin\libcrypto\include - ZT_OFFICIAL_RELEASE;ZT_AUTO_UPDATE;%(PreprocessorDefinitions) + ZT_OFFICIAL_RELEASE;ZT_AUTO_UPDATE;ZT_SALSA20_SSE;%(PreprocessorDefinitions) MultiThreaded NoExtensions true @@ -245,7 +245,7 @@ true true $(SolutionDir)\ext\bin\libcrypto\include - ZT_OFFICIAL_RELEASE;ZT_AUTO_UPDATE;%(PreprocessorDefinitions) + ZT_OFFICIAL_RELEASE;ZT_AUTO_UPDATE;ZT_SALSA20_SSE;%(PreprocessorDefinitions) MultiThreaded NotSet true diff --git a/windows/ZeroTierOne/ZeroTierOne.vcxproj.filters b/windows/ZeroTierOne/ZeroTierOne.vcxproj.filters index f7b5921f0..4002bcec0 100644 --- a/windows/ZeroTierOne/ZeroTierOne.vcxproj.filters +++ b/windows/ZeroTierOne/ZeroTierOne.vcxproj.filters @@ -120,7 +120,7 @@ Source Files - + Source Files @@ -287,15 +287,15 @@ Header Files - - Header Files - Header Files Header Files + + Header Files +