diff --git a/node/AES.cpp b/node/AES.cpp
index baee7063a..6dd5ae040 100644
--- a/node/AES.cpp
+++ b/node/AES.cpp
@@ -66,7 +66,7 @@ static bool _zt_aesni_supported()
 	return ((ecx & (1 << 25)) != 0);
 #endif
 }
-const bool AES::HW_ACCEL = _zt_aesni_supported();
+const bool AES::HW_ACCEL = false; //_zt_aesni_supported();
 
 #else
 
@@ -107,6 +107,11 @@ void AES::_initSW(const uint8_t key[32])
 		rk[15] = rk[7] ^ rk[14];
 		rk += 8;
 	}
+
+	uint64_t zero[2]; zero[0] = 0; zero[1] = 0;
+	_encryptSW((const uint8_t *)zero,(uint8_t *)_k.sw.h);
+	_k.sw.h[0] = Utils::ntoh(_k.sw.h[0]);
+	_k.sw.h[1] = Utils::ntoh(_k.sw.h[1]);
 }
 
 void AES::_encryptSW(const uint8_t in[16],uint8_t out[16]) const
@@ -183,4 +188,192 @@ void AES::_encryptSW(const uint8_t in[16],uint8_t out[16]) const
 	PUTU32(out + 12, s3);
 }
 
+#if (defined(__GNUC__) || defined(__clang)) && (defined(__amd64) || defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || defined(__AMD64) || defined(__AMD64__) || defined(_M_X64) || defined(__aarch64__))
+
+#if defined(__SIZEOF_INT128__)
+typedef unsigned __int128 uint128_t;
+#else
+typedef unsigned uint128_t __attribute__((mode(TI)));
+#endif
+
+static ZT_ALWAYS_INLINE void s_bmul64(const uint64_t x,const uint64_t y,uint64_t &r_high,uint64_t &r_low)
+{
+	static uint128_t m1 = (uint128_t)0x2108421084210842ULL << 64 | 0x1084210842108421ULL;
+	static uint128_t m2 = (uint128_t)0x4210842108421084ULL << 64 | 0x2108421084210842ULL;
+	static uint128_t m3 = (uint128_t)0x8421084210842108ULL << 64 | 0x4210842108421084ULL;
+	static uint128_t m4 = (uint128_t)0x0842108421084210ULL << 64 | 0x8421084210842108ULL;
+	static uint128_t m5 = (uint128_t)0x1084210842108421ULL << 64 | 0x0842108421084210ULL;
+	uint128_t x1 = x & m1;
+	uint128_t y1 = y & m1;
+	uint128_t x2 = x & m2;
+	uint128_t y2 = y & m2;
+	uint128_t x3 = x & m3;
+	uint128_t y3 = y & m3;
+	uint128_t x4 = x & m4;
+	uint128_t y4 = y & m4;
+	uint128_t x5 = x & m5;
+	uint128_t y5 = y & m5;
+	uint128_t z = (x1 * y1) ^ (x2 * y5) ^ (x3 * y4) ^ (x4 * y3) ^ (x5 * y2);
+	uint128_t r = z & m1;
+	z = (x1 * y2) ^ (x2 * y1) ^ (x3 * y5) ^ (x4 * y4) ^ (x5 * y3);
+	r |= z & m2;
+	z = (x1 * y3) ^ (x2 * y2) ^ (x3 * y1) ^ (x4 * y5) ^ (x5 * y4);
+	r |= z & m3;
+	z = (x1 * y4) ^ (x2 * y3) ^ (x3 * y2) ^ (x4 * y1) ^ (x5 * y5);
+	r |= z & m4;
+	z = (x1 * y5) ^ (x2 * y4) ^ (x3 * y3) ^ (x4 * y2) ^ (x5 * y1);
+	r |= z & m5;
+	r_high = (uint64_t)(r >> 64);
+	r_low = (uint64_t)r;
+}
+
+static ZT_ALWAYS_INLINE void s_gfmul(const uint64_t h_high,const uint64_t h_low,uint64_t &y0, uint64_t &y1)
+{
+	uint64_t z2_low,z2_high,z0_low,z0_high,z1a_low,z1a_high;
+	uint64_t y_high = Utils::ntoh(y0);
+	uint64_t y_low = Utils::ntoh(y1);
+	s_bmul64(y_high,h_high,z2_high,z2_low);
+	s_bmul64(y_low,h_low,z0_high,z0_low);
+	s_bmul64(y_high ^ y_low,h_high ^ h_low,z1a_high,z1a_low);
+	z1a_high ^= z2_high ^ z0_high;
+	z1a_low ^= z2_low ^ z0_low;
+	uint128_t z_high = ((uint128_t)z2_high << 64) | (z2_low ^ z1a_high);
+	uint128_t z_low = (((uint128_t)z0_high << 64) | z0_low) ^ (((uint128_t)z1a_low) << 64);
+	z_high = (z_high << 1) | (z_low >> 127);
+	z_low <<= 1;
+	z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121);
+	z_high ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7);
+	y1 = Utils::hton((uint64_t)z_high);
+	y0 = Utils::hton((uint64_t)(z_high >> 64));
+}
+
+#else
+
+static ZT_ALWAYS_INLINE void s_bmul32(uint32_t x,uint32_t y,uint32_t &r_high,uint32_t &r_low)
+{
+	const uint32_t m1 = (uint32_t)0x11111111;
+	const uint32_t m2 = (uint32_t)0x22222222;
+	const uint32_t m4 = (uint32_t)0x44444444;
+	const uint32_t m8 = (uint32_t)0x88888888;
+	uint32_t x0 = x & m1;
+	uint32_t x1 = x & m2;
+	uint32_t x2 = x & m4;
+	uint32_t x3 = x & m8;
+	uint32_t y0 = y & m1;
+	uint32_t y1 = y & m2;
+	uint32_t y2 = y & m4;
+	uint32_t y3 = y & m8;
+	uint64_t z0 = ((uint64_t)x0 * y0) ^ ((uint64_t)x1 * y3) ^ ((uint64_t)x2 * y2) ^ ((uint64_t)x3 * y1);
+	uint64_t z1 = ((uint64_t)x0 * y1) ^ ((uint64_t)x1 * y0) ^ ((uint64_t)x2 * y3) ^ ((uint64_t)x3 * y2);
+	uint64_t z2 = ((uint64_t)x0 * y2) ^ ((uint64_t)x1 * y1) ^ ((uint64_t)x2 * y0) ^ ((uint64_t)x3 * y3);
+	uint64_t z3 = ((uint64_t)x0 * y3) ^ ((uint64_t)x1 * y2) ^ ((uint64_t)x2 * y1) ^ ((uint64_t)x3 * y0);
+	z0 &= ((uint64_t)m1 << 32) | m1;
+	z1 &= ((uint64_t)m2 << 32) | m2;
+	z2 &= ((uint64_t)m4 << 32) | m4;
+	z3 &= ((uint64_t)m8 << 32) | m8;
+	uint64_t z = z0 | z1 | z2 | z3;
+	r_high = (uint32_t)(z >> 32);
+	r_low = (uint32_t)z;
+}
+
+static ZT_ALWAYS_INLINE void s_gfmul(const uint64_t h_high,const uint64_t h_low,uint64_t &y0,uint64_t &y1)
+{
+	uint32_t h_high_h = (uint32_t)(h_high >> 32);
+	uint32_t h_high_l = (uint32_t)h_high;
+	uint32_t h_low_h = (uint32_t)(h_low >> 32);
+	uint32_t h_low_l = (uint32_t)h_low;
+	uint32_t h_highXlow_h = h_high_h ^ h_low_h;
+	uint32_t h_highXlow_l = h_high_l ^ h_low_l;
+	uint64_t y_low = Utils::ntoh(y0);
+	uint64_t y_high = Utils::ntoh(y1);
+	uint32_t ci_low_h = (uint32_t)(y_high >> 32);
+	uint32_t ci_low_l = (uint32_t)y_high;
+	uint32_t ci_high_h = (uint32_t)(y_low >> 32);
+	uint32_t ci_high_l = (uint32_t)y_low;
+	uint32_t ci_highXlow_h = ci_high_h ^ ci_low_h;
+	uint32_t ci_highXlow_l = ci_high_l ^ ci_low_l;
+	uint32_t a_a_h,a_a_l,a_b_h,a_b_l,a_c_h,a_c_l;
+	s_bmul32(ci_high_h,h_high_h,a_a_h,a_a_l);
+	s_bmul32(ci_high_l,h_high_l,a_b_h,a_b_l);
+	s_bmul32(ci_high_h ^ ci_high_l,h_high_h ^ h_high_l,a_c_h,a_c_l);
+	a_c_h ^= a_a_h ^ a_b_h;
+	a_c_l ^= a_a_l ^ a_b_l;
+	a_a_l ^= a_c_h;
+	a_b_h ^= a_c_l;
+	uint32_t b_a_h,b_a_l,b_b_h,b_b_l,b_c_h,b_c_l;
+	s_bmul32(ci_low_h,h_low_h,b_a_h,b_a_l);
+	s_bmul32(ci_low_l,h_low_l,b_b_h,b_b_l);
+	s_bmul32(ci_low_h ^ ci_low_l,h_low_h ^ h_low_l,b_c_h,b_c_l);
+	b_c_h ^= b_a_h ^ b_b_h;
+	b_c_l ^= b_a_l ^ b_b_l;
+	b_a_l ^= b_c_h;
+	b_b_h ^= b_c_l;
+	uint32_t c_a_h,c_a_l,c_b_h,c_b_l,c_c_h,c_c_l;
+	s_bmul32(ci_highXlow_h,h_highXlow_h,c_a_h,c_a_l);
+	s_bmul32(ci_highXlow_l,h_highXlow_l,c_b_h,c_b_l);
+	s_bmul32(ci_highXlow_h ^ ci_highXlow_l, h_highXlow_h ^ h_highXlow_l,c_c_h,c_c_l);
+	c_c_h ^= c_a_h ^ c_b_h;
+	c_c_l ^= c_a_l ^ c_b_l;
+	c_a_l ^= c_c_h;
+	c_b_h ^= c_c_l;
+	c_a_h ^= b_a_h ^ a_a_h;
+	c_a_l ^= b_a_l ^ a_a_l;
+	c_b_h ^= b_b_h ^ a_b_h;
+	c_b_l ^= b_b_l ^ a_b_l;
+	uint64_t z_high_h = ((uint64_t)a_a_h << 32) | a_a_l;
+	uint64_t z_high_l = (((uint64_t)a_b_h << 32) | a_b_l) ^ (((uint64_t)c_a_h << 32) | c_a_l);
+	uint64_t z_low_h = (((uint64_t)b_a_h << 32) | b_a_l) ^ (((uint64_t)c_b_h << 32) | c_b_l);
+	uint64_t z_low_l = ((uint64_t)b_b_h << 32) | b_b_l;
+	z_high_h = z_high_h << 1 | z_high_l >> 63;
+	z_high_l = z_high_l << 1 | z_low_h >> 63;
+	z_low_h = z_low_h << 1 | z_low_l >> 63;
+	z_low_l <<= 1;
+	z_low_h ^= (z_low_l << 63) ^ (z_low_l << 62) ^ (z_low_l << 57);
+	z_high_h ^= z_low_h ^ (z_low_h >> 1) ^ (z_low_h >> 2) ^ (z_low_h >> 7);
+	z_high_l ^= z_low_l ^ (z_low_l >> 1) ^ (z_low_l >> 2) ^ (z_low_l >> 7) ^ (z_low_h << 63) ^ (z_low_h << 62) ^ (z_low_h << 57);
+	y0 = Utils::hton(z_high_h);
+	y1 = Utils::hton(z_high_l);
+}
+#endif
+
+void AES::_gmacSW(const uint8_t iv[12],const uint8_t *in,unsigned int len,uint8_t out[16]) const
+{
+	const uint64_t h0 = _k.sw.h[0];
+	const uint64_t h1 = _k.sw.h[1];
+	const uint64_t lpad = Utils::hton((uint64_t)len * 8);
+	uint64_t y0 = 0,y1 = 0;
+
+	while (len >= 16) {
+		y0 ^= *((const uint64_t *)in);
+		in += 8;
+		y1 ^= *((const uint64_t *)in);
+		in += 8;
+		s_gfmul(h0,h1,y0,y1);
+		len -= 16;
+	}
+
+	if (len) {
+		uint64_t last[2] = { 0,0 };
+		for(unsigned int i=0;i<len;++i)
+			((uint8_t *)last)[i] = in[i];
+		y0 ^= last[0];
+		y1 ^= last[1];
+		s_gfmul(h0,h1,y0,y1);
+	}
+
+	y0 ^= lpad;
+	s_gfmul(h0,h1,y0,y1);
+
+	uint64_t iv2[2];
+	for(unsigned int i=0;i<12;++i)
+		((uint8_t *)iv2)[i] = iv[i];
+	((uint8_t *)iv2)[12] = 0;
+	((uint8_t *)iv2)[13] = 0;
+	((uint8_t *)iv2)[14] = 0;
+	((uint8_t *)iv2)[15] = 1;
+	_encryptSW((const uint8_t *)iv2,(uint8_t *)iv2);
+	((uint64_t *)out)[0] = y0 ^ iv2[0];
+	((uint64_t *)out)[1] = y1 ^ iv2[1];
+}
+
 } // namespace ZeroTier
diff --git a/node/AES.hpp b/node/AES.hpp
index 2d6621e29..90094ed1c 100644
--- a/node/AES.hpp
+++ b/node/AES.hpp
@@ -104,6 +104,8 @@ public:
 			return;
 		}
 #endif
+
+		_gmacSW(iv,(const uint8_t *)in,len,out);
 	}
 
 	/**
@@ -261,6 +263,7 @@ private:
 
 	void _initSW(const uint8_t key[32]);
 	void _encryptSW(const uint8_t in[16],uint8_t out[16]) const;
+	void _gmacSW(const uint8_t iv[12],const uint8_t *in,unsigned int len,uint8_t out[16]) const;
 
 	/**************************************************************************/
 	union {
@@ -276,6 +279,7 @@ private:
 		} ni;
 #endif
 		struct {
+			uint64_t h[2];
 			uint32_t ek[30];
 		} sw;
 	} _k;
@@ -426,14 +430,16 @@ private:
 		h = _mm_aesenc_si128(h,_k.ni.k[12]);
 		h = _mm_aesenc_si128(h,_k.ni.k[13]);
 		h = _mm_aesenclast_si128(h,_k.ni.k[14]);
-		__m128i hswap = _swap128_aesni(h);
-		__m128i hh = _mult_block_aesni(hswap,h);
-		__m128i hhh = _mult_block_aesni(hswap,hh);
-		__m128i hhhh = _mult_block_aesni(hswap,hhh);
+
+		const __m128i shuf = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+		__m128i hswap = _mm_shuffle_epi8(h,shuf);
+		__m128i hh = _mult_block_aesni(shuf,hswap,h);
+		__m128i hhh = _mult_block_aesni(shuf,hswap,hh);
+		__m128i hhhh = _mult_block_aesni(shuf,hswap,hhh);
 		_k.ni.h = hswap;
-		_k.ni.hh = _swap128_aesni(hh);
-		_k.ni.hhh = _swap128_aesni(hhh);
-		_k.ni.hhhh = _swap128_aesni(hhhh);
+		_k.ni.hh = _mm_shuffle_epi8(hh,shuf);
+		_k.ni.hhh = _mm_shuffle_epi8(hhh,shuf);
+		_k.ni.hhhh = _mm_shuffle_epi8(hhhh,shuf);
 	}
 
 	ZT_ALWAYS_INLINE void _encrypt_aesni(const void *in,void *out) const
@@ -587,10 +593,9 @@ private:
 		}
 	}
 
-	static ZT_ALWAYS_INLINE __m128i _swap128_aesni(__m128i x) { return _mm_shuffle_epi8(x,_mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)); }
-	static ZT_ALWAYS_INLINE __m128i _mult_block_aesni(__m128i h,__m128i y)
+	static ZT_ALWAYS_INLINE __m128i _mult_block_aesni(__m128i shuf,__m128i h,__m128i y)
 	{
-		y = _swap128_aesni(y);
+		y = _mm_shuffle_epi8(y,shuf);
 		__m128i t1 = _mm_clmulepi64_si128(h,y,0x00);
 		__m128i t2 = _mm_clmulepi64_si128(h,y,0x01);
 		__m128i t3 = _mm_clmulepi64_si128(h,y,0x10);
@@ -626,86 +631,9 @@ private:
 		t4 = _mm_xor_si128(t4,t2);
 		t4 = _mm_xor_si128(t4,t3);
 		t4 = _mm_xor_si128(t4,t5);
-		return _swap128_aesni(t4);
+		return _mm_shuffle_epi8(t4,shuf);
 	}
-	static ZT_ALWAYS_INLINE __m128i _mult4xor_aesni(__m128i h1,__m128i h2,__m128i h3,__m128i h4,__m128i d1,__m128i d2,__m128i d3,__m128i d4)
-	{
-		d1 = _swap128_aesni(d1);
-		d2 = _swap128_aesni(d2);
-		d3 = _swap128_aesni(d3);
-		d4 = _swap128_aesni(d4);
-		__m128i t0 = _mm_clmulepi64_si128(h1,d1,0x00);
-		__m128i t1 = _mm_clmulepi64_si128(h2,d2,0x00);
-		__m128i t2 = _mm_clmulepi64_si128(h3,d3,0x00);
-		__m128i t3 = _mm_clmulepi64_si128(h4,d4,0x00);
-		__m128i t8 = _mm_xor_si128(t0,t1);
-		t8 = _mm_xor_si128(t8,t2);
-		t8 = _mm_xor_si128(t8,t3);
-		__m128i t4 = _mm_clmulepi64_si128(h1,d1,0x11);
-		__m128i t5 = _mm_clmulepi64_si128(h2,d2,0x11);
-		__m128i t6 = _mm_clmulepi64_si128(h3,d3,0x11);
-		__m128i t7 = _mm_clmulepi64_si128(h4,d4,0x11);
-		__m128i t9 = _mm_xor_si128(t4,t5);
-		t9 = _mm_xor_si128(t9,t6);
-		t9 = _mm_xor_si128(t9,t7);
-		t0 = _mm_shuffle_epi32(h1,78);
-		t4 = _mm_shuffle_epi32(d1,78);
-		t0 = _mm_xor_si128(t0,h1);
-		t4 = _mm_xor_si128(t4,d1);
-		t1 = _mm_shuffle_epi32(h2,78);
-		t5 = _mm_shuffle_epi32(d2,78);
-		t1 = _mm_xor_si128(t1,h2);
-		t5 = _mm_xor_si128(t5,d2);
-		t2 = _mm_shuffle_epi32(h3,78);
-		t6 = _mm_shuffle_epi32(d3,78);
-		t2 = _mm_xor_si128(t2,h3);
-		t6 = _mm_xor_si128(t6,d3);
-		t3 = _mm_shuffle_epi32(h4,78);
-		t7 = _mm_shuffle_epi32(d4,78);
-		t3 = _mm_xor_si128(t3,h4);
-		t7 = _mm_xor_si128(t7,d4);
-		t0 = _mm_clmulepi64_si128(t0,t4,0x00);
-		t1 = _mm_clmulepi64_si128(t1,t5,0x00);
-		t2 = _mm_clmulepi64_si128(t2,t6,0x00);
-		t3 = _mm_clmulepi64_si128(t3,t7,0x00);
-		t0 = _mm_xor_si128(t0,t8);
-		t0 = _mm_xor_si128(t0,t9);
-		t0 = _mm_xor_si128(t1,t0);
-		t0 = _mm_xor_si128(t2,t0);
-		t0 = _mm_xor_si128(t3,t0);
-		t4 = _mm_slli_si128(t0,8);
-		t0 = _mm_srli_si128(t0,8);
-		t3 = _mm_xor_si128(t4,t8);
-		t6 = _mm_xor_si128(t0,t9);
-		t7 = _mm_srli_epi32(t3,31);
-		t8 = _mm_srli_epi32(t6,31);
-		t3 = _mm_slli_epi32(t3,1);
-		t6 = _mm_slli_epi32(t6,1);
-		t9 = _mm_srli_si128(t7,12);
-		t8 = _mm_slli_si128(t8,4);
-		t7 = _mm_slli_si128(t7,4);
-		t3 = _mm_or_si128(t3,t7);
-		t6 = _mm_or_si128(t6,t8);
-		t6 = _mm_or_si128(t6,t9);
-		t7 = _mm_slli_epi32(t3,31);
-		t8 = _mm_slli_epi32(t3,30);
-		t9 = _mm_slli_epi32(t3,25);
-		t7 = _mm_xor_si128(t7,t8);
-		t7 = _mm_xor_si128(t7,t9);
-		t8 = _mm_srli_si128(t7,4);
-		t7 = _mm_slli_si128(t7,12);
-		t3 = _mm_xor_si128(t3,t7);
-		t2 = _mm_srli_epi32(t3,1);
-		t4 = _mm_srli_epi32(t3,2);
-		t5 = _mm_srli_epi32(t3,7);
-		t2 = _mm_xor_si128(t2,t4);
-		t2 = _mm_xor_si128(t2,t5);
-		t2 = _mm_xor_si128(t2,t8);
-		t3 = _mm_xor_si128(t3,t2);
-		t6 = _mm_xor_si128(t6,t3);
-		return _swap128_aesni(t6);
-	}
-	static ZT_ALWAYS_INLINE __m128i _ghash_aesni(__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(h,_mm_xor_si128(y,x)); }
+	static ZT_ALWAYS_INLINE __m128i _ghash_aesni(__m128i shuf,__m128i h,__m128i y,__m128i x) { return _mult_block_aesni(shuf,h,_mm_xor_si128(y,x)); }
 
 	ZT_ALWAYS_INLINE void _gmac_aesni(const uint8_t iv[12],const uint8_t *in,const unsigned int len,uint8_t out[16]) const
 	{
@@ -714,30 +642,100 @@ private:
 		unsigned int pblocks = blocks - (blocks % 4);
 		unsigned int rem = len % 16;
 
-		__m128i h1 = _k.ni.hhhh;
-		__m128i h2 = _k.ni.hhh;
-		__m128i h3 = _k.ni.hh;
-		__m128i h4 = _k.ni.h;
+		const __m128i h1 = _k.ni.hhhh;
+		const __m128i h2 = _k.ni.hhh;
+		const __m128i h3 = _k.ni.hh;
+		const __m128i h4 = _k.ni.h;
+		const __m128i shuf = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
 		__m128i y = _mm_setzero_si128();
-		for (unsigned int i=0;i<pblocks;i+=4) {
-			__m128i d1 = _mm_loadu_si128(ab + i + 0);
-			__m128i d2 = _mm_loadu_si128(ab + i + 1);
-			__m128i d3 = _mm_loadu_si128(ab + i + 2);
-			__m128i d4 = _mm_loadu_si128(ab + i + 3);
-			y = _mm_xor_si128(y,d1);
-			y = _mult4xor_aesni(h1,h2,h3,h4,y,d2,d3,d4);
+		unsigned int i = 0;
+		for (;i<pblocks;i+=4) {
+			__m128i d1 = _mm_shuffle_epi8(_mm_xor_si128(y,_mm_loadu_si128(ab + i + 0)),shuf);
+			__m128i d2 = _mm_shuffle_epi8(_mm_loadu_si128(ab + i + 1),shuf);
+			__m128i d3 = _mm_shuffle_epi8(_mm_loadu_si128(ab + i + 2),shuf);
+			__m128i d4 = _mm_shuffle_epi8(_mm_loadu_si128(ab + i + 3),shuf);
+			__m128i t0 = _mm_clmulepi64_si128(h1,d1,0x00);
+			__m128i t1 = _mm_clmulepi64_si128(h2,d2,0x00);
+			__m128i t2 = _mm_clmulepi64_si128(h3,d3,0x00);
+			__m128i t3 = _mm_clmulepi64_si128(h4,d4,0x00);
+			__m128i t8 = _mm_xor_si128(t0,t1);
+			t8 = _mm_xor_si128(t8,t2);
+			t8 = _mm_xor_si128(t8,t3);
+			__m128i t4 = _mm_clmulepi64_si128(h1,d1,0x11);
+			__m128i t5 = _mm_clmulepi64_si128(h2,d2,0x11);
+			__m128i t6 = _mm_clmulepi64_si128(h3,d3,0x11);
+			__m128i t7 = _mm_clmulepi64_si128(h4,d4,0x11);
+			__m128i t9 = _mm_xor_si128(t4,t5);
+			t9 = _mm_xor_si128(t9,t6);
+			t9 = _mm_xor_si128(t9,t7);
+			t0 = _mm_shuffle_epi32(h1,78);
+			t4 = _mm_shuffle_epi32(d1,78);
+			t0 = _mm_xor_si128(t0,h1);
+			t4 = _mm_xor_si128(t4,d1);
+			t1 = _mm_shuffle_epi32(h2,78);
+			t5 = _mm_shuffle_epi32(d2,78);
+			t1 = _mm_xor_si128(t1,h2);
+			t5 = _mm_xor_si128(t5,d2);
+			t2 = _mm_shuffle_epi32(h3,78);
+			t6 = _mm_shuffle_epi32(d3,78);
+			t2 = _mm_xor_si128(t2,h3);
+			t6 = _mm_xor_si128(t6,d3);
+			t3 = _mm_shuffle_epi32(h4,78);
+			t7 = _mm_shuffle_epi32(d4,78);
+			t3 = _mm_xor_si128(t3,h4);
+			t7 = _mm_xor_si128(t7,d4);
+			t0 = _mm_clmulepi64_si128(t0,t4,0x00);
+			t1 = _mm_clmulepi64_si128(t1,t5,0x00);
+			t2 = _mm_clmulepi64_si128(t2,t6,0x00);
+			t3 = _mm_clmulepi64_si128(t3,t7,0x00);
+			t0 = _mm_xor_si128(t0,t8);
+			t0 = _mm_xor_si128(t0,t9);
+			t0 = _mm_xor_si128(t1,t0);
+			t0 = _mm_xor_si128(t2,t0);
+			t0 = _mm_xor_si128(t3,t0);
+			t4 = _mm_slli_si128(t0,8);
+			t0 = _mm_srli_si128(t0,8);
+			t3 = _mm_xor_si128(t4,t8);
+			t6 = _mm_xor_si128(t0,t9);
+			t7 = _mm_srli_epi32(t3,31);
+			t8 = _mm_srli_epi32(t6,31);
+			t3 = _mm_slli_epi32(t3,1);
+			t6 = _mm_slli_epi32(t6,1);
+			t9 = _mm_srli_si128(t7,12);
+			t8 = _mm_slli_si128(t8,4);
+			t7 = _mm_slli_si128(t7,4);
+			t3 = _mm_or_si128(t3,t7);
+			t6 = _mm_or_si128(t6,t8);
+			t6 = _mm_or_si128(t6,t9);
+			t7 = _mm_slli_epi32(t3,31);
+			t8 = _mm_slli_epi32(t3,30);
+			t9 = _mm_slli_epi32(t3,25);
+			t7 = _mm_xor_si128(t7,t8);
+			t7 = _mm_xor_si128(t7,t9);
+			t8 = _mm_srli_si128(t7,4);
+			t7 = _mm_slli_si128(t7,12);
+			t3 = _mm_xor_si128(t3,t7);
+			t2 = _mm_srli_epi32(t3,1);
+			t4 = _mm_srli_epi32(t3,2);
+			t5 = _mm_srli_epi32(t3,7);
+			t2 = _mm_xor_si128(t2,t4);
+			t2 = _mm_xor_si128(t2,t5);
+			t2 = _mm_xor_si128(t2,t8);
+			t3 = _mm_xor_si128(t3,t2);
+			t6 = _mm_xor_si128(t6,t3);
+			y = _mm_shuffle_epi8(t6,shuf);
 		}
 
-		for (unsigned int i=pblocks;i<blocks;++i)
-			y = _ghash_aesni(_k.ni.h,y,_mm_loadu_si128(ab + i));
+		for (;i<blocks;++i)
+			y = _ghash_aesni(shuf,h4,y,_mm_loadu_si128(ab + i));
 
 		if (rem) {
 			__m128i last = _mm_setzero_si128();
 			memcpy(&last,ab + blocks,rem);
-			y = _ghash_aesni(_k.ni.h,y,last);
+			y = _ghash_aesni(shuf,h4,y,last);
 		}
 
-		y = _ghash_aesni(_k.ni.h,y,_mm_set_epi64((__m64)0LL,(__m64)Utils::hton((uint64_t)len * (uint64_t)8)));
+		y = _ghash_aesni(shuf,h4,y,_mm_set_epi64((__m64)0LL,(__m64)Utils::hton((uint64_t)len * (uint64_t)8)));
 
 		__m128i t = _mm_xor_si128(_mm_set_epi32(0x01000000,(int)*((const uint32_t *)(iv+8)),(int)*((const uint32_t *)(iv+4)),(int)*((const uint32_t *)(iv))),_k.ni.k[0]);
 		t = _mm_aesenc_si128(t,_k.ni.k[1]);
@@ -754,8 +752,7 @@ private:
 		t = _mm_aesenc_si128(t,_k.ni.k[12]);
 		t = _mm_aesenc_si128(t,_k.ni.k[13]);
 		t = _mm_aesenclast_si128(t,_k.ni.k[14]);
-		t = _mm_xor_si128(y,t);
-		_mm_storeu_si128((__m128i *)out,t);
+		_mm_storeu_si128((__m128i *)out,_mm_xor_si128(y,t));
 	}
 #endif /* ZT_AES_AESNI ******************************************************/
 };
diff --git a/selftest.cpp b/selftest.cpp
index 8da5e1683..120b9a12e 100644
--- a/selftest.cpp
+++ b/selftest.cpp
@@ -158,6 +158,11 @@ static const uint8_t AES_GMAC_VECTOR_1_IV[12] = { 0x7C,0xFD,0xE9,0xF9,0xE3,0x37,
 static const uint8_t AES_GMAC_VECTOR_1_IN[81] = { 0x84,0xC5,0xD5,0x13,0xD2,0xAA,0xF6,0xE5,0xBB,0xD2,0x72,0x77,0x88,0xE5,0x23,0x00,0x89,0x32,0xD6,0x12,0x7C,0xFD,0xE9,0xF9,0xE3,0x37,0x24,0xC6,0x08,0x00,0x0F,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,0x00,0x05 };
 static const uint8_t AES_GMAC_VECTOR_1_OUT[16] = { 0x6E,0xE1,0x60,0xE8,0xFA,0xEC,0xA4,0xB3,0x6C,0x86,0xB2,0x34,0x92,0x0C,0xA9,0x75 };
 
+static const uint8_t AES_GMAC_VECTOR_2_KEY[32] = { 0x63,0x2f,0xd9,0x48,0xcf,0x70,0xe2,0xee,0x70,0x63,0xe8,0x7a,0x4a,0x2a,0x39,0x9b,0x67,0x08,0x64,0x03,0x68,0x9d,0xbc,0x60,0xea,0x68,0x4a,0x7a,0x83,0x37,0x00,0xfe };
+static const uint8_t AES_GMAC_VECTOR_2_IV[12] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b };
+static const uint8_t AES_GMAC_VECTOR_2_IN[541] = { 0xc8,0x36,0x38,0xe8,0x53,0xc8,0x86,0xa3,0xe3,0xad,0x9e,0x2a,0x91,0x47,0xb9,0x51,0xad,0xf7,0x78,0x89,0x9a,0xeb,0x80,0x41,0x67,0xa9,0x16,0xc4,0x93,0xcc,0x77,0x3d,0x8c,0xcf,0x4d,0xb5,0x0b,0xda,0xfd,0xc2,0x8c,0x83,0x5d,0x66,0x43,0x74,0x21,0xbd,0xc4,0xab,0x41,0xd8,0x40,0x53,0x34,0xe8,0x05,0xcb,0x89,0x45,0x09,0xb0,0xa4,0xa6,0x04,0x95,0x19,0x2c,0xab,0x94,0xe1,0x8d,0x7b,0x59,0x8b,0xb9,0x31,0xae,0x3c,0x25,0xd3,0x23,0xab,0x8f,0x95,0xa3,0x8b,0xa5,0xc1,0x66,0x8b,0x57,0xe4,0x88,0x70,0xc9,0xe0,0xa1,0x16,0x39,0xf8,0x12,0xb3,0xe5,0x95,0x38,0x3a,0x01,0x1d,0xcc,0xc0,0xc3,0xa9,0x1c,0x72,0xa7,0x46,0x79,0x51,0x05,0xb2,0x85,0x5a,0x97,0x16,0x97,0xa6,0x85,0xa4,0xf2,0x0b,0x3c,0x90,0x52,0xa3,0xe0,0xbe,0xad,0x06,0x1b,0x8e,0x04,0x22,0xeb,0x3a,0x48,0xb9,0x84,0x24,0x0b,0x24,0x42,0xd9,0xed,0x6b,0x5c,0xc1,0xb6,0x2e,0xa5,0xc0,0x07,0xfe,0x3e,0xbc,0x9a,0x92,0x26,0xb5,0xa6,0x5f,0x09,0x13,0x85,0x5a,0xcf,0x61,0x56,0x65,0x0f,0x4c,0x64,0x79,0xfa,0x0a,0xcf,0xc0,0x95,0x8d,0x4d,0xc6,0xbe,0xee,0xb3,0x67,0xd8,0xa7,0x40,0x90,0x61,0xe3,0xba,0xcb,0x18,0xe0,0x61,0x7b,0x33,0x86,0xf7,0xef,0x64,0xe5,0x36,0xf0,0x9c,0xb6,0x34,0xb1,0xe1,0x2a,0xd8,0xd8,0x5e,0x6b,0x61,0x92,0xa0,0x8e,0x04,0x7b,0xbf,0xa5,0x84,0x39,0x3a,0xe0,0x27,0xc7,0xb0,0x83,0x88,0x4f,0x3e,0x49,0x14,0xaa,0x34,0xde,0xb4,0xbb,0x4c,0xe4,0xbf,0xae,0x9a,0xf9,0x88,0x7a,0x1f,0x18,0xa0,0x8c,0x60,0xc0,0x5c,0x46,0xa1,0xd1,0x36,0x99,0x60,0x9b,0x73,0xa2,0x9a,0x0b,0x8d,0x6e,0x2f,0xe1,0x58,0x7a,0x39,0x71,0xed,0xfc,0x34,0xe4,0x98,0x57,0x7e,0x86,0xf1,0xe5,0x00,0x7d,0x1b,0x6a,0xfa,0xf8,0x6e,0x7b,0x12,0x44,0x04,0x60,0x02,0x81,0x12,0x09,0x00,0xb4,0x35,0x9e,0x03,0x73,0x79,0x9b,0x13,0xc5,0xd7,0x0e,0xce,0x49,0x87,0x48,0x1a,0x67,0x89,0x93,0xef,0xd1,0xdf,0x2d,0x48,0x6d,0x30,0xd5,0xec,0x49,0xfe,0x15,0x1b,0xa6,0x2b,0x6c,0x08,0x8e,0x39,0x73,0x68,0x87,0xa7,0x43,0x28,0x16,0x77,0x86,0xd1,0xcb,0x13,0xe4,0xd3,0xda,0x63,0xcd,0x3a,0x2a,0x35,0xd5,0xfa,0x36,0x67,0xc8,0x4c,0x6b,0xa1,0x8a,0xaf,0x7b,0x4c,0x43,0xb0,0x2f,0x4a,0xcc,0xc0,0x11,0xc6,0x30,0x8e,0xa3,0xd2,0x4a,0x1b,0x2a,0x4f,0xec,0x97,0x83,0xa6,0x4c,0xee,0x51,0xaf,0x06,0x0a,0x1d,0x80,0xd9,0xcf,0xb7,0x69,0x23,0x15,0x3a,0x26,0x04,0x34,0x33,0x76,0x30,0x9f,0xfb,0x56,0xb4,0x26,0xee,0xfa,0x54,0x6c,0x18,0xf9,0xd5,0x32,0x5d,0x03,0xcb,0x2c,0x20,0x30,0x0c,0xa0,0xbb,0xde,0x01,0x77,0x65,0xb0,0x18,0x30,0xd2,0x55,0x9f,0x9b,0xcf,0xb8,0x9b,0xb4,0xbc,0x0b,0x49,0x52,0x53,0x30,0x48,0xa5,0x12,0xe5,0x3b,0x47,0x84,0xff,0xf1,0x53,0x5d,0x5c,0x04,0x70,0x63,0x91,0xc3,0xc0,0xf0,0xea,0xcb,0x44,0x4f,0x8c,0x85,0x42,0x6a,0xc7,0xfa,0xc7,0xb5,0x30,0x03,0x12,0x65,0xca,0xba,0x4f,0x67,0xbb,0xef,0xb6,0xc6,0x3f,0x19,0xe2,0xb5,0x4b,0x8c,0xfc,0x9e,0x18,0xb0,0x33,0x89,0x6e,0xde,0x61,0x0a,0xe3,0x5e,0xa3,0x5d,0x2e,0x80,0x3e,0x53,0x67,0xfb,0x7b,0x7a,0xbf,0xd5,0xf4,0x47 };
+static const uint8_t AES_GMAC_VECTOR_2_OUT[16] = { 0x67,0x39,0x4f,0x00,0x04,0x28,0xaf,0xe9,0xb4,0x2e,0xb5,0x3c,0x42,0x24,0x86,0xa3 };
+
 //////////////////////////////////////////////////////////////////////////////
 
 static int testCrypto()
@@ -182,22 +187,27 @@ static int testCrypto()
 		}
 		std::cout << "OK" ZT_EOL_S << "  GMAC-AES-256 (test vectors): "; std::cout.flush();
 		tv.init(AES_GMAC_VECTOR_0_KEY);
-		tv.gmac(AES_GMAC_VECTOR_0_IV,AES_GMAC_VECTOR_0_IN,sizeof(AES_GMAC_VECTOR_0_IN),(uint8_t *)hexbuf);
-		if (memcmp(hexbuf,AES_GMAC_VECTOR_0_OUT,16) != 0) {
-			std::cout << "FAILED (test vector 0)" ZT_EOL_S;
+		tv.gmac(AES_GMAC_VECTOR_0_IV,AES_GMAC_VECTOR_0_IN,sizeof(AES_GMAC_VECTOR_0_IN),(uint8_t *)buf2);
+		if (memcmp(buf2,AES_GMAC_VECTOR_0_OUT,16) != 0) {
+			std::cout << "FAILED (test vector 0) (" << Utils::hex(buf2,16,hexbuf) << ")" ZT_EOL_S;
 			return -1;
 		}
 		tv.init(AES_GMAC_VECTOR_1_KEY);
-		tv.gmac(AES_GMAC_VECTOR_1_IV,AES_GMAC_VECTOR_1_IN,sizeof(AES_GMAC_VECTOR_1_IN),(uint8_t *)hexbuf);
-		if (memcmp(hexbuf,AES_GMAC_VECTOR_1_OUT,16) != 0) {
-			std::cout << "FAILED (test vector 1)" ZT_EOL_S;
+		tv.gmac(AES_GMAC_VECTOR_1_IV,AES_GMAC_VECTOR_1_IN,sizeof(AES_GMAC_VECTOR_1_IN),(uint8_t *)buf2);
+		if (memcmp(buf2,AES_GMAC_VECTOR_1_OUT,16) != 0) {
+			std::cout << "FAILED (test vector 1) (" << Utils::hex(buf2,16,hexbuf) << ")" ZT_EOL_S;
+			return -1;
+		}
+		tv.init(AES_GMAC_VECTOR_2_KEY);
+		tv.gmac(AES_GMAC_VECTOR_2_IV,AES_GMAC_VECTOR_2_IN,sizeof(AES_GMAC_VECTOR_2_IN),(uint8_t *)buf2);
+		if (memcmp(buf2,AES_GMAC_VECTOR_2_OUT,16) != 0) {
+			std::cout << "FAILED (test vector 2) (" << Utils::hex(buf2,16,hexbuf) << ")" ZT_EOL_S;
 			return -1;
 		}
 		std::cout << "OK" ZT_EOL_S << "  GMAC-AES-256 (benchmark): "; std::cout.flush();
 		int64_t start = OSUtils::now();
 		for(unsigned long i=0;i<200000;++i) {
-			tv.gmac(AES_GMAC_VECTOR_0_IV,buf1,sizeof(buf1),(uint8_t *)hexbuf);
-			buf1[0] = hexbuf[0];
+			tv.gmac((const uint8_t *)buf1,buf1,sizeof(buf1),(uint8_t *)buf1);
 		}
 		int64_t end = OSUtils::now();
 		*dummy = hexbuf[0];