From a1e94154bebe17a24d2eed43be7d866e93c061fe Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Tue, 18 Apr 2017 08:45:37 -0700 Subject: [PATCH] Just incorporate the X64 ASM version of Salsa20/12 for X64 platforms. This gives us (for example) 1.5gb/sec encryption on a Core i5 2.8ghz. --- ext/x64-salsa2012-asm/README.md | 6 + ext/x64-salsa2012-asm/salsa2012.h | 13 + ext/x64-salsa2012-asm/salsa2012.s | 4488 +++++++++++++++++++++++++++++ make-mac.mk | 7 + node/Packet.cpp | 65 +- selftest.cpp | 22 + 6 files changed, 4597 insertions(+), 4 deletions(-) create mode 100644 ext/x64-salsa2012-asm/README.md create mode 100644 ext/x64-salsa2012-asm/salsa2012.h create mode 100644 ext/x64-salsa2012-asm/salsa2012.s diff --git a/ext/x64-salsa2012-asm/README.md b/ext/x64-salsa2012-asm/README.md new file mode 100644 index 000000000..a69a1a671 --- /dev/null +++ b/ext/x64-salsa2012-asm/README.md @@ -0,0 +1,6 @@ +Blazingly fast X64 ASM implementation of Salsa20/12 +====== + +This is ripped from the [cnacl](https://github.com/cjdelisle/cnacl) source. The actual code is by Danial J. Bernstein and is in the public domain. + +This is included on Linux and Mac 64-bit builds and is significantly faster than the SSE intrinsics or C versions. It's used for packet encode/decode only since its use differs a bit from the regular Salsa20 C++ class. Specifically it lacks the ability to be called on multiple blocks, preferring instead to take a key and a single stream to encrypt and that's it. diff --git a/ext/x64-salsa2012-asm/salsa2012.h b/ext/x64-salsa2012-asm/salsa2012.h new file mode 100644 index 000000000..d47059b42 --- /dev/null +++ b/ext/x64-salsa2012-asm/salsa2012.h @@ -0,0 +1,13 @@ +#ifdef __cplusplus +extern "C" { +#endif + +// output, outlen, nonce, key (256-bit / 32-byte) +extern int zt_salsa2012_amd64_xmm6(unsigned char *,unsigned long long,const unsigned char *,const unsigned char *); + +// ciphertext, message, mlen, nonce, key +extern int zt_salsa2012_amd64_xmm6_xor(unsigned char *,const unsigned char *,unsigned long long,const unsigned char *,const unsigned char *); + +#ifdef __cplusplus +} +#endif diff --git a/ext/x64-salsa2012-asm/salsa2012.s b/ext/x64-salsa2012-asm/salsa2012.s new file mode 100644 index 000000000..699c89ac6 --- /dev/null +++ b/ext/x64-salsa2012-asm/salsa2012.s @@ -0,0 +1,4488 @@ +# qhasm: enter zt_salsa2012_amd64_xmm6 +.text +.p2align 5 +.globl _zt_salsa2012_amd64_xmm6 +.globl zt_salsa2012_amd64_xmm6 +_zt_salsa2012_amd64_xmm6: +zt_salsa2012_amd64_xmm6: +mov %rsp,%r11 +and $31,%r11 +add $480,%r11 +sub %r11,%rsp + +# qhasm: r11_stack = r11_caller +# asm 1: movq r11_stack=stack64#1 +# asm 2: movq r11_stack=352(%rsp) +movq %r11,352(%rsp) + +# qhasm: r12_stack = r12_caller +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=360(%rsp) +movq %r12,360(%rsp) + +# qhasm: r13_stack = r13_caller +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=368(%rsp) +movq %r13,368(%rsp) + +# qhasm: r14_stack = r14_caller +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=376(%rsp) +movq %r14,376(%rsp) + +# qhasm: r15_stack = r15_caller +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=384(%rsp) +movq %r15,384(%rsp) + +# qhasm: rbx_stack = rbx_caller +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=392(%rsp) +movq %rbx,392(%rsp) + +# qhasm: rbp_stack = rbp_caller +# asm 1: movq rbp_stack=stack64#7 +# asm 2: movq rbp_stack=400(%rsp) +movq %rbp,400(%rsp) + +# qhasm: bytes = arg2 +# asm 1: mov bytes=int64#6 +# asm 2: mov bytes=%r9 +mov %rsi,%r9 + +# qhasm: out = arg1 +# asm 1: mov out=int64#1 +# asm 2: mov out=%rdi +mov %rdi,%rdi + +# qhasm: m = out +# asm 1: mov m=int64#2 +# asm 2: mov m=%rsi +mov %rdi,%rsi + +# qhasm: iv = arg3 +# asm 1: mov iv=int64#3 +# asm 2: mov iv=%rdx +mov %rdx,%rdx + +# qhasm: k = arg4 +# asm 1: mov k=int64#8 +# asm 2: mov k=%r10 +mov %rcx,%r10 + +# qhasm: unsigned>? bytes - 0 +# asm 1: cmp $0, +jbe ._done + +# qhasm: a = 0 +# asm 1: mov $0,>a=int64#7 +# asm 2: mov $0,>a=%rax +mov $0,%rax + +# qhasm: i = bytes +# asm 1: mov i=int64#4 +# asm 2: mov i=%rcx +mov %r9,%rcx + +# qhasm: while (i) { *out++ = a; --i } +rep stosb + +# qhasm: out -= bytes +# asm 1: sub r11_stack=stack64#1 +# asm 2: movq r11_stack=352(%rsp) +movq %r11,352(%rsp) + +# qhasm: r12_stack = r12_caller +# asm 1: movq r12_stack=stack64#2 +# asm 2: movq r12_stack=360(%rsp) +movq %r12,360(%rsp) + +# qhasm: r13_stack = r13_caller +# asm 1: movq r13_stack=stack64#3 +# asm 2: movq r13_stack=368(%rsp) +movq %r13,368(%rsp) + +# qhasm: r14_stack = r14_caller +# asm 1: movq r14_stack=stack64#4 +# asm 2: movq r14_stack=376(%rsp) +movq %r14,376(%rsp) + +# qhasm: r15_stack = r15_caller +# asm 1: movq r15_stack=stack64#5 +# asm 2: movq r15_stack=384(%rsp) +movq %r15,384(%rsp) + +# qhasm: rbx_stack = rbx_caller +# asm 1: movq rbx_stack=stack64#6 +# asm 2: movq rbx_stack=392(%rsp) +movq %rbx,392(%rsp) + +# qhasm: rbp_stack = rbp_caller +# asm 1: movq rbp_stack=stack64#7 +# asm 2: movq rbp_stack=400(%rsp) +movq %rbp,400(%rsp) + +# qhasm: out = arg1 +# asm 1: mov out=int64#1 +# asm 2: mov out=%rdi +mov %rdi,%rdi + +# qhasm: m = arg2 +# asm 1: mov m=int64#2 +# asm 2: mov m=%rsi +mov %rsi,%rsi + +# qhasm: bytes = arg3 +# asm 1: mov bytes=int64#6 +# asm 2: mov bytes=%r9 +mov %rdx,%r9 + +# qhasm: iv = arg4 +# asm 1: mov iv=int64#3 +# asm 2: mov iv=%rdx +mov %rcx,%rdx + +# qhasm: k = arg5 +# asm 1: mov k=int64#8 +# asm 2: mov k=%r10 +mov %r8,%r10 + +# qhasm: unsigned>? bytes - 0 +# asm 1: cmp $0, +jbe ._done +# comment:fp stack unchanged by fallthrough + +# qhasm: start: +._start: + +# qhasm: in12 = *(uint32 *) (k + 20) +# asm 1: movl 20(in12=int64#4d +# asm 2: movl 20(in12=%ecx +movl 20(%r10),%ecx + +# qhasm: in1 = *(uint32 *) (k + 0) +# asm 1: movl 0(in1=int64#5d +# asm 2: movl 0(in1=%r8d +movl 0(%r10),%r8d + +# qhasm: in6 = *(uint32 *) (iv + 0) +# asm 1: movl 0(in6=int64#7d +# asm 2: movl 0(in6=%eax +movl 0(%rdx),%eax + +# qhasm: in11 = *(uint32 *) (k + 16) +# asm 1: movl 16(in11=int64#9d +# asm 2: movl 16(in11=%r11d +movl 16(%r10),%r11d + +# qhasm: ((uint32 *)&x1)[0] = in12 +# asm 1: movl x1=stack128#1 +# asm 2: movl x1=0(%rsp) +movl %ecx,0(%rsp) + +# qhasm: ((uint32 *)&x1)[1] = in1 +# asm 1: movl in8=int64#4 +# asm 2: mov $0,>in8=%rcx +mov $0,%rcx + +# qhasm: in13 = *(uint32 *) (k + 24) +# asm 1: movl 24(in13=int64#5d +# asm 2: movl 24(in13=%r8d +movl 24(%r10),%r8d + +# qhasm: in2 = *(uint32 *) (k + 4) +# asm 1: movl 4(in2=int64#7d +# asm 2: movl 4(in2=%eax +movl 4(%r10),%eax + +# qhasm: in7 = *(uint32 *) (iv + 4) +# asm 1: movl 4(in7=int64#3d +# asm 2: movl 4(in7=%edx +movl 4(%rdx),%edx + +# qhasm: ((uint32 *)&x2)[0] = in8 +# asm 1: movl x2=stack128#2 +# asm 2: movl x2=16(%rsp) +movl %ecx,16(%rsp) + +# qhasm: ((uint32 *)&x2)[1] = in13 +# asm 1: movl in4=int64#3d +# asm 2: movl 12(in4=%edx +movl 12(%r10),%edx + +# qhasm: in9 = 0 +# asm 1: mov $0,>in9=int64#4 +# asm 2: mov $0,>in9=%rcx +mov $0,%rcx + +# qhasm: in14 = *(uint32 *) (k + 28) +# asm 1: movl 28(in14=int64#5d +# asm 2: movl 28(in14=%r8d +movl 28(%r10),%r8d + +# qhasm: in3 = *(uint32 *) (k + 8) +# asm 1: movl 8(in3=int64#7d +# asm 2: movl 8(in3=%eax +movl 8(%r10),%eax + +# qhasm: ((uint32 *)&x3)[0] = in4 +# asm 1: movl x3=stack128#3 +# asm 2: movl x3=32(%rsp) +movl %edx,32(%rsp) + +# qhasm: ((uint32 *)&x3)[1] = in9 +# asm 1: movl in0=int64#3 +# asm 2: mov $1634760805,>in0=%rdx +mov $1634760805,%rdx + +# qhasm: in5 = 857760878 +# asm 1: mov $857760878,>in5=int64#4 +# asm 2: mov $857760878,>in5=%rcx +mov $857760878,%rcx + +# qhasm: in10 = 2036477234 +# asm 1: mov $2036477234,>in10=int64#5 +# asm 2: mov $2036477234,>in10=%r8 +mov $2036477234,%r8 + +# qhasm: in15 = 1797285236 +# asm 1: mov $1797285236,>in15=int64#7 +# asm 2: mov $1797285236,>in15=%rax +mov $1797285236,%rax + +# qhasm: ((uint32 *)&x0)[0] = in0 +# asm 1: movl x0=stack128#4 +# asm 2: movl x0=48(%rsp) +movl %edx,48(%rsp) + +# qhasm: ((uint32 *)&x0)[1] = in5 +# asm 1: movl z0=int6464#1 +# asm 2: movdqa z0=%xmm0 +movdqa 48(%rsp),%xmm0 + +# qhasm: z5 = z0[1,1,1,1] +# asm 1: pshufd $0x55,z5=int6464#2 +# asm 2: pshufd $0x55,z5=%xmm1 +pshufd $0x55,%xmm0,%xmm1 + +# qhasm: z10 = z0[2,2,2,2] +# asm 1: pshufd $0xaa,z10=int6464#3 +# asm 2: pshufd $0xaa,z10=%xmm2 +pshufd $0xaa,%xmm0,%xmm2 + +# qhasm: z15 = z0[3,3,3,3] +# asm 1: pshufd $0xff,z15=int6464#4 +# asm 2: pshufd $0xff,z15=%xmm3 +pshufd $0xff,%xmm0,%xmm3 + +# qhasm: z0 = z0[0,0,0,0] +# asm 1: pshufd $0x00,z0=int6464#1 +# asm 2: pshufd $0x00,z0=%xmm0 +pshufd $0x00,%xmm0,%xmm0 + +# qhasm: orig5 = z5 +# asm 1: movdqa orig5=stack128#5 +# asm 2: movdqa orig5=64(%rsp) +movdqa %xmm1,64(%rsp) + +# qhasm: orig10 = z10 +# asm 1: movdqa orig10=stack128#6 +# asm 2: movdqa orig10=80(%rsp) +movdqa %xmm2,80(%rsp) + +# qhasm: orig15 = z15 +# asm 1: movdqa orig15=stack128#7 +# asm 2: movdqa orig15=96(%rsp) +movdqa %xmm3,96(%rsp) + +# qhasm: orig0 = z0 +# asm 1: movdqa orig0=stack128#8 +# asm 2: movdqa orig0=112(%rsp) +movdqa %xmm0,112(%rsp) + +# qhasm: z1 = x1 +# asm 1: movdqa z1=int6464#1 +# asm 2: movdqa z1=%xmm0 +movdqa 0(%rsp),%xmm0 + +# qhasm: z6 = z1[2,2,2,2] +# asm 1: pshufd $0xaa,z6=int6464#2 +# asm 2: pshufd $0xaa,z6=%xmm1 +pshufd $0xaa,%xmm0,%xmm1 + +# qhasm: z11 = z1[3,3,3,3] +# asm 1: pshufd $0xff,z11=int6464#3 +# asm 2: pshufd $0xff,z11=%xmm2 +pshufd $0xff,%xmm0,%xmm2 + +# qhasm: z12 = z1[0,0,0,0] +# asm 1: pshufd $0x00,z12=int6464#4 +# asm 2: pshufd $0x00,z12=%xmm3 +pshufd $0x00,%xmm0,%xmm3 + +# qhasm: z1 = z1[1,1,1,1] +# asm 1: pshufd $0x55,z1=int6464#1 +# asm 2: pshufd $0x55,z1=%xmm0 +pshufd $0x55,%xmm0,%xmm0 + +# qhasm: orig6 = z6 +# asm 1: movdqa orig6=stack128#9 +# asm 2: movdqa orig6=128(%rsp) +movdqa %xmm1,128(%rsp) + +# qhasm: orig11 = z11 +# asm 1: movdqa orig11=stack128#10 +# asm 2: movdqa orig11=144(%rsp) +movdqa %xmm2,144(%rsp) + +# qhasm: orig12 = z12 +# asm 1: movdqa orig12=stack128#11 +# asm 2: movdqa orig12=160(%rsp) +movdqa %xmm3,160(%rsp) + +# qhasm: orig1 = z1 +# asm 1: movdqa orig1=stack128#12 +# asm 2: movdqa orig1=176(%rsp) +movdqa %xmm0,176(%rsp) + +# qhasm: z2 = x2 +# asm 1: movdqa z2=int6464#1 +# asm 2: movdqa z2=%xmm0 +movdqa 16(%rsp),%xmm0 + +# qhasm: z7 = z2[3,3,3,3] +# asm 1: pshufd $0xff,z7=int6464#2 +# asm 2: pshufd $0xff,z7=%xmm1 +pshufd $0xff,%xmm0,%xmm1 + +# qhasm: z13 = z2[1,1,1,1] +# asm 1: pshufd $0x55,z13=int6464#3 +# asm 2: pshufd $0x55,z13=%xmm2 +pshufd $0x55,%xmm0,%xmm2 + +# qhasm: z2 = z2[2,2,2,2] +# asm 1: pshufd $0xaa,z2=int6464#1 +# asm 2: pshufd $0xaa,z2=%xmm0 +pshufd $0xaa,%xmm0,%xmm0 + +# qhasm: orig7 = z7 +# asm 1: movdqa orig7=stack128#13 +# asm 2: movdqa orig7=192(%rsp) +movdqa %xmm1,192(%rsp) + +# qhasm: orig13 = z13 +# asm 1: movdqa orig13=stack128#14 +# asm 2: movdqa orig13=208(%rsp) +movdqa %xmm2,208(%rsp) + +# qhasm: orig2 = z2 +# asm 1: movdqa orig2=stack128#15 +# asm 2: movdqa orig2=224(%rsp) +movdqa %xmm0,224(%rsp) + +# qhasm: z3 = x3 +# asm 1: movdqa z3=int6464#1 +# asm 2: movdqa z3=%xmm0 +movdqa 32(%rsp),%xmm0 + +# qhasm: z4 = z3[0,0,0,0] +# asm 1: pshufd $0x00,z4=int6464#2 +# asm 2: pshufd $0x00,z4=%xmm1 +pshufd $0x00,%xmm0,%xmm1 + +# qhasm: z14 = z3[2,2,2,2] +# asm 1: pshufd $0xaa,z14=int6464#3 +# asm 2: pshufd $0xaa,z14=%xmm2 +pshufd $0xaa,%xmm0,%xmm2 + +# qhasm: z3 = z3[3,3,3,3] +# asm 1: pshufd $0xff,z3=int6464#1 +# asm 2: pshufd $0xff,z3=%xmm0 +pshufd $0xff,%xmm0,%xmm0 + +# qhasm: orig4 = z4 +# asm 1: movdqa orig4=stack128#16 +# asm 2: movdqa orig4=240(%rsp) +movdqa %xmm1,240(%rsp) + +# qhasm: orig14 = z14 +# asm 1: movdqa orig14=stack128#17 +# asm 2: movdqa orig14=256(%rsp) +movdqa %xmm2,256(%rsp) + +# qhasm: orig3 = z3 +# asm 1: movdqa orig3=stack128#18 +# asm 2: movdqa orig3=272(%rsp) +movdqa %xmm0,272(%rsp) + +# qhasm: bytesatleast256: +._bytesatleast256: + +# qhasm: in8 = ((uint32 *)&x2)[0] +# asm 1: movl in8=int64#3d +# asm 2: movl in8=%edx +movl 16(%rsp),%edx + +# qhasm: in9 = ((uint32 *)&x3)[1] +# asm 1: movl 4+in9=int64#4d +# asm 2: movl 4+in9=%ecx +movl 4+32(%rsp),%ecx + +# qhasm: ((uint32 *) &orig8)[0] = in8 +# asm 1: movl orig8=stack128#19 +# asm 2: movl orig8=288(%rsp) +movl %edx,288(%rsp) + +# qhasm: ((uint32 *) &orig9)[0] = in9 +# asm 1: movl orig9=stack128#20 +# asm 2: movl orig9=304(%rsp) +movl %ecx,304(%rsp) + +# qhasm: in8 += 1 +# asm 1: add $1,in9=int64#4 +# asm 2: mov in9=%rcx +mov %rdx,%rcx + +# qhasm: (uint64) in9 >>= 32 +# asm 1: shr $32,in9=int64#4 +# asm 2: mov in9=%rcx +mov %rdx,%rcx + +# qhasm: (uint64) in9 >>= 32 +# asm 1: shr $32,in9=int64#4 +# asm 2: mov in9=%rcx +mov %rdx,%rcx + +# qhasm: (uint64) in9 >>= 32 +# asm 1: shr $32,in9=int64#4 +# asm 2: mov in9=%rcx +mov %rdx,%rcx + +# qhasm: (uint64) in9 >>= 32 +# asm 1: shr $32,x2=stack128#2 +# asm 2: movl x2=16(%rsp) +movl %edx,16(%rsp) + +# qhasm: ((uint32 *)&x3)[1] = in9 +# asm 1: movl bytes_backup=stack64#8 +# asm 2: movq bytes_backup=408(%rsp) +movq %r9,408(%rsp) + +# qhasm: i = 12 +# asm 1: mov $12,>i=int64#3 +# asm 2: mov $12,>i=%rdx +mov $12,%rdx + +# qhasm: z5 = orig5 +# asm 1: movdqa z5=int6464#1 +# asm 2: movdqa z5=%xmm0 +movdqa 64(%rsp),%xmm0 + +# qhasm: z10 = orig10 +# asm 1: movdqa z10=int6464#2 +# asm 2: movdqa z10=%xmm1 +movdqa 80(%rsp),%xmm1 + +# qhasm: z15 = orig15 +# asm 1: movdqa z15=int6464#3 +# asm 2: movdqa z15=%xmm2 +movdqa 96(%rsp),%xmm2 + +# qhasm: z14 = orig14 +# asm 1: movdqa z14=int6464#4 +# asm 2: movdqa z14=%xmm3 +movdqa 256(%rsp),%xmm3 + +# qhasm: z3 = orig3 +# asm 1: movdqa z3=int6464#5 +# asm 2: movdqa z3=%xmm4 +movdqa 272(%rsp),%xmm4 + +# qhasm: z6 = orig6 +# asm 1: movdqa z6=int6464#6 +# asm 2: movdqa z6=%xmm5 +movdqa 128(%rsp),%xmm5 + +# qhasm: z11 = orig11 +# asm 1: movdqa z11=int6464#7 +# asm 2: movdqa z11=%xmm6 +movdqa 144(%rsp),%xmm6 + +# qhasm: z1 = orig1 +# asm 1: movdqa z1=int6464#8 +# asm 2: movdqa z1=%xmm7 +movdqa 176(%rsp),%xmm7 + +# qhasm: z7 = orig7 +# asm 1: movdqa z7=int6464#9 +# asm 2: movdqa z7=%xmm8 +movdqa 192(%rsp),%xmm8 + +# qhasm: z13 = orig13 +# asm 1: movdqa z13=int6464#10 +# asm 2: movdqa z13=%xmm9 +movdqa 208(%rsp),%xmm9 + +# qhasm: z2 = orig2 +# asm 1: movdqa z2=int6464#11 +# asm 2: movdqa z2=%xmm10 +movdqa 224(%rsp),%xmm10 + +# qhasm: z9 = orig9 +# asm 1: movdqa z9=int6464#12 +# asm 2: movdqa z9=%xmm11 +movdqa 304(%rsp),%xmm11 + +# qhasm: z0 = orig0 +# asm 1: movdqa z0=int6464#13 +# asm 2: movdqa z0=%xmm12 +movdqa 112(%rsp),%xmm12 + +# qhasm: z12 = orig12 +# asm 1: movdqa z12=int6464#14 +# asm 2: movdqa z12=%xmm13 +movdqa 160(%rsp),%xmm13 + +# qhasm: z4 = orig4 +# asm 1: movdqa z4=int6464#15 +# asm 2: movdqa z4=%xmm14 +movdqa 240(%rsp),%xmm14 + +# qhasm: z8 = orig8 +# asm 1: movdqa z8=int6464#16 +# asm 2: movdqa z8=%xmm15 +movdqa 288(%rsp),%xmm15 + +# qhasm: mainloop1: +._mainloop1: + +# qhasm: z10_stack = z10 +# asm 1: movdqa z10_stack=stack128#21 +# asm 2: movdqa z10_stack=320(%rsp) +movdqa %xmm1,320(%rsp) + +# qhasm: z15_stack = z15 +# asm 1: movdqa z15_stack=stack128#22 +# asm 2: movdqa z15_stack=336(%rsp) +movdqa %xmm2,336(%rsp) + +# qhasm: y4 = z12 +# asm 1: movdqa y4=int6464#2 +# asm 2: movdqa y4=%xmm1 +movdqa %xmm13,%xmm1 + +# qhasm: uint32323232 y4 += z0 +# asm 1: paddd r4=int6464#3 +# asm 2: movdqa r4=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y4 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,y9=int6464#2 +# asm 2: movdqa y9=%xmm1 +movdqa %xmm7,%xmm1 + +# qhasm: uint32323232 y9 += z5 +# asm 1: paddd r9=int6464#3 +# asm 2: movdqa r9=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y9 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,y8=int6464#2 +# asm 2: movdqa y8=%xmm1 +movdqa %xmm12,%xmm1 + +# qhasm: uint32323232 y8 += z4 +# asm 1: paddd r8=int6464#3 +# asm 2: movdqa r8=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y8 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,y13=int6464#2 +# asm 2: movdqa y13=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: uint32323232 y13 += z9 +# asm 1: paddd r13=int6464#3 +# asm 2: movdqa r13=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y13 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,y12=int6464#2 +# asm 2: movdqa y12=%xmm1 +movdqa %xmm14,%xmm1 + +# qhasm: uint32323232 y12 += z8 +# asm 1: paddd r12=int6464#3 +# asm 2: movdqa r12=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y12 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,y1=int6464#2 +# asm 2: movdqa y1=%xmm1 +movdqa %xmm11,%xmm1 + +# qhasm: uint32323232 y1 += z13 +# asm 1: paddd r1=int6464#3 +# asm 2: movdqa r1=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y1 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,y0=int6464#2 +# asm 2: movdqa y0=%xmm1 +movdqa %xmm15,%xmm1 + +# qhasm: uint32323232 y0 += z12 +# asm 1: paddd r0=int6464#3 +# asm 2: movdqa r0=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y0 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,z10=int6464#2 +# asm 2: movdqa z10=%xmm1 +movdqa 320(%rsp),%xmm1 + +# qhasm: z0_stack = z0 +# asm 1: movdqa z0_stack=stack128#21 +# asm 2: movdqa z0_stack=320(%rsp) +movdqa %xmm12,320(%rsp) + +# qhasm: y5 = z13 +# asm 1: movdqa y5=int6464#3 +# asm 2: movdqa y5=%xmm2 +movdqa %xmm9,%xmm2 + +# qhasm: uint32323232 y5 += z1 +# asm 1: paddd r5=int6464#13 +# asm 2: movdqa r5=%xmm12 +movdqa %xmm2,%xmm12 + +# qhasm: uint32323232 y5 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,y14=int6464#3 +# asm 2: movdqa y14=%xmm2 +movdqa %xmm5,%xmm2 + +# qhasm: uint32323232 y14 += z10 +# asm 1: paddd r14=int6464#13 +# asm 2: movdqa r14=%xmm12 +movdqa %xmm2,%xmm12 + +# qhasm: uint32323232 y14 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,z15=int6464#3 +# asm 2: movdqa z15=%xmm2 +movdqa 336(%rsp),%xmm2 + +# qhasm: z5_stack = z5 +# asm 1: movdqa z5_stack=stack128#22 +# asm 2: movdqa z5_stack=336(%rsp) +movdqa %xmm0,336(%rsp) + +# qhasm: y3 = z11 +# asm 1: movdqa y3=int6464#1 +# asm 2: movdqa y3=%xmm0 +movdqa %xmm6,%xmm0 + +# qhasm: uint32323232 y3 += z15 +# asm 1: paddd r3=int6464#13 +# asm 2: movdqa r3=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y3 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,y2=int6464#1 +# asm 2: movdqa y2=%xmm0 +movdqa %xmm1,%xmm0 + +# qhasm: uint32323232 y2 += z14 +# asm 1: paddd r2=int6464#13 +# asm 2: movdqa r2=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y2 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,y7=int6464#1 +# asm 2: movdqa y7=%xmm0 +movdqa %xmm2,%xmm0 + +# qhasm: uint32323232 y7 += z3 +# asm 1: paddd r7=int6464#13 +# asm 2: movdqa r7=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y7 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,y6=int6464#1 +# asm 2: movdqa y6=%xmm0 +movdqa %xmm3,%xmm0 + +# qhasm: uint32323232 y6 += z2 +# asm 1: paddd r6=int6464#13 +# asm 2: movdqa r6=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y6 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,y11=int6464#1 +# asm 2: movdqa y11=%xmm0 +movdqa %xmm4,%xmm0 + +# qhasm: uint32323232 y11 += z7 +# asm 1: paddd r11=int6464#13 +# asm 2: movdqa r11=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y11 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,y10=int6464#1 +# asm 2: movdqa y10=%xmm0 +movdqa %xmm10,%xmm0 + +# qhasm: uint32323232 y10 += z6 +# asm 1: paddd r10=int6464#13 +# asm 2: movdqa r10=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y10 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,z0=int6464#1 +# asm 2: movdqa z0=%xmm0 +movdqa 320(%rsp),%xmm0 + +# qhasm: z10_stack = z10 +# asm 1: movdqa z10_stack=stack128#21 +# asm 2: movdqa z10_stack=320(%rsp) +movdqa %xmm1,320(%rsp) + +# qhasm: y1 = z3 +# asm 1: movdqa y1=int6464#2 +# asm 2: movdqa y1=%xmm1 +movdqa %xmm4,%xmm1 + +# qhasm: uint32323232 y1 += z0 +# asm 1: paddd r1=int6464#13 +# asm 2: movdqa r1=%xmm12 +movdqa %xmm1,%xmm12 + +# qhasm: uint32323232 y1 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,y15=int6464#2 +# asm 2: movdqa y15=%xmm1 +movdqa %xmm8,%xmm1 + +# qhasm: uint32323232 y15 += z11 +# asm 1: paddd r15=int6464#13 +# asm 2: movdqa r15=%xmm12 +movdqa %xmm1,%xmm12 + +# qhasm: uint32323232 y15 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,z5=int6464#13 +# asm 2: movdqa z5=%xmm12 +movdqa 336(%rsp),%xmm12 + +# qhasm: z15_stack = z15 +# asm 1: movdqa z15_stack=stack128#22 +# asm 2: movdqa z15_stack=336(%rsp) +movdqa %xmm2,336(%rsp) + +# qhasm: y6 = z4 +# asm 1: movdqa y6=int6464#2 +# asm 2: movdqa y6=%xmm1 +movdqa %xmm14,%xmm1 + +# qhasm: uint32323232 y6 += z5 +# asm 1: paddd r6=int6464#3 +# asm 2: movdqa r6=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y6 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,y2=int6464#2 +# asm 2: movdqa y2=%xmm1 +movdqa %xmm0,%xmm1 + +# qhasm: uint32323232 y2 += z1 +# asm 1: paddd r2=int6464#3 +# asm 2: movdqa r2=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y2 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,y7=int6464#2 +# asm 2: movdqa y7=%xmm1 +movdqa %xmm12,%xmm1 + +# qhasm: uint32323232 y7 += z6 +# asm 1: paddd r7=int6464#3 +# asm 2: movdqa r7=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y7 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,y3=int6464#2 +# asm 2: movdqa y3=%xmm1 +movdqa %xmm7,%xmm1 + +# qhasm: uint32323232 y3 += z2 +# asm 1: paddd r3=int6464#3 +# asm 2: movdqa r3=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y3 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,y4=int6464#2 +# asm 2: movdqa y4=%xmm1 +movdqa %xmm5,%xmm1 + +# qhasm: uint32323232 y4 += z7 +# asm 1: paddd r4=int6464#3 +# asm 2: movdqa r4=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y4 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,y0=int6464#2 +# asm 2: movdqa y0=%xmm1 +movdqa %xmm10,%xmm1 + +# qhasm: uint32323232 y0 += z3 +# asm 1: paddd r0=int6464#3 +# asm 2: movdqa r0=%xmm2 +movdqa %xmm1,%xmm2 + +# qhasm: uint32323232 y0 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,z10=int6464#2 +# asm 2: movdqa z10=%xmm1 +movdqa 320(%rsp),%xmm1 + +# qhasm: z0_stack = z0 +# asm 1: movdqa z0_stack=stack128#21 +# asm 2: movdqa z0_stack=320(%rsp) +movdqa %xmm0,320(%rsp) + +# qhasm: y5 = z7 +# asm 1: movdqa y5=int6464#1 +# asm 2: movdqa y5=%xmm0 +movdqa %xmm8,%xmm0 + +# qhasm: uint32323232 y5 += z4 +# asm 1: paddd r5=int6464#3 +# asm 2: movdqa r5=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: uint32323232 y5 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,y11=int6464#1 +# asm 2: movdqa y11=%xmm0 +movdqa %xmm11,%xmm0 + +# qhasm: uint32323232 y11 += z10 +# asm 1: paddd r11=int6464#3 +# asm 2: movdqa r11=%xmm2 +movdqa %xmm0,%xmm2 + +# qhasm: uint32323232 y11 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,z15=int6464#3 +# asm 2: movdqa z15=%xmm2 +movdqa 336(%rsp),%xmm2 + +# qhasm: z5_stack = z5 +# asm 1: movdqa z5_stack=stack128#22 +# asm 2: movdqa z5_stack=336(%rsp) +movdqa %xmm12,336(%rsp) + +# qhasm: y12 = z14 +# asm 1: movdqa y12=int6464#1 +# asm 2: movdqa y12=%xmm0 +movdqa %xmm3,%xmm0 + +# qhasm: uint32323232 y12 += z15 +# asm 1: paddd r12=int6464#13 +# asm 2: movdqa r12=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y12 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,y8=int6464#1 +# asm 2: movdqa y8=%xmm0 +movdqa %xmm1,%xmm0 + +# qhasm: uint32323232 y8 += z11 +# asm 1: paddd r8=int6464#13 +# asm 2: movdqa r8=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y8 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,y13=int6464#1 +# asm 2: movdqa y13=%xmm0 +movdqa %xmm2,%xmm0 + +# qhasm: uint32323232 y13 += z12 +# asm 1: paddd r13=int6464#13 +# asm 2: movdqa r13=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y13 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,y9=int6464#1 +# asm 2: movdqa y9=%xmm0 +movdqa %xmm6,%xmm0 + +# qhasm: uint32323232 y9 += z8 +# asm 1: paddd r9=int6464#13 +# asm 2: movdqa r9=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y9 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,y14=int6464#1 +# asm 2: movdqa y14=%xmm0 +movdqa %xmm13,%xmm0 + +# qhasm: uint32323232 y14 += z13 +# asm 1: paddd r14=int6464#13 +# asm 2: movdqa r14=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y14 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,y10=int6464#1 +# asm 2: movdqa y10=%xmm0 +movdqa %xmm15,%xmm0 + +# qhasm: uint32323232 y10 += z9 +# asm 1: paddd r10=int6464#13 +# asm 2: movdqa r10=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y10 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,y15=int6464#1 +# asm 2: movdqa y15=%xmm0 +movdqa %xmm9,%xmm0 + +# qhasm: uint32323232 y15 += z14 +# asm 1: paddd r15=int6464#13 +# asm 2: movdqa r15=%xmm12 +movdqa %xmm0,%xmm12 + +# qhasm: uint32323232 y15 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,z0=int6464#13 +# asm 2: movdqa z0=%xmm12 +movdqa 320(%rsp),%xmm12 + +# qhasm: z5 = z5_stack +# asm 1: movdqa z5=int6464#1 +# asm 2: movdqa z5=%xmm0 +movdqa 336(%rsp),%xmm0 + +# qhasm: unsigned>? i -= 2 +# asm 1: sub $2, +ja ._mainloop1 + +# qhasm: uint32323232 z0 += orig0 +# asm 1: paddd in0=int64#3 +# asm 2: movd in0=%rdx +movd %xmm12,%rdx + +# qhasm: in1 = z1 +# asm 1: movd in1=int64#4 +# asm 2: movd in1=%rcx +movd %xmm7,%rcx + +# qhasm: in2 = z2 +# asm 1: movd in2=int64#5 +# asm 2: movd in2=%r8 +movd %xmm10,%r8 + +# qhasm: in3 = z3 +# asm 1: movd in3=int64#6 +# asm 2: movd in3=%r9 +movd %xmm4,%r9 + +# qhasm: z0 <<<= 96 +# asm 1: pshufd $0x39,in0=int64#3 +# asm 2: movd in0=%rdx +movd %xmm12,%rdx + +# qhasm: in1 = z1 +# asm 1: movd in1=int64#4 +# asm 2: movd in1=%rcx +movd %xmm7,%rcx + +# qhasm: in2 = z2 +# asm 1: movd in2=int64#5 +# asm 2: movd in2=%r8 +movd %xmm10,%r8 + +# qhasm: in3 = z3 +# asm 1: movd in3=int64#6 +# asm 2: movd in3=%r9 +movd %xmm4,%r9 + +# qhasm: z0 <<<= 96 +# asm 1: pshufd $0x39,in0=int64#3 +# asm 2: movd in0=%rdx +movd %xmm12,%rdx + +# qhasm: in1 = z1 +# asm 1: movd in1=int64#4 +# asm 2: movd in1=%rcx +movd %xmm7,%rcx + +# qhasm: in2 = z2 +# asm 1: movd in2=int64#5 +# asm 2: movd in2=%r8 +movd %xmm10,%r8 + +# qhasm: in3 = z3 +# asm 1: movd in3=int64#6 +# asm 2: movd in3=%r9 +movd %xmm4,%r9 + +# qhasm: z0 <<<= 96 +# asm 1: pshufd $0x39,in0=int64#3 +# asm 2: movd in0=%rdx +movd %xmm12,%rdx + +# qhasm: in1 = z1 +# asm 1: movd in1=int64#4 +# asm 2: movd in1=%rcx +movd %xmm7,%rcx + +# qhasm: in2 = z2 +# asm 1: movd in2=int64#5 +# asm 2: movd in2=%r8 +movd %xmm10,%r8 + +# qhasm: in3 = z3 +# asm 1: movd in3=int64#6 +# asm 2: movd in3=%r9 +movd %xmm4,%r9 + +# qhasm: (uint32) in0 ^= *(uint32 *) (m + 192) +# asm 1: xorl 192(in4=int64#3 +# asm 2: movd in4=%rdx +movd %xmm14,%rdx + +# qhasm: in5 = z5 +# asm 1: movd in5=int64#4 +# asm 2: movd in5=%rcx +movd %xmm0,%rcx + +# qhasm: in6 = z6 +# asm 1: movd in6=int64#5 +# asm 2: movd in6=%r8 +movd %xmm5,%r8 + +# qhasm: in7 = z7 +# asm 1: movd in7=int64#6 +# asm 2: movd in7=%r9 +movd %xmm8,%r9 + +# qhasm: z4 <<<= 96 +# asm 1: pshufd $0x39,in4=int64#3 +# asm 2: movd in4=%rdx +movd %xmm14,%rdx + +# qhasm: in5 = z5 +# asm 1: movd in5=int64#4 +# asm 2: movd in5=%rcx +movd %xmm0,%rcx + +# qhasm: in6 = z6 +# asm 1: movd in6=int64#5 +# asm 2: movd in6=%r8 +movd %xmm5,%r8 + +# qhasm: in7 = z7 +# asm 1: movd in7=int64#6 +# asm 2: movd in7=%r9 +movd %xmm8,%r9 + +# qhasm: z4 <<<= 96 +# asm 1: pshufd $0x39,in4=int64#3 +# asm 2: movd in4=%rdx +movd %xmm14,%rdx + +# qhasm: in5 = z5 +# asm 1: movd in5=int64#4 +# asm 2: movd in5=%rcx +movd %xmm0,%rcx + +# qhasm: in6 = z6 +# asm 1: movd in6=int64#5 +# asm 2: movd in6=%r8 +movd %xmm5,%r8 + +# qhasm: in7 = z7 +# asm 1: movd in7=int64#6 +# asm 2: movd in7=%r9 +movd %xmm8,%r9 + +# qhasm: z4 <<<= 96 +# asm 1: pshufd $0x39,in4=int64#3 +# asm 2: movd in4=%rdx +movd %xmm14,%rdx + +# qhasm: in5 = z5 +# asm 1: movd in5=int64#4 +# asm 2: movd in5=%rcx +movd %xmm0,%rcx + +# qhasm: in6 = z6 +# asm 1: movd in6=int64#5 +# asm 2: movd in6=%r8 +movd %xmm5,%r8 + +# qhasm: in7 = z7 +# asm 1: movd in7=int64#6 +# asm 2: movd in7=%r9 +movd %xmm8,%r9 + +# qhasm: (uint32) in4 ^= *(uint32 *) (m + 208) +# asm 1: xorl 208(in8=int64#3 +# asm 2: movd in8=%rdx +movd %xmm15,%rdx + +# qhasm: in9 = z9 +# asm 1: movd in9=int64#4 +# asm 2: movd in9=%rcx +movd %xmm11,%rcx + +# qhasm: in10 = z10 +# asm 1: movd in10=int64#5 +# asm 2: movd in10=%r8 +movd %xmm1,%r8 + +# qhasm: in11 = z11 +# asm 1: movd in11=int64#6 +# asm 2: movd in11=%r9 +movd %xmm6,%r9 + +# qhasm: z8 <<<= 96 +# asm 1: pshufd $0x39,in8=int64#3 +# asm 2: movd in8=%rdx +movd %xmm15,%rdx + +# qhasm: in9 = z9 +# asm 1: movd in9=int64#4 +# asm 2: movd in9=%rcx +movd %xmm11,%rcx + +# qhasm: in10 = z10 +# asm 1: movd in10=int64#5 +# asm 2: movd in10=%r8 +movd %xmm1,%r8 + +# qhasm: in11 = z11 +# asm 1: movd in11=int64#6 +# asm 2: movd in11=%r9 +movd %xmm6,%r9 + +# qhasm: z8 <<<= 96 +# asm 1: pshufd $0x39,in8=int64#3 +# asm 2: movd in8=%rdx +movd %xmm15,%rdx + +# qhasm: in9 = z9 +# asm 1: movd in9=int64#4 +# asm 2: movd in9=%rcx +movd %xmm11,%rcx + +# qhasm: in10 = z10 +# asm 1: movd in10=int64#5 +# asm 2: movd in10=%r8 +movd %xmm1,%r8 + +# qhasm: in11 = z11 +# asm 1: movd in11=int64#6 +# asm 2: movd in11=%r9 +movd %xmm6,%r9 + +# qhasm: z8 <<<= 96 +# asm 1: pshufd $0x39,in8=int64#3 +# asm 2: movd in8=%rdx +movd %xmm15,%rdx + +# qhasm: in9 = z9 +# asm 1: movd in9=int64#4 +# asm 2: movd in9=%rcx +movd %xmm11,%rcx + +# qhasm: in10 = z10 +# asm 1: movd in10=int64#5 +# asm 2: movd in10=%r8 +movd %xmm1,%r8 + +# qhasm: in11 = z11 +# asm 1: movd in11=int64#6 +# asm 2: movd in11=%r9 +movd %xmm6,%r9 + +# qhasm: (uint32) in8 ^= *(uint32 *) (m + 224) +# asm 1: xorl 224(in12=int64#3 +# asm 2: movd in12=%rdx +movd %xmm13,%rdx + +# qhasm: in13 = z13 +# asm 1: movd in13=int64#4 +# asm 2: movd in13=%rcx +movd %xmm9,%rcx + +# qhasm: in14 = z14 +# asm 1: movd in14=int64#5 +# asm 2: movd in14=%r8 +movd %xmm3,%r8 + +# qhasm: in15 = z15 +# asm 1: movd in15=int64#6 +# asm 2: movd in15=%r9 +movd %xmm2,%r9 + +# qhasm: z12 <<<= 96 +# asm 1: pshufd $0x39,in12=int64#3 +# asm 2: movd in12=%rdx +movd %xmm13,%rdx + +# qhasm: in13 = z13 +# asm 1: movd in13=int64#4 +# asm 2: movd in13=%rcx +movd %xmm9,%rcx + +# qhasm: in14 = z14 +# asm 1: movd in14=int64#5 +# asm 2: movd in14=%r8 +movd %xmm3,%r8 + +# qhasm: in15 = z15 +# asm 1: movd in15=int64#6 +# asm 2: movd in15=%r9 +movd %xmm2,%r9 + +# qhasm: z12 <<<= 96 +# asm 1: pshufd $0x39,in12=int64#3 +# asm 2: movd in12=%rdx +movd %xmm13,%rdx + +# qhasm: in13 = z13 +# asm 1: movd in13=int64#4 +# asm 2: movd in13=%rcx +movd %xmm9,%rcx + +# qhasm: in14 = z14 +# asm 1: movd in14=int64#5 +# asm 2: movd in14=%r8 +movd %xmm3,%r8 + +# qhasm: in15 = z15 +# asm 1: movd in15=int64#6 +# asm 2: movd in15=%r9 +movd %xmm2,%r9 + +# qhasm: z12 <<<= 96 +# asm 1: pshufd $0x39,in12=int64#3 +# asm 2: movd in12=%rdx +movd %xmm13,%rdx + +# qhasm: in13 = z13 +# asm 1: movd in13=int64#4 +# asm 2: movd in13=%rcx +movd %xmm9,%rcx + +# qhasm: in14 = z14 +# asm 1: movd in14=int64#5 +# asm 2: movd in14=%r8 +movd %xmm3,%r8 + +# qhasm: in15 = z15 +# asm 1: movd in15=int64#6 +# asm 2: movd in15=%r9 +movd %xmm2,%r9 + +# qhasm: (uint32) in12 ^= *(uint32 *) (m + 240) +# asm 1: xorl 240(bytes=int64#6 +# asm 2: movq bytes=%r9 +movq 408(%rsp),%r9 + +# qhasm: bytes -= 256 +# asm 1: sub $256,? bytes - 0 +# asm 1: cmp $0, +jbe ._done +# comment:fp stack unchanged by fallthrough + +# qhasm: bytesbetween1and255: +._bytesbetween1and255: + +# qhasm: unsignedctarget=int64#3 +# asm 2: mov ctarget=%rdx +mov %rdi,%rdx + +# qhasm: out = &tmp +# asm 1: leaq out=int64#1 +# asm 2: leaq out=%rdi +leaq 416(%rsp),%rdi + +# qhasm: i = bytes +# asm 1: mov i=int64#4 +# asm 2: mov i=%rcx +mov %r9,%rcx + +# qhasm: while (i) { *out++ = *m++; --i } +rep movsb + +# qhasm: out = &tmp +# asm 1: leaq out=int64#1 +# asm 2: leaq out=%rdi +leaq 416(%rsp),%rdi + +# qhasm: m = &tmp +# asm 1: leaq m=int64#2 +# asm 2: leaq m=%rsi +leaq 416(%rsp),%rsi +# comment:fp stack unchanged by fallthrough + +# qhasm: nocopy: +._nocopy: + +# qhasm: bytes_backup = bytes +# asm 1: movq bytes_backup=stack64#8 +# asm 2: movq bytes_backup=408(%rsp) +movq %r9,408(%rsp) + +# qhasm: diag0 = x0 +# asm 1: movdqa diag0=int6464#1 +# asm 2: movdqa diag0=%xmm0 +movdqa 48(%rsp),%xmm0 + +# qhasm: diag1 = x1 +# asm 1: movdqa diag1=int6464#2 +# asm 2: movdqa diag1=%xmm1 +movdqa 0(%rsp),%xmm1 + +# qhasm: diag2 = x2 +# asm 1: movdqa diag2=int6464#3 +# asm 2: movdqa diag2=%xmm2 +movdqa 16(%rsp),%xmm2 + +# qhasm: diag3 = x3 +# asm 1: movdqa diag3=int6464#4 +# asm 2: movdqa diag3=%xmm3 +movdqa 32(%rsp),%xmm3 + +# qhasm: a0 = diag1 +# asm 1: movdqa a0=int6464#5 +# asm 2: movdqa a0=%xmm4 +movdqa %xmm1,%xmm4 + +# qhasm: i = 12 +# asm 1: mov $12,>i=int64#4 +# asm 2: mov $12,>i=%rcx +mov $12,%rcx + +# qhasm: mainloop2: +._mainloop2: + +# qhasm: uint32323232 a0 += diag0 +# asm 1: paddd a1=int6464#6 +# asm 2: movdqa a1=%xmm5 +movdqa %xmm0,%xmm5 + +# qhasm: b0 = a0 +# asm 1: movdqa b0=int6464#7 +# asm 2: movdqa b0=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: uint32323232 a0 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,a2=int6464#5 +# asm 2: movdqa a2=%xmm4 +movdqa %xmm3,%xmm4 + +# qhasm: b1 = a1 +# asm 1: movdqa b1=int6464#7 +# asm 2: movdqa b1=%xmm6 +movdqa %xmm5,%xmm6 + +# qhasm: uint32323232 a1 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,a3=int6464#6 +# asm 2: movdqa a3=%xmm5 +movdqa %xmm2,%xmm5 + +# qhasm: b2 = a2 +# asm 1: movdqa b2=int6464#7 +# asm 2: movdqa b2=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: uint32323232 a2 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,a4=int6464#5 +# asm 2: movdqa a4=%xmm4 +movdqa %xmm3,%xmm4 + +# qhasm: b3 = a3 +# asm 1: movdqa b3=int6464#7 +# asm 2: movdqa b3=%xmm6 +movdqa %xmm5,%xmm6 + +# qhasm: uint32323232 a3 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,a5=int6464#6 +# asm 2: movdqa a5=%xmm5 +movdqa %xmm0,%xmm5 + +# qhasm: b4 = a4 +# asm 1: movdqa b4=int6464#7 +# asm 2: movdqa b4=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: uint32323232 a4 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,a6=int6464#5 +# asm 2: movdqa a6=%xmm4 +movdqa %xmm1,%xmm4 + +# qhasm: b5 = a5 +# asm 1: movdqa b5=int6464#7 +# asm 2: movdqa b5=%xmm6 +movdqa %xmm5,%xmm6 + +# qhasm: uint32323232 a5 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,a7=int6464#6 +# asm 2: movdqa a7=%xmm5 +movdqa %xmm2,%xmm5 + +# qhasm: b6 = a6 +# asm 1: movdqa b6=int6464#7 +# asm 2: movdqa b6=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: uint32323232 a6 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,a0=int6464#5 +# asm 2: movdqa a0=%xmm4 +movdqa %xmm1,%xmm4 + +# qhasm: b7 = a7 +# asm 1: movdqa b7=int6464#7 +# asm 2: movdqa b7=%xmm6 +movdqa %xmm5,%xmm6 + +# qhasm: uint32323232 a7 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,a1=int6464#6 +# asm 2: movdqa a1=%xmm5 +movdqa %xmm0,%xmm5 + +# qhasm: b0 = a0 +# asm 1: movdqa b0=int6464#7 +# asm 2: movdqa b0=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: uint32323232 a0 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,a2=int6464#5 +# asm 2: movdqa a2=%xmm4 +movdqa %xmm3,%xmm4 + +# qhasm: b1 = a1 +# asm 1: movdqa b1=int6464#7 +# asm 2: movdqa b1=%xmm6 +movdqa %xmm5,%xmm6 + +# qhasm: uint32323232 a1 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,a3=int6464#6 +# asm 2: movdqa a3=%xmm5 +movdqa %xmm2,%xmm5 + +# qhasm: b2 = a2 +# asm 1: movdqa b2=int6464#7 +# asm 2: movdqa b2=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: uint32323232 a2 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,a4=int6464#5 +# asm 2: movdqa a4=%xmm4 +movdqa %xmm3,%xmm4 + +# qhasm: b3 = a3 +# asm 1: movdqa b3=int6464#7 +# asm 2: movdqa b3=%xmm6 +movdqa %xmm5,%xmm6 + +# qhasm: uint32323232 a3 <<= 18 +# asm 1: pslld $18,>= 14 +# asm 1: psrld $14,a5=int6464#6 +# asm 2: movdqa a5=%xmm5 +movdqa %xmm0,%xmm5 + +# qhasm: b4 = a4 +# asm 1: movdqa b4=int6464#7 +# asm 2: movdqa b4=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: uint32323232 a4 <<= 7 +# asm 1: pslld $7,>= 25 +# asm 1: psrld $25,a6=int6464#5 +# asm 2: movdqa a6=%xmm4 +movdqa %xmm1,%xmm4 + +# qhasm: b5 = a5 +# asm 1: movdqa b5=int6464#7 +# asm 2: movdqa b5=%xmm6 +movdqa %xmm5,%xmm6 + +# qhasm: uint32323232 a5 <<= 9 +# asm 1: pslld $9,>= 23 +# asm 1: psrld $23,a7=int6464#6 +# asm 2: movdqa a7=%xmm5 +movdqa %xmm2,%xmm5 + +# qhasm: b6 = a6 +# asm 1: movdqa b6=int6464#7 +# asm 2: movdqa b6=%xmm6 +movdqa %xmm4,%xmm6 + +# qhasm: uint32323232 a6 <<= 13 +# asm 1: pslld $13,>= 19 +# asm 1: psrld $19,? i -= 4 +# asm 1: sub $4,a0=int6464#5 +# asm 2: movdqa a0=%xmm4 +movdqa %xmm1,%xmm4 + +# qhasm: b7 = a7 +# asm 1: movdqa b7=int6464#7 +# asm 2: movdqa b7=%xmm6 +movdqa %xmm5,%xmm6 + +# qhasm: uint32323232 a7 <<= 18 +# asm 1: pslld $18,b0=int6464#8,>b0=int6464#8 +# asm 2: pxor >b0=%xmm7,>b0=%xmm7 +pxor %xmm7,%xmm7 + +# qhasm: uint32323232 b7 >>= 14 +# asm 1: psrld $14, +ja ._mainloop2 + +# qhasm: uint32323232 diag0 += x0 +# asm 1: paddd in0=int64#4 +# asm 2: movd in0=%rcx +movd %xmm0,%rcx + +# qhasm: in12 = diag1 +# asm 1: movd in12=int64#5 +# asm 2: movd in12=%r8 +movd %xmm1,%r8 + +# qhasm: in8 = diag2 +# asm 1: movd in8=int64#6 +# asm 2: movd in8=%r9 +movd %xmm2,%r9 + +# qhasm: in4 = diag3 +# asm 1: movd in4=int64#7 +# asm 2: movd in4=%rax +movd %xmm3,%rax + +# qhasm: diag0 <<<= 96 +# asm 1: pshufd $0x39,in5=int64#4 +# asm 2: movd in5=%rcx +movd %xmm0,%rcx + +# qhasm: in1 = diag1 +# asm 1: movd in1=int64#5 +# asm 2: movd in1=%r8 +movd %xmm1,%r8 + +# qhasm: in13 = diag2 +# asm 1: movd in13=int64#6 +# asm 2: movd in13=%r9 +movd %xmm2,%r9 + +# qhasm: in9 = diag3 +# asm 1: movd in9=int64#7 +# asm 2: movd in9=%rax +movd %xmm3,%rax + +# qhasm: diag0 <<<= 96 +# asm 1: pshufd $0x39,in10=int64#4 +# asm 2: movd in10=%rcx +movd %xmm0,%rcx + +# qhasm: in6 = diag1 +# asm 1: movd in6=int64#5 +# asm 2: movd in6=%r8 +movd %xmm1,%r8 + +# qhasm: in2 = diag2 +# asm 1: movd in2=int64#6 +# asm 2: movd in2=%r9 +movd %xmm2,%r9 + +# qhasm: in14 = diag3 +# asm 1: movd in14=int64#7 +# asm 2: movd in14=%rax +movd %xmm3,%rax + +# qhasm: diag0 <<<= 96 +# asm 1: pshufd $0x39,in15=int64#4 +# asm 2: movd in15=%rcx +movd %xmm0,%rcx + +# qhasm: in11 = diag1 +# asm 1: movd in11=int64#5 +# asm 2: movd in11=%r8 +movd %xmm1,%r8 + +# qhasm: in7 = diag2 +# asm 1: movd in7=int64#6 +# asm 2: movd in7=%r9 +movd %xmm2,%r9 + +# qhasm: in3 = diag3 +# asm 1: movd in3=int64#7 +# asm 2: movd in3=%rax +movd %xmm3,%rax + +# qhasm: (uint32) in15 ^= *(uint32 *) (m + 60) +# asm 1: xorl 60(bytes=int64#6 +# asm 2: movq bytes=%r9 +movq 408(%rsp),%r9 + +# qhasm: in8 = ((uint32 *)&x2)[0] +# asm 1: movl in8=int64#4d +# asm 2: movl in8=%ecx +movl 16(%rsp),%ecx + +# qhasm: in9 = ((uint32 *)&x3)[1] +# asm 1: movl 4+in9=int64#5d +# asm 2: movl 4+in9=%r8d +movl 4+32(%rsp),%r8d + +# qhasm: in8 += 1 +# asm 1: add $1,in9=int64#5 +# asm 2: mov in9=%r8 +mov %rcx,%r8 + +# qhasm: (uint64) in9 >>= 32 +# asm 1: shr $32,x2=stack128#2 +# asm 2: movl x2=16(%rsp) +movl %ecx,16(%rsp) + +# qhasm: ((uint32 *)&x3)[1] = in9 +# asm 1: movl ? unsigned +ja ._bytesatleast65 +# comment:fp stack unchanged by jump + +# qhasm: goto bytesatleast64 if !unsigned< +jae ._bytesatleast64 + +# qhasm: m = out +# asm 1: mov m=int64#2 +# asm 2: mov m=%rsi +mov %rdi,%rsi + +# qhasm: out = ctarget +# asm 1: mov out=int64#1 +# asm 2: mov out=%rdi +mov %rdx,%rdi + +# qhasm: i = bytes +# asm 1: mov i=int64#4 +# asm 2: mov i=%rcx +mov %r9,%rcx + +# qhasm: while (i) { *out++ = *m++; --i } +rep movsb +# comment:fp stack unchanged by fallthrough + +# qhasm: bytesatleast64: +._bytesatleast64: +# comment:fp stack unchanged by fallthrough + +# qhasm: done: +._done: + +# qhasm: r11_caller = r11_stack +# asm 1: movq r11_caller=int64#9 +# asm 2: movq r11_caller=%r11 +movq 352(%rsp),%r11 + +# qhasm: r12_caller = r12_stack +# asm 1: movq r12_caller=int64#10 +# asm 2: movq r12_caller=%r12 +movq 360(%rsp),%r12 + +# qhasm: r13_caller = r13_stack +# asm 1: movq r13_caller=int64#11 +# asm 2: movq r13_caller=%r13 +movq 368(%rsp),%r13 + +# qhasm: r14_caller = r14_stack +# asm 1: movq r14_caller=int64#12 +# asm 2: movq r14_caller=%r14 +movq 376(%rsp),%r14 + +# qhasm: r15_caller = r15_stack +# asm 1: movq r15_caller=int64#13 +# asm 2: movq r15_caller=%r15 +movq 384(%rsp),%r15 + +# qhasm: rbx_caller = rbx_stack +# asm 1: movq rbx_caller=int64#14 +# asm 2: movq rbx_caller=%rbx +movq 392(%rsp),%rbx + +# qhasm: rbp_caller = rbp_stack +# asm 1: movq rbp_caller=int64#15 +# asm 2: movq rbp_caller=%rbp +movq 400(%rsp),%rbp + +# qhasm: leave +add %r11,%rsp +xor %rax,%rax +xor %rdx,%rdx +ret + +# qhasm: bytesatleast65: +._bytesatleast65: + +# qhasm: bytes -= 64 +# asm 1: sub $64, @@ -1064,7 +1068,7 @@ const char *Packet::errorString(ErrorCode e) void Packet::armor(const void *key,bool encryptPayload,unsigned int counter) { - uint8_t mangledKey[32],macKey[32],mac[16]; + uint8_t mangledKey[32]; uint8_t *const data = reinterpret_cast(unsafeData()); // Mask least significant 3 bits of packet ID with counter to embed packet send counter for QoS use @@ -1074,23 +1078,47 @@ void Packet::armor(const void *key,bool encryptPayload,unsigned int counter) setCipher(encryptPayload ? ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012 : ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE); _salsa20MangleKey((const unsigned char *)key,mangledKey); + +#ifdef ZT_USE_X64_ASM_SALSA2012 + const unsigned int payloadLen = (encryptPayload) ? (size() - ZT_PACKET_IDX_VERB) : 0; + uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8]; + zt_salsa2012_amd64_xmm6(reinterpret_cast(keyStream),payloadLen + 64,reinterpret_cast(data + ZT_PACKET_IDX_IV),reinterpret_cast(mangledKey)); + + uint64_t *ksptr = keyStream + 8; // encryption starts after first Salsa20 block + uint8_t *dptr = data + ZT_PACKET_IDX_VERB; + unsigned int ksrem = payloadLen; + while (ksrem >= 8) { + ksrem -= 8; + *(reinterpret_cast(dptr)) ^= *(ksptr++); + dptr += 8; + } + for(unsigned int i=0;i(ksptr)[i]; + } + + uint64_t mac[2]; + Poly1305::compute(mac,data + ZT_PACKET_IDX_VERB,size() - ZT_PACKET_IDX_VERB,keyStream); + memcpy(data + ZT_PACKET_IDX_MAC,mac,8); +#else Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV); - // MAC key is always the first 32 bytes of the Salsa20 key stream - // This is the same construction DJB's NaCl library uses + uint64_t macKey[4]; s20.crypt12(ZERO_KEY,macKey,sizeof(macKey)); uint8_t *const payload = data + ZT_PACKET_IDX_VERB; const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB; if (encryptPayload) s20.crypt12(payload,payload,payloadLen); + + uint64_t mac[2]; Poly1305::compute(mac,payload,payloadLen,macKey); memcpy(data + ZT_PACKET_IDX_MAC,mac,8); +#endif } bool Packet::dearmor(const void *key) { - uint8_t mangledKey[32],macKey[32],mac[16]; + uint8_t mangledKey[32]; uint8_t *const data = reinterpret_cast(unsafeData()); const unsigned int payloadLen = size() - ZT_PACKET_IDX_VERB; unsigned char *const payload = data + ZT_PACKET_IDX_VERB; @@ -1098,9 +1126,37 @@ bool Packet::dearmor(const void *key) if ((cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)) { _salsa20MangleKey((const unsigned char *)key,mangledKey); + +#ifdef ZT_USE_X64_ASM_SALSA2012 + uint64_t keyStream[(ZT_PROTO_MAX_PACKET_LENGTH + 64 + 8) / 8]; + zt_salsa2012_amd64_xmm6(reinterpret_cast(keyStream),((cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012) ? (payloadLen + 64) : 64),reinterpret_cast(data + ZT_PACKET_IDX_IV),reinterpret_cast(mangledKey)); + + uint64_t mac[2]; + Poly1305::compute(mac,payload,payloadLen,keyStream); + if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8)) + return false; // MAC failed, packet is corrupt, modified, or is not from the sender + + if (cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012) { + uint64_t *ksptr = keyStream + 8; // encryption starts after first Salsa20 block + uint8_t *dptr = data + ZT_PACKET_IDX_VERB; + unsigned int ksrem = payloadLen; + while (ksrem >= 8) { + ksrem -= 8; + *(reinterpret_cast(dptr)) ^= *(ksptr++); + dptr += 8; + } + for(unsigned int i=0;i(ksptr)[i]; + } + } + + return true; +#else Salsa20 s20(mangledKey,data + ZT_PACKET_IDX_IV); + uint64_t macKey[4]; s20.crypt12(ZERO_KEY,macKey,sizeof(macKey)); + uint64_t mac[2]; Poly1305::compute(mac,payload,payloadLen,macKey); if (!Utils::secureEq(mac,data + ZT_PACKET_IDX_MAC,8)) return false; // MAC failed, packet is corrupt, modified, or is not from the sender @@ -1109,6 +1165,7 @@ bool Packet::dearmor(const void *key) s20.crypt12(payload,payload,payloadLen); return true; +#endif } else { return false; // unrecognized cipher suite } diff --git a/selftest.cpp b/selftest.cpp index fe0aa933b..b7a1cc4d4 100644 --- a/selftest.cpp +++ b/selftest.cpp @@ -54,6 +54,10 @@ #include "controller/JSONDB.hpp" +#ifdef ZT_USE_X64_ASM_SALSA2012 +#include "ext/x64-salsa2012-asm/salsa2012.h" +#endif + #ifdef __WINDOWS__ #include #endif @@ -204,6 +208,24 @@ static int testCrypto() ::free((void *)bb); } +#ifdef ZT_USE_X64_ASM_SALSA2012 + std::cout << "[crypto] Benchmarking Salsa20/12 fast x64 ASM... "; std::cout.flush(); + { + unsigned char *bb = (unsigned char *)::malloc(1234567); + for(unsigned int i=0;i<1234567;++i) + bb[i] = (unsigned char)i; + double bytes = 0.0; + uint64_t start = OSUtils::now(); + for(unsigned int i=0;i<200;++i) { + zt_salsa2012_amd64_xmm6_xor(bb,bb,1234567,s20TV0Iv,s20TV0Key); + bytes += 1234567.0; + } + uint64_t end = OSUtils::now(); + std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second" << std::endl; + ::free((void *)bb); + } +#endif + std::cout << "[crypto] Benchmarking Salsa20/20... "; std::cout.flush(); { unsigned char *bb = (unsigned char *)::malloc(1234567);