diff --git a/.gitignore b/.gitignore index 2a5d32757..f8d4e7e44 100755 --- a/.gitignore +++ b/.gitignore @@ -125,7 +125,6 @@ attic/world/mkworld workspace/ workspace2/ zeroidc/target/ -tmp/ #snapcraft specifics /parts/ diff --git a/zeroidc/vendor/ring/pregenerated/tmp/aesni-gcm-x86_64-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/aesni-gcm-x86_64-nasm.asm new file mode 100644 index 000000000..d975309f7 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/aesni-gcm-x86_64-nasm.asm @@ -0,0 +1,1025 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + + +ALIGN 32 +_aesni_ctr32_ghash_6x: + + vmovdqu xmm2,XMMWORD[32+r11] + sub rdx,6 + vpxor xmm4,xmm4,xmm4 + vmovdqu xmm15,XMMWORD[((0-128))+rcx] + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpaddb xmm12,xmm11,xmm2 + vpaddb xmm13,xmm12,xmm2 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm9,xmm1,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm4 + jmp NEAR $L$oop6x + +ALIGN 32 +$L$oop6x: + add ebx,100663296 + jc NEAR $L$handle_ctr32 + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpaddb xmm1,xmm14,xmm2 + vpxor xmm10,xmm10,xmm15 + vpxor xmm11,xmm11,xmm15 + +$L$resume_ctr32: + vmovdqu XMMWORD[r8],xmm1 + vpclmulqdq xmm5,xmm7,xmm3,0x10 + vpxor xmm12,xmm12,xmm15 + vmovups xmm2,XMMWORD[((16-128))+rcx] + vpclmulqdq xmm6,xmm7,xmm3,0x01 + + + + + + + + + + + + + + + + + + xor r12,r12 + cmp r15,r14 + + vaesenc xmm9,xmm9,xmm2 + vmovdqu xmm0,XMMWORD[((48+8))+rsp] + vpxor xmm13,xmm13,xmm15 + vpclmulqdq xmm1,xmm7,xmm3,0x00 + vaesenc xmm10,xmm10,xmm2 + vpxor xmm14,xmm14,xmm15 + setnc r12b + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vaesenc xmm11,xmm11,xmm2 + vmovdqu xmm3,XMMWORD[((16-32))+r9] + neg r12 + vaesenc xmm12,xmm12,xmm2 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm3,0x00 + vpxor xmm8,xmm8,xmm4 + vaesenc xmm13,xmm13,xmm2 + vpxor xmm4,xmm1,xmm5 + and r12,0x60 + vmovups xmm15,XMMWORD[((32-128))+rcx] + vpclmulqdq xmm1,xmm0,xmm3,0x10 + vaesenc xmm14,xmm14,xmm2 + + vpclmulqdq xmm2,xmm0,xmm3,0x01 + lea r14,[r12*1+r14] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpclmulqdq xmm3,xmm0,xmm3,0x11 + vmovdqu xmm0,XMMWORD[((64+8))+rsp] + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[88+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[80+r14] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((32+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((40+8))+rsp],r12 + vmovdqu xmm5,XMMWORD[((48-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((48-128))+rcx] + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm0,xmm5,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm5,0x10 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm7,xmm7,xmm3 + vpclmulqdq xmm3,xmm0,xmm5,0x01 + vaesenc xmm11,xmm11,xmm15 + vpclmulqdq xmm5,xmm0,xmm5,0x11 + vmovdqu xmm0,XMMWORD[((80+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vpxor xmm4,xmm4,xmm1 + vmovdqu xmm1,XMMWORD[((64-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((64-128))+rcx] + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm1,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm1,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[72+r14] + vpxor xmm7,xmm7,xmm5 + vpclmulqdq xmm5,xmm0,xmm1,0x01 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[64+r14] + vpclmulqdq xmm1,xmm0,xmm1,0x11 + vmovdqu xmm0,XMMWORD[((96+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((48+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((56+8))+rsp],r12 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm2,XMMWORD[((96-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((80-128))+rcx] + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm2,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm2,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[56+r14] + vpxor xmm7,xmm7,xmm1 + vpclmulqdq xmm1,xmm0,xmm2,0x01 + vpxor xmm8,xmm8,XMMWORD[((112+8))+rsp] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[48+r14] + vpclmulqdq xmm2,xmm0,xmm2,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((64+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((72+8))+rsp],r12 + vpxor xmm4,xmm4,xmm3 + vmovdqu xmm3,XMMWORD[((112-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((96-128))+rcx] + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm8,xmm3,0x10 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm8,xmm3,0x01 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[40+r14] + vpxor xmm7,xmm7,xmm2 + vpclmulqdq xmm2,xmm8,xmm3,0x00 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[32+r14] + vpclmulqdq xmm8,xmm8,xmm3,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((80+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((88+8))+rsp],r12 + vpxor xmm6,xmm6,xmm5 + vaesenc xmm14,xmm14,xmm15 + vpxor xmm6,xmm6,xmm1 + + vmovups xmm15,XMMWORD[((112-128))+rcx] + vpslldq xmm5,xmm6,8 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm3,XMMWORD[16+r11] + + vaesenc xmm9,xmm9,xmm15 + vpxor xmm7,xmm7,xmm8 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm4,xmm4,xmm5 + movbe r13,QWORD[24+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[16+r14] + vpalignr xmm0,xmm4,xmm4,8 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + mov QWORD[((96+8))+rsp],r13 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((104+8))+rsp],r12 + vaesenc xmm13,xmm13,xmm15 + vmovups xmm1,XMMWORD[((128-128))+rcx] + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vmovups xmm15,XMMWORD[((144-128))+rcx] + vaesenc xmm10,xmm10,xmm1 + vpsrldq xmm6,xmm6,8 + vaesenc xmm11,xmm11,xmm1 + vpxor xmm7,xmm7,xmm6 + vaesenc xmm12,xmm12,xmm1 + vpxor xmm4,xmm4,xmm0 + movbe r13,QWORD[8+r14] + vaesenc xmm13,xmm13,xmm1 + movbe r12,QWORD[r14] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((160-128))+rcx] + cmp ebp,11 + jb NEAR $L$enc_tail + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((176-128))+rcx] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((192-128))+rcx] + + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((208-128))+rcx] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((224-128))+rcx] + jmp NEAR $L$enc_tail + +ALIGN 32 +$L$handle_ctr32: + vmovdqu xmm0,XMMWORD[r11] + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm15 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm15 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpshufb xmm14,xmm14,xmm0 + vpshufb xmm1,xmm1,xmm0 + jmp NEAR $L$resume_ctr32 + +ALIGN 32 +$L$enc_tail: + vaesenc xmm9,xmm9,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm7 + vpalignr xmm8,xmm4,xmm4,8 + vaesenc xmm10,xmm10,xmm15 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + vpxor xmm2,xmm1,XMMWORD[rdi] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm0,xmm1,XMMWORD[16+rdi] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm5,xmm1,XMMWORD[32+rdi] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm6,xmm1,XMMWORD[48+rdi] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm7,xmm1,XMMWORD[64+rdi] + vpxor xmm3,xmm1,XMMWORD[80+rdi] + vmovdqu xmm1,XMMWORD[r8] + + vaesenclast xmm9,xmm9,xmm2 + vmovdqu xmm2,XMMWORD[32+r11] + vaesenclast xmm10,xmm10,xmm0 + vpaddb xmm0,xmm1,xmm2 + mov QWORD[((112+8))+rsp],r13 + lea rdi,[96+rdi] + vaesenclast xmm11,xmm11,xmm5 + vpaddb xmm5,xmm0,xmm2 + mov QWORD[((120+8))+rsp],r12 + lea rsi,[96+rsi] + vmovdqu xmm15,XMMWORD[((0-128))+rcx] + vaesenclast xmm12,xmm12,xmm6 + vpaddb xmm6,xmm5,xmm2 + vaesenclast xmm13,xmm13,xmm7 + vpaddb xmm7,xmm6,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vpaddb xmm3,xmm7,xmm2 + + add r10,0x60 + sub rdx,0x6 + jc NEAR $L$6x_done + + vmovups XMMWORD[(-96)+rsi],xmm9 + vpxor xmm9,xmm1,xmm15 + vmovups XMMWORD[(-80)+rsi],xmm10 + vmovdqa xmm10,xmm0 + vmovups XMMWORD[(-64)+rsi],xmm11 + vmovdqa xmm11,xmm5 + vmovups XMMWORD[(-48)+rsi],xmm12 + vmovdqa xmm12,xmm6 + vmovups XMMWORD[(-32)+rsi],xmm13 + vmovdqa xmm13,xmm7 + vmovups XMMWORD[(-16)+rsi],xmm14 + vmovdqa xmm14,xmm3 + vmovdqu xmm7,XMMWORD[((32+8))+rsp] + jmp NEAR $L$oop6x + +$L$6x_done: + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpxor xmm8,xmm8,xmm4 + + DB 0F3h,0C3h ;repret + + +global GFp_aesni_gcm_decrypt + +ALIGN 32 +GFp_aesni_gcm_decrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_aesni_gcm_decrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + xor r10,r10 + + + + cmp rdx,0x60 + jb NEAR $L$gcm_dec_abort + + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + movaps XMMWORD[(-216)+rax],xmm6 + movaps XMMWORD[(-200)+rax],xmm7 + movaps XMMWORD[(-184)+rax],xmm8 + movaps XMMWORD[(-168)+rax],xmm9 + movaps XMMWORD[(-152)+rax],xmm10 + movaps XMMWORD[(-136)+rax],xmm11 + movaps XMMWORD[(-120)+rax],xmm12 + movaps XMMWORD[(-104)+rax],xmm13 + movaps XMMWORD[(-88)+rax],xmm14 + movaps XMMWORD[(-72)+rax],xmm15 +$L$gcm_dec_body: + vzeroupper + + vmovdqu xmm1,XMMWORD[r8] + add rsp,-128 + mov ebx,DWORD[12+r8] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+rcx] + mov r15,0xf80 + vmovdqu xmm8,XMMWORD[r9] + and rsp,-128 + vmovdqu xmm0,XMMWORD[r11] + lea rcx,[128+rcx] + lea r9,[((32+32))+r9] + mov ebp,DWORD[((240-128))+rcx] + vpshufb xmm8,xmm8,xmm0 + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$dec_no_key_aliasing + cmp r15,768 + jnc NEAR $L$dec_no_key_aliasing + sub rsp,r15 +$L$dec_no_key_aliasing: + + vmovdqu xmm7,XMMWORD[80+rdi] + lea r14,[rdi] + vmovdqu xmm4,XMMWORD[64+rdi] + + + + + + + + lea r15,[((-192))+rdx*1+rdi] + + vmovdqu xmm5,XMMWORD[48+rdi] + shr rdx,4 + xor r10,r10 + vmovdqu xmm6,XMMWORD[32+rdi] + vpshufb xmm7,xmm7,xmm0 + vmovdqu xmm2,XMMWORD[16+rdi] + vpshufb xmm4,xmm4,xmm0 + vmovdqu xmm3,XMMWORD[rdi] + vpshufb xmm5,xmm5,xmm0 + vmovdqu XMMWORD[48+rsp],xmm4 + vpshufb xmm6,xmm6,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm2,xmm2,xmm0 + vmovdqu XMMWORD[80+rsp],xmm6 + vpshufb xmm3,xmm3,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vmovdqu XMMWORD[112+rsp],xmm3 + + call _aesni_ctr32_ghash_6x + + vmovups XMMWORD[(-96)+rsi],xmm9 + vmovups XMMWORD[(-80)+rsi],xmm10 + vmovups XMMWORD[(-64)+rsi],xmm11 + vmovups XMMWORD[(-48)+rsi],xmm12 + vmovups XMMWORD[(-32)+rsi],xmm13 + vmovups XMMWORD[(-16)+rsi],xmm14 + + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[(-64)+r9],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$gcm_dec_abort: + mov rax,r10 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_aesni_gcm_decrypt: + +ALIGN 32 +_aesni_ctr32_6x: + + vmovdqu xmm4,XMMWORD[((0-128))+rcx] + vmovdqu xmm2,XMMWORD[32+r11] + lea r13,[((-1))+rbp] + vmovups xmm15,XMMWORD[((16-128))+rcx] + lea r12,[((32-128))+rcx] + vpxor xmm9,xmm1,xmm4 + add ebx,100663296 + jc NEAR $L$handle_ctr32_2 + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpxor xmm10,xmm10,xmm4 + vpaddb xmm12,xmm11,xmm2 + vpxor xmm11,xmm11,xmm4 + vpaddb xmm13,xmm12,xmm2 + vpxor xmm12,xmm12,xmm4 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm13,xmm13,xmm4 + vpaddb xmm1,xmm14,xmm2 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + +ALIGN 16 +$L$oop_ctr32: + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + vmovups xmm15,XMMWORD[r12] + lea r12,[16+r12] + dec r13d + jnz NEAR $L$oop_ctr32 + + vmovdqu xmm3,XMMWORD[r12] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm4,xmm3,XMMWORD[rdi] + vaesenc xmm10,xmm10,xmm15 + vpxor xmm5,xmm3,XMMWORD[16+rdi] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm6,xmm3,XMMWORD[32+rdi] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm8,xmm3,XMMWORD[48+rdi] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm2,xmm3,XMMWORD[64+rdi] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm3,xmm3,XMMWORD[80+rdi] + lea rdi,[96+rdi] + + vaesenclast xmm9,xmm9,xmm4 + vaesenclast xmm10,xmm10,xmm5 + vaesenclast xmm11,xmm11,xmm6 + vaesenclast xmm12,xmm12,xmm8 + vaesenclast xmm13,xmm13,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vmovups XMMWORD[rsi],xmm9 + vmovups XMMWORD[16+rsi],xmm10 + vmovups XMMWORD[32+rsi],xmm11 + vmovups XMMWORD[48+rsi],xmm12 + vmovups XMMWORD[64+rsi],xmm13 + vmovups XMMWORD[80+rsi],xmm14 + lea rsi,[96+rsi] + + DB 0F3h,0C3h ;repret +ALIGN 32 +$L$handle_ctr32_2: + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm4 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm4 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpxor xmm12,xmm12,xmm4 + vpshufb xmm14,xmm14,xmm0 + vpxor xmm13,xmm13,xmm4 + vpshufb xmm1,xmm1,xmm0 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + + + +global GFp_aesni_gcm_encrypt + +ALIGN 32 +GFp_aesni_gcm_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_aesni_gcm_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + xor r10,r10 + + + + + cmp rdx,0x60*3 + jb NEAR $L$gcm_enc_abort + + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + movaps XMMWORD[(-216)+rax],xmm6 + movaps XMMWORD[(-200)+rax],xmm7 + movaps XMMWORD[(-184)+rax],xmm8 + movaps XMMWORD[(-168)+rax],xmm9 + movaps XMMWORD[(-152)+rax],xmm10 + movaps XMMWORD[(-136)+rax],xmm11 + movaps XMMWORD[(-120)+rax],xmm12 + movaps XMMWORD[(-104)+rax],xmm13 + movaps XMMWORD[(-88)+rax],xmm14 + movaps XMMWORD[(-72)+rax],xmm15 +$L$gcm_enc_body: + vzeroupper + + vmovdqu xmm1,XMMWORD[r8] + add rsp,-128 + mov ebx,DWORD[12+r8] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+rcx] + mov r15,0xf80 + lea rcx,[128+rcx] + vmovdqu xmm0,XMMWORD[r11] + and rsp,-128 + mov ebp,DWORD[((240-128))+rcx] + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$enc_no_key_aliasing + cmp r15,768 + jnc NEAR $L$enc_no_key_aliasing + sub rsp,r15 +$L$enc_no_key_aliasing: + + lea r14,[rsi] + + + + + + + + + lea r15,[((-192))+rdx*1+rsi] + + shr rdx,4 + + call _aesni_ctr32_6x + vpshufb xmm8,xmm9,xmm0 + vpshufb xmm2,xmm10,xmm0 + vmovdqu XMMWORD[112+rsp],xmm8 + vpshufb xmm4,xmm11,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vpshufb xmm5,xmm12,xmm0 + vmovdqu XMMWORD[80+rsp],xmm4 + vpshufb xmm6,xmm13,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm7,xmm14,xmm0 + vmovdqu XMMWORD[48+rsp],xmm6 + + call _aesni_ctr32_6x + + vmovdqu xmm8,XMMWORD[r9] + lea r9,[((32+32))+r9] + sub rdx,12 + mov r10,0x60*2 + vpshufb xmm8,xmm8,xmm0 + + call _aesni_ctr32_ghash_6x + vmovdqu xmm7,XMMWORD[32+rsp] + vmovdqu xmm0,XMMWORD[r11] + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpunpckhqdq xmm1,xmm7,xmm7 + vmovdqu xmm15,XMMWORD[((32-32))+r9] + vmovups XMMWORD[(-96)+rsi],xmm9 + vpshufb xmm9,xmm9,xmm0 + vpxor xmm1,xmm1,xmm7 + vmovups XMMWORD[(-80)+rsi],xmm10 + vpshufb xmm10,xmm10,xmm0 + vmovups XMMWORD[(-64)+rsi],xmm11 + vpshufb xmm11,xmm11,xmm0 + vmovups XMMWORD[(-48)+rsi],xmm12 + vpshufb xmm12,xmm12,xmm0 + vmovups XMMWORD[(-32)+rsi],xmm13 + vpshufb xmm13,xmm13,xmm0 + vmovups XMMWORD[(-16)+rsi],xmm14 + vpshufb xmm14,xmm14,xmm0 + vmovdqu XMMWORD[16+rsp],xmm9 + vmovdqu xmm6,XMMWORD[48+rsp] + vmovdqu xmm0,XMMWORD[((16-32))+r9] + vpunpckhqdq xmm2,xmm6,xmm6 + vpclmulqdq xmm5,xmm7,xmm3,0x00 + vpxor xmm2,xmm2,xmm6 + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + + vmovdqu xmm9,XMMWORD[64+rsp] + vpclmulqdq xmm4,xmm6,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+r9] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm5,xmm9,xmm9 + vpclmulqdq xmm6,xmm6,xmm0,0x11 + vpxor xmm5,xmm5,xmm9 + vpxor xmm6,xmm6,xmm7 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+r9] + vpxor xmm2,xmm2,xmm1 + + vmovdqu xmm1,XMMWORD[80+rsp] + vpclmulqdq xmm7,xmm9,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+r9] + vpxor xmm7,xmm7,xmm4 + vpunpckhqdq xmm4,xmm1,xmm1 + vpclmulqdq xmm9,xmm9,xmm3,0x11 + vpxor xmm4,xmm4,xmm1 + vpxor xmm9,xmm9,xmm6 + vpclmulqdq xmm5,xmm5,xmm15,0x00 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm2,XMMWORD[96+rsp] + vpclmulqdq xmm6,xmm1,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+r9] + vpxor xmm6,xmm6,xmm7 + vpunpckhqdq xmm7,xmm2,xmm2 + vpclmulqdq xmm1,xmm1,xmm0,0x11 + vpxor xmm7,xmm7,xmm2 + vpxor xmm1,xmm1,xmm9 + vpclmulqdq xmm4,xmm4,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+r9] + vpxor xmm4,xmm4,xmm5 + + vpxor xmm8,xmm8,XMMWORD[112+rsp] + vpclmulqdq xmm5,xmm2,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+r9] + vpunpckhqdq xmm9,xmm8,xmm8 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm2,xmm2,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm2,xmm2,xmm1 + vpclmulqdq xmm7,xmm7,xmm15,0x00 + vpxor xmm4,xmm7,xmm4 + + vpclmulqdq xmm6,xmm8,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpunpckhqdq xmm1,xmm14,xmm14 + vpclmulqdq xmm8,xmm8,xmm0,0x11 + vpxor xmm1,xmm1,xmm14 + vpxor xmm5,xmm6,xmm5 + vpclmulqdq xmm9,xmm9,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((32-32))+r9] + vpxor xmm7,xmm8,xmm2 + vpxor xmm6,xmm9,xmm4 + + vmovdqu xmm0,XMMWORD[((16-32))+r9] + vpxor xmm9,xmm7,xmm5 + vpclmulqdq xmm4,xmm14,xmm3,0x00 + vpxor xmm6,xmm6,xmm9 + vpunpckhqdq xmm2,xmm13,xmm13 + vpclmulqdq xmm14,xmm14,xmm3,0x11 + vpxor xmm2,xmm2,xmm13 + vpslldq xmm9,xmm6,8 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + vpxor xmm8,xmm5,xmm9 + vpsrldq xmm6,xmm6,8 + vpxor xmm7,xmm7,xmm6 + + vpclmulqdq xmm5,xmm13,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+r9] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm9,xmm12,xmm12 + vpclmulqdq xmm13,xmm13,xmm0,0x11 + vpxor xmm9,xmm9,xmm12 + vpxor xmm13,xmm13,xmm14 + vpalignr xmm14,xmm8,xmm8,8 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+r9] + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm4,xmm12,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+r9] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm1,xmm11,xmm11 + vpclmulqdq xmm12,xmm12,xmm3,0x11 + vpxor xmm1,xmm1,xmm11 + vpxor xmm12,xmm12,xmm13 + vxorps xmm7,xmm7,XMMWORD[16+rsp] + vpclmulqdq xmm9,xmm9,xmm15,0x00 + vpxor xmm9,xmm9,xmm2 + + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm5,xmm11,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+r9] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm2,xmm10,xmm10 + vpclmulqdq xmm11,xmm11,xmm0,0x11 + vpxor xmm2,xmm2,xmm10 + vpalignr xmm14,xmm8,xmm8,8 + vpxor xmm11,xmm11,xmm12 + vpclmulqdq xmm1,xmm1,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+r9] + vpxor xmm1,xmm1,xmm9 + + vxorps xmm14,xmm14,xmm7 + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm4,xmm10,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+r9] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm9,xmm8,xmm8 + vpclmulqdq xmm10,xmm10,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm10,xmm10,xmm11 + vpclmulqdq xmm2,xmm2,xmm15,0x00 + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm5,xmm8,xmm0,0x00 + vpclmulqdq xmm7,xmm8,xmm0,0x11 + vpxor xmm5,xmm5,xmm4 + vpclmulqdq xmm6,xmm9,xmm15,0x10 + vpxor xmm7,xmm7,xmm10 + vpxor xmm6,xmm6,xmm2 + + vpxor xmm4,xmm7,xmm5 + vpxor xmm6,xmm6,xmm4 + vpslldq xmm1,xmm6,8 + vmovdqu xmm3,XMMWORD[16+r11] + vpsrldq xmm6,xmm6,8 + vpxor xmm8,xmm5,xmm1 + vpxor xmm7,xmm7,xmm6 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm8,xmm8,xmm2 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm2,xmm2,xmm7 + vpxor xmm8,xmm8,xmm2 + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[(-64)+r9],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$gcm_enc_abort: + mov rax,r10 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_aesni_gcm_encrypt: +ALIGN 64 +$L$bswap_mask: +DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$poly: +DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +$L$one_msb: +DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$two_lsb: +DB 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +$L$one_lsb: +DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108 +DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 +DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 +DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +ALIGN 64 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +gcm_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[120+r8] + + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + mov QWORD[240+r8],r15 + mov QWORD[232+r8],r14 + mov QWORD[224+r8],r13 + mov QWORD[216+r8],r12 + mov QWORD[160+r8],rbp + mov QWORD[144+r8],rbx + + lea rsi,[((-216))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_aesni_gcm_decrypt wrt ..imagebase + DD $L$SEH_end_GFp_aesni_gcm_decrypt wrt ..imagebase + DD $L$SEH_gcm_dec_info wrt ..imagebase + + DD $L$SEH_begin_GFp_aesni_gcm_encrypt wrt ..imagebase + DD $L$SEH_end_GFp_aesni_gcm_encrypt wrt ..imagebase + DD $L$SEH_GFp_gcm_enc_info wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_gcm_dec_info: +DB 9,0,0,0 + DD gcm_se_handler wrt ..imagebase + DD $L$gcm_dec_body wrt ..imagebase,$L$gcm_dec_abort wrt ..imagebase +$L$SEH_GFp_gcm_enc_info: +DB 9,0,0,0 + DD gcm_se_handler wrt ..imagebase + DD $L$gcm_enc_body wrt ..imagebase,$L$gcm_enc_abort wrt ..imagebase diff --git a/zeroidc/vendor/ring/pregenerated/tmp/aesni-x86-win32n.asm b/zeroidc/vendor/ring/pregenerated/tmp/aesni-x86-win32n.asm new file mode 100644 index 000000000..ad9b72953 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/aesni-x86-win32n.asm @@ -0,0 +1,682 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +;extern _GFp_ia32cap_P +global _GFp_aes_hw_encrypt +align 16 +_GFp_aes_hw_encrypt: +L$_GFp_aes_hw_encrypt_begin: + mov eax,DWORD [4+esp] + mov edx,DWORD [12+esp] + movups xmm2,[eax] + mov ecx,DWORD [240+edx] + mov eax,DWORD [8+esp] + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$000enc1_loop_1: +db 102,15,56,220,209 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$000enc1_loop_1 +db 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movups [eax],xmm2 + pxor xmm2,xmm2 + ret +align 16 +__aesni_encrypt2: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx + add ecx,16 +L$001enc2_loop: +db 102,15,56,220,209 +db 102,15,56,220,217 + movups xmm1,[ecx*1+edx] + add ecx,32 +db 102,15,56,220,208 +db 102,15,56,220,216 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$001enc2_loop +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,221,208 +db 102,15,56,221,216 + ret +align 16 +__aesni_encrypt3: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx + add ecx,16 +L$002enc3_loop: +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 + movups xmm1,[ecx*1+edx] + add ecx,32 +db 102,15,56,220,208 +db 102,15,56,220,216 +db 102,15,56,220,224 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$002enc3_loop +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +db 102,15,56,221,208 +db 102,15,56,221,216 +db 102,15,56,221,224 + ret +align 16 +__aesni_encrypt4: + movups xmm0,[edx] + movups xmm1,[16+edx] + shl ecx,4 + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + pxor xmm5,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx +db 15,31,64,0 + add ecx,16 +L$003enc4_loop: +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +db 102,15,56,220,233 + movups xmm1,[ecx*1+edx] + add ecx,32 +db 102,15,56,220,208 +db 102,15,56,220,216 +db 102,15,56,220,224 +db 102,15,56,220,232 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$003enc4_loop +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +db 102,15,56,220,233 +db 102,15,56,221,208 +db 102,15,56,221,216 +db 102,15,56,221,224 +db 102,15,56,221,232 + ret +align 16 +__aesni_encrypt6: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 +db 102,15,56,220,209 + pxor xmm5,xmm0 + pxor xmm6,xmm0 +db 102,15,56,220,217 + lea edx,[32+ecx*1+edx] + neg ecx +db 102,15,56,220,225 + pxor xmm7,xmm0 + movups xmm0,[ecx*1+edx] + add ecx,16 + jmp NEAR L$004_aesni_encrypt6_inner +align 16 +L$005enc6_loop: +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +L$004_aesni_encrypt6_inner: +db 102,15,56,220,233 +db 102,15,56,220,241 +db 102,15,56,220,249 +L$_aesni_encrypt6_enter: + movups xmm1,[ecx*1+edx] + add ecx,32 +db 102,15,56,220,208 +db 102,15,56,220,216 +db 102,15,56,220,224 +db 102,15,56,220,232 +db 102,15,56,220,240 +db 102,15,56,220,248 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$005enc6_loop +db 102,15,56,220,209 +db 102,15,56,220,217 +db 102,15,56,220,225 +db 102,15,56,220,233 +db 102,15,56,220,241 +db 102,15,56,220,249 +db 102,15,56,221,208 +db 102,15,56,221,216 +db 102,15,56,221,224 +db 102,15,56,221,232 +db 102,15,56,221,240 +db 102,15,56,221,248 + ret +global _GFp_aes_hw_ctr32_encrypt_blocks +align 16 +_GFp_aes_hw_ctr32_encrypt_blocks: +L$_GFp_aes_hw_ctr32_encrypt_blocks_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + mov ebp,esp + sub esp,88 + and esp,-16 + mov DWORD [80+esp],ebp + cmp eax,1 + je NEAR L$006ctr32_one_shortcut + movdqu xmm7,[ebx] + mov DWORD [esp],202182159 + mov DWORD [4+esp],134810123 + mov DWORD [8+esp],67438087 + mov DWORD [12+esp],66051 + mov ecx,6 + xor ebp,ebp + mov DWORD [16+esp],ecx + mov DWORD [20+esp],ecx + mov DWORD [24+esp],ecx + mov DWORD [28+esp],ebp +db 102,15,58,22,251,3 +db 102,15,58,34,253,3 + mov ecx,DWORD [240+edx] + bswap ebx + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movdqa xmm2,[esp] +db 102,15,58,34,195,0 + lea ebp,[3+ebx] +db 102,15,58,34,205,0 + inc ebx +db 102,15,58,34,195,1 + inc ebp +db 102,15,58,34,205,1 + inc ebx +db 102,15,58,34,195,2 + inc ebp +db 102,15,58,34,205,2 + movdqa [48+esp],xmm0 +db 102,15,56,0,194 + movdqu xmm6,[edx] + movdqa [64+esp],xmm1 +db 102,15,56,0,202 + pshufd xmm2,xmm0,192 + pshufd xmm3,xmm0,128 + cmp eax,6 + jb NEAR L$007ctr32_tail + pxor xmm7,xmm6 + shl ecx,4 + mov ebx,16 + movdqa [32+esp],xmm7 + mov ebp,edx + sub ebx,ecx + lea edx,[32+ecx*1+edx] + sub eax,6 + jmp NEAR L$008ctr32_loop6 +align 16 +L$008ctr32_loop6: + pshufd xmm4,xmm0,64 + movdqa xmm0,[32+esp] + pshufd xmm5,xmm1,192 + pxor xmm2,xmm0 + pshufd xmm6,xmm1,128 + pxor xmm3,xmm0 + pshufd xmm7,xmm1,64 + movups xmm1,[16+ebp] + pxor xmm4,xmm0 + pxor xmm5,xmm0 +db 102,15,56,220,209 + pxor xmm6,xmm0 + pxor xmm7,xmm0 +db 102,15,56,220,217 + movups xmm0,[32+ebp] + mov ecx,ebx +db 102,15,56,220,225 +db 102,15,56,220,233 +db 102,15,56,220,241 +db 102,15,56,220,249 + call L$_aesni_encrypt6_enter + movups xmm1,[esi] + movups xmm0,[16+esi] + xorps xmm2,xmm1 + movups xmm1,[32+esi] + xorps xmm3,xmm0 + movups [edi],xmm2 + movdqa xmm0,[16+esp] + xorps xmm4,xmm1 + movdqa xmm1,[64+esp] + movups [16+edi],xmm3 + movups [32+edi],xmm4 + paddd xmm1,xmm0 + paddd xmm0,[48+esp] + movdqa xmm2,[esp] + movups xmm3,[48+esi] + movups xmm4,[64+esi] + xorps xmm5,xmm3 + movups xmm3,[80+esi] + lea esi,[96+esi] + movdqa [48+esp],xmm0 +db 102,15,56,0,194 + xorps xmm6,xmm4 + movups [48+edi],xmm5 + xorps xmm7,xmm3 + movdqa [64+esp],xmm1 +db 102,15,56,0,202 + movups [64+edi],xmm6 + pshufd xmm2,xmm0,192 + movups [80+edi],xmm7 + lea edi,[96+edi] + pshufd xmm3,xmm0,128 + sub eax,6 + jnc NEAR L$008ctr32_loop6 + add eax,6 + jz NEAR L$009ctr32_ret + movdqu xmm7,[ebp] + mov edx,ebp + pxor xmm7,[32+esp] + mov ecx,DWORD [240+ebp] +L$007ctr32_tail: + por xmm2,xmm7 + cmp eax,2 + jb NEAR L$010ctr32_one + pshufd xmm4,xmm0,64 + por xmm3,xmm7 + je NEAR L$011ctr32_two + pshufd xmm5,xmm1,192 + por xmm4,xmm7 + cmp eax,4 + jb NEAR L$012ctr32_three + pshufd xmm6,xmm1,128 + por xmm5,xmm7 + je NEAR L$013ctr32_four + por xmm6,xmm7 + call __aesni_encrypt6 + movups xmm1,[esi] + movups xmm0,[16+esi] + xorps xmm2,xmm1 + movups xmm1,[32+esi] + xorps xmm3,xmm0 + movups xmm0,[48+esi] + xorps xmm4,xmm1 + movups xmm1,[64+esi] + xorps xmm5,xmm0 + movups [edi],xmm2 + xorps xmm6,xmm1 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + movups [64+edi],xmm6 + jmp NEAR L$009ctr32_ret +align 16 +L$006ctr32_one_shortcut: + movups xmm2,[ebx] + mov ecx,DWORD [240+edx] +L$010ctr32_one: + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$014enc1_loop_2: +db 102,15,56,220,209 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$014enc1_loop_2 +db 102,15,56,221,209 + movups xmm6,[esi] + xorps xmm6,xmm2 + movups [edi],xmm6 + jmp NEAR L$009ctr32_ret +align 16 +L$011ctr32_two: + call __aesni_encrypt2 + movups xmm5,[esi] + movups xmm6,[16+esi] + xorps xmm2,xmm5 + xorps xmm3,xmm6 + movups [edi],xmm2 + movups [16+edi],xmm3 + jmp NEAR L$009ctr32_ret +align 16 +L$012ctr32_three: + call __aesni_encrypt3 + movups xmm5,[esi] + movups xmm6,[16+esi] + xorps xmm2,xmm5 + movups xmm7,[32+esi] + xorps xmm3,xmm6 + movups [edi],xmm2 + xorps xmm4,xmm7 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + jmp NEAR L$009ctr32_ret +align 16 +L$013ctr32_four: + call __aesni_encrypt4 + movups xmm6,[esi] + movups xmm7,[16+esi] + movups xmm1,[32+esi] + xorps xmm2,xmm6 + movups xmm0,[48+esi] + xorps xmm3,xmm7 + movups [edi],xmm2 + xorps xmm4,xmm1 + movups [16+edi],xmm3 + xorps xmm5,xmm0 + movups [32+edi],xmm4 + movups [48+edi],xmm5 +L$009ctr32_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + movdqa [32+esp],xmm0 + pxor xmm5,xmm5 + movdqa [48+esp],xmm0 + pxor xmm6,xmm6 + movdqa [64+esp],xmm0 + pxor xmm7,xmm7 + mov esp,DWORD [80+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +align 16 +__aesni_set_encrypt_key: + push ebp + push ebx + test eax,eax + jz NEAR L$015bad_pointer + test edx,edx + jz NEAR L$015bad_pointer + call L$016pic +L$016pic: + pop ebx + lea ebx,[(L$key_const-L$016pic)+ebx] + lea ebp,[_GFp_ia32cap_P] + movups xmm0,[eax] + xorps xmm4,xmm4 + mov ebp,DWORD [4+ebp] + lea edx,[16+edx] + and ebp,268437504 + cmp ecx,256 + je NEAR L$01714rounds + cmp ecx,128 + jne NEAR L$018bad_keybits +align 16 +L$01910rounds: + cmp ebp,268435456 + je NEAR L$02010rounds_alt + mov ecx,9 + movups [edx-16],xmm0 +db 102,15,58,223,200,1 + call L$021key_128_cold +db 102,15,58,223,200,2 + call L$022key_128 +db 102,15,58,223,200,4 + call L$022key_128 +db 102,15,58,223,200,8 + call L$022key_128 +db 102,15,58,223,200,16 + call L$022key_128 +db 102,15,58,223,200,32 + call L$022key_128 +db 102,15,58,223,200,64 + call L$022key_128 +db 102,15,58,223,200,128 + call L$022key_128 +db 102,15,58,223,200,27 + call L$022key_128 +db 102,15,58,223,200,54 + call L$022key_128 + movups [edx],xmm0 + mov DWORD [80+edx],ecx + jmp NEAR L$023good_key +align 16 +L$022key_128: + movups [edx],xmm0 + lea edx,[16+edx] +L$021key_128_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret +align 16 +L$02010rounds_alt: + movdqa xmm5,[ebx] + mov ecx,8 + movdqa xmm4,[32+ebx] + movdqa xmm2,xmm0 + movdqu [edx-16],xmm0 +L$024loop_key128: +db 102,15,56,0,197 +db 102,15,56,221,196 + pslld xmm4,1 + lea edx,[16+edx] + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu [edx-16],xmm0 + movdqa xmm2,xmm0 + dec ecx + jnz NEAR L$024loop_key128 + movdqa xmm4,[48+ebx] +db 102,15,56,0,197 +db 102,15,56,221,196 + pslld xmm4,1 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu [edx],xmm0 + movdqa xmm2,xmm0 +db 102,15,56,0,197 +db 102,15,56,221,196 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu [16+edx],xmm0 + mov ecx,9 + mov DWORD [96+edx],ecx + jmp NEAR L$023good_key +align 16 +L$01714rounds: + movups xmm2,[16+eax] + lea edx,[16+edx] + cmp ebp,268435456 + je NEAR L$02514rounds_alt + mov ecx,13 + movups [edx-32],xmm0 + movups [edx-16],xmm2 +db 102,15,58,223,202,1 + call L$026key_256a_cold +db 102,15,58,223,200,1 + call L$027key_256b +db 102,15,58,223,202,2 + call L$028key_256a +db 102,15,58,223,200,2 + call L$027key_256b +db 102,15,58,223,202,4 + call L$028key_256a +db 102,15,58,223,200,4 + call L$027key_256b +db 102,15,58,223,202,8 + call L$028key_256a +db 102,15,58,223,200,8 + call L$027key_256b +db 102,15,58,223,202,16 + call L$028key_256a +db 102,15,58,223,200,16 + call L$027key_256b +db 102,15,58,223,202,32 + call L$028key_256a +db 102,15,58,223,200,32 + call L$027key_256b +db 102,15,58,223,202,64 + call L$028key_256a + movups [edx],xmm0 + mov DWORD [16+edx],ecx + xor eax,eax + jmp NEAR L$023good_key +align 16 +L$028key_256a: + movups [edx],xmm2 + lea edx,[16+edx] +L$026key_256a_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret +align 16 +L$027key_256b: + movups [edx],xmm0 + lea edx,[16+edx] + shufps xmm4,xmm2,16 + xorps xmm2,xmm4 + shufps xmm4,xmm2,140 + xorps xmm2,xmm4 + shufps xmm1,xmm1,170 + xorps xmm2,xmm1 + ret +align 16 +L$02514rounds_alt: + movdqa xmm5,[ebx] + movdqa xmm4,[32+ebx] + mov ecx,7 + movdqu [edx-32],xmm0 + movdqa xmm1,xmm2 + movdqu [edx-16],xmm2 +L$029loop_key256: +db 102,15,56,0,213 +db 102,15,56,221,212 + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + pxor xmm0,xmm2 + movdqu [edx],xmm0 + dec ecx + jz NEAR L$030done_key256 + pshufd xmm2,xmm0,255 + pxor xmm3,xmm3 +db 102,15,56,221,211 + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + pxor xmm2,xmm1 + movdqu [16+edx],xmm2 + lea edx,[32+edx] + movdqa xmm1,xmm2 + jmp NEAR L$029loop_key256 +L$030done_key256: + mov ecx,13 + mov DWORD [16+edx],ecx +L$023good_key: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + xor eax,eax + pop ebx + pop ebp + ret +align 4 +L$015bad_pointer: + mov eax,-1 + pop ebx + pop ebp + ret +align 4 +L$018bad_keybits: + pxor xmm0,xmm0 + mov eax,-2 + pop ebx + pop ebp + ret +global _GFp_aes_hw_set_encrypt_key +align 16 +_GFp_aes_hw_set_encrypt_key: +L$_GFp_aes_hw_set_encrypt_key_begin: + mov eax,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov edx,DWORD [12+esp] + call __aesni_set_encrypt_key + ret +align 64 +L$key_const: +dd 202313229,202313229,202313229,202313229 +dd 67569157,67569157,67569157,67569157 +dd 1,1,1,1 +dd 27,27,27,27 +db 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 +db 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 +db 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 +db 115,108,46,111,114,103,62,0 +segment .bss +common _GFp_ia32cap_P 16 diff --git a/zeroidc/vendor/ring/pregenerated/tmp/aesni-x86_64-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/aesni-x86_64-nasm.asm new file mode 100644 index 000000000..62d318a0b --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/aesni-x86_64-nasm.asm @@ -0,0 +1,1311 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + +EXTERN GFp_ia32cap_P +global GFp_aes_hw_encrypt + +ALIGN 16 +GFp_aes_hw_encrypt: + + movups xmm2,XMMWORD[rcx] + mov eax,DWORD[240+r8] + movups xmm0,XMMWORD[r8] + movups xmm1,XMMWORD[16+r8] + lea r8,[32+r8] + xorps xmm2,xmm0 +$L$oop_enc1_1: +DB 102,15,56,220,209 + dec eax + movups xmm1,XMMWORD[r8] + lea r8,[16+r8] + jnz NEAR $L$oop_enc1_1 +DB 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movups XMMWORD[rdx],xmm2 + pxor xmm2,xmm2 + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +_aesni_encrypt2: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + add rax,16 + +$L$enc_loop2: +DB 102,15,56,220,209 +DB 102,15,56,220,217 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 +DB 102,15,56,220,208 +DB 102,15,56,220,216 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop2 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,221,208 +DB 102,15,56,221,216 + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +_aesni_encrypt3: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + xorps xmm4,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + add rax,16 + +$L$enc_loop3: +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop3 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,221,208 +DB 102,15,56,221,216 +DB 102,15,56,221,224 + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +_aesni_encrypt4: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + xorps xmm4,xmm0 + xorps xmm5,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax +DB 0x0f,0x1f,0x00 + add rax,16 + +$L$enc_loop4: +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop4 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,221,208 +DB 102,15,56,221,216 +DB 102,15,56,221,224 +DB 102,15,56,221,232 + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +_aesni_encrypt6: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 +DB 102,15,56,220,209 + lea rcx,[32+rax*1+rcx] + neg rax +DB 102,15,56,220,217 + pxor xmm5,xmm0 + pxor xmm6,xmm0 +DB 102,15,56,220,225 + pxor xmm7,xmm0 + movups xmm0,XMMWORD[rax*1+rcx] + add rax,16 + jmp NEAR $L$enc_loop6_enter +ALIGN 16 +$L$enc_loop6: +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +$L$enc_loop6_enter: +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 +DB 102,15,56,220,240 +DB 102,15,56,220,248 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop6 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,15,56,221,208 +DB 102,15,56,221,216 +DB 102,15,56,221,224 +DB 102,15,56,221,232 +DB 102,15,56,221,240 +DB 102,15,56,221,248 + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +_aesni_encrypt8: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + pxor xmm4,xmm0 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + lea rcx,[32+rax*1+rcx] + neg rax +DB 102,15,56,220,209 + pxor xmm7,xmm0 + pxor xmm8,xmm0 +DB 102,15,56,220,217 + pxor xmm9,xmm0 + movups xmm0,XMMWORD[rax*1+rcx] + add rax,16 + jmp NEAR $L$enc_loop8_inner +ALIGN 16 +$L$enc_loop8: +DB 102,15,56,220,209 +DB 102,15,56,220,217 +$L$enc_loop8_inner: +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 +DB 102,68,15,56,220,201 +$L$enc_loop8_enter: + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 +DB 102,15,56,220,240 +DB 102,15,56,220,248 +DB 102,68,15,56,220,192 +DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop8 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 +DB 102,68,15,56,220,201 +DB 102,15,56,221,208 +DB 102,15,56,221,216 +DB 102,15,56,221,224 +DB 102,15,56,221,232 +DB 102,15,56,221,240 +DB 102,15,56,221,248 +DB 102,68,15,56,221,192 +DB 102,68,15,56,221,200 + DB 0F3h,0C3h ;repret + + +global GFp_aes_hw_ctr32_encrypt_blocks + +ALIGN 16 +GFp_aes_hw_ctr32_encrypt_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_aes_hw_ctr32_encrypt_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + cmp rdx,1 + jne NEAR $L$ctr32_bulk + + + + movups xmm2,XMMWORD[r8] + movups xmm3,XMMWORD[rdi] + mov edx,DWORD[240+rcx] + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[16+rcx] + lea rcx,[32+rcx] + xorps xmm2,xmm0 +$L$oop_enc1_2: +DB 102,15,56,220,209 + dec edx + movups xmm1,XMMWORD[rcx] + lea rcx,[16+rcx] + jnz NEAR $L$oop_enc1_2 +DB 102,15,56,221,209 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + xorps xmm2,xmm3 + pxor xmm3,xmm3 + movups XMMWORD[rsi],xmm2 + xorps xmm2,xmm2 + jmp NEAR $L$ctr32_epilogue + +ALIGN 16 +$L$ctr32_bulk: + lea r11,[rsp] + + push rbp + + sub rsp,288 + and rsp,-16 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 +$L$ctr32_body: + + + + + movdqu xmm2,XMMWORD[r8] + movdqu xmm0,XMMWORD[rcx] + mov r8d,DWORD[12+r8] + pxor xmm2,xmm0 + mov ebp,DWORD[12+rcx] + movdqa XMMWORD[rsp],xmm2 + bswap r8d + movdqa xmm3,xmm2 + movdqa xmm4,xmm2 + movdqa xmm5,xmm2 + movdqa XMMWORD[64+rsp],xmm2 + movdqa XMMWORD[80+rsp],xmm2 + movdqa XMMWORD[96+rsp],xmm2 + mov r10,rdx + movdqa XMMWORD[112+rsp],xmm2 + + lea rax,[1+r8] + lea rdx,[2+r8] + bswap eax + bswap edx + xor eax,ebp + xor edx,ebp +DB 102,15,58,34,216,3 + lea rax,[3+r8] + movdqa XMMWORD[16+rsp],xmm3 +DB 102,15,58,34,226,3 + bswap eax + mov rdx,r10 + lea r10,[4+r8] + movdqa XMMWORD[32+rsp],xmm4 + xor eax,ebp + bswap r10d +DB 102,15,58,34,232,3 + xor r10d,ebp + movdqa XMMWORD[48+rsp],xmm5 + lea r9,[5+r8] + mov DWORD[((64+12))+rsp],r10d + bswap r9d + lea r10,[6+r8] + mov eax,DWORD[240+rcx] + xor r9d,ebp + bswap r10d + mov DWORD[((80+12))+rsp],r9d + xor r10d,ebp + lea r9,[7+r8] + mov DWORD[((96+12))+rsp],r10d + bswap r9d + lea r10,[GFp_ia32cap_P] + mov r10d,DWORD[4+r10] + xor r9d,ebp + and r10d,71303168 + mov DWORD[((112+12))+rsp],r9d + + movups xmm1,XMMWORD[16+rcx] + + movdqa xmm6,XMMWORD[64+rsp] + movdqa xmm7,XMMWORD[80+rsp] + + cmp rdx,8 + jb NEAR $L$ctr32_tail + + sub rdx,6 + cmp r10d,4194304 + je NEAR $L$ctr32_6x + + lea rcx,[128+rcx] + sub rdx,2 + jmp NEAR $L$ctr32_loop8 + +ALIGN 16 +$L$ctr32_6x: + shl eax,4 + mov r10d,48 + bswap ebp + lea rcx,[32+rax*1+rcx] + sub r10,rax + jmp NEAR $L$ctr32_loop6 + +ALIGN 16 +$L$ctr32_loop6: + add r8d,6 + movups xmm0,XMMWORD[((-48))+r10*1+rcx] +DB 102,15,56,220,209 + mov eax,r8d + xor eax,ebp +DB 102,15,56,220,217 +DB 0x0f,0x38,0xf1,0x44,0x24,12 + lea eax,[1+r8] +DB 102,15,56,220,225 + xor eax,ebp +DB 0x0f,0x38,0xf1,0x44,0x24,28 +DB 102,15,56,220,233 + lea eax,[2+r8] + xor eax,ebp +DB 102,15,56,220,241 +DB 0x0f,0x38,0xf1,0x44,0x24,44 + lea eax,[3+r8] +DB 102,15,56,220,249 + movups xmm1,XMMWORD[((-32))+r10*1+rcx] + xor eax,ebp + +DB 102,15,56,220,208 +DB 0x0f,0x38,0xf1,0x44,0x24,60 + lea eax,[4+r8] +DB 102,15,56,220,216 + xor eax,ebp +DB 0x0f,0x38,0xf1,0x44,0x24,76 +DB 102,15,56,220,224 + lea eax,[5+r8] + xor eax,ebp +DB 102,15,56,220,232 +DB 0x0f,0x38,0xf1,0x44,0x24,92 + mov rax,r10 +DB 102,15,56,220,240 +DB 102,15,56,220,248 + movups xmm0,XMMWORD[((-16))+r10*1+rcx] + + call $L$enc_loop6 + + movdqu xmm8,XMMWORD[rdi] + movdqu xmm9,XMMWORD[16+rdi] + movdqu xmm10,XMMWORD[32+rdi] + movdqu xmm11,XMMWORD[48+rdi] + movdqu xmm12,XMMWORD[64+rdi] + movdqu xmm13,XMMWORD[80+rdi] + lea rdi,[96+rdi] + movups xmm1,XMMWORD[((-64))+r10*1+rcx] + pxor xmm8,xmm2 + movaps xmm2,XMMWORD[rsp] + pxor xmm9,xmm3 + movaps xmm3,XMMWORD[16+rsp] + pxor xmm10,xmm4 + movaps xmm4,XMMWORD[32+rsp] + pxor xmm11,xmm5 + movaps xmm5,XMMWORD[48+rsp] + pxor xmm12,xmm6 + movaps xmm6,XMMWORD[64+rsp] + pxor xmm13,xmm7 + movaps xmm7,XMMWORD[80+rsp] + movdqu XMMWORD[rsi],xmm8 + movdqu XMMWORD[16+rsi],xmm9 + movdqu XMMWORD[32+rsi],xmm10 + movdqu XMMWORD[48+rsi],xmm11 + movdqu XMMWORD[64+rsi],xmm12 + movdqu XMMWORD[80+rsi],xmm13 + lea rsi,[96+rsi] + + sub rdx,6 + jnc NEAR $L$ctr32_loop6 + + add rdx,6 + jz NEAR $L$ctr32_done + + lea eax,[((-48))+r10] + lea rcx,[((-80))+r10*1+rcx] + neg eax + shr eax,4 + jmp NEAR $L$ctr32_tail + +ALIGN 32 +$L$ctr32_loop8: + add r8d,8 + movdqa xmm8,XMMWORD[96+rsp] +DB 102,15,56,220,209 + mov r9d,r8d + movdqa xmm9,XMMWORD[112+rsp] +DB 102,15,56,220,217 + bswap r9d + movups xmm0,XMMWORD[((32-128))+rcx] +DB 102,15,56,220,225 + xor r9d,ebp + nop +DB 102,15,56,220,233 + mov DWORD[((0+12))+rsp],r9d + lea r9,[1+r8] +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 +DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((48-128))+rcx] + bswap r9d +DB 102,15,56,220,208 +DB 102,15,56,220,216 + xor r9d,ebp +DB 0x66,0x90 +DB 102,15,56,220,224 +DB 102,15,56,220,232 + mov DWORD[((16+12))+rsp],r9d + lea r9,[2+r8] +DB 102,15,56,220,240 +DB 102,15,56,220,248 +DB 102,68,15,56,220,192 +DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((64-128))+rcx] + bswap r9d +DB 102,15,56,220,209 +DB 102,15,56,220,217 + xor r9d,ebp +DB 0x66,0x90 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + mov DWORD[((32+12))+rsp],r9d + lea r9,[3+r8] +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 +DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((80-128))+rcx] + bswap r9d +DB 102,15,56,220,208 +DB 102,15,56,220,216 + xor r9d,ebp +DB 0x66,0x90 +DB 102,15,56,220,224 +DB 102,15,56,220,232 + mov DWORD[((48+12))+rsp],r9d + lea r9,[4+r8] +DB 102,15,56,220,240 +DB 102,15,56,220,248 +DB 102,68,15,56,220,192 +DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((96-128))+rcx] + bswap r9d +DB 102,15,56,220,209 +DB 102,15,56,220,217 + xor r9d,ebp +DB 0x66,0x90 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + mov DWORD[((64+12))+rsp],r9d + lea r9,[5+r8] +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 +DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((112-128))+rcx] + bswap r9d +DB 102,15,56,220,208 +DB 102,15,56,220,216 + xor r9d,ebp +DB 0x66,0x90 +DB 102,15,56,220,224 +DB 102,15,56,220,232 + mov DWORD[((80+12))+rsp],r9d + lea r9,[6+r8] +DB 102,15,56,220,240 +DB 102,15,56,220,248 +DB 102,68,15,56,220,192 +DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((128-128))+rcx] + bswap r9d +DB 102,15,56,220,209 +DB 102,15,56,220,217 + xor r9d,ebp +DB 0x66,0x90 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + mov DWORD[((96+12))+rsp],r9d + lea r9,[7+r8] +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 +DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((144-128))+rcx] + bswap r9d +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 + xor r9d,ebp + movdqu xmm10,XMMWORD[rdi] +DB 102,15,56,220,232 + mov DWORD[((112+12))+rsp],r9d + cmp eax,11 +DB 102,15,56,220,240 +DB 102,15,56,220,248 +DB 102,68,15,56,220,192 +DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((160-128))+rcx] + + jb NEAR $L$ctr32_enc_done + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 +DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((176-128))+rcx] + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 +DB 102,15,56,220,240 +DB 102,15,56,220,248 +DB 102,68,15,56,220,192 +DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((192-128))+rcx] + + + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 +DB 102,68,15,56,220,201 + movups xmm1,XMMWORD[((208-128))+rcx] + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 +DB 102,15,56,220,240 +DB 102,15,56,220,248 +DB 102,68,15,56,220,192 +DB 102,68,15,56,220,200 + movups xmm0,XMMWORD[((224-128))+rcx] + jmp NEAR $L$ctr32_enc_done + +ALIGN 16 +$L$ctr32_enc_done: + movdqu xmm11,XMMWORD[16+rdi] + pxor xmm10,xmm0 + movdqu xmm12,XMMWORD[32+rdi] + pxor xmm11,xmm0 + movdqu xmm13,XMMWORD[48+rdi] + pxor xmm12,xmm0 + movdqu xmm14,XMMWORD[64+rdi] + pxor xmm13,xmm0 + movdqu xmm15,XMMWORD[80+rdi] + pxor xmm14,xmm0 + pxor xmm15,xmm0 +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 +DB 102,68,15,56,220,201 + movdqu xmm1,XMMWORD[96+rdi] + lea rdi,[128+rdi] + +DB 102,65,15,56,221,210 + pxor xmm1,xmm0 + movdqu xmm10,XMMWORD[((112-128))+rdi] +DB 102,65,15,56,221,219 + pxor xmm10,xmm0 + movdqa xmm11,XMMWORD[rsp] +DB 102,65,15,56,221,228 +DB 102,65,15,56,221,237 + movdqa xmm12,XMMWORD[16+rsp] + movdqa xmm13,XMMWORD[32+rsp] +DB 102,65,15,56,221,246 +DB 102,65,15,56,221,255 + movdqa xmm14,XMMWORD[48+rsp] + movdqa xmm15,XMMWORD[64+rsp] +DB 102,68,15,56,221,193 + movdqa xmm0,XMMWORD[80+rsp] + movups xmm1,XMMWORD[((16-128))+rcx] +DB 102,69,15,56,221,202 + + movups XMMWORD[rsi],xmm2 + movdqa xmm2,xmm11 + movups XMMWORD[16+rsi],xmm3 + movdqa xmm3,xmm12 + movups XMMWORD[32+rsi],xmm4 + movdqa xmm4,xmm13 + movups XMMWORD[48+rsi],xmm5 + movdqa xmm5,xmm14 + movups XMMWORD[64+rsi],xmm6 + movdqa xmm6,xmm15 + movups XMMWORD[80+rsi],xmm7 + movdqa xmm7,xmm0 + movups XMMWORD[96+rsi],xmm8 + movups XMMWORD[112+rsi],xmm9 + lea rsi,[128+rsi] + + sub rdx,8 + jnc NEAR $L$ctr32_loop8 + + add rdx,8 + jz NEAR $L$ctr32_done + lea rcx,[((-128))+rcx] + +$L$ctr32_tail: + + + lea rcx,[16+rcx] + cmp rdx,4 + jb NEAR $L$ctr32_loop3 + je NEAR $L$ctr32_loop4 + + + shl eax,4 + movdqa xmm8,XMMWORD[96+rsp] + pxor xmm9,xmm9 + + movups xmm0,XMMWORD[16+rcx] +DB 102,15,56,220,209 +DB 102,15,56,220,217 + lea rcx,[((32-16))+rax*1+rcx] + neg rax +DB 102,15,56,220,225 + add rax,16 + movups xmm10,XMMWORD[rdi] +DB 102,15,56,220,233 +DB 102,15,56,220,241 + movups xmm11,XMMWORD[16+rdi] + movups xmm12,XMMWORD[32+rdi] +DB 102,15,56,220,249 +DB 102,68,15,56,220,193 + + call $L$enc_loop8_enter + + movdqu xmm13,XMMWORD[48+rdi] + pxor xmm2,xmm10 + movdqu xmm10,XMMWORD[64+rdi] + pxor xmm3,xmm11 + movdqu XMMWORD[rsi],xmm2 + pxor xmm4,xmm12 + movdqu XMMWORD[16+rsi],xmm3 + pxor xmm5,xmm13 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm6,xmm10 + movdqu XMMWORD[48+rsi],xmm5 + movdqu XMMWORD[64+rsi],xmm6 + cmp rdx,6 + jb NEAR $L$ctr32_done + + movups xmm11,XMMWORD[80+rdi] + xorps xmm7,xmm11 + movups XMMWORD[80+rsi],xmm7 + je NEAR $L$ctr32_done + + movups xmm12,XMMWORD[96+rdi] + xorps xmm8,xmm12 + movups XMMWORD[96+rsi],xmm8 + jmp NEAR $L$ctr32_done + +ALIGN 32 +$L$ctr32_loop4: +DB 102,15,56,220,209 + lea rcx,[16+rcx] + dec eax +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + movups xmm1,XMMWORD[rcx] + jnz NEAR $L$ctr32_loop4 +DB 102,15,56,221,209 +DB 102,15,56,221,217 + movups xmm10,XMMWORD[rdi] + movups xmm11,XMMWORD[16+rdi] +DB 102,15,56,221,225 +DB 102,15,56,221,233 + movups xmm12,XMMWORD[32+rdi] + movups xmm13,XMMWORD[48+rdi] + + xorps xmm2,xmm10 + movups XMMWORD[rsi],xmm2 + xorps xmm3,xmm11 + movups XMMWORD[16+rsi],xmm3 + pxor xmm4,xmm12 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm5,xmm13 + movdqu XMMWORD[48+rsi],xmm5 + jmp NEAR $L$ctr32_done + +ALIGN 32 +$L$ctr32_loop3: +DB 102,15,56,220,209 + lea rcx,[16+rcx] + dec eax +DB 102,15,56,220,217 +DB 102,15,56,220,225 + movups xmm1,XMMWORD[rcx] + jnz NEAR $L$ctr32_loop3 +DB 102,15,56,221,209 +DB 102,15,56,221,217 +DB 102,15,56,221,225 + + movups xmm10,XMMWORD[rdi] + xorps xmm2,xmm10 + movups XMMWORD[rsi],xmm2 + cmp rdx,2 + jb NEAR $L$ctr32_done + + movups xmm11,XMMWORD[16+rdi] + xorps xmm3,xmm11 + movups XMMWORD[16+rsi],xmm3 + je NEAR $L$ctr32_done + + movups xmm12,XMMWORD[32+rdi] + xorps xmm4,xmm12 + movups XMMWORD[32+rsi],xmm4 + +$L$ctr32_done: + xorps xmm0,xmm0 + xor ebp,ebp + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 + movaps XMMWORD[rsp],xmm0 + movaps XMMWORD[16+rsp],xmm0 + movaps XMMWORD[32+rsp],xmm0 + movaps XMMWORD[48+rsp],xmm0 + movaps XMMWORD[64+rsp],xmm0 + movaps XMMWORD[80+rsp],xmm0 + movaps XMMWORD[96+rsp],xmm0 + movaps XMMWORD[112+rsp],xmm0 + mov rbp,QWORD[((-8))+r11] + + lea rsp,[r11] + +$L$ctr32_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_aes_hw_ctr32_encrypt_blocks: +global GFp_aes_hw_set_encrypt_key + +ALIGN 16 +GFp_aes_hw_set_encrypt_key: +__aesni_set_encrypt_key: + +DB 0x48,0x83,0xEC,0x08 + + mov rax,-1 + test rcx,rcx + jz NEAR $L$enc_key_ret + test r8,r8 + jz NEAR $L$enc_key_ret + + movups xmm0,XMMWORD[rcx] + xorps xmm4,xmm4 + lea r10,[GFp_ia32cap_P] + mov r10d,DWORD[4+r10] + and r10d,268437504 + lea rax,[16+r8] + cmp edx,256 + je NEAR $L$14rounds + + cmp edx,128 + jne NEAR $L$bad_keybits + +$L$10rounds: + mov edx,9 + cmp r10d,268435456 + je NEAR $L$10rounds_alt + + movups XMMWORD[r8],xmm0 +DB 102,15,58,223,200,1 + call $L$key_expansion_128_cold +DB 102,15,58,223,200,2 + call $L$key_expansion_128 +DB 102,15,58,223,200,4 + call $L$key_expansion_128 +DB 102,15,58,223,200,8 + call $L$key_expansion_128 +DB 102,15,58,223,200,16 + call $L$key_expansion_128 +DB 102,15,58,223,200,32 + call $L$key_expansion_128 +DB 102,15,58,223,200,64 + call $L$key_expansion_128 +DB 102,15,58,223,200,128 + call $L$key_expansion_128 +DB 102,15,58,223,200,27 + call $L$key_expansion_128 +DB 102,15,58,223,200,54 + call $L$key_expansion_128 + movups XMMWORD[rax],xmm0 + mov DWORD[80+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret + +ALIGN 16 +$L$10rounds_alt: + movdqa xmm5,XMMWORD[$L$key_rotate] + mov r10d,8 + movdqa xmm4,XMMWORD[$L$key_rcon1] + movdqa xmm2,xmm0 + movdqu XMMWORD[r8],xmm0 + jmp NEAR $L$oop_key128 + +ALIGN 16 +$L$oop_key128: +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + lea rax,[16+rax] + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD[(-16)+rax],xmm0 + movdqa xmm2,xmm0 + + dec r10d + jnz NEAR $L$oop_key128 + + movdqa xmm4,XMMWORD[$L$key_rcon1b] + +DB 102,15,56,0,197 +DB 102,15,56,221,196 + pslld xmm4,1 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD[rax],xmm0 + + movdqa xmm2,xmm0 +DB 102,15,56,0,197 +DB 102,15,56,221,196 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD[16+rax],xmm0 + + mov DWORD[96+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret + + + +ALIGN 16 +$L$14rounds: + movups xmm2,XMMWORD[16+rcx] + mov edx,13 + lea rax,[16+rax] + cmp r10d,268435456 + je NEAR $L$14rounds_alt + + movups XMMWORD[r8],xmm0 + movups XMMWORD[16+r8],xmm2 +DB 102,15,58,223,202,1 + call $L$key_expansion_256a_cold +DB 102,15,58,223,200,1 + call $L$key_expansion_256b +DB 102,15,58,223,202,2 + call $L$key_expansion_256a +DB 102,15,58,223,200,2 + call $L$key_expansion_256b +DB 102,15,58,223,202,4 + call $L$key_expansion_256a +DB 102,15,58,223,200,4 + call $L$key_expansion_256b +DB 102,15,58,223,202,8 + call $L$key_expansion_256a +DB 102,15,58,223,200,8 + call $L$key_expansion_256b +DB 102,15,58,223,202,16 + call $L$key_expansion_256a +DB 102,15,58,223,200,16 + call $L$key_expansion_256b +DB 102,15,58,223,202,32 + call $L$key_expansion_256a +DB 102,15,58,223,200,32 + call $L$key_expansion_256b +DB 102,15,58,223,202,64 + call $L$key_expansion_256a + movups XMMWORD[rax],xmm0 + mov DWORD[16+rax],edx + xor rax,rax + jmp NEAR $L$enc_key_ret + +ALIGN 16 +$L$14rounds_alt: + movdqa xmm5,XMMWORD[$L$key_rotate] + movdqa xmm4,XMMWORD[$L$key_rcon1] + mov r10d,7 + movdqu XMMWORD[r8],xmm0 + movdqa xmm1,xmm2 + movdqu XMMWORD[16+r8],xmm2 + jmp NEAR $L$oop_key256 + +ALIGN 16 +$L$oop_key256: +DB 102,15,56,0,213 +DB 102,15,56,221,212 + + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + + pxor xmm0,xmm2 + movdqu XMMWORD[rax],xmm0 + + dec r10d + jz NEAR $L$done_key256 + + pshufd xmm2,xmm0,0xff + pxor xmm3,xmm3 +DB 102,15,56,221,211 + + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + + pxor xmm2,xmm1 + movdqu XMMWORD[16+rax],xmm2 + lea rax,[32+rax] + movdqa xmm1,xmm2 + + jmp NEAR $L$oop_key256 + +$L$done_key256: + mov DWORD[16+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret + +ALIGN 16 +$L$bad_keybits: + mov rax,-2 +$L$enc_key_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + add rsp,8 + + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_set_encrypt_key: + +ALIGN 16 +$L$key_expansion_128: + movups XMMWORD[rax],xmm0 + lea rax,[16+rax] +$L$key_expansion_128_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + DB 0F3h,0C3h ;repret + +ALIGN 16 +$L$key_expansion_192a: + movups XMMWORD[rax],xmm0 + lea rax,[16+rax] +$L$key_expansion_192a_cold: + movaps xmm5,xmm2 +$L$key_expansion_192b_warm: + shufps xmm4,xmm0,16 + movdqa xmm3,xmm2 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + pslldq xmm3,4 + xorps xmm0,xmm4 + pshufd xmm1,xmm1,85 + pxor xmm2,xmm3 + pxor xmm0,xmm1 + pshufd xmm3,xmm0,255 + pxor xmm2,xmm3 + DB 0F3h,0C3h ;repret + +ALIGN 16 +$L$key_expansion_192b: + movaps xmm3,xmm0 + shufps xmm5,xmm0,68 + movups XMMWORD[rax],xmm5 + shufps xmm3,xmm2,78 + movups XMMWORD[16+rax],xmm3 + lea rax,[32+rax] + jmp NEAR $L$key_expansion_192b_warm + +ALIGN 16 +$L$key_expansion_256a: + movups XMMWORD[rax],xmm2 + lea rax,[16+rax] +$L$key_expansion_256a_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + DB 0F3h,0C3h ;repret + +ALIGN 16 +$L$key_expansion_256b: + movups XMMWORD[rax],xmm0 + lea rax,[16+rax] + + shufps xmm4,xmm2,16 + xorps xmm2,xmm4 + shufps xmm4,xmm2,140 + xorps xmm2,xmm4 + shufps xmm1,xmm1,170 + xorps xmm2,xmm1 + DB 0F3h,0C3h ;repret + + +ALIGN 64 +$L$bswap_mask: +DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$increment32: + DD 6,6,6,0 +$L$increment64: + DD 1,0,0,0 +$L$increment1: +DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$key_rotate: + DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +$L$key_rotate192: + DD 0x04070605,0x04070605,0x04070605,0x04070605 +$L$key_rcon1: + DD 1,1,1,1 +$L$key_rcon1b: + DD 0x1b,0x1b,0x1b,0x1b + +DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 +DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 +DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 +DB 115,108,46,111,114,103,62,0 +ALIGN 64 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +ctr_xts_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[208+r8] + + lea rsi,[((-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp + + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_aes_hw_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_GFp_aes_hw_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_info_GFp_ctr32 wrt ..imagebase + DD GFp_aes_hw_set_encrypt_key wrt ..imagebase + DD $L$SEH_end_GFp_set_encrypt_key wrt ..imagebase + DD $L$SEH_info_GFp_key wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_GFp_ctr32: +DB 9,0,0,0 + DD ctr_xts_se_handler wrt ..imagebase + DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase +$L$SEH_info_GFp_key: +DB 0x01,0x04,0x01,0x00 +DB 0x04,0x02,0x00,0x00 diff --git a/zeroidc/vendor/ring/pregenerated/tmp/chacha-x86-win32n.asm b/zeroidc/vendor/ring/pregenerated/tmp/chacha-x86-win32n.asm new file mode 100644 index 000000000..09a1f090e --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/chacha-x86-win32n.asm @@ -0,0 +1,973 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _GFp_ChaCha20_ctr32 +align 16 +_GFp_ChaCha20_ctr32: +L$_GFp_ChaCha20_ctr32_begin: + push ebp + push ebx + push esi + push edi + xor eax,eax + cmp eax,DWORD [28+esp] + je NEAR L$000no_data + call L$pic_point +L$pic_point: + pop eax + lea ebp,[_GFp_ia32cap_P] + test DWORD [ebp],16777216 + jz NEAR L$001x86 + test DWORD [4+ebp],512 + jz NEAR L$001x86 + jmp NEAR L$ssse3_shortcut +L$001x86: + mov esi,DWORD [32+esp] + mov edi,DWORD [36+esp] + sub esp,132 + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edx,DWORD [12+esi] + mov DWORD [80+esp],eax + mov DWORD [84+esp],ebx + mov DWORD [88+esp],ecx + mov DWORD [92+esp],edx + mov eax,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov edx,DWORD [28+esi] + mov DWORD [96+esp],eax + mov DWORD [100+esp],ebx + mov DWORD [104+esp],ecx + mov DWORD [108+esp],edx + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + mov edx,DWORD [12+edi] + sub eax,1 + mov DWORD [112+esp],eax + mov DWORD [116+esp],ebx + mov DWORD [120+esp],ecx + mov DWORD [124+esp],edx + jmp NEAR L$002entry +align 16 +L$003outer_loop: + mov DWORD [156+esp],ebx + mov DWORD [152+esp],eax + mov DWORD [160+esp],ecx +L$002entry: + mov eax,1634760805 + mov DWORD [4+esp],857760878 + mov DWORD [8+esp],2036477234 + mov DWORD [12+esp],1797285236 + mov ebx,DWORD [84+esp] + mov ebp,DWORD [88+esp] + mov ecx,DWORD [104+esp] + mov esi,DWORD [108+esp] + mov edx,DWORD [116+esp] + mov edi,DWORD [120+esp] + mov DWORD [20+esp],ebx + mov DWORD [24+esp],ebp + mov DWORD [40+esp],ecx + mov DWORD [44+esp],esi + mov DWORD [52+esp],edx + mov DWORD [56+esp],edi + mov ebx,DWORD [92+esp] + mov edi,DWORD [124+esp] + mov edx,DWORD [112+esp] + mov ebp,DWORD [80+esp] + mov ecx,DWORD [96+esp] + mov esi,DWORD [100+esp] + add edx,1 + mov DWORD [28+esp],ebx + mov DWORD [60+esp],edi + mov DWORD [112+esp],edx + mov ebx,10 + jmp NEAR L$004loop +align 16 +L$004loop: + add eax,ebp + mov DWORD [128+esp],ebx + mov ebx,ebp + xor edx,eax + rol edx,16 + add ecx,edx + xor ebx,ecx + mov edi,DWORD [52+esp] + rol ebx,12 + mov ebp,DWORD [20+esp] + add eax,ebx + xor edx,eax + mov DWORD [esp],eax + rol edx,8 + mov eax,DWORD [4+esp] + add ecx,edx + mov DWORD [48+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + mov DWORD [32+esp],ecx + rol edi,16 + mov DWORD [16+esp],ebx + add esi,edi + mov ecx,DWORD [40+esp] + xor ebp,esi + mov edx,DWORD [56+esp] + rol ebp,12 + mov ebx,DWORD [24+esp] + add eax,ebp + xor edi,eax + mov DWORD [4+esp],eax + rol edi,8 + mov eax,DWORD [8+esp] + add esi,edi + mov DWORD [52+esp],edi + xor ebp,esi + add eax,ebx + rol ebp,7 + xor edx,eax + mov DWORD [36+esp],esi + rol edx,16 + mov DWORD [20+esp],ebp + add ecx,edx + mov esi,DWORD [44+esp] + xor ebx,ecx + mov edi,DWORD [60+esp] + rol ebx,12 + mov ebp,DWORD [28+esp] + add eax,ebx + xor edx,eax + mov DWORD [8+esp],eax + rol edx,8 + mov eax,DWORD [12+esp] + add ecx,edx + mov DWORD [56+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + rol edi,16 + mov DWORD [24+esp],ebx + add esi,edi + xor ebp,esi + rol ebp,12 + mov ebx,DWORD [20+esp] + add eax,ebp + xor edi,eax + mov DWORD [12+esp],eax + rol edi,8 + mov eax,DWORD [esp] + add esi,edi + mov edx,edi + xor ebp,esi + add eax,ebx + rol ebp,7 + xor edx,eax + rol edx,16 + mov DWORD [28+esp],ebp + add ecx,edx + xor ebx,ecx + mov edi,DWORD [48+esp] + rol ebx,12 + mov ebp,DWORD [24+esp] + add eax,ebx + xor edx,eax + mov DWORD [esp],eax + rol edx,8 + mov eax,DWORD [4+esp] + add ecx,edx + mov DWORD [60+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + mov DWORD [40+esp],ecx + rol edi,16 + mov DWORD [20+esp],ebx + add esi,edi + mov ecx,DWORD [32+esp] + xor ebp,esi + mov edx,DWORD [52+esp] + rol ebp,12 + mov ebx,DWORD [28+esp] + add eax,ebp + xor edi,eax + mov DWORD [4+esp],eax + rol edi,8 + mov eax,DWORD [8+esp] + add esi,edi + mov DWORD [48+esp],edi + xor ebp,esi + add eax,ebx + rol ebp,7 + xor edx,eax + mov DWORD [44+esp],esi + rol edx,16 + mov DWORD [24+esp],ebp + add ecx,edx + mov esi,DWORD [36+esp] + xor ebx,ecx + mov edi,DWORD [56+esp] + rol ebx,12 + mov ebp,DWORD [16+esp] + add eax,ebx + xor edx,eax + mov DWORD [8+esp],eax + rol edx,8 + mov eax,DWORD [12+esp] + add ecx,edx + mov DWORD [52+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + rol edi,16 + mov DWORD [28+esp],ebx + add esi,edi + xor ebp,esi + mov edx,DWORD [48+esp] + rol ebp,12 + mov ebx,DWORD [128+esp] + add eax,ebp + xor edi,eax + mov DWORD [12+esp],eax + rol edi,8 + mov eax,DWORD [esp] + add esi,edi + mov DWORD [56+esp],edi + xor ebp,esi + rol ebp,7 + dec ebx + jnz NEAR L$004loop + mov ebx,DWORD [160+esp] + add eax,1634760805 + add ebp,DWORD [80+esp] + add ecx,DWORD [96+esp] + add esi,DWORD [100+esp] + cmp ebx,64 + jb NEAR L$005tail + mov ebx,DWORD [156+esp] + add edx,DWORD [112+esp] + add edi,DWORD [120+esp] + xor eax,DWORD [ebx] + xor ebp,DWORD [16+ebx] + mov DWORD [esp],eax + mov eax,DWORD [152+esp] + xor ecx,DWORD [32+ebx] + xor esi,DWORD [36+ebx] + xor edx,DWORD [48+ebx] + xor edi,DWORD [56+ebx] + mov DWORD [16+eax],ebp + mov DWORD [32+eax],ecx + mov DWORD [36+eax],esi + mov DWORD [48+eax],edx + mov DWORD [56+eax],edi + mov ebp,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov esi,DWORD [12+esp] + mov edx,DWORD [20+esp] + mov edi,DWORD [24+esp] + add ebp,857760878 + add ecx,2036477234 + add esi,1797285236 + add edx,DWORD [84+esp] + add edi,DWORD [88+esp] + xor ebp,DWORD [4+ebx] + xor ecx,DWORD [8+ebx] + xor esi,DWORD [12+ebx] + xor edx,DWORD [20+ebx] + xor edi,DWORD [24+ebx] + mov DWORD [4+eax],ebp + mov DWORD [8+eax],ecx + mov DWORD [12+eax],esi + mov DWORD [20+eax],edx + mov DWORD [24+eax],edi + mov ebp,DWORD [28+esp] + mov ecx,DWORD [40+esp] + mov esi,DWORD [44+esp] + mov edx,DWORD [52+esp] + mov edi,DWORD [60+esp] + add ebp,DWORD [92+esp] + add ecx,DWORD [104+esp] + add esi,DWORD [108+esp] + add edx,DWORD [116+esp] + add edi,DWORD [124+esp] + xor ebp,DWORD [28+ebx] + xor ecx,DWORD [40+ebx] + xor esi,DWORD [44+ebx] + xor edx,DWORD [52+ebx] + xor edi,DWORD [60+ebx] + lea ebx,[64+ebx] + mov DWORD [28+eax],ebp + mov ebp,DWORD [esp] + mov DWORD [40+eax],ecx + mov ecx,DWORD [160+esp] + mov DWORD [44+eax],esi + mov DWORD [52+eax],edx + mov DWORD [60+eax],edi + mov DWORD [eax],ebp + lea eax,[64+eax] + sub ecx,64 + jnz NEAR L$003outer_loop + jmp NEAR L$006done +L$005tail: + add edx,DWORD [112+esp] + add edi,DWORD [120+esp] + mov DWORD [esp],eax + mov DWORD [16+esp],ebp + mov DWORD [32+esp],ecx + mov DWORD [36+esp],esi + mov DWORD [48+esp],edx + mov DWORD [56+esp],edi + mov ebp,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov esi,DWORD [12+esp] + mov edx,DWORD [20+esp] + mov edi,DWORD [24+esp] + add ebp,857760878 + add ecx,2036477234 + add esi,1797285236 + add edx,DWORD [84+esp] + add edi,DWORD [88+esp] + mov DWORD [4+esp],ebp + mov DWORD [8+esp],ecx + mov DWORD [12+esp],esi + mov DWORD [20+esp],edx + mov DWORD [24+esp],edi + mov ebp,DWORD [28+esp] + mov ecx,DWORD [40+esp] + mov esi,DWORD [44+esp] + mov edx,DWORD [52+esp] + mov edi,DWORD [60+esp] + add ebp,DWORD [92+esp] + add ecx,DWORD [104+esp] + add esi,DWORD [108+esp] + add edx,DWORD [116+esp] + add edi,DWORD [124+esp] + mov DWORD [28+esp],ebp + mov ebp,DWORD [156+esp] + mov DWORD [40+esp],ecx + mov ecx,DWORD [152+esp] + mov DWORD [44+esp],esi + xor esi,esi + mov DWORD [52+esp],edx + mov DWORD [60+esp],edi + xor eax,eax + xor edx,edx +L$007tail_loop: + mov al,BYTE [ebp*1+esi] + mov dl,BYTE [esi*1+esp] + lea esi,[1+esi] + xor al,dl + mov BYTE [esi*1+ecx-1],al + dec ebx + jnz NEAR L$007tail_loop +L$006done: + add esp,132 +L$000no_data: + pop edi + pop esi + pop ebx + pop ebp + ret +align 16 +__ChaCha20_ssse3: + push ebp + push ebx + push esi + push edi +L$ssse3_shortcut: + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov ecx,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + mov ebp,esp + sub esp,524 + and esp,-64 + mov DWORD [512+esp],ebp + lea eax,[(L$ssse3_data-L$pic_point)+eax] + movdqu xmm3,[ebx] + cmp ecx,256 + jb NEAR L$0081x + mov DWORD [516+esp],edx + mov DWORD [520+esp],ebx + sub ecx,256 + lea ebp,[384+esp] + movdqu xmm7,[edx] + pshufd xmm0,xmm3,0 + pshufd xmm1,xmm3,85 + pshufd xmm2,xmm3,170 + pshufd xmm3,xmm3,255 + paddd xmm0,[48+eax] + pshufd xmm4,xmm7,0 + pshufd xmm5,xmm7,85 + psubd xmm0,[64+eax] + pshufd xmm6,xmm7,170 + pshufd xmm7,xmm7,255 + movdqa [64+ebp],xmm0 + movdqa [80+ebp],xmm1 + movdqa [96+ebp],xmm2 + movdqa [112+ebp],xmm3 + movdqu xmm3,[16+edx] + movdqa [ebp-64],xmm4 + movdqa [ebp-48],xmm5 + movdqa [ebp-32],xmm6 + movdqa [ebp-16],xmm7 + movdqa xmm7,[32+eax] + lea ebx,[128+esp] + pshufd xmm0,xmm3,0 + pshufd xmm1,xmm3,85 + pshufd xmm2,xmm3,170 + pshufd xmm3,xmm3,255 + pshufd xmm4,xmm7,0 + pshufd xmm5,xmm7,85 + pshufd xmm6,xmm7,170 + pshufd xmm7,xmm7,255 + movdqa [ebp],xmm0 + movdqa [16+ebp],xmm1 + movdqa [32+ebp],xmm2 + movdqa [48+ebp],xmm3 + movdqa [ebp-128],xmm4 + movdqa [ebp-112],xmm5 + movdqa [ebp-96],xmm6 + movdqa [ebp-80],xmm7 + lea esi,[128+esi] + lea edi,[128+edi] + jmp NEAR L$009outer_loop +align 16 +L$009outer_loop: + movdqa xmm1,[ebp-112] + movdqa xmm2,[ebp-96] + movdqa xmm3,[ebp-80] + movdqa xmm5,[ebp-48] + movdqa xmm6,[ebp-32] + movdqa xmm7,[ebp-16] + movdqa [ebx-112],xmm1 + movdqa [ebx-96],xmm2 + movdqa [ebx-80],xmm3 + movdqa [ebx-48],xmm5 + movdqa [ebx-32],xmm6 + movdqa [ebx-16],xmm7 + movdqa xmm2,[32+ebp] + movdqa xmm3,[48+ebp] + movdqa xmm4,[64+ebp] + movdqa xmm5,[80+ebp] + movdqa xmm6,[96+ebp] + movdqa xmm7,[112+ebp] + paddd xmm4,[64+eax] + movdqa [32+ebx],xmm2 + movdqa [48+ebx],xmm3 + movdqa [64+ebx],xmm4 + movdqa [80+ebx],xmm5 + movdqa [96+ebx],xmm6 + movdqa [112+ebx],xmm7 + movdqa [64+ebp],xmm4 + movdqa xmm0,[ebp-128] + movdqa xmm6,xmm4 + movdqa xmm3,[ebp-64] + movdqa xmm4,[ebp] + movdqa xmm5,[16+ebp] + mov edx,10 + nop +align 16 +L$010loop: + paddd xmm0,xmm3 + movdqa xmm2,xmm3 + pxor xmm6,xmm0 + pshufb xmm6,[eax] + paddd xmm4,xmm6 + pxor xmm2,xmm4 + movdqa xmm3,[ebx-48] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-112] + paddd xmm0,xmm2 + movdqa xmm7,[80+ebx] + pxor xmm6,xmm0 + movdqa [ebx-128],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [64+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + movdqa [ebx],xmm4 + pshufb xmm7,[eax] + movdqa [ebx-64],xmm2 + paddd xmm5,xmm7 + movdqa xmm4,[32+ebx] + pxor xmm3,xmm5 + movdqa xmm2,[ebx-32] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-96] + paddd xmm1,xmm3 + movdqa xmm6,[96+ebx] + pxor xmm7,xmm1 + movdqa [ebx-112],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [80+ebx],xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + movdqa [16+ebx],xmm5 + pshufb xmm6,[eax] + movdqa [ebx-48],xmm3 + paddd xmm4,xmm6 + movdqa xmm5,[48+ebx] + pxor xmm2,xmm4 + movdqa xmm3,[ebx-16] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-80] + paddd xmm0,xmm2 + movdqa xmm7,[112+ebx] + pxor xmm6,xmm0 + movdqa [ebx-96],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [96+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + pshufb xmm7,[eax] + movdqa [ebx-32],xmm2 + paddd xmm5,xmm7 + pxor xmm3,xmm5 + movdqa xmm2,[ebx-48] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-128] + paddd xmm1,xmm3 + pxor xmm7,xmm1 + movdqa [ebx-80],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa xmm6,xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + pshufb xmm6,[eax] + movdqa [ebx-16],xmm3 + paddd xmm4,xmm6 + pxor xmm2,xmm4 + movdqa xmm3,[ebx-32] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-112] + paddd xmm0,xmm2 + movdqa xmm7,[64+ebx] + pxor xmm6,xmm0 + movdqa [ebx-128],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [112+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + movdqa [32+ebx],xmm4 + pshufb xmm7,[eax] + movdqa [ebx-48],xmm2 + paddd xmm5,xmm7 + movdqa xmm4,[ebx] + pxor xmm3,xmm5 + movdqa xmm2,[ebx-16] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-96] + paddd xmm1,xmm3 + movdqa xmm6,[80+ebx] + pxor xmm7,xmm1 + movdqa [ebx-112],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [64+ebx],xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + movdqa [48+ebx],xmm5 + pshufb xmm6,[eax] + movdqa [ebx-32],xmm3 + paddd xmm4,xmm6 + movdqa xmm5,[16+ebx] + pxor xmm2,xmm4 + movdqa xmm3,[ebx-64] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-80] + paddd xmm0,xmm2 + movdqa xmm7,[96+ebx] + pxor xmm6,xmm0 + movdqa [ebx-96],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [80+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + pshufb xmm7,[eax] + movdqa [ebx-16],xmm2 + paddd xmm5,xmm7 + pxor xmm3,xmm5 + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-128] + paddd xmm1,xmm3 + movdqa xmm6,[64+ebx] + pxor xmm7,xmm1 + movdqa [ebx-80],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [96+ebx],xmm7 + pxor xmm3,xmm5 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + por xmm3,xmm1 + dec edx + jnz NEAR L$010loop + movdqa [ebx-64],xmm3 + movdqa [ebx],xmm4 + movdqa [16+ebx],xmm5 + movdqa [64+ebx],xmm6 + movdqa [96+ebx],xmm7 + movdqa xmm1,[ebx-112] + movdqa xmm2,[ebx-96] + movdqa xmm3,[ebx-80] + paddd xmm0,[ebp-128] + paddd xmm1,[ebp-112] + paddd xmm2,[ebp-96] + paddd xmm3,[ebp-80] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[ebx-64] + pxor xmm5,xmm1 + movdqa xmm1,[ebx-48] + pxor xmm6,xmm2 + movdqa xmm2,[ebx-32] + pxor xmm7,xmm3 + movdqa xmm3,[ebx-16] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[ebp-64] + paddd xmm1,[ebp-48] + paddd xmm2,[ebp-32] + paddd xmm3,[ebp-16] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[ebx] + pxor xmm5,xmm1 + movdqa xmm1,[16+ebx] + pxor xmm6,xmm2 + movdqa xmm2,[32+ebx] + pxor xmm7,xmm3 + movdqa xmm3,[48+ebx] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[ebp] + paddd xmm1,[16+ebp] + paddd xmm2,[32+ebp] + paddd xmm3,[48+ebp] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[64+ebx] + pxor xmm5,xmm1 + movdqa xmm1,[80+ebx] + pxor xmm6,xmm2 + movdqa xmm2,[96+ebx] + pxor xmm7,xmm3 + movdqa xmm3,[112+ebx] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[64+ebp] + paddd xmm1,[80+ebp] + paddd xmm2,[96+ebp] + paddd xmm3,[112+ebp] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[208+esi] + pxor xmm4,xmm0 + pxor xmm5,xmm1 + pxor xmm6,xmm2 + pxor xmm7,xmm3 + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[208+edi] + sub ecx,256 + jnc NEAR L$009outer_loop + add ecx,256 + jz NEAR L$011done + mov ebx,DWORD [520+esp] + lea esi,[esi-128] + mov edx,DWORD [516+esp] + lea edi,[edi-128] + movd xmm2,DWORD [64+ebp] + movdqu xmm3,[ebx] + paddd xmm2,[96+eax] + pand xmm3,[112+eax] + por xmm3,xmm2 +L$0081x: + movdqa xmm0,[32+eax] + movdqu xmm1,[edx] + movdqu xmm2,[16+edx] + movdqa xmm6,[eax] + movdqa xmm7,[16+eax] + mov DWORD [48+esp],ebp + movdqa [esp],xmm0 + movdqa [16+esp],xmm1 + movdqa [32+esp],xmm2 + movdqa [48+esp],xmm3 + mov edx,10 + jmp NEAR L$012loop1x +align 16 +L$013outer1x: + movdqa xmm3,[80+eax] + movdqa xmm0,[esp] + movdqa xmm1,[16+esp] + movdqa xmm2,[32+esp] + paddd xmm3,[48+esp] + mov edx,10 + movdqa [48+esp],xmm3 + jmp NEAR L$012loop1x +align 16 +L$012loop1x: + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,57 + pshufd xmm3,xmm3,147 + nop + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,147 + pshufd xmm3,xmm3,57 + dec edx + jnz NEAR L$012loop1x + paddd xmm0,[esp] + paddd xmm1,[16+esp] + paddd xmm2,[32+esp] + paddd xmm3,[48+esp] + cmp ecx,64 + jb NEAR L$014tail + movdqu xmm4,[esi] + movdqu xmm5,[16+esi] + pxor xmm0,xmm4 + movdqu xmm4,[32+esi] + pxor xmm1,xmm5 + movdqu xmm5,[48+esi] + pxor xmm2,xmm4 + pxor xmm3,xmm5 + lea esi,[64+esi] + movdqu [edi],xmm0 + movdqu [16+edi],xmm1 + movdqu [32+edi],xmm2 + movdqu [48+edi],xmm3 + lea edi,[64+edi] + sub ecx,64 + jnz NEAR L$013outer1x + jmp NEAR L$011done +L$014tail: + movdqa [esp],xmm0 + movdqa [16+esp],xmm1 + movdqa [32+esp],xmm2 + movdqa [48+esp],xmm3 + xor eax,eax + xor edx,edx + xor ebp,ebp +L$015tail_loop: + mov al,BYTE [ebp*1+esp] + mov dl,BYTE [ebp*1+esi] + lea ebp,[1+ebp] + xor al,dl + mov BYTE [ebp*1+edi-1],al + dec ecx + jnz NEAR L$015tail_loop +L$011done: + mov esp,DWORD [512+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$ssse3_data: +db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +db 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +dd 1634760805,857760878,2036477234,1797285236 +dd 0,1,2,3 +dd 4,4,4,4 +dd 1,0,0,0 +dd 4,0,0,0 +dd 0,-1,-1,-1 +align 64 +db 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +db 114,103,62,0 +segment .bss +common _GFp_ia32cap_P 16 diff --git a/zeroidc/vendor/ring/pregenerated/tmp/chacha-x86_64-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/chacha-x86_64-nasm.asm new file mode 100644 index 000000000..1169fa553 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/chacha-x86_64-nasm.asm @@ -0,0 +1,1922 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +EXTERN GFp_ia32cap_P + +ALIGN 64 +$L$zero: + DD 0,0,0,0 +$L$one: + DD 1,0,0,0 +$L$inc: + DD 0,1,2,3 +$L$four: + DD 4,4,4,4 +$L$incy: + DD 0,2,4,6,1,3,5,7 +$L$eight: + DD 8,8,8,8,8,8,8,8 +$L$rot16: +DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd +$L$rot24: +DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe +$L$sigma: +DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 +DB 0 +ALIGN 64 +$L$zeroz: + DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 +$L$fourz: + DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 +$L$incz: + DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +$L$sixteen: + DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 +DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 +DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 +DB 108,46,111,114,103,62,0 +global GFp_ChaCha20_ctr32 + +ALIGN 64 +GFp_ChaCha20_ctr32: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_ChaCha20_ctr32: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + cmp rdx,0 + je NEAR $L$no_data + mov r10,QWORD[((GFp_ia32cap_P+4))] + test r10d,512 + jnz NEAR $L$ChaCha20_ssse3 + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,64+24 + +$L$ctr32_body: + + + movdqu xmm1,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm4,XMMWORD[$L$one] + + + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + mov rbp,rdx + jmp NEAR $L$oop_outer + +ALIGN 32 +$L$oop_outer: + mov eax,0x61707865 + mov ebx,0x3320646e + mov ecx,0x79622d32 + mov edx,0x6b206574 + mov r8d,DWORD[16+rsp] + mov r9d,DWORD[20+rsp] + mov r10d,DWORD[24+rsp] + mov r11d,DWORD[28+rsp] + movd r12d,xmm3 + mov r13d,DWORD[52+rsp] + mov r14d,DWORD[56+rsp] + mov r15d,DWORD[60+rsp] + + mov QWORD[((64+0))+rsp],rbp + mov ebp,10 + mov QWORD[((64+8))+rsp],rsi +DB 102,72,15,126,214 + mov QWORD[((64+16))+rsp],rdi + mov rdi,rsi + shr rdi,32 + jmp NEAR $L$oop + +ALIGN 32 +$L$oop: + add eax,r8d + xor r12d,eax + rol r12d,16 + add ebx,r9d + xor r13d,ebx + rol r13d,16 + add esi,r12d + xor r8d,esi + rol r8d,12 + add edi,r13d + xor r9d,edi + rol r9d,12 + add eax,r8d + xor r12d,eax + rol r12d,8 + add ebx,r9d + xor r13d,ebx + rol r13d,8 + add esi,r12d + xor r8d,esi + rol r8d,7 + add edi,r13d + xor r9d,edi + rol r9d,7 + mov DWORD[32+rsp],esi + mov DWORD[36+rsp],edi + mov esi,DWORD[40+rsp] + mov edi,DWORD[44+rsp] + add ecx,r10d + xor r14d,ecx + rol r14d,16 + add edx,r11d + xor r15d,edx + rol r15d,16 + add esi,r14d + xor r10d,esi + rol r10d,12 + add edi,r15d + xor r11d,edi + rol r11d,12 + add ecx,r10d + xor r14d,ecx + rol r14d,8 + add edx,r11d + xor r15d,edx + rol r15d,8 + add esi,r14d + xor r10d,esi + rol r10d,7 + add edi,r15d + xor r11d,edi + rol r11d,7 + add eax,r9d + xor r15d,eax + rol r15d,16 + add ebx,r10d + xor r12d,ebx + rol r12d,16 + add esi,r15d + xor r9d,esi + rol r9d,12 + add edi,r12d + xor r10d,edi + rol r10d,12 + add eax,r9d + xor r15d,eax + rol r15d,8 + add ebx,r10d + xor r12d,ebx + rol r12d,8 + add esi,r15d + xor r9d,esi + rol r9d,7 + add edi,r12d + xor r10d,edi + rol r10d,7 + mov DWORD[40+rsp],esi + mov DWORD[44+rsp],edi + mov esi,DWORD[32+rsp] + mov edi,DWORD[36+rsp] + add ecx,r11d + xor r13d,ecx + rol r13d,16 + add edx,r8d + xor r14d,edx + rol r14d,16 + add esi,r13d + xor r11d,esi + rol r11d,12 + add edi,r14d + xor r8d,edi + rol r8d,12 + add ecx,r11d + xor r13d,ecx + rol r13d,8 + add edx,r8d + xor r14d,edx + rol r14d,8 + add esi,r13d + xor r11d,esi + rol r11d,7 + add edi,r14d + xor r8d,edi + rol r8d,7 + dec ebp + jnz NEAR $L$oop + mov DWORD[36+rsp],edi + mov DWORD[32+rsp],esi + mov rbp,QWORD[64+rsp] + movdqa xmm1,xmm2 + mov rsi,QWORD[((64+8))+rsp] + paddd xmm3,xmm4 + mov rdi,QWORD[((64+16))+rsp] + + add eax,0x61707865 + add ebx,0x3320646e + add ecx,0x79622d32 + add edx,0x6b206574 + add r8d,DWORD[16+rsp] + add r9d,DWORD[20+rsp] + add r10d,DWORD[24+rsp] + add r11d,DWORD[28+rsp] + add r12d,DWORD[48+rsp] + add r13d,DWORD[52+rsp] + add r14d,DWORD[56+rsp] + add r15d,DWORD[60+rsp] + paddd xmm1,XMMWORD[32+rsp] + + cmp rbp,64 + jb NEAR $L$tail + + xor eax,DWORD[rsi] + xor ebx,DWORD[4+rsi] + xor ecx,DWORD[8+rsi] + xor edx,DWORD[12+rsi] + xor r8d,DWORD[16+rsi] + xor r9d,DWORD[20+rsi] + xor r10d,DWORD[24+rsi] + xor r11d,DWORD[28+rsi] + movdqu xmm0,XMMWORD[32+rsi] + xor r12d,DWORD[48+rsi] + xor r13d,DWORD[52+rsi] + xor r14d,DWORD[56+rsi] + xor r15d,DWORD[60+rsi] + lea rsi,[64+rsi] + pxor xmm0,xmm1 + + movdqa XMMWORD[32+rsp],xmm2 + movd DWORD[48+rsp],xmm3 + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + movdqu XMMWORD[32+rdi],xmm0 + mov DWORD[48+rdi],r12d + mov DWORD[52+rdi],r13d + mov DWORD[56+rdi],r14d + mov DWORD[60+rdi],r15d + lea rdi,[64+rdi] + + sub rbp,64 + jnz NEAR $L$oop_outer + + jmp NEAR $L$done + +ALIGN 16 +$L$tail: + mov DWORD[rsp],eax + mov DWORD[4+rsp],ebx + xor rbx,rbx + mov DWORD[8+rsp],ecx + mov DWORD[12+rsp],edx + mov DWORD[16+rsp],r8d + mov DWORD[20+rsp],r9d + mov DWORD[24+rsp],r10d + mov DWORD[28+rsp],r11d + movdqa XMMWORD[32+rsp],xmm1 + mov DWORD[48+rsp],r12d + mov DWORD[52+rsp],r13d + mov DWORD[56+rsp],r14d + mov DWORD[60+rsp],r15d + +$L$oop_tail: + movzx eax,BYTE[rbx*1+rsi] + movzx edx,BYTE[rbx*1+rsp] + lea rbx,[1+rbx] + xor eax,edx + mov BYTE[((-1))+rbx*1+rdi],al + dec rbp + jnz NEAR $L$oop_tail + +$L$done: + lea rsi,[((64+24+48))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$no_data: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_ChaCha20_ctr32: + +ALIGN 32 +ChaCha20_ssse3: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ssse3: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + +$L$ChaCha20_ssse3: + + mov r9,rsp + + cmp rdx,128 + ja NEAR $L$ChaCha20_4x + +$L$do_sse3_after_all: + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$ssse3_body: + movdqa xmm0,XMMWORD[$L$sigma] + movdqu xmm1,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm6,XMMWORD[$L$rot16] + movdqa xmm7,XMMWORD[$L$rot24] + + movdqa XMMWORD[rsp],xmm0 + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + mov r8,10 + jmp NEAR $L$oop_ssse3 + +ALIGN 32 +$L$oop_outer_ssse3: + movdqa xmm3,XMMWORD[$L$one] + movdqa xmm0,XMMWORD[rsp] + movdqa xmm1,XMMWORD[16+rsp] + movdqa xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + mov r8,10 + movdqa XMMWORD[48+rsp],xmm3 + jmp NEAR $L$oop_ssse3 + +ALIGN 32 +$L$oop_ssse3: + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,57 + pshufd xmm3,xmm3,147 + nop + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,147 + pshufd xmm3,xmm3,57 + dec r8 + jnz NEAR $L$oop_ssse3 + paddd xmm0,XMMWORD[rsp] + paddd xmm1,XMMWORD[16+rsp] + paddd xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + + cmp rdx,64 + jb NEAR $L$tail_ssse3 + + movdqu xmm4,XMMWORD[rsi] + movdqu xmm5,XMMWORD[16+rsi] + pxor xmm0,xmm4 + movdqu xmm4,XMMWORD[32+rsi] + pxor xmm1,xmm5 + movdqu xmm5,XMMWORD[48+rsi] + lea rsi,[64+rsi] + pxor xmm2,xmm4 + pxor xmm3,xmm5 + + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + lea rdi,[64+rdi] + + sub rdx,64 + jnz NEAR $L$oop_outer_ssse3 + + jmp NEAR $L$done_ssse3 + +ALIGN 16 +$L$tail_ssse3: + movdqa XMMWORD[rsp],xmm0 + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + xor r8,r8 + +$L$oop_tail_ssse3: + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] + xor eax,ecx + mov BYTE[((-1))+r8*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail_ssse3 + +$L$done_ssse3: + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$ssse3_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_ssse3: + +ALIGN 32 +ChaCha20_4x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_4x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + +$L$ChaCha20_4x: + + mov r9,rsp + + mov r11,r10 + shr r10,32 + test r10,32 + jnz NEAR $L$ChaCha20_8x + cmp rdx,192 + ja NEAR $L$proceed4x + + and r11,71303168 + cmp r11,4194304 + je NEAR $L$do_sse3_after_all + +$L$proceed4x: + sub rsp,0x140+168 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$4x_body: + movdqa xmm11,XMMWORD[$L$sigma] + movdqu xmm15,XMMWORD[rcx] + movdqu xmm7,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + lea rcx,[256+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + pshufd xmm8,xmm11,0x00 + pshufd xmm9,xmm11,0x55 + movdqa XMMWORD[64+rsp],xmm8 + pshufd xmm10,xmm11,0xaa + movdqa XMMWORD[80+rsp],xmm9 + pshufd xmm11,xmm11,0xff + movdqa XMMWORD[96+rsp],xmm10 + movdqa XMMWORD[112+rsp],xmm11 + + pshufd xmm12,xmm15,0x00 + pshufd xmm13,xmm15,0x55 + movdqa XMMWORD[(128-256)+rcx],xmm12 + pshufd xmm14,xmm15,0xaa + movdqa XMMWORD[(144-256)+rcx],xmm13 + pshufd xmm15,xmm15,0xff + movdqa XMMWORD[(160-256)+rcx],xmm14 + movdqa XMMWORD[(176-256)+rcx],xmm15 + + pshufd xmm4,xmm7,0x00 + pshufd xmm5,xmm7,0x55 + movdqa XMMWORD[(192-256)+rcx],xmm4 + pshufd xmm6,xmm7,0xaa + movdqa XMMWORD[(208-256)+rcx],xmm5 + pshufd xmm7,xmm7,0xff + movdqa XMMWORD[(224-256)+rcx],xmm6 + movdqa XMMWORD[(240-256)+rcx],xmm7 + + pshufd xmm0,xmm3,0x00 + pshufd xmm1,xmm3,0x55 + paddd xmm0,XMMWORD[$L$inc] + pshufd xmm2,xmm3,0xaa + movdqa XMMWORD[(272-256)+rcx],xmm1 + pshufd xmm3,xmm3,0xff + movdqa XMMWORD[(288-256)+rcx],xmm2 + movdqa XMMWORD[(304-256)+rcx],xmm3 + + jmp NEAR $L$oop_enter4x + +ALIGN 32 +$L$oop_outer4x: + movdqa xmm8,XMMWORD[64+rsp] + movdqa xmm9,XMMWORD[80+rsp] + movdqa xmm10,XMMWORD[96+rsp] + movdqa xmm11,XMMWORD[112+rsp] + movdqa xmm12,XMMWORD[((128-256))+rcx] + movdqa xmm13,XMMWORD[((144-256))+rcx] + movdqa xmm14,XMMWORD[((160-256))+rcx] + movdqa xmm15,XMMWORD[((176-256))+rcx] + movdqa xmm4,XMMWORD[((192-256))+rcx] + movdqa xmm5,XMMWORD[((208-256))+rcx] + movdqa xmm6,XMMWORD[((224-256))+rcx] + movdqa xmm7,XMMWORD[((240-256))+rcx] + movdqa xmm0,XMMWORD[((256-256))+rcx] + movdqa xmm1,XMMWORD[((272-256))+rcx] + movdqa xmm2,XMMWORD[((288-256))+rcx] + movdqa xmm3,XMMWORD[((304-256))+rcx] + paddd xmm0,XMMWORD[$L$four] + +$L$oop_enter4x: + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm7 + movdqa xmm7,XMMWORD[r10] + mov eax,10 + movdqa XMMWORD[(256-256)+rcx],xmm0 + jmp NEAR $L$oop4x + +ALIGN 32 +$L$oop4x: + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 +DB 102,15,56,0,199 +DB 102,15,56,0,207 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm6,xmm12 + pslld xmm12,12 + psrld xmm6,20 + movdqa xmm7,xmm13 + pslld xmm13,12 + por xmm12,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm13,xmm7 + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 +DB 102,15,56,0,198 +DB 102,15,56,0,206 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm7,xmm12 + pslld xmm12,7 + psrld xmm7,25 + movdqa xmm6,xmm13 + pslld xmm13,7 + por xmm12,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm13,xmm6 + movdqa XMMWORD[rsp],xmm4 + movdqa XMMWORD[16+rsp],xmm5 + movdqa xmm4,XMMWORD[32+rsp] + movdqa xmm5,XMMWORD[48+rsp] + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 +DB 102,15,56,0,215 +DB 102,15,56,0,223 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm6,xmm14 + pslld xmm14,12 + psrld xmm6,20 + movdqa xmm7,xmm15 + pslld xmm15,12 + por xmm14,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm15,xmm7 + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 +DB 102,15,56,0,214 +DB 102,15,56,0,222 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm7,xmm14 + pslld xmm14,7 + psrld xmm7,25 + movdqa xmm6,xmm15 + pslld xmm15,7 + por xmm14,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm15,xmm6 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 +DB 102,15,56,0,223 +DB 102,15,56,0,199 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm6,xmm13 + pslld xmm13,12 + psrld xmm6,20 + movdqa xmm7,xmm14 + pslld xmm14,12 + por xmm13,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm14,xmm7 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 +DB 102,15,56,0,222 +DB 102,15,56,0,198 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm7,xmm13 + pslld xmm13,7 + psrld xmm7,25 + movdqa xmm6,xmm14 + pslld xmm14,7 + por xmm13,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm14,xmm6 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm5 + movdqa xmm4,XMMWORD[rsp] + movdqa xmm5,XMMWORD[16+rsp] + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 +DB 102,15,56,0,207 +DB 102,15,56,0,215 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm6,xmm15 + pslld xmm15,12 + psrld xmm6,20 + movdqa xmm7,xmm12 + pslld xmm12,12 + por xmm15,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm12,xmm7 + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 +DB 102,15,56,0,206 +DB 102,15,56,0,214 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm7,xmm15 + pslld xmm15,7 + psrld xmm7,25 + movdqa xmm6,xmm12 + pslld xmm12,7 + por xmm15,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm12,xmm6 + dec eax + jnz NEAR $L$oop4x + + paddd xmm8,XMMWORD[64+rsp] + paddd xmm9,XMMWORD[80+rsp] + paddd xmm10,XMMWORD[96+rsp] + paddd xmm11,XMMWORD[112+rsp] + + movdqa xmm6,xmm8 + punpckldq xmm8,xmm9 + movdqa xmm7,xmm10 + punpckldq xmm10,xmm11 + punpckhdq xmm6,xmm9 + punpckhdq xmm7,xmm11 + movdqa xmm9,xmm8 + punpcklqdq xmm8,xmm10 + movdqa xmm11,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm9,xmm10 + punpckhqdq xmm11,xmm7 + paddd xmm12,XMMWORD[((128-256))+rcx] + paddd xmm13,XMMWORD[((144-256))+rcx] + paddd xmm14,XMMWORD[((160-256))+rcx] + paddd xmm15,XMMWORD[((176-256))+rcx] + + movdqa XMMWORD[rsp],xmm8 + movdqa XMMWORD[16+rsp],xmm9 + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + + movdqa xmm10,xmm12 + punpckldq xmm12,xmm13 + movdqa xmm7,xmm14 + punpckldq xmm14,xmm15 + punpckhdq xmm10,xmm13 + punpckhdq xmm7,xmm15 + movdqa xmm13,xmm12 + punpcklqdq xmm12,xmm14 + movdqa xmm15,xmm10 + punpcklqdq xmm10,xmm7 + punpckhqdq xmm13,xmm14 + punpckhqdq xmm15,xmm7 + paddd xmm4,XMMWORD[((192-256))+rcx] + paddd xmm5,XMMWORD[((208-256))+rcx] + paddd xmm8,XMMWORD[((224-256))+rcx] + paddd xmm9,XMMWORD[((240-256))+rcx] + + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm11 + + movdqa xmm14,xmm4 + punpckldq xmm4,xmm5 + movdqa xmm7,xmm8 + punpckldq xmm8,xmm9 + punpckhdq xmm14,xmm5 + punpckhdq xmm7,xmm9 + movdqa xmm5,xmm4 + punpcklqdq xmm4,xmm8 + movdqa xmm9,xmm14 + punpcklqdq xmm14,xmm7 + punpckhqdq xmm5,xmm8 + punpckhqdq xmm9,xmm7 + paddd xmm0,XMMWORD[((256-256))+rcx] + paddd xmm1,XMMWORD[((272-256))+rcx] + paddd xmm2,XMMWORD[((288-256))+rcx] + paddd xmm3,XMMWORD[((304-256))+rcx] + + movdqa xmm8,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm8,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm8 + punpcklqdq xmm8,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + cmp rdx,64*4 + jb NEAR $L$tail4x + + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[48+rsp] + pxor xmm11,xmm15 + pxor xmm2,xmm9 + pxor xmm7,xmm3 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + + sub rdx,64*4 + jnz NEAR $L$oop_outer4x + + jmp NEAR $L$done4x + +$L$tail4x: + cmp rdx,192 + jae NEAR $L$192_or_more4x + cmp rdx,128 + jae NEAR $L$128_or_more4x + cmp rdx,64 + jae NEAR $L$64_or_more4x + + + xor r10,r10 + + movdqa XMMWORD[16+rsp],xmm12 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm0 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$64_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[16+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm13 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm5 + sub rdx,64 + movdqa XMMWORD[48+rsp],xmm1 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$128_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[32+rsp] + lea rsi,[128+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm10 + lea rdi,[128+rdi] + movdqa XMMWORD[32+rsp],xmm14 + sub rdx,128 + movdqa XMMWORD[48+rsp],xmm8 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$192_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[48+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm15 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm9 + sub rdx,192 + movdqa XMMWORD[48+rsp],xmm3 + +$L$oop_tail4x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail4x + +$L$done4x: + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_4x: + +ALIGN 32 +ChaCha20_8x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_8x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + +$L$ChaCha20_8x: + + mov r9,rsp + + sub rsp,0x280+168 + and rsp,-32 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$8x_body: + vzeroupper + + + + + + + + + + + vbroadcasti128 ymm11,XMMWORD[$L$sigma] + vbroadcasti128 ymm3,XMMWORD[rcx] + vbroadcasti128 ymm15,XMMWORD[16+rcx] + vbroadcasti128 ymm7,XMMWORD[r8] + lea rcx,[256+rsp] + lea rax,[512+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + vpshufd ymm8,ymm11,0x00 + vpshufd ymm9,ymm11,0x55 + vmovdqa YMMWORD[(128-256)+rcx],ymm8 + vpshufd ymm10,ymm11,0xaa + vmovdqa YMMWORD[(160-256)+rcx],ymm9 + vpshufd ymm11,ymm11,0xff + vmovdqa YMMWORD[(192-256)+rcx],ymm10 + vmovdqa YMMWORD[(224-256)+rcx],ymm11 + + vpshufd ymm0,ymm3,0x00 + vpshufd ymm1,ymm3,0x55 + vmovdqa YMMWORD[(256-256)+rcx],ymm0 + vpshufd ymm2,ymm3,0xaa + vmovdqa YMMWORD[(288-256)+rcx],ymm1 + vpshufd ymm3,ymm3,0xff + vmovdqa YMMWORD[(320-256)+rcx],ymm2 + vmovdqa YMMWORD[(352-256)+rcx],ymm3 + + vpshufd ymm12,ymm15,0x00 + vpshufd ymm13,ymm15,0x55 + vmovdqa YMMWORD[(384-512)+rax],ymm12 + vpshufd ymm14,ymm15,0xaa + vmovdqa YMMWORD[(416-512)+rax],ymm13 + vpshufd ymm15,ymm15,0xff + vmovdqa YMMWORD[(448-512)+rax],ymm14 + vmovdqa YMMWORD[(480-512)+rax],ymm15 + + vpshufd ymm4,ymm7,0x00 + vpshufd ymm5,ymm7,0x55 + vpaddd ymm4,ymm4,YMMWORD[$L$incy] + vpshufd ymm6,ymm7,0xaa + vmovdqa YMMWORD[(544-512)+rax],ymm5 + vpshufd ymm7,ymm7,0xff + vmovdqa YMMWORD[(576-512)+rax],ymm6 + vmovdqa YMMWORD[(608-512)+rax],ymm7 + + jmp NEAR $L$oop_enter8x + +ALIGN 32 +$L$oop_outer8x: + vmovdqa ymm8,YMMWORD[((128-256))+rcx] + vmovdqa ymm9,YMMWORD[((160-256))+rcx] + vmovdqa ymm10,YMMWORD[((192-256))+rcx] + vmovdqa ymm11,YMMWORD[((224-256))+rcx] + vmovdqa ymm0,YMMWORD[((256-256))+rcx] + vmovdqa ymm1,YMMWORD[((288-256))+rcx] + vmovdqa ymm2,YMMWORD[((320-256))+rcx] + vmovdqa ymm3,YMMWORD[((352-256))+rcx] + vmovdqa ymm12,YMMWORD[((384-512))+rax] + vmovdqa ymm13,YMMWORD[((416-512))+rax] + vmovdqa ymm14,YMMWORD[((448-512))+rax] + vmovdqa ymm15,YMMWORD[((480-512))+rax] + vmovdqa ymm4,YMMWORD[((512-512))+rax] + vmovdqa ymm5,YMMWORD[((544-512))+rax] + vmovdqa ymm6,YMMWORD[((576-512))+rax] + vmovdqa ymm7,YMMWORD[((608-512))+rax] + vpaddd ymm4,ymm4,YMMWORD[$L$eight] + +$L$oop_enter8x: + vmovdqa YMMWORD[64+rsp],ymm14 + vmovdqa YMMWORD[96+rsp],ymm15 + vbroadcasti128 ymm15,XMMWORD[r10] + vmovdqa YMMWORD[(512-512)+rax],ymm4 + mov eax,10 + jmp NEAR $L$oop8x + +ALIGN 32 +$L$oop8x: + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm14,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm14,ymm0 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm15,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm15,ymm1 + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm15,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm15,ymm0 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm14,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm14,ymm1 + vmovdqa YMMWORD[rsp],ymm12 + vmovdqa YMMWORD[32+rsp],ymm13 + vmovdqa ymm12,YMMWORD[64+rsp] + vmovdqa ymm13,YMMWORD[96+rsp] + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm14,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm14,ymm2 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm15,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm15,ymm3 + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm15,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm15,ymm2 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm14,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm14,ymm3 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm14,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm14,ymm1 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm15,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm15,ymm2 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm15,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm15,ymm1 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm14,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm14,ymm2 + vmovdqa YMMWORD[64+rsp],ymm12 + vmovdqa YMMWORD[96+rsp],ymm13 + vmovdqa ymm12,YMMWORD[rsp] + vmovdqa ymm13,YMMWORD[32+rsp] + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm14,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm14,ymm3 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm15,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm15,ymm0 + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm15,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm15,ymm3 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm14,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm14,ymm0 + dec eax + jnz NEAR $L$oop8x + + lea rax,[512+rsp] + vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] + vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] + vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] + vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] + + vpunpckldq ymm14,ymm8,ymm9 + vpunpckldq ymm15,ymm10,ymm11 + vpunpckhdq ymm8,ymm8,ymm9 + vpunpckhdq ymm10,ymm10,ymm11 + vpunpcklqdq ymm9,ymm14,ymm15 + vpunpckhqdq ymm14,ymm14,ymm15 + vpunpcklqdq ymm11,ymm8,ymm10 + vpunpckhqdq ymm8,ymm8,ymm10 + vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] + vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] + vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] + vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] + + vpunpckldq ymm10,ymm0,ymm1 + vpunpckldq ymm15,ymm2,ymm3 + vpunpckhdq ymm0,ymm0,ymm1 + vpunpckhdq ymm2,ymm2,ymm3 + vpunpcklqdq ymm1,ymm10,ymm15 + vpunpckhqdq ymm10,ymm10,ymm15 + vpunpcklqdq ymm3,ymm0,ymm2 + vpunpckhqdq ymm0,ymm0,ymm2 + vperm2i128 ymm15,ymm9,ymm1,0x20 + vperm2i128 ymm1,ymm9,ymm1,0x31 + vperm2i128 ymm9,ymm14,ymm10,0x20 + vperm2i128 ymm10,ymm14,ymm10,0x31 + vperm2i128 ymm14,ymm11,ymm3,0x20 + vperm2i128 ymm3,ymm11,ymm3,0x31 + vperm2i128 ymm11,ymm8,ymm0,0x20 + vperm2i128 ymm0,ymm8,ymm0,0x31 + vmovdqa YMMWORD[rsp],ymm15 + vmovdqa YMMWORD[32+rsp],ymm9 + vmovdqa ymm15,YMMWORD[64+rsp] + vmovdqa ymm9,YMMWORD[96+rsp] + + vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] + vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] + vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] + vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] + + vpunpckldq ymm2,ymm12,ymm13 + vpunpckldq ymm8,ymm15,ymm9 + vpunpckhdq ymm12,ymm12,ymm13 + vpunpckhdq ymm15,ymm15,ymm9 + vpunpcklqdq ymm13,ymm2,ymm8 + vpunpckhqdq ymm2,ymm2,ymm8 + vpunpcklqdq ymm9,ymm12,ymm15 + vpunpckhqdq ymm12,ymm12,ymm15 + vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] + vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] + vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] + vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] + + vpunpckldq ymm15,ymm4,ymm5 + vpunpckldq ymm8,ymm6,ymm7 + vpunpckhdq ymm4,ymm4,ymm5 + vpunpckhdq ymm6,ymm6,ymm7 + vpunpcklqdq ymm5,ymm15,ymm8 + vpunpckhqdq ymm15,ymm15,ymm8 + vpunpcklqdq ymm7,ymm4,ymm6 + vpunpckhqdq ymm4,ymm4,ymm6 + vperm2i128 ymm8,ymm13,ymm5,0x20 + vperm2i128 ymm5,ymm13,ymm5,0x31 + vperm2i128 ymm13,ymm2,ymm15,0x20 + vperm2i128 ymm15,ymm2,ymm15,0x31 + vperm2i128 ymm2,ymm9,ymm7,0x20 + vperm2i128 ymm7,ymm9,ymm7,0x31 + vperm2i128 ymm9,ymm12,ymm4,0x20 + vperm2i128 ymm4,ymm12,ymm4,0x31 + vmovdqa ymm6,YMMWORD[rsp] + vmovdqa ymm12,YMMWORD[32+rsp] + + cmp rdx,64*8 + jb NEAR $L$tail8x + + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + lea rdi,[128+rdi] + + vpxor ymm12,ymm12,YMMWORD[rsi] + vpxor ymm13,ymm13,YMMWORD[32+rsi] + vpxor ymm10,ymm10,YMMWORD[64+rsi] + vpxor ymm15,ymm15,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm12 + vmovdqu YMMWORD[32+rdi],ymm13 + vmovdqu YMMWORD[64+rdi],ymm10 + vmovdqu YMMWORD[96+rdi],ymm15 + lea rdi,[128+rdi] + + vpxor ymm14,ymm14,YMMWORD[rsi] + vpxor ymm2,ymm2,YMMWORD[32+rsi] + vpxor ymm3,ymm3,YMMWORD[64+rsi] + vpxor ymm7,ymm7,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm14 + vmovdqu YMMWORD[32+rdi],ymm2 + vmovdqu YMMWORD[64+rdi],ymm3 + vmovdqu YMMWORD[96+rdi],ymm7 + lea rdi,[128+rdi] + + vpxor ymm11,ymm11,YMMWORD[rsi] + vpxor ymm9,ymm9,YMMWORD[32+rsi] + vpxor ymm0,ymm0,YMMWORD[64+rsi] + vpxor ymm4,ymm4,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm11 + vmovdqu YMMWORD[32+rdi],ymm9 + vmovdqu YMMWORD[64+rdi],ymm0 + vmovdqu YMMWORD[96+rdi],ymm4 + lea rdi,[128+rdi] + + sub rdx,64*8 + jnz NEAR $L$oop_outer8x + + jmp NEAR $L$done8x + +$L$tail8x: + cmp rdx,448 + jae NEAR $L$448_or_more8x + cmp rdx,384 + jae NEAR $L$384_or_more8x + cmp rdx,320 + jae NEAR $L$320_or_more8x + cmp rdx,256 + jae NEAR $L$256_or_more8x + cmp rdx,192 + jae NEAR $L$192_or_more8x + cmp rdx,128 + jae NEAR $L$128_or_more8x + cmp rdx,64 + jae NEAR $L$64_or_more8x + + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm6 + vmovdqa YMMWORD[32+rsp],ymm8 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$64_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + je NEAR $L$done8x + + lea rsi,[64+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm1 + lea rdi,[64+rdi] + sub rdx,64 + vmovdqa YMMWORD[32+rsp],ymm5 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$128_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + je NEAR $L$done8x + + lea rsi,[128+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm12 + lea rdi,[128+rdi] + sub rdx,128 + vmovdqa YMMWORD[32+rsp],ymm13 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$192_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + je NEAR $L$done8x + + lea rsi,[192+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm10 + lea rdi,[192+rdi] + sub rdx,192 + vmovdqa YMMWORD[32+rsp],ymm15 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$256_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + je NEAR $L$done8x + + lea rsi,[256+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm14 + lea rdi,[256+rdi] + sub rdx,256 + vmovdqa YMMWORD[32+rsp],ymm2 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$320_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + je NEAR $L$done8x + + lea rsi,[320+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm3 + lea rdi,[320+rdi] + sub rdx,320 + vmovdqa YMMWORD[32+rsp],ymm7 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$384_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + je NEAR $L$done8x + + lea rsi,[384+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm11 + lea rdi,[384+rdi] + sub rdx,384 + vmovdqa YMMWORD[32+rsp],ymm9 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$448_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vpxor ymm11,ymm11,YMMWORD[384+rsi] + vpxor ymm9,ymm9,YMMWORD[416+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + vmovdqu YMMWORD[384+rdi],ymm11 + vmovdqu YMMWORD[416+rdi],ymm9 + je NEAR $L$done8x + + lea rsi,[448+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm0 + lea rdi,[448+rdi] + sub rdx,448 + vmovdqa YMMWORD[32+rsp],ymm4 + +$L$oop_tail8x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail8x + +$L$done8x: + vzeroall + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$8x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ChaCha20_8x: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + lea r10,[$L$ctr32_body] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea r10,[$L$no_data] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[((64+24+48))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +ssse3_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-40))+rax] + lea rdi,[512+r8] + mov ecx,4 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_end_GFp_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_info_GFp_ChaCha20_ctr32 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase + DD $L$SEH_end_ChaCha20_4x wrt ..imagebase + DD $L$SEH_info_ChaCha20_4x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase + DD $L$SEH_end_ChaCha20_8x wrt ..imagebase + DD $L$SEH_info_ChaCha20_8x wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_GFp_ChaCha20_ctr32: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + +$L$SEH_info_ChaCha20_ssse3: +DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase + +$L$SEH_info_ChaCha20_4x: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase +$L$SEH_info_ChaCha20_8x: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase diff --git a/zeroidc/vendor/ring/pregenerated/tmp/chacha20_poly1305_x86_64-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/chacha20_poly1305_x86_64-nasm.asm new file mode 100644 index 000000000..150930f03 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/chacha20_poly1305_x86_64-nasm.asm @@ -0,0 +1,8941 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + +EXTERN GFp_ia32cap_P + +chacha20_poly1305_constants: + +ALIGN 64 +$L$chacha20_consts: +DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +$L$rol8: +DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +$L$rol16: +DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +$L$avx2_init: + DD 0,0,0,0 +$L$sse_inc: + DD 1,0,0,0 +$L$avx2_inc: + DD 2,0,0,0,2,0,0,0 +$L$clamp: + DQ 0x0FFFFFFC0FFFFFFF,0x0FFFFFFC0FFFFFFC + DQ 0xFFFFFFFFFFFFFFFF,0xFFFFFFFFFFFFFFFF +ALIGN 16 +$L$and_masks: +DB 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 +DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + +ALIGN 64 +poly_hash_ad_internal: + + + xor r10,r10 + xor r11,r11 + xor r12,r12 + cmp r8,13 + jne NEAR $L$hash_ad_loop +$L$poly_fast_tls_ad: + + mov r10,QWORD[rcx] + mov r11,QWORD[5+rcx] + shr r11,24 + mov r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + DB 0F3h,0C3h ;repret +$L$hash_ad_loop: + + cmp r8,16 + jb NEAR $L$hash_ad_tail + add r10,QWORD[((0+0))+rcx] + adc r11,QWORD[((8+0))+rcx] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rcx,[16+rcx] + sub r8,16 + jmp NEAR $L$hash_ad_loop +$L$hash_ad_tail: + cmp r8,0 + je NEAR $L$hash_ad_done + + xor r13,r13 + xor r14,r14 + xor r15,r15 + add rcx,r8 +$L$hash_ad_tail_loop: + shld r14,r13,8 + shl r13,8 + movzx r15,BYTE[((-1))+rcx] + xor r13,r15 + dec rcx + dec r8 + jne NEAR $L$hash_ad_tail_loop + + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$hash_ad_done: + DB 0F3h,0C3h ;repret + + + +global GFp_chacha20_poly1305_open + +ALIGN 64 +GFp_chacha20_poly1305_open: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_chacha20_poly1305_open: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + + mov eax,DWORD[((GFp_ia32cap_P+8))] + and eax,288 + xor eax,288 + jz NEAR chacha20_poly1305_open_avx2 + + cmp rbx,128 + jbe NEAR $L$open_sse_128 + + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqu xmm4,XMMWORD[r9] + movdqu xmm8,XMMWORD[16+r9] + movdqu xmm12,XMMWORD[32+r9] + + movdqa xmm7,xmm12 + + movdqa XMMWORD[(160+48)+rbp],xmm4 + movdqa XMMWORD[(160+64)+rbp],xmm8 + movdqa XMMWORD[(160+96)+rbp],xmm12 + mov r10,10 +$L$open_sse_init_rounds: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + dec r10 + jne NEAR $L$open_sse_init_rounds + + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + + pand xmm0,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm0 + movdqa XMMWORD[(160+16)+rbp],xmm4 + + mov r8,r8 + call poly_hash_ad_internal +$L$open_sse_main_loop: + cmp rbx,16*16 + jb NEAR $L$open_sse_tail + + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm3,xmm0 + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,XMMWORD[((160+96))+rbp] + paddd xmm15,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm15 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + + + + mov rcx,4 + mov r8,rsi +$L$open_sse_main_loop_rounds: + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + + lea r8,[16+r8] + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + imul r9,r12 + add r15,r10 + adc r9,rdx +DB 102,15,58,15,255,4 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,12 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] +DB 102,15,58,15,255,12 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,4 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + dec rcx + jge NEAR $L$open_sse_main_loop_rounds + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + cmp rcx,-6 + jg NEAR $L$open_sse_main_loop_rounds + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqa XMMWORD[(160+80)+rbp],xmm12 + movdqu xmm12,XMMWORD[((0 + 0))+rsi] + pxor xmm12,xmm3 + movdqu XMMWORD[(0 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((16 + 0))+rsi] + pxor xmm12,xmm7 + movdqu XMMWORD[(16 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((32 + 0))+rsi] + pxor xmm12,xmm11 + movdqu XMMWORD[(32 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((48 + 0))+rsi] + pxor xmm12,xmm15 + movdqu XMMWORD[(48 + 0)+rdi],xmm12 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 64)+rdi],xmm2 + movdqu XMMWORD[(16 + 64)+rdi],xmm6 + movdqu XMMWORD[(32 + 64)+rdi],xmm10 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 128)+rdi],xmm1 + movdqu XMMWORD[(16 + 128)+rdi],xmm5 + movdqu XMMWORD[(32 + 128)+rdi],xmm9 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 192))+rsi] + movdqu xmm7,XMMWORD[((16 + 192))+rsi] + movdqu xmm11,XMMWORD[((32 + 192))+rsi] + movdqu xmm15,XMMWORD[((48 + 192))+rsi] + pxor xmm0,xmm3 + pxor xmm4,xmm7 + pxor xmm8,xmm11 + pxor xmm15,XMMWORD[((160+80))+rbp] + movdqu XMMWORD[(0 + 192)+rdi],xmm0 + movdqu XMMWORD[(16 + 192)+rdi],xmm4 + movdqu XMMWORD[(32 + 192)+rdi],xmm8 + movdqu XMMWORD[(48 + 192)+rdi],xmm15 + + lea rsi,[256+rsi] + lea rdi,[256+rdi] + sub rbx,16*16 + jmp NEAR $L$open_sse_main_loop +$L$open_sse_tail: + + test rbx,rbx + jz NEAR $L$open_sse_finalize + cmp rbx,12*16 + ja NEAR $L$open_sse_tail_256 + cmp rbx,8*16 + ja NEAR $L$open_sse_tail_192 + cmp rbx,4*16 + ja NEAR $L$open_sse_tail_128 + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm12,XMMWORD[((160+96))+rbp] + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + + xor r8,r8 + mov rcx,rbx + cmp rcx,16 + jb NEAR $L$open_sse_tail_64_rounds +$L$open_sse_tail_64_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + sub rcx,16 +$L$open_sse_tail_64_rounds: + add r8,16 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + cmp rcx,16 + jae NEAR $L$open_sse_tail_64_rounds_and_x1hash + cmp r8,10*16 + jne NEAR $L$open_sse_tail_64_rounds + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + jmp NEAR $L$open_sse_tail_64_dec_loop + +$L$open_sse_tail_128: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm13,XMMWORD[((160+96))+rbp] + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + + mov rcx,rbx + and rcx,-16 + xor r8,r8 +$L$open_sse_tail_128_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_sse_tail_128_rounds: + add r8,16 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + + cmp r8,rcx + jb NEAR $L$open_sse_tail_128_rounds_and_x1hash + cmp r8,10*16 + jne NEAR $L$open_sse_tail_128_rounds + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 0)+rdi],xmm1 + movdqu XMMWORD[(16 + 0)+rdi],xmm5 + movdqu XMMWORD[(32 + 0)+rdi],xmm9 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + + sub rbx,4*16 + lea rsi,[64+rsi] + lea rdi,[64+rdi] + jmp NEAR $L$open_sse_tail_64_dec_loop + +$L$open_sse_tail_192: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm14,XMMWORD[((160+96))+rbp] + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + + mov rcx,rbx + mov r8,10*16 + cmp rcx,10*16 + cmovg rcx,r8 + and rcx,-16 + xor r8,r8 +$L$open_sse_tail_192_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_sse_tail_192_rounds: + add r8,16 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + + cmp r8,rcx + jb NEAR $L$open_sse_tail_192_rounds_and_x1hash + cmp r8,10*16 + jne NEAR $L$open_sse_tail_192_rounds + cmp rbx,11*16 + jb NEAR $L$open_sse_tail_192_finish + add r10,QWORD[((0+160))+rsi] + adc r11,QWORD[((8+160))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + cmp rbx,12*16 + jb NEAR $L$open_sse_tail_192_finish + add r10,QWORD[((0+176))+rsi] + adc r11,QWORD[((8+176))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_sse_tail_192_finish: + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 0)+rdi],xmm2 + movdqu XMMWORD[(16 + 0)+rdi],xmm6 + movdqu XMMWORD[(32 + 0)+rdi],xmm10 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 64)+rdi],xmm1 + movdqu XMMWORD[(16 + 64)+rdi],xmm5 + movdqu XMMWORD[(32 + 64)+rdi],xmm9 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + + sub rbx,8*16 + lea rsi,[128+rsi] + lea rdi,[128+rdi] + jmp NEAR $L$open_sse_tail_64_dec_loop + +$L$open_sse_tail_256: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm3,xmm0 + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,XMMWORD[((160+96))+rbp] + paddd xmm15,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm15 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + + xor r8,r8 +$L$open_sse_tail_256_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + movdqa XMMWORD[(160+80)+rbp],xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,12 + psrld xmm4,20 + pxor xmm4,xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,7 + psrld xmm4,25 + pxor xmm4,xmm11 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,12 + psrld xmm5,20 + pxor xmm5,xmm11 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,7 + psrld xmm5,25 + pxor xmm5,xmm11 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,12 + psrld xmm6,20 + pxor xmm6,xmm11 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,7 + psrld xmm6,25 + pxor xmm6,xmm11 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + movdqa xmm11,XMMWORD[((160+80))+rbp] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + movdqa XMMWORD[(160+80)+rbp],xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol16] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,12 + psrld xmm7,20 + pxor xmm7,xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol8] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,7 + psrld xmm7,25 + pxor xmm7,xmm9 +DB 102,15,58,15,255,4 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,12 + movdqa xmm9,XMMWORD[((160+80))+rbp] + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + movdqa XMMWORD[(160+80)+rbp],xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,12 + psrld xmm4,20 + pxor xmm4,xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,7 + psrld xmm4,25 + pxor xmm4,xmm11 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,12 + psrld xmm5,20 + pxor xmm5,xmm11 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,7 + psrld xmm5,25 + pxor xmm5,xmm11 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + imul r9,r12 + add r15,r10 + adc r9,rdx + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,12 + psrld xmm6,20 + pxor xmm6,xmm11 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,7 + psrld xmm6,25 + pxor xmm6,xmm11 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + movdqa xmm11,XMMWORD[((160+80))+rbp] + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + movdqa XMMWORD[(160+80)+rbp],xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol16] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,12 + psrld xmm7,20 + pxor xmm7,xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol8] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,7 + psrld xmm7,25 + pxor xmm7,xmm9 +DB 102,15,58,15,255,12 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,4 + movdqa xmm9,XMMWORD[((160+80))+rbp] + + add r8,16 + cmp r8,10*16 + jb NEAR $L$open_sse_tail_256_rounds_and_x1hash + + mov rcx,rbx + and rcx,-16 +$L$open_sse_tail_256_hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + add r8,16 + cmp r8,rcx + jb NEAR $L$open_sse_tail_256_hash + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqa XMMWORD[(160+80)+rbp],xmm12 + movdqu xmm12,XMMWORD[((0 + 0))+rsi] + pxor xmm12,xmm3 + movdqu XMMWORD[(0 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((16 + 0))+rsi] + pxor xmm12,xmm7 + movdqu XMMWORD[(16 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((32 + 0))+rsi] + pxor xmm12,xmm11 + movdqu XMMWORD[(32 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((48 + 0))+rsi] + pxor xmm12,xmm15 + movdqu XMMWORD[(48 + 0)+rdi],xmm12 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 64)+rdi],xmm2 + movdqu XMMWORD[(16 + 64)+rdi],xmm6 + movdqu XMMWORD[(32 + 64)+rdi],xmm10 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 128)+rdi],xmm1 + movdqu XMMWORD[(16 + 128)+rdi],xmm5 + movdqu XMMWORD[(32 + 128)+rdi],xmm9 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + + movdqa xmm12,XMMWORD[((160+80))+rbp] + sub rbx,12*16 + lea rsi,[192+rsi] + lea rdi,[192+rdi] + + +$L$open_sse_tail_64_dec_loop: + cmp rbx,16 + jb NEAR $L$open_sse_tail_16_init + sub rbx,16 + movdqu xmm3,XMMWORD[rsi] + pxor xmm0,xmm3 + movdqu XMMWORD[rdi],xmm0 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + movdqa xmm0,xmm4 + movdqa xmm4,xmm8 + movdqa xmm8,xmm12 + jmp NEAR $L$open_sse_tail_64_dec_loop +$L$open_sse_tail_16_init: + movdqa xmm1,xmm0 + + +$L$open_sse_tail_16: + test rbx,rbx + jz NEAR $L$open_sse_finalize + + + + pxor xmm3,xmm3 + lea rsi,[((-1))+rbx*1+rsi] + mov r8,rbx +$L$open_sse_tail_16_compose: + pslldq xmm3,1 + pinsrb xmm3,BYTE[rsi],0 + sub rsi,1 + sub r8,1 + jnz NEAR $L$open_sse_tail_16_compose + +DB 102,73,15,126,221 + pextrq r14,xmm3,1 + + pxor xmm3,xmm1 + + +$L$open_sse_tail_16_extract: + pextrb XMMWORD[rdi],xmm3,0 + psrldq xmm3,1 + add rdi,1 + sub rbx,1 + jne NEAR $L$open_sse_tail_16_extract + + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$open_sse_finalize: + add r10,QWORD[((0+160+32))+rbp] + adc r11,QWORD[((8+160+32))+rbp] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + mov r13,r10 + mov r14,r11 + mov r15,r12 + sub r10,-5 + sbb r11,-1 + sbb r12,3 + cmovc r10,r13 + cmovc r11,r14 + cmovc r12,r15 + + add r10,QWORD[((0+160+16))+rbp] + adc r11,QWORD[((8+160+16))+rbp] + + movaps xmm6,XMMWORD[((0+0))+rbp] + movaps xmm7,XMMWORD[((16+0))+rbp] + movaps xmm8,XMMWORD[((32+0))+rbp] + movaps xmm9,XMMWORD[((48+0))+rbp] + movaps xmm10,XMMWORD[((64+0))+rbp] + movaps xmm11,XMMWORD[((80+0))+rbp] + movaps xmm12,XMMWORD[((96+0))+rbp] + movaps xmm13,XMMWORD[((112+0))+rbp] + movaps xmm14,XMMWORD[((128+0))+rbp] + movaps xmm15,XMMWORD[((144+0))+rbp] + + + add rsp,288 + 160 + 32 + + + pop r9 + + mov QWORD[r9],r10 + mov QWORD[8+r9],r11 + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$open_sse_128: + + movdqu xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm1,xmm0 + movdqa xmm2,xmm0 + movdqu xmm4,XMMWORD[r9] + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + movdqu xmm8,XMMWORD[16+r9] + movdqa xmm9,xmm8 + movdqa xmm10,xmm8 + movdqu xmm12,XMMWORD[32+r9] + movdqa xmm13,xmm12 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm13 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,xmm13 + mov r10,10 + +$L$open_sse_128_rounds: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + + dec r10 + jnz NEAR $L$open_sse_128_rounds + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm4,xmm7 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + paddd xmm9,xmm11 + paddd xmm10,xmm11 + paddd xmm13,xmm15 + paddd xmm15,XMMWORD[$L$sse_inc] + paddd xmm14,xmm15 + + pand xmm0,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm0 + movdqa XMMWORD[(160+16)+rbp],xmm4 + + mov r8,r8 + call poly_hash_ad_internal +$L$open_sse_128_xor_hash: + cmp rbx,16 + jb NEAR $L$open_sse_tail_16 + sub rbx,16 + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + + + movdqu xmm3,XMMWORD[rsi] + pxor xmm1,xmm3 + movdqu XMMWORD[rdi],xmm1 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + movdqa xmm1,xmm5 + movdqa xmm5,xmm9 + movdqa xmm9,xmm13 + movdqa xmm13,xmm2 + movdqa xmm2,xmm6 + movdqa xmm6,xmm10 + movdqa xmm10,xmm14 + jmp NEAR $L$open_sse_128_xor_hash +$L$SEH_end_GFp_chacha20_poly1305_open: + + + + + + + +global GFp_chacha20_poly1305_seal + +ALIGN 64 +GFp_chacha20_poly1305_seal: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_chacha20_poly1305_seal: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,QWORD[56+r9] + add rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + mov rbx,rdx + + mov eax,DWORD[((GFp_ia32cap_P+8))] + and eax,288 + xor eax,288 + jz NEAR chacha20_poly1305_seal_avx2 + + cmp rbx,128 + jbe NEAR $L$seal_sse_128 + + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqu xmm4,XMMWORD[r9] + movdqu xmm8,XMMWORD[16+r9] + movdqu xmm12,XMMWORD[32+r9] + + movdqa xmm1,xmm0 + movdqa xmm2,xmm0 + movdqa xmm3,xmm0 + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + movdqa xmm7,xmm4 + movdqa xmm9,xmm8 + movdqa xmm10,xmm8 + movdqa xmm11,xmm8 + movdqa xmm15,xmm12 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm12 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm12 + paddd xmm12,XMMWORD[$L$sse_inc] + + movdqa XMMWORD[(160+48)+rbp],xmm4 + movdqa XMMWORD[(160+64)+rbp],xmm8 + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + mov r10,10 +$L$seal_sse_init_rounds: + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] +DB 102,15,58,15,255,4 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,12 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] +DB 102,15,58,15,255,12 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,4 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + dec r10 + jnz NEAR $L$seal_sse_init_rounds + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + + pand xmm3,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm3 + movdqa XMMWORD[(160+16)+rbp],xmm7 + + mov r8,r8 + call poly_hash_ad_internal + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 0)+rdi],xmm2 + movdqu XMMWORD[(16 + 0)+rdi],xmm6 + movdqu XMMWORD[(32 + 0)+rdi],xmm10 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 64)+rdi],xmm1 + movdqu XMMWORD[(16 + 64)+rdi],xmm5 + movdqu XMMWORD[(32 + 64)+rdi],xmm9 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + + cmp rbx,12*16 + ja NEAR $L$seal_sse_main_init + mov rcx,8*16 + sub rbx,8*16 + lea rsi,[128+rsi] + jmp NEAR $L$seal_sse_128_tail_hash +$L$seal_sse_main_init: + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm0,xmm3 + pxor xmm4,xmm7 + pxor xmm8,xmm11 + pxor xmm15,xmm12 + movdqu XMMWORD[(0 + 128)+rdi],xmm0 + movdqu XMMWORD[(16 + 128)+rdi],xmm4 + movdqu XMMWORD[(32 + 128)+rdi],xmm8 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + + mov rcx,12*16 + sub rbx,12*16 + lea rsi,[192+rsi] + mov rcx,2 + mov r8,8 + cmp rbx,4*16 + jbe NEAR $L$seal_sse_tail_64 + cmp rbx,8*16 + jbe NEAR $L$seal_sse_tail_128 + cmp rbx,12*16 + jbe NEAR $L$seal_sse_tail_192 + +$L$seal_sse_main_loop: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm3,xmm0 + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,XMMWORD[((160+96))+rbp] + paddd xmm15,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm15 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + +ALIGN 32 +$L$seal_sse_main_rounds: + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + imul r9,r12 + add r15,r10 + adc r9,rdx +DB 102,15,58,15,255,4 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,12 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 +DB 102,69,15,56,0,248 +DB 102,69,15,56,0,240 +DB 102,69,15,56,0,232 +DB 102,69,15,56,0,224 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] +DB 102,15,58,15,255,12 +DB 102,69,15,58,15,219,8 +DB 102,69,15,58,15,255,4 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + + lea rdi,[16+rdi] + dec r8 + jge NEAR $L$seal_sse_main_rounds + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_main_rounds + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + movdqa XMMWORD[(160+80)+rbp],xmm14 + movdqa XMMWORD[(160+80)+rbp],xmm14 + movdqu xmm14,XMMWORD[((0 + 0))+rsi] + pxor xmm14,xmm3 + movdqu XMMWORD[(0 + 0)+rdi],xmm14 + movdqu xmm14,XMMWORD[((16 + 0))+rsi] + pxor xmm14,xmm7 + movdqu XMMWORD[(16 + 0)+rdi],xmm14 + movdqu xmm14,XMMWORD[((32 + 0))+rsi] + pxor xmm14,xmm11 + movdqu XMMWORD[(32 + 0)+rdi],xmm14 + movdqu xmm14,XMMWORD[((48 + 0))+rsi] + pxor xmm14,xmm15 + movdqu XMMWORD[(48 + 0)+rdi],xmm14 + + movdqa xmm14,XMMWORD[((160+80))+rbp] + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 64)+rdi],xmm2 + movdqu XMMWORD[(16 + 64)+rdi],xmm6 + movdqu XMMWORD[(32 + 64)+rdi],xmm10 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 128)+rdi],xmm1 + movdqu XMMWORD[(16 + 128)+rdi],xmm5 + movdqu XMMWORD[(32 + 128)+rdi],xmm9 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + + cmp rbx,16*16 + ja NEAR $L$seal_sse_main_loop_xor + + mov rcx,12*16 + sub rbx,12*16 + lea rsi,[192+rsi] + jmp NEAR $L$seal_sse_128_tail_hash +$L$seal_sse_main_loop_xor: + movdqu xmm3,XMMWORD[((0 + 192))+rsi] + movdqu xmm7,XMMWORD[((16 + 192))+rsi] + movdqu xmm11,XMMWORD[((32 + 192))+rsi] + movdqu xmm15,XMMWORD[((48 + 192))+rsi] + pxor xmm0,xmm3 + pxor xmm4,xmm7 + pxor xmm8,xmm11 + pxor xmm15,xmm12 + movdqu XMMWORD[(0 + 192)+rdi],xmm0 + movdqu XMMWORD[(16 + 192)+rdi],xmm4 + movdqu XMMWORD[(32 + 192)+rdi],xmm8 + movdqu XMMWORD[(48 + 192)+rdi],xmm15 + + lea rsi,[256+rsi] + sub rbx,16*16 + mov rcx,6 + mov r8,4 + cmp rbx,12*16 + jg NEAR $L$seal_sse_main_loop + mov rcx,rbx + test rbx,rbx + je NEAR $L$seal_sse_128_tail_hash + mov rcx,6 + cmp rbx,8*16 + ja NEAR $L$seal_sse_tail_192 + cmp rbx,4*16 + ja NEAR $L$seal_sse_tail_128 + +$L$seal_sse_tail_64: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm12,XMMWORD[((160+96))+rbp] + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + +$L$seal_sse_tail_64_rounds_and_x2hash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_sse_tail_64_rounds_and_x1hash: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_tail_64_rounds_and_x2hash + dec r8 + jge NEAR $L$seal_sse_tail_64_rounds_and_x1hash + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + jmp NEAR $L$seal_sse_128_tail_xor + +$L$seal_sse_tail_128: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm13,XMMWORD[((160+96))+rbp] + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + +$L$seal_sse_tail_128_rounds_and_x2hash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_sse_tail_128_rounds_and_x1hash: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_tail_128_rounds_and_x2hash + dec r8 + jge NEAR $L$seal_sse_tail_128_rounds_and_x1hash + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 0)+rdi],xmm1 + movdqu XMMWORD[(16 + 0)+rdi],xmm5 + movdqu XMMWORD[(32 + 0)+rdi],xmm9 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + + mov rcx,4*16 + sub rbx,4*16 + lea rsi,[64+rsi] + jmp NEAR $L$seal_sse_128_tail_hash + +$L$seal_sse_tail_192: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm14,XMMWORD[((160+96))+rbp] + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + +$L$seal_sse_tail_192_rounds_and_x2hash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_sse_tail_192_rounds_and_x1hash: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_tail_192_rounds_and_x2hash + dec r8 + jge NEAR $L$seal_sse_tail_192_rounds_and_x1hash + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 0)+rdi],xmm2 + movdqu XMMWORD[(16 + 0)+rdi],xmm6 + movdqu XMMWORD[(32 + 0)+rdi],xmm10 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 64)+rdi],xmm1 + movdqu XMMWORD[(16 + 64)+rdi],xmm5 + movdqu XMMWORD[(32 + 64)+rdi],xmm9 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + + mov rcx,8*16 + sub rbx,8*16 + lea rsi,[128+rsi] + +$L$seal_sse_128_tail_hash: + cmp rcx,16 + jb NEAR $L$seal_sse_128_tail_xor + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + sub rcx,16 + lea rdi,[16+rdi] + jmp NEAR $L$seal_sse_128_tail_hash + +$L$seal_sse_128_tail_xor: + cmp rbx,16 + jb NEAR $L$seal_sse_tail_16 + sub rbx,16 + + movdqu xmm3,XMMWORD[rsi] + pxor xmm0,xmm3 + movdqu XMMWORD[rdi],xmm0 + + add r10,QWORD[rdi] + adc r11,QWORD[8+rdi] + adc r12,1 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + movdqa xmm0,xmm4 + movdqa xmm4,xmm8 + movdqa xmm8,xmm12 + movdqa xmm12,xmm1 + movdqa xmm1,xmm5 + movdqa xmm5,xmm9 + movdqa xmm9,xmm13 + jmp NEAR $L$seal_sse_128_tail_xor + +$L$seal_sse_tail_16: + test rbx,rbx + jz NEAR $L$process_blocks_of_extra_in + + mov r8,rbx + mov rcx,rbx + lea rsi,[((-1))+rbx*1+rsi] + pxor xmm15,xmm15 +$L$seal_sse_tail_16_compose: + pslldq xmm15,1 + pinsrb xmm15,BYTE[rsi],0 + lea rsi,[((-1))+rsi] + dec rcx + jne NEAR $L$seal_sse_tail_16_compose + + + pxor xmm15,xmm0 + + + mov rcx,rbx + movdqu xmm0,xmm15 +$L$seal_sse_tail_16_extract: + pextrb XMMWORD[rdi],xmm0,0 + psrldq xmm0,1 + add rdi,1 + sub rcx,1 + jnz NEAR $L$seal_sse_tail_16_extract + + + + + + + + + mov r9,QWORD[((288 + 160 + 32))+rsp] + mov r14,QWORD[56+r9] + mov r13,QWORD[48+r9] + test r14,r14 + jz NEAR $L$process_partial_block + + mov r15,16 + sub r15,rbx + cmp r14,r15 + + jge NEAR $L$load_extra_in + mov r15,r14 + +$L$load_extra_in: + + + lea rsi,[((-1))+r15*1+r13] + + + add r13,r15 + sub r14,r15 + mov QWORD[48+r9],r13 + mov QWORD[56+r9],r14 + + + + add r8,r15 + + + pxor xmm11,xmm11 +$L$load_extra_load_loop: + pslldq xmm11,1 + pinsrb xmm11,BYTE[rsi],0 + lea rsi,[((-1))+rsi] + sub r15,1 + jnz NEAR $L$load_extra_load_loop + + + + + mov r15,rbx + +$L$load_extra_shift_loop: + pslldq xmm11,1 + sub r15,1 + jnz NEAR $L$load_extra_shift_loop + + + + + lea r15,[$L$and_masks] + shl rbx,4 + pand xmm15,XMMWORD[((-16))+rbx*1+r15] + + + por xmm15,xmm11 + + + +DB 102,77,15,126,253 + pextrq r14,xmm15,1 + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$process_blocks_of_extra_in: + + mov r9,QWORD[((288+32+160 ))+rsp] + mov rsi,QWORD[48+r9] + mov r8,QWORD[56+r9] + mov rcx,r8 + shr r8,4 + +$L$process_extra_hash_loop: + jz NEAR process_extra_in_trailer + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rsi,[16+rsi] + sub r8,1 + jmp NEAR $L$process_extra_hash_loop +process_extra_in_trailer: + and rcx,15 + mov rbx,rcx + jz NEAR $L$do_length_block + lea rsi,[((-1))+rcx*1+rsi] + +$L$process_extra_in_trailer_load: + pslldq xmm15,1 + pinsrb xmm15,BYTE[rsi],0 + lea rsi,[((-1))+rsi] + sub rcx,1 + jnz NEAR $L$process_extra_in_trailer_load + +$L$process_partial_block: + + lea r15,[$L$and_masks] + shl rbx,4 + pand xmm15,XMMWORD[((-16))+rbx*1+r15] +DB 102,77,15,126,253 + pextrq r14,xmm15,1 + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$do_length_block: + add r10,QWORD[((0+160+32))+rbp] + adc r11,QWORD[((8+160+32))+rbp] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + mov r13,r10 + mov r14,r11 + mov r15,r12 + sub r10,-5 + sbb r11,-1 + sbb r12,3 + cmovc r10,r13 + cmovc r11,r14 + cmovc r12,r15 + + add r10,QWORD[((0+160+16))+rbp] + adc r11,QWORD[((8+160+16))+rbp] + + movaps xmm6,XMMWORD[((0+0))+rbp] + movaps xmm7,XMMWORD[((16+0))+rbp] + movaps xmm8,XMMWORD[((32+0))+rbp] + movaps xmm9,XMMWORD[((48+0))+rbp] + movaps xmm10,XMMWORD[((64+0))+rbp] + movaps xmm11,XMMWORD[((80+0))+rbp] + movaps xmm12,XMMWORD[((96+0))+rbp] + movaps xmm13,XMMWORD[((112+0))+rbp] + movaps xmm14,XMMWORD[((128+0))+rbp] + movaps xmm15,XMMWORD[((144+0))+rbp] + + + add rsp,288 + 160 + 32 + + + pop r9 + + mov QWORD[r9],r10 + mov QWORD[8+r9],r11 + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$seal_sse_128: + + movdqu xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm1,xmm0 + movdqa xmm2,xmm0 + movdqu xmm4,XMMWORD[r9] + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + movdqu xmm8,XMMWORD[16+r9] + movdqa xmm9,xmm8 + movdqa xmm10,xmm8 + movdqu xmm14,XMMWORD[32+r9] + movdqa xmm12,xmm14 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm12 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,xmm12 + mov r10,10 + +$L$seal_sse_128_rounds: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,4 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,4 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,4 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 +DB 102,15,58,15,228,12 +DB 102,69,15,58,15,192,8 +DB 102,69,15,58,15,228,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 +DB 102,15,58,15,237,12 +DB 102,69,15,58,15,201,8 +DB 102,69,15,58,15,237,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 +DB 102,15,58,15,246,12 +DB 102,69,15,58,15,210,8 +DB 102,69,15,58,15,246,4 + + dec r10 + jnz NEAR $L$seal_sse_128_rounds + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm4,xmm7 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + paddd xmm8,xmm11 + paddd xmm9,xmm11 + paddd xmm12,xmm15 + paddd xmm15,XMMWORD[$L$sse_inc] + paddd xmm13,xmm15 + + pand xmm2,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm2 + movdqa XMMWORD[(160+16)+rbp],xmm6 + + mov r8,r8 + call poly_hash_ad_internal + jmp NEAR $L$seal_sse_128_tail_xor +$L$SEH_end_GFp_chacha20_poly1305_seal: + + + + +ALIGN 64 +chacha20_poly1305_open_avx2: + + + + + + + + + + + + + vzeroupper + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vbroadcasti128 ymm4,XMMWORD[r9] + vbroadcasti128 ymm8,XMMWORD[16+r9] + vbroadcasti128 ymm12,XMMWORD[32+r9] + vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init] + cmp rbx,6*32 + jbe NEAR $L$open_avx2_192 + cmp rbx,10*32 + jbe NEAR $L$open_avx2_320 + + vmovdqa YMMWORD[(160+64)+rbp],ymm4 + vmovdqa YMMWORD[(160+96)+rbp],ymm8 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + mov r10,10 +$L$open_avx2_init_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + + dec r10 + jne NEAR $L$open_avx2_init_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + + mov r8,r8 + call poly_hash_ad_internal + + xor rcx,rcx +$L$open_avx2_init_hash: + add r10,QWORD[((0+0))+rcx*1+rsi] + adc r11,QWORD[((8+0))+rcx*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + add rcx,16 + cmp rcx,2*32 + jne NEAR $L$open_avx2_init_hash + + vpxor ymm0,ymm0,YMMWORD[rsi] + vpxor ymm4,ymm4,YMMWORD[32+rsi] + + vmovdqu YMMWORD[rdi],ymm0 + vmovdqu YMMWORD[32+rdi],ymm4 + lea rsi,[64+rsi] + lea rdi,[64+rdi] + sub rbx,2*32 +$L$open_avx2_main_loop: + + cmp rbx,16*32 + jb NEAR $L$open_avx2_main_loop_done + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + xor rcx,rcx +$L$open_avx2_main_loop_rounds: + add r10,QWORD[((0+0))+rcx*1+rsi] + adc r11,QWORD[((8+0))+rcx*1+rsi] + adc r12,1 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + add r15,rax + adc r9,rdx + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + add r10,QWORD[((0+16))+rcx*1+rsi] + adc r11,QWORD[((8+16))+rcx*1+rsi] + adc r12,1 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + add r15,rax + adc r9,rdx + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + add r10,QWORD[((0+32))+rcx*1+rsi] + adc r11,QWORD[((8+32))+rcx*1+rsi] + adc r12,1 + + lea rcx,[48+rcx] + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + add r15,rax + adc r9,rdx + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpalignr ymm12,ymm12,ymm12,4 + + cmp rcx,10*6*8 + jne NEAR $L$open_avx2_main_loop_rounds + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + add r10,QWORD[((0+480))+rsi] + adc r11,QWORD[((8+480))+rsi] + adc r12,1 + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + add r10,QWORD[((0+480+16))+rsi] + adc r11,QWORD[((8+480+16))+rsi] + adc r12,1 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vperm2i128 ymm3,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm12,ymm8,0x02 + vperm2i128 ymm8,ymm12,ymm8,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi] + vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi] + vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi] + vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi] + vmovdqu YMMWORD[(0+384)+rdi],ymm3 + vmovdqu YMMWORD[(32+384)+rdi],ymm0 + vmovdqu YMMWORD[(64+384)+rdi],ymm4 + vmovdqu YMMWORD[(96+384)+rdi],ymm8 + + lea rsi,[512+rsi] + lea rdi,[512+rdi] + sub rbx,16*32 + jmp NEAR $L$open_avx2_main_loop +$L$open_avx2_main_loop_done: + test rbx,rbx + vzeroupper + je NEAR $L$open_sse_finalize + + cmp rbx,12*32 + ja NEAR $L$open_avx2_tail_512 + cmp rbx,8*32 + ja NEAR $L$open_avx2_tail_384 + cmp rbx,4*32 + ja NEAR $L$open_avx2_tail_256 + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + xor r8,r8 + mov rcx,rbx + and rcx,-16 + test rcx,rcx + je NEAR $L$open_avx2_tail_128_rounds +$L$open_avx2_tail_128_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_avx2_tail_128_rounds: + add r8,16 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + + cmp r8,rcx + jb NEAR $L$open_avx2_tail_128_rounds_and_x1hash + cmp r8,160 + jne NEAR $L$open_avx2_tail_128_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + jmp NEAR $L$open_avx2_tail_128_xor + +$L$open_avx2_tail_256: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + + mov QWORD[((160+128))+rbp],rbx + mov rcx,rbx + sub rcx,4*32 + shr rcx,4 + mov r8,10 + cmp rcx,10 + cmovg rcx,r8 + mov rbx,rsi + xor r8,r8 +$L$open_avx2_tail_256_rounds_and_x1hash: + add r10,QWORD[((0+0))+rbx] + adc r11,QWORD[((8+0))+rbx] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rbx,[16+rbx] +$L$open_avx2_tail_256_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + + inc r8 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + cmp r8,rcx + jb NEAR $L$open_avx2_tail_256_rounds_and_x1hash + cmp r8,10 + jne NEAR $L$open_avx2_tail_256_rounds + mov r8,rbx + sub rbx,rsi + mov rcx,rbx + mov rbx,QWORD[((160+128))+rbp] +$L$open_avx2_tail_256_hash: + add rcx,16 + cmp rcx,rbx + jg NEAR $L$open_avx2_tail_256_done + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + jmp NEAR $L$open_avx2_tail_256_hash +$L$open_avx2_tail_256_done: + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm1 + vmovdqu YMMWORD[(64+0)+rdi],ymm5 + vmovdqu YMMWORD[(96+0)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + lea rsi,[128+rsi] + lea rdi,[128+rdi] + sub rbx,4*32 + jmp NEAR $L$open_avx2_tail_128_xor + +$L$open_avx2_tail_384: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + + mov QWORD[((160+128))+rbp],rbx + mov rcx,rbx + sub rcx,8*32 + shr rcx,4 + add rcx,6 + mov r8,10 + cmp rcx,10 + cmovg rcx,r8 + mov rbx,rsi + xor r8,r8 +$L$open_avx2_tail_384_rounds_and_x2hash: + add r10,QWORD[((0+0))+rbx] + adc r11,QWORD[((8+0))+rbx] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rbx,[16+rbx] +$L$open_avx2_tail_384_rounds_and_x1hash: + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + add r10,QWORD[((0+0))+rbx] + adc r11,QWORD[((8+0))+rbx] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rbx,[16+rbx] + inc r8 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + + cmp r8,rcx + jb NEAR $L$open_avx2_tail_384_rounds_and_x2hash + cmp r8,10 + jne NEAR $L$open_avx2_tail_384_rounds_and_x1hash + mov r8,rbx + sub rbx,rsi + mov rcx,rbx + mov rbx,QWORD[((160+128))+rbp] +$L$open_avx2_384_tail_hash: + add rcx,16 + cmp rcx,rbx + jg NEAR $L$open_avx2_384_tail_done + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + jmp NEAR $L$open_avx2_384_tail_hash +$L$open_avx2_384_tail_done: + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm2 + vmovdqu YMMWORD[(64+0)+rdi],ymm6 + vmovdqu YMMWORD[(96+0)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm1 + vmovdqu YMMWORD[(64+128)+rdi],ymm5 + vmovdqu YMMWORD[(96+128)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + lea rsi,[256+rsi] + lea rdi,[256+rdi] + sub rbx,8*32 + jmp NEAR $L$open_avx2_tail_128_xor + +$L$open_avx2_tail_512: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + xor rcx,rcx + mov r8,rsi +$L$open_avx2_tail_512_rounds_and_x2hash: + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] +$L$open_avx2_tail_512_rounds_and_x1hash: + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + add r10,QWORD[((0+16))+r8] + adc r11,QWORD[((8+16))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[32+r8] + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + + inc rcx + cmp rcx,4 + jl NEAR $L$open_avx2_tail_512_rounds_and_x2hash + cmp rcx,10 + jne NEAR $L$open_avx2_tail_512_rounds_and_x1hash + mov rcx,rbx + sub rcx,12*32 + and rcx,-16 +$L$open_avx2_tail_512_hash: + test rcx,rcx + je NEAR $L$open_avx2_tail_512_done + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + sub rcx,2*8 + jmp NEAR $L$open_avx2_tail_512_hash +$L$open_avx2_tail_512_done: + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + lea rsi,[384+rsi] + lea rdi,[384+rdi] + sub rbx,12*32 +$L$open_avx2_tail_128_xor: + cmp rbx,32 + jb NEAR $L$open_avx2_tail_32_xor + sub rbx,32 + vpxor ymm0,ymm0,YMMWORD[rsi] + vmovdqu YMMWORD[rdi],ymm0 + lea rsi,[32+rsi] + lea rdi,[32+rdi] + vmovdqa ymm0,ymm4 + vmovdqa ymm4,ymm8 + vmovdqa ymm8,ymm12 + jmp NEAR $L$open_avx2_tail_128_xor +$L$open_avx2_tail_32_xor: + cmp rbx,16 + vmovdqa xmm1,xmm0 + jb NEAR $L$open_avx2_exit + sub rbx,16 + + vpxor xmm1,xmm0,XMMWORD[rsi] + vmovdqu XMMWORD[rdi],xmm1 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + vperm2i128 ymm0,ymm0,ymm0,0x11 + vmovdqa xmm1,xmm0 +$L$open_avx2_exit: + vzeroupper + jmp NEAR $L$open_sse_tail_16 + +$L$open_avx2_192: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vmovdqa ymm11,ymm12 + vmovdqa ymm15,ymm13 + mov r10,10 +$L$open_avx2_192_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + + dec r10 + jne NEAR $L$open_avx2_192_rounds + vpaddd ymm0,ymm0,ymm2 + vpaddd ymm1,ymm1,ymm2 + vpaddd ymm4,ymm4,ymm6 + vpaddd ymm5,ymm5,ymm6 + vpaddd ymm8,ymm8,ymm10 + vpaddd ymm9,ymm9,ymm10 + vpaddd ymm12,ymm12,ymm11 + vpaddd ymm13,ymm13,ymm15 + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 +$L$open_avx2_short: + mov r8,r8 + call poly_hash_ad_internal +$L$open_avx2_short_hash_and_xor_loop: + cmp rbx,32 + jb NEAR $L$open_avx2_short_tail_32 + sub rbx,32 + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rsi] + adc r11,QWORD[((8+16))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + vpxor ymm0,ymm0,YMMWORD[rsi] + vmovdqu YMMWORD[rdi],ymm0 + lea rsi,[32+rsi] + lea rdi,[32+rdi] + + vmovdqa ymm0,ymm4 + vmovdqa ymm4,ymm8 + vmovdqa ymm8,ymm12 + vmovdqa ymm12,ymm1 + vmovdqa ymm1,ymm5 + vmovdqa ymm5,ymm9 + vmovdqa ymm9,ymm13 + vmovdqa ymm13,ymm2 + vmovdqa ymm2,ymm6 + jmp NEAR $L$open_avx2_short_hash_and_xor_loop +$L$open_avx2_short_tail_32: + cmp rbx,16 + vmovdqa xmm1,xmm0 + jb NEAR $L$open_avx2_short_tail_32_exit + sub rbx,16 + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + vpxor xmm3,xmm0,XMMWORD[rsi] + vmovdqu XMMWORD[rdi],xmm3 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + vextracti128 xmm1,ymm0,1 +$L$open_avx2_short_tail_32_exit: + vzeroupper + jmp NEAR $L$open_sse_tail_16 + +$L$open_avx2_320: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc] + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + mov r10,10 +$L$open_avx2_320_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + dec r10 + jne NEAR $L$open_avx2_320_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,ymm7 + vpaddd ymm5,ymm5,ymm7 + vpaddd ymm6,ymm6,ymm7 + vpaddd ymm8,ymm8,ymm11 + vpaddd ymm9,ymm9,ymm11 + vpaddd ymm10,ymm10,ymm11 + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 + vperm2i128 ymm9,ymm6,ymm2,0x02 + vperm2i128 ymm13,ymm14,ymm10,0x02 + vperm2i128 ymm2,ymm6,ymm2,0x13 + vperm2i128 ymm6,ymm14,ymm10,0x13 + jmp NEAR $L$open_avx2_short + + + + + +ALIGN 64 +chacha20_poly1305_seal_avx2: + + + + + + + + + + + + + vzeroupper + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vbroadcasti128 ymm4,XMMWORD[r9] + vbroadcasti128 ymm8,XMMWORD[16+r9] + vbroadcasti128 ymm12,XMMWORD[32+r9] + vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init] + cmp rbx,6*32 + jbe NEAR $L$seal_avx2_192 + cmp rbx,10*32 + jbe NEAR $L$seal_avx2_320 + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm3,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm7,ymm4 + vmovdqa YMMWORD[(160+64)+rbp],ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vmovdqa ymm11,ymm8 + vmovdqa YMMWORD[(160+96)+rbp],ymm8 + vmovdqa ymm15,ymm12 + vpaddd ymm14,ymm15,YMMWORD[$L$avx2_inc] + vpaddd ymm13,ymm14,YMMWORD[$L$avx2_inc] + vpaddd ymm12,ymm13,YMMWORD[$L$avx2_inc] + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + mov r10,10 +$L$seal_avx2_init_rounds: + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + + dec r10 + jnz NEAR $L$seal_avx2_init_rounds + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vperm2i128 ymm11,ymm15,ymm11,0x13 + vperm2i128 ymm15,ymm7,ymm3,0x02 + vperm2i128 ymm3,ymm7,ymm3,0x13 + vpand ymm15,ymm15,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm15 + mov r8,r8 + call poly_hash_ad_internal + + vpxor ymm3,ymm3,YMMWORD[rsi] + vpxor ymm11,ymm11,YMMWORD[32+rsi] + vmovdqu YMMWORD[rdi],ymm3 + vmovdqu YMMWORD[32+rdi],ymm11 + vperm2i128 ymm15,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm15,ymm15,YMMWORD[((0+64))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+64))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+64))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+64))+rsi] + vmovdqu YMMWORD[(0+64)+rdi],ymm15 + vmovdqu YMMWORD[(32+64)+rdi],ymm2 + vmovdqu YMMWORD[(64+64)+rdi],ymm6 + vmovdqu YMMWORD[(96+64)+rdi],ymm10 + vperm2i128 ymm15,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm15,ymm15,YMMWORD[((0+192))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+192))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+192))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+192))+rsi] + vmovdqu YMMWORD[(0+192)+rdi],ymm15 + vmovdqu YMMWORD[(32+192)+rdi],ymm1 + vmovdqu YMMWORD[(64+192)+rdi],ymm5 + vmovdqu YMMWORD[(96+192)+rdi],ymm9 + vperm2i128 ymm15,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm15 + + lea rsi,[320+rsi] + sub rbx,10*32 + mov rcx,10*32 + cmp rbx,4*32 + jbe NEAR $L$seal_avx2_short_hash_remainder + vpxor ymm0,ymm0,YMMWORD[rsi] + vpxor ymm4,ymm4,YMMWORD[32+rsi] + vpxor ymm8,ymm8,YMMWORD[64+rsi] + vpxor ymm12,ymm12,YMMWORD[96+rsi] + vmovdqu YMMWORD[320+rdi],ymm0 + vmovdqu YMMWORD[352+rdi],ymm4 + vmovdqu YMMWORD[384+rdi],ymm8 + vmovdqu YMMWORD[416+rdi],ymm12 + lea rsi,[128+rsi] + sub rbx,4*32 + mov rcx,8 + mov r8,2 + cmp rbx,4*32 + jbe NEAR $L$seal_avx2_tail_128 + cmp rbx,8*32 + jbe NEAR $L$seal_avx2_tail_256 + cmp rbx,12*32 + jbe NEAR $L$seal_avx2_tail_384 + cmp rbx,16*32 + jbe NEAR $L$seal_avx2_tail_512 + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + + sub rdi,16 + mov rcx,9 + jmp NEAR $L$seal_avx2_main_loop_rounds_entry +ALIGN 32 +$L$seal_avx2_main_loop: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + mov rcx,10 +ALIGN 32 +$L$seal_avx2_main_loop_rounds: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + add r15,rax + adc r9,rdx + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$seal_avx2_main_loop_rounds_entry: + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + add r15,rax + adc r9,rdx + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + add r10,QWORD[((0+32))+rdi] + adc r11,QWORD[((8+32))+rdi] + adc r12,1 + + lea rdi,[48+rdi] + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + add r15,rax + adc r9,rdx + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpalignr ymm12,ymm12,ymm12,4 + + dec rcx + jne NEAR $L$seal_avx2_main_loop_rounds + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm12,ymm8,0x02 + vperm2i128 ymm8,ymm12,ymm8,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi] + vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi] + vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi] + vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi] + vmovdqu YMMWORD[(0+384)+rdi],ymm3 + vmovdqu YMMWORD[(32+384)+rdi],ymm0 + vmovdqu YMMWORD[(64+384)+rdi],ymm4 + vmovdqu YMMWORD[(96+384)+rdi],ymm8 + + lea rsi,[512+rsi] + sub rbx,16*32 + cmp rbx,16*32 + jg NEAR $L$seal_avx2_main_loop + + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + mov rcx,10 + xor r8,r8 + + cmp rbx,12*32 + ja NEAR $L$seal_avx2_tail_512 + cmp rbx,8*32 + ja NEAR $L$seal_avx2_tail_384 + cmp rbx,4*32 + ja NEAR $L$seal_avx2_tail_256 + +$L$seal_avx2_tail_128: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + +$L$seal_avx2_tail_128_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_128_rounds_and_2xhash: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_128_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_128_rounds_and_2xhash + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + jmp NEAR $L$seal_avx2_short_loop + +$L$seal_avx2_tail_256: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + +$L$seal_avx2_tail_256_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_256_rounds_and_2xhash: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_256_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_256_rounds_and_2xhash + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm1 + vmovdqu YMMWORD[(64+0)+rdi],ymm5 + vmovdqu YMMWORD[(96+0)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + mov rcx,4*32 + lea rsi,[128+rsi] + sub rbx,4*32 + jmp NEAR $L$seal_avx2_short_hash_remainder + +$L$seal_avx2_tail_384: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + +$L$seal_avx2_tail_384_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_384_rounds_and_2xhash: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_384_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_384_rounds_and_2xhash + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm2 + vmovdqu YMMWORD[(64+0)+rdi],ymm6 + vmovdqu YMMWORD[(96+0)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm1 + vmovdqu YMMWORD[(64+128)+rdi],ymm5 + vmovdqu YMMWORD[(96+128)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + mov rcx,8*32 + lea rsi,[256+rsi] + sub rbx,8*32 + jmp NEAR $L$seal_avx2_short_hash_remainder + +$L$seal_avx2_tail_512: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + +$L$seal_avx2_tail_512_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_512_rounds_and_2xhash: + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + add r15,rax + adc r9,rdx + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + + + + + + + + + + + + + + + + + add r15,rax + adc r9,rdx + + + + + + + + + + + + + + + + + + + + + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_512_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_512_rounds_and_2xhash + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + mov rcx,12*32 + lea rsi,[384+rsi] + sub rbx,12*32 + jmp NEAR $L$seal_avx2_short_hash_remainder + +$L$seal_avx2_320: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc] + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + mov r10,10 +$L$seal_avx2_320_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + dec r10 + jne NEAR $L$seal_avx2_320_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,ymm7 + vpaddd ymm5,ymm5,ymm7 + vpaddd ymm6,ymm6,ymm7 + vpaddd ymm8,ymm8,ymm11 + vpaddd ymm9,ymm9,ymm11 + vpaddd ymm10,ymm10,ymm11 + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 + vperm2i128 ymm9,ymm6,ymm2,0x02 + vperm2i128 ymm13,ymm14,ymm10,0x02 + vperm2i128 ymm2,ymm6,ymm2,0x13 + vperm2i128 ymm6,ymm14,ymm10,0x13 + jmp NEAR $L$seal_avx2_short + +$L$seal_avx2_192: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vmovdqa ymm11,ymm12 + vmovdqa ymm15,ymm13 + mov r10,10 +$L$seal_avx2_192_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + + dec r10 + jne NEAR $L$seal_avx2_192_rounds + vpaddd ymm0,ymm0,ymm2 + vpaddd ymm1,ymm1,ymm2 + vpaddd ymm4,ymm4,ymm6 + vpaddd ymm5,ymm5,ymm6 + vpaddd ymm8,ymm8,ymm10 + vpaddd ymm9,ymm9,ymm10 + vpaddd ymm12,ymm12,ymm11 + vpaddd ymm13,ymm13,ymm15 + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 +$L$seal_avx2_short: + mov r8,r8 + call poly_hash_ad_internal + xor rcx,rcx +$L$seal_avx2_short_hash_remainder: + cmp rcx,16 + jb NEAR $L$seal_avx2_short_loop + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + sub rcx,16 + add rdi,16 + jmp NEAR $L$seal_avx2_short_hash_remainder +$L$seal_avx2_short_loop: + cmp rbx,32 + jb NEAR $L$seal_avx2_short_tail + sub rbx,32 + + vpxor ymm0,ymm0,YMMWORD[rsi] + vmovdqu YMMWORD[rdi],ymm0 + lea rsi,[32+rsi] + + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + + vmovdqa ymm0,ymm4 + vmovdqa ymm4,ymm8 + vmovdqa ymm8,ymm12 + vmovdqa ymm12,ymm1 + vmovdqa ymm1,ymm5 + vmovdqa ymm5,ymm9 + vmovdqa ymm9,ymm13 + vmovdqa ymm13,ymm2 + vmovdqa ymm2,ymm6 + jmp NEAR $L$seal_avx2_short_loop +$L$seal_avx2_short_tail: + cmp rbx,16 + jb NEAR $L$seal_avx2_exit + sub rbx,16 + vpxor xmm3,xmm0,XMMWORD[rsi] + vmovdqu XMMWORD[rdi],xmm3 + lea rsi,[16+rsi] + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] + vextracti128 xmm0,ymm0,1 +$L$seal_avx2_exit: + vzeroupper + jmp NEAR $L$seal_sse_tail_16 + + diff --git a/zeroidc/vendor/ring/pregenerated/tmp/ecp_nistz256-x86-win32n.asm b/zeroidc/vendor/ring/pregenerated/tmp/ecp_nistz256-x86-win32n.asm new file mode 100644 index 000000000..85b53b38e --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/ecp_nistz256-x86-win32n.asm @@ -0,0 +1,1105 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +;extern _GFp_ia32cap_P +L$ONE_mont: +dd 1,0,0,-1,-1,-1,-2,0 +align 16 +__ecp_nistz256_div_by_2: + mov ebp,DWORD [esi] + xor edx,edx + mov ebx,DWORD [4+esi] + mov eax,ebp + and ebp,1 + mov ecx,DWORD [8+esi] + sub edx,ebp + add eax,edx + adc ebx,edx + mov DWORD [edi],eax + adc ecx,edx + mov DWORD [4+edi],ebx + mov DWORD [8+edi],ecx + mov eax,DWORD [12+esi] + mov ebx,DWORD [16+esi] + adc eax,0 + mov ecx,DWORD [20+esi] + adc ebx,0 + mov DWORD [12+edi],eax + adc ecx,0 + mov DWORD [16+edi],ebx + mov DWORD [20+edi],ecx + mov eax,DWORD [24+esi] + mov ebx,DWORD [28+esi] + adc eax,ebp + adc ebx,edx + mov DWORD [24+edi],eax + sbb esi,esi + mov DWORD [28+edi],ebx + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + mov edx,DWORD [12+edi] + shr eax,1 + mov ebp,ebx + shl ebx,31 + or eax,ebx + shr ebp,1 + mov ebx,ecx + shl ecx,31 + mov DWORD [edi],eax + or ebp,ecx + mov eax,DWORD [16+edi] + shr ebx,1 + mov ecx,edx + shl edx,31 + mov DWORD [4+edi],ebp + or ebx,edx + mov ebp,DWORD [20+edi] + shr ecx,1 + mov edx,eax + shl eax,31 + mov DWORD [8+edi],ebx + or ecx,eax + mov ebx,DWORD [24+edi] + shr edx,1 + mov eax,ebp + shl ebp,31 + mov DWORD [12+edi],ecx + or edx,ebp + mov ecx,DWORD [28+edi] + shr eax,1 + mov ebp,ebx + shl ebx,31 + mov DWORD [16+edi],edx + or eax,ebx + shr ebp,1 + mov ebx,ecx + shl ecx,31 + mov DWORD [20+edi],eax + or ebp,ecx + shr ebx,1 + shl esi,31 + mov DWORD [24+edi],ebp + or ebx,esi + mov DWORD [28+edi],ebx + ret +global _GFp_nistz256_add +align 16 +_GFp_nistz256_add: +L$_GFp_nistz256_add_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [24+esp] + mov ebp,DWORD [28+esp] + mov edi,DWORD [20+esp] + call __ecp_nistz256_add + pop edi + pop esi + pop ebx + pop ebp + ret +align 16 +__ecp_nistz256_add: + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + add eax,DWORD [ebp] + mov edx,DWORD [12+esi] + adc ebx,DWORD [4+ebp] + mov DWORD [edi],eax + adc ecx,DWORD [8+ebp] + mov DWORD [4+edi],ebx + adc edx,DWORD [12+ebp] + mov DWORD [8+edi],ecx + mov DWORD [12+edi],edx + mov eax,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + adc eax,DWORD [16+ebp] + mov edx,DWORD [28+esi] + adc ebx,DWORD [20+ebp] + mov DWORD [16+edi],eax + adc ecx,DWORD [24+ebp] + mov DWORD [20+edi],ebx + mov esi,0 + adc edx,DWORD [28+ebp] + mov DWORD [24+edi],ecx + adc esi,0 + mov DWORD [28+edi],edx + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + sub eax,-1 + mov edx,DWORD [12+edi] + sbb ebx,-1 + mov eax,DWORD [16+edi] + sbb ecx,-1 + mov ebx,DWORD [20+edi] + sbb edx,0 + mov ecx,DWORD [24+edi] + sbb eax,0 + mov edx,DWORD [28+edi] + sbb ebx,0 + sbb ecx,1 + sbb edx,-1 + sbb esi,0 + not esi + mov eax,DWORD [edi] + mov ebp,esi + mov ebx,DWORD [4+edi] + shr ebp,31 + mov ecx,DWORD [8+edi] + sub eax,esi + mov edx,DWORD [12+edi] + sbb ebx,esi + mov DWORD [edi],eax + sbb ecx,esi + mov DWORD [4+edi],ebx + sbb edx,0 + mov DWORD [8+edi],ecx + mov DWORD [12+edi],edx + mov eax,DWORD [16+edi] + mov ebx,DWORD [20+edi] + mov ecx,DWORD [24+edi] + sbb eax,0 + mov edx,DWORD [28+edi] + sbb ebx,0 + mov DWORD [16+edi],eax + sbb ecx,ebp + mov DWORD [20+edi],ebx + sbb edx,esi + mov DWORD [24+edi],ecx + mov DWORD [28+edi],edx + ret +align 16 +__ecp_nistz256_sub: + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + sub eax,DWORD [ebp] + mov edx,DWORD [12+esi] + sbb ebx,DWORD [4+ebp] + mov DWORD [edi],eax + sbb ecx,DWORD [8+ebp] + mov DWORD [4+edi],ebx + sbb edx,DWORD [12+ebp] + mov DWORD [8+edi],ecx + mov DWORD [12+edi],edx + mov eax,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + sbb eax,DWORD [16+ebp] + mov edx,DWORD [28+esi] + sbb ebx,DWORD [20+ebp] + sbb ecx,DWORD [24+ebp] + mov DWORD [16+edi],eax + sbb edx,DWORD [28+ebp] + mov DWORD [20+edi],ebx + sbb esi,esi + mov DWORD [24+edi],ecx + mov DWORD [28+edi],edx + mov eax,DWORD [edi] + mov ebp,esi + mov ebx,DWORD [4+edi] + shr ebp,31 + mov ecx,DWORD [8+edi] + add eax,esi + mov edx,DWORD [12+edi] + adc ebx,esi + mov DWORD [edi],eax + adc ecx,esi + mov DWORD [4+edi],ebx + adc edx,0 + mov DWORD [8+edi],ecx + mov DWORD [12+edi],edx + mov eax,DWORD [16+edi] + mov ebx,DWORD [20+edi] + mov ecx,DWORD [24+edi] + adc eax,0 + mov edx,DWORD [28+edi] + adc ebx,0 + mov DWORD [16+edi],eax + adc ecx,ebp + mov DWORD [20+edi],ebx + adc edx,esi + mov DWORD [24+edi],ecx + mov DWORD [28+edi],edx + ret +global _GFp_nistz256_neg +align 16 +_GFp_nistz256_neg: +L$_GFp_nistz256_neg_begin: + push ebp + push ebx + push esi + push edi + mov ebp,DWORD [24+esp] + mov edi,DWORD [20+esp] + xor eax,eax + sub esp,32 + mov DWORD [esp],eax + mov esi,esp + mov DWORD [4+esp],eax + mov DWORD [8+esp],eax + mov DWORD [12+esp],eax + mov DWORD [16+esp],eax + mov DWORD [20+esp],eax + mov DWORD [24+esp],eax + mov DWORD [28+esp],eax + call __ecp_nistz256_sub + add esp,32 + pop edi + pop esi + pop ebx + pop ebp + ret +align 16 +__picup_eax: + mov eax,DWORD [esp] + ret +global _GFp_nistz256_mul_mont +align 16 +_GFp_nistz256_mul_mont: +L$_GFp_nistz256_mul_mont_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [24+esp] + mov ebp,DWORD [28+esp] + call __picup_eax +L$000pic: + lea eax,[_GFp_ia32cap_P] + mov eax,DWORD [eax] + mov edi,DWORD [20+esp] + call __ecp_nistz256_mul_mont + pop edi + pop esi + pop ebx + pop ebp + ret +align 16 +__ecp_nistz256_mul_mont: + mov edx,esp + sub esp,256 + movd xmm7,DWORD [ebp] + lea ebp,[4+ebp] + pcmpeqd xmm6,xmm6 + psrlq xmm6,48 + pshuflw xmm7,xmm7,220 + and esp,-64 + pshufd xmm7,xmm7,220 + lea ebx,[128+esp] + movd xmm0,DWORD [esi] + pshufd xmm0,xmm0,204 + movd xmm1,DWORD [4+esi] + movdqa [ebx],xmm0 + pmuludq xmm0,xmm7 + movd xmm2,DWORD [8+esi] + pshufd xmm1,xmm1,204 + movdqa [16+ebx],xmm1 + pmuludq xmm1,xmm7 + movq xmm4,xmm0 + pslldq xmm4,6 + paddq xmm4,xmm0 + movdqa xmm5,xmm4 + psrldq xmm4,10 + pand xmm5,xmm6 + movd xmm3,DWORD [12+esi] + pshufd xmm2,xmm2,204 + movdqa [32+ebx],xmm2 + pmuludq xmm2,xmm7 + paddq xmm1,xmm4 + movdqa [esp],xmm1 + movd xmm0,DWORD [16+esi] + pshufd xmm3,xmm3,204 + movdqa [48+ebx],xmm3 + pmuludq xmm3,xmm7 + movdqa [16+esp],xmm2 + movd xmm1,DWORD [20+esi] + pshufd xmm0,xmm0,204 + movdqa [64+ebx],xmm0 + pmuludq xmm0,xmm7 + paddq xmm3,xmm5 + movdqa [32+esp],xmm3 + movd xmm2,DWORD [24+esi] + pshufd xmm1,xmm1,204 + movdqa [80+ebx],xmm1 + pmuludq xmm1,xmm7 + movdqa [48+esp],xmm0 + pshufd xmm4,xmm5,177 + movd xmm3,DWORD [28+esi] + pshufd xmm2,xmm2,204 + movdqa [96+ebx],xmm2 + pmuludq xmm2,xmm7 + movdqa [64+esp],xmm1 + psubq xmm4,xmm5 + movd xmm0,DWORD [ebp] + pshufd xmm3,xmm3,204 + movdqa [112+ebx],xmm3 + pmuludq xmm3,xmm7 + pshuflw xmm7,xmm0,220 + movdqa xmm0,[ebx] + pshufd xmm7,xmm7,220 + mov ecx,6 + lea ebp,[4+ebp] + jmp NEAR L$001madd_sse2 +align 16 +L$001madd_sse2: + paddq xmm2,xmm5 + paddq xmm3,xmm4 + movdqa xmm1,[16+ebx] + pmuludq xmm0,xmm7 + movdqa [80+esp],xmm2 + movdqa xmm2,[32+ebx] + pmuludq xmm1,xmm7 + movdqa [96+esp],xmm3 + paddq xmm0,[esp] + movdqa xmm3,[48+ebx] + pmuludq xmm2,xmm7 + movq xmm4,xmm0 + pslldq xmm4,6 + paddq xmm1,[16+esp] + paddq xmm4,xmm0 + movdqa xmm5,xmm4 + psrldq xmm4,10 + movdqa xmm0,[64+ebx] + pmuludq xmm3,xmm7 + paddq xmm1,xmm4 + paddq xmm2,[32+esp] + movdqa [esp],xmm1 + movdqa xmm1,[80+ebx] + pmuludq xmm0,xmm7 + paddq xmm3,[48+esp] + movdqa [16+esp],xmm2 + pand xmm5,xmm6 + movdqa xmm2,[96+ebx] + pmuludq xmm1,xmm7 + paddq xmm3,xmm5 + paddq xmm0,[64+esp] + movdqa [32+esp],xmm3 + pshufd xmm4,xmm5,177 + movdqa xmm3,xmm7 + pmuludq xmm2,xmm7 + movd xmm7,DWORD [ebp] + lea ebp,[4+ebp] + paddq xmm1,[80+esp] + psubq xmm4,xmm5 + movdqa [48+esp],xmm0 + pshuflw xmm7,xmm7,220 + pmuludq xmm3,[112+ebx] + pshufd xmm7,xmm7,220 + movdqa xmm0,[ebx] + movdqa [64+esp],xmm1 + paddq xmm2,[96+esp] + dec ecx + jnz NEAR L$001madd_sse2 + paddq xmm2,xmm5 + paddq xmm3,xmm4 + movdqa xmm1,[16+ebx] + pmuludq xmm0,xmm7 + movdqa [80+esp],xmm2 + movdqa xmm2,[32+ebx] + pmuludq xmm1,xmm7 + movdqa [96+esp],xmm3 + paddq xmm0,[esp] + movdqa xmm3,[48+ebx] + pmuludq xmm2,xmm7 + movq xmm4,xmm0 + pslldq xmm4,6 + paddq xmm1,[16+esp] + paddq xmm4,xmm0 + movdqa xmm5,xmm4 + psrldq xmm4,10 + movdqa xmm0,[64+ebx] + pmuludq xmm3,xmm7 + paddq xmm1,xmm4 + paddq xmm2,[32+esp] + movdqa [esp],xmm1 + movdqa xmm1,[80+ebx] + pmuludq xmm0,xmm7 + paddq xmm3,[48+esp] + movdqa [16+esp],xmm2 + pand xmm5,xmm6 + movdqa xmm2,[96+ebx] + pmuludq xmm1,xmm7 + paddq xmm3,xmm5 + paddq xmm0,[64+esp] + movdqa [32+esp],xmm3 + pshufd xmm4,xmm5,177 + movdqa xmm3,[112+ebx] + pmuludq xmm2,xmm7 + paddq xmm1,[80+esp] + psubq xmm4,xmm5 + movdqa [48+esp],xmm0 + pmuludq xmm3,xmm7 + pcmpeqd xmm7,xmm7 + movdqa xmm0,[esp] + pslldq xmm7,8 + movdqa [64+esp],xmm1 + paddq xmm2,[96+esp] + paddq xmm2,xmm5 + paddq xmm3,xmm4 + movdqa [80+esp],xmm2 + movdqa [96+esp],xmm3 + movdqa xmm1,[16+esp] + movdqa xmm2,[32+esp] + movdqa xmm3,[48+esp] + movq xmm4,xmm0 + pand xmm0,xmm7 + xor ebp,ebp + pslldq xmm4,6 + movq xmm5,xmm1 + paddq xmm0,xmm4 + pand xmm1,xmm7 + psrldq xmm0,6 + movd eax,xmm0 + psrldq xmm0,4 + paddq xmm5,xmm0 + movdqa xmm0,[64+esp] + sub eax,-1 + pslldq xmm5,6 + movq xmm4,xmm2 + paddq xmm1,xmm5 + pand xmm2,xmm7 + psrldq xmm1,6 + mov DWORD [edi],eax + movd eax,xmm1 + psrldq xmm1,4 + paddq xmm4,xmm1 + movdqa xmm1,[80+esp] + sbb eax,-1 + pslldq xmm4,6 + movq xmm5,xmm3 + paddq xmm2,xmm4 + pand xmm3,xmm7 + psrldq xmm2,6 + mov DWORD [4+edi],eax + movd eax,xmm2 + psrldq xmm2,4 + paddq xmm5,xmm2 + movdqa xmm2,[96+esp] + sbb eax,-1 + pslldq xmm5,6 + movq xmm4,xmm0 + paddq xmm3,xmm5 + pand xmm0,xmm7 + psrldq xmm3,6 + mov DWORD [8+edi],eax + movd eax,xmm3 + psrldq xmm3,4 + paddq xmm4,xmm3 + sbb eax,0 + pslldq xmm4,6 + movq xmm5,xmm1 + paddq xmm0,xmm4 + pand xmm1,xmm7 + psrldq xmm0,6 + mov DWORD [12+edi],eax + movd eax,xmm0 + psrldq xmm0,4 + paddq xmm5,xmm0 + sbb eax,0 + pslldq xmm5,6 + movq xmm4,xmm2 + paddq xmm1,xmm5 + pand xmm2,xmm7 + psrldq xmm1,6 + movd ebx,xmm1 + psrldq xmm1,4 + mov esp,edx + paddq xmm4,xmm1 + pslldq xmm4,6 + paddq xmm2,xmm4 + psrldq xmm2,6 + movd ecx,xmm2 + psrldq xmm2,4 + sbb ebx,0 + movd edx,xmm2 + pextrw esi,xmm2,2 + sbb ecx,1 + sbb edx,-1 + sbb esi,0 + sub ebp,esi + add DWORD [edi],esi + adc DWORD [4+edi],esi + adc DWORD [8+edi],esi + adc DWORD [12+edi],0 + adc eax,0 + adc ebx,0 + mov DWORD [16+edi],eax + adc ecx,ebp + mov DWORD [20+edi],ebx + adc edx,esi + mov DWORD [24+edi],ecx + mov DWORD [28+edi],edx + ret +global _GFp_nistz256_point_double +align 16 +_GFp_nistz256_point_double: +L$_GFp_nistz256_point_double_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [24+esp] + sub esp,164 + call __picup_eax +L$002pic: + lea edx,[_GFp_ia32cap_P] + mov ebp,DWORD [edx] +L$point_double_shortcut: + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edx,DWORD [12+esi] + mov DWORD [96+esp],eax + mov DWORD [100+esp],ebx + mov DWORD [104+esp],ecx + mov DWORD [108+esp],edx + mov eax,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov edx,DWORD [28+esi] + mov DWORD [112+esp],eax + mov DWORD [116+esp],ebx + mov DWORD [120+esp],ecx + mov DWORD [124+esp],edx + mov DWORD [160+esp],ebp + lea ebp,[32+esi] + lea esi,[32+esi] + lea edi,[esp] + call __ecp_nistz256_add + mov eax,DWORD [160+esp] + mov esi,64 + add esi,DWORD [188+esp] + lea edi,[64+esp] + mov ebp,esi + call __ecp_nistz256_mul_mont + mov eax,DWORD [160+esp] + lea esi,[esp] + lea ebp,[esp] + lea edi,[esp] + call __ecp_nistz256_mul_mont + mov eax,DWORD [160+esp] + mov ebp,DWORD [188+esp] + lea esi,[32+ebp] + lea ebp,[64+ebp] + lea edi,[128+esp] + call __ecp_nistz256_mul_mont + lea esi,[96+esp] + lea ebp,[64+esp] + lea edi,[32+esp] + call __ecp_nistz256_add + mov edi,64 + lea esi,[128+esp] + lea ebp,[128+esp] + add edi,DWORD [184+esp] + call __ecp_nistz256_add + lea esi,[96+esp] + lea ebp,[64+esp] + lea edi,[64+esp] + call __ecp_nistz256_sub + mov eax,DWORD [160+esp] + lea esi,[esp] + lea ebp,[esp] + lea edi,[128+esp] + call __ecp_nistz256_mul_mont + mov eax,DWORD [160+esp] + lea esi,[32+esp] + lea ebp,[64+esp] + lea edi,[32+esp] + call __ecp_nistz256_mul_mont + mov edi,32 + lea esi,[128+esp] + add edi,DWORD [184+esp] + call __ecp_nistz256_div_by_2 + lea esi,[32+esp] + lea ebp,[32+esp] + lea edi,[128+esp] + call __ecp_nistz256_add + mov eax,DWORD [160+esp] + lea esi,[96+esp] + lea ebp,[esp] + lea edi,[esp] + call __ecp_nistz256_mul_mont + lea esi,[128+esp] + lea ebp,[32+esp] + lea edi,[32+esp] + call __ecp_nistz256_add + lea esi,[esp] + lea ebp,[esp] + lea edi,[128+esp] + call __ecp_nistz256_add + mov eax,DWORD [160+esp] + lea esi,[32+esp] + lea ebp,[32+esp] + mov edi,DWORD [184+esp] + call __ecp_nistz256_mul_mont + mov esi,edi + lea ebp,[128+esp] + call __ecp_nistz256_sub + lea esi,[esp] + mov ebp,edi + lea edi,[esp] + call __ecp_nistz256_sub + mov eax,DWORD [160+esp] + mov esi,edi + lea ebp,[32+esp] + call __ecp_nistz256_mul_mont + mov ebp,32 + lea esi,[esp] + add ebp,DWORD [184+esp] + mov edi,ebp + call __ecp_nistz256_sub + add esp,164 + pop edi + pop esi + pop ebx + pop ebp + ret +global _GFp_nistz256_point_add_affine +align 16 +_GFp_nistz256_point_add_affine: +L$_GFp_nistz256_point_add_affine_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [24+esp] + sub esp,492 + call __picup_eax +L$003pic: + lea edx,[_GFp_ia32cap_P] + mov ebp,DWORD [edx] + lea edi,[96+esp] + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edx,DWORD [12+esi] + mov DWORD [edi],eax + mov DWORD [488+esp],ebp + mov DWORD [4+edi],ebx + mov DWORD [8+edi],ecx + mov DWORD [12+edi],edx + mov eax,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov edx,DWORD [28+esi] + mov DWORD [16+edi],eax + mov DWORD [20+edi],ebx + mov DWORD [24+edi],ecx + mov DWORD [28+edi],edx + mov eax,DWORD [32+esi] + mov ebx,DWORD [36+esi] + mov ecx,DWORD [40+esi] + mov edx,DWORD [44+esi] + mov DWORD [32+edi],eax + mov DWORD [36+edi],ebx + mov DWORD [40+edi],ecx + mov DWORD [44+edi],edx + mov eax,DWORD [48+esi] + mov ebx,DWORD [52+esi] + mov ecx,DWORD [56+esi] + mov edx,DWORD [60+esi] + mov DWORD [48+edi],eax + mov DWORD [52+edi],ebx + mov DWORD [56+edi],ecx + mov DWORD [60+edi],edx + mov eax,DWORD [64+esi] + mov ebx,DWORD [68+esi] + mov ecx,DWORD [72+esi] + mov edx,DWORD [76+esi] + mov DWORD [64+edi],eax + mov ebp,eax + mov DWORD [68+edi],ebx + or ebp,ebx + mov DWORD [72+edi],ecx + or ebp,ecx + mov DWORD [76+edi],edx + or ebp,edx + mov eax,DWORD [80+esi] + mov ebx,DWORD [84+esi] + mov ecx,DWORD [88+esi] + mov edx,DWORD [92+esi] + mov DWORD [80+edi],eax + or ebp,eax + mov DWORD [84+edi],ebx + or ebp,ebx + mov DWORD [88+edi],ecx + or ebp,ecx + mov DWORD [92+edi],edx + or ebp,edx + xor eax,eax + mov esi,DWORD [520+esp] + sub eax,ebp + or ebp,eax + sar ebp,31 + mov DWORD [480+esp],ebp + lea edi,[192+esp] + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edx,DWORD [12+esi] + mov DWORD [edi],eax + mov ebp,eax + mov DWORD [4+edi],ebx + or ebp,ebx + mov DWORD [8+edi],ecx + or ebp,ecx + mov DWORD [12+edi],edx + or ebp,edx + mov eax,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov edx,DWORD [28+esi] + mov DWORD [16+edi],eax + or ebp,eax + mov DWORD [20+edi],ebx + or ebp,ebx + mov DWORD [24+edi],ecx + or ebp,ecx + mov DWORD [28+edi],edx + or ebp,edx + mov eax,DWORD [32+esi] + mov ebx,DWORD [36+esi] + mov ecx,DWORD [40+esi] + mov edx,DWORD [44+esi] + mov DWORD [32+edi],eax + or ebp,eax + mov DWORD [36+edi],ebx + or ebp,ebx + mov DWORD [40+edi],ecx + or ebp,ecx + mov DWORD [44+edi],edx + or ebp,edx + mov eax,DWORD [48+esi] + mov ebx,DWORD [52+esi] + mov ecx,DWORD [56+esi] + mov edx,DWORD [60+esi] + mov DWORD [48+edi],eax + or ebp,eax + mov DWORD [52+edi],ebx + or ebp,ebx + mov DWORD [56+edi],ecx + or ebp,ecx + mov DWORD [60+edi],edx + or ebp,edx + xor ebx,ebx + mov eax,DWORD [488+esp] + sub ebx,ebp + lea esi,[160+esp] + or ebx,ebp + lea ebp,[160+esp] + sar ebx,31 + lea edi,[288+esp] + mov DWORD [484+esp],ebx + call __ecp_nistz256_mul_mont + mov eax,DWORD [488+esp] + lea esi,[192+esp] + mov ebp,edi + lea edi,[256+esp] + call __ecp_nistz256_mul_mont + mov eax,DWORD [488+esp] + lea esi,[160+esp] + lea ebp,[288+esp] + lea edi,[288+esp] + call __ecp_nistz256_mul_mont + lea esi,[256+esp] + lea ebp,[96+esp] + lea edi,[320+esp] + call __ecp_nistz256_sub + mov eax,DWORD [488+esp] + lea esi,[224+esp] + lea ebp,[288+esp] + lea edi,[288+esp] + call __ecp_nistz256_mul_mont + mov eax,DWORD [488+esp] + lea esi,[160+esp] + lea ebp,[320+esp] + lea edi,[64+esp] + call __ecp_nistz256_mul_mont + lea esi,[288+esp] + lea ebp,[128+esp] + lea edi,[352+esp] + call __ecp_nistz256_sub + mov eax,DWORD [488+esp] + lea esi,[320+esp] + lea ebp,[320+esp] + lea edi,[384+esp] + call __ecp_nistz256_mul_mont + mov eax,DWORD [488+esp] + lea esi,[352+esp] + lea ebp,[352+esp] + lea edi,[448+esp] + call __ecp_nistz256_mul_mont + mov eax,DWORD [488+esp] + lea esi,[96+esp] + lea ebp,[384+esp] + lea edi,[256+esp] + call __ecp_nistz256_mul_mont + mov eax,DWORD [488+esp] + lea esi,[320+esp] + lea ebp,[384+esp] + lea edi,[416+esp] + call __ecp_nistz256_mul_mont + lea esi,[256+esp] + lea ebp,[256+esp] + lea edi,[384+esp] + call __ecp_nistz256_add + lea esi,[448+esp] + lea ebp,[384+esp] + lea edi,[esp] + call __ecp_nistz256_sub + lea esi,[esp] + lea ebp,[416+esp] + lea edi,[esp] + call __ecp_nistz256_sub + lea esi,[256+esp] + lea ebp,[esp] + lea edi,[32+esp] + call __ecp_nistz256_sub + mov eax,DWORD [488+esp] + lea esi,[416+esp] + lea ebp,[128+esp] + lea edi,[288+esp] + call __ecp_nistz256_mul_mont + mov eax,DWORD [488+esp] + lea esi,[352+esp] + lea ebp,[32+esp] + lea edi,[32+esp] + call __ecp_nistz256_mul_mont + lea esi,[32+esp] + lea ebp,[288+esp] + lea edi,[32+esp] + call __ecp_nistz256_sub + mov ebp,DWORD [480+esp] + mov esi,DWORD [484+esp] + mov edi,DWORD [512+esp] + mov edx,ebp + not ebp + and edx,esi + and ebp,esi + not esi + mov eax,edx + and eax,DWORD [64+esp] + mov ebx,ebp + and ebx,1 + mov ecx,esi + and ecx,DWORD [160+esp] + or eax,ebx + or eax,ecx + mov DWORD [64+edi],eax + mov eax,edx + and eax,DWORD [68+esp] + mov ecx,esi + and ecx,DWORD [164+esp] + or eax,ecx + mov DWORD [68+edi],eax + mov eax,edx + and eax,DWORD [72+esp] + mov ecx,esi + and ecx,DWORD [168+esp] + or eax,ecx + mov DWORD [72+edi],eax + mov eax,edx + and eax,DWORD [76+esp] + mov ecx,esi + and ecx,DWORD [172+esp] + or eax,ebp + or eax,ecx + mov DWORD [76+edi],eax + mov eax,edx + and eax,DWORD [80+esp] + mov ecx,esi + and ecx,DWORD [176+esp] + or eax,ebp + or eax,ecx + mov DWORD [80+edi],eax + mov eax,edx + and eax,DWORD [84+esp] + mov ecx,esi + and ecx,DWORD [180+esp] + or eax,ebp + or eax,ecx + mov DWORD [84+edi],eax + mov eax,edx + and eax,DWORD [88+esp] + mov ebx,ebp + and ebx,-2 + mov ecx,esi + and ecx,DWORD [184+esp] + or eax,ebx + or eax,ecx + mov DWORD [88+edi],eax + mov eax,edx + and eax,DWORD [92+esp] + mov ecx,esi + and ecx,DWORD [188+esp] + or eax,ecx + mov DWORD [92+edi],eax + mov eax,edx + and eax,DWORD [esp] + mov ebx,ebp + and ebx,DWORD [192+esp] + mov ecx,esi + and ecx,DWORD [96+esp] + or eax,ebx + or eax,ecx + mov DWORD [edi],eax + mov eax,edx + and eax,DWORD [4+esp] + mov ebx,ebp + and ebx,DWORD [196+esp] + mov ecx,esi + and ecx,DWORD [100+esp] + or eax,ebx + or eax,ecx + mov DWORD [4+edi],eax + mov eax,edx + and eax,DWORD [8+esp] + mov ebx,ebp + and ebx,DWORD [200+esp] + mov ecx,esi + and ecx,DWORD [104+esp] + or eax,ebx + or eax,ecx + mov DWORD [8+edi],eax + mov eax,edx + and eax,DWORD [12+esp] + mov ebx,ebp + and ebx,DWORD [204+esp] + mov ecx,esi + and ecx,DWORD [108+esp] + or eax,ebx + or eax,ecx + mov DWORD [12+edi],eax + mov eax,edx + and eax,DWORD [16+esp] + mov ebx,ebp + and ebx,DWORD [208+esp] + mov ecx,esi + and ecx,DWORD [112+esp] + or eax,ebx + or eax,ecx + mov DWORD [16+edi],eax + mov eax,edx + and eax,DWORD [20+esp] + mov ebx,ebp + and ebx,DWORD [212+esp] + mov ecx,esi + and ecx,DWORD [116+esp] + or eax,ebx + or eax,ecx + mov DWORD [20+edi],eax + mov eax,edx + and eax,DWORD [24+esp] + mov ebx,ebp + and ebx,DWORD [216+esp] + mov ecx,esi + and ecx,DWORD [120+esp] + or eax,ebx + or eax,ecx + mov DWORD [24+edi],eax + mov eax,edx + and eax,DWORD [28+esp] + mov ebx,ebp + and ebx,DWORD [220+esp] + mov ecx,esi + and ecx,DWORD [124+esp] + or eax,ebx + or eax,ecx + mov DWORD [28+edi],eax + mov eax,edx + and eax,DWORD [32+esp] + mov ebx,ebp + and ebx,DWORD [224+esp] + mov ecx,esi + and ecx,DWORD [128+esp] + or eax,ebx + or eax,ecx + mov DWORD [32+edi],eax + mov eax,edx + and eax,DWORD [36+esp] + mov ebx,ebp + and ebx,DWORD [228+esp] + mov ecx,esi + and ecx,DWORD [132+esp] + or eax,ebx + or eax,ecx + mov DWORD [36+edi],eax + mov eax,edx + and eax,DWORD [40+esp] + mov ebx,ebp + and ebx,DWORD [232+esp] + mov ecx,esi + and ecx,DWORD [136+esp] + or eax,ebx + or eax,ecx + mov DWORD [40+edi],eax + mov eax,edx + and eax,DWORD [44+esp] + mov ebx,ebp + and ebx,DWORD [236+esp] + mov ecx,esi + and ecx,DWORD [140+esp] + or eax,ebx + or eax,ecx + mov DWORD [44+edi],eax + mov eax,edx + and eax,DWORD [48+esp] + mov ebx,ebp + and ebx,DWORD [240+esp] + mov ecx,esi + and ecx,DWORD [144+esp] + or eax,ebx + or eax,ecx + mov DWORD [48+edi],eax + mov eax,edx + and eax,DWORD [52+esp] + mov ebx,ebp + and ebx,DWORD [244+esp] + mov ecx,esi + and ecx,DWORD [148+esp] + or eax,ebx + or eax,ecx + mov DWORD [52+edi],eax + mov eax,edx + and eax,DWORD [56+esp] + mov ebx,ebp + and ebx,DWORD [248+esp] + mov ecx,esi + and ecx,DWORD [152+esp] + or eax,ebx + or eax,ecx + mov DWORD [56+edi],eax + mov eax,edx + and eax,DWORD [60+esp] + mov ebx,ebp + and ebx,DWORD [252+esp] + mov ecx,esi + and ecx,DWORD [156+esp] + or eax,ebx + or eax,ecx + mov DWORD [60+edi],eax + add esp,492 + pop edi + pop esi + pop ebx + pop ebp + ret +segment .bss +common _GFp_ia32cap_P 16 diff --git a/zeroidc/vendor/ring/pregenerated/tmp/ghash-x86-win32n.asm b/zeroidc/vendor/ring/pregenerated/tmp/ghash-x86-win32n.asm new file mode 100644 index 000000000..2ac02cb79 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/ghash-x86-win32n.asm @@ -0,0 +1,359 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _GFp_gcm_init_clmul +align 16 +_GFp_gcm_init_clmul: +L$_GFp_gcm_init_clmul_begin: + mov edx,DWORD [4+esp] + mov eax,DWORD [8+esp] + call L$000pic +L$000pic: + pop ecx + lea ecx,[(L$bswap-L$000pic)+ecx] + movdqu xmm2,[eax] + pshufd xmm2,xmm2,78 + pshufd xmm4,xmm2,255 + movdqa xmm3,xmm2 + psllq xmm2,1 + pxor xmm5,xmm5 + psrlq xmm3,63 + pcmpgtd xmm5,xmm4 + pslldq xmm3,8 + por xmm2,xmm3 + pand xmm5,[16+ecx] + pxor xmm2,xmm5 + movdqa xmm0,xmm2 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pshufd xmm4,xmm2,78 + pxor xmm3,xmm0 + pxor xmm4,xmm2 +db 102,15,58,68,194,0 +db 102,15,58,68,202,17 +db 102,15,58,68,220,0 + xorps xmm3,xmm0 + xorps xmm3,xmm1 + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufd xmm3,xmm2,78 + pshufd xmm4,xmm0,78 + pxor xmm3,xmm2 + movdqu [edx],xmm2 + pxor xmm4,xmm0 + movdqu [16+edx],xmm0 +db 102,15,58,15,227,8 + movdqu [32+edx],xmm4 + ret +global _GFp_gcm_gmult_clmul +align 16 +_GFp_gcm_gmult_clmul: +L$_GFp_gcm_gmult_clmul_begin: + mov eax,DWORD [4+esp] + mov edx,DWORD [8+esp] + call L$001pic +L$001pic: + pop ecx + lea ecx,[(L$bswap-L$001pic)+ecx] + movdqu xmm0,[eax] + movdqa xmm5,[ecx] + movups xmm2,[edx] +db 102,15,56,0,197 + movups xmm4,[32+edx] + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +db 102,15,58,68,194,0 +db 102,15,58,68,202,17 +db 102,15,58,68,220,0 + xorps xmm3,xmm0 + xorps xmm3,xmm1 + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +db 102,15,56,0,197 + movdqu [eax],xmm0 + ret +global _GFp_gcm_ghash_clmul +align 16 +_GFp_gcm_ghash_clmul: +L$_GFp_gcm_ghash_clmul_begin: + push ebp + push ebx + push esi + push edi + mov eax,DWORD [20+esp] + mov edx,DWORD [24+esp] + mov esi,DWORD [28+esp] + mov ebx,DWORD [32+esp] + call L$002pic +L$002pic: + pop ecx + lea ecx,[(L$bswap-L$002pic)+ecx] + movdqu xmm0,[eax] + movdqa xmm5,[ecx] + movdqu xmm2,[edx] +db 102,15,56,0,197 + sub ebx,16 + jz NEAR L$003odd_tail + movdqu xmm3,[esi] + movdqu xmm6,[16+esi] +db 102,15,56,0,221 +db 102,15,56,0,245 + movdqu xmm5,[32+edx] + pxor xmm0,xmm3 + pshufd xmm3,xmm6,78 + movdqa xmm7,xmm6 + pxor xmm3,xmm6 + lea esi,[32+esi] +db 102,15,58,68,242,0 +db 102,15,58,68,250,17 +db 102,15,58,68,221,0 + movups xmm2,[16+edx] + nop + sub ebx,32 + jbe NEAR L$004even_tail + jmp NEAR L$005mod_loop +align 32 +L$005mod_loop: + pshufd xmm4,xmm0,78 + movdqa xmm1,xmm0 + pxor xmm4,xmm0 + nop +db 102,15,58,68,194,0 +db 102,15,58,68,202,17 +db 102,15,58,68,229,16 + movups xmm2,[edx] + xorps xmm0,xmm6 + movdqa xmm5,[ecx] + xorps xmm1,xmm7 + movdqu xmm7,[esi] + pxor xmm3,xmm0 + movdqu xmm6,[16+esi] + pxor xmm3,xmm1 +db 102,15,56,0,253 + pxor xmm4,xmm3 + movdqa xmm3,xmm4 + psrldq xmm4,8 + pslldq xmm3,8 + pxor xmm1,xmm4 + pxor xmm0,xmm3 +db 102,15,56,0,245 + pxor xmm1,xmm7 + movdqa xmm7,xmm6 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 +db 102,15,58,68,242,0 + movups xmm5,[32+edx] + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + pshufd xmm3,xmm7,78 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm3,xmm7 + pxor xmm1,xmm4 +db 102,15,58,68,250,17 + movups xmm2,[16+edx] + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +db 102,15,58,68,221,0 + lea esi,[32+esi] + sub ebx,32 + ja NEAR L$005mod_loop +L$004even_tail: + pshufd xmm4,xmm0,78 + movdqa xmm1,xmm0 + pxor xmm4,xmm0 +db 102,15,58,68,194,0 +db 102,15,58,68,202,17 +db 102,15,58,68,229,16 + movdqa xmm5,[ecx] + xorps xmm0,xmm6 + xorps xmm1,xmm7 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + pxor xmm4,xmm3 + movdqa xmm3,xmm4 + psrldq xmm4,8 + pslldq xmm3,8 + pxor xmm1,xmm4 + pxor xmm0,xmm3 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + test ebx,ebx + jnz NEAR L$006done + movups xmm2,[edx] +L$003odd_tail: + movdqu xmm3,[esi] +db 102,15,56,0,221 + pxor xmm0,xmm3 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pshufd xmm4,xmm2,78 + pxor xmm3,xmm0 + pxor xmm4,xmm2 +db 102,15,58,68,194,0 +db 102,15,58,68,202,17 +db 102,15,58,68,220,0 + xorps xmm3,xmm0 + xorps xmm3,xmm1 + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +L$006done: +db 102,15,56,0,197 + movdqu [eax],xmm0 + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$bswap: +db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +db 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 +align 64 +L$007rem_8bit: +dw 0,450,900,582,1800,1738,1164,1358 +dw 3600,4050,3476,3158,2328,2266,2716,2910 +dw 7200,7650,8100,7782,6952,6890,6316,6510 +dw 4656,5106,4532,4214,5432,5370,5820,6014 +dw 14400,14722,15300,14854,16200,16010,15564,15630 +dw 13904,14226,13780,13334,12632,12442,13020,13086 +dw 9312,9634,10212,9766,9064,8874,8428,8494 +dw 10864,11186,10740,10294,11640,11450,12028,12094 +dw 28800,28994,29444,29382,30600,30282,29708,30158 +dw 32400,32594,32020,31958,31128,30810,31260,31710 +dw 27808,28002,28452,28390,27560,27242,26668,27118 +dw 25264,25458,24884,24822,26040,25722,26172,26622 +dw 18624,18690,19268,19078,20424,19978,19532,19854 +dw 18128,18194,17748,17558,16856,16410,16988,17310 +dw 21728,21794,22372,22182,21480,21034,20588,20910 +dw 23280,23346,22900,22710,24056,23610,24188,24510 +dw 57600,57538,57988,58182,58888,59338,58764,58446 +dw 61200,61138,60564,60758,59416,59866,60316,59998 +dw 64800,64738,65188,65382,64040,64490,63916,63598 +dw 62256,62194,61620,61814,62520,62970,63420,63102 +dw 55616,55426,56004,56070,56904,57226,56780,56334 +dw 55120,54930,54484,54550,53336,53658,54236,53790 +dw 50528,50338,50916,50982,49768,50090,49644,49198 +dw 52080,51890,51444,51510,52344,52666,53244,52798 +dw 37248,36930,37380,37830,38536,38730,38156,38094 +dw 40848,40530,39956,40406,39064,39258,39708,39646 +dw 36256,35938,36388,36838,35496,35690,35116,35054 +dw 33712,33394,32820,33270,33976,34170,34620,34558 +dw 43456,43010,43588,43910,44744,44810,44364,44174 +dw 42960,42514,42068,42390,41176,41242,41820,41630 +dw 46560,46114,46692,47014,45800,45866,45420,45230 +dw 48112,47666,47220,47542,48376,48442,49020,48830 +db 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 +db 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 +db 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 +db 0 diff --git a/zeroidc/vendor/ring/pregenerated/tmp/ghash-x86_64-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/ghash-x86_64-nasm.asm new file mode 100644 index 000000000..89e118555 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/ghash-x86_64-nasm.asm @@ -0,0 +1,1209 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + +EXTERN GFp_ia32cap_P +global GFp_gcm_init_clmul + +ALIGN 16 +GFp_gcm_init_clmul: + +$L$_init_clmul: +$L$SEH_begin_GFp_gcm_init_clmul: + +DB 0x48,0x83,0xec,0x18 +DB 0x0f,0x29,0x34,0x24 + movdqu xmm2,XMMWORD[rdx] + pshufd xmm2,xmm2,78 + + + pshufd xmm4,xmm2,255 + movdqa xmm3,xmm2 + psllq xmm2,1 + pxor xmm5,xmm5 + psrlq xmm3,63 + pcmpgtd xmm5,xmm4 + pslldq xmm3,8 + por xmm2,xmm3 + + + pand xmm5,XMMWORD[$L$0x1c2_polynomial] + pxor xmm2,xmm5 + + + pshufd xmm6,xmm2,78 + movdqa xmm0,xmm2 + pxor xmm6,xmm2 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +DB 102,15,58,68,194,0 +DB 102,15,58,68,202,17 +DB 102,15,58,68,222,0 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufd xmm3,xmm2,78 + pshufd xmm4,xmm0,78 + pxor xmm3,xmm2 + movdqu XMMWORD[rcx],xmm2 + pxor xmm4,xmm0 + movdqu XMMWORD[16+rcx],xmm0 +DB 102,15,58,15,227,8 + movdqu XMMWORD[32+rcx],xmm4 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +DB 102,15,58,68,194,0 +DB 102,15,58,68,202,17 +DB 102,15,58,68,222,0 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + movdqa xmm5,xmm0 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +DB 102,15,58,68,194,0 +DB 102,15,58,68,202,17 +DB 102,15,58,68,222,0 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufd xmm3,xmm5,78 + pshufd xmm4,xmm0,78 + pxor xmm3,xmm5 + movdqu XMMWORD[48+rcx],xmm5 + pxor xmm4,xmm0 + movdqu XMMWORD[64+rcx],xmm0 +DB 102,15,58,15,227,8 + movdqu XMMWORD[80+rcx],xmm4 + movaps xmm6,XMMWORD[rsp] + lea rsp,[24+rsp] +$L$SEH_end_GFp_gcm_init_clmul: + DB 0F3h,0C3h ;repret + + +global GFp_gcm_gmult_clmul + +ALIGN 16 +GFp_gcm_gmult_clmul: + +$L$_gmult_clmul: + movdqu xmm0,XMMWORD[rcx] + movdqa xmm5,XMMWORD[$L$bswap_mask] + movdqu xmm2,XMMWORD[rdx] + movdqu xmm4,XMMWORD[32+rdx] +DB 102,15,56,0,197 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +DB 102,15,58,68,194,0 +DB 102,15,58,68,202,17 +DB 102,15,58,68,220,0 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +DB 102,15,56,0,197 + movdqu XMMWORD[rcx],xmm0 + DB 0F3h,0C3h ;repret + + +global GFp_gcm_ghash_clmul + +ALIGN 32 +GFp_gcm_ghash_clmul: + +$L$_ghash_clmul: + lea rax,[((-136))+rsp] +$L$SEH_begin_GFp_gcm_ghash_clmul: + +DB 0x48,0x8d,0x60,0xe0 +DB 0x0f,0x29,0x70,0xe0 +DB 0x0f,0x29,0x78,0xf0 +DB 0x44,0x0f,0x29,0x00 +DB 0x44,0x0f,0x29,0x48,0x10 +DB 0x44,0x0f,0x29,0x50,0x20 +DB 0x44,0x0f,0x29,0x58,0x30 +DB 0x44,0x0f,0x29,0x60,0x40 +DB 0x44,0x0f,0x29,0x68,0x50 +DB 0x44,0x0f,0x29,0x70,0x60 +DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm10,XMMWORD[$L$bswap_mask] + + movdqu xmm0,XMMWORD[rcx] + movdqu xmm2,XMMWORD[rdx] + movdqu xmm7,XMMWORD[32+rdx] +DB 102,65,15,56,0,194 + + sub r9,0x10 + jz NEAR $L$odd_tail + + movdqu xmm6,XMMWORD[16+rdx] + lea rax,[GFp_ia32cap_P] + mov eax,DWORD[4+rax] + cmp r9,0x30 + jb NEAR $L$skip4x + + and eax,71303168 + cmp eax,4194304 + je NEAR $L$skip4x + + sub r9,0x30 + mov rax,0xA040608020C0E000 + movdqu xmm14,XMMWORD[48+rdx] + movdqu xmm15,XMMWORD[64+rdx] + + + + + movdqu xmm3,XMMWORD[48+r8] + movdqu xmm11,XMMWORD[32+r8] +DB 102,65,15,56,0,218 +DB 102,69,15,56,0,218 + movdqa xmm5,xmm3 + pshufd xmm4,xmm3,78 + pxor xmm4,xmm3 +DB 102,15,58,68,218,0 +DB 102,15,58,68,234,17 +DB 102,15,58,68,231,0 + + movdqa xmm13,xmm11 + pshufd xmm12,xmm11,78 + pxor xmm12,xmm11 +DB 102,68,15,58,68,222,0 +DB 102,68,15,58,68,238,17 +DB 102,68,15,58,68,231,16 + xorps xmm3,xmm11 + xorps xmm5,xmm13 + movups xmm7,XMMWORD[80+rdx] + xorps xmm4,xmm12 + + movdqu xmm11,XMMWORD[16+r8] + movdqu xmm8,XMMWORD[r8] +DB 102,69,15,56,0,218 +DB 102,69,15,56,0,194 + movdqa xmm13,xmm11 + pshufd xmm12,xmm11,78 + pxor xmm0,xmm8 + pxor xmm12,xmm11 +DB 102,69,15,58,68,222,0 + movdqa xmm1,xmm0 + pshufd xmm8,xmm0,78 + pxor xmm8,xmm0 +DB 102,69,15,58,68,238,17 +DB 102,68,15,58,68,231,0 + xorps xmm3,xmm11 + xorps xmm5,xmm13 + + lea r8,[64+r8] + sub r9,0x40 + jc NEAR $L$tail4x + + jmp NEAR $L$mod4_loop +ALIGN 32 +$L$mod4_loop: +DB 102,65,15,58,68,199,0 + xorps xmm4,xmm12 + movdqu xmm11,XMMWORD[48+r8] +DB 102,69,15,56,0,218 +DB 102,65,15,58,68,207,17 + xorps xmm0,xmm3 + movdqu xmm3,XMMWORD[32+r8] + movdqa xmm13,xmm11 +DB 102,68,15,58,68,199,16 + pshufd xmm12,xmm11,78 + xorps xmm1,xmm5 + pxor xmm12,xmm11 +DB 102,65,15,56,0,218 + movups xmm7,XMMWORD[32+rdx] + xorps xmm8,xmm4 +DB 102,68,15,58,68,218,0 + pshufd xmm4,xmm3,78 + + pxor xmm8,xmm0 + movdqa xmm5,xmm3 + pxor xmm8,xmm1 + pxor xmm4,xmm3 + movdqa xmm9,xmm8 +DB 102,68,15,58,68,234,17 + pslldq xmm8,8 + psrldq xmm9,8 + pxor xmm0,xmm8 + movdqa xmm8,XMMWORD[$L$7_mask] + pxor xmm1,xmm9 +DB 102,76,15,110,200 + + pand xmm8,xmm0 +DB 102,69,15,56,0,200 + pxor xmm9,xmm0 +DB 102,68,15,58,68,231,0 + psllq xmm9,57 + movdqa xmm8,xmm9 + pslldq xmm9,8 +DB 102,15,58,68,222,0 + psrldq xmm8,8 + pxor xmm0,xmm9 + pxor xmm1,xmm8 + movdqu xmm8,XMMWORD[r8] + + movdqa xmm9,xmm0 + psrlq xmm0,1 +DB 102,15,58,68,238,17 + xorps xmm3,xmm11 + movdqu xmm11,XMMWORD[16+r8] +DB 102,69,15,56,0,218 +DB 102,15,58,68,231,16 + xorps xmm5,xmm13 + movups xmm7,XMMWORD[80+rdx] +DB 102,69,15,56,0,194 + pxor xmm1,xmm9 + pxor xmm9,xmm0 + psrlq xmm0,5 + + movdqa xmm13,xmm11 + pxor xmm4,xmm12 + pshufd xmm12,xmm11,78 + pxor xmm0,xmm9 + pxor xmm1,xmm8 + pxor xmm12,xmm11 +DB 102,69,15,58,68,222,0 + psrlq xmm0,1 + pxor xmm0,xmm1 + movdqa xmm1,xmm0 +DB 102,69,15,58,68,238,17 + xorps xmm3,xmm11 + pshufd xmm8,xmm0,78 + pxor xmm8,xmm0 + +DB 102,68,15,58,68,231,0 + xorps xmm5,xmm13 + + lea r8,[64+r8] + sub r9,0x40 + jnc NEAR $L$mod4_loop + +$L$tail4x: +DB 102,65,15,58,68,199,0 +DB 102,65,15,58,68,207,17 +DB 102,68,15,58,68,199,16 + xorps xmm4,xmm12 + xorps xmm0,xmm3 + xorps xmm1,xmm5 + pxor xmm1,xmm0 + pxor xmm8,xmm4 + + pxor xmm8,xmm1 + pxor xmm1,xmm0 + + movdqa xmm9,xmm8 + psrldq xmm8,8 + pslldq xmm9,8 + pxor xmm1,xmm8 + pxor xmm0,xmm9 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + add r9,0x40 + jz NEAR $L$done + movdqu xmm7,XMMWORD[32+rdx] + sub r9,0x10 + jz NEAR $L$odd_tail +$L$skip4x: + + + + + + movdqu xmm8,XMMWORD[r8] + movdqu xmm3,XMMWORD[16+r8] +DB 102,69,15,56,0,194 +DB 102,65,15,56,0,218 + pxor xmm0,xmm8 + + movdqa xmm5,xmm3 + pshufd xmm4,xmm3,78 + pxor xmm4,xmm3 +DB 102,15,58,68,218,0 +DB 102,15,58,68,234,17 +DB 102,15,58,68,231,0 + + lea r8,[32+r8] + nop + sub r9,0x20 + jbe NEAR $L$even_tail + nop + jmp NEAR $L$mod_loop + +ALIGN 32 +$L$mod_loop: + movdqa xmm1,xmm0 + movdqa xmm8,xmm4 + pshufd xmm4,xmm0,78 + pxor xmm4,xmm0 + +DB 102,15,58,68,198,0 +DB 102,15,58,68,206,17 +DB 102,15,58,68,231,16 + + pxor xmm0,xmm3 + pxor xmm1,xmm5 + movdqu xmm9,XMMWORD[r8] + pxor xmm8,xmm0 +DB 102,69,15,56,0,202 + movdqu xmm3,XMMWORD[16+r8] + + pxor xmm8,xmm1 + pxor xmm1,xmm9 + pxor xmm4,xmm8 +DB 102,65,15,56,0,218 + movdqa xmm8,xmm4 + psrldq xmm8,8 + pslldq xmm4,8 + pxor xmm1,xmm8 + pxor xmm0,xmm4 + + movdqa xmm5,xmm3 + + movdqa xmm9,xmm0 + movdqa xmm8,xmm0 + psllq xmm0,5 + pxor xmm8,xmm0 +DB 102,15,58,68,218,0 + psllq xmm0,1 + pxor xmm0,xmm8 + psllq xmm0,57 + movdqa xmm8,xmm0 + pslldq xmm0,8 + psrldq xmm8,8 + pxor xmm0,xmm9 + pshufd xmm4,xmm5,78 + pxor xmm1,xmm8 + pxor xmm4,xmm5 + + movdqa xmm9,xmm0 + psrlq xmm0,1 +DB 102,15,58,68,234,17 + pxor xmm1,xmm9 + pxor xmm9,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm9 + lea r8,[32+r8] + psrlq xmm0,1 +DB 102,15,58,68,231,0 + pxor xmm0,xmm1 + + sub r9,0x20 + ja NEAR $L$mod_loop + +$L$even_tail: + movdqa xmm1,xmm0 + movdqa xmm8,xmm4 + pshufd xmm4,xmm0,78 + pxor xmm4,xmm0 + +DB 102,15,58,68,198,0 +DB 102,15,58,68,206,17 +DB 102,15,58,68,231,16 + + pxor xmm0,xmm3 + pxor xmm1,xmm5 + pxor xmm8,xmm0 + pxor xmm8,xmm1 + pxor xmm4,xmm8 + movdqa xmm8,xmm4 + psrldq xmm8,8 + pslldq xmm4,8 + pxor xmm1,xmm8 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + test r9,r9 + jnz NEAR $L$done + +$L$odd_tail: + movdqu xmm8,XMMWORD[r8] +DB 102,69,15,56,0,194 + pxor xmm0,xmm8 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 +DB 102,15,58,68,194,0 +DB 102,15,58,68,202,17 +DB 102,15,58,68,223,0 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +$L$done: +DB 102,65,15,56,0,194 + movdqu XMMWORD[rcx],xmm0 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] +$L$SEH_end_GFp_gcm_ghash_clmul: + DB 0F3h,0C3h ;repret + + +global GFp_gcm_init_avx + +ALIGN 32 +GFp_gcm_init_avx: + +$L$SEH_begin_GFp_gcm_init_avx: + +DB 0x48,0x83,0xec,0x18 +DB 0x0f,0x29,0x34,0x24 + vzeroupper + + vmovdqu xmm2,XMMWORD[rdx] + vpshufd xmm2,xmm2,78 + + + vpshufd xmm4,xmm2,255 + vpsrlq xmm3,xmm2,63 + vpsllq xmm2,xmm2,1 + vpxor xmm5,xmm5,xmm5 + vpcmpgtd xmm5,xmm5,xmm4 + vpslldq xmm3,xmm3,8 + vpor xmm2,xmm2,xmm3 + + + vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial] + vpxor xmm2,xmm2,xmm5 + + vpunpckhqdq xmm6,xmm2,xmm2 + vmovdqa xmm0,xmm2 + vpxor xmm6,xmm6,xmm2 + mov r10,4 + jmp NEAR $L$init_start_avx +ALIGN 32 +$L$init_loop_avx: + vpalignr xmm5,xmm4,xmm3,8 + vmovdqu XMMWORD[(-16)+rcx],xmm5 + vpunpckhqdq xmm3,xmm0,xmm0 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm1,xmm0,xmm2,0x11 + vpclmulqdq xmm0,xmm0,xmm2,0x00 + vpclmulqdq xmm3,xmm3,xmm6,0x00 + vpxor xmm4,xmm1,xmm0 + vpxor xmm3,xmm3,xmm4 + + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm0,xmm0,xmm4 + vpxor xmm1,xmm1,xmm3 + vpsllq xmm3,xmm0,57 + vpsllq xmm4,xmm0,62 + vpxor xmm4,xmm4,xmm3 + vpsllq xmm3,xmm0,63 + vpxor xmm4,xmm4,xmm3 + vpslldq xmm3,xmm4,8 + vpsrldq xmm4,xmm4,8 + vpxor xmm0,xmm0,xmm3 + vpxor xmm1,xmm1,xmm4 + + vpsrlq xmm4,xmm0,1 + vpxor xmm1,xmm1,xmm0 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm4,xmm4,5 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm0,xmm0,1 + vpxor xmm0,xmm0,xmm1 +$L$init_start_avx: + vmovdqa xmm5,xmm0 + vpunpckhqdq xmm3,xmm0,xmm0 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm1,xmm0,xmm2,0x11 + vpclmulqdq xmm0,xmm0,xmm2,0x00 + vpclmulqdq xmm3,xmm3,xmm6,0x00 + vpxor xmm4,xmm1,xmm0 + vpxor xmm3,xmm3,xmm4 + + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm0,xmm0,xmm4 + vpxor xmm1,xmm1,xmm3 + vpsllq xmm3,xmm0,57 + vpsllq xmm4,xmm0,62 + vpxor xmm4,xmm4,xmm3 + vpsllq xmm3,xmm0,63 + vpxor xmm4,xmm4,xmm3 + vpslldq xmm3,xmm4,8 + vpsrldq xmm4,xmm4,8 + vpxor xmm0,xmm0,xmm3 + vpxor xmm1,xmm1,xmm4 + + vpsrlq xmm4,xmm0,1 + vpxor xmm1,xmm1,xmm0 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm4,xmm4,5 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm0,xmm0,1 + vpxor xmm0,xmm0,xmm1 + vpshufd xmm3,xmm5,78 + vpshufd xmm4,xmm0,78 + vpxor xmm3,xmm3,xmm5 + vmovdqu XMMWORD[rcx],xmm5 + vpxor xmm4,xmm4,xmm0 + vmovdqu XMMWORD[16+rcx],xmm0 + lea rcx,[48+rcx] + sub r10,1 + jnz NEAR $L$init_loop_avx + + vpalignr xmm5,xmm3,xmm4,8 + vmovdqu XMMWORD[(-16)+rcx],xmm5 + + vzeroupper + movaps xmm6,XMMWORD[rsp] + lea rsp,[24+rsp] +$L$SEH_end_GFp_gcm_init_avx: + DB 0F3h,0C3h ;repret + + +global GFp_gcm_ghash_avx + +ALIGN 32 +GFp_gcm_ghash_avx: + + lea rax,[((-136))+rsp] +$L$SEH_begin_GFp_gcm_ghash_avx: + +DB 0x48,0x8d,0x60,0xe0 +DB 0x0f,0x29,0x70,0xe0 +DB 0x0f,0x29,0x78,0xf0 +DB 0x44,0x0f,0x29,0x00 +DB 0x44,0x0f,0x29,0x48,0x10 +DB 0x44,0x0f,0x29,0x50,0x20 +DB 0x44,0x0f,0x29,0x58,0x30 +DB 0x44,0x0f,0x29,0x60,0x40 +DB 0x44,0x0f,0x29,0x68,0x50 +DB 0x44,0x0f,0x29,0x70,0x60 +DB 0x44,0x0f,0x29,0x78,0x70 + vzeroupper + + vmovdqu xmm10,XMMWORD[rcx] + lea r10,[$L$0x1c2_polynomial] + lea rdx,[64+rdx] + vmovdqu xmm13,XMMWORD[$L$bswap_mask] + vpshufb xmm10,xmm10,xmm13 + cmp r9,0x80 + jb NEAR $L$short_avx + sub r9,0x80 + + vmovdqu xmm14,XMMWORD[112+r8] + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vpshufb xmm14,xmm14,xmm13 + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + + vpunpckhqdq xmm9,xmm14,xmm14 + vmovdqu xmm15,XMMWORD[96+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm9,xmm9,xmm14 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vmovdqu xmm14,XMMWORD[80+r8] + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vmovdqu xmm15,XMMWORD[64+r8] + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + + vpshufb xmm15,xmm15,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[48+r8] + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpxor xmm2,xmm2,xmm5 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[32+r8] + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[16+r8] + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpxor xmm2,xmm2,xmm5 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((176-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[r8] + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((160-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm9,xmm7,0x10 + + lea r8,[128+r8] + cmp r9,0x80 + jb NEAR $L$tail_avx + + vpxor xmm15,xmm15,xmm10 + sub r9,0x80 + jmp NEAR $L$oop8x_avx + +ALIGN 32 +$L$oop8x_avx: + vpunpckhqdq xmm8,xmm15,xmm15 + vmovdqu xmm14,XMMWORD[112+r8] + vpxor xmm3,xmm3,xmm0 + vpxor xmm8,xmm8,xmm15 + vpclmulqdq xmm10,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm11,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm12,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[96+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm10,xmm10,xmm3 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vxorps xmm11,xmm11,xmm4 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm12,xmm12,xmm5 + vxorps xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[80+r8] + vpxor xmm12,xmm12,xmm10 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm12,xmm12,xmm11 + vpslldq xmm9,xmm12,8 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vpsrldq xmm12,xmm12,8 + vpxor xmm10,xmm10,xmm9 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpshufb xmm14,xmm14,xmm13 + vxorps xmm11,xmm11,xmm12 + vpxor xmm4,xmm4,xmm1 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[64+r8] + vpalignr xmm12,xmm10,xmm10,8 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vxorps xmm8,xmm8,xmm15 + vpxor xmm2,xmm2,xmm5 + + vmovdqu xmm14,XMMWORD[48+r8] + vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[32+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + vpxor xmm2,xmm2,xmm5 + vxorps xmm10,xmm10,xmm12 + + vmovdqu xmm14,XMMWORD[16+r8] + vpalignr xmm12,xmm10,xmm10,8 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 + vxorps xmm12,xmm12,xmm11 + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((176-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((160-64))+rdx] + vpxor xmm15,xmm15,xmm12 + vpclmulqdq xmm2,xmm9,xmm7,0x10 + vpxor xmm15,xmm15,xmm10 + + lea r8,[128+r8] + sub r9,0x80 + jnc NEAR $L$oop8x_avx + + add r9,0x80 + jmp NEAR $L$tail_no_xor_avx + +ALIGN 32 +$L$short_avx: + vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8] + lea r8,[r9*1+r8] + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + + vmovdqa xmm3,xmm0 + vmovdqa xmm4,xmm1 + vmovdqa xmm5,xmm2 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-32))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-48))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-64))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-80))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-96))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-112))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovq xmm7,QWORD[((184-64))+rdx] + sub r9,0x10 + jmp NEAR $L$tail_avx + +ALIGN 32 +$L$tail_avx: + vpxor xmm15,xmm15,xmm10 +$L$tail_no_xor_avx: + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + + vmovdqu xmm12,XMMWORD[r10] + + vpxor xmm10,xmm3,xmm0 + vpxor xmm11,xmm4,xmm1 + vpxor xmm5,xmm5,xmm2 + + vpxor xmm5,xmm5,xmm10 + vpxor xmm5,xmm5,xmm11 + vpslldq xmm9,xmm5,8 + vpsrldq xmm5,xmm5,8 + vpxor xmm10,xmm10,xmm9 + vpxor xmm11,xmm11,xmm5 + + vpclmulqdq xmm9,xmm10,xmm12,0x10 + vpalignr xmm10,xmm10,xmm10,8 + vpxor xmm10,xmm10,xmm9 + + vpclmulqdq xmm9,xmm10,xmm12,0x10 + vpalignr xmm10,xmm10,xmm10,8 + vpxor xmm10,xmm10,xmm11 + vpxor xmm10,xmm10,xmm9 + + cmp r9,0 + jne NEAR $L$short_avx + + vpshufb xmm10,xmm10,xmm13 + vmovdqu XMMWORD[rcx],xmm10 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] +$L$SEH_end_GFp_gcm_ghash_avx: + DB 0F3h,0C3h ;repret + + +ALIGN 64 +$L$bswap_mask: +DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$0x1c2_polynomial: +DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +$L$7_mask: + DD 7,0,7,0 +ALIGN 64 + +DB 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52 +DB 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +DB 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +DB 114,103,62,0 +ALIGN 64 +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_gcm_init_clmul wrt ..imagebase + DD $L$SEH_end_GFp_gcm_init_clmul wrt ..imagebase + DD $L$SEH_info_GFp_gcm_init_clmul wrt ..imagebase + + DD $L$SEH_begin_GFp_gcm_ghash_clmul wrt ..imagebase + DD $L$SEH_end_GFp_gcm_ghash_clmul wrt ..imagebase + DD $L$SEH_info_GFp_gcm_ghash_clmul wrt ..imagebase + DD $L$SEH_begin_GFp_gcm_init_avx wrt ..imagebase + DD $L$SEH_end_GFp_gcm_init_avx wrt ..imagebase + DD $L$SEH_info_GFp_gcm_init_clmul wrt ..imagebase + + DD $L$SEH_begin_GFp_gcm_ghash_avx wrt ..imagebase + DD $L$SEH_end_GFp_gcm_ghash_avx wrt ..imagebase + DD $L$SEH_info_GFp_gcm_ghash_clmul wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_GFp_gcm_init_clmul: +DB 0x01,0x08,0x03,0x00 +DB 0x08,0x68,0x00,0x00 +DB 0x04,0x22,0x00,0x00 +$L$SEH_info_GFp_gcm_ghash_clmul: +DB 0x01,0x33,0x16,0x00 +DB 0x33,0xf8,0x09,0x00 +DB 0x2e,0xe8,0x08,0x00 +DB 0x29,0xd8,0x07,0x00 +DB 0x24,0xc8,0x06,0x00 +DB 0x1f,0xb8,0x05,0x00 +DB 0x1a,0xa8,0x04,0x00 +DB 0x15,0x98,0x03,0x00 +DB 0x10,0x88,0x02,0x00 +DB 0x0c,0x78,0x01,0x00 +DB 0x08,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 diff --git a/zeroidc/vendor/ring/pregenerated/tmp/p256-x86_64-asm-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/p256-x86_64-asm-nasm.asm new file mode 100644 index 000000000..f1051be75 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/p256-x86_64-asm-nasm.asm @@ -0,0 +1,5037 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + +EXTERN GFp_ia32cap_P + + +ALIGN 64 +$L$poly: + DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 + +$L$One: + DD 1,1,1,1,1,1,1,1 +$L$Two: + DD 2,2,2,2,2,2,2,2 +$L$Three: + DD 3,3,3,3,3,3,3,3 +$L$ONE_mont: + DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe + + +$L$ord: + DQ 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +$L$ordK: + DQ 0xccd1c8aaee00bc4f + + + +global GFp_nistz256_add + +ALIGN 32 +GFp_nistz256_add: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_add: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + push r12 + push r13 + + mov r8,QWORD[rsi] + xor r13,r13 + mov r9,QWORD[8+rsi] + mov r10,QWORD[16+rsi] + mov r11,QWORD[24+rsi] + lea rsi,[$L$poly] + + add r8,QWORD[rdx] + adc r9,QWORD[8+rdx] + mov rax,r8 + adc r10,QWORD[16+rdx] + adc r11,QWORD[24+rdx] + mov rdx,r9 + adc r13,0 + + sub r8,QWORD[rsi] + mov rcx,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r12,r11 + sbb r11,QWORD[24+rsi] + sbb r13,0 + + cmovc r8,rax + cmovc r9,rdx + mov QWORD[rdi],r8 + cmovc r10,rcx + mov QWORD[8+rdi],r9 + cmovc r11,r12 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + pop r13 + pop r12 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_GFp_nistz256_add: + + + +global GFp_nistz256_neg + +ALIGN 32 +GFp_nistz256_neg: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_neg: + mov rdi,rcx + mov rsi,rdx + + + + push r12 + + push r13 + +$L$neg_body: + + xor r8,r8 + xor r9,r9 + xor r10,r10 + xor r11,r11 + xor r13,r13 + + sub r8,QWORD[rsi] + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov rax,r8 + sbb r11,QWORD[24+rsi] + lea rsi,[$L$poly] + mov rdx,r9 + sbb r13,0 + + add r8,QWORD[rsi] + mov rcx,r10 + adc r9,QWORD[8+rsi] + adc r10,QWORD[16+rsi] + mov r12,r11 + adc r11,QWORD[24+rsi] + test r13,r13 + + cmovz r8,rax + cmovz r9,rdx + mov QWORD[rdi],r8 + cmovz r10,rcx + mov QWORD[8+rdi],r9 + cmovz r11,r12 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$neg_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_neg: + + + + + + +global GFp_p256_scalar_mul_mont + +ALIGN 32 +GFp_p256_scalar_mul_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_p256_scalar_mul_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rcx,[GFp_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$ecp_nistz256_ord_mul_montx + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mul_body: + + mov rax,QWORD[rdx] + mov rbx,rdx + lea r14,[$L$ord] + mov r15,QWORD[$L$ordK] + + + mov rcx,rax + mul QWORD[rsi] + mov r8,rax + mov rax,rcx + mov r9,rdx + + mul QWORD[8+rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov r10,rdx + + mul QWORD[16+rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + + mov r13,r8 + imul r8,r15 + + mov r11,rdx + mul QWORD[24+rsi] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r12,rdx + + + mul QWORD[r14] + mov rbp,r8 + add r13,rax + mov rax,r8 + adc rdx,0 + mov rcx,rdx + + sub r10,r8 + sbb r8,0 + + mul QWORD[8+r14] + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,rbp + adc r10,rdx + mov rdx,rbp + adc r8,0 + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[8+rbx] + sbb rbp,rdx + + add r11,r8 + adc r12,rbp + adc r13,0 + + + mov rcx,rax + mul QWORD[rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r10,rbp + adc rdx,0 + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r9 + imul r9,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r12,rbp + adc rdx,0 + xor r8,r8 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + mul QWORD[r14] + mov rbp,r9 + add rcx,rax + mov rax,r9 + adc rcx,rdx + + sub r11,r9 + sbb r9,0 + + mul QWORD[8+r14] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc r11,rdx + mov rdx,rbp + adc r9,0 + + shl rax,32 + shr rdx,32 + sub r12,rax + mov rax,QWORD[16+rbx] + sbb rbp,rdx + + add r12,r9 + adc r13,rbp + adc r8,0 + + + mov rcx,rax + mul QWORD[rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r10 + imul r10,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r13,rbp + adc rdx,0 + xor r9,r9 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + mul QWORD[r14] + mov rbp,r10 + add rcx,rax + mov rax,r10 + adc rcx,rdx + + sub r12,r10 + sbb r10,0 + + mul QWORD[8+r14] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc r12,rdx + mov rdx,rbp + adc r10,0 + + shl rax,32 + shr rdx,32 + sub r13,rax + mov rax,QWORD[24+rbx] + sbb rbp,rdx + + add r13,r10 + adc r8,rbp + adc r9,0 + + + mov rcx,rax + mul QWORD[rsi] + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r11 + imul r11,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r8,rbp + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + mul QWORD[r14] + mov rbp,r11 + add rcx,rax + mov rax,r11 + adc rcx,rdx + + sub r13,r11 + sbb r11,0 + + mul QWORD[8+r14] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc r13,rdx + mov rdx,rbp + adc r11,0 + + shl rax,32 + shr rdx,32 + sub r8,rax + sbb rbp,rdx + + add r8,r11 + adc r9,rbp + adc r10,0 + + + mov rsi,r12 + sub r12,QWORD[r14] + mov r11,r13 + sbb r13,QWORD[8+r14] + mov rcx,r8 + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rsi + cmovc r13,r11 + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_p256_scalar_mul_mont: + + + + + + + +global GFp_p256_scalar_sqr_rep_mont + +ALIGN 32 +GFp_p256_scalar_sqr_rep_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_p256_scalar_sqr_rep_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rcx,[GFp_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$ecp_nistz256_ord_sqr_montx + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqr_body: + + mov r8,QWORD[rsi] + mov rax,QWORD[8+rsi] + mov r14,QWORD[16+rsi] + mov r15,QWORD[24+rsi] + lea rsi,[$L$ord] + mov rbx,rdx + jmp NEAR $L$oop_ord_sqr + +ALIGN 32 +$L$oop_ord_sqr: + + mov rbp,rax + mul r8 + mov r9,rax +DB 102,72,15,110,205 + mov rax,r14 + mov r10,rdx + + mul r8 + add r10,rax + mov rax,r15 +DB 102,73,15,110,214 + adc rdx,0 + mov r11,rdx + + mul r8 + add r11,rax + mov rax,r15 +DB 102,73,15,110,223 + adc rdx,0 + mov r12,rdx + + + mul r14 + mov r13,rax + mov rax,r14 + mov r14,rdx + + + mul rbp + add r11,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + + mul rbp + add r12,rax + adc rdx,0 + + add r12,r15 + adc r13,rdx + adc r14,0 + + + xor r15,r15 + mov rax,r8 + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + + mul rax + mov r8,rax +DB 102,72,15,126,200 + mov rbp,rdx + + mul rax + add r9,rbp + adc r10,rax +DB 102,72,15,126,208 + adc rdx,0 + mov rbp,rdx + + mul rax + add r11,rbp + adc r12,rax +DB 102,72,15,126,216 + adc rdx,0 + mov rbp,rdx + + mov rcx,r8 + imul r8,QWORD[32+rsi] + + mul rax + add r13,rbp + adc r14,rax + mov rax,QWORD[rsi] + adc r15,rdx + + + mul r8 + mov rbp,r8 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r10,r8 + sbb rbp,0 + + mul r8 + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,r8 + adc r10,rdx + mov rdx,r8 + adc rbp,0 + + mov rcx,r9 + imul r9,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[rsi] + sbb r8,rdx + + add r11,rbp + adc r8,0 + + + mul r9 + mov rbp,r9 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r11,r9 + sbb rbp,0 + + mul r9 + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,r9 + adc r11,rdx + mov rdx,r9 + adc rbp,0 + + mov rcx,r10 + imul r10,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r8,rax + mov rax,QWORD[rsi] + sbb r9,rdx + + add r8,rbp + adc r9,0 + + + mul r10 + mov rbp,r10 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r8,r10 + sbb rbp,0 + + mul r10 + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,r10 + adc r8,rdx + mov rdx,r10 + adc rbp,0 + + mov rcx,r11 + imul r11,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r9,rax + mov rax,QWORD[rsi] + sbb r10,rdx + + add r9,rbp + adc r10,0 + + + mul r11 + mov rbp,r11 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r9,r11 + sbb rbp,0 + + mul r11 + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + mov rdx,r11 + adc rbp,0 + + shl rax,32 + shr rdx,32 + sub r10,rax + sbb r11,rdx + + add r10,rbp + adc r11,0 + + + xor rdx,rdx + add r8,r12 + adc r9,r13 + mov r12,r8 + adc r10,r14 + adc r11,r15 + mov rax,r9 + adc rdx,0 + + + sub r8,QWORD[rsi] + mov r14,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r15,r11 + sbb r11,QWORD[24+rsi] + sbb rdx,0 + + cmovc r8,r12 + cmovnc rax,r9 + cmovnc r14,r10 + cmovnc r15,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqr + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],rax + pxor xmm1,xmm1 + mov QWORD[16+rdi],r14 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r15 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_p256_scalar_sqr_rep_mont: + + +ALIGN 32 +ecp_nistz256_ord_mul_montx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_montx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_mul_montx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mulx_body: + + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + lea r14,[(($L$ord-128))] + mov r15,QWORD[$L$ordK] + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mulx r11,rbp,r11 + add r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + mulx rax,rdx,r15 + adc r10,rbp + adc r11,rcx + adc r12,0 + + + xor r13,r13 + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r8,rcx + adox r9,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[8+rbx] + adcx r11,rcx + adox r12,rbp + adcx r12,r8 + adox r13,r8 + adc r13,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + mulx rax,rdx,r15 + adcx r12,rcx + adox r13,rbp + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[16+rbx] + adcx r12,rcx + adox r13,rbp + adcx r13,r9 + adox r8,r9 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + mulx rax,rdx,r15 + adcx r13,rcx + adox r8,rbp + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[24+rbx] + adcx r13,rcx + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + mulx rax,rdx,r15 + adcx r8,rcx + adox r9,rbp + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + lea r14,[128+r14] + mov rbx,r12 + adcx r8,rcx + adox r9,rbp + mov rdx,r13 + adcx r9,r11 + adox r10,r11 + adc r10,0 + + + + mov rcx,r8 + sub r12,QWORD[r14] + sbb r13,QWORD[8+r14] + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mulx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_mul_montx: + + +ALIGN 32 +ecp_nistz256_ord_sqr_montx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_montx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_sqr_montx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqrx_body: + + mov rbx,rdx + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[$L$ord] + jmp NEAR $L$oop_ord_sqrx + +ALIGN 32 +$L$oop_ord_sqrx: + mulx r10,r9,r14 + mulx r11,rcx,r15 + mov rax,rdx +DB 102,73,15,110,206 + mulx r12,rbp,r8 + mov rdx,r14 + add r10,rcx +DB 102,73,15,110,215 + adc r11,rbp + adc r12,0 + xor r13,r13 + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + mulx r14,rcx,r8 + mov rdx,rax +DB 102,73,15,110,216 + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + + mulx rbp,r8,rdx +DB 102,72,15,126,202 + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx +DB 102,72,15,126,210 + adcx r13,r13 + adox r10,rcx + adcx r14,r14 + mulx rbp,rcx,rdx +DB 0x67 +DB 102,72,15,126,218 + adox r11,rax + adcx r15,r15 + adox r12,rcx + adox r13,rbp + mulx rax,rcx,rdx + adox r14,rcx + adox r15,rax + + + mov rdx,r8 + mulx rcx,rdx,QWORD[32+rsi] + + xor rax,rax + mulx rbp,rcx,QWORD[rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r9,rcx + adox r10,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r11,rcx + adox r8,rbp + adcx r8,rax + + + mov rdx,r9 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r10,rcx + adcx r11,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r8,rcx + adcx r9,rbp + adox r9,rax + + + mov rdx,r10 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r11,rcx + adox r8,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r9,rcx + adox r10,rbp + adcx r10,rax + + + mov rdx,r11 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r8,rcx + adcx r9,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r10,rcx + adcx r11,rbp + adox r11,rax + + + add r12,r8 + adc r9,r13 + mov rdx,r12 + adc r10,r14 + adc r11,r15 + mov r14,r9 + adc rax,0 + + + sub r12,QWORD[rsi] + mov r15,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r8,r11 + sbb r11,QWORD[24+rsi] + sbb rax,0 + + cmovnc rdx,r12 + cmovnc r14,r9 + cmovnc r15,r10 + cmovnc r8,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqrx + + mov QWORD[rdi],rdx + mov QWORD[8+rdi],r14 + pxor xmm1,xmm1 + mov QWORD[16+rdi],r15 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r8 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqrx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_ecp_nistz256_ord_sqr_montx: + + + + + + +global GFp_nistz256_mul_mont + +ALIGN 32 +GFp_nistz256_mul_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_mul_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rcx,[GFp_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 +$L$mul_mont: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mul_body: + cmp ecx,0x80100 + je NEAR $L$mul_montx + mov rbx,rdx + mov rax,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + + call __ecp_nistz256_mul_montq + jmp NEAR $L$mul_mont_done + +ALIGN 32 +$L$mul_montx: + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_mul_montx +$L$mul_mont_done: + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_mul_mont: + + +ALIGN 32 +__ecp_nistz256_mul_montq: + + + + mov rbp,rax + mul r9 + mov r14,QWORD[(($L$poly+8))] + mov r8,rax + mov rax,rbp + mov r9,rdx + + mul r10 + mov r15,QWORD[(($L$poly+24))] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul r11 + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul r12 + add r11,rax + mov rax,r8 + adc rdx,0 + xor r13,r13 + mov r12,rdx + + + + + + + + + + + mov rbp,r8 + shl r8,32 + mul r15 + shr rbp,32 + add r9,r8 + adc r10,rbp + adc r11,rax + mov rax,QWORD[8+rbx] + adc r12,rdx + adc r13,0 + xor r8,r8 + + + + mov rbp,rax + mul QWORD[rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + + mov rbp,r9 + shl r9,32 + mul r15 + shr rbp,32 + add r10,r9 + adc r11,rbp + adc r12,rax + mov rax,QWORD[16+rbx] + adc r13,rdx + adc r8,0 + xor r9,r9 + + + + mov rbp,rax + mul QWORD[rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + + mov rbp,r10 + shl r10,32 + mul r15 + shr rbp,32 + add r11,r10 + adc r12,rbp + adc r13,rax + mov rax,QWORD[24+rbx] + adc r8,rdx + adc r9,0 + xor r10,r10 + + + + mov rbp,rax + mul QWORD[rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + + mov rbp,r11 + shl r11,32 + mul r15 + shr rbp,32 + add r12,r11 + adc r13,rbp + mov rcx,r12 + adc r8,rax + adc r9,rdx + mov rbp,r13 + adc r10,0 + + + + sub r12,-1 + mov rbx,r8 + sbb r13,r14 + sbb r8,0 + mov rdx,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rcx + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rbx + mov QWORD[8+rdi],r13 + cmovc r9,rdx + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + + + + + + + +global GFp_nistz256_sqr_mont + +ALIGN 32 +GFp_nistz256_sqr_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_sqr_mont: + mov rdi,rcx + mov rsi,rdx + + + + lea rcx,[GFp_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$sqr_body: + cmp ecx,0x80100 + je NEAR $L$sqr_montx + mov rax,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + + call __ecp_nistz256_sqr_montq + jmp NEAR $L$sqr_mont_done + +ALIGN 32 +$L$sqr_montx: + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_sqr_montx +$L$sqr_mont_done: + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_sqr_mont: + + +ALIGN 32 +__ecp_nistz256_sqr_montq: + + mov r13,rax + mul r14 + mov r9,rax + mov rax,r15 + mov r10,rdx + + mul r13 + add r10,rax + mov rax,r8 + adc rdx,0 + mov r11,rdx + + mul r13 + add r11,rax + mov rax,r15 + adc rdx,0 + mov r12,rdx + + + mul r14 + add r11,rax + mov rax,r8 + adc rdx,0 + mov rbp,rdx + + mul r14 + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,rbp + mov r13,rdx + adc r13,0 + + + mul r15 + xor r15,r15 + add r13,rax + mov rax,QWORD[rsi] + mov r14,rdx + adc r14,0 + + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + mul rax + mov r8,rax + mov rax,QWORD[8+rsi] + mov rcx,rdx + + mul rax + add r9,rcx + adc r10,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r11,rcx + adc r12,rax + mov rax,QWORD[24+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r13,rcx + adc r14,rax + mov rax,r8 + adc r15,rdx + + mov rsi,QWORD[(($L$poly+8))] + mov rbp,QWORD[(($L$poly+24))] + + + + + mov rcx,r8 + shl r8,32 + mul rbp + shr rcx,32 + add r9,r8 + adc r10,rcx + adc r11,rax + mov rax,r9 + adc rdx,0 + + + + mov rcx,r9 + shl r9,32 + mov r8,rdx + mul rbp + shr rcx,32 + add r10,r9 + adc r11,rcx + adc r8,rax + mov rax,r10 + adc rdx,0 + + + + mov rcx,r10 + shl r10,32 + mov r9,rdx + mul rbp + shr rcx,32 + add r11,r10 + adc r8,rcx + adc r9,rax + mov rax,r11 + adc rdx,0 + + + + mov rcx,r11 + shl r11,32 + mov r10,rdx + mul rbp + shr rcx,32 + add r8,r11 + adc r9,rcx + adc r10,rax + adc rdx,0 + xor r11,r11 + + + + add r12,r8 + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,rdx + mov r9,r13 + adc r11,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov rcx,r15 + sbb r15,rbp + sbb r11,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,rcx + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ecp_nistz256_mul_montx: + + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mov r14,32 + xor r13,r13 + mulx r11,rbp,r11 + mov r15,QWORD[(($L$poly+24))] + adc r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + adc r10,rbp + shlx rbp,r8,r14 + adc r11,rcx + shrx rcx,r8,r14 + adc r12,0 + + + + add r9,rbp + adc r10,rcx + + mulx rbp,rcx,r15 + mov rdx,QWORD[8+rbx] + adc r11,rcx + adc r12,rbp + adc r13,0 + xor r8,r8 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + adcx r12,rcx + shlx rcx,r9,r14 + adox r13,rbp + shrx rbp,r9,r14 + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + + add r10,rcx + adc r11,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[16+rbx] + adc r12,rcx + adc r13,rbp + adc r8,0 + xor r9,r9 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + adcx r13,rcx + shlx rcx,r10,r14 + adox r8,rbp + shrx rbp,r10,r14 + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + + add r11,rcx + adc r12,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[24+rbx] + adc r13,rcx + adc r8,rbp + adc r9,0 + xor r10,r10 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + adcx r8,rcx + shlx rcx,r11,r14 + adox r9,rbp + shrx rbp,r11,r14 + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + + add r12,rcx + adc r13,rbp + + mulx rbp,rcx,r15 + mov rbx,r12 + mov r14,QWORD[(($L$poly+8))] + adc r8,rcx + mov rdx,r13 + adc r9,rbp + adc r10,0 + + + + xor eax,eax + mov rcx,r8 + sbb r12,-1 + sbb r13,r14 + sbb r8,0 + mov rbp,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,rbp + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_sqr_montx: + + mulx r10,r9,r14 + mulx r11,rcx,r15 + xor eax,eax + adc r10,rcx + mulx r12,rbp,r8 + mov rdx,r14 + adc r11,rbp + adc r12,0 + xor r13,r13 + + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + + mulx r14,rcx,r8 + mov rdx,QWORD[((0+128))+rsi] + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + mulx rbp,r8,rdx + mov rdx,QWORD[((8+128))+rsi] + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx + mov rdx,QWORD[((16+128))+rsi] + adcx r13,r13 + adox r10,rcx + adcx r14,r14 +DB 0x67 + mulx rbp,rcx,rdx + mov rdx,QWORD[((24+128))+rsi] + adox r11,rax + adcx r15,r15 + adox r12,rcx + mov rsi,32 + adox r13,rbp +DB 0x67,0x67 + mulx rax,rcx,rdx + mov rdx,QWORD[(($L$poly+24))] + adox r14,rcx + shlx rcx,r8,rsi + adox r15,rax + shrx rax,r8,rsi + mov rbp,rdx + + + add r9,rcx + adc r10,rax + + mulx r8,rcx,r8 + adc r11,rcx + shlx rcx,r9,rsi + adc r8,0 + shrx rax,r9,rsi + + + add r10,rcx + adc r11,rax + + mulx r9,rcx,r9 + adc r8,rcx + shlx rcx,r10,rsi + adc r9,0 + shrx rax,r10,rsi + + + add r11,rcx + adc r8,rax + + mulx r10,rcx,r10 + adc r9,rcx + shlx rcx,r11,rsi + adc r10,0 + shrx rax,r11,rsi + + + add r8,rcx + adc r9,rax + + mulx r11,rcx,r11 + adc r10,rcx + adc r11,0 + + xor rdx,rdx + add r12,r8 + mov rsi,QWORD[(($L$poly+8))] + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,r11 + mov r9,r13 + adc rdx,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov r11,r15 + sbb r15,rbp + sbb rdx,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,r11 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + DB 0F3h,0C3h ;repret + + + + +global GFp_nistz256_select_w5 + +ALIGN 32 +GFp_nistz256_select_w5: + + lea rax,[GFp_ia32cap_P] + mov rax,QWORD[8+rax] + test eax,32 + jnz NEAR $L$avx2_select_w5 + lea rax,[((-136))+rsp] +$L$SEH_begin_GFp_nistz256_select_w5: +DB 0x48,0x8d,0x60,0xe0 +DB 0x0f,0x29,0x70,0xe0 +DB 0x0f,0x29,0x78,0xf0 +DB 0x44,0x0f,0x29,0x00 +DB 0x44,0x0f,0x29,0x48,0x10 +DB 0x44,0x0f,0x29,0x50,0x20 +DB 0x44,0x0f,0x29,0x58,0x30 +DB 0x44,0x0f,0x29,0x60,0x40 +DB 0x44,0x0f,0x29,0x68,0x50 +DB 0x44,0x0f,0x29,0x70,0x60 +DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm0,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + + movdqa xmm8,xmm0 + pshufd xmm1,xmm1,0 + + mov rax,16 +$L$select_loop_sse_w5: + + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + pcmpeqd xmm15,xmm1 + + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + movdqa xmm13,XMMWORD[64+rdx] + movdqa xmm14,XMMWORD[80+rdx] + lea rdx,[96+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + pand xmm13,xmm15 + por xmm5,xmm12 + pand xmm14,xmm15 + por xmm6,xmm13 + por xmm7,xmm14 + + dec rax + jnz NEAR $L$select_loop_sse_w5 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movdqu XMMWORD[64+rcx],xmm6 + movdqu XMMWORD[80+rcx],xmm7 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_select_w5: + + + + +global GFp_nistz256_select_w7 + +ALIGN 32 +GFp_nistz256_select_w7: + + lea rax,[GFp_ia32cap_P] + mov rax,QWORD[8+rax] + test eax,32 + jnz NEAR $L$avx2_select_w7 + lea rax,[((-136))+rsp] +$L$SEH_begin_GFp_nistz256_select_w7: +DB 0x48,0x8d,0x60,0xe0 +DB 0x0f,0x29,0x70,0xe0 +DB 0x0f,0x29,0x78,0xf0 +DB 0x44,0x0f,0x29,0x00 +DB 0x44,0x0f,0x29,0x48,0x10 +DB 0x44,0x0f,0x29,0x50,0x20 +DB 0x44,0x0f,0x29,0x58,0x30 +DB 0x44,0x0f,0x29,0x60,0x40 +DB 0x44,0x0f,0x29,0x68,0x50 +DB 0x44,0x0f,0x29,0x70,0x60 +DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm8,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + + movdqa xmm0,xmm8 + pshufd xmm1,xmm1,0 + mov rax,64 + +$L$select_loop_sse_w7: + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + pcmpeqd xmm15,xmm1 + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + lea rdx,[64+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + prefetcht0 [255+rdx] + por xmm5,xmm12 + + dec rax + jnz NEAR $L$select_loop_sse_w7 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_select_w7: + + + + +ALIGN 32 +GFp_nistz256_avx2_select_w5: + +$L$avx2_select_w5: + vzeroupper + lea rax,[((-136))+rsp] + mov r11,rsp +$L$SEH_begin_GFp_nistz256_avx2_select_w5: +DB 0x48,0x8d,0x60,0xe0 +DB 0xc5,0xf8,0x29,0x70,0xe0 +DB 0xc5,0xf8,0x29,0x78,0xf0 +DB 0xc5,0x78,0x29,0x40,0x00 +DB 0xc5,0x78,0x29,0x48,0x10 +DB 0xc5,0x78,0x29,0x50,0x20 +DB 0xc5,0x78,0x29,0x58,0x30 +DB 0xc5,0x78,0x29,0x60,0x40 +DB 0xc5,0x78,0x29,0x68,0x50 +DB 0xc5,0x78,0x29,0x70,0x60 +DB 0xc5,0x78,0x29,0x78,0x70 + vmovdqa ymm0,YMMWORD[$L$Two] + + vpxor ymm2,ymm2,ymm2 + vpxor ymm3,ymm3,ymm3 + vpxor ymm4,ymm4,ymm4 + + vmovdqa ymm5,YMMWORD[$L$One] + vmovdqa ymm10,YMMWORD[$L$Two] + + vmovd xmm1,r8d + vpermd ymm1,ymm2,ymm1 + + mov rax,8 +$L$select_loop_avx2_w5: + + vmovdqa ymm6,YMMWORD[rdx] + vmovdqa ymm7,YMMWORD[32+rdx] + vmovdqa ymm8,YMMWORD[64+rdx] + + vmovdqa ymm11,YMMWORD[96+rdx] + vmovdqa ymm12,YMMWORD[128+rdx] + vmovdqa ymm13,YMMWORD[160+rdx] + + vpcmpeqd ymm9,ymm5,ymm1 + vpcmpeqd ymm14,ymm10,ymm1 + + vpaddd ymm5,ymm5,ymm0 + vpaddd ymm10,ymm10,ymm0 + lea rdx,[192+rdx] + + vpand ymm6,ymm6,ymm9 + vpand ymm7,ymm7,ymm9 + vpand ymm8,ymm8,ymm9 + vpand ymm11,ymm11,ymm14 + vpand ymm12,ymm12,ymm14 + vpand ymm13,ymm13,ymm14 + + vpxor ymm2,ymm2,ymm6 + vpxor ymm3,ymm3,ymm7 + vpxor ymm4,ymm4,ymm8 + vpxor ymm2,ymm2,ymm11 + vpxor ymm3,ymm3,ymm12 + vpxor ymm4,ymm4,ymm13 + + dec rax + jnz NEAR $L$select_loop_avx2_w5 + + vmovdqu YMMWORD[rcx],ymm2 + vmovdqu YMMWORD[32+rcx],ymm3 + vmovdqu YMMWORD[64+rcx],ymm4 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[r11] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_avx2_select_w5: + + + + +global GFp_nistz256_avx2_select_w7 + +ALIGN 32 +GFp_nistz256_avx2_select_w7: + +$L$avx2_select_w7: + vzeroupper + mov r11,rsp + lea rax,[((-136))+rsp] +$L$SEH_begin_GFp_nistz256_avx2_select_w7: +DB 0x48,0x8d,0x60,0xe0 +DB 0xc5,0xf8,0x29,0x70,0xe0 +DB 0xc5,0xf8,0x29,0x78,0xf0 +DB 0xc5,0x78,0x29,0x40,0x00 +DB 0xc5,0x78,0x29,0x48,0x10 +DB 0xc5,0x78,0x29,0x50,0x20 +DB 0xc5,0x78,0x29,0x58,0x30 +DB 0xc5,0x78,0x29,0x60,0x40 +DB 0xc5,0x78,0x29,0x68,0x50 +DB 0xc5,0x78,0x29,0x70,0x60 +DB 0xc5,0x78,0x29,0x78,0x70 + vmovdqa ymm0,YMMWORD[$L$Three] + + vpxor ymm2,ymm2,ymm2 + vpxor ymm3,ymm3,ymm3 + + vmovdqa ymm4,YMMWORD[$L$One] + vmovdqa ymm8,YMMWORD[$L$Two] + vmovdqa ymm12,YMMWORD[$L$Three] + + vmovd xmm1,r8d + vpermd ymm1,ymm2,ymm1 + + + mov rax,21 +$L$select_loop_avx2_w7: + + vmovdqa ymm5,YMMWORD[rdx] + vmovdqa ymm6,YMMWORD[32+rdx] + + vmovdqa ymm9,YMMWORD[64+rdx] + vmovdqa ymm10,YMMWORD[96+rdx] + + vmovdqa ymm13,YMMWORD[128+rdx] + vmovdqa ymm14,YMMWORD[160+rdx] + + vpcmpeqd ymm7,ymm4,ymm1 + vpcmpeqd ymm11,ymm8,ymm1 + vpcmpeqd ymm15,ymm12,ymm1 + + vpaddd ymm4,ymm4,ymm0 + vpaddd ymm8,ymm8,ymm0 + vpaddd ymm12,ymm12,ymm0 + lea rdx,[192+rdx] + + vpand ymm5,ymm5,ymm7 + vpand ymm6,ymm6,ymm7 + vpand ymm9,ymm9,ymm11 + vpand ymm10,ymm10,ymm11 + vpand ymm13,ymm13,ymm15 + vpand ymm14,ymm14,ymm15 + + vpxor ymm2,ymm2,ymm5 + vpxor ymm3,ymm3,ymm6 + vpxor ymm2,ymm2,ymm9 + vpxor ymm3,ymm3,ymm10 + vpxor ymm2,ymm2,ymm13 + vpxor ymm3,ymm3,ymm14 + + dec rax + jnz NEAR $L$select_loop_avx2_w7 + + + vmovdqa ymm5,YMMWORD[rdx] + vmovdqa ymm6,YMMWORD[32+rdx] + + vpcmpeqd ymm7,ymm4,ymm1 + + vpand ymm5,ymm5,ymm7 + vpand ymm6,ymm6,ymm7 + + vpxor ymm2,ymm2,ymm5 + vpxor ymm3,ymm3,ymm6 + + vmovdqu YMMWORD[rcx],ymm2 + vmovdqu YMMWORD[32+rcx],ymm3 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[r11] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_avx2_select_w7: + + +ALIGN 32 +__ecp_nistz256_add_toq: + + xor r11,r11 + add r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_sub_fromq: + + sub r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,r11 + + add r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + test r11,r11 + + cmovz r12,rax + cmovz r13,rbp + mov QWORD[rdi],r12 + cmovz r8,rcx + mov QWORD[8+rdi],r13 + cmovz r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_subq: + + sub rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,r11 + + add rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + test r11,r11 + + cmovnz r12,rax + cmovnz r13,rbp + cmovnz r8,rcx + cmovnz r9,r10 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_mul_by_2q: + + xor r11,r11 + add r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + +global GFp_nistz256_point_double + +ALIGN 32 +GFp_nistz256_point_double: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_point_double: + mov rdi,rcx + mov rsi,rdx + + + + lea rcx,[GFp_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$point_doublex + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*5+8 + +$L$point_doubleq_body: + +$L$point_double_shortcutq: + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] +DB 102,72,15,110,199 +DB 102,73,15,110,202 +DB 102,73,15,110,211 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-0))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-0))+rbx] + lea rbx,[32+rbx] +DB 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] +DB 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rax,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] +DB 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subq + + mov rax,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-0))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + +DB 102,72,15,126,203 +DB 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doubleq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_point_double: +global GFp_nistz256_point_add + +ALIGN 32 +GFp_nistz256_point_add: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_point_add: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rcx,[GFp_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$point_addx + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*18+8 + +$L$point_addq_body: + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + movdqu xmm0,XMMWORD[64+rsi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm1,xmm0 +DB 102,72,15,110,199 + + lea rsi,[((64-0))+rsi] + mov QWORD[((544+0))+rsp],rax + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm1,0xb1 + por xmm4,xmm1 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rax,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] +DB 102,72,15,110,203 + + lea rsi,[((64-0))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((0+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 +DB 102,73,15,110,220 + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + or r12,r8 + or r12,r9 + +DB 102,73,15,126,208 +DB 102,73,15,126,217 + or r12,r8 +DB 0x3e + jnz NEAR $L$add_proceedq + + + + test r9,r9 + jz NEAR $L$add_doubleq + + + + + + +DB 102,72,15,126,199 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_doneq + +ALIGN 32 +$L$add_doubleq: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + + jmp NEAR $L$point_double_shortcutq + + +ALIGN 32 +$L$add_proceedq: + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((0+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + + + + xor r11,r11 + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromq + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_doneq: + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_point_add: +global GFp_nistz256_point_add_affine + +ALIGN 32 +GFp_nistz256_point_add_affine: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_point_add_affine: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rcx,[GFp_ia32cap_P] + mov rcx,QWORD[8+rcx] + and ecx,0x80100 + cmp ecx,0x80100 + je NEAR $L$point_add_affinex + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*15+8 + +$L$add_affineq_body: + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-0))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rax,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-0))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((0+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + + + + xor r11,r11 + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((0+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromq + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affineq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_point_add_affine: + +ALIGN 32 +__ecp_nistz256_add_tox: + + xor r11,r11 + adc r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_sub_fromx: + + xor r11,r11 + sbb r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,0 + + xor r10,r10 + adc r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + + bt r11,0 + cmovnc r12,rax + cmovnc r13,rbp + mov QWORD[rdi],r12 + cmovnc r8,rcx + mov QWORD[8+rdi],r13 + cmovnc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_subx: + + xor r11,r11 + sbb rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,0 + + xor r9,r9 + adc rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + + bt r11,0 + cmovc r12,rax + cmovc r13,rbp + cmovc r8,rcx + cmovc r9,r10 + + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +__ecp_nistz256_mul_by_2x: + + xor r11,r11 + adc r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +GFp_nistz256_point_doublex: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_point_doublex: + mov rdi,rcx + mov rsi,rdx + + + +$L$point_doublex: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*5+8 + +$L$point_doublex_body: + +$L$point_double_shortcutx: + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] +DB 102,72,15,110,199 +DB 102,73,15,110,202 +DB 102,73,15,110,211 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-128))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-128))+rbx] + lea rbx,[32+rbx] +DB 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] +DB 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rdx,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] +DB 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subx + + mov rdx,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-128))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + +DB 102,72,15,126,203 +DB 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doublex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_point_doublex: + +ALIGN 32 +GFp_nistz256_point_addx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_point_addx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$point_addx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*18+8 + +$L$point_addx_body: + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + movdqu xmm0,XMMWORD[64+rsi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm1,xmm0 +DB 102,72,15,110,199 + + lea rsi,[((64-128))+rsi] + mov QWORD[((544+0))+rsp],rdx + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm1,0xb1 + por xmm4,xmm1 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rdx,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] +DB 102,72,15,110,203 + + lea rsi,[((64-128))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((-128+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 +DB 102,73,15,110,220 + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + or r12,r8 + or r12,r9 + +DB 102,73,15,126,208 +DB 102,73,15,126,217 + or r12,r8 +DB 0x3e + jnz NEAR $L$add_proceedx + + + + test r9,r9 + jz NEAR $L$add_doublex + + + + + + +DB 102,72,15,126,199 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_donex + +ALIGN 32 +$L$add_doublex: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + + jmp NEAR $L$point_double_shortcutx + + +ALIGN 32 +$L$add_proceedx: + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((-128+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromx + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_donex: + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_point_addx: + +ALIGN 32 +GFp_nistz256_point_add_affinex: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_nistz256_point_add_affinex: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$point_add_affinex: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*15+8 + +$L$add_affinex_body: + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-128))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rdx,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-128))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((-128+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((-128+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromx + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affinex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_nistz256_point_add_affinex: +EXTERN __imp_RtlVirtualUnwind + + +ALIGN 16 +short_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[16+rax] + + mov r12,QWORD[((-8))+rax] + mov r13,QWORD[((-16))+rax] + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10d,DWORD[8+r11] + lea rax,[r10*1+rax] + + mov rbp,QWORD[((-8))+rax] + mov rbx,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_nistz256_neg wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_neg wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_neg wrt ..imagebase + + DD $L$SEH_begin_GFp_p256_scalar_mul_mont wrt ..imagebase + DD $L$SEH_end_GFp_p256_scalar_mul_mont wrt ..imagebase + DD $L$SEH_info_GFp_p256_scalar_mul_mont wrt ..imagebase + + DD $L$SEH_begin_GFp_p256_scalar_sqr_rep_mont wrt ..imagebase + DD $L$SEH_end_GFp_p256_scalar_sqr_rep_mont wrt ..imagebase + DD $L$SEH_info_GFp_p256_scalar_sqr_rep_mont wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_ord_mul_montx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_montx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_montx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_montx wrt ..imagebase + DD $L$SEH_begin_GFp_nistz256_mul_mont wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_mul_mont wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_mul_mont wrt ..imagebase + + DD $L$SEH_begin_GFp_nistz256_sqr_mont wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_sqr_mont wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_sqr_mont wrt ..imagebase + + DD $L$SEH_begin_GFp_nistz256_select_w5 wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_select_w5 wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_select_wX wrt ..imagebase + + DD $L$SEH_begin_GFp_nistz256_select_w7 wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_select_w7 wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_select_wX wrt ..imagebase + DD $L$SEH_begin_GFp_nistz256_avx2_select_w5 wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_avx2_select_w5 wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_avx2_select_wX wrt ..imagebase + + DD $L$SEH_begin_GFp_nistz256_avx2_select_w7 wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_avx2_select_w7 wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_avx2_select_wX wrt ..imagebase + DD $L$SEH_begin_GFp_nistz256_point_double wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_point_double wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_point_double wrt ..imagebase + + DD $L$SEH_begin_GFp_nistz256_point_add wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_point_add wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_point_add wrt ..imagebase + + DD $L$SEH_begin_GFp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_point_add_affine wrt ..imagebase + DD $L$SEH_begin_GFp_nistz256_point_doublex wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_point_doublex wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_point_doublex wrt ..imagebase + + DD $L$SEH_begin_GFp_nistz256_point_addx wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_point_addx wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_point_addx wrt ..imagebase + + DD $L$SEH_begin_GFp_nistz256_point_add_affinex wrt ..imagebase + DD $L$SEH_end_GFp_nistz256_point_add_affinex wrt ..imagebase + DD $L$SEH_info_GFp_nistz256_point_add_affinex wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_GFp_nistz256_neg: +DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase +$L$SEH_info_GFp_p256_scalar_mul_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_GFp_p256_scalar_sqr_rep_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_mul_montx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_montx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_GFp_nistz256_mul_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_GFp_nistz256_sqr_mont: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_GFp_nistz256_select_wX: +DB 0x01,0x33,0x16,0x00 +DB 0x33,0xf8,0x09,0x00 +DB 0x2e,0xe8,0x08,0x00 +DB 0x29,0xd8,0x07,0x00 +DB 0x24,0xc8,0x06,0x00 +DB 0x1f,0xb8,0x05,0x00 +DB 0x1a,0xa8,0x04,0x00 +DB 0x15,0x98,0x03,0x00 +DB 0x10,0x88,0x02,0x00 +DB 0x0c,0x78,0x01,0x00 +DB 0x08,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 +ALIGN 8 +$L$SEH_info_GFp_nistz256_avx2_select_wX: +DB 0x01,0x36,0x17,0x0b +DB 0x36,0xf8,0x09,0x00 +DB 0x31,0xe8,0x08,0x00 +DB 0x2c,0xd8,0x07,0x00 +DB 0x27,0xc8,0x06,0x00 +DB 0x22,0xb8,0x05,0x00 +DB 0x1d,0xa8,0x04,0x00 +DB 0x18,0x98,0x03,0x00 +DB 0x13,0x88,0x02,0x00 +DB 0x0e,0x78,0x01,0x00 +DB 0x09,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 +DB 0x00,0xb3,0x00,0x00 +ALIGN 8 +$L$SEH_info_GFp_nistz256_point_double: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_GFp_nistz256_point_add: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_GFp_nistz256_point_add_affine: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase + DD 32*15+56,0 +ALIGN 8 +$L$SEH_info_GFp_nistz256_point_doublex: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_GFp_nistz256_point_addx: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_GFp_nistz256_point_add_affinex: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase + DD 32*15+56,0 diff --git a/zeroidc/vendor/ring/pregenerated/tmp/sha256-x86_64-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/sha256-x86_64-nasm.asm new file mode 100644 index 000000000..2c526b027 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/sha256-x86_64-nasm.asm @@ -0,0 +1,4138 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +EXTERN GFp_ia32cap_P +global GFp_sha256_block_data_order + +ALIGN 16 +GFp_sha256_block_data_order: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_sha256_block_data_order: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea r11,[GFp_ia32cap_P] + mov r9d,DWORD[r11] + mov r10d,DWORD[4+r11] + mov r11d,DWORD[8+r11] + and r9d,1073741824 + and r10d,268435968 + or r10d,r9d + cmp r10d,1342177792 + je NEAR $L$avx_shortcut + test r10d,512 + jnz NEAR $L$ssse3_shortcut + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,16*4+4*8 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[88+rsp],rax + +$L$prologue: + + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + jmp NEAR $L$loop + +ALIGN 16 +$L$loop: + mov edi,ebx + lea rbp,[K256] + xor edi,ecx + mov r12d,DWORD[rsi] + mov r13d,r8d + mov r14d,eax + bswap r12d + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + add r11d,r14d + mov r12d,DWORD[4+rsi] + mov r13d,edx + mov r14d,r11d + bswap r12d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[4+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + add r10d,r14d + mov r12d,DWORD[8+rsi] + mov r13d,ecx + mov r14d,r10d + bswap r12d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[8+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + add r9d,r14d + mov r12d,DWORD[12+rsi] + mov r13d,ebx + mov r14d,r9d + bswap r12d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[12+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + add r8d,r14d + mov r12d,DWORD[16+rsi] + mov r13d,eax + mov r14d,r8d + bswap r12d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[16+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + add edx,r14d + mov r12d,DWORD[20+rsi] + mov r13d,r11d + mov r14d,edx + bswap r12d + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[20+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + add ecx,r14d + mov r12d,DWORD[24+rsi] + mov r13d,r10d + mov r14d,ecx + bswap r12d + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[24+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + add ebx,r14d + mov r12d,DWORD[28+rsi] + mov r13d,r9d + mov r14d,ebx + bswap r12d + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[28+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + add eax,r14d + mov r12d,DWORD[32+rsi] + mov r13d,r8d + mov r14d,eax + bswap r12d + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[32+rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + add r11d,r14d + mov r12d,DWORD[36+rsi] + mov r13d,edx + mov r14d,r11d + bswap r12d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[36+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + add r10d,r14d + mov r12d,DWORD[40+rsi] + mov r13d,ecx + mov r14d,r10d + bswap r12d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[40+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + add r9d,r14d + mov r12d,DWORD[44+rsi] + mov r13d,ebx + mov r14d,r9d + bswap r12d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[44+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + add r8d,r14d + mov r12d,DWORD[48+rsi] + mov r13d,eax + mov r14d,r8d + bswap r12d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[48+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + add edx,r14d + mov r12d,DWORD[52+rsi] + mov r13d,r11d + mov r14d,edx + bswap r12d + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[52+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + add ecx,r14d + mov r12d,DWORD[56+rsi] + mov r13d,r10d + mov r14d,ecx + bswap r12d + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[56+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + add ebx,r14d + mov r12d,DWORD[60+rsi] + mov r13d,r9d + mov r14d,ebx + bswap r12d + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[60+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + jmp NEAR $L$rounds_16_xx +ALIGN 16 +$L$rounds_16_xx: + mov r13d,DWORD[4+rsp] + mov r15d,DWORD[56+rsp] + + mov r12d,r13d + ror r13d,11 + add eax,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[36+rsp] + + add r12d,DWORD[rsp] + mov r13d,r8d + add r12d,r15d + mov r14d,eax + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[8+rsp] + mov edi,DWORD[60+rsp] + + mov r12d,r13d + ror r13d,11 + add r11d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[40+rsp] + + add r12d,DWORD[4+rsp] + mov r13d,edx + add r12d,edi + mov r14d,r11d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[4+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[12+rsp] + mov r15d,DWORD[rsp] + + mov r12d,r13d + ror r13d,11 + add r10d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[44+rsp] + + add r12d,DWORD[8+rsp] + mov r13d,ecx + add r12d,r15d + mov r14d,r10d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[8+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[16+rsp] + mov edi,DWORD[4+rsp] + + mov r12d,r13d + ror r13d,11 + add r9d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[48+rsp] + + add r12d,DWORD[12+rsp] + mov r13d,ebx + add r12d,edi + mov r14d,r9d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[12+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + mov r13d,DWORD[20+rsp] + mov r15d,DWORD[8+rsp] + + mov r12d,r13d + ror r13d,11 + add r8d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[52+rsp] + + add r12d,DWORD[16+rsp] + mov r13d,eax + add r12d,r15d + mov r14d,r8d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[16+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[24+rsp] + mov edi,DWORD[12+rsp] + + mov r12d,r13d + ror r13d,11 + add edx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[56+rsp] + + add r12d,DWORD[20+rsp] + mov r13d,r11d + add r12d,edi + mov r14d,edx + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[20+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[28+rsp] + mov r15d,DWORD[16+rsp] + + mov r12d,r13d + ror r13d,11 + add ecx,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[60+rsp] + + add r12d,DWORD[24+rsp] + mov r13d,r10d + add r12d,r15d + mov r14d,ecx + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[24+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[32+rsp] + mov edi,DWORD[20+rsp] + + mov r12d,r13d + ror r13d,11 + add ebx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[rsp] + + add r12d,DWORD[28+rsp] + mov r13d,r9d + add r12d,edi + mov r14d,ebx + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[28+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + mov r13d,DWORD[36+rsp] + mov r15d,DWORD[24+rsp] + + mov r12d,r13d + ror r13d,11 + add eax,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[4+rsp] + + add r12d,DWORD[32+rsp] + mov r13d,r8d + add r12d,r15d + mov r14d,eax + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[32+rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[40+rsp] + mov edi,DWORD[28+rsp] + + mov r12d,r13d + ror r13d,11 + add r11d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[8+rsp] + + add r12d,DWORD[36+rsp] + mov r13d,edx + add r12d,edi + mov r14d,r11d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[36+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[44+rsp] + mov r15d,DWORD[32+rsp] + + mov r12d,r13d + ror r13d,11 + add r10d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[12+rsp] + + add r12d,DWORD[40+rsp] + mov r13d,ecx + add r12d,r15d + mov r14d,r10d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[40+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[48+rsp] + mov edi,DWORD[36+rsp] + + mov r12d,r13d + ror r13d,11 + add r9d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[16+rsp] + + add r12d,DWORD[44+rsp] + mov r13d,ebx + add r12d,edi + mov r14d,r9d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[44+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + mov r13d,DWORD[52+rsp] + mov r15d,DWORD[40+rsp] + + mov r12d,r13d + ror r13d,11 + add r8d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[20+rsp] + + add r12d,DWORD[48+rsp] + mov r13d,eax + add r12d,r15d + mov r14d,r8d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[48+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[56+rsp] + mov edi,DWORD[44+rsp] + + mov r12d,r13d + ror r13d,11 + add edx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[24+rsp] + + add r12d,DWORD[52+rsp] + mov r13d,r11d + add r12d,edi + mov r14d,edx + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[52+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[60+rsp] + mov r15d,DWORD[48+rsp] + + mov r12d,r13d + ror r13d,11 + add ecx,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[28+rsp] + + add r12d,DWORD[56+rsp] + mov r13d,r10d + add r12d,r15d + mov r14d,ecx + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[56+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[rsp] + mov edi,DWORD[52+rsp] + + mov r12d,r13d + ror r13d,11 + add ebx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[32+rsp] + + add r12d,DWORD[60+rsp] + mov r13d,r9d + add r12d,edi + mov r14d,ebx + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[60+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + cmp BYTE[3+rbp],0 + jnz NEAR $L$rounds_16_xx + + mov rdi,QWORD[((64+0))+rsp] + add eax,r14d + lea rsi,[64+rsi] + + add eax,DWORD[rdi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop + + mov rsi,QWORD[88+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_sha256_block_data_order: +ALIGN 64 + +K256: + DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 +DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +DB 111,114,103,62,0 + +ALIGN 64 +GFp_sha256_block_data_order_ssse3: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_sha256_block_data_order_ssse3: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ssse3_shortcut: + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[88+rsp],rax + + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_ssse3: + + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + + + jmp NEAR $L$loop_ssse3 +ALIGN 16 +$L$loop_ssse3: + movdqa xmm7,XMMWORD[((K256+512))] + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] +DB 102,15,56,0,199 + movdqu xmm3,XMMWORD[48+rsi] + lea rbp,[K256] +DB 102,15,56,0,207 + movdqa xmm4,XMMWORD[rbp] + movdqa xmm5,XMMWORD[32+rbp] +DB 102,15,56,0,215 + paddd xmm4,xmm0 + movdqa xmm6,XMMWORD[64+rbp] +DB 102,15,56,0,223 + movdqa xmm7,XMMWORD[96+rbp] + paddd xmm5,xmm1 + paddd xmm6,xmm2 + paddd xmm7,xmm3 + movdqa XMMWORD[rsp],xmm4 + mov r14d,eax + movdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + movdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + movdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$ssse3_00_47 + +ALIGN 16 +$L$ssse3_00_47: + sub rbp,-128 + ror r13d,14 + movdqa xmm4,xmm1 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm3 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,224,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,250,4 + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm0,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm3,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD[4+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm0,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm0,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm0,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD[rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm0,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm0 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD[rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm2 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm0 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,225,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,251,4 + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm1,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm0,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD[20+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm1,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm1,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm1,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD[32+rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm1,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm1 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD[16+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm3 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm1 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,226,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,248,4 + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm2,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm1,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD[36+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm2,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm2,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm2,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD[64+rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm2,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm2 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD[32+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm0 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm2 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,227,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,249,4 + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm3,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm2,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD[52+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm3,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm3,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm3,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD[96+rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm3,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm3 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$ssse3_00_47 + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_ssse3 + + mov rsi,QWORD[88+rsp] + + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue_ssse3: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_sha256_block_data_order_ssse3: + +ALIGN 64 +GFp_sha256_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_sha256_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$avx_shortcut: + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[88+rsp],rax + + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_avx: + + vzeroupper + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + vmovdqa xmm8,XMMWORD[((K256+512+32))] + vmovdqa xmm9,XMMWORD[((K256+512+64))] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm7,XMMWORD[((K256+512))] + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm0,xmm0,xmm7 + lea rbp,[K256] + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,XMMWORD[rbp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,XMMWORD[32+rbp] + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + vpaddd xmm7,xmm3,XMMWORD[96+rbp] + vmovdqa XMMWORD[rsp],xmm4 + mov r14d,eax + vmovdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + vmovdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + vmovdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + sub rbp,-128 + vpalignr xmm4,xmm1,xmm0,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm3,xmm2,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm0,xmm0,xmm7 + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm3,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm0,xmm0,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm0,xmm0,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + vpshufd xmm7,xmm0,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm0,xmm0,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm0,XMMWORD[rbp] + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[rsp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm0,xmm3,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm1,xmm1,xmm7 + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm0,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm1,xmm1,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm1,xmm1,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + vpshufd xmm7,xmm1,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm1,xmm1,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm1,XMMWORD[32+rbp] + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[16+rsp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm1,xmm0,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm2,xmm2,xmm7 + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm1,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm2,xmm2,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm2,xmm2,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + vpshufd xmm7,xmm2,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm2,xmm2,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[32+rsp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm2,xmm1,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm3,xmm3,xmm7 + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm2,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm3,xmm3,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm3,xmm3,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + vpshufd xmm7,xmm3,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm3,xmm3,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm3,XMMWORD[96+rbp] + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_avx + + mov rsi,QWORD[88+rsp] + + vzeroupper + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_sha256_block_data_order_avx: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + mov rsi,rax + mov rax,QWORD[((64+24))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + + lea r10,[$L$epilogue] + cmp rbx,r10 + jb NEAR $L$in_prologue + + lea rsi,[((64+32))+rsi] + lea rdi,[512+r8] + mov ecx,8 + DD 0xa548f3fc + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_sha256_block_data_order wrt ..imagebase + DD $L$SEH_end_GFp_sha256_block_data_order wrt ..imagebase + DD $L$SEH_info_GFp_sha256_block_data_order wrt ..imagebase + DD $L$SEH_begin_GFp_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_end_GFp_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_info_GFp_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_GFp_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_GFp_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_GFp_sha256_block_data_order_avx wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_GFp_sha256_block_data_order: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +$L$SEH_info_GFp_sha256_block_data_order_ssse3: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_GFp_sha256_block_data_order_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase diff --git a/zeroidc/vendor/ring/pregenerated/tmp/sha512-x86_64-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/sha512-x86_64-nasm.asm new file mode 100644 index 000000000..386de48d6 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/sha512-x86_64-nasm.asm @@ -0,0 +1,3135 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +EXTERN GFp_ia32cap_P +global GFp_sha512_block_data_order + +ALIGN 16 +GFp_sha512_block_data_order: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_sha512_block_data_order: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea r11,[GFp_ia32cap_P] + mov r9d,DWORD[r11] + mov r10d,DWORD[4+r11] + mov r11d,DWORD[8+r11] + and r9d,1073741824 + and r10d,268435968 + or r10d,r9d + cmp r10d,1342177792 + je NEAR $L$avx_shortcut + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,16*8+4*8 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[152+rsp],rax + +$L$prologue: + + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop + +ALIGN 16 +$L$loop: + mov rdi,rbx + lea rbp,[K512] + xor rdi,rcx + mov r12,QWORD[rsi] + mov r13,r8 + mov r14,rax + bswap r12 + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + add r11,r14 + mov r12,QWORD[8+rsi] + mov r13,rdx + mov r14,r11 + bswap r12 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[8+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + add r10,r14 + mov r12,QWORD[16+rsi] + mov r13,rcx + mov r14,r10 + bswap r12 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[16+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + add r9,r14 + mov r12,QWORD[24+rsi] + mov r13,rbx + mov r14,r9 + bswap r12 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[24+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + add r8,r14 + mov r12,QWORD[32+rsi] + mov r13,rax + mov r14,r8 + bswap r12 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[32+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + add rdx,r14 + mov r12,QWORD[40+rsi] + mov r13,r11 + mov r14,rdx + bswap r12 + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[40+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + add rcx,r14 + mov r12,QWORD[48+rsi] + mov r13,r10 + mov r14,rcx + bswap r12 + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[48+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + add rbx,r14 + mov r12,QWORD[56+rsi] + mov r13,r9 + mov r14,rbx + bswap r12 + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[56+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + add rax,r14 + mov r12,QWORD[64+rsi] + mov r13,r8 + mov r14,rax + bswap r12 + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[64+rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + add r11,r14 + mov r12,QWORD[72+rsi] + mov r13,rdx + mov r14,r11 + bswap r12 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[72+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + add r10,r14 + mov r12,QWORD[80+rsi] + mov r13,rcx + mov r14,r10 + bswap r12 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[80+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + add r9,r14 + mov r12,QWORD[88+rsi] + mov r13,rbx + mov r14,r9 + bswap r12 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[88+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + add r8,r14 + mov r12,QWORD[96+rsi] + mov r13,rax + mov r14,r8 + bswap r12 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[96+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + add rdx,r14 + mov r12,QWORD[104+rsi] + mov r13,r11 + mov r14,rdx + bswap r12 + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[104+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + add rcx,r14 + mov r12,QWORD[112+rsi] + mov r13,r10 + mov r14,rcx + bswap r12 + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[112+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + add rbx,r14 + mov r12,QWORD[120+rsi] + mov r13,r9 + mov r14,rbx + bswap r12 + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[120+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + jmp NEAR $L$rounds_16_xx +ALIGN 16 +$L$rounds_16_xx: + mov r13,QWORD[8+rsp] + mov r15,QWORD[112+rsp] + + mov r12,r13 + ror r13,7 + add rax,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[72+rsp] + + add r12,QWORD[rsp] + mov r13,r8 + add r12,r15 + mov r14,rax + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[16+rsp] + mov rdi,QWORD[120+rsp] + + mov r12,r13 + ror r13,7 + add r11,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[80+rsp] + + add r12,QWORD[8+rsp] + mov r13,rdx + add r12,rdi + mov r14,r11 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[8+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[24+rsp] + mov r15,QWORD[rsp] + + mov r12,r13 + ror r13,7 + add r10,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[88+rsp] + + add r12,QWORD[16+rsp] + mov r13,rcx + add r12,r15 + mov r14,r10 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[16+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[32+rsp] + mov rdi,QWORD[8+rsp] + + mov r12,r13 + ror r13,7 + add r9,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[96+rsp] + + add r12,QWORD[24+rsp] + mov r13,rbx + add r12,rdi + mov r14,r9 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[24+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[40+rsp] + mov r15,QWORD[16+rsp] + + mov r12,r13 + ror r13,7 + add r8,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[104+rsp] + + add r12,QWORD[32+rsp] + mov r13,rax + add r12,r15 + mov r14,r8 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[32+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[48+rsp] + mov rdi,QWORD[24+rsp] + + mov r12,r13 + ror r13,7 + add rdx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[112+rsp] + + add r12,QWORD[40+rsp] + mov r13,r11 + add r12,rdi + mov r14,rdx + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[40+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[56+rsp] + mov r15,QWORD[32+rsp] + + mov r12,r13 + ror r13,7 + add rcx,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[120+rsp] + + add r12,QWORD[48+rsp] + mov r13,r10 + add r12,r15 + mov r14,rcx + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[48+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[64+rsp] + mov rdi,QWORD[40+rsp] + + mov r12,r13 + ror r13,7 + add rbx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[rsp] + + add r12,QWORD[56+rsp] + mov r13,r9 + add r12,rdi + mov r14,rbx + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[56+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[72+rsp] + mov r15,QWORD[48+rsp] + + mov r12,r13 + ror r13,7 + add rax,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[8+rsp] + + add r12,QWORD[64+rsp] + mov r13,r8 + add r12,r15 + mov r14,rax + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[64+rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[80+rsp] + mov rdi,QWORD[56+rsp] + + mov r12,r13 + ror r13,7 + add r11,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[16+rsp] + + add r12,QWORD[72+rsp] + mov r13,rdx + add r12,rdi + mov r14,r11 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[72+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[88+rsp] + mov r15,QWORD[64+rsp] + + mov r12,r13 + ror r13,7 + add r10,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[24+rsp] + + add r12,QWORD[80+rsp] + mov r13,rcx + add r12,r15 + mov r14,r10 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[80+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[96+rsp] + mov rdi,QWORD[72+rsp] + + mov r12,r13 + ror r13,7 + add r9,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[32+rsp] + + add r12,QWORD[88+rsp] + mov r13,rbx + add r12,rdi + mov r14,r9 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[88+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[104+rsp] + mov r15,QWORD[80+rsp] + + mov r12,r13 + ror r13,7 + add r8,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[40+rsp] + + add r12,QWORD[96+rsp] + mov r13,rax + add r12,r15 + mov r14,r8 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[96+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[112+rsp] + mov rdi,QWORD[88+rsp] + + mov r12,r13 + ror r13,7 + add rdx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[48+rsp] + + add r12,QWORD[104+rsp] + mov r13,r11 + add r12,rdi + mov r14,rdx + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[104+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[120+rsp] + mov r15,QWORD[96+rsp] + + mov r12,r13 + ror r13,7 + add rcx,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[56+rsp] + + add r12,QWORD[112+rsp] + mov r13,r10 + add r12,r15 + mov r14,rcx + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[112+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[rsp] + mov rdi,QWORD[104+rsp] + + mov r12,r13 + ror r13,7 + add rbx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[64+rsp] + + add r12,QWORD[120+rsp] + mov r13,r9 + add r12,rdi + mov r14,rbx + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[120+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + cmp BYTE[7+rbp],0 + jnz NEAR $L$rounds_16_xx + + mov rdi,QWORD[((128+0))+rsp] + add rax,r14 + lea rsi,[128+rsi] + + add rax,QWORD[rdi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop + + mov rsi,QWORD[152+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_sha512_block_data_order: +ALIGN 64 + +K512: + DQ 0x428a2f98d728ae22,0x7137449123ef65cd + DQ 0x428a2f98d728ae22,0x7137449123ef65cd + DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + DQ 0x3956c25bf348b538,0x59f111f1b605d019 + DQ 0x3956c25bf348b538,0x59f111f1b605d019 + DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + DQ 0xd807aa98a3030242,0x12835b0145706fbe + DQ 0xd807aa98a3030242,0x12835b0145706fbe + DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + DQ 0x9bdc06a725c71235,0xc19bf174cf692694 + DQ 0x9bdc06a725c71235,0xc19bf174cf692694 + DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + DQ 0x983e5152ee66dfab,0xa831c66d2db43210 + DQ 0x983e5152ee66dfab,0xa831c66d2db43210 + DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4 + DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4 + DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725 + DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725 + DQ 0x06ca6351e003826f,0x142929670a0e6e70 + DQ 0x06ca6351e003826f,0x142929670a0e6e70 + DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926 + DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926 + DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + DQ 0x650a73548baf63de,0x766a0abb3c77b2a8 + DQ 0x650a73548baf63de,0x766a0abb3c77b2a8 + DQ 0x81c2c92e47edaee6,0x92722c851482353b + DQ 0x81c2c92e47edaee6,0x92722c851482353b + DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001 + DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001 + DQ 0xc24b8b70d0f89791,0xc76c51a30654be30 + DQ 0xc24b8b70d0f89791,0xc76c51a30654be30 + DQ 0xd192e819d6ef5218,0xd69906245565a910 + DQ 0xd192e819d6ef5218,0xd69906245565a910 + DQ 0xf40e35855771202a,0x106aa07032bbd1b8 + DQ 0xf40e35855771202a,0x106aa07032bbd1b8 + DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + DQ 0x748f82ee5defb2fc,0x78a5636f43172f60 + DQ 0x748f82ee5defb2fc,0x78a5636f43172f60 + DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec + DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec + DQ 0x90befffa23631e28,0xa4506cebde82bde9 + DQ 0x90befffa23631e28,0xa4506cebde82bde9 + DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b + DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b + DQ 0xca273eceea26619c,0xd186b8c721c0c207 + DQ 0xca273eceea26619c,0xd186b8c721c0c207 + DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6 + DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6 + DQ 0x113f9804bef90dae,0x1b710b35131c471b + DQ 0x113f9804bef90dae,0x1b710b35131c471b + DQ 0x28db77f523047d84,0x32caab7b40c72493 + DQ 0x28db77f523047d84,0x32caab7b40c72493 + DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + + DQ 0x0001020304050607,0x08090a0b0c0d0e0f + DQ 0x0001020304050607,0x08090a0b0c0d0e0f +DB 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 +DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 +DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +DB 111,114,103,62,0 + +ALIGN 64 +GFp_sha512_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_sha512_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$avx_shortcut: + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,256 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[152+rsp],rax + + movaps XMMWORD[(128+32)+rsp],xmm6 + movaps XMMWORD[(128+48)+rsp],xmm7 + movaps XMMWORD[(128+64)+rsp],xmm8 + movaps XMMWORD[(128+80)+rsp],xmm9 + movaps XMMWORD[(128+96)+rsp],xmm10 + movaps XMMWORD[(128+112)+rsp],xmm11 +$L$prologue_avx: + + vzeroupper + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm11,XMMWORD[((K512+1280))] + vmovdqu xmm0,XMMWORD[rsi] + lea rbp,[((K512+128))] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vpshufb xmm0,xmm0,xmm11 + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm1,xmm1,xmm11 + vmovdqu xmm4,XMMWORD[64+rsi] + vpshufb xmm2,xmm2,xmm11 + vmovdqu xmm5,XMMWORD[80+rsi] + vpshufb xmm3,xmm3,xmm11 + vmovdqu xmm6,XMMWORD[96+rsi] + vpshufb xmm4,xmm4,xmm11 + vmovdqu xmm7,XMMWORD[112+rsi] + vpshufb xmm5,xmm5,xmm11 + vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] + vpshufb xmm6,xmm6,xmm11 + vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] + vpshufb xmm7,xmm7,xmm11 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] + vmovdqa XMMWORD[rsp],xmm8 + vpaddq xmm8,xmm4,XMMWORD[rbp] + vmovdqa XMMWORD[16+rsp],xmm9 + vpaddq xmm9,xmm5,XMMWORD[32+rbp] + vmovdqa XMMWORD[32+rsp],xmm10 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + vmovdqa XMMWORD[48+rsp],xmm11 + vpaddq xmm11,xmm7,XMMWORD[96+rbp] + vmovdqa XMMWORD[64+rsp],xmm8 + mov r14,rax + vmovdqa XMMWORD[80+rsp],xmm9 + mov rdi,rbx + vmovdqa XMMWORD[96+rsp],xmm10 + xor rdi,rcx + vmovdqa XMMWORD[112+rsp],xmm11 + mov r13,r8 + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + add rbp,256 + vpalignr xmm8,xmm1,xmm0,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm5,xmm4,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm0,xmm0,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm7,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm7,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm0,xmm0,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm7,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[8+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm0,xmm0,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[rsp],xmm10 + vpalignr xmm8,xmm2,xmm1,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm6,xmm5,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm1,xmm1,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[16+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm0,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm0,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm1,xmm1,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm0,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[24+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm1,xmm1,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[16+rsp],xmm10 + vpalignr xmm8,xmm3,xmm2,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm7,xmm6,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm2,xmm2,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[32+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm1,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm1,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm2,xmm2,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm1,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[40+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm2,xmm2,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[32+rsp],xmm10 + vpalignr xmm8,xmm4,xmm3,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm0,xmm7,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm3,xmm3,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[48+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm2,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm2,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm3,xmm3,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm2,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[56+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm3,xmm3,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[48+rsp],xmm10 + vpalignr xmm8,xmm5,xmm4,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm1,xmm0,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm4,xmm4,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[64+rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm3,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm3,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm4,xmm4,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm3,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[72+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm4,xmm4,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm4,XMMWORD[rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[64+rsp],xmm10 + vpalignr xmm8,xmm6,xmm5,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm2,xmm1,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm5,xmm5,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[80+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm4,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm4,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm5,xmm5,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm4,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[88+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm5,xmm5,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm5,XMMWORD[32+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[80+rsp],xmm10 + vpalignr xmm8,xmm7,xmm6,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm3,xmm2,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm6,xmm6,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[96+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm5,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm5,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm6,xmm6,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm5,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[104+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm6,xmm6,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[96+rsp],xmm10 + vpalignr xmm8,xmm0,xmm7,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm4,xmm3,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm7,xmm7,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[112+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm6,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm6,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm7,xmm7,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm6,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[120+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm7,xmm7,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm7,XMMWORD[96+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[112+rsp],xmm10 + cmp BYTE[135+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + mov rdi,QWORD[((128+0))+rsp] + mov rax,r14 + + add rax,QWORD[rdi] + lea rsi,[128+rsi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop_avx + + mov rsi,QWORD[152+rsp] + + vzeroupper + movaps xmm6,XMMWORD[((128+32))+rsp] + movaps xmm7,XMMWORD[((128+48))+rsp] + movaps xmm8,XMMWORD[((128+64))+rsp] + movaps xmm9,XMMWORD[((128+80))+rsp] + movaps xmm10,XMMWORD[((128+96))+rsp] + movaps xmm11,XMMWORD[((128+112))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_sha512_block_data_order_avx: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + mov rsi,rax + mov rax,QWORD[((128+24))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + + lea r10,[$L$epilogue] + cmp rbx,r10 + jb NEAR $L$in_prologue + + lea rsi,[((128+32))+rsi] + lea rdi,[512+r8] + mov ecx,12 + DD 0xa548f3fc + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_sha512_block_data_order wrt ..imagebase + DD $L$SEH_end_GFp_sha512_block_data_order wrt ..imagebase + DD $L$SEH_info_GFp_sha512_block_data_order wrt ..imagebase + DD $L$SEH_begin_GFp_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_GFp_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_GFp_sha512_block_data_order_avx wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_GFp_sha512_block_data_order: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +$L$SEH_info_GFp_sha512_block_data_order_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase diff --git a/zeroidc/vendor/ring/pregenerated/tmp/vpaes-x86-win32n.asm b/zeroidc/vendor/ring/pregenerated/tmp/vpaes-x86-win32n.asm new file mode 100644 index 000000000..8061dc3de --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/vpaes-x86-win32n.asm @@ -0,0 +1,378 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +align 64 +L$_vpaes_consts: +dd 218628480,235210255,168496130,67568393 +dd 252381056,17041926,33884169,51187212 +dd 252645135,252645135,252645135,252645135 +dd 1512730624,3266504856,1377990664,3401244816 +dd 830229760,1275146365,2969422977,3447763452 +dd 3411033600,2979783055,338359620,2782886510 +dd 4209124096,907596821,221174255,1006095553 +dd 191964160,3799684038,3164090317,1589111125 +dd 182528256,1777043520,2877432650,3265356744 +dd 1874708224,3503451415,3305285752,363511674 +dd 1606117888,3487855781,1093350906,2384367825 +dd 197121,67569157,134941193,202313229 +dd 67569157,134941193,202313229,197121 +dd 134941193,202313229,197121,67569157 +dd 202313229,197121,67569157,134941193 +dd 33619971,100992007,168364043,235736079 +dd 235736079,33619971,100992007,168364043 +dd 168364043,235736079,33619971,100992007 +dd 100992007,168364043,235736079,33619971 +dd 50462976,117835012,185207048,252579084 +dd 252314880,51251460,117574920,184942860 +dd 184682752,252054788,50987272,118359308 +dd 118099200,185467140,251790600,50727180 +dd 2946363062,528716217,1300004225,1881839624 +dd 1532713819,1532713819,1532713819,1532713819 +dd 3602276352,4288629033,3737020424,4153884961 +dd 1354558464,32357713,2958822624,3775749553 +dd 1201988352,132424512,1572796698,503232858 +dd 2213177600,1597421020,4103937655,675398315 +db 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +db 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +db 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +db 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +db 118,101,114,115,105,116,121,41,0 +align 64 +align 16 +__vpaes_preheat: + add ebp,DWORD [esp] + movdqa xmm7,[ebp-48] + movdqa xmm6,[ebp-16] + ret +align 16 +__vpaes_encrypt_core: + mov ecx,16 + mov eax,DWORD [240+edx] + movdqa xmm1,xmm6 + movdqa xmm2,[ebp] + pandn xmm1,xmm0 + pand xmm0,xmm6 + movdqu xmm5,[edx] +db 102,15,56,0,208 + movdqa xmm0,[16+ebp] + pxor xmm2,xmm5 + psrld xmm1,4 + add edx,16 +db 102,15,56,0,193 + lea ebx,[192+ebp] + pxor xmm0,xmm2 + jmp NEAR L$000enc_entry +align 16 +L$001enc_loop: + movdqa xmm4,[32+ebp] + movdqa xmm0,[48+ebp] +db 102,15,56,0,226 +db 102,15,56,0,195 + pxor xmm4,xmm5 + movdqa xmm5,[64+ebp] + pxor xmm0,xmm4 + movdqa xmm1,[ecx*1+ebx-64] +db 102,15,56,0,234 + movdqa xmm2,[80+ebp] + movdqa xmm4,[ecx*1+ebx] +db 102,15,56,0,211 + movdqa xmm3,xmm0 + pxor xmm2,xmm5 +db 102,15,56,0,193 + add edx,16 + pxor xmm0,xmm2 +db 102,15,56,0,220 + add ecx,16 + pxor xmm3,xmm0 +db 102,15,56,0,193 + and ecx,48 + sub eax,1 + pxor xmm0,xmm3 +L$000enc_entry: + movdqa xmm1,xmm6 + movdqa xmm5,[ebp-32] + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm6 +db 102,15,56,0,232 + movdqa xmm3,xmm7 + pxor xmm0,xmm1 +db 102,15,56,0,217 + movdqa xmm4,xmm7 + pxor xmm3,xmm5 +db 102,15,56,0,224 + movdqa xmm2,xmm7 + pxor xmm4,xmm5 +db 102,15,56,0,211 + movdqa xmm3,xmm7 + pxor xmm2,xmm0 +db 102,15,56,0,220 + movdqu xmm5,[edx] + pxor xmm3,xmm1 + jnz NEAR L$001enc_loop + movdqa xmm4,[96+ebp] + movdqa xmm0,[112+ebp] +db 102,15,56,0,226 + pxor xmm4,xmm5 +db 102,15,56,0,195 + movdqa xmm1,[64+ecx*1+ebx] + pxor xmm0,xmm4 +db 102,15,56,0,193 + ret +align 16 +__vpaes_schedule_core: + add ebp,DWORD [esp] + movdqu xmm0,[esi] + movdqa xmm2,[320+ebp] + movdqa xmm3,xmm0 + lea ebx,[ebp] + movdqa [4+esp],xmm2 + call __vpaes_schedule_transform + movdqa xmm7,xmm0 + test edi,edi + jnz NEAR L$002schedule_am_decrypting + movdqu [edx],xmm0 + jmp NEAR L$003schedule_go +L$002schedule_am_decrypting: + movdqa xmm1,[256+ecx*1+ebp] +db 102,15,56,0,217 + movdqu [edx],xmm3 + xor ecx,48 +L$003schedule_go: + cmp eax,192 + ja NEAR L$004schedule_256 +L$005schedule_128: + mov eax,10 +L$006loop_schedule_128: + call __vpaes_schedule_round + dec eax + jz NEAR L$007schedule_mangle_last + call __vpaes_schedule_mangle + jmp NEAR L$006loop_schedule_128 +align 16 +L$004schedule_256: + movdqu xmm0,[16+esi] + call __vpaes_schedule_transform + mov eax,7 +L$008loop_schedule_256: + call __vpaes_schedule_mangle + movdqa xmm6,xmm0 + call __vpaes_schedule_round + dec eax + jz NEAR L$007schedule_mangle_last + call __vpaes_schedule_mangle + pshufd xmm0,xmm0,255 + movdqa [20+esp],xmm7 + movdqa xmm7,xmm6 + call L$_vpaes_schedule_low_round + movdqa xmm7,[20+esp] + jmp NEAR L$008loop_schedule_256 +align 16 +L$007schedule_mangle_last: + lea ebx,[384+ebp] + test edi,edi + jnz NEAR L$009schedule_mangle_last_dec + movdqa xmm1,[256+ecx*1+ebp] +db 102,15,56,0,193 + lea ebx,[352+ebp] + add edx,32 +L$009schedule_mangle_last_dec: + add edx,-16 + pxor xmm0,[336+ebp] + call __vpaes_schedule_transform + movdqu [edx],xmm0 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + ret +align 16 +__vpaes_schedule_round: + movdqa xmm2,[8+esp] + pxor xmm1,xmm1 +db 102,15,58,15,202,15 +db 102,15,58,15,210,15 + pxor xmm7,xmm1 + pshufd xmm0,xmm0,255 +db 102,15,58,15,192,1 + movdqa [8+esp],xmm2 +L$_vpaes_schedule_low_round: + movdqa xmm1,xmm7 + pslldq xmm7,4 + pxor xmm7,xmm1 + movdqa xmm1,xmm7 + pslldq xmm7,8 + pxor xmm7,xmm1 + pxor xmm7,[336+ebp] + movdqa xmm4,[ebp-16] + movdqa xmm5,[ebp-48] + movdqa xmm1,xmm4 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm4 + movdqa xmm2,[ebp-32] +db 102,15,56,0,208 + pxor xmm0,xmm1 + movdqa xmm3,xmm5 +db 102,15,56,0,217 + pxor xmm3,xmm2 + movdqa xmm4,xmm5 +db 102,15,56,0,224 + pxor xmm4,xmm2 + movdqa xmm2,xmm5 +db 102,15,56,0,211 + pxor xmm2,xmm0 + movdqa xmm3,xmm5 +db 102,15,56,0,220 + pxor xmm3,xmm1 + movdqa xmm4,[32+ebp] +db 102,15,56,0,226 + movdqa xmm0,[48+ebp] +db 102,15,56,0,195 + pxor xmm0,xmm4 + pxor xmm0,xmm7 + movdqa xmm7,xmm0 + ret +align 16 +__vpaes_schedule_transform: + movdqa xmm2,[ebp-16] + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + movdqa xmm2,[ebx] +db 102,15,56,0,208 + movdqa xmm0,[16+ebx] +db 102,15,56,0,193 + pxor xmm0,xmm2 + ret +align 16 +__vpaes_schedule_mangle: + movdqa xmm4,xmm0 + movdqa xmm5,[128+ebp] + test edi,edi + jnz NEAR L$010schedule_mangle_dec + add edx,16 + pxor xmm4,[336+ebp] +db 102,15,56,0,229 + movdqa xmm3,xmm4 +db 102,15,56,0,229 + pxor xmm3,xmm4 +db 102,15,56,0,229 + pxor xmm3,xmm4 + jmp NEAR L$011schedule_mangle_both +align 16 +L$010schedule_mangle_dec: + movdqa xmm2,[ebp-16] + lea esi,[ebp] + movdqa xmm1,xmm2 + pandn xmm1,xmm4 + psrld xmm1,4 + pand xmm4,xmm2 + movdqa xmm2,[esi] +db 102,15,56,0,212 + movdqa xmm3,[16+esi] +db 102,15,56,0,217 + pxor xmm3,xmm2 +db 102,15,56,0,221 + movdqa xmm2,[32+esi] +db 102,15,56,0,212 + pxor xmm2,xmm3 + movdqa xmm3,[48+esi] +db 102,15,56,0,217 + pxor xmm3,xmm2 +db 102,15,56,0,221 + movdqa xmm2,[64+esi] +db 102,15,56,0,212 + pxor xmm2,xmm3 + movdqa xmm3,[80+esi] +db 102,15,56,0,217 + pxor xmm3,xmm2 +db 102,15,56,0,221 + movdqa xmm2,[96+esi] +db 102,15,56,0,212 + pxor xmm2,xmm3 + movdqa xmm3,[112+esi] +db 102,15,56,0,217 + pxor xmm3,xmm2 + add edx,-16 +L$011schedule_mangle_both: + movdqa xmm1,[256+ecx*1+ebp] +db 102,15,56,0,217 + add ecx,-16 + and ecx,48 + movdqu [edx],xmm3 + ret +global _GFp_vpaes_set_encrypt_key +align 16 +_GFp_vpaes_set_encrypt_key: +L$_GFp_vpaes_set_encrypt_key_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + lea ebx,[esp-56] + mov eax,DWORD [24+esp] + and ebx,-16 + mov edx,DWORD [28+esp] + xchg ebx,esp + mov DWORD [48+esp],ebx + mov ebx,eax + shr ebx,5 + add ebx,5 + mov DWORD [240+edx],ebx + mov ecx,48 + mov edi,0 + lea ebp,[(L$_vpaes_consts+0x30-L$012pic_point)] + call __vpaes_schedule_core +L$012pic_point: + mov esp,DWORD [48+esp] + xor eax,eax + pop edi + pop esi + pop ebx + pop ebp + ret +global _GFp_vpaes_encrypt +align 16 +_GFp_vpaes_encrypt: +L$_GFp_vpaes_encrypt_begin: + push ebp + push ebx + push esi + push edi + lea ebp,[(L$_vpaes_consts+0x30-L$013pic_point)] + call __vpaes_preheat +L$013pic_point: + mov esi,DWORD [20+esp] + lea ebx,[esp-56] + mov edi,DWORD [24+esp] + and ebx,-16 + mov edx,DWORD [28+esp] + xchg ebx,esp + mov DWORD [48+esp],ebx + movdqu xmm0,[esi] + call __vpaes_encrypt_core + movdqu [edi],xmm0 + mov esp,DWORD [48+esp] + pop edi + pop esi + pop ebx + pop ebp + ret diff --git a/zeroidc/vendor/ring/pregenerated/tmp/vpaes-x86_64-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/vpaes-x86_64-nasm.asm new file mode 100644 index 000000000..ba9348566 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/vpaes-x86_64-nasm.asm @@ -0,0 +1,982 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_encrypt_core: + + mov r9,rdx + mov r11,16 + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm2,XMMWORD[$L$k_ipt] + pandn xmm1,xmm0 + movdqu xmm5,XMMWORD[r9] + psrld xmm1,4 + pand xmm0,xmm9 +DB 102,15,56,0,208 + movdqa xmm0,XMMWORD[(($L$k_ipt+16))] +DB 102,15,56,0,193 + pxor xmm2,xmm5 + add r9,16 + pxor xmm0,xmm2 + lea r10,[$L$k_mc_backward] + jmp NEAR $L$enc_entry + +ALIGN 16 +$L$enc_loop: + + movdqa xmm4,xmm13 + movdqa xmm0,xmm12 +DB 102,15,56,0,226 +DB 102,15,56,0,195 + pxor xmm4,xmm5 + movdqa xmm5,xmm15 + pxor xmm0,xmm4 + movdqa xmm1,XMMWORD[((-64))+r10*1+r11] +DB 102,15,56,0,234 + movdqa xmm4,XMMWORD[r10*1+r11] + movdqa xmm2,xmm14 +DB 102,15,56,0,211 + movdqa xmm3,xmm0 + pxor xmm2,xmm5 +DB 102,15,56,0,193 + add r9,16 + pxor xmm0,xmm2 +DB 102,15,56,0,220 + add r11,16 + pxor xmm3,xmm0 +DB 102,15,56,0,193 + and r11,0x30 + sub rax,1 + pxor xmm0,xmm3 + +$L$enc_entry: + + movdqa xmm1,xmm9 + movdqa xmm5,xmm11 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm9 +DB 102,15,56,0,232 + movdqa xmm3,xmm10 + pxor xmm0,xmm1 +DB 102,15,56,0,217 + movdqa xmm4,xmm10 + pxor xmm3,xmm5 +DB 102,15,56,0,224 + movdqa xmm2,xmm10 + pxor xmm4,xmm5 +DB 102,15,56,0,211 + movdqa xmm3,xmm10 + pxor xmm2,xmm0 +DB 102,15,56,0,220 + movdqu xmm5,XMMWORD[r9] + pxor xmm3,xmm1 + jnz NEAR $L$enc_loop + + + movdqa xmm4,XMMWORD[((-96))+r10] + movdqa xmm0,XMMWORD[((-80))+r10] +DB 102,15,56,0,226 + pxor xmm4,xmm5 +DB 102,15,56,0,195 + movdqa xmm1,XMMWORD[64+r10*1+r11] + pxor xmm0,xmm4 +DB 102,15,56,0,193 + DB 0F3h,0C3h ;repret + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_encrypt_core_2x: + + mov r9,rdx + mov r11,16 + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm2,XMMWORD[$L$k_ipt] + movdqa xmm8,xmm2 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + movdqu xmm5,XMMWORD[r9] + + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,208 +DB 102,68,15,56,0,198 + movdqa xmm0,XMMWORD[(($L$k_ipt+16))] + movdqa xmm6,xmm0 +DB 102,15,56,0,193 +DB 102,15,56,0,247 + pxor xmm2,xmm5 + pxor xmm8,xmm5 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 + lea r10,[$L$k_mc_backward] + jmp NEAR $L$enc2x_entry + +ALIGN 16 +$L$enc2x_loop: + + movdqa xmm4,XMMWORD[$L$k_sb1] + movdqa xmm0,XMMWORD[(($L$k_sb1+16))] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + pxor xmm4,xmm5 + pxor xmm12,xmm5 + movdqa xmm5,XMMWORD[$L$k_sb2] + movdqa xmm13,xmm5 + pxor xmm0,xmm4 + pxor xmm6,xmm12 + movdqa xmm1,XMMWORD[((-64))+r10*1+r11] + +DB 102,15,56,0,234 +DB 102,69,15,56,0,232 + movdqa xmm4,XMMWORD[r10*1+r11] + + movdqa xmm2,XMMWORD[(($L$k_sb2+16))] + movdqa xmm8,xmm2 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm0 + movdqa xmm11,xmm6 + pxor xmm2,xmm5 + pxor xmm8,xmm13 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 +DB 102,15,56,0,220 +DB 102,68,15,56,0,220 + add r11,16 + pxor xmm3,xmm0 + pxor xmm11,xmm6 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + and r11,0x30 + sub rax,1 + pxor xmm0,xmm3 + pxor xmm6,xmm11 + +$L$enc2x_entry: + + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm5,XMMWORD[(($L$k_inv+16))] + movdqa xmm13,xmm5 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 +DB 102,15,56,0,232 +DB 102,68,15,56,0,238 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm0,xmm1 + pxor xmm6,xmm7 +DB 102,15,56,0,217 +DB 102,68,15,56,0,223 + movdqa xmm4,xmm10 + movdqa xmm12,xmm10 + pxor xmm3,xmm5 + pxor xmm11,xmm13 +DB 102,15,56,0,224 +DB 102,68,15,56,0,230 + movdqa xmm2,xmm10 + movdqa xmm8,xmm10 + pxor xmm4,xmm5 + pxor xmm12,xmm13 +DB 102,15,56,0,211 +DB 102,69,15,56,0,195 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm2,xmm0 + pxor xmm8,xmm6 +DB 102,15,56,0,220 +DB 102,69,15,56,0,220 + movdqu xmm5,XMMWORD[r9] + + pxor xmm3,xmm1 + pxor xmm11,xmm7 + jnz NEAR $L$enc2x_loop + + + movdqa xmm4,XMMWORD[((-96))+r10] + movdqa xmm0,XMMWORD[((-80))+r10] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 +DB 102,15,56,0,226 +DB 102,69,15,56,0,224 + pxor xmm4,xmm5 + pxor xmm12,xmm5 +DB 102,15,56,0,195 +DB 102,65,15,56,0,243 + movdqa xmm1,XMMWORD[64+r10*1+r11] + + pxor xmm0,xmm4 + pxor xmm6,xmm12 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + DB 0F3h,0C3h ;repret + + + + + + + + + +ALIGN 16 +_vpaes_schedule_core: + + + + + + + call _vpaes_preheat + movdqa xmm8,XMMWORD[$L$k_rcon] + movdqu xmm0,XMMWORD[rdi] + + + movdqa xmm3,xmm0 + lea r11,[$L$k_ipt] + call _vpaes_schedule_transform + movdqa xmm7,xmm0 + + lea r10,[$L$k_sr] + + + movdqu XMMWORD[rdx],xmm0 + +$L$schedule_go: + cmp esi,192 + ja NEAR $L$schedule_256 + + + + + + + + + + + +$L$schedule_128: + mov esi,10 + +$L$oop_schedule_128: + call _vpaes_schedule_round + dec rsi + jz NEAR $L$schedule_mangle_last + call _vpaes_schedule_mangle + jmp NEAR $L$oop_schedule_128 + + + + + + + + + + + +ALIGN 16 +$L$schedule_256: + movdqu xmm0,XMMWORD[16+rdi] + call _vpaes_schedule_transform + mov esi,7 + +$L$oop_schedule_256: + call _vpaes_schedule_mangle + movdqa xmm6,xmm0 + + + call _vpaes_schedule_round + dec rsi + jz NEAR $L$schedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd xmm0,xmm0,0xFF + movdqa xmm5,xmm7 + movdqa xmm7,xmm6 + call _vpaes_schedule_low_round + movdqa xmm7,xmm5 + + jmp NEAR $L$oop_schedule_256 + + + + + + + + + + + + +ALIGN 16 +$L$schedule_mangle_last: + + lea r11,[$L$k_deskew] + + + movdqa xmm1,XMMWORD[r10*1+r8] +DB 102,15,56,0,193 + lea r11,[$L$k_opt] + add rdx,32 + +$L$schedule_mangle_last_dec: + add rdx,-16 + pxor xmm0,XMMWORD[$L$k_s63] + call _vpaes_schedule_transform + movdqu XMMWORD[rdx],xmm0 + + + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + DB 0F3h,0C3h ;repret + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_round: + + + pxor xmm1,xmm1 +DB 102,65,15,58,15,200,15 +DB 102,69,15,58,15,192,15 + pxor xmm7,xmm1 + + + pshufd xmm0,xmm0,0xFF +DB 102,15,58,15,192,1 + + + + +_vpaes_schedule_low_round: + + movdqa xmm1,xmm7 + pslldq xmm7,4 + pxor xmm7,xmm1 + movdqa xmm1,xmm7 + pslldq xmm7,8 + pxor xmm7,xmm1 + pxor xmm7,XMMWORD[$L$k_s63] + + + movdqa xmm1,xmm9 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm9 + movdqa xmm2,xmm11 +DB 102,15,56,0,208 + pxor xmm0,xmm1 + movdqa xmm3,xmm10 +DB 102,15,56,0,217 + pxor xmm3,xmm2 + movdqa xmm4,xmm10 +DB 102,15,56,0,224 + pxor xmm4,xmm2 + movdqa xmm2,xmm10 +DB 102,15,56,0,211 + pxor xmm2,xmm0 + movdqa xmm3,xmm10 +DB 102,15,56,0,220 + pxor xmm3,xmm1 + movdqa xmm4,xmm13 +DB 102,15,56,0,226 + movdqa xmm0,xmm12 +DB 102,15,56,0,195 + pxor xmm0,xmm4 + + + pxor xmm0,xmm7 + movdqa xmm7,xmm0 + DB 0F3h,0C3h ;repret + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_transform: + + movdqa xmm1,xmm9 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm9 + movdqa xmm2,XMMWORD[r11] +DB 102,15,56,0,208 + movdqa xmm0,XMMWORD[16+r11] +DB 102,15,56,0,193 + pxor xmm0,xmm2 + DB 0F3h,0C3h ;repret + + + + + + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_mangle: + + movdqa xmm4,xmm0 + movdqa xmm5,XMMWORD[$L$k_mc_forward] + + + add rdx,16 + pxor xmm4,XMMWORD[$L$k_s63] +DB 102,15,56,0,229 + movdqa xmm3,xmm4 +DB 102,15,56,0,229 + pxor xmm3,xmm4 +DB 102,15,56,0,229 + pxor xmm3,xmm4 + +$L$schedule_mangle_both: + movdqa xmm1,XMMWORD[r10*1+r8] +DB 102,15,56,0,217 + add r8,-16 + and r8,0x30 + movdqu XMMWORD[rdx],xmm3 + DB 0F3h,0C3h ;repret + + + + + + +global GFp_vpaes_set_encrypt_key + +ALIGN 16 +GFp_vpaes_set_encrypt_key: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_vpaes_set_encrypt_key: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+5))],1 +%endif + + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$enc_key_body: + mov eax,esi + shr eax,5 + add eax,5 + mov DWORD[240+rdx],eax + + mov ecx,0 + mov r8d,0x30 + call _vpaes_schedule_core + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$enc_key_epilogue: + xor eax,eax + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_vpaes_set_encrypt_key: + +global GFp_vpaes_encrypt + +ALIGN 16 +GFp_vpaes_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_vpaes_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$enc_body: + movdqu xmm0,XMMWORD[rdi] + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu XMMWORD[rsi],xmm0 + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$enc_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_vpaes_encrypt: +global GFp_vpaes_ctr32_encrypt_blocks + +ALIGN 16 +GFp_vpaes_ctr32_encrypt_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_vpaes_ctr32_encrypt_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + + xchg rdx,rcx + test rcx,rcx + jz NEAR $L$ctr32_abort + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$ctr32_body: + movdqu xmm0,XMMWORD[r8] + movdqa xmm8,XMMWORD[$L$ctr_add_one] + sub rsi,rdi + call _vpaes_preheat + movdqa xmm6,xmm0 + pshufb xmm6,XMMWORD[$L$rev_ctr] + + test rcx,1 + jz NEAR $L$ctr32_prep_loop + + + + movdqu xmm7,XMMWORD[rdi] + call _vpaes_encrypt_core + pxor xmm0,xmm7 + paddd xmm6,xmm8 + movdqu XMMWORD[rdi*1+rsi],xmm0 + sub rcx,1 + lea rdi,[16+rdi] + jz NEAR $L$ctr32_done + +$L$ctr32_prep_loop: + + + movdqa xmm14,xmm6 + movdqa xmm15,xmm6 + paddd xmm15,xmm8 + +$L$ctr32_loop: + movdqa xmm1,XMMWORD[$L$rev_ctr] + movdqa xmm0,xmm14 + movdqa xmm6,xmm15 +DB 102,15,56,0,193 +DB 102,15,56,0,241 + call _vpaes_encrypt_core_2x + movdqu xmm1,XMMWORD[rdi] + movdqu xmm2,XMMWORD[16+rdi] + movdqa xmm3,XMMWORD[$L$ctr_add_two] + pxor xmm0,xmm1 + pxor xmm6,xmm2 + paddd xmm14,xmm3 + paddd xmm15,xmm3 + movdqu XMMWORD[rdi*1+rsi],xmm0 + movdqu XMMWORD[16+rdi*1+rsi],xmm6 + sub rcx,2 + lea rdi,[32+rdi] + jnz NEAR $L$ctr32_loop + +$L$ctr32_done: + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$ctr32_epilogue: +$L$ctr32_abort: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_vpaes_ctr32_encrypt_blocks: + + + + + + + +ALIGN 16 +_vpaes_preheat: + + lea r10,[$L$k_s0F] + movdqa xmm10,XMMWORD[((-32))+r10] + movdqa xmm11,XMMWORD[((-16))+r10] + movdqa xmm9,XMMWORD[r10] + movdqa xmm13,XMMWORD[48+r10] + movdqa xmm12,XMMWORD[64+r10] + movdqa xmm15,XMMWORD[80+r10] + movdqa xmm14,XMMWORD[96+r10] + DB 0F3h,0C3h ;repret + + + + + + + + +ALIGN 64 +_vpaes_consts: +$L$k_inv: + DQ 0x0E05060F0D080180,0x040703090A0B0C02 + DQ 0x01040A060F0B0780,0x030D0E0C02050809 + +$L$k_s0F: + DQ 0x0F0F0F0F0F0F0F0F,0x0F0F0F0F0F0F0F0F + +$L$k_ipt: + DQ 0xC2B2E8985A2A7000,0xCABAE09052227808 + DQ 0x4C01307D317C4D00,0xCD80B1FCB0FDCC81 + +$L$k_sb1: + DQ 0xB19BE18FCB503E00,0xA5DF7A6E142AF544 + DQ 0x3618D415FAE22300,0x3BF7CCC10D2ED9EF +$L$k_sb2: + DQ 0xE27A93C60B712400,0x5EB7E955BC982FCD + DQ 0x69EB88400AE12900,0xC2A163C8AB82234A +$L$k_sbo: + DQ 0xD0D26D176FBDC700,0x15AABF7AC502A878 + DQ 0xCFE474A55FBB6A00,0x8E1E90D1412B35FA + +$L$k_mc_forward: + DQ 0x0407060500030201,0x0C0F0E0D080B0A09 + DQ 0x080B0A0904070605,0x000302010C0F0E0D + DQ 0x0C0F0E0D080B0A09,0x0407060500030201 + DQ 0x000302010C0F0E0D,0x080B0A0904070605 + +$L$k_mc_backward: + DQ 0x0605040702010003,0x0E0D0C0F0A09080B + DQ 0x020100030E0D0C0F,0x0A09080B06050407 + DQ 0x0E0D0C0F0A09080B,0x0605040702010003 + DQ 0x0A09080B06050407,0x020100030E0D0C0F + +$L$k_sr: + DQ 0x0706050403020100,0x0F0E0D0C0B0A0908 + DQ 0x030E09040F0A0500,0x0B06010C07020D08 + DQ 0x0F060D040B020900,0x070E050C030A0108 + DQ 0x0B0E0104070A0D00,0x0306090C0F020508 + +$L$k_rcon: + DQ 0x1F8391B9AF9DEEB6,0x702A98084D7C7D81 + +$L$k_s63: + DQ 0x5B5B5B5B5B5B5B5B,0x5B5B5B5B5B5B5B5B + +$L$k_opt: + DQ 0xFF9F4929D6B66000,0xF7974121DEBE6808 + DQ 0x01EDBD5150BCEC00,0xE10D5DB1B05C0CE0 + +$L$k_deskew: + DQ 0x07E4A34047A4E300,0x1DFEB95A5DBEF91A + DQ 0x5F36B5DC83EA6900,0x2841C2ABF49D1E77 + + +$L$rev_ctr: + DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 + + +$L$ctr_add_one: + DQ 0x0000000000000000,0x0000000100000000 +$L$ctr_add_two: + DQ 0x0000000000000000,0x0000000200000000 + +DB 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +DB 111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54 +DB 52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97 +DB 109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32 +DB 85,110,105,118,101,114,115,105,116,121,41,0 +ALIGN 64 + +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + + lea rsi,[16+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + lea rax,[184+rax] + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_vpaes_set_encrypt_key wrt ..imagebase + DD $L$SEH_end_GFp_vpaes_set_encrypt_key wrt ..imagebase + DD $L$SEH_info_GFp_vpaes_set_encrypt_key wrt ..imagebase + + DD $L$SEH_begin_GFp_vpaes_encrypt wrt ..imagebase + DD $L$SEH_end_GFp_vpaes_encrypt wrt ..imagebase + DD $L$SEH_info_GFp_vpaes_encrypt wrt ..imagebase + DD $L$SEH_begin_GFp_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_GFp_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_info_GFp_vpaes_ctr32_encrypt_blocks wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_GFp_vpaes_set_encrypt_key: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$enc_key_body wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase +$L$SEH_info_GFp_vpaes_encrypt: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$enc_body wrt ..imagebase,$L$enc_epilogue wrt ..imagebase +$L$SEH_info_GFp_vpaes_ctr32_encrypt_blocks: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase diff --git a/zeroidc/vendor/ring/pregenerated/tmp/x86-mont-win32n.asm b/zeroidc/vendor/ring/pregenerated/tmp/x86-mont-win32n.asm new file mode 100644 index 000000000..bb7a249e0 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/x86-mont-win32n.asm @@ -0,0 +1,227 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_nasm.inc" +%endif +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +;extern _GFp_ia32cap_P +global _GFp_bn_mul_mont +align 16 +_GFp_bn_mul_mont: +L$_GFp_bn_mul_mont_begin: + push ebp + push ebx + push esi + push edi + xor eax,eax + mov edi,DWORD [40+esp] + lea esi,[20+esp] + lea edx,[24+esp] + add edi,2 + neg edi + lea ebp,[edi*4+esp-32] + neg edi + mov eax,ebp + sub eax,edx + and eax,2047 + sub ebp,eax + xor edx,ebp + and edx,2048 + xor edx,2048 + sub ebp,edx + and ebp,-64 + mov eax,esp + sub eax,ebp + and eax,-4096 + mov edx,esp + lea esp,[eax*1+ebp] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$000page_walk + jmp NEAR L$001page_walk_done +align 16 +L$000page_walk: + lea esp,[esp-4096] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$000page_walk +L$001page_walk_done: + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov ebp,DWORD [12+esi] + mov esi,DWORD [16+esi] + mov esi,DWORD [esi] + mov DWORD [4+esp],eax + mov DWORD [8+esp],ebx + mov DWORD [12+esp],ecx + mov DWORD [16+esp],ebp + mov DWORD [20+esp],esi + lea ebx,[edi-3] + mov DWORD [24+esp],edx + lea eax,[_GFp_ia32cap_P] + bt DWORD [eax],26 + mov eax,-1 + movd mm7,eax + mov esi,DWORD [8+esp] + mov edi,DWORD [12+esp] + mov ebp,DWORD [16+esp] + xor edx,edx + xor ecx,ecx + movd mm4,DWORD [edi] + movd mm5,DWORD [esi] + movd mm3,DWORD [ebp] + pmuludq mm5,mm4 + movq mm2,mm5 + movq mm0,mm5 + pand mm0,mm7 + pmuludq mm5,[20+esp] + pmuludq mm3,mm5 + paddq mm3,mm0 + movd mm1,DWORD [4+ebp] + movd mm0,DWORD [4+esi] + psrlq mm2,32 + psrlq mm3,32 + inc ecx +align 16 +L$0021st: + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + pand mm0,mm7 + movd mm1,DWORD [4+ecx*4+ebp] + paddq mm3,mm0 + movd mm0,DWORD [4+ecx*4+esi] + psrlq mm2,32 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm3,32 + lea ecx,[1+ecx] + cmp ecx,ebx + jl NEAR L$0021st + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + pand mm0,mm7 + paddq mm3,mm0 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm2,32 + psrlq mm3,32 + paddq mm3,mm2 + movq [32+ebx*4+esp],mm3 + inc edx +L$003outer: + xor ecx,ecx + movd mm4,DWORD [edx*4+edi] + movd mm5,DWORD [esi] + movd mm6,DWORD [32+esp] + movd mm3,DWORD [ebp] + pmuludq mm5,mm4 + paddq mm5,mm6 + movq mm0,mm5 + movq mm2,mm5 + pand mm0,mm7 + pmuludq mm5,[20+esp] + pmuludq mm3,mm5 + paddq mm3,mm0 + movd mm6,DWORD [36+esp] + movd mm1,DWORD [4+ebp] + movd mm0,DWORD [4+esi] + psrlq mm2,32 + psrlq mm3,32 + paddq mm2,mm6 + inc ecx + dec ebx +L$004inner: + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + movd mm6,DWORD [36+ecx*4+esp] + pand mm0,mm7 + movd mm1,DWORD [4+ecx*4+ebp] + paddq mm3,mm0 + movd mm0,DWORD [4+ecx*4+esi] + psrlq mm2,32 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm3,32 + paddq mm2,mm6 + dec ebx + lea ecx,[1+ecx] + jnz NEAR L$004inner + mov ebx,ecx + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + pand mm0,mm7 + paddq mm3,mm0 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm2,32 + psrlq mm3,32 + movd mm6,DWORD [36+ebx*4+esp] + paddq mm3,mm2 + paddq mm3,mm6 + movq [32+ebx*4+esp],mm3 + lea edx,[1+edx] + cmp edx,ebx + jle NEAR L$003outer + emms +align 16 +L$005common_tail: + mov ebp,DWORD [16+esp] + mov edi,DWORD [4+esp] + lea esi,[32+esp] + mov eax,DWORD [esi] + mov ecx,ebx + xor edx,edx +align 16 +L$006sub: + sbb eax,DWORD [edx*4+ebp] + mov DWORD [edx*4+edi],eax + dec ecx + mov eax,DWORD [4+edx*4+esi] + lea edx,[1+edx] + jge NEAR L$006sub + sbb eax,0 + mov edx,-1 + xor edx,eax + jmp NEAR L$007copy +align 16 +L$007copy: + mov esi,DWORD [32+ebx*4+esp] + mov ebp,DWORD [ebx*4+edi] + mov DWORD [32+ebx*4+esp],ecx + and esi,eax + and ebp,edx + or ebp,esi + mov DWORD [ebx*4+edi],ebp + dec ebx + jge NEAR L$007copy + mov esp,DWORD [24+esp] + mov eax,1 + pop edi + pop esi + pop ebx + pop ebp + ret +db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +db 111,114,103,62,0 +segment .bss +common _GFp_ia32cap_P 16 diff --git a/zeroidc/vendor/ring/pregenerated/tmp/x86_64-mont-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/x86_64-mont-nasm.asm new file mode 100644 index 000000000..38f355329 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/x86_64-mont-nasm.asm @@ -0,0 +1,1475 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +EXTERN GFp_ia32cap_P + +global GFp_bn_mul_mont + +ALIGN 16 +GFp_bn_mul_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_bn_mul_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov r9d,r9d + mov rax,rsp + + test r9d,3 + jnz NEAR $L$mul_enter + cmp r9d,8 + jb NEAR $L$mul_enter + mov r11d,DWORD[((GFp_ia32cap_P+8))] + cmp rdx,rsi + jne NEAR $L$mul4x_enter + test r9d,7 + jz NEAR $L$sqr8x_enter + jmp NEAR $L$mul4x_enter + +ALIGN 16 +$L$mul_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + neg r9 + mov r11,rsp + lea r10,[((-16))+r9*8+rsp] + neg r9 + and r10,-1024 + + + + + + + + + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk + jmp NEAR $L$mul_page_walk_done + +ALIGN 16 +$L$mul_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk +$L$mul_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax + +$L$mul_body: + mov r12,rdx + mov r8,QWORD[r8] + mov rbx,QWORD[r12] + mov rax,QWORD[rsi] + + xor r14,r14 + xor r15,r15 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$1st_enter + +ALIGN 16 +$L$1st: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r11 + mov r11,r10 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$1st_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + lea r15,[1+r15] + mov r10,rdx + + mul rbp + cmp r15,r9 + jne NEAR $L$1st + + add r13,rax + mov rax,QWORD[rsi] + adc rdx,0 + add r13,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + mov r11,r10 + + xor rdx,rdx + add r13,r11 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + jmp NEAR $L$outer +ALIGN 16 +$L$outer: + mov rbx,QWORD[r14*8+r12] + xor r15,r15 + mov rbp,r8 + mov r10,QWORD[rsp] + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r10,QWORD[8+rsp] + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$inner_enter + +ALIGN 16 +$L$inner: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD[r15*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$inner_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + lea r15,[1+r15] + + mul rbp + cmp r15,r9 + jne NEAR $L$inner + + add r13,rax + mov rax,QWORD[rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD[r15*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + + xor rdx,rdx + add r13,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + cmp r14,r9 + jb NEAR $L$outer + + xor r14,r14 + mov rax,QWORD[rsp] + mov r15,r9 + +ALIGN 16 +$L$sub: sbb rax,QWORD[r14*8+rcx] + mov QWORD[r14*8+rdi],rax + mov rax,QWORD[8+r14*8+rsp] + lea r14,[1+r14] + dec r15 + jnz NEAR $L$sub + + sbb rax,0 + mov rbx,-1 + xor rbx,rax + xor r14,r14 + mov r15,r9 + +$L$copy: + mov rcx,QWORD[r14*8+rdi] + mov rdx,QWORD[r14*8+rsp] + and rcx,rbx + and rdx,rax + mov QWORD[r14*8+rsp],r9 + or rdx,rcx + mov QWORD[r14*8+rdi],rdx + lea r14,[1+r14] + sub r15,1 + jnz NEAR $L$copy + + mov rsi,QWORD[8+r9*8+rsp] + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_bn_mul_mont: + +ALIGN 16 +bn_mul4x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul4x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov r9d,r9d + mov rax,rsp + +$L$mul4x_enter: + and r11d,0x80100 + cmp r11d,0x80100 + je NEAR $L$mulx4x_enter + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + neg r9 + mov r11,rsp + lea r10,[((-32))+r9*8+rsp] + neg r9 + and r10,-1024 + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax + +$L$mul4x_body: + mov QWORD[16+r9*8+rsp],rdi + mov r12,rdx + mov r8,QWORD[r8] + mov rbx,QWORD[r12] + mov rax,QWORD[rsi] + + xor r14,r14 + xor r15,r15 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[4+r15] + adc rdx,0 + mov QWORD[rsp],rdi + mov r13,rdx + jmp NEAR $L$1st4x +ALIGN 16 +$L$1st4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+r15*8+rcx] + adc rdx,0 + lea r15,[4+r15] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[((-16))+r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-32))+r15*8+rsp],rdi + mov r13,rdx + cmp r15,r9 + jb NEAR $L$1st4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + xor rdi,rdi + add r13,r10 + adc rdi,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov QWORD[r15*8+rsp],rdi + + lea r14,[1+r14] +ALIGN 4 +$L$outer4x: + mov rbx,QWORD[r14*8+r12] + xor r15,r15 + mov r10,QWORD[rsp] + mov rbp,r8 + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + add r11,QWORD[8+rsp] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[4+r15] + adc rdx,0 + mov QWORD[rsp],rdi + mov r13,rdx + jmp NEAR $L$inner4x +ALIGN 16 +$L$inner4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + add r10,QWORD[((-16))+r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r15*8+rsp] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + add r10,QWORD[r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+r15*8+rcx] + adc rdx,0 + add r11,QWORD[8+r15*8+rsp] + adc rdx,0 + lea r15,[4+r15] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[((-16))+r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-32))+r15*8+rsp],rdi + mov r13,rdx + cmp r15,r9 + jb NEAR $L$inner4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + add r10,QWORD[((-16))+r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r15*8+rsp] + adc rdx,0 + lea r14,[1+r14] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + xor rdi,rdi + add r13,r10 + adc rdi,0 + add r13,QWORD[r9*8+rsp] + adc rdi,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov QWORD[r15*8+rsp],rdi + + cmp r14,r9 + jb NEAR $L$outer4x + mov rdi,QWORD[16+r9*8+rsp] + lea r15,[((-4))+r9] + mov rax,QWORD[rsp] + mov rdx,QWORD[8+rsp] + shr r15,2 + lea rsi,[rsp] + xor r14,r14 + + sub rax,QWORD[rcx] + mov rbx,QWORD[16+rsi] + mov rbp,QWORD[24+rsi] + sbb rdx,QWORD[8+rcx] + +$L$sub4x: + mov QWORD[r14*8+rdi],rax + mov QWORD[8+r14*8+rdi],rdx + sbb rbx,QWORD[16+r14*8+rcx] + mov rax,QWORD[32+r14*8+rsi] + mov rdx,QWORD[40+r14*8+rsi] + sbb rbp,QWORD[24+r14*8+rcx] + mov QWORD[16+r14*8+rdi],rbx + mov QWORD[24+r14*8+rdi],rbp + sbb rax,QWORD[32+r14*8+rcx] + mov rbx,QWORD[48+r14*8+rsi] + mov rbp,QWORD[56+r14*8+rsi] + sbb rdx,QWORD[40+r14*8+rcx] + lea r14,[4+r14] + dec r15 + jnz NEAR $L$sub4x + + mov QWORD[r14*8+rdi],rax + mov rax,QWORD[32+r14*8+rsi] + sbb rbx,QWORD[16+r14*8+rcx] + mov QWORD[8+r14*8+rdi],rdx + sbb rbp,QWORD[24+r14*8+rcx] + mov QWORD[16+r14*8+rdi],rbx + + sbb rax,0 + mov QWORD[24+r14*8+rdi],rbp + pxor xmm0,xmm0 +DB 102,72,15,110,224 + pcmpeqd xmm5,xmm5 + pshufd xmm4,xmm4,0 + mov r15,r9 + pxor xmm5,xmm4 + shr r15,2 + xor eax,eax + + jmp NEAR $L$copy4x +ALIGN 16 +$L$copy4x: + movdqa xmm1,XMMWORD[rax*1+rsp] + movdqu xmm2,XMMWORD[rax*1+rdi] + pand xmm1,xmm4 + pand xmm2,xmm5 + movdqa xmm3,XMMWORD[16+rax*1+rsp] + movdqa XMMWORD[rax*1+rsp],xmm0 + por xmm1,xmm2 + movdqu xmm2,XMMWORD[16+rax*1+rdi] + movdqu XMMWORD[rax*1+rdi],xmm1 + pand xmm3,xmm4 + pand xmm2,xmm5 + movdqa XMMWORD[16+rax*1+rsp],xmm0 + por xmm3,xmm2 + movdqu XMMWORD[16+rax*1+rdi],xmm3 + lea rax,[32+rax] + dec r15 + jnz NEAR $L$copy4x + mov rsi,QWORD[8+r9*8+rsp] + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_mul4x_mont: +EXTERN GFp_bn_sqrx8x_internal +EXTERN GFp_bn_sqr8x_internal + + +ALIGN 32 +bn_sqr8x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_sqr8x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$sqr8x_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$sqr8x_prologue: + + mov r10d,r9d + shl r9d,3 + shl r10,3+2 + neg r9 + + + + + + + lea r11,[((-64))+r9*2+rsp] + mov rbp,rsp + mov r8,QWORD[r8] + sub r11,rsi + and r11,4095 + cmp r10,r11 + jb NEAR $L$sqr8x_sp_alt + sub rbp,r11 + lea rbp,[((-64))+r9*2+rbp] + jmp NEAR $L$sqr8x_sp_done + +ALIGN 32 +$L$sqr8x_sp_alt: + lea r10,[((4096-64))+r9*2] + lea rbp,[((-64))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$sqr8x_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk + jmp NEAR $L$sqr8x_page_walk_done + +ALIGN 16 +$L$sqr8x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk +$L$sqr8x_page_walk_done: + + mov r10,r9 + neg r9 + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$sqr8x_body: + +DB 102,72,15,110,209 + pxor xmm0,xmm0 +DB 102,72,15,110,207 +DB 102,73,15,110,218 + mov eax,DWORD[((GFp_ia32cap_P+8))] + and eax,0x80100 + cmp eax,0x80100 + jne NEAR $L$sqr8x_nox + + call GFp_bn_sqrx8x_internal + + + + + lea rbx,[rcx*1+r8] + mov r9,rcx + mov rdx,rcx +DB 102,72,15,126,207 + sar rcx,3+2 + jmp NEAR $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_nox: + call GFp_bn_sqr8x_internal + + + + + lea rbx,[r9*1+rdi] + mov rcx,r9 + mov rdx,r9 +DB 102,72,15,126,207 + sar rcx,3+2 + jmp NEAR $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_sub: + mov r12,QWORD[rbx] + mov r13,QWORD[8+rbx] + mov r14,QWORD[16+rbx] + mov r15,QWORD[24+rbx] + lea rbx,[32+rbx] + sbb r12,QWORD[rbp] + sbb r13,QWORD[8+rbp] + sbb r14,QWORD[16+rbp] + sbb r15,QWORD[24+rbp] + lea rbp,[32+rbp] + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + lea rdi,[32+rdi] + inc rcx + jnz NEAR $L$sqr8x_sub + + sbb rax,0 + lea rbx,[r9*1+rbx] + lea rdi,[r9*1+rdi] + +DB 102,72,15,110,200 + pxor xmm0,xmm0 + pshufd xmm1,xmm1,0 + mov rsi,QWORD[40+rsp] + + jmp NEAR $L$sqr8x_cond_copy + +ALIGN 32 +$L$sqr8x_cond_copy: + movdqa xmm2,XMMWORD[rbx] + movdqa xmm3,XMMWORD[16+rbx] + lea rbx,[32+rbx] + movdqu xmm4,XMMWORD[rdi] + movdqu xmm5,XMMWORD[16+rdi] + lea rdi,[32+rdi] + movdqa XMMWORD[(-32)+rbx],xmm0 + movdqa XMMWORD[(-16)+rbx],xmm0 + movdqa XMMWORD[(-32)+rdx*1+rbx],xmm0 + movdqa XMMWORD[(-16)+rdx*1+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD[(-32)+rdi],xmm4 + movdqu XMMWORD[(-16)+rdi],xmm5 + add r9,32 + jnz NEAR $L$sqr8x_cond_copy + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$sqr8x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_sqr8x_mont: + +ALIGN 32 +bn_mulx4x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$mulx4x_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + shl r9d,3 + xor r10,r10 + sub r10,r9 + mov r8,QWORD[r8] + lea rbp,[((-72))+r10*1+rsp] + and rbp,-128 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +ALIGN 16 +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + lea r10,[r9*1+rdx] + + + + + + + + + + + + + mov QWORD[rsp],r9 + shr r9,5 + mov QWORD[16+rsp],r10 + sub r9,1 + mov QWORD[24+rsp],r8 + mov QWORD[32+rsp],rdi + mov QWORD[40+rsp],rax + + mov QWORD[48+rsp],r9 + jmp NEAR $L$mulx4x_body + +ALIGN 32 +$L$mulx4x_body: + lea rdi,[8+rdx] + mov rdx,QWORD[rdx] + lea rbx,[((64+32))+rsp] + mov r9,rdx + + mulx rax,r8,QWORD[rsi] + mulx r14,r11,QWORD[8+rsi] + add r11,rax + mov QWORD[8+rsp],rdi + mulx r13,r12,QWORD[16+rsi] + adc r12,r14 + adc r13,0 + + mov rdi,r8 + imul r8,QWORD[24+rsp] + xor rbp,rbp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx rdi,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 +DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + mov rdi,QWORD[48+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] +DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + add r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + mov rdx,QWORD[rdi] + lea rdi,[8+rdi] + sub rsi,rax + mov QWORD[rbx],r15 + lea rbx,[((64+32))+rsp] + sub rcx,rax + + mulx r11,r8,QWORD[rsi] + xor ebp,ebp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + adox r12,QWORD[((-16))+rbx] + adcx r13,rbp + adox r13,rbp + + mov QWORD[8+rsp],rdi + mov r15,r8 + imul r8,QWORD[24+rsp] + xor ebp,ebp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + adcx r13,rax + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + lea rsi,[32+rsi] + adox r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + lea rcx,[32+rcx] + adcx r12,rax + adox r15,rbp + mov rdi,QWORD[48+rsp] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-32))+rbx],r11 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + sub rbp,QWORD[rbx] + adc r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + + cmp rdi,QWORD[16+rsp] + jne NEAR $L$mulx4x_outer + + lea rbx,[64+rsp] + sub rcx,rax + neg r15 + mov rdx,rax + shr rax,3+2 + mov rdi,QWORD[32+rsp] + jmp NEAR $L$mulx4x_sub + +ALIGN 32 +$L$mulx4x_sub: + mov r11,QWORD[rbx] + mov r12,QWORD[8+rbx] + mov r13,QWORD[16+rbx] + mov r14,QWORD[24+rbx] + lea rbx,[32+rbx] + sbb r11,QWORD[rcx] + sbb r12,QWORD[8+rcx] + sbb r13,QWORD[16+rcx] + sbb r14,QWORD[24+rcx] + lea rcx,[32+rcx] + mov QWORD[rdi],r11 + mov QWORD[8+rdi],r12 + mov QWORD[16+rdi],r13 + mov QWORD[24+rdi],r14 + lea rdi,[32+rdi] + dec rax + jnz NEAR $L$mulx4x_sub + + sbb r15,0 + lea rbx,[64+rsp] + sub rdi,rdx + +DB 102,73,15,110,207 + pxor xmm0,xmm0 + pshufd xmm1,xmm1,0 + mov rsi,QWORD[40+rsp] + + jmp NEAR $L$mulx4x_cond_copy + +ALIGN 32 +$L$mulx4x_cond_copy: + movdqa xmm2,XMMWORD[rbx] + movdqa xmm3,XMMWORD[16+rbx] + lea rbx,[32+rbx] + movdqu xmm4,XMMWORD[rdi] + movdqu xmm5,XMMWORD[16+rdi] + lea rdi,[32+rdi] + movdqa XMMWORD[(-32)+rbx],xmm0 + movdqa XMMWORD[(-16)+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD[(-32)+rdi],xmm4 + movdqu XMMWORD[(-16)+rdi],xmm5 + sub rdx,32 + jnz NEAR $L$mulx4x_cond_copy + + mov QWORD[rbx],rdx + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_mulx4x_mont: +DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83 +DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 +DB 115,108,46,111,114,103,62,0 +ALIGN 16 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +mul_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10,QWORD[192+r8] + mov rax,QWORD[8+r10*8+rax] + + jmp NEAR $L$common_pop_regs + + + +ALIGN 16 +sqr_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[40+rax] + +$L$common_pop_regs: + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_bn_mul_mont wrt ..imagebase + DD $L$SEH_end_GFp_bn_mul_mont wrt ..imagebase + DD $L$SEH_info_GFp_bn_mul_mont wrt ..imagebase + + DD $L$SEH_begin_bn_mul4x_mont wrt ..imagebase + DD $L$SEH_end_bn_mul4x_mont wrt ..imagebase + DD $L$SEH_info_bn_mul4x_mont wrt ..imagebase + + DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_GFp_bn_mul_mont: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase +$L$SEH_info_bn_mul4x_mont: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase +$L$SEH_info_bn_sqr8x_mont: +DB 9,0,0,0 + DD sqr_handler wrt ..imagebase + DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_mulx4x_mont: +DB 9,0,0,0 + DD sqr_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 diff --git a/zeroidc/vendor/ring/pregenerated/tmp/x86_64-mont5-nasm.asm b/zeroidc/vendor/ring/pregenerated/tmp/x86_64-mont5-nasm.asm new file mode 100644 index 000000000..14f7ecb00 --- /dev/null +++ b/zeroidc/vendor/ring/pregenerated/tmp/x86_64-mont5-nasm.asm @@ -0,0 +1,4031 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +EXTERN GFp_ia32cap_P + +global GFp_bn_mul_mont_gather5 + +ALIGN 64 +GFp_bn_mul_mont_gather5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_bn_mul_mont_gather5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov r9d,r9d + mov rax,rsp + + test r9d,7 + jnz NEAR $L$mul_enter + lea r11,[GFp_ia32cap_P] + mov r11d,DWORD[8+r11] + jmp NEAR $L$mul4x_enter + +ALIGN 16 +$L$mul_enter: + movd xmm5,DWORD[56+rsp] + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + neg r9 + mov r11,rsp + lea r10,[((-280))+r9*8+rsp] + neg r9 + and r10,-1024 + + + + + + + + + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk + jmp NEAR $L$mul_page_walk_done + +$L$mul_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk +$L$mul_page_walk_done: + + lea r10,[$L$inc] + mov QWORD[8+r9*8+rsp],rax + +$L$mul_body: + + lea r12,[128+rdx] + movdqa xmm0,XMMWORD[r10] + movdqa xmm1,XMMWORD[16+r10] + lea r10,[((24-112))+r9*8+rsp] + and r10,-16 + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 +DB 0x67 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 +DB 0x67 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + pand xmm0,XMMWORD[64+r12] + + pand xmm1,XMMWORD[80+r12] + pand xmm2,XMMWORD[96+r12] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+r12] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+r12] + movdqa xmm5,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+r12] + movdqa xmm5,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[r12] + movdqa xmm5,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+r12] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea r12,[256+r12] +DB 102,72,15,126,195 + + mov r8,QWORD[r8] + mov rax,QWORD[rsi] + + xor r14,r14 + xor r15,r15 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$1st_enter + +ALIGN 16 +$L$1st: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r11 + mov r11,r10 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$1st_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + lea r15,[1+r15] + mov r10,rdx + + mul rbp + cmp r15,r9 + jne NEAR $L$1st + + + add r13,rax + adc rdx,0 + add r13,r11 + adc rdx,0 + mov QWORD[((-16))+r9*8+rsp],r13 + mov r13,rdx + mov r11,r10 + + xor rdx,rdx + add r13,r11 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + jmp NEAR $L$outer +ALIGN 16 +$L$outer: + lea rdx,[((24+128))+r9*8+rsp] + and rdx,-16 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r12] + movdqa xmm1,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm0,XMMWORD[((-128))+rdx] + pand xmm1,XMMWORD[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r12] + movdqa xmm1,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm0,XMMWORD[((-64))+rdx] + pand xmm1,XMMWORD[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r12] + movdqa xmm1,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + movdqa xmm3,XMMWORD[48+r12] + pand xmm0,XMMWORD[rdx] + pand xmm1,XMMWORD[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r12] + movdqa xmm1,XMMWORD[80+r12] + movdqa xmm2,XMMWORD[96+r12] + movdqa xmm3,XMMWORD[112+r12] + pand xmm0,XMMWORD[64+rdx] + pand xmm1,XMMWORD[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea r12,[256+r12] + + mov rax,QWORD[rsi] +DB 102,72,15,126,195 + + xor r15,r15 + mov rbp,r8 + mov r10,QWORD[rsp] + + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r10,QWORD[8+rsp] + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$inner_enter + +ALIGN 16 +$L$inner: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD[r15*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$inner_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + lea r15,[1+r15] + + mul rbp + cmp r15,r9 + jne NEAR $L$inner + + add r13,rax + adc rdx,0 + add r13,r10 + mov r10,QWORD[r9*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r9*8+rsp],r13 + mov r13,rdx + + xor rdx,rdx + add r13,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + cmp r14,r9 + jb NEAR $L$outer + + xor r14,r14 + mov rax,QWORD[rsp] + lea rsi,[rsp] + mov r15,r9 + jmp NEAR $L$sub +ALIGN 16 +$L$sub: sbb rax,QWORD[r14*8+rcx] + mov QWORD[r14*8+rdi],rax + mov rax,QWORD[8+r14*8+rsi] + lea r14,[1+r14] + dec r15 + jnz NEAR $L$sub + + sbb rax,0 + mov rbx,-1 + xor rbx,rax + xor r14,r14 + mov r15,r9 + +$L$copy: + mov rcx,QWORD[r14*8+rdi] + mov rdx,QWORD[r14*8+rsp] + and rcx,rbx + and rdx,rax + mov QWORD[r14*8+rsp],r14 + or rdx,rcx + mov QWORD[r14*8+rdi],rdx + lea r14,[1+r14] + sub r15,1 + jnz NEAR $L$copy + + mov rsi,QWORD[8+r9*8+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_bn_mul_mont_gather5: + +ALIGN 32 +bn_mul4x_mont_gather5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul4x_mont_gather5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +DB 0x67 + mov rax,rsp + +$L$mul4x_enter: + and r11d,0x80108 + cmp r11d,0x80108 + je NEAR $L$mulx4x_enter + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mul4x_prologue: + +DB 0x67 + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + + + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$mul4xsp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$mul4xsp_done + +ALIGN 32 +$L$mul4xsp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$mul4xsp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + + neg r9 + + mov QWORD[40+rsp],rax + +$L$mul4x_body: + + call mul4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_mul4x_mont_gather5: + + +ALIGN 32 +mul4x_internal: + + shl r9,5 + movd xmm5,DWORD[56+rax] + lea rax,[$L$inc] + lea r13,[128+r9*1+rdx] + shr r9,5 + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r10,[((88-112))+r9*1+rsp] + lea r12,[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 +DB 0x67,0x67 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 +DB 0x67 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 +DB 0x67 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + pand xmm0,XMMWORD[64+r12] + + pand xmm1,XMMWORD[80+r12] + pand xmm2,XMMWORD[96+r12] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+r12] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+r12] + movdqa xmm5,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+r12] + movdqa xmm5,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[r12] + movdqa xmm5,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+r12] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea r12,[256+r12] +DB 102,72,15,126,195 + + mov QWORD[((16+8))+rsp],r13 + mov QWORD[((56+8))+rsp],rdi + + mov r8,QWORD[r8] + mov rax,QWORD[rsi] + lea rsi,[r9*1+rsi] + neg r9 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + lea r14,[((64+8))+rsp] + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+r9*1+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r9*1+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[32+r9] + lea rcx,[32+rcx] + adc rdx,0 + mov QWORD[r14],rdi + mov r13,rdx + jmp NEAR $L$1st4x + +ALIGN 32 +$L$1st4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r14],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r14],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r15*1+rsi] + adc rdx,0 + add rdi,r11 + lea rcx,[32+rcx] + adc rdx,0 + mov QWORD[r14],rdi + mov r13,rdx + + add r15,32 + jnz NEAR $L$1st4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r9*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r14],rdi + mov r13,rdx + + lea rcx,[r9*1+rcx] + + xor rdi,rdi + add r13,r10 + adc rdi,0 + mov QWORD[((-8))+r14],r13 + + jmp NEAR $L$outer4x + +ALIGN 32 +$L$outer4x: + lea rdx,[((16+128))+r14] + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r12] + movdqa xmm1,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm0,XMMWORD[((-128))+rdx] + pand xmm1,XMMWORD[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r12] + movdqa xmm1,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm0,XMMWORD[((-64))+rdx] + pand xmm1,XMMWORD[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r12] + movdqa xmm1,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + movdqa xmm3,XMMWORD[48+r12] + pand xmm0,XMMWORD[rdx] + pand xmm1,XMMWORD[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r12] + movdqa xmm1,XMMWORD[80+r12] + movdqa xmm2,XMMWORD[96+r12] + movdqa xmm3,XMMWORD[112+r12] + pand xmm0,XMMWORD[64+rdx] + pand xmm1,XMMWORD[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea r12,[256+r12] +DB 102,72,15,126,195 + + mov r10,QWORD[r9*1+r14] + mov rbp,r8 + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + mov QWORD[r14],rdi + + lea r14,[r9*1+r14] + + mul rbp + add r10,rax + mov rax,QWORD[8+r9*1+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + add r11,QWORD[8+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r9*1+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[32+r9] + lea rcx,[32+rcx] + adc rdx,0 + mov r13,rdx + jmp NEAR $L$inner4x + +ALIGN 32 +$L$inner4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + adc rdx,0 + add r10,QWORD[16+r14] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-32))+r14],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + add r10,QWORD[r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-16))+r14],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + add r11,QWORD[8+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r15*1+rsi] + adc rdx,0 + add rdi,r11 + lea rcx,[32+rcx] + adc rdx,0 + mov QWORD[((-8))+r14],r13 + mov r13,rdx + + add r15,32 + jnz NEAR $L$inner4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + adc rdx,0 + add r10,QWORD[16+r14] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-32))+r14],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,rbp + mov rbp,QWORD[((-8))+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r9*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov r13,rdx + + mov QWORD[((-16))+r14],rdi + lea rcx,[r9*1+rcx] + + xor rdi,rdi + add r13,r10 + adc rdi,0 + add r13,QWORD[r14] + adc rdi,0 + mov QWORD[((-8))+r14],r13 + + cmp r12,QWORD[((16+8))+rsp] + jb NEAR $L$outer4x + xor rax,rax + sub rbp,r13 + adc r15,r15 + or rdi,r15 + sub rax,rdi + lea rbx,[r9*1+r14] + mov r12,QWORD[rcx] + lea rbp,[rcx] + mov rcx,r9 + sar rcx,3+2 + mov rdi,QWORD[((56+8))+rsp] + dec r12 + xor r10,r10 + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqr4x_sub_entry + + +global GFp_bn_power5 + +ALIGN 32 +GFp_bn_power5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_GFp_bn_power5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + + lea r11,[GFp_ia32cap_P] + mov r11d,DWORD[8+r11] + and r11d,0x80108 + cmp r11d,0x80108 + je NEAR $L$powerx5_enter + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$power5_prologue: + + shl r9d,3 + lea r10d,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$pwr_sp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$pwr_sp_done + +ALIGN 32 +$L$pwr_sp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$pwr_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk + jmp NEAR $L$pwr_page_walk_done + +$L$pwr_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk +$L$pwr_page_walk_done: + + mov r10,r9 + neg r9 + + + + + + + + + + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$power5_body: +DB 102,72,15,110,207 +DB 102,72,15,110,209 +DB 102,73,15,110,218 +DB 102,72,15,110,226 + + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + +DB 102,72,15,126,209 +DB 102,72,15,126,226 + mov rdi,rsi + mov rax,QWORD[40+rsp] + lea r8,[32+rsp] + + call mul4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$power5_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_GFp_bn_power5: + +global GFp_bn_sqr8x_internal + + +ALIGN 32 +GFp_bn_sqr8x_internal: +__bn_sqr8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lea rbp,[32+r10] + lea rsi,[r9*1+rsi] + + mov rcx,r9 + + + mov r14,QWORD[((-32))+rbp*1+rsi] + lea rdi,[((48+8))+r9*2+rsp] + mov rax,QWORD[((-24))+rbp*1+rsi] + lea rdi,[((-32))+rbp*1+rdi] + mov rbx,QWORD[((-16))+rbp*1+rsi] + mov r15,rax + + mul r14 + mov r10,rax + mov rax,rbx + mov r11,rdx + mov QWORD[((-24))+rbp*1+rdi],r10 + + mul r14 + add r11,rax + mov rax,rbx + adc rdx,0 + mov QWORD[((-16))+rbp*1+rdi],r11 + mov r10,rdx + + + mov rbx,QWORD[((-8))+rbp*1+rsi] + mul r15 + mov r12,rax + mov rax,rbx + mov r13,rdx + + lea rcx,[rbp] + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + mov QWORD[((-8))+rcx*1+rdi],r10 + jmp NEAR $L$sqr4x_1st + +ALIGN 32 +$L$sqr4x_1st: + mov rbx,QWORD[rcx*1+rsi] + mul r15 + add r13,rax + mov rax,rbx + mov r12,rdx + adc r12,0 + + mul r14 + add r11,rax + mov rax,rbx + mov rbx,QWORD[8+rcx*1+rsi] + mov r10,rdx + adc r10,0 + add r11,r13 + adc r10,0 + + + mul r15 + add r12,rax + mov rax,rbx + mov QWORD[rcx*1+rdi],r11 + mov r13,rdx + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + mov rbx,QWORD[16+rcx*1+rsi] + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + + mul r15 + add r13,rax + mov rax,rbx + mov QWORD[8+rcx*1+rdi],r10 + mov r12,rdx + adc r12,0 + + mul r14 + add r11,rax + mov rax,rbx + mov rbx,QWORD[24+rcx*1+rsi] + mov r10,rdx + adc r10,0 + add r11,r13 + adc r10,0 + + + mul r15 + add r12,rax + mov rax,rbx + mov QWORD[16+rcx*1+rdi],r11 + mov r13,rdx + adc r13,0 + lea rcx,[32+rcx] + + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + mov QWORD[((-8))+rcx*1+rdi],r10 + + cmp rcx,0 + jne NEAR $L$sqr4x_1st + + mul r15 + add r13,rax + lea rbp,[16+rbp] + adc rdx,0 + add r13,r11 + adc rdx,0 + + mov QWORD[rdi],r13 + mov r12,rdx + mov QWORD[8+rdi],rdx + jmp NEAR $L$sqr4x_outer + +ALIGN 32 +$L$sqr4x_outer: + mov r14,QWORD[((-32))+rbp*1+rsi] + lea rdi,[((48+8))+r9*2+rsp] + mov rax,QWORD[((-24))+rbp*1+rsi] + lea rdi,[((-32))+rbp*1+rdi] + mov rbx,QWORD[((-16))+rbp*1+rsi] + mov r15,rax + + mul r14 + mov r10,QWORD[((-24))+rbp*1+rdi] + add r10,rax + mov rax,rbx + adc rdx,0 + mov QWORD[((-24))+rbp*1+rdi],r10 + mov r11,rdx + + mul r14 + add r11,rax + mov rax,rbx + adc rdx,0 + add r11,QWORD[((-16))+rbp*1+rdi] + mov r10,rdx + adc r10,0 + mov QWORD[((-16))+rbp*1+rdi],r11 + + xor r12,r12 + + mov rbx,QWORD[((-8))+rbp*1+rsi] + mul r15 + add r12,rax + mov rax,rbx + adc rdx,0 + add r12,QWORD[((-8))+rbp*1+rdi] + mov r13,rdx + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + adc rdx,0 + add r10,r12 + mov r11,rdx + adc r11,0 + mov QWORD[((-8))+rbp*1+rdi],r10 + + lea rcx,[rbp] + jmp NEAR $L$sqr4x_inner + +ALIGN 32 +$L$sqr4x_inner: + mov rbx,QWORD[rcx*1+rsi] + mul r15 + add r13,rax + mov rax,rbx + mov r12,rdx + adc r12,0 + add r13,QWORD[rcx*1+rdi] + adc r12,0 + +DB 0x67 + mul r14 + add r11,rax + mov rax,rbx + mov rbx,QWORD[8+rcx*1+rsi] + mov r10,rdx + adc r10,0 + add r11,r13 + adc r10,0 + + mul r15 + add r12,rax + mov QWORD[rcx*1+rdi],r11 + mov rax,rbx + mov r13,rdx + adc r13,0 + add r12,QWORD[8+rcx*1+rdi] + lea rcx,[16+rcx] + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + adc rdx,0 + add r10,r12 + mov r11,rdx + adc r11,0 + mov QWORD[((-8))+rcx*1+rdi],r10 + + cmp rcx,0 + jne NEAR $L$sqr4x_inner + +DB 0x67 + mul r15 + add r13,rax + adc rdx,0 + add r13,r11 + adc rdx,0 + + mov QWORD[rdi],r13 + mov r12,rdx + mov QWORD[8+rdi],rdx + + add rbp,16 + jnz NEAR $L$sqr4x_outer + + + mov r14,QWORD[((-32))+rsi] + lea rdi,[((48+8))+r9*2+rsp] + mov rax,QWORD[((-24))+rsi] + lea rdi,[((-32))+rbp*1+rdi] + mov rbx,QWORD[((-16))+rsi] + mov r15,rax + + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + + mul r14 + add r11,rax + mov rax,rbx + mov QWORD[((-24))+rdi],r10 + mov r10,rdx + adc r10,0 + add r11,r13 + mov rbx,QWORD[((-8))+rsi] + adc r10,0 + + mul r15 + add r12,rax + mov rax,rbx + mov QWORD[((-16))+rdi],r11 + mov r13,rdx + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + mov QWORD[((-8))+rdi],r10 + + mul r15 + add r13,rax + mov rax,QWORD[((-16))+rsi] + adc rdx,0 + add r13,r11 + adc rdx,0 + + mov QWORD[rdi],r13 + mov r12,rdx + mov QWORD[8+rdi],rdx + + mul rbx + add rbp,16 + xor r14,r14 + sub rbp,r9 + xor r15,r15 + + add rax,r12 + adc rdx,0 + mov QWORD[8+rdi],rax + mov QWORD[16+rdi],rdx + mov QWORD[24+rdi],r15 + + mov rax,QWORD[((-16))+rbp*1+rsi] + lea rdi,[((48+8))+rsp] + xor r10,r10 + mov r11,QWORD[8+rdi] + + lea r12,[r10*2+r14] + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[16+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[24+rdi] + adc r12,rax + mov rax,QWORD[((-8))+rbp*1+rsi] + mov QWORD[rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[8+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mov r10,QWORD[32+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[40+rdi] + adc rbx,rax + mov rax,QWORD[rbp*1+rsi] + mov QWORD[16+rdi],rbx + adc r8,rdx + lea rbp,[16+rbp] + mov QWORD[24+rdi],r8 + sbb r15,r15 + lea rdi,[64+rdi] + jmp NEAR $L$sqr4x_shift_n_add + +ALIGN 32 +$L$sqr4x_shift_n_add: + lea r12,[r10*2+r14] + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[((-16))+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[((-8))+rdi] + adc r12,rax + mov rax,QWORD[((-8))+rbp*1+rsi] + mov QWORD[((-32))+rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[((-24))+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mov r10,QWORD[rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[8+rdi] + adc rbx,rax + mov rax,QWORD[rbp*1+rsi] + mov QWORD[((-16))+rdi],rbx + adc r8,rdx + + lea r12,[r10*2+r14] + mov QWORD[((-8))+rdi],r8 + sbb r15,r15 + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[16+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[24+rdi] + adc r12,rax + mov rax,QWORD[8+rbp*1+rsi] + mov QWORD[rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[8+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mov r10,QWORD[32+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[40+rdi] + adc rbx,rax + mov rax,QWORD[16+rbp*1+rsi] + mov QWORD[16+rdi],rbx + adc r8,rdx + mov QWORD[24+rdi],r8 + sbb r15,r15 + lea rdi,[64+rdi] + add rbp,32 + jnz NEAR $L$sqr4x_shift_n_add + + lea r12,[r10*2+r14] +DB 0x67 + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[((-16))+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[((-8))+rdi] + adc r12,rax + mov rax,QWORD[((-8))+rsi] + mov QWORD[((-32))+rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[((-24))+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mul rax + neg r15 + adc rbx,rax + adc r8,rdx + mov QWORD[((-16))+rdi],rbx + mov QWORD[((-8))+rdi],r8 +DB 102,72,15,126,213 +__bn_sqr8x_reduction: + xor rax,rax + lea rcx,[rbp*1+r9] + lea rdx,[((48+8))+r9*2+rsp] + mov QWORD[((0+8))+rsp],rcx + lea rdi,[((48+8))+r9*1+rsp] + mov QWORD[((8+8))+rsp],rdx + neg r9 + jmp NEAR $L$8x_reduction_loop + +ALIGN 32 +$L$8x_reduction_loop: + lea rdi,[r9*1+rdi] +DB 0x66 + mov rbx,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov r12,QWORD[32+rdi] + mov r13,QWORD[40+rdi] + mov r14,QWORD[48+rdi] + mov r15,QWORD[56+rdi] + mov QWORD[rdx],rax + lea rdi,[64+rdi] + +DB 0x67 + mov r8,rbx + imul rbx,QWORD[((32+8))+rsp] + mov rax,QWORD[rbp] + mov ecx,8 + jmp NEAR $L$8x_reduce + +ALIGN 32 +$L$8x_reduce: + mul rbx + mov rax,QWORD[8+rbp] + neg r8 + mov r8,rdx + adc r8,0 + + mul rbx + add r9,rax + mov rax,QWORD[16+rbp] + adc rdx,0 + add r8,r9 + mov QWORD[((48-8+8))+rcx*8+rsp],rbx + mov r9,rdx + adc r9,0 + + mul rbx + add r10,rax + mov rax,QWORD[24+rbp] + adc rdx,0 + add r9,r10 + mov rsi,QWORD[((32+8))+rsp] + mov r10,rdx + adc r10,0 + + mul rbx + add r11,rax + mov rax,QWORD[32+rbp] + adc rdx,0 + imul rsi,r8 + add r10,r11 + mov r11,rdx + adc r11,0 + + mul rbx + add r12,rax + mov rax,QWORD[40+rbp] + adc rdx,0 + add r11,r12 + mov r12,rdx + adc r12,0 + + mul rbx + add r13,rax + mov rax,QWORD[48+rbp] + adc rdx,0 + add r12,r13 + mov r13,rdx + adc r13,0 + + mul rbx + add r14,rax + mov rax,QWORD[56+rbp] + adc rdx,0 + add r13,r14 + mov r14,rdx + adc r14,0 + + mul rbx + mov rbx,rsi + add r15,rax + mov rax,QWORD[rbp] + adc rdx,0 + add r14,r15 + mov r15,rdx + adc r15,0 + + dec ecx + jnz NEAR $L$8x_reduce + + lea rbp,[64+rbp] + xor rax,rax + mov rdx,QWORD[((8+8))+rsp] + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$8x_no_tail + +DB 0x66 + add r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + sbb rsi,rsi + + mov rbx,QWORD[((48+56+8))+rsp] + mov ecx,8 + mov rax,QWORD[rbp] + jmp NEAR $L$8x_tail + +ALIGN 32 +$L$8x_tail: + mul rbx + add r8,rax + mov rax,QWORD[8+rbp] + mov QWORD[rdi],r8 + mov r8,rdx + adc r8,0 + + mul rbx + add r9,rax + mov rax,QWORD[16+rbp] + adc rdx,0 + add r8,r9 + lea rdi,[8+rdi] + mov r9,rdx + adc r9,0 + + mul rbx + add r10,rax + mov rax,QWORD[24+rbp] + adc rdx,0 + add r9,r10 + mov r10,rdx + adc r10,0 + + mul rbx + add r11,rax + mov rax,QWORD[32+rbp] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + + mul rbx + add r12,rax + mov rax,QWORD[40+rbp] + adc rdx,0 + add r11,r12 + mov r12,rdx + adc r12,0 + + mul rbx + add r13,rax + mov rax,QWORD[48+rbp] + adc rdx,0 + add r12,r13 + mov r13,rdx + adc r13,0 + + mul rbx + add r14,rax + mov rax,QWORD[56+rbp] + adc rdx,0 + add r13,r14 + mov r14,rdx + adc r14,0 + + mul rbx + mov rbx,QWORD[((48-16+8))+rcx*8+rsp] + add r15,rax + adc rdx,0 + add r14,r15 + mov rax,QWORD[rbp] + mov r15,rdx + adc r15,0 + + dec ecx + jnz NEAR $L$8x_tail + + lea rbp,[64+rbp] + mov rdx,QWORD[((8+8))+rsp] + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$8x_tail_done + + mov rbx,QWORD[((48+56+8))+rsp] + neg rsi + mov rax,QWORD[rbp] + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + sbb rsi,rsi + + mov ecx,8 + jmp NEAR $L$8x_tail + +ALIGN 32 +$L$8x_tail_done: + xor rax,rax + add r8,QWORD[rdx] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rax,0 + + neg rsi +$L$8x_no_tail: + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + adc rax,0 + mov rcx,QWORD[((-8))+rbp] + xor rsi,rsi + +DB 102,72,15,126,213 + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 +DB 102,73,15,126,217 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + lea rdi,[64+rdi] + + cmp rdi,rdx + jb NEAR $L$8x_reduction_loop + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__bn_post4x_internal: + + mov r12,QWORD[rbp] + lea rbx,[r9*1+rdi] + mov rcx,r9 +DB 102,72,15,126,207 + neg rax +DB 102,72,15,126,206 + sar rcx,3+2 + dec r12 + xor r10,r10 + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqr4x_sub_entry + +ALIGN 16 +$L$sqr4x_sub: + mov r12,QWORD[rbp] + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] +$L$sqr4x_sub_entry: + lea rbp,[32+rbp] + not r12 + not r13 + not r14 + not r15 + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + neg r10 + adc r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + adc r14,QWORD[16+rbx] + adc r15,QWORD[24+rbx] + mov QWORD[rdi],r12 + lea rbx,[32+rbx] + mov QWORD[8+rdi],r13 + sbb r10,r10 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + lea rdi,[32+rdi] + + inc rcx + jnz NEAR $L$sqr4x_sub + + mov r10,r9 + neg r9 + DB 0F3h,0C3h ;repret + + +global GFp_bn_from_montgomery + +ALIGN 32 +GFp_bn_from_montgomery: + + test DWORD[48+rsp],7 + jz NEAR bn_from_mont8x + xor eax,eax + DB 0F3h,0C3h ;repret + + + + +ALIGN 32 +bn_from_mont8x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_from_mont8x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +DB 0x67 + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$from_prologue: + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$from_sp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$from_sp_done + +ALIGN 32 +$L$from_sp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$from_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$from_page_walk + jmp NEAR $L$from_page_walk_done + +$L$from_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$from_page_walk +$L$from_page_walk_done: + + mov r10,r9 + neg r9 + + + + + + + + + + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$from_body: + mov r11,r9 + lea rax,[48+rsp] + pxor xmm0,xmm0 + jmp NEAR $L$mul_by_1 + +ALIGN 32 +$L$mul_by_1: + movdqu xmm1,XMMWORD[rsi] + movdqu xmm2,XMMWORD[16+rsi] + movdqu xmm3,XMMWORD[32+rsi] + movdqa XMMWORD[r9*1+rax],xmm0 + movdqu xmm4,XMMWORD[48+rsi] + movdqa XMMWORD[16+r9*1+rax],xmm0 +DB 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 + movdqa XMMWORD[rax],xmm1 + movdqa XMMWORD[32+r9*1+rax],xmm0 + movdqa XMMWORD[16+rax],xmm2 + movdqa XMMWORD[48+r9*1+rax],xmm0 + movdqa XMMWORD[32+rax],xmm3 + movdqa XMMWORD[48+rax],xmm4 + lea rax,[64+rax] + sub r11,64 + jnz NEAR $L$mul_by_1 + +DB 102,72,15,110,207 +DB 102,72,15,110,209 +DB 0x67 + mov rbp,rcx +DB 102,73,15,110,218 + lea r11,[GFp_ia32cap_P] + mov r11d,DWORD[8+r11] + and r11d,0x80108 + cmp r11d,0x80108 + jne NEAR $L$from_mont_nox + + lea rdi,[r9*1+rax] + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor xmm0,xmm0 + lea rax,[48+rsp] + jmp NEAR $L$from_mont_zero + +ALIGN 32 +$L$from_mont_nox: + call __bn_sqr8x_reduction + call __bn_post4x_internal + + pxor xmm0,xmm0 + lea rax,[48+rsp] + jmp NEAR $L$from_mont_zero + +ALIGN 32 +$L$from_mont_zero: + mov rsi,QWORD[40+rsp] + + movdqa XMMWORD[rax],xmm0 + movdqa XMMWORD[16+rax],xmm0 + movdqa XMMWORD[32+rax],xmm0 + movdqa XMMWORD[48+rax],xmm0 + lea rax,[64+rax] + sub r9,32 + jnz NEAR $L$from_mont_zero + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$from_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_from_mont8x: + +ALIGN 32 +bn_mulx4x_mont_gather5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont_gather5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$mulx4x_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$mulx4xsp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$mulx4xsp_done + +$L$mulx4xsp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$mulx4xsp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + + + + + + + + + + + + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$mulx4x_body: + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_mulx4x_mont_gather5: + + +ALIGN 32 +mulx4x_internal: + + mov QWORD[8+rsp],r9 + mov r10,r9 + neg r9 + shl r9,5 + neg r10 + lea r13,[128+r9*1+rdx] + shr r9,5+5 + movd xmm5,DWORD[56+rax] + sub r9,1 + lea rax,[$L$inc] + mov QWORD[((16+8))+rsp],r13 + mov QWORD[((24+8))+rsp],r9 + mov QWORD[((56+8))+rsp],rdi + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r10,[((88-112))+r10*1+rsp] + lea rdi,[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 +DB 0x67 + movdqa xmm2,xmm1 +DB 0x67 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 +DB 0x67 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + + pand xmm0,XMMWORD[64+rdi] + pand xmm1,XMMWORD[80+rdi] + pand xmm2,XMMWORD[96+rdi] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+rdi] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+rdi] + movdqa xmm5,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+rdi] + movdqa xmm5,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[rdi] + movdqa xmm5,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + pxor xmm0,xmm1 + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea rdi,[256+rdi] +DB 102,72,15,126,194 + lea rbx,[((64+32+8))+rsp] + + mov r9,rdx + mulx rax,r8,QWORD[rsi] + mulx r12,r11,QWORD[8+rsi] + add r11,rax + mulx r13,rax,QWORD[16+rsi] + adc r12,rax + adc r13,0 + mulx r14,rax,QWORD[24+rsi] + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + xor rbp,rbp + mov rdx,r8 + + mov QWORD[((8+8))+rsp],rdi + + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] +DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[8+rsp] + adc r15,rbp + lea rsi,[rax*1+rsi] + add r14,r15 + mov rdi,QWORD[((8+8))+rsp] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + lea r10,[((16-256))+rbx] + pxor xmm4,xmm4 +DB 0x67,0x67 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+rdi] + movdqa xmm1,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm0,XMMWORD[256+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm1,XMMWORD[272+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[288+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[304+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+rdi] + movdqa xmm1,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm0,XMMWORD[320+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm1,XMMWORD[336+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[352+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[368+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[rdi] + movdqa xmm1,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm0,XMMWORD[384+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm1,XMMWORD[400+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[416+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[432+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+rdi] + movdqa xmm1,XMMWORD[80+rdi] + movdqa xmm2,XMMWORD[96+rdi] + pand xmm0,XMMWORD[448+r10] + movdqa xmm3,XMMWORD[112+rdi] + pand xmm1,XMMWORD[464+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[480+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[496+r10] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea rdi,[256+rdi] +DB 102,72,15,126,194 + + mov QWORD[rbx],rbp + lea rbx,[32+rax*1+rbx] + mulx r11,r8,QWORD[rsi] + xor rbp,rbp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + mulx r14,rdx,QWORD[24+rsi] + adox r12,QWORD[((-16))+rbx] + adcx r13,rdx + lea rcx,[rax*1+rcx] + lea rsi,[32+rsi] + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + adox r14,rbp + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + + mov rdx,r8 + xor rbp,rbp + mov QWORD[((8+8))+rsp],rdi + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r12,rax + mov QWORD[((-24))+rbx],r11 + adox r15,rbp + mov QWORD[((-16))+rbx],r12 + lea rcx,[32+rcx] + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mov QWORD[((-32))+rbx],r11 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + lea rcx,[32+rcx] + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[((0+8))+rsp] + adc r15,rbp + sub rdi,QWORD[rbx] + mov rdi,QWORD[((8+8))+rsp] + mov r10,QWORD[((16+8))+rsp] + adc r14,r15 + lea rsi,[rax*1+rsi] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + + cmp rdi,r10 + jb NEAR $L$mulx4x_outer + + mov r10,QWORD[((-8))+rcx] + mov r8,rbp + mov r12,QWORD[rax*1+rcx] + lea rbp,[rax*1+rcx] + mov rcx,rax + lea rdi,[rax*1+rbx] + xor eax,eax + xor r15,r15 + sub r10,r14 + adc r15,r15 + or r8,r15 + sar rcx,3+2 + sub rax,r8 + mov rdx,QWORD[((56+8))+rsp] + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + + + +ALIGN 32 +bn_powerx5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_powerx5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + mov rax,rsp + +$L$powerx5_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$powerx5_prologue: + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$pwrx_sp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$pwrx_sp_done + +ALIGN 32 +$L$pwrx_sp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$pwrx_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk + jmp NEAR $L$pwrx_page_walk_done + +$L$pwrx_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk +$L$pwrx_page_walk_done: + + mov r10,r9 + neg r9 + + + + + + + + + + + + + pxor xmm0,xmm0 +DB 102,72,15,110,207 +DB 102,72,15,110,209 +DB 102,73,15,110,218 +DB 102,72,15,110,226 + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$powerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + mov r9,r10 + mov rdi,rsi +DB 102,72,15,126,209 +DB 102,72,15,126,226 + mov rax,QWORD[40+rsp] + + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$powerx5_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_bn_powerx5: + +global GFp_bn_sqrx8x_internal + +ALIGN 32 +GFp_bn_sqrx8x_internal: +__bn_sqrx8x_internal: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lea rdi,[((48+8))+rsp] + lea rbp,[r9*1+rsi] + mov QWORD[((0+8))+rsp],r9 + mov QWORD[((8+8))+rsp],rbp + jmp NEAR $L$sqr8x_zero_start + +ALIGN 32 +DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +$L$sqrx8x_zero: +DB 0x3e + movdqa XMMWORD[rdi],xmm0 + movdqa XMMWORD[16+rdi],xmm0 + movdqa XMMWORD[32+rdi],xmm0 + movdqa XMMWORD[48+rdi],xmm0 +$L$sqr8x_zero_start: + movdqa XMMWORD[64+rdi],xmm0 + movdqa XMMWORD[80+rdi],xmm0 + movdqa XMMWORD[96+rdi],xmm0 + movdqa XMMWORD[112+rdi],xmm0 + lea rdi,[128+rdi] + sub r9,64 + jnz NEAR $L$sqrx8x_zero + + mov rdx,QWORD[rsi] + + xor r10,r10 + xor r11,r11 + xor r12,r12 + xor r13,r13 + xor r14,r14 + xor r15,r15 + lea rdi,[((48+8))+rsp] + xor rbp,rbp + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_loop: + mulx rax,r8,QWORD[8+rsi] + adcx r8,r9 + adox r10,rax + mulx rax,r9,QWORD[16+rsi] + adcx r9,r10 + adox r11,rax +DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcx r10,r11 + adox r12,rax +DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcx r11,r12 + adox r13,rax + mulx rax,r12,QWORD[40+rsi] + adcx r12,r13 + adox r14,rax + mulx rax,r13,QWORD[48+rsi] + adcx r13,r14 + adox rax,r15 + mulx r15,r14,QWORD[56+rsi] + mov rdx,QWORD[8+rsi] + adcx r14,rax + adox r15,rbp + adc r15,QWORD[64+rdi] + mov QWORD[8+rdi],r8 + mov QWORD[16+rdi],r9 + sbb rcx,rcx + xor rbp,rbp + + + mulx rbx,r8,QWORD[16+rsi] + mulx rax,r9,QWORD[24+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[32+rsi] + adcx r9,r11 + adox r10,rax +DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcx r10,r12 + adox r11,rbx +DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcx r11,r13 + adox r12,r14 +DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + mov rdx,QWORD[16+rsi] + adcx r12,rax + adox r13,rbx + adcx r13,r15 + adox r14,rbp + adcx r14,rbp + + mov QWORD[24+rdi],r8 + mov QWORD[32+rdi],r9 + + mulx rbx,r8,QWORD[24+rsi] + mulx rax,r9,QWORD[32+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[40+rsi] + adcx r9,r11 + adox r10,rax +DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcx r10,r12 + adox r11,r13 +DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +DB 0x3e + mov rdx,QWORD[24+rsi] + adcx r11,rbx + adox r12,rax + adcx r12,r14 + mov QWORD[40+rdi],r8 + mov QWORD[48+rdi],r9 + mulx rax,r8,QWORD[32+rsi] + adox r13,rbp + adcx r13,rbp + + mulx rbx,r9,QWORD[40+rsi] + adcx r8,r10 + adox r9,rax + mulx rax,r10,QWORD[48+rsi] + adcx r9,r11 + adox r10,r12 + mulx r12,r11,QWORD[56+rsi] + mov rdx,QWORD[32+rsi] + mov r14,QWORD[40+rsi] + adcx r10,rbx + adox r11,rax + mov r15,QWORD[48+rsi] + adcx r11,r13 + adox r12,rbp + adcx r12,rbp + + mov QWORD[56+rdi],r8 + mov QWORD[64+rdi],r9 + + mulx rax,r9,r14 + mov r8,QWORD[56+rsi] + adcx r9,r10 + mulx rbx,r10,r15 + adox r10,rax + adcx r10,r11 + mulx rax,r11,r8 + mov rdx,r14 + adox r11,rbx + adcx r11,r12 + + adcx rax,rbp + + mulx rbx,r14,r15 + mulx r13,r12,r8 + mov rdx,r15 + lea rsi,[64+rsi] + adcx r11,r14 + adox r12,rbx + adcx r12,rax + adox r13,rbp + +DB 0x67,0x67 + mulx r14,r8,r8 + adcx r13,r8 + adcx r14,rbp + + cmp rsi,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_outer_break + + neg rcx + mov rcx,-8 + mov r15,rbp + mov r8,QWORD[64+rdi] + adcx r9,QWORD[72+rdi] + adcx r10,QWORD[80+rdi] + adcx r11,QWORD[88+rdi] + adc r12,QWORD[96+rdi] + adc r13,QWORD[104+rdi] + adc r14,QWORD[112+rdi] + adc r15,QWORD[120+rdi] + lea rbp,[rsi] + lea rdi,[128+rdi] + sbb rax,rax + + mov rdx,QWORD[((-64))+rsi] + mov QWORD[((16+8))+rsp],rax + mov QWORD[((24+8))+rsp],rdi + + + xor eax,eax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_loop: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + +DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + mov QWORD[rcx*8+rdi],rbx + mov ebx,0 + adcx r13,rax + adox r14,r15 + +DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + mov rdx,QWORD[8+rcx*8+rsi] + adcx r14,rax + adox r15,rbx + adcx r15,rbx + +DB 0x67 + inc rcx + jnz NEAR $L$sqrx8x_loop + + lea rbp,[64+rbp] + mov rcx,-8 + cmp rbp,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_break + + sub rbx,QWORD[((16+8))+rsp] +DB 0x66 + mov rdx,QWORD[((-64))+rsi] + adcx r8,QWORD[rdi] + adcx r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] +DB 0x67 + sbb rax,rax + xor ebx,ebx + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_break: + xor rbp,rbp + sub rbx,QWORD[((16+8))+rsp] + adcx r8,rbp + mov rcx,QWORD[((24+8))+rsp] + adcx r9,rbp + mov rdx,QWORD[rsi] + adc r10,0 + mov QWORD[rdi],r8 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + cmp rdi,rcx + je NEAR $L$sqrx8x_outer_loop + + mov QWORD[8+rdi],r9 + mov r9,QWORD[8+rcx] + mov QWORD[16+rdi],r10 + mov r10,QWORD[16+rcx] + mov QWORD[24+rdi],r11 + mov r11,QWORD[24+rcx] + mov QWORD[32+rdi],r12 + mov r12,QWORD[32+rcx] + mov QWORD[40+rdi],r13 + mov r13,QWORD[40+rcx] + mov QWORD[48+rdi],r14 + mov r14,QWORD[48+rcx] + mov QWORD[56+rdi],r15 + mov r15,QWORD[56+rcx] + mov rdi,rcx + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_break: + mov QWORD[72+rdi],r9 +DB 102,72,15,126,217 + mov QWORD[80+rdi],r10 + mov QWORD[88+rdi],r11 + mov QWORD[96+rdi],r12 + mov QWORD[104+rdi],r13 + mov QWORD[112+rdi],r14 + lea rdi,[((48+8))+rsp] + mov rdx,QWORD[rcx*1+rsi] + + mov r11,QWORD[8+rdi] + xor r10,r10 + mov r9,QWORD[((0+8))+rsp] + adox r11,r11 + mov r12,QWORD[16+rdi] + mov r13,QWORD[24+rdi] + + +ALIGN 32 +$L$sqrx4x_shift_n_add: + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 +DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[40+rdi] + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + mov rdx,QWORD[16+rcx*1+rsi] + mov r12,QWORD[48+rdi] + adox r11,r11 + adcx rbx,r13 + mov r13,QWORD[56+rdi] + mov QWORD[16+rdi],rax + mov QWORD[24+rdi],rbx + + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 + mov rdx,QWORD[24+rcx*1+rsi] + lea rcx,[32+rcx] + mov r10,QWORD[64+rdi] + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[72+rdi] + mov QWORD[32+rdi],rax + mov QWORD[40+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + jrcxz $L$sqrx4x_shift_n_add_break +DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adox r11,r11 + adcx rbx,r13 + mov r12,QWORD[80+rdi] + mov r13,QWORD[88+rdi] + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] + nop + jmp NEAR $L$sqrx4x_shift_n_add + +ALIGN 32 +$L$sqrx4x_shift_n_add_break: + adcx rbx,r13 + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] +DB 102,72,15,126,213 +__bn_sqrx8x_reduction: + xor eax,eax + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rcx,[((-64))+r9*1+rbp] + + mov QWORD[((0+8))+rsp],rcx + mov QWORD[((8+8))+rsp],rdi + + lea rdi,[((48+8))+rsp] + jmp NEAR $L$sqrx8x_reduction_loop + +ALIGN 32 +$L$sqrx8x_reduction_loop: + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov r12,QWORD[32+rdi] + mov r8,rdx + imul rdx,rbx + mov r13,QWORD[40+rdi] + mov r14,QWORD[48+rdi] + mov r15,QWORD[56+rdi] + mov QWORD[((24+8))+rsp],rax + + lea rdi,[64+rdi] + xor rsi,rsi + mov rcx,-8 + jmp NEAR $L$sqrx8x_reduce + +ALIGN 32 +$L$sqrx8x_reduce: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rax,rbx + adox r8,r9 + + mulx r9,rbx,QWORD[8+rbp] + adcx r8,rbx + adox r9,r10 + + mulx r10,rbx,QWORD[16+rbp] + adcx r9,rbx + adox r10,r11 + + mulx r11,rbx,QWORD[24+rbp] + adcx r10,rbx + adox r11,r12 + +DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + mov rax,rdx + mov rdx,r8 + adcx r11,rbx + adox r12,r13 + + mulx rdx,rbx,QWORD[((32+8))+rsp] + mov rdx,rax + mov QWORD[((64+48+8))+rcx*8+rsp],rax + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,rbx + adcx r14,rax + adox r15,rsi + adcx r15,rsi + +DB 0x67,0x67,0x67 + inc rcx + jnz NEAR $L$sqrx8x_reduce + + mov rax,rsi + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_no_tail + + mov rdx,QWORD[((48+8))+rsp] + add r8,QWORD[rdi] + lea rbp,[64+rbp] + mov rcx,-8 + adcx r9,QWORD[8+rdi] + adcx r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + +DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,QWORD[((72+48+8))+rcx*8+rsp] + adcx r14,rax + adox r15,rsi + mov QWORD[rcx*8+rdi],rbx + mov rbx,r8 + adcx r15,rsi + + inc rcx + jnz NEAR $L$sqrx8x_tail + + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_tail_done + + sub rsi,QWORD[((16+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rbp,[64+rbp] + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + sub rcx,8 + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail_done: + xor rax,rax + add r8,QWORD[((24+8))+rsp] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rax,0 + + sub rsi,QWORD[((16+8))+rsp] +$L$sqrx8x_no_tail: + adc r8,QWORD[rdi] +DB 102,72,15,126,217 + adc r9,QWORD[8+rdi] + mov rsi,QWORD[56+rbp] +DB 102,72,15,126,213 + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + adc rax,0 + + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[64+rcx*1+rdi] + + mov QWORD[rdi],r8 + lea r8,[64+rdi] + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + lea rdi,[64+rcx*1+rdi] + cmp r8,QWORD[((8+8))+rsp] + jb NEAR $L$sqrx8x_reduction_loop + DB 0F3h,0C3h ;repret + + +ALIGN 32 + +__bn_postx4x_internal: + + mov r12,QWORD[rbp] + mov r10,rcx + mov r9,rcx + neg rax + sar rcx,3+2 + +DB 102,72,15,126,202 +DB 102,72,15,126,206 + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + +ALIGN 16 +$L$sqrx4x_sub: + mov r12,QWORD[rbp] + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] +$L$sqrx4x_sub_entry: + andn r12,r12,rax + lea rbp,[32+rbp] + andn r13,r13,rax + andn r14,r14,rax + andn r15,r15,rax + + neg r8 + adc r12,QWORD[rdi] + adc r13,QWORD[8+rdi] + adc r14,QWORD[16+rdi] + adc r15,QWORD[24+rdi] + mov QWORD[rdx],r12 + lea rdi,[32+rdi] + mov QWORD[8+rdx],r13 + sbb r8,r8 + mov QWORD[16+rdx],r14 + mov QWORD[24+rdx],r15 + lea rdx,[32+rdx] + + inc rcx + jnz NEAR $L$sqrx4x_sub + + neg r9 + + DB 0F3h,0C3h ;repret + + +global GFp_bn_scatter5 + +ALIGN 16 +GFp_bn_scatter5: + + cmp edx,0 + jz NEAR $L$scatter_epilogue + lea r8,[r9*8+r8] +$L$scatter: + mov rax,QWORD[rcx] + lea rcx,[8+rcx] + mov QWORD[r8],rax + lea r8,[256+r8] + sub edx,1 + jnz NEAR $L$scatter +$L$scatter_epilogue: + DB 0F3h,0C3h ;repret + + + +global GFp_bn_gather5 + +ALIGN 32 +GFp_bn_gather5: + +$L$SEH_begin_GFp_bn_gather5: + +DB 0x4c,0x8d,0x14,0x24 + +DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + lea rax,[$L$inc] + and rsp,-16 + + movd xmm5,r9d + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r11,[128+r8] + lea rax,[128+rsp] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[(-128)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[(-112)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[(-96)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[(-80)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[(-64)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[(-48)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[(-32)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[(-16)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[16+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[32+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[48+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[64+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[80+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[96+rax],xmm2 + movdqa xmm2,xmm4 + movdqa XMMWORD[112+rax],xmm3 + jmp NEAR $L$gather + +ALIGN 32 +$L$gather: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r11] + movdqa xmm1,XMMWORD[((-112))+r11] + movdqa xmm2,XMMWORD[((-96))+r11] + pand xmm0,XMMWORD[((-128))+rax] + movdqa xmm3,XMMWORD[((-80))+r11] + pand xmm1,XMMWORD[((-112))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r11] + movdqa xmm1,XMMWORD[((-48))+r11] + movdqa xmm2,XMMWORD[((-32))+r11] + pand xmm0,XMMWORD[((-64))+rax] + movdqa xmm3,XMMWORD[((-16))+r11] + pand xmm1,XMMWORD[((-48))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r11] + movdqa xmm1,XMMWORD[16+r11] + movdqa xmm2,XMMWORD[32+r11] + pand xmm0,XMMWORD[rax] + movdqa xmm3,XMMWORD[48+r11] + pand xmm1,XMMWORD[16+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r11] + movdqa xmm1,XMMWORD[80+r11] + movdqa xmm2,XMMWORD[96+r11] + pand xmm0,XMMWORD[64+rax] + movdqa xmm3,XMMWORD[112+r11] + pand xmm1,XMMWORD[80+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rax] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + lea r11,[256+r11] + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + movq QWORD[rcx],xmm0 + lea rcx,[8+rcx] + sub edx,1 + jnz NEAR $L$gather + + lea rsp,[r10] + + DB 0F3h,0C3h ;repret +$L$SEH_end_GFp_bn_gather5: + + +ALIGN 64 +$L$inc: + DD 0,0,1,1 + DD 2,2,2,2 +DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 +DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 +DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79 +DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111 +DB 112,101,110,115,115,108,46,111,114,103,62,0 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +mul_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea r10,[$L$mul_epilogue] + cmp rbx,r10 + ja NEAR $L$body_40 + + mov r10,QWORD[192+r8] + mov rax,QWORD[8+r10*8+rax] + + jmp NEAR $L$common_pop_regs + +$L$body_40: + mov rax,QWORD[40+rax] +$L$common_pop_regs: + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_GFp_bn_mul_mont_gather5 wrt ..imagebase + DD $L$SEH_end_GFp_bn_mul_mont_gather5 wrt ..imagebase + DD $L$SEH_info_GFp_bn_mul_mont_gather5 wrt ..imagebase + + DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase + DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase + DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase + + DD $L$SEH_begin_GFp_bn_power5 wrt ..imagebase + DD $L$SEH_end_GFp_bn_power5 wrt ..imagebase + DD $L$SEH_info_GFp_bn_power5 wrt ..imagebase + + DD $L$SEH_begin_bn_from_mont8x wrt ..imagebase + DD $L$SEH_end_bn_from_mont8x wrt ..imagebase + DD $L$SEH_info_bn_from_mont8x wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase + + DD $L$SEH_begin_bn_powerx5 wrt ..imagebase + DD $L$SEH_end_bn_powerx5 wrt ..imagebase + DD $L$SEH_info_GFp_bn_powerx5 wrt ..imagebase + DD $L$SEH_begin_GFp_bn_gather5 wrt ..imagebase + DD $L$SEH_end_GFp_bn_gather5 wrt ..imagebase + DD $L$SEH_info_GFp_bn_gather5 wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_GFp_bn_mul_mont_gather5: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_mul4x_mont_gather5: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_GFp_bn_power5: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_from_mont8x: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_mulx4x_mont_gather5: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_GFp_bn_powerx5: +DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_GFp_bn_gather5: +DB 0x01,0x0b,0x03,0x0a +DB 0x0b,0x01,0x21,0x00 +DB 0x04,0xa3,0x00,0x00 +ALIGN 8