diff --git a/ext/arm32-neon-salsa2012-asm/README.md b/ext/arm32-neon-salsa2012-asm/README.md new file mode 100644 index 000000000..54fc6f5fa --- /dev/null +++ b/ext/arm32-neon-salsa2012-asm/README.md @@ -0,0 +1,6 @@ +ARM NEON (32-bit) ASM implementation of Salsa20/12 +====== + +This is from [supercop](http://bench.cr.yp.to/supercop.html) and was originally written by Daniel J. Bernstein. Code is in the public domain like the rest of Salsa20. It's much faster than the naive implementation. + +It's included automatically in 32-bit Linux ARM builds. It likely will not work on 64-bit ARM, so it'll need to be ported at least. That will unfortunately keep it out of mobile versions for now since those are all going 64-bit. diff --git a/ext/arm32-neon-salsa2012-asm/salsa2012.h b/ext/arm32-neon-salsa2012-asm/salsa2012.h new file mode 100644 index 000000000..7820a2e69 --- /dev/null +++ b/ext/arm32-neon-salsa2012-asm/salsa2012.h @@ -0,0 +1,15 @@ +#ifndef ZT_SALSA2012_ARM32NEON_ASM +#define ZT_SALSA2012_ARM32NEON_ASM + +#ifdef __cplusplus +extern "C" { +#endif + +// ciphertext buffer, message/NULL, length, nonce (8 bytes), key (32 bytes) +extern int zt_salsa2012_armneon3_xor(unsigned char *c,const unsigned char *m,unsigned long long len,const unsigned char *n,const unsigned char *k); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/ext/arm32-neon-salsa2012-asm/salsa2012.s b/ext/arm32-neon-salsa2012-asm/salsa2012.s new file mode 100644 index 000000000..9e5989cd3 --- /dev/null +++ b/ext/arm32-neon-salsa2012-asm/salsa2012.s @@ -0,0 +1,2231 @@ + +# qhasm: int32 input_0 + +# qhasm: int32 input_1 + +# qhasm: int32 input_2 + +# qhasm: int32 input_3 + +# qhasm: stack32 input_4 + +# qhasm: stack32 input_5 + +# qhasm: stack32 input_6 + +# qhasm: stack32 input_7 + +# qhasm: int32 caller_r4 + +# qhasm: int32 caller_r5 + +# qhasm: int32 caller_r6 + +# qhasm: int32 caller_r7 + +# qhasm: int32 caller_r8 + +# qhasm: int32 caller_r9 + +# qhasm: int32 caller_r10 + +# qhasm: int32 caller_r11 + +# qhasm: int32 caller_r14 + +# qhasm: reg128 caller_q4 + +# qhasm: reg128 caller_q5 + +# qhasm: reg128 caller_q6 + +# qhasm: reg128 caller_q7 + +# qhasm: startcode +.fpu neon +.text + +# qhasm: constant sigma: +.align 2 +sigma: + +# qhasm: const32 1634760805 +.word 1634760805 + +# qhasm: const32 857760878 +.word 857760878 + +# qhasm: const32 2036477234 +.word 2036477234 + +# qhasm: const32 1797285236 +.word 1797285236 + +# qhasm: int128 abab + +# qhasm: int128 diag0 + +# qhasm: int128 diag1 + +# qhasm: int128 diag2 + +# qhasm: int128 diag3 + +# qhasm: int128 a0 + +# qhasm: int128 a1 + +# qhasm: int128 a2 + +# qhasm: int128 a3 + +# qhasm: int128 b0 + +# qhasm: int128 b1 + +# qhasm: int128 b2 + +# qhasm: int128 b3 + +# qhasm: int128 next_diag0 + +# qhasm: int128 next_diag1 + +# qhasm: int128 next_diag2 + +# qhasm: int128 next_diag3 + +# qhasm: int128 next_a0 + +# qhasm: int128 next_a1 + +# qhasm: int128 next_a2 + +# qhasm: int128 next_a3 + +# qhasm: int128 next_b0 + +# qhasm: int128 next_b1 + +# qhasm: int128 next_b2 + +# qhasm: int128 next_b3 + +# qhasm: int128 x0x5x10x15 + +# qhasm: int128 x12x1x6x11 + +# qhasm: int128 x8x13x2x7 + +# qhasm: int128 x4x9x14x3 + +# qhasm: int128 x0x1x10x11 + +# qhasm: int128 x12x13x6x7 + +# qhasm: int128 x8x9x2x3 + +# qhasm: int128 x4x5x14x15 + +# qhasm: int128 x0x1x2x3 + +# qhasm: int128 x4x5x6x7 + +# qhasm: int128 x8x9x10x11 + +# qhasm: int128 x12x13x14x15 + +# qhasm: int128 m0m1m2m3 + +# qhasm: int128 m4m5m6m7 + +# qhasm: int128 m8m9m10m11 + +# qhasm: int128 m12m13m14m15 + +# qhasm: int128 start0 + +# qhasm: int128 start1 + +# qhasm: int128 start2 + +# qhasm: int128 start3 + +# qhasm: stack128 stack_start3 + +# qhasm: stack128 next_start2 + +# qhasm: stack128 next_start3 + +# qhasm: int128 k0k1k2k3 + +# qhasm: int128 k4k5k6k7 + +# qhasm: int128 k1n1k7k2 + +# qhasm: int128 n2n3n3n2 + +# qhasm: int128 k2k3k6k7 + +# qhasm: int128 nextblock + +# qhasm: stack128 stack_q4 + +# qhasm: stack128 stack_q5 + +# qhasm: stack128 stack_q6 + +# qhasm: stack128 stack_q7 + +# qhasm: stack32 stack_r4 + +# qhasm: stack128 k2k3k6k7_stack + +# qhasm: stack128 k1n1k7k2_stack + +# qhasm: stack512 tmp + +# qhasm: stack32 savec + +# qhasm: int32 i + +# qhasm: int32 ci + +# qhasm: int32 mi + +# qhasm: enter zt_salsa2012_armneon3_xor +.align 2 +.global _zt_salsa2012_armneon3_xor +.global zt_salsa2012_armneon3_xor +.type _zt_salsa2012_armneon3_xor STT_FUNC +.type zt_salsa2012_armneon3_xor STT_FUNC +_zt_salsa2012_armneon3_xor: +zt_salsa2012_armneon3_xor: +sub sp,sp,#256 + +# qhasm: new stack_q4 + +# qhasm: new stack_q5 + +# qhasm: new stack_q6 + +# qhasm: new stack_q7 + +# qhasm: stack_q4 bot = caller_q4 bot +# asm 1: vstr stack_r4=stack32#2 +# asm 2: str stack_r4=[sp,#68] +str r4,[sp,#68] + +# qhasm: int32 c + +# qhasm: c = input_0 +# asm 1: mov >c=int32#1,c=r0,m=int32#2,m=r1,mlenlow=int32#3,mlenlow=r2,mlenhigh=int32#4,mlenhigh=r3,n=int32#5,n=r4,k=int32#13,k=r12,k0k1k2k3=reg128#1%bot->k0k1k2k3=reg128#1%top},[k0k1k2k3=d0->k0k1k2k3=d1},[k4k5k6k7=reg128#2%bot->k4k5k6k7=reg128#2%top},[k4k5k6k7=d2->k4k5k6k7=d3},[i=int32#13,=sigma +# asm 2: ldr >i=r12,=sigma +ldr r12,=sigma + +# qhasm: start0 = mem128[i] +# asm 1: vld1.8 {>start0=reg128#3%bot->start0=reg128#3%top},[start0=d4->start0=d5},[start1=reg128#4,#0 +# asm 2: vmov.i64 >start1=q3,#0 +vmov.i64 q3,#0 + +# qhasm: start1 bot = mem64[n] +# asm 1: vld1.8 {k2k3k6k7=reg128#6,k2k3k6k7=q5,n2n3n3n2=reg128#1,#0 +# asm 2: vmov.i64 >n2n3n3n2=q0,#0 +vmov.i64 q0,#0 + +# qhasm: unsigneddiag0=reg128#8,diag0=q7,diag1=reg128#9,diag1=q8,start2=reg128#10,start2=q9,nextblock=reg128#11,#0xff +# asm 2: vmov.i64 >nextblock=q10,#0xff +vmov.i64 q10,#0xff + +# qhasm: 4x nextblock unsigned>>= 7 +# asm 1: vshr.u32 >nextblock=reg128#11,nextblock=q10,n2n3n3n2=reg128#1,n2n3n3n2=q0,n2n3n3n2=reg128#1,n2n3n3n2=q0,next_diag0=reg128#2,next_diag0=q1,next_diag1=reg128#5,next_diag1=q4,i=int32#5,=12 +# asm 2: ldr >i=r4,=12 +ldr r4,=12 + +# qhasm: mainloop2: +._mainloop2: + +# qhasm: 4x a0 = diag1 + diag0 +# asm 1: vadd.i32 >a0=reg128#11,a0=q10,next_a0=reg128#14,next_a0=q13,b0=reg128#15,b0=q14,next_b0=reg128#16,next_b0=q15,> 25 +# asm 1: vsri.i32 > 25 +# asm 1: vsri.i32 diag3=reg128#7,diag3=q6,next_diag3=reg128#11,next_diag3=q10,a1=reg128#13,a1=q12,next_a1=reg128#14,next_a1=q13,b1=reg128#15,b1=q14,next_b1=reg128#16,next_b1=q15,> 23 +# asm 1: vsri.i32 > 23 +# asm 1: vsri.i32 diag2=reg128#6,diag2=q5,next_diag2=reg128#12,next_diag2=q11,a2=reg128#13,a2=q12,diag3=reg128#7,diag3=q6,next_a2=reg128#14,next_a2=q13,b2=reg128#15,b2=q14,next_diag3=reg128#11,next_diag3=q10,next_b2=reg128#16,next_b2=q15,> 19 +# asm 1: vsri.i32 > 19 +# asm 1: vsri.i32 diag1=reg128#9,diag1=q8,next_diag1=reg128#5,next_diag1=q4,a3=reg128#13,a3=q12,next_a3=reg128#14,next_a3=q13,b3=reg128#15,b3=q14,next_b3=reg128#16,next_b3=q15,> 14 +# asm 1: vsri.i32 diag1=reg128#9,diag1=q8,> 14 +# asm 1: vsri.i32 diag0=reg128#8,diag0=q7,next_diag1=reg128#5,next_diag1=q4,next_diag0=reg128#2,next_diag0=q1,a0=reg128#13,a0=q12,next_a0=reg128#14,next_a0=q13,b0=reg128#15,b0=q14,next_b0=reg128#16,next_b0=q15,> 25 +# asm 1: vsri.i32 > 25 +# asm 1: vsri.i32 diag1=reg128#9,diag1=q8,next_diag1=reg128#5,next_diag1=q4,a1=reg128#13,a1=q12,next_a1=reg128#14,next_a1=q13,b1=reg128#15,b1=q14,next_b1=reg128#16,next_b1=q15,> 23 +# asm 1: vsri.i32 ? i -= 2 +# asm 1: subs > 23 +# asm 1: vsri.i32 diag2=reg128#6,diag2=q5,next_diag2=reg128#12,next_diag2=q11,a2=reg128#13,a2=q12,diag1=reg128#9,diag1=q8,next_a2=reg128#14,next_a2=q13,b2=reg128#15,b2=q14,next_diag1=reg128#5,next_diag1=q4,next_b2=reg128#16,next_b2=q15,> 19 +# asm 1: vsri.i32 > 19 +# asm 1: vsri.i32 diag3=reg128#7,diag3=q6,next_diag3=reg128#11,next_diag3=q10,a3=reg128#13,a3=q12,next_a3=reg128#14,next_a3=q13,b3=reg128#15,b3=q14,next_b3=reg128#16,next_b3=q15,> 14 +# asm 1: vsri.i32 diag3=reg128#7,diag3=q6,> 14 +# asm 1: vsri.i32 diag0=reg128#8,diag0=q7,next_diag3=reg128#13,next_diag3=q12,next_diag0=reg128#2,next_diag0=q1, +bhi ._mainloop2 + +# qhasm: 2x abab = 0xffffffff +# asm 1: vmov.i64 >abab=reg128#11,#0xffffffff +# asm 2: vmov.i64 >abab=q10,#0xffffffff +vmov.i64 q10,#0xffffffff + +# qhasm: new x4x9x14x3 + +# qhasm: x4x9x14x3 bot = stack_start3 bot +# asm 1: vldr x0x5x10x15=reg128#8,x0x5x10x15=q7,x12x1x6x11=reg128#9,x12x1x6x11=q8,x8x13x2x7=reg128#6,x8x13x2x7=q5,x4x9x14x3=reg128#7,x4x9x14x3=q6,x0x1x10x11=reg128#10,x0x1x10x11=q9,x12x13x6x7=reg128#14,x12x13x6x7=q13,x8x9x2x3=reg128#15,x8x9x2x3=q14,x4x5x14x15=reg128#16,x4x5x14x15=q15,x0x1x2x3=reg128#6,x0x1x2x3=q5,x4x5x6x7=reg128#7,x4x5x6x7=q6,x8x9x10x11=reg128#8,x8x9x10x11=q7,x12x13x14x15=reg128#9,x12x13x14x15=q8,m0m1m2m3=reg128#10%bot->m0m1m2m3=reg128#10%top},[m0m1m2m3=d18->m0m1m2m3=d19},[m4m5m6m7=reg128#14%bot->m4m5m6m7=reg128#14%top},[m4m5m6m7=d26->m4m5m6m7=d27},[m8m9m10m11=reg128#15%bot->m8m9m10m11=reg128#15%top},[m8m9m10m11=d28->m8m9m10m11=d29},[m12m13m14m15=reg128#16%bot->m12m13m14m15=reg128#16%top},[m12m13m14m15=d30->m12m13m14m15=d31},[x0x1x2x3=reg128#6,x0x1x2x3=q5,x4x5x6x7=reg128#7,x4x5x6x7=q6,x8x9x10x11=reg128#8,x8x9x10x11=q7,x12x13x14x15=reg128#9,x12x13x14x15=q8,x0x5x10x15=reg128#2,x0x5x10x15=q1,x12x1x6x11=reg128#5,x12x1x6x11=q4,x8x13x2x7=reg128#6,x8x13x2x7=q5,x4x9x14x3=reg128#7,x4x9x14x3=q6,x0x1x10x11=reg128#8,x0x1x10x11=q7,x12x13x6x7=reg128#9,x12x13x6x7=q8,x8x9x2x3=reg128#10,x8x9x2x3=q9,x4x5x14x15=reg128#12,x4x5x14x15=q11,x0x1x2x3=reg128#2,x0x1x2x3=q1,x4x5x6x7=reg128#5,x4x5x6x7=q4,x8x9x10x11=reg128#6,x8x9x10x11=q5,x12x13x14x15=reg128#7,x12x13x14x15=q6,m0m1m2m3=reg128#8%bot->m0m1m2m3=reg128#8%top},[m0m1m2m3=d14->m0m1m2m3=d15},[m4m5m6m7=reg128#9%bot->m4m5m6m7=reg128#9%top},[m4m5m6m7=d16->m4m5m6m7=d17},[m8m9m10m11=reg128#10%bot->m8m9m10m11=reg128#10%top},[m8m9m10m11=d18->m8m9m10m11=d19},[m12m13m14m15=reg128#11%bot->m12m13m14m15=reg128#11%top},[m12m13m14m15=d20->m12m13m14m15=d21},[x0x1x2x3=reg128#2,x0x1x2x3=q1,x4x5x6x7=reg128#5,x4x5x6x7=q4,x8x9x10x11=reg128#6,x8x9x10x11=q5,x12x13x14x15=reg128#7,x12x13x14x15=q6,? mlenhigh - 0 +# asm 1: cmp +bhi ._mlenatleast128 + +# qhasm: =? mlenlow - 0 +# asm 1: cmp savec=stack32#1 +# asm 2: str savec=[sp,#64] +str r0,[sp,#64] + +# qhasm: c = &tmp +# asm 1: lea >c=int32#1,c=r0,i=int32#4,=0 +# asm 2: ldr >i=r3,=0 +ldr r3,=0 + +# qhasm: mcopy: +._mcopy: + +# qhasm: mi = mem8[m + 0] +# asm 1: ldrb >mi=int32#5,[mi=r4,[mi=int32#2,=0 +# asm 2: ldr >mi=r1,=0 +ldr r1,=0 + +# qhasm: pad: +._pad: + +# qhasm: mem8[c + 0] = mi +# asm 1: strb m=int32#2,m=r1,diag0=reg128#2,diag0=q1,diag1=reg128#5,diag1=q4,diag2=reg128#8,diag2=q7,diag3=reg128#9,diag3=q8,nextblock=reg128#10,#0xff +# asm 2: vmov.i64 >nextblock=q9,#0xff +vmov.i64 q9,#0xff + +# qhasm: 4x nextblock unsigned>>= 7 +# asm 1: vshr.u32 >nextblock=reg128#10,nextblock=q9,n2n3n3n2=reg128#1,n2n3n3n2=q0,i=int32#4,=12 +# asm 2: ldr >i=r3,=12 +ldr r3,=12 + +# qhasm: mainloop1: +._mainloop1: + +# qhasm: 4x a0 = diag1 + diag0 +# asm 1: vadd.i32 >a0=reg128#10,a0=q9,b0=reg128#11,b0=q10,> 25 +# asm 1: vsri.i32 diag3=reg128#9,diag3=q8,a1=reg128#10,a1=q9,b1=reg128#11,b1=q10,> 23 +# asm 1: vsri.i32 diag2=reg128#8,diag2=q7,a2=reg128#10,a2=q9,diag3=reg128#9,diag3=q8,b2=reg128#11,b2=q10,> 19 +# asm 1: vsri.i32 diag1=reg128#5,diag1=q4,a3=reg128#10,a3=q9,b3=reg128#11,b3=q10,> 14 +# asm 1: vsri.i32 diag1=reg128#5,diag1=q4,diag0=reg128#2,diag0=q1,a0=reg128#10,a0=q9,b0=reg128#11,b0=q10,> 25 +# asm 1: vsri.i32 diag1=reg128#5,diag1=q4,a1=reg128#10,a1=q9,b1=reg128#11,b1=q10,> 23 +# asm 1: vsri.i32 ? i -= 2 +# asm 1: subs diag2=reg128#8,diag2=q7,a2=reg128#10,a2=q9,diag1=reg128#5,diag1=q4,b2=reg128#11,b2=q10,> 19 +# asm 1: vsri.i32 diag3=reg128#9,diag3=q8,a3=reg128#10,a3=q9,b3=reg128#11,b3=q10,> 14 +# asm 1: vsri.i32 diag3=reg128#9,diag3=q8,diag0=reg128#2,diag0=q1, +bhi ._mainloop1 + +# qhasm: 2x abab = 0xffffffff +# asm 1: vmov.i64 >abab=reg128#10,#0xffffffff +# asm 2: vmov.i64 >abab=q9,#0xffffffff +vmov.i64 q9,#0xffffffff + +# qhasm: 4x x0x5x10x15 = diag0 + start0 +# asm 1: vadd.i32 >x0x5x10x15=reg128#2,x0x5x10x15=q1,x12x1x6x11=reg128#5,x12x1x6x11=q4,x8x13x2x7=reg128#6,x8x13x2x7=q5,x4x9x14x3=reg128#7,x4x9x14x3=q6,x0x1x10x11=reg128#8,x0x1x10x11=q7,x12x13x6x7=reg128#9,x12x13x6x7=q8,x8x9x2x3=reg128#11,x8x9x2x3=q10,x4x5x14x15=reg128#12,x4x5x14x15=q11,x0x1x2x3=reg128#2,x0x1x2x3=q1,x4x5x6x7=reg128#5,x4x5x6x7=q4,x8x9x10x11=reg128#6,x8x9x10x11=q5,x12x13x14x15=reg128#7,x12x13x14x15=q6,m0m1m2m3=reg128#8%bot->m0m1m2m3=reg128#8%top},[m0m1m2m3=d14->m0m1m2m3=d15},[m4m5m6m7=reg128#9%bot->m4m5m6m7=reg128#9%top},[m4m5m6m7=d16->m4m5m6m7=d17},[m8m9m10m11=reg128#10%bot->m8m9m10m11=reg128#10%top},[m8m9m10m11=d18->m8m9m10m11=d19},[m12m13m14m15=reg128#11%bot->m12m13m14m15=reg128#11%top},[m12m13m14m15=d20->m12m13m14m15=d21},[x0x1x2x3=reg128#2,x0x1x2x3=q1,x4x5x6x7=reg128#5,x4x5x6x7=q4,x8x9x10x11=reg128#6,x8x9x10x11=q5,x12x13x14x15=reg128#7,x12x13x14x15=q6,i=int32#4,=0 +# asm 2: ldr >i=r3,=0 +ldr r3,=0 + +# qhasm: m = c - 64 +# asm 1: sub >m=int32#2,m=r1,c=int32#1,c=r0,ci=int32#5,[ci=r4,[? mlenlow -= 64 +# asm 1: subs +bhi ._mlenatleast1 + +# qhasm: done: +._done: + +# qhasm: new caller_r4 + +# qhasm: caller_r4 = stack_r4 +# asm 1: ldr >caller_r4=int32#5,caller_r4=r4,result=int32#1,=0 +# asm 2: ldr >result=r0,=0 +ldr r0,=0 + +# qhasm: return result +add sp,sp,#256 +bx lr diff --git a/ext/x64-salsa2012-asm/salsa2012.h b/ext/x64-salsa2012-asm/salsa2012.h index d8c2e48cd..73e375ebe 100644 --- a/ext/x64-salsa2012-asm/salsa2012.h +++ b/ext/x64-salsa2012-asm/salsa2012.h @@ -1,3 +1,6 @@ +#ifndef ZT_X64_SALSA2012_ASM +#define ZT_X64_SALSA2012_ASM + #ifdef __cplusplus extern "C" { #endif @@ -9,3 +12,5 @@ extern int zt_salsa2012_amd64_xmm6(unsigned char *, unsigned long long, const un #ifdef __cplusplus } #endif + +#endif