ZeroTierOne/zeroidc/vendor/ring/pregenerated/ecp_nistz256-armv4-ios32.S

1115 lines
25 KiB
ArmAsm
Raw Normal View History

// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#if !defined(__has_feature)
#define __has_feature(x) 0
#endif
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM)
#include <GFp/arm_arch.h>
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 6
#ifdef __thumb2__
.thumb_func __ecp_nistz256_mul_by_2
#endif
.align 4
__ecp_nistz256_mul_by_2:
ldr r4,[r1,#0]
ldr r5,[r1,#4]
ldr r6,[r1,#8]
adds r4,r4,r4 @ a[0:7]+=a[0:7], i.e. add with itself
ldr r7,[r1,#12]
adcs r5,r5,r5
ldr r8,[r1,#16]
adcs r6,r6,r6
ldr r9,[r1,#20]
adcs r7,r7,r7
ldr r10,[r1,#24]
adcs r8,r8,r8
ldr r11,[r1,#28]
adcs r9,r9,r9
adcs r10,r10,r10
mov r3,#0
adcs r11,r11,r11
adc r3,r3,#0
b Lreduce_by_sub
@ void GFp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
@ const BN_ULONG r2[8]);
.globl _GFp_nistz256_add
.private_extern _GFp_nistz256_add
#ifdef __thumb2__
.thumb_func _GFp_nistz256_add
#endif
.align 4
_GFp_nistz256_add:
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
bl __ecp_nistz256_add
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
#else
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
#ifdef __thumb2__
.thumb_func __ecp_nistz256_add
#endif
.align 4
__ecp_nistz256_add:
str lr,[sp,#-4]! @ push lr
ldr r4,[r1,#0]
ldr r5,[r1,#4]
ldr r6,[r1,#8]
ldr r7,[r1,#12]
ldr r8,[r1,#16]
ldr r3,[r2,#0]
ldr r9,[r1,#20]
ldr r12,[r2,#4]
ldr r10,[r1,#24]
ldr r14,[r2,#8]
ldr r11,[r1,#28]
ldr r1,[r2,#12]
adds r4,r4,r3
ldr r3,[r2,#16]
adcs r5,r5,r12
ldr r12,[r2,#20]
adcs r6,r6,r14
ldr r14,[r2,#24]
adcs r7,r7,r1
ldr r1,[r2,#28]
adcs r8,r8,r3
adcs r9,r9,r12
adcs r10,r10,r14
mov r3,#0
adcs r11,r11,r1
adc r3,r3,#0
ldr lr,[sp],#4 @ pop lr
Lreduce_by_sub:
@ if a+b >= modulus, subtract modulus.
@
@ But since comparison implies subtraction, we subtract
@ modulus and then add it back if subtraction borrowed.
subs r4,r4,#-1
sbcs r5,r5,#-1
sbcs r6,r6,#-1
sbcs r7,r7,#0
sbcs r8,r8,#0
sbcs r9,r9,#0
sbcs r10,r10,#1
sbcs r11,r11,#-1
sbc r3,r3,#0
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ using value of borrow as a whole or extracting single bit.
@ Follow r3 register...
adds r4,r4,r3 @ add synthesized modulus
adcs r5,r5,r3
str r4,[r0,#0]
adcs r6,r6,r3
str r5,[r0,#4]
adcs r7,r7,#0
str r6,[r0,#8]
adcs r8,r8,#0
str r7,[r0,#12]
adcs r9,r9,#0
str r8,[r0,#16]
adcs r10,r10,r3,lsr#31
str r9,[r0,#20]
adcs r11,r11,r3
str r10,[r0,#24]
str r11,[r0,#28]
mov pc,lr
#ifdef __thumb2__
.thumb_func __ecp_nistz256_mul_by_3
#endif
.align 4
__ecp_nistz256_mul_by_3:
str lr,[sp,#-4]! @ push lr
@ As multiplication by 3 is performed as 2*n+n, below are inline
@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
@ corresponding subroutines for details.
ldr r4,[r1,#0]
ldr r5,[r1,#4]
ldr r6,[r1,#8]
adds r4,r4,r4 @ a[0:7]+=a[0:7]
ldr r7,[r1,#12]
adcs r5,r5,r5
ldr r8,[r1,#16]
adcs r6,r6,r6
ldr r9,[r1,#20]
adcs r7,r7,r7
ldr r10,[r1,#24]
adcs r8,r8,r8
ldr r11,[r1,#28]
adcs r9,r9,r9
adcs r10,r10,r10
mov r3,#0
adcs r11,r11,r11
adc r3,r3,#0
subs r4,r4,#-1 @ Lreduce_by_sub but without stores
sbcs r5,r5,#-1
sbcs r6,r6,#-1
sbcs r7,r7,#0
sbcs r8,r8,#0
sbcs r9,r9,#0
sbcs r10,r10,#1
sbcs r11,r11,#-1
sbc r3,r3,#0
adds r4,r4,r3 @ add synthesized modulus
adcs r5,r5,r3
adcs r6,r6,r3
adcs r7,r7,#0
adcs r8,r8,#0
ldr r2,[r1,#0]
adcs r9,r9,#0
ldr r12,[r1,#4]
adcs r10,r10,r3,lsr#31
ldr r14,[r1,#8]
adc r11,r11,r3
ldr r3,[r1,#12]
adds r4,r4,r2 @ 2*a[0:7]+=a[0:7]
ldr r2,[r1,#16]
adcs r5,r5,r12
ldr r12,[r1,#20]
adcs r6,r6,r14
ldr r14,[r1,#24]
adcs r7,r7,r3
ldr r1,[r1,#28]
adcs r8,r8,r2
adcs r9,r9,r12
adcs r10,r10,r14
mov r3,#0
adcs r11,r11,r1
adc r3,r3,#0
ldr lr,[sp],#4 @ pop lr
b Lreduce_by_sub
#ifdef __thumb2__
.thumb_func __ecp_nistz256_div_by_2
#endif
.align 4
__ecp_nistz256_div_by_2:
@ ret = (a is odd ? a+mod : a) >> 1
ldr r4,[r1,#0]
ldr r5,[r1,#4]
ldr r6,[r1,#8]
mov r3,r4,lsl#31 @ place least significant bit to most
@ significant position, now arithmetic
@ right shift by 31 will produce -1 or
@ 0, while logical right shift 1 or 0,
@ this is how modulus is conditionally
@ synthesized in this case...
ldr r7,[r1,#12]
adds r4,r4,r3,asr#31
ldr r8,[r1,#16]
adcs r5,r5,r3,asr#31
ldr r9,[r1,#20]
adcs r6,r6,r3,asr#31
ldr r10,[r1,#24]
adcs r7,r7,#0
ldr r11,[r1,#28]
adcs r8,r8,#0
mov r4,r4,lsr#1 @ a[0:7]>>=1, we can start early
@ because it doesn't affect flags
adcs r9,r9,#0
orr r4,r4,r5,lsl#31
adcs r10,r10,r3,lsr#31
mov r2,#0
adcs r11,r11,r3,asr#31
mov r5,r5,lsr#1
adc r2,r2,#0 @ top-most carry bit from addition
orr r5,r5,r6,lsl#31
mov r6,r6,lsr#1
str r4,[r0,#0]
orr r6,r6,r7,lsl#31
mov r7,r7,lsr#1
str r5,[r0,#4]
orr r7,r7,r8,lsl#31
mov r8,r8,lsr#1
str r6,[r0,#8]
orr r8,r8,r9,lsl#31
mov r9,r9,lsr#1
str r7,[r0,#12]
orr r9,r9,r10,lsl#31
mov r10,r10,lsr#1
str r8,[r0,#16]
orr r10,r10,r11,lsl#31
mov r11,r11,lsr#1
str r9,[r0,#20]
orr r11,r11,r2,lsl#31 @ don't forget the top-most carry bit
str r10,[r0,#24]
str r11,[r0,#28]
mov pc,lr
#ifdef __thumb2__
.thumb_func __ecp_nistz256_sub
#endif
.align 4
__ecp_nistz256_sub:
str lr,[sp,#-4]! @ push lr
ldr r4,[r1,#0]
ldr r5,[r1,#4]
ldr r6,[r1,#8]
ldr r7,[r1,#12]
ldr r8,[r1,#16]
ldr r3,[r2,#0]
ldr r9,[r1,#20]
ldr r12,[r2,#4]
ldr r10,[r1,#24]
ldr r14,[r2,#8]
ldr r11,[r1,#28]
ldr r1,[r2,#12]
subs r4,r4,r3
ldr r3,[r2,#16]
sbcs r5,r5,r12
ldr r12,[r2,#20]
sbcs r6,r6,r14
ldr r14,[r2,#24]
sbcs r7,r7,r1
ldr r1,[r2,#28]
sbcs r8,r8,r3
sbcs r9,r9,r12
sbcs r10,r10,r14
sbcs r11,r11,r1
sbc r3,r3,r3 @ broadcast borrow bit
ldr lr,[sp],#4 @ pop lr
Lreduce_by_add:
@ if a-b borrows, add modulus.
@
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ broadcasting borrow bit to a register, r3, and using it as
@ a whole or extracting single bit.
adds r4,r4,r3 @ add synthesized modulus
adcs r5,r5,r3
str r4,[r0,#0]
adcs r6,r6,r3
str r5,[r0,#4]
adcs r7,r7,#0
str r6,[r0,#8]
adcs r8,r8,#0
str r7,[r0,#12]
adcs r9,r9,#0
str r8,[r0,#16]
adcs r10,r10,r3,lsr#31
str r9,[r0,#20]
adcs r11,r11,r3
str r10,[r0,#24]
str r11,[r0,#28]
mov pc,lr
@ void GFp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl _GFp_nistz256_neg
.private_extern _GFp_nistz256_neg
#ifdef __thumb2__
.thumb_func _GFp_nistz256_neg
#endif
.align 4
_GFp_nistz256_neg:
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
bl __ecp_nistz256_neg
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
#else
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
#ifdef __thumb2__
.thumb_func __ecp_nistz256_neg
#endif
.align 4
__ecp_nistz256_neg:
ldr r4,[r1,#0]
eor r3,r3,r3
ldr r5,[r1,#4]
ldr r6,[r1,#8]
subs r4,r3,r4
ldr r7,[r1,#12]
sbcs r5,r3,r5
ldr r8,[r1,#16]
sbcs r6,r3,r6
ldr r9,[r1,#20]
sbcs r7,r3,r7
ldr r10,[r1,#24]
sbcs r8,r3,r8
ldr r11,[r1,#28]
sbcs r9,r3,r9
sbcs r10,r3,r10
sbcs r11,r3,r11
sbc r3,r3,r3
b Lreduce_by_add
@ void GFp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
@ const BN_ULONG r2[8]);
.globl _GFp_nistz256_mul_mont
.private_extern _GFp_nistz256_mul_mont
#ifdef __thumb2__
.thumb_func _GFp_nistz256_mul_mont
#endif
.align 4
_GFp_nistz256_mul_mont:
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
bl __ecp_nistz256_mul_mont
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
#else
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
#ifdef __thumb2__
.thumb_func __ecp_nistz256_mul_mont
#endif
.align 4
__ecp_nistz256_mul_mont:
stmdb sp!,{r0,r1,r2,lr} @ make a copy of arguments too
ldr r2,[r2,#0] @ b[0]
ldmia r1,{r4,r5,r6,r7,r8,r9,r10,r11}
umull r3,r14,r4,r2 @ r[0]=a[0]*b[0]
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy a[0-7] to stack, so
@ that it can be addressed
@ without spending register
@ on address
umull r4,r0,r5,r2 @ r[1]=a[1]*b[0]
umull r5,r1,r6,r2
adds r4,r4,r14 @ accumulate high part of mult
umull r6,r12,r7,r2
adcs r5,r5,r0
umull r7,r14,r8,r2
adcs r6,r6,r1
umull r8,r0,r9,r2
adcs r7,r7,r12
umull r9,r1,r10,r2
adcs r8,r8,r14
umull r10,r12,r11,r2
adcs r9,r9,r0
adcs r10,r10,r1
eor r14,r14,r14 @ first overflow bit is zero
adc r11,r12,#0
@ multiplication-less reduction 1
adds r6,r6,r3 @ r[3]+=r[0]
ldr r2,[sp,#40] @ restore b_ptr
adcs r7,r7,#0 @ r[4]+=0
adcs r8,r8,#0 @ r[5]+=0
adcs r9,r9,r3 @ r[6]+=r[0]
ldr r1,[sp,#0] @ load a[0]
adcs r10,r10,#0 @ r[7]+=0
ldr r2,[r2,#4*1] @ load b[i]
adcs r11,r11,r3 @ r[8]+=r[0]
eor r0,r0,r0
adc r14,r14,#0 @ overflow bit
subs r10,r10,r3 @ r[7]-=r[0]
ldr r12,[sp,#4] @ a[1]
sbcs r11,r11,#0 @ r[8]-=0
umlal r4,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
eor r1,r1,r1
sbc r3,r14,#0 @ overflow bit, keep in mind
@ that netto result is
@ addition of a value which
@ makes underflow impossible
ldr r14,[sp,#8] @ a[2]
umlal r5,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
str r3,[sp,#36] @ temporarily offload overflow
eor r12,r12,r12
ldr r3,[sp,#12] @ a[3], r3 is alias r3
umlal r6,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
eor r14,r14,r14
adds r5,r5,r0 @ accumulate high part of mult
ldr r0,[sp,#16] @ a[4]
umlal r7,r14,r3,r2 @ "r[3]"+=a[3]*b[i]
eor r3,r3,r3
adcs r6,r6,r1
ldr r1,[sp,#20] @ a[5]
umlal r8,r3,r0,r2 @ "r[4]"+=a[4]*b[i]
eor r0,r0,r0
adcs r7,r7,r12
ldr r12,[sp,#24] @ a[6]
umlal r9,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
eor r1,r1,r1
adcs r8,r8,r14
ldr r14,[sp,#28] @ a[7]
umlal r10,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
eor r12,r12,r12
adcs r9,r9,r3
ldr r3,[sp,#36] @ restore overflow bit
umlal r11,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
eor r14,r14,r14
adcs r10,r10,r0
adcs r11,r11,r1
adcs r3,r3,r12
adc r14,r14,#0 @ new overflow bit
@ multiplication-less reduction 2
adds r7,r7,r4 @ r[3]+=r[0]
ldr r2,[sp,#40] @ restore b_ptr
adcs r8,r8,#0 @ r[4]+=0
adcs r9,r9,#0 @ r[5]+=0
adcs r10,r10,r4 @ r[6]+=r[0]
ldr r1,[sp,#0] @ load a[0]
adcs r11,r11,#0 @ r[7]+=0
ldr r2,[r2,#4*2] @ load b[i]
adcs r3,r3,r4 @ r[8]+=r[0]
eor r0,r0,r0
adc r14,r14,#0 @ overflow bit
subs r11,r11,r4 @ r[7]-=r[0]
ldr r12,[sp,#4] @ a[1]
sbcs r3,r3,#0 @ r[8]-=0
umlal r5,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
eor r1,r1,r1
sbc r4,r14,#0 @ overflow bit, keep in mind
@ that netto result is
@ addition of a value which
@ makes underflow impossible
ldr r14,[sp,#8] @ a[2]
umlal r6,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
str r4,[sp,#36] @ temporarily offload overflow
eor r12,r12,r12
ldr r4,[sp,#12] @ a[3], r4 is alias r4
umlal r7,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
eor r14,r14,r14
adds r6,r6,r0 @ accumulate high part of mult
ldr r0,[sp,#16] @ a[4]
umlal r8,r14,r4,r2 @ "r[3]"+=a[3]*b[i]
eor r4,r4,r4
adcs r7,r7,r1
ldr r1,[sp,#20] @ a[5]
umlal r9,r4,r0,r2 @ "r[4]"+=a[4]*b[i]
eor r0,r0,r0
adcs r8,r8,r12
ldr r12,[sp,#24] @ a[6]
umlal r10,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
eor r1,r1,r1
adcs r9,r9,r14
ldr r14,[sp,#28] @ a[7]
umlal r11,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
eor r12,r12,r12
adcs r10,r10,r4
ldr r4,[sp,#36] @ restore overflow bit
umlal r3,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
eor r14,r14,r14
adcs r11,r11,r0
adcs r3,r3,r1
adcs r4,r4,r12
adc r14,r14,#0 @ new overflow bit
@ multiplication-less reduction 3
adds r8,r8,r5 @ r[3]+=r[0]
ldr r2,[sp,#40] @ restore b_ptr
adcs r9,r9,#0 @ r[4]+=0
adcs r10,r10,#0 @ r[5]+=0
adcs r11,r11,r5 @ r[6]+=r[0]
ldr r1,[sp,#0] @ load a[0]
adcs r3,r3,#0 @ r[7]+=0
ldr r2,[r2,#4*3] @ load b[i]
adcs r4,r4,r5 @ r[8]+=r[0]
eor r0,r0,r0
adc r14,r14,#0 @ overflow bit
subs r3,r3,r5 @ r[7]-=r[0]
ldr r12,[sp,#4] @ a[1]
sbcs r4,r4,#0 @ r[8]-=0
umlal r6,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
eor r1,r1,r1
sbc r5,r14,#0 @ overflow bit, keep in mind
@ that netto result is
@ addition of a value which
@ makes underflow impossible
ldr r14,[sp,#8] @ a[2]
umlal r7,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
str r5,[sp,#36] @ temporarily offload overflow
eor r12,r12,r12
ldr r5,[sp,#12] @ a[3], r5 is alias r5
umlal r8,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
eor r14,r14,r14
adds r7,r7,r0 @ accumulate high part of mult
ldr r0,[sp,#16] @ a[4]
umlal r9,r14,r5,r2 @ "r[3]"+=a[3]*b[i]
eor r5,r5,r5
adcs r8,r8,r1
ldr r1,[sp,#20] @ a[5]
umlal r10,r5,r0,r2 @ "r[4]"+=a[4]*b[i]
eor r0,r0,r0
adcs r9,r9,r12
ldr r12,[sp,#24] @ a[6]
umlal r11,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
eor r1,r1,r1
adcs r10,r10,r14
ldr r14,[sp,#28] @ a[7]
umlal r3,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
eor r12,r12,r12
adcs r11,r11,r5
ldr r5,[sp,#36] @ restore overflow bit
umlal r4,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
eor r14,r14,r14
adcs r3,r3,r0
adcs r4,r4,r1
adcs r5,r5,r12
adc r14,r14,#0 @ new overflow bit
@ multiplication-less reduction 4
adds r9,r9,r6 @ r[3]+=r[0]
ldr r2,[sp,#40] @ restore b_ptr
adcs r10,r10,#0 @ r[4]+=0
adcs r11,r11,#0 @ r[5]+=0
adcs r3,r3,r6 @ r[6]+=r[0]
ldr r1,[sp,#0] @ load a[0]
adcs r4,r4,#0 @ r[7]+=0
ldr r2,[r2,#4*4] @ load b[i]
adcs r5,r5,r6 @ r[8]+=r[0]
eor r0,r0,r0
adc r14,r14,#0 @ overflow bit
subs r4,r4,r6 @ r[7]-=r[0]
ldr r12,[sp,#4] @ a[1]
sbcs r5,r5,#0 @ r[8]-=0
umlal r7,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
eor r1,r1,r1
sbc r6,r14,#0 @ overflow bit, keep in mind
@ that netto result is
@ addition of a value which
@ makes underflow impossible
ldr r14,[sp,#8] @ a[2]
umlal r8,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
str r6,[sp,#36] @ temporarily offload overflow
eor r12,r12,r12
ldr r6,[sp,#12] @ a[3], r6 is alias r6
umlal r9,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
eor r14,r14,r14
adds r8,r8,r0 @ accumulate high part of mult
ldr r0,[sp,#16] @ a[4]
umlal r10,r14,r6,r2 @ "r[3]"+=a[3]*b[i]
eor r6,r6,r6
adcs r9,r9,r1
ldr r1,[sp,#20] @ a[5]
umlal r11,r6,r0,r2 @ "r[4]"+=a[4]*b[i]
eor r0,r0,r0
adcs r10,r10,r12
ldr r12,[sp,#24] @ a[6]
umlal r3,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
eor r1,r1,r1
adcs r11,r11,r14
ldr r14,[sp,#28] @ a[7]
umlal r4,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
eor r12,r12,r12
adcs r3,r3,r6
ldr r6,[sp,#36] @ restore overflow bit
umlal r5,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
eor r14,r14,r14
adcs r4,r4,r0
adcs r5,r5,r1
adcs r6,r6,r12
adc r14,r14,#0 @ new overflow bit
@ multiplication-less reduction 5
adds r10,r10,r7 @ r[3]+=r[0]
ldr r2,[sp,#40] @ restore b_ptr
adcs r11,r11,#0 @ r[4]+=0
adcs r3,r3,#0 @ r[5]+=0
adcs r4,r4,r7 @ r[6]+=r[0]
ldr r1,[sp,#0] @ load a[0]
adcs r5,r5,#0 @ r[7]+=0
ldr r2,[r2,#4*5] @ load b[i]
adcs r6,r6,r7 @ r[8]+=r[0]
eor r0,r0,r0
adc r14,r14,#0 @ overflow bit
subs r5,r5,r7 @ r[7]-=r[0]
ldr r12,[sp,#4] @ a[1]
sbcs r6,r6,#0 @ r[8]-=0
umlal r8,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
eor r1,r1,r1
sbc r7,r14,#0 @ overflow bit, keep in mind
@ that netto result is
@ addition of a value which
@ makes underflow impossible
ldr r14,[sp,#8] @ a[2]
umlal r9,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
str r7,[sp,#36] @ temporarily offload overflow
eor r12,r12,r12
ldr r7,[sp,#12] @ a[3], r7 is alias r7
umlal r10,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
eor r14,r14,r14
adds r9,r9,r0 @ accumulate high part of mult
ldr r0,[sp,#16] @ a[4]
umlal r11,r14,r7,r2 @ "r[3]"+=a[3]*b[i]
eor r7,r7,r7
adcs r10,r10,r1
ldr r1,[sp,#20] @ a[5]
umlal r3,r7,r0,r2 @ "r[4]"+=a[4]*b[i]
eor r0,r0,r0
adcs r11,r11,r12
ldr r12,[sp,#24] @ a[6]
umlal r4,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
eor r1,r1,r1
adcs r3,r3,r14
ldr r14,[sp,#28] @ a[7]
umlal r5,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
eor r12,r12,r12
adcs r4,r4,r7
ldr r7,[sp,#36] @ restore overflow bit
umlal r6,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
eor r14,r14,r14
adcs r5,r5,r0
adcs r6,r6,r1
adcs r7,r7,r12
adc r14,r14,#0 @ new overflow bit
@ multiplication-less reduction 6
adds r11,r11,r8 @ r[3]+=r[0]
ldr r2,[sp,#40] @ restore b_ptr
adcs r3,r3,#0 @ r[4]+=0
adcs r4,r4,#0 @ r[5]+=0
adcs r5,r5,r8 @ r[6]+=r[0]
ldr r1,[sp,#0] @ load a[0]
adcs r6,r6,#0 @ r[7]+=0
ldr r2,[r2,#4*6] @ load b[i]
adcs r7,r7,r8 @ r[8]+=r[0]
eor r0,r0,r0
adc r14,r14,#0 @ overflow bit
subs r6,r6,r8 @ r[7]-=r[0]
ldr r12,[sp,#4] @ a[1]
sbcs r7,r7,#0 @ r[8]-=0
umlal r9,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
eor r1,r1,r1
sbc r8,r14,#0 @ overflow bit, keep in mind
@ that netto result is
@ addition of a value which
@ makes underflow impossible
ldr r14,[sp,#8] @ a[2]
umlal r10,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
str r8,[sp,#36] @ temporarily offload overflow
eor r12,r12,r12
ldr r8,[sp,#12] @ a[3], r8 is alias r8
umlal r11,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
eor r14,r14,r14
adds r10,r10,r0 @ accumulate high part of mult
ldr r0,[sp,#16] @ a[4]
umlal r3,r14,r8,r2 @ "r[3]"+=a[3]*b[i]
eor r8,r8,r8
adcs r11,r11,r1
ldr r1,[sp,#20] @ a[5]
umlal r4,r8,r0,r2 @ "r[4]"+=a[4]*b[i]
eor r0,r0,r0
adcs r3,r3,r12
ldr r12,[sp,#24] @ a[6]
umlal r5,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
eor r1,r1,r1
adcs r4,r4,r14
ldr r14,[sp,#28] @ a[7]
umlal r6,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
eor r12,r12,r12
adcs r5,r5,r8
ldr r8,[sp,#36] @ restore overflow bit
umlal r7,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
eor r14,r14,r14
adcs r6,r6,r0
adcs r7,r7,r1
adcs r8,r8,r12
adc r14,r14,#0 @ new overflow bit
@ multiplication-less reduction 7
adds r3,r3,r9 @ r[3]+=r[0]
ldr r2,[sp,#40] @ restore b_ptr
adcs r4,r4,#0 @ r[4]+=0
adcs r5,r5,#0 @ r[5]+=0
adcs r6,r6,r9 @ r[6]+=r[0]
ldr r1,[sp,#0] @ load a[0]
adcs r7,r7,#0 @ r[7]+=0
ldr r2,[r2,#4*7] @ load b[i]
adcs r8,r8,r9 @ r[8]+=r[0]
eor r0,r0,r0
adc r14,r14,#0 @ overflow bit
subs r7,r7,r9 @ r[7]-=r[0]
ldr r12,[sp,#4] @ a[1]
sbcs r8,r8,#0 @ r[8]-=0
umlal r10,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
eor r1,r1,r1
sbc r9,r14,#0 @ overflow bit, keep in mind
@ that netto result is
@ addition of a value which
@ makes underflow impossible
ldr r14,[sp,#8] @ a[2]
umlal r11,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
str r9,[sp,#36] @ temporarily offload overflow
eor r12,r12,r12
ldr r9,[sp,#12] @ a[3], r9 is alias r9
umlal r3,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
eor r14,r14,r14
adds r11,r11,r0 @ accumulate high part of mult
ldr r0,[sp,#16] @ a[4]
umlal r4,r14,r9,r2 @ "r[3]"+=a[3]*b[i]
eor r9,r9,r9
adcs r3,r3,r1
ldr r1,[sp,#20] @ a[5]
umlal r5,r9,r0,r2 @ "r[4]"+=a[4]*b[i]
eor r0,r0,r0
adcs r4,r4,r12
ldr r12,[sp,#24] @ a[6]
umlal r6,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
eor r1,r1,r1
adcs r5,r5,r14
ldr r14,[sp,#28] @ a[7]
umlal r7,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
eor r12,r12,r12
adcs r6,r6,r9
ldr r9,[sp,#36] @ restore overflow bit
umlal r8,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
eor r14,r14,r14
adcs r7,r7,r0
adcs r8,r8,r1
adcs r9,r9,r12
adc r14,r14,#0 @ new overflow bit
@ last multiplication-less reduction
adds r4,r4,r10
ldr r0,[sp,#32] @ restore r_ptr
adcs r5,r5,#0
adcs r6,r6,#0
adcs r7,r7,r10
adcs r8,r8,#0
adcs r9,r9,r10
adc r14,r14,#0
subs r8,r8,r10
sbcs r9,r9,#0
sbc r10,r14,#0 @ overflow bit
@ Final step is "if result > mod, subtract mod", but we do it
@ "other way around", namely subtract modulus from result
@ and if it borrowed, add modulus back.
adds r11,r11,#1 @ subs r11,r11,#-1
adcs r3,r3,#0 @ sbcs r3,r3,#-1
adcs r4,r4,#0 @ sbcs r4,r4,#-1
sbcs r5,r5,#0
sbcs r6,r6,#0
sbcs r7,r7,#0
sbcs r8,r8,#1
adcs r9,r9,#0 @ sbcs r9,r9,#-1
ldr lr,[sp,#44] @ restore lr
sbc r10,r10,#0 @ broadcast borrow bit
add sp,sp,#48
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ broadcasting borrow bit to a register, r10, and using it as
@ a whole or extracting single bit.
adds r11,r11,r10 @ add modulus or zero
adcs r3,r3,r10
str r11,[r0,#0]
adcs r4,r4,r10
str r3,[r0,#4]
adcs r5,r5,#0
str r4,[r0,#8]
adcs r6,r6,#0
str r5,[r0,#12]
adcs r7,r7,#0
str r6,[r0,#16]
adcs r8,r8,r10,lsr#31
str r7,[r0,#20]
adc r9,r9,r10
str r8,[r0,#24]
str r9,[r0,#28]
mov pc,lr
#ifdef __thumb2__
.thumb_func __ecp_nistz256_sub_from
#endif
.align 5
__ecp_nistz256_sub_from:
str lr,[sp,#-4]! @ push lr
ldr r10,[r2,#0]
ldr r12,[r2,#4]
ldr r14,[r2,#8]
ldr r1,[r2,#12]
subs r11,r11,r10
ldr r10,[r2,#16]
sbcs r3,r3,r12
ldr r12,[r2,#20]
sbcs r4,r4,r14
ldr r14,[r2,#24]
sbcs r5,r5,r1
ldr r1,[r2,#28]
sbcs r6,r6,r10
sbcs r7,r7,r12
sbcs r8,r8,r14
sbcs r9,r9,r1
sbc r2,r2,r2 @ broadcast borrow bit
ldr lr,[sp],#4 @ pop lr
adds r11,r11,r2 @ add synthesized modulus
adcs r3,r3,r2
str r11,[r0,#0]
adcs r4,r4,r2
str r3,[r0,#4]
adcs r5,r5,#0
str r4,[r0,#8]
adcs r6,r6,#0
str r5,[r0,#12]
adcs r7,r7,#0
str r6,[r0,#16]
adcs r8,r8,r2,lsr#31
str r7,[r0,#20]
adcs r9,r9,r2
str r8,[r0,#24]
str r9,[r0,#28]
mov pc,lr
#ifdef __thumb2__
.thumb_func __ecp_nistz256_sub_morf
#endif
.align 5
__ecp_nistz256_sub_morf:
str lr,[sp,#-4]! @ push lr
ldr r10,[r2,#0]
ldr r12,[r2,#4]
ldr r14,[r2,#8]
ldr r1,[r2,#12]
subs r11,r10,r11
ldr r10,[r2,#16]
sbcs r3,r12,r3
ldr r12,[r2,#20]
sbcs r4,r14,r4
ldr r14,[r2,#24]
sbcs r5,r1,r5
ldr r1,[r2,#28]
sbcs r6,r10,r6
sbcs r7,r12,r7
sbcs r8,r14,r8
sbcs r9,r1,r9
sbc r2,r2,r2 @ broadcast borrow bit
ldr lr,[sp],#4 @ pop lr
adds r11,r11,r2 @ add synthesized modulus
adcs r3,r3,r2
str r11,[r0,#0]
adcs r4,r4,r2
str r3,[r0,#4]
adcs r5,r5,#0
str r4,[r0,#8]
adcs r6,r6,#0
str r5,[r0,#12]
adcs r7,r7,#0
str r6,[r0,#16]
adcs r8,r8,r2,lsr#31
str r7,[r0,#20]
adcs r9,r9,r2
str r8,[r0,#24]
str r9,[r0,#28]
mov pc,lr
#ifdef __thumb2__
.thumb_func __ecp_nistz256_add_self
#endif
.align 4
__ecp_nistz256_add_self:
adds r11,r11,r11 @ a[0:7]+=a[0:7]
adcs r3,r3,r3
adcs r4,r4,r4
adcs r5,r5,r5
adcs r6,r6,r6
adcs r7,r7,r7
adcs r8,r8,r8
mov r2,#0
adcs r9,r9,r9
adc r2,r2,#0
@ if a+b >= modulus, subtract modulus.
@
@ But since comparison implies subtraction, we subtract
@ modulus and then add it back if subtraction borrowed.
subs r11,r11,#-1
sbcs r3,r3,#-1
sbcs r4,r4,#-1
sbcs r5,r5,#0
sbcs r6,r6,#0
sbcs r7,r7,#0
sbcs r8,r8,#1
sbcs r9,r9,#-1
sbc r2,r2,#0
@ Note that because mod has special form, i.e. consists of
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
@ using value of borrow as a whole or extracting single bit.
@ Follow r2 register...
adds r11,r11,r2 @ add synthesized modulus
adcs r3,r3,r2
str r11,[r0,#0]
adcs r4,r4,r2
str r3,[r0,#4]
adcs r5,r5,#0
str r4,[r0,#8]
adcs r6,r6,#0
str r5,[r0,#12]
adcs r7,r7,#0
str r6,[r0,#16]
adcs r8,r8,r2,lsr#31
str r7,[r0,#20]
adcs r9,r9,r2
str r8,[r0,#24]
str r9,[r0,#28]
mov pc,lr
.globl _GFp_nistz256_point_double
.private_extern _GFp_nistz256_point_double
#ifdef __thumb2__
.thumb_func _GFp_nistz256_point_double
#endif
.align 5
_GFp_nistz256_point_double:
stmdb sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ push from r0, unusual, but intentional
sub sp,sp,#32*5
Lpoint_double_shortcut:
add r3,sp,#96
ldmia r1!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy in_x
stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
add r0,sp,#0
bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
add r2,r1,#32
add r1,r1,#32
add r0,sp,#64
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
add r1,sp,#0
add r2,sp,#0
add r0,sp,#0
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
ldr r2,[sp,#32*5+4]
add r1,r2,#32
add r2,r2,#64
add r0,sp,#128
bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
ldr r0,[sp,#32*5]
add r0,r0,#64
bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
add r1,sp,#96
add r2,sp,#64
add r0,sp,#32
bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
add r1,sp,#96
add r2,sp,#64
add r0,sp,#64
bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
add r1,sp,#0
add r2,sp,#0
add r0,sp,#128
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
add r1,sp,#64
add r2,sp,#32
add r0,sp,#32
bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
ldr r0,[sp,#32*5]
add r1,sp,#128
add r0,r0,#32
bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
add r1,sp,#32
add r0,sp,#32
bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
add r1,sp,#96
add r2,sp,#0
add r0,sp,#0
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
add r0,sp,#128
bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
ldr r0,[sp,#32*5]
add r1,sp,#32
add r2,sp,#32
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
add r2,sp,#128
bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
add r2,sp,#0
add r0,sp,#0
bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
add r1,sp,#32
add r2,sp,#0
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
ldr r0,[sp,#32*5]
add r2,r0,#32
add r0,r0,#32
bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
#else
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
bx lr @ interoperable with Thumb ISA:-)
#endif
#endif // !OPENSSL_NO_ASM