mirror of
https://github.com/zerotier/ZeroTierOne.git
synced 2025-01-28 07:03:58 +00:00
1115 lines
25 KiB
ArmAsm
1115 lines
25 KiB
ArmAsm
|
// This file is generated from a similarly-named Perl script in the BoringSSL
|
||
|
// source tree. Do not edit by hand.
|
||
|
|
||
|
#if !defined(__has_feature)
|
||
|
#define __has_feature(x) 0
|
||
|
#endif
|
||
|
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
|
||
|
#define OPENSSL_NO_ASM
|
||
|
#endif
|
||
|
|
||
|
#if !defined(OPENSSL_NO_ASM)
|
||
|
#include <GFp/arm_arch.h>
|
||
|
|
||
|
.text
|
||
|
#if defined(__thumb2__)
|
||
|
.syntax unified
|
||
|
.thumb
|
||
|
#else
|
||
|
.code 32
|
||
|
#endif
|
||
|
|
||
|
.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||
|
.align 2
|
||
|
.align 6
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_mul_by_2
|
||
|
#endif
|
||
|
.align 4
|
||
|
__ecp_nistz256_mul_by_2:
|
||
|
ldr r4,[r1,#0]
|
||
|
ldr r5,[r1,#4]
|
||
|
ldr r6,[r1,#8]
|
||
|
adds r4,r4,r4 @ a[0:7]+=a[0:7], i.e. add with itself
|
||
|
ldr r7,[r1,#12]
|
||
|
adcs r5,r5,r5
|
||
|
ldr r8,[r1,#16]
|
||
|
adcs r6,r6,r6
|
||
|
ldr r9,[r1,#20]
|
||
|
adcs r7,r7,r7
|
||
|
ldr r10,[r1,#24]
|
||
|
adcs r8,r8,r8
|
||
|
ldr r11,[r1,#28]
|
||
|
adcs r9,r9,r9
|
||
|
adcs r10,r10,r10
|
||
|
mov r3,#0
|
||
|
adcs r11,r11,r11
|
||
|
adc r3,r3,#0
|
||
|
|
||
|
b Lreduce_by_sub
|
||
|
|
||
|
|
||
|
@ void GFp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
|
||
|
@ const BN_ULONG r2[8]);
|
||
|
.globl _GFp_nistz256_add
|
||
|
.private_extern _GFp_nistz256_add
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func _GFp_nistz256_add
|
||
|
#endif
|
||
|
.align 4
|
||
|
_GFp_nistz256_add:
|
||
|
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
|
||
|
bl __ecp_nistz256_add
|
||
|
#if __ARM_ARCH__>=5 || !defined(__thumb__)
|
||
|
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
|
||
|
#else
|
||
|
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
|
||
|
bx lr @ interoperable with Thumb ISA:-)
|
||
|
#endif
|
||
|
|
||
|
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_add
|
||
|
#endif
|
||
|
.align 4
|
||
|
__ecp_nistz256_add:
|
||
|
str lr,[sp,#-4]! @ push lr
|
||
|
|
||
|
ldr r4,[r1,#0]
|
||
|
ldr r5,[r1,#4]
|
||
|
ldr r6,[r1,#8]
|
||
|
ldr r7,[r1,#12]
|
||
|
ldr r8,[r1,#16]
|
||
|
ldr r3,[r2,#0]
|
||
|
ldr r9,[r1,#20]
|
||
|
ldr r12,[r2,#4]
|
||
|
ldr r10,[r1,#24]
|
||
|
ldr r14,[r2,#8]
|
||
|
ldr r11,[r1,#28]
|
||
|
ldr r1,[r2,#12]
|
||
|
adds r4,r4,r3
|
||
|
ldr r3,[r2,#16]
|
||
|
adcs r5,r5,r12
|
||
|
ldr r12,[r2,#20]
|
||
|
adcs r6,r6,r14
|
||
|
ldr r14,[r2,#24]
|
||
|
adcs r7,r7,r1
|
||
|
ldr r1,[r2,#28]
|
||
|
adcs r8,r8,r3
|
||
|
adcs r9,r9,r12
|
||
|
adcs r10,r10,r14
|
||
|
mov r3,#0
|
||
|
adcs r11,r11,r1
|
||
|
adc r3,r3,#0
|
||
|
ldr lr,[sp],#4 @ pop lr
|
||
|
|
||
|
Lreduce_by_sub:
|
||
|
|
||
|
@ if a+b >= modulus, subtract modulus.
|
||
|
@
|
||
|
@ But since comparison implies subtraction, we subtract
|
||
|
@ modulus and then add it back if subtraction borrowed.
|
||
|
|
||
|
subs r4,r4,#-1
|
||
|
sbcs r5,r5,#-1
|
||
|
sbcs r6,r6,#-1
|
||
|
sbcs r7,r7,#0
|
||
|
sbcs r8,r8,#0
|
||
|
sbcs r9,r9,#0
|
||
|
sbcs r10,r10,#1
|
||
|
sbcs r11,r11,#-1
|
||
|
sbc r3,r3,#0
|
||
|
|
||
|
@ Note that because mod has special form, i.e. consists of
|
||
|
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||
|
@ using value of borrow as a whole or extracting single bit.
|
||
|
@ Follow r3 register...
|
||
|
|
||
|
adds r4,r4,r3 @ add synthesized modulus
|
||
|
adcs r5,r5,r3
|
||
|
str r4,[r0,#0]
|
||
|
adcs r6,r6,r3
|
||
|
str r5,[r0,#4]
|
||
|
adcs r7,r7,#0
|
||
|
str r6,[r0,#8]
|
||
|
adcs r8,r8,#0
|
||
|
str r7,[r0,#12]
|
||
|
adcs r9,r9,#0
|
||
|
str r8,[r0,#16]
|
||
|
adcs r10,r10,r3,lsr#31
|
||
|
str r9,[r0,#20]
|
||
|
adcs r11,r11,r3
|
||
|
str r10,[r0,#24]
|
||
|
str r11,[r0,#28]
|
||
|
|
||
|
mov pc,lr
|
||
|
|
||
|
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_mul_by_3
|
||
|
#endif
|
||
|
.align 4
|
||
|
__ecp_nistz256_mul_by_3:
|
||
|
str lr,[sp,#-4]! @ push lr
|
||
|
|
||
|
@ As multiplication by 3 is performed as 2*n+n, below are inline
|
||
|
@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
|
||
|
@ corresponding subroutines for details.
|
||
|
|
||
|
ldr r4,[r1,#0]
|
||
|
ldr r5,[r1,#4]
|
||
|
ldr r6,[r1,#8]
|
||
|
adds r4,r4,r4 @ a[0:7]+=a[0:7]
|
||
|
ldr r7,[r1,#12]
|
||
|
adcs r5,r5,r5
|
||
|
ldr r8,[r1,#16]
|
||
|
adcs r6,r6,r6
|
||
|
ldr r9,[r1,#20]
|
||
|
adcs r7,r7,r7
|
||
|
ldr r10,[r1,#24]
|
||
|
adcs r8,r8,r8
|
||
|
ldr r11,[r1,#28]
|
||
|
adcs r9,r9,r9
|
||
|
adcs r10,r10,r10
|
||
|
mov r3,#0
|
||
|
adcs r11,r11,r11
|
||
|
adc r3,r3,#0
|
||
|
|
||
|
subs r4,r4,#-1 @ Lreduce_by_sub but without stores
|
||
|
sbcs r5,r5,#-1
|
||
|
sbcs r6,r6,#-1
|
||
|
sbcs r7,r7,#0
|
||
|
sbcs r8,r8,#0
|
||
|
sbcs r9,r9,#0
|
||
|
sbcs r10,r10,#1
|
||
|
sbcs r11,r11,#-1
|
||
|
sbc r3,r3,#0
|
||
|
|
||
|
adds r4,r4,r3 @ add synthesized modulus
|
||
|
adcs r5,r5,r3
|
||
|
adcs r6,r6,r3
|
||
|
adcs r7,r7,#0
|
||
|
adcs r8,r8,#0
|
||
|
ldr r2,[r1,#0]
|
||
|
adcs r9,r9,#0
|
||
|
ldr r12,[r1,#4]
|
||
|
adcs r10,r10,r3,lsr#31
|
||
|
ldr r14,[r1,#8]
|
||
|
adc r11,r11,r3
|
||
|
|
||
|
ldr r3,[r1,#12]
|
||
|
adds r4,r4,r2 @ 2*a[0:7]+=a[0:7]
|
||
|
ldr r2,[r1,#16]
|
||
|
adcs r5,r5,r12
|
||
|
ldr r12,[r1,#20]
|
||
|
adcs r6,r6,r14
|
||
|
ldr r14,[r1,#24]
|
||
|
adcs r7,r7,r3
|
||
|
ldr r1,[r1,#28]
|
||
|
adcs r8,r8,r2
|
||
|
adcs r9,r9,r12
|
||
|
adcs r10,r10,r14
|
||
|
mov r3,#0
|
||
|
adcs r11,r11,r1
|
||
|
adc r3,r3,#0
|
||
|
ldr lr,[sp],#4 @ pop lr
|
||
|
|
||
|
b Lreduce_by_sub
|
||
|
|
||
|
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_div_by_2
|
||
|
#endif
|
||
|
.align 4
|
||
|
__ecp_nistz256_div_by_2:
|
||
|
@ ret = (a is odd ? a+mod : a) >> 1
|
||
|
|
||
|
ldr r4,[r1,#0]
|
||
|
ldr r5,[r1,#4]
|
||
|
ldr r6,[r1,#8]
|
||
|
mov r3,r4,lsl#31 @ place least significant bit to most
|
||
|
@ significant position, now arithmetic
|
||
|
@ right shift by 31 will produce -1 or
|
||
|
@ 0, while logical right shift 1 or 0,
|
||
|
@ this is how modulus is conditionally
|
||
|
@ synthesized in this case...
|
||
|
ldr r7,[r1,#12]
|
||
|
adds r4,r4,r3,asr#31
|
||
|
ldr r8,[r1,#16]
|
||
|
adcs r5,r5,r3,asr#31
|
||
|
ldr r9,[r1,#20]
|
||
|
adcs r6,r6,r3,asr#31
|
||
|
ldr r10,[r1,#24]
|
||
|
adcs r7,r7,#0
|
||
|
ldr r11,[r1,#28]
|
||
|
adcs r8,r8,#0
|
||
|
mov r4,r4,lsr#1 @ a[0:7]>>=1, we can start early
|
||
|
@ because it doesn't affect flags
|
||
|
adcs r9,r9,#0
|
||
|
orr r4,r4,r5,lsl#31
|
||
|
adcs r10,r10,r3,lsr#31
|
||
|
mov r2,#0
|
||
|
adcs r11,r11,r3,asr#31
|
||
|
mov r5,r5,lsr#1
|
||
|
adc r2,r2,#0 @ top-most carry bit from addition
|
||
|
|
||
|
orr r5,r5,r6,lsl#31
|
||
|
mov r6,r6,lsr#1
|
||
|
str r4,[r0,#0]
|
||
|
orr r6,r6,r7,lsl#31
|
||
|
mov r7,r7,lsr#1
|
||
|
str r5,[r0,#4]
|
||
|
orr r7,r7,r8,lsl#31
|
||
|
mov r8,r8,lsr#1
|
||
|
str r6,[r0,#8]
|
||
|
orr r8,r8,r9,lsl#31
|
||
|
mov r9,r9,lsr#1
|
||
|
str r7,[r0,#12]
|
||
|
orr r9,r9,r10,lsl#31
|
||
|
mov r10,r10,lsr#1
|
||
|
str r8,[r0,#16]
|
||
|
orr r10,r10,r11,lsl#31
|
||
|
mov r11,r11,lsr#1
|
||
|
str r9,[r0,#20]
|
||
|
orr r11,r11,r2,lsl#31 @ don't forget the top-most carry bit
|
||
|
str r10,[r0,#24]
|
||
|
str r11,[r0,#28]
|
||
|
|
||
|
mov pc,lr
|
||
|
|
||
|
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_sub
|
||
|
#endif
|
||
|
.align 4
|
||
|
__ecp_nistz256_sub:
|
||
|
str lr,[sp,#-4]! @ push lr
|
||
|
|
||
|
ldr r4,[r1,#0]
|
||
|
ldr r5,[r1,#4]
|
||
|
ldr r6,[r1,#8]
|
||
|
ldr r7,[r1,#12]
|
||
|
ldr r8,[r1,#16]
|
||
|
ldr r3,[r2,#0]
|
||
|
ldr r9,[r1,#20]
|
||
|
ldr r12,[r2,#4]
|
||
|
ldr r10,[r1,#24]
|
||
|
ldr r14,[r2,#8]
|
||
|
ldr r11,[r1,#28]
|
||
|
ldr r1,[r2,#12]
|
||
|
subs r4,r4,r3
|
||
|
ldr r3,[r2,#16]
|
||
|
sbcs r5,r5,r12
|
||
|
ldr r12,[r2,#20]
|
||
|
sbcs r6,r6,r14
|
||
|
ldr r14,[r2,#24]
|
||
|
sbcs r7,r7,r1
|
||
|
ldr r1,[r2,#28]
|
||
|
sbcs r8,r8,r3
|
||
|
sbcs r9,r9,r12
|
||
|
sbcs r10,r10,r14
|
||
|
sbcs r11,r11,r1
|
||
|
sbc r3,r3,r3 @ broadcast borrow bit
|
||
|
ldr lr,[sp],#4 @ pop lr
|
||
|
|
||
|
Lreduce_by_add:
|
||
|
|
||
|
@ if a-b borrows, add modulus.
|
||
|
@
|
||
|
@ Note that because mod has special form, i.e. consists of
|
||
|
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||
|
@ broadcasting borrow bit to a register, r3, and using it as
|
||
|
@ a whole or extracting single bit.
|
||
|
|
||
|
adds r4,r4,r3 @ add synthesized modulus
|
||
|
adcs r5,r5,r3
|
||
|
str r4,[r0,#0]
|
||
|
adcs r6,r6,r3
|
||
|
str r5,[r0,#4]
|
||
|
adcs r7,r7,#0
|
||
|
str r6,[r0,#8]
|
||
|
adcs r8,r8,#0
|
||
|
str r7,[r0,#12]
|
||
|
adcs r9,r9,#0
|
||
|
str r8,[r0,#16]
|
||
|
adcs r10,r10,r3,lsr#31
|
||
|
str r9,[r0,#20]
|
||
|
adcs r11,r11,r3
|
||
|
str r10,[r0,#24]
|
||
|
str r11,[r0,#28]
|
||
|
|
||
|
mov pc,lr
|
||
|
|
||
|
|
||
|
@ void GFp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
|
||
|
.globl _GFp_nistz256_neg
|
||
|
.private_extern _GFp_nistz256_neg
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func _GFp_nistz256_neg
|
||
|
#endif
|
||
|
.align 4
|
||
|
_GFp_nistz256_neg:
|
||
|
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
|
||
|
bl __ecp_nistz256_neg
|
||
|
#if __ARM_ARCH__>=5 || !defined(__thumb__)
|
||
|
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
|
||
|
#else
|
||
|
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
|
||
|
bx lr @ interoperable with Thumb ISA:-)
|
||
|
#endif
|
||
|
|
||
|
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_neg
|
||
|
#endif
|
||
|
.align 4
|
||
|
__ecp_nistz256_neg:
|
||
|
ldr r4,[r1,#0]
|
||
|
eor r3,r3,r3
|
||
|
ldr r5,[r1,#4]
|
||
|
ldr r6,[r1,#8]
|
||
|
subs r4,r3,r4
|
||
|
ldr r7,[r1,#12]
|
||
|
sbcs r5,r3,r5
|
||
|
ldr r8,[r1,#16]
|
||
|
sbcs r6,r3,r6
|
||
|
ldr r9,[r1,#20]
|
||
|
sbcs r7,r3,r7
|
||
|
ldr r10,[r1,#24]
|
||
|
sbcs r8,r3,r8
|
||
|
ldr r11,[r1,#28]
|
||
|
sbcs r9,r3,r9
|
||
|
sbcs r10,r3,r10
|
||
|
sbcs r11,r3,r11
|
||
|
sbc r3,r3,r3
|
||
|
|
||
|
b Lreduce_by_add
|
||
|
|
||
|
@ void GFp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
|
||
|
@ const BN_ULONG r2[8]);
|
||
|
.globl _GFp_nistz256_mul_mont
|
||
|
.private_extern _GFp_nistz256_mul_mont
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func _GFp_nistz256_mul_mont
|
||
|
#endif
|
||
|
.align 4
|
||
|
_GFp_nistz256_mul_mont:
|
||
|
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
|
||
|
bl __ecp_nistz256_mul_mont
|
||
|
#if __ARM_ARCH__>=5 || !defined(__thumb__)
|
||
|
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
|
||
|
#else
|
||
|
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
|
||
|
bx lr @ interoperable with Thumb ISA:-)
|
||
|
#endif
|
||
|
|
||
|
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_mul_mont
|
||
|
#endif
|
||
|
.align 4
|
||
|
__ecp_nistz256_mul_mont:
|
||
|
stmdb sp!,{r0,r1,r2,lr} @ make a copy of arguments too
|
||
|
|
||
|
ldr r2,[r2,#0] @ b[0]
|
||
|
ldmia r1,{r4,r5,r6,r7,r8,r9,r10,r11}
|
||
|
|
||
|
umull r3,r14,r4,r2 @ r[0]=a[0]*b[0]
|
||
|
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy a[0-7] to stack, so
|
||
|
@ that it can be addressed
|
||
|
@ without spending register
|
||
|
@ on address
|
||
|
umull r4,r0,r5,r2 @ r[1]=a[1]*b[0]
|
||
|
umull r5,r1,r6,r2
|
||
|
adds r4,r4,r14 @ accumulate high part of mult
|
||
|
umull r6,r12,r7,r2
|
||
|
adcs r5,r5,r0
|
||
|
umull r7,r14,r8,r2
|
||
|
adcs r6,r6,r1
|
||
|
umull r8,r0,r9,r2
|
||
|
adcs r7,r7,r12
|
||
|
umull r9,r1,r10,r2
|
||
|
adcs r8,r8,r14
|
||
|
umull r10,r12,r11,r2
|
||
|
adcs r9,r9,r0
|
||
|
adcs r10,r10,r1
|
||
|
eor r14,r14,r14 @ first overflow bit is zero
|
||
|
adc r11,r12,#0
|
||
|
@ multiplication-less reduction 1
|
||
|
adds r6,r6,r3 @ r[3]+=r[0]
|
||
|
ldr r2,[sp,#40] @ restore b_ptr
|
||
|
adcs r7,r7,#0 @ r[4]+=0
|
||
|
adcs r8,r8,#0 @ r[5]+=0
|
||
|
adcs r9,r9,r3 @ r[6]+=r[0]
|
||
|
ldr r1,[sp,#0] @ load a[0]
|
||
|
adcs r10,r10,#0 @ r[7]+=0
|
||
|
ldr r2,[r2,#4*1] @ load b[i]
|
||
|
adcs r11,r11,r3 @ r[8]+=r[0]
|
||
|
eor r0,r0,r0
|
||
|
adc r14,r14,#0 @ overflow bit
|
||
|
subs r10,r10,r3 @ r[7]-=r[0]
|
||
|
ldr r12,[sp,#4] @ a[1]
|
||
|
sbcs r11,r11,#0 @ r[8]-=0
|
||
|
umlal r4,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
sbc r3,r14,#0 @ overflow bit, keep in mind
|
||
|
@ that netto result is
|
||
|
@ addition of a value which
|
||
|
@ makes underflow impossible
|
||
|
|
||
|
ldr r14,[sp,#8] @ a[2]
|
||
|
umlal r5,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
|
||
|
str r3,[sp,#36] @ temporarily offload overflow
|
||
|
eor r12,r12,r12
|
||
|
ldr r3,[sp,#12] @ a[3], r3 is alias r3
|
||
|
umlal r6,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adds r5,r5,r0 @ accumulate high part of mult
|
||
|
ldr r0,[sp,#16] @ a[4]
|
||
|
umlal r7,r14,r3,r2 @ "r[3]"+=a[3]*b[i]
|
||
|
eor r3,r3,r3
|
||
|
adcs r6,r6,r1
|
||
|
ldr r1,[sp,#20] @ a[5]
|
||
|
umlal r8,r3,r0,r2 @ "r[4]"+=a[4]*b[i]
|
||
|
eor r0,r0,r0
|
||
|
adcs r7,r7,r12
|
||
|
ldr r12,[sp,#24] @ a[6]
|
||
|
umlal r9,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
adcs r8,r8,r14
|
||
|
ldr r14,[sp,#28] @ a[7]
|
||
|
umlal r10,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
|
||
|
eor r12,r12,r12
|
||
|
adcs r9,r9,r3
|
||
|
ldr r3,[sp,#36] @ restore overflow bit
|
||
|
umlal r11,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adcs r10,r10,r0
|
||
|
adcs r11,r11,r1
|
||
|
adcs r3,r3,r12
|
||
|
adc r14,r14,#0 @ new overflow bit
|
||
|
@ multiplication-less reduction 2
|
||
|
adds r7,r7,r4 @ r[3]+=r[0]
|
||
|
ldr r2,[sp,#40] @ restore b_ptr
|
||
|
adcs r8,r8,#0 @ r[4]+=0
|
||
|
adcs r9,r9,#0 @ r[5]+=0
|
||
|
adcs r10,r10,r4 @ r[6]+=r[0]
|
||
|
ldr r1,[sp,#0] @ load a[0]
|
||
|
adcs r11,r11,#0 @ r[7]+=0
|
||
|
ldr r2,[r2,#4*2] @ load b[i]
|
||
|
adcs r3,r3,r4 @ r[8]+=r[0]
|
||
|
eor r0,r0,r0
|
||
|
adc r14,r14,#0 @ overflow bit
|
||
|
subs r11,r11,r4 @ r[7]-=r[0]
|
||
|
ldr r12,[sp,#4] @ a[1]
|
||
|
sbcs r3,r3,#0 @ r[8]-=0
|
||
|
umlal r5,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
sbc r4,r14,#0 @ overflow bit, keep in mind
|
||
|
@ that netto result is
|
||
|
@ addition of a value which
|
||
|
@ makes underflow impossible
|
||
|
|
||
|
ldr r14,[sp,#8] @ a[2]
|
||
|
umlal r6,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
|
||
|
str r4,[sp,#36] @ temporarily offload overflow
|
||
|
eor r12,r12,r12
|
||
|
ldr r4,[sp,#12] @ a[3], r4 is alias r4
|
||
|
umlal r7,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adds r6,r6,r0 @ accumulate high part of mult
|
||
|
ldr r0,[sp,#16] @ a[4]
|
||
|
umlal r8,r14,r4,r2 @ "r[3]"+=a[3]*b[i]
|
||
|
eor r4,r4,r4
|
||
|
adcs r7,r7,r1
|
||
|
ldr r1,[sp,#20] @ a[5]
|
||
|
umlal r9,r4,r0,r2 @ "r[4]"+=a[4]*b[i]
|
||
|
eor r0,r0,r0
|
||
|
adcs r8,r8,r12
|
||
|
ldr r12,[sp,#24] @ a[6]
|
||
|
umlal r10,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
adcs r9,r9,r14
|
||
|
ldr r14,[sp,#28] @ a[7]
|
||
|
umlal r11,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
|
||
|
eor r12,r12,r12
|
||
|
adcs r10,r10,r4
|
||
|
ldr r4,[sp,#36] @ restore overflow bit
|
||
|
umlal r3,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adcs r11,r11,r0
|
||
|
adcs r3,r3,r1
|
||
|
adcs r4,r4,r12
|
||
|
adc r14,r14,#0 @ new overflow bit
|
||
|
@ multiplication-less reduction 3
|
||
|
adds r8,r8,r5 @ r[3]+=r[0]
|
||
|
ldr r2,[sp,#40] @ restore b_ptr
|
||
|
adcs r9,r9,#0 @ r[4]+=0
|
||
|
adcs r10,r10,#0 @ r[5]+=0
|
||
|
adcs r11,r11,r5 @ r[6]+=r[0]
|
||
|
ldr r1,[sp,#0] @ load a[0]
|
||
|
adcs r3,r3,#0 @ r[7]+=0
|
||
|
ldr r2,[r2,#4*3] @ load b[i]
|
||
|
adcs r4,r4,r5 @ r[8]+=r[0]
|
||
|
eor r0,r0,r0
|
||
|
adc r14,r14,#0 @ overflow bit
|
||
|
subs r3,r3,r5 @ r[7]-=r[0]
|
||
|
ldr r12,[sp,#4] @ a[1]
|
||
|
sbcs r4,r4,#0 @ r[8]-=0
|
||
|
umlal r6,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
sbc r5,r14,#0 @ overflow bit, keep in mind
|
||
|
@ that netto result is
|
||
|
@ addition of a value which
|
||
|
@ makes underflow impossible
|
||
|
|
||
|
ldr r14,[sp,#8] @ a[2]
|
||
|
umlal r7,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
|
||
|
str r5,[sp,#36] @ temporarily offload overflow
|
||
|
eor r12,r12,r12
|
||
|
ldr r5,[sp,#12] @ a[3], r5 is alias r5
|
||
|
umlal r8,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adds r7,r7,r0 @ accumulate high part of mult
|
||
|
ldr r0,[sp,#16] @ a[4]
|
||
|
umlal r9,r14,r5,r2 @ "r[3]"+=a[3]*b[i]
|
||
|
eor r5,r5,r5
|
||
|
adcs r8,r8,r1
|
||
|
ldr r1,[sp,#20] @ a[5]
|
||
|
umlal r10,r5,r0,r2 @ "r[4]"+=a[4]*b[i]
|
||
|
eor r0,r0,r0
|
||
|
adcs r9,r9,r12
|
||
|
ldr r12,[sp,#24] @ a[6]
|
||
|
umlal r11,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
adcs r10,r10,r14
|
||
|
ldr r14,[sp,#28] @ a[7]
|
||
|
umlal r3,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
|
||
|
eor r12,r12,r12
|
||
|
adcs r11,r11,r5
|
||
|
ldr r5,[sp,#36] @ restore overflow bit
|
||
|
umlal r4,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adcs r3,r3,r0
|
||
|
adcs r4,r4,r1
|
||
|
adcs r5,r5,r12
|
||
|
adc r14,r14,#0 @ new overflow bit
|
||
|
@ multiplication-less reduction 4
|
||
|
adds r9,r9,r6 @ r[3]+=r[0]
|
||
|
ldr r2,[sp,#40] @ restore b_ptr
|
||
|
adcs r10,r10,#0 @ r[4]+=0
|
||
|
adcs r11,r11,#0 @ r[5]+=0
|
||
|
adcs r3,r3,r6 @ r[6]+=r[0]
|
||
|
ldr r1,[sp,#0] @ load a[0]
|
||
|
adcs r4,r4,#0 @ r[7]+=0
|
||
|
ldr r2,[r2,#4*4] @ load b[i]
|
||
|
adcs r5,r5,r6 @ r[8]+=r[0]
|
||
|
eor r0,r0,r0
|
||
|
adc r14,r14,#0 @ overflow bit
|
||
|
subs r4,r4,r6 @ r[7]-=r[0]
|
||
|
ldr r12,[sp,#4] @ a[1]
|
||
|
sbcs r5,r5,#0 @ r[8]-=0
|
||
|
umlal r7,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
sbc r6,r14,#0 @ overflow bit, keep in mind
|
||
|
@ that netto result is
|
||
|
@ addition of a value which
|
||
|
@ makes underflow impossible
|
||
|
|
||
|
ldr r14,[sp,#8] @ a[2]
|
||
|
umlal r8,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
|
||
|
str r6,[sp,#36] @ temporarily offload overflow
|
||
|
eor r12,r12,r12
|
||
|
ldr r6,[sp,#12] @ a[3], r6 is alias r6
|
||
|
umlal r9,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adds r8,r8,r0 @ accumulate high part of mult
|
||
|
ldr r0,[sp,#16] @ a[4]
|
||
|
umlal r10,r14,r6,r2 @ "r[3]"+=a[3]*b[i]
|
||
|
eor r6,r6,r6
|
||
|
adcs r9,r9,r1
|
||
|
ldr r1,[sp,#20] @ a[5]
|
||
|
umlal r11,r6,r0,r2 @ "r[4]"+=a[4]*b[i]
|
||
|
eor r0,r0,r0
|
||
|
adcs r10,r10,r12
|
||
|
ldr r12,[sp,#24] @ a[6]
|
||
|
umlal r3,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
adcs r11,r11,r14
|
||
|
ldr r14,[sp,#28] @ a[7]
|
||
|
umlal r4,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
|
||
|
eor r12,r12,r12
|
||
|
adcs r3,r3,r6
|
||
|
ldr r6,[sp,#36] @ restore overflow bit
|
||
|
umlal r5,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adcs r4,r4,r0
|
||
|
adcs r5,r5,r1
|
||
|
adcs r6,r6,r12
|
||
|
adc r14,r14,#0 @ new overflow bit
|
||
|
@ multiplication-less reduction 5
|
||
|
adds r10,r10,r7 @ r[3]+=r[0]
|
||
|
ldr r2,[sp,#40] @ restore b_ptr
|
||
|
adcs r11,r11,#0 @ r[4]+=0
|
||
|
adcs r3,r3,#0 @ r[5]+=0
|
||
|
adcs r4,r4,r7 @ r[6]+=r[0]
|
||
|
ldr r1,[sp,#0] @ load a[0]
|
||
|
adcs r5,r5,#0 @ r[7]+=0
|
||
|
ldr r2,[r2,#4*5] @ load b[i]
|
||
|
adcs r6,r6,r7 @ r[8]+=r[0]
|
||
|
eor r0,r0,r0
|
||
|
adc r14,r14,#0 @ overflow bit
|
||
|
subs r5,r5,r7 @ r[7]-=r[0]
|
||
|
ldr r12,[sp,#4] @ a[1]
|
||
|
sbcs r6,r6,#0 @ r[8]-=0
|
||
|
umlal r8,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
sbc r7,r14,#0 @ overflow bit, keep in mind
|
||
|
@ that netto result is
|
||
|
@ addition of a value which
|
||
|
@ makes underflow impossible
|
||
|
|
||
|
ldr r14,[sp,#8] @ a[2]
|
||
|
umlal r9,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
|
||
|
str r7,[sp,#36] @ temporarily offload overflow
|
||
|
eor r12,r12,r12
|
||
|
ldr r7,[sp,#12] @ a[3], r7 is alias r7
|
||
|
umlal r10,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adds r9,r9,r0 @ accumulate high part of mult
|
||
|
ldr r0,[sp,#16] @ a[4]
|
||
|
umlal r11,r14,r7,r2 @ "r[3]"+=a[3]*b[i]
|
||
|
eor r7,r7,r7
|
||
|
adcs r10,r10,r1
|
||
|
ldr r1,[sp,#20] @ a[5]
|
||
|
umlal r3,r7,r0,r2 @ "r[4]"+=a[4]*b[i]
|
||
|
eor r0,r0,r0
|
||
|
adcs r11,r11,r12
|
||
|
ldr r12,[sp,#24] @ a[6]
|
||
|
umlal r4,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
adcs r3,r3,r14
|
||
|
ldr r14,[sp,#28] @ a[7]
|
||
|
umlal r5,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
|
||
|
eor r12,r12,r12
|
||
|
adcs r4,r4,r7
|
||
|
ldr r7,[sp,#36] @ restore overflow bit
|
||
|
umlal r6,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adcs r5,r5,r0
|
||
|
adcs r6,r6,r1
|
||
|
adcs r7,r7,r12
|
||
|
adc r14,r14,#0 @ new overflow bit
|
||
|
@ multiplication-less reduction 6
|
||
|
adds r11,r11,r8 @ r[3]+=r[0]
|
||
|
ldr r2,[sp,#40] @ restore b_ptr
|
||
|
adcs r3,r3,#0 @ r[4]+=0
|
||
|
adcs r4,r4,#0 @ r[5]+=0
|
||
|
adcs r5,r5,r8 @ r[6]+=r[0]
|
||
|
ldr r1,[sp,#0] @ load a[0]
|
||
|
adcs r6,r6,#0 @ r[7]+=0
|
||
|
ldr r2,[r2,#4*6] @ load b[i]
|
||
|
adcs r7,r7,r8 @ r[8]+=r[0]
|
||
|
eor r0,r0,r0
|
||
|
adc r14,r14,#0 @ overflow bit
|
||
|
subs r6,r6,r8 @ r[7]-=r[0]
|
||
|
ldr r12,[sp,#4] @ a[1]
|
||
|
sbcs r7,r7,#0 @ r[8]-=0
|
||
|
umlal r9,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
sbc r8,r14,#0 @ overflow bit, keep in mind
|
||
|
@ that netto result is
|
||
|
@ addition of a value which
|
||
|
@ makes underflow impossible
|
||
|
|
||
|
ldr r14,[sp,#8] @ a[2]
|
||
|
umlal r10,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
|
||
|
str r8,[sp,#36] @ temporarily offload overflow
|
||
|
eor r12,r12,r12
|
||
|
ldr r8,[sp,#12] @ a[3], r8 is alias r8
|
||
|
umlal r11,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adds r10,r10,r0 @ accumulate high part of mult
|
||
|
ldr r0,[sp,#16] @ a[4]
|
||
|
umlal r3,r14,r8,r2 @ "r[3]"+=a[3]*b[i]
|
||
|
eor r8,r8,r8
|
||
|
adcs r11,r11,r1
|
||
|
ldr r1,[sp,#20] @ a[5]
|
||
|
umlal r4,r8,r0,r2 @ "r[4]"+=a[4]*b[i]
|
||
|
eor r0,r0,r0
|
||
|
adcs r3,r3,r12
|
||
|
ldr r12,[sp,#24] @ a[6]
|
||
|
umlal r5,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
adcs r4,r4,r14
|
||
|
ldr r14,[sp,#28] @ a[7]
|
||
|
umlal r6,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
|
||
|
eor r12,r12,r12
|
||
|
adcs r5,r5,r8
|
||
|
ldr r8,[sp,#36] @ restore overflow bit
|
||
|
umlal r7,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adcs r6,r6,r0
|
||
|
adcs r7,r7,r1
|
||
|
adcs r8,r8,r12
|
||
|
adc r14,r14,#0 @ new overflow bit
|
||
|
@ multiplication-less reduction 7
|
||
|
adds r3,r3,r9 @ r[3]+=r[0]
|
||
|
ldr r2,[sp,#40] @ restore b_ptr
|
||
|
adcs r4,r4,#0 @ r[4]+=0
|
||
|
adcs r5,r5,#0 @ r[5]+=0
|
||
|
adcs r6,r6,r9 @ r[6]+=r[0]
|
||
|
ldr r1,[sp,#0] @ load a[0]
|
||
|
adcs r7,r7,#0 @ r[7]+=0
|
||
|
ldr r2,[r2,#4*7] @ load b[i]
|
||
|
adcs r8,r8,r9 @ r[8]+=r[0]
|
||
|
eor r0,r0,r0
|
||
|
adc r14,r14,#0 @ overflow bit
|
||
|
subs r7,r7,r9 @ r[7]-=r[0]
|
||
|
ldr r12,[sp,#4] @ a[1]
|
||
|
sbcs r8,r8,#0 @ r[8]-=0
|
||
|
umlal r10,r0,r1,r2 @ "r[0]"+=a[0]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
sbc r9,r14,#0 @ overflow bit, keep in mind
|
||
|
@ that netto result is
|
||
|
@ addition of a value which
|
||
|
@ makes underflow impossible
|
||
|
|
||
|
ldr r14,[sp,#8] @ a[2]
|
||
|
umlal r11,r1,r12,r2 @ "r[1]"+=a[1]*b[i]
|
||
|
str r9,[sp,#36] @ temporarily offload overflow
|
||
|
eor r12,r12,r12
|
||
|
ldr r9,[sp,#12] @ a[3], r9 is alias r9
|
||
|
umlal r3,r12,r14,r2 @ "r[2]"+=a[2]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adds r11,r11,r0 @ accumulate high part of mult
|
||
|
ldr r0,[sp,#16] @ a[4]
|
||
|
umlal r4,r14,r9,r2 @ "r[3]"+=a[3]*b[i]
|
||
|
eor r9,r9,r9
|
||
|
adcs r3,r3,r1
|
||
|
ldr r1,[sp,#20] @ a[5]
|
||
|
umlal r5,r9,r0,r2 @ "r[4]"+=a[4]*b[i]
|
||
|
eor r0,r0,r0
|
||
|
adcs r4,r4,r12
|
||
|
ldr r12,[sp,#24] @ a[6]
|
||
|
umlal r6,r0,r1,r2 @ "r[5]"+=a[5]*b[i]
|
||
|
eor r1,r1,r1
|
||
|
adcs r5,r5,r14
|
||
|
ldr r14,[sp,#28] @ a[7]
|
||
|
umlal r7,r1,r12,r2 @ "r[6]"+=a[6]*b[i]
|
||
|
eor r12,r12,r12
|
||
|
adcs r6,r6,r9
|
||
|
ldr r9,[sp,#36] @ restore overflow bit
|
||
|
umlal r8,r12,r14,r2 @ "r[7]"+=a[7]*b[i]
|
||
|
eor r14,r14,r14
|
||
|
adcs r7,r7,r0
|
||
|
adcs r8,r8,r1
|
||
|
adcs r9,r9,r12
|
||
|
adc r14,r14,#0 @ new overflow bit
|
||
|
@ last multiplication-less reduction
|
||
|
adds r4,r4,r10
|
||
|
ldr r0,[sp,#32] @ restore r_ptr
|
||
|
adcs r5,r5,#0
|
||
|
adcs r6,r6,#0
|
||
|
adcs r7,r7,r10
|
||
|
adcs r8,r8,#0
|
||
|
adcs r9,r9,r10
|
||
|
adc r14,r14,#0
|
||
|
subs r8,r8,r10
|
||
|
sbcs r9,r9,#0
|
||
|
sbc r10,r14,#0 @ overflow bit
|
||
|
|
||
|
@ Final step is "if result > mod, subtract mod", but we do it
|
||
|
@ "other way around", namely subtract modulus from result
|
||
|
@ and if it borrowed, add modulus back.
|
||
|
|
||
|
adds r11,r11,#1 @ subs r11,r11,#-1
|
||
|
adcs r3,r3,#0 @ sbcs r3,r3,#-1
|
||
|
adcs r4,r4,#0 @ sbcs r4,r4,#-1
|
||
|
sbcs r5,r5,#0
|
||
|
sbcs r6,r6,#0
|
||
|
sbcs r7,r7,#0
|
||
|
sbcs r8,r8,#1
|
||
|
adcs r9,r9,#0 @ sbcs r9,r9,#-1
|
||
|
ldr lr,[sp,#44] @ restore lr
|
||
|
sbc r10,r10,#0 @ broadcast borrow bit
|
||
|
add sp,sp,#48
|
||
|
|
||
|
@ Note that because mod has special form, i.e. consists of
|
||
|
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||
|
@ broadcasting borrow bit to a register, r10, and using it as
|
||
|
@ a whole or extracting single bit.
|
||
|
|
||
|
adds r11,r11,r10 @ add modulus or zero
|
||
|
adcs r3,r3,r10
|
||
|
str r11,[r0,#0]
|
||
|
adcs r4,r4,r10
|
||
|
str r3,[r0,#4]
|
||
|
adcs r5,r5,#0
|
||
|
str r4,[r0,#8]
|
||
|
adcs r6,r6,#0
|
||
|
str r5,[r0,#12]
|
||
|
adcs r7,r7,#0
|
||
|
str r6,[r0,#16]
|
||
|
adcs r8,r8,r10,lsr#31
|
||
|
str r7,[r0,#20]
|
||
|
adc r9,r9,r10
|
||
|
str r8,[r0,#24]
|
||
|
str r9,[r0,#28]
|
||
|
|
||
|
mov pc,lr
|
||
|
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_sub_from
|
||
|
#endif
|
||
|
.align 5
|
||
|
__ecp_nistz256_sub_from:
|
||
|
str lr,[sp,#-4]! @ push lr
|
||
|
|
||
|
ldr r10,[r2,#0]
|
||
|
ldr r12,[r2,#4]
|
||
|
ldr r14,[r2,#8]
|
||
|
ldr r1,[r2,#12]
|
||
|
subs r11,r11,r10
|
||
|
ldr r10,[r2,#16]
|
||
|
sbcs r3,r3,r12
|
||
|
ldr r12,[r2,#20]
|
||
|
sbcs r4,r4,r14
|
||
|
ldr r14,[r2,#24]
|
||
|
sbcs r5,r5,r1
|
||
|
ldr r1,[r2,#28]
|
||
|
sbcs r6,r6,r10
|
||
|
sbcs r7,r7,r12
|
||
|
sbcs r8,r8,r14
|
||
|
sbcs r9,r9,r1
|
||
|
sbc r2,r2,r2 @ broadcast borrow bit
|
||
|
ldr lr,[sp],#4 @ pop lr
|
||
|
|
||
|
adds r11,r11,r2 @ add synthesized modulus
|
||
|
adcs r3,r3,r2
|
||
|
str r11,[r0,#0]
|
||
|
adcs r4,r4,r2
|
||
|
str r3,[r0,#4]
|
||
|
adcs r5,r5,#0
|
||
|
str r4,[r0,#8]
|
||
|
adcs r6,r6,#0
|
||
|
str r5,[r0,#12]
|
||
|
adcs r7,r7,#0
|
||
|
str r6,[r0,#16]
|
||
|
adcs r8,r8,r2,lsr#31
|
||
|
str r7,[r0,#20]
|
||
|
adcs r9,r9,r2
|
||
|
str r8,[r0,#24]
|
||
|
str r9,[r0,#28]
|
||
|
|
||
|
mov pc,lr
|
||
|
|
||
|
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_sub_morf
|
||
|
#endif
|
||
|
.align 5
|
||
|
__ecp_nistz256_sub_morf:
|
||
|
str lr,[sp,#-4]! @ push lr
|
||
|
|
||
|
ldr r10,[r2,#0]
|
||
|
ldr r12,[r2,#4]
|
||
|
ldr r14,[r2,#8]
|
||
|
ldr r1,[r2,#12]
|
||
|
subs r11,r10,r11
|
||
|
ldr r10,[r2,#16]
|
||
|
sbcs r3,r12,r3
|
||
|
ldr r12,[r2,#20]
|
||
|
sbcs r4,r14,r4
|
||
|
ldr r14,[r2,#24]
|
||
|
sbcs r5,r1,r5
|
||
|
ldr r1,[r2,#28]
|
||
|
sbcs r6,r10,r6
|
||
|
sbcs r7,r12,r7
|
||
|
sbcs r8,r14,r8
|
||
|
sbcs r9,r1,r9
|
||
|
sbc r2,r2,r2 @ broadcast borrow bit
|
||
|
ldr lr,[sp],#4 @ pop lr
|
||
|
|
||
|
adds r11,r11,r2 @ add synthesized modulus
|
||
|
adcs r3,r3,r2
|
||
|
str r11,[r0,#0]
|
||
|
adcs r4,r4,r2
|
||
|
str r3,[r0,#4]
|
||
|
adcs r5,r5,#0
|
||
|
str r4,[r0,#8]
|
||
|
adcs r6,r6,#0
|
||
|
str r5,[r0,#12]
|
||
|
adcs r7,r7,#0
|
||
|
str r6,[r0,#16]
|
||
|
adcs r8,r8,r2,lsr#31
|
||
|
str r7,[r0,#20]
|
||
|
adcs r9,r9,r2
|
||
|
str r8,[r0,#24]
|
||
|
str r9,[r0,#28]
|
||
|
|
||
|
mov pc,lr
|
||
|
|
||
|
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func __ecp_nistz256_add_self
|
||
|
#endif
|
||
|
.align 4
|
||
|
__ecp_nistz256_add_self:
|
||
|
adds r11,r11,r11 @ a[0:7]+=a[0:7]
|
||
|
adcs r3,r3,r3
|
||
|
adcs r4,r4,r4
|
||
|
adcs r5,r5,r5
|
||
|
adcs r6,r6,r6
|
||
|
adcs r7,r7,r7
|
||
|
adcs r8,r8,r8
|
||
|
mov r2,#0
|
||
|
adcs r9,r9,r9
|
||
|
adc r2,r2,#0
|
||
|
|
||
|
@ if a+b >= modulus, subtract modulus.
|
||
|
@
|
||
|
@ But since comparison implies subtraction, we subtract
|
||
|
@ modulus and then add it back if subtraction borrowed.
|
||
|
|
||
|
subs r11,r11,#-1
|
||
|
sbcs r3,r3,#-1
|
||
|
sbcs r4,r4,#-1
|
||
|
sbcs r5,r5,#0
|
||
|
sbcs r6,r6,#0
|
||
|
sbcs r7,r7,#0
|
||
|
sbcs r8,r8,#1
|
||
|
sbcs r9,r9,#-1
|
||
|
sbc r2,r2,#0
|
||
|
|
||
|
@ Note that because mod has special form, i.e. consists of
|
||
|
@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
|
||
|
@ using value of borrow as a whole or extracting single bit.
|
||
|
@ Follow r2 register...
|
||
|
|
||
|
adds r11,r11,r2 @ add synthesized modulus
|
||
|
adcs r3,r3,r2
|
||
|
str r11,[r0,#0]
|
||
|
adcs r4,r4,r2
|
||
|
str r3,[r0,#4]
|
||
|
adcs r5,r5,#0
|
||
|
str r4,[r0,#8]
|
||
|
adcs r6,r6,#0
|
||
|
str r5,[r0,#12]
|
||
|
adcs r7,r7,#0
|
||
|
str r6,[r0,#16]
|
||
|
adcs r8,r8,r2,lsr#31
|
||
|
str r7,[r0,#20]
|
||
|
adcs r9,r9,r2
|
||
|
str r8,[r0,#24]
|
||
|
str r9,[r0,#28]
|
||
|
|
||
|
mov pc,lr
|
||
|
|
||
|
|
||
|
.globl _GFp_nistz256_point_double
|
||
|
.private_extern _GFp_nistz256_point_double
|
||
|
#ifdef __thumb2__
|
||
|
.thumb_func _GFp_nistz256_point_double
|
||
|
#endif
|
||
|
.align 5
|
||
|
_GFp_nistz256_point_double:
|
||
|
stmdb sp!,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ push from r0, unusual, but intentional
|
||
|
sub sp,sp,#32*5
|
||
|
|
||
|
Lpoint_double_shortcut:
|
||
|
add r3,sp,#96
|
||
|
ldmia r1!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy in_x
|
||
|
stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
|
||
|
|
||
|
add r0,sp,#0
|
||
|
bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
|
||
|
|
||
|
add r2,r1,#32
|
||
|
add r1,r1,#32
|
||
|
add r0,sp,#64
|
||
|
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
|
||
|
|
||
|
add r1,sp,#0
|
||
|
add r2,sp,#0
|
||
|
add r0,sp,#0
|
||
|
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
|
||
|
|
||
|
ldr r2,[sp,#32*5+4]
|
||
|
add r1,r2,#32
|
||
|
add r2,r2,#64
|
||
|
add r0,sp,#128
|
||
|
bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
|
||
|
|
||
|
ldr r0,[sp,#32*5]
|
||
|
add r0,r0,#64
|
||
|
bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
|
||
|
|
||
|
add r1,sp,#96
|
||
|
add r2,sp,#64
|
||
|
add r0,sp,#32
|
||
|
bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
|
||
|
|
||
|
add r1,sp,#96
|
||
|
add r2,sp,#64
|
||
|
add r0,sp,#64
|
||
|
bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
|
||
|
|
||
|
add r1,sp,#0
|
||
|
add r2,sp,#0
|
||
|
add r0,sp,#128
|
||
|
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
|
||
|
|
||
|
add r1,sp,#64
|
||
|
add r2,sp,#32
|
||
|
add r0,sp,#32
|
||
|
bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
|
||
|
|
||
|
ldr r0,[sp,#32*5]
|
||
|
add r1,sp,#128
|
||
|
add r0,r0,#32
|
||
|
bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
|
||
|
|
||
|
add r1,sp,#32
|
||
|
add r0,sp,#32
|
||
|
bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
|
||
|
|
||
|
add r1,sp,#96
|
||
|
add r2,sp,#0
|
||
|
add r0,sp,#0
|
||
|
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
|
||
|
|
||
|
add r0,sp,#128
|
||
|
bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
|
||
|
|
||
|
ldr r0,[sp,#32*5]
|
||
|
add r1,sp,#32
|
||
|
add r2,sp,#32
|
||
|
bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
|
||
|
|
||
|
add r2,sp,#128
|
||
|
bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
|
||
|
|
||
|
add r2,sp,#0
|
||
|
add r0,sp,#0
|
||
|
bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
|
||
|
|
||
|
add r1,sp,#32
|
||
|
add r2,sp,#0
|
||
|
bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
|
||
|
|
||
|
ldr r0,[sp,#32*5]
|
||
|
add r2,r0,#32
|
||
|
add r0,r0,#32
|
||
|
bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
|
||
|
|
||
|
add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
|
||
|
#if __ARM_ARCH__>=5 || !defined(__thumb__)
|
||
|
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
|
||
|
#else
|
||
|
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
|
||
|
bx lr @ interoperable with Thumb ISA:-)
|
||
|
#endif
|
||
|
|
||
|
#endif // !OPENSSL_NO_ASM
|