2021-03-02 08:24:45 +00:00
|
|
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
kernel: 5.4: import wireguard backport
Rather than using the clunky, old, slower wireguard-linux-compat out of
tree module, this commit does a patch-by-patch backport of upstream's
wireguard to 5.4. This specific backport is in widespread use, being
part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's
Android kernel, Gentoo's distro kernel, and probably more I've forgotten
about. It's definately the "more proper" way of adding wireguard to a
kernel than the ugly compat.h hell of the wireguard-linux-compat repo.
And most importantly for OpenWRT, it allows using the same module
configuration code for 5.10 as for 5.4, with no need for bifurcation.
These patches are from the backport tree which is maintained in the
open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y
I'll be sending PRs to update this as needed.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2021-02-19 13:29:04 +00:00
|
|
|
From: Uros Bizjak <ubizjak@gmail.com>
|
|
|
|
Date: Thu, 27 Aug 2020 19:30:58 +0200
|
2021-03-02 08:24:45 +00:00
|
|
|
Subject: [PATCH] crypto: curve25519-x86_64 - Use XORL r32,32
|
kernel: 5.4: import wireguard backport
Rather than using the clunky, old, slower wireguard-linux-compat out of
tree module, this commit does a patch-by-patch backport of upstream's
wireguard to 5.4. This specific backport is in widespread use, being
part of SUSE's enterprise kernel, Oracle's enterprise kernel, Google's
Android kernel, Gentoo's distro kernel, and probably more I've forgotten
about. It's definately the "more proper" way of adding wireguard to a
kernel than the ugly compat.h hell of the wireguard-linux-compat repo.
And most importantly for OpenWRT, it allows using the same module
configuration code for 5.10 as for 5.4, with no need for bifurcation.
These patches are from the backport tree which is maintained in the
open here: https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y
I'll be sending PRs to update this as needed.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2021-02-19 13:29:04 +00:00
|
|
|
|
|
|
|
commit db719539fd3889836900bf912755aa30a5985e9a upstream.
|
|
|
|
|
|
|
|
x86_64 zero extends 32bit operations, so for 64bit operands,
|
|
|
|
XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
|
|
|
|
a REX prefix byte when legacy registers are used.
|
|
|
|
|
|
|
|
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
|
|
|
|
Cc: Herbert Xu <herbert@gondor.apana.org.au>
|
|
|
|
Cc: "David S. Miller" <davem@davemloft.net>
|
|
|
|
Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
|
|
|
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
|
|
|
|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
|
|
|
|
---
|
|
|
|
arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
|
|
|
|
1 file changed, 34 insertions(+), 34 deletions(-)
|
|
|
|
|
|
|
|
--- a/arch/x86/crypto/curve25519-x86_64.c
|
|
|
|
+++ b/arch/x86/crypto/curve25519-x86_64.c
|
|
|
|
@@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, c
|
|
|
|
|
|
|
|
asm volatile(
|
|
|
|
/* Clear registers to propagate the carry bit */
|
|
|
|
- " xor %%r8, %%r8;"
|
|
|
|
- " xor %%r9, %%r9;"
|
|
|
|
- " xor %%r10, %%r10;"
|
|
|
|
- " xor %%r11, %%r11;"
|
|
|
|
- " xor %1, %1;"
|
|
|
|
+ " xor %%r8d, %%r8d;"
|
|
|
|
+ " xor %%r9d, %%r9d;"
|
|
|
|
+ " xor %%r10d, %%r10d;"
|
|
|
|
+ " xor %%r11d, %%r11d;"
|
|
|
|
+ " xor %k1, %k1;"
|
|
|
|
|
|
|
|
/* Begin addition chain */
|
|
|
|
" addq 0(%3), %0;"
|
|
|
|
@@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const
|
|
|
|
" cmovc %0, %%rax;"
|
|
|
|
|
|
|
|
/* Step 2: Add carry*38 to the original sum */
|
|
|
|
- " xor %%rcx, %%rcx;"
|
|
|
|
+ " xor %%ecx, %%ecx;"
|
|
|
|
" add %%rax, %%r8;"
|
|
|
|
" adcx %%rcx, %%r9;"
|
|
|
|
" movq %%r9, 8(%1);"
|
|
|
|
@@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const
|
|
|
|
|
|
|
|
/* Compute src1[0] * src2 */
|
|
|
|
" movq 0(%1), %%rdx;"
|
|
|
|
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
|
|
|
|
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
|
|
|
|
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
|
|
|
|
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
|
|
|
|
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;"
|
|
|
|
/* Compute src1[1] * src2 */
|
|
|
|
" movq 8(%1), %%rdx;"
|
|
|
|
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
|
|
|
|
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
|
|
|
|
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
|
|
|
|
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
|
|
|
|
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
|
|
|
|
/* Compute src1[2] * src2 */
|
|
|
|
" movq 16(%1), %%rdx;"
|
|
|
|
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
|
|
|
|
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
|
|
|
|
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
|
|
|
|
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
|
|
|
|
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
|
|
|
|
/* Compute src1[3] * src2 */
|
|
|
|
" movq 24(%1), %%rdx;"
|
|
|
|
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
|
|
|
|
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
|
|
|
|
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
|
|
|
|
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
|
|
|
|
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
|
|
|
|
@@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const
|
|
|
|
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
|
|
|
|
" mov $38, %%rdx;"
|
|
|
|
" mulxq 32(%1), %%r8, %%r13;"
|
|
|
|
- " xor %3, %3;"
|
|
|
|
+ " xor %k3, %k3;"
|
|
|
|
" adoxq 0(%1), %%r8;"
|
|
|
|
" mulxq 40(%1), %%r9, %%rbx;"
|
|
|
|
" adcx %%r13, %%r9;"
|
|
|
|
@@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const
|
|
|
|
|
|
|
|
/* Compute src1[0] * src2 */
|
|
|
|
" movq 0(%1), %%rdx;"
|
|
|
|
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
|
|
|
|
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
|
|
|
|
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
|
|
|
|
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
|
|
|
|
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;"
|
|
|
|
/* Compute src1[1] * src2 */
|
|
|
|
" movq 8(%1), %%rdx;"
|
|
|
|
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
|
|
|
|
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
|
|
|
|
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
|
|
|
|
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
|
|
|
|
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
|
|
|
|
/* Compute src1[2] * src2 */
|
|
|
|
" movq 16(%1), %%rdx;"
|
|
|
|
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
|
|
|
|
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
|
|
|
|
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
|
|
|
|
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
|
|
|
|
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
|
|
|
|
/* Compute src1[3] * src2 */
|
|
|
|
" movq 24(%1), %%rdx;"
|
|
|
|
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
|
|
|
|
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
|
|
|
|
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
|
|
|
|
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
|
|
|
|
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
|
|
|
|
@@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const
|
|
|
|
|
|
|
|
/* Compute src1[0] * src2 */
|
|
|
|
" movq 32(%1), %%rdx;"
|
|
|
|
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
|
|
|
|
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
|
|
|
|
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);"
|
|
|
|
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
|
|
|
|
" mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
|
|
|
|
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;"
|
|
|
|
/* Compute src1[1] * src2 */
|
|
|
|
" movq 40(%1), %%rdx;"
|
|
|
|
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
|
|
|
|
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
|
|
|
|
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
|
|
|
|
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
|
|
|
|
" mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
|
|
|
|
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
|
|
|
|
/* Compute src1[2] * src2 */
|
|
|
|
" movq 48(%1), %%rdx;"
|
|
|
|
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
|
|
|
|
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
|
|
|
|
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
|
|
|
|
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
|
|
|
|
" mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
|
|
|
|
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
|
|
|
|
/* Compute src1[3] * src2 */
|
|
|
|
" movq 56(%1), %%rdx;"
|
|
|
|
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
|
|
|
|
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
|
|
|
|
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
|
|
|
|
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
|
|
|
|
" mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
|
|
|
|
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
|
|
|
|
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
|
|
|
|
@@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const
|
|
|
|
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
|
|
|
|
" mov $38, %%rdx;"
|
|
|
|
" mulxq 32(%1), %%r8, %%r13;"
|
|
|
|
- " xor %3, %3;"
|
|
|
|
+ " xor %k3, %k3;"
|
|
|
|
" adoxq 0(%1), %%r8;"
|
|
|
|
" mulxq 40(%1), %%r9, %%rbx;"
|
|
|
|
" adcx %%r13, %%r9;"
|
|
|
|
@@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const
|
|
|
|
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
|
|
|
|
" mov $38, %%rdx;"
|
|
|
|
" mulxq 96(%1), %%r8, %%r13;"
|
|
|
|
- " xor %3, %3;"
|
|
|
|
+ " xor %k3, %k3;"
|
|
|
|
" adoxq 64(%1), %%r8;"
|
|
|
|
" mulxq 104(%1), %%r9, %%rbx;"
|
|
|
|
" adcx %%r13, %%r9;"
|
|
|
|
@@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const
|
|
|
|
|
|
|
|
/* Step 1: Compute all partial products */
|
|
|
|
" movq 0(%1), %%rdx;" /* f[0] */
|
|
|
|
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
|
|
|
|
+ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
|
|
|
|
" mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
|
|
|
|
" mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
|
|
|
|
" movq 24(%1), %%rdx;" /* f[3] */
|
|
|
|
@@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const
|
|
|
|
" mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
|
|
|
|
|
|
|
|
/* Step 2: Compute two parallel carry chains */
|
|
|
|
- " xor %%r15, %%r15;"
|
|
|
|
+ " xor %%r15d, %%r15d;"
|
|
|
|
" adox %%rax, %%r10;"
|
|
|
|
" adcx %%r8, %%r8;"
|
|
|
|
" adox %%rcx, %%r11;"
|
|
|
|
@@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const
|
|
|
|
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
|
|
|
|
" mov $38, %%rdx;"
|
|
|
|
" mulxq 32(%1), %%r8, %%r13;"
|
|
|
|
- " xor %%rcx, %%rcx;"
|
|
|
|
+ " xor %%ecx, %%ecx;"
|
|
|
|
" adoxq 0(%1), %%r8;"
|
|
|
|
" mulxq 40(%1), %%r9, %%rbx;"
|
|
|
|
" adcx %%r13, %%r9;"
|
|
|
|
@@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const
|
|
|
|
asm volatile(
|
|
|
|
/* Step 1: Compute all partial products */
|
|
|
|
" movq 0(%1), %%rdx;" /* f[0] */
|
|
|
|
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
|
|
|
|
+ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
|
|
|
|
" mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
|
|
|
|
" mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
|
|
|
|
" movq 24(%1), %%rdx;" /* f[3] */
|
|
|
|
@@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const
|
|
|
|
" mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
|
|
|
|
|
|
|
|
/* Step 2: Compute two parallel carry chains */
|
|
|
|
- " xor %%r15, %%r15;"
|
|
|
|
+ " xor %%r15d, %%r15d;"
|
|
|
|
" adox %%rax, %%r10;"
|
|
|
|
" adcx %%r8, %%r8;"
|
|
|
|
" adox %%rcx, %%r11;"
|
|
|
|
@@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const
|
|
|
|
|
|
|
|
/* Step 1: Compute all partial products */
|
|
|
|
" movq 32(%1), %%rdx;" /* f[0] */
|
|
|
|
- " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
|
|
|
|
+ " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
|
|
|
|
" mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
|
|
|
|
" mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
|
|
|
|
" movq 56(%1), %%rdx;" /* f[3] */
|
|
|
|
@@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const
|
|
|
|
" mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
|
|
|
|
|
|
|
|
/* Step 2: Compute two parallel carry chains */
|
|
|
|
- " xor %%r15, %%r15;"
|
|
|
|
+ " xor %%r15d, %%r15d;"
|
|
|
|
" adox %%rax, %%r10;"
|
|
|
|
" adcx %%r8, %%r8;"
|
|
|
|
" adox %%rcx, %%r11;"
|
|
|
|
@@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const
|
|
|
|
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
|
|
|
|
" mov $38, %%rdx;"
|
|
|
|
" mulxq 32(%1), %%r8, %%r13;"
|
|
|
|
- " xor %%rcx, %%rcx;"
|
|
|
|
+ " xor %%ecx, %%ecx;"
|
|
|
|
" adoxq 0(%1), %%r8;"
|
|
|
|
" mulxq 40(%1), %%r9, %%rbx;"
|
|
|
|
" adcx %%r13, %%r9;"
|
|
|
|
@@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const
|
|
|
|
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
|
|
|
|
" mov $38, %%rdx;"
|
|
|
|
" mulxq 96(%1), %%r8, %%r13;"
|
|
|
|
- " xor %%rcx, %%rcx;"
|
|
|
|
+ " xor %%ecx, %%ecx;"
|
|
|
|
" adoxq 64(%1), %%r8;"
|
|
|
|
" mulxq 104(%1), %%r9, %%rbx;"
|
|
|
|
" adcx %%r13, %%r9;"
|