2020-08-27 17:31:49

by Uros Bizjak

[permalink] [raw]
Subject: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

x86_64 zero extends 32bit operations, so for 64bit operands,
XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
a REX prefix byte when legacy registers are used.

Signed-off-by: Uros Bizjak <[email protected]>
Cc: Herbert Xu <[email protected]>
Cc: "David S. Miller" <[email protected]>
---
arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 8acbb6584a37..a9edb6f8a0ba 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)

asm volatile(
/* Clear registers to propagate the carry bit */
- " xor %%r8, %%r8;"
- " xor %%r9, %%r9;"
- " xor %%r10, %%r10;"
- " xor %%r11, %%r11;"
- " xor %1, %1;"
+ " xor %%r8d, %%r8d;"
+ " xor %%r9d, %%r9d;"
+ " xor %%r10d, %%r10d;"
+ " xor %%r11d, %%r11d;"
+ " xor %k1, %k1;"

/* Begin addition chain */
" addq 0(%3), %0;"
@@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
" cmovc %0, %%rax;"

/* Step 2: Add carry*38 to the original sum */
- " xor %%rcx, %%rcx;"
+ " xor %%ecx, %%ecx;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%1);"
@@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)

/* Compute src1[0] * src2 */
" movq 0(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;"
/* Compute src1[1] * src2 */
" movq 8(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[2] * src2 */
" movq 16(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[3] * src2 */
" movq 24(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
@@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 32(%1), %%r8, %%r13;"
- " xor %3, %3;"
+ " xor %k3, %k3;"
" adoxq 0(%1), %%r8;"
" mulxq 40(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
@@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)

/* Compute src1[0] * src2 */
" movq 0(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;"
/* Compute src1[1] * src2 */
" movq 8(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[2] * src2 */
" movq 16(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[3] * src2 */
" movq 24(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
" mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
@@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)

/* Compute src1[0] * src2 */
" movq 32(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
" mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;"
/* Compute src1[1] * src2 */
" movq 40(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
" mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[2] * src2 */
" movq 48(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
" mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
/* Compute src1[3] * src2 */
" movq 56(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
" mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
@@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 32(%1), %%r8, %%r13;"
- " xor %3, %3;"
+ " xor %k3, %k3;"
" adoxq 0(%1), %%r8;"
" mulxq 40(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
@@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 96(%1), %%r8, %%r13;"
- " xor %3, %3;"
+ " xor %k3, %k3;"
" adoxq 64(%1), %%r8;"
" mulxq 104(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
@@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)

/* Step 1: Compute all partial products */
" movq 0(%1), %%rdx;" /* f[0] */
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
+ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
" mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
" mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
" movq 24(%1), %%rdx;" /* f[3] */
@@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
" mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */

/* Step 2: Compute two parallel carry chains */
- " xor %%r15, %%r15;"
+ " xor %%r15d, %%r15d;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
@@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 32(%1), %%r8, %%r13;"
- " xor %%rcx, %%rcx;"
+ " xor %%ecx, %%ecx;"
" adoxq 0(%1), %%r8;"
" mulxq 40(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
@@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
asm volatile(
/* Step 1: Compute all partial products */
" movq 0(%1), %%rdx;" /* f[0] */
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
+ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
" mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
" mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
" movq 24(%1), %%rdx;" /* f[3] */
@@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */

/* Step 2: Compute two parallel carry chains */
- " xor %%r15, %%r15;"
+ " xor %%r15d, %%r15d;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
@@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)

/* Step 1: Compute all partial products */
" movq 32(%1), %%rdx;" /* f[0] */
- " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
+ " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
" mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
" mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
" movq 56(%1), %%rdx;" /* f[3] */
@@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */

/* Step 2: Compute two parallel carry chains */
- " xor %%r15, %%r15;"
+ " xor %%r15d, %%r15d;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
@@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 32(%1), %%r8, %%r13;"
- " xor %%rcx, %%rcx;"
+ " xor %%ecx, %%ecx;"
" adoxq 0(%1), %%r8;"
" mulxq 40(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
@@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 96(%1), %%r8, %%r13;"
- " xor %%rcx, %%rcx;"
+ " xor %%ecx, %%ecx;"
" adoxq 64(%1), %%r8;"
" mulxq 104(%1), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
--
2.26.2


2020-09-01 16:09:24

by Ard Biesheuvel

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

(+ Jason)

On Thu, 27 Aug 2020 at 20:31, Uros Bizjak <[email protected]> wrote:
>
> x86_64 zero extends 32bit operations, so for 64bit operands,
> XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> a REX prefix byte when legacy registers are used.
>
> Signed-off-by: Uros Bizjak <[email protected]>
> Cc: Herbert Xu <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> ---
> arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
> 1 file changed, 34 insertions(+), 34 deletions(-)
>
> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> index 8acbb6584a37..a9edb6f8a0ba 100644
> --- a/arch/x86/crypto/curve25519-x86_64.c
> +++ b/arch/x86/crypto/curve25519-x86_64.c
> @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
>
> asm volatile(
> /* Clear registers to propagate the carry bit */
> - " xor %%r8, %%r8;"
> - " xor %%r9, %%r9;"
> - " xor %%r10, %%r10;"
> - " xor %%r11, %%r11;"
> - " xor %1, %1;"
> + " xor %%r8d, %%r8d;"
> + " xor %%r9d, %%r9d;"
> + " xor %%r10d, %%r10d;"
> + " xor %%r11d, %%r11d;"
> + " xor %k1, %k1;"
>
> /* Begin addition chain */
> " addq 0(%3), %0;"
> @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
> " cmovc %0, %%rax;"
>
> /* Step 2: Add carry*38 to the original sum */
> - " xor %%rcx, %%rcx;"
> + " xor %%ecx, %%ecx;"
> " add %%rax, %%r8;"
> " adcx %%rcx, %%r9;"
> " movq %%r9, 8(%1);"
> @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
> /* Compute src1[0] * src2 */
> " movq 0(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;"
> /* Compute src1[1] * src2 */
> " movq 8(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[2] * src2 */
> " movq 16(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[3] * src2 */
> " movq 24(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
> @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 32(%1), %%r8, %%r13;"
> - " xor %3, %3;"
> + " xor %k3, %k3;"
> " adoxq 0(%1), %%r8;"
> " mulxq 40(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
> /* Compute src1[0] * src2 */
> " movq 0(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;"
> /* Compute src1[1] * src2 */
> " movq 8(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[2] * src2 */
> " movq 16(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[3] * src2 */
> " movq 24(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
> @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
> /* Compute src1[0] * src2 */
> " movq 32(%1), %%rdx;"
> - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
> - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
> + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);"
> + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
> " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
> " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;"
> /* Compute src1[1] * src2 */
> " movq 40(%1), %%rdx;"
> - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
> - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
> + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
> + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
> " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[2] * src2 */
> " movq 48(%1), %%rdx;"
> - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
> - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
> + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
> + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
> " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[3] * src2 */
> " movq 56(%1), %%rdx;"
> - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
> - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
> + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
> + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
> " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
> " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
> @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 32(%1), %%r8, %%r13;"
> - " xor %3, %3;"
> + " xor %k3, %k3;"
> " adoxq 0(%1), %%r8;"
> " mulxq 40(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 96(%1), %%r8, %%r13;"
> - " xor %3, %3;"
> + " xor %k3, %k3;"
> " adoxq 64(%1), %%r8;"
> " mulxq 104(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>
> /* Step 1: Compute all partial products */
> " movq 0(%1), %%rdx;" /* f[0] */
> - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
> + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
> " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
> " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
> " movq 24(%1), %%rdx;" /* f[3] */
> @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
>
> /* Step 2: Compute two parallel carry chains */
> - " xor %%r15, %%r15;"
> + " xor %%r15d, %%r15d;"
> " adox %%rax, %%r10;"
> " adcx %%r8, %%r8;"
> " adox %%rcx, %%r11;"
> @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 32(%1), %%r8, %%r13;"
> - " xor %%rcx, %%rcx;"
> + " xor %%ecx, %%ecx;"
> " adoxq 0(%1), %%r8;"
> " mulxq 40(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> asm volatile(
> /* Step 1: Compute all partial products */
> " movq 0(%1), %%rdx;" /* f[0] */
> - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
> + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
> " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
> " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
> " movq 24(%1), %%rdx;" /* f[3] */
> @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
>
> /* Step 2: Compute two parallel carry chains */
> - " xor %%r15, %%r15;"
> + " xor %%r15d, %%r15d;"
> " adox %%rax, %%r10;"
> " adcx %%r8, %%r8;"
> " adox %%rcx, %%r11;"
> @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>
> /* Step 1: Compute all partial products */
> " movq 32(%1), %%rdx;" /* f[0] */
> - " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
> + " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
> " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
> " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
> " movq 56(%1), %%rdx;" /* f[3] */
> @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
>
> /* Step 2: Compute two parallel carry chains */
> - " xor %%r15, %%r15;"
> + " xor %%r15d, %%r15d;"
> " adox %%rax, %%r10;"
> " adcx %%r8, %%r8;"
> " adox %%rcx, %%r11;"
> @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 32(%1), %%r8, %%r13;"
> - " xor %%rcx, %%rcx;"
> + " xor %%ecx, %%ecx;"
> " adoxq 0(%1), %%r8;"
> " mulxq 40(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 96(%1), %%r8, %%r13;"
> - " xor %%rcx, %%rcx;"
> + " xor %%ecx, %%ecx;"
> " adoxq 64(%1), %%r8;"
> " mulxq 104(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> --
> 2.26.2
>

2020-09-01 18:14:34

by Jason A. Donenfeld

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

Hi Uros,

Thanks for this patch. This seems correct to me, since indeed those
clear the top bits anyway, and having smaller code seems good. But
first -- I'm wondering -- have you stuck this into Vale/Hacl to see if
it still checks out there? I'm CC'ing Karthik/Aymeric/Chris/Jonathan
who might be interested in taking a look at that. Seems like it might
be easy to just special case this in Xor64 for the case where the
operands are the same. Also, have you seen any measurable differences
when benching this? I can stick it into kbench9000 to see if you
haven't looked yet.

Jason

On Tue, Sep 1, 2020 at 5:46 PM Ard Biesheuvel <[email protected]> wrote:
>
> (+ Jason)
>
> On Thu, 27 Aug 2020 at 20:31, Uros Bizjak <[email protected]> wrote:
> >
> > x86_64 zero extends 32bit operations, so for 64bit operands,
> > XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> > a REX prefix byte when legacy registers are used.
> >
> > Signed-off-by: Uros Bizjak <[email protected]>
> > Cc: Herbert Xu <[email protected]>
> > Cc: "David S. Miller" <[email protected]>
> > ---
> > arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
> > 1 file changed, 34 insertions(+), 34 deletions(-)
> >
> > diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> > index 8acbb6584a37..a9edb6f8a0ba 100644
> > --- a/arch/x86/crypto/curve25519-x86_64.c
> > +++ b/arch/x86/crypto/curve25519-x86_64.c
> > @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
> >
> > asm volatile(
> > /* Clear registers to propagate the carry bit */
> > - " xor %%r8, %%r8;"
> > - " xor %%r9, %%r9;"
> > - " xor %%r10, %%r10;"
> > - " xor %%r11, %%r11;"
> > - " xor %1, %1;"
> > + " xor %%r8d, %%r8d;"
> > + " xor %%r9d, %%r9d;"
> > + " xor %%r10d, %%r10d;"
> > + " xor %%r11d, %%r11d;"
> > + " xor %k1, %k1;"
> >
> > /* Begin addition chain */
> > " addq 0(%3), %0;"
> > @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
> > " cmovc %0, %%rax;"
> >
> > /* Step 2: Add carry*38 to the original sum */
> > - " xor %%rcx, %%rcx;"
> > + " xor %%ecx, %%ecx;"
> > " add %%rax, %%r8;"
> > " adcx %%rcx, %%r9;"
> > " movq %%r9, 8(%1);"
> > @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >
> > /* Compute src1[0] * src2 */
> > " movq 0(%1), %%rdx;"
> > - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
> > + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
> > " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
> > " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
> > " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;"
> > /* Compute src1[1] * src2 */
> > " movq 8(%1), %%rdx;"
> > - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> > + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> > " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
> > " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> > " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> > /* Compute src1[2] * src2 */
> > " movq 16(%1), %%rdx;"
> > - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> > + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> > " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
> > " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> > " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> > /* Compute src1[3] * src2 */
> > " movq 24(%1), %%rdx;"
> > - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> > + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> > " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
> > " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
> > " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
> > @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> > /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> > " mov $38, %%rdx;"
> > " mulxq 32(%1), %%r8, %%r13;"
> > - " xor %3, %3;"
> > + " xor %k3, %k3;"
> > " adoxq 0(%1), %%r8;"
> > " mulxq 40(%1), %%r9, %%rbx;"
> > " adcx %%r13, %%r9;"
> > @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >
> > /* Compute src1[0] * src2 */
> > " movq 0(%1), %%rdx;"
> > - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
> > + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
> > " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
> > " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
> > " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;"
> > /* Compute src1[1] * src2 */
> > " movq 8(%1), %%rdx;"
> > - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> > + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> > " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
> > " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> > " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> > /* Compute src1[2] * src2 */
> > " movq 16(%1), %%rdx;"
> > - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> > + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> > " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
> > " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> > " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> > /* Compute src1[3] * src2 */
> > " movq 24(%1), %%rdx;"
> > - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> > + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> > " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
> > " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
> > " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
> > @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> >
> > /* Compute src1[0] * src2 */
> > " movq 32(%1), %%rdx;"
> > - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
> > - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
> > + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);"
> > + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
> > " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
> > " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;"
> > /* Compute src1[1] * src2 */
> > " movq 40(%1), %%rdx;"
> > - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
> > - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
> > + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
> > + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
> > " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> > " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> > /* Compute src1[2] * src2 */
> > " movq 48(%1), %%rdx;"
> > - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
> > - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
> > + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
> > + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
> > " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> > " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> > /* Compute src1[3] * src2 */
> > " movq 56(%1), %%rdx;"
> > - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
> > - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
> > + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
> > + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
> > " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
> > " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
> > " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
> > @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> > /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> > " mov $38, %%rdx;"
> > " mulxq 32(%1), %%r8, %%r13;"
> > - " xor %3, %3;"
> > + " xor %k3, %k3;"
> > " adoxq 0(%1), %%r8;"
> > " mulxq 40(%1), %%r9, %%rbx;"
> > " adcx %%r13, %%r9;"
> > @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> > /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> > " mov $38, %%rdx;"
> > " mulxq 96(%1), %%r8, %%r13;"
> > - " xor %3, %3;"
> > + " xor %k3, %k3;"
> > " adoxq 64(%1), %%r8;"
> > " mulxq 104(%1), %%r9, %%rbx;"
> > " adcx %%r13, %%r9;"
> > @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> >
> > /* Step 1: Compute all partial products */
> > " movq 0(%1), %%rdx;" /* f[0] */
> > - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
> > + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
> > " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
> > " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
> > " movq 24(%1), %%rdx;" /* f[3] */
> > @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> > " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
> >
> > /* Step 2: Compute two parallel carry chains */
> > - " xor %%r15, %%r15;"
> > + " xor %%r15d, %%r15d;"
> > " adox %%rax, %%r10;"
> > " adcx %%r8, %%r8;"
> > " adox %%rcx, %%r11;"
> > @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> > /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> > " mov $38, %%rdx;"
> > " mulxq 32(%1), %%r8, %%r13;"
> > - " xor %%rcx, %%rcx;"
> > + " xor %%ecx, %%ecx;"
> > " adoxq 0(%1), %%r8;"
> > " mulxq 40(%1), %%r9, %%rbx;"
> > " adcx %%r13, %%r9;"
> > @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> > asm volatile(
> > /* Step 1: Compute all partial products */
> > " movq 0(%1), %%rdx;" /* f[0] */
> > - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
> > + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
> > " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
> > " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
> > " movq 24(%1), %%rdx;" /* f[3] */
> > @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> > " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
> >
> > /* Step 2: Compute two parallel carry chains */
> > - " xor %%r15, %%r15;"
> > + " xor %%r15d, %%r15d;"
> > " adox %%rax, %%r10;"
> > " adcx %%r8, %%r8;"
> > " adox %%rcx, %%r11;"
> > @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> >
> > /* Step 1: Compute all partial products */
> > " movq 32(%1), %%rdx;" /* f[0] */
> > - " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
> > + " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
> > " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
> > " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
> > " movq 56(%1), %%rdx;" /* f[3] */
> > @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> > " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
> >
> > /* Step 2: Compute two parallel carry chains */
> > - " xor %%r15, %%r15;"
> > + " xor %%r15d, %%r15d;"
> > " adox %%rax, %%r10;"
> > " adcx %%r8, %%r8;"
> > " adox %%rcx, %%r11;"
> > @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> > /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> > " mov $38, %%rdx;"
> > " mulxq 32(%1), %%r8, %%r13;"
> > - " xor %%rcx, %%rcx;"
> > + " xor %%ecx, %%ecx;"
> > " adoxq 0(%1), %%r8;"
> > " mulxq 40(%1), %%r9, %%rbx;"
> > " adcx %%r13, %%r9;"
> > @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> > /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> > " mov $38, %%rdx;"
> > " mulxq 96(%1), %%r8, %%r13;"
> > - " xor %%rcx, %%rcx;"
> > + " xor %%ecx, %%ecx;"
> > " adoxq 64(%1), %%r8;"
> > " mulxq 104(%1), %%r9, %%rbx;"
> > " adcx %%r13, %%r9;"
> > --
> > 2.26.2
> >

2020-09-01 19:13:10

by Jason A. Donenfeld

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <[email protected]> wrote:
> operands are the same. Also, have you seen any measurable differences
> when benching this? I can stick it into kbench9000 to see if you
> haven't looked yet.

On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
difference with this, at all, no matter how much I median or mean or
reduce noise by disabling interrupts.

One thing that sticks out is that all the replacements of r8-r15 by
their %r8d-r15d counterparts still have the REX prefix, as is
necessary to access those registers. The only ones worth changing,
then, are the legacy registers, and on a whole, this amounts to only
48 bytes of difference.

2020-09-02 05:51:27

by Uros Bizjak

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <[email protected]> wrote:
>
> On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <[email protected]> wrote:
> > operands are the same. Also, have you seen any measurable differences
> > when benching this? I can stick it into kbench9000 to see if you
> > haven't looked yet.
>
> On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> difference with this, at all, no matter how much I median or mean or
> reduce noise by disabling interrupts.
>
> One thing that sticks out is that all the replacements of r8-r15 by
> their %r8d-r15d counterparts still have the REX prefix, as is
> necessary to access those registers. The only ones worth changing,
> then, are the legacy registers, and on a whole, this amounts to only
> 48 bytes of difference.

The patch implements one of x86 target specific optimizations,
performed by gcc. The optimization results in code size savings of one
byte, where REX prefix is omitted with legacy registers, but otherwise
should have no measurable runtime effect. Since gcc applies this
optimization universally to all integer registers, I took the same
approach and implemented the same change to legacy and REX registers.
As measured above, 48 bytes saved is a good result for such a trivial
optimization.

Uros.

2020-09-02 09:18:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <[email protected]> wrote:
> >
> > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <[email protected]> wrote:
> > > operands are the same. Also, have you seen any measurable differences
> > > when benching this? I can stick it into kbench9000 to see if you
> > > haven't looked yet.
> >
> > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > difference with this, at all, no matter how much I median or mean or
> > reduce noise by disabling interrupts.
> >
> > One thing that sticks out is that all the replacements of r8-r15 by
> > their %r8d-r15d counterparts still have the REX prefix, as is
> > necessary to access those registers. The only ones worth changing,
> > then, are the legacy registers, and on a whole, this amounts to only
> > 48 bytes of difference.
>
> The patch implements one of x86 target specific optimizations,
> performed by gcc. The optimization results in code size savings of one
> byte, where REX prefix is omitted with legacy registers, but otherwise
> should have no measurable runtime effect. Since gcc applies this
> optimization universally to all integer registers, I took the same
> approach and implemented the same change to legacy and REX registers.
> As measured above, 48 bytes saved is a good result for such a trivial
> optimization.

Could we instead implement this optimization in GAS ? Then we can leave
the code as-is.

2020-09-02 11:36:59

by Uros Bizjak

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

On Wed, Sep 2, 2020 at 11:17 AM <[email protected]> wrote:
>
> On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> > On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <[email protected]> wrote:
> > >
> > > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <[email protected]> wrote:
> > > > operands are the same. Also, have you seen any measurable differences
> > > > when benching this? I can stick it into kbench9000 to see if you
> > > > haven't looked yet.
> > >
> > > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > > difference with this, at all, no matter how much I median or mean or
> > > reduce noise by disabling interrupts.
> > >
> > > One thing that sticks out is that all the replacements of r8-r15 by
> > > their %r8d-r15d counterparts still have the REX prefix, as is
> > > necessary to access those registers. The only ones worth changing,
> > > then, are the legacy registers, and on a whole, this amounts to only
> > > 48 bytes of difference.
> >
> > The patch implements one of x86 target specific optimizations,
> > performed by gcc. The optimization results in code size savings of one
> > byte, where REX prefix is omitted with legacy registers, but otherwise
> > should have no measurable runtime effect. Since gcc applies this
> > optimization universally to all integer registers, I took the same
> > approach and implemented the same change to legacy and REX registers.
> > As measured above, 48 bytes saved is a good result for such a trivial
> > optimization.
>
> Could we instead implement this optimization in GAS ? Then we can leave
> the code as-is.

I don't think that e.g. "xorq %rax,%rax" should universally be
translated to "xorl %eax,%eax" in the assembler. Perhaps the author
expected exactly 3 bytes (to align the code or similar), and the
assembler would change the length to 2 bytes behind his back, breaking
the expectations.

Uros.

2020-09-02 15:00:28

by Jason A. Donenfeld

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

On Wed, Sep 2, 2020 at 11:44 AM <[email protected]> wrote:
>
> On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> > On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <[email protected]> wrote:
> > >
> > > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <[email protected]> wrote:
> > > > operands are the same. Also, have you seen any measurable differences
> > > > when benching this? I can stick it into kbench9000 to see if you
> > > > haven't looked yet.
> > >
> > > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > > difference with this, at all, no matter how much I median or mean or
> > > reduce noise by disabling interrupts.
> > >
> > > One thing that sticks out is that all the replacements of r8-r15 by
> > > their %r8d-r15d counterparts still have the REX prefix, as is
> > > necessary to access those registers. The only ones worth changing,
> > > then, are the legacy registers, and on a whole, this amounts to only
> > > 48 bytes of difference.
> >
> > The patch implements one of x86 target specific optimizations,
> > performed by gcc. The optimization results in code size savings of one
> > byte, where REX prefix is omitted with legacy registers, but otherwise
> > should have no measurable runtime effect. Since gcc applies this
> > optimization universally to all integer registers, I took the same
> > approach and implemented the same change to legacy and REX registers.
> > As measured above, 48 bytes saved is a good result for such a trivial
> > optimization.
>
> Could we instead implement this optimization in GAS ? Then we can leave
> the code as-is.

I rather like that idea. Though I wonder if some would balk at it for
smelling a bit like the MIPS assembler with its optimization pass...

2020-09-02 15:05:12

by Jason A. Donenfeld

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

On Wed, Sep 2, 2020 at 1:42 PM Uros Bizjak <[email protected]> wrote:
>
> On Wed, Sep 2, 2020 at 11:17 AM <[email protected]> wrote:
> >
> > On Wed, Sep 02, 2020 at 07:50:36AM +0200, Uros Bizjak wrote:
> > > On Tue, Sep 1, 2020 at 9:12 PM Jason A. Donenfeld <[email protected]> wrote:
> > > >
> > > > On Tue, Sep 1, 2020 at 8:13 PM Jason A. Donenfeld <[email protected]> wrote:
> > > > > operands are the same. Also, have you seen any measurable differences
> > > > > when benching this? I can stick it into kbench9000 to see if you
> > > > > haven't looked yet.
> > > >
> > > > On a Skylake server (Xeon Gold 5120), I'm unable to see any measurable
> > > > difference with this, at all, no matter how much I median or mean or
> > > > reduce noise by disabling interrupts.
> > > >
> > > > One thing that sticks out is that all the replacements of r8-r15 by
> > > > their %r8d-r15d counterparts still have the REX prefix, as is
> > > > necessary to access those registers. The only ones worth changing,
> > > > then, are the legacy registers, and on a whole, this amounts to only
> > > > 48 bytes of difference.
> > >
> > > The patch implements one of x86 target specific optimizations,
> > > performed by gcc. The optimization results in code size savings of one
> > > byte, where REX prefix is omitted with legacy registers, but otherwise
> > > should have no measurable runtime effect. Since gcc applies this
> > > optimization universally to all integer registers, I took the same
> > > approach and implemented the same change to legacy and REX registers.
> > > As measured above, 48 bytes saved is a good result for such a trivial
> > > optimization.
> >
> > Could we instead implement this optimization in GAS ? Then we can leave
> > the code as-is.
>
> I don't think that e.g. "xorq %rax,%rax" should universally be
> translated to "xorl %eax,%eax" in the assembler. Perhaps the author
> expected exactly 3 bytes (to align the code or similar), and the
> assembler would change the length to 2 bytes behind his back, breaking
> the expectations.

Are you sure that's something that GAS actually provides now? Seems
like a lot of mnemonics have ambiguous/injective encodings, and this
wouldn't make things any different. Most authors use .byte or .align
when they care about the actual bytes, no?

2020-09-07 17:48:09

by Jason A. Donenfeld

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

Hi Uros, Herbert,

On Thu, Aug 27, 2020 at 07:30:58PM +0200, Uros Bizjak wrote:
> x86_64 zero extends 32bit operations, so for 64bit operands,
> XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> a REX prefix byte when legacy registers are used.
>
> Signed-off-by: Uros Bizjak <[email protected]>
> Cc: Herbert Xu <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> ---
> arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
> 1 file changed, 34 insertions(+), 34 deletions(-)
>
> diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
> index 8acbb6584a37..a9edb6f8a0ba 100644
> --- a/arch/x86/crypto/curve25519-x86_64.c
> +++ b/arch/x86/crypto/curve25519-x86_64.c
> @@ -45,11 +45,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
>
> asm volatile(
> /* Clear registers to propagate the carry bit */
> - " xor %%r8, %%r8;"
> - " xor %%r9, %%r9;"
> - " xor %%r10, %%r10;"
> - " xor %%r11, %%r11;"
> - " xor %1, %1;"
> + " xor %%r8d, %%r8d;"
> + " xor %%r9d, %%r9d;"
> + " xor %%r10d, %%r10d;"
> + " xor %%r11d, %%r11d;"
> + " xor %k1, %k1;"
>
> /* Begin addition chain */
> " addq 0(%3), %0;"
> @@ -93,7 +93,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
> " cmovc %0, %%rax;"
>
> /* Step 2: Add carry*38 to the original sum */
> - " xor %%rcx, %%rcx;"
> + " xor %%ecx, %%ecx;"
> " add %%rax, %%r8;"
> " adcx %%rcx, %%r9;"
> " movq %%r9, 8(%1);"
> @@ -165,28 +165,28 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
> /* Compute src1[0] * src2 */
> " movq 0(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;"
> /* Compute src1[1] * src2 */
> " movq 8(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[2] * src2 */
> " movq 16(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[3] * src2 */
> " movq 24(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
> @@ -200,7 +200,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 32(%1), %%r8, %%r13;"
> - " xor %3, %3;"
> + " xor %k3, %k3;"
> " adoxq 0(%1), %%r8;"
> " mulxq 40(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -246,28 +246,28 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
> /* Compute src1[0] * src2 */
> " movq 0(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;"
> /* Compute src1[1] * src2 */
> " movq 8(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[2] * src2 */
> " movq 16(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[3] * src2 */
> " movq 24(%1), %%rdx;"
> - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> + " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
> " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
> " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
> " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
> @@ -277,29 +277,29 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
>
> /* Compute src1[0] * src2 */
> " movq 32(%1), %%rdx;"
> - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
> - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
> + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);"
> + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
> " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
> " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;"
> /* Compute src1[1] * src2 */
> " movq 40(%1), %%rdx;"
> - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
> - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
> + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
> + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
> " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[2] * src2 */
> " movq 48(%1), %%rdx;"
> - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
> - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
> + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
> + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
> " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
> " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
> /* Compute src1[3] * src2 */
> " movq 56(%1), %%rdx;"
> - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
> - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
> + " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
> + " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
> " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
> " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
> " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
> @@ -312,7 +312,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 32(%1), %%r8, %%r13;"
> - " xor %3, %3;"
> + " xor %k3, %k3;"
> " adoxq 0(%1), %%r8;"
> " mulxq 40(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -345,7 +345,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 96(%1), %%r8, %%r13;"
> - " xor %3, %3;"
> + " xor %k3, %k3;"
> " adoxq 64(%1), %%r8;"
> " mulxq 104(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -516,7 +516,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
>
> /* Step 1: Compute all partial products */
> " movq 0(%1), %%rdx;" /* f[0] */
> - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
> + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
> " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
> " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
> " movq 24(%1), %%rdx;" /* f[3] */
> @@ -526,7 +526,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
>
> /* Step 2: Compute two parallel carry chains */
> - " xor %%r15, %%r15;"
> + " xor %%r15d, %%r15d;"
> " adox %%rax, %%r10;"
> " adcx %%r8, %%r8;"
> " adox %%rcx, %%r11;"
> @@ -563,7 +563,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 32(%1), %%r8, %%r13;"
> - " xor %%rcx, %%rcx;"
> + " xor %%ecx, %%ecx;"
> " adoxq 0(%1), %%r8;"
> " mulxq 40(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -607,7 +607,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> asm volatile(
> /* Step 1: Compute all partial products */
> " movq 0(%1), %%rdx;" /* f[0] */
> - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
> + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
> " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
> " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
> " movq 24(%1), %%rdx;" /* f[3] */
> @@ -617,7 +617,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
>
> /* Step 2: Compute two parallel carry chains */
> - " xor %%r15, %%r15;"
> + " xor %%r15d, %%r15d;"
> " adox %%rax, %%r10;"
> " adcx %%r8, %%r8;"
> " adox %%rcx, %%r11;"
> @@ -647,7 +647,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
>
> /* Step 1: Compute all partial products */
> " movq 32(%1), %%rdx;" /* f[0] */
> - " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
> + " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
> " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
> " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
> " movq 56(%1), %%rdx;" /* f[3] */
> @@ -657,7 +657,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
>
> /* Step 2: Compute two parallel carry chains */
> - " xor %%r15, %%r15;"
> + " xor %%r15d, %%r15d;"
> " adox %%rax, %%r10;"
> " adcx %%r8, %%r8;"
> " adox %%rcx, %%r11;"
> @@ -692,7 +692,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 32(%1), %%r8, %%r13;"
> - " xor %%rcx, %%rcx;"
> + " xor %%ecx, %%ecx;"
> " adoxq 0(%1), %%r8;"
> " mulxq 40(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> @@ -725,7 +725,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
> /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
> " mov $38, %%rdx;"
> " mulxq 96(%1), %%r8, %%r13;"
> - " xor %%rcx, %%rcx;"
> + " xor %%ecx, %%ecx;"
> " adoxq 64(%1), %%r8;"
> " mulxq 104(%1), %%r9, %%rbx;"
> " adcx %%r13, %%r9;"
> --
> 2.26.2
>

Looks like this is going into HACL in the end, so:

Acked-by: Jason A. Donenfeld <[email protected]>

for cryptodev-2.6.git, rather than crypto-2.6.git

Thanks,
Jason

2020-09-11 06:56:38

by Herbert Xu

[permalink] [raw]
Subject: Re: [PATCH] crypto/x86: Use XORL r32,32 in curve25519-x86_64.c

On Thu, Aug 27, 2020 at 07:30:58PM +0200, Uros Bizjak wrote:
> x86_64 zero extends 32bit operations, so for 64bit operands,
> XORL r32,r32 is functionally equal to XORL r64,r64, but avoids
> a REX prefix byte when legacy registers are used.
>
> Signed-off-by: Uros Bizjak <[email protected]>
> Cc: Herbert Xu <[email protected]>
> Cc: "David S. Miller" <[email protected]>
> ---
> arch/x86/crypto/curve25519-x86_64.c | 68 ++++++++++++++---------------
> 1 file changed, 34 insertions(+), 34 deletions(-)

Patch applied. Thanks.
--
Email: Herbert Xu <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt