2018-06-25 10:26:49

by Jan Beulich

[permalink] [raw]
Subject: [PATCH] x86-64: use 32-bit XOR to zero registers

Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms - use
32-bit ones instead.

Signed-off-by: Jan Beulich <[email protected]>
---
arch/x86/crypto/aegis128-aesni-asm.S | 2 +-
arch/x86/crypto/aegis128l-aesni-asm.S | 2 +-
arch/x86/crypto/aegis256-aesni-asm.S | 2 +-
arch/x86/crypto/aesni-intel_asm.S | 8 ++++----
arch/x86/crypto/aesni-intel_avx-x86_64.S | 4 ++--
arch/x86/crypto/morus1280-avx2-asm.S | 2 +-
arch/x86/crypto/morus1280-sse2-asm.S | 2 +-
arch/x86/crypto/morus640-sse2-asm.S | 2 +-
arch/x86/crypto/sha1_ssse3_asm.S | 2 +-
arch/x86/kernel/head_64.S | 2 +-
arch/x86/kernel/paravirt_patch_64.c | 2 +-
arch/x86/lib/memcpy_64.S | 2 +-
arch/x86/power/hibernate_asm_64.S | 2 +-
13 files changed, 17 insertions(+), 17 deletions(-)

--- 4.18-rc2/arch/x86/crypto/aegis128-aesni-asm.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aegis128-aesni-asm.S
@@ -75,7 +75,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG

mov LEN, %r8
--- 4.18-rc2/arch/x86/crypto/aegis128l-aesni-asm.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aegis128l-aesni-asm.S
@@ -66,7 +66,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG0, MSG0
pxor MSG1, MSG1

--- 4.18-rc2/arch/x86/crypto/aegis256-aesni-asm.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aegis256-aesni-asm.S
@@ -59,7 +59,7 @@
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG

mov LEN, %r8
--- 4.18-rc2/arch/x86/crypto/aesni-intel_asm.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_asm.S
@@ -258,7 +258,7 @@ ALL_F: .octa 0xffffffffffffffffffff
.macro GCM_INIT Iv SUBKEY AAD AADLEN
mov \AADLEN, %r11
mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
- xor %r11, %r11
+ xor %r11d, %r11d
mov %r11, InLen(%arg2) # ctx_data.in_length = 0
mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
@@ -286,7 +286,7 @@ ALL_F: .octa 0xffffffffffffffffffff
movdqu HashKey(%arg2), %xmm13
add %arg5, InLen(%arg2)

- xor %r11, %r11 # initialise the data pointer offset as zero
+ xor %r11d, %r11d # initialise the data pointer offset as zero
PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation

sub %r11, %arg5 # sub partial block data used
@@ -702,7 +702,7 @@ _no_extra_mask_1_\@:

# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %rax,%rax
+ xor %eax, %eax

mov %rax, PBlockLen(%arg2)
jmp _dec_done_\@
@@ -737,7 +737,7 @@ _no_extra_mask_2_\@:

# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- xor %rax,%rax
+ xor %eax, %eax

mov %rax, PBlockLen(%arg2)
jmp _encode_done_\@
--- 4.18-rc2/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -463,7 +463,7 @@ _get_AAD_rest_final\@:

_get_AAD_done\@:
# initialize the data pointer offset as zero
- xor %r11, %r11
+ xor %r11d, %r11d

# start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0
@@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@:

_get_AAD_done\@:
# initialize the data pointer offset as zero
- xor %r11, %r11
+ xor %r11d, %r11d

# start AES for num_initial_blocks blocks
mov arg5, %rax # rax = *Y0
--- 4.18-rc2/arch/x86/crypto/morus1280-avx2-asm.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/morus1280-avx2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
vpxor MSG, MSG, MSG

mov %rcx, %r8
--- 4.18-rc2/arch/x86/crypto/morus1280-sse2-asm.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/morus1280-sse2-asm.S
@@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG_LO, MSG_LO
pxor MSG_HI, MSG_HI

--- 4.18-rc2/arch/x86/crypto/morus640-sse2-asm.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/morus640-sse2-asm.S
@@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero)
* %r9
*/
__load_partial:
- xor %r9, %r9
+ xor %r9d, %r9d
pxor MSG, MSG

mov %rcx, %r8
--- 4.18-rc2/arch/x86/crypto/sha1_ssse3_asm.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/sha1_ssse3_asm.S
@@ -96,7 +96,7 @@
# cleanup workspace
mov $8, %ecx
mov %rsp, %rdi
- xor %rax, %rax
+ xor %eax, %eax
rep stosq

mov %rbp, %rsp # deallocate workspace
--- 4.18-rc2/arch/x86/kernel/head_64.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/kernel/head_64.S
@@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
* address given in m16:64.
*/
pushq $.Lafter_lret # put return address on stack for unwinder
- xorq %rbp, %rbp # clear frame pointer
+ xorl %ebp, %ebp # clear frame pointer
movq initial_code(%rip), %rax
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
--- 4.18-rc2/arch/x86/kernel/paravirt_patch_64.c
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/kernel/paravirt_patch_64.c
@@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");

#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
-DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
#endif

unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
--- 4.18-rc2/arch/x86/lib/memcpy_64.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/lib/memcpy_64.S
@@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe)

/* Copy successful. Return zero */
.L_done_memcpy_trap:
- xorq %rax, %rax
+ xorl %eax, %eax
ret
ENDPROC(__memcpy_mcsafe)
EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
--- 4.18-rc2/arch/x86/power/hibernate_asm_64.S
+++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/power/hibernate_asm_64.S
@@ -137,7 +137,7 @@ ENTRY(restore_registers)
/* Saved in save_processor_state. */
lgdt saved_context_gdt_desc(%rax)

- xorq %rax, %rax
+ xorl %eax, %eax

/* tell the hibernation core that we've just restored the memory */
movq %rax, in_suspend(%rip)





2018-06-25 16:34:45

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH] x86-64: use 32-bit XOR to zero registers

On 06/25/2018 03:25 AM, Jan Beulich wrote:
> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms - use
> 32-bit ones instead.

Hmph. Is that considered a bug (errata)?

URL/references?

Are these changes really only zeroing the lower 32 bits of the register?
and that's all that the code cares about?

thanks.

> Signed-off-by: Jan Beulich <[email protected]>
> ---
> arch/x86/crypto/aegis128-aesni-asm.S | 2 +-
> arch/x86/crypto/aegis128l-aesni-asm.S | 2 +-
> arch/x86/crypto/aegis256-aesni-asm.S | 2 +-
> arch/x86/crypto/aesni-intel_asm.S | 8 ++++----
> arch/x86/crypto/aesni-intel_avx-x86_64.S | 4 ++--
> arch/x86/crypto/morus1280-avx2-asm.S | 2 +-
> arch/x86/crypto/morus1280-sse2-asm.S | 2 +-
> arch/x86/crypto/morus640-sse2-asm.S | 2 +-
> arch/x86/crypto/sha1_ssse3_asm.S | 2 +-
> arch/x86/kernel/head_64.S | 2 +-
> arch/x86/kernel/paravirt_patch_64.c | 2 +-
> arch/x86/lib/memcpy_64.S | 2 +-
> arch/x86/power/hibernate_asm_64.S | 2 +-
> 13 files changed, 17 insertions(+), 17 deletions(-)
>
> --- 4.18-rc2/arch/x86/crypto/aegis128-aesni-asm.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aegis128-aesni-asm.S
> @@ -75,7 +75,7 @@
> * %r9
> */
> __load_partial:
> - xor %r9, %r9
> + xor %r9d, %r9d
> pxor MSG, MSG
>
> mov LEN, %r8
> --- 4.18-rc2/arch/x86/crypto/aegis128l-aesni-asm.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aegis128l-aesni-asm.S
> @@ -66,7 +66,7 @@
> * %r9
> */
> __load_partial:
> - xor %r9, %r9
> + xor %r9d, %r9d
> pxor MSG0, MSG0
> pxor MSG1, MSG1
>
> --- 4.18-rc2/arch/x86/crypto/aegis256-aesni-asm.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aegis256-aesni-asm.S
> @@ -59,7 +59,7 @@
> * %r9
> */
> __load_partial:
> - xor %r9, %r9
> + xor %r9d, %r9d
> pxor MSG, MSG
>
> mov LEN, %r8
> --- 4.18-rc2/arch/x86/crypto/aesni-intel_asm.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_asm.S
> @@ -258,7 +258,7 @@ ALL_F: .octa 0xffffffffffffffffffff
> .macro GCM_INIT Iv SUBKEY AAD AADLEN
> mov \AADLEN, %r11
> mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
> - xor %r11, %r11
> + xor %r11d, %r11d
> mov %r11, InLen(%arg2) # ctx_data.in_length = 0
> mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
> mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
> @@ -286,7 +286,7 @@ ALL_F: .octa 0xffffffffffffffffffff
> movdqu HashKey(%arg2), %xmm13
> add %arg5, InLen(%arg2)
>
> - xor %r11, %r11 # initialise the data pointer offset as zero
> + xor %r11d, %r11d # initialise the data pointer offset as zero
> PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
>
> sub %r11, %arg5 # sub partial block data used
> @@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
>
> # GHASH computation for the last <16 Byte block
> GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
> - xor %rax,%rax
> + xor %eax, %eax
>
> mov %rax, PBlockLen(%arg2)
> jmp _dec_done_\@
> @@ -737,7 +737,7 @@ _no_extra_mask_2_\@:
>
> # GHASH computation for the last <16 Byte block
> GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
> - xor %rax,%rax
> + xor %eax, %eax
>
> mov %rax, PBlockLen(%arg2)
> jmp _encode_done_\@
> --- 4.18-rc2/arch/x86/crypto/aesni-intel_avx-x86_64.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_avx-x86_64.S
> @@ -463,7 +463,7 @@ _get_AAD_rest_final\@:
>
> _get_AAD_done\@:
> # initialize the data pointer offset as zero
> - xor %r11, %r11
> + xor %r11d, %r11d
>
> # start AES for num_initial_blocks blocks
> mov arg5, %rax # rax = *Y0
> @@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@:
>
> _get_AAD_done\@:
> # initialize the data pointer offset as zero
> - xor %r11, %r11
> + xor %r11d, %r11d
>
> # start AES for num_initial_blocks blocks
> mov arg5, %rax # rax = *Y0
> --- 4.18-rc2/arch/x86/crypto/morus1280-avx2-asm.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/morus1280-avx2-asm.S
> @@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero)
> * %r9
> */
> __load_partial:
> - xor %r9, %r9
> + xor %r9d, %r9d
> vpxor MSG, MSG, MSG
>
> mov %rcx, %r8
> --- 4.18-rc2/arch/x86/crypto/morus1280-sse2-asm.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/morus1280-sse2-asm.S
> @@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero)
> * %r9
> */
> __load_partial:
> - xor %r9, %r9
> + xor %r9d, %r9d
> pxor MSG_LO, MSG_LO
> pxor MSG_HI, MSG_HI
>
> --- 4.18-rc2/arch/x86/crypto/morus640-sse2-asm.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/morus640-sse2-asm.S
> @@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero)
> * %r9
> */
> __load_partial:
> - xor %r9, %r9
> + xor %r9d, %r9d
> pxor MSG, MSG
>
> mov %rcx, %r8
> --- 4.18-rc2/arch/x86/crypto/sha1_ssse3_asm.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/sha1_ssse3_asm.S
> @@ -96,7 +96,7 @@
> # cleanup workspace
> mov $8, %ecx
> mov %rsp, %rdi
> - xor %rax, %rax
> + xor %eax, %eax
> rep stosq
>
> mov %rbp, %rsp # deallocate workspace
> --- 4.18-rc2/arch/x86/kernel/head_64.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/kernel/head_64.S
> @@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
> * address given in m16:64.
> */
> pushq $.Lafter_lret # put return address on stack for unwinder
> - xorq %rbp, %rbp # clear frame pointer
> + xorl %ebp, %ebp # clear frame pointer
> movq initial_code(%rip), %rax
> pushq $__KERNEL_CS # set correct cs
> pushq %rax # target address in negative space
> --- 4.18-rc2/arch/x86/kernel/paravirt_patch_64.c
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/kernel/paravirt_patch_64.c
> @@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
>
> #if defined(CONFIG_PARAVIRT_SPINLOCKS)
> DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
> -DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
> +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
> #endif
>
> unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
> --- 4.18-rc2/arch/x86/lib/memcpy_64.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/lib/memcpy_64.S
> @@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe)
>
> /* Copy successful. Return zero */
> .L_done_memcpy_trap:
> - xorq %rax, %rax
> + xorl %eax, %eax
> ret
> ENDPROC(__memcpy_mcsafe)
> EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
> --- 4.18-rc2/arch/x86/power/hibernate_asm_64.S
> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/power/hibernate_asm_64.S
> @@ -137,7 +137,7 @@ ENTRY(restore_registers)
> /* Saved in save_processor_state. */
> lgdt saved_context_gdt_desc(%rax)
>
> - xorq %rax, %rax
> + xorl %eax, %eax
>
> /* tell the hibernation core that we've just restored the memory */
> movq %rax, in_suspend(%rip)
>
>
>


--
~Randy

2018-06-25 16:50:59

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] x86-64: use 32-bit XOR to zero registers

On June 25, 2018 9:33:35 AM PDT, Randy Dunlap <[email protected]> wrote:
>On 06/25/2018 03:25 AM, Jan Beulich wrote:
>> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms - use
>> 32-bit ones instead.
>
>Hmph. Is that considered a bug (errata)?
>
>URL/references?
>
>Are these changes really only zeroing the lower 32 bits of the
>register?
>and that's all that the code cares about?
>
>thanks.
>
>> Signed-off-by: Jan Beulich <[email protected]>
>> ---
>> arch/x86/crypto/aegis128-aesni-asm.S | 2 +-
>> arch/x86/crypto/aegis128l-aesni-asm.S | 2 +-
>> arch/x86/crypto/aegis256-aesni-asm.S | 2 +-
>> arch/x86/crypto/aesni-intel_asm.S | 8 ++++----
>> arch/x86/crypto/aesni-intel_avx-x86_64.S | 4 ++--
>> arch/x86/crypto/morus1280-avx2-asm.S | 2 +-
>> arch/x86/crypto/morus1280-sse2-asm.S | 2 +-
>> arch/x86/crypto/morus640-sse2-asm.S | 2 +-
>> arch/x86/crypto/sha1_ssse3_asm.S | 2 +-
>> arch/x86/kernel/head_64.S | 2 +-
>> arch/x86/kernel/paravirt_patch_64.c | 2 +-
>> arch/x86/lib/memcpy_64.S | 2 +-
>> arch/x86/power/hibernate_asm_64.S | 2 +-
>> 13 files changed, 17 insertions(+), 17 deletions(-)
>>
>> --- 4.18-rc2/arch/x86/crypto/aegis128-aesni-asm.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aegis128-aesni-asm.S
>> @@ -75,7 +75,7 @@
>> * %r9
>> */
>> __load_partial:
>> - xor %r9, %r9
>> + xor %r9d, %r9d
>> pxor MSG, MSG
>>
>> mov LEN, %r8
>> --- 4.18-rc2/arch/x86/crypto/aegis128l-aesni-asm.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aegis128l-aesni-asm.S
>> @@ -66,7 +66,7 @@
>> * %r9
>> */
>> __load_partial:
>> - xor %r9, %r9
>> + xor %r9d, %r9d
>> pxor MSG0, MSG0
>> pxor MSG1, MSG1
>>
>> --- 4.18-rc2/arch/x86/crypto/aegis256-aesni-asm.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aegis256-aesni-asm.S
>> @@ -59,7 +59,7 @@
>> * %r9
>> */
>> __load_partial:
>> - xor %r9, %r9
>> + xor %r9d, %r9d
>> pxor MSG, MSG
>>
>> mov LEN, %r8
>> --- 4.18-rc2/arch/x86/crypto/aesni-intel_asm.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_asm.S
>> @@ -258,7 +258,7 @@ ALL_F: .octa 0xffffffffffffffffffff
>> .macro GCM_INIT Iv SUBKEY AAD AADLEN
>> mov \AADLEN, %r11
>> mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
>> - xor %r11, %r11
>> + xor %r11d, %r11d
>> mov %r11, InLen(%arg2) # ctx_data.in_length = 0
>> mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
>> mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
>> @@ -286,7 +286,7 @@ ALL_F: .octa 0xffffffffffffffffffff
>> movdqu HashKey(%arg2), %xmm13
>> add %arg5, InLen(%arg2)
>>
>> - xor %r11, %r11 # initialise the data pointer offset as zero
>> + xor %r11d, %r11d # initialise the data pointer offset as zero
>> PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
>>
>> sub %r11, %arg5 # sub partial block data used
>> @@ -702,7 +702,7 @@ _no_extra_mask_1_\@:
>>
>> # GHASH computation for the last <16 Byte block
>> GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
>> - xor %rax,%rax
>> + xor %eax, %eax
>>
>> mov %rax, PBlockLen(%arg2)
>> jmp _dec_done_\@
>> @@ -737,7 +737,7 @@ _no_extra_mask_2_\@:
>>
>> # GHASH computation for the last <16 Byte block
>> GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
>> - xor %rax,%rax
>> + xor %eax, %eax
>>
>> mov %rax, PBlockLen(%arg2)
>> jmp _encode_done_\@
>> --- 4.18-rc2/arch/x86/crypto/aesni-intel_avx-x86_64.S
>> +++
>4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/aesni-intel_avx-x86_64.S
>> @@ -463,7 +463,7 @@ _get_AAD_rest_final\@:
>>
>> _get_AAD_done\@:
>> # initialize the data pointer offset as zero
>> - xor %r11, %r11
>> + xor %r11d, %r11d
>>
>> # start AES for num_initial_blocks blocks
>> mov arg5, %rax # rax = *Y0
>> @@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@:
>>
>> _get_AAD_done\@:
>> # initialize the data pointer offset as zero
>> - xor %r11, %r11
>> + xor %r11d, %r11d
>>
>> # start AES for num_initial_blocks blocks
>> mov arg5, %rax # rax = *Y0
>> --- 4.18-rc2/arch/x86/crypto/morus1280-avx2-asm.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/morus1280-avx2-asm.S
>> @@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero)
>> * %r9
>> */
>> __load_partial:
>> - xor %r9, %r9
>> + xor %r9d, %r9d
>> vpxor MSG, MSG, MSG
>>
>> mov %rcx, %r8
>> --- 4.18-rc2/arch/x86/crypto/morus1280-sse2-asm.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/morus1280-sse2-asm.S
>> @@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero)
>> * %r9
>> */
>> __load_partial:
>> - xor %r9, %r9
>> + xor %r9d, %r9d
>> pxor MSG_LO, MSG_LO
>> pxor MSG_HI, MSG_HI
>>
>> --- 4.18-rc2/arch/x86/crypto/morus640-sse2-asm.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/morus640-sse2-asm.S
>> @@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero)
>> * %r9
>> */
>> __load_partial:
>> - xor %r9, %r9
>> + xor %r9d, %r9d
>> pxor MSG, MSG
>>
>> mov %rcx, %r8
>> --- 4.18-rc2/arch/x86/crypto/sha1_ssse3_asm.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/crypto/sha1_ssse3_asm.S
>> @@ -96,7 +96,7 @@
>> # cleanup workspace
>> mov $8, %ecx
>> mov %rsp, %rdi
>> - xor %rax, %rax
>> + xor %eax, %eax
>> rep stosq
>>
>> mov %rbp, %rsp # deallocate workspace
>> --- 4.18-rc2/arch/x86/kernel/head_64.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/kernel/head_64.S
>> @@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
>> * address given in m16:64.
>> */
>> pushq $.Lafter_lret # put return address on stack for unwinder
>> - xorq %rbp, %rbp # clear frame pointer
>> + xorl %ebp, %ebp # clear frame pointer
>> movq initial_code(%rip), %rax
>> pushq $__KERNEL_CS # set correct cs
>> pushq %rax # target address in negative space
>> --- 4.18-rc2/arch/x86/kernel/paravirt_patch_64.c
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/kernel/paravirt_patch_64.c
>> @@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
>>
>> #if defined(CONFIG_PARAVIRT_SPINLOCKS)
>> DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
>> -DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
>> +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
>> #endif
>>
>> unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
>> --- 4.18-rc2/arch/x86/lib/memcpy_64.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/lib/memcpy_64.S
>> @@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe)
>>
>> /* Copy successful. Return zero */
>> .L_done_memcpy_trap:
>> - xorq %rax, %rax
>> + xorl %eax, %eax
>> ret
>> ENDPROC(__memcpy_mcsafe)
>> EXPORT_SYMBOL_GPL(__memcpy_mcsafe)
>> --- 4.18-rc2/arch/x86/power/hibernate_asm_64.S
>> +++ 4.18-rc2-x86_64-32bit-XOR/arch/x86/power/hibernate_asm_64.S
>> @@ -137,7 +137,7 @@ ENTRY(restore_registers)
>> /* Saved in save_processor_state. */
>> lgdt saved_context_gdt_desc(%rax)
>>
>> - xorq %rax, %rax
>> + xorl %eax, %eax
>>
>> /* tell the hibernation core that we've just restored the memory */
>> movq %rax, in_suspend(%rip)
>>
>>
>>

Writing the low 32 bits zero-extends the result to 64 bits anyway.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.

2018-06-26 06:34:49

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH] x86-64: use 32-bit XOR to zero registers

>>> On 25.06.18 at 18:33, <[email protected]> wrote:
> On 06/25/2018 03:25 AM, Jan Beulich wrote:
>> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms - use
>> 32-bit ones instead.
>
> Hmph. Is that considered a bug (errata)?

No.

> URL/references?

Intel's Optimization Reference Manual says so (in rev 040 this is in section
16.2.2.5 "Zeroing Idioms" as a subsection of the Goldmont/Silvermont
descriptions).

> Are these changes really only zeroing the lower 32 bits of the register?
> and that's all that the code cares about?

No - like all operations targeting a 32-bit register, the result is zero
extended to the entire 64-bit destination register.

Jan



2018-06-26 07:18:11

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86-64: use 32-bit XOR to zero registers


* Jan Beulich <[email protected]> wrote:

> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms

Please write out the consequence of that in the changelog.

Thanks,

Ingo

Subject: Re: [PATCH] x86-64: use 32-bit XOR to zero registers

On Tue, 26 Jun 2018, Jan Beulich wrote:
> >>> On 25.06.18 at 18:33, <[email protected]> wrote:
> > On 06/25/2018 03:25 AM, Jan Beulich wrote:
> >> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms - use
> >> 32-bit ones instead.
> >
> > Hmph. Is that considered a bug (errata)?
>
> No.
>
> > URL/references?
>
> Intel's Optimization Reference Manual says so (in rev 040 this is in section
> 16.2.2.5 "Zeroing Idioms" as a subsection of the Goldmont/Silvermont
> descriptions).
>
> > Are these changes really only zeroing the lower 32 bits of the register?
> > and that's all that the code cares about?
>
> No - like all operations targeting a 32-bit register, the result is zero
> extended to the entire 64-bit destination register.

Missing information that would have been helpful in the commit message:

When the processor can recognize something as a zeroing idiom, it
optimizes that operation on the front-end. Only 32-bit XOR r,r is
documented as a zeroing idiom according to the Intel optimization
manual. While a few Intel processors recognize the 64-bit version of
XOR r,r as a zeroing idiom, many won't.

Note that the 32-bit operation extends to the high part of the 64-bit
register, so it will zero the entire 64-bit register. The 32-bit
instruction is also one byte shorter.

The last sentence is just a reminder, for completeness...

--
Henrique Holschuh

2018-07-26 09:21:08

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] x86-64: use 32-bit XOR to zero registers

On Tue 2018-06-26 08:38:22, Henrique de Moraes Holschuh wrote:
> On Tue, 26 Jun 2018, Jan Beulich wrote:
> > >>> On 25.06.18 at 18:33, <[email protected]> wrote:
> > > On 06/25/2018 03:25 AM, Jan Beulich wrote:
> > >> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms - use
> > >> 32-bit ones instead.
> > >
> > > Hmph. Is that considered a bug (errata)?
> >
> > No.
> >
> > > URL/references?
> >
> > Intel's Optimization Reference Manual says so (in rev 040 this is in section
> > 16.2.2.5 "Zeroing Idioms" as a subsection of the Goldmont/Silvermont
> > descriptions).
> >
> > > Are these changes really only zeroing the lower 32 bits of the register?
> > > and that's all that the code cares about?
> >
> > No - like all operations targeting a 32-bit register, the result is zero
> > extended to the entire 64-bit destination register.
>
> Missing information that would have been helpful in the commit message:
>
> When the processor can recognize something as a zeroing idiom, it
> optimizes that operation on the front-end. Only 32-bit XOR r,r is
> documented as a zeroing idiom according to the Intel optimization
> manual. While a few Intel processors recognize the 64-bit version of
> XOR r,r as a zeroing idiom, many won't.
>
> Note that the 32-bit operation extends to the high part of the 64-bit
> register, so it will zero the entire 64-bit register. The 32-bit
> instruction is also one byte shorter.

Actually, I believe that should be comment in code. But Ingo (?) told
me everyone knows about this quirk...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


Attachments:
(No filename) (1.70 kB)
signature.asc (188.00 B)
Digital signature
Download all attachments

2018-07-26 11:46:51

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86-64: use 32-bit XOR to zero registers


* Pavel Machek <[email protected]> wrote:

> On Tue 2018-06-26 08:38:22, Henrique de Moraes Holschuh wrote:
> > On Tue, 26 Jun 2018, Jan Beulich wrote:
> > > >>> On 25.06.18 at 18:33, <[email protected]> wrote:
> > > > On 06/25/2018 03:25 AM, Jan Beulich wrote:
> > > >> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms - use
> > > >> 32-bit ones instead.
> > > >
> > > > Hmph. Is that considered a bug (errata)?
> > >
> > > No.
> > >
> > > > URL/references?
> > >
> > > Intel's Optimization Reference Manual says so (in rev 040 this is in section
> > > 16.2.2.5 "Zeroing Idioms" as a subsection of the Goldmont/Silvermont
> > > descriptions).
> > >
> > > > Are these changes really only zeroing the lower 32 bits of the register?
> > > > and that's all that the code cares about?
> > >
> > > No - like all operations targeting a 32-bit register, the result is zero
> > > extended to the entire 64-bit destination register.
> >
> > Missing information that would have been helpful in the commit message:
> >
> > When the processor can recognize something as a zeroing idiom, it
> > optimizes that operation on the front-end. Only 32-bit XOR r,r is
> > documented as a zeroing idiom according to the Intel optimization
> > manual. While a few Intel processors recognize the 64-bit version of
> > XOR r,r as a zeroing idiom, many won't.
> >
> > Note that the 32-bit operation extends to the high part of the 64-bit
> > register, so it will zero the entire 64-bit register. The 32-bit
> > instruction is also one byte shorter.
>
> Actually, I believe that should be comment in code.

Agreed - mind sending a patch that adds it?

> But Ingo (?) told me everyone knows about this quirk...

I was wrong.

Thanks,

Ingo

2018-07-26 18:35:41

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH] x86-64: use 32-bit XOR to zero registers

On Thu 2018-07-26 13:45:37, Ingo Molnar wrote:
>
> * Pavel Machek <[email protected]> wrote:
>
> > On Tue 2018-06-26 08:38:22, Henrique de Moraes Holschuh wrote:
> > > On Tue, 26 Jun 2018, Jan Beulich wrote:
> > > > >>> On 25.06.18 at 18:33, <[email protected]> wrote:
> > > > > On 06/25/2018 03:25 AM, Jan Beulich wrote:
> > > > >> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms - use
> > > > >> 32-bit ones instead.
> > > > >
> > > > > Hmph. Is that considered a bug (errata)?
> > > >
> > > > No.
> > > >
> > > > > URL/references?
> > > >
> > > > Intel's Optimization Reference Manual says so (in rev 040 this is in section
> > > > 16.2.2.5 "Zeroing Idioms" as a subsection of the Goldmont/Silvermont
> > > > descriptions).
> > > >
> > > > > Are these changes really only zeroing the lower 32 bits of the register?
> > > > > and that's all that the code cares about?
> > > >
> > > > No - like all operations targeting a 32-bit register, the result is zero
> > > > extended to the entire 64-bit destination register.
> > >
> > > Missing information that would have been helpful in the commit message:
> > >
> > > When the processor can recognize something as a zeroing idiom, it
> > > optimizes that operation on the front-end. Only 32-bit XOR r,r is
> > > documented as a zeroing idiom according to the Intel optimization
> > > manual. While a few Intel processors recognize the 64-bit version of
> > > XOR r,r as a zeroing idiom, many won't.
> > >
> > > Note that the 32-bit operation extends to the high part of the 64-bit
> > > register, so it will zero the entire 64-bit register. The 32-bit
> > > instruction is also one byte shorter.
> >
> > Actually, I believe that should be comment in code.
>
> Agreed - mind sending a patch that adds it?

Ok. Would /* write to low 32 bits clears high 32 bits, too */ be
reasonable comment?

Thanks,
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


Attachments:
(No filename) (2.03 kB)
signature.asc (188.00 B)
Digital signature
Download all attachments

2018-07-26 19:07:47

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86-64: use 32-bit XOR to zero registers


* Pavel Machek <[email protected]> wrote:

> On Thu 2018-07-26 13:45:37, Ingo Molnar wrote:
> >
> > * Pavel Machek <[email protected]> wrote:
> >
> > > On Tue 2018-06-26 08:38:22, Henrique de Moraes Holschuh wrote:
> > > > On Tue, 26 Jun 2018, Jan Beulich wrote:
> > > > > >>> On 25.06.18 at 18:33, <[email protected]> wrote:
> > > > > > On 06/25/2018 03:25 AM, Jan Beulich wrote:
> > > > > >> Some Intel CPUs don't recognize 64-bit XORs as zeroing idioms - use
> > > > > >> 32-bit ones instead.
> > > > > >
> > > > > > Hmph. Is that considered a bug (errata)?
> > > > >
> > > > > No.
> > > > >
> > > > > > URL/references?
> > > > >
> > > > > Intel's Optimization Reference Manual says so (in rev 040 this is in section
> > > > > 16.2.2.5 "Zeroing Idioms" as a subsection of the Goldmont/Silvermont
> > > > > descriptions).
> > > > >
> > > > > > Are these changes really only zeroing the lower 32 bits of the register?
> > > > > > and that's all that the code cares about?
> > > > >
> > > > > No - like all operations targeting a 32-bit register, the result is zero
> > > > > extended to the entire 64-bit destination register.
> > > >
> > > > Missing information that would have been helpful in the commit message:
> > > >
> > > > When the processor can recognize something as a zeroing idiom, it
> > > > optimizes that operation on the front-end. Only 32-bit XOR r,r is
> > > > documented as a zeroing idiom according to the Intel optimization
> > > > manual. While a few Intel processors recognize the 64-bit version of
> > > > XOR r,r as a zeroing idiom, many won't.
> > > >
> > > > Note that the 32-bit operation extends to the high part of the 64-bit
> > > > register, so it will zero the entire 64-bit register. The 32-bit
> > > > instruction is also one byte shorter.
> > >
> > > Actually, I believe that should be comment in code.
> >
> > Agreed - mind sending a patch that adds it?
>
> Ok. Would /* write to low 32 bits clears high 32 bits, too */ be
> reasonable comment?

So I'd suggest putting the above description somewhere strategic - such as the top
of entry_64.S, or calling.h, or so?

Thanks,

Ingo