Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752523AbdLGWm3 (ORCPT ); Thu, 7 Dec 2017 17:42:29 -0500 Received: from mail-wr0-f195.google.com ([209.85.128.195]:35098 "EHLO mail-wr0-f195.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750973AbdLGWmH (ORCPT ); Thu, 7 Dec 2017 17:42:07 -0500 X-Google-Smtp-Source: AGs4zMZ3VtU3OgmQN1YRlwM6wLRcE1irUcI5XUXb6f3BeQricWgSIIZ89sRBcB85Dr4GoxoXdsx4ew== From: Alexey Dobriyan To: linux-kernel@vger.kernel.org Cc: x86@kernel.org, tglx@linutronix.de, mingo@redhat.com, hpa@zytor.com, Alexey Dobriyan Subject: [PATCH 4/5] -march=native: REP STOSB Date: Fri, 8 Dec 2017 01:41:53 +0300 Message-Id: <20171207224154.4687-4-adobriyan@gmail.com> X-Mailer: git-send-email 2.13.6 In-Reply-To: <20171207224154.4687-1-adobriyan@gmail.com> References: <20171207224154.4687-1-adobriyan@gmail.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 5414 Lines: 192 If CPU advertises fast REP STOSB, use it. Inline clear_page() to use only 3 registers across function call not whole shebang as required by ABI. Also, tell gcc to use REP STOSB for memset(), this saves terabytes of .text. Signed-off-by: Alexey Dobriyan --- Makefile | 3 +++ arch/x86/boot/compressed/head_64.S | 4 ++++ arch/x86/crypto/sha1_ssse3_asm.S | 7 ++++++- arch/x86/include/asm/page_64.h | 13 +++++++++++++ arch/x86/lib/Makefile | 2 ++ arch/x86/lib/memset_64.S | 15 +++++++++++++++ scripts/kconfig/cpuid.c | 6 +++++- scripts/march-native.sh | 1 + 8 files changed, 49 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 84abac4c181a..70d91d52ee60 100644 --- a/Makefile +++ b/Makefile @@ -593,6 +593,9 @@ endif ifdef CONFIG_MARCH_NATIVE_REP_MOVSB KBUILD_CFLAGS += -mmemcpy-strategy=rep_byte:-1:align endif +ifdef CONFIG_MARCH_NATIVE_REP_STOSB +KBUILD_CFLAGS += -mmemset-strategy=rep_byte:-1:align +endif ifeq ($(KBUILD_EXTMOD),) # Read in dependencies to all Kconfig* files, make sure to run diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 20919b4f3133..a7913f5e18b6 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -447,8 +447,12 @@ relocated: leaq _bss(%rip), %rdi leaq _ebss(%rip), %rcx subq %rdi, %rcx +#ifdef CONFIG_MARCH_NATIVE_REP_STOSB + rep stosb +#else shrq $3, %rcx rep stosq +#endif /* * Adjust our own GOT diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 6204bd53528c..ffa41d7a582a 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -94,10 +94,15 @@ SHA1_PIPELINED_MAIN_BODY # cleanup workspace - mov $8, %ecx mov %rsp, %rdi xor %rax, %rax +#ifdef CONFIG_MARCH_NATIVE_REP_STOSB + mov $64, %ecx + rep stosb +#else + mov $8, %ecx rep stosq +#endif mov %rbp, %rsp # deallocate workspace pop %rbp diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index c2353661eaf1..b3d275b07624 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -36,6 +36,18 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define pfn_valid(pfn) ((pfn) < max_pfn) #endif +#ifdef CONFIG_MARCH_NATIVE_REP_STOSB +static __always_inline void clear_page(void *page) +{ + uint32_t len = PAGE_SIZE; + asm volatile ( + "rep stosb" + : "+D" (page), "+c" (len) + : "a" (0) + : "memory" + ); +} +#else void clear_page_orig(void *page); void clear_page_rep(void *page); void clear_page_erms(void *page); @@ -49,6 +61,7 @@ static inline void clear_page(void *page) "0" (page) : "memory", "rax", "rcx"); } +#endif #ifdef CONFIG_MARCH_NATIVE_REP_MOVSB static __always_inline void copy_page(void *to, void *from) diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 8f9460bef2ec..6cb356408ebb 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -45,7 +45,9 @@ endif else obj-y += iomap_copy_64.o lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o +ifneq ($(CONFIG_MARCH_NATIVE_REP_STOSB),y) lib-y += clear_page_64.o +endif ifneq ($(CONFIG_MARCH_NATIVE_REP_MOVSB),y) lib-y += copy_page_64.o endif diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 9bc861c71e75..7786d1a65423 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -8,6 +8,20 @@ .weak memset +#ifdef CONFIG_MARCH_NATIVE_REP_STOSB +ENTRY(memset) +ENTRY(__memset) + mov %esi, %eax + mov %rdi, %rsi + mov %rdx, %rcx + rep stosb + mov %rsi, %rax + ret +ENDPROC(memset) +ENDPROC(__memset) +EXPORT_SYMBOL(memset) +EXPORT_SYMBOL(__memset) +#else /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is @@ -140,3 +154,4 @@ ENTRY(memset_orig) jmp .Lafter_bad_alignment .Lfinal: ENDPROC(memset_orig) +#endif diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c index 4947f47e7728..ecb285183581 100644 --- a/scripts/kconfig/cpuid.c +++ b/scripts/kconfig/cpuid.c @@ -44,6 +44,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t static bool popcnt = false; static bool rep_movsb = false; +static bool rep_stosb = false; static uint32_t eax0_max; @@ -62,8 +63,10 @@ static void intel(void) cpuid2(7, 0, &eax, &ecx, &edx, &ebx); // printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx); - if (ebx & (1 << 9)) + if (ebx & (1 << 9)) { rep_movsb = true; + rep_stosb = true; + } } } @@ -85,6 +88,7 @@ int main(int argc, char *argv[]) #define _(x) if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE _(popcnt); _(rep_movsb); + _(rep_stosb); #undef _ return EXIT_FAILURE; diff --git a/scripts/march-native.sh b/scripts/march-native.sh index eb52c20c56b4..d3adf0edb2be 100755 --- a/scripts/march-native.sh +++ b/scripts/march-native.sh @@ -32,6 +32,7 @@ option() { if test -x "$CPUID"; then "$CPUID" popcnt && option "CONFIG_MARCH_NATIVE_POPCNT" "$CPUID" rep_movsb && option "CONFIG_MARCH_NATIVE_REP_MOVSB" + "$CPUID" rep_stosb && option "CONFIG_MARCH_NATIVE_REP_STOSB" fi if test ! -f "$AUTOCONF1" -o ! -f "$AUTOCONF2"; then -- 2.13.6