Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752443AbdCELSh (ORCPT ); Sun, 5 Mar 2017 06:18:37 -0500 Received: from mx2.suse.de ([195.135.220.15]:51615 "EHLO mx2.suse.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751539AbdCELSg (ORCPT ); Sun, 5 Mar 2017 06:18:36 -0500 Date: Sun, 5 Mar 2017 12:18:23 +0100 From: Borislav Petkov To: hpa@zytor.com Cc: Logan Gunthorpe , Thomas Gleixner , Ingo Molnar , Tony Luck , Al Viro , the arch/x86 maintainers , Linux Kernel Mailing List , Linus Torvalds Subject: Re: Question Regarding ERMS memcpy Message-ID: <20170305111823.6ir7qwrgkn7fhlap@pd.tnic> References: <20170304224341.zfp4fl37ypt57amg@pd.tnic> <5CCEF10D-5647-4503-A398-0681DF2C8847@zytor.com> <20170305001447.kcxignj3nsq35vci@pd.tnic> <20170305003349.6kgq4ovj7ipezfxu@pd.tnic> <20170305095059.l4od2yjqm5yxx6ln@pd.tnic> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline Content-Transfer-Encoding: 8bit In-Reply-To: <20170305095059.l4od2yjqm5yxx6ln@pd.tnic> User-Agent: NeoMutt/20170113 (1.7.2) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 7511 Lines: 245 On Sun, Mar 05, 2017 at 10:50:59AM +0100, Borislav Petkov wrote: > On Sat, Mar 04, 2017 at 04:56:38PM -0800, hpa@zytor.com wrote: > > That's what the -march= and -mtune= option do! > > How does that even help with a distro kernel built with -mtune=generic ? > > gcc can't possibly know on what targets is that kernel going to be > booted on. So it probably does some universally optimal things, like in > the dmi_scan_machine() case: > > memcpy_fromio(buf, p, 32); > > turns into: > > .loc 3 219 0 > movl $8, %ecx #, tmp79 > movq %rax, %rsi # p, p > movq %rsp, %rdi #, tmp77 > rep movsl > > Apparently it thinks it is fine to do 8*4-byte MOVS. But why not > 4*8-byte MOVS? > > That's half the loops. > > [ It is a whole different story what the machine actually does underneath. It > being a half cacheline probably doesn't help and it really does the separate > MOVs but then it would be cheaper if it did 4 8-byte ones. ] > > One thing's for sure - both variants are certainly cheaper than to CALL > a memcpy variant. > > What we probably should try to do, though, is simply patch in the body > of REP; MOVSQ or REP; MOVSB into the call sites and only have a call to > memcpy_orig() because that last one if fat. > > I remember we did talk about it at some point but don't remember why we > didn't do it. Right, and I believe it was Linus who mentioned we should simply patch in the small REP_GOOD and ERMS memcpy body into the call sites. CCed. Anyway, something like below. It almost builds here - I still need to figure out that arch/x86/boot/compressed/misc.c including decompression logic which does memcpy and it turns into the alternative version which we don't want in arch/x86/boot/compressed/. Also, I need to check what vmlinuz size bloat we're talking: with the diff below, we do add padding which looks like this: 3d: 90 nop 3e: 90 nop 3f: 90 nop 40: 90 nop 41: 90 nop 42: 90 nop 43: 90 nop 44: 90 nop 45: 90 nop 46: 90 nop 47: 90 nop 48: 90 nop 49: 90 nop 4a: 90 nop 4b: 90 nop 4c: 90 nop 4d: 90 nop 4e: 90 nop 4f: 90 nop and that at every call site. But at least we save ourselves the call overhead and use the optimal memcpy variant on each machine. --- diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d449337a360..24711ca4033b 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -143,9 +143,7 @@ endif export CONFIG_X86_X32_ABI # Don't unroll struct assignments with kmemcheck enabled -ifeq ($(CONFIG_KMEMCHECK),y) - KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) -endif +KBUILD_CFLAGS += $(call cc-option,-fno-builtin) # Stackpointer is addressed different for 32 bit and 64 bit x86 sp-$(CONFIG_X86_32) := esp diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1b020381ab38..8216c2e610ae 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -204,6 +204,12 @@ static inline int alternatives_text_reserved(void *start, void *end) asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ : output : "i" (0), ## input) +#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2, feature2, \ + output, input...) \ + asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ + : output \ + : "i" (0), ## input) + /* Like alternative_io, but for replacing a direct call with another one. */ #define alternative_call(oldfunc, newfunc, feature, output, input...) \ asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index a164862d77e3..a529a12a35ed 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -3,6 +3,8 @@ #ifdef __KERNEL__ #include +#include +#include /* Written 2002 by Andi Kleen */ @@ -28,8 +30,37 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t function. */ #define __HAVE_ARCH_MEMCPY 1 -extern void *memcpy(void *to, const void *from, size_t len); -extern void *__memcpy(void *to, const void *from, size_t len); +void *memcpy_orig(void *to, const void *from, size_t len); + +/* + * We build a call to memcpy_orig by default which replaced on + * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which + * have the enhanced REP MOVSB/STOSB feature (ERMS), change the REP_GOOD + * routine to a REP; MOVSB mem copy. + */ +static inline void *memcpy(void *to, const void *from, size_t len) +{ + void *ret; + + alternative_io_2("call memcpy_orig\n\t", + "movq %%rdi, %%rax\n\t" + "movq %%rdx, %%rcx\n\t" + "shrq $3, %%rcx\n\t" + "andl $7, %%edx\n\t" + "rep movsq\n\t" + "movl %%edx, %%ecx\n\t" + "rep movsb\n\t", + X86_FEATURE_REP_GOOD, + "movq %%rdi, %%rax\n\t" + "movq %%rdx, %%rcx\n\t" + "rep movsb\n\t", + X86_FEATURE_ERMS, + ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)), + "1" (to), "2" (from), "3" (len) + : "memory", "rcx"); + + return ret; +} #ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 779782f58324..c3c8890b155c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -6,13 +6,6 @@ #include #include -/* - * We build a jump to memcpy_orig by default which gets NOPped out on - * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which - * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs - * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. - */ - .weak memcpy /* @@ -26,36 +19,12 @@ * Output: * rax original destination */ -ENTRY(__memcpy) -ENTRY(memcpy) - ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ - "jmp memcpy_erms", X86_FEATURE_ERMS - - movq %rdi, %rax - movq %rdx, %rcx - shrq $3, %rcx - andl $7, %edx - rep movsq - movl %edx, %ecx - rep movsb - ret -ENDPROC(memcpy) -ENDPROC(__memcpy) -EXPORT_SYMBOL(memcpy) -EXPORT_SYMBOL(__memcpy) - -/* - * memcpy_erms() - enhanced fast string memcpy. This is faster and - * simpler than memcpy. Use memcpy_erms when possible. - */ -ENTRY(memcpy_erms) - movq %rdi, %rax - movq %rdx, %rcx - rep movsb - ret -ENDPROC(memcpy_erms) - ENTRY(memcpy_orig) + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + movq %rdi, %rax cmpq $0x20, %rdx @@ -179,8 +148,14 @@ ENTRY(memcpy_orig) movb %cl, (%rdi) .Lend: + popq %r11 + popq %r10 + popq %r9 + popq %r8 + retq ENDPROC(memcpy_orig) +EXPORT_SYMBOL_GPL(memcpy_orig) #ifndef CONFIG_UML /* -- Regards/Gruss, Boris. SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg) --