Date: Sun, 5 Mar 2017 12:18:23 +0100
From: Borislav Petkov <bp@suse.de>
To: hpa@zytor.com
Cc: Logan Gunthorpe <logang@deltatee.com>,
        Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@redhat.com>,
        Tony Luck <tony.luck@intel.com>, Al Viro <viro@zeniv.linux.org.uk>,
        the arch/x86 maintainers <x86@kernel.org>,
        Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
        Linus Torvalds <torvalds@linux-foundation.org>
Subject: Re: Question Regarding ERMS memcpy
Message-ID: <20170305111823.6ir7qwrgkn7fhlap@pd.tnic>
References: <e3e805ee-dc81-55f3-46e6-e3c7430096c3@deltatee.com>
 <20170304224341.zfp4fl37ypt57amg@pd.tnic>
 <c675a6c8-e87a-ece2-2350-cdd079b1b610@deltatee.com>
 <5CCEF10D-5647-4503-A398-0681DF2C8847@zytor.com>
 <20170305001447.kcxignj3nsq35vci@pd.tnic>
 <BF3D2B38-BC19-4B66-840E-6E9DF9C3E649@zytor.com>
 <20170305003349.6kgq4ovj7ipezfxu@pd.tnic>
 <C2B520E2-A4B8-4A90-BAA0-AAA44185837E@zytor.com>
 <20170305095059.l4od2yjqm5yxx6ln@pd.tnic>
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline
Content-Transfer-Encoding: 8bit
In-Reply-To: <20170305095059.l4od2yjqm5yxx6ln@pd.tnic>
User-Agent: NeoMutt/20170113 (1.7.2)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7511
Lines: 245

On Sun, Mar 05, 2017 at 10:50:59AM +0100, Borislav Petkov wrote:
> On Sat, Mar 04, 2017 at 04:56:38PM -0800, hpa@zytor.com wrote:
> > That's what the -march= and -mtune= option do!
> 
> How does that even help with a distro kernel built with -mtune=generic ?
> 
> gcc can't possibly know on what targets is that kernel going to be
> booted on. So it probably does some universally optimal things, like in
> the dmi_scan_machine() case:
> 
> 	memcpy_fromio(buf, p, 32);
> 
> turns into:
> 
>         .loc 3 219 0
>         movl    $8, %ecx        #, tmp79
>         movq    %rax, %rsi      # p, p
>         movq    %rsp, %rdi      #, tmp77
>         rep movsl
> 
> Apparently it thinks it is fine to do 8*4-byte MOVS. But why not
> 4*8-byte MOVS?
> 
> That's half the loops.
> 
>  [ It is a whole different story what the machine actually does underneath. It
>     being a half cacheline probably doesn't help and it really does the separate
>     MOVs but then it would be cheaper if it did 4 8-byte ones. ]
> 
> One thing's for sure - both variants are certainly cheaper than to CALL
> a memcpy variant.
> 
> What we probably should try to do, though, is simply patch in the body
> of REP; MOVSQ or REP; MOVSB into the call sites and only have a call to
> memcpy_orig() because that last one if fat.
> 
> I remember we did talk about it at some point but don't remember why we
> didn't do it.

Right, and I believe it was Linus who mentioned we should simply patch
in the small REP_GOOD and ERMS memcpy body into the call sites. CCed.

Anyway, something like below.

It almost builds here - I still need to figure out that
arch/x86/boot/compressed/misc.c including decompression logic which does
memcpy and it turns into the alternative version which we don't want in
arch/x86/boot/compressed/.

Also, I need to check what vmlinuz size bloat we're talking: with the
diff below, we do add padding which looks like this:

      3d:       90                      nop
      3e:       90                      nop
      3f:       90                      nop
      40:       90                      nop
      41:       90                      nop
      42:       90                      nop
      43:       90                      nop
      44:       90                      nop
      45:       90                      nop
      46:       90                      nop
      47:       90                      nop
      48:       90                      nop
      49:       90                      nop
      4a:       90                      nop
      4b:       90                      nop
      4c:       90                      nop
      4d:       90                      nop
      4e:       90                      nop
      4f:       90                      nop

and that at every call site.

But at least we save ourselves the call overhead and use the optimal
memcpy variant on each machine.

---
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d449337a360..24711ca4033b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -143,9 +143,7 @@ endif
 export CONFIG_X86_X32_ABI
 
 # Don't unroll struct assignments with kmemcheck enabled
-ifeq ($(CONFIG_KMEMCHECK),y)
-	KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
-endif
+KBUILD_CFLAGS += $(call cc-option,-fno-builtin)
 
 # Stackpointer is addressed different for 32 bit and 64 bit x86
 sp-$(CONFIG_X86_32) := esp
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 1b020381ab38..8216c2e610ae 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -204,6 +204,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
 		: output : "i" (0), ## input)
 
+#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2, feature2,	\
+			 output, input...) \
+	asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)	\
+			: output						\
+			: "i" (0), ## input)
+
 /* Like alternative_io, but for replacing a direct call with another one. */
 #define alternative_call(oldfunc, newfunc, feature, output, input...)	\
 	asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index a164862d77e3..a529a12a35ed 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -3,6 +3,8 @@
 
 #ifdef __KERNEL__
 #include <linux/jump_label.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
 
 /* Written 2002 by Andi Kleen */
 
@@ -28,8 +30,37 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
    function. */
 
 #define __HAVE_ARCH_MEMCPY 1
-extern void *memcpy(void *to, const void *from, size_t len);
-extern void *__memcpy(void *to, const void *from, size_t len);
+void *memcpy_orig(void *to, const void *from, size_t len);
+
+/*
+ * We build a call to memcpy_orig by default which replaced on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change the REP_GOOD
+ * routine to a REP; MOVSB mem copy.
+ */
+static inline void *memcpy(void *to, const void *from, size_t len)
+{
+	void *ret;
+
+	alternative_io_2("call memcpy_orig\n\t",
+			 "movq %%rdi, %%rax\n\t"
+			 "movq %%rdx, %%rcx\n\t"
+			 "shrq $3, %%rcx\n\t"
+			 "andl $7, %%edx\n\t"
+			 "rep movsq\n\t"
+			 "movl %%edx, %%ecx\n\t"
+			 "rep movsb\n\t",
+			 X86_FEATURE_REP_GOOD,
+			 "movq %%rdi, %%rax\n\t"
+			 "movq %%rdx, %%rcx\n\t"
+			 "rep movsb\n\t",
+			 X86_FEATURE_ERMS,
+			 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)),
+			 "1" (to), "2" (from), "3" (len)
+			 : "memory", "rcx");
+
+	return ret;
+}
 
 #ifndef CONFIG_KMEMCHECK
 #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 779782f58324..c3c8890b155c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -6,13 +6,6 @@
 #include <asm/alternative-asm.h>
 #include <asm/export.h>
 
-/*
- * We build a jump to memcpy_orig by default which gets NOPped out on
- * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
- * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
- * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
- */
-
 .weak memcpy
 
 /*
@@ -26,36 +19,12 @@
  * Output:
  * rax original destination
  */
-ENTRY(__memcpy)
-ENTRY(memcpy)
-	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
-		      "jmp memcpy_erms", X86_FEATURE_ERMS
-
-	movq %rdi, %rax
-	movq %rdx, %rcx
-	shrq $3, %rcx
-	andl $7, %edx
-	rep movsq
-	movl %edx, %ecx
-	rep movsb
-	ret
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-EXPORT_SYMBOL(memcpy)
-EXPORT_SYMBOL(__memcpy)
-
-/*
- * memcpy_erms() - enhanced fast string memcpy. This is faster and
- * simpler than memcpy. Use memcpy_erms when possible.
- */
-ENTRY(memcpy_erms)
-	movq %rdi, %rax
-	movq %rdx, %rcx
-	rep movsb
-	ret
-ENDPROC(memcpy_erms)
-
 ENTRY(memcpy_orig)
+	pushq %r8
+	pushq %r9
+	pushq %r10
+	pushq %r11
+
 	movq %rdi, %rax
 
 	cmpq $0x20, %rdx
@@ -179,8 +148,14 @@ ENTRY(memcpy_orig)
 	movb %cl, (%rdi)
 
 .Lend:
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+
 	retq
 ENDPROC(memcpy_orig)
+EXPORT_SYMBOL_GPL(memcpy_orig)
 
 #ifndef CONFIG_UML
 /*

-- 
Regards/Gruss,
    Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg)
--