LinuxLists.cc - [PATCH v2 RESEND] x86: optimize memcpy

2018-06-18 13:25:06

Subject: [PATCH v2 RESEND] x86: optimize memcpy_flushcache

From: Mikulas Patocka <[email protected]>
Subject: [PATCH v2] x86: optimize memcpy_flushcache

In the context of constant short length stores to persistent memory,
memcpy_flushcache suffers from a 2% performance degradation compared to
explicitly using the "movnti" instruction.

Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
movnti instruction with inline assembler.

Signed-off-by: Mikulas Patocka <[email protected]>
Reviewed-by: Dan Williams <[email protected]>
Signed-off-by: Mike Snitzer <[email protected]>
---
arch/x86/include/asm/string_64.h | 28 +++++++++++++++++++++++++++-
arch/x86/lib/usercopy_64.c | 4 ++--
2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 533f74c300c2..aaba83478cdc 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -147,7 +147,33 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
-void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+ if (__builtin_constant_p(cnt)) {
+ switch (cnt) {
+ case 4:
+ asm volatile("movntil %1, %0"
+ : "=m" (*(u32 *)dst)
+ : "r" (*(u32 *)src));
+ return;
+ case 8:
+ asm volatile("movntiq %1, %0"
+ : "=m" (*(u64 *)dst)
+ : "r" (*(u64 *)src));
+ return;
+ case 16:
+ asm volatile("movntiq %1, %0"
+ : "=m" (*(u64 *)dst)
+ : "r" (*(u64 *)src));
+ asm volatile("movntiq %1, %0"
+ : "=m" (*(u64 *)(dst + 8))
+ : "r" (*(u64 *)(src + 8)));
+ return;
+ }
+ }
+ __memcpy_flushcache(dst, src, cnt);
+}
#endif

#endif /* __KERNEL__ */
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 75d3776123cc..26f515aa3529 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -133,7 +133,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
return rc;
}

-void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
{
unsigned long dest = (unsigned long) _dst;
unsigned long source = (unsigned long) _src;
@@ -196,7 +196,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size)
clean_cache_range((void *) dest, size);
}
}
-EXPORT_SYMBOL_GPL(memcpy_flushcache);
+EXPORT_SYMBOL_GPL(__memcpy_flushcache);

void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len)
--
2.15.0

2018-06-21 14:34:41

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH v2 RESEND] x86: optimize memcpy_flushcache

* Mike Snitzer <[email protected]> wrote:

> From: Mikulas Patocka <[email protected]>
> Subject: [PATCH v2] x86: optimize memcpy_flushcache
>
> In the context of constant short length stores to persistent memory,
> memcpy_flushcache suffers from a 2% performance degradation compared to
> explicitly using the "movnti" instruction.
>
> Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
> movnti instruction with inline assembler.

Linus requested asm optimizations to include actual benchmarks, so it would be
nice to describe how this was tested, on what hardware, and what the before/after
numbers are.

Thanks,

Ingo

2018-06-22 01:20:27

by Mikulas Patocka

[permalink] [raw]

Subject: Re: [PATCH v2 RESEND] x86: optimize memcpy_flushcache

On Thu, 21 Jun 2018, Ingo Molnar wrote:

>
> * Mike Snitzer <[email protected]> wrote:
>
> > From: Mikulas Patocka <[email protected]>
> > Subject: [PATCH v2] x86: optimize memcpy_flushcache
> >
> > In the context of constant short length stores to persistent memory,
> > memcpy_flushcache suffers from a 2% performance degradation compared to
> > explicitly using the "movnti" instruction.
> >
> > Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
> > movnti instruction with inline assembler.
>
> Linus requested asm optimizations to include actual benchmarks, so it would be
> nice to describe how this was tested, on what hardware, and what the before/after
> numbers are.
>
> Thanks,
>
> Ingo

It was tested on 4-core skylake machine with persistent memory being
emulated using the memmap kernel option. The dm-writecache target used the
emulated persistent memory as a cache and sata SSD as a backing device.
The patch results in 2% improved throughput when writing data using dd.

I don't have access to the machine anymore.

Mikulas

2018-06-22 01:33:12

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH v2 RESEND] x86: optimize memcpy_flushcache

* Mikulas Patocka <[email protected]> wrote:

> On Thu, 21 Jun 2018, Ingo Molnar wrote:
>
> >
> > * Mike Snitzer <[email protected]> wrote:
> >
> > > From: Mikulas Patocka <[email protected]>
> > > Subject: [PATCH v2] x86: optimize memcpy_flushcache
> > >
> > > In the context of constant short length stores to persistent memory,
> > > memcpy_flushcache suffers from a 2% performance degradation compared to
> > > explicitly using the "movnti" instruction.
> > >
> > > Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
> > > movnti instruction with inline assembler.
> >
> > Linus requested asm optimizations to include actual benchmarks, so it would be
> > nice to describe how this was tested, on what hardware, and what the before/after
> > numbers are.
> >
> > Thanks,
> >
> > Ingo
>
> It was tested on 4-core skylake machine with persistent memory being
> emulated using the memmap kernel option. The dm-writecache target used the
> emulated persistent memory as a cache and sata SSD as a backing device.
> The patch results in 2% improved throughput when writing data using dd.
>
> I don't have access to the machine anymore.

I think this information is enough, but do we know how well memmap emulation
represents true persistent memory speed and cache management characteristics?
It might be representative - but I don't know for sure, nor probably most
readers of the changelog.

So could you please put all this into an updated changelog, and also add a short
description that outlines exactly which codepaths end up using this method in a
typical persistent memory setup? All filesystem ops - or only reads, etc?

Thanks,

Ingo

2018-08-08 21:23:23

by Mikulas Patocka

[permalink] [raw]

Subject: [PATCH v3 RESEND] x86: optimize memcpy_flushcache

On Fri, 22 Jun 2018, Ingo Molnar wrote:

>
> * Mikulas Patocka <[email protected]> wrote:
>
> > On Thu, 21 Jun 2018, Ingo Molnar wrote:
> >
> > >
> > > * Mike Snitzer <[email protected]> wrote:
> > >
> > > > From: Mikulas Patocka <[email protected]>
> > > > Subject: [PATCH v2] x86: optimize memcpy_flushcache
> > > >
> > > > In the context of constant short length stores to persistent memory,
> > > > memcpy_flushcache suffers from a 2% performance degradation compared to
> > > > explicitly using the "movnti" instruction.
> > > >
> > > > Optimize 4, 8, and 16 byte memcpy_flushcache calls to explicitly use the
> > > > movnti instruction with inline assembler.
> > >
> > > Linus requested asm optimizations to include actual benchmarks, so it would be
> > > nice to describe how this was tested, on what hardware, and what the before/after
> > > numbers are.
> > >
> > > Thanks,
> > >
> > > Ingo
> >
> > It was tested on 4-core skylake machine with persistent memory being
> > emulated using the memmap kernel option. The dm-writecache target used the
> > emulated persistent memory as a cache and sata SSD as a backing device.
> > The patch results in 2% improved throughput when writing data using dd.
> >
> > I don't have access to the machine anymore.
>
> I think this information is enough, but do we know how well memmap emulation
> represents true persistent memory speed and cache management characteristics?
> It might be representative - but I don't know for sure, nor probably most
> readers of the changelog.
>
> So could you please put all this into an updated changelog, and also add a short
> description that outlines exactly which codepaths end up using this method in a
> typical persistent memory setup? All filesystem ops - or only reads, etc?
>
> Thanks,
>
> Ingo

Here I resend it:

From: Mikulas Patocka <[email protected]>
Subject: [PATCH] x86: optimize memcpy_flushcache

I use memcpy_flushcache in my persistent memory driver for metadata
updates, there are many 8-byte and 16-byte updates and it turns out that
the overhead of memcpy_flushcache causes 2% performance degradation
compared to "movnti" instruction explicitly coded using inline assembler.

The tests were done on a Skylake processor with persistent memory emulated
using the "memmap" kernel parameter. dd was used to copy data to the
dm-writecache target.

This patch recognizes memcpy_flushcache calls with constant short length
and turns them into inline assembler - so that I don't have to use inline
assembler in the driver.

Signed-off-by: Mikulas Patocka <[email protected]>

---
arch/x86/include/asm/string_64.h | 20 +++++++++++++++++++-
arch/x86/lib/usercopy_64.c | 4 ++--
2 files changed, 21 insertions(+), 3 deletions(-)

Index: linux-2.6/arch/x86/include/asm/string_64.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/string_64.h
+++ linux-2.6/arch/x86/include/asm/string_64.h
@@ -149,7 +149,25 @@ memcpy_mcsafe(void *dst, const void *src

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
-void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+ if (__builtin_constant_p(cnt)) {
+ switch (cnt) {
+ case 4:
+ asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
+ return;
+ case 8:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ return;
+ case 16:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
+ return;
+ }
+ }
+ __memcpy_flushcache(dst, src, cnt);
+}
#endif

#endif /* __KERNEL__ */
Index: linux-2.6/arch/x86/lib/usercopy_64.c
===================================================================
--- linux-2.6.orig/arch/x86/lib/usercopy_64.c
+++ linux-2.6/arch/x86/lib/usercopy_64.c
@@ -153,7 +153,7 @@ long __copy_user_flushcache(void *dst, c
return rc;
}

-void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
{
unsigned long dest = (unsigned long) _dst;
unsigned long source = (unsigned long) _src;
@@ -216,7 +216,7 @@ void memcpy_flushcache(void *_dst, const
clean_cache_range((void *) dest, size);
}
}
-EXPORT_SYMBOL_GPL(memcpy_flushcache);
+EXPORT_SYMBOL_GPL(__memcpy_flushcache);

void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len)

2018-09-10 13:21:01

by Ingo Molnar

[permalink] [raw]

Subject: Re: [PATCH v3 RESEND] x86: optimize memcpy_flushcache

* Mikulas Patocka <[email protected]> wrote:

> Here I resend it:
>
>
> From: Mikulas Patocka <[email protected]>
> Subject: [PATCH] x86: optimize memcpy_flushcache
>
> I use memcpy_flushcache in my persistent memory driver for metadata
> updates, there are many 8-byte and 16-byte updates and it turns out that
> the overhead of memcpy_flushcache causes 2% performance degradation
> compared to "movnti" instruction explicitly coded using inline assembler.
>
> The tests were done on a Skylake processor with persistent memory emulated
> using the "memmap" kernel parameter. dd was used to copy data to the
> dm-writecache target.
>
> This patch recognizes memcpy_flushcache calls with constant short length
> and turns them into inline assembler - so that I don't have to use inline
> assembler in the driver.
>
> Signed-off-by: Mikulas Patocka <[email protected]>
>
> ---
> arch/x86/include/asm/string_64.h | 20 +++++++++++++++++++-
> arch/x86/lib/usercopy_64.c | 4 ++--
> 2 files changed, 21 insertions(+), 3 deletions(-)

Applied to tip:x86/asm, thanks!

I'll push it out later today after some testing.

Thanks,

Ingo

2018-09-11 06:23:19

by tip-bot for Vasyl Gomonovych

[permalink] [raw]

Subject: [tip:x86/asm] x86/asm: Optimize memcpy_flushcache()

Commit-ID: 02101c45ec5b19d607af7372680f5259050b4e9c
Gitweb: https://git.kernel.org/tip/02101c45ec5b19d607af7372680f5259050b4e9c
Author: Mikulas Patocka <[email protected]>
AuthorDate: Wed, 8 Aug 2018 17:22:16 -0400
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 10 Sep 2018 15:17:12 +0200

x86/asm: Optimize memcpy_flushcache()

I use memcpy_flushcache() in my persistent memory driver for metadata
updates, there are many 8-byte and 16-byte updates and it turns out that
the overhead of memcpy_flushcache causes 2% performance degradation
compared to "movnti" instruction explicitly coded using inline assembler.

The tests were done on a Skylake processor with persistent memory emulated
using the "memmap" kernel parameter. dd was used to copy data to the
dm-writecache target.

This patch recognizes memcpy_flushcache calls with constant short length
and turns them into inline assembler - so that I don't have to use inline
assembler in the driver.

Signed-off-by: Mikulas Patocka <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Mike Snitzer <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: device-mapper development <[email protected]>
Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1808081720460.24747@file01.intranet.prod.int.rdu2.redhat.com
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/string_64.h | 20 +++++++++++++++++++-
arch/x86/lib/usercopy_64.c | 4 ++--
2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index d33f92b9fa22..7ad41bfcc16c 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -149,7 +149,25 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt)

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
-void memcpy_flushcache(void *dst, const void *src, size_t cnt);
+void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
+static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt)
+{
+ if (__builtin_constant_p(cnt)) {
+ switch (cnt) {
+ case 4:
+ asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src));
+ return;
+ case 8:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ return;
+ case 16:
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src));
+ asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8)));
+ return;
+ }
+ }
+ __memcpy_flushcache(dst, src, cnt);
+}
#endif

#endif /* __KERNEL__ */
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 9c5606d88f61..c50a1d815a37 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -153,7 +153,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
return rc;
}

-void memcpy_flushcache(void *_dst, const void *_src, size_t size)
+void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
{
unsigned long dest = (unsigned long) _dst;
unsigned long source = (unsigned long) _src;
@@ -216,7 +216,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size)
clean_cache_range((void *) dest, size);
}
}
-EXPORT_SYMBOL_GPL(memcpy_flushcache);
+EXPORT_SYMBOL_GPL(__memcpy_flushcache);

void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
size_t len)