LinuxLists.cc - x86: Static optimisations for copy

2017-06-01 06:59:05

Subject: x86: Static optimisations for copy_user

I was looking at the overhead of drmIoctl() in a microbenchmark that
repeatedly did a copy_from_user(.size=8) followed by a
copy_to_user(.size=8) as part of the DRM_IOCTL_I915_GEM_BUSY. I found
that if I forced inlined the get_user/put_user instead the walltime of
the ioctl was improved by about 20%. If copy_user_generic_unrolled was
used instead of copy_user_enhanced_fast_string, performance of the
microbenchmark was improved by 10%. Benchmarking on a few machines

(Broadwell)
benchmark_copy_user(hot):
size unrolled string fast-string
1 158 77 79
2 306 154 158
4 614 308 317
6 926 462 476
8 1344 298 635
12 1773 482 952
16 2797 602 1269
24 4020 903 1906
32 5055 1204 2540
48 6150 1806 3810
64 9564 2409 5082
96 13583 3612 6483
128 18108 4815 8434

(Broxton)
benchmark_copy_user(hot):
size unrolled string fast-string
1 270 52 53
2 364 106 109
4 460 213 218
6 486 305 312
8 1250 253 437
12 1009 332 625
16 2059 514 897
24 2624 672 1071
32 3043 1014 1750
48 3620 1499 2561
64 7777 1971 3333
96 7499 2876 4772
128 9999 3733 6088

which says that for this cache hot case in benchmarking the rep mov
microcode noticeably underperforms. Though once we pass a few
cachelines, and definitely after exceeding L1 cache, rep mov is the
clear winner. From cold, there is no difference in timings.

I can improve the microbenchmark by either force inlining the
raw_copy_*_user switches, or by switching to copy_user_generic_unrolled.
Both leave a sour taste. The switch is too big to be inlined, and if
called out-of-line the function call overhead negates its benefits.
Switching between fast-string and unrolled makes a presumption on
behaviour.

In the end, I limited this series to just adding a few extra
translations for statically known copy_*_user().
-Chris

2017-06-01 06:59:07

by Chris Wilson

[permalink] [raw]

Subject: [PATCH 3/3] x86-64: Inline 6/12 byte copy_user

Extend the list of replacements for compile-time known sizes to include
6/12 byte copies. These expand to two movs (along with their exception
table) and are cheaper to inline than the function call (similar to the
10 byte copy already handled).

Signed-off-by: Chris Wilson <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
---
arch/x86/include/asm/uaccess_64.h | 42 +++++++++++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index c5504b9a472e..ff2d65baa988 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -71,6 +71,16 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
ret, "l", "k", "=r", 4);
__uaccess_end();
return ret;
+ case 6:
+ __uaccess_begin();
+ __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
+ ret, "l", "k", "=r", 6);
+ if (likely(!ret))
+ __get_user_asm_nozero(*(u16 *)(4 + (char *)dst),
+ (u16 __user *)(4 + (char __user *)src),
+ ret, "w", "w", "=r", 2);
+ __uaccess_end();
+ return ret;
case 8:
__uaccess_begin();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
@@ -87,6 +97,16 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
ret, "w", "w", "=r", 2);
__uaccess_end();
return ret;
+ case 12:
+ __uaccess_begin();
+ __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
+ ret, "q", "", "=r", 10);
+ if (likely(!ret))
+ __get_user_asm_nozero(*(u32 *)(8 + (char *)dst),
+ (u32 __user *)(8 + (char __user *)src),
+ ret, "l", "k", "=r", 4);
+ __uaccess_end();
+ return ret;
case 16:
__uaccess_begin();
__get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
@@ -128,6 +148,17 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
ret, "l", "k", "ir", 4);
__uaccess_end();
return ret;
+ case 6:
+ __uaccess_begin();
+ __put_user_asm(*(u32 *)src, (u32 __user *)dst,
+ ret, "l", "k", "ir", 6);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(2[(u16 *)src], 2 + (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ }
+ __uaccess_end();
+ return ret;
case 8:
__uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
@@ -145,6 +176,17 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
}
__uaccess_end();
return ret;
+ case 12:
+ __uaccess_begin();
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ ret, "q", "", "er", 12);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(2[(u32 *)src], 2 + (u32 __user *)dst,
+ ret, "l", "k", "ir", 4);
+ }
+ __uaccess_end();
+ return ret;
case 16:
__uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
--
2.11.0

2017-06-01 07:00:30

by Chris Wilson

[permalink] [raw]

Subject: [PATCH 2/3] x86-32: Expand static copy_to_user()

For known compile-time fixed sizes, teach x86-32 copy_to_user() to
convert them to the simpler put_user and inline it similar to the
optimisation applied to copy_from_user() and already used by x86-64.

Signed-off-by: Chris Wilson <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
---
arch/x86/include/asm/uaccess_32.h | 48 +++++++++++++++++++++++++++++++++++++++
1 file changed, 48 insertions(+)

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 44d17d1ab07c..a02aa9db34ed 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -16,6 +16,54 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
static __always_inline unsigned long __must_check
raw_copy_to_user(void __user *to, const void *from, unsigned long n)
{
+ if (__builtin_constant_p(n)) {
+ unsigned long ret = 0;
+
+ switch (n) {
+ case 1:
+ __uaccess_begin();
+ __put_user_asm(*(u8 *)from, to, ret,
+ "b", "b", "iq", 1);
+ __uaccess_end();
+ return ret;
+ case 2:
+ __uaccess_begin();
+ __put_user_asm(*(u16 *)from, to, ret,
+ "w", "w", "ir", 2);
+ __uaccess_end();
+ return ret;
+ case 4:
+ __uaccess_begin();
+ __put_user_asm(*(u32 *)from, to, ret,
+ "l", "k", "ir", 4);
+ __uaccess_end();
+ return ret;
+ case 6:
+ __uaccess_begin();
+ __put_user_asm(*(u32 *)from, to, ret,
+ "l", "k", "ir", 4);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(*(u16 *)(4 + (char *)from),
+ (u16 __user *)(4 + (char __user *)to),
+ ret, "w", "w", "ir", 2);
+ }
+ __uaccess_end();
+ return ret;
+ case 8:
+ __uaccess_begin();
+ __put_user_asm(*(u32 *)from, to, ret,
+ "l", "k", "ir", 4);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(*(u32 *)(4 + (char *)from),
+ (u32 __user *)(4 + (char __user *)to),
+ ret, "l", "k", "ir", 4);
+ }
+ __uaccess_end();
+ return ret;
+ }
+ }
return __copy_user_ll((__force void *)to, from, n);
}

--
2.11.0

2017-06-01 07:00:28

by Chris Wilson

[permalink] [raw]

Subject: [PATCH 1/3] x86-32: Teach copy_from_user to unroll .size=6/8

Two exception handling register moves are faster to inline than a call
to __copy_user_ll(). We already apply the conversion for a get_user()
call, so for symmetry we should also apply the optimisation to
copy_from_user.

Signed-off-by: Chris Wilson <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
---
arch/x86/include/asm/uaccess_32.h | 25 +++++++++++++++++++++----
1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index aeda9bb8af50..44d17d1ab07c 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -23,30 +23,47 @@ static __always_inline unsigned long
raw_copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (__builtin_constant_p(n)) {
- unsigned long ret;
+ unsigned long ret = 0;

switch (n) {
case 1:
- ret = 0;
__uaccess_begin();
__get_user_asm_nozero(*(u8 *)to, from, ret,
"b", "b", "=q", 1);
__uaccess_end();
return ret;
case 2:
- ret = 0;
__uaccess_begin();
__get_user_asm_nozero(*(u16 *)to, from, ret,
"w", "w", "=r", 2);
__uaccess_end();
return ret;
case 4:
- ret = 0;
__uaccess_begin();
__get_user_asm_nozero(*(u32 *)to, from, ret,
"l", "k", "=r", 4);
__uaccess_end();
return ret;
+ case 6:
+ __uaccess_begin();
+ __get_user_asm_nozero(*(u32 *)to, from, ret,
+ "l", "k", "=r", 6);
+ if (likely(!ret))
+ __get_user_asm_nozero(*(u16 *)(4 + (char *)to),
+ (u16 __user *)(4 + (char __user *)from),
+ ret, "w", "w", "=r", 2);
+ __uaccess_end();
+ return ret;
+ case 8:
+ __uaccess_begin();
+ __get_user_asm_nozero(*(u32 *)to, from, ret,
+ "l", "k", "=r", 8);
+ if (likely(!ret))
+ __get_user_asm_nozero(*(u32 *)(4 + (char *)to),
+ (u32 __user *)(4 + (char __user *)from),
+ ret, "l", "k", "=r", 4);
+ __uaccess_end();
+ return ret;
}
}
return __copy_user_ll(to, (__force const void *)from, n);
--
2.11.0