2009-11-19 07:19:17

by Hitoshi Mitake

[permalink] [raw]
Subject: [PATCH] perf bench: Add new functions for utilizing rep of Nehalem

This patch adds two functions to bench/mem-memcpy.c,
memcpy_nehalem_rep() and memcpy_orig_of_nehalem_rep().

These functions are based on the post by Ling Ma,
http://marc.info/?l=linux-kernel&m=125800144419838&w=2
And the purpose of this post is improving performance
of memcpy() for x86_64.

On my Core i7 box, I can find improvement of performance,

|% perf bench mem memcpy -c -l 1GB -r nehalem-rep
| # Running mem/memcpy benchmark...
| # Copying 1GB Bytes from 0x7ff9fc95a010 to 0x7ffa3c95b010 ...
|
| 3.099325 Clock/Byte
| % perf bench mem memcpy -c -l 1GB -r orig-of-nehalem-rep
| # Running mem/memcpy benchmark...
| # Copying 1GB Bytes from 0x7f314d1d7010 to 0x7f318d1d8010 ...
|
| 4.353351 Clock/Byte

(of course I tested some times, and results are like the above)

Signed-off-by: Hitoshi Mitake <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ling Ma <[email protected]>
---
tools/perf/bench/mem-memcpy.c | 167 +++++++++++++++++++++++++++++++++++++++++
1 files changed, 167 insertions(+), 0 deletions(-)

diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index d4f4f98..349aeaf 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -37,6 +37,165 @@ static const struct option options[] = {
OPT_END()
};

+#ifdef __x86_64__
+
+/*
+ * This memcpy_nehalem_rep() is based on Ling Ma <[email protected]>'s
+ * memcpy_new() optimized for Nehalem architecture.
+ *
+ * http://marc.info/?l=linux-kernel&m=125800144419838&w=2
+ */
+static void *memcpy_nehalem_rep(void *dst, const void *src __used,
+ size_t len __used)
+{
+ __asm__(
+ "movq %rdi, %rax\n\t"
+ "movl %edx, %ecx\n\t"
+ "shrl $6, %ecx\n\t"
+ "jz nehalem_rep_2\n\t"
+
+ "cmp $0x400, %edx\n\t"
+ "jae nehalem_rep_7\n\t"
+
+ "nehalem_rep_1:\n\t"
+ "decl %ecx\n\t"
+ "movq 0*8(%rsi), %r11\n\t"
+ "movq 1*8(%rdi), %r8\n\t"
+ "movq %r11, 0*8(%rdi)\n\t"
+ "movq %r8, 1*8(%rdi)\n\t"
+ "movq 2*8(%rsi), %r9\n\t"
+ "movq 3*8(%rdi), %r10\n\t"
+ "movq %r9, 2*8(%rdi)\n\t"
+ "movq %r10, 3*8(%rdi)\n\t"
+ "movq 4*8(%rsi), %r11\n\t"
+ "movq 5*8(%rdi), %r8\n\t"
+ "movq %r11, 4*8(%rdi)\n\t"
+ "movq %r8, 5*8(%rdi)\n\t"
+ "movq 6*8(%rsi), %r9\n\t"
+ "movq 7*8(%rdi), %r10\n\t"
+ "movq %r9, 6*8(%rdi)\n\t"
+ "movq %r10, 7*8(%rdi)\n\t"
+ "leaq 64(%rsi), %rsi\n\t"
+ "leaq 64(%rdi), %rdi\n\t"
+ "jnz nehalem_rep_1\n\t"
+
+ "nehalem_rep_2:\n\t"
+ "movl %edx, %ecx\n\t"
+ "andl $63, %ecx\n\t"
+ "shrl $3, %ecx\n\t"
+ "jz nehalem_rep_4\n\t"
+
+ "nehalem_rep_3:\n\t"
+ "decl %ecx\n\t"
+ "movq (%rsi), %r8\n\t"
+ "movq %r8, (%rdi)\n\t"
+ "leaq 8(%rdi), %rdi\n\t"
+ "leaq 8(%rsi), %rsi\n\t"
+ "jnz nehalem_rep_3\n\t"
+
+ "nehalem_rep_4:\n\t"
+ "movl %edx, %ecx\n\t"
+ "andl $7, %ecx\n\t"
+ "jz nehalem_rep_6\n\t"
+
+ "nehalem_rep_5:\n\t"
+ "movb (%rsi), %r8b\n\t"
+ "movb %r8b, (%rdi)\n\t"
+ "incq %rdi\n\t"
+ "incq %rsi\n\t"
+ "decl %ecx\n\t"
+ "jnz nehalem_rep_5\n\t"
+ "nehalem_rep_6:\n\t"
+
+ "retq\n\t"
+ "nehalem_rep_7:\n\t"
+
+ "movl %edx, %ecx\n\t"
+ "shr $3, %ecx\n\t"
+ "andl $7, %edx\n\t"
+ "rep movsq \n\t"
+ "jz nehalem_rep_8\n\t"
+
+ "movl %edx, %ecx\n\t"
+ "rep movsb\n\t"
+ "nehalem_rep_8:\n\t"
+ );
+
+ return dst;
+}
+
+/*
+ * Original memcpy() from arch/x86/lib/memcpy_64.S
+ * Main purpose of this function is comparison with
+ * for-nehalem-rep()
+ */
+
+static void *memcpy_orig_of_nehalem_rep(void *dst, const void *src __used,
+ size_t len __used)
+{
+ __asm__(
+ "movq %rdi, %rax\n\t"
+ "movl %edx, %ecx\n\t"
+ "shrl $6, %ecx\n\t"
+ "jz orig_of_nehalem_rep2\n\t"
+
+ "mov $0x80, %r8d\n\t"
+ "orig_of_nehalem_rep1:\n\t"
+ "decl %ecx\n\t"
+ "movq 0*8(%rsi), %r11\n\t"
+ "movq 1*8(%rdi), %r8\n\t"
+ "movq %r11, 0*8(%rdi)\n\t"
+ "movq %r8, 1*8(%rdi)\n\t"
+ "movq 2*8(%rsi), %r9\n\t"
+ "movq 3*8(%rdi), %r10\n\t"
+ "movq %r9, 2*8(%rdi)\n\t"
+ "movq %r10, 3*8(%rdi)\n\t"
+ "movq 4*8(%rsi), %r11\n\t"
+ "movq 5*8(%rdi), %r8\n\t"
+ "movq %r11, 4*8(%rdi)\n\t"
+ "movq %r8, 5*8(%rdi)\n\t"
+ "movq 6*8(%rsi), %r9\n\t"
+ "movq 7*8(%rdi), %r10\n\t"
+ "movq %r9, 6*8(%rdi)\n\t"
+ "movq %r10, 7*8(%rdi)\n\t"
+ "leaq 64(%rsi), %rsi\n\t"
+ "leaq 64(%rdi), %rdi\n\t"
+ "jnz orig_of_nehalem_rep1\n\t"
+
+ "orig_of_nehalem_rep2:\n\t"
+ "movl %edx, %ecx\n\t"
+ "andl $63, %ecx\n\t"
+ "shrl $3, %ecx\n\t"
+ "jz orig_of_nehalem_rep4\n\t"
+
+ "orig_of_nehalem_rep3:\n\t"
+ "decl %ecx\n\t"
+ "movq (%rsi), %r8\n\t"
+ "movq %r8, (%rdi)\n\t"
+ "leaq 8(%rdi), %rdi\n\t"
+ "leaq 8(%rsi), %rsi\n\t"
+ "jnz orig_of_nehalem_rep3\n\t"
+
+ "orig_of_nehalem_rep4:\n\t"
+ "movl %edx, %ecx\n\t"
+ "andl $7, %ecx\n\t"
+ "jz orig_of_nehalem_rep6\n\t"
+
+ "orig_of_nehalem_rep5:\n\t"
+ "movb (%rsi), %r8b\n\t"
+ "movb %r8b, (%rdi)\n\t"
+ "incq %rdi\n\t"
+ "incq %rsi\n\t"
+ "decl %ecx\n\t"
+ "jnz orig_of_nehalem_rep5\n\t"
+
+ "orig_of_nehalem_rep6:\n\t"
+ );
+ return dst;
+}
+
+ #endif /* __x86_64__ */
+
struct routine {
const char *name;
const char *desc;
@@ -47,6 +206,14 @@ struct routine routines[] = {
{ "default",
"Default memcpy() provided by glibc",
memcpy },
+#ifdef __x86_64__
+ { "nehalem-rep",
+ "Optimized memcpy() for Nehalem architecture",
+ memcpy_nehalem_rep },
+ { "orig-of-nehalem-rep",
+ "Original memcpy() from arch/x86/lib/memcpy_64.S",
+ memcpy_orig_of_nehalem_rep },
+#endif /* __x86_64__ */
{ NULL,
NULL,
NULL }
--
1.6.5.2