Hi Ingo,
Alignment of memory access can cause performance degradation
in simple memory copy. So this patch adds the option to
specify access alignment used when calling memcpy().
Current maximum alignment is 8 byte, should this value
can be configurable?
I'll test Miao Xie's patch with this option later.
Example of use:
| mitake@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 748.866217 MB/Sec
| 4.521793 GB/Sec (with prefault)
| mitake@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB -r x86-64-unrolled -d 3
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 769.653487 MB/Sec
| 3.518181 GB/Sec (with prefault)
In latter case, access to destination memory ragion is shifted 3 bytes,
and performance degradation is observed in prefaulted copy.
Signed-off-by: Hitoshi Mitake <[email protected]>
Cc: Miao Xie <[email protected]>
Cc: Ma Ling <[email protected]>
Cc: Zhao Yakui <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Paul Mackerras <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Andi Kleen <[email protected]>
---
tools/perf/bench/mem-memcpy.c | 42 +++++++++++++++++++++++++++++-----------
1 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index db82021..ac88f52 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -19,6 +19,7 @@
#include <string.h>
#include <sys/time.h>
#include <errno.h>
+#include <unistd.h>
#define K 1024
@@ -28,6 +29,8 @@ static bool use_clock;
static int clock_fd;
static bool only_prefault;
static bool no_prefault;
+static int src_align;
+static int dst_align;
static const struct option options[] = {
OPT_STRING('l', "length", &length_str, "1MB",
@@ -41,6 +44,10 @@ static const struct option options[] = {
"Show only the result with page faults before memcpy()"),
OPT_BOOLEAN('n', "no-prefault", &no_prefault,
"Show only the result without page faults before memcpy()"),
+ OPT_INTEGER('s', "src-alignment", &src_align,
+ "Alignment of source memory region (in byte)"),
+ OPT_INTEGER('d', "dst-alignment", &dst_align,
+ "Alignment of destination memory region (in byte)"),
OPT_END()
};
@@ -79,6 +86,9 @@ static struct perf_event_attr clock_attr = {
.config = PERF_COUNT_HW_CPU_CYCLES
};
+/* Should this alignment be configurable? */
+#define ALIGNMENT 8
+
static void init_clock(void)
{
clock_fd = sys_perf_event_open(&clock_attr, getpid(), -1, -1, 0);
@@ -108,27 +118,29 @@ static double timeval2double(struct timeval *ts)
static void alloc_mem(void **dst, void **src, size_t length)
{
- *dst = zalloc(length);
- if (!dst)
+ int ret;
+
+ ret = posix_memalign(dst, ALIGNMENT, length + ALIGNMENT - 1);
+ if (ret)
die("memory allocation failed - maybe length is too large?\n");
- *src = zalloc(length);
- if (!src)
+ ret = posix_memalign(src, ALIGNMENT, length + ALIGNMENT - 1);
+ if (ret)
die("memory allocation failed - maybe length is too large?\n");
}
static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
{
u64 clock_start = 0ULL, clock_end = 0ULL;
- void *src = NULL, *dst = NULL;
+ char *src = NULL, *dst = NULL;
- alloc_mem(&src, &dst, len);
+ alloc_mem((void **)&src, (void **)&dst, len);
if (prefault)
- fn(dst, src, len);
+ fn(dst + dst_align, src + src_align, len);
clock_start = get_clock();
- fn(dst, src, len);
+ fn(dst + dst_align, src + src_align, len);
clock_end = get_clock();
free(src);
@@ -139,15 +151,15 @@ static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
{
struct timeval tv_start, tv_end, tv_diff;
- void *src = NULL, *dst = NULL;
+ char *src = NULL, *dst = NULL;
- alloc_mem(&src, &dst, len);
+ alloc_mem((void **)&src, (void **)&dst, len);
if (prefault)
- fn(dst, src, len);
+ fn(dst + dst_align, src + src_align, len);
BUG_ON(gettimeofday(&tv_start, NULL));
- fn(dst, src, len);
+ fn(dst + dst_align, src + src_align, len);
BUG_ON(gettimeofday(&tv_end, NULL));
timersub(&tv_end, &tv_start, &tv_diff);
@@ -198,6 +210,12 @@ int bench_mem_memcpy(int argc, const char **argv,
if (only_prefault && no_prefault)
only_prefault = no_prefault = false;
+ if (ALIGNMENT <= src_align || ALIGNMENT <= dst_align) {
+ fprintf(stderr, "Alignment is too large,"
+ "it should be shorter than %d Byte\n", ALIGNMENT);
+ return 1;
+ }
+
for (i = 0; routines[i].name; i++) {
if (!strcmp(routines[i].name, routine))
break;
--
1.7.1.1