Subject: [PATCH 1/2] perf bench futex: cache align the worer struct

It popped up in perf testing that the worker consumes some amount of
CPU. It boils down to the increment of `ops` which causes cache line
bouncing between the individual threads.
The patch aligns the struct by 256 bytes to ensure that not a cache line
is shared among CPUs. 128 byte is the x86 worst case and grep says that
L1_CACHE_SHIFT is set to 8 on s390.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
tools/perf/bench/futex-hash.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 8024cd5febd2..d9e5e80bb4d0 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -39,12 +39,15 @@ static unsigned int threads_starting;
static struct stats throughput_stats;
static pthread_cond_t thread_parent, thread_worker;

+#define SMP_CACHE_BYTES 256
+#define __cacheline_aligned __attribute__ ((aligned (SMP_CACHE_BYTES)))
+
struct worker {
int tid;
u_int32_t *futex;
pthread_t thread;
unsigned long ops;
-};
+} __cacheline_aligned;

static const struct option options[] = {
OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),
--
2.9.3


Subject: [PATCH 2/2] perf bench futex: add NUMA support

By default the application uses malloc() and all available CPUs. This
patch introduces NUMA support which means:
- memory is allocated node local via numa_alloc_local()
- all CPUs of the specified NUMA node are used. This is also true if the
number of threads set is greater than the number of CPUs available on
this node.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
tools/perf/bench/Build | 4 ++
tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
2 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index 60bf11943047..9e6e518d7d62 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -1,3 +1,7 @@
+ifdef CONFIG_NUMA
+CFLAGS_futex-hash.o += -DCONFIG_NUMA=1
+endif
+
perf-y += sched-messaging.o
perf-y += sched-pipe.o
perf-y += mem-functions.o
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index d9e5e80bb4d0..8db4a5bd6a4e 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -25,6 +25,9 @@

#include <err.h>
#include <sys/time.h>
+#ifdef CONFIG_NUMA
+#include <numa.h>
+#endif

static unsigned int nthreads = 0;
static unsigned int nsecs = 10;
@@ -32,6 +35,7 @@ static unsigned int nsecs = 10;
static unsigned int nfutexes = 1024;
static bool fshared = false, done = false, silent = false;
static int futex_flag = 0;
+static int numa_node = -1;

struct timeval start, end, runtime;
static pthread_mutex_t thread_lock;
@@ -55,9 +59,28 @@ static const struct option options[] = {
OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"),
OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"),
OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"),
+#ifdef CONFIG_NUMA
+ OPT_INTEGER( 'n', "numa", &numa_node, "Specify the NUMA node"),
+#endif
OPT_END()
};

+#ifndef CONFIG_NUMA
+static int numa_run_on_node(int node __maybe_unused) { return 0; }
+static int numa_node_of_cpu(int node __maybe_unused) { return 0; }
+static void *numa_alloc_local(size_t size) { return malloc(size); }
+static void numa_free(void *p, size_t size __maybe_unused) { return free(p); }
+#endif
+
+static bool cpu_is_local(int cpu)
+{
+ if (numa_node < 0)
+ return true;
+ if (numa_node_of_cpu(cpu) == numa_node)
+ return true;
+ return false;
+}
+
static const char * const bench_futex_hash_usage[] = {
"perf bench futex hash <options>",
NULL
@@ -123,6 +146,8 @@ int bench_futex_hash(int argc, const char **argv,
unsigned int i, ncpus;
pthread_attr_t thread_attr;
struct worker *worker = NULL;
+ char *node_str = NULL;
+ unsigned int cpunum;

argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
if (argc) {
@@ -136,18 +161,50 @@ int bench_futex_hash(int argc, const char **argv,
act.sa_sigaction = toggle_done;
sigaction(SIGINT, &act, NULL);

- if (!nthreads) /* default to the number of CPUs */
- nthreads = ncpus;
+ if (!nthreads) {
+ /* default to the number of CPUs per NUMA node */
+ if (numa_node < 0) {
+ nthreads = ncpus;
+ } else {
+ for (i = 0; i < ncpus; i++) {
+ if (cpu_is_local(i))
+ nthreads++;
+ }
+ if (!nthreads)
+ err(EXIT_FAILURE, "No online CPUs for this node");
+ }
+ } else {
+ int cpu_available = 0;

- worker = calloc(nthreads, sizeof(*worker));
+ for (i = 0; i < ncpus && !cpu_available; i++) {
+ if (cpu_is_local(i))
+ cpu_available = 1;
+ }
+ if (!cpu_available)
+ err(EXIT_FAILURE, "No online CPUs for this node");
+ }
+
+ if (numa_node >= 0) {
+ ret = numa_run_on_node(numa_node);
+ if (ret < 0)
+ err(EXIT_FAILURE, "numa_run_on_node");
+ ret = asprintf(&node_str, " on node %d", numa_node);
+ if (ret < 0)
+ err(EXIT_FAILURE, "numa_node, asprintf");
+ }
+
+ worker = numa_alloc_local(nthreads * sizeof(*worker));
if (!worker)
goto errmem;

if (!fshared)
futex_flag = FUTEX_PRIVATE_FLAG;

- printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n",
- getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs);
+ printf("Run summary [PID %d]: %d threads%s, each operating on %d [%s] futexes for %d secs.\n\n",
+ getpid(), nthreads,
+ node_str ? : "",
+ nfutexes, fshared ? "shared":"private",
+ nsecs);

init_stats(&throughput_stats);
pthread_mutex_init(&thread_lock, NULL);
@@ -157,14 +214,24 @@ int bench_futex_hash(int argc, const char **argv,
threads_starting = nthreads;
pthread_attr_init(&thread_attr);
gettimeofday(&start, NULL);
- for (i = 0; i < nthreads; i++) {
+ for (cpunum = 0, i = 0; i < nthreads; i++, cpunum++) {
+
+ do {
+ if (cpu_is_local(cpunum))
+ break;
+ cpunum++;
+ if (cpunum > ncpus)
+ cpunum = 0;
+ } while (1);
+
worker[i].tid = i;
- worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex));
+ worker[i].futex = numa_alloc_local(nfutexes *
+ sizeof(*worker[i].futex));
if (!worker[i].futex)
goto errmem;

CPU_ZERO(&cpu);
- CPU_SET(i % ncpus, &cpu);
+ CPU_SET(cpunum % ncpus, &cpu);

ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
if (ret)
@@ -211,12 +278,12 @@ int bench_futex_hash(int argc, const char **argv,
&worker[i].futex[nfutexes-1], t);
}

- free(worker[i].futex);
+ numa_free(worker[i].futex, nfutexes * sizeof(*worker[i].futex));
}

print_summary();

- free(worker);
+ numa_free(worker, nthreads * sizeof(*worker));
return ret;
errmem:
err(EXIT_FAILURE, "calloc");
--
2.9.3

2016-10-17 14:38:35

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf bench futex: add NUMA support

Em Sun, Oct 16, 2016 at 09:08:03PM +0200, Sebastian Andrzej Siewior escreveu:
> By default the application uses malloc() and all available CPUs. This
> patch introduces NUMA support which means:
> - memory is allocated node local via numa_alloc_local()
> - all CPUs of the specified NUMA node are used. This is also true if the
> number of threads set is greater than the number of CPUs available on
> this node.
>
> Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
> ---
> tools/perf/bench/Build | 4 ++
> tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 81 insertions(+), 10 deletions(-)
>
> diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
> index 60bf11943047..9e6e518d7d62 100644
> --- a/tools/perf/bench/Build
> +++ b/tools/perf/bench/Build
> @@ -1,3 +1,7 @@
> +ifdef CONFIG_NUMA
> +CFLAGS_futex-hash.o += -DCONFIG_NUMA=1
> +endif

Jiri, do we really need this? I.e. aren't the CONFIG_FOO defines
available to tools?

> perf-y += sched-messaging.o
> perf-y += sched-pipe.o
> perf-y += mem-functions.o
> diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
> index d9e5e80bb4d0..8db4a5bd6a4e 100644
> --- a/tools/perf/bench/futex-hash.c
> +++ b/tools/perf/bench/futex-hash.c
> @@ -25,6 +25,9 @@
>
> #include <err.h>
> #include <sys/time.h>
> +#ifdef CONFIG_NUMA
> +#include <numa.h>
> +#endif
>
> static unsigned int nthreads = 0;
> static unsigned int nsecs = 10;
> @@ -32,6 +35,7 @@ static unsigned int nsecs = 10;
> static unsigned int nfutexes = 1024;
> static bool fshared = false, done = false, silent = false;
> static int futex_flag = 0;
> +static int numa_node = -1;
>
> struct timeval start, end, runtime;
> static pthread_mutex_t thread_lock;
> @@ -55,9 +59,28 @@ static const struct option options[] = {
> OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"),
> OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"),
> OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"),
> +#ifdef CONFIG_NUMA
> + OPT_INTEGER( 'n', "numa", &numa_node, "Specify the NUMA node"),
> +#endif
> OPT_END()
> };
>
> +#ifndef CONFIG_NUMA
> +static int numa_run_on_node(int node __maybe_unused) { return 0; }
> +static int numa_node_of_cpu(int node __maybe_unused) { return 0; }
> +static void *numa_alloc_local(size_t size) { return malloc(size); }
> +static void numa_free(void *p, size_t size __maybe_unused) { return free(p); }
> +#endif
> +
> +static bool cpu_is_local(int cpu)
> +{
> + if (numa_node < 0)
> + return true;
> + if (numa_node_of_cpu(cpu) == numa_node)
> + return true;
> + return false;
> +}
> +
> static const char * const bench_futex_hash_usage[] = {
> "perf bench futex hash <options>",
> NULL
> @@ -123,6 +146,8 @@ int bench_futex_hash(int argc, const char **argv,
> unsigned int i, ncpus;
> pthread_attr_t thread_attr;
> struct worker *worker = NULL;
> + char *node_str = NULL;
> + unsigned int cpunum;
>
> argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
> if (argc) {
> @@ -136,18 +161,50 @@ int bench_futex_hash(int argc, const char **argv,
> act.sa_sigaction = toggle_done;
> sigaction(SIGINT, &act, NULL);
>
> - if (!nthreads) /* default to the number of CPUs */
> - nthreads = ncpus;
> + if (!nthreads) {
> + /* default to the number of CPUs per NUMA node */
> + if (numa_node < 0) {
> + nthreads = ncpus;
> + } else {
> + for (i = 0; i < ncpus; i++) {
> + if (cpu_is_local(i))
> + nthreads++;
> + }
> + if (!nthreads)
> + err(EXIT_FAILURE, "No online CPUs for this node");
> + }
> + } else {
> + int cpu_available = 0;
>
> - worker = calloc(nthreads, sizeof(*worker));
> + for (i = 0; i < ncpus && !cpu_available; i++) {
> + if (cpu_is_local(i))
> + cpu_available = 1;
> + }
> + if (!cpu_available)
> + err(EXIT_FAILURE, "No online CPUs for this node");
> + }
> +
> + if (numa_node >= 0) {
> + ret = numa_run_on_node(numa_node);
> + if (ret < 0)
> + err(EXIT_FAILURE, "numa_run_on_node");
> + ret = asprintf(&node_str, " on node %d", numa_node);
> + if (ret < 0)
> + err(EXIT_FAILURE, "numa_node, asprintf");
> + }
> +
> + worker = numa_alloc_local(nthreads * sizeof(*worker));
> if (!worker)
> goto errmem;
>
> if (!fshared)
> futex_flag = FUTEX_PRIVATE_FLAG;
>
> - printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n",
> - getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs);
> + printf("Run summary [PID %d]: %d threads%s, each operating on %d [%s] futexes for %d secs.\n\n",
> + getpid(), nthreads,
> + node_str ? : "",
> + nfutexes, fshared ? "shared":"private",
> + nsecs);
>
> init_stats(&throughput_stats);
> pthread_mutex_init(&thread_lock, NULL);
> @@ -157,14 +214,24 @@ int bench_futex_hash(int argc, const char **argv,
> threads_starting = nthreads;
> pthread_attr_init(&thread_attr);
> gettimeofday(&start, NULL);
> - for (i = 0; i < nthreads; i++) {
> + for (cpunum = 0, i = 0; i < nthreads; i++, cpunum++) {
> +
> + do {
> + if (cpu_is_local(cpunum))
> + break;
> + cpunum++;
> + if (cpunum > ncpus)
> + cpunum = 0;
> + } while (1);
> +
> worker[i].tid = i;
> - worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex));
> + worker[i].futex = numa_alloc_local(nfutexes *
> + sizeof(*worker[i].futex));
> if (!worker[i].futex)
> goto errmem;
>
> CPU_ZERO(&cpu);
> - CPU_SET(i % ncpus, &cpu);
> + CPU_SET(cpunum % ncpus, &cpu);
>
> ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
> if (ret)
> @@ -211,12 +278,12 @@ int bench_futex_hash(int argc, const char **argv,
> &worker[i].futex[nfutexes-1], t);
> }
>
> - free(worker[i].futex);
> + numa_free(worker[i].futex, nfutexes * sizeof(*worker[i].futex));
> }
>
> print_summary();
>
> - free(worker);
> + numa_free(worker, nthreads * sizeof(*worker));
> return ret;
> errmem:
> err(EXIT_FAILURE, "calloc");
> --
> 2.9.3

2016-10-17 15:01:36

by Jiri Olsa

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf bench futex: add NUMA support

On Mon, Oct 17, 2016 at 11:38:21AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Sun, Oct 16, 2016 at 09:08:03PM +0200, Sebastian Andrzej Siewior escreveu:
> > By default the application uses malloc() and all available CPUs. This
> > patch introduces NUMA support which means:
> > - memory is allocated node local via numa_alloc_local()
> > - all CPUs of the specified NUMA node are used. This is also true if the
> > number of threads set is greater than the number of CPUs available on
> > this node.
> >
> > Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
> > ---
> > tools/perf/bench/Build | 4 ++
> > tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
> > 2 files changed, 81 insertions(+), 10 deletions(-)
> >
> > diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
> > index 60bf11943047..9e6e518d7d62 100644
> > --- a/tools/perf/bench/Build
> > +++ b/tools/perf/bench/Build
> > @@ -1,3 +1,7 @@
> > +ifdef CONFIG_NUMA
> > +CFLAGS_futex-hash.o += -DCONFIG_NUMA=1
> > +endif
>
> Jiri, do we really need this? I.e. aren't the CONFIG_FOO defines
> available to tools?

not directly ATM.. it's prepared for the .config customary setting feature

meanwhile we set HAVE_* defines, like for CONFIG_NUMA we have:
-DHAVE_LIBNUMA_SUPPORT

you can check it in Makefile.config

jirka

2016-10-17 15:04:59

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 2/2] perf bench futex: add NUMA support

Em Mon, Oct 17, 2016 at 05:01:23PM +0200, Jiri Olsa escreveu:
> On Mon, Oct 17, 2016 at 11:38:21AM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Sun, Oct 16, 2016 at 09:08:03PM +0200, Sebastian Andrzej Siewior escreveu:
> > > By default the application uses malloc() and all available CPUs. This
> > > patch introduces NUMA support which means:
> > > - memory is allocated node local via numa_alloc_local()
> > > - all CPUs of the specified NUMA node are used. This is also true if the
> > > number of threads set is greater than the number of CPUs available on
> > > this node.
> > >
> > > Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
> > > ---
> > > tools/perf/bench/Build | 4 ++
> > > tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
> > > 2 files changed, 81 insertions(+), 10 deletions(-)
> > >
> > > diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
> > > index 60bf11943047..9e6e518d7d62 100644
> > > --- a/tools/perf/bench/Build
> > > +++ b/tools/perf/bench/Build
> > > @@ -1,3 +1,7 @@
> > > +ifdef CONFIG_NUMA
> > > +CFLAGS_futex-hash.o += -DCONFIG_NUMA=1
> > > +endif
> >
> > Jiri, do we really need this? I.e. aren't the CONFIG_FOO defines
> > available to tools?
>
> not directly ATM.. it's prepared for the .config customary setting feature
>
> meanwhile we set HAVE_* defines, like for CONFIG_NUMA we have:
> -DHAVE_LIBNUMA_SUPPORT
>
> you can check it in Makefile.config

So, Andrzej, can you use that instead? I merged the first patch already.

- Arnaldo

Subject: [PATCH 2/2 v2] perf bench futex: add NUMA support

By default the application uses malloc() and all available CPUs. This
patch introduces NUMA support which means:
- memory is allocated node local via numa_alloc_local()
- all CPUs of the specified NUMA node are used. This is also true if the
number of threads set is greater than the number of CPUs available on
this node.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
---
v1…v2: use HAVE_LIBNUMA_SUPPORT instead the custom variant

tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
1 file changed, 77 insertions(+), 10 deletions(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index d9e5e80bb4d0..17d38ce829f8 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -25,6 +25,9 @@

#include <err.h>
#include <sys/time.h>
+#ifdef HAVE_LIBNUMA_SUPPORT
+#include <numa.h>
+#endif

static unsigned int nthreads = 0;
static unsigned int nsecs = 10;
@@ -32,6 +35,7 @@ static unsigned int nsecs = 10;
static unsigned int nfutexes = 1024;
static bool fshared = false, done = false, silent = false;
static int futex_flag = 0;
+static int numa_node = -1;

struct timeval start, end, runtime;
static pthread_mutex_t thread_lock;
@@ -55,9 +59,28 @@ static const struct option options[] = {
OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"),
OPT_BOOLEAN( 's', "silent", &silent, "Silent mode: do not display data/details"),
OPT_BOOLEAN( 'S', "shared", &fshared, "Use shared futexes instead of private ones"),
+#ifdef HAVE_LIBNUMA_SUPPORT
+ OPT_INTEGER( 'n', "numa", &numa_node, "Specify the NUMA node"),
+#endif
OPT_END()
};

+#ifndef HAVE_LIBNUMA_SUPPORT
+static int numa_run_on_node(int node __maybe_unused) { return 0; }
+static int numa_node_of_cpu(int node __maybe_unused) { return 0; }
+static void *numa_alloc_local(size_t size) { return malloc(size); }
+static void numa_free(void *p, size_t size __maybe_unused) { return free(p); }
+#endif
+
+static bool cpu_is_local(int cpu)
+{
+ if (numa_node < 0)
+ return true;
+ if (numa_node_of_cpu(cpu) == numa_node)
+ return true;
+ return false;
+}
+
static const char * const bench_futex_hash_usage[] = {
"perf bench futex hash <options>",
NULL
@@ -123,6 +146,8 @@ int bench_futex_hash(int argc, const char **argv,
unsigned int i, ncpus;
pthread_attr_t thread_attr;
struct worker *worker = NULL;
+ char *node_str = NULL;
+ unsigned int cpunum;

argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
if (argc) {
@@ -136,18 +161,50 @@ int bench_futex_hash(int argc, const char **argv,
act.sa_sigaction = toggle_done;
sigaction(SIGINT, &act, NULL);

- if (!nthreads) /* default to the number of CPUs */
- nthreads = ncpus;
+ if (!nthreads) {
+ /* default to the number of CPUs per NUMA node */
+ if (numa_node < 0) {
+ nthreads = ncpus;
+ } else {
+ for (i = 0; i < ncpus; i++) {
+ if (cpu_is_local(i))
+ nthreads++;
+ }
+ if (!nthreads)
+ err(EXIT_FAILURE, "No online CPUs for this node");
+ }
+ } else {
+ int cpu_available = 0;

- worker = calloc(nthreads, sizeof(*worker));
+ for (i = 0; i < ncpus && !cpu_available; i++) {
+ if (cpu_is_local(i))
+ cpu_available = 1;
+ }
+ if (!cpu_available)
+ err(EXIT_FAILURE, "No online CPUs for this node");
+ }
+
+ if (numa_node >= 0) {
+ ret = numa_run_on_node(numa_node);
+ if (ret < 0)
+ err(EXIT_FAILURE, "numa_run_on_node");
+ ret = asprintf(&node_str, " on node %d", numa_node);
+ if (ret < 0)
+ err(EXIT_FAILURE, "numa_node, asprintf");
+ }
+
+ worker = numa_alloc_local(nthreads * sizeof(*worker));
if (!worker)
goto errmem;

if (!fshared)
futex_flag = FUTEX_PRIVATE_FLAG;

- printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n",
- getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs);
+ printf("Run summary [PID %d]: %d threads%s, each operating on %d [%s] futexes for %d secs.\n\n",
+ getpid(), nthreads,
+ node_str ? : "",
+ nfutexes, fshared ? "shared":"private",
+ nsecs);

init_stats(&throughput_stats);
pthread_mutex_init(&thread_lock, NULL);
@@ -157,14 +214,24 @@ int bench_futex_hash(int argc, const char **argv,
threads_starting = nthreads;
pthread_attr_init(&thread_attr);
gettimeofday(&start, NULL);
- for (i = 0; i < nthreads; i++) {
+ for (cpunum = 0, i = 0; i < nthreads; i++, cpunum++) {
+
+ do {
+ if (cpu_is_local(cpunum))
+ break;
+ cpunum++;
+ if (cpunum > ncpus)
+ cpunum = 0;
+ } while (1);
+
worker[i].tid = i;
- worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex));
+ worker[i].futex = numa_alloc_local(nfutexes *
+ sizeof(*worker[i].futex));
if (!worker[i].futex)
goto errmem;

CPU_ZERO(&cpu);
- CPU_SET(i % ncpus, &cpu);
+ CPU_SET(cpunum % ncpus, &cpu);

ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
if (ret)
@@ -211,12 +278,12 @@ int bench_futex_hash(int argc, const char **argv,
&worker[i].futex[nfutexes-1], t);
}

- free(worker[i].futex);
+ numa_free(worker[i].futex, nfutexes * sizeof(*worker[i].futex));
}

print_summary();

- free(worker);
+ numa_free(worker, nthreads * sizeof(*worker));
return ret;
errmem:
err(EXIT_FAILURE, "calloc");
--
2.9.3

2016-10-18 01:10:09

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH 1/2] perf bench futex: cache align the worer struct

On Sun, 16 Oct 2016, Sebastian Andrzej Siewior wrote:

>It popped up in perf testing that the worker consumes some amount of
>CPU. It boils down to the increment of `ops` which causes cache line
>bouncing between the individual threads.

Are you referring to this?

│ for (i = 0; i < nfutexes; i++, w->ops++) {
│ be: add $0x1,%ebx
65.87 │ addq $0x1,0x18(%r12)

(which is like 65% of 13% on my box with a default futex-hash run).

Even better, could we get rid entirely of the ops increments and just
use a local variable, then update the worker at the end of the function.
The following makes 'perf' pretty much disappear in the profile.

Thanks,
Davidlohr

----8<--------------
From: Davidlohr Bueso <[email protected]>
Subject: [PATCH] perf/bench-futex: Avoid worker cacheline bouncing

Sebastian noted that overhead for worker thread ops (throughput)
accounting was producing 'perf' to appear in the profiles, consuming
a non-trivial (ie 13%) amount of CPU. This is due to cacheline
bouncing due to the increment of w->ops. We can easily fix this by
just working on a local copy and updating the actual worker once
done running, and ready to show the program summary. There is no
danger of the worker being concurrent, so we can trust that no stale
value is being seen by another thread.

Reported-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Davidlohr Bueso <[email protected]>
---
tools/perf/bench/futex-hash.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 8024cd5febd2..0b9583164da0 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -63,8 +63,10 @@ static const char * const bench_futex_hash_usage[] = {
static void *workerfn(void *arg)
{
int ret;
- unsigned int i;
struct worker *w = (struct worker *) arg;
+ unsigned int i;
+ unsigned long ops = w->ops; /* avoid cacheline-bouncing */
+

pthread_mutex_lock(&thread_lock);
threads_starting--;
@@ -74,7 +76,7 @@ static void *workerfn(void *arg)
pthread_mutex_unlock(&thread_lock);

do {
- for (i = 0; i < nfutexes; i++, w->ops++) {
+ for (i = 0; i < nfutexes; i++, ops++) {
/*
* We want the futex calls to fail in order to stress
* the hashing of uaddr and not measure other steps,
@@ -88,6 +90,8 @@ static void *workerfn(void *arg)
}
} while (!done);

+ w->ops = ops;
+
return NULL;
}

--
2.6.6

Subject: Re: [PATCH 1/2] perf bench futex: cache align the worer struct

On 2016-10-17 18:09:49 [-0700], Davidlohr Bueso wrote:
> On Sun, 16 Oct 2016, Sebastian Andrzej Siewior wrote:
>
> > It popped up in perf testing that the worker consumes some amount of
> > CPU. It boils down to the increment of `ops` which causes cache line
> > bouncing between the individual threads.
>
> Are you referring to this?
>
> │ for (i = 0; i < nfutexes; i++, w->ops++) {
> │ be: add $0x1,%ebx
> 65.87 │ addq $0x1,0x18(%r12)
>
> (which is like 65% of 13% on my box with a default futex-hash run).
correct.

> Even better, could we get rid entirely of the ops increments and just
> use a local variable, then update the worker at the end of the function.
> The following makes 'perf' pretty much disappear in the profile.

this should do it, too. So what remains is the read access for w->futex
but since it does not pop up in perf, it is probably not that important.

> Thanks,
> Davidlohr

Sebastian

2016-10-19 17:59:48

by Davidlohr Bueso

[permalink] [raw]
Subject: [PATCH] perf/bench-futex: Avoid worker cacheline bouncing

Sebastian noted that overhead for worker thread ops (throughput)
accounting was producing 'perf' to appear in the profiles, consuming
a non-trivial (ie 13%) amount of CPU. This is due to cacheline
bouncing due to the increment of w->ops. We can easily fix this by
just working on a local copy and updating the actual worker once
done running, and ready to show the program summary. There is no
danger of the worker being concurrent, so we can trust that no stale
value is being seen by another thread.

Reported-by: Sebastian Andrzej Siewior <[email protected]>
Signed-off-by: Davidlohr Bueso <[email protected]>
---
This is an updated version as I noticed lock-pi was also doing this.
acme, could you please pick up if no objection? Thanks.

tools/perf/bench/futex-hash.c | 6 ++++--
tools/perf/bench/futex-lock-pi.c | 4 +++-
2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 8024cd5febd2..da04b8c5568a 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -63,8 +63,9 @@ static const char * const bench_futex_hash_usage[] = {
static void *workerfn(void *arg)
{
int ret;
- unsigned int i;
struct worker *w = (struct worker *) arg;
+ unsigned int i;
+ unsigned long ops = w->ops; /* avoid cacheline bouncing */

pthread_mutex_lock(&thread_lock);
threads_starting--;
@@ -74,7 +75,7 @@ static void *workerfn(void *arg)
pthread_mutex_unlock(&thread_lock);

do {
- for (i = 0; i < nfutexes; i++, w->ops++) {
+ for (i = 0; i < nfutexes; i++, ops++) {
/*
* We want the futex calls to fail in order to stress
* the hashing of uaddr and not measure other steps,
@@ -88,6 +89,7 @@ static void *workerfn(void *arg)
}
} while (!done);

+ w->ops = ops;
return NULL;
}

diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index 936d89d30483..7032e4643c65 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -75,6 +75,7 @@ static void toggle_done(int sig __maybe_unused,
static void *workerfn(void *arg)
{
struct worker *w = (struct worker *) arg;
+ unsigned long ops = w->ops;

pthread_mutex_lock(&thread_lock);
threads_starting--;
@@ -103,9 +104,10 @@ static void *workerfn(void *arg)
if (ret && !silent)
warn("thread %d: Could not unlock pi-lock for %p (%d)",
w->tid, w->futex, ret);
- w->ops++; /* account for thread's share of work */
+ ops++; /* account for thread's share of work */
} while (!done);

+ w->ops = ops;
return NULL;
}

--
2.6.6

Subject: Re: [PATCH] perf/bench-futex: Avoid worker cacheline bouncing

On 2016-10-19 10:59:33 [-0700], Davidlohr Bueso wrote:
> Sebastian noted that overhead for worker thread ops (throughput)
> accounting was producing 'perf' to appear in the profiles, consuming
> a non-trivial (ie 13%) amount of CPU. This is due to cacheline
> bouncing due to the increment of w->ops. We can easily fix this by
> just working on a local copy and updating the actual worker once
> done running, and ready to show the program summary. There is no
> danger of the worker being concurrent, so we can trust that no stale
> value is being seen by another thread.
>
> Reported-by: Sebastian Andrzej Siewior <[email protected]>
Acked-by: Sebastian Andrzej Siewior <[email protected]>

> --- a/tools/perf/bench/futex-hash.c
> +++ b/tools/perf/bench/futex-hash.c
> @@ -63,8 +63,9 @@ static const char * const bench_futex_hash_usage[] = {
> static void *workerfn(void *arg)
> {
> int ret;
> - unsigned int i;
> struct worker *w = (struct worker *) arg;
> + unsigned int i;
> + unsigned long ops = w->ops; /* avoid cacheline bouncing */

we start at 0 so there is probably no need to init it with w->ops.

Sebastian

2016-10-19 18:16:29

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH 2/2 v2] perf bench futex: add NUMA support

On Mon, 17 Oct 2016, Sebastian Andrzej Siewior wrote:

>By default the application uses malloc() and all available CPUs. This
>patch introduces NUMA support which means:
>- memory is allocated node local via numa_alloc_local()
>- all CPUs of the specified NUMA node are used. This is also true if the
> number of threads set is greater than the number of CPUs available on
> this node.

Can't we just use numactl to bind cpus and memory to be node-local?

Thanks,
Davidlohr

Subject: Re: [PATCH 2/2 v2] perf bench futex: add NUMA support

On 2016-10-19 11:16:16 [-0700], Davidlohr Bueso wrote:
> On Mon, 17 Oct 2016, Sebastian Andrzej Siewior wrote:
>
> > By default the application uses malloc() and all available CPUs. This
> > patch introduces NUMA support which means:
> > - memory is allocated node local via numa_alloc_local()
> > - all CPUs of the specified NUMA node are used. This is also true if the
> > number of threads set is greater than the number of CPUs available on
> > this node.
>
> Can't we just use numactl to bind cpus and memory to be node-local?

something like
numactl --cpunodebind=$NODE --membind=$NODE perf …
?
This should work for memory however since we use
pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
we would need to query the affinity mask, and deploy threads based on
that mask.
Using NUMA support within this bench-tool has also the side effect that
the output gives all the option used.

> Thanks,
> Davidlohr

Sebastian

2016-10-19 18:41:44

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH] perf/bench-futex: Avoid worker cacheline bouncing

On Wed, 19 Oct 2016, Sebastian Andrzej Siewior wrote:

>On 2016-10-19 10:59:33 [-0700], Davidlohr Bueso wrote:
>> Sebastian noted that overhead for worker thread ops (throughput)
>> accounting was producing 'perf' to appear in the profiles, consuming
>> a non-trivial (ie 13%) amount of CPU. This is due to cacheline
>> bouncing due to the increment of w->ops. We can easily fix this by
>> just working on a local copy and updating the actual worker once
>> done running, and ready to show the program summary. There is no
>> danger of the worker being concurrent, so we can trust that no stale
>> value is being seen by another thread.
>>
>> Reported-by: Sebastian Andrzej Siewior <[email protected]>
>Acked-by: Sebastian Andrzej Siewior <[email protected]>

Thanks.

>
>> --- a/tools/perf/bench/futex-hash.c
>> +++ b/tools/perf/bench/futex-hash.c
>> @@ -63,8 +63,9 @@ static const char * const bench_futex_hash_usage[] = {
>> static void *workerfn(void *arg)
>> {
>> int ret;
>> - unsigned int i;
>> struct worker *w = (struct worker *) arg;
>> + unsigned int i;
>> + unsigned long ops = w->ops; /* avoid cacheline bouncing */
>
>we start at 0 so there is probably no need to init it with w->ops.

Yeah, but I prefer having it this way - separates the init from the actual
work (although no big deal here). The extra load happens ncpu times, so
also no big deal.

2016-10-21 02:34:25

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH 2/2 v2] perf bench futex: add NUMA support

On Wed, 19 Oct 2016, Sebastian Andrzej Siewior wrote:

>On 2016-10-19 11:16:16 [-0700], Davidlohr Bueso wrote:
>> On Mon, 17 Oct 2016, Sebastian Andrzej Siewior wrote:
>>
>> > By default the application uses malloc() and all available CPUs. This
>> > patch introduces NUMA support which means:
>> > - memory is allocated node local via numa_alloc_local()
>> > - all CPUs of the specified NUMA node are used. This is also true if the
>> > number of threads set is greater than the number of CPUs available on
>> > this node.
>>
>> Can't we just use numactl to bind cpus and memory to be node-local?
>
>something like
> numactl --cpunodebind=$NODE --membind=$NODE perf ???
>?

Yes.

>This should work for memory however since we use
> pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
>we would need to query the affinity mask, and deploy threads based on
>that mask.

Ah right. I also considered getting rid of the affinity, but that would
probably hurt more than help (or at least alter) for non-numa options.

>Using NUMA support within this bench-tool has also the side effect that
>the output gives all the option used.

So if we are going to support the numa option for the benchmark, could you please
move the new code into futex.h instead of futex-hash.c? That way we can integrate
the support for the other futex programs as well.

Thanks,
Davidlohr

2016-10-21 03:03:17

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH 2/2 v2] perf bench futex: add NUMA support

On Mon, 17 Oct 2016, Sebastian Andrzej Siewior wrote:

>+#ifdef HAVE_LIBNUMA_SUPPORT
>+#include <numa.h>
>+#endif

In futex.h

>+static int numa_node = -1;

In futex.h (perhaps rename to futexbench_numa_node?)

>+#ifndef HAVE_LIBNUMA_SUPPORT
>+static int numa_run_on_node(int node __maybe_unused) { return 0; }
>+static int numa_node_of_cpu(int node __maybe_unused) { return 0; }
>+static void *numa_alloc_local(size_t size) { return malloc(size); }
>+static void numa_free(void *p, size_t size __maybe_unused) { return free(p); }
>+#endif
>+
>+static bool cpu_is_local(int cpu)
>+{
>+ if (numa_node < 0)
>+ return true;
>+ if (numa_node_of_cpu(cpu) == numa_node)
>+ return true;
>+ return false;
>+}

In futex.h

>- if (!nthreads) /* default to the number of CPUs */
>- nthreads = ncpus;
>+ if (!nthreads) {
>+ /* default to the number of CPUs per NUMA node */

This comment should go...

>+ if (numa_node < 0) {
>+ nthreads = ncpus;
>+ } else {

here.

>+ for (i = 0; i < ncpus; i++) {
>+ if (cpu_is_local(i))
>+ nthreads++;
>+ }
>+ if (!nthreads)
>+ err(EXIT_FAILURE, "No online CPUs for this node");
>+ }
>+ } else {
>+ int cpu_available = 0;
>
>- worker = calloc(nthreads, sizeof(*worker));
>+ for (i = 0; i < ncpus && !cpu_available; i++) {
>+ if (cpu_is_local(i))
>+ cpu_available = 1;
>+ }

Is this really necessary? If the user passes the number of threads, then we shouldn't
care about ncpus; we just run all the threads on the specified node. Wouldn't the
below numa_run_on_node() ensure that the node is not, for example, CPU-less?

>+ if (!cpu_available)
>+ err(EXIT_FAILURE, "No online CPUs for this node");
>+ }
>+
>+ if (numa_node >= 0) {
>+ ret = numa_run_on_node(numa_node);
>+ if (ret < 0)
>+ err(EXIT_FAILURE, "numa_run_on_node");


Thanks,
Davidlohr

Subject: [tip:perf/core] perf bench futex: Cache align the worker struct

Commit-ID: 34b753007d646482a4125a7095e1d1986d395f95
Gitweb: http://git.kernel.org/tip/34b753007d646482a4125a7095e1d1986d395f95
Author: Sebastian Andrzej Siewior <[email protected]>
AuthorDate: Sun, 16 Oct 2016 21:08:02 +0200
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 24 Oct 2016 11:07:45 -0300

perf bench futex: Cache align the worker struct

It popped up in perf testing that the worker consumes some amount of
CPU. It boils down to the increment of `ops` which causes cache line
bouncing between the individual threads.

This patch aligns the struct by 256 bytes to ensure that not a cache
line is shared among CPUs. 128 byte is the x86 worst case and grep says
that L1_CACHE_SHIFT is set to 8 on s390.

Signed-off-by: Sebastian Andrzej Siewior <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/bench/futex-hash.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 8024cd5..d9e5e80 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -39,12 +39,15 @@ static unsigned int threads_starting;
static struct stats throughput_stats;
static pthread_cond_t thread_parent, thread_worker;

+#define SMP_CACHE_BYTES 256
+#define __cacheline_aligned __attribute__ ((aligned (SMP_CACHE_BYTES)))
+
struct worker {
int tid;
u_int32_t *futex;
pthread_t thread;
unsigned long ops;
-};
+} __cacheline_aligned;

static const struct option options[] = {
OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),