2015-04-16 15:59:09

by Petr Holasek

[permalink] [raw]
Subject: [PATCH 0/3] a few perf numa benchmark fixes

Hi,

this small series is fixing two bugs in perf bench numa and adds some per
thread stats.

Petr Holasek (3):
perf bench numa: Fixes of --quiet argument
perf bench numa: show more stats of particular threads in verbose mode
perf bench numa: fix immediate meeting of convergence condition

tools/perf/bench/numa.c | 44 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 41 insertions(+), 3 deletions(-)

--
2.1.0


2015-04-16 15:38:38

by Petr Holasek

[permalink] [raw]
Subject: [PATCH 1/3] perf bench numa: Fixes of --quiet argument

Corrected description and fixed function of --quiet argument.

Signed-off-by: Petr Holasek <[email protected]>
---
tools/perf/bench/numa.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index ebfa163..cd872e9c 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -180,7 +180,7 @@ static const struct option options[] = {
OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
- OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "bzero the initial allocations"),
+ OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"),
OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),

/* Special option string parsing callbacks: */
@@ -1395,7 +1395,7 @@ static void print_res(const char *name, double val,
if (!name)
name = "main,";

- if (g->p.show_quiet)
+ if (!g->p.show_quiet)
printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
else
printf(" %14.3f %s\n", val, txt_long);
--
2.1.0

2015-04-16 15:38:56

by Petr Holasek

[permalink] [raw]
Subject: [PATCH 2/3] perf bench numa: show more stats of particular threads in verbose mode

In verbose mode perf bench numa shows also GB/s speed, system and user cpu
time for each particular thread. Using of getrusage() can provide much more
per process or per thread stats in future.

Signed-off-by: Petr Holasek <[email protected]>
---
tools/perf/bench/numa.c | 32 +++++++++++++++++++++++++++++++-
1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index cd872e9c..72edc49 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -23,6 +23,7 @@
#include <pthread.h>
#include <sys/mman.h>
#include <sys/time.h>
+#include <sys/resource.h>
#include <sys/wait.h>
#include <sys/prctl.h>
#include <sys/types.h>
@@ -51,6 +52,9 @@ struct thread_data {
unsigned int loops_done;
u64 val;
u64 runtime_ns;
+ u64 system_time_ns;
+ u64 user_time_ns;
+ double speed_gbs;
pthread_mutex_t *process_lock;
};

@@ -1034,6 +1038,7 @@ static void *worker_thread(void *__tdata)
u64 bytes_done;
long work_done;
u32 l;
+ struct rusage usage;

bind_to_cpumask(td->bind_cpumask);
bind_to_memnode(td->bind_node);
@@ -1186,6 +1191,13 @@ static void *worker_thread(void *__tdata)
timersub(&stop, &start0, &diff);
td->runtime_ns = diff.tv_sec * 1000000000ULL;
td->runtime_ns += diff.tv_usec * 1000ULL;
+ td->speed_gbs = bytes_done / (td->runtime_ns / 1e9) / 1e9;
+
+ getrusage(RUSAGE_THREAD, &usage);
+ td->system_time_ns = usage.ru_stime.tv_sec * 1000000000ULL;
+ td->system_time_ns += usage.ru_stime.tv_usec * 1000ULL;
+ td->user_time_ns = usage.ru_utime.tv_sec * 1000000000ULL;
+ td->user_time_ns += usage.ru_utime.tv_usec * 1000ULL;

free_data(thread_data, g->p.bytes_thread);

@@ -1412,7 +1424,7 @@ static int __bench_numa(const char *name)
double runtime_sec_min;
int wait_stat;
double bytes;
- int i, t;
+ int i, t, p;

if (init())
return -1;
@@ -1548,6 +1560,24 @@ static int __bench_numa(const char *name)
print_res(name, bytes / runtime_sec_max / 1e9,
"GB/sec,", "total-speed", "GB/sec total speed");

+ if (g->p.show_details >= 2) {
+ char tname[32];
+ struct thread_data *td;
+ for (p = 0; p < g->p.nr_proc; p++) {
+ for (t = 0; t < g->p.nr_threads; t++) {
+ memset(tname, 0, 32);
+ td = g->threads + p*g->p.nr_threads + t;
+ snprintf(tname, 32, "process%d:thread%d", p, t);
+ print_res(tname, td->speed_gbs,
+ "GB/sec", "thread-speed", "GB/sec/thread speed");
+ print_res(tname, td->system_time_ns / 1e9,
+ "secs", "thread-system-time", "system CPU time/thread");
+ print_res(tname, td->user_time_ns / 1e9,
+ "secs", "thread-user-time", "user CPU time/thread");
+ }
+ }
+ }
+
free(pids);

deinit();
--
2.1.0

2015-04-16 15:38:51

by Petr Holasek

[permalink] [raw]
Subject: [PATCH 3/3] perf bench numa: fix immediate meeting of convergence condition

This patch fixes the race in the beginning of benchmark run when some
threads hasn't got assigned curr_cpu yet so they don't occur in
nodes-of-process stats and benchmark concludes that all remaining threads
are converged already.

The race can be reproduced with small amount of threads and some bigger amount
of shared process memory, e.g. one process, two threads and 5GB of process
memory.

Signed-off-by: Petr Holasek <[email protected]>
---
tools/perf/bench/numa.c | 8 ++++++++
1 file changed, 8 insertions(+)

diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 72edc49..1704929 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -832,6 +832,9 @@ static int count_process_nodes(int process_nr)
td = g->threads + task_nr;

node = numa_node_of_cpu(td->curr_cpu);
+ if (node < 0) /* curr_cpu was likely still -1 */
+ return 0;
+
node_present[node] = 1;
}

@@ -886,6 +889,11 @@ static void calc_convergence_compression(int *strong)
for (p = 0; p < g->p.nr_proc; p++) {
unsigned int nodes = count_process_nodes(p);

+ if (!nodes) {
+ *strong = 0;
+ return;
+ }
+
nodes_min = min(nodes, nodes_min);
nodes_max = max(nodes, nodes_max);
}
--
2.1.0

2015-04-16 16:13:19

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 0/3] a few perf numa benchmark fixes


* Petr Holasek <[email protected]> wrote:

> Hi,
>
> this small series is fixing two bugs in perf bench numa and adds some per
> thread stats.
>
> Petr Holasek (3):
> perf bench numa: Fixes of --quiet argument
> perf bench numa: show more stats of particular threads in verbose mode
> perf bench numa: fix immediate meeting of convergence condition
>
> tools/perf/bench/numa.c | 44 +++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 41 insertions(+), 3 deletions(-)

Nice!

Reviewed-by: Ingo Molnar <[email protected]>

Thanks,

Ingo

2015-04-27 16:44:52

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 0/3] a few perf numa benchmark fixes

Em Thu, Apr 16, 2015 at 06:13:04PM +0200, Ingo Molnar escreveu:
>
> * Petr Holasek <[email protected]> wrote:
>
> > Hi,
> >
> > this small series is fixing two bugs in perf bench numa and adds some per
> > thread stats.
> >
> > Petr Holasek (3):
> > perf bench numa: Fixes of --quiet argument
> > perf bench numa: show more stats of particular threads in verbose mode
> > perf bench numa: fix immediate meeting of convergence condition

Applied the first and the last ones to perf/urgent (fixes), will apply the
second to perf/core (improvement).

- Arnaldo

> >
> > tools/perf/bench/numa.c | 44 +++++++++++++++++++++++++++++++++++++++++---
> > 1 file changed, 41 insertions(+), 3 deletions(-)
>
> Nice!
>
> Reviewed-by: Ingo Molnar <[email protected]>
>
> Thanks,
>
> Ingo

Subject: [tip:perf/urgent] perf bench numa: Fixes of --quiet argument

Commit-ID: 24f1ced167e5e011040b4c3aae75aee45a79eed5
Gitweb: http://git.kernel.org/tip/24f1ced167e5e011040b4c3aae75aee45a79eed5
Author: Petr Holasek <[email protected]>
AuthorDate: Thu, 16 Apr 2015 17:38:17 +0200
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 27 Apr 2015 13:57:49 -0300

perf bench numa: Fixes of --quiet argument

Corrected description and fixed function of --quiet argument.

Signed-off-by: Petr Holasek <[email protected]>
Reviewed-by: Ingo Molnar <[email protected]>
Cc: Jiri Olsa <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/bench/numa.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index ebfa163..cd872e9c 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -180,7 +180,7 @@ static const struct option options[] = {
OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
- OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "bzero the initial allocations"),
+ OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"),
OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),

/* Special option string parsing callbacks: */
@@ -1395,7 +1395,7 @@ static void print_res(const char *name, double val,
if (!name)
name = "main,";

- if (g->p.show_quiet)
+ if (!g->p.show_quiet)
printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
else
printf(" %14.3f %s\n", val, txt_long);

Subject: [tip:perf/urgent] perf bench numa: Fix immediate meeting of convergence condition

Commit-ID: 1d90a685eb75a56648d7dd22c704a1a6da516de9
Gitweb: http://git.kernel.org/tip/1d90a685eb75a56648d7dd22c704a1a6da516de9
Author: Petr Holasek <[email protected]>
AuthorDate: Thu, 16 Apr 2015 17:38:19 +0200
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 27 Apr 2015 13:57:50 -0300

perf bench numa: Fix immediate meeting of convergence condition

This patch fixes the race in the beginning of benchmark run when some
threads hasn't got assigned curr_cpu yet so they don't occur in
nodes-of-process stats and benchmark concludes that all remaining
threads are converged already.

The race can be reproduced with small amount of threads and some bigger
amount of shared process memory, e.g. one process, two threads and 5GB
of process memory.

Signed-off-by: Petr Holasek <[email protected]>
Reviewed-by: Ingo Molnar <[email protected]>
Cc: Jiri Olsa <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/bench/numa.c | 8 ++++++++
1 file changed, 8 insertions(+)

diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index cd872e9c..ba5efa4 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -828,6 +828,9 @@ static int count_process_nodes(int process_nr)
td = g->threads + task_nr;

node = numa_node_of_cpu(td->curr_cpu);
+ if (node < 0) /* curr_cpu was likely still -1 */
+ return 0;
+
node_present[node] = 1;
}

@@ -882,6 +885,11 @@ static void calc_convergence_compression(int *strong)
for (p = 0; p < g->p.nr_proc; p++) {
unsigned int nodes = count_process_nodes(p);

+ if (!nodes) {
+ *strong = 0;
+ return;
+ }
+
nodes_min = min(nodes, nodes_min);
nodes_max = max(nodes, nodes_max);
}

Subject: [tip:perf/core] perf bench numa: Show more stats of particular threads in verbose mode

Commit-ID: b64aa553d8430aabd24f303899cfa4de678e2c3a
Gitweb: http://git.kernel.org/tip/b64aa553d8430aabd24f303899cfa4de678e2c3a
Author: Petr Holasek <[email protected]>
AuthorDate: Thu, 16 Apr 2015 17:38:18 +0200
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 4 May 2015 12:43:41 -0300

perf bench numa: Show more stats of particular threads in verbose mode

In verbose mode perf bench numa shows also GB/s speed, system and user cpu
time for each particular thread. Using of getrusage() can provide much more
per process or per thread stats in future.

Signed-off-by: Petr Holasek <[email protected]>
Reviewed-by: Ingo Molnar <[email protected]>
Cc: Jiri Olsa <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
[ Rename 'usage' variable to not shadow util.h's usage() ]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/bench/numa.c | 32 +++++++++++++++++++++++++++++++-
1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index ebfa163..0b704c5 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -23,6 +23,7 @@
#include <pthread.h>
#include <sys/mman.h>
#include <sys/time.h>
+#include <sys/resource.h>
#include <sys/wait.h>
#include <sys/prctl.h>
#include <sys/types.h>
@@ -51,6 +52,9 @@ struct thread_data {
unsigned int loops_done;
u64 val;
u64 runtime_ns;
+ u64 system_time_ns;
+ u64 user_time_ns;
+ double speed_gbs;
pthread_mutex_t *process_lock;
};

@@ -1034,6 +1038,7 @@ static void *worker_thread(void *__tdata)
u64 bytes_done;
long work_done;
u32 l;
+ struct rusage rusage;

bind_to_cpumask(td->bind_cpumask);
bind_to_memnode(td->bind_node);
@@ -1186,6 +1191,13 @@ static void *worker_thread(void *__tdata)
timersub(&stop, &start0, &diff);
td->runtime_ns = diff.tv_sec * 1000000000ULL;
td->runtime_ns += diff.tv_usec * 1000ULL;
+ td->speed_gbs = bytes_done / (td->runtime_ns / 1e9) / 1e9;
+
+ getrusage(RUSAGE_THREAD, &rusage);
+ td->system_time_ns = rusage.ru_stime.tv_sec * 1000000000ULL;
+ td->system_time_ns += rusage.ru_stime.tv_usec * 1000ULL;
+ td->user_time_ns = rusage.ru_utime.tv_sec * 1000000000ULL;
+ td->user_time_ns += rusage.ru_utime.tv_usec * 1000ULL;

free_data(thread_data, g->p.bytes_thread);

@@ -1412,7 +1424,7 @@ static int __bench_numa(const char *name)
double runtime_sec_min;
int wait_stat;
double bytes;
- int i, t;
+ int i, t, p;

if (init())
return -1;
@@ -1548,6 +1560,24 @@ static int __bench_numa(const char *name)
print_res(name, bytes / runtime_sec_max / 1e9,
"GB/sec,", "total-speed", "GB/sec total speed");

+ if (g->p.show_details >= 2) {
+ char tname[32];
+ struct thread_data *td;
+ for (p = 0; p < g->p.nr_proc; p++) {
+ for (t = 0; t < g->p.nr_threads; t++) {
+ memset(tname, 0, 32);
+ td = g->threads + p*g->p.nr_threads + t;
+ snprintf(tname, 32, "process%d:thread%d", p, t);
+ print_res(tname, td->speed_gbs,
+ "GB/sec", "thread-speed", "GB/sec/thread speed");
+ print_res(tname, td->system_time_ns / 1e9,
+ "secs", "thread-system-time", "system CPU time/thread");
+ print_res(tname, td->user_time_ns / 1e9,
+ "secs", "thread-user-time", "user CPU time/thread");
+ }
+ }
+ }
+
free(pids);

deinit();