We have supported the event modifier "percore" which sums up the
event counts for all hardware threads in a core and show the counts
per core.
For example,
# perf stat -e cpu/event=cpu-cycles,percore/ -a -A -- sleep 1
Performance counter stats for 'system wide':
S0-D0-C0 395,072 cpu/event=cpu-cycles,percore/
S0-D0-C1 851,248 cpu/event=cpu-cycles,percore/
S0-D0-C2 954,226 cpu/event=cpu-cycles,percore/
S0-D0-C3 1,233,659 cpu/event=cpu-cycles,percore/
This patch provides a new option "--percore-show-thread". It is
used with event modifier "percore" together to sum up the event counts
for all hardware threads in a core but show the counts per hardware
thread.
For example,
# perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -- sleep 1
Performance counter stats for 'system wide':
CPU0 2,453,061 cpu/event=cpu-cycles,percore/
CPU1 1,823,921 cpu/event=cpu-cycles,percore/
CPU2 1,383,166 cpu/event=cpu-cycles,percore/
CPU3 1,102,652 cpu/event=cpu-cycles,percore/
CPU4 2,453,061 cpu/event=cpu-cycles,percore/
CPU5 1,823,921 cpu/event=cpu-cycles,percore/
CPU6 1,383,166 cpu/event=cpu-cycles,percore/
CPU7 1,102,652 cpu/event=cpu-cycles,percore/
We can see counts are duplicated in some CPU pairs
(CPU0/CPU4, CPU1/CPU5, CPU2/CPU6, CPU3/CPU7).
This new option may be useful for some script processing.
Signed-off-by: Jin Yao <[email protected]>
---
tools/perf/Documentation/perf-stat.txt | 7 ++++
tools/perf/builtin-stat.c | 4 ++
tools/perf/util/stat-display.c | 57 ++++++++++++++++++++++----
tools/perf/util/stat.h | 1 +
4 files changed, 60 insertions(+), 9 deletions(-)
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 9431b8066fb4..f6033b3d0971 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -334,6 +334,13 @@ Configure all used events to run in kernel space.
--all-user::
Configure all used events to run in user space.
+--percore-show-thread::
+The event modifier "percore" has supported to sum up the event counts
+for all hardware threads in a core and show the counts per core.
+
+This option with event modifier "percore" enabled also sums up the event
+counts for all hardware threads in a core but show the counts per thread.
+
EXAMPLES
--------
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a098c2ebf4ea..ec053dc1e35c 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -929,6 +929,10 @@ static struct option stat_options[] = {
OPT_BOOLEAN_FLAG(0, "all-user", &stat_config.all_user,
"Configure all used events to run in user space.",
PARSE_OPT_EXCLUSIVE),
+ OPT_BOOLEAN(0, "percore-show-thread", &stat_config.percore_show_thread,
+ "Use with 'percore' event qualifier to show the event "
+ "counts of one hardware thread by sum up total hardware "
+ "threads of same physical core"),
OPT_END()
};
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index bc31fccc0057..ca603e59dfe1 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -110,7 +110,7 @@ static void aggr_printout(struct perf_stat_config *config,
config->csv_sep);
break;
case AGGR_NONE:
- if (evsel->percore) {
+ if (evsel->percore && !config->percore_show_thread) {
fprintf(config->output, "S%d-D%d-C%*d%s",
cpu_map__id_to_socket(id),
cpu_map__id_to_die(id),
@@ -628,7 +628,7 @@ static void aggr_cb(struct perf_stat_config *config,
static void print_counter_aggrdata(struct perf_stat_config *config,
struct evsel *counter, int s,
char *prefix, bool metric_only,
- bool *first)
+ bool *first, int cpu)
{
struct aggr_data ad;
FILE *output = config->output;
@@ -654,8 +654,15 @@ static void print_counter_aggrdata(struct perf_stat_config *config,
fprintf(output, "%s", prefix);
uval = val * counter->scale;
- printout(config, id, nr, counter, uval, prefix,
- run, ena, 1.0, &rt_stat);
+
+ if (cpu == -1) {
+ printout(config, id, nr, counter, uval, prefix,
+ run, ena, 1.0, &rt_stat);
+ } else {
+ printout(config, cpu, nr, counter, uval, prefix,
+ run, ena, 1.0, &rt_stat);
+ }
+
if (!metric_only)
fputc('\n', output);
}
@@ -687,7 +694,7 @@ static void print_aggr(struct perf_stat_config *config,
evlist__for_each_entry(evlist, counter) {
print_counter_aggrdata(config, counter, s,
prefix, metric_only,
- &first);
+ &first, -1);
}
if (metric_only)
fputc('\n', output);
@@ -1163,13 +1170,38 @@ static void print_percore(struct perf_stat_config *config,
print_counter_aggrdata(config, counter, s,
prefix, metric_only,
- &first);
+ &first, -1);
}
if (metric_only)
fputc('\n', output);
}
+static void print_percore_thread(struct perf_stat_config *config,
+ struct evsel *counter, char *prefix)
+{
+ int cpu, s, s2, id;
+ bool first = true;
+ FILE *output = config->output;
+
+ for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+ s2 = config->aggr_get_id(config, evsel__cpus(counter), cpu);
+
+ for (s = 0; s < config->aggr_map->nr; s++) {
+ id = config->aggr_map->map[s];
+ if (s2 == id)
+ break;
+ }
+
+ if (prefix)
+ fprintf(output, "%s", prefix);
+
+ print_counter_aggrdata(config, counter, s,
+ prefix, false,
+ &first, cpu);
+ }
+}
+
void
perf_evlist__print_counters(struct evlist *evlist,
struct perf_stat_config *config,
@@ -1222,9 +1254,16 @@ perf_evlist__print_counters(struct evlist *evlist,
print_no_aggr_metric(config, evlist, prefix);
else {
evlist__for_each_entry(evlist, counter) {
- if (counter->percore)
- print_percore(config, counter, prefix);
- else
+ if (counter->percore) {
+ if (config->percore_show_thread) {
+ print_percore_thread(config,
+ counter,
+ prefix);
+ } else {
+ print_percore(config, counter,
+ prefix);
+ }
+ } else
print_counter(config, counter, prefix);
}
}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index fb990efa54a8..b4fdfaa7f2c0 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -109,6 +109,7 @@ struct perf_stat_config {
bool walltime_run_table;
bool all_kernel;
bool all_user;
+ bool percore_show_thread;
FILE *output;
unsigned int interval;
unsigned int timeout;
--
2.17.1
On Thu, Feb 06, 2020 at 09:56:13AM +0800, Jin Yao wrote:
> We have supported the event modifier "percore" which sums up the
> event counts for all hardware threads in a core and show the counts
> per core.
>
> For example,
>
> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A -- sleep 1
>
> Performance counter stats for 'system wide':
>
> S0-D0-C0 395,072 cpu/event=cpu-cycles,percore/
> S0-D0-C1 851,248 cpu/event=cpu-cycles,percore/
> S0-D0-C2 954,226 cpu/event=cpu-cycles,percore/
> S0-D0-C3 1,233,659 cpu/event=cpu-cycles,percore/
>
> This patch provides a new option "--percore-show-thread". It is
> used with event modifier "percore" together to sum up the event counts
> for all hardware threads in a core but show the counts per hardware
> thread.
>
> For example,
>
> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -- sleep 1
>
> Performance counter stats for 'system wide':
>
> CPU0 2,453,061 cpu/event=cpu-cycles,percore/
> CPU1 1,823,921 cpu/event=cpu-cycles,percore/
> CPU2 1,383,166 cpu/event=cpu-cycles,percore/
> CPU3 1,102,652 cpu/event=cpu-cycles,percore/
> CPU4 2,453,061 cpu/event=cpu-cycles,percore/
> CPU5 1,823,921 cpu/event=cpu-cycles,percore/
> CPU6 1,383,166 cpu/event=cpu-cycles,percore/
> CPU7 1,102,652 cpu/event=cpu-cycles,percore/
I don't understand how is this different from -A output:
# ./perf stat -e cpu/event=cpu-cycles/ -A
^C
Performance counter stats for 'system wide':
CPU0 56,847,497 cpu/event=cpu-cycles/
CPU1 75,274,384 cpu/event=cpu-cycles/
CPU2 63,866,342 cpu/event=cpu-cycles/
CPU3 89,559,693 cpu/event=cpu-cycles/
CPU4 74,761,132 cpu/event=cpu-cycles/
CPU5 76,320,191 cpu/event=cpu-cycles/
CPU6 55,100,175 cpu/event=cpu-cycles/
CPU7 48,472,895 cpu/event=cpu-cycles/
1.074800857 seconds time elapsed
also the interval output is mangled:
# ./perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -I 1000
# time CPU counts unit events
1.000177375 1.000177375 CPU0 138,483,540 cpu/event=cpu-cycles,percore/
1.000177375 1.000177375 CPU1 143,159,477 cpu/event=cpu-cycles,percore/
1.000177375 1.000177375 CPU2 177,554,642 cpu/event=cpu-cycles,percore/
1.000177375 1.000177375 CPU3 150,974,512 cpu/event=cpu-cycles,percore/
1.000177375 1.000177375 CPU4 138,483,540 cpu/event=cpu-cycles,percore/
1.000177375 1.000177375 CPU5 143,159,477 cpu/event=cpu-cycles,percore/
1.000177375 1.000177375 CPU6 177,554,642 cpu/event=cpu-cycles,percore/
jirka
On 2/10/2020 9:28 PM, Jiri Olsa wrote:
> On Thu, Feb 06, 2020 at 09:56:13AM +0800, Jin Yao wrote:
>> We have supported the event modifier "percore" which sums up the
>> event counts for all hardware threads in a core and show the counts
>> per core.
>>
>> For example,
>>
>> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A -- sleep 1
>>
>> Performance counter stats for 'system wide':
>>
>> S0-D0-C0 395,072 cpu/event=cpu-cycles,percore/
>> S0-D0-C1 851,248 cpu/event=cpu-cycles,percore/
>> S0-D0-C2 954,226 cpu/event=cpu-cycles,percore/
>> S0-D0-C3 1,233,659 cpu/event=cpu-cycles,percore/
>>
>> This patch provides a new option "--percore-show-thread". It is
>> used with event modifier "percore" together to sum up the event counts
>> for all hardware threads in a core but show the counts per hardware
>> thread.
>>
>> For example,
>>
>> # perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -- sleep 1
>>
>> Performance counter stats for 'system wide':
>>
>> CPU0 2,453,061 cpu/event=cpu-cycles,percore/
>> CPU1 1,823,921 cpu/event=cpu-cycles,percore/
>> CPU2 1,383,166 cpu/event=cpu-cycles,percore/
>> CPU3 1,102,652 cpu/event=cpu-cycles,percore/
>> CPU4 2,453,061 cpu/event=cpu-cycles,percore/
>> CPU5 1,823,921 cpu/event=cpu-cycles,percore/
>> CPU6 1,383,166 cpu/event=cpu-cycles,percore/
>> CPU7 1,102,652 cpu/event=cpu-cycles,percore/
>
> I don't understand how is this different from -A output:
>
> # ./perf stat -e cpu/event=cpu-cycles/ -A
> ^C
> Performance counter stats for 'system wide':
>
> CPU0 56,847,497 cpu/event=cpu-cycles/
> CPU1 75,274,384 cpu/event=cpu-cycles/
> CPU2 63,866,342 cpu/event=cpu-cycles/
> CPU3 89,559,693 cpu/event=cpu-cycles/
> CPU4 74,761,132 cpu/event=cpu-cycles/
> CPU5 76,320,191 cpu/event=cpu-cycles/
> CPU6 55,100,175 cpu/event=cpu-cycles/
> CPU7 48,472,895 cpu/event=cpu-cycles/
>
> 1.074800857 seconds time elapsed
>
The results are different.
With --percore-show-thread, CPU0 and CPU4 have the same counts (CPU0 and
CPU4 are siblings, e.g. 2,453,061 in my example). The value is sum of
CPU0 + CPU4.
Without --percore-show-thread, CPU0 and CPU4 have their own counts.
> also the interval output is mangled:
>
> # ./perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -I 1000
> # time CPU counts unit events
> 1.000177375 1.000177375 CPU0 138,483,540 cpu/event=cpu-cycles,percore/
> 1.000177375 1.000177375 CPU1 143,159,477 cpu/event=cpu-cycles,percore/
> 1.000177375 1.000177375 CPU2 177,554,642 cpu/event=cpu-cycles,percore/
> 1.000177375 1.000177375 CPU3 150,974,512 cpu/event=cpu-cycles,percore/
> 1.000177375 1.000177375 CPU4 138,483,540 cpu/event=cpu-cycles,percore/
> 1.000177375 1.000177375 CPU5 143,159,477 cpu/event=cpu-cycles,percore/
> 1.000177375 1.000177375 CPU6 177,554,642 cpu/event=cpu-cycles,percore/
>
> jirka
>
Sorry, why the interval output is mangled? It's expected that CPU0 and
CPU4 have the same counts.
Thanks
Jin Yao
On Mon, Feb 10, 2020 at 09:46:46PM +0800, Jin, Yao wrote:
>
>
> On 2/10/2020 9:28 PM, Jiri Olsa wrote:
> > On Thu, Feb 06, 2020 at 09:56:13AM +0800, Jin Yao wrote:
> > > We have supported the event modifier "percore" which sums up the
> > > event counts for all hardware threads in a core and show the counts
> > > per core.
> > >
> > > For example,
> > >
> > > # perf stat -e cpu/event=cpu-cycles,percore/ -a -A -- sleep 1
> > >
> > > Performance counter stats for 'system wide':
> > >
> > > S0-D0-C0 395,072 cpu/event=cpu-cycles,percore/
> > > S0-D0-C1 851,248 cpu/event=cpu-cycles,percore/
> > > S0-D0-C2 954,226 cpu/event=cpu-cycles,percore/
> > > S0-D0-C3 1,233,659 cpu/event=cpu-cycles,percore/
> > >
> > > This patch provides a new option "--percore-show-thread". It is
> > > used with event modifier "percore" together to sum up the event counts
> > > for all hardware threads in a core but show the counts per hardware
> > > thread.
> > >
> > > For example,
> > >
> > > # perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -- sleep 1
> > >
> > > Performance counter stats for 'system wide':
> > >
> > > CPU0 2,453,061 cpu/event=cpu-cycles,percore/
> > > CPU1 1,823,921 cpu/event=cpu-cycles,percore/
> > > CPU2 1,383,166 cpu/event=cpu-cycles,percore/
> > > CPU3 1,102,652 cpu/event=cpu-cycles,percore/
> > > CPU4 2,453,061 cpu/event=cpu-cycles,percore/
> > > CPU5 1,823,921 cpu/event=cpu-cycles,percore/
> > > CPU6 1,383,166 cpu/event=cpu-cycles,percore/
> > > CPU7 1,102,652 cpu/event=cpu-cycles,percore/
> >
> > I don't understand how is this different from -A output:
> >
> > # ./perf stat -e cpu/event=cpu-cycles/ -A
> > ^C
> > Performance counter stats for 'system wide':
> >
> > CPU0 56,847,497 cpu/event=cpu-cycles/
> > CPU1 75,274,384 cpu/event=cpu-cycles/
> > CPU2 63,866,342 cpu/event=cpu-cycles/
> > CPU3 89,559,693 cpu/event=cpu-cycles/
> > CPU4 74,761,132 cpu/event=cpu-cycles/
> > CPU5 76,320,191 cpu/event=cpu-cycles/
> > CPU6 55,100,175 cpu/event=cpu-cycles/
> > CPU7 48,472,895 cpu/event=cpu-cycles/
> >
> > 1.074800857 seconds time elapsed
> >
>
> The results are different.
>
> With --percore-show-thread, CPU0 and CPU4 have the same counts (CPU0 and
> CPU4 are siblings, e.g. 2,453,061 in my example). The value is sum of CPU0 +
> CPU4.
so it shows percore stats but displays all the cpus? what is this good for?
to see which cpus are in core? if that's the case then I think we could
somehow display the cpu numbers for core in --per-core output, like:
S0-D0-C0(0,4) 395,072 cpu/event=cpu-cycles,percore/
S0-D0-C1(1,5) 851,248 cpu/event=cpu-cycles,percore/
S0-D0-C2(2,6) 954,226 cpu/event=cpu-cycles,percore/
S0-D0-C3(3,7) 1,233,659 cpu/event=cpu-cycles,percore/
>
> Without --percore-show-thread, CPU0 and CPU4 have their own counts.
>
> > also the interval output is mangled:
> >
> > # ./perf stat -e cpu/event=cpu-cycles,percore/ -a -A --percore-show-thread -I 1000
> > # time CPU counts unit events
> > 1.000177375 1.000177375 CPU0 138,483,540 cpu/event=cpu-cycles,percore/
> > 1.000177375 1.000177375 CPU1 143,159,477 cpu/event=cpu-cycles,percore/
> > 1.000177375 1.000177375 CPU2 177,554,642 cpu/event=cpu-cycles,percore/
> > 1.000177375 1.000177375 CPU3 150,974,512 cpu/event=cpu-cycles,percore/
> > 1.000177375 1.000177375 CPU4 138,483,540 cpu/event=cpu-cycles,percore/
> > 1.000177375 1.000177375 CPU5 143,159,477 cpu/event=cpu-cycles,percore/
> > 1.000177375 1.000177375 CPU6 177,554,642 cpu/event=cpu-cycles,percore/
> >
> > jirka
> >
>
> Sorry, why the interval output is mangled? It's expected that CPU0 and CPU4
> have the same counts.
there are 2 timestamp columns and the header line does
not align with the data
jirka
> > With --percore-show-thread, CPU0 and CPU4 have the same counts (CPU0 and
> > CPU4 are siblings, e.g. 2,453,061 in my example). The value is sum of CPU0 +
> > CPU4.
>
> so it shows percore stats but displays all the cpus? what is this good for?
This is essentially a replacement for the any bit (which is gone in Icelake).
Per core counts are useful for some formulas, e.g. CoreIPC
The original percore version was inconvenient to post process. This
variant matches the output of the any bit.
-Andi
On Mon, Feb 10, 2020 at 09:01:59AM -0800, Andi Kleen wrote:
> > > With --percore-show-thread, CPU0 and CPU4 have the same counts (CPU0 and
> > > CPU4 are siblings, e.g. 2,453,061 in my example). The value is sum of CPU0 +
> > > CPU4.
> >
> > so it shows percore stats but displays all the cpus? what is this good for?
>
> This is essentially a replacement for the any bit (which is gone in Icelake).
> Per core counts are useful for some formulas, e.g. CoreIPC
>
> The original percore version was inconvenient to post process. This
> variant matches the output of the any bit.
I see, please put this to the changelog/doc
thanks,
jirka
On 2/11/2020 5:04 AM, Jiri Olsa wrote:
> On Mon, Feb 10, 2020 at 09:01:59AM -0800, Andi Kleen wrote:
>>>> With --percore-show-thread, CPU0 and CPU4 have the same counts (CPU0 and
>>>> CPU4 are siblings, e.g. 2,453,061 in my example). The value is sum of CPU0 +
>>>> CPU4.
>>>
>>> so it shows percore stats but displays all the cpus? what is this good for?
>>
>> This is essentially a replacement for the any bit (which is gone in Icelake).
>> Per core counts are useful for some formulas, e.g. CoreIPC
>>
>> The original percore version was inconvenient to post process. This
>> variant matches the output of the any bit.
>
> I see, please put this to the changelog/doc
>
> thanks,
> jirka
>
Thanks Jiri, thanks Andi!
I will put the explanation in v2.
Thanks
Jin Yao