LinuxLists.cc - [PATCH] perf record: add a shortcut for metrics

2024-05-27 10:15:43

Subject: [PATCH] perf record: add a shortcut for metrics

Add -M/--metrics option to perf-record providing a shortcut to record
metrics and metricgroups. This option mirrors the one in perf-stat.

Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
Signed-off-by: Artem Savkov <[email protected]>
---
tools/perf/Documentation/perf-record.txt | 7 +++-
tools/perf/builtin-record.c | 43 ++++++++++++++++++++++++
2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 6015fdd08fb63..ebb560d137e62 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -18,7 +18,6 @@ from it, into perf.data - without displaying anything.

This file can then be inspected later on, using 'perf report'.

-
OPTIONS
-------
<command>...::
@@ -216,6 +215,12 @@ OPTIONS
na, by_data, by_addr (for mem_blk)
hops0, hops1, hops2, hops3 (for mem_hops)

+-M::
+--metrics::
+Record metrics or metricgroups specified in a comma separated list.
+For a group all metrics from the group are added.
+See perf list output for the possible metrics and metricgroups.
+
--exclude-perf::
Don't record events issued by perf itself. This option should follow
an event selector (-e) which selects tracepoint event(s). It adds a
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 66a3de8ac6618..5828051ff2736 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -40,6 +40,7 @@
#include "util/trigger.h"
#include "util/perf-hooks.h"
#include "util/cpu-set-sched.h"
+#include "util/metricgroup.h"
#include "util/synthetic-events.h"
#include "util/time-utils.h"
#include "util/units.h"
@@ -188,6 +189,7 @@ static volatile int done;
static volatile int auxtrace_record__snapshot_started;
static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
static DEFINE_TRIGGER(switch_output_trigger);
+static char *metrics;

static const char *affinity_tags[PERF_AFFINITY_MAX] = {
"SYS", "NODE", "CPU"
@@ -200,6 +202,25 @@ static inline pid_t gettid(void)
}
#endif

+static int append_metric_groups(const struct option *opt __maybe_unused,
+ const char *str,
+ int unset __maybe_unused)
+{
+ if (metrics) {
+ char *tmp;
+
+ if (asprintf(&tmp, "%s,%s", metrics, str) < 0)
+ return -ENOMEM;
+ free(metrics);
+ metrics = tmp;
+ } else {
+ metrics = strdup(str);
+ if (!metrics)
+ return -ENOMEM;
+ }
+ return 0;
+}
+
static int record__threads_enabled(struct record *rec)
{
return rec->opts.threads_spec;
@@ -3382,6 +3403,9 @@ static struct option __record_options[] = {
parse_events_option),
OPT_CALLBACK(0, "filter", &record.evlist, "filter",
"event filter", parse_filter),
+ OPT_CALLBACK('M', "metrics", &record.evlist, "metric/metric group list",
+ "monitor specified metrics or metric groups (separated by ,)",
+ append_metric_groups),
OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
NULL, "don't record events from perf itself",
exclude_perf),
@@ -3984,6 +4008,7 @@ int cmd_record(int argc, const char **argv)
int err;
struct record *rec = &record;
char errbuf[BUFSIZ];
+ struct rblist mevents;

setlocale(LC_ALL, "");

@@ -4153,6 +4178,23 @@ int cmd_record(int argc, const char **argv)
if (record.opts.overwrite)
record.opts.tail_synthesize = true;

+ if (metrics) {
+ const char *pmu = parse_events_option_args.pmu_filter ?: "all";
+ int ret = metricgroup__parse_groups(rec->evlist, pmu, metrics,
+ false, /* metric_no_group */
+ false, /* metric_no_merge */
+ false, /* metric_no_threshold */
+ rec->opts.target.cpu_list,
+ rec->opts.target.system_wide,
+ false, /* hardware_aware_grouping */
+ &mevents);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ zfree(&metrics);
+ }
+
if (rec->evlist->core.nr_entries == 0) {
bool can_profile_kernel = perf_event_paranoid_check(1);

@@ -4264,6 +4306,7 @@ int cmd_record(int argc, const char **argv)
out_opts:
record__free_thread_masks(rec, rec->nr_threads);
rec->nr_threads = 0;
+ metricgroup__rblist_exit(&mevents);
evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
return err;
}
--
2.45.1

2024-05-27 17:08:31

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH] perf record: add a shortcut for metrics

On Mon, May 27, 2024 at 02:02:33PM -0300, Arnaldo Carvalho de Melo wrote:
> On Mon, May 27, 2024 at 12:15:19PM +0200, Artem Savkov wrote:
> > Add -M/--metrics option to perf-record providing a shortcut to record
> > metrics and metricgroups. This option mirrors the one in perf-stat.
> >
> > Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
> > Signed-off-by: Artem Savkov <[email protected]>
>
> Not building for me, I needed to add the rblist.h header and also I
> think we need to use metricgroup__rblist_init(&mevents), right?

Argh, that is a static function, it seems we trigger it by having
nr_entries = 0, so the following should do the trick:

struct rblist mevents = { .nr_entries = 0, }

So that we don't depend on the compiler zeroing that field, which for
local variables it should not.

- Arnaldo

> Testing it now.
>
> - Arnaldo
>
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 18da3ce380152ad1..5d67b0711c166fae 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -27,6 +27,7 @@
> #include "util/session.h"
> #include "util/tool.h"
> #include "util/symbol.h"
> +#include "util/rblist.h"
> #include "util/record.h"
> #include "util/cpumap.h"
> #include "util/thread_map.h"
> @@ -4017,6 +4018,7 @@ int cmd_record(int argc, const char **argv)
> set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
> # undef set_nobuild
> #endif
> + metricgroup__rblist_init(&mevents);
>
> /* Disable eager loading of kernel symbols that adds overhead to perf record. */
> symbol_conf.lazy_load_kernel_maps = true;
>
> > ---
> > tools/perf/Documentation/perf-record.txt | 7 +++-
> > tools/perf/builtin-record.c | 43 ++++++++++++++++++++++++
> > 2 files changed, 49 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
> > index 6015fdd08fb63..ebb560d137e62 100644
> > --- a/tools/perf/Documentation/perf-record.txt
> > +++ b/tools/perf/Documentation/perf-record.txt
> > @@ -18,7 +18,6 @@ from it, into perf.data - without displaying anything.
> >
> > This file can then be inspected later on, using 'perf report'.
> >
> > -
> > OPTIONS
> > -------
> > <command>...::
> > @@ -216,6 +215,12 @@ OPTIONS
> > na, by_data, by_addr (for mem_blk)
> > hops0, hops1, hops2, hops3 (for mem_hops)
> >
> > +-M::
> > +--metrics::
> > +Record metrics or metricgroups specified in a comma separated list.
> > +For a group all metrics from the group are added.
> > +See perf list output for the possible metrics and metricgroups.
> > +
> > --exclude-perf::
> > Don't record events issued by perf itself. This option should follow
> > an event selector (-e) which selects tracepoint event(s). It adds a
> > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> > index 66a3de8ac6618..5828051ff2736 100644
> > --- a/tools/perf/builtin-record.c
> > +++ b/tools/perf/builtin-record.c
> > @@ -40,6 +40,7 @@
> > #include "util/trigger.h"
> > #include "util/perf-hooks.h"
> > #include "util/cpu-set-sched.h"
> > +#include "util/metricgroup.h"
> > #include "util/synthetic-events.h"
> > #include "util/time-utils.h"
> > #include "util/units.h"
> > @@ -188,6 +189,7 @@ static volatile int done;
> > static volatile int auxtrace_record__snapshot_started;
> > static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
> > static DEFINE_TRIGGER(switch_output_trigger);
> > +static char *metrics;
> >
> > static const char *affinity_tags[PERF_AFFINITY_MAX] = {
> > "SYS", "NODE", "CPU"
> > @@ -200,6 +202,25 @@ static inline pid_t gettid(void)
> > }
> > #endif
> >
> > +static int append_metric_groups(const struct option *opt __maybe_unused,
> > + const char *str,
> > + int unset __maybe_unused)
> > +{
> > + if (metrics) {
> > + char *tmp;
> > +
> > + if (asprintf(&tmp, "%s,%s", metrics, str) < 0)
> > + return -ENOMEM;
> > + free(metrics);
> > + metrics = tmp;
> > + } else {
> > + metrics = strdup(str);
> > + if (!metrics)
> > + return -ENOMEM;
> > + }
> > + return 0;
> > +}
> > +
> > static int record__threads_enabled(struct record *rec)
> > {
> > return rec->opts.threads_spec;
> > @@ -3382,6 +3403,9 @@ static struct option __record_options[] = {
> > parse_events_option),
> > OPT_CALLBACK(0, "filter", &record.evlist, "filter",
> > "event filter", parse_filter),
> > + OPT_CALLBACK('M', "metrics", &record.evlist, "metric/metric group list",
> > + "monitor specified metrics or metric groups (separated by ,)",
> > + append_metric_groups),
> > OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
> > NULL, "don't record events from perf itself",
> > exclude_perf),
> > @@ -3984,6 +4008,7 @@ int cmd_record(int argc, const char **argv)
> > int err;
> > struct record *rec = &record;
> > char errbuf[BUFSIZ];
> > + struct rblist mevents;
> >
> > setlocale(LC_ALL, "");
> >
> > @@ -4153,6 +4178,23 @@ int cmd_record(int argc, const char **argv)
> > if (record.opts.overwrite)
> > record.opts.tail_synthesize = true;
> >
> > + if (metrics) {
> > + const char *pmu = parse_events_option_args.pmu_filter ?: "all";
> > + int ret = metricgroup__parse_groups(rec->evlist, pmu, metrics,
> > + false, /* metric_no_group */
> > + false, /* metric_no_merge */
> > + false, /* metric_no_threshold */
> > + rec->opts.target.cpu_list,
> > + rec->opts.target.system_wide,
> > + false, /* hardware_aware_grouping */
> > + &mevents);
> > + if (ret) {
> > + err = ret;
> > + goto out;
> > + }
> > + zfree(&metrics);
> > + }
> > +
> > if (rec->evlist->core.nr_entries == 0) {
> > bool can_profile_kernel = perf_event_paranoid_check(1);
> >
> > @@ -4264,6 +4306,7 @@ int cmd_record(int argc, const char **argv)
> > out_opts:
> > record__free_thread_masks(rec, rec->nr_threads);
> > rec->nr_threads = 0;
> > + metricgroup__rblist_exit(&mevents);
> > evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
> > return err;
> > }
> > --
> > 2.45.1

2024-05-27 17:12:57

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH] perf record: add a shortcut for metrics

On Mon, May 27, 2024 at 12:15:19PM +0200, Artem Savkov wrote:
> Add -M/--metrics option to perf-record providing a shortcut to record
> metrics and metricgroups. This option mirrors the one in perf-stat.
>
> Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
> Signed-off-by: Artem Savkov <[email protected]>

Not building for me, I needed to add the rblist.h header and also I
think we need to use metricgroup__rblist_init(&mevents), right?

Testing it now.

- Arnaldo

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 18da3ce380152ad1..5d67b0711c166fae 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -27,6 +27,7 @@
#include "util/session.h"
#include "util/tool.h"
#include "util/symbol.h"
+#include "util/rblist.h"
#include "util/record.h"
#include "util/cpumap.h"
#include "util/thread_map.h"
@@ -4017,6 +4018,7 @@ int cmd_record(int argc, const char **argv)
set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
# undef set_nobuild
#endif
+ metricgroup__rblist_init(&mevents);

/* Disable eager loading of kernel symbols that adds overhead to perf record. */
symbol_conf.lazy_load_kernel_maps = true;

> ---
> tools/perf/Documentation/perf-record.txt | 7 +++-
> tools/perf/builtin-record.c | 43 ++++++++++++++++++++++++
> 2 files changed, 49 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
> index 6015fdd08fb63..ebb560d137e62 100644
> --- a/tools/perf/Documentation/perf-record.txt
> +++ b/tools/perf/Documentation/perf-record.txt
> @@ -18,7 +18,6 @@ from it, into perf.data - without displaying anything.
>
> This file can then be inspected later on, using 'perf report'.
>
> -
> OPTIONS
> -------
> <command>...::
> @@ -216,6 +215,12 @@ OPTIONS
> na, by_data, by_addr (for mem_blk)
> hops0, hops1, hops2, hops3 (for mem_hops)
>
> +-M::
> +--metrics::
> +Record metrics or metricgroups specified in a comma separated list.
> +For a group all metrics from the group are added.
> +See perf list output for the possible metrics and metricgroups.
> +
> --exclude-perf::
> Don't record events issued by perf itself. This option should follow
> an event selector (-e) which selects tracepoint event(s). It adds a
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 66a3de8ac6618..5828051ff2736 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -40,6 +40,7 @@
> #include "util/trigger.h"
> #include "util/perf-hooks.h"
> #include "util/cpu-set-sched.h"
> +#include "util/metricgroup.h"
> #include "util/synthetic-events.h"
> #include "util/time-utils.h"
> #include "util/units.h"
> @@ -188,6 +189,7 @@ static volatile int done;
> static volatile int auxtrace_record__snapshot_started;
> static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
> static DEFINE_TRIGGER(switch_output_trigger);
> +static char *metrics;
>
> static const char *affinity_tags[PERF_AFFINITY_MAX] = {
> "SYS", "NODE", "CPU"
> @@ -200,6 +202,25 @@ static inline pid_t gettid(void)
> }
> #endif
>
> +static int append_metric_groups(const struct option *opt __maybe_unused,
> + const char *str,
> + int unset __maybe_unused)
> +{
> + if (metrics) {
> + char *tmp;
> +
> + if (asprintf(&tmp, "%s,%s", metrics, str) < 0)
> + return -ENOMEM;
> + free(metrics);
> + metrics = tmp;
> + } else {
> + metrics = strdup(str);
> + if (!metrics)
> + return -ENOMEM;
> + }
> + return 0;
> +}
> +
> static int record__threads_enabled(struct record *rec)
> {
> return rec->opts.threads_spec;
> @@ -3382,6 +3403,9 @@ static struct option __record_options[] = {
> parse_events_option),
> OPT_CALLBACK(0, "filter", &record.evlist, "filter",
> "event filter", parse_filter),
> + OPT_CALLBACK('M', "metrics", &record.evlist, "metric/metric group list",
> + "monitor specified metrics or metric groups (separated by ,)",
> + append_metric_groups),
> OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
> NULL, "don't record events from perf itself",
> exclude_perf),
> @@ -3984,6 +4008,7 @@ int cmd_record(int argc, const char **argv)
> int err;
> struct record *rec = &record;
> char errbuf[BUFSIZ];
> + struct rblist mevents;
>
> setlocale(LC_ALL, "");
>
> @@ -4153,6 +4178,23 @@ int cmd_record(int argc, const char **argv)
> if (record.opts.overwrite)
> record.opts.tail_synthesize = true;
>
> + if (metrics) {
> + const char *pmu = parse_events_option_args.pmu_filter ?: "all";
> + int ret = metricgroup__parse_groups(rec->evlist, pmu, metrics,
> + false, /* metric_no_group */
> + false, /* metric_no_merge */
> + false, /* metric_no_threshold */
> + rec->opts.target.cpu_list,
> + rec->opts.target.system_wide,
> + false, /* hardware_aware_grouping */
> + &mevents);
> + if (ret) {
> + err = ret;
> + goto out;
> + }
> + zfree(&metrics);
> + }
> +
> if (rec->evlist->core.nr_entries == 0) {
> bool can_profile_kernel = perf_event_paranoid_check(1);
>
> @@ -4264,6 +4306,7 @@ int cmd_record(int argc, const char **argv)
> out_opts:
> record__free_thread_masks(rec, rec->nr_threads);
> rec->nr_threads = 0;
> + metricgroup__rblist_exit(&mevents);
> evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
> return err;
> }
> --
> 2.45.1

2024-05-27 17:28:42

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH] perf record: add a shortcut for metrics

On Mon, May 27, 2024 at 02:04:54PM -0300, Arnaldo Carvalho de Melo wrote:
> On Mon, May 27, 2024 at 02:02:33PM -0300, Arnaldo Carvalho de Melo wrote:
> > On Mon, May 27, 2024 at 12:15:19PM +0200, Artem Savkov wrote:
> > > Add -M/--metrics option to perf-record providing a shortcut to record
> > > metrics and metricgroups. This option mirrors the one in perf-stat.

> > > Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
> > > Signed-off-by: Artem Savkov <[email protected]>

> > Not building for me, I needed to add the rblist.h header and also I
> > think we need to use metricgroup__rblist_init(&mevents), right?

> Argh, that is a static function, it seems we trigger it by having
> nr_entries = 0, so the following should do the trick:

> struct rblist mevents = { .nr_entries = 0, }

> So that we don't depend on the compiler zeroing that field, which for
> local variables it should not.

How did you test this?

I'm trying:

perf list metric

pick a metric then:

perf record -M tma_core_bound

And it gets in a long loop doing perf_event_open() calls...

root@number:~# perf stat -a -M tma_clears_resteers sleep 1

Performance counter stats for 'system wide':

4,248,865,818 cpu_core/TOPDOWN.SLOTS/ # 0.5 % tma_clears_resteers
652,979,004 cpu_core/topdown-retiring/
332,409,986 cpu_core/topdown-bad-spec/
1,535,823,405 cpu_core/topdown-fetch-lat/
322,562,930 cpu_core/topdown-br-mispredict/
1,977,392,925 cpu_core/topdown-fe-bound/
1,301,619,465 cpu_core/topdown-be-bound/
78,222,034 cpu_core/INT_MISC.CLEAR_RESTEER_CYCLES/
727,201,022 cpu_core/CPU_CLK_UNHALTED.THREAD/
105,140,481 cpu_core/INT_MISC.UNKNOWN_BRANCH_CYCLES/
5,067,924 cpu_core/INT_MISC.UOP_DROPPING/

1.002715853 seconds time elapsed

root@number:~# gdb perf
GNU gdb (Fedora Linux) 14.2-1.fc39
Copyright (C) 2023 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<https://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from perf...
(gdb) run record -a -M tma_clears_resteers sleep 1
Starting program: /root/bin/perf record -a -M tma_clears_resteers sleep 1

This GDB supports auto-downloading debuginfo from the following URLs:
<https://debuginfod.fedoraproject.org/>
Enable debuginfod for this session? (y or [n]) n
Debuginfod has been disabled.
To make this setting permanent, add 'set debuginfod enabled off' to .gdbinit.
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
[Detaching after fork from child process 688237]
^C
Program received signal SIGINT, Interrupt.
0x00007ffff6f21804 in close () from /lib64/libc.so.6
Missing separate debuginfos, use: dnf debuginfo-install bzip2-libs-1.0.8-16.fc39.x86_64 capstone-4.0.2-15.fc39.x86_64 elfutils-debuginfod-client-0.191-2.fc39.x86_64 elfutils-libelf-0.191-2.fc39.x86_64 elfutils-libs-0.191-2.fc39.x86_64 glib2-2.78.6-1.fc39.x86_64 glibc-2.38-18.fc39.x86_64 keyutils-libs-1.6.3-1.fc39.x86_64 krb5-libs-1.21.2-3.fc39.x86_64 libbabeltrace-1.5.11-5.fc39.x86_64 libcap-2.48-9.fc39.x86_64 libcom_err-1.47.0-2.fc39.x86_64 libcurl-minimal-8.2.1-5.fc39.x86_64 libidn2-2.3.7-1.fc39.x86_64 libnghttp2-1.55.1-5.fc39.x86_64 libpfm-4.13.0-4.fc39.x86_64 libselinux-3.5-5.fc39.x86_64 libstdc++-13.2.1-7.fc39.x86_64 libtraceevent-1.7.2-3.fc39.x86_64 libunistring-1.1-5.fc39.x86_64 libunwind-1.7.0-0.2.rc2.fc39.x86_64 libuuid-2.39.4-1.fc39.x86_64 libzstd-1.5.6-1.fc39.x86_64 numactl-libs-2.0.16-3.fc39.x86_64 opencsd-1.4.0-1.fc39.x86_64 openssl-libs-3.1.1-4.fc39.x86_64 pcre2-10.42-1.fc39.2.x86_64 perl-libs-5.38.2-502.fc39.x86_64 popt-1.19-3.fc39.x86_64 python3-libs-3.12.3-2.fc39.x86_64 slang-2.3.3-4.fc39.x86_64 xz-libs-5.4.4-1.fc39.x86_64 zlib-1.2.13-4.fc39.x86_64
(gdb) bt
#0 0x00007ffff6f21804 in close () from /lib64/libc.so.6
#1 0x000000000061fbd2 in perf_evsel__close_fd_cpu (evsel=0xdab470, cpu_map_idx=6) at evsel.c:188
#2 0x000000000061fc22 in perf_evsel__close_fd (evsel=0xdab470) at evsel.c:197
#3 0x000000000061fc9b in perf_evsel__close (evsel=0xdab470) at evsel.c:211
#4 0x00000000004e0b5f in evlist.reset_weak_group ()
#5 0x0000000000423bb9 in __cmd_record.constprop.0 ()
#6 0x00000000004276c5 in cmd_record ()
#7 0x00000000004c4579 in run_builtin ()
#8 0x00000000004c4889 in handle_internal_command ()
#9 0x0000000000410e57 in main ()
(gdb) c
Continuing.
^C
Program received signal SIGINT, Interrupt.
0x00007ffff6f21804 in close () from /lib64/libc.so.6
(gdb)

So you should investigate this further.

The idea, from my notes, was to be able to have extra columns in 'perf
report' with things like IPC and other metrics, probably not all metrics
will apply. We need to find a way to find out which ones are OK for that
purpose, for instance:

root@number:~# perf stat -a -M tma_branch_resteers sleep 1

Performance counter stats for 'system wide':

209,159,606,886 cpu_core/TOPDOWN.SLOTS/ # 3.2 % tma_branch_resteers
55,156,278,851 cpu_core/topdown-retiring/
7,779,703,706 cpu_core/topdown-bad-spec/
17,644,918,779 cpu_core/topdown-fetch-lat/
39,431,478,422 cpu_core/topdown-fe-bound/
107,325,133,399 cpu_core/topdown-be-bound/
1,066,765,398 cpu_core/INT_MISC.CLEAR_RESTEER_CYCLES/
35,367,316,520 cpu_core/CPU_CLK_UNHALTED.THREAD/
73,066,635 cpu_core/INT_MISC.UNKNOWN_BRANCH_CYCLES/
106,828,690 cpu_core/INT_MISC.UOP_DROPPING/

1.001581758 seconds time elapsed

root@number:~#

But then:

root@number:~# perf record -e cpu_core/TOPDOWN.SLOTS/,cpu_core/topdown-bad-spec/,cpu_core/topdown-fetch-lat/,cpu_core/topdown-fe-bound/,cpu_core/topdown-be-bound/,cpu_core/INT_MISC.CLEAR_RESTEER_CYCLES/,cpu_core/CPU_CLK_UNHALTED.THREAD/,cpu_core/INT_MISC.UNKNOWN_BRANCH_CYCLES/,cpu_core/INT_MISC.UOP_DROPPING/
WARNING: events were regrouped to match PMUs
Error:
The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (cpu_core/topdown-bad-spec/).
/bin/dmesg | grep -i perf may provide additional information.

root@number:~#

That invalid argument error message needs improvement, but its one
example of a metric where events can't be sampled with 'perf record' for
some reason:

Opening: cpu_core/topdown-bad-spec/
------------------------------------------------------------
perf_event_attr:
type 4 (cpu_core)
size 136
config 0x8100 (topdown-bad-spec)
{ sample_period, sample_freq } 4000
sample_type IP|TID|TIME|CPU|PERIOD|IDENTIFIER
read_format ID|LOST
disabled 1
inherit 1
freq 1
sample_id_all 1
exclude_guest 1
------------------------------------------------------------
sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8
sys_perf_event_open failed, error -22
switching off PERF_FORMAT_LOST support
Opening: cpu_core/topdown-bad-spec/

It goes down disabling several perf_event_attr assuming the kernel
doesn't have support for features but ultimately fails and returns the
cryptic EINVAL.

Ian, can you take a look at this:

root@number:~# perf stat -a -M tma_branch_resteers sleep 1

Performance counter stats for 'system wide':

207,780,999,822 cpu_core/TOPDOWN.SLOTS/ # 5.6 % tma_branch_resteers
46,114,346,088 cpu_core/topdown-retiring/
12,533,625,786 cpu_core/topdown-bad-spec/
25,845,036,349 cpu_core/topdown-fetch-lat/
50,198,057,652 cpu_core/topdown-fe-bound/
99,605,368,200 cpu_core/topdown-be-bound/
1,720,994,647 cpu_core/INT_MISC.CLEAR_RESTEER_CYCLES/
39,224,461,225 cpu_core/CPU_CLK_UNHALTED.THREAD/
469,464,484 cpu_core/INT_MISC.UNKNOWN_BRANCH_CYCLES/
260,388,972 cpu_core/INT_MISC.UOP_DROPPING/

1.004820692 seconds time elapsed

root@number:~# perf stat -a -e cpu_core/topdown-bad-spec/ sleep 1

Performance counter stats for 'system wide':

<not supported> cpu_core/topdown-bad-spec/

1.003017044 seconds time elapsed

root@number:~# perf stat -a -e cpu_atom/topdown-bad-spec/ sleep 1

Performance counter stats for 'system wide':

19,178,297,593 cpu_atom/topdown-bad-spec/

1.002640873 seconds time elapsed

root@number:~#

It states that it was able to count cpu_core/topdown-bad-spec/ when
calling via the tma_branch_resteers metric, but then if I call it
directly it says its not supported for cpu_core, while works for
cpu_atom, this looks wrong, no?

- Arnaldo

2024-05-27 17:46:46

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH] perf record: add a shortcut for metrics

On Mon, May 27, 2024 at 02:28:32PM -0300, Arnaldo Carvalho de Melo wrote:
> On Mon, May 27, 2024 at 02:04:54PM -0300, Arnaldo Carvalho de Melo wrote:
> > On Mon, May 27, 2024 at 02:02:33PM -0300, Arnaldo Carvalho de Melo wrote:
> > > On Mon, May 27, 2024 at 12:15:19PM +0200, Artem Savkov wrote:
> > > > Add -M/--metrics option to perf-record providing a shortcut to record
> > > > metrics and metricgroups. This option mirrors the one in perf-stat.
>
> > > > Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
> > > > Signed-off-by: Artem Savkov <[email protected]>

> How did you test this?

> The idea, from my notes, was to be able to have extra columns in 'perf
> report' with things like IPC and other metrics, probably not all metrics
> will apply. We need to find a way to find out which ones are OK for that
> purpose, for instance:

One that may make sense:

root@number:~# perf record -M tma_fb_full
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 3.846 MB perf.data (21745 samples) ]

root@number:~# perf evlist
cpu_core/CPU_CLK_UNHALTED.THREAD/
cpu_core/L1D_PEND_MISS.FB_FULL/
dummy:u
root@number:~#

But then we need to read both to do the math, maybe something like:

root@number:~# perf record -e '{cpu_core/CPU_CLK_UNHALTED.THREAD/,cpu_core/L1D_PEND_MISS.FB_FULL/}:S'
^C[ perf record: Woken up 40 times to write data ]
[ perf record: Captured and wrote 14.640 MB perf.data (219990 samples) ]

root@number:~# perf script | head
cc1plus 1339704 [000] 36028.995981: 2011389 cpu_core/CPU_CLK_UNHALTED.THREAD/: 1097303 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
cc1plus 1339704 [000] 36028.995981: 26231 cpu_core/L1D_PEND_MISS.FB_FULL/: 1097303 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
cc1plus 1340011 [001] 36028.996008: 2004568 cpu_core/CPU_CLK_UNHALTED.THREAD/: 8c23b4 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
cc1plus 1340011 [001] 36028.996008: 20113 cpu_core/L1D_PEND_MISS.FB_FULL/: 8c23b4 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
clang 1340462 [002] 36028.996043: 2007356 cpu_core/CPU_CLK_UNHALTED.THREAD/: ffffffffb43b045d release_pages+0x3dd ([kernel.kallsyms])
clang 1340462 [002] 36028.996043: 23481 cpu_core/L1D_PEND_MISS.FB_FULL/: ffffffffb43b045d release_pages+0x3dd ([kernel.kallsyms])
cc1plus 1339622 [003] 36028.996066: 2004148 cpu_core/CPU_CLK_UNHALTED.THREAD/: 760874 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
cc1plus 1339622 [003] 36028.996066: 31935 cpu_core/L1D_PEND_MISS.FB_FULL/: 760874 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
as 1340513 [004] 36028.996097: 2005052 cpu_core/CPU_CLK_UNHALTED.THREAD/: ffffffffb4491d65 __count_memcg_events+0x55 ([kernel.kallsyms])
as 1340513 [004] 36028.996097: 45084 cpu_core/L1D_PEND_MISS.FB_FULL/: ffffffffb4491d65 __count_memcg_events+0x55 ([kernel.kallsyms])
root@number:~#

root@number:~# perf report --stdio -F +period | head -20
# To display the perf.data header info, please use --header/--header-only options.
#
#
# Total Lost Samples: 0
#
# Samples: 219K of events 'anon group { cpu_core/CPU_CLK_UNHALTED.THREAD/, cpu_core/L1D_PEND_MISS.FB_FULL/ }'
# Event count (approx.): 216528524863
#
# Overhead Period Command Shared Object Symbol
# ................ .................... ......... ................. ....................................
#
4.01% 1.09% 8538169256 39826572 podman [kernel.kallsyms] [k] native_queued_spin_lock_slowpath
1.35% 1.17% 2863376078 42829266 cc1plus cc1plus [.] 0x00000000003f6bcc
0.94% 0.78% 1990639149 28408591 cc1plus cc1plus [.] 0x00000000003f6be4
0.65% 0.17% 1375916283 6109515 podman [kernel.kallsyms] [k] _raw_spin_lock_irqsave
0.61% 0.99% 1304418325 36198834 cc1plus [kernel.kallsyms] [k] get_mem_cgroup_from_mm
0.52% 0.42% 1103054030 15427418 cc1plus cc1plus [.] 0x0000000000ca6c69
0.51% 0.17% 1094200572 6299289 podman [kernel.kallsyms] [k] psi_group_change
0.42% 0.41% 893633315 14778675 cc1plus cc1plus [.] 0x00000000018afafe
0.42% 1.29% 887664793 47046952 cc1plus [kernel.kallsyms] [k] asm_exc_page_fault
root@number:~#

That 'tma_fb_full' metric then would be another column, calculated from
the sampled components of its metric equation:

root@number:~# perf list tma_fb_full | head

Metric Groups:

MemoryBW: [Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet]
tma_fb_full
[This metric does a *rough estimation* of how often L1D Fill Buffer
unavailability limited additional L1D miss memory access requests to
proceed]

TopdownL4: [Metrics for top-down breakdown at level 4]
root@number:~#

This is roughly what we brainstormed, to support metrics in other tools
than 'perf stat' but we need to check the possibilities and limitations
of such an idea, hopefully this discussion will help with that,

- Arnaldo

2024-05-28 14:48:46

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH] perf record: add a shortcut for metrics

On Tue, May 28, 2024 at 01:45:25PM +0200, Artem Savkov wrote:
> On Mon, May 27, 2024 at 02:28:29PM -0300, Arnaldo Carvalho de Melo wrote:
> > On Mon, May 27, 2024 at 02:04:54PM -0300, Arnaldo Carvalho de Melo wrote:
> > > On Mon, May 27, 2024 at 02:02:33PM -0300, Arnaldo Carvalho de Melo wrote:
> > > > On Mon, May 27, 2024 at 12:15:19PM +0200, Artem Savkov wrote:
> > > > > Add -M/--metrics option to perf-record providing a shortcut to record
> > > > > metrics and metricgroups. This option mirrors the one in perf-stat.
> >
> > > > > Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
> > > > > Signed-off-by: Artem Savkov <[email protected]>

<SNIP>

> > How did you test this?
> >
> > I'm trying:
> >
> > perf list metric
> >
> > pick a metric then:
> >
> > perf record -M tma_core_bound
> >
> > And it gets in a long loop doing perf_event_open() calls...
>
> [snip]
>
> > (gdb) bt
> > #0 0x00007ffff6f21804 in close () from /lib64/libc.so.6
> > #1 0x000000000061fbd2 in perf_evsel__close_fd_cpu (evsel=0xdab470, cpu_map_idx=6) at evsel.c:188
> > #2 0x000000000061fc22 in perf_evsel__close_fd (evsel=0xdab470) at evsel.c:197
> > #3 0x000000000061fc9b in perf_evsel__close (evsel=0xdab470) at evsel.c:211
> > #4 0x00000000004e0b5f in evlist.reset_weak_group ()
> > #5 0x0000000000423bb9 in __cmd_record.constprop.0 ()
> > #6 0x00000000004276c5 in cmd_record ()
> > #7 0x00000000004c4579 in run_builtin ()
> > #8 0x00000000004c4889 in handle_internal_command ()
> > #9 0x0000000000410e57 in main ()
> > (gdb) c
> > Continuing.
> > ^C
> > Program received signal SIGINT, Interrupt.
> > 0x00007ffff6f21804 in close () from /lib64/libc.so.6
> > (gdb)
> >
> > So you should investigate this further.
>
> I tried a bunch of random metrics from perf list but didn't hit this.
>
> It spins forever in evlist__for_each_entry() loop in record__open() with
> the same error:
>
> Weak group for TOPDOWN.SLOTS/5 failed
>
> Looks like the culprit is one of those unsupported metrics, will
> investigate.

Right, when trying something new, in a different way than the
pre-existing codebase was envisioned to be used we may uncover latent
problems, that endless loop seems like something we want fixed :-)

> > The idea, from my notes, was to be able to have extra columns in 'perf
> > report' with things like IPC and other metrics, probably not all metrics
> > will apply. We need to find a way to find out which ones are OK for that
> > purpose, for instance:
> >
> > Opening: cpu_core/topdown-bad-spec/
> > ------------------------------------------------------------
> > perf_event_attr:
> > type 4 (cpu_core)
> > size 136
> > config 0x8100 (topdown-bad-spec)
> > { sample_period, sample_freq } 4000
> > sample_type IP|TID|TIME|CPU|PERIOD|IDENTIFIER
> > read_format ID|LOST
> > disabled 1
> > inherit 1
> > freq 1
> > sample_id_all 1
> > exclude_guest 1
> > ------------------------------------------------------------
> > sys_perf_event_open: pid -1 cpu 0 group_fd -1 flags 0x8
> > sys_perf_event_open failed, error -22
> > switching off PERF_FORMAT_LOST support
> > Opening: cpu_core/topdown-bad-spec/
>
> Is it just metrics containing unsupported events that need to be skipped
> or there are other cases that wouldn't make much sense? If the latter
> maybe it will be easier to just tag the ones that are supported (or not) in
> pmu-events?

Maybe we can use some criteria to look at the metric and filter out
things that are not working right now? As you go on studying the
codebase you will figure out the reasons, sometimes its a bug (the
forever loop above), sometimes it plain don't make sense and we just
skip it, leaving things like IPC, i.e. we have instructions, we have
cycles, that is what needed for IPC, ok, that makes sense and we should
have an IPC column when collecting both cycles and instructions, just
like is done in a ad hoc way for IPC in perf stat since forever.

People want to have those columns in 'perf report' and 'perf top'.

- Arnaldo

2024-05-28 15:55:14

by Liang, Kan

[permalink] [raw]

Subject: Re: [PATCH] perf record: add a shortcut for metrics

On 2024-05-28 7:57 a.m., Artem Savkov wrote:
> On Mon, May 27, 2024 at 10:01:37PM -0700, Ian Rogers wrote:
>> On Mon, May 27, 2024 at 10:46 AM Arnaldo Carvalho de Melo
>> <[email protected]> wrote:
>>>
>>> On Mon, May 27, 2024 at 02:28:32PM -0300, Arnaldo Carvalho de Melo wrote:
>>>> On Mon, May 27, 2024 at 02:04:54PM -0300, Arnaldo Carvalho de Melo wrote:
>>>>> On Mon, May 27, 2024 at 02:02:33PM -0300, Arnaldo Carvalho de Melo wrote:
>>>>>> On Mon, May 27, 2024 at 12:15:19PM +0200, Artem Savkov wrote:
>>>>>>> Add -M/--metrics option to perf-record providing a shortcut to record
>>>>>>> metrics and metricgroups. This option mirrors the one in perf-stat.
>>>>
>>>>>>> Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
>>>>>>> Signed-off-by: Artem Savkov <[email protected]>
>>>
>>>> How did you test this?
>>>
>>>> The idea, from my notes, was to be able to have extra columns in 'perf
>>>> report' with things like IPC and other metrics, probably not all metrics
>>>> will apply. We need to find a way to find out which ones are OK for that
>>>> purpose, for instance:
>>>
>>> One that may make sense:
>>>
>>> root@number:~# perf record -M tma_fb_full
>>> ^C[ perf record: Woken up 1 times to write data ]
>>> [ perf record: Captured and wrote 3.846 MB perf.data (21745 samples) ]
>>>
>>> root@number:~# perf evlist
>>> cpu_core/CPU_CLK_UNHALTED.THREAD/
>>> cpu_core/L1D_PEND_MISS.FB_FULL/
>>> dummy:u
>>> root@number:~#
>>>
>>> But then we need to read both to do the math, maybe something like:
>>>
>>> root@number:~# perf record -e '{cpu_core/CPU_CLK_UNHALTED.THREAD/,cpu_core/L1D_PEND_MISS.FB_FULL/}:S'
>>> ^C[ perf record: Woken up 40 times to write data ]
>>> [ perf record: Captured and wrote 14.640 MB perf.data (219990 samples) ]
>>>
>>> root@number:~# perf script | head
>>> cc1plus 1339704 [000] 36028.995981: 2011389 cpu_core/CPU_CLK_UNHALTED.THREAD/: 1097303 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>> cc1plus 1339704 [000] 36028.995981: 26231 cpu_core/L1D_PEND_MISS.FB_FULL/: 1097303 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>> cc1plus 1340011 [001] 36028.996008: 2004568 cpu_core/CPU_CLK_UNHALTED.THREAD/: 8c23b4 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>> cc1plus 1340011 [001] 36028.996008: 20113 cpu_core/L1D_PEND_MISS.FB_FULL/: 8c23b4 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>> clang 1340462 [002] 36028.996043: 2007356 cpu_core/CPU_CLK_UNHALTED.THREAD/: ffffffffb43b045d release_pages+0x3dd ([kernel.kallsyms])
>>> clang 1340462 [002] 36028.996043: 23481 cpu_core/L1D_PEND_MISS.FB_FULL/: ffffffffb43b045d release_pages+0x3dd ([kernel.kallsyms])
>>> cc1plus 1339622 [003] 36028.996066: 2004148 cpu_core/CPU_CLK_UNHALTED.THREAD/: 760874 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>> cc1plus 1339622 [003] 36028.996066: 31935 cpu_core/L1D_PEND_MISS.FB_FULL/: 760874 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>> as 1340513 [004] 36028.996097: 2005052 cpu_core/CPU_CLK_UNHALTED.THREAD/: ffffffffb4491d65 __count_memcg_events+0x55 ([kernel.kallsyms])
>>> as 1340513 [004] 36028.996097: 45084 cpu_core/L1D_PEND_MISS.FB_FULL/: ffffffffb4491d65 __count_memcg_events+0x55 ([kernel.kallsyms])
>>> root@number:~#
>>>
>>> root@number:~# perf report --stdio -F +period | head -20
>>> # To display the perf.data header info, please use --header/--header-only options.
>>> #
>>> #
>>> # Total Lost Samples: 0
>>> #
>>> # Samples: 219K of events 'anon group { cpu_core/CPU_CLK_UNHALTED.THREAD/, cpu_core/L1D_PEND_MISS.FB_FULL/ }'
>>> # Event count (approx.): 216528524863
>>> #
>>> # Overhead Period Command Shared Object Symbol
>>> # ................ .................... ......... ................. ....................................
>>> #
>>> 4.01% 1.09% 8538169256 39826572 podman [kernel.kallsyms] [k] native_queued_spin_lock_slowpath
>>> 1.35% 1.17% 2863376078 42829266 cc1plus cc1plus [.] 0x00000000003f6bcc
>>> 0.94% 0.78% 1990639149 28408591 cc1plus cc1plus [.] 0x00000000003f6be4
>>> 0.65% 0.17% 1375916283 6109515 podman [kernel.kallsyms] [k] _raw_spin_lock_irqsave
>>> 0.61% 0.99% 1304418325 36198834 cc1plus [kernel.kallsyms] [k] get_mem_cgroup_from_mm
>>> 0.52% 0.42% 1103054030 15427418 cc1plus cc1plus [.] 0x0000000000ca6c69
>>> 0.51% 0.17% 1094200572 6299289 podman [kernel.kallsyms] [k] psi_group_change
>>> 0.42% 0.41% 893633315 14778675 cc1plus cc1plus [.] 0x00000000018afafe
>>> 0.42% 1.29% 887664793 47046952 cc1plus [kernel.kallsyms] [k] asm_exc_page_fault
>>> root@number:~#
>>>
>>> That 'tma_fb_full' metric then would be another column, calculated from
>>> the sampled components of its metric equation:
>>>
>>> root@number:~# perf list tma_fb_full | head
>>>
>>> Metric Groups:
>>>
>>> MemoryBW: [Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet]
>>> tma_fb_full
>>> [This metric does a *rough estimation* of how often L1D Fill Buffer
>>> unavailability limited additional L1D miss memory access requests to
>>> proceed]
>>>
>>> TopdownL4: [Metrics for top-down breakdown at level 4]
>>> root@number:~#
>>>
>>> This is roughly what we brainstormed, to support metrics in other tools
>>> than 'perf stat' but we need to check the possibilities and limitations
>>> of such an idea, hopefully this discussion will help with that,
>>
>> Putting metrics next to code in perf report/annotate sounds good to
>> me, opening all events from a metric as if we want to sample on them
>> less so.
>
> The idea was to record whatever data was asked on record step and
> provide the list of all metrics that can be calculated out of that data
> in perf report, e.g. you could record tma_info_thread_ipc but report
> will suggest both it and tma_info_thread_cpi.
>

Do you mean that sample all the events in a metrics, and report both
samples and its metrics calculation result in the report?
That doesn't work for all the metrics.

- For the topdown related metrics, especially on ICL and later
platforms, the perf metrics feature is used by default. It doesn't
support sampling.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/topdown.txt?#n293
- Some PMUs which doesn't support sampling as well, e.g., uncore, Power,
MSR.
- There are some SW events, e.g.,duration_time, you may don't want to do
sampling

You probable need to introduce a flag to ignore those metrics in perf
record.

>> We don't have metrics working with `perf stat record`, I
>> think Kan may have volunteered for that, but it seems like something
>> more urgent than expanding `perf record`. Presumably the way the
>> metric would be recorded for that could also benefit this effort.
>>
>> If you look at the tma metrics a number of them have a "Sample with".
>> For example:
>> ```
>> $ perf list -v
>> ...
>> tma_branch_mispredicts
>> [This metric represents fraction of slots the CPU has wasted
>> due to Branch Misprediction.
>> These slots are either wasted by uops fetched from an
>> incorrectly speculated program path;
>> or stalls when the out-of-order part of the machine needs to
>> recover its state from a
>> speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES.
>> Related metrics:
>> tma_info_bad_spec_branch_misprediction_cost,tma_info_bottleneck_mispredictions,
>> tma_mispredicts_resteers]
>> ...
>> ```
>> It could be logical for `perf record -M tma_branch_mispredicts ...` to
>> be translated to `perf record -e BR_MISP_RETIRED.ALL_BRANCHES ...`
>> rather than to do any form of counting.
>
> Thanks for the pointer, I'll see how this could be done.

It sounds more reasonable to me that we can sample some typical events,
and read the other members in the metrics. So we can put metrics next to
the code in perf report/annotate as Ian mentioned. It could also address
limits of some metrics, especially for the topdown related metrics.
(But I'm not sure if the "Sample with" can give you the right hints. I
will ask around internally.)

But there is also some limits for the sampling read. Everything has to
be in a group. That could be a problem for some big metrics.
Thanks,
Kan

2024-05-28 18:20:21

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH] perf record: add a shortcut for metrics

On Tue, May 28, 2024 at 11:55:00AM -0400, Liang, Kan wrote:
> On 2024-05-28 7:57 a.m., Artem Savkov wrote:
> > On Mon, May 27, 2024 at 10:01:37PM -0700, Ian Rogers wrote:
> >> On Mon, May 27, 2024 at 10:46 AM Arnaldo Carvalho de Melo
> >> <[email protected]> wrote:
> >>>
> >>> On Mon, May 27, 2024 at 02:28:32PM -0300, Arnaldo Carvalho de Melo wrote:
> >>>> On Mon, May 27, 2024 at 02:04:54PM -0300, Arnaldo Carvalho de Melo wrote:
> >>>>> On Mon, May 27, 2024 at 02:02:33PM -0300, Arnaldo Carvalho de Melo wrote:
> >>>>>> On Mon, May 27, 2024 at 12:15:19PM +0200, Artem Savkov wrote:
> >>>>>>> Add -M/--metrics option to perf-record providing a shortcut to record
> >>>>>>> metrics and metricgroups. This option mirrors the one in perf-stat.
> >>>>
> >>>>>>> Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
> >>>>>>> Signed-off-by: Artem Savkov <[email protected]>
> >>>
> >>>> How did you test this?
> >>>
> >>>> The idea, from my notes, was to be able to have extra columns in 'perf
> >>>> report' with things like IPC and other metrics, probably not all metrics
> >>>> will apply. We need to find a way to find out which ones are OK for that
> >>>> purpose, for instance:
> >>>
> >>> One that may make sense:
> >>>
> >>> root@number:~# perf record -M tma_fb_full
> >>> ^C[ perf record: Woken up 1 times to write data ]
> >>> [ perf record: Captured and wrote 3.846 MB perf.data (21745 samples) ]
> >>>
> >>> root@number:~# perf evlist
> >>> cpu_core/CPU_CLK_UNHALTED.THREAD/
> >>> cpu_core/L1D_PEND_MISS.FB_FULL/
> >>> dummy:u
> >>> root@number:~#
> >>>
> >>> But then we need to read both to do the math, maybe something like:
> >>>
> >>> root@number:~# perf record -e '{cpu_core/CPU_CLK_UNHALTED.THREAD/,cpu_core/L1D_PEND_MISS.FB_FULL/}:S'
> >>> ^C[ perf record: Woken up 40 times to write data ]
> >>> [ perf record: Captured and wrote 14.640 MB perf.data (219990 samples) ]
> >>>
> >>> root@number:~# perf script | head
> >>> cc1plus 1339704 [000] 36028.995981: 2011389 cpu_core/CPU_CLK_UNHALTED.THREAD/: 1097303 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
> >>> cc1plus 1339704 [000] 36028.995981: 26231 cpu_core/L1D_PEND_MISS.FB_FULL/: 1097303 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
> >>> cc1plus 1340011 [001] 36028.996008: 2004568 cpu_core/CPU_CLK_UNHALTED.THREAD/: 8c23b4 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
> >>> cc1plus 1340011 [001] 36028.996008: 20113 cpu_core/L1D_PEND_MISS.FB_FULL/: 8c23b4 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
> >>> clang 1340462 [002] 36028.996043: 2007356 cpu_core/CPU_CLK_UNHALTED.THREAD/: ffffffffb43b045d release_pages+0x3dd ([kernel.kallsyms])
> >>> clang 1340462 [002] 36028.996043: 23481 cpu_core/L1D_PEND_MISS.FB_FULL/: ffffffffb43b045d release_pages+0x3dd ([kernel.kallsyms])
> >>> cc1plus 1339622 [003] 36028.996066: 2004148 cpu_core/CPU_CLK_UNHALTED.THREAD/: 760874 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
> >>> cc1plus 1339622 [003] 36028.996066: 31935 cpu_core/L1D_PEND_MISS.FB_FULL/: 760874 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
> >>> as 1340513 [004] 36028.996097: 2005052 cpu_core/CPU_CLK_UNHALTED.THREAD/: ffffffffb4491d65 __count_memcg_events+0x55 ([kernel.kallsyms])
> >>> as 1340513 [004] 36028.996097: 45084 cpu_core/L1D_PEND_MISS.FB_FULL/: ffffffffb4491d65 __count_memcg_events+0x55 ([kernel.kallsyms])
> >>> root@number:~#
> >>>
> >>> root@number:~# perf report --stdio -F +period | head -20
> >>> # To display the perf.data header info, please use --header/--header-only options.
> >>> #
> >>> #
> >>> # Total Lost Samples: 0
> >>> #
> >>> # Samples: 219K of events 'anon group { cpu_core/CPU_CLK_UNHALTED.THREAD/, cpu_core/L1D_PEND_MISS.FB_FULL/ }'
> >>> # Event count (approx.): 216528524863
> >>> #
> >>> # Overhead Period Command Shared Object Symbol
> >>> # ................ .................... ......... ................. ....................................
> >>> #
> >>> 4.01% 1.09% 8538169256 39826572 podman [kernel.kallsyms] [k] native_queued_spin_lock_slowpath
> >>> 1.35% 1.17% 2863376078 42829266 cc1plus cc1plus [.] 0x00000000003f6bcc
> >>> 0.94% 0.78% 1990639149 28408591 cc1plus cc1plus [.] 0x00000000003f6be4
> >>> 0.65% 0.17% 1375916283 6109515 podman [kernel.kallsyms] [k] _raw_spin_lock_irqsave
> >>> 0.61% 0.99% 1304418325 36198834 cc1plus [kernel.kallsyms] [k] get_mem_cgroup_from_mm
> >>> 0.52% 0.42% 1103054030 15427418 cc1plus cc1plus [.] 0x0000000000ca6c69
> >>> 0.51% 0.17% 1094200572 6299289 podman [kernel.kallsyms] [k] psi_group_change
> >>> 0.42% 0.41% 893633315 14778675 cc1plus cc1plus [.] 0x00000000018afafe
> >>> 0.42% 1.29% 887664793 47046952 cc1plus [kernel.kallsyms] [k] asm_exc_page_fault
> >>> root@number:~#
> >>>
> >>> That 'tma_fb_full' metric then would be another column, calculated from
> >>> the sampled components of its metric equation:
> >>>
> >>> root@number:~# perf list tma_fb_full | head
> >>>
> >>> Metric Groups:
> >>>
> >>> MemoryBW: [Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet]
> >>> tma_fb_full
> >>> [This metric does a *rough estimation* of how often L1D Fill Buffer
> >>> unavailability limited additional L1D miss memory access requests to
> >>> proceed]
> >>>
> >>> TopdownL4: [Metrics for top-down breakdown at level 4]
> >>> root@number:~#
> >>>
> >>> This is roughly what we brainstormed, to support metrics in other tools
> >>> than 'perf stat' but we need to check the possibilities and limitations
> >>> of such an idea, hopefully this discussion will help with that,
> >>
> >> Putting metrics next to code in perf report/annotate sounds good to
> >> me, opening all events from a metric as if we want to sample on them
> >> less so.
> >
> > The idea was to record whatever data was asked on record step and
> > provide the list of all metrics that can be calculated out of that data
> > in perf report, e.g. you could record tma_info_thread_ipc but report
> > will suggest both it and tma_info_thread_cpi.
> >
>
> Do you mean that sample all the events in a metrics, and report both
> samples and its metrics calculation result in the report?
> That doesn't work for all the metrics.

IIRC Guilherme was mentioning having extra metrics on report was
something he missed that is available on tools such as VTune, Guilherme?

- Arnaldo

> - For the topdown related metrics, especially on ICL and later
> platforms, the perf metrics feature is used by default. It doesn't
> support sampling.
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/topdown.txt?#n293
> - Some PMUs which doesn't support sampling as well, e.g., uncore, Power,
> MSR.
> - There are some SW events, e.g.,duration_time, you may don't want to do
> sampling
>
> You probable need to introduce a flag to ignore those metrics in perf
> record.
>
> >> We don't have metrics working with `perf stat record`, I
> >> think Kan may have volunteered for that, but it seems like something
> >> more urgent than expanding `perf record`. Presumably the way the
> >> metric would be recorded for that could also benefit this effort.
> >>
> >> If you look at the tma metrics a number of them have a "Sample with".
> >> For example:
> >> ```
> >> $ perf list -v
> >> ...
> >> tma_branch_mispredicts
> >> [This metric represents fraction of slots the CPU has wasted
> >> due to Branch Misprediction.
> >> These slots are either wasted by uops fetched from an
> >> incorrectly speculated program path;
> >> or stalls when the out-of-order part of the machine needs to
> >> recover its state from a
> >> speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES.
> >> Related metrics:
> >> tma_info_bad_spec_branch_misprediction_cost,tma_info_bottleneck_mispredictions,
> >> tma_mispredicts_resteers]
> >> ...
> >> ```
> >> It could be logical for `perf record -M tma_branch_mispredicts ...` to
> >> be translated to `perf record -e BR_MISP_RETIRED.ALL_BRANCHES ...`
> >> rather than to do any form of counting.
> >
> > Thanks for the pointer, I'll see how this could be done.
>
> It sounds more reasonable to me that we can sample some typical events,
> and read the other members in the metrics. So we can put metrics next to
> the code in perf report/annotate as Ian mentioned. It could also address
> limits of some metrics, especially for the topdown related metrics.
> (But I'm not sure if the "Sample with" can give you the right hints. I
> will ask around internally.)
>
> But there is also some limits for the sampling read. Everything has to
> be in a group. That could be a problem for some big metrics.
> Thanks,
> Kan

2024-05-29 20:57:47

by Liang, Kan

[permalink] [raw]

Subject: Re: [PATCH] perf record: add a shortcut for metrics

On 2024-05-29 11:15 a.m., Guilherme Amadio wrote:
> Hi Arnaldo,
>
> On Tue, May 28, 2024 at 08:20:05PM +0200, Arnaldo Carvalho de Melo wrote:
>> On Tue, May 28, 2024 at 11:55:00AM -0400, Liang, Kan wrote:
>>> On 2024-05-28 7:57 a.m., Artem Savkov wrote:
>>>> On Mon, May 27, 2024 at 10:01:37PM -0700, Ian Rogers wrote:
>>>>> On Mon, May 27, 2024 at 10:46 AM Arnaldo Carvalho de Melo
>>>>> <[email protected]> wrote:
>>>>>>
>>>>>> On Mon, May 27, 2024 at 02:28:32PM -0300, Arnaldo Carvalho de Melo wrote:
>>>>>>> On Mon, May 27, 2024 at 02:04:54PM -0300, Arnaldo Carvalho de Melo wrote:
>>>>>>>> On Mon, May 27, 2024 at 02:02:33PM -0300, Arnaldo Carvalho de Melo wrote:
>>>>>>>>> On Mon, May 27, 2024 at 12:15:19PM +0200, Artem Savkov wrote:
>>>>>>>>>> Add -M/--metrics option to perf-record providing a shortcut to record
>>>>>>>>>> metrics and metricgroups. This option mirrors the one in perf-stat.
>>>>>>>
>>>>>>>>>> Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
>>>>>>>>>> Signed-off-by: Artem Savkov <[email protected]>
>>>>>>
>>>>>>> How did you test this?
>>>>>>
>>>>>>> The idea, from my notes, was to be able to have extra columns in 'perf
>>>>>>> report' with things like IPC and other metrics, probably not all metrics
>>>>>>> will apply. We need to find a way to find out which ones are OK for that
>>>>>>> purpose, for instance:
>>>>>>
>>>>>> One that may make sense:
>>>>>>
>>>>>> root@number:~# perf record -M tma_fb_full
>>>>>> ^C[ perf record: Woken up 1 times to write data ]
>>>>>> [ perf record: Captured and wrote 3.846 MB perf.data (21745 samples) ]
>>>>>>
>>>>>> root@number:~# perf evlist
>>>>>> cpu_core/CPU_CLK_UNHALTED.THREAD/
>>>>>> cpu_core/L1D_PEND_MISS.FB_FULL/
>>>>>> dummy:u
>>>>>> root@number:~#
>>>>>>
>>>>>> But then we need to read both to do the math, maybe something like:
>>>>>>
>>>>>> root@number:~# perf record -e '{cpu_core/CPU_CLK_UNHALTED.THREAD/,cpu_core/L1D_PEND_MISS.FB_FULL/}:S'
>>>>>> ^C[ perf record: Woken up 40 times to write data ]
>>>>>> [ perf record: Captured and wrote 14.640 MB perf.data (219990 samples) ]
>>>>>>
>>>>>> root@number:~# perf script | head
>>>>>> cc1plus 1339704 [000] 36028.995981: 2011389 cpu_core/CPU_CLK_UNHALTED.THREAD/: 1097303 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>>>>> cc1plus 1339704 [000] 36028.995981: 26231 cpu_core/L1D_PEND_MISS.FB_FULL/: 1097303 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>>>>> cc1plus 1340011 [001] 36028.996008: 2004568 cpu_core/CPU_CLK_UNHALTED.THREAD/: 8c23b4 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>>>>> cc1plus 1340011 [001] 36028.996008: 20113 cpu_core/L1D_PEND_MISS.FB_FULL/: 8c23b4 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>>>>> clang 1340462 [002] 36028.996043: 2007356 cpu_core/CPU_CLK_UNHALTED.THREAD/: ffffffffb43b045d release_pages+0x3dd ([kernel.kallsyms])
>>>>>> clang 1340462 [002] 36028.996043: 23481 cpu_core/L1D_PEND_MISS.FB_FULL/: ffffffffb43b045d release_pages+0x3dd ([kernel.kallsyms])
>>>>>> cc1plus 1339622 [003] 36028.996066: 2004148 cpu_core/CPU_CLK_UNHALTED.THREAD/: 760874 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>>>>> cc1plus 1339622 [003] 36028.996066: 31935 cpu_core/L1D_PEND_MISS.FB_FULL/: 760874 [unknown] (/usr/libexec/gcc/x86_64-pc-linux-gnu/13/cc1plus)
>>>>>> as 1340513 [004] 36028.996097: 2005052 cpu_core/CPU_CLK_UNHALTED.THREAD/: ffffffffb4491d65 __count_memcg_events+0x55 ([kernel.kallsyms])
>>>>>> as 1340513 [004] 36028.996097: 45084 cpu_core/L1D_PEND_MISS.FB_FULL/: ffffffffb4491d65 __count_memcg_events+0x55 ([kernel.kallsyms])
>>>>>> root@number:~#
>>>>>>
>>>>>> root@number:~# perf report --stdio -F +period | head -20
>>>>>> # To display the perf.data header info, please use --header/--header-only options.
>>>>>> #
>>>>>> #
>>>>>> # Total Lost Samples: 0
>>>>>> #
>>>>>> # Samples: 219K of events 'anon group { cpu_core/CPU_CLK_UNHALTED.THREAD/, cpu_core/L1D_PEND_MISS.FB_FULL/ }'
>>>>>> # Event count (approx.): 216528524863
>>>>>> #
>>>>>> # Overhead Period Command Shared Object Symbol
>>>>>> # ................ .................... ......... ................. ....................................
>>>>>> #
>>>>>> 4.01% 1.09% 8538169256 39826572 podman [kernel.kallsyms] [k] native_queued_spin_lock_slowpath
>>>>>> 1.35% 1.17% 2863376078 42829266 cc1plus cc1plus [.] 0x00000000003f6bcc
>>>>>> 0.94% 0.78% 1990639149 28408591 cc1plus cc1plus [.] 0x00000000003f6be4
>>>>>> 0.65% 0.17% 1375916283 6109515 podman [kernel.kallsyms] [k] _raw_spin_lock_irqsave
>>>>>> 0.61% 0.99% 1304418325 36198834 cc1plus [kernel.kallsyms] [k] get_mem_cgroup_from_mm
>>>>>> 0.52% 0.42% 1103054030 15427418 cc1plus cc1plus [.] 0x0000000000ca6c69
>>>>>> 0.51% 0.17% 1094200572 6299289 podman [kernel.kallsyms] [k] psi_group_change
>>>>>> 0.42% 0.41% 893633315 14778675 cc1plus cc1plus [.] 0x00000000018afafe
>>>>>> 0.42% 1.29% 887664793 47046952 cc1plus [kernel.kallsyms] [k] asm_exc_page_fault
>>>>>> root@number:~#
>>>>>>
>>>>>> That 'tma_fb_full' metric then would be another column, calculated from
>>>>>> the sampled components of its metric equation:
>>>>>>
>>>>>> root@number:~# perf list tma_fb_full | head
>>>>>>
>>>>>> Metric Groups:
>>>>>>
>>>>>> MemoryBW: [Grouping from Top-down Microarchitecture Analysis Metrics spreadsheet]
>>>>>> tma_fb_full
>>>>>> [This metric does a *rough estimation* of how often L1D Fill Buffer
>>>>>> unavailability limited additional L1D miss memory access requests to
>>>>>> proceed]
>>>>>>
>>>>>> TopdownL4: [Metrics for top-down breakdown at level 4]
>>>>>> root@number:~#
>>>>>>
>>>>>> This is roughly what we brainstormed, to support metrics in other tools
>>>>>> than 'perf stat' but we need to check the possibilities and limitations
>>>>>> of such an idea, hopefully this discussion will help with that,
>>>>>
>>>>> Putting metrics next to code in perf report/annotate sounds good to
>>>>> me, opening all events from a metric as if we want to sample on them
>>>>> less so.
>>>>
>>>> The idea was to record whatever data was asked on record step and
>>>> provide the list of all metrics that can be calculated out of that data
>>>> in perf report, e.g. you could record tma_info_thread_ipc but report
>>>> will suggest both it and tma_info_thread_cpi.
>>>>
>>>
>>> Do you mean that sample all the events in a metrics, and report both
>>> samples and its metrics calculation result in the report?
>>> That doesn't work for all the metrics.
>>
>> IIRC Guilherme was mentioning having extra metrics on report was
>> something he missed that is available on tools such as VTune, Guilherme?
>
> Thanks for asking. I will try to explain the motivation behind metric
> sampling. VTune offers something called a Microarchitecture Analysis
> report, which will show a break down of all the TMA metrics per symbol:
>
> https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/top-down-microarchitecture-analysis-method.html
>
> The link above has a small screenshot showing function, instructions,
> CPI, and the metrics. This is much better than just counting, because in
> a large detector simulation, for example, there are many different kinds
> of bottlenecks the code can have, and the break down per symbol helps to
> identify which functions suffer from bad speculation, which suffer from
> cache misses, etc. This allows one to choose what kind of change to make
> to the software to optimize it. So as a first step, having TMA level 0
> (i.e. a breakdown of the pipelines for Front-End Bound, Bad Speculation,
> Core Bound, and Memory Bound) would already be quite far towards the
> goal of understanding bottlenecks in specific parts of the code. VTune
> forces sampling without collecting call stacks for this, perf could do
> the same. Hotspots usually have lots of samples, which then allows
> computing metrics relatively accurately.

Yes, that's the assumption the VTune method relies on. Otherwise, the
result may be dubious.

> VTune uses multiplexing and
> very large sampling expression, which I am pasting at the end of this
> message². I extracted that command from the report file after using
> "vtune -collect uarch-exploration <command>" to produce a report. I
> tried that with standard perf record and it failed to parse, so likely
> amplxe-perf is required to be able to record that, but I thought it'd
> be useful information.

Actually, there is already a similar support for the perf script which
was provided by Andi.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4bd1bef8bba2f99ff472ae3617864dda301f81bd

It should be able to be extended to other tools, e.g., annotate or report.

But it seems the feature is broken now. It's better to fix it first.
$ sudo perf script -I -F cpu,ip,sym,event,period,metric
Segmentation fault

The solution relies on the sample read feature. So you probably have to
divide the metrics into several groups if the metrics is too big.
For the leading event, the ref-cycles suggested in Andi's example should
be a good default choice, after all you want to measure time.

For example, the "perf record -M tma_info_thread_ipc"
may be interpreted to
perf record -e "{ref-cycles,INST_RETIRED.ANY,CPU_CLK_UNHALTED.THREAD}:S"

The implementation should be simpler than the VTune method.

>
> As for the interface, I suggest adding a "perf mlist" similar to
> perf evlist, that would just print what metrics could be calculated
> from the events recorded in the input file. Then one could be selected
> for use with perf report or perf annotate.
>

The "perf mlist" looks good, since the metrics are used more widely.

Thanks,
Kan

> I hope this explains enough to clarify things for you. I am attaching a
> screenshot example for the classic matrix multiplication with wrong
> indexing as well, which shows that only certain lines get the metric,
> whereas lines with low number of samples just get 0.0%.
>
> Best regards,
> -Guilherme
>
>>> - For the topdown related metrics, especially on ICL and later
>>> platforms, the perf metrics feature is used by default. It doesn't
>>> support sampling.
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/topdown.txt?#n293
>>> - Some PMUs which doesn't support sampling as well, e.g., uncore, Power,
>>> MSR.
>>> - There are some SW events, e.g.,duration_time, you may don't want to do
>>> sampling
>>>
>>> You probable need to introduce a flag to ignore those metrics in perf
>>> record.
>>>
>>>>> We don't have metrics working with `perf stat record`, I
>>>>> think Kan may have volunteered for that, but it seems like something
>>>>> more urgent than expanding `perf record`. Presumably the way the
>>>>> metric would be recorded for that could also benefit this effort.
>>>>>
>>>>> If you look at the tma metrics a number of them have a "Sample with".
>>>>> For example:
>>>>> ```
>>>>> $ perf list -v
>>>>> ...
>>>>> tma_branch_mispredicts
>>>>> [This metric represents fraction of slots the CPU has wasted
>>>>> due to Branch Misprediction.
>>>>> These slots are either wasted by uops fetched from an
>>>>> incorrectly speculated program path;
>>>>> or stalls when the out-of-order part of the machine needs to
>>>>> recover its state from a
>>>>> speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES.
>>>>> Related metrics:
>>>>> tma_info_bad_spec_branch_misprediction_cost,tma_info_bottleneck_mispredictions,
>>>>> tma_mispredicts_resteers]
>>>>> ...
>>>>> ```
>>>>> It could be logical for `perf record -M tma_branch_mispredicts ...` to
>>>>> be translated to `perf record -e BR_MISP_RETIRED.ALL_BRANCHES ...`
>>>>> rather than to do any form of counting.
>>>>
>>>> Thanks for the pointer, I'll see how this could be done.
>>>
>>> It sounds more reasonable to me that we can sample some typical events,
>>> and read the other members in the metrics. So we can put metrics next to
>>> the code in perf report/annotate as Ian mentioned. It could also address
>>> limits of some metrics, especially for the topdown related metrics.
>>> (But I'm not sure if the "Sample with" can give you the right hints. I
>>> will ask around internally.)
>>>
>>> But there is also some limits for the sampling read. Everything has to
>>> be in a group. That could be a problem for some big metrics.
>>> Thanks,
>>> Kan
>
> 2. runCmd: amplxe-perf record -v --control=fd:21,24 -o system-wide.perf -N -B -T --sample-cpu -d -a --compression-level=1 --threads --clockid=CLOCK_MONOTONIC_RAW -e cpu/period=0xa037a0,event=0x3c,name='CPU_CLK_UNHALTED.THREAD'/Duk,cpu/period=0xa037a0,umask=0x3,name='CPU_CLK_UNHALTED.REF_TSC'/Duk,cpu/period=0xa037a0,event=0xc0,name='INST_RETIRED.ANY'/Duk,cpu/period=0x7a12f,event=0x3c,umask=0x1,name='CPU_CLK_UNHALTED.REF_XCLK'/uk,cpu/period=0x7a12f,event=0x3c,umask=0x2,name='CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE'/uk,cpu/period=0x98968f,event=0x3c,name='CPU_CLK_UNHALTED.THREAD_P'/uk,cpu/period=0x98968f,event=0xa3,umask=0x14,cmask=0x14,name='CYCLE_ACTIVITY.STALLS_MEM_ANY'/uk,cpu/period=0x98968f,event=0xa3,umask=0x4,cmask=0x4,name='CYCLE_ACTIVITY.STALLS_TOTAL'/uk,cpu/period=0x98968f,event=0xa6,umask=0x2,name='EXE_ACTIVITY.1_PORTS_UTIL'/uk,cpu/period=0x98968f,event=0xa6,umask=0x4,name='EXE_ACTIVITY.2_PORTS_UTIL'/uk,cpu/period=0x98968f,event=0xa6,umask=0x40,name='EXE_ACTIVITY.BOUND_ON_STORES'/uk,cpu/period=0x7a143,event=0xc6,umask=0x1,frontend=0x400406,name='FRONTEND_RETIRED.LATENCY_GE_4_PS'/ukpp,cpu/period=0x98968f,event=0x9c,umask=0x1,name='IDQ_UOPS_NOT_DELIVERED.CORE'/uk,cpu/period=0x98968f,event=0xd,umask=0x1,name='INT_MISC.RECOVERY_CYCLES'/uk,cpu/period=0x98968f,event=0xe,umask=0x1,name='UOPS_ISSUED.ANY'/uk,cpu/period=0x98968f,event=0xc2,umask=0x2,name='UOPS_RETIRED.RETIRE_SLOTS'/uk,cpu/period=0x7a12f,event=0xe6,umask=0x1,name='BACLEARS.ANY'/uk,cpu/period=0x1e84ad,event=0xc5,name='BR_MISP_RETIRED.ALL_BRANCHES'/uk,cpu/period=0x98968f,event=0xab,umask=0x2,name='DSB2MITE_SWITCHES.PENALTY_CYCLES'/uk,cpu/period=0x7a143,event=0xc6,umask=0x1,frontend=0x1,name='FRONTEND_RETIRED.ANY_DSB_MISS'/uk,cpu/period=0x7a143,event=0xc6,umask=0x1,frontend=0x11,name='FRONTEND_RETIRED.DSB_MISS_PS'/ukpp,cpu/period=0x7a143,event=0xc6,umask=0x1,frontend=0x13,name='FRONTEND_RETIRED.L2_MISS_PS'/ukpp,cpu/period=0x7a143,event=0xc6,umask=0x1,frontend=0x401006,name='FRONTEND_RETIRED.LATENCY_GE_16_PS'/ukpp,cpu/period=0x7a143,event=0xc6,umask=0x1,frontend=0x100206,name='FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS'/ukpp,cpu/period=0x7a143,event=0xc6,umask=0x1,frontend=0x15,name='FRONTEND_RETIRED.STLB_MISS_PS'/ukpp,cpu/period=0x98968f,event=0x80,umask=0x4,name='ICACHE_16B.IFDATA_STALL'/uk,cpu/period=0x98968f,event=0x80,edge=0x1,umask=0x4,cmask=0x1,name='ICACHE_16B.IFDATA_STALL:cmask=1:e=yes'/uk,cpu/period=0xf424f,event=0x83,umask=0x4,name='ICACHE_64B.IFTAG_STALL'/uk,cpu/period=0x98968f,event=0x79,umask=0x18,cmask=0x4,name='IDQ.ALL_DSB_CYCLES_4_UOPS'/uk,cpu/period=0x98968f,event=0x79,umask=0x18,cmask=0x1,name='IDQ.ALL_DSB_CYCLES_ANY_UOPS'/uk,cpu/period=0x98968f,event=0x79,umask=0x24,cmask=0x4,name='IDQ.ALL_MITE_CYCLES_4_UOPS'/uk,cpu/period=0x98968f,event=0x79,umask=0x24,cmask=0x1,name='IDQ.ALL_MITE_CYCLES_ANY_UOPS'/uk,cpu/period=0x98968f,event=0x79,umask=0x8,name='IDQ.DSB_UOPS'/uk,cpu/period=0x98968f,event=0x79,umask=0x4,name='IDQ.MITE_UOPS'/uk,cpu/period=0x98968f,event=0x79,edge=0x1,umask=0x30,cmask=0x1,name='IDQ.MS_SWITCHES'/uk,cpu/period=0x98968f,event=0x79,umask=0x30,name='IDQ.MS_UOPS'/uk,cpu/period=0x98968f,event=0x9c,umask=0x1,cmask=0x4,name='IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE'/uk,cpu/period=0x98968f,event=0x87,umask=0x1,name='ILD_STALL.LCP'/uk,cpu/period=0x98968f,event=0x55,umask=0x1,cmask=0x1,name='INST_DECODED.DECODERS:cmask=1'/uk,cpu/period=0x98968f,event=0x55,umask=0x1,cmask=0x2,name='INST_DECODED.DECODERS:cmask=2'/uk,cpu/period=0x98968f,event=0xd,umask=0x80,name='INT_MISC.CLEAR_RESTEER_CYCLES'/uk,cpu/period=0x7a12f,event=0xc3,edge=0x1,umask=0x1,cmask=0x1,name='MACHINE_CLEARS.COUNT'/uk,cpu/period=0x1e84ad,event=0xc5,umask=0x4,name='BR_MISP_RETIRED.ALL_BRANCHES_PS'/ukpp,cpu/period=0x98968f,event=0xa3,umask=0x8,cmask=0x8,name='CYCLE_ACTIVITY.CYCLES_L1D_MISS'/uk,cpu/period=0x98968f,event=0xa3,umask=0x10,cmask=0x10,name='CYCLE_ACTIVITY.CYCLES_MEM_ANY'/uk,cpu/period=0x98968f,event=0xa3,umask=0xc,cmask=0xc,name='CYCLE_ACTIVITY.STALLS_L1D_MISS'/uk,cpu/period=0x98968f,event=0xa3,umask=0x5,cmask=0x5,name='CYCLE_ACTIVITY.STALLS_L2_MISS'/uk,cpu/period=0x98968f,event=0xa3,umask=0x6,cmask=0x6,name='CYCLE_ACTIVITY.STALLS_L3_MISS'/uk,cpu/period=0x98968f,event=0x8,umask=0x20,cmask=0x1,name='DTLB_LOAD_MISSES.STLB_HIT:cmask=1'/uk,cpu/period=0x7a12f,event=0x8,umask=0x10,cmask=0x1,name='DTLB_LOAD_MISSES.WALK_ACTIVE'/uk,cpu/period=0x7a12f,event=0x49,umask=0x20,cmask=0x1,name='DTLB_STORE_MISSES.STLB_HIT:cmask=1'/uk,cpu/period=0x7a12f,event=0x49,umask=0x10,cmask=0x1,name='DTLB_STORE_MISSES.WALK_ACTIVE'/uk,cpu/period=0x98968f,event=0x48,umask=0x2,cmask=0x1,name='L1D_PEND_MISS.FB_FULL:cmask=1'/uk,cpu/period=0x98968f,event=0x48,umask=0x1,name='L1D_PEND_MISS.PENDING'/uk,cpu/period=0xf424f,event=0x24,umask=0xe2,name='L2_RQSTS.ALL_RFO'/uk,cpu/period=0xf424f,event=0x24,umask=0xc2,name='L2_RQSTS.RFO_HIT'/uk,cpu/period=0x7a12f,event=0x3,umask=0x8,name='LD_BLOCKS.NO_SR'/uk,cpu/period=0x7a12f,event=0x3,umask=0x2,name='LD_BLOCKS.STORE_FORWARD'/uk,cpu/period=0x7a12f,event=0x7,umask=0x1,name='LD_BLOCKS_PARTIAL.ADDRESS_ALIAS'/uk,cpu/period=0x98968f,event=0xd0,umask=0x82,name='MEM_INST_RETIRED.ALL_STORES_PS'/ukpp,cpu/period=0x7a143,event=0xd0,umask=0x21,name='MEM_INST_RETIRED.LOCK_LOADS_PS'/ukpp,cpu/period=0x7a12f,event=0xd0,umask=0x41,name='MEM_INST_RETIRED.SPLIT_LOADS_PS'/ukpp,cpu/period=0x7a12f,event=0xd0,umask=0x42,name='MEM_INST_RETIRED.SPLIT_STORES_PS'/ukpp,cpu/period=0x7a12f,event=0xd0,umask=0x11,name='MEM_INST_RETIRED.STLB_MISS_LOADS_PS'/ukpp,cpu/period=0x7a12f,event=0xd0,umask=0x12,name='MEM_INST_RETIRED.STLB_MISS_STORES_PS'/ukpp,cpu/period=0x186d7,event=0xd2,umask=0x4,name='MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS'/ukpp,cpu/period=0x186d7,event=0xd2,umask=0x2,name='MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS'/ukpp,cpu/period=0x186d7,event=0xd2,umask=0x1,name='MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS'/ukpp,cpu/period=0x7a143,event=0xd3,umask=0x1,name='MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS'/ukpp,cpu/period=0x7a143,event=0xd3,umask=0x2,name='MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS'/ukpp,cpu/period=0x7a143,event=0xd3,umask=0x8,name='MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD'/uk,cpu/period=0x7a143,event=0xd3,umask=0x4,name='MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM_PS'/ukpp,cpu/period=0x7a143,event=0xd3,umask=0x10,name='MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM_PS'/ukpp,cpu/period=0x7a12f,event=0xd1,umask=0x40,name='MEM_LOAD_RETIRED.FB_HIT_PS'/ukpp,cpu/period=0x98968f,event=0xd1,umask=0x1,name='MEM_LOAD_RETIRED.L1_HIT_PS'/ukpp,cpu/period=0x7a12f,event=0xd1,umask=0x8,name='MEM_LOAD_RETIRED.L1_MISS_PS'/ukpp,cpu/period=0x7a12f,event=0xd1,umask=0x2,name='MEM_LOAD_RETIRED.L2_HIT_PS'/ukpp,cpu/period=0x3d0f9,event=0xd1,umask=0x4,name='MEM_LOAD_RETIRED.L3_HIT_PS'/ukpp,cpu/period=0x7a143,event=0xd1,umask=0x20,name='MEM_LOAD_RETIRED.L3_MISS_PS'/ukpp,cpu/period=0x7a143,event=0xd1,umask=0x80,name='MEM_LOAD_RETIRED.LOCAL_PMM_PS'/ukpp,cpu/period=0x98968f,event=0xb2,umask=0x1,name='OFFCORE_REQUESTS_BUFFER.SQ_FULL'/uk,cpu/period=0x98968f,event=0x60,umask=0x8,cmask=0x4,name='OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD:cmask=4'/uk,cpu/period=0x98968f,event=0x60,umask=0x8,cmask=0x1,name='OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD'/uk,cpu/period=0x98968f,event=0x60,umask=0x4,cmask=0x1,name='OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO'/uk,cpu/period=0x98968f,event=0x14,umask=0x1,cmask=0x1,name='ARITH.DIVIDER_ACTIVE'/uk,cpu/period=0x98968f,event=0xa6,umask=0x1,name='EXE_ACTIVITY.EXE_BOUND_0_PORTS'/uk,cpu/period=0x98968f,event=0xc7,name='FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:umask=0xfc'/uk,cpu/period=0x98968f,event=0xc7,name='FP_ARITH_INST_RETIRED.SCALAR_SINGLE:umask=0x03'/uk,cpu/period=0x98968f,event=0x59,umask=0x1,name='PARTIAL_RAT_STALLS.SCOREBOARD'/uk,cpu/period=0x98968f,event=0xc0,umask=0x1,name='INST_RETIRED.PREC_DIST'/ukpp,cpu/period=0x98968f,event=0xcc,umask=0x40,name='ROB_MISC_EVENTS.PAUSE_INST'/uk,cpu/period=0x98968f,event=0xa1,umask=0x1,name='UOPS_DISPATCHED_PORT.PORT_0'/uk,cpu/period=0x98968f,event=0xa1,umask=0x2,name='UOPS_DISPATCHED_PORT.PORT_1'/uk,cpu/period=0x98968f,event=0xa1,umask=0x4,name='UOPS_DISPATCHED_PORT.PORT_2'/uk,cpu/period=0x98968f,event=0xa1,umask=0x8,name='UOPS_DISPATCHED_PORT.PORT_3'/uk,cpu/period=0x98968f,event=0xa1,umask=0x10,name='UOPS_DISPATCHED_PORT.PORT_4'/uk,cpu/period=0x98968f,event=0xa1,umask=0x20,name='UOPS_DISPATCHED_PORT.PORT_5'/uk,cpu/period=0x98968f,event=0xa1,umask=0x40,name='UOPS_DISPATCHED_PORT.PORT_6'/uk,cpu/period=0x98968f,event=0xa1,umask=0x80,name='UOPS_DISPATCHED_PORT.PORT_7'/uk,cpu/period=0x98968f,event=0xb1,umask=0x2,cmask=0x1,name='UOPS_EXECUTED.CORE_CYCLES_GE_1'/uk,cpu/period=0x98968f,event=0xb1,umask=0x2,cmask=0x2,name='UOPS_EXECUTED.CORE_CYCLES_GE_2'/uk,cpu/period=0x98968f,event=0xb1,umask=0x2,cmask=0x3,name='UOPS_EXECUTED.CORE_CYCLES_GE_3'/uk,cpu/period=0x98968f,event=0xb1,inv=0x1,umask=0x2,cmask=0x1,name='UOPS_EXECUTED.CORE_CYCLES_NONE'/uk,cpu/period=0x98968f,event=0xb1,umask=0x1,name='UOPS_EXECUTED.THREAD'/uk,cpu/period=0x98968f,event=0xb1,umask=0x10,name='UOPS_EXECUTED.X87'/uk,cpu/period=0x98968f,event=0xe,umask=0x2,name='UOPS_ISSUED.VECTOR_WIDTH_MISMATCH'/uk,cpu/period=0x98968f,event=0xc2,umask=0x4,name='UOPS_RETIRED.MACRO_FUSED'/uk,cpu/period=0x1e84ad,event=0xc4,name='BR_INST_RETIRED.ALL_BRANCHES'/uk,cpu/period=0x98968f,event=0xc7,umask=0x4,name='FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE'/uk,cpu/period=0x98968f,event=0xc7,umask=0x8,name='FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE'/uk,cpu/period=0x98968f,event=0xc7,umask=0x10,name='FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE'/uk,cpu/period=0x98968f,event=0xc7,umask=0x20,name='FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE'/uk,cpu/period=0x98968f,event=0xc7,umask=0x40,name='FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE'/uk,cpu/period=0x98968f,event=0xc7,umask=0x80,name='FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE'/uk,cpu/period=0x7a12f,event=0xca,umask=0x1e,cmask=0x1,name='FP_ASSIST.ANY'/uk,cpu/period=0x98968f,event=0xc0,umask=0x2,name='INST_RETIRED.NOP'/uk,cpu/period=0x98968f,event=0xd0,umask=0x83,name='MEM_INST_RETIRED.ANY'/uk,cpu/period=0x7a12f,event=0xc1,umask=0x3f,name='OTHER_ASSISTS.ANY'/uk,cpu/period=0x7a12f,event=0xb7,offcore_rsp=0x8003c0001,umask=0x1,name='OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD'/uk,cpu/period=0x7a12f,event=0xbb,offcore_rsp=0x10003c0002,umask=0x1,name='OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE'/uk,cpu/period=0x7a12f,event=0xb7,offcore_rsp=0x103fc00020,umask=0x1,name='OCR.PF_L2_RFO.L3_MISS.REMOTE_HITM'/uk,cpu/period=0x7a12f,event=0xbb,offcore_rsp=0x10003c0001,umask=0x1,name='OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE'/uk,cpu/period=0x98968f,event=0xc7,umask=0x2,name='FP_ARITH_INST_RETIRED.SCALAR_SINGLE'/uk,cpu/period=0x98968f,event=0xc7,umask=0x1,name='FP_ARITH_INST_RETIRED.SCALAR_DOUBLE'/uk,cpu/period=0x7a12f,event=0xb7,offcore_rsp=0x103fc00002,umask=0x1,name='OCR.DEMAND_RFO.L3_MISS.REMOTE_HITM'/uk,cpu/period=0x7a12f,event=0xbb,offcore_rsp=0x10003c0020,umask=0x1,name='OCR.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE'/uk amplxe-perf-sync sync sys