2024-04-28 05:36:30

by Ian Rogers

[permalink] [raw]
Subject: [RFC PATCH v2 0/3] Retirement latency perf stat support

Support 'R' as a retirement latency modifier on events. When present
the evsel will fork perf record and perf report commands, parsing the
perf report output as the count value. The intent is to do something
similar to Weilin's series:
https://lore.kernel.org/lkml/[email protected]/

While the 'R' and the retirement latency are Intel specific, in the
future I can imagine more evsel like commands that require child
processes. We can make the logic more generic at that point.

The code is untested on hardware that supports retirement latency, and
with metrics with retirement latency in them. The record is also of
sleep and various things need tweaking but I think v1 is good enough
for people to give input.

The first patch refactors tool events. I came across various issues
while looking into the issue and we can likely just pick it first. I
kept it in the series for cleanliness sake.

The code has benefitted greatly from Weilin's work and Namhyung's
great review input.

v2. Better refactor tool events to support interval mode and more.
Switch "weight1" to "retire_lat" in perf report command.
Ensure max precision 'P' is enabled on the perf record command.
Kill the perf record at read but keep sleep 0.1.
Fix issues around perf record of tool and retirement latency events.

Ian Rogers (3):
perf evsel: Refactor tool events
perf parse-events: Add a retirement latency modifier
perf evsel: Add retirement latency event support

tools/perf/builtin-stat.c | 74 +++---
tools/perf/util/evsel.c | 409 ++++++++++++++++++++++++++++++++-
tools/perf/util/evsel.h | 10 +
tools/perf/util/parse-events.c | 2 +
tools/perf/util/parse-events.h | 1 +
tools/perf/util/parse-events.l | 3 +-
6 files changed, 453 insertions(+), 46 deletions(-)

--
2.44.0.769.g3c40516874-goog



2024-04-28 05:36:42

by Ian Rogers

[permalink] [raw]
Subject: [RFC PATCH v2 1/3] perf evsel: Refactor tool events

Tool events unnecessarily open a dummy perf event which is useless
even with `perf record` which will still open a dummy event. Change
the behavior of tool events so:

- duration_time - call `rdclock` on open and then report the count as
a delta since the start in evsel__read_counter. This moves code out
of builtin-stat making it more general purpose.

- user_time/system_time - open the fd as either `/proc/pid/stat` or
`/proc/stat` for cases like system wide. evsel__read_counter will
read the appropriate field out of the procfs file. These values
were previously supplied by wait4, if the procfs read fails then
the wait4 values are used, assuming the process/thread terminated.
By reading user_time and system_time this way, interval mode can be
supported.

Opening any of the tool events for `perf record` returns invalid.

Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/builtin-stat.c | 74 ++++++-------
tools/perf/util/evsel.c | 223 +++++++++++++++++++++++++++++++++++++-
tools/perf/util/evsel.h | 6 +
3 files changed, 259 insertions(+), 44 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 65a3dd7ffac3..428e9721b908 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -255,45 +255,37 @@ static int evsel__write_stat_event(struct evsel *counter, int cpu_map_idx, u32 t
process_synthesized_event, NULL);
}

-static int read_single_counter(struct evsel *counter, int cpu_map_idx,
- int thread, struct timespec *rs)
-{
- switch(counter->tool_event) {
- case PERF_TOOL_DURATION_TIME: {
- u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL;
- struct perf_counts_values *count =
- perf_counts(counter->counts, cpu_map_idx, thread);
- count->ena = count->run = val;
- count->val = val;
- return 0;
- }
- case PERF_TOOL_USER_TIME:
- case PERF_TOOL_SYSTEM_TIME: {
- u64 val;
- struct perf_counts_values *count =
- perf_counts(counter->counts, cpu_map_idx, thread);
- if (counter->tool_event == PERF_TOOL_USER_TIME)
- val = ru_stats.ru_utime_usec_stat.mean;
- else
- val = ru_stats.ru_stime_usec_stat.mean;
- count->ena = count->run = val;
- count->val = val;
- return 0;
- }
- default:
- case PERF_TOOL_NONE:
- return evsel__read_counter(counter, cpu_map_idx, thread);
- case PERF_TOOL_MAX:
- /* This should never be reached */
- return 0;
+static int read_single_counter(struct evsel *counter, int cpu_map_idx, int thread)
+{
+ int err = evsel__read_counter(counter, cpu_map_idx, thread);
+
+ /*
+ * Reading user and system time will fail when the process
+ * terminates. Use the wait4 values in that case.
+ */
+ if (err &&
+ (counter->tool_event == PERF_TOOL_USER_TIME ||
+ counter->tool_event == PERF_TOOL_SYSTEM_TIME)) {
+ u64 val;
+ struct perf_counts_values *count =
+ perf_counts(counter->counts, cpu_map_idx, thread);
+
+ if (counter->tool_event == PERF_TOOL_USER_TIME)
+ val = ru_stats.ru_utime_usec_stat.mean;
+ else
+ val = ru_stats.ru_stime_usec_stat.mean;
+ count->ena = count->run = val;
+ count->val = val;
+ return 0;
}
+ return err;
}

/*
* Read out the results of a single counter:
* do not aggregate counts across CPUs in system-wide mode
*/
-static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_map_idx)
+static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
{
int nthreads = perf_thread_map__nr(evsel_list->core.threads);
int thread;
@@ -311,7 +303,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
* (via evsel__read_counter()) and sets their count->loaded.
*/
if (!perf_counts__is_loaded(counter->counts, cpu_map_idx, thread) &&
- read_single_counter(counter, cpu_map_idx, thread, rs)) {
+ read_single_counter(counter, cpu_map_idx, thread)) {
counter->counts->scaled = -1;
perf_counts(counter->counts, cpu_map_idx, thread)->ena = 0;
perf_counts(counter->counts, cpu_map_idx, thread)->run = 0;
@@ -340,7 +332,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
return 0;
}

-static int read_affinity_counters(struct timespec *rs)
+static int read_affinity_counters(void)
{
struct evlist_cpu_iterator evlist_cpu_itr;
struct affinity saved_affinity, *affinity;
@@ -361,10 +353,8 @@ static int read_affinity_counters(struct timespec *rs)
if (evsel__is_bpf(counter))
continue;

- if (!counter->err) {
- counter->err = read_counter_cpu(counter, rs,
- evlist_cpu_itr.cpu_map_idx);
- }
+ if (!counter->err)
+ counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
}
if (affinity)
affinity__cleanup(&saved_affinity);
@@ -388,11 +378,11 @@ static int read_bpf_map_counters(void)
return 0;
}

-static int read_counters(struct timespec *rs)
+static int read_counters(void)
{
if (!stat_config.stop_read_counter) {
if (read_bpf_map_counters() ||
- read_affinity_counters(rs))
+ read_affinity_counters())
return -1;
}
return 0;
@@ -423,7 +413,7 @@ static void process_interval(void)

evlist__reset_aggr_stats(evsel_list);

- if (read_counters(&rs) == 0)
+ if (read_counters() == 0)
process_counters();

if (STAT_RECORD) {
@@ -911,7 +901,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
* avoid arbitrary skew, we must read all counters before closing any
* group leaders.
*/
- if (read_counters(&(struct timespec) { .tv_nsec = t1-t0 }) == 0)
+ if (read_counters() == 0)
process_counters();

/*
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 3536404e9447..a0a8aee7d6b9 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -10,6 +10,7 @@
#include <errno.h>
#include <inttypes.h>
#include <linux/bitops.h>
+#include <api/io.h>
#include <api/fs/fs.h>
#include <api/fs/tracing_path.h>
#include <linux/hw_breakpoint.h>
@@ -30,6 +31,7 @@
#include "counts.h"
#include "event.h"
#include "evsel.h"
+#include "time-utils.h"
#include "util/env.h"
#include "util/evsel_config.h"
#include "util/evsel_fprintf.h"
@@ -1600,11 +1602,183 @@ static int evsel__read_group(struct evsel *leader, int cpu_map_idx, int thread)
return evsel__process_group_data(leader, cpu_map_idx, thread, data);
}

+static int read_stat_field(int fd, struct perf_cpu cpu, int field, __u64 *val)
+{
+ char buf[256];
+ struct io io;
+ int c, i;
+
+ io__init(&io, fd, buf, sizeof(buf));
+
+ /* Skip lines to relevant CPU. */
+ for (i = -1; i < cpu.cpu; i++) {
+ do {
+ c = io__get_char(&io);
+ if (c == -1)
+ return -EINVAL;
+ } while (c != '\n');
+ }
+ /* Skip to "cpu". */
+ c = io__get_char(&io);
+ if (c != 'c')
+ return -EINVAL;
+ c = io__get_char(&io);
+ if (c != 'p')
+ return -EINVAL;
+ c = io__get_char(&io);
+ if (c != 'u')
+ return -EINVAL;
+ /* Skip N of cpuN. */
+ do {
+ c = io__get_char(&io);
+ if (c == -1)
+ return -EINVAL;
+ } while (c != ' ');
+
+ i = 1;
+ while (true) {
+ c = io__get_dec(&io, val);
+ if (c != ' ')
+ break;
+ if (field == i)
+ return 0;
+ i++;
+ }
+ return -EINVAL;
+}
+
+static int read_pid_stat_field(int fd, int field, __u64 *val)
+{
+ char buf[256];
+ struct io io;
+ int c, i;
+
+ io__init(&io, fd, buf, sizeof(buf));
+ c = io__get_dec(&io, val);
+ if (c != ' ')
+ return -EINVAL;
+ if (field == 1)
+ return 0;
+
+ /* Skip comm. */
+ c = io__get_char(&io);
+ if (c != '(')
+ return -EINVAL;
+ do {
+ c = io__get_char(&io);
+ if (c == -1)
+ return -EINVAL;
+ } while (c != ')');
+ if (field == 2)
+ return -EINVAL;
+
+ /* Skip state */
+ c = io__get_char(&io);
+ if (c != ' ')
+ return -EINVAL;
+ c = io__get_char(&io);
+ if (c == -1)
+ return -EINVAL;
+ if (field == 3)
+ return -EINVAL;
+
+ /* Loop over numeric fields*/
+ c = io__get_char(&io);
+ if (c != ' ')
+ return -EINVAL;
+
+ i = 4;
+ while (true) {
+ c = io__get_dec(&io, val);
+ if (c == -1)
+ return -EINVAL;
+ if (c == -2) {
+ /* Assume a -ve was read */
+ c = io__get_dec(&io, val);
+ *val *= -1;
+ }
+ if (c != ' ')
+ return -EINVAL;
+ if (field == i)
+ return 0;
+ i++;
+ }
+ return -EINVAL;
+}
+
+static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread)
+{
+ __u64 cur_time, delta_start;
+ int fd, err = 0;
+ struct perf_counts_values *count;
+ bool adjust = false;
+
+ count = perf_counts(evsel->counts, cpu_map_idx, thread);
+
+ switch (evsel->tool_event) {
+ case PERF_TOOL_DURATION_TIME:
+ /*
+ * Pretend duration_time is only on the first CPU and thread, or
+ * else aggregation will scale duration_time by the number of
+ * CPUs/threads.
+ */
+ if (cpu_map_idx == 0 && thread == 0)
+ cur_time = rdclock();
+ else
+ cur_time = evsel->start_time;
+ break;
+ case PERF_TOOL_USER_TIME:
+ case PERF_TOOL_SYSTEM_TIME: {
+ bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
+
+ fd = FD(evsel, cpu_map_idx, thread);
+ lseek(fd, SEEK_SET, 0);
+ if (evsel->pid_stat) {
+ /* The event exists solely on 1 CPU. */
+ if (cpu_map_idx == 0)
+ err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
+ else
+ cur_time = 0;
+ } else {
+ /* The event is for all threads. */
+ if (thread == 0) {
+ struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
+ cpu_map_idx);
+
+ err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
+ } else {
+ cur_time = 0;
+ }
+ }
+ adjust = true;
+ break;
+ }
+ case PERF_TOOL_NONE:
+ case PERF_TOOL_MAX:
+ default:
+ err = -EINVAL;
+ }
+ if (err)
+ return err;
+
+ delta_start = cur_time - evsel->start_time;
+ if (adjust) {
+ __u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
+
+ delta_start *= 1000000000 / ticks_per_sec;
+ }
+ count->val = delta_start;
+ count->ena = count->run = delta_start;
+ count->lost = 0;
+ return 0;
+}
+
int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
{
- u64 read_format = evsel->core.attr.read_format;
+ if (evsel__is_tool(evsel))
+ return evsel__read_tool(evsel, cpu_map_idx, thread);

- if (read_format & PERF_FORMAT_GROUP)
+ if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
return evsel__read_group(evsel, cpu_map_idx, thread);

return evsel__read_one(evsel, cpu_map_idx, thread);
@@ -2005,6 +2179,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
int pid = -1, err, old_errno;
enum rlimit_action set_rlimit = NO_CHANGE;

+ if (evsel->tool_event == PERF_TOOL_DURATION_TIME) {
+ if (evsel->core.attr.sample_period) /* no sampling */
+ return -EINVAL;
+ evsel->start_time = rdclock();
+ return 0;
+ }
+
err = __evsel__prepare_open(evsel, cpus, threads);
if (err)
return err;
@@ -2037,6 +2218,44 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
if (!evsel->cgrp && !evsel->core.system_wide)
pid = perf_thread_map__pid(threads, thread);

+ if (evsel->tool_event == PERF_TOOL_USER_TIME ||
+ evsel->tool_event == PERF_TOOL_SYSTEM_TIME) {
+ bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
+
+ if (evsel->core.attr.sample_period) {
+ /* no sampling */
+ err = -EINVAL;
+ goto out_close;
+ }
+ if (pid > -1) {
+ char buf[64];
+
+ snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
+ fd = open(buf, O_RDONLY);
+ evsel->pid_stat = true;
+ } else {
+ fd = open("/proc/stat", O_RDONLY);
+ }
+ FD(evsel, idx, thread) = fd;
+ if (fd < 0) {
+ err = -errno;
+ goto out_close;
+ }
+ if (pid > -1) {
+ err = read_pid_stat_field(fd, system ? 15 : 14,
+ &evsel->start_time);
+ } else {
+ struct perf_cpu cpu;
+
+ cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
+ err = read_stat_field(fd, cpu, system ? 3 : 1,
+ &evsel->start_time);
+ }
+ if (err)
+ goto out_close;
+ continue;
+ }
+
group_fd = get_group_fd(evsel, idx, thread);

if (group_fd == -2) {
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 517cff431de2..43f6fd1dcb4d 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -170,6 +170,12 @@ struct evsel {

/* for missing_features */
struct perf_pmu *pmu;
+
+ /* For tool events */
+ /* Beginning time subtracted when the counter is read. */
+ __u64 start_time;
+ /* Is the tool's fd for /proc/pid/stat or /proc/stat. */
+ bool pid_stat;
};

struct perf_missing_features {
--
2.44.0.769.g3c40516874-goog


2024-04-28 05:36:49

by Ian Rogers

[permalink] [raw]
Subject: [RFC PATCH v2 2/3] perf parse-events: Add a retirement latency modifier

Retirement latency is a separate sampled count used on newer Intel
CPUs.

Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/util/evsel.h | 1 +
tools/perf/util/parse-events.c | 2 ++
tools/perf/util/parse-events.h | 1 +
tools/perf/util/parse-events.l | 3 ++-
4 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 43f6fd1dcb4d..bd8e84954e34 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -98,6 +98,7 @@ struct evsel {
bool bpf_counter;
bool use_config_name;
bool skippable;
+ bool retire_lat;
int bpf_fd;
struct bpf_object *bpf_obj;
struct list_head config_terms;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 0f308b4db2b9..9c2a76ec8c99 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1818,6 +1818,8 @@ static int parse_events__modifier_list(struct parse_events_state *parse_state,
evsel->weak_group = true;
if (mod.bpf)
evsel->bpf_counter = true;
+ if (mod.retire_lat)
+ evsel->retire_lat = true;
}
return 0;
}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 5695308efab9..eb94d1247dae 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -201,6 +201,7 @@ struct parse_events_modifier {
bool hypervisor : 1; /* 'h' */
bool guest : 1; /* 'G' */
bool host : 1; /* 'H' */
+ bool retire_lat : 1; /* 'R' */
};

int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc,
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 08ea2d845dc3..85015f080240 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -209,6 +209,7 @@ static int modifiers(struct parse_events_state *parse_state, yyscan_t scanner)
CASE('W', weak);
CASE('e', exclusive);
CASE('b', bpf);
+ CASE('R', retire_lat);
default:
return PE_ERROR;
}
@@ -250,7 +251,7 @@ drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)?
* If you add a modifier you need to update check_modifier().
* Also, the letters in modifier_event must not be in modifier_bp.
*/
-modifier_event [ukhpPGHSDIWeb]{1,15}
+modifier_event [ukhpPGHSDIWebR]{1,16}
modifier_bp [rwx]{1,3}
lc_type (L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node)
lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss)
--
2.44.0.769.g3c40516874-goog


2024-04-28 05:37:07

by Ian Rogers

[permalink] [raw]
Subject: [RFC PATCH v2 3/3] perf evsel: Add retirement latency event support

When a retirement latency event is processed it sets a flag on the
evsel. This change makes it so that when the flag is set evsel
opening, reading and exiting report values from child perf record and
perf report processes.

Something similar was suggested by Namhyung Kim in:
https://lore.kernel.org/lkml/CAM9d7cgdQQn5GYB7t++xuoMdeqPXiEkkcop69+rD06RAnu9-EQ@mail.gmail.com/

This is trying to add support for retirement latency directly in
events rather than through metric changes, as suggested by Weilin Wang in:
https://lore.kernel.org/lkml/[email protected]/

Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/util/evsel.c | 186 +++++++++++++++++++++++++++++++++++++++-
tools/perf/util/evsel.h | 3 +
2 files changed, 188 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index a0a8aee7d6b9..17c123cddde3 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -22,6 +22,7 @@
#include <sys/resource.h>
#include <sys/types.h>
#include <dirent.h>
+#include <signal.h>
#include <stdlib.h>
#include <perf/evsel.h>
#include "asm/bug.h"
@@ -58,6 +59,7 @@
#include <internal/xyarray.h>
#include <internal/lib.h>
#include <internal/threadmap.h>
+#include <subcmd/run-command.h>

#include <linux/ctype.h>

@@ -493,6 +495,162 @@ struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx)
}
#endif

+static int evsel__start_retire_latency_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
+ int cpu_map_idx)
+{
+ char buf[16];
+ int pipefd[2];
+ int err, i;
+ struct perf_cpu cpu = perf_cpu_map__cpu(cpus, cpu_map_idx);
+ struct child_process *child_record =
+ xyarray__entry(evsel->children, cpu_map_idx, 0);
+ struct child_process *child_report =
+ xyarray__entry(evsel->children, cpu_map_idx, 1);
+ char *event = strdup(evsel__name(evsel));
+ /* TODO: the dummy event also won't be used, but there's no option to disable. */
+ const char *record_argv[15] = {
+ [0] = "perf",
+ [1] = "record",
+ [2] = "--synth=no",
+ [3] = "--no-bpf-event",
+ [4] = "-W",
+ [5] = "-o",
+ [6] = "-",
+ [7] = "-e",
+ };
+ const char *report_argv[] = {
+ [0] = "perf",
+ [1] = "report",
+ [2] = "-i",
+ [3] = "-",
+ [4] = "-q",
+ [5] = "-F",
+ [6] = "retire_lat",
+ NULL,
+ };
+
+ if (evsel->core.attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ if (!event)
+ return -ENOMEM;
+
+ /* Change the R modifier to the maximum precision one. */
+ for (i = strlen(event) - 1; i > 0; i--) {
+ if (event[i] == 'R')
+ break;
+ if (event[i] == ':' || event[i] == '/')
+ i = 0;
+ }
+ if (i <= 0) {
+ pr_err("Expected retired latency 'R'\n");
+ return -EINVAL;
+ }
+ event[i] = 'P';
+
+ i = 8;
+ record_argv[i++] = event;
+ if (verbose) {
+ record_argv[i++] = verbose > 1 ? "-vv" : "-v";
+ }
+ if (cpu.cpu >= 0) {
+ record_argv[i++] = "-C";
+ snprintf(buf, sizeof(buf), "%d", cpu.cpu);
+ record_argv[i++] = buf;
+ } else {
+ record_argv[i++] = "-a";
+ }
+ /* TODO: use something like the control fds to control perf record behavior. */
+ record_argv[i++] = "sleep";
+ record_argv[i++] = "0.1";
+
+ if (pipe(pipefd) < 0) {
+ free(event);
+ return -errno;
+ }
+
+ child_record->argv = record_argv;
+ child_record->pid = -1;
+ child_record->no_stdin = 1;
+ if (verbose)
+ child_record->err = fileno(stderr);
+ else
+ child_record->no_stderr = 1;
+ child_record->out = pipefd[1];
+ err = start_command(child_record);
+ free(event);
+ if (err)
+ return err;
+
+ child_report->argv = report_argv;
+ child_report->pid = -1;
+ if (verbose)
+ child_report->err = fileno(stderr);
+ else
+ child_report->no_stderr = 1;
+ child_report->in = pipefd[0];
+ child_report->out = -1;
+ return start_command(child_report);
+}
+
+static int evsel__finish_retire_latency_cpu(struct evsel *evsel, int cpu_map_idx)
+{
+ struct child_process *child_record =
+ xyarray__entry(evsel->children, cpu_map_idx, 0);
+ struct child_process *child_report =
+ xyarray__entry(evsel->children, cpu_map_idx, 1);
+
+ if (child_record->pid > 0)
+ finish_command(child_record);
+ if (child_report->pid > 0)
+ finish_command(child_report);
+ return 0;
+}
+
+static int evsel__read_retire_latency(struct evsel *evsel, int cpu_map_idx, int thread)
+{
+ struct child_process *child_record = xyarray__entry(evsel->children, cpu_map_idx, 0);
+ struct child_process *child_report = xyarray__entry(evsel->children, cpu_map_idx, 1);
+ struct perf_counts_values *count = perf_counts(evsel->counts, cpu_map_idx, thread);
+ char buf[256];
+ int err;
+
+ kill(child_record->pid, SIGTERM);
+ err = read(child_report->out, buf, sizeof(buf));
+ if (err < 0 || strlen(buf) == 0)
+ return -1;
+
+ count->val = atoll(buf);
+ count->ena = 1;
+ count->run = 1;
+ count->id = 0;
+ count->lost = 0;
+
+ /*
+ * TODO: The SIGCHLD from the children exiting will cause interval mode
+ * to stop, use the control fd to avoid this.
+ */
+ err = evsel__finish_retire_latency_cpu(evsel, cpu_map_idx);
+ if (err)
+ return err;
+
+ /* Restart the counter. */
+ return evsel__start_retire_latency_cpu(evsel, evsel->core.cpus, cpu_map_idx);
+}
+
+static int evsel__finish_retire_latency(struct evsel *evsel)
+{
+ int idx;
+
+ perf_cpu_map__for_each_idx(idx, evsel->core.cpus) {
+ int err = evsel__finish_retire_latency_cpu(evsel, idx);
+
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
const char *const evsel__hw_names[PERF_COUNT_HW_MAX] = {
"cycles",
"instructions",
@@ -1465,6 +1623,10 @@ static void evsel__free_config_terms(struct evsel *evsel)

void evsel__exit(struct evsel *evsel)
{
+ if (evsel->children) {
+ evsel__finish_retire_latency(evsel);
+ zfree(&evsel->children);
+ }
assert(list_empty(&evsel->core.node));
assert(evsel->evlist == NULL);
bpf_counter__destroy(evsel);
@@ -1778,6 +1940,9 @@ int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
if (evsel__is_tool(evsel))
return evsel__read_tool(evsel, cpu_map_idx, thread);

+ if (evsel->retire_lat)
+ return evsel__read_retire_latency(evsel, cpu_map_idx, thread);
+
if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
return evsel__read_group(evsel, cpu_map_idx, thread);

@@ -1993,10 +2158,22 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus,
threads = empty_thread_map;
}

- if (evsel->core.fd == NULL &&
+ if (!evsel->retire_lat && evsel->core.fd == NULL &&
perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0)
return -ENOMEM;

+ if (evsel->retire_lat && evsel->children == NULL) {
+ /*
+ * Use ylen of 2, [0] is the record and [1] is the report
+ * command. Currently retirement latency doesn't support
+ * per-thread mode.
+ */
+ evsel->children = xyarray__new(perf_cpu_map__nr(cpus), /*ylen=*/2,
+ sizeof(struct child_process));
+ if (!evsel->children)
+ return -ENOMEM;
+ }
+
evsel->open_flags = PERF_FLAG_FD_CLOEXEC;
if (evsel->cgrp)
evsel->open_flags |= PERF_FLAG_PID_CGROUP;
@@ -2209,6 +2386,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,

for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) {

+ if (evsel->retire_lat) {
+ err = evsel__start_retire_latency_cpu(evsel, cpus, idx);
+ if (err)
+ goto out_close;
+ continue;
+ }
+
for (thread = 0; thread < nthreads; thread++) {
int fd, group_fd;
retry_open:
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index bd8e84954e34..3c0806436e64 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -177,6 +177,9 @@ struct evsel {
__u64 start_time;
/* Is the tool's fd for /proc/pid/stat or /proc/stat. */
bool pid_stat;
+
+ /* Used for retire_lat child process. */
+ struct xyarray *children;
};

struct perf_missing_features {
--
2.44.0.769.g3c40516874-goog


2024-04-29 21:00:56

by Namhyung Kim

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] perf evsel: Refactor tool events

Hi Ian,

On Sat, Apr 27, 2024 at 10:36 PM Ian Rogers <[email protected]> wrote:
>
> Tool events unnecessarily open a dummy perf event which is useless
> even with `perf record` which will still open a dummy event. Change
> the behavior of tool events so:

Right, I was surprised that we need a dummy event for that.

>
> - duration_time - call `rdclock` on open and then report the count as
> a delta since the start in evsel__read_counter. This moves code out
> of builtin-stat making it more general purpose.
>
> - user_time/system_time - open the fd as either `/proc/pid/stat` or
> `/proc/stat` for cases like system wide. evsel__read_counter will
> read the appropriate field out of the procfs file. These values
> were previously supplied by wait4, if the procfs read fails then
> the wait4 values are used, assuming the process/thread terminated.
> By reading user_time and system_time this way, interval mode can be
> supported.

Good improvement!

>
> Opening any of the tool events for `perf record` returns invalid.

Ok, I think those tool events are for `perf stat` only.

But I feel like this change is a separate optimization and can be
independently submitted (from the retire-latency or tPEBS change).

>
> Signed-off-by: Ian Rogers <[email protected]>
> ---
> tools/perf/builtin-stat.c | 74 ++++++-------
> tools/perf/util/evsel.c | 223 +++++++++++++++++++++++++++++++++++++-
> tools/perf/util/evsel.h | 6 +
> 3 files changed, 259 insertions(+), 44 deletions(-)
>
> diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> index 65a3dd7ffac3..428e9721b908 100644
> --- a/tools/perf/builtin-stat.c
> +++ b/tools/perf/builtin-stat.c
> @@ -255,45 +255,37 @@ static int evsel__write_stat_event(struct evsel *counter, int cpu_map_idx, u32 t
> process_synthesized_event, NULL);
> }
>
> -static int read_single_counter(struct evsel *counter, int cpu_map_idx,
> - int thread, struct timespec *rs)
> -{
> - switch(counter->tool_event) {
> - case PERF_TOOL_DURATION_TIME: {
> - u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL;
> - struct perf_counts_values *count =
> - perf_counts(counter->counts, cpu_map_idx, thread);
> - count->ena = count->run = val;
> - count->val = val;
> - return 0;
> - }
> - case PERF_TOOL_USER_TIME:
> - case PERF_TOOL_SYSTEM_TIME: {
> - u64 val;
> - struct perf_counts_values *count =
> - perf_counts(counter->counts, cpu_map_idx, thread);
> - if (counter->tool_event == PERF_TOOL_USER_TIME)
> - val = ru_stats.ru_utime_usec_stat.mean;
> - else
> - val = ru_stats.ru_stime_usec_stat.mean;
> - count->ena = count->run = val;
> - count->val = val;
> - return 0;
> - }
> - default:
> - case PERF_TOOL_NONE:
> - return evsel__read_counter(counter, cpu_map_idx, thread);
> - case PERF_TOOL_MAX:
> - /* This should never be reached */
> - return 0;
> +static int read_single_counter(struct evsel *counter, int cpu_map_idx, int thread)
> +{
> + int err = evsel__read_counter(counter, cpu_map_idx, thread);
> +
> + /*
> + * Reading user and system time will fail when the process
> + * terminates. Use the wait4 values in that case.
> + */
> + if (err &&
> + (counter->tool_event == PERF_TOOL_USER_TIME ||
> + counter->tool_event == PERF_TOOL_SYSTEM_TIME)) {

Off-topic. I feel like we should have a (fake) pmu for the tool events
rather than the evsel->tool_event enum.


> + u64 val;
> + struct perf_counts_values *count =
> + perf_counts(counter->counts, cpu_map_idx, thread);
> +
> + if (counter->tool_event == PERF_TOOL_USER_TIME)
> + val = ru_stats.ru_utime_usec_stat.mean;
> + else
> + val = ru_stats.ru_stime_usec_stat.mean;
> + count->ena = count->run = val;
> + count->val = val;
> + return 0;
> }
> + return err;
> }
>
> /*
> * Read out the results of a single counter:
> * do not aggregate counts across CPUs in system-wide mode
> */
> -static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_map_idx)
> +static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
> {
> int nthreads = perf_thread_map__nr(evsel_list->core.threads);
> int thread;
> @@ -311,7 +303,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
> * (via evsel__read_counter()) and sets their count->loaded.
> */
> if (!perf_counts__is_loaded(counter->counts, cpu_map_idx, thread) &&
> - read_single_counter(counter, cpu_map_idx, thread, rs)) {
> + read_single_counter(counter, cpu_map_idx, thread)) {
> counter->counts->scaled = -1;
> perf_counts(counter->counts, cpu_map_idx, thread)->ena = 0;
> perf_counts(counter->counts, cpu_map_idx, thread)->run = 0;
> @@ -340,7 +332,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
> return 0;
> }
>
> -static int read_affinity_counters(struct timespec *rs)
> +static int read_affinity_counters(void)
> {
> struct evlist_cpu_iterator evlist_cpu_itr;
> struct affinity saved_affinity, *affinity;
> @@ -361,10 +353,8 @@ static int read_affinity_counters(struct timespec *rs)
> if (evsel__is_bpf(counter))
> continue;
>
> - if (!counter->err) {
> - counter->err = read_counter_cpu(counter, rs,
> - evlist_cpu_itr.cpu_map_idx);
> - }
> + if (!counter->err)
> + counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
> }
> if (affinity)
> affinity__cleanup(&saved_affinity);
> @@ -388,11 +378,11 @@ static int read_bpf_map_counters(void)
> return 0;
> }
>
> -static int read_counters(struct timespec *rs)
> +static int read_counters(void)
> {
> if (!stat_config.stop_read_counter) {
> if (read_bpf_map_counters() ||
> - read_affinity_counters(rs))
> + read_affinity_counters())
> return -1;
> }
> return 0;
> @@ -423,7 +413,7 @@ static void process_interval(void)
>
> evlist__reset_aggr_stats(evsel_list);
>
> - if (read_counters(&rs) == 0)
> + if (read_counters() == 0)
> process_counters();
>
> if (STAT_RECORD) {
> @@ -911,7 +901,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
> * avoid arbitrary skew, we must read all counters before closing any
> * group leaders.
> */
> - if (read_counters(&(struct timespec) { .tv_nsec = t1-t0 }) == 0)
> + if (read_counters() == 0)
> process_counters();
>
> /*

I think this part can be a separate commit. You can implement the
tool event handling before actually using it. :)


> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index 3536404e9447..a0a8aee7d6b9 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -10,6 +10,7 @@
> #include <errno.h>
> #include <inttypes.h>
> #include <linux/bitops.h>
> +#include <api/io.h>
> #include <api/fs/fs.h>
> #include <api/fs/tracing_path.h>
> #include <linux/hw_breakpoint.h>
> @@ -30,6 +31,7 @@
> #include "counts.h"
> #include "event.h"
> #include "evsel.h"
> +#include "time-utils.h"
> #include "util/env.h"
> #include "util/evsel_config.h"
> #include "util/evsel_fprintf.h"
> @@ -1600,11 +1602,183 @@ static int evsel__read_group(struct evsel *leader, int cpu_map_idx, int thread)
> return evsel__process_group_data(leader, cpu_map_idx, thread, data);
> }
>
> +static int read_stat_field(int fd, struct perf_cpu cpu, int field, __u64 *val)
> +{
> + char buf[256];
> + struct io io;
> + int c, i;
> +
> + io__init(&io, fd, buf, sizeof(buf));
> +
> + /* Skip lines to relevant CPU. */
> + for (i = -1; i < cpu.cpu; i++) {
> + do {
> + c = io__get_char(&io);
> + if (c == -1)
> + return -EINVAL;
> + } while (c != '\n');
> + }
> + /* Skip to "cpu". */
> + c = io__get_char(&io);
> + if (c != 'c')
> + return -EINVAL;
> + c = io__get_char(&io);
> + if (c != 'p')
> + return -EINVAL;
> + c = io__get_char(&io);
> + if (c != 'u')
> + return -EINVAL;
> + /* Skip N of cpuN. */
> + do {
> + c = io__get_char(&io);
> + if (c == -1)
> + return -EINVAL;
> + } while (c != ' ');
> +
> + i = 1;
> + while (true) {
> + c = io__get_dec(&io, val);
> + if (c != ' ')
> + break;
> + if (field == i)
> + return 0;
> + i++;
> + }
> + return -EINVAL;
> +}

Looks somewhat verbose, can we have something like
io__get_str_field() or io__get_num_filed()?


> +
> +static int read_pid_stat_field(int fd, int field, __u64 *val)
> +{
> + char buf[256];
> + struct io io;
> + int c, i;
> +
> + io__init(&io, fd, buf, sizeof(buf));
> + c = io__get_dec(&io, val);
> + if (c != ' ')
> + return -EINVAL;
> + if (field == 1)
> + return 0;
> +
> + /* Skip comm. */
> + c = io__get_char(&io);
> + if (c != '(')
> + return -EINVAL;
> + do {
> + c = io__get_char(&io);
> + if (c == -1)
> + return -EINVAL;
> + } while (c != ')');
> + if (field == 2)
> + return -EINVAL;
> +
> + /* Skip state */
> + c = io__get_char(&io);
> + if (c != ' ')
> + return -EINVAL;
> + c = io__get_char(&io);
> + if (c == -1)
> + return -EINVAL;
> + if (field == 3)
> + return -EINVAL;
> +
> + /* Loop over numeric fields*/
> + c = io__get_char(&io);
> + if (c != ' ')
> + return -EINVAL;
> +
> + i = 4;
> + while (true) {
> + c = io__get_dec(&io, val);
> + if (c == -1)
> + return -EINVAL;
> + if (c == -2) {
> + /* Assume a -ve was read */
> + c = io__get_dec(&io, val);
> + *val *= -1;
> + }
> + if (c != ' ')
> + return -EINVAL;
> + if (field == i)
> + return 0;
> + i++;
> + }
> + return -EINVAL;
> +}
> +
> +static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread)
> +{
> + __u64 cur_time, delta_start;
> + int fd, err = 0;
> + struct perf_counts_values *count;
> + bool adjust = false;
> +
> + count = perf_counts(evsel->counts, cpu_map_idx, thread);
> +
> + switch (evsel->tool_event) {
> + case PERF_TOOL_DURATION_TIME:
> + /*
> + * Pretend duration_time is only on the first CPU and thread, or
> + * else aggregation will scale duration_time by the number of
> + * CPUs/threads.
> + */

We could set evsel->pmu->cpus to 0, if there's a tool pmu.


> + if (cpu_map_idx == 0 && thread == 0)
> + cur_time = rdclock();
> + else
> + cur_time = evsel->start_time;
> + break;
> + case PERF_TOOL_USER_TIME:
> + case PERF_TOOL_SYSTEM_TIME: {
> + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
> +
> + fd = FD(evsel, cpu_map_idx, thread);
> + lseek(fd, SEEK_SET, 0);
> + if (evsel->pid_stat) {
> + /* The event exists solely on 1 CPU. */

Probably the same.

Thanks,
Namhyung


> + if (cpu_map_idx == 0)
> + err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
> + else
> + cur_time = 0;
> + } else {
> + /* The event is for all threads. */
> + if (thread == 0) {
> + struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
> + cpu_map_idx);
> +
> + err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
> + } else {
> + cur_time = 0;
> + }
> + }
> + adjust = true;
> + break;
> + }
> + case PERF_TOOL_NONE:
> + case PERF_TOOL_MAX:
> + default:
> + err = -EINVAL;
> + }
> + if (err)
> + return err;
> +
> + delta_start = cur_time - evsel->start_time;
> + if (adjust) {
> + __u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
> +
> + delta_start *= 1000000000 / ticks_per_sec;
> + }
> + count->val = delta_start;
> + count->ena = count->run = delta_start;
> + count->lost = 0;
> + return 0;
> +}
> +
> int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
> {
> - u64 read_format = evsel->core.attr.read_format;
> + if (evsel__is_tool(evsel))
> + return evsel__read_tool(evsel, cpu_map_idx, thread);
>
> - if (read_format & PERF_FORMAT_GROUP)
> + if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
> return evsel__read_group(evsel, cpu_map_idx, thread);
>
> return evsel__read_one(evsel, cpu_map_idx, thread);
> @@ -2005,6 +2179,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> int pid = -1, err, old_errno;
> enum rlimit_action set_rlimit = NO_CHANGE;
>
> + if (evsel->tool_event == PERF_TOOL_DURATION_TIME) {
> + if (evsel->core.attr.sample_period) /* no sampling */
> + return -EINVAL;
> + evsel->start_time = rdclock();
> + return 0;
> + }
> +
> err = __evsel__prepare_open(evsel, cpus, threads);
> if (err)
> return err;
> @@ -2037,6 +2218,44 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> if (!evsel->cgrp && !evsel->core.system_wide)
> pid = perf_thread_map__pid(threads, thread);
>
> + if (evsel->tool_event == PERF_TOOL_USER_TIME ||
> + evsel->tool_event == PERF_TOOL_SYSTEM_TIME) {
> + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
> +
> + if (evsel->core.attr.sample_period) {
> + /* no sampling */
> + err = -EINVAL;
> + goto out_close;
> + }
> + if (pid > -1) {
> + char buf[64];
> +
> + snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
> + fd = open(buf, O_RDONLY);
> + evsel->pid_stat = true;
> + } else {
> + fd = open("/proc/stat", O_RDONLY);
> + }
> + FD(evsel, idx, thread) = fd;
> + if (fd < 0) {
> + err = -errno;
> + goto out_close;
> + }
> + if (pid > -1) {
> + err = read_pid_stat_field(fd, system ? 15 : 14,
> + &evsel->start_time);
> + } else {
> + struct perf_cpu cpu;
> +
> + cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
> + err = read_stat_field(fd, cpu, system ? 3 : 1,
> + &evsel->start_time);
> + }
> + if (err)
> + goto out_close;
> + continue;
> + }
> +
> group_fd = get_group_fd(evsel, idx, thread);
>
> if (group_fd == -2) {
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 517cff431de2..43f6fd1dcb4d 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -170,6 +170,12 @@ struct evsel {
>
> /* for missing_features */
> struct perf_pmu *pmu;
> +
> + /* For tool events */
> + /* Beginning time subtracted when the counter is read. */
> + __u64 start_time;
> + /* Is the tool's fd for /proc/pid/stat or /proc/stat. */
> + bool pid_stat;
> };
>
> struct perf_missing_features {
> --
> 2.44.0.769.g3c40516874-goog
>

2024-04-29 21:08:41

by Namhyung Kim

[permalink] [raw]
Subject: Re: [RFC PATCH v2 2/3] perf parse-events: Add a retirement latency modifier

On Sat, Apr 27, 2024 at 10:36 PM Ian Rogers <[email protected]> wrote:
>
> Retirement latency is a separate sampled count used on newer Intel
> CPUs.

Can you please update the "event modifiers" section in the perf list
documentation too?

Also I'm curious if we have a doc for the JSON metric format.

Thanks,
Namhyung

>
> Signed-off-by: Ian Rogers <[email protected]>
> ---
> tools/perf/util/evsel.h | 1 +
> tools/perf/util/parse-events.c | 2 ++
> tools/perf/util/parse-events.h | 1 +
> tools/perf/util/parse-events.l | 3 ++-
> 4 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index 43f6fd1dcb4d..bd8e84954e34 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -98,6 +98,7 @@ struct evsel {
> bool bpf_counter;
> bool use_config_name;
> bool skippable;
> + bool retire_lat;
> int bpf_fd;
> struct bpf_object *bpf_obj;
> struct list_head config_terms;
> diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> index 0f308b4db2b9..9c2a76ec8c99 100644
> --- a/tools/perf/util/parse-events.c
> +++ b/tools/perf/util/parse-events.c
> @@ -1818,6 +1818,8 @@ static int parse_events__modifier_list(struct parse_events_state *parse_state,
> evsel->weak_group = true;
> if (mod.bpf)
> evsel->bpf_counter = true;
> + if (mod.retire_lat)
> + evsel->retire_lat = true;
> }
> return 0;
> }
> diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
> index 5695308efab9..eb94d1247dae 100644
> --- a/tools/perf/util/parse-events.h
> +++ b/tools/perf/util/parse-events.h
> @@ -201,6 +201,7 @@ struct parse_events_modifier {
> bool hypervisor : 1; /* 'h' */
> bool guest : 1; /* 'G' */
> bool host : 1; /* 'H' */
> + bool retire_lat : 1; /* 'R' */
> };
>
> int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc,
> diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
> index 08ea2d845dc3..85015f080240 100644
> --- a/tools/perf/util/parse-events.l
> +++ b/tools/perf/util/parse-events.l
> @@ -209,6 +209,7 @@ static int modifiers(struct parse_events_state *parse_state, yyscan_t scanner)
> CASE('W', weak);
> CASE('e', exclusive);
> CASE('b', bpf);
> + CASE('R', retire_lat);
> default:
> return PE_ERROR;
> }
> @@ -250,7 +251,7 @@ drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\:]+)?
> * If you add a modifier you need to update check_modifier().
> * Also, the letters in modifier_event must not be in modifier_bp.
> */
> -modifier_event [ukhpPGHSDIWeb]{1,15}
> +modifier_event [ukhpPGHSDIWebR]{1,16}
> modifier_bp [rwx]{1,3}
> lc_type (L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node)
> lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss)
> --
> 2.44.0.769.g3c40516874-goog
>

2024-04-29 21:40:01

by Namhyung Kim

[permalink] [raw]
Subject: Re: [RFC PATCH v2 3/3] perf evsel: Add retirement latency event support

On Sat, Apr 27, 2024 at 10:36 PM Ian Rogers <[email protected]> wrote:
>
> When a retirement latency event is processed it sets a flag on the
> evsel. This change makes it so that when the flag is set evsel
> opening, reading and exiting report values from child perf record and
> perf report processes.
>
> Something similar was suggested by Namhyung Kim in:
> https://lore.kernel.org/lkml/CAM9d7cgdQQn5GYB7t++xuoMdeqPXiEkkcop69+rD06RAnu9-EQ@mail.gmail.com/
>
> This is trying to add support for retirement latency directly in
> events rather than through metric changes, as suggested by Weilin Wang in:
> https://lore.kernel.org/lkml/[email protected]/

This seems to create perf record + report pair for each R event
while Weilin's patch handled multiple events at once.

>
> Signed-off-by: Ian Rogers <[email protected]>
> ---
> tools/perf/util/evsel.c | 186 +++++++++++++++++++++++++++++++++++++++-
> tools/perf/util/evsel.h | 3 +
> 2 files changed, 188 insertions(+), 1 deletion(-)
>
> diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> index a0a8aee7d6b9..17c123cddde3 100644
> --- a/tools/perf/util/evsel.c
> +++ b/tools/perf/util/evsel.c
> @@ -22,6 +22,7 @@
> #include <sys/resource.h>
> #include <sys/types.h>
> #include <dirent.h>
> +#include <signal.h>
> #include <stdlib.h>
> #include <perf/evsel.h>
> #include "asm/bug.h"
> @@ -58,6 +59,7 @@
> #include <internal/xyarray.h>
> #include <internal/lib.h>
> #include <internal/threadmap.h>
> +#include <subcmd/run-command.h>
>
> #include <linux/ctype.h>
>
> @@ -493,6 +495,162 @@ struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx)
> }
> #endif
>
> +static int evsel__start_retire_latency_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> + int cpu_map_idx)
> +{
> + char buf[16];
> + int pipefd[2];
> + int err, i;
> + struct perf_cpu cpu = perf_cpu_map__cpu(cpus, cpu_map_idx);
> + struct child_process *child_record =
> + xyarray__entry(evsel->children, cpu_map_idx, 0);
> + struct child_process *child_report =
> + xyarray__entry(evsel->children, cpu_map_idx, 1);
> + char *event = strdup(evsel__name(evsel));
> + /* TODO: the dummy event also won't be used, but there's no option to disable. */
> + const char *record_argv[15] = {
> + [0] = "perf",
> + [1] = "record",
> + [2] = "--synth=no",
> + [3] = "--no-bpf-event",
> + [4] = "-W",
> + [5] = "-o",
> + [6] = "-",
> + [7] = "-e",
> + };
> + const char *report_argv[] = {
> + [0] = "perf",
> + [1] = "report",
> + [2] = "-i",
> + [3] = "-",
> + [4] = "-q",
> + [5] = "-F",
> + [6] = "retire_lat",
> + NULL,
> + };
> +
> + if (evsel->core.attr.sample_period) /* no sampling */
> + return -EINVAL;
> +
> + if (!event)
> + return -ENOMEM;
> +
> + /* Change the R modifier to the maximum precision one. */
> + for (i = strlen(event) - 1; i > 0; i--) {
> + if (event[i] == 'R')
> + break;
> + if (event[i] == ':' || event[i] == '/')
> + i = 0;
> + }
> + if (i <= 0) {
> + pr_err("Expected retired latency 'R'\n");
> + return -EINVAL;
> + }
> + event[i] = 'P';
> +
> + i = 8;
> + record_argv[i++] = event;
> + if (verbose) {
> + record_argv[i++] = verbose > 1 ? "-vv" : "-v";
> + }
> + if (cpu.cpu >= 0) {
> + record_argv[i++] = "-C";
> + snprintf(buf, sizeof(buf), "%d", cpu.cpu);
> + record_argv[i++] = buf;
> + } else {
> + record_argv[i++] = "-a";
> + }
> + /* TODO: use something like the control fds to control perf record behavior. */
> + record_argv[i++] = "sleep";
> + record_argv[i++] = "0.1";

This might be too short and the record process can exit
before perf report (but I guess it's fine when using a pipe).
Also I'm not sure if it's ok to get the retire latency of 100 ms
regardless of the execution of the given workload.

> +
> + if (pipe(pipefd) < 0) {
> + free(event);
> + return -errno;
> + }
> +
> + child_record->argv = record_argv;
> + child_record->pid = -1;
> + child_record->no_stdin = 1;
> + if (verbose)
> + child_record->err = fileno(stderr);
> + else
> + child_record->no_stderr = 1;
> + child_record->out = pipefd[1];
> + err = start_command(child_record);
> + free(event);
> + if (err)
> + return err;
> +
> + child_report->argv = report_argv;
> + child_report->pid = -1;
> + if (verbose)
> + child_report->err = fileno(stderr);
> + else
> + child_report->no_stderr = 1;
> + child_report->in = pipefd[0];
> + child_report->out = -1;
> + return start_command(child_report);
> +}
> +
> +static int evsel__finish_retire_latency_cpu(struct evsel *evsel, int cpu_map_idx)
> +{
> + struct child_process *child_record =
> + xyarray__entry(evsel->children, cpu_map_idx, 0);
> + struct child_process *child_report =
> + xyarray__entry(evsel->children, cpu_map_idx, 1);
> +
> + if (child_record->pid > 0)
> + finish_command(child_record);
> + if (child_report->pid > 0)
> + finish_command(child_report);
> + return 0;
> +}
> +
> +static int evsel__read_retire_latency(struct evsel *evsel, int cpu_map_idx, int thread)
> +{
> + struct child_process *child_record = xyarray__entry(evsel->children, cpu_map_idx, 0);
> + struct child_process *child_report = xyarray__entry(evsel->children, cpu_map_idx, 1);
> + struct perf_counts_values *count = perf_counts(evsel->counts, cpu_map_idx, thread);
> + char buf[256];
> + int err;
> +
> + kill(child_record->pid, SIGTERM);
> + err = read(child_report->out, buf, sizeof(buf));
> + if (err < 0 || strlen(buf) == 0)
> + return -1;
> +
> + count->val = atoll(buf);
> + count->ena = 1;
> + count->run = 1;
> + count->id = 0;
> + count->lost = 0;
> +
> + /*
> + * TODO: The SIGCHLD from the children exiting will cause interval mode
> + * to stop, use the control fd to avoid this.
> + */
> + err = evsel__finish_retire_latency_cpu(evsel, cpu_map_idx);
> + if (err)
> + return err;
> +
> + /* Restart the counter. */
> + return evsel__start_retire_latency_cpu(evsel, evsel->core.cpus, cpu_map_idx);

Is this for the interval mode? Then I think it's better to move the
logic there and let this code just do the 'read'.


> +}
> +
> +static int evsel__finish_retire_latency(struct evsel *evsel)
> +{
> + int idx;
> +
> + perf_cpu_map__for_each_idx(idx, evsel->core.cpus) {
> + int err = evsel__finish_retire_latency_cpu(evsel, idx);
> +
> + if (err)
> + return err;
> + }
> + return 0;
> +}
> +
> const char *const evsel__hw_names[PERF_COUNT_HW_MAX] = {
> "cycles",
> "instructions",
> @@ -1465,6 +1623,10 @@ static void evsel__free_config_terms(struct evsel *evsel)
>
> void evsel__exit(struct evsel *evsel)
> {
> + if (evsel->children) {
> + evsel__finish_retire_latency(evsel);
> + zfree(&evsel->children);

You'd better call xyarray__delete() and set it to NULL.

Thanks,
Namhyung

> + }
> assert(list_empty(&evsel->core.node));
> assert(evsel->evlist == NULL);
> bpf_counter__destroy(evsel);
> @@ -1778,6 +1940,9 @@ int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
> if (evsel__is_tool(evsel))
> return evsel__read_tool(evsel, cpu_map_idx, thread);
>
> + if (evsel->retire_lat)
> + return evsel__read_retire_latency(evsel, cpu_map_idx, thread);
> +
> if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
> return evsel__read_group(evsel, cpu_map_idx, thread);
>
> @@ -1993,10 +2158,22 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus,
> threads = empty_thread_map;
> }
>
> - if (evsel->core.fd == NULL &&
> + if (!evsel->retire_lat && evsel->core.fd == NULL &&
> perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0)
> return -ENOMEM;
>
> + if (evsel->retire_lat && evsel->children == NULL) {
> + /*
> + * Use ylen of 2, [0] is the record and [1] is the report
> + * command. Currently retirement latency doesn't support
> + * per-thread mode.
> + */
> + evsel->children = xyarray__new(perf_cpu_map__nr(cpus), /*ylen=*/2,
> + sizeof(struct child_process));
> + if (!evsel->children)
> + return -ENOMEM;
> + }
> +
> evsel->open_flags = PERF_FLAG_FD_CLOEXEC;
> if (evsel->cgrp)
> evsel->open_flags |= PERF_FLAG_PID_CGROUP;
> @@ -2209,6 +2386,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
>
> for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) {
>
> + if (evsel->retire_lat) {
> + err = evsel__start_retire_latency_cpu(evsel, cpus, idx);
> + if (err)
> + goto out_close;
> + continue;
> + }
> +
> for (thread = 0; thread < nthreads; thread++) {
> int fd, group_fd;
> retry_open:
> diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> index bd8e84954e34..3c0806436e64 100644
> --- a/tools/perf/util/evsel.h
> +++ b/tools/perf/util/evsel.h
> @@ -177,6 +177,9 @@ struct evsel {
> __u64 start_time;
> /* Is the tool's fd for /proc/pid/stat or /proc/stat. */
> bool pid_stat;
> +
> + /* Used for retire_lat child process. */
> + struct xyarray *children;
> };
>
> struct perf_missing_features {
> --
> 2.44.0.769.g3c40516874-goog
>

2024-04-30 02:48:01

by Ian Rogers

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] perf evsel: Refactor tool events

On Mon, Apr 29, 2024 at 2:00 PM Namhyung Kim <[email protected]> wrote:
>
> Hi Ian,
>
> On Sat, Apr 27, 2024 at 10:36 PM Ian Rogers <[email protected]> wrote:
> >
> > Tool events unnecessarily open a dummy perf event which is useless
> > even with `perf record` which will still open a dummy event. Change
> > the behavior of tool events so:
>
> Right, I was surprised that we need a dummy event for that.
>
> >
> > - duration_time - call `rdclock` on open and then report the count as
> > a delta since the start in evsel__read_counter. This moves code out
> > of builtin-stat making it more general purpose.
> >
> > - user_time/system_time - open the fd as either `/proc/pid/stat` or
> > `/proc/stat` for cases like system wide. evsel__read_counter will
> > read the appropriate field out of the procfs file. These values
> > were previously supplied by wait4, if the procfs read fails then
> > the wait4 values are used, assuming the process/thread terminated.
> > By reading user_time and system_time this way, interval mode can be
> > supported.
>
> Good improvement!
>
> >
> > Opening any of the tool events for `perf record` returns invalid.
>
> Ok, I think those tool events are for `perf stat` only.
>
> But I feel like this change is a separate optimization and can be
> independently submitted (from the retire-latency or tPEBS change).

You're right, but as both pieces of code are changing the
evsel__read_counter implementation I'd kept them together.

> >
> > Signed-off-by: Ian Rogers <[email protected]>
> > ---
> > tools/perf/builtin-stat.c | 74 ++++++-------
> > tools/perf/util/evsel.c | 223 +++++++++++++++++++++++++++++++++++++-
> > tools/perf/util/evsel.h | 6 +
> > 3 files changed, 259 insertions(+), 44 deletions(-)
> >
> > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > index 65a3dd7ffac3..428e9721b908 100644
> > --- a/tools/perf/builtin-stat.c
> > +++ b/tools/perf/builtin-stat.c
> > @@ -255,45 +255,37 @@ static int evsel__write_stat_event(struct evsel *counter, int cpu_map_idx, u32 t
> > process_synthesized_event, NULL);
> > }
> >
> > -static int read_single_counter(struct evsel *counter, int cpu_map_idx,
> > - int thread, struct timespec *rs)
> > -{
> > - switch(counter->tool_event) {
> > - case PERF_TOOL_DURATION_TIME: {
> > - u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL;
> > - struct perf_counts_values *count =
> > - perf_counts(counter->counts, cpu_map_idx, thread);
> > - count->ena = count->run = val;
> > - count->val = val;
> > - return 0;
> > - }
> > - case PERF_TOOL_USER_TIME:
> > - case PERF_TOOL_SYSTEM_TIME: {
> > - u64 val;
> > - struct perf_counts_values *count =
> > - perf_counts(counter->counts, cpu_map_idx, thread);
> > - if (counter->tool_event == PERF_TOOL_USER_TIME)
> > - val = ru_stats.ru_utime_usec_stat.mean;
> > - else
> > - val = ru_stats.ru_stime_usec_stat.mean;
> > - count->ena = count->run = val;
> > - count->val = val;
> > - return 0;
> > - }
> > - default:
> > - case PERF_TOOL_NONE:
> > - return evsel__read_counter(counter, cpu_map_idx, thread);
> > - case PERF_TOOL_MAX:
> > - /* This should never be reached */
> > - return 0;
> > +static int read_single_counter(struct evsel *counter, int cpu_map_idx, int thread)
> > +{
> > + int err = evsel__read_counter(counter, cpu_map_idx, thread);
> > +
> > + /*
> > + * Reading user and system time will fail when the process
> > + * terminates. Use the wait4 values in that case.
> > + */
> > + if (err &&
> > + (counter->tool_event == PERF_TOOL_USER_TIME ||
> > + counter->tool_event == PERF_TOOL_SYSTEM_TIME)) {
>
> Off-topic. I feel like we should have a (fake) pmu for the tool events
> rather than the evsel->tool_event enum.

Sounds good to me. The refactoring in this (unmerged) test change:
https://lore.kernel.org/lkml/[email protected]/
makes it so we can load a PMU from something like a temporary
directory. It would be nice to get rid of the fake_pmu special case in
parse-events and similarly the tool event special case. A problem is
that libperf lacks knowledge of fake_pmus and tool events in the
evlist and evsel code. libperf can try to apply operations to these
events even though the only knowledge of them is in the perf tool.


>
> > + u64 val;
> > + struct perf_counts_values *count =
> > + perf_counts(counter->counts, cpu_map_idx, thread);
> > +
> > + if (counter->tool_event == PERF_TOOL_USER_TIME)
> > + val = ru_stats.ru_utime_usec_stat.mean;
> > + else
> > + val = ru_stats.ru_stime_usec_stat.mean;
> > + count->ena = count->run = val;
> > + count->val = val;
> > + return 0;
> > }
> > + return err;
> > }
> >
> > /*
> > * Read out the results of a single counter:
> > * do not aggregate counts across CPUs in system-wide mode
> > */
> > -static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_map_idx)
> > +static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
> > {
> > int nthreads = perf_thread_map__nr(evsel_list->core.threads);
> > int thread;
> > @@ -311,7 +303,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
> > * (via evsel__read_counter()) and sets their count->loaded.
> > */
> > if (!perf_counts__is_loaded(counter->counts, cpu_map_idx, thread) &&
> > - read_single_counter(counter, cpu_map_idx, thread, rs)) {
> > + read_single_counter(counter, cpu_map_idx, thread)) {
> > counter->counts->scaled = -1;
> > perf_counts(counter->counts, cpu_map_idx, thread)->ena = 0;
> > perf_counts(counter->counts, cpu_map_idx, thread)->run = 0;
> > @@ -340,7 +332,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
> > return 0;
> > }
> >
> > -static int read_affinity_counters(struct timespec *rs)
> > +static int read_affinity_counters(void)
> > {
> > struct evlist_cpu_iterator evlist_cpu_itr;
> > struct affinity saved_affinity, *affinity;
> > @@ -361,10 +353,8 @@ static int read_affinity_counters(struct timespec *rs)
> > if (evsel__is_bpf(counter))
> > continue;
> >
> > - if (!counter->err) {
> > - counter->err = read_counter_cpu(counter, rs,
> > - evlist_cpu_itr.cpu_map_idx);
> > - }
> > + if (!counter->err)
> > + counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
> > }
> > if (affinity)
> > affinity__cleanup(&saved_affinity);
> > @@ -388,11 +378,11 @@ static int read_bpf_map_counters(void)
> > return 0;
> > }
> >
> > -static int read_counters(struct timespec *rs)
> > +static int read_counters(void)
> > {
> > if (!stat_config.stop_read_counter) {
> > if (read_bpf_map_counters() ||
> > - read_affinity_counters(rs))
> > + read_affinity_counters())
> > return -1;
> > }
> > return 0;
> > @@ -423,7 +413,7 @@ static void process_interval(void)
> >
> > evlist__reset_aggr_stats(evsel_list);
> >
> > - if (read_counters(&rs) == 0)
> > + if (read_counters() == 0)
> > process_counters();
> >
> > if (STAT_RECORD) {
> > @@ -911,7 +901,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
> > * avoid arbitrary skew, we must read all counters before closing any
> > * group leaders.
> > */
> > - if (read_counters(&(struct timespec) { .tv_nsec = t1-t0 }) == 0)
> > + if (read_counters() == 0)
> > process_counters();
> >
> > /*
>
> I think this part can be a separate commit. You can implement the
> tool event handling before actually using it. :)

It's possible but it opens a 3rd world where tool code is implemented
both in evsel and also in builtin-stat. The builtin-stat code uses
wait4 and the evsel code is using file descriptors. If tests fail
should the 3rd world be fixed? The intent here is that we're deleting
all the builtin-stat evsel tool code, a consequence of that is
deleting unused function arguments. As the changes are purely deleting
I'd prefer not to have a combined world to temporarily maintain.

>
> > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> > index 3536404e9447..a0a8aee7d6b9 100644
> > --- a/tools/perf/util/evsel.c
> > +++ b/tools/perf/util/evsel.c
> > @@ -10,6 +10,7 @@
> > #include <errno.h>
> > #include <inttypes.h>
> > #include <linux/bitops.h>
> > +#include <api/io.h>
> > #include <api/fs/fs.h>
> > #include <api/fs/tracing_path.h>
> > #include <linux/hw_breakpoint.h>
> > @@ -30,6 +31,7 @@
> > #include "counts.h"
> > #include "event.h"
> > #include "evsel.h"
> > +#include "time-utils.h"
> > #include "util/env.h"
> > #include "util/evsel_config.h"
> > #include "util/evsel_fprintf.h"
> > @@ -1600,11 +1602,183 @@ static int evsel__read_group(struct evsel *leader, int cpu_map_idx, int thread)
> > return evsel__process_group_data(leader, cpu_map_idx, thread, data);
> > }
> >
> > +static int read_stat_field(int fd, struct perf_cpu cpu, int field, __u64 *val)
> > +{
> > + char buf[256];
> > + struct io io;
> > + int c, i;
> > +
> > + io__init(&io, fd, buf, sizeof(buf));
> > +
> > + /* Skip lines to relevant CPU. */
> > + for (i = -1; i < cpu.cpu; i++) {
> > + do {
> > + c = io__get_char(&io);
> > + if (c == -1)
> > + return -EINVAL;
> > + } while (c != '\n');
> > + }
> > + /* Skip to "cpu". */
> > + c = io__get_char(&io);
> > + if (c != 'c')
> > + return -EINVAL;
> > + c = io__get_char(&io);
> > + if (c != 'p')
> > + return -EINVAL;
> > + c = io__get_char(&io);
> > + if (c != 'u')
> > + return -EINVAL;
> > + /* Skip N of cpuN. */
> > + do {
> > + c = io__get_char(&io);
> > + if (c == -1)
> > + return -EINVAL;
> > + } while (c != ' ');
> > +
> > + i = 1;
> > + while (true) {
> > + c = io__get_dec(&io, val);
> > + if (c != ' ')
> > + break;
> > + if (field == i)
> > + return 0;
> > + i++;
> > + }
> > + return -EINVAL;
> > +}
>
> Looks somewhat verbose, can we have something like
> io__get_str_field() or io__get_num_filed()?

I think macro magic would probably be the best way to cut down on
verbosity, I'm not sure its a big deal given the majority of the field
handling happens in the while loops. I'd like to skip this for now.

>
> > +
> > +static int read_pid_stat_field(int fd, int field, __u64 *val)
> > +{
> > + char buf[256];
> > + struct io io;
> > + int c, i;
> > +
> > + io__init(&io, fd, buf, sizeof(buf));
> > + c = io__get_dec(&io, val);
> > + if (c != ' ')
> > + return -EINVAL;
> > + if (field == 1)
> > + return 0;
> > +
> > + /* Skip comm. */
> > + c = io__get_char(&io);
> > + if (c != '(')
> > + return -EINVAL;
> > + do {
> > + c = io__get_char(&io);
> > + if (c == -1)
> > + return -EINVAL;
> > + } while (c != ')');
> > + if (field == 2)
> > + return -EINVAL;
> > +
> > + /* Skip state */
> > + c = io__get_char(&io);
> > + if (c != ' ')
> > + return -EINVAL;
> > + c = io__get_char(&io);
> > + if (c == -1)
> > + return -EINVAL;
> > + if (field == 3)
> > + return -EINVAL;
> > +
> > + /* Loop over numeric fields*/
> > + c = io__get_char(&io);
> > + if (c != ' ')
> > + return -EINVAL;
> > +
> > + i = 4;
> > + while (true) {
> > + c = io__get_dec(&io, val);
> > + if (c == -1)
> > + return -EINVAL;
> > + if (c == -2) {
> > + /* Assume a -ve was read */
> > + c = io__get_dec(&io, val);
> > + *val *= -1;
> > + }
> > + if (c != ' ')
> > + return -EINVAL;
> > + if (field == i)
> > + return 0;
> > + i++;
> > + }
> > + return -EINVAL;
> > +}
> > +
> > +static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread)
> > +{
> > + __u64 cur_time, delta_start;
> > + int fd, err = 0;
> > + struct perf_counts_values *count;
> > + bool adjust = false;
> > +
> > + count = perf_counts(evsel->counts, cpu_map_idx, thread);
> > +
> > + switch (evsel->tool_event) {
> > + case PERF_TOOL_DURATION_TIME:
> > + /*
> > + * Pretend duration_time is only on the first CPU and thread, or
> > + * else aggregation will scale duration_time by the number of
> > + * CPUs/threads.
> > + */
>
> We could set evsel->pmu->cpus to 0, if there's a tool pmu.

For the CPU it already is:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/util/parse-events.c#n314
There isn't an equivalent for the thread. I was tempted to add an
assert that cpu_map_idx == 0, but it seemed more forgiving to treat it
how threads are handled - I'm always concerned about breaking existing
code given how often I'm successful at doing it :-) . We could also
try to divide the delta across all CPUs/threads, but that seemed
weird.

>
> > + if (cpu_map_idx == 0 && thread == 0)
> > + cur_time = rdclock();
> > + else
> > + cur_time = evsel->start_time;
> > + break;
> > + case PERF_TOOL_USER_TIME:
> > + case PERF_TOOL_SYSTEM_TIME: {
> > + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
> > +
> > + fd = FD(evsel, cpu_map_idx, thread);
> > + lseek(fd, SEEK_SET, 0);
> > + if (evsel->pid_stat) {
> > + /* The event exists solely on 1 CPU. */
>
> Probably the same.

Similar to above. In system wide we can give per-CPU user and system
time breakdowns, whereas the /prod/pid/stat version doesn't support
that. The code is trying to make the most out of the information /proc
will give it. Thinking about it, we shouldn't be pinning the user and
system time to CPU0 in that case, so add_event_tool:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/util/parse-events.c#n302
should pass "0" in the duration_time case and NULL for the others.
This should allow say "perf stat -A -I 1000 -e user_time,system_time"
to give a per second per CPU user and system time value. This reminds
me that it'd be nice to do similar for hwmon values, another fake PMU
case.

Thanks,
Ian

> Thanks,
> Namhyung
>
>
> > + if (cpu_map_idx == 0)
> > + err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
> > + else
> > + cur_time = 0;
> > + } else {
> > + /* The event is for all threads. */
> > + if (thread == 0) {
> > + struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
> > + cpu_map_idx);
> > +
> > + err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
> > + } else {
> > + cur_time = 0;
> > + }
> > + }
> > + adjust = true;
> > + break;
> > + }
> > + case PERF_TOOL_NONE:
> > + case PERF_TOOL_MAX:
> > + default:
> > + err = -EINVAL;
> > + }
> > + if (err)
> > + return err;
> > +
> > + delta_start = cur_time - evsel->start_time;
> > + if (adjust) {
> > + __u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
> > +
> > + delta_start *= 1000000000 / ticks_per_sec;
> > + }
> > + count->val = delta_start;
> > + count->ena = count->run = delta_start;
> > + count->lost = 0;
> > + return 0;
> > +}
> > +
> > int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
> > {
> > - u64 read_format = evsel->core.attr.read_format;
> > + if (evsel__is_tool(evsel))
> > + return evsel__read_tool(evsel, cpu_map_idx, thread);
> >
> > - if (read_format & PERF_FORMAT_GROUP)
> > + if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
> > return evsel__read_group(evsel, cpu_map_idx, thread);
> >
> > return evsel__read_one(evsel, cpu_map_idx, thread);
> > @@ -2005,6 +2179,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> > int pid = -1, err, old_errno;
> > enum rlimit_action set_rlimit = NO_CHANGE;
> >
> > + if (evsel->tool_event == PERF_TOOL_DURATION_TIME) {
> > + if (evsel->core.attr.sample_period) /* no sampling */
> > + return -EINVAL;
> > + evsel->start_time = rdclock();
> > + return 0;
> > + }
> > +
> > err = __evsel__prepare_open(evsel, cpus, threads);
> > if (err)
> > return err;
> > @@ -2037,6 +2218,44 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> > if (!evsel->cgrp && !evsel->core.system_wide)
> > pid = perf_thread_map__pid(threads, thread);
> >
> > + if (evsel->tool_event == PERF_TOOL_USER_TIME ||
> > + evsel->tool_event == PERF_TOOL_SYSTEM_TIME) {
> > + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
> > +
> > + if (evsel->core.attr.sample_period) {
> > + /* no sampling */
> > + err = -EINVAL;
> > + goto out_close;
> > + }
> > + if (pid > -1) {
> > + char buf[64];
> > +
> > + snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
> > + fd = open(buf, O_RDONLY);
> > + evsel->pid_stat = true;
> > + } else {
> > + fd = open("/proc/stat", O_RDONLY);
> > + }
> > + FD(evsel, idx, thread) = fd;
> > + if (fd < 0) {
> > + err = -errno;
> > + goto out_close;
> > + }
> > + if (pid > -1) {
> > + err = read_pid_stat_field(fd, system ? 15 : 14,
> > + &evsel->start_time);
> > + } else {
> > + struct perf_cpu cpu;
> > +
> > + cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
> > + err = read_stat_field(fd, cpu, system ? 3 : 1,
> > + &evsel->start_time);
> > + }
> > + if (err)
> > + goto out_close;
> > + continue;
> > + }
> > +
> > group_fd = get_group_fd(evsel, idx, thread);
> >
> > if (group_fd == -2) {
> > diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> > index 517cff431de2..43f6fd1dcb4d 100644
> > --- a/tools/perf/util/evsel.h
> > +++ b/tools/perf/util/evsel.h
> > @@ -170,6 +170,12 @@ struct evsel {
> >
> > /* for missing_features */
> > struct perf_pmu *pmu;
> > +
> > + /* For tool events */
> > + /* Beginning time subtracted when the counter is read. */
> > + __u64 start_time;
> > + /* Is the tool's fd for /proc/pid/stat or /proc/stat. */
> > + bool pid_stat;
> > };
> >
> > struct perf_missing_features {
> > --
> > 2.44.0.769.g3c40516874-goog
> >

2024-04-30 02:52:38

by Ian Rogers

[permalink] [raw]
Subject: Re: [RFC PATCH v2 2/3] perf parse-events: Add a retirement latency modifier

On Mon, Apr 29, 2024 at 2:08 PM Namhyung Kim <[email protected]> wrote:
>
> On Sat, Apr 27, 2024 at 10:36 PM Ian Rogers <[email protected]> wrote:
> >
> > Retirement latency is a separate sampled count used on newer Intel
> > CPUs.
>
> Can you please update the "event modifiers" section in the perf list
> documentation too?

Will do, currently the only users are TMA metrics.

> Also I'm curious if we have a doc for the JSON metric format.

Nope, nor a schema. It'd be worth setting it up and say using it in
the jevents.py parser. Perhaps the json format can be the same as the
`perf list -j` json output too.

Thanks,
Ian

> Thanks,
> Namhyung
>
> >
> > Signed-off-by: Ian Rogers <[email protected]>
> > ---
> > tools/perf/util/evsel.h | 1 +
> > tools/perf/util/parse-events.c | 2 ++
> > tools/perf/util/parse-events.h | 1 +
> > tools/perf/util/parse-events.l | 3 ++-
> > 4 files changed, 6 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> > index 43f6fd1dcb4d..bd8e84954e34 100644
> > --- a/tools/perf/util/evsel.h
> > +++ b/tools/perf/util/evsel.h
> > @@ -98,6 +98,7 @@ struct evsel {
> > bool bpf_counter;
> > bool use_config_name;
> > bool skippable;
> > + bool retire_lat;
> > int bpf_fd;
> > struct bpf_object *bpf_obj;
> > struct list_head config_terms;
> > diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
> > index 0f308b4db2b9..9c2a76ec8c99 100644
> > --- a/tools/perf/util/parse-events.c
> > +++ b/tools/perf/util/parse-events.c
> > @@ -1818,6 +1818,8 @@ static int parse_events__modifier_list(struct parse_events_state *parse_state,
> > evsel->weak_group = true;
> > if (mod.bpf)
> > evsel->bpf_counter = true;
> > + if (mod.retire_lat)
> > + evsel->retire_lat = true;
> > }
> > return 0;
> > }
> > diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
> > index 5695308efab9..eb94d1247dae 100644
> > --- a/tools/perf/util/parse-events.h
> > +++ b/tools/perf/util/parse-events.h
> > @@ -201,6 +201,7 @@ struct parse_events_modifier {
> > bool hypervisor : 1; /* 'h' */
> > bool guest : 1; /* 'G' */
> > bool host : 1; /* 'H' */
> > + bool retire_lat : 1; /* 'R' */
> > };
> >
> > int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc,
> > diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
> > index 08ea2d845dc3..85015f080240 100644
> > --- a/tools/perf/util/parse-events.l
> > +++ b/tools/perf/util/parse-events.l
> > @@ -209,6 +209,7 @@ static int modifiers(struct parse_events_state *parse_state, yyscan_t scanner)
> > CASE('W', weak);
> > CASE('e', exclusive);
> > CASE('b', bpf);
> > + CASE('R', retire_lat);
> > default:
> > return PE_ERROR;
> > }
> > @@ -250,7 +251,7 @@ drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)?
> > * If you add a modifier you need to update check_modifier().
> > * Also, the letters in modifier_event must not be in modifier_bp.
> > */
> > -modifier_event [ukhpPGHSDIWeb]{1,15}
> > +modifier_event [ukhpPGHSDIWebR]{1,16}
> > modifier_bp [rwx]{1,3}
> > lc_type (L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node)
> > lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss)
> > --
> > 2.44.0.769.g3c40516874-goog
> >

2024-04-30 19:02:02

by Namhyung Kim

[permalink] [raw]
Subject: Re: [RFC PATCH v2 1/3] perf evsel: Refactor tool events

On Mon, Apr 29, 2024 at 7:47 PM Ian Rogers <[email protected]> wrote:
>
> On Mon, Apr 29, 2024 at 2:00 PM Namhyung Kim <[email protected]> wrote:
> >
> > Hi Ian,
> >
> > On Sat, Apr 27, 2024 at 10:36 PM Ian Rogers <[email protected]> wrote:
> > >
> > > Tool events unnecessarily open a dummy perf event which is useless
> > > even with `perf record` which will still open a dummy event. Change
> > > the behavior of tool events so:
> >
> > Right, I was surprised that we need a dummy event for that.
> >
> > >
> > > - duration_time - call `rdclock` on open and then report the count as
> > > a delta since the start in evsel__read_counter. This moves code out
> > > of builtin-stat making it more general purpose.
> > >
> > > - user_time/system_time - open the fd as either `/proc/pid/stat` or
> > > `/proc/stat` for cases like system wide. evsel__read_counter will
> > > read the appropriate field out of the procfs file. These values
> > > were previously supplied by wait4, if the procfs read fails then
> > > the wait4 values are used, assuming the process/thread terminated.
> > > By reading user_time and system_time this way, interval mode can be
> > > supported.
> >
> > Good improvement!
> >
> > >
> > > Opening any of the tool events for `perf record` returns invalid.
> >
> > Ok, I think those tool events are for `perf stat` only.
> >
> > But I feel like this change is a separate optimization and can be
> > independently submitted (from the retire-latency or tPEBS change).
>
> You're right, but as both pieces of code are changing the
> evsel__read_counter implementation I'd kept them together.

Ok.

>
> > >
> > > Signed-off-by: Ian Rogers <[email protected]>
> > > ---
> > > tools/perf/builtin-stat.c | 74 ++++++-------
> > > tools/perf/util/evsel.c | 223 +++++++++++++++++++++++++++++++++++++-
> > > tools/perf/util/evsel.h | 6 +
> > > 3 files changed, 259 insertions(+), 44 deletions(-)
> > >
> > > diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
> > > index 65a3dd7ffac3..428e9721b908 100644
> > > --- a/tools/perf/builtin-stat.c
> > > +++ b/tools/perf/builtin-stat.c
> > > @@ -255,45 +255,37 @@ static int evsel__write_stat_event(struct evsel *counter, int cpu_map_idx, u32 t
> > > process_synthesized_event, NULL);
> > > }
> > >
> > > -static int read_single_counter(struct evsel *counter, int cpu_map_idx,
> > > - int thread, struct timespec *rs)
> > > -{
> > > - switch(counter->tool_event) {
> > > - case PERF_TOOL_DURATION_TIME: {
> > > - u64 val = rs->tv_nsec + rs->tv_sec*1000000000ULL;
> > > - struct perf_counts_values *count =
> > > - perf_counts(counter->counts, cpu_map_idx, thread);
> > > - count->ena = count->run = val;
> > > - count->val = val;
> > > - return 0;
> > > - }
> > > - case PERF_TOOL_USER_TIME:
> > > - case PERF_TOOL_SYSTEM_TIME: {
> > > - u64 val;
> > > - struct perf_counts_values *count =
> > > - perf_counts(counter->counts, cpu_map_idx, thread);
> > > - if (counter->tool_event == PERF_TOOL_USER_TIME)
> > > - val = ru_stats.ru_utime_usec_stat.mean;
> > > - else
> > > - val = ru_stats.ru_stime_usec_stat.mean;
> > > - count->ena = count->run = val;
> > > - count->val = val;
> > > - return 0;
> > > - }
> > > - default:
> > > - case PERF_TOOL_NONE:
> > > - return evsel__read_counter(counter, cpu_map_idx, thread);
> > > - case PERF_TOOL_MAX:
> > > - /* This should never be reached */
> > > - return 0;
> > > +static int read_single_counter(struct evsel *counter, int cpu_map_idx, int thread)
> > > +{
> > > + int err = evsel__read_counter(counter, cpu_map_idx, thread);
> > > +
> > > + /*
> > > + * Reading user and system time will fail when the process
> > > + * terminates. Use the wait4 values in that case.
> > > + */
> > > + if (err &&
> > > + (counter->tool_event == PERF_TOOL_USER_TIME ||
> > > + counter->tool_event == PERF_TOOL_SYSTEM_TIME)) {
> >
> > Off-topic. I feel like we should have a (fake) pmu for the tool events
> > rather than the evsel->tool_event enum.
>
> Sounds good to me. The refactoring in this (unmerged) test change:
> https://lore.kernel.org/lkml/[email protected]/
> makes it so we can load a PMU from something like a temporary
> directory. It would be nice to get rid of the fake_pmu special case in
> parse-events and similarly the tool event special case. A problem is
> that libperf lacks knowledge of fake_pmus and tool events in the
> evlist and evsel code. libperf can try to apply operations to these
> events even though the only knowledge of them is in the perf tool.

Right, libperf should be aware of the tool pmu then.

>
> >
> > > + u64 val;
> > > + struct perf_counts_values *count =
> > > + perf_counts(counter->counts, cpu_map_idx, thread);
> > > +
> > > + if (counter->tool_event == PERF_TOOL_USER_TIME)
> > > + val = ru_stats.ru_utime_usec_stat.mean;
> > > + else
> > > + val = ru_stats.ru_stime_usec_stat.mean;
> > > + count->ena = count->run = val;
> > > + count->val = val;
> > > + return 0;
> > > }
> > > + return err;
> > > }
> > >
> > > /*
> > > * Read out the results of a single counter:
> > > * do not aggregate counts across CPUs in system-wide mode
> > > */
> > > -static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_map_idx)
> > > +static int read_counter_cpu(struct evsel *counter, int cpu_map_idx)
> > > {
> > > int nthreads = perf_thread_map__nr(evsel_list->core.threads);
> > > int thread;
> > > @@ -311,7 +303,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
> > > * (via evsel__read_counter()) and sets their count->loaded.
> > > */
> > > if (!perf_counts__is_loaded(counter->counts, cpu_map_idx, thread) &&
> > > - read_single_counter(counter, cpu_map_idx, thread, rs)) {
> > > + read_single_counter(counter, cpu_map_idx, thread)) {
> > > counter->counts->scaled = -1;
> > > perf_counts(counter->counts, cpu_map_idx, thread)->ena = 0;
> > > perf_counts(counter->counts, cpu_map_idx, thread)->run = 0;
> > > @@ -340,7 +332,7 @@ static int read_counter_cpu(struct evsel *counter, struct timespec *rs, int cpu_
> > > return 0;
> > > }
> > >
> > > -static int read_affinity_counters(struct timespec *rs)
> > > +static int read_affinity_counters(void)
> > > {
> > > struct evlist_cpu_iterator evlist_cpu_itr;
> > > struct affinity saved_affinity, *affinity;
> > > @@ -361,10 +353,8 @@ static int read_affinity_counters(struct timespec *rs)
> > > if (evsel__is_bpf(counter))
> > > continue;
> > >
> > > - if (!counter->err) {
> > > - counter->err = read_counter_cpu(counter, rs,
> > > - evlist_cpu_itr.cpu_map_idx);
> > > - }
> > > + if (!counter->err)
> > > + counter->err = read_counter_cpu(counter, evlist_cpu_itr.cpu_map_idx);
> > > }
> > > if (affinity)
> > > affinity__cleanup(&saved_affinity);
> > > @@ -388,11 +378,11 @@ static int read_bpf_map_counters(void)
> > > return 0;
> > > }
> > >
> > > -static int read_counters(struct timespec *rs)
> > > +static int read_counters(void)
> > > {
> > > if (!stat_config.stop_read_counter) {
> > > if (read_bpf_map_counters() ||
> > > - read_affinity_counters(rs))
> > > + read_affinity_counters())
> > > return -1;
> > > }
> > > return 0;
> > > @@ -423,7 +413,7 @@ static void process_interval(void)
> > >
> > > evlist__reset_aggr_stats(evsel_list);
> > >
> > > - if (read_counters(&rs) == 0)
> > > + if (read_counters() == 0)
> > > process_counters();
> > >
> > > if (STAT_RECORD) {
> > > @@ -911,7 +901,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx)
> > > * avoid arbitrary skew, we must read all counters before closing any
> > > * group leaders.
> > > */
> > > - if (read_counters(&(struct timespec) { .tv_nsec = t1-t0 }) == 0)
> > > + if (read_counters() == 0)
> > > process_counters();
> > >
> > > /*
> >
> > I think this part can be a separate commit. You can implement the
> > tool event handling before actually using it. :)
>
> It's possible but it opens a 3rd world where tool code is implemented
> both in evsel and also in builtin-stat. The builtin-stat code uses
> wait4 and the evsel code is using file descriptors. If tests fail
> should the 3rd world be fixed? The intent here is that we're deleting
> all the builtin-stat evsel tool code, a consequence of that is
> deleting unused function arguments. As the changes are purely deleting
> I'd prefer not to have a combined world to temporarily maintain.

I guess the tool events are only used by perf stat, is there any
tests that use the tool events?

Anyway it's not a big deal. I just wanted to keep each change
small. I think it's ok to add the code for tool event to evsel as
long as it's not used. But I don't insist on it strongly.

>
> >
> > > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> > > index 3536404e9447..a0a8aee7d6b9 100644
> > > --- a/tools/perf/util/evsel.c
> > > +++ b/tools/perf/util/evsel.c
> > > @@ -10,6 +10,7 @@
> > > #include <errno.h>
> > > #include <inttypes.h>
> > > #include <linux/bitops.h>
> > > +#include <api/io.h>
> > > #include <api/fs/fs.h>
> > > #include <api/fs/tracing_path.h>
> > > #include <linux/hw_breakpoint.h>
> > > @@ -30,6 +31,7 @@
> > > #include "counts.h"
> > > #include "event.h"
> > > #include "evsel.h"
> > > +#include "time-utils.h"
> > > #include "util/env.h"
> > > #include "util/evsel_config.h"
> > > #include "util/evsel_fprintf.h"
> > > @@ -1600,11 +1602,183 @@ static int evsel__read_group(struct evsel *leader, int cpu_map_idx, int thread)
> > > return evsel__process_group_data(leader, cpu_map_idx, thread, data);
> > > }
> > >
> > > +static int read_stat_field(int fd, struct perf_cpu cpu, int field, __u64 *val)
> > > +{
> > > + char buf[256];
> > > + struct io io;
> > > + int c, i;
> > > +
> > > + io__init(&io, fd, buf, sizeof(buf));
> > > +
> > > + /* Skip lines to relevant CPU. */
> > > + for (i = -1; i < cpu.cpu; i++) {
> > > + do {
> > > + c = io__get_char(&io);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + } while (c != '\n');
> > > + }
> > > + /* Skip to "cpu". */
> > > + c = io__get_char(&io);
> > > + if (c != 'c')
> > > + return -EINVAL;
> > > + c = io__get_char(&io);
> > > + if (c != 'p')
> > > + return -EINVAL;
> > > + c = io__get_char(&io);
> > > + if (c != 'u')
> > > + return -EINVAL;
> > > + /* Skip N of cpuN. */
> > > + do {
> > > + c = io__get_char(&io);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + } while (c != ' ');
> > > +
> > > + i = 1;
> > > + while (true) {
> > > + c = io__get_dec(&io, val);
> > > + if (c != ' ')
> > > + break;
> > > + if (field == i)
> > > + return 0;
> > > + i++;
> > > + }
> > > + return -EINVAL;
> > > +}
> >
> > Looks somewhat verbose, can we have something like
> > io__get_str_field() or io__get_num_filed()?
>
> I think macro magic would probably be the best way to cut down on
> verbosity, I'm not sure its a big deal given the majority of the field
> handling happens in the while loops. I'd like to skip this for now.

It's not a big deal. But I just thought this can be generic and
live in the tools/lib. :)

>
> >
> > > +
> > > +static int read_pid_stat_field(int fd, int field, __u64 *val)
> > > +{
> > > + char buf[256];
> > > + struct io io;
> > > + int c, i;
> > > +
> > > + io__init(&io, fd, buf, sizeof(buf));
> > > + c = io__get_dec(&io, val);
> > > + if (c != ' ')
> > > + return -EINVAL;
> > > + if (field == 1)
> > > + return 0;
> > > +
> > > + /* Skip comm. */
> > > + c = io__get_char(&io);
> > > + if (c != '(')
> > > + return -EINVAL;
> > > + do {
> > > + c = io__get_char(&io);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + } while (c != ')');

Hmm.. but the possible space in the comm would make it
complicated. :(


> > > + if (field == 2)
> > > + return -EINVAL;
> > > +
> > > + /* Skip state */
> > > + c = io__get_char(&io);
> > > + if (c != ' ')
> > > + return -EINVAL;
> > > + c = io__get_char(&io);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + if (field == 3)
> > > + return -EINVAL;
> > > +
> > > + /* Loop over numeric fields*/
> > > + c = io__get_char(&io);
> > > + if (c != ' ')
> > > + return -EINVAL;
> > > +
> > > + i = 4;
> > > + while (true) {
> > > + c = io__get_dec(&io, val);
> > > + if (c == -1)
> > > + return -EINVAL;
> > > + if (c == -2) {
> > > + /* Assume a -ve was read */
> > > + c = io__get_dec(&io, val);
> > > + *val *= -1;
> > > + }
> > > + if (c != ' ')
> > > + return -EINVAL;
> > > + if (field == i)
> > > + return 0;
> > > + i++;
> > > + }
> > > + return -EINVAL;
> > > +}
> > > +
> > > +static int evsel__read_tool(struct evsel *evsel, int cpu_map_idx, int thread)
> > > +{
> > > + __u64 cur_time, delta_start;
> > > + int fd, err = 0;
> > > + struct perf_counts_values *count;
> > > + bool adjust = false;
> > > +
> > > + count = perf_counts(evsel->counts, cpu_map_idx, thread);
> > > +
> > > + switch (evsel->tool_event) {
> > > + case PERF_TOOL_DURATION_TIME:
> > > + /*
> > > + * Pretend duration_time is only on the first CPU and thread, or
> > > + * else aggregation will scale duration_time by the number of
> > > + * CPUs/threads.
> > > + */
> >
> > We could set evsel->pmu->cpus to 0, if there's a tool pmu.
>
> For the CPU it already is:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/util/parse-events.c#n314
> There isn't an equivalent for the thread. I was tempted to add an
> assert that cpu_map_idx == 0, but it seemed more forgiving to treat it
> how threads are handled - I'm always concerned about breaking existing
> code given how often I'm successful at doing it :-) . We could also
> try to divide the delta across all CPUs/threads, but that seemed
> weird.

Yeah, I think it's best if we keep a single event (cpu = -1, thread = 0 ?)
Then it needs to set the cpu/thread map regardless of user options.

>
> >
> > > + if (cpu_map_idx == 0 && thread == 0)
> > > + cur_time = rdclock();
> > > + else
> > > + cur_time = evsel->start_time;
> > > + break;
> > > + case PERF_TOOL_USER_TIME:
> > > + case PERF_TOOL_SYSTEM_TIME: {
> > > + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
> > > +
> > > + fd = FD(evsel, cpu_map_idx, thread);
> > > + lseek(fd, SEEK_SET, 0);
> > > + if (evsel->pid_stat) {
> > > + /* The event exists solely on 1 CPU. */
> >
> > Probably the same.
>
> Similar to above. In system wide we can give per-CPU user and system
> time breakdowns, whereas the /prod/pid/stat version doesn't support
> that. The code is trying to make the most out of the information /proc
> will give it. Thinking about it, we shouldn't be pinning the user and
> system time to CPU0 in that case, so add_event_tool:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/util/parse-events.c#n302
> should pass "0" in the duration_time case and NULL for the others.
> This should allow say "perf stat -A -I 1000 -e user_time,system_time"
> to give a per second per CPU user and system time value. This reminds
> me that it'd be nice to do similar for hwmon values, another fake PMU
> case.

That makes sense. Actually I'm thinking if we can make the
cpu-clock and task-clock events accept the k and u modifiers.
But it'd require programming the timers on every kernel/user
context switch and everyone would hate it. ;-)

Thanks,
Namhyung

> >
> >
> > > + if (cpu_map_idx == 0)
> > > + err = read_pid_stat_field(fd, system ? 15 : 14, &cur_time);
> > > + else
> > > + cur_time = 0;
> > > + } else {
> > > + /* The event is for all threads. */
> > > + if (thread == 0) {
> > > + struct perf_cpu cpu = perf_cpu_map__cpu(evsel->core.cpus,
> > > + cpu_map_idx);
> > > +
> > > + err = read_stat_field(fd, cpu, system ? 3 : 1, &cur_time);
> > > + } else {
> > > + cur_time = 0;
> > > + }
> > > + }
> > > + adjust = true;
> > > + break;
> > > + }
> > > + case PERF_TOOL_NONE:
> > > + case PERF_TOOL_MAX:
> > > + default:
> > > + err = -EINVAL;
> > > + }
> > > + if (err)
> > > + return err;
> > > +
> > > + delta_start = cur_time - evsel->start_time;
> > > + if (adjust) {
> > > + __u64 ticks_per_sec = sysconf(_SC_CLK_TCK);
> > > +
> > > + delta_start *= 1000000000 / ticks_per_sec;
> > > + }
> > > + count->val = delta_start;
> > > + count->ena = count->run = delta_start;
> > > + count->lost = 0;
> > > + return 0;
> > > +}
> > > +
> > > int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
> > > {
> > > - u64 read_format = evsel->core.attr.read_format;
> > > + if (evsel__is_tool(evsel))
> > > + return evsel__read_tool(evsel, cpu_map_idx, thread);
> > >
> > > - if (read_format & PERF_FORMAT_GROUP)
> > > + if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
> > > return evsel__read_group(evsel, cpu_map_idx, thread);
> > >
> > > return evsel__read_one(evsel, cpu_map_idx, thread);
> > > @@ -2005,6 +2179,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> > > int pid = -1, err, old_errno;
> > > enum rlimit_action set_rlimit = NO_CHANGE;
> > >
> > > + if (evsel->tool_event == PERF_TOOL_DURATION_TIME) {
> > > + if (evsel->core.attr.sample_period) /* no sampling */
> > > + return -EINVAL;
> > > + evsel->start_time = rdclock();
> > > + return 0;
> > > + }
> > > +
> > > err = __evsel__prepare_open(evsel, cpus, threads);
> > > if (err)
> > > return err;
> > > @@ -2037,6 +2218,44 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> > > if (!evsel->cgrp && !evsel->core.system_wide)
> > > pid = perf_thread_map__pid(threads, thread);
> > >
> > > + if (evsel->tool_event == PERF_TOOL_USER_TIME ||
> > > + evsel->tool_event == PERF_TOOL_SYSTEM_TIME) {
> > > + bool system = evsel->tool_event == PERF_TOOL_SYSTEM_TIME;
> > > +
> > > + if (evsel->core.attr.sample_period) {
> > > + /* no sampling */
> > > + err = -EINVAL;
> > > + goto out_close;
> > > + }
> > > + if (pid > -1) {
> > > + char buf[64];
> > > +
> > > + snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
> > > + fd = open(buf, O_RDONLY);
> > > + evsel->pid_stat = true;
> > > + } else {
> > > + fd = open("/proc/stat", O_RDONLY);
> > > + }
> > > + FD(evsel, idx, thread) = fd;
> > > + if (fd < 0) {
> > > + err = -errno;
> > > + goto out_close;
> > > + }
> > > + if (pid > -1) {
> > > + err = read_pid_stat_field(fd, system ? 15 : 14,
> > > + &evsel->start_time);
> > > + } else {
> > > + struct perf_cpu cpu;
> > > +
> > > + cpu = perf_cpu_map__cpu(evsel->core.cpus, idx);
> > > + err = read_stat_field(fd, cpu, system ? 3 : 1,
> > > + &evsel->start_time);
> > > + }
> > > + if (err)
> > > + goto out_close;
> > > + continue;
> > > + }
> > > +
> > > group_fd = get_group_fd(evsel, idx, thread);
> > >
> > > if (group_fd == -2) {
> > > diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> > > index 517cff431de2..43f6fd1dcb4d 100644
> > > --- a/tools/perf/util/evsel.h
> > > +++ b/tools/perf/util/evsel.h
> > > @@ -170,6 +170,12 @@ struct evsel {
> > >
> > > /* for missing_features */
> > > struct perf_pmu *pmu;
> > > +
> > > + /* For tool events */
> > > + /* Beginning time subtracted when the counter is read. */
> > > + __u64 start_time;
> > > + /* Is the tool's fd for /proc/pid/stat or /proc/stat. */
> > > + bool pid_stat;
> > > };
> > >
> > > struct perf_missing_features {
> > > --
> > > 2.44.0.769.g3c40516874-goog
> > >

2024-04-30 21:00:40

by Namhyung Kim

[permalink] [raw]
Subject: Re: [RFC PATCH v2 3/3] perf evsel: Add retirement latency event support

On Mon, Apr 29, 2024 at 8:27 PM Ian Rogers <[email protected]> wrote:
>
> On Mon, Apr 29, 2024 at 2:31 PM Namhyung Kim <[email protected]> wrote:
> >
> > On Sat, Apr 27, 2024 at 10:36 PM Ian Rogers <[email protected]> wrote:
> > >
> > > When a retirement latency event is processed it sets a flag on the
> > > evsel. This change makes it so that when the flag is set evsel
> > > opening, reading and exiting report values from child perf record and
> > > perf report processes.
> > >
> > > Something similar was suggested by Namhyung Kim in:
> > > https://lore.kernel.org/lkml/CAM9d7cgdQQn5GYB7t++xuoMdeqPXiEkkcop69+rD06RAnu9-EQ@mail.gmail.com/
> > >
> > > This is trying to add support for retirement latency directly in
> > > events rather than through metric changes, as suggested by Weilin Wang in:
> > > https://lore.kernel.org/lkml/[email protected]/
> >
> > This seems to create perf record + report pair for each R event
> > while Weilin's patch handled multiple events at once.
>
> Some issues with this change:
> - perf stat treats any SIGCHLD to the parent as the workload
> completing. Properly supporting child processes means perhaps using
> sigaction and detecting the child process that terminates. As
> evsel__read_counter sends a kill it triggers the SIGCHLD that ends
> perf stat and thereby breaking interval mode.

Right, the interval mode is tricky to support.

> - there's a pair of processes per-CPU so that per-CPU mode is
> supported. Ideally we'd start a single process and read every CPU in
> that process in one go, then the evsel__read_counter code would read
> each CPU's count at once.

Also if there're more than one events.

> - record/report restarts happen on each evsel__read_counter as there
> doesn't seem to be enough functionality in control-fds to do a
> periodic dumping.

Hmm.. maybe something like perf top.

> - the evsel__open_cpu has a "sleep 0.1" workload to avoid gathering
> too many samples and overloading the perf report process.

Yeah, maybe with a lower frequency too.

> - there's no way to disable the dummy event in perf record, even
> though all we want is one field out of retirement latency samples.

Right, in this case we don't care about the sideband events.

>
> Given fixing all of these would be a lot of work, I think it is
> similar or less work to just directly read the retirement latency from
> events in perf stat. Making perf stat allow events that have a
> sampling buffer is troublesome not least as the mmaps are associated
> with the evlist:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/util/evlist.h#n65
> and so the evlist code assumes every evsel is sampling.
>
> Given all of this, I think these patches are a quick way to add the
> retirement latency support and then proper sampling support in `perf
> stat` can be worked on next to lower the overhead.

Ok, one more thing I asked to Weilin is documentation.
I don't think we all clearly understand what is TPEBS and
why do want to do this stuff. Writing down the situation
would help others (including future me) understand the
problem and current solution.

Thanks,
Namhyung


>
> > >
> > > Signed-off-by: Ian Rogers <[email protected]>
> > > ---
> > > tools/perf/util/evsel.c | 186 +++++++++++++++++++++++++++++++++++++++-
> > > tools/perf/util/evsel.h | 3 +
> > > 2 files changed, 188 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
> > > index a0a8aee7d6b9..17c123cddde3 100644
> > > --- a/tools/perf/util/evsel.c
> > > +++ b/tools/perf/util/evsel.c
> > > @@ -22,6 +22,7 @@
> > > #include <sys/resource.h>
> > > #include <sys/types.h>
> > > #include <dirent.h>
> > > +#include <signal.h>
> > > #include <stdlib.h>
> > > #include <perf/evsel.h>
> > > #include "asm/bug.h"
> > > @@ -58,6 +59,7 @@
> > > #include <internal/xyarray.h>
> > > #include <internal/lib.h>
> > > #include <internal/threadmap.h>
> > > +#include <subcmd/run-command.h>
> > >
> > > #include <linux/ctype.h>
> > >
> > > @@ -493,6 +495,162 @@ struct evsel *evsel__newtp_idx(const char *sys, const char *name, int idx)
> > > }
> > > #endif
> > >
> > > +static int evsel__start_retire_latency_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> > > + int cpu_map_idx)
> > > +{
> > > + char buf[16];
> > > + int pipefd[2];
> > > + int err, i;
> > > + struct perf_cpu cpu = perf_cpu_map__cpu(cpus, cpu_map_idx);
> > > + struct child_process *child_record =
> > > + xyarray__entry(evsel->children, cpu_map_idx, 0);
> > > + struct child_process *child_report =
> > > + xyarray__entry(evsel->children, cpu_map_idx, 1);
> > > + char *event = strdup(evsel__name(evsel));
> > > + /* TODO: the dummy event also won't be used, but there's no option to disable. */
> > > + const char *record_argv[15] = {
> > > + [0] = "perf",
> > > + [1] = "record",
> > > + [2] = "--synth=no",
> > > + [3] = "--no-bpf-event",
> > > + [4] = "-W",
> > > + [5] = "-o",
> > > + [6] = "-",
> > > + [7] = "-e",
> > > + };
> > > + const char *report_argv[] = {
> > > + [0] = "perf",
> > > + [1] = "report",
> > > + [2] = "-i",
> > > + [3] = "-",
> > > + [4] = "-q",
> > > + [5] = "-F",
> > > + [6] = "retire_lat",
> > > + NULL,
> > > + };
> > > +
> > > + if (evsel->core.attr.sample_period) /* no sampling */
> > > + return -EINVAL;
> > > +
> > > + if (!event)
> > > + return -ENOMEM;
> > > +
> > > + /* Change the R modifier to the maximum precision one. */
> > > + for (i = strlen(event) - 1; i > 0; i--) {
> > > + if (event[i] == 'R')
> > > + break;
> > > + if (event[i] == ':' || event[i] == '/')
> > > + i = 0;
> > > + }
> > > + if (i <= 0) {
> > > + pr_err("Expected retired latency 'R'\n");
> > > + return -EINVAL;
> > > + }
> > > + event[i] = 'P';
> > > +
> > > + i = 8;
> > > + record_argv[i++] = event;
> > > + if (verbose) {
> > > + record_argv[i++] = verbose > 1 ? "-vv" : "-v";
> > > + }
> > > + if (cpu.cpu >= 0) {
> > > + record_argv[i++] = "-C";
> > > + snprintf(buf, sizeof(buf), "%d", cpu.cpu);
> > > + record_argv[i++] = buf;
> > > + } else {
> > > + record_argv[i++] = "-a";
> > > + }
> > > + /* TODO: use something like the control fds to control perf record behavior. */
> > > + record_argv[i++] = "sleep";
> > > + record_argv[i++] = "0.1";
> >
> > This might be too short and the record process can exit
> > before perf report (but I guess it's fine when using a pipe).
> > Also I'm not sure if it's ok to get the retire latency of 100 ms
> > regardless of the execution of the given workload.
>
> Ack. As said above, I think the correct longer term thing is to remove
> the forked processes.
>
> > > +
> > > + if (pipe(pipefd) < 0) {
> > > + free(event);
> > > + return -errno;
> > > + }
> > > +
> > > + child_record->argv = record_argv;
> > > + child_record->pid = -1;
> > > + child_record->no_stdin = 1;
> > > + if (verbose)
> > > + child_record->err = fileno(stderr);
> > > + else
> > > + child_record->no_stderr = 1;
> > > + child_record->out = pipefd[1];
> > > + err = start_command(child_record);
> > > + free(event);
> > > + if (err)
> > > + return err;
> > > +
> > > + child_report->argv = report_argv;
> > > + child_report->pid = -1;
> > > + if (verbose)
> > > + child_report->err = fileno(stderr);
> > > + else
> > > + child_report->no_stderr = 1;
> > > + child_report->in = pipefd[0];
> > > + child_report->out = -1;
> > > + return start_command(child_report);
> > > +}
> > > +
> > > +static int evsel__finish_retire_latency_cpu(struct evsel *evsel, int cpu_map_idx)
> > > +{
> > > + struct child_process *child_record =
> > > + xyarray__entry(evsel->children, cpu_map_idx, 0);
> > > + struct child_process *child_report =
> > > + xyarray__entry(evsel->children, cpu_map_idx, 1);
> > > +
> > > + if (child_record->pid > 0)
> > > + finish_command(child_record);
> > > + if (child_report->pid > 0)
> > > + finish_command(child_report);
> > > + return 0;
> > > +}
> > > +
> > > +static int evsel__read_retire_latency(struct evsel *evsel, int cpu_map_idx, int thread)
> > > +{
> > > + struct child_process *child_record = xyarray__entry(evsel->children, cpu_map_idx, 0);
> > > + struct child_process *child_report = xyarray__entry(evsel->children, cpu_map_idx, 1);
> > > + struct perf_counts_values *count = perf_counts(evsel->counts, cpu_map_idx, thread);
> > > + char buf[256];
> > > + int err;
> > > +
> > > + kill(child_record->pid, SIGTERM);
> > > + err = read(child_report->out, buf, sizeof(buf));
> > > + if (err < 0 || strlen(buf) == 0)
> > > + return -1;
> > > +
> > > + count->val = atoll(buf);
> > > + count->ena = 1;
> > > + count->run = 1;
> > > + count->id = 0;
> > > + count->lost = 0;
> > > +
> > > + /*
> > > + * TODO: The SIGCHLD from the children exiting will cause interval mode
> > > + * to stop, use the control fd to avoid this.
> > > + */
> > > + err = evsel__finish_retire_latency_cpu(evsel, cpu_map_idx);
> > > + if (err)
> > > + return err;
> > > +
> > > + /* Restart the counter. */
> > > + return evsel__start_retire_latency_cpu(evsel, evsel->core.cpus, cpu_map_idx);
> >
> > Is this for the interval mode? Then I think it's better to move the
> > logic there and let this code just do the 'read'.
>
> For encapsulation's sake I'm trying to make it so that we don't put
> event reading logic into perf stat. perf stat just makes
> counters/evsels and asks the code to read them. Behind the scenes we
> do things like this, reading /proc for tool events, potentially other
> things for hwmon, etc. We should be able to update event parsing and
> evsel and from this get support for a new counter type - is my hope.
>
> >
> > > +}
> > > +
> > > +static int evsel__finish_retire_latency(struct evsel *evsel)
> > > +{
> > > + int idx;
> > > +
> > > + perf_cpu_map__for_each_idx(idx, evsel->core.cpus) {
> > > + int err = evsel__finish_retire_latency_cpu(evsel, idx);
> > > +
> > > + if (err)
> > > + return err;
> > > + }
> > > + return 0;
> > > +}
> > > +
> > > const char *const evsel__hw_names[PERF_COUNT_HW_MAX] = {
> > > "cycles",
> > > "instructions",
> > > @@ -1465,6 +1623,10 @@ static void evsel__free_config_terms(struct evsel *evsel)
> > >
> > > void evsel__exit(struct evsel *evsel)
> > > {
> > > + if (evsel->children) {
> > > + evsel__finish_retire_latency(evsel);
> > > + zfree(&evsel->children);
> >
> > You'd better call xyarray__delete() and set it to NULL.
>
> Okay.
>
> Thanks,
> Ian
>
> > Thanks,
> > Namhyung
> >
> > > + }
> > > assert(list_empty(&evsel->core.node));
> > > assert(evsel->evlist == NULL);
> > > bpf_counter__destroy(evsel);
> > > @@ -1778,6 +1940,9 @@ int evsel__read_counter(struct evsel *evsel, int cpu_map_idx, int thread)
> > > if (evsel__is_tool(evsel))
> > > return evsel__read_tool(evsel, cpu_map_idx, thread);
> > >
> > > + if (evsel->retire_lat)
> > > + return evsel__read_retire_latency(evsel, cpu_map_idx, thread);
> > > +
> > > if (evsel->core.attr.read_format & PERF_FORMAT_GROUP)
> > > return evsel__read_group(evsel, cpu_map_idx, thread);
> > >
> > > @@ -1993,10 +2158,22 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus,
> > > threads = empty_thread_map;
> > > }
> > >
> > > - if (evsel->core.fd == NULL &&
> > > + if (!evsel->retire_lat && evsel->core.fd == NULL &&
> > > perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0)
> > > return -ENOMEM;
> > >
> > > + if (evsel->retire_lat && evsel->children == NULL) {
> > > + /*
> > > + * Use ylen of 2, [0] is the record and [1] is the report
> > > + * command. Currently retirement latency doesn't support
> > > + * per-thread mode.
> > > + */
> > > + evsel->children = xyarray__new(perf_cpu_map__nr(cpus), /*ylen=*/2,
> > > + sizeof(struct child_process));
> > > + if (!evsel->children)
> > > + return -ENOMEM;
> > > + }
> > > +
> > > evsel->open_flags = PERF_FLAG_FD_CLOEXEC;
> > > if (evsel->cgrp)
> > > evsel->open_flags |= PERF_FLAG_PID_CGROUP;
> > > @@ -2209,6 +2386,13 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
> > >
> > > for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) {
> > >
> > > + if (evsel->retire_lat) {
> > > + err = evsel__start_retire_latency_cpu(evsel, cpus, idx);
> > > + if (err)
> > > + goto out_close;
> > > + continue;
> > > + }
> > > +
> > > for (thread = 0; thread < nthreads; thread++) {
> > > int fd, group_fd;
> > > retry_open:
> > > diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
> > > index bd8e84954e34..3c0806436e64 100644
> > > --- a/tools/perf/util/evsel.h
> > > +++ b/tools/perf/util/evsel.h
> > > @@ -177,6 +177,9 @@ struct evsel {
> > > __u64 start_time;
> > > /* Is the tool's fd for /proc/pid/stat or /proc/stat. */
> > > bool pid_stat;
> > > +
> > > + /* Used for retire_lat child process. */
> > > + struct xyarray *children;
> > > };
> > >
> > > struct perf_missing_features {
> > > --
> > > 2.44.0.769.g3c40516874-goog
> > >