2023-04-03 05:41:55

by Mike Galbraith

[permalink] [raw]
Subject: [hacklet] perf sched: Add report latency max/sum fields and body filter options

Dunno if anyone else will find this useful, but who knows. After a
recent bout of repetitive perf sched data frobing, I felt motivated to
can and condense a few keystrokes. It's dressed up a bit like a patch,
but that's to inspire future leaky self to recall its excuse for
existing, $subject is correctly speeled :)

Anyone trying it should dig up and apply the below first, as perf sched
is kinda busted without it.
[PATCH v4] perf sched: Fix sched latency analysis incorrect

perf sched: Add report latency max/sum fields and body filter options

When using perf sched lat, I almost always have to do some gymnastics to
concentrate the output, so I dug up a 2017 hacklet to tally wait time,
and added some basic conveniences to it. 'G' 'S' and 'T' are new, as
are the 'sum' sort key and associated output field.

The footer is always full report regardless of body trimming, and grew
max latency and latency tally fields.

Usage: perf sched latency [<options>]

-C, --CPU <n> CPU to profile on
-G, --greater <n> only output entries with a ms field >= N ms
-p, --pids latency stats per pid instead of per comm
-s, --sort <key[,key2...]>
sort by key(s): runtime, switch, avg, max or sum
-S, --summary <n> summarize the top N lines of sorted entries
-T, --truncate truncate the max delay start/end fields

That '-G' is meant as a simple noise filter. Say during a PornHub vs
Kbuild recording, you capture a bunch of tasks that weren't involved
enough in either competing activity to generate a 4 digit time tally,
with -G 1000, signal to noise improves.

Signed-off-by: Mike Galbraith <[email protected]>
---
tools/perf/builtin-sched.c | 94 ++++++++++++++++++++++++++++++++++++---------
1 file changed, 77 insertions(+), 17 deletions(-)

--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -188,6 +188,9 @@ struct perf_sched {
struct mutex start_work_mutex;
struct mutex work_done_wait_mutex;
int profile_cpu;
+ int lat_minimum;
+ int lat_summary;
+ bool lat_truncate;
/*
* Track the current task - that way we can know whether there's any
* weird events, such as a task being switched away that is not current.
@@ -224,6 +227,8 @@ struct perf_sched {
u64 run_avg;
u64 all_runtime;
u64 all_count;
+ u64 all_lat;
+ u64 max_lat;
u64 cpu_last_switched[MAX_CPUS];
struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root;
struct list_head sort_list, cmp_pid;
@@ -1341,11 +1346,12 @@ static int latency_migrate_task_event(st
return err;
}

-static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
+static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list,
+ bool summarize)
{
int i;
int ret;
- u64 avg;
+ u64 avg, min = sched->lat_minimum * NSEC_PER_MSEC;
char max_lat_start[32], max_lat_end[32];

if (!work_list->nb_atoms)
@@ -1358,6 +1364,18 @@ static void output_lat_thread(struct per

sched->all_runtime += work_list->total_runtime;
sched->all_count += work_list->nb_atoms;
+ sched->all_lat += work_list->total_lat;
+ if (sched->max_lat < work_list->max_lat)
+ sched->max_lat = work_list->max_lat;
+
+ /* Toss lines that don't meet the user's significance criteria. */
+ if (min > work_list->total_runtime && min > work_list->total_lat &&
+ min > work_list->max_lat)
+ return;
+
+ /* We've reached the user's TLDNR threshold, so we're done. */
+ if (summarize && sched->lat_summary-- <= 0)
+ return;

if (work_list->num_merged > 1)
ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
@@ -1371,11 +1389,14 @@ static void output_lat_thread(struct per
timestamp__scnprintf_usec(work_list->max_lat_start, max_lat_start, sizeof(max_lat_start));
timestamp__scnprintf_usec(work_list->max_lat_end, max_lat_end, sizeof(max_lat_end));

- printf("|%11.3f ms |%9" PRIu64 " | avg:%8.3f ms | max:%8.3f ms | max start: %12s s | max end: %12s s\n",
- (double)work_list->total_runtime / NSEC_PER_MSEC,
- work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
- (double)work_list->max_lat / NSEC_PER_MSEC,
- max_lat_start, max_lat_end);
+ printf("|%11.3f ms |%9" PRIu64 " | avg:%8.3f ms | max:%8.3f ms | sum:%9.3f ms |",
+ (double)work_list->total_runtime / NSEC_PER_MSEC,
+ work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
+ (double)work_list->max_lat / NSEC_PER_MSEC,
+ (double)work_list->total_lat / NSEC_PER_MSEC);
+ if (!sched->lat_truncate)
+ printf(" max start: %12s s | max end: %12s s |", max_lat_start, max_lat_end);
+ printf("\n");
}

static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
@@ -1420,6 +1441,16 @@ static int max_cmp(struct work_atoms *l,
return 0;
}

+static int sum_cmp(struct work_atoms *l, struct work_atoms *r)
+{
+ if (l->total_lat < r->total_lat)
+ return -1;
+ if (l->total_lat > r->total_lat)
+ return 1;
+
+ return 0;
+}
+
static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
{
if (l->nb_atoms < r->nb_atoms)
@@ -1451,6 +1482,10 @@ static int sort_dimension__add(const cha
.name = "max",
.cmp = max_cmp,
};
+ static struct sort_dimension sum_sort_dimension = {
+ .name = "sum",
+ .cmp = sum_cmp,
+ };
static struct sort_dimension pid_sort_dimension = {
.name = "pid",
.cmp = pid_cmp,
@@ -1467,6 +1502,7 @@ static int sort_dimension__add(const cha
&pid_sort_dimension,
&avg_sort_dimension,
&max_sort_dimension,
+ &sum_sort_dimension,
&switch_sort_dimension,
&runtime_sort_dimension,
};
@@ -3199,6 +3235,9 @@ static void perf_sched__merge_lat(struct
static int perf_sched__lat(struct perf_sched *sched)
{
struct rb_node *next;
+ bool summarize = sched->lat_summary > 0;
+ char buf[2][255];
+ int truncate;

setup_pager();

@@ -3208,26 +3247,38 @@ static int perf_sched__lat(struct perf_s
perf_sched__merge_lat(sched);
perf_sched__sort_lat(sched);

- printf("\n -------------------------------------------------------------------------------------------------------------------------------------------\n");
- printf(" Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Max delay start | Max delay end |\n");
- printf(" -------------------------------------------------------------------------------------------------------------------------------------------\n");
+
+ /* Header. */
+ strcpy(buf[0], "----------------------------------------------------------------------------------------------------------");
+ strcpy(buf[1], " Task | Runtime ms | Switches | Avg delay ms | Max delay ms | Sum delay ms |");
+ /* The last footer line is always a short line, so may need a trim. */
+ truncate = strlen(buf[0]);
+ if (!sched->lat_truncate) {
+ strcat(buf[0], "-----------------------------------------------------");
+ strcat(buf[1], " Max delay start | Max delay end |");
+ }
+ printf("\n %s\n %s\n %s\n", buf[0], buf[1], buf[0]);

next = rb_first_cached(&sched->sorted_atom_root);

+ /* Body. */
while (next) {
struct work_atoms *work_list;

work_list = rb_entry(next, struct work_atoms, node);
- output_lat_thread(sched, work_list);
+ output_lat_thread(sched, work_list, summarize);
next = rb_next(next);
thread__zput(work_list->thread);
}

- printf(" -----------------------------------------------------------------------------------------------------------------\n");
- printf(" TOTAL: |%11.3f ms |%9" PRIu64 " |\n",
- (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
-
- printf(" ---------------------------------------------------\n");
+ /* Footer. */
+ sprintf(buf[1], " TOTAL: |%11.3f ms |%9" PRIu64 " | |%13.3f ms |%14.3f ms |",
+ (double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count,
+ (double)sched->max_lat / NSEC_PER_MSEC,
+ (double)sched->all_lat / NSEC_PER_MSEC);
+ printf(" %s\n %s\n", buf[0], buf[1]);
+ buf[0][truncate] = '\0';
+ printf(" %s\n", buf[0]);

print_bad_events(sched);
printf("\n");
@@ -3478,6 +3529,9 @@ int cmd_sched(int argc, const char **arg
.skip_merge = 0,
.show_callchain = 1,
.max_stack = 5,
+ .lat_summary = 0,
+ .lat_minimum = 0,
+ .lat_truncate = false,
};
const struct option sched_options[] = {
OPT_STRING('i', "input", &input_name, "file",
@@ -3491,11 +3545,17 @@ int cmd_sched(int argc, const char **arg
};
const struct option latency_options[] = {
OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
- "sort by key(s): runtime, switch, avg, max"),
+ "sort by key(s): runtime, switch, avg, max or sum"),
OPT_INTEGER('C', "CPU", &sched.profile_cpu,
"CPU to profile on"),
OPT_BOOLEAN('p', "pids", &sched.skip_merge,
"latency stats per pid instead of per comm"),
+ OPT_INTEGER('G', "greater", &sched.lat_minimum,
+ "only output entries with a ms field >= N ms"),
+ OPT_INTEGER('S', "summary", &sched.lat_summary,
+ "summarize the top N lines of sorted entries"),
+ OPT_BOOLEAN('T', "truncate", &sched.lat_truncate,
+ "truncate the max delay start/end fields"),
OPT_PARENT(sched_options)
};
const struct option replay_options[] = {