2023-02-02 05:05:04

by Namhyung Kim

[permalink] [raw]
Subject: [PATCH 0/4] perf lock contention: Improve aggr x filter combination (v1)

Hello,

The callstack filter can be useful to debug lock issues but it has a
limitation that it only works with caller aggregation mode (which is the
default setting). IOW it cannot filter by callstack when showing tasks
or lock addresses/names.

But sometimes users want to use the filter for other aggregation mode.
Like "show me lock addresses/names from this caller only" or "show me
tasks having these callers".

When it's using tracepoint events from the data file, the situation is
good since the tracepoints have all the necessary info. But when using
BPF it needs to extend the key of lock stat BPF map to have more than
one info like 'pid + stack_id' or 'lock_addr + stack_id'. As callstack
filter works in userspace, it should save the both info.

With this change we can now use the -S/--callstack-filter with the
-t/--threads option or -l/--lock-addr option. It's also possible to use
it with other filter options.

The following example shows the top 5 tasks that have contention
somewhere in the epoll handling.

$ sudo perf lock con -abt -S epoll -E5 -- sleep 1
contended total wait max wait avg wait pid comm

2 58.64 us 32.38 us 29.32 us 1514752 Chrome_IOThread
3 29.31 us 12.65 us 9.77 us 3773 Xorg
1 17.45 us 17.45 us 17.45 us 1514906 Chrome_ChildIOT
1 15.41 us 15.41 us 15.41 us 1515382 Chrome_ChildIOT
1 12.52 us 12.52 us 12.52 us 293878 IPC I/O Parent

You get get the code at 'perf/lock-filter-v1' branch in

git://git.kernel.org/pub/scm/linux/kernel/git/namhyung/linux-perf.git

Thanks,
Namhyung

Namhyung Kim (4):
perf lock contention: Factor out lock_contention_get_name()
perf lock contention: Use lock_stat_find{,new}
perf lock contention: Support filters for different aggregation
perf test: Add more test cases for perf lock contention

tools/perf/builtin-lock.c | 79 ++++----
tools/perf/tests/shell/lock_contention.sh | 66 ++++++-
tools/perf/util/bpf_lock_contention.c | 178 +++++++++++-------
.../perf/util/bpf_skel/lock_contention.bpf.c | 15 +-
tools/perf/util/bpf_skel/lock_data.h | 4 +-
tools/perf/util/lock-contention.h | 5 +
6 files changed, 234 insertions(+), 113 deletions(-)


base-commit: 7cfa9f5e440054db7c7e28e83a045d36993ff958
--
2.39.1.456.gfc5497dd1b-goog



2023-02-02 05:05:09

by Namhyung Kim

[permalink] [raw]
Subject: [PATCH 1/4] perf lock contention: Factor out lock_contention_get_name()

The lock_contention_get_name() returns a name for the lock stat entry
based on the current aggregation mode. As it's called sequentially in a
single thread, it can return the address of a static buffer for symbol
and offset of the caller.

Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/util/bpf_lock_contention.c | 113 ++++++++++++++------------
1 file changed, 63 insertions(+), 50 deletions(-)

diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 4902ac331f41..967ce168f163 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -163,9 +163,68 @@ int lock_contention_stop(void)
return 0;
}

+static const char *lock_contention_get_name(struct lock_contention *con,
+ struct contention_key *key,
+ u64 *stack_trace)
+{
+ int idx = 0;
+ u64 addr;
+ const char *name = "";
+ static char name_buf[KSYM_NAME_LEN];
+ struct symbol *sym;
+ struct map *kmap;
+ struct machine *machine = con->machine;
+
+ if (con->aggr_mode == LOCK_AGGR_TASK) {
+ struct contention_task_data task;
+ struct thread *t;
+ int pid = key->aggr_key;
+ int task_fd = bpf_map__fd(skel->maps.task_data);
+
+ /* do not update idle comm which contains CPU number */
+ if (pid) {
+ bpf_map_lookup_elem(task_fd, &pid, &task);
+ t = __machine__findnew_thread(machine, /*pid=*/-1, pid);
+ thread__set_comm(t, task.comm, /*timestamp=*/0);
+ name = task.comm;
+ }
+ return name;
+ }
+
+ if (con->aggr_mode == LOCK_AGGR_ADDR) {
+ sym = machine__find_kernel_symbol(machine, key->aggr_key, &kmap);
+ if (sym)
+ name = sym->name;
+ return name;
+ }
+
+ /* LOCK_AGGR_CALLER: skip lock internal functions */
+ while (machine__is_lock_function(machine, stack_trace[idx]) &&
+ idx < con->max_stack - 1)
+ idx++;
+
+ addr = stack_trace[idx];
+ sym = machine__find_kernel_symbol(machine, addr, &kmap);
+
+ if (sym) {
+ unsigned long offset;
+
+ offset = kmap->map_ip(kmap, addr) - sym->start;
+
+ if (offset == 0)
+ return sym->name;
+
+ snprintf(name_buf, sizeof(name_buf), "%s+%#lx", sym->name, offset);
+ } else {
+ snprintf(name_buf, sizeof(name_buf), "%#lx", (unsigned long)addr);
+ }
+
+ return name_buf;
+}
+
int lock_contention_read(struct lock_contention *con)
{
- int fd, stack, task_fd, err = 0;
+ int fd, stack, err = 0;
struct contention_key *prev_key, key;
struct contention_data data = {};
struct lock_stat *st = NULL;
@@ -175,7 +234,6 @@ int lock_contention_read(struct lock_contention *con)

fd = bpf_map__fd(skel->maps.lock_stat);
stack = bpf_map__fd(skel->maps.stacks);
- task_fd = bpf_map__fd(skel->maps.task_data);

con->lost = skel->bss->lost;

@@ -195,9 +253,6 @@ int lock_contention_read(struct lock_contention *con)

prev_key = NULL;
while (!bpf_map_get_next_key(fd, prev_key, &key)) {
- struct map *kmap;
- struct symbol *sym;
- int idx = 0;
s32 stack_id;

/* to handle errors in the loop body */
@@ -219,61 +274,19 @@ int lock_contention_read(struct lock_contention *con)
st->flags = data.flags;
st->addr = key.aggr_key;

- if (con->aggr_mode == LOCK_AGGR_TASK) {
- struct contention_task_data task;
- struct thread *t;
- int pid = key.aggr_key;
-
- /* do not update idle comm which contains CPU number */
- if (st->addr) {
- bpf_map_lookup_elem(task_fd, &pid, &task);
- t = __machine__findnew_thread(machine, /*pid=*/-1, pid);
- thread__set_comm(t, task.comm, /*timestamp=*/0);
- }
- goto next;
- }
-
- if (con->aggr_mode == LOCK_AGGR_ADDR) {
- sym = machine__find_kernel_symbol(machine, st->addr, &kmap);
- if (sym)
- st->name = strdup(sym->name);
- goto next;
- }
-
stack_id = key.aggr_key;
bpf_map_lookup_elem(stack, &stack_id, stack_trace);

- /* skip lock internal functions */
- while (machine__is_lock_function(machine, stack_trace[idx]) &&
- idx < con->max_stack - 1)
- idx++;
-
- st->addr = stack_trace[idx];
- sym = machine__find_kernel_symbol(machine, st->addr, &kmap);
-
- if (sym) {
- unsigned long offset;
- int ret = 0;
-
- offset = kmap->map_ip(kmap, st->addr) - sym->start;
-
- if (offset)
- ret = asprintf(&st->name, "%s+%#lx", sym->name, offset);
- else
- st->name = strdup(sym->name);
-
- if (ret < 0 || st->name == NULL)
- break;
- } else if (asprintf(&st->name, "%#lx", (unsigned long)st->addr) < 0) {
+ st->name = strdup(lock_contention_get_name(con, &key, stack_trace));
+ if (st->name == NULL)
break;
- }

if (con->save_callstack) {
st->callstack = memdup(stack_trace, stack_size);
if (st->callstack == NULL)
break;
}
-next:
+
hlist_add_head(&st->hash_entry, con->result);
prev_key = &key;

--
2.39.1.456.gfc5497dd1b-goog


2023-02-02 05:05:12

by Namhyung Kim

[permalink] [raw]
Subject: [PATCH 2/4] perf lock contention: Use lock_stat_find{,new}

This is a preparation work to support complex keys of BPF maps. Now it
has single value key according to the aggregation mode like stack_id or
pid. But we want to use a combination of those keys.

Then lock_contention_read() should still aggregate the result based on
the key that was requested by user. The other key info will be used for
filtering.

So instead of creating a lock_stat entry always, Check if it's already
there using lock_stat_find() first.

Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/builtin-lock.c | 4 +--
tools/perf/util/bpf_lock_contention.c | 41 ++++++++++++++++-----------
tools/perf/util/lock-contention.h | 3 ++
3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 216a9a252bf4..0593c6e636c6 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -465,7 +465,7 @@ static struct lock_stat *pop_from_result(void)
return container_of(node, struct lock_stat, rb);
}

-static struct lock_stat *lock_stat_find(u64 addr)
+struct lock_stat *lock_stat_find(u64 addr)
{
struct hlist_head *entry = lockhashentry(addr);
struct lock_stat *ret;
@@ -477,7 +477,7 @@ static struct lock_stat *lock_stat_find(u64 addr)
return NULL;
}

-static struct lock_stat *lock_stat_findnew(u64 addr, const char *name, int flags)
+struct lock_stat *lock_stat_findnew(u64 addr, const char *name, int flags)
{
struct hlist_head *entry = lockhashentry(addr);
struct lock_stat *ret, *new;
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index 967ce168f163..c6f2db603d5a 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -254,12 +254,34 @@ int lock_contention_read(struct lock_contention *con)
prev_key = NULL;
while (!bpf_map_get_next_key(fd, prev_key, &key)) {
s32 stack_id;
+ const char *name;

/* to handle errors in the loop body */
err = -1;

bpf_map_lookup_elem(fd, &key, &data);
- st = zalloc(sizeof(*st));
+
+ if (con->save_callstack) {
+ stack_id = key.aggr_key;
+ bpf_map_lookup_elem(stack, &stack_id, stack_trace);
+ }
+
+ st = lock_stat_find(key.aggr_key);
+ if (st != NULL) {
+ st->wait_time_total += data.total_time;
+ if (st->wait_time_max < data.max_time)
+ st->wait_time_max = data.max_time;
+ if (st->wait_time_min > data.min_time)
+ st->wait_time_min = data.min_time;
+
+ st->nr_contended += data.count;
+ if (st->nr_contended)
+ st->avg_wait_time = st->wait_time_total / st->nr_contended;
+ goto next;
+ }
+
+ name = lock_contention_get_name(con, &key, stack_trace);
+ st = lock_stat_findnew(key.aggr_key, name, data.flags);
if (st == NULL)
break;

@@ -272,14 +294,6 @@ int lock_contention_read(struct lock_contention *con)
st->avg_wait_time = data.total_time / data.count;

st->flags = data.flags;
- st->addr = key.aggr_key;
-
- stack_id = key.aggr_key;
- bpf_map_lookup_elem(stack, &stack_id, stack_trace);
-
- st->name = strdup(lock_contention_get_name(con, &key, stack_trace));
- if (st->name == NULL)
- break;

if (con->save_callstack) {
st->callstack = memdup(stack_trace, stack_size);
@@ -287,19 +301,14 @@ int lock_contention_read(struct lock_contention *con)
break;
}

- hlist_add_head(&st->hash_entry, con->result);
+next:
prev_key = &key;

- /* we're fine now, reset the values */
- st = NULL;
+ /* we're fine now, reset the error */
err = 0;
}

free(stack_trace);
- if (st) {
- free(st->name);
- free(st);
- }

return err;
}
diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
index 17e594d57a61..39d5bfc77f4e 100644
--- a/tools/perf/util/lock-contention.h
+++ b/tools/perf/util/lock-contention.h
@@ -65,6 +65,9 @@ struct lock_stat {
*/
#define MAX_LOCK_DEPTH 48

+struct lock_stat *lock_stat_find(u64 addr);
+struct lock_stat *lock_stat_findnew(u64 addr, const char *name, int flags);
+
/*
* struct lock_seq_stat:
* Place to put on state of one lock sequence
--
2.39.1.456.gfc5497dd1b-goog


2023-02-02 05:05:23

by Namhyung Kim

[permalink] [raw]
Subject: [PATCH 3/4] perf lock contention: Support filters for different aggregation

It'd be useful to filter other than the current aggregation mode. For
example, users may want to see callstacks for specific locks only. Or
they may want tasks from a certain callstack.

The tracepoints already collected the information but it needs to check
the condition again when processing the event. And it needs to change
BPF to allow the key combinations.

The lock contentions on 'rcu_state' spinlock can be monitored:

$ sudo perf lock con -abv -L rcu_state sleep 1
...
contended total wait max wait avg wait type caller

4 151.39 us 62.57 us 37.85 us spinlock rcu_core+0xcb
0xffffffff81fd1666 _raw_spin_lock_irqsave+0x46
0xffffffff8172d76b rcu_core+0xcb
0xffffffff822000eb __softirqentry_text_start+0xeb
0xffffffff816a0ba9 __irq_exit_rcu+0xc9
0xffffffff81fc0112 sysvec_apic_timer_interrupt+0xa2
0xffffffff82000e46 asm_sysvec_apic_timer_interrupt+0x16
0xffffffff81d49f78 cpuidle_enter_state+0xd8
0xffffffff81d4a259 cpuidle_enter+0x29
1 30.21 us 30.21 us 30.21 us spinlock rcu_core+0xcb
0xffffffff81fd1666 _raw_spin_lock_irqsave+0x46
0xffffffff8172d76b rcu_core+0xcb
0xffffffff822000eb __softirqentry_text_start+0xeb
0xffffffff816a0ba9 __irq_exit_rcu+0xc9
0xffffffff81fc00c4 sysvec_apic_timer_interrupt+0x54
0xffffffff82000e46 asm_sysvec_apic_timer_interrupt+0x16
1 28.84 us 28.84 us 28.84 us spinlock rcu_accelerate_cbs_unlocked+0x40
0xffffffff81fd1c60 _raw_spin_lock+0x30
0xffffffff81728cf0 rcu_accelerate_cbs_unlocked+0x40
0xffffffff8172da82 rcu_core+0x3e2
0xffffffff822000eb __softirqentry_text_start+0xeb
0xffffffff816a0ba9 __irq_exit_rcu+0xc9
0xffffffff81fc0112 sysvec_apic_timer_interrupt+0xa2
0xffffffff82000e46 asm_sysvec_apic_timer_interrupt+0x16
0xffffffff81d49f78 cpuidle_enter_state+0xd8
...

To see tasks calling 'rcu_core' function:

$ sudo perf lock con -abt -S rcu_core sleep 1
contended total wait max wait avg wait pid comm

19 23.46 us 2.21 us 1.23 us 0 swapper
2 18.37 us 17.01 us 9.19 us 2061859 ThreadPoolForeg
3 5.76 us 1.97 us 1.92 us 3909 pipewire-pulse
1 2.26 us 2.26 us 2.26 us 1809271 MediaSu~isor #2
1 1.97 us 1.97 us 1.97 us 1514882 Chrome_ChildIOT
1 987 ns 987 ns 987 ns 3740 pipewire-pulse

Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/builtin-lock.c | 75 +++++++++++--------
tools/perf/util/bpf_lock_contention.c | 44 +++++++----
.../perf/util/bpf_skel/lock_contention.bpf.c | 15 ++--
tools/perf/util/bpf_skel/lock_data.h | 4 +-
tools/perf/util/lock-contention.h | 2 +
5 files changed, 89 insertions(+), 51 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 0593c6e636c6..0d11f301fd72 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -509,6 +509,34 @@ struct lock_stat *lock_stat_findnew(u64 addr, const char *name, int flags)
return NULL;
}

+bool match_callstack_filter(struct machine *machine, u64 *callstack)
+{
+ struct map *kmap;
+ struct symbol *sym;
+ u64 ip;
+
+ if (list_empty(&callstack_filters))
+ return true;
+
+ for (int i = 0; i < max_stack_depth; i++) {
+ struct callstack_filter *filter;
+
+ if (!callstack || !callstack[i])
+ break;
+
+ ip = callstack[i];
+ sym = machine__find_kernel_symbol(machine, ip, &kmap);
+ if (sym == NULL)
+ continue;
+
+ list_for_each_entry(filter, &callstack_filters, list) {
+ if (strstr(sym->name, filter->name))
+ return true;
+ }
+ }
+ return false;
+}
+
struct trace_lock_handler {
/* it's used on CONFIG_LOCKDEP */
int (*acquire_event)(struct evsel *evsel,
@@ -1070,12 +1098,6 @@ static int report_lock_contention_begin_event(struct evsel *evsel,
ls = lock_stat_findnew(key, name, flags);
if (!ls)
return -ENOMEM;
-
- if (aggr_mode == LOCK_AGGR_CALLER && needs_callstack()) {
- ls->callstack = get_callstack(sample, max_stack_depth);
- if (ls->callstack == NULL)
- return -ENOMEM;
- }
}

if (filters.nr_types) {
@@ -1106,6 +1128,22 @@ static int report_lock_contention_begin_event(struct evsel *evsel,
return 0;
}

+ if (needs_callstack()) {
+ u64 *callstack = get_callstack(sample, max_stack_depth);
+ if (callstack == NULL)
+ return -ENOMEM;
+
+ if (!match_callstack_filter(machine, callstack)) {
+ free(callstack);
+ return 0;
+ }
+
+ if (ls->callstack == NULL)
+ ls->callstack = callstack;
+ else
+ free(callstack);
+ }
+
ts = thread_stat_findnew(sample->tid);
if (!ts)
return -ENOMEM;
@@ -1606,31 +1644,6 @@ static void print_contention_result(struct lock_contention *con)
if (!st->wait_time_total)
continue;

- if (aggr_mode == LOCK_AGGR_CALLER && !list_empty(&callstack_filters)) {
- struct map *kmap;
- struct symbol *sym;
- u64 ip;
-
- for (int i = 0; i < max_stack_depth; i++) {
- struct callstack_filter *filter;
-
- if (!st->callstack || !st->callstack[i])
- break;
-
- ip = st->callstack[i];
- sym = machine__find_kernel_symbol(con->machine, ip, &kmap);
- if (sym == NULL)
- continue;
-
- list_for_each_entry(filter, &callstack_filters, list) {
- if (strstr(sym->name, filter->name))
- goto found;
- }
- }
- continue;
- }
-
-found:
list_for_each_entry(key, &lock_keys, list) {
key->print(key, st);
pr_info(" ");
diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
index c6f2db603d5a..3b4308b7b6bf 100644
--- a/tools/perf/util/bpf_lock_contention.c
+++ b/tools/perf/util/bpf_lock_contention.c
@@ -34,13 +34,15 @@ int lock_contention_prepare(struct lock_contention *con)
bpf_map__set_max_entries(skel->maps.lock_stat, con->map_nr_entries);
bpf_map__set_max_entries(skel->maps.tstamp, con->map_nr_entries);

- if (con->aggr_mode == LOCK_AGGR_TASK) {
+ if (con->aggr_mode == LOCK_AGGR_TASK)
bpf_map__set_max_entries(skel->maps.task_data, con->map_nr_entries);
- bpf_map__set_max_entries(skel->maps.stacks, 1);
- } else {
+ else
bpf_map__set_max_entries(skel->maps.task_data, 1);
+
+ if (con->save_callstack)
bpf_map__set_max_entries(skel->maps.stacks, con->map_nr_entries);
- }
+ else
+ bpf_map__set_max_entries(skel->maps.stacks, 1);

if (target__has_cpu(target))
ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus);
@@ -146,6 +148,7 @@ int lock_contention_prepare(struct lock_contention *con)
/* these don't work well if in the rodata section */
skel->bss->stack_skip = con->stack_skip;
skel->bss->aggr_mode = con->aggr_mode;
+ skel->bss->needs_callstack = con->save_callstack;

lock_contention_bpf__attach(skel);
return 0;
@@ -178,7 +181,7 @@ static const char *lock_contention_get_name(struct lock_contention *con,
if (con->aggr_mode == LOCK_AGGR_TASK) {
struct contention_task_data task;
struct thread *t;
- int pid = key->aggr_key;
+ int pid = key->pid;
int task_fd = bpf_map__fd(skel->maps.task_data);

/* do not update idle comm which contains CPU number */
@@ -192,7 +195,7 @@ static const char *lock_contention_get_name(struct lock_contention *con,
}

if (con->aggr_mode == LOCK_AGGR_ADDR) {
- sym = machine__find_kernel_symbol(machine, key->aggr_key, &kmap);
+ sym = machine__find_kernel_symbol(machine, key->lock_addr, &kmap);
if (sym)
name = sym->name;
return name;
@@ -253,20 +256,35 @@ int lock_contention_read(struct lock_contention *con)

prev_key = NULL;
while (!bpf_map_get_next_key(fd, prev_key, &key)) {
- s32 stack_id;
+ s64 ls_key;
const char *name;

/* to handle errors in the loop body */
err = -1;

bpf_map_lookup_elem(fd, &key, &data);
-
if (con->save_callstack) {
- stack_id = key.aggr_key;
- bpf_map_lookup_elem(stack, &stack_id, stack_trace);
+ bpf_map_lookup_elem(stack, &key.stack_id, stack_trace);
+
+ if (!match_callstack_filter(machine, stack_trace))
+ goto next;
}

- st = lock_stat_find(key.aggr_key);
+ switch (con->aggr_mode) {
+ case LOCK_AGGR_CALLER:
+ ls_key = key.stack_id;
+ break;
+ case LOCK_AGGR_TASK:
+ ls_key = key.pid;
+ break;
+ case LOCK_AGGR_ADDR:
+ ls_key = key.lock_addr;
+ break;
+ default:
+ goto next;
+ }
+
+ st = lock_stat_find(ls_key);
if (st != NULL) {
st->wait_time_total += data.total_time;
if (st->wait_time_max < data.max_time)
@@ -281,7 +299,7 @@ int lock_contention_read(struct lock_contention *con)
}

name = lock_contention_get_name(con, &key, stack_trace);
- st = lock_stat_findnew(key.aggr_key, name, data.flags);
+ st = lock_stat_findnew(ls_key, name, data.flags);
if (st == NULL)
break;

@@ -293,8 +311,6 @@ int lock_contention_read(struct lock_contention *con)
if (data.count)
st->avg_wait_time = data.total_time / data.count;

- st->flags = data.flags;
-
if (con->save_callstack) {
st->callstack = memdup(stack_trace, stack_size);
if (st->callstack == NULL)
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
index ad0ca5d50557..7ce276ed987e 100644
--- a/tools/perf/util/bpf_skel/lock_contention.bpf.c
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -82,6 +82,7 @@ int has_cpu;
int has_task;
int has_type;
int has_addr;
+int needs_callstack;
int stack_skip;

/* determine the key of lock stat */
@@ -173,7 +174,7 @@ int contention_begin(u64 *ctx)
pelem->lock = (__u64)ctx[0];
pelem->flags = (__u32)ctx[1];

- if (aggr_mode == LOCK_AGGR_CALLER) {
+ if (needs_callstack) {
pelem->stack_id = bpf_get_stackid(ctx, &stacks,
BPF_F_FAST_STACK_CMP | stack_skip);
if (pelem->stack_id < 0)
@@ -188,7 +189,7 @@ int contention_end(u64 *ctx)
{
__u32 pid;
struct tstamp_data *pelem;
- struct contention_key key;
+ struct contention_key key = {};
struct contention_data *data;
__u64 duration;

@@ -204,14 +205,18 @@ int contention_end(u64 *ctx)

switch (aggr_mode) {
case LOCK_AGGR_CALLER:
- key.aggr_key = pelem->stack_id;
+ key.stack_id = pelem->stack_id;
break;
case LOCK_AGGR_TASK:
- key.aggr_key = pid;
+ key.pid = pid;
update_task_data(pid);
+ if (needs_callstack)
+ key.stack_id = pelem->stack_id;
break;
case LOCK_AGGR_ADDR:
- key.aggr_key = pelem->lock;
+ key.lock_addr = pelem->lock;
+ if (needs_callstack)
+ key.stack_id = pelem->stack_id;
break;
default:
/* should not happen */
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
index ce71cf1a7e1e..3d35fd4407ac 100644
--- a/tools/perf/util/bpf_skel/lock_data.h
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -4,7 +4,9 @@
#define UTIL_BPF_SKEL_LOCK_DATA_H

struct contention_key {
- u64 aggr_key; /* can be stack_id, pid or lock addr */
+ u32 stack_id;
+ u32 pid;
+ u64 lock_addr;
};

#define TASK_COMM_LEN 16
diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
index 39d5bfc77f4e..e5fc036108ec 100644
--- a/tools/perf/util/lock-contention.h
+++ b/tools/perf/util/lock-contention.h
@@ -68,6 +68,8 @@ struct lock_stat {
struct lock_stat *lock_stat_find(u64 addr);
struct lock_stat *lock_stat_findnew(u64 addr, const char *name, int flags);

+bool match_callstack_filter(struct machine *machine, u64 *callstack);
+
/*
* struct lock_seq_stat:
* Place to put on state of one lock sequence
--
2.39.1.456.gfc5497dd1b-goog


2023-02-02 05:05:25

by Namhyung Kim

[permalink] [raw]
Subject: [PATCH 4/4] perf test: Add more test cases for perf lock contention

Check callstack filter with two different aggregation mode.

$ sudo ./perf test -v contention
88: kernel lock contention analysis test :
--- start ---
test child forked, pid 83416
Testing perf lock record and perf lock contention
Testing perf lock contention --use-bpf
Testing perf lock record and perf lock contention at the same time
Testing perf lock contention --threads
Testing perf lock contention --lock-addr
Testing perf lock contention --type-filter (w/ spinlock)
Testing perf lock contention --lock-filter (w/ tasklist_lock)
Testing perf lock contention --callstack-filter (w/ unix_stream)
Testing perf lock contention --callstack-filter with task aggregation
test child finished with 0
---- end ----
kernel lock contention analysis test: Ok

Signed-off-by: Namhyung Kim <[email protected]>
---
tools/perf/tests/shell/lock_contention.sh | 66 +++++++++++++++++++++--
1 file changed, 62 insertions(+), 4 deletions(-)

diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh
index b05f1b1ca6c8..be5fcafb26aa 100755
--- a/tools/perf/tests/shell/lock_contention.sh
+++ b/tools/perf/tests/shell/lock_contention.sh
@@ -128,7 +128,7 @@ test_type_filter()
echo "Testing perf lock contention --type-filter (w/ spinlock)"
perf lock contention -i ${perfdata} -Y spinlock -q 2> ${result}
if [ $(grep -c -v spinlock "${result}") != "0" ]; then
- echo "[Fail] Recorded should not have non-spinlocks:" $(cat "${result}")
+ echo "[Fail] Recorded result should not have non-spinlocks:" $(cat "${result}")
err=1
exit
fi
@@ -139,7 +139,7 @@ test_type_filter()

perf lock con -a -b -Y spinlock -q -- perf bench sched messaging > /dev/null 2> ${result}
if [ $(grep -c -v spinlock "${result}") != "0" ]; then
- echo "[Fail] Recorded should not have non-spinlocks:" $(cat "${result}")
+ echo "[Fail] BPF result should not have non-spinlocks:" $(cat "${result}")
err=1
exit
fi
@@ -160,7 +160,7 @@ test_lock_filter()
local type=$(head -1 "${result}" | awk '{ print $8 }' | sed -e 's/:.*//')

if [ $(grep -c -v "${type}" "${result}") != "0" ]; then
- echo "[Fail] Recorded should not have non-${type} locks:" $(cat "${result}")
+ echo "[Fail] Recorded result should not have non-${type} locks:" $(cat "${result}")
err=1
exit
fi
@@ -171,7 +171,63 @@ test_lock_filter()

perf lock con -a -b -L tasklist_lock -q -- perf bench sched messaging > /dev/null 2> ${result}
if [ $(grep -c -v "${type}" "${result}") != "0" ]; then
- echo "[Fail] Recorded should not have non-${type} locks:" $(cat "${result}")
+ echo "[Fail] BPF result should not have non-${type} locks:" $(cat "${result}")
+ err=1
+ exit
+ fi
+}
+
+test_stack_filter()
+{
+ echo "Testing perf lock contention --callstack-filter (w/ unix_stream)"
+ perf lock contention -i ${perfdata} -v -q 2> ${result}
+ if [ $(grep -c unix_stream "${result}") == "0" ]; then
+ echo "[Skip] Could not find 'unix_stream'"
+ return
+ fi
+
+ perf lock contention -i ${perfdata} -E 1 -S unix_stream -q 2> ${result}
+ if [ $(cat "${result}" | wc -l) != "1" ]; then
+ echo "[Fail] Recorded result should have a lock from unix_stream:" $(cat "${result}")
+ err=1
+ exit
+ fi
+
+ if ! perf lock con -b true > /dev/null 2>&1 ; then
+ return
+ fi
+
+ perf lock con -a -b -S unix_stream -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result}
+ if [ $(cat "${result}" | wc -l) != "1" ]; then
+ echo "[Fail] BPF result should have a lock from unix_stream:" $(cat "${result}")
+ err=1
+ exit
+ fi
+}
+
+test_aggr_task_stack_filter()
+{
+ echo "Testing perf lock contention --callstack-filter with task aggregation"
+ perf lock contention -i ${perfdata} -v -q 2> ${result}
+ if [ $(grep -c unix_stream "${result}") == "0" ]; then
+ echo "[Skip] Could not find 'unix_stream'"
+ return
+ fi
+
+ perf lock contention -i ${perfdata} -t -E 1 -S unix_stream -q 2> ${result}
+ if [ $(cat "${result}" | wc -l) != "1" ]; then
+ echo "[Fail] Recorded result should have a task from unix_stream:" $(cat "${result}")
+ err=1
+ exit
+ fi
+
+ if ! perf lock con -b true > /dev/null 2>&1 ; then
+ return
+ fi
+
+ perf lock con -a -b -t -S unix_stream -E 1 -q -- perf bench sched messaging > /dev/null 2> ${result}
+ if [ $(cat "${result}" | wc -l) != "1" ]; then
+ echo "[Fail] BPF result should have a task from unix_stream:" $(cat "${result}")
err=1
exit
fi
@@ -186,5 +242,7 @@ test_aggr_task
test_aggr_addr
test_type_filter
test_lock_filter
+test_stack_filter
+test_aggr_task_stack_filter

exit ${err}
--
2.39.1.456.gfc5497dd1b-goog


2023-02-02 12:53:29

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 1/4] perf lock contention: Factor out lock_contention_get_name()

Em Wed, Feb 01, 2023 at 09:04:52PM -0800, Namhyung Kim escreveu:
> The lock_contention_get_name() returns a name for the lock stat entry
> based on the current aggregation mode. As it's called sequentially in a
> single thread, it can return the address of a static buffer for symbol
> and offset of the caller.
>
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> tools/perf/util/bpf_lock_contention.c | 113 ++++++++++++++------------
> 1 file changed, 63 insertions(+), 50 deletions(-)
>
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index 4902ac331f41..967ce168f163 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -163,9 +163,68 @@ int lock_contention_stop(void)
> return 0;
> }
>
> +static const char *lock_contention_get_name(struct lock_contention *con,
> + struct contention_key *key,
> + u64 *stack_trace)
> +{
> + int idx = 0;
> + u64 addr;
> + const char *name = "";
> + static char name_buf[KSYM_NAME_LEN];
> + struct symbol *sym;
> + struct map *kmap;
> + struct machine *machine = con->machine;
> +
> + if (con->aggr_mode == LOCK_AGGR_TASK) {
> + struct contention_task_data task;
> + struct thread *t;
> + int pid = key->aggr_key;
> + int task_fd = bpf_map__fd(skel->maps.task_data);
> +

I'm processing this as-is, but please consider to reduce the number of
lines by declaring variables where they are needed, for instance, the
't' variable is only used inside the next if block, so it could have
been:

> + /* do not update idle comm which contains CPU number */
> + if (pid) {
> + bpf_map_lookup_elem(task_fd, &pid, &task);
> + t = __machine__findnew_thread(machine, /*pid=*/-1, pid);

+ struct thread *t = __machine__findnew_thread(machine, /*pid=*/-1, pid);

But since __machine__findnew_thread() can fail, please check for that
and send a new patch series... Humm, you're just factoring out, the
problem was there before, so, to make progress, I'll process it as is
and later we can add fixes, can you please look into that?

> + thread__set_comm(t, task.comm, /*timestamp=*/0);

thread__set_comm() can fail as well.

> + name = task.comm;
> + }
> + return name;
> + }
> +
> + if (con->aggr_mode == LOCK_AGGR_ADDR) {
> + sym = machine__find_kernel_symbol(machine, key->aggr_key, &kmap);
> + if (sym)
> + name = sym->name;

Here you check :-)

> + return name;
> + }
> +
> + /* LOCK_AGGR_CALLER: skip lock internal functions */
> + while (machine__is_lock_function(machine, stack_trace[idx]) &&
> + idx < con->max_stack - 1)
> + idx++;
> +
> + addr = stack_trace[idx];
> + sym = machine__find_kernel_symbol(machine, addr, &kmap);
> +
> + if (sym) {
> + unsigned long offset;
> +
> + offset = kmap->map_ip(kmap, addr) - sym->start;
> +
> + if (offset == 0)
> + return sym->name;
> +
> + snprintf(name_buf, sizeof(name_buf), "%s+%#lx", sym->name, offset);
> + } else {
> + snprintf(name_buf, sizeof(name_buf), "%#lx", (unsigned long)addr);
> + }
> +
> + return name_buf;
> +}
> +
> int lock_contention_read(struct lock_contention *con)
> {
> - int fd, stack, task_fd, err = 0;
> + int fd, stack, err = 0;
> struct contention_key *prev_key, key;
> struct contention_data data = {};
> struct lock_stat *st = NULL;
> @@ -175,7 +234,6 @@ int lock_contention_read(struct lock_contention *con)
>
> fd = bpf_map__fd(skel->maps.lock_stat);
> stack = bpf_map__fd(skel->maps.stacks);
> - task_fd = bpf_map__fd(skel->maps.task_data);
>
> con->lost = skel->bss->lost;
>
> @@ -195,9 +253,6 @@ int lock_contention_read(struct lock_contention *con)
>
> prev_key = NULL;
> while (!bpf_map_get_next_key(fd, prev_key, &key)) {
> - struct map *kmap;
> - struct symbol *sym;
> - int idx = 0;
> s32 stack_id;
>
> /* to handle errors in the loop body */
> @@ -219,61 +274,19 @@ int lock_contention_read(struct lock_contention *con)
> st->flags = data.flags;
> st->addr = key.aggr_key;
>
> - if (con->aggr_mode == LOCK_AGGR_TASK) {
> - struct contention_task_data task;
> - struct thread *t;
> - int pid = key.aggr_key;
> -
> - /* do not update idle comm which contains CPU number */
> - if (st->addr) {
> - bpf_map_lookup_elem(task_fd, &pid, &task);
> - t = __machine__findnew_thread(machine, /*pid=*/-1, pid);
> - thread__set_comm(t, task.comm, /*timestamp=*/0);
> - }
> - goto next;
> - }
> -
> - if (con->aggr_mode == LOCK_AGGR_ADDR) {
> - sym = machine__find_kernel_symbol(machine, st->addr, &kmap);
> - if (sym)
> - st->name = strdup(sym->name);
> - goto next;
> - }
> -
> stack_id = key.aggr_key;
> bpf_map_lookup_elem(stack, &stack_id, stack_trace);
>
> - /* skip lock internal functions */
> - while (machine__is_lock_function(machine, stack_trace[idx]) &&
> - idx < con->max_stack - 1)
> - idx++;
> -
> - st->addr = stack_trace[idx];
> - sym = machine__find_kernel_symbol(machine, st->addr, &kmap);
> -
> - if (sym) {
> - unsigned long offset;
> - int ret = 0;
> -
> - offset = kmap->map_ip(kmap, st->addr) - sym->start;
> -
> - if (offset)
> - ret = asprintf(&st->name, "%s+%#lx", sym->name, offset);
> - else
> - st->name = strdup(sym->name);
> -
> - if (ret < 0 || st->name == NULL)
> - break;
> - } else if (asprintf(&st->name, "%#lx", (unsigned long)st->addr) < 0) {
> + st->name = strdup(lock_contention_get_name(con, &key, stack_trace));
> + if (st->name == NULL)
> break;
> - }
>
> if (con->save_callstack) {
> st->callstack = memdup(stack_trace, stack_size);
> if (st->callstack == NULL)
> break;
> }
> -next:
> +
> hlist_add_head(&st->hash_entry, con->result);
> prev_key = &key;
>
> --
> 2.39.1.456.gfc5497dd1b-goog
>

--

- Arnaldo

2023-02-02 12:55:11

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 0/4] perf lock contention: Improve aggr x filter combination (v1)

Em Wed, Feb 01, 2023 at 09:04:51PM -0800, Namhyung Kim escreveu:
> Hello,
>
> The callstack filter can be useful to debug lock issues but it has a
> limitation that it only works with caller aggregation mode (which is the
> default setting). IOW it cannot filter by callstack when showing tasks
> or lock addresses/names.

Thanks, applied, the only nit was for a preexisting problem I only
caught now in a refactoring, please consider fixing those.

- Arnaldo

> But sometimes users want to use the filter for other aggregation mode.
> Like "show me lock addresses/names from this caller only" or "show me
> tasks having these callers".
>
> When it's using tracepoint events from the data file, the situation is
> good since the tracepoints have all the necessary info. But when using
> BPF it needs to extend the key of lock stat BPF map to have more than
> one info like 'pid + stack_id' or 'lock_addr + stack_id'. As callstack
> filter works in userspace, it should save the both info.
>
> With this change we can now use the -S/--callstack-filter with the
> -t/--threads option or -l/--lock-addr option. It's also possible to use
> it with other filter options.
>
> The following example shows the top 5 tasks that have contention
> somewhere in the epoll handling.
>
> $ sudo perf lock con -abt -S epoll -E5 -- sleep 1
> contended total wait max wait avg wait pid comm
>
> 2 58.64 us 32.38 us 29.32 us 1514752 Chrome_IOThread
> 3 29.31 us 12.65 us 9.77 us 3773 Xorg
> 1 17.45 us 17.45 us 17.45 us 1514906 Chrome_ChildIOT
> 1 15.41 us 15.41 us 15.41 us 1515382 Chrome_ChildIOT
> 1 12.52 us 12.52 us 12.52 us 293878 IPC I/O Parent
>
> You get get the code at 'perf/lock-filter-v1' branch in
>
> git://git.kernel.org/pub/scm/linux/kernel/git/namhyung/linux-perf.git
>
> Thanks,
> Namhyung
>
> Namhyung Kim (4):
> perf lock contention: Factor out lock_contention_get_name()
> perf lock contention: Use lock_stat_find{,new}
> perf lock contention: Support filters for different aggregation
> perf test: Add more test cases for perf lock contention
>
> tools/perf/builtin-lock.c | 79 ++++----
> tools/perf/tests/shell/lock_contention.sh | 66 ++++++-
> tools/perf/util/bpf_lock_contention.c | 178 +++++++++++-------
> .../perf/util/bpf_skel/lock_contention.bpf.c | 15 +-
> tools/perf/util/bpf_skel/lock_data.h | 4 +-
> tools/perf/util/lock-contention.h | 5 +
> 6 files changed, 234 insertions(+), 113 deletions(-)
>
>
> base-commit: 7cfa9f5e440054db7c7e28e83a045d36993ff958
> --
> 2.39.1.456.gfc5497dd1b-goog
>

--

- Arnaldo

2023-02-02 20:28:01

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 2/4] perf lock contention: Use lock_stat_find{,new}

Em Wed, Feb 01, 2023 at 09:04:53PM -0800, Namhyung Kim escreveu:
> This is a preparation work to support complex keys of BPF maps. Now it
> has single value key according to the aggregation mode like stack_id or
> pid. But we want to use a combination of those keys.
>
> Then lock_contention_read() should still aggregate the result based on
> the key that was requested by user. The other key info will be used for
> filtering.
>
> So instead of creating a lock_stat entry always, Check if it's already
> there using lock_stat_find() first.

Hey, try building without libtraceevent-devel installed, should be
equivalent to NO_LIBTRACEEVENT=1.

At this point I think you should move bpf_lock_contention.o to inside
that CONFIG_LIBTRACEEVENT if block.

perf-$(CONFIG_PERF_BPF_SKEL) += bpf_lock_contention.o

ifeq ($(CONFIG_LIBTRACEEVENT),y)
perf-$(CONFIG_PERF_BPF_SKEL) += bpf_kwork.o
endif

I'm removing this series from tmp.perf/core for now.

- Arnaldo

> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> tools/perf/builtin-lock.c | 4 +--
> tools/perf/util/bpf_lock_contention.c | 41 ++++++++++++++++-----------
> tools/perf/util/lock-contention.h | 3 ++
> 3 files changed, 30 insertions(+), 18 deletions(-)
>
> diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
> index 216a9a252bf4..0593c6e636c6 100644
> --- a/tools/perf/builtin-lock.c
> +++ b/tools/perf/builtin-lock.c
> @@ -465,7 +465,7 @@ static struct lock_stat *pop_from_result(void)
> return container_of(node, struct lock_stat, rb);
> }
>
> -static struct lock_stat *lock_stat_find(u64 addr)
> +struct lock_stat *lock_stat_find(u64 addr)
> {
> struct hlist_head *entry = lockhashentry(addr);
> struct lock_stat *ret;
> @@ -477,7 +477,7 @@ static struct lock_stat *lock_stat_find(u64 addr)
> return NULL;
> }
>
> -static struct lock_stat *lock_stat_findnew(u64 addr, const char *name, int flags)
> +struct lock_stat *lock_stat_findnew(u64 addr, const char *name, int flags)
> {
> struct hlist_head *entry = lockhashentry(addr);
> struct lock_stat *ret, *new;
> diff --git a/tools/perf/util/bpf_lock_contention.c b/tools/perf/util/bpf_lock_contention.c
> index 967ce168f163..c6f2db603d5a 100644
> --- a/tools/perf/util/bpf_lock_contention.c
> +++ b/tools/perf/util/bpf_lock_contention.c
> @@ -254,12 +254,34 @@ int lock_contention_read(struct lock_contention *con)
> prev_key = NULL;
> while (!bpf_map_get_next_key(fd, prev_key, &key)) {
> s32 stack_id;
> + const char *name;
>
> /* to handle errors in the loop body */
> err = -1;
>
> bpf_map_lookup_elem(fd, &key, &data);
> - st = zalloc(sizeof(*st));
> +
> + if (con->save_callstack) {
> + stack_id = key.aggr_key;
> + bpf_map_lookup_elem(stack, &stack_id, stack_trace);
> + }
> +
> + st = lock_stat_find(key.aggr_key);
> + if (st != NULL) {
> + st->wait_time_total += data.total_time;
> + if (st->wait_time_max < data.max_time)
> + st->wait_time_max = data.max_time;
> + if (st->wait_time_min > data.min_time)
> + st->wait_time_min = data.min_time;
> +
> + st->nr_contended += data.count;
> + if (st->nr_contended)
> + st->avg_wait_time = st->wait_time_total / st->nr_contended;
> + goto next;
> + }
> +
> + name = lock_contention_get_name(con, &key, stack_trace);
> + st = lock_stat_findnew(key.aggr_key, name, data.flags);
> if (st == NULL)
> break;
>
> @@ -272,14 +294,6 @@ int lock_contention_read(struct lock_contention *con)
> st->avg_wait_time = data.total_time / data.count;
>
> st->flags = data.flags;
> - st->addr = key.aggr_key;
> -
> - stack_id = key.aggr_key;
> - bpf_map_lookup_elem(stack, &stack_id, stack_trace);
> -
> - st->name = strdup(lock_contention_get_name(con, &key, stack_trace));
> - if (st->name == NULL)
> - break;
>
> if (con->save_callstack) {
> st->callstack = memdup(stack_trace, stack_size);
> @@ -287,19 +301,14 @@ int lock_contention_read(struct lock_contention *con)
> break;
> }
>
> - hlist_add_head(&st->hash_entry, con->result);
> +next:
> prev_key = &key;
>
> - /* we're fine now, reset the values */
> - st = NULL;
> + /* we're fine now, reset the error */
> err = 0;
> }
>
> free(stack_trace);
> - if (st) {
> - free(st->name);
> - free(st);
> - }
>
> return err;
> }
> diff --git a/tools/perf/util/lock-contention.h b/tools/perf/util/lock-contention.h
> index 17e594d57a61..39d5bfc77f4e 100644
> --- a/tools/perf/util/lock-contention.h
> +++ b/tools/perf/util/lock-contention.h
> @@ -65,6 +65,9 @@ struct lock_stat {
> */
> #define MAX_LOCK_DEPTH 48
>
> +struct lock_stat *lock_stat_find(u64 addr);
> +struct lock_stat *lock_stat_findnew(u64 addr, const char *name, int flags);
> +
> /*
> * struct lock_seq_stat:
> * Place to put on state of one lock sequence
> --
> 2.39.1.456.gfc5497dd1b-goog
>

--

- Arnaldo

2023-02-02 23:51:26

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 2/4] perf lock contention: Use lock_stat_find{,new}

Hi Arnaldo,

On Thu, Feb 2, 2023 at 12:27 PM Arnaldo Carvalho de Melo
<[email protected]> wrote:
>
> Em Wed, Feb 01, 2023 at 09:04:53PM -0800, Namhyung Kim escreveu:
> > This is a preparation work to support complex keys of BPF maps. Now it
> > has single value key according to the aggregation mode like stack_id or
> > pid. But we want to use a combination of those keys.
> >
> > Then lock_contention_read() should still aggregate the result based on
> > the key that was requested by user. The other key info will be used for
> > filtering.
> >
> > So instead of creating a lock_stat entry always, Check if it's already
> > there using lock_stat_find() first.
>
> Hey, try building without libtraceevent-devel installed, should be
> equivalent to NO_LIBTRACEEVENT=1.
>
> At this point I think you should move bpf_lock_contention.o to inside
> that CONFIG_LIBTRACEEVENT if block.
>
> perf-$(CONFIG_PERF_BPF_SKEL) += bpf_lock_contention.o
>
> ifeq ($(CONFIG_LIBTRACEEVENT),y)
> perf-$(CONFIG_PERF_BPF_SKEL) += bpf_kwork.o
> endif
>
> I'm removing this series from tmp.perf/core for now.

Thanks for the suggestion. I've tested it builds with the change.
Will send v2.

Thanks,
Namhyung