2020-06-26 00:14:33

by Song Liu

[permalink] [raw]
Subject: [PATCH v2 bpf-next 0/4] bpf: introduce bpf_get_task_stack()

This set introduces a new helper bpf_get_task_stack(). The primary use case
is to dump all /proc/*/stack to seq_file via bpf_iter__task.

A few different approaches have been explored and compared:

1. A simple wrapper around stack_trace_save_tsk(), as v1 [1].

This approach introduces new syntax, which is different to existing
helper bpf_get_stack(). Therefore, this is not ideal.

2. Extend get_perf_callchain() to support "task" as argument.

This approach reuses most of bpf_get_stack(). However, extending
get_perf_callchain() requires non-trivial changes to architecture
specific code. Which is error prone.

3. Current (v2) approach, leverages most of existing bpf_get_stack(), and
uses stack_trace_save_tsk() to handle architecture specific logic.

[1] https://lore.kernel.org/netdev/[email protected]/

Changes v1 => v2:
1. Reuse most of bpf_get_stack() logic. (Andrii)
2. Fix unsigned long vs. u64 mismatch for 32-bit systems. (Yonghong)
3. Add %pB support in bpf_trace_printk(). (Daniel)
4. Fix buffer size to bytes.

Song Liu (4):
perf: export get/put_chain_entry()
bpf: introduce helper bpf_get_task_stak()
bpf: allow %pB in bpf_seq_printf() and bpf_trace_printk()
selftests/bpf: add bpf_iter test with bpf_get_task_stack_trace()

include/linux/bpf.h | 1 +
include/linux/perf_event.h | 2 +
include/uapi/linux/bpf.h | 35 +++++++-
kernel/bpf/stackmap.c | 79 ++++++++++++++++++-
kernel/events/callchain.c | 4 +-
kernel/trace/bpf_trace.c | 14 +++-
scripts/bpf_helpers_doc.py | 2 +
tools/include/uapi/linux/bpf.h | 35 +++++++-
.../selftests/bpf/prog_tests/bpf_iter.c | 17 ++++
.../selftests/bpf/progs/bpf_iter_task_stack.c | 60 ++++++++++++++
10 files changed, 239 insertions(+), 10 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c

--
2.24.1


2020-06-26 00:14:38

by Song Liu

[permalink] [raw]
Subject: [PATCH v2 bpf-next 4/4] selftests/bpf: add bpf_iter test with bpf_get_task_stack()

The new test is similar to other bpf_iter tests.

Signed-off-by: Song Liu <[email protected]>
---
.../selftests/bpf/prog_tests/bpf_iter.c | 17 ++++++
.../selftests/bpf/progs/bpf_iter_task_stack.c | 60 +++++++++++++++++++
2 files changed, 77 insertions(+)
create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 87c29dde1cf96..baa83328f810d 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -5,6 +5,7 @@
#include "bpf_iter_netlink.skel.h"
#include "bpf_iter_bpf_map.skel.h"
#include "bpf_iter_task.skel.h"
+#include "bpf_iter_task_stack.skel.h"
#include "bpf_iter_task_file.skel.h"
#include "bpf_iter_test_kern1.skel.h"
#include "bpf_iter_test_kern2.skel.h"
@@ -106,6 +107,20 @@ static void test_task(void)
bpf_iter_task__destroy(skel);
}

+static void test_task_stack(void)
+{
+ struct bpf_iter_task_stack *skel;
+
+ skel = bpf_iter_task_stack__open_and_load();
+ if (CHECK(!skel, "bpf_iter_task_stack__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_task_stack);
+
+ bpf_iter_task_stack__destroy(skel);
+}
+
static void test_task_file(void)
{
struct bpf_iter_task_file *skel;
@@ -392,6 +407,8 @@ void test_bpf_iter(void)
test_bpf_map();
if (test__start_subtest("task"))
test_task();
+ if (test__start_subtest("task_stack"))
+ test_task_stack();
if (test__start_subtest("task_file"))
test_task_file();
if (test__start_subtest("anon"))
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
new file mode 100644
index 0000000000000..83aca5b1a7965
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__task bpf_iter__task___not_used
+#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__task
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* bpf_get_task_stack needs a stackmap to work */
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, 16384);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u64) * 20);
+} stackmap SEC(".maps");
+
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+} __attribute__((preserve_access_index));
+
+#define MAX_STACK_TRACE_DEPTH 64
+unsigned long entries[MAX_STACK_TRACE_DEPTH];
+
+SEC("iter/task")
+int dump_task_stack(struct bpf_iter__task *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct task_struct *task = ctx->task;
+ int i, retlen;
+
+ if (task == (void *)0)
+ return 0;
+
+ retlen = bpf_get_task_stack(task, entries,
+ MAX_STACK_TRACE_DEPTH * sizeof(unsigned long), 0);
+ if (retlen < 0)
+ return 0;
+
+ BPF_SEQ_PRINTF(seq, "pid: %8u num_entries: %8u\n", task->pid,
+ retlen / sizeof(unsigned long));
+ for (i = 0; i < MAX_STACK_TRACE_DEPTH / sizeof(unsigned long); i++) {
+ if (retlen > i * sizeof(unsigned long))
+ BPF_SEQ_PRINTF(seq, "[<0>] %pB\n", (void *)entries[i]);
+ }
+ BPF_SEQ_PRINTF(seq, "\n");
+
+ return 0;
+}
--
2.24.1

2020-06-26 00:14:48

by Song Liu

[permalink] [raw]
Subject: [PATCH v2 bpf-next 3/4] bpf: allow %pB in bpf_seq_printf() and bpf_trace_printk()

This makes it easy to dump stack trace in text.

Signed-off-by: Song Liu <[email protected]>
---
kernel/trace/bpf_trace.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 65fa62723e2f8..1cb90b0868817 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -376,7 +376,7 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,

/*
* Only limited trace_printk() conversion specifiers allowed:
- * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s
+ * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s
*/
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
@@ -420,6 +420,11 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
goto fmt_str;
}

+ if (fmt[i + 1] == 'B') {
+ i++;
+ goto fmt_next;
+ }
+
/* disallow any further format extensions */
if (fmt[i + 1] != 0 &&
!isspace(fmt[i + 1]) &&
@@ -479,7 +484,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
#define __BPF_TP_EMIT() __BPF_ARG3_TP()
#define __BPF_TP(...) \
__trace_printk(0 /* Fake ip */, \
- fmt, ##__VA_ARGS__)
+ fmt, ##__VA_ARGS__)\

#define __BPF_ARG1_TP(...) \
((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
@@ -636,7 +641,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
if (fmt[i] == 'p') {
if (fmt[i + 1] == 0 ||
fmt[i + 1] == 'K' ||
- fmt[i + 1] == 'x') {
+ fmt[i + 1] == 'x' ||
+ fmt[i + 1] == 'B') {
/* just kernel pointers */
params[fmt_cnt] = args[fmt_cnt];
fmt_cnt++;
--
2.24.1

2020-06-26 00:15:31

by Song Liu

[permalink] [raw]
Subject: [PATCH v2 bpf-next 1/4] perf: export get/put_chain_entry()

This would be used by bpf stack mapo.

Signed-off-by: Song Liu <[email protected]>
---
include/linux/perf_event.h | 2 ++
kernel/events/callchain.c | 4 ++--
2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b4bb32082342c..00ab5efa38334 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1244,6 +1244,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
+extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
+extern void put_callchain_entry(int rctx);

extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 334d48b16c36d..50b8a1622807f 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -149,7 +149,7 @@ void put_callchain_buffers(void)
}
}

-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+struct perf_callchain_entry *get_callchain_entry(int *rctx)
{
int cpu;
struct callchain_cpus_entries *entries;
@@ -168,7 +168,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
(*rctx * perf_callchain_entry__sizeof()));
}

-static void
+void
put_callchain_entry(int rctx)
{
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
--
2.24.1

2020-06-26 00:18:26

by Song Liu

[permalink] [raw]
Subject: [PATCH v2 bpf-next 2/4] bpf: introduce helper bpf_get_task_stak()

Introduce helper bpf_get_task_stack(), which dumps stack trace of given
task. This is different to bpf_get_stack(), which gets stack track of
current task. One potential use case of bpf_get_task_stack() is to call
it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.

bpf_get_task_stack() uses stack_trace_save_tsk() instead of
get_perf_callchain() for kernel stack. The benefit of this choice is that
stack_trace_save_tsk() doesn't require changes in arch/. The downside of
using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
stack trace to unsigned long array. For 32-bit systems, we need to
translate it to u64 array.

Signed-off-by: Song Liu <[email protected]>
---
include/linux/bpf.h | 1 +
include/uapi/linux/bpf.h | 35 ++++++++++++++-
kernel/bpf/stackmap.c | 79 ++++++++++++++++++++++++++++++++--
kernel/trace/bpf_trace.c | 2 +
scripts/bpf_helpers_doc.py | 2 +
tools/include/uapi/linux/bpf.h | 35 ++++++++++++++-
6 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 07052d44bca1c..cee31ee56367b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1607,6 +1607,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
+extern const struct bpf_func_proto bpf_get_task_stack_proto;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 19684813faaed..7638412987354 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3252,6 +3252,38 @@ union bpf_attr {
* case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
* is returned or the error code -EACCES in case the skb is not
* subject to CHECKSUM_UNNECESSARY.
+ *
+ * int bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
+ * Description
+ * Return a user or a kernel stack in bpf program provided buffer.
+ * To achieve this, the helper needs *task*, which is a valid
+ * pointer to struct task_struct. To store the stacktrace, the
+ * bpf program provides *buf* with a nonnegative *size*.
+ *
+ * The last argument, *flags*, holds the number of stack frames to
+ * skip (from 0 to 255), masked with
+ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * the following flags:
+ *
+ * **BPF_F_USER_STACK**
+ * Collect a user space stack instead of a kernel stack.
+ * **BPF_F_USER_BUILD_ID**
+ * Collect buildid+offset instead of ips for user stack,
+ * only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * **bpf_get_task_stack**\ () can collect up to
+ * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * to sufficient large buffer size. Note that
+ * this limit can be controlled with the **sysctl** program, and
+ * that it should be manually increased in order to profile long
+ * user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * ::
+ *
+ * # sysctl kernel.perf_event_max_stack=<new value>
+ * Return
+ * A non-negative value equal to or less than *size* on success,
+ * or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3389,7 +3421,8 @@ union bpf_attr {
FN(ringbuf_submit), \
FN(ringbuf_discard), \
FN(ringbuf_query), \
- FN(csum_level),
+ FN(csum_level), \
+ FN(get_task_stack),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 599488f25e404..64b7843057a23 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -348,6 +348,44 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
}

+static struct perf_callchain_entry *
+get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+{
+ struct perf_callchain_entry *entry;
+ int rctx;
+
+ entry = get_callchain_entry(&rctx);
+
+ if (rctx == -1)
+ return NULL;
+
+ if (!entry)
+ goto exit_put;
+
+ entry->nr = init_nr +
+ stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
+ sysctl_perf_event_max_stack - init_nr, 0);
+
+ /* stack_trace_save_tsk() works on unsigned long array, while
+ * perf_callchain_entry uses u64 array. For 32-bit systems, it is
+ * necessary to fix this mismatch.
+ */
+ if (__BITS_PER_LONG != 64) {
+ unsigned long *from = (unsigned long *) entry->ip;
+ u64 *to = entry->ip;
+ int i;
+
+ /* copy data from the end to avoid using extra buffer */
+ for (i = entry->nr - 1; i >= (int)init_nr; i--)
+ to[i] = (u64)(from[i]);
+ }
+
+exit_put:
+ put_callchain_entry(rctx);
+
+ return entry;
+}
+
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
@@ -448,8 +486,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING,
};

-BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
- u64, flags)
+static int __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
+ void *buf, u32 size, u64 flags)
{
u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -471,13 +509,22 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
if (unlikely(size % elem_size))
goto clear;

+ /* cannot get valid user stack for task without user_mode regs */
+ if (task && user && !user_mode(regs))
+ goto err_fault;
+
num_elem = size / elem_size;
if (sysctl_perf_event_max_stack < num_elem)
init_nr = 0;
else
init_nr = sysctl_perf_event_max_stack - num_elem;
+
+ if (kernel && task)
+ trace = get_callchain_entry_for_task(task, init_nr);
+ else
trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
+ sysctl_perf_event_max_stack,
+ false, false);
if (unlikely(!trace))
goto err_fault;

@@ -505,6 +552,12 @@ BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
return err;
}

+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, buf, size, flags);
+}
+
const struct bpf_func_proto bpf_get_stack_proto = {
.func = bpf_get_stack,
.gpl_only = true,
@@ -515,6 +568,26 @@ const struct bpf_func_proto bpf_get_stack_proto = {
.arg4_type = ARG_ANYTHING,
};

+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ struct pt_regs *regs = task_pt_regs(task);
+
+ return __bpf_get_stack(regs, task, buf, size, flags);
+}
+
+static int bpf_get_task_stack_btf_ids[5];
+const struct bpf_func_proto bpf_get_task_stack_proto = {
+ .func = bpf_get_task_stack,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .btf_id = bpf_get_task_stack_btf_ids,
+};
+
/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e729c9e587a07..65fa62723e2f8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1134,6 +1134,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
+ case BPF_FUNC_get_task_stack:
+ return &bpf_get_task_stack_proto;
default:
return NULL;
}
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 91fa668fa8602..a8783d668c5b7 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -421,6 +421,7 @@ class PrinterHelpers(Printer):
'struct sockaddr',
'struct tcphdr',
'struct seq_file',
+ 'struct task_struct',

'struct __sk_buff',
'struct sk_msg_md',
@@ -458,6 +459,7 @@ class PrinterHelpers(Printer):
'struct sockaddr',
'struct tcphdr',
'struct seq_file',
+ 'struct task_struct',
}
mapped_types = {
'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 19684813faaed..7638412987354 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3252,6 +3252,38 @@ union bpf_attr {
* case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
* is returned or the error code -EACCES in case the skb is not
* subject to CHECKSUM_UNNECESSARY.
+ *
+ * int bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
+ * Description
+ * Return a user or a kernel stack in bpf program provided buffer.
+ * To achieve this, the helper needs *task*, which is a valid
+ * pointer to struct task_struct. To store the stacktrace, the
+ * bpf program provides *buf* with a nonnegative *size*.
+ *
+ * The last argument, *flags*, holds the number of stack frames to
+ * skip (from 0 to 255), masked with
+ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * the following flags:
+ *
+ * **BPF_F_USER_STACK**
+ * Collect a user space stack instead of a kernel stack.
+ * **BPF_F_USER_BUILD_ID**
+ * Collect buildid+offset instead of ips for user stack,
+ * only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * **bpf_get_task_stack**\ () can collect up to
+ * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * to sufficient large buffer size. Note that
+ * this limit can be controlled with the **sysctl** program, and
+ * that it should be manually increased in order to profile long
+ * user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * ::
+ *
+ * # sysctl kernel.perf_event_max_stack=<new value>
+ * Return
+ * A non-negative value equal to or less than *size* on success,
+ * or a negative error in case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3389,7 +3421,8 @@ union bpf_attr {
FN(ringbuf_submit), \
FN(ringbuf_discard), \
FN(ringbuf_query), \
- FN(csum_level),
+ FN(csum_level), \
+ FN(get_task_stack),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
--
2.24.1

2020-06-26 12:09:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 1/4] perf: export get/put_chain_entry()

On Thu, Jun 25, 2020 at 05:13:29PM -0700, Song Liu wrote:
> This would be used by bpf stack mapo.

Would it make sense to sanitize the API a little before exposing it?

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 334d48b16c36..016894b0d2c2 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
return NULL;

entries = rcu_dereference(callchain_cpus_entries);
- if (!entries)
+ if (!entries) {
+ put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
return NULL;
+ }

cpu = smp_processor_id();

@@ -183,12 +185,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
int rctx;

entry = get_callchain_entry(&rctx);
- if (rctx == -1)
+ if (!entry || rctx == -1)
return NULL;

- if (!entry)
- goto exit_put;
-
ctx.entry = entry;
ctx.max_stack = max_stack;
ctx.nr = entry->nr = init_nr;

2020-06-26 17:01:09

by Yonghong Song

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 2/4] bpf: introduce helper bpf_get_task_stak()



On 6/25/20 5:13 PM, Song Liu wrote:
> Introduce helper bpf_get_task_stack(), which dumps stack trace of given
> task. This is different to bpf_get_stack(), which gets stack track of
> current task. One potential use case of bpf_get_task_stack() is to call
> it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.
>
> bpf_get_task_stack() uses stack_trace_save_tsk() instead of
> get_perf_callchain() for kernel stack. The benefit of this choice is that
> stack_trace_save_tsk() doesn't require changes in arch/. The downside of
> using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
> stack trace to unsigned long array. For 32-bit systems, we need to
> translate it to u64 array.
>
> Signed-off-by: Song Liu <[email protected]>
> ---
> include/linux/bpf.h | 1 +
> include/uapi/linux/bpf.h | 35 ++++++++++++++-
> kernel/bpf/stackmap.c | 79 ++++++++++++++++++++++++++++++++--
> kernel/trace/bpf_trace.c | 2 +
> scripts/bpf_helpers_doc.py | 2 +
> tools/include/uapi/linux/bpf.h | 35 ++++++++++++++-
> 6 files changed, 149 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 07052d44bca1c..cee31ee56367b 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1607,6 +1607,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
> extern const struct bpf_func_proto bpf_get_current_comm_proto;
> extern const struct bpf_func_proto bpf_get_stackid_proto;
> extern const struct bpf_func_proto bpf_get_stack_proto;
> +extern const struct bpf_func_proto bpf_get_task_stack_proto;
> extern const struct bpf_func_proto bpf_sock_map_update_proto;
> extern const struct bpf_func_proto bpf_sock_hash_update_proto;
> extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 19684813faaed..7638412987354 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3252,6 +3252,38 @@ union bpf_attr {
> * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
> * is returned or the error code -EACCES in case the skb is not
> * subject to CHECKSUM_UNNECESSARY.
> + *
> + * int bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)

Andrii's recent patch changed the return type to 'long' to align with
kernel u64 return type for better llvm code generation.

Please rebase and you will see the new convention.

> + * Description
> + * Return a user or a kernel stack in bpf program provided buffer.
> + * To achieve this, the helper needs *task*, which is a valid
> + * pointer to struct task_struct. To store the stacktrace, the
> + * bpf program provides *buf* with a nonnegative *size*.
> + *
> + * The last argument, *flags*, holds the number of stack frames to
> + * skip (from 0 to 255), masked with
> + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
> + * the following flags:
> + *
> + * **BPF_F_USER_STACK**
> + * Collect a user space stack instead of a kernel stack.
> + * **BPF_F_USER_BUILD_ID**
> + * Collect buildid+offset instead of ips for user stack,
> + * only valid if **BPF_F_USER_STACK** is also specified.
> + *
> + * **bpf_get_task_stack**\ () can collect up to
> + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
> + * to sufficient large buffer size. Note that
> + * this limit can be controlled with the **sysctl** program, and
> + * that it should be manually increased in order to profile long
> + * user stacks (such as stacks for Java programs). To do so, use:
> + *
> + * ::
> + *
> + * # sysctl kernel.perf_event_max_stack=<new value>
> + * Return
> + * A non-negative value equal to or less than *size* on success,
> + * or a negative error in case of failure.
> */
> #define __BPF_FUNC_MAPPER(FN) \
> FN(unspec), \
> @@ -3389,7 +3421,8 @@ union bpf_attr {
> FN(ringbuf_submit), \
> FN(ringbuf_discard), \
> FN(ringbuf_query), \
> - FN(csum_level),
> + FN(csum_level), \
> + FN(get_task_stack),
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> * function eBPF program intends to call
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 599488f25e404..64b7843057a23 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -348,6 +348,44 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
> }
> }
>
> +static struct perf_callchain_entry *
> +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
> +{
> + struct perf_callchain_entry *entry;
> + int rctx;
> +
> + entry = get_callchain_entry(&rctx);
> +
> + if (rctx == -1)
> + return NULL;

Is this needed? Should be below !entry enough?

> +
> + if (!entry)
> + goto exit_put;
> +
> + entry->nr = init_nr +
> + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
> + sysctl_perf_event_max_stack - init_nr, 0);
> +
> + /* stack_trace_save_tsk() works on unsigned long array, while
> + * perf_callchain_entry uses u64 array. For 32-bit systems, it is
> + * necessary to fix this mismatch.
> + */
> + if (__BITS_PER_LONG != 64) {
> + unsigned long *from = (unsigned long *) entry->ip;
> + u64 *to = entry->ip;
> + int i;
> +
> + /* copy data from the end to avoid using extra buffer */
> + for (i = entry->nr - 1; i >= (int)init_nr; i--)
> + to[i] = (u64)(from[i]);
> + }
> +
> +exit_put:
> + put_callchain_entry(rctx);
> +
> + return entry;
> +}
> +
> BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
> u64, flags)
> {
> @@ -448,8 +486,8 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
> .arg3_type = ARG_ANYTHING,
> };
>
> -BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
> - u64, flags)
[...]

2020-06-26 17:08:39

by Yonghong Song

[permalink] [raw]
Subject: Re: [Potential Spoof] [PATCH v2 bpf-next 3/4] bpf: allow %pB in bpf_seq_printf() and bpf_trace_printk()



On 6/25/20 5:13 PM, Song Liu wrote:
> This makes it easy to dump stack trace in text.
>
> Signed-off-by: Song Liu <[email protected]>

Ack with a small nit below.
Acked-by: Yonghong Song <[email protected]>

> ---
> kernel/trace/bpf_trace.c | 12 +++++++++---
> 1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 65fa62723e2f8..1cb90b0868817 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -376,7 +376,7 @@ static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
>
> /*
> * Only limited trace_printk() conversion specifiers allowed:
> - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s
> + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s
> */
> BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
> u64, arg2, u64, arg3)
> @@ -420,6 +420,11 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
> goto fmt_str;
> }
>
> + if (fmt[i + 1] == 'B') {
> + i++;
> + goto fmt_next;
> + }
> +
> /* disallow any further format extensions */
> if (fmt[i + 1] != 0 &&
> !isspace(fmt[i + 1]) &&
> @@ -479,7 +484,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
> #define __BPF_TP_EMIT() __BPF_ARG3_TP()
> #define __BPF_TP(...) \
> __trace_printk(0 /* Fake ip */, \
> - fmt, ##__VA_ARGS__)
> + fmt, ##__VA_ARGS__)\

Accidental change?

>
> #define __BPF_ARG1_TP(...) \
> ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
> @@ -636,7 +641,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
> if (fmt[i] == 'p') {
> if (fmt[i + 1] == 0 ||
> fmt[i + 1] == 'K' ||
> - fmt[i + 1] == 'x') {
> + fmt[i + 1] == 'x' ||
> + fmt[i + 1] == 'B') {
> /* just kernel pointers */
> params[fmt_cnt] = args[fmt_cnt];
> fmt_cnt++;
>

2020-06-26 17:12:00

by Yonghong Song

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 4/4] selftests/bpf: add bpf_iter test with bpf_get_task_stack()



On 6/25/20 5:13 PM, Song Liu wrote:
> The new test is similar to other bpf_iter tests.
>
> Signed-off-by: Song Liu <[email protected]>
> ---
> .../selftests/bpf/prog_tests/bpf_iter.c | 17 ++++++
> .../selftests/bpf/progs/bpf_iter_task_stack.c | 60 +++++++++++++++++++
> 2 files changed, 77 insertions(+)
> create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> index 87c29dde1cf96..baa83328f810d 100644
> --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> @@ -5,6 +5,7 @@
> #include "bpf_iter_netlink.skel.h"
> #include "bpf_iter_bpf_map.skel.h"
> #include "bpf_iter_task.skel.h"
> +#include "bpf_iter_task_stack.skel.h"
> #include "bpf_iter_task_file.skel.h"
> #include "bpf_iter_test_kern1.skel.h"
> #include "bpf_iter_test_kern2.skel.h"
> @@ -106,6 +107,20 @@ static void test_task(void)
> bpf_iter_task__destroy(skel);
> }
>
> +static void test_task_stack(void)
> +{
> + struct bpf_iter_task_stack *skel;
> +
> + skel = bpf_iter_task_stack__open_and_load();
> + if (CHECK(!skel, "bpf_iter_task_stack__open_and_load",
> + "skeleton open_and_load failed\n"))
> + return;
> +
> + do_dummy_read(skel->progs.dump_task_stack);
> +
> + bpf_iter_task_stack__destroy(skel);
> +}
> +
> static void test_task_file(void)
> {
> struct bpf_iter_task_file *skel;
> @@ -392,6 +407,8 @@ void test_bpf_iter(void)
> test_bpf_map();
> if (test__start_subtest("task"))
> test_task();
> + if (test__start_subtest("task_stack"))
> + test_task_stack();
> if (test__start_subtest("task_file"))
> test_task_file();
> if (test__start_subtest("anon"))
> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
> new file mode 100644
> index 0000000000000..83aca5b1a7965
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
> @@ -0,0 +1,60 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2020 Facebook */
> +/* "undefine" structs in vmlinux.h, because we "override" them below */
> +#define bpf_iter_meta bpf_iter_meta___not_used
> +#define bpf_iter__task bpf_iter__task___not_used
> +#include "vmlinux.h"
> +#undef bpf_iter_meta
> +#undef bpf_iter__task
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>

Could you rebase on top of latest bpf-next?
A new header bpf_iter.h is introduced and it will
make code simpler.

> +
> +char _license[] SEC("license") = "GPL";
> +
> +/* bpf_get_task_stack needs a stackmap to work */
> +struct {
> + __uint(type, BPF_MAP_TYPE_STACK_TRACE);
> + __uint(max_entries, 16384);
> + __uint(key_size, sizeof(__u32));
> + __uint(value_size, sizeof(__u64) * 20);
> +} stackmap SEC(".maps");
> +
> +struct bpf_iter_meta {
> + struct seq_file *seq;
> + __u64 session_id;
> + __u64 seq_num;
> +} __attribute__((preserve_access_index));
> +
> +struct bpf_iter__task {
> + struct bpf_iter_meta *meta;
> + struct task_struct *task;
> +} __attribute__((preserve_access_index));
> +
> +#define MAX_STACK_TRACE_DEPTH 64
> +unsigned long entries[MAX_STACK_TRACE_DEPTH];
> +
> +SEC("iter/task")
> +int dump_task_stack(struct bpf_iter__task *ctx)
> +{
> + struct seq_file *seq = ctx->meta->seq;
> + struct task_struct *task = ctx->task;
> + int i, retlen;

long retlen after rebase?

> +
> + if (task == (void *)0)
> + return 0;
> +
> + retlen = bpf_get_task_stack(task, entries,
> + MAX_STACK_TRACE_DEPTH * sizeof(unsigned long), 0);
> + if (retlen < 0) > + return 0;
> +
> + BPF_SEQ_PRINTF(seq, "pid: %8u num_entries: %8u\n", task->pid,
> + retlen / sizeof(unsigned long));

sizeof(unsigned long) is used a few times. It is worthwhile to
have a variable for it.

> + for (i = 0; i < MAX_STACK_TRACE_DEPTH / sizeof(unsigned long); i++) {
> + if (retlen > i * sizeof(unsigned long))
> + BPF_SEQ_PRINTF(seq, "[<0>] %pB\n", (void *)entries[i]);
> + }
> + BPF_SEQ_PRINTF(seq, "\n");
> +
> + return 0;
> +}
>

2020-06-26 20:14:35

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 1/4] perf: export get/put_chain_entry()

On Fri, Jun 26, 2020 at 5:10 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Jun 25, 2020 at 05:13:29PM -0700, Song Liu wrote:
> > This would be used by bpf stack mapo.
>
> Would it make sense to sanitize the API a little before exposing it?
>
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 334d48b16c36..016894b0d2c2 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
> return NULL;
>
> entries = rcu_dereference(callchain_cpus_entries);
> - if (!entries)
> + if (!entries) {
> + put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
> return NULL;
> + }
>
> cpu = smp_processor_id();
>
> @@ -183,12 +185,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
> int rctx;
>
> entry = get_callchain_entry(&rctx);
> - if (rctx == -1)
> + if (!entry || rctx == -1)
> return NULL;
>

isn't rctx == -1 check here not necessary anymore? Seems like
get_callchain_entry() will always return NULL if rctx == -1?

> - if (!entry)
> - goto exit_put;
> -
> ctx.entry = entry;
> ctx.max_stack = max_stack;
> ctx.nr = entry->nr = init_nr;

2020-06-26 20:18:28

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 2/4] bpf: introduce helper bpf_get_task_stak()

On Thu, Jun 25, 2020 at 5:14 PM Song Liu <[email protected]> wrote:
>
> Introduce helper bpf_get_task_stack(), which dumps stack trace of given
> task. This is different to bpf_get_stack(), which gets stack track of
> current task. One potential use case of bpf_get_task_stack() is to call
> it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.
>
> bpf_get_task_stack() uses stack_trace_save_tsk() instead of
> get_perf_callchain() for kernel stack. The benefit of this choice is that
> stack_trace_save_tsk() doesn't require changes in arch/. The downside of
> using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
> stack trace to unsigned long array. For 32-bit systems, we need to
> translate it to u64 array.
>
> Signed-off-by: Song Liu <[email protected]>
> ---

Looks great, I just think that there are cases where user doesn't
necessarily has valid task_struct pointer, just pid, so would be nice
to not artificially restrict such cases by having extra helper.

Acked-by: Andrii Nakryiko <[email protected]>

> include/linux/bpf.h | 1 +
> include/uapi/linux/bpf.h | 35 ++++++++++++++-
> kernel/bpf/stackmap.c | 79 ++++++++++++++++++++++++++++++++--
> kernel/trace/bpf_trace.c | 2 +
> scripts/bpf_helpers_doc.py | 2 +
> tools/include/uapi/linux/bpf.h | 35 ++++++++++++++-
> 6 files changed, 149 insertions(+), 5 deletions(-)
>

[...]

> + /* stack_trace_save_tsk() works on unsigned long array, while
> + * perf_callchain_entry uses u64 array. For 32-bit systems, it is
> + * necessary to fix this mismatch.
> + */
> + if (__BITS_PER_LONG != 64) {
> + unsigned long *from = (unsigned long *) entry->ip;
> + u64 *to = entry->ip;
> + int i;
> +
> + /* copy data from the end to avoid using extra buffer */
> + for (i = entry->nr - 1; i >= (int)init_nr; i--)
> + to[i] = (u64)(from[i]);

doing this forward would be just fine as well, no? First iteration
will cast and overwrite low 32-bits, all the subsequent iterations
won't even overlap.

> + }
> +
> +exit_put:
> + put_callchain_entry(rctx);
> +
> + return entry;
> +}
> +

[...]

> +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
> + u32, size, u64, flags)
> +{
> + struct pt_regs *regs = task_pt_regs(task);
> +
> + return __bpf_get_stack(regs, task, buf, size, flags);
> +}


So this takes advantage of BTF and having a direct task_struct
pointer. But for kprobes/tracepoint I think it would also be extremely
helpful to be able to request stack trace by PID. How about one more
helper which will wrap this one with get/put task by PID, e.g.,
bpf_get_pid_stack(int pid, void *buf, u32 size, u64 flags)? Would that
be a problem?

> +
> +static int bpf_get_task_stack_btf_ids[5];
> +const struct bpf_func_proto bpf_get_task_stack_proto = {
> + .func = bpf_get_task_stack,
> + .gpl_only = false,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_BTF_ID,
> + .arg2_type = ARG_PTR_TO_UNINIT_MEM,
> + .arg3_type = ARG_CONST_SIZE_OR_ZERO,
> + .arg4_type = ARG_ANYTHING,
> + .btf_id = bpf_get_task_stack_btf_ids,
> +};
> +

[...]

2020-06-26 20:22:04

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 4/4] selftests/bpf: add bpf_iter test with bpf_get_task_stack()

On Thu, Jun 25, 2020 at 5:15 PM Song Liu <[email protected]> wrote:
>
> The new test is similar to other bpf_iter tests.
>
> Signed-off-by: Song Liu <[email protected]>
> ---
> .../selftests/bpf/prog_tests/bpf_iter.c | 17 ++++++
> .../selftests/bpf/progs/bpf_iter_task_stack.c | 60 +++++++++++++++++++
> 2 files changed, 77 insertions(+)
> create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> index 87c29dde1cf96..baa83328f810d 100644
> --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> @@ -5,6 +5,7 @@
> #include "bpf_iter_netlink.skel.h"
> #include "bpf_iter_bpf_map.skel.h"
> #include "bpf_iter_task.skel.h"
> +#include "bpf_iter_task_stack.skel.h"
> #include "bpf_iter_task_file.skel.h"
> #include "bpf_iter_test_kern1.skel.h"
> #include "bpf_iter_test_kern2.skel.h"
> @@ -106,6 +107,20 @@ static void test_task(void)
> bpf_iter_task__destroy(skel);
> }
>
> +static void test_task_stack(void)
> +{
> + struct bpf_iter_task_stack *skel;
> +
> + skel = bpf_iter_task_stack__open_and_load();
> + if (CHECK(!skel, "bpf_iter_task_stack__open_and_load",
> + "skeleton open_and_load failed\n"))
> + return;
> +
> + do_dummy_read(skel->progs.dump_task_stack);
> +
> + bpf_iter_task_stack__destroy(skel);
> +}
> +
> static void test_task_file(void)
> {
> struct bpf_iter_task_file *skel;
> @@ -392,6 +407,8 @@ void test_bpf_iter(void)
> test_bpf_map();
> if (test__start_subtest("task"))
> test_task();
> + if (test__start_subtest("task_stack"))
> + test_task_stack();
> if (test__start_subtest("task_file"))
> test_task_file();
> if (test__start_subtest("anon"))
> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
> new file mode 100644
> index 0000000000000..83aca5b1a7965
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
> @@ -0,0 +1,60 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2020 Facebook */
> +/* "undefine" structs in vmlinux.h, because we "override" them below */
> +#define bpf_iter_meta bpf_iter_meta___not_used
> +#define bpf_iter__task bpf_iter__task___not_used
> +#include "vmlinux.h"
> +#undef bpf_iter_meta
> +#undef bpf_iter__task
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +
> +char _license[] SEC("license") = "GPL";
> +
> +/* bpf_get_task_stack needs a stackmap to work */

no it doesn't anymore :) please drop


[...]

2020-06-26 20:25:11

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 2/4] bpf: introduce helper bpf_get_task_stak()

On Fri, Jun 26, 2020 at 1:17 PM Andrii Nakryiko
<[email protected]> wrote:
>
> On Thu, Jun 25, 2020 at 5:14 PM Song Liu <[email protected]> wrote:
> >
> > Introduce helper bpf_get_task_stack(), which dumps stack trace of given
> > task. This is different to bpf_get_stack(), which gets stack track of
> > current task. One potential use case of bpf_get_task_stack() is to call
> > it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.
> >
> > bpf_get_task_stack() uses stack_trace_save_tsk() instead of
> > get_perf_callchain() for kernel stack. The benefit of this choice is that
> > stack_trace_save_tsk() doesn't require changes in arch/. The downside of
> > using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
> > stack trace to unsigned long array. For 32-bit systems, we need to
> > translate it to u64 array.
> >
> > Signed-off-by: Song Liu <[email protected]>
> > ---
>
> Looks great, I just think that there are cases where user doesn't
> necessarily has valid task_struct pointer, just pid, so would be nice
> to not artificially restrict such cases by having extra helper.
>
> Acked-by: Andrii Nakryiko <[email protected]>

oh, please also fix a typo in the subject, it will make grepping more
frustrating

[...]

2020-06-26 21:30:51

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 1/4] perf: export get/put_chain_entry()



> On Jun 26, 2020, at 4:00 AM, Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Jun 25, 2020 at 05:13:29PM -0700, Song Liu wrote:
>> This would be used by bpf stack mapo.
>
> Would it make sense to sanitize the API a little before exposing it?

I will fold this in the next version.

Thanks,
Song

>
> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
> index 334d48b16c36..016894b0d2c2 100644
> --- a/kernel/events/callchain.c
> +++ b/kernel/events/callchain.c
> @@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
> return NULL;
>
> entries = rcu_dereference(callchain_cpus_entries);
> - if (!entries)
> + if (!entries) {
> + put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
> return NULL;
> + }
>
> cpu = smp_processor_id();
>
> @@ -183,12 +185,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
> int rctx;
>
> entry = get_callchain_entry(&rctx);
> - if (rctx == -1)
> + if (!entry || rctx == -1)
> return NULL;
>
> - if (!entry)
> - goto exit_put;
> -
> ctx.entry = entry;
> ctx.max_stack = max_stack;
> ctx.nr = entry->nr = init_nr;

2020-06-26 21:39:39

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 1/4] perf: export get/put_chain_entry()



> On Jun 26, 2020, at 1:06 PM, Andrii Nakryiko <[email protected]> wrote:
>
> On Fri, Jun 26, 2020 at 5:10 AM Peter Zijlstra <[email protected]> wrote:
>>
>> On Thu, Jun 25, 2020 at 05:13:29PM -0700, Song Liu wrote:
>>> This would be used by bpf stack mapo.
>>
>> Would it make sense to sanitize the API a little before exposing it?
>>
>> diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
>> index 334d48b16c36..016894b0d2c2 100644
>> --- a/kernel/events/callchain.c
>> +++ b/kernel/events/callchain.c
>> @@ -159,8 +159,10 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
>> return NULL;
>>
>> entries = rcu_dereference(callchain_cpus_entries);
>> - if (!entries)
>> + if (!entries) {
>> + put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
>> return NULL;
>> + }
>>
>> cpu = smp_processor_id();
>>
>> @@ -183,12 +185,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
>> int rctx;
>>
>> entry = get_callchain_entry(&rctx);
>> - if (rctx == -1)
>> + if (!entry || rctx == -1)
>> return NULL;
>>
>
> isn't rctx == -1 check here not necessary anymore? Seems like
> get_callchain_entry() will always return NULL if rctx == -1?

Yes, looks like we only need to check entry.

Thanks,
Song

2020-06-26 22:41:11

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 2/4] bpf: introduce helper bpf_get_task_stak()



> On Jun 26, 2020, at 8:40 AM, Yonghong Song <[email protected]> wrote:
>
>
>
> On 6/25/20 5:13 PM, Song Liu wrote:
>> Introduce helper bpf_get_task_stack(), which dumps stack trace of given
>> task. This is different to bpf_get_stack(), which gets stack track of
>> current task. One potential use case of bpf_get_task_stack() is to call
>> it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.
>> bpf_get_task_stack() uses stack_trace_save_tsk() instead of
>> get_perf_callchain() for kernel stack. The benefit of this choice is that
>> stack_trace_save_tsk() doesn't require changes in arch/. The downside of
>> using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
>> stack trace to unsigned long array. For 32-bit systems, we need to
>> translate it to u64 array.
>> Signed-off-by: Song Liu <[email protected]>
>>
[...]
>> +++ b/include/uapi/linux/bpf.h
>> @@ -3252,6 +3252,38 @@ union bpf_attr {
>> * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
>> * is returned or the error code -EACCES in case the skb is not
>> * subject to CHECKSUM_UNNECESSARY.
>> + *
>> + * int bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
>
> Andrii's recent patch changed the return type to 'long' to align with
> kernel u64 return type for better llvm code generation.
>
> Please rebase and you will see the new convention.

Will fix.

>
>> + * Description
>>

[...]

>> +static struct perf_callchain_entry *
>> +get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
>> +{
>> + struct perf_callchain_entry *entry;
>> + int rctx;
>> +
>> + entry = get_callchain_entry(&rctx);
>> +
>> + if (rctx == -1)
>> + return NULL;
>
> Is this needed? Should be below !entry enough?

It is needed before Peter's suggestion. After applying Peter's patch,
this is no longer needed.

Thanks,
Song


2020-06-26 22:46:31

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 2/4] bpf: introduce helper bpf_get_task_stak()



> On Jun 26, 2020, at 1:17 PM, Andrii Nakryiko <[email protected]> wrote:
>
> On Thu, Jun 25, 2020 at 5:14 PM Song Liu <[email protected]> wrote:
>>
>> Introduce helper bpf_get_task_stack(), which dumps stack trace of given
>> task. This is different to bpf_get_stack(), which gets stack track of
>> current task. One potential use case of bpf_get_task_stack() is to call
>> it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.
>>
>> bpf_get_task_stack() uses stack_trace_save_tsk() instead of
>> get_perf_callchain() for kernel stack. The benefit of this choice is that
>> stack_trace_save_tsk() doesn't require changes in arch/. The downside of
>> using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
>> stack trace to unsigned long array. For 32-bit systems, we need to
>> translate it to u64 array.
>>
>> Signed-off-by: Song Liu <[email protected]>
>> ---
>
> Looks great, I just think that there are cases where user doesn't
> necessarily has valid task_struct pointer, just pid, so would be nice
> to not artificially restrict such cases by having extra helper.
>
> Acked-by: Andrii Nakryiko <[email protected]>

Thanks!

>
>> include/linux/bpf.h | 1 +
>> include/uapi/linux/bpf.h | 35 ++++++++++++++-
>> kernel/bpf/stackmap.c | 79 ++++++++++++++++++++++++++++++++--
>> kernel/trace/bpf_trace.c | 2 +
>> scripts/bpf_helpers_doc.py | 2 +
>> tools/include/uapi/linux/bpf.h | 35 ++++++++++++++-
>> 6 files changed, 149 insertions(+), 5 deletions(-)
>>
>
> [...]
>
>> + /* stack_trace_save_tsk() works on unsigned long array, while
>> + * perf_callchain_entry uses u64 array. For 32-bit systems, it is
>> + * necessary to fix this mismatch.
>> + */
>> + if (__BITS_PER_LONG != 64) {
>> + unsigned long *from = (unsigned long *) entry->ip;
>> + u64 *to = entry->ip;
>> + int i;
>> +
>> + /* copy data from the end to avoid using extra buffer */
>> + for (i = entry->nr - 1; i >= (int)init_nr; i--)
>> + to[i] = (u64)(from[i]);
>
> doing this forward would be just fine as well, no? First iteration
> will cast and overwrite low 32-bits, all the subsequent iterations
> won't even overlap.

I think first iteration will write zeros to higher 32 bits, no?

>
>> + }
>> +
>> +exit_put:
>> + put_callchain_entry(rctx);
>> +
>> + return entry;
>> +}
>> +
>
> [...]
>
>> +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
>> + u32, size, u64, flags)
>> +{
>> + struct pt_regs *regs = task_pt_regs(task);
>> +
>> + return __bpf_get_stack(regs, task, buf, size, flags);
>> +}
>
>
> So this takes advantage of BTF and having a direct task_struct
> pointer. But for kprobes/tracepoint I think it would also be extremely
> helpful to be able to request stack trace by PID. How about one more
> helper which will wrap this one with get/put task by PID, e.g.,
> bpf_get_pid_stack(int pid, void *buf, u32 size, u64 flags)? Would that
> be a problem?

That should work. Let me add that in a follow up patch.

2020-06-26 22:51:52

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 2/4] bpf: introduce helper bpf_get_task_stak()

On Fri, Jun 26, 2020 at 3:45 PM Song Liu <[email protected]> wrote:
>
>
>
> > On Jun 26, 2020, at 1:17 PM, Andrii Nakryiko <[email protected]> wrote:
> >
> > On Thu, Jun 25, 2020 at 5:14 PM Song Liu <[email protected]> wrote:
> >>
> >> Introduce helper bpf_get_task_stack(), which dumps stack trace of given
> >> task. This is different to bpf_get_stack(), which gets stack track of
> >> current task. One potential use case of bpf_get_task_stack() is to call
> >> it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.
> >>
> >> bpf_get_task_stack() uses stack_trace_save_tsk() instead of
> >> get_perf_callchain() for kernel stack. The benefit of this choice is that
> >> stack_trace_save_tsk() doesn't require changes in arch/. The downside of
> >> using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
> >> stack trace to unsigned long array. For 32-bit systems, we need to
> >> translate it to u64 array.
> >>
> >> Signed-off-by: Song Liu <[email protected]>
> >> ---
> >
> > Looks great, I just think that there are cases where user doesn't
> > necessarily has valid task_struct pointer, just pid, so would be nice
> > to not artificially restrict such cases by having extra helper.
> >
> > Acked-by: Andrii Nakryiko <[email protected]>
>
> Thanks!
>
> >
> >> include/linux/bpf.h | 1 +
> >> include/uapi/linux/bpf.h | 35 ++++++++++++++-
> >> kernel/bpf/stackmap.c | 79 ++++++++++++++++++++++++++++++++--
> >> kernel/trace/bpf_trace.c | 2 +
> >> scripts/bpf_helpers_doc.py | 2 +
> >> tools/include/uapi/linux/bpf.h | 35 ++++++++++++++-
> >> 6 files changed, 149 insertions(+), 5 deletions(-)
> >>
> >
> > [...]
> >
> >> + /* stack_trace_save_tsk() works on unsigned long array, while
> >> + * perf_callchain_entry uses u64 array. For 32-bit systems, it is
> >> + * necessary to fix this mismatch.
> >> + */
> >> + if (__BITS_PER_LONG != 64) {
> >> + unsigned long *from = (unsigned long *) entry->ip;
> >> + u64 *to = entry->ip;
> >> + int i;
> >> +
> >> + /* copy data from the end to avoid using extra buffer */
> >> + for (i = entry->nr - 1; i >= (int)init_nr; i--)
> >> + to[i] = (u64)(from[i]);
> >
> > doing this forward would be just fine as well, no? First iteration
> > will cast and overwrite low 32-bits, all the subsequent iterations
> > won't even overlap.
>
> I think first iteration will write zeros to higher 32 bits, no?

Oh, wait, I completely misread what this is doing. It up-converts from
32-bit to 64-bit, sorry. Yeah, ignore me on this :)

But then I have another question. How do you know that entry->ip has
enough space to keep the same number of 2x bigger entries?

>
> >
> >> + }
> >> +
> >> +exit_put:
> >> + put_callchain_entry(rctx);
> >> +
> >> + return entry;
> >> +}
> >> +
> >
> > [...]
> >
> >> +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
> >> + u32, size, u64, flags)
> >> +{
> >> + struct pt_regs *regs = task_pt_regs(task);
> >> +
> >> + return __bpf_get_stack(regs, task, buf, size, flags);
> >> +}
> >
> >
> > So this takes advantage of BTF and having a direct task_struct
> > pointer. But for kprobes/tracepoint I think it would also be extremely
> > helpful to be able to request stack trace by PID. How about one more
> > helper which will wrap this one with get/put task by PID, e.g.,
> > bpf_get_pid_stack(int pid, void *buf, u32 size, u64 flags)? Would that
> > be a problem?
>
> That should work. Let me add that in a follow up patch.
>

2020-06-26 23:07:26

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 4/4] selftests/bpf: add bpf_iter test with bpf_get_task_stack()



> On Jun 26, 2020, at 1:21 PM, Andrii Nakryiko <[email protected]> wrote:
>
> On Thu, Jun 25, 2020 at 5:15 PM Song Liu <[email protected]> wrote:
>>
>> The new test is similar to other bpf_iter tests.
>>
>> Signed-off-by: Song Liu <[email protected]>
>> ---
>> .../selftests/bpf/prog_tests/bpf_iter.c | 17 ++++++
>> .../selftests/bpf/progs/bpf_iter_task_stack.c | 60 +++++++++++++++++++
>> 2 files changed, 77 insertions(+)
>> create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
>>
>> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
>> index 87c29dde1cf96..baa83328f810d 100644
>> --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
>> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
>> @@ -5,6 +5,7 @@
>> #include "bpf_iter_netlink.skel.h"
>> #include "bpf_iter_bpf_map.skel.h"
>> #include "bpf_iter_task.skel.h"
>> +#include "bpf_iter_task_stack.skel.h"
>> #include "bpf_iter_task_file.skel.h"
>> #include "bpf_iter_test_kern1.skel.h"
>> #include "bpf_iter_test_kern2.skel.h"
>> @@ -106,6 +107,20 @@ static void test_task(void)
>> bpf_iter_task__destroy(skel);
>> }
>>
>> +static void test_task_stack(void)
>> +{
>> + struct bpf_iter_task_stack *skel;
>> +
>> + skel = bpf_iter_task_stack__open_and_load();
>> + if (CHECK(!skel, "bpf_iter_task_stack__open_and_load",
>> + "skeleton open_and_load failed\n"))
>> + return;
>> +
>> + do_dummy_read(skel->progs.dump_task_stack);
>> +
>> + bpf_iter_task_stack__destroy(skel);
>> +}
>> +
>> static void test_task_file(void)
>> {
>> struct bpf_iter_task_file *skel;
>> @@ -392,6 +407,8 @@ void test_bpf_iter(void)
>> test_bpf_map();
>> if (test__start_subtest("task"))
>> test_task();
>> + if (test__start_subtest("task_stack"))
>> + test_task_stack();
>> if (test__start_subtest("task_file"))
>> test_task_file();
>> if (test__start_subtest("anon"))
>> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
>> new file mode 100644
>> index 0000000000000..83aca5b1a7965
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
>> @@ -0,0 +1,60 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/* Copyright (c) 2020 Facebook */
>> +/* "undefine" structs in vmlinux.h, because we "override" them below */
>> +#define bpf_iter_meta bpf_iter_meta___not_used
>> +#define bpf_iter__task bpf_iter__task___not_used
>> +#include "vmlinux.h"
>> +#undef bpf_iter_meta
>> +#undef bpf_iter__task
>> +#include <bpf/bpf_helpers.h>
>> +#include <bpf/bpf_tracing.h>
>> +
>> +char _license[] SEC("license") = "GPL";
>> +
>> +/* bpf_get_task_stack needs a stackmap to work */
>
> no it doesn't anymore :) please drop

We still need stack_map_alloc() to call get_callchain_buffers() in this
case. Without an active stack map, get_callchain_buffers() may fail.

Thanks,
Song

2020-06-26 23:15:02

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 4/4] selftests/bpf: add bpf_iter test with bpf_get_task_stack()

On Fri, Jun 26, 2020 at 4:05 PM Song Liu <[email protected]> wrote:
>
>
>
> > On Jun 26, 2020, at 1:21 PM, Andrii Nakryiko <[email protected]> wrote:
> >
> > On Thu, Jun 25, 2020 at 5:15 PM Song Liu <[email protected]> wrote:
> >>
> >> The new test is similar to other bpf_iter tests.
> >>
> >> Signed-off-by: Song Liu <[email protected]>
> >> ---
> >> .../selftests/bpf/prog_tests/bpf_iter.c | 17 ++++++
> >> .../selftests/bpf/progs/bpf_iter_task_stack.c | 60 +++++++++++++++++++
> >> 2 files changed, 77 insertions(+)
> >> create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
> >>
> >> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> >> index 87c29dde1cf96..baa83328f810d 100644
> >> --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> >> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
> >> @@ -5,6 +5,7 @@
> >> #include "bpf_iter_netlink.skel.h"
> >> #include "bpf_iter_bpf_map.skel.h"
> >> #include "bpf_iter_task.skel.h"
> >> +#include "bpf_iter_task_stack.skel.h"
> >> #include "bpf_iter_task_file.skel.h"
> >> #include "bpf_iter_test_kern1.skel.h"
> >> #include "bpf_iter_test_kern2.skel.h"
> >> @@ -106,6 +107,20 @@ static void test_task(void)
> >> bpf_iter_task__destroy(skel);
> >> }
> >>
> >> +static void test_task_stack(void)
> >> +{
> >> + struct bpf_iter_task_stack *skel;
> >> +
> >> + skel = bpf_iter_task_stack__open_and_load();
> >> + if (CHECK(!skel, "bpf_iter_task_stack__open_and_load",
> >> + "skeleton open_and_load failed\n"))
> >> + return;
> >> +
> >> + do_dummy_read(skel->progs.dump_task_stack);
> >> +
> >> + bpf_iter_task_stack__destroy(skel);
> >> +}
> >> +
> >> static void test_task_file(void)
> >> {
> >> struct bpf_iter_task_file *skel;
> >> @@ -392,6 +407,8 @@ void test_bpf_iter(void)
> >> test_bpf_map();
> >> if (test__start_subtest("task"))
> >> test_task();
> >> + if (test__start_subtest("task_stack"))
> >> + test_task_stack();
> >> if (test__start_subtest("task_file"))
> >> test_task_file();
> >> if (test__start_subtest("anon"))
> >> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
> >> new file mode 100644
> >> index 0000000000000..83aca5b1a7965
> >> --- /dev/null
> >> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
> >> @@ -0,0 +1,60 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/* Copyright (c) 2020 Facebook */
> >> +/* "undefine" structs in vmlinux.h, because we "override" them below */
> >> +#define bpf_iter_meta bpf_iter_meta___not_used
> >> +#define bpf_iter__task bpf_iter__task___not_used
> >> +#include "vmlinux.h"
> >> +#undef bpf_iter_meta
> >> +#undef bpf_iter__task
> >> +#include <bpf/bpf_helpers.h>
> >> +#include <bpf/bpf_tracing.h>
> >> +
> >> +char _license[] SEC("license") = "GPL";
> >> +
> >> +/* bpf_get_task_stack needs a stackmap to work */
> >
> > no it doesn't anymore :) please drop
>
> We still need stack_map_alloc() to call get_callchain_buffers() in this
> case. Without an active stack map, get_callchain_buffers() may fail.

Oh... um... is it possible to do it some other way? It's extremely
confusing dependency. Does bpf_get_stack() also require stackmap?

>
> Thanks,
> Song

2020-06-26 23:41:53

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 4/4] selftests/bpf: add bpf_iter test with bpf_get_task_stack()



> On Jun 26, 2020, at 4:11 PM, Andrii Nakryiko <[email protected]> wrote:
>
> On Fri, Jun 26, 2020 at 4:05 PM Song Liu <[email protected]> wrote:
>>
>>
>>
>>> On Jun 26, 2020, at 1:21 PM, Andrii Nakryiko <[email protected]> wrote:
>>>
>>> On Thu, Jun 25, 2020 at 5:15 PM Song Liu <[email protected]> wrote:
>>>>
>>>> The new test is similar to other bpf_iter tests.
>>>>
>>>> Signed-off-by: Song Liu <[email protected]>
>>>> ---
>>>> .../selftests/bpf/prog_tests/bpf_iter.c | 17 ++++++
>>>> .../selftests/bpf/progs/bpf_iter_task_stack.c | 60 +++++++++++++++++++
>>>> 2 files changed, 77 insertions(+)
>>>> create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
>>>>
>>>> diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
>>>> index 87c29dde1cf96..baa83328f810d 100644
>>>> --- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
>>>> +++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
>>>> @@ -5,6 +5,7 @@
>>>> #include "bpf_iter_netlink.skel.h"
>>>> #include "bpf_iter_bpf_map.skel.h"
>>>> #include "bpf_iter_task.skel.h"
>>>> +#include "bpf_iter_task_stack.skel.h"
>>>> #include "bpf_iter_task_file.skel.h"
>>>> #include "bpf_iter_test_kern1.skel.h"
>>>> #include "bpf_iter_test_kern2.skel.h"
>>>> @@ -106,6 +107,20 @@ static void test_task(void)
>>>> bpf_iter_task__destroy(skel);
>>>> }
>>>>
>>>> +static void test_task_stack(void)
>>>> +{
>>>> + struct bpf_iter_task_stack *skel;
>>>> +
>>>> + skel = bpf_iter_task_stack__open_and_load();
>>>> + if (CHECK(!skel, "bpf_iter_task_stack__open_and_load",
>>>> + "skeleton open_and_load failed\n"))
>>>> + return;
>>>> +
>>>> + do_dummy_read(skel->progs.dump_task_stack);
>>>> +
>>>> + bpf_iter_task_stack__destroy(skel);
>>>> +}
>>>> +
>>>> static void test_task_file(void)
>>>> {
>>>> struct bpf_iter_task_file *skel;
>>>> @@ -392,6 +407,8 @@ void test_bpf_iter(void)
>>>> test_bpf_map();
>>>> if (test__start_subtest("task"))
>>>> test_task();
>>>> + if (test__start_subtest("task_stack"))
>>>> + test_task_stack();
>>>> if (test__start_subtest("task_file"))
>>>> test_task_file();
>>>> if (test__start_subtest("anon"))
>>>> diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
>>>> new file mode 100644
>>>> index 0000000000000..83aca5b1a7965
>>>> --- /dev/null
>>>> +++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
>>>> @@ -0,0 +1,60 @@
>>>> +// SPDX-License-Identifier: GPL-2.0
>>>> +/* Copyright (c) 2020 Facebook */
>>>> +/* "undefine" structs in vmlinux.h, because we "override" them below */
>>>> +#define bpf_iter_meta bpf_iter_meta___not_used
>>>> +#define bpf_iter__task bpf_iter__task___not_used
>>>> +#include "vmlinux.h"
>>>> +#undef bpf_iter_meta
>>>> +#undef bpf_iter__task
>>>> +#include <bpf/bpf_helpers.h>
>>>> +#include <bpf/bpf_tracing.h>
>>>> +
>>>> +char _license[] SEC("license") = "GPL";
>>>> +
>>>> +/* bpf_get_task_stack needs a stackmap to work */
>>>
>>> no it doesn't anymore :) please drop
>>
>> We still need stack_map_alloc() to call get_callchain_buffers() in this
>> case. Without an active stack map, get_callchain_buffers() may fail.
>
> Oh... um... is it possible to do it some other way? It's extremely
> confusing dependency. Does bpf_get_stack() also require stackmap?
>

Aha, I thought bpf_get_stack() also requires stackmap, but it doesn't.
The fix is in check_helper_call(). Let me do the same for bpf_get_task_stack().

Thanks,
Song

2020-06-26 23:52:56

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 2/4] bpf: introduce helper bpf_get_task_stak()



> On Jun 26, 2020, at 3:51 PM, Andrii Nakryiko <[email protected]> wrote:
>
> On Fri, Jun 26, 2020 at 3:45 PM Song Liu <[email protected]> wrote:
>>
>>
>>
>>> On Jun 26, 2020, at 1:17 PM, Andrii Nakryiko <[email protected]> wrote:
>>>
>>> On Thu, Jun 25, 2020 at 5:14 PM Song Liu <[email protected]> wrote:
>>>>
>>>> Introduce helper bpf_get_task_stack(), which dumps stack trace of given
>>>> task. This is different to bpf_get_stack(), which gets stack track of
>>>> current task. One potential use case of bpf_get_task_stack() is to call
>>>> it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.
>>>>
>>>> bpf_get_task_stack() uses stack_trace_save_tsk() instead of
>>>> get_perf_callchain() for kernel stack. The benefit of this choice is that
>>>> stack_trace_save_tsk() doesn't require changes in arch/. The downside of
>>>> using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
>>>> stack trace to unsigned long array. For 32-bit systems, we need to
>>>> translate it to u64 array.
>>>>
>>>> Signed-off-by: Song Liu <[email protected]>
>>>> ---
>>>
>>> Looks great, I just think that there are cases where user doesn't
>>> necessarily has valid task_struct pointer, just pid, so would be nice
>>> to not artificially restrict such cases by having extra helper.
>>>
>>> Acked-by: Andrii Nakryiko <[email protected]>
>>
>> Thanks!
>>
>>>
>>>> include/linux/bpf.h | 1 +
>>>> include/uapi/linux/bpf.h | 35 ++++++++++++++-
>>>> kernel/bpf/stackmap.c | 79 ++++++++++++++++++++++++++++++++--
>>>> kernel/trace/bpf_trace.c | 2 +
>>>> scripts/bpf_helpers_doc.py | 2 +
>>>> tools/include/uapi/linux/bpf.h | 35 ++++++++++++++-
>>>> 6 files changed, 149 insertions(+), 5 deletions(-)
>>>>
>>>
>>> [...]
>>>
>>>> + /* stack_trace_save_tsk() works on unsigned long array, while
>>>> + * perf_callchain_entry uses u64 array. For 32-bit systems, it is
>>>> + * necessary to fix this mismatch.
>>>> + */
>>>> + if (__BITS_PER_LONG != 64) {
>>>> + unsigned long *from = (unsigned long *) entry->ip;
>>>> + u64 *to = entry->ip;
>>>> + int i;
>>>> +
>>>> + /* copy data from the end to avoid using extra buffer */
>>>> + for (i = entry->nr - 1; i >= (int)init_nr; i--)
>>>> + to[i] = (u64)(from[i]);
>>>
>>> doing this forward would be just fine as well, no? First iteration
>>> will cast and overwrite low 32-bits, all the subsequent iterations
>>> won't even overlap.
>>
>> I think first iteration will write zeros to higher 32 bits, no?
>
> Oh, wait, I completely misread what this is doing. It up-converts from
> 32-bit to 64-bit, sorry. Yeah, ignore me on this :)
>
> But then I have another question. How do you know that entry->ip has
> enough space to keep the same number of 2x bigger entries?

The buffer is sized for sysctl_perf_event_max_stack u64 numbers.
stack_trace_save_tsk() will put at most stack_trace_save_tsk unsigned
long in it (init_nr == 0). So the buffer is big enough.

Thanks,
Song

2020-06-27 00:09:02

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH v2 bpf-next 2/4] bpf: introduce helper bpf_get_task_stak()

On Fri, Jun 26, 2020 at 4:47 PM Song Liu <[email protected]> wrote:
>
>
>
> > On Jun 26, 2020, at 3:51 PM, Andrii Nakryiko <[email protected]> wrote:
> >
> > On Fri, Jun 26, 2020 at 3:45 PM Song Liu <[email protected]> wrote:
> >>
> >>
> >>
> >>> On Jun 26, 2020, at 1:17 PM, Andrii Nakryiko <[email protected]> wrote:
> >>>
> >>> On Thu, Jun 25, 2020 at 5:14 PM Song Liu <[email protected]> wrote:
> >>>>
> >>>> Introduce helper bpf_get_task_stack(), which dumps stack trace of given
> >>>> task. This is different to bpf_get_stack(), which gets stack track of
> >>>> current task. One potential use case of bpf_get_task_stack() is to call
> >>>> it from bpf_iter__task and dump all /proc/<pid>/stack to a seq_file.
> >>>>
> >>>> bpf_get_task_stack() uses stack_trace_save_tsk() instead of
> >>>> get_perf_callchain() for kernel stack. The benefit of this choice is that
> >>>> stack_trace_save_tsk() doesn't require changes in arch/. The downside of
> >>>> using stack_trace_save_tsk() is that stack_trace_save_tsk() dumps the
> >>>> stack trace to unsigned long array. For 32-bit systems, we need to
> >>>> translate it to u64 array.
> >>>>
> >>>> Signed-off-by: Song Liu <[email protected]>
> >>>> ---
> >>>
> >>> Looks great, I just think that there are cases where user doesn't
> >>> necessarily has valid task_struct pointer, just pid, so would be nice
> >>> to not artificially restrict such cases by having extra helper.
> >>>
> >>> Acked-by: Andrii Nakryiko <[email protected]>
> >>
> >> Thanks!
> >>
> >>>
> >>>> include/linux/bpf.h | 1 +
> >>>> include/uapi/linux/bpf.h | 35 ++++++++++++++-
> >>>> kernel/bpf/stackmap.c | 79 ++++++++++++++++++++++++++++++++--
> >>>> kernel/trace/bpf_trace.c | 2 +
> >>>> scripts/bpf_helpers_doc.py | 2 +
> >>>> tools/include/uapi/linux/bpf.h | 35 ++++++++++++++-
> >>>> 6 files changed, 149 insertions(+), 5 deletions(-)
> >>>>
> >>>
> >>> [...]
> >>>
> >>>> + /* stack_trace_save_tsk() works on unsigned long array, while
> >>>> + * perf_callchain_entry uses u64 array. For 32-bit systems, it is
> >>>> + * necessary to fix this mismatch.
> >>>> + */
> >>>> + if (__BITS_PER_LONG != 64) {
> >>>> + unsigned long *from = (unsigned long *) entry->ip;
> >>>> + u64 *to = entry->ip;
> >>>> + int i;
> >>>> +
> >>>> + /* copy data from the end to avoid using extra buffer */
> >>>> + for (i = entry->nr - 1; i >= (int)init_nr; i--)
> >>>> + to[i] = (u64)(from[i]);
> >>>
> >>> doing this forward would be just fine as well, no? First iteration
> >>> will cast and overwrite low 32-bits, all the subsequent iterations
> >>> won't even overlap.
> >>
> >> I think first iteration will write zeros to higher 32 bits, no?
> >
> > Oh, wait, I completely misread what this is doing. It up-converts from
> > 32-bit to 64-bit, sorry. Yeah, ignore me on this :)
> >
> > But then I have another question. How do you know that entry->ip has
> > enough space to keep the same number of 2x bigger entries?
>
> The buffer is sized for sysctl_perf_event_max_stack u64 numbers.
> stack_trace_save_tsk() will put at most stack_trace_save_tsk unsigned
> long in it (init_nr == 0). So the buffer is big enough.
>

Awesome, thanks for clarification!

> Thanks,
> Song