LinuxLists.cc - [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

2023-08-10 19:19:34

Subject: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

Previously a BPF event of augmented_raw_syscalls.c could be used to
enable augmentation of syscalls by perf trace. As BPF events are no
longer supported, switch to using a BPF skeleton which when attached
explicitly opens the sysenter and sysexit tracepoints.

The dump map is removed as debugging wasn't supported by the
augmentation and bpf_printk can be used when necessary.

Remove tools/perf/examples/bpf/augmented_raw_syscalls.c so that the
rename/migration to a BPF skeleton captures that this was the source.

Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/Makefile.perf | 1 +
tools/perf/builtin-trace.c | 180 +++++++++++-------
.../bpf_skel/augmented_raw_syscalls.bpf.c} | 27 +--
3 files changed, 131 insertions(+), 77 deletions(-)
rename tools/perf/{examples/bpf/augmented_raw_syscalls.c => util/bpf_skel/augmented_raw_syscalls.bpf.c} (96%)

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 6ec5079fd697..0e1597712b95 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1042,6 +1042,7 @@ SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
+SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h

$(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_OUTPUT) $(LIBSYMBOL_OUTPUT):
$(Q)$(MKDIR) -p $@
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 59862467e781..8625fca42cd8 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -19,6 +19,9 @@
#ifdef HAVE_LIBBPF_SUPPORT
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
+#ifdef HAVE_BPF_SKEL
+#include "bpf_skel/augmented_raw_syscalls.skel.h"
+#endif
#endif
#include "util/bpf_map.h"
#include "util/rlimit.h"
@@ -127,25 +130,19 @@ struct trace {
struct syscalltbl *sctbl;
struct {
struct syscall *table;
- struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
- struct bpf_map *sys_enter,
- *sys_exit;
- } prog_array;
struct {
struct evsel *sys_enter,
- *sys_exit,
- *augmented;
+ *sys_exit,
+ *bpf_output;
} events;
- struct bpf_program *unaugmented_prog;
} syscalls;
- struct {
- struct bpf_map *map;
- } dump;
+#ifdef HAVE_BPF_SKEL
+ struct augmented_raw_syscalls_bpf *skel;
+#endif
struct record_opts opts;
struct evlist *evlist;
struct machine *host;
struct thread *current;
- struct bpf_object *bpf_obj;
struct cgroup *cgroup;
u64 base_time;
FILE *output;
@@ -415,6 +412,7 @@ static int evsel__init_syscall_tp(struct evsel *evsel)
if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
return -ENOENT;
+
return 0;
}

@@ -2845,7 +2843,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
if (thread)
trace__fprintf_comm_tid(trace, thread, trace->output);

- if (evsel == trace->syscalls.events.augmented) {
+ if (evsel == trace->syscalls.events.bpf_output) {
int id = perf_evsel__sc_tp_uint(evsel, id, sample);
struct syscall *sc = trace__syscall_info(trace, evsel, id);

@@ -3278,24 +3276,16 @@ static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
goto out;
}

-#ifdef HAVE_LIBBPF_SUPPORT
-static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name)
-{
- if (trace->bpf_obj == NULL)
- return NULL;
-
- return bpf_object__find_map_by_name(trace->bpf_obj, name);
-}
-
+#ifdef HAVE_BPF_SKEL
static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
{
struct bpf_program *pos, *prog = NULL;
const char *sec_name;

- if (trace->bpf_obj == NULL)
+ if (trace->skel->obj == NULL)
return NULL;

- bpf_object__for_each_program(pos, trace->bpf_obj) {
+ bpf_object__for_each_program(pos, trace->skel->obj) {
sec_name = bpf_program__section_name(pos);
if (sec_name && !strcmp(sec_name, name)) {
prog = pos;
@@ -3313,12 +3303,14 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str

if (prog_name == NULL) {
char default_prog_name[256];
- scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name);
+ scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s",
+ type, sc->name);
prog = trace__find_bpf_program_by_title(trace, default_prog_name);
if (prog != NULL)
goto out_found;
if (sc->fmt && sc->fmt->alias) {
- scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias);
+ scnprintf(default_prog_name, sizeof(default_prog_name),
+ "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
prog = trace__find_bpf_program_by_title(trace, default_prog_name);
if (prog != NULL)
goto out_found;
@@ -3336,7 +3328,7 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
prog_name, type, sc->name);
out_unaugmented:
- return trace->syscalls.unaugmented_prog;
+ return trace->skel->progs.syscall_unaugmented;
}

static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
@@ -3353,13 +3345,21 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
{
struct syscall *sc = trace__syscall_info(trace, NULL, id);
- return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog);
+
+ if (sc)
+ return bpf_program__fd(sc->bpf_prog.sys_enter);
+
+ return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
}

static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
{
struct syscall *sc = trace__syscall_info(trace, NULL, id);
- return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
+
+ if (sc)
+ return bpf_program__fd(sc->bpf_prog.sys_exit);
+
+ return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
}

static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
@@ -3384,7 +3384,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
bool is_candidate = false;

if (pair == NULL || pair == sc ||
- pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog)
+ pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
continue;

for (field = sc->args, candidate_field = pair->args;
@@ -3437,7 +3437,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
*/
if (pair_prog == NULL) {
pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
- if (pair_prog == trace->syscalls.unaugmented_prog)
+ if (pair_prog == trace->skel->progs.syscall_unaugmented)
goto next_candidate;
}

@@ -3452,8 +3452,8 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace

static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
{
- int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter),
- map_exit_fd = bpf_map__fd(trace->syscalls.prog_array.sys_exit);
+ int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
+ int map_exit_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
int err = 0, key;

for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
@@ -3515,7 +3515,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
* For now we're just reusing the sys_enter prog, and if it
* already has an augmenter, we don't need to find one.
*/
- if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog)
+ if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
continue;

/*
@@ -3538,22 +3538,9 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
break;
}

-
return err;
}
-
-#else // HAVE_LIBBPF_SUPPORT
-static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace __maybe_unused,
- const char *name __maybe_unused)
-{
- return NULL;
-}
-
-static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused)
-{
- return 0;
-}
-#endif // HAVE_LIBBPF_SUPPORT
+#endif // HAVE_BPF_SKEL

static int trace__set_ev_qualifier_filter(struct trace *trace)
{
@@ -3917,13 +3904,31 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
err = evlist__open(evlist);
if (err < 0)
goto out_error_open;
+#ifdef HAVE_BPF_SKEL
+ {
+ struct perf_cpu cpu;

+ /*
+ * Set up the __augmented_syscalls__ BPF map to hold for each
+ * CPU the bpf-output event's file descriptor.
+ */
+ perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
+ bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
+ &cpu.cpu, sizeof(int),
+ xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
+ cpu.cpu, 0),
+ sizeof(__u32), BPF_ANY);
+ }
+ }
+#endif
err = trace__set_filter_pids(trace);
if (err < 0)
goto out_error_mem;

- if (trace->syscalls.prog_array.sys_enter)
+#ifdef HAVE_BPF_SKEL
+ if (trace->skel->progs.sys_enter)
trace__init_syscalls_bpf_prog_array_maps(trace);
+#endif

if (trace->ev_qualifier_ids.nr > 0) {
err = trace__set_ev_qualifier_filter(trace);
@@ -3956,9 +3961,6 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
if (err < 0)
goto out_error_apply_filters;

- if (trace->dump.map)
- bpf_map__fprintf(trace->dump.map, trace->output);
-
err = evlist__mmap(evlist, trace->opts.mmap_pages);
if (err < 0)
goto out_error_mmap;
@@ -4655,6 +4657,18 @@ static void trace__exit(struct trace *trace)
zfree(&trace->perfconfig_events);
}

+#ifdef HAVE_BPF_SKEL
+static int bpf__setup_bpf_output(struct evlist *evlist)
+{
+ int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
+
+ if (err)
+ pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
+
+ return err;
+}
+#endif
+
int cmd_trace(int argc, const char **argv)
{
const char *trace_usage[] = {
@@ -4686,7 +4700,6 @@ int cmd_trace(int argc, const char **argv)
.max_stack = UINT_MAX,
.max_events = ULONG_MAX,
};
- const char *map_dump_str = NULL;
const char *output_name = NULL;
const struct option trace_options[] = {
OPT_CALLBACK('e', "event", &trace, "event",
@@ -4720,9 +4733,6 @@ int cmd_trace(int argc, const char **argv)
OPT_CALLBACK(0, "duration", &trace, "float",
"show only events with duration > N.M ms",
trace__set_duration),
-#ifdef HAVE_LIBBPF_SUPPORT
- OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
-#endif
OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
OPT_INCR('v', "verbose", &verbose, "be more verbose"),
OPT_BOOLEAN('T', "time", &trace.full_time,
@@ -4849,16 +4859,55 @@ int cmd_trace(int argc, const char **argv)
"cgroup monitoring only available in system-wide mode");
}

- err = -1;
+#ifdef HAVE_BPF_SKEL
+ trace.skel = augmented_raw_syscalls_bpf__open();
+ if (!trace.skel) {
+ pr_debug("Failed to open augmented syscalls BPF skeleton");
+ } else {
+ /*
+ * Disable attaching the BPF programs except for sys_enter and
+ * sys_exit that tail call into this as necessary.
+ */
+ bpf_program__set_autoattach(trace.skel->progs.syscall_unaugmented,
+ /*autoattach=*/false);
+ bpf_program__set_autoattach(trace.skel->progs.sys_enter_connect,
+ /*autoattach=*/false);
+ bpf_program__set_autoattach(trace.skel->progs.sys_enter_sendto,
+ /*autoattach=*/false);
+ bpf_program__set_autoattach(trace.skel->progs.sys_enter_open,
+ /*autoattach=*/false);
+ bpf_program__set_autoattach(trace.skel->progs.sys_enter_openat,
+ /*autoattach=*/false);
+ bpf_program__set_autoattach(trace.skel->progs.sys_enter_rename,
+ /*autoattach=*/false);
+ bpf_program__set_autoattach(trace.skel->progs.sys_enter_renameat,
+ /*autoattach=*/false);
+ bpf_program__set_autoattach(trace.skel->progs.sys_enter_perf_event_open,
+ /*autoattach=*/false);
+ bpf_program__set_autoattach(trace.skel->progs.sys_enter_clock_nanosleep,
+ /*autoattach=*/false);
+
+ err = augmented_raw_syscalls_bpf__load(trace.skel);

- if (map_dump_str) {
- trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str);
- if (trace.dump.map == NULL) {
- pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
- goto out;
+ if (err < 0) {
+ pr_debug("Failed to load augmented syscalls BPF skeleton\n");
+ } else {
+ augmented_raw_syscalls_bpf__attach(trace.skel);
+ trace__add_syscall_newtp(&trace);
}
}

+ err = bpf__setup_bpf_output(trace.evlist);
+ if (err) {
+ libbpf_strerror(err, bf, sizeof(bf));
+ pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
+ goto out;
+ }
+ trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
+ assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__"));
+#endif
+ err = -1;
+
if (trace.trace_pgfaults) {
trace.opts.sample_address = true;
trace.opts.sample_time = true;
@@ -4909,7 +4958,7 @@ int cmd_trace(int argc, const char **argv)
* buffers that are being copied from kernel to userspace, think 'read'
* syscall.
*/
- if (trace.syscalls.events.augmented) {
+ if (trace.syscalls.events.bpf_output) {
evlist__for_each_entry(trace.evlist, evsel) {
bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0;

@@ -4918,9 +4967,9 @@ int cmd_trace(int argc, const char **argv)
goto init_augmented_syscall_tp;
}

- if (trace.syscalls.events.augmented->priv == NULL &&
+ if (trace.syscalls.events.bpf_output->priv == NULL &&
strstr(evsel__name(evsel), "syscalls:sys_enter")) {
- struct evsel *augmented = trace.syscalls.events.augmented;
+ struct evsel *augmented = trace.syscalls.events.bpf_output;
if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
evsel__init_augmented_syscall_tp_args(augmented))
goto out;
@@ -5025,5 +5074,8 @@ int cmd_trace(int argc, const char **argv)
fclose(trace.output);
out:
trace__exit(&trace);
+#ifdef HAVE_BPF_SKEL
+ augmented_raw_syscalls_bpf__destroy(trace.skel);
+#endif
return err;
}
diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
similarity index 96%
rename from tools/perf/examples/bpf/augmented_raw_syscalls.c
rename to tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 9a03189d33d3..70478b9460ee 100644
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -18,6 +18,8 @@
#include <bpf/bpf_helpers.h>
#include <linux/limits.h>

+#define MAX_CPUS 4096
+
// FIXME: These should come from system headers
typedef char bool;
typedef int pid_t;
@@ -34,7 +36,7 @@ struct __augmented_syscalls__ {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__type(key, int);
__type(value, __u32);
- __uint(max_entries, __NR_CPUS__);
+ __uint(max_entries, MAX_CPUS);
} __augmented_syscalls__ SEC(".maps");

/*
@@ -170,7 +172,7 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
return augmented_len;
}

-SEC("!raw_syscalls:unaugmented")
+SEC("tp/raw_syscalls/sys_enter")
int syscall_unaugmented(struct syscall_enter_args *args)
{
return 1;
@@ -182,7 +184,7 @@ int syscall_unaugmented(struct syscall_enter_args *args)
* on from there, reading the first syscall arg as a string, i.e. open's
* filename.
*/
-SEC("!syscalls:sys_enter_connect")
+SEC("tp/syscalls/sys_enter_connect")
int sys_enter_connect(struct syscall_enter_args *args)
{
struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -201,7 +203,7 @@ int sys_enter_connect(struct syscall_enter_args *args)
return augmented__output(args, augmented_args, len + socklen);
}

-SEC("!syscalls:sys_enter_sendto")
+SEC("tp/syscalls/sys_enter_sendto")
int sys_enter_sendto(struct syscall_enter_args *args)
{
struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -220,7 +222,7 @@ int sys_enter_sendto(struct syscall_enter_args *args)
return augmented__output(args, augmented_args, len + socklen);
}

-SEC("!syscalls:sys_enter_open")
+SEC("tp/syscalls/sys_enter_open")
int sys_enter_open(struct syscall_enter_args *args)
{
struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -235,7 +237,7 @@ int sys_enter_open(struct syscall_enter_args *args)
return augmented__output(args, augmented_args, len);
}

-SEC("!syscalls:sys_enter_openat")
+SEC("tp/syscalls/sys_enter_openat")
int sys_enter_openat(struct syscall_enter_args *args)
{
struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -250,7 +252,7 @@ int sys_enter_openat(struct syscall_enter_args *args)
return augmented__output(args, augmented_args, len);
}

-SEC("!syscalls:sys_enter_rename")
+SEC("tp/syscalls/sys_enter_rename")
int sys_enter_rename(struct syscall_enter_args *args)
{
struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -267,7 +269,7 @@ int sys_enter_rename(struct syscall_enter_args *args)
return augmented__output(args, augmented_args, len);
}

-SEC("!syscalls:sys_enter_renameat")
+SEC("tp/syscalls/sys_enter_renameat")
int sys_enter_renameat(struct syscall_enter_args *args)
{
struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -295,7 +297,7 @@ struct perf_event_attr_size {
__u32 size;
};

-SEC("!syscalls:sys_enter_perf_event_open")
+SEC("tp/syscalls/sys_enter_perf_event_open")
int sys_enter_perf_event_open(struct syscall_enter_args *args)
{
struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -327,7 +329,7 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args)
return 1; /* Failure: don't filter */
}

-SEC("!syscalls:sys_enter_clock_nanosleep")
+SEC("tp/syscalls/sys_enter_clock_nanosleep")
int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
{
struct augmented_args_payload *augmented_args = augmented_args_payload();
@@ -358,7 +360,7 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
return bpf_map_lookup_elem(pids, &pid) != NULL;
}

-SEC("raw_syscalls:sys_enter")
+SEC("tp/raw_syscalls/sys_enter")
int sys_enter(struct syscall_enter_args *args)
{
struct augmented_args_payload *augmented_args;
@@ -371,7 +373,6 @@ int sys_enter(struct syscall_enter_args *args)
* We'll add to this as we add augmented syscalls right after that
* initial, non-augmented raw_syscalls:sys_enter payload.
*/
- unsigned int len = sizeof(augmented_args->args);

if (pid_filter__has(&pids_filtered, getpid()))
return 0;
@@ -393,7 +394,7 @@ int sys_enter(struct syscall_enter_args *args)
return 0;
}

-SEC("raw_syscalls:sys_exit")
+SEC("tp/raw_syscalls/sys_exit")
int sys_exit(struct syscall_exit_args *args)
{
struct syscall_exit_args exit_args;
--
2.41.0.640.ga95def55d0-goog

2023-08-11 18:45:19

by Jiri Olsa

[permalink] [raw]

Subject: Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

On Thu, Aug 10, 2023 at 11:48:51AM -0700, Ian Rogers wrote:
> Previously a BPF event of augmented_raw_syscalls.c could be used to
> enable augmentation of syscalls by perf trace. As BPF events are no
> longer supported, switch to using a BPF skeleton which when attached
> explicitly opens the sysenter and sysexit tracepoints.
>
> The dump map is removed as debugging wasn't supported by the
> augmentation and bpf_printk can be used when necessary.
>
> Remove tools/perf/examples/bpf/augmented_raw_syscalls.c so that the
> rename/migration to a BPF skeleton captures that this was the source.

there's still some:

[jolsa@krava perf]$ grep -r augmented_raw_syscalls.c
builtin-trace.c: * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
builtin-trace.c: * tools/perf/examples/bpf/augmented_raw_syscalls.c,
Documentation/perf-trace.txt: living in tools/perf/examples/bpf/augmented_raw_syscalls.c. For now this

jirka

>
> Signed-off-by: Ian Rogers <[email protected]>
> ---
> tools/perf/Makefile.perf | 1 +
> tools/perf/builtin-trace.c | 180 +++++++++++-------
> .../bpf_skel/augmented_raw_syscalls.bpf.c} | 27 +--
> 3 files changed, 131 insertions(+), 77 deletions(-)
> rename tools/perf/{examples/bpf/augmented_raw_syscalls.c => util/bpf_skel/augmented_raw_syscalls.bpf.c} (96%)
>
> diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> index 6ec5079fd697..0e1597712b95 100644
> --- a/tools/perf/Makefile.perf
> +++ b/tools/perf/Makefile.perf
> @@ -1042,6 +1042,7 @@ SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
> SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
> SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
> SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
> +SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
>
> $(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_OUTPUT) $(LIBSYMBOL_OUTPUT):
> $(Q)$(MKDIR) -p $@
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index 59862467e781..8625fca42cd8 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -19,6 +19,9 @@
> #ifdef HAVE_LIBBPF_SUPPORT
> #include <bpf/bpf.h>
> #include <bpf/libbpf.h>
> +#ifdef HAVE_BPF_SKEL
> +#include "bpf_skel/augmented_raw_syscalls.skel.h"
> +#endif
> #endif
> #include "util/bpf_map.h"
> #include "util/rlimit.h"
> @@ -127,25 +130,19 @@ struct trace {
> struct syscalltbl *sctbl;
> struct {
> struct syscall *table;
> - struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
> - struct bpf_map *sys_enter,
> - *sys_exit;
> - } prog_array;
> struct {
> struct evsel *sys_enter,
> - *sys_exit,
> - *augmented;
> + *sys_exit,
> + *bpf_output;
> } events;
> - struct bpf_program *unaugmented_prog;
> } syscalls;
> - struct {
> - struct bpf_map *map;
> - } dump;
> +#ifdef HAVE_BPF_SKEL
> + struct augmented_raw_syscalls_bpf *skel;
> +#endif
> struct record_opts opts;
> struct evlist *evlist;
> struct machine *host;
> struct thread *current;
> - struct bpf_object *bpf_obj;
> struct cgroup *cgroup;
> u64 base_time;
> FILE *output;
> @@ -415,6 +412,7 @@ static int evsel__init_syscall_tp(struct evsel *evsel)
> if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
> evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
> return -ENOENT;
> +
> return 0;
> }
>
> @@ -2845,7 +2843,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
> if (thread)
> trace__fprintf_comm_tid(trace, thread, trace->output);
>
> - if (evsel == trace->syscalls.events.augmented) {
> + if (evsel == trace->syscalls.events.bpf_output) {
> int id = perf_evsel__sc_tp_uint(evsel, id, sample);
> struct syscall *sc = trace__syscall_info(trace, evsel, id);
>
> @@ -3278,24 +3276,16 @@ static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
> goto out;
> }
>
> -#ifdef HAVE_LIBBPF_SUPPORT
> -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name)
> -{
> - if (trace->bpf_obj == NULL)
> - return NULL;
> -
> - return bpf_object__find_map_by_name(trace->bpf_obj, name);
> -}
> -
> +#ifdef HAVE_BPF_SKEL
> static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
> {
> struct bpf_program *pos, *prog = NULL;
> const char *sec_name;
>
> - if (trace->bpf_obj == NULL)
> + if (trace->skel->obj == NULL)
> return NULL;
>
> - bpf_object__for_each_program(pos, trace->bpf_obj) {
> + bpf_object__for_each_program(pos, trace->skel->obj) {
> sec_name = bpf_program__section_name(pos);
> if (sec_name && !strcmp(sec_name, name)) {
> prog = pos;
> @@ -3313,12 +3303,14 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
>
> if (prog_name == NULL) {
> char default_prog_name[256];
> - scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name);
> + scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s",
> + type, sc->name);
> prog = trace__find_bpf_program_by_title(trace, default_prog_name);
> if (prog != NULL)
> goto out_found;
> if (sc->fmt && sc->fmt->alias) {
> - scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias);
> + scnprintf(default_prog_name, sizeof(default_prog_name),
> + "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
> prog = trace__find_bpf_program_by_title(trace, default_prog_name);
> if (prog != NULL)
> goto out_found;
> @@ -3336,7 +3328,7 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
> pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
> prog_name, type, sc->name);
> out_unaugmented:
> - return trace->syscalls.unaugmented_prog;
> + return trace->skel->progs.syscall_unaugmented;
> }
>
> static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
> @@ -3353,13 +3345,21 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
> static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
> {
> struct syscall *sc = trace__syscall_info(trace, NULL, id);
> - return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog);
> +
> + if (sc)
> + return bpf_program__fd(sc->bpf_prog.sys_enter);
> +
> + return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
> }
>
> static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
> {
> struct syscall *sc = trace__syscall_info(trace, NULL, id);
> - return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
> +
> + if (sc)
> + return bpf_program__fd(sc->bpf_prog.sys_exit);
> +
> + return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
> }
>
> static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
> @@ -3384,7 +3384,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
> bool is_candidate = false;
>
> if (pair == NULL || pair == sc ||
> - pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog)
> + pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
> continue;
>
> for (field = sc->args, candidate_field = pair->args;
> @@ -3437,7 +3437,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
> */
> if (pair_prog == NULL) {
> pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
> - if (pair_prog == trace->syscalls.unaugmented_prog)
> + if (pair_prog == trace->skel->progs.syscall_unaugmented)
> goto next_candidate;
> }
>
> @@ -3452,8 +3452,8 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
>
> static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
> {
> - int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter),
> - map_exit_fd = bpf_map__fd(trace->syscalls.prog_array.sys_exit);
> + int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
> + int map_exit_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
> int err = 0, key;
>
> for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
> @@ -3515,7 +3515,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
> * For now we're just reusing the sys_enter prog, and if it
> * already has an augmenter, we don't need to find one.
> */
> - if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog)
> + if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
> continue;
>
> /*
> @@ -3538,22 +3538,9 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
> break;
> }
>
> -
> return err;
> }
> -
> -#else // HAVE_LIBBPF_SUPPORT
> -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace __maybe_unused,
> - const char *name __maybe_unused)
> -{
> - return NULL;
> -}
> -
> -static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused)
> -{
> - return 0;
> -}
> -#endif // HAVE_LIBBPF_SUPPORT
> +#endif // HAVE_BPF_SKEL
>
> static int trace__set_ev_qualifier_filter(struct trace *trace)
> {
> @@ -3917,13 +3904,31 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> err = evlist__open(evlist);
> if (err < 0)
> goto out_error_open;
> +#ifdef HAVE_BPF_SKEL
> + {
> + struct perf_cpu cpu;
>
> + /*
> + * Set up the __augmented_syscalls__ BPF map to hold for each
> + * CPU the bpf-output event's file descriptor.
> + */
> + perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
> + bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
> + &cpu.cpu, sizeof(int),
> + xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
> + cpu.cpu, 0),
> + sizeof(__u32), BPF_ANY);
> + }
> + }
> +#endif
> err = trace__set_filter_pids(trace);
> if (err < 0)
> goto out_error_mem;
>
> - if (trace->syscalls.prog_array.sys_enter)
> +#ifdef HAVE_BPF_SKEL
> + if (trace->skel->progs.sys_enter)
> trace__init_syscalls_bpf_prog_array_maps(trace);
> +#endif
>
> if (trace->ev_qualifier_ids.nr > 0) {
> err = trace__set_ev_qualifier_filter(trace);
> @@ -3956,9 +3961,6 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> if (err < 0)
> goto out_error_apply_filters;
>
> - if (trace->dump.map)
> - bpf_map__fprintf(trace->dump.map, trace->output);
> -
> err = evlist__mmap(evlist, trace->opts.mmap_pages);
> if (err < 0)
> goto out_error_mmap;
> @@ -4655,6 +4657,18 @@ static void trace__exit(struct trace *trace)
> zfree(&trace->perfconfig_events);
> }
>
> +#ifdef HAVE_BPF_SKEL
> +static int bpf__setup_bpf_output(struct evlist *evlist)
> +{
> + int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
> +
> + if (err)
> + pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
> +
> + return err;
> +}
> +#endif
> +
> int cmd_trace(int argc, const char **argv)
> {
> const char *trace_usage[] = {
> @@ -4686,7 +4700,6 @@ int cmd_trace(int argc, const char **argv)
> .max_stack = UINT_MAX,
> .max_events = ULONG_MAX,
> };
> - const char *map_dump_str = NULL;
> const char *output_name = NULL;
> const struct option trace_options[] = {
> OPT_CALLBACK('e', "event", &trace, "event",
> @@ -4720,9 +4733,6 @@ int cmd_trace(int argc, const char **argv)
> OPT_CALLBACK(0, "duration", &trace, "float",
> "show only events with duration > N.M ms",
> trace__set_duration),
> -#ifdef HAVE_LIBBPF_SUPPORT
> - OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
> -#endif
> OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
> OPT_INCR('v', "verbose", &verbose, "be more verbose"),
> OPT_BOOLEAN('T', "time", &trace.full_time,
> @@ -4849,16 +4859,55 @@ int cmd_trace(int argc, const char **argv)
> "cgroup monitoring only available in system-wide mode");
> }
>
> - err = -1;
> +#ifdef HAVE_BPF_SKEL
> + trace.skel = augmented_raw_syscalls_bpf__open();
> + if (!trace.skel) {
> + pr_debug("Failed to open augmented syscalls BPF skeleton");
> + } else {
> + /*
> + * Disable attaching the BPF programs except for sys_enter and
> + * sys_exit that tail call into this as necessary.
> + */
> + bpf_program__set_autoattach(trace.skel->progs.syscall_unaugmented,
> + /*autoattach=*/false);
> + bpf_program__set_autoattach(trace.skel->progs.sys_enter_connect,
> + /*autoattach=*/false);
> + bpf_program__set_autoattach(trace.skel->progs.sys_enter_sendto,
> + /*autoattach=*/false);
> + bpf_program__set_autoattach(trace.skel->progs.sys_enter_open,
> + /*autoattach=*/false);
> + bpf_program__set_autoattach(trace.skel->progs.sys_enter_openat,
> + /*autoattach=*/false);
> + bpf_program__set_autoattach(trace.skel->progs.sys_enter_rename,
> + /*autoattach=*/false);
> + bpf_program__set_autoattach(trace.skel->progs.sys_enter_renameat,
> + /*autoattach=*/false);
> + bpf_program__set_autoattach(trace.skel->progs.sys_enter_perf_event_open,
> + /*autoattach=*/false);
> + bpf_program__set_autoattach(trace.skel->progs.sys_enter_clock_nanosleep,
> + /*autoattach=*/false);
> +
> + err = augmented_raw_syscalls_bpf__load(trace.skel);
>
> - if (map_dump_str) {
> - trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str);
> - if (trace.dump.map == NULL) {
> - pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
> - goto out;
> + if (err < 0) {
> + pr_debug("Failed to load augmented syscalls BPF skeleton\n");
> + } else {
> + augmented_raw_syscalls_bpf__attach(trace.skel);
> + trace__add_syscall_newtp(&trace);
> }
> }
>
> + err = bpf__setup_bpf_output(trace.evlist);
> + if (err) {
> + libbpf_strerror(err, bf, sizeof(bf));
> + pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
> + goto out;
> + }
> + trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
> + assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__"));
> +#endif
> + err = -1;
> +
> if (trace.trace_pgfaults) {
> trace.opts.sample_address = true;
> trace.opts.sample_time = true;
> @@ -4909,7 +4958,7 @@ int cmd_trace(int argc, const char **argv)
> * buffers that are being copied from kernel to userspace, think 'read'
> * syscall.
> */
> - if (trace.syscalls.events.augmented) {
> + if (trace.syscalls.events.bpf_output) {
> evlist__for_each_entry(trace.evlist, evsel) {
> bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
>
> @@ -4918,9 +4967,9 @@ int cmd_trace(int argc, const char **argv)
> goto init_augmented_syscall_tp;
> }
>
> - if (trace.syscalls.events.augmented->priv == NULL &&
> + if (trace.syscalls.events.bpf_output->priv == NULL &&
> strstr(evsel__name(evsel), "syscalls:sys_enter")) {
> - struct evsel *augmented = trace.syscalls.events.augmented;
> + struct evsel *augmented = trace.syscalls.events.bpf_output;
> if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
> evsel__init_augmented_syscall_tp_args(augmented))
> goto out;
> @@ -5025,5 +5074,8 @@ int cmd_trace(int argc, const char **argv)
> fclose(trace.output);
> out:
> trace__exit(&trace);
> +#ifdef HAVE_BPF_SKEL
> + augmented_raw_syscalls_bpf__destroy(trace.skel);
> +#endif
> return err;
> }
> diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> similarity index 96%
> rename from tools/perf/examples/bpf/augmented_raw_syscalls.c
> rename to tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> index 9a03189d33d3..70478b9460ee 100644
> --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
> +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> @@ -18,6 +18,8 @@
> #include <bpf/bpf_helpers.h>
> #include <linux/limits.h>
>
> +#define MAX_CPUS 4096
> +
> // FIXME: These should come from system headers
> typedef char bool;
> typedef int pid_t;
> @@ -34,7 +36,7 @@ struct __augmented_syscalls__ {
> __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
> __type(key, int);
> __type(value, __u32);
> - __uint(max_entries, __NR_CPUS__);
> + __uint(max_entries, MAX_CPUS);
> } __augmented_syscalls__ SEC(".maps");
>
> /*
> @@ -170,7 +172,7 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
> return augmented_len;
> }
>
> -SEC("!raw_syscalls:unaugmented")
> +SEC("tp/raw_syscalls/sys_enter")
> int syscall_unaugmented(struct syscall_enter_args *args)
> {
> return 1;
> @@ -182,7 +184,7 @@ int syscall_unaugmented(struct syscall_enter_args *args)
> * on from there, reading the first syscall arg as a string, i.e. open's
> * filename.
> */
> -SEC("!syscalls:sys_enter_connect")
> +SEC("tp/syscalls/sys_enter_connect")
> int sys_enter_connect(struct syscall_enter_args *args)
> {
> struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -201,7 +203,7 @@ int sys_enter_connect(struct syscall_enter_args *args)
> return augmented__output(args, augmented_args, len + socklen);
> }
>
> -SEC("!syscalls:sys_enter_sendto")
> +SEC("tp/syscalls/sys_enter_sendto")
> int sys_enter_sendto(struct syscall_enter_args *args)
> {
> struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -220,7 +222,7 @@ int sys_enter_sendto(struct syscall_enter_args *args)
> return augmented__output(args, augmented_args, len + socklen);
> }
>
> -SEC("!syscalls:sys_enter_open")
> +SEC("tp/syscalls/sys_enter_open")
> int sys_enter_open(struct syscall_enter_args *args)
> {
> struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -235,7 +237,7 @@ int sys_enter_open(struct syscall_enter_args *args)
> return augmented__output(args, augmented_args, len);
> }
>
> -SEC("!syscalls:sys_enter_openat")
> +SEC("tp/syscalls/sys_enter_openat")
> int sys_enter_openat(struct syscall_enter_args *args)
> {
> struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -250,7 +252,7 @@ int sys_enter_openat(struct syscall_enter_args *args)
> return augmented__output(args, augmented_args, len);
> }
>
> -SEC("!syscalls:sys_enter_rename")
> +SEC("tp/syscalls/sys_enter_rename")
> int sys_enter_rename(struct syscall_enter_args *args)
> {
> struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -267,7 +269,7 @@ int sys_enter_rename(struct syscall_enter_args *args)
> return augmented__output(args, augmented_args, len);
> }
>
> -SEC("!syscalls:sys_enter_renameat")
> +SEC("tp/syscalls/sys_enter_renameat")
> int sys_enter_renameat(struct syscall_enter_args *args)
> {
> struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -295,7 +297,7 @@ struct perf_event_attr_size {
> __u32 size;
> };
>
> -SEC("!syscalls:sys_enter_perf_event_open")
> +SEC("tp/syscalls/sys_enter_perf_event_open")
> int sys_enter_perf_event_open(struct syscall_enter_args *args)
> {
> struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -327,7 +329,7 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args)
> return 1; /* Failure: don't filter */
> }
>
> -SEC("!syscalls:sys_enter_clock_nanosleep")
> +SEC("tp/syscalls/sys_enter_clock_nanosleep")
> int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
> {
> struct augmented_args_payload *augmented_args = augmented_args_payload();
> @@ -358,7 +360,7 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
> return bpf_map_lookup_elem(pids, &pid) != NULL;
> }
>
> -SEC("raw_syscalls:sys_enter")
> +SEC("tp/raw_syscalls/sys_enter")
> int sys_enter(struct syscall_enter_args *args)
> {
> struct augmented_args_payload *augmented_args;
> @@ -371,7 +373,6 @@ int sys_enter(struct syscall_enter_args *args)
> * We'll add to this as we add augmented syscalls right after that
> * initial, non-augmented raw_syscalls:sys_enter payload.
> */
> - unsigned int len = sizeof(augmented_args->args);
>
> if (pid_filter__has(&pids_filtered, getpid()))
> return 0;
> @@ -393,7 +394,7 @@ int sys_enter(struct syscall_enter_args *args)
> return 0;
> }
>
> -SEC("raw_syscalls:sys_exit")
> +SEC("tp/raw_syscalls/sys_exit")
> int sys_exit(struct syscall_exit_args *args)
> {
> struct syscall_exit_args exit_args;
> --
> 2.41.0.640.ga95def55d0-goog
>

2023-08-11 19:13:29

by Ian Rogers

[permalink] [raw]

Subject: Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

On Fri, Aug 11, 2023 at 9:09 AM Jiri Olsa <[email protected]> wrote:
>
> On Thu, Aug 10, 2023 at 11:48:51AM -0700, Ian Rogers wrote:
> > Previously a BPF event of augmented_raw_syscalls.c could be used to
> > enable augmentation of syscalls by perf trace. As BPF events are no
> > longer supported, switch to using a BPF skeleton which when attached
> > explicitly opens the sysenter and sysexit tracepoints.
> >
> > The dump map is removed as debugging wasn't supported by the
> > augmentation and bpf_printk can be used when necessary.
> >
> > Remove tools/perf/examples/bpf/augmented_raw_syscalls.c so that the
> > rename/migration to a BPF skeleton captures that this was the source.
>
> there's still some:
>
> [jolsa@krava perf]$ grep -r augmented_raw_syscalls.c
> builtin-trace.c: * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it
> builtin-trace.c: * tools/perf/examples/bpf/augmented_raw_syscalls.c,
> Documentation/perf-trace.txt: living in tools/perf/examples/bpf/augmented_raw_syscalls.c. For now this

Agreed, I'll double check but the later patches remove these. I was
trying to keep this patch down to a minimum one approach switch to the
other.

Thanks,
Ian

> jirka
>
> >
> > Signed-off-by: Ian Rogers <[email protected]>
> > ---
> > tools/perf/Makefile.perf | 1 +
> > tools/perf/builtin-trace.c | 180 +++++++++++-------
> > .../bpf_skel/augmented_raw_syscalls.bpf.c} | 27 +--
> > 3 files changed, 131 insertions(+), 77 deletions(-)
> > rename tools/perf/{examples/bpf/augmented_raw_syscalls.c => util/bpf_skel/augmented_raw_syscalls.bpf.c} (96%)
> >
> > diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
> > index 6ec5079fd697..0e1597712b95 100644
> > --- a/tools/perf/Makefile.perf
> > +++ b/tools/perf/Makefile.perf
> > @@ -1042,6 +1042,7 @@ SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
> > SKELETONS += $(SKEL_OUT)/off_cpu.skel.h $(SKEL_OUT)/lock_contention.skel.h
> > SKELETONS += $(SKEL_OUT)/kwork_trace.skel.h $(SKEL_OUT)/sample_filter.skel.h
> > SKELETONS += $(SKEL_OUT)/bench_uprobe.skel.h
> > +SKELETONS += $(SKEL_OUT)/augmented_raw_syscalls.skel.h
> >
> > $(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_OUTPUT) $(LIBSYMBOL_OUTPUT):
> > $(Q)$(MKDIR) -p $@
> > diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> > index 59862467e781..8625fca42cd8 100644
> > --- a/tools/perf/builtin-trace.c
> > +++ b/tools/perf/builtin-trace.c
> > @@ -19,6 +19,9 @@
> > #ifdef HAVE_LIBBPF_SUPPORT
> > #include <bpf/bpf.h>
> > #include <bpf/libbpf.h>
> > +#ifdef HAVE_BPF_SKEL
> > +#include "bpf_skel/augmented_raw_syscalls.skel.h"
> > +#endif
> > #endif
> > #include "util/bpf_map.h"
> > #include "util/rlimit.h"
> > @@ -127,25 +130,19 @@ struct trace {
> > struct syscalltbl *sctbl;
> > struct {
> > struct syscall *table;
> > - struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY
> > - struct bpf_map *sys_enter,
> > - *sys_exit;
> > - } prog_array;
> > struct {
> > struct evsel *sys_enter,
> > - *sys_exit,
> > - *augmented;
> > + *sys_exit,
> > + *bpf_output;
> > } events;
> > - struct bpf_program *unaugmented_prog;
> > } syscalls;
> > - struct {
> > - struct bpf_map *map;
> > - } dump;
> > +#ifdef HAVE_BPF_SKEL
> > + struct augmented_raw_syscalls_bpf *skel;
> > +#endif
> > struct record_opts opts;
> > struct evlist *evlist;
> > struct machine *host;
> > struct thread *current;
> > - struct bpf_object *bpf_obj;
> > struct cgroup *cgroup;
> > u64 base_time;
> > FILE *output;
> > @@ -415,6 +412,7 @@ static int evsel__init_syscall_tp(struct evsel *evsel)
> > if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") &&
> > evsel__init_tp_uint_field(evsel, &sc->id, "nr"))
> > return -ENOENT;
> > +
> > return 0;
> > }
> >
> > @@ -2845,7 +2843,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel,
> > if (thread)
> > trace__fprintf_comm_tid(trace, thread, trace->output);
> >
> > - if (evsel == trace->syscalls.events.augmented) {
> > + if (evsel == trace->syscalls.events.bpf_output) {
> > int id = perf_evsel__sc_tp_uint(evsel, id, sample);
> > struct syscall *sc = trace__syscall_info(trace, evsel, id);
> >
> > @@ -3278,24 +3276,16 @@ static int trace__set_ev_qualifier_tp_filter(struct trace *trace)
> > goto out;
> > }
> >
> > -#ifdef HAVE_LIBBPF_SUPPORT
> > -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name)
> > -{
> > - if (trace->bpf_obj == NULL)
> > - return NULL;
> > -
> > - return bpf_object__find_map_by_name(trace->bpf_obj, name);
> > -}
> > -
> > +#ifdef HAVE_BPF_SKEL
> > static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name)
> > {
> > struct bpf_program *pos, *prog = NULL;
> > const char *sec_name;
> >
> > - if (trace->bpf_obj == NULL)
> > + if (trace->skel->obj == NULL)
> > return NULL;
> >
> > - bpf_object__for_each_program(pos, trace->bpf_obj) {
> > + bpf_object__for_each_program(pos, trace->skel->obj) {
> > sec_name = bpf_program__section_name(pos);
> > if (sec_name && !strcmp(sec_name, name)) {
> > prog = pos;
> > @@ -3313,12 +3303,14 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
> >
> > if (prog_name == NULL) {
> > char default_prog_name[256];
> > - scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name);
> > + scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s",
> > + type, sc->name);
> > prog = trace__find_bpf_program_by_title(trace, default_prog_name);
> > if (prog != NULL)
> > goto out_found;
> > if (sc->fmt && sc->fmt->alias) {
> > - scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias);
> > + scnprintf(default_prog_name, sizeof(default_prog_name),
> > + "tp/syscalls/sys_%s_%s", type, sc->fmt->alias);
> > prog = trace__find_bpf_program_by_title(trace, default_prog_name);
> > if (prog != NULL)
> > goto out_found;
> > @@ -3336,7 +3328,7 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str
> > pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n",
> > prog_name, type, sc->name);
> > out_unaugmented:
> > - return trace->syscalls.unaugmented_prog;
> > + return trace->skel->progs.syscall_unaugmented;
> > }
> >
> > static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
> > @@ -3353,13 +3345,21 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id)
> > static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id)
> > {
> > struct syscall *sc = trace__syscall_info(trace, NULL, id);
> > - return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog);
> > +
> > + if (sc)
> > + return bpf_program__fd(sc->bpf_prog.sys_enter);
> > +
> > + return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
> > }
> >
> > static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id)
> > {
> > struct syscall *sc = trace__syscall_info(trace, NULL, id);
> > - return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog);
> > +
> > + if (sc)
> > + return bpf_program__fd(sc->bpf_prog.sys_exit);
> > +
> > + return bpf_program__fd(trace->skel->progs.syscall_unaugmented);
> > }
> >
> > static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc)
> > @@ -3384,7 +3384,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
> > bool is_candidate = false;
> >
> > if (pair == NULL || pair == sc ||
> > - pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog)
> > + pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented)
> > continue;
> >
> > for (field = sc->args, candidate_field = pair->args;
> > @@ -3437,7 +3437,7 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
> > */
> > if (pair_prog == NULL) {
> > pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter");
> > - if (pair_prog == trace->syscalls.unaugmented_prog)
> > + if (pair_prog == trace->skel->progs.syscall_unaugmented)
> > goto next_candidate;
> > }
> >
> > @@ -3452,8 +3452,8 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace
> >
> > static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
> > {
> > - int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter),
> > - map_exit_fd = bpf_map__fd(trace->syscalls.prog_array.sys_exit);
> > + int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter);
> > + int map_exit_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_exit);
> > int err = 0, key;
> >
> > for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) {
> > @@ -3515,7 +3515,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
> > * For now we're just reusing the sys_enter prog, and if it
> > * already has an augmenter, we don't need to find one.
> > */
> > - if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog)
> > + if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented)
> > continue;
> >
> > /*
> > @@ -3538,22 +3538,9 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace)
> > break;
> > }
> >
> > -
> > return err;
> > }
> > -
> > -#else // HAVE_LIBBPF_SUPPORT
> > -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace __maybe_unused,
> > - const char *name __maybe_unused)
> > -{
> > - return NULL;
> > -}
> > -
> > -static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused)
> > -{
> > - return 0;
> > -}
> > -#endif // HAVE_LIBBPF_SUPPORT
> > +#endif // HAVE_BPF_SKEL
> >
> > static int trace__set_ev_qualifier_filter(struct trace *trace)
> > {
> > @@ -3917,13 +3904,31 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> > err = evlist__open(evlist);
> > if (err < 0)
> > goto out_error_open;
> > +#ifdef HAVE_BPF_SKEL
> > + {
> > + struct perf_cpu cpu;
> >
> > + /*
> > + * Set up the __augmented_syscalls__ BPF map to hold for each
> > + * CPU the bpf-output event's file descriptor.
> > + */
> > + perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) {
> > + bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__,
> > + &cpu.cpu, sizeof(int),
> > + xyarray__entry(trace->syscalls.events.bpf_output->core.fd,
> > + cpu.cpu, 0),
> > + sizeof(__u32), BPF_ANY);
> > + }
> > + }
> > +#endif
> > err = trace__set_filter_pids(trace);
> > if (err < 0)
> > goto out_error_mem;
> >
> > - if (trace->syscalls.prog_array.sys_enter)
> > +#ifdef HAVE_BPF_SKEL
> > + if (trace->skel->progs.sys_enter)
> > trace__init_syscalls_bpf_prog_array_maps(trace);
> > +#endif
> >
> > if (trace->ev_qualifier_ids.nr > 0) {
> > err = trace__set_ev_qualifier_filter(trace);
> > @@ -3956,9 +3961,6 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
> > if (err < 0)
> > goto out_error_apply_filters;
> >
> > - if (trace->dump.map)
> > - bpf_map__fprintf(trace->dump.map, trace->output);
> > -
> > err = evlist__mmap(evlist, trace->opts.mmap_pages);
> > if (err < 0)
> > goto out_error_mmap;
> > @@ -4655,6 +4657,18 @@ static void trace__exit(struct trace *trace)
> > zfree(&trace->perfconfig_events);
> > }
> >
> > +#ifdef HAVE_BPF_SKEL
> > +static int bpf__setup_bpf_output(struct evlist *evlist)
> > +{
> > + int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/");
> > +
> > + if (err)
> > + pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n");
> > +
> > + return err;
> > +}
> > +#endif
> > +
> > int cmd_trace(int argc, const char **argv)
> > {
> > const char *trace_usage[] = {
> > @@ -4686,7 +4700,6 @@ int cmd_trace(int argc, const char **argv)
> > .max_stack = UINT_MAX,
> > .max_events = ULONG_MAX,
> > };
> > - const char *map_dump_str = NULL;
> > const char *output_name = NULL;
> > const struct option trace_options[] = {
> > OPT_CALLBACK('e', "event", &trace, "event",
> > @@ -4720,9 +4733,6 @@ int cmd_trace(int argc, const char **argv)
> > OPT_CALLBACK(0, "duration", &trace, "float",
> > "show only events with duration > N.M ms",
> > trace__set_duration),
> > -#ifdef HAVE_LIBBPF_SUPPORT
> > - OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"),
> > -#endif
> > OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
> > OPT_INCR('v', "verbose", &verbose, "be more verbose"),
> > OPT_BOOLEAN('T', "time", &trace.full_time,
> > @@ -4849,16 +4859,55 @@ int cmd_trace(int argc, const char **argv)
> > "cgroup monitoring only available in system-wide mode");
> > }
> >
> > - err = -1;
> > +#ifdef HAVE_BPF_SKEL
> > + trace.skel = augmented_raw_syscalls_bpf__open();
> > + if (!trace.skel) {
> > + pr_debug("Failed to open augmented syscalls BPF skeleton");
> > + } else {
> > + /*
> > + * Disable attaching the BPF programs except for sys_enter and
> > + * sys_exit that tail call into this as necessary.
> > + */
> > + bpf_program__set_autoattach(trace.skel->progs.syscall_unaugmented,
> > + /*autoattach=*/false);
> > + bpf_program__set_autoattach(trace.skel->progs.sys_enter_connect,
> > + /*autoattach=*/false);
> > + bpf_program__set_autoattach(trace.skel->progs.sys_enter_sendto,
> > + /*autoattach=*/false);
> > + bpf_program__set_autoattach(trace.skel->progs.sys_enter_open,
> > + /*autoattach=*/false);
> > + bpf_program__set_autoattach(trace.skel->progs.sys_enter_openat,
> > + /*autoattach=*/false);
> > + bpf_program__set_autoattach(trace.skel->progs.sys_enter_rename,
> > + /*autoattach=*/false);
> > + bpf_program__set_autoattach(trace.skel->progs.sys_enter_renameat,
> > + /*autoattach=*/false);
> > + bpf_program__set_autoattach(trace.skel->progs.sys_enter_perf_event_open,
> > + /*autoattach=*/false);
> > + bpf_program__set_autoattach(trace.skel->progs.sys_enter_clock_nanosleep,
> > + /*autoattach=*/false);
> > +
> > + err = augmented_raw_syscalls_bpf__load(trace.skel);
> >
> > - if (map_dump_str) {
> > - trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str);
> > - if (trace.dump.map == NULL) {
> > - pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str);
> > - goto out;
> > + if (err < 0) {
> > + pr_debug("Failed to load augmented syscalls BPF skeleton\n");
> > + } else {
> > + augmented_raw_syscalls_bpf__attach(trace.skel);
> > + trace__add_syscall_newtp(&trace);
> > }
> > }
> >
> > + err = bpf__setup_bpf_output(trace.evlist);
> > + if (err) {
> > + libbpf_strerror(err, bf, sizeof(bf));
> > + pr_err("ERROR: Setup BPF output event failed: %s\n", bf);
> > + goto out;
> > + }
> > + trace.syscalls.events.bpf_output = evlist__last(trace.evlist);
> > + assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__"));
> > +#endif
> > + err = -1;
> > +
> > if (trace.trace_pgfaults) {
> > trace.opts.sample_address = true;
> > trace.opts.sample_time = true;
> > @@ -4909,7 +4958,7 @@ int cmd_trace(int argc, const char **argv)
> > * buffers that are being copied from kernel to userspace, think 'read'
> > * syscall.
> > */
> > - if (trace.syscalls.events.augmented) {
> > + if (trace.syscalls.events.bpf_output) {
> > evlist__for_each_entry(trace.evlist, evsel) {
> > bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0;
> >
> > @@ -4918,9 +4967,9 @@ int cmd_trace(int argc, const char **argv)
> > goto init_augmented_syscall_tp;
> > }
> >
> > - if (trace.syscalls.events.augmented->priv == NULL &&
> > + if (trace.syscalls.events.bpf_output->priv == NULL &&
> > strstr(evsel__name(evsel), "syscalls:sys_enter")) {
> > - struct evsel *augmented = trace.syscalls.events.augmented;
> > + struct evsel *augmented = trace.syscalls.events.bpf_output;
> > if (evsel__init_augmented_syscall_tp(augmented, evsel) ||
> > evsel__init_augmented_syscall_tp_args(augmented))
> > goto out;
> > @@ -5025,5 +5074,8 @@ int cmd_trace(int argc, const char **argv)
> > fclose(trace.output);
> > out:
> > trace__exit(&trace);
> > +#ifdef HAVE_BPF_SKEL
> > + augmented_raw_syscalls_bpf__destroy(trace.skel);
> > +#endif
> > return err;
> > }
> > diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > similarity index 96%
> > rename from tools/perf/examples/bpf/augmented_raw_syscalls.c
> > rename to tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > index 9a03189d33d3..70478b9460ee 100644
> > --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
> > +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > @@ -18,6 +18,8 @@
> > #include <bpf/bpf_helpers.h>
> > #include <linux/limits.h>
> >
> > +#define MAX_CPUS 4096
> > +
> > // FIXME: These should come from system headers
> > typedef char bool;
> > typedef int pid_t;
> > @@ -34,7 +36,7 @@ struct __augmented_syscalls__ {
> > __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
> > __type(key, int);
> > __type(value, __u32);
> > - __uint(max_entries, __NR_CPUS__);
> > + __uint(max_entries, MAX_CPUS);
> > } __augmented_syscalls__ SEC(".maps");
> >
> > /*
> > @@ -170,7 +172,7 @@ unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const
> > return augmented_len;
> > }
> >
> > -SEC("!raw_syscalls:unaugmented")
> > +SEC("tp/raw_syscalls/sys_enter")
> > int syscall_unaugmented(struct syscall_enter_args *args)
> > {
> > return 1;
> > @@ -182,7 +184,7 @@ int syscall_unaugmented(struct syscall_enter_args *args)
> > * on from there, reading the first syscall arg as a string, i.e. open's
> > * filename.
> > */
> > -SEC("!syscalls:sys_enter_connect")
> > +SEC("tp/syscalls/sys_enter_connect")
> > int sys_enter_connect(struct syscall_enter_args *args)
> > {
> > struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -201,7 +203,7 @@ int sys_enter_connect(struct syscall_enter_args *args)
> > return augmented__output(args, augmented_args, len + socklen);
> > }
> >
> > -SEC("!syscalls:sys_enter_sendto")
> > +SEC("tp/syscalls/sys_enter_sendto")
> > int sys_enter_sendto(struct syscall_enter_args *args)
> > {
> > struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -220,7 +222,7 @@ int sys_enter_sendto(struct syscall_enter_args *args)
> > return augmented__output(args, augmented_args, len + socklen);
> > }
> >
> > -SEC("!syscalls:sys_enter_open")
> > +SEC("tp/syscalls/sys_enter_open")
> > int sys_enter_open(struct syscall_enter_args *args)
> > {
> > struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -235,7 +237,7 @@ int sys_enter_open(struct syscall_enter_args *args)
> > return augmented__output(args, augmented_args, len);
> > }
> >
> > -SEC("!syscalls:sys_enter_openat")
> > +SEC("tp/syscalls/sys_enter_openat")
> > int sys_enter_openat(struct syscall_enter_args *args)
> > {
> > struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -250,7 +252,7 @@ int sys_enter_openat(struct syscall_enter_args *args)
> > return augmented__output(args, augmented_args, len);
> > }
> >
> > -SEC("!syscalls:sys_enter_rename")
> > +SEC("tp/syscalls/sys_enter_rename")
> > int sys_enter_rename(struct syscall_enter_args *args)
> > {
> > struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -267,7 +269,7 @@ int sys_enter_rename(struct syscall_enter_args *args)
> > return augmented__output(args, augmented_args, len);
> > }
> >
> > -SEC("!syscalls:sys_enter_renameat")
> > +SEC("tp/syscalls/sys_enter_renameat")
> > int sys_enter_renameat(struct syscall_enter_args *args)
> > {
> > struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -295,7 +297,7 @@ struct perf_event_attr_size {
> > __u32 size;
> > };
> >
> > -SEC("!syscalls:sys_enter_perf_event_open")
> > +SEC("tp/syscalls/sys_enter_perf_event_open")
> > int sys_enter_perf_event_open(struct syscall_enter_args *args)
> > {
> > struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -327,7 +329,7 @@ int sys_enter_perf_event_open(struct syscall_enter_args *args)
> > return 1; /* Failure: don't filter */
> > }
> >
> > -SEC("!syscalls:sys_enter_clock_nanosleep")
> > +SEC("tp/syscalls/sys_enter_clock_nanosleep")
> > int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
> > {
> > struct augmented_args_payload *augmented_args = augmented_args_payload();
> > @@ -358,7 +360,7 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
> > return bpf_map_lookup_elem(pids, &pid) != NULL;
> > }
> >
> > -SEC("raw_syscalls:sys_enter")
> > +SEC("tp/raw_syscalls/sys_enter")
> > int sys_enter(struct syscall_enter_args *args)
> > {
> > struct augmented_args_payload *augmented_args;
> > @@ -371,7 +373,6 @@ int sys_enter(struct syscall_enter_args *args)
> > * We'll add to this as we add augmented syscalls right after that
> > * initial, non-augmented raw_syscalls:sys_enter payload.
> > */
> > - unsigned int len = sizeof(augmented_args->args);
> >
> > if (pid_filter__has(&pids_filtered, getpid()))
> > return 0;
> > @@ -393,7 +394,7 @@ int sys_enter(struct syscall_enter_args *args)
> > return 0;
> > }
> >
> > -SEC("raw_syscalls:sys_exit")
> > +SEC("tp/raw_syscalls/sys_exit")
> > int sys_exit(struct syscall_exit_args *args)
> > {
> > struct syscall_exit_args exit_args;
> > --
> > 2.41.0.640.ga95def55d0-goog
> >

2023-08-11 20:23:37

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

Em Thu, Aug 10, 2023 at 11:48:51AM -0700, Ian Rogers escreveu:
> Previously a BPF event of augmented_raw_syscalls.c could be used to
> enable augmentation of syscalls by perf trace. As BPF events are no
> longer supported, switch to using a BPF skeleton which when attached
> explicitly opens the sysenter and sysexit tracepoints.
>
> The dump map is removed as debugging wasn't supported by the
> augmentation and bpf_printk can be used when necessary.
>
> Remove tools/perf/examples/bpf/augmented_raw_syscalls.c so that the
> rename/migration to a BPF skeleton captures that this was the source.

So, there is a problem where the augmented_raw_syscalls connect/sendto
handlers are being rejected by the verifier, the way you did it makes it
to print the verifier output and then continue without augmentation,
unsure if this is a good default, opinions?

[root@quaco ~]# perf trace -e open*
libbpf: prog 'sys_enter_connect': BPF program load failed: Permission denied
libbpf: prog 'sys_enter_connect': -- BEGIN PROG LOAD LOG --
reg type unsupported for arg#0 function sys_enter_connect#59
0: R1=ctx(off=0,imm=0) R10=fp0
; int sys_enter_connect(struct syscall_enter_args *args)
0: (bf) r6 = r1 ; R1=ctx(off=0,imm=0) R6_w=ctx(off=0,imm=0)
1: (b7) r1 = 0 ; R1_w=0
; int key = 0;
2: (63) *(u32 *)(r10 -4) = r1 ; R1_w=0 R10=fp0 fp-8=0000????
3: (bf) r2 = r10 ; R2_w=fp0 R10=fp0
;
4: (07) r2 += -4 ; R2_w=fp-4
; return bpf_map_lookup_elem(&augmented_args_tmp, &key);
5: (18) r1 = 0xffff8de5ae1d4600 ; R1_w=map_ptr(off=0,ks=4,vs=8272,imm=0)
7: (85) call bpf_map_lookup_elem#1 ; R0_w=map_value_or_null(id=1,off=0,ks=4,vs=8272,imm=0)
8: (bf) r7 = r0 ; R0_w=map_value_or_null(id=1,off=0,ks=4,vs=8272,imm=0) R7_w=map_value_or_null(id=1,off=0,ks=4,vs=8272,imm=0)
9: (b7) r0 = 1 ; R0_w=1
; if (augmented_args == NULL)
10: (15) if r7 == 0x0 goto pc+25 ; R7_w=map_value(off=0,ks=4,vs=8272,imm=0)
; unsigned int socklen = args->args[2];
11: (79) r1 = *(u64 *)(r6 +32) ; R1_w=scalar() R6_w=ctx(off=0,imm=0)
;
12: (bf) r2 = r1 ; R1_w=scalar(id=2) R2_w=scalar(id=2)
13: (67) r2 <<= 32 ; R2_w=scalar(smax=9223372032559808512,umax=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min=0,s32_max=0,u32_max=0)
14: (77) r2 >>= 32 ; R2_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff))
15: (b7) r8 = 128 ; R8=128
; if (socklen > sizeof(augmented_args->saddr))
16: (25) if r2 > 0x80 goto pc+1 ; R2=scalar(umax=128,var_off=(0x0; 0xff))
17: (bf) r8 = r1 ; R1=scalar(id=2) R8_w=scalar(id=2)
; const void *sockaddr_arg = (const void *)args->args[1];
18: (79) r3 = *(u64 *)(r6 +24) ; R3_w=scalar() R6=ctx(off=0,imm=0)
; bpf_probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
19: (bf) r1 = r7 ; R1_w=map_value(off=0,ks=4,vs=8272,imm=0) R7=map_value(off=0,ks=4,vs=8272,imm=0)
20: (07) r1 += 64 ; R1_w=map_value(off=64,ks=4,vs=8272,imm=0)
; bpf_probe_read(&augmented_args->saddr, socklen, sockaddr_arg);
21: (bf) r2 = r8 ; R2_w=scalar(id=2) R8_w=scalar(id=2)
22: (85) call bpf_probe_read#4
R2 min value is negative, either use unsigned or 'var &= const'
processed 22 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1
-- END PROG LOAD LOG --
libbpf: prog 'sys_enter_connect': failed to load: -13
libbpf: failed to load object 'augmented_raw_syscalls_bpf'
libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -13
0.000 systemd-oomd/959 openat(dfd: CWD, filename: 0xc0a2a2bd, flags: RDONLY|CLOEXEC) = 12
86.339 thermald/1234 openat(dfd: CWD, filename: 0xac000ba0) = 13
87.008 thermald/1234 openat(dfd: CWD, filename: 0xac000eb0) = 13
87.270 thermald/1234 openat(dfd: CWD, filename: 0xac000b70) = 13
89.657 thermald/1234 openat(dfd: CWD, filename: 0xac000eb0) = 13
^C

If I comment out the connect and sendto it doesn't build anymore,
whereas before it would continue with the other handlers:

CLANG /tmp/build/perf-tools-next/util/bpf_skel/.tmp/augmented_raw_syscalls.bpf.o
GENSKEL /tmp/build/perf-tools-next/util/bpf_skel/augmented_raw_syscalls.skel.h
CC /tmp/build/perf-tools-next/builtin-trace.o
builtin-trace.c: In function ‘cmd_trace’:
builtin-trace.c:4873:63: error: ‘struct <anonymous>’ has no member named ‘sys_enter_connect’; did you mean ‘sys_enter_openat’?
4873 | bpf_program__set_autoattach(trace.skel->progs.sys_enter_connect,
| ^~~~~~~~~~~~~~~~~
| sys_enter_openat
builtin-trace.c:4875:63: error: ‘struct <anonymous>’ has no member named ‘sys_enter_sendto’; did you mean ‘sys_enter_openat’?
4875 | bpf_program__set_autoattach(trace.skel->progs.sys_enter_sendto,
| ^~~~~~~~~~~~~~~~
| sys_enter_openat
make[3]: *** [/home/acme/git/perf-tools-next/tools/build/Makefile.build:97: /tmp/build/perf-tools-next/builtin-trace.o] Error 1
make[2]: *** [Makefile.perf:662: /tmp/build/perf-tools-next/perf-in.o] Error 2
make[1]: *** [Makefile.perf:238: sub-make] Error 2
make: *** [Makefile:113: install-bin] Error 2
make: Leaving directory '/home/acme/git/perf-tools-next/tools/perf'
[acme@quaco perf-tools-next]$

I.e. no need for explicitely referring to those, I think in the past it
was just looking if it was there and if so, attaching, I'll try to fix
this.

If I remove the explicit references in builtin-trace.c:

[root@quaco ~]# perf trace -e open* --max-events=10
0.000 thermald/1234 openat(dfd: CWD, filename: "/sys/class/powercap/intel-rapl/intel-rapl:0/intel-rapl:0:2/energy_uj") = 13
0.236 thermald/1234 openat(dfd: CWD, filename: "/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj") = 13
0.334 thermald/1234 openat(dfd: CWD, filename: "/sys/class/thermal/thermal_zone2/temp") = 13
9.092 systemd-oomd/959 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 12
259.212 systemd-oomd/959 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 12
497.464 gpm/1049 openat(dfd: CWD, filename: "/dev/tty0") = 4
509.044 systemd-oomd/959 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 12
509.559 systemd-oomd/959 openat(dfd: CWD, filename: "/sys/fs/cgroup/user.slice/user-1000.slice/[email protected]/session.slice/memory.pressure", flags: RDONLY|CLOEXEC) = 12
509.917 systemd-oomd/959 openat(dfd: CWD, filename: "/sys/fs/cgroup/user.slice/user-1000.slice/[email protected]/session.slice/memory.current", flags: RDONLY|CLOEXEC) = 12
510.111 systemd-oomd/959 openat(dfd: CWD, filename: "/sys/fs/cgroup/user.slice/user-1000.slice/[email protected]/session.slice/memory.min", flags: RDONLY|CLOEXEC) = 12
[root@quaco ~]#

Cool!

Some inception:

[root@quaco ~]# perf trace -e perf_event_open perf stat -e cycles,instructions,cache-misses sleep 1
0.000 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0 (PERF_COUNT_HW_CPU_CYCLES), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 232297 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 3
0.063 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x1 (PERF_COUNT_HW_INSTRUCTIONS), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 232297 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 4
0.070 perf_event_open(attr_uptr: { type: 0 (PERF_TYPE_HARDWARE), size: 136, config: 0x3 (PERF_COUNT_HW_CACHE_MISSES), sample_type: IDENTIFIER, read_format: TOTAL_TIME_ENABLED|TOTAL_TIME_RUNNING, disabled: 1, inherit: 1, enable_on_exec: 1, exclude_guest: 1 }, pid: 232297 (perf), cpu: -1, group_fd: -1, flags: FD_CLOEXEC) = 5

Performance counter stats for 'sleep 1':

2,669,464 cycles
1,842,319 instructions # 0.69 insn per cycle
27,716 cache-misses

1.001948592 seconds time elapsed

0.000000000 seconds user
0.001657000 seconds sys

[root@quaco ~]#

I'm putting what I have in the tmp.perf-tools-next branch, will continue
later today.

- Arnaldo

2023-08-15 23:17:11

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

2023-08-16 19:14:05

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

Em Wed, Aug 16, 2023 at 10:22:15AM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Wed, Aug 16, 2023 at 10:11:11AM -0300, Arnaldo Carvalho de Melo escreveu:
> > Just taking notes about things to work on top of what is in
> > tmp.perf-tools-next, that will move to perf-tools-next soon:

> > We need to make these libbpf error messages appear only in verbose mode,
> > and probably have a hint about unprivileged BPF, a quick attempt failed
> > after several attempts at getting privileges :-\

> > Probably attaching to tracepoints is off limits to !root even with
> > /proc/sys/kernel/unprivileged_bpf_disabled set to zero.

> yep, the libbpf sys_bpf call to check if it could load a basic BPF
> bytecode (prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2) succeeds,
> but then, later we manage to create the maps, etc to then stumble on

> bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERCPU_ARRAY, key_size=4, value_size=8272, max_entries=1, map_flags=0, inner_map_fd=0, map_name="augmented_args_", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0, map_extra=0}, 72) = 7
> bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0000\0\0\0000\0\0\0\t\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=81, btf_log_size=0, btf_log_level=0}, 32) = -1 EPERM (Operation not permitted)

> and:

> bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=2, insns=0x1758340, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(6, 4, 7), prog_flags=0, prog_name="syscall_unaugme", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0, fd_array=NULL}, 144) = -1 EPERM (Operation not permitted)

> So 'perf trace' should just not try to load the augmented_raw_syscalls
> BPF skel for !root.

Not really, I insisted and it is (was?) possible to make it work,
testing on some other machine and after having to change the permissions
recursively on tracefs (before a remount with mode=755 seemed to
work?).

I managed to make it work for !root, BPF collecting the pointer args for
openat, access (perf trace looks for syscall signatures and reuses BPF
progs for the ones matching one of the explicitely provided)
clock_namosleep, etc.

(re)Reading Documentation/admin-guide/perf-security.rst and getting it
into the hints system of 'perf trace' may make this process simpler and
safer, by using a group, etc. But it is possible, great!

I didn't even had to touch /proc/sys/kernel/unprivileged_bpf_disabled,
just the capabilities for the perf binary (which is a pretty big window,
but way smaller than touching /proc/sys/kernel/unprivileged_bpf_disabled).

So now we need to get BUILD_BPF_SKEL=1 to be the default but just emit a
warning when what is needed isn't available, just like with other
features, in that case 'perf trace' continues as today, no pointer arg
contents collection.

Unfortunately it is too late in the process for v6.6 even, so as soon as
perf-tools-next becomes perf-tools and we reopen it for v6.7 the first
patch should be this build BPF skels if what is needed is available.

I'll also check if we can enable BUILD_BPF_SKEL=1 in the distro packages
so that we collect some info from them about possible problems.

What I have is now in perf-tools-next, so should get into linux-next and
hopefully help in testing it, IIRC there are CIs that enable
BUILD_BPF_SKEL=1.

- Arnaldo

[acme@five ~]$ uname -a
Linux five 6.2.15-100.fc36.x86_64 #1 SMP PREEMPT_DYNAMIC Thu May 11 16:51:53 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
[acme@five ~]$ id
uid=1000(acme) gid=1000(acme) groups=1000(acme),10(wheel) context=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
[acme@five ~]$ perf trace sleep 1
? ( ): sleep/980735 ... [continued]: execve()) = 0
0.031 ( 0.002 ms): sleep/980735 brk() = 0x55c621548000
0.039 ( 0.001 ms): sleep/980735 arch_prctl(option: 0x3001, arg2: 0x7ffeb8a6a460) = -1 EINVAL (Invalid argument)
0.058 ( 0.006 ms): sleep/980735 access(filename: "/etc/ld.so.preload", mode: R) = -1 ENOENT (No such file or directory)
0.068 ( 0.005 ms): sleep/980735 openat(dfd: CWD, filename: "/etc/ld.so.cache", flags: RDONLY|CLOEXEC) = 3
0.074 ( 0.002 ms): sleep/980735 newfstatat(dfd: 3, filename: "", statbuf: 0x7ffeb8a69680, flag: 4096) = 0
0.077 ( 0.006 ms): sleep/980735 mmap(len: 54771, prot: READ, flags: PRIVATE, fd: 3) = 0x7f6b95ad9000
0.084 ( 0.001 ms): sleep/980735 close(fd: 3) = 0
0.094 ( 0.006 ms): sleep/980735 openat(dfd: CWD, filename: "/lib64/libc.so.6", flags: RDONLY|CLOEXEC) = 3
0.101 ( 0.002 ms): sleep/980735 read(fd: 3, buf: 0x7ffeb8a697e8, count: 832) = 832
0.105 ( 0.001 ms): sleep/980735 pread64(fd: 3, buf: 0x7ffeb8a693e0, count: 784, pos: 64) = 784
0.107 ( 0.001 ms): sleep/980735 pread64(fd: 3, buf: 0x7ffeb8a69380, count: 80, pos: 848) = 80
0.110 ( 0.001 ms): sleep/980735 pread64(fd: 3, buf: 0x7ffeb8a69330, count: 68, pos: 928) = 68
0.113 ( 0.002 ms): sleep/980735 newfstatat(dfd: 3, filename: "", statbuf: 0x7ffeb8a69680, flag: 4096) = 0
0.115 ( 0.003 ms): sleep/980735 mmap(len: 8192, prot: READ|WRITE, flags: PRIVATE|ANONYMOUS) = 0x7f6b95ad7000
0.122 ( 0.001 ms): sleep/980735 pread64(fd: 3, buf: 0x7ffeb8a692d0, count: 784, pos: 64) = 784
0.126 ( 0.006 ms): sleep/980735 mmap(len: 2104720, prot: READ, flags: PRIVATE|DENYWRITE, fd: 3) = 0x7f6b95800000
0.133 ( 0.013 ms): sleep/980735 mmap(addr: 0x7f6b95828000, len: 1523712, prot: READ|EXEC, flags: PRIVATE|FIXED|DENYWRITE, fd: 3, off: 0x28000) = 0x7f6b95828000
0.147 ( 0.008 ms): sleep/980735 mmap(addr: 0x7f6b9599c000, len: 360448, prot: READ, flags: PRIVATE|FIXED|DENYWRITE, fd: 3, off: 0x19c000) = 0x7f6b9599c000
0.156 ( 0.010 ms): sleep/980735 mmap(addr: 0x7f6b959f4000, len: 24576, prot: READ|WRITE, flags: PRIVATE|FIXED|DENYWRITE, fd: 3, off: 0x1f3000) = 0x7f6b959f4000
0.171 ( 0.005 ms): sleep/980735 mmap(addr: 0x7f6b959fa000, len: 32144, prot: READ|WRITE, flags: PRIVATE|FIXED|ANONYMOUS) = 0x7f6b959fa000
0.182 ( 0.001 ms): sleep/980735 close(fd: 3) = 0
0.193 ( 0.003 ms): sleep/980735 mmap(len: 12288, prot: READ|WRITE, flags: PRIVATE|ANONYMOUS) = 0x7f6b95ad4000
0.199 ( 0.001 ms): sleep/980735 arch_prctl(option: SET_FS, arg2: 0x7f6b95ad4740) = 0
0.202 ( 0.001 ms): sleep/980735 set_tid_address(tidptr: 0x7f6b95ad4a10) = 980735 (sleep)
0.204 ( 0.001 ms): sleep/980735 set_robust_list(head: 0x7f6b95ad4a20, len: 24) = 0
0.206 ( 0.001 ms): sleep/980735 rseq(rseq: 0x7f6b95ad50e0, rseq_len: 32, sig: 1392848979) = 0
0.277 ( 0.010 ms): sleep/980735 mprotect(start: 0x7f6b959f4000, len: 16384, prot: READ) = 0
0.306 ( 0.007 ms): sleep/980735 mprotect(start: 0x55c61fa4a000, len: 4096, prot: READ) = 0
0.320 ( 0.010 ms): sleep/980735 mprotect(start: 0x7f6b95b1c000, len: 8192, prot: READ) = 0
0.340 ( 0.002 ms): sleep/980735 prlimit64(resource: STACK, old_rlim: 0x7ffeb8a6a1c0) = 0
0.349 ( 0.009 ms): sleep/980735 munmap(addr: 0x7f6b95ad9000, len: 54771) = 0
0.381 ( 0.002 ms): sleep/980735 getrandom(ubuf: 0x7f6b959ff4d8, len: 8, flags: NONBLOCK) = 8
0.386 ( 0.001 ms): sleep/980735 brk() = 0x55c621548000
0.388 ( 0.006 ms): sleep/980735 brk(brk: 0x55c621569000) = 0x55c621569000
0.403 ( 0.012 ms): sleep/980735 openat(dfd: CWD, filename: "", flags: RDONLY|CLOEXEC) = 3
0.417 ( 0.003 ms): sleep/980735 newfstatat(dfd: 3, filename: "", statbuf: 0x7f6b959f9b80, flag: 4096) = 0
0.422 ( 0.008 ms): sleep/980735 mmap(len: 224096080, prot: READ, flags: PRIVATE, fd: 3) = 0x7f6b88200000
0.436 ( 0.002 ms): sleep/980735 close(fd: 3) = 0
0.480 (1000.041 ms): sleep/980735 clock_nanosleep(rqtp: { .tv_sec: 1, .tv_nsec: 0 }, rmtp: 0x7ffeb8a6a450) = 0
1000.552 ( 0.003 ms): sleep/980735 close(fd: 1) = 0
1000.558 ( 0.002 ms): sleep/980735 close(fd: 2) = 0
1000.565 ( ): sleep/980735 exit_group() = ?
[acme@five ~]$ getcap ~/bin/perf
/var/home/acme/bin/perf cap_perfmon,cap_bpf=ep
[acme@five ~]$ cat /proc/sys/kernel/unprivileged_bpf_disabled
2
[acme@five ~]$ cat /proc/sys/kernel/perf_event_paranoid
-1
[acme@five ~]$

2023-08-18 10:41:16

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

Em Wed, Aug 16, 2023 at 10:11:11AM -0300, Arnaldo Carvalho de Melo escreveu:
> Just taking notes about things to work on top of what is in
> tmp.perf-tools-next, that will move to perf-tools-next soon:
>
> We need to make these libbpf error messages appear only in verbose mode,
> and probably have a hint about unprivileged BPF, a quick attempt failed
> after several attempts at getting privileges :-\
>
> Probably attaching to tracepoints is off limits to !root even with
> /proc/sys/kernel/unprivileged_bpf_disabled set to zero.

yep, the libbpf sys_bpf call to check if it could load a basic BPF
bytecode (prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2) succeeds,
but then, later we manage to create the maps, etc to then stumble on

bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERCPU_ARRAY, key_size=4, value_size=8272, max_entries=1, map_flags=0, inner_map_fd=0, map_name="augmented_args_", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0, map_extra=0}, 72) = 7
bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0000\0\0\0000\0\0\0\t\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=81, btf_log_size=0, btf_log_level=0}, 32) = -1 EPERM (Operation not permitted)

and:

bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=2, insns=0x1758340, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(6, 4, 7), prog_flags=0, prog_name="syscall_unaugme", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0, fd_array=NULL}, 144) = -1 EPERM (Operation not permitted)

So 'perf trace' should just not try to load the augmented_raw_syscalls
BPF skel for !root.

- Arnaldo

[acme@quaco perf-tools-next]$ strace -e bpf perf trace -vv -e open* sleep 1
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7ffe95185300, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0}, 116) = 3
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7ffe951854a0, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0, fd_array=NULL}, 144) = 3
bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0\20\0\0\0\20\0\0\0\5\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=45, btf_log_size=0, btf_log_level=0}, 32) = -1 EPERM (Operation not permitted)
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_SOCKET_FILTER, insn_cnt=2, insns=0x7ffe95185110, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(0, 0, 0), prog_flags=0, prog_name="libbpf_nametest"}, 64) = 3
bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_HASH, key_size=4, value_size=1, max_entries=64, map_flags=0, inner_map_fd=0, map_name="pids_filtered", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0, map_extra=0}, 72) = 3
bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PROG_ARRAY, key_size=4, value_size=4, max_entries=512, map_flags=0, inner_map_fd=0, map_name="syscalls_sys_en", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0, map_extra=0}, 72) = 4
bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PROG_ARRAY, key_size=4, value_size=4, max_entries=512, map_flags=0, inner_map_fd=0, map_name="syscalls_sys_ex", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0, map_extra=0}, 72) = 5
bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERF_EVENT_ARRAY, key_size=4, value_size=4, max_entries=4096, map_flags=0, inner_map_fd=0, map_name="__augmented_sys", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0, map_extra=0}, 72) = 6
bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_PERCPU_ARRAY, key_size=4, value_size=8272, max_entries=1, map_flags=0, inner_map_fd=0, map_name="augmented_args_", map_ifindex=0, btf_fd=0, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0, map_extra=0}, 72) = 7
bpf(BPF_BTF_LOAD, {btf="\237\353\1\0\30\0\0\0\0\0\0\0000\0\0\0000\0\0\0\t\0\0\0\1\0\0\0\0\0\0\1"..., btf_log_buf=NULL, btf_size=81, btf_log_size=0, btf_log_level=0}, 32) = -1 EPERM (Operation not permitted)
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=2, insns=0x1758340, license="GPL", log_level=0, log_size=0, log_buf=NULL, kern_version=KERNEL_VERSION(6, 4, 7), prog_flags=0, prog_name="syscall_unaugme", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0, fd_array=NULL}, 144) = -1 EPERM (Operation not permitted)
bpf(BPF_PROG_LOAD, {prog_type=BPF_PROG_TYPE_TRACEPOINT, insn_cnt=2, insns=0x1758340, license="GPL", log_level=1, log_size=16777215, log_buf="", kern_version=KERNEL_VERSION(6, 4, 7), prog_flags=0, prog_name="syscall_unaugme", prog_ifindex=0, expected_attach_type=BPF_CGROUP_INET_INGRESS, prog_btf_fd=0, func_info_rec_size=0, func_info=NULL, func_info_cnt=0, line_info_rec_size=0, line_info=NULL, line_info_cnt=0, attach_btf_id=0, attach_prog_fd=0, fd_array=NULL}, 144) = -1 EPERM (Operation not permitted)
libbpf: prog 'syscall_unaugmented': BPF program load failed: Operation not permitted
libbpf: prog 'syscall_unaugmented': failed to load: -1
libbpf: failed to load object 'augmented_raw_syscalls_bpf'
libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -1
Failed to load augmented syscalls BPF skeleton: Operation not permitted
Using CPUID GenuineIntel-6-8E-A
intel_pt default config: tsc,mtc,mtc_period=3,psb_period=3,pt,branch
Error: No permissions to read /sys/kernel/tracing//events/raw_syscalls/sys_(enter|exit)
Hint: Try 'sudo mount -o remount,mode=755 /sys/kernel/tracing/'

+++ exited with 255 +++
[acme@quaco perf-tools-next]$

2023-08-18 18:28:40

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

Em Wed, Aug 16, 2023 at 01:08:14PM -0300, Arnaldo Carvalho de Melo escreveu:
> [acme@five ~]$ getcap ~/bin/perf
> /var/home/acme/bin/perf cap_perfmon,cap_bpf=ep
> [acme@five ~]$ cat /proc/sys/kernel/unprivileged_bpf_disabled
> 2
> [acme@five ~]$ cat /proc/sys/kernel/perf_event_paranoid
> -1

This last one can remain at 2:

[acme@five ~]$ cat /proc/sys/kernel/perf_event_paranoid
2
[acme@five ~]$ perf trace -e bpf*,perf*,openat,connect* --max-events=10
0.000 ( 0.031 ms): systemd-oomd/1151 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 11
25.532 ( ): gnome-terminal/3223 openat(dfd: CWD, filename: "/proc/1244100/cmdline") ...
249.996 ( 0.031 ms): systemd-oomd/1151 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 11
423.853 ( 0.036 ms): pool/2490 connect(fd: 7, uservaddr: { .family: LOCAL, path: /var/run/.heim_org.h5l.kcm-socket }, addrlen: 110) = 0
423.929 ( 0.021 ms): sssd_kcm/2514 openat(dfd: CWD, filename: "/proc/2486/cmdline") = 16
499.988 ( 0.030 ms): systemd-oomd/1151 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 11
749.981 ( 0.032 ms): systemd-oomd/1151 openat(dfd: CWD, filename: "/proc/meminfo", flags: RDONLY|CLOEXEC) = 11
775.441 ( ): gnome-terminal/3223 openat(dfd: CWD, filename: "/proc/1244100/cmdline") ...
999.988 ( 0.044 ms): systemd-oomd/1151 openat(dfd: CWD, filename: "/sys/fs/cgroup/user.slice/user-1001.slice/[email protected]/memory.pressure", flags: RDONLY|CLOEXEC) = 11
1000.091 ( 0.010 ms): systemd-oomd/1151 openat(dfd: CWD, filename: "/sys/fs/cgroup/user.slice/user-1001.slice/[email protected]/memory.current", flags: RDONLY|CLOEXEC) = 11
[acme@five ~]$

2023-08-19 12:44:05

by Arnaldo Carvalho de Melo

[permalink] [raw]

Subject: Re: [PATCH v1 2/4] perf trace: Migrate BPF augmentation to use a skeleton

Em Tue, Aug 15, 2023 at 07:34:08AM -0700, Ian Rogers escreveu:
> On Tue, Aug 15, 2023, 7:25 AM Arnaldo Carvalho de Melo <[email protected]> wrote:
> > bpf_object__for_each_program(prog, trace.skel->obj) {
> > if (prog != trace.skel->progs.sys_enter && prog !=
> > trace.skel->progs.sys_exit)
> > bpf_program__set_autoattach(prog,
> > /*autoattach=*/false);
> > }
> >
> > So that we don't have to add new lines disabling attachment when adding
> > support for other pointer receiving syscalls.

> Makes sense. Thanks,

Just taking notes about things to work on top of what is in
tmp.perf-tools-next, that will move to perf-tools-next soon:

We need to make these libbpf error messages appear only in verbose mode,
and probably have a hint about unprivileged BPF, a quick attempt failed
after several attempts at getting privileges :-\

Probably attaching to tracepoints is off limits to !root even with
/proc/sys/kernel/unprivileged_bpf_disabled set to zero.

[acme@quaco perf-tools-next]$ perf trace ls
libbpf: Failed to bump RLIMIT_MEMLOCK (err = -1), you might need to do it explicitly!
libbpf: Error in bpf_object__probe_loading():Operation not permitted(1). Couldn't load trivial BPF program. Make sure your kernel supports BPF (CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is set to big enough value.
libbpf: failed to load object 'augmented_raw_syscalls_bpf'
libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -1
Error: No permissions to read /sys/kernel/tracing//events/raw_syscalls/sys_(enter|exit)
Hint: Try 'sudo mount -o remount,mode=755 /sys/kernel/tracing/'

[acme@quaco perf-tools-next]$

[acme@quaco perf-tools-next]$ perf trace -e open* sleep 1
libbpf: Failed to bump RLIMIT_MEMLOCK (err = -1), you might need to do it explicitly!
libbpf: Error in bpf_object__probe_loading():Operation not permitted(1). Couldn't load trivial BPF program. Make sure your kernel supports BPF (CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is set to big enough value.
libbpf: failed to load object 'augmented_raw_syscalls_bpf'
libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -1
Error: No permissions to read /sys/kernel/tracing//events/raw_syscalls/sys_(enter|exit)
Hint: Try 'sudo mount -o remount,mode=755 /sys/kernel/tracing/'

[acme@quaco perf-tools-next]$ sudo mount -o remount,mode=755 /sys/kernel/tracing/
[sudo] password for acme:
[acme@quaco perf-tools-next]$ perf trace -e open* sleep 1
libbpf: Failed to bump RLIMIT_MEMLOCK (err = -1), you might need to do it explicitly!
libbpf: Error in bpf_object__probe_loading():Operation not permitted(1). Couldn't load trivial BPF program. Make sure your kernel supports BPF (CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is set to big enough value.
libbpf: failed to load object 'augmented_raw_syscalls_bpf'
libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -1
Error: No permissions to read /sys/kernel/tracing//events/raw_syscalls/sys_(enter|exit)
Hint: Try 'sudo mount -o remount,mode=755 /sys/kernel/tracing/'

[acme@quaco perf-tools-next]$ sudo mount -o remount,mode=755 /sys/kernel/debug
[acme@quaco perf-tools-next]$ perf trace -e open* sleep 1
libbpf: Failed to bump RLIMIT_MEMLOCK (err = -1), you might need to do it explicitly!
libbpf: Error in bpf_object__probe_loading():Operation not permitted(1). Couldn't load trivial BPF program. Make sure your kernel supports BPF (CONFIG_BPF_SYSCALL=y) and/or that RLIMIT_MEMLOCK is set to big enough value.
libbpf: failed to load object 'augmented_raw_syscalls_bpf'
libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -1
Error: No permissions to read /sys/kernel/tracing//events/raw_syscalls/sys_(enter|exit)
Hint: Try 'sudo mount -o remount,mode=755 /sys/kernel/tracing/'

[acme@quaco perf-tools-next]$ sudo sh -c "echo 0 > /proc/sys/kernel/unprivileged_bpf_disabled"
[acme@quaco perf-tools-next]$ perf trace -e open* sleep 1
libbpf: prog 'syscall_unaugmented': BPF program load failed: Operation not permitted
libbpf: prog 'syscall_unaugmented': failed to load: -1
libbpf: failed to load object 'augmented_raw_syscalls_bpf'
libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -1
Error: No permissions to read /sys/kernel/tracing//events/raw_syscalls/sys_(enter|exit)
Hint: Try 'sudo mount -o remount,mode=755 /sys/kernel/tracing/'

[acme@quaco perf-tools-next]$ cat /proc/sys/kernel/unprivileged_bpf_disabled
0
[acme@quaco perf-tools-next]$
[acme@quaco perf-tools-next]$
[acme@quaco perf-tools-next]$ cat /proc/sys/kernel/perf_event_paranoid
2
[acme@quaco perf-tools-next]$ sudo sh -c "echo -1 > /proc/sys/kernel/perf_event_paranoid"
[acme@quaco perf-tools-next]$ perf trace -e open* sleep 1
libbpf: prog 'syscall_unaugmented': BPF program load failed: Operation not permitted
libbpf: prog 'syscall_unaugmented': failed to load: -1
libbpf: failed to load object 'augmented_raw_syscalls_bpf'
libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -1
Error: No permissions to read /sys/kernel/tracing//events/raw_syscalls/sys_(enter|exit)
Hint: Try 'sudo mount -o remount,mode=755 /sys/kernel/tracing/'

[acme@quaco perf-tools-next]$