2016-04-08 15:07:47

by Wang Nan

[permalink] [raw]
Subject: [PATCH 0/4] perf bpf: Add __bpf_stdout__ support

Hi Arnaldo,

Following your suggestion, patch 3-4 in this patch set add __bpf_stdout__
support. If a map is called __bpf_stdout__, perf will create a bpf-output
event and attach to it automatically. Don't need to setup bpf output event
in cmdline explicitly.

# perf record -e ./test_bpf_trace.c usleep 100000
# ~/perf script
usleep 25656 [003] 264724.900051: 0 __bpf_stdout__: ffffffff810eb9a1 sys_nanosleep (/lib/modules
BPF output: 0000: 52 61 69 73 65 20 61 20 Raise a
0008: 42 50 46 20 65 76 65 6e BPF even
0010: 74 21 00 00 t!..
BPF string: "Raise a BPF event!"

usleep 25656 [003] 264725.000112: 0 __bpf_stdout__: ffffffff8105d609 kretprobe_trampoline_holder
BPF output: 0000: 52 61 69 73 65 20 61 20 Raise a
0008: 42 50 46 20 65 76 65 6e BPF even
0010: 74 21 00 00 t!..
BPF string: "Raise a BPF event!"

Wang Nan (4):
perf trace: Improve error message when receive non-tracepoint events
perf trace: Print content of bpf-output event
perf bpf: Clone bpf stdout events in multiple bpf scripts
perf bpf: Automatically create bpf-output event __bpf_stdout__

tools/perf/builtin-record.c | 8 +++
tools/perf/builtin-trace.c | 33 +++++++++-
tools/perf/util/bpf-loader.c | 143 +++++++++++++++++++++++++++++++++++++++++++
tools/perf/util/bpf-loader.h | 19 ++++++
4 files changed, 202 insertions(+), 1 deletion(-)

--
1.8.3.4


2016-04-08 15:08:03

by Wang Nan

[permalink] [raw]
Subject: [PATCH 2/4] perf trace: Print content of bpf-output event

With this patch the contend of BPF output event is printed by
'perf trace'. For example:

# ./perf trace -a --ev bpf-output/no-inherit,name=evt/ \
--ev ./test_bpf_trace.c/map:channel.event=evt/ \
usleep 100000
...
1.787 ( 0.004 ms): usleep/3832 nanosleep(rqtp: 0x7ffc78b18980 ) ...
1.787 ( ): evt:Raise a BPF event!..)
1.788 ( ): perf_bpf_probe:func_begin:(ffffffff810e97d0))
...
101.866 (87.038 ms): gmain/1654 poll(ufds: 0x7f57a80008c0, nfds: 2, timeout_msecs: 1000 ) ...
101.866 ( ): evt:Raise a BPF event!..)
101.867 ( ): perf_bpf_probe:func_end:(ffffffff810e97d0 <- ffffffff81796173))
101.869 (100.087 ms): usleep/3832 ... [continued]: nanosleep()) = 0
...

(There is an extra ')' at the end of several lines. However, it is
another problem, unrelated to this commit.)

Where test_bpf_trace.c is:
/************************ BEGIN **************************/
#include <uapi/linux/bpf.h>
struct bpf_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
unsigned int max_entries;
};
#define SEC(NAME) __attribute__((section(NAME), used))
static u64 (*ktime_get_ns)(void) =
(void *)BPF_FUNC_ktime_get_ns;
static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
(void *)BPF_FUNC_trace_printk;
static int (*get_smp_processor_id)(void) =
(void *)BPF_FUNC_get_smp_processor_id;
static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
(void *)BPF_FUNC_perf_event_output;

struct bpf_map_def SEC("maps") channel = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = __NR_CPUS__,
};

static inline int __attribute__((always_inline))
func(void *ctx, int type)
{
char output_str[] = "Raise a BPF event!";
char err_str[] = "BAD %d\n";
int err;

err = perf_event_output(ctx, &channel, get_smp_processor_id(),
&output_str, sizeof(output_str));
if (err)
trace_printk(err_str, sizeof(err_str), err);
return 1;
}
SEC("func_begin=sys_nanosleep")
int func_begin(void *ctx) {return func(ctx, 1);}
SEC("func_end=sys_nanosleep%return")
int func_end(void *ctx) { return func(ctx, 2);}
char _license[] SEC("license") = "GPL";
int _version SEC("version") = LINUX_VERSION_CODE;
/************************* END ***************************/

Signed-off-by: Wang Nan <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: [email protected]
---
tools/perf/builtin-trace.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 6fbed86..fb8257f 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2680,11 +2680,15 @@ out_enomem:
goto out;
}

-static int validate_evlist(struct perf_evlist *evlist)
+static int validate_evlist(struct perf_evlist *evlist, bool *has_bpf_output)
{
struct perf_evsel *evsel;

evlist__for_each(evlist, evsel) {
+ if (perf_evsel__is_bpf_output(evsel)) {
+ *has_bpf_output = true;
+ continue;
+ }
if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
}
@@ -3268,6 +3272,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
const char * const trace_subcommands[] = { "record", NULL };
int err;
char bf[BUFSIZ];
+ bool has_bpf_output = false;

signal(SIGSEGV, sighandler_dump_stack);
signal(SIGFPE, sighandler_dump_stack);
@@ -3284,12 +3289,12 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);

- if (validate_evlist(trace.evlist)) {
- pr_err("Only support tracepoint events!\n");
+ if (validate_evlist(trace.evlist, &has_bpf_output)) {
+ pr_err("Only support tracepoint and bpf-output events!\n");
return -EINVAL;
}

- if (trace.trace_pgfaults) {
+ if (trace.trace_pgfaults || has_bpf_output) {
trace.opts.sample_address = true;
trace.opts.sample_time = true;
}
--
1.8.3.4

2016-04-08 15:08:07

by Wang Nan

[permalink] [raw]
Subject: [PATCH 1/4] perf trace: Improve error message when receive non-tracepoint events

Before this patch, strange error message is provided if passed a
non-tracepoint event to 'perf trace':

# perf trace -a --ev cycles sleep 1
Failed to set filter "common_pid != 27500" on event cycles with 22 (Invalid argument)

This is because 'perf trace' accepts all valid events during cmdline
parsing, but in fact user can only provide tracepoints, because it
needs filter.

This patch validate evlist, report error earlier:

# ./perf trace -a --ev cycles sleep 1
Only support tracepoint events!

Signed-off-by: Wang Nan <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: [email protected]
---
tools/perf/builtin-trace.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 11290b5..6fbed86 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2680,6 +2680,17 @@ out_enomem:
goto out;
}

+static int validate_evlist(struct perf_evlist *evlist)
+{
+ struct perf_evsel *evsel;
+
+ evlist__for_each(evlist, evsel) {
+ if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+ return -EINVAL;
+ }
+ return 0;
+}
+
static int trace__run(struct trace *trace, int argc, const char **argv)
{
struct perf_evlist *evlist = trace->evlist;
@@ -3273,6 +3284,11 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);

+ if (validate_evlist(trace.evlist)) {
+ pr_err("Only support tracepoint events!\n");
+ return -EINVAL;
+ }
+
if (trace.trace_pgfaults) {
trace.opts.sample_address = true;
trace.opts.sample_time = true;
--
1.8.3.4

2016-04-08 15:11:14

by Wang Nan

[permalink] [raw]
Subject: [PATCH 4/4] perf bpf: Automatically create bpf-output event __bpf_stdout__

This patch allows ignoring bpf-output event setting in cmdline.
By adding a map named '__bpf_stdout__', perf automatically creates
an event for it.

For example:

# perf record -e ./test_bpf_trace.c usleep 100000
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.012 MB perf.data (2 samples) ]
# ~/perf script
usleep 4639 [000] 261895.307826: 0 __bpf_stdout__: ffffffff810eb9a1 ...
BPF output: 0000: 52 61 69 73 65 20 61 20 Raise a
0008: 42 50 46 20 65 76 65 6e BPF even
0010: 74 21 00 00 t!..
BPF string: "Raise a BPF event!"

usleep 4639 [000] 261895.407883: 0 __bpf_stdout__: ffffffff8105d609 ...
BPF output: 0000: 52 61 69 73 65 20 61 20 Raise a
0008: 42 50 46 20 65 76 65 6e BPF even
0010: 74 21 00 00 t!..
BPF string: "Raise a BPF event!"

perf record -e ./test_bpf_trace.c usleep 100000

equals to

perf record -e bpf-output/no-inherit=1,name=__bpf_stdout__/ \
-e ./test_bpf_trace.c/map:__bpf_stdout__.event=__bpf_stdout__/ \
usleep 100000

Where test_bpf_trace.c is:
/************************ BEGIN **************************/
#include <uapi/linux/bpf.h>
struct bpf_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
unsigned int max_entries;
};
#define SEC(NAME) __attribute__((section(NAME), used))
static u64 (*ktime_get_ns)(void) =
(void *)BPF_FUNC_ktime_get_ns;
static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
(void *)BPF_FUNC_trace_printk;
static int (*get_smp_processor_id)(void) =
(void *)BPF_FUNC_get_smp_processor_id;
static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
(void *)BPF_FUNC_perf_event_output;

struct bpf_map_def SEC("maps") __bpf_stdout__ = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = __NR_CPUS__,
};

static inline int __attribute__((always_inline))
func(void *ctx, int type)
{
char output_str[] = "Raise a BPF event!";
char err_str[] = "BAD %d\n";
int err;

err = perf_event_output(ctx, &channel, get_smp_processor_id(),
&output_str, sizeof(output_str));
if (err)
trace_printk(err_str, sizeof(err_str), err);
return 1;
}
SEC("func_begin=sys_nanosleep")
int func_begin(void *ctx) {return func(ctx, 1);}
SEC("func_end=sys_nanosleep%return")
int func_end(void *ctx) { return func(ctx, 2);}
char _license[] SEC("license") = "GPL";
int _version SEC("version") = LINUX_VERSION_CODE;
/************************* END ***************************/

Signed-off-by: Wang Nan <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: [email protected]
---
tools/perf/util/bpf-loader.c | 37 ++++++++++++++++++++++++++++---------
1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index a0d2802..1bd7d5b 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -1483,6 +1483,7 @@ int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
{
struct bpf_map_priv *tmpl_priv = NULL;
struct bpf_object *obj, *tmp;
+ struct perf_evsel *evsel = NULL;
struct bpf_map *map;
int err;
bool need_init = false;
@@ -1507,8 +1508,16 @@ int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
if (!need_init)
return 0;

- if (!tmpl_priv)
- return 0;
+ if (!tmpl_priv) {
+ err = parse_events(evlist, "bpf-output/no-inherit=1,name=__bpf_stdout__/",
+ NULL);
+ if (err) {
+ pr_debug("ERROR: failed to create bpf-output event\n");
+ return -err;
+ }
+
+ evsel = perf_evlist__last(evlist);
+ }

bpf__for_each_stdout_map(map, obj, tmp) {
struct bpf_map_priv *priv;
@@ -1519,14 +1528,24 @@ int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
if (priv)
continue;

- priv = bpf_map_priv__clone(tmpl_priv);
- if (!priv)
- return -ENOMEM;
+ if (tmpl_priv) {
+ priv = bpf_map_priv__clone(tmpl_priv);
+ if (!priv)
+ return -ENOMEM;

- err = bpf_map__set_private(map, priv, bpf_map_priv__clear);
- if (err) {
- bpf_map_priv__clear(map, priv);
- return err;
+ err = bpf_map__set_private(map, priv, bpf_map_priv__clear);
+ if (err) {
+ bpf_map_priv__clear(map, priv);
+ return err;
+ }
+ } else if (evsel) {
+ struct bpf_map_op *op;
+
+ op = bpf_map__add_newop(map, NULL);
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+ op->op_type = BPF_MAP_OP_SET_EVSEL;
+ op->v.evsel = evsel;
}
}

--
1.8.3.4

2016-04-08 15:11:17

by Wang Nan

[permalink] [raw]
Subject: [PATCH 3/4] perf bpf: Clone bpf stdout events in multiple bpf scripts

This patch allows cloning bpf-output event configuration among multiple
bpf scripts. If there exist a map named '__bpf_output__' and not configured
using 'map:__bpf_output__.event=', this patch clones the configuration of
another '__bpf_stdout__' map. For example, following command:

# perf trace --ev bpf-output/no-inherit,name=evt/ \
--ev ./test_bpf_trace.c/map:__bpf_stdout__.event=evt/ \
--ev ./test_bpf_trace2.c usleep 100000

equals to:

# perf trace --ev bpf-output/no-inherit,name=evt/ \
--ev ./test_bpf_trace.c/map:__bpf_stdout__.event=evt/ \
--ev ./test_bpf_trace2.c/map:__bpf_stdout__.event=evt/ \
usleep 100000

Signed-off-by: Wang Nan <[email protected]>
Cc: Arnaldo Carvalho de Melo <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Li Zefan <[email protected]>
Cc: [email protected]
---
tools/perf/builtin-record.c | 8 +++
tools/perf/builtin-trace.c | 10 ++++
tools/perf/util/bpf-loader.c | 124 +++++++++++++++++++++++++++++++++++++++++++
tools/perf/util/bpf-loader.h | 19 +++++++
4 files changed, 161 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 410035c..e64bd1e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1276,6 +1276,14 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
if (err)
return err;

+ err = bpf__setup_stdout(rec->evlist);
+ if (err) {
+ bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
+ pr_err("ERROR: Setup BPF stdout failed: %s\n",
+ errbuf);
+ return err;
+ }
+
err = -ENOMEM;

symbol__init(NULL);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index fb8257f..9d47bba 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -3289,6 +3289,16 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);

+ err = bpf__setup_stdout(trace.evlist);
+ if (err) {
+ char errbuf[BUFSIZ];
+
+ bpf__strerror_setup_stdout(trace.evlist, err, errbuf, sizeof(errbuf));
+ pr_err("ERROR: Setup BPF stdout failed: %s\n",
+ errbuf);
+ goto out;
+ }
+
if (validate_evlist(trace.evlist, &has_bpf_output)) {
pr_err("Only support tracepoint and bpf-output events!\n");
return -EINVAL;
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 0967ce6..a0d2802 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -842,6 +842,58 @@ bpf_map_op__new(struct parse_events_term *term)
return op;
}

+static struct bpf_map_op *
+bpf_map_op__clone(struct bpf_map_op *op)
+{
+ struct bpf_map_op *newop;
+
+ newop = memdup(op, sizeof(*op));
+ if (!newop) {
+ pr_debug("Failed to alloc bpf_map_op\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&newop->list);
+ if (op->key_type == BPF_MAP_KEY_RANGES) {
+ size_t memsz = op->k.array.nr_ranges *
+ sizeof(op->k.array.ranges[0]);
+
+ newop->k.array.ranges = memdup(op->k.array.ranges, memsz);
+ if (!newop->k.array.ranges) {
+ pr_debug("Failed to alloc indices for map\n");
+ free(newop);
+ return NULL;
+ }
+ }
+
+ return newop;
+}
+
+static struct bpf_map_priv *
+bpf_map_priv__clone(struct bpf_map_priv *priv)
+{
+ struct bpf_map_priv *newpriv;
+ struct bpf_map_op *pos, *newop;
+
+ newpriv = zalloc(sizeof(*newpriv));
+ if (!newpriv) {
+ pr_debug("No enough memory to alloc map private\n");
+ return NULL;
+ }
+ INIT_LIST_HEAD(&newpriv->ops_list);
+
+ list_for_each_entry(pos, &priv->ops_list, list) {
+ newop = bpf_map_op__clone(pos);
+ if (!newop) {
+ bpf_map_priv__purge(newpriv);
+ return NULL;
+ }
+ list_add_tail(&newop->list, &newpriv->ops_list);
+ }
+
+ return newpriv;
+}
+
static int
bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
{
@@ -1417,6 +1469,70 @@ int bpf__apply_obj_config(void)
return 0;
}

+#define bpf__for_each_map(pos, obj, objtmp) \
+ bpf_object__for_each_safe(obj, objtmp) \
+ bpf_map__for_each(pos, obj)
+
+#define bpf__for_each_stdout_map(pos, obj, objtmp) \
+ bpf__for_each_map(pos, obj, objtmp) \
+ if (bpf_map__get_name(pos) && \
+ (strcmp("__bpf_stdout__", \
+ bpf_map__get_name(pos)) == 0))
+
+int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
+{
+ struct bpf_map_priv *tmpl_priv = NULL;
+ struct bpf_object *obj, *tmp;
+ struct bpf_map *map;
+ int err;
+ bool need_init = false;
+
+ bpf__for_each_stdout_map(map, obj, tmp) {
+ struct bpf_map_priv *priv;
+
+ err = bpf_map__get_private(map, (void **)&priv);
+ if (err)
+ return -BPF_LOADER_ERRNO__INTERNAL;
+
+ /*
+ * No need to check map type: type should have been
+ * verified by kernel.
+ */
+ if (!need_init && !priv)
+ need_init = !priv;
+ if (!tmpl_priv && priv)
+ tmpl_priv = priv;
+ }
+
+ if (!need_init)
+ return 0;
+
+ if (!tmpl_priv)
+ return 0;
+
+ bpf__for_each_stdout_map(map, obj, tmp) {
+ struct bpf_map_priv *priv;
+
+ err = bpf_map__get_private(map, (void **)&priv);
+ if (err)
+ return -BPF_LOADER_ERRNO__INTERNAL;
+ if (priv)
+ continue;
+
+ priv = bpf_map_priv__clone(tmpl_priv);
+ if (!priv)
+ return -ENOMEM;
+
+ err = bpf_map__set_private(map, priv, bpf_map_priv__clear);
+ if (err) {
+ bpf_map_priv__clear(map, priv);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
#define ERRNO_OFFSET(e) ((e) - __BPF_LOADER_ERRNO__START)
#define ERRCODE_OFFSET(c) ERRNO_OFFSET(BPF_LOADER_ERRNO__##c)
#define NR_ERRNO (__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START)
@@ -1590,3 +1706,11 @@ int bpf__strerror_apply_obj_config(int err, char *buf, size_t size)
bpf__strerror_end(buf, size);
return 0;
}
+
+int bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused,
+ int err, char *buf, size_t size)
+{
+ bpf__strerror_head(err, buf, size);
+ bpf__strerror_end(buf, size);
+ return 0;
+}
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index be43119..941e172 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -79,6 +79,11 @@ int bpf__strerror_config_obj(struct bpf_object *obj,
size_t size);
int bpf__apply_obj_config(void);
int bpf__strerror_apply_obj_config(int err, char *buf, size_t size);
+
+int bpf__setup_stdout(struct perf_evlist *evlist);
+int bpf__strerror_setup_stdout(struct perf_evlist *evlist, int err,
+ char *buf, size_t size);
+
#else
static inline struct bpf_object *
bpf__prepare_load(const char *filename __maybe_unused,
@@ -125,6 +130,12 @@ bpf__apply_obj_config(void)
}

static inline int
+bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
+{
+ return 0;
+}
+
+static inline int
__bpf_strerror(char *buf, size_t size)
{
if (!size)
@@ -177,5 +188,13 @@ bpf__strerror_apply_obj_config(int err __maybe_unused,
{
return __bpf_strerror(buf, size);
}
+
+static inline int
+bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused,
+ int err __maybe_unused, char *buf,
+ size_t size)
+{
+ return __bpf_strerror(buf, size);
+}
#endif
#endif
--
1.8.3.4

2016-04-08 15:23:14

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 1/4] perf trace: Improve error message when receive non-tracepoint events

Em Fri, Apr 08, 2016 at 03:07:22PM +0000, Wang Nan escreveu:
> Before this patch, strange error message is provided if passed a
> non-tracepoint event to 'perf trace':
>
> # perf trace -a --ev cycles sleep 1
> Failed to set filter "common_pid != 27500" on event cycles with 22 (Invalid argument)
>
> This is because 'perf trace' accepts all valid events during cmdline
> parsing, but in fact user can only provide tracepoints, because it
> needs filter.
>
> This patch validate evlist, report error earlier:
>
> # ./perf trace -a --ev cycles sleep 1
> Only support tracepoint events!

Humm, perhaps we should instead refrain from setting filters to non
tracepoint events? I.e. I don't see why we whouldn't support, say,
software events...

/me trying some now, i.e.:

# trace --ev minor-faults --no-syscalls

But it has some issues...

- Arnaldo

> Signed-off-by: Wang Nan <[email protected]>
> Cc: Arnaldo Carvalho de Melo <[email protected]>
> Cc: Jiri Olsa <[email protected]>
> Cc: Li Zefan <[email protected]>
> Cc: [email protected]
> ---
> tools/perf/builtin-trace.c | 16 ++++++++++++++++
> 1 file changed, 16 insertions(+)
>
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index 11290b5..6fbed86 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -2680,6 +2680,17 @@ out_enomem:
> goto out;
> }
>
> +static int validate_evlist(struct perf_evlist *evlist)
> +{
> + struct perf_evsel *evsel;
> +
> + evlist__for_each(evlist, evsel) {
> + if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
> + return -EINVAL;
> + }
> + return 0;
> +}
> +
> static int trace__run(struct trace *trace, int argc, const char **argv)
> {
> struct perf_evlist *evlist = trace->evlist;
> @@ -3273,6 +3284,11 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
> argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
> trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
>
> + if (validate_evlist(trace.evlist)) {
> + pr_err("Only support tracepoint events!\n");
> + return -EINVAL;
> + }
> +
> if (trace.trace_pgfaults) {
> trace.opts.sample_address = true;
> trace.opts.sample_time = true;
> --
> 1.8.3.4

2016-04-08 15:55:44

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH 2/4] perf trace: Print content of bpf-output event

Sorry. This patch should be squash into patch 1/4. The commit message
is wrong. The main part of described patch is already accepted and merged:

1d6c940 perf trace: Print content of bpf-output event

Thank you.

On 2016/4/8 23:07, Wang Nan wrote:
> With this patch the contend of BPF output event is printed by
> 'perf trace'. For example:
>
> # ./perf trace -a --ev bpf-output/no-inherit,name=evt/ \
> --ev ./test_bpf_trace.c/map:channel.event=evt/ \
> usleep 100000
> ...
> 1.787 ( 0.004 ms): usleep/3832 nanosleep(rqtp: 0x7ffc78b18980 ) ...
> 1.787 ( ): evt:Raise a BPF event!..)
> 1.788 ( ): perf_bpf_probe:func_begin:(ffffffff810e97d0))
> ...
> 101.866 (87.038 ms): gmain/1654 poll(ufds: 0x7f57a80008c0, nfds: 2, timeout_msecs: 1000 ) ...
> 101.866 ( ): evt:Raise a BPF event!..)
> 101.867 ( ): perf_bpf_probe:func_end:(ffffffff810e97d0 <- ffffffff81796173))
> 101.869 (100.087 ms): usleep/3832 ... [continued]: nanosleep()) = 0
> ...
>
> (There is an extra ')' at the end of several lines. However, it is
> another problem, unrelated to this commit.)
>
> Where test_bpf_trace.c is:
> /************************ BEGIN **************************/
> #include <uapi/linux/bpf.h>
> struct bpf_map_def {
> unsigned int type;
> unsigned int key_size;
> unsigned int value_size;
> unsigned int max_entries;
> };
> #define SEC(NAME) __attribute__((section(NAME), used))
> static u64 (*ktime_get_ns)(void) =
> (void *)BPF_FUNC_ktime_get_ns;
> static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
> (void *)BPF_FUNC_trace_printk;
> static int (*get_smp_processor_id)(void) =
> (void *)BPF_FUNC_get_smp_processor_id;
> static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
> (void *)BPF_FUNC_perf_event_output;
>
> struct bpf_map_def SEC("maps") channel = {
> .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> .key_size = sizeof(int),
> .value_size = sizeof(u32),
> .max_entries = __NR_CPUS__,
> };
>
> static inline int __attribute__((always_inline))
> func(void *ctx, int type)
> {
> char output_str[] = "Raise a BPF event!";
> char err_str[] = "BAD %d\n";
> int err;
>
> err = perf_event_output(ctx, &channel, get_smp_processor_id(),
> &output_str, sizeof(output_str));
> if (err)
> trace_printk(err_str, sizeof(err_str), err);
> return 1;
> }
> SEC("func_begin=sys_nanosleep")
> int func_begin(void *ctx) {return func(ctx, 1);}
> SEC("func_end=sys_nanosleep%return")
> int func_end(void *ctx) { return func(ctx, 2);}
> char _license[] SEC("license") = "GPL";
> int _version SEC("version") = LINUX_VERSION_CODE;
> /************************* END ***************************/
>
> Signed-off-by: Wang Nan <[email protected]>
> Cc: Arnaldo Carvalho de Melo <[email protected]>
> Cc: Jiri Olsa <[email protected]>
> Cc: Li Zefan <[email protected]>
> Cc: [email protected]
> ---
> tools/perf/builtin-trace.c | 13 +++++++++----
> 1 file changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> index 6fbed86..fb8257f 100644
> --- a/tools/perf/builtin-trace.c
> +++ b/tools/perf/builtin-trace.c
> @@ -2680,11 +2680,15 @@ out_enomem:
> goto out;
> }
>
> -static int validate_evlist(struct perf_evlist *evlist)
> +static int validate_evlist(struct perf_evlist *evlist, bool *has_bpf_output)
> {
> struct perf_evsel *evsel;
>
> evlist__for_each(evlist, evsel) {
> + if (perf_evsel__is_bpf_output(evsel)) {
> + *has_bpf_output = true;
> + continue;
> + }
> if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
> return -EINVAL;
> }
> @@ -3268,6 +3272,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
> const char * const trace_subcommands[] = { "record", NULL };
> int err;
> char bf[BUFSIZ];
> + bool has_bpf_output = false;
>
> signal(SIGSEGV, sighandler_dump_stack);
> signal(SIGFPE, sighandler_dump_stack);
> @@ -3284,12 +3289,12 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
> argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
> trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
>
> - if (validate_evlist(trace.evlist)) {
> - pr_err("Only support tracepoint events!\n");
> + if (validate_evlist(trace.evlist, &has_bpf_output)) {
> + pr_err("Only support tracepoint and bpf-output events!\n");
> return -EINVAL;
> }
>
> - if (trace.trace_pgfaults) {
> + if (trace.trace_pgfaults || has_bpf_output) {
> trace.opts.sample_address = true;
> trace.opts.sample_time = true;
> }


2016-04-08 16:16:54

by Wang Nan

[permalink] [raw]
Subject: Re: [PATCH 1/4] perf trace: Improve error message when receive non-tracepoint events



On 2016/4/8 23:22, Arnaldo Carvalho de Melo wrote:
> Em Fri, Apr 08, 2016 at 03:07:22PM +0000, Wang Nan escreveu:
>> Before this patch, strange error message is provided if passed a
>> non-tracepoint event to 'perf trace':
>>
>> # perf trace -a --ev cycles sleep 1
>> Failed to set filter "common_pid != 27500" on event cycles with 22 (Invalid argument)
>>
>> This is because 'perf trace' accepts all valid events during cmdline
>> parsing, but in fact user can only provide tracepoints, because it
>> needs filter.
>>
>> This patch validate evlist, report error earlier:
>>
>> # ./perf trace -a --ev cycles sleep 1
>> Only support tracepoint events!
> Humm, perhaps we should instead refrain from setting filters to non
> tracepoint events? I.e. I don't see why we whouldn't support, say,
> software events...
>
> /me trying some now, i.e.:
>
> # trace --ev minor-faults --no-syscalls
>
> But it has some issues...
>
> - Arnaldo
>

We already have commit fdf14720fbd02 ("perf tools: Only set filter for
tracepoints events") so you won't see the ugly error message again.

However, I think parsing non-tracepoint events in 'perf trace' is still
a challange. We never support it in 'perf trace' and I'm not too much
sure who will need this feature and how to use them, and why he/she can't
use 'perf record' instead.

Thank you.

2016-04-08 17:25:16

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 2/4] perf trace: Print content of bpf-output event

Em Fri, Apr 08, 2016 at 11:55:11PM +0800, Wangnan (F) escreveu:
> Sorry. This patch should be squash into patch 1/4. The commit message
> is wrong. The main part of described patch is already accepted and merged:
>
> 1d6c940 perf trace: Print content of bpf-output event

I'll check

> Thank you.
>
> On 2016/4/8 23:07, Wang Nan wrote:
> >With this patch the contend of BPF output event is printed by
> >'perf trace'. For example:
> >
> > # ./perf trace -a --ev bpf-output/no-inherit,name=evt/ \
> > --ev ./test_bpf_trace.c/map:channel.event=evt/ \
> > usleep 100000
> > ...
> > 1.787 ( 0.004 ms): usleep/3832 nanosleep(rqtp: 0x7ffc78b18980 ) ...
> > 1.787 ( ): evt:Raise a BPF event!..)
> > 1.788 ( ): perf_bpf_probe:func_begin:(ffffffff810e97d0))
> > ...
> > 101.866 (87.038 ms): gmain/1654 poll(ufds: 0x7f57a80008c0, nfds: 2, timeout_msecs: 1000 ) ...
> > 101.866 ( ): evt:Raise a BPF event!..)
> > 101.867 ( ): perf_bpf_probe:func_end:(ffffffff810e97d0 <- ffffffff81796173))
> > 101.869 (100.087 ms): usleep/3832 ... [continued]: nanosleep()) = 0
> > ...
> >
> > (There is an extra ')' at the end of several lines. However, it is
> > another problem, unrelated to this commit.)
> >
> >Where test_bpf_trace.c is:
> > /************************ BEGIN **************************/
> > #include <uapi/linux/bpf.h>
> > struct bpf_map_def {
> > unsigned int type;
> > unsigned int key_size;
> > unsigned int value_size;
> > unsigned int max_entries;
> > };
> > #define SEC(NAME) __attribute__((section(NAME), used))
> > static u64 (*ktime_get_ns)(void) =
> > (void *)BPF_FUNC_ktime_get_ns;
> > static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
> > (void *)BPF_FUNC_trace_printk;
> > static int (*get_smp_processor_id)(void) =
> > (void *)BPF_FUNC_get_smp_processor_id;
> > static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
> > (void *)BPF_FUNC_perf_event_output;
> >
> > struct bpf_map_def SEC("maps") channel = {
> > .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> > .key_size = sizeof(int),
> > .value_size = sizeof(u32),
> > .max_entries = __NR_CPUS__,
> > };
> >
> > static inline int __attribute__((always_inline))
> > func(void *ctx, int type)
> > {
> > char output_str[] = "Raise a BPF event!";
> > char err_str[] = "BAD %d\n";
> > int err;
> >
> > err = perf_event_output(ctx, &channel, get_smp_processor_id(),
> > &output_str, sizeof(output_str));
> > if (err)
> > trace_printk(err_str, sizeof(err_str), err);
> > return 1;
> > }
> > SEC("func_begin=sys_nanosleep")
> > int func_begin(void *ctx) {return func(ctx, 1);}
> > SEC("func_end=sys_nanosleep%return")
> > int func_end(void *ctx) { return func(ctx, 2);}
> > char _license[] SEC("license") = "GPL";
> > int _version SEC("version") = LINUX_VERSION_CODE;
> > /************************* END ***************************/
> >
> >Signed-off-by: Wang Nan <[email protected]>
> >Cc: Arnaldo Carvalho de Melo <[email protected]>
> >Cc: Jiri Olsa <[email protected]>
> >Cc: Li Zefan <[email protected]>
> >Cc: [email protected]
> >---
> > tools/perf/builtin-trace.c | 13 +++++++++----
> > 1 file changed, 9 insertions(+), 4 deletions(-)
> >
> >diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
> >index 6fbed86..fb8257f 100644
> >--- a/tools/perf/builtin-trace.c
> >+++ b/tools/perf/builtin-trace.c
> >@@ -2680,11 +2680,15 @@ out_enomem:
> > goto out;
> > }
> >-static int validate_evlist(struct perf_evlist *evlist)
> >+static int validate_evlist(struct perf_evlist *evlist, bool *has_bpf_output)
> > {
> > struct perf_evsel *evsel;
> > evlist__for_each(evlist, evsel) {
> >+ if (perf_evsel__is_bpf_output(evsel)) {
> >+ *has_bpf_output = true;
> >+ continue;
> >+ }
> > if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
> > return -EINVAL;
> > }
> >@@ -3268,6 +3272,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
> > const char * const trace_subcommands[] = { "record", NULL };
> > int err;
> > char bf[BUFSIZ];
> >+ bool has_bpf_output = false;
> > signal(SIGSEGV, sighandler_dump_stack);
> > signal(SIGFPE, sighandler_dump_stack);
> >@@ -3284,12 +3289,12 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
> > argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
> > trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
> >- if (validate_evlist(trace.evlist)) {
> >- pr_err("Only support tracepoint events!\n");
> >+ if (validate_evlist(trace.evlist, &has_bpf_output)) {
> >+ pr_err("Only support tracepoint and bpf-output events!\n");
> > return -EINVAL;
> > }
> >- if (trace.trace_pgfaults) {
> >+ if (trace.trace_pgfaults || has_bpf_output) {
> > trace.opts.sample_address = true;
> > trace.opts.sample_time = true;
> > }
>

2016-04-08 17:33:09

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 1/4] perf trace: Improve error message when receive non-tracepoint events

Em Sat, Apr 09, 2016 at 12:12:41AM +0800, Wangnan (F) escreveu:
>
>
> On 2016/4/8 23:22, Arnaldo Carvalho de Melo wrote:
> >Em Fri, Apr 08, 2016 at 03:07:22PM +0000, Wang Nan escreveu:
> >>Before this patch, strange error message is provided if passed a
> >>non-tracepoint event to 'perf trace':
> >>
> >> # perf trace -a --ev cycles sleep 1
> >> Failed to set filter "common_pid != 27500" on event cycles with 22 (Invalid argument)
> >>
> >>This is because 'perf trace' accepts all valid events during cmdline
> >>parsing, but in fact user can only provide tracepoints, because it
> >>needs filter.
> >>
> >>This patch validate evlist, report error earlier:
> >>
> >> # ./perf trace -a --ev cycles sleep 1
> >> Only support tracepoint events!
> >Humm, perhaps we should instead refrain from setting filters to non
> >tracepoint events? I.e. I don't see why we whouldn't support, say,
> >software events...
> >
> >/me trying some now, i.e.:
> >
> > # trace --ev minor-faults --no-syscalls
> >
> >But it has some issues...
> >
> >- Arnaldo
>
> We already have commit fdf14720fbd02 ("perf tools: Only set filter for
> tracepoints events") so you won't see the ugly error message again.

Ok

> However, I think parsing non-tracepoint events in 'perf trace' is still
> a challange. We never support it in 'perf trace' and I'm not too much
> sure who will need this feature and how to use them, and why he/she can't
> use 'perf record' instead.

Well, it works already, the issue I had with 'trace --ev minor-faults'
is that we start with a freq=0, if we do:

# trace --ev minor-faults/freq=1234/

We set up it in a way that we get the events:

{ sample_period, sample_freq } 1234

# trace --ev minor-faults/freq=1234/ -e nanosleep usleep 1
18446744073709.551 ( ): minor-faults/freq=1234/:)
18446744073709.551 ( ): minor-faults/freq=1234/:)
18446744073709.551 ( ): minor-faults/freq=1234/:)
18446744073709.551 ( ): minor-faults/freq=1234/:)
18446744073709.551 ( ): minor-faults/freq=1234/:)
0.345 ( 0.058 ms): usleep/24424 nanosleep(rqtp: 0x7ffc39aca490) = 0
#

But we're not setting up the sample_type, another minor issue, will fix.

In general I think we should not artificially limit what can be done
with one of the tools, trying to leave policy to the user, and being
able to have sampling events mixed with strace-like formatted syscall
entry+exit, tracepoints and other kinds of events like minor-faults, etc
looks sensible.

We can even make the default for PERF_[SH]W_EVENTS to be resolve the
symbol, like we wo in 'perf script'.

And if a frequency is not provided for a sampling event, set a default,
like 'perf record' and 'top' do, its just that the default one for 'perf
trace' now is 0, making me thinkg it was busted in a more serious
fashion.

Now to test Millian's 'perf trace --call-chain' patch...

- Arnaldo

2016-04-08 17:35:26

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 3/4] perf bpf: Clone bpf stdout events in multiple bpf scripts

Em Fri, Apr 08, 2016 at 03:07:24PM +0000, Wang Nan escreveu:
> This patch allows cloning bpf-output event configuration among multiple
> bpf scripts. If there exist a map named '__bpf_output__' and not configured
> using 'map:__bpf_output__.event=', this patch clones the configuration of
> another '__bpf_stdout__' map. For example, following command:
>
> # perf trace --ev bpf-output/no-inherit,name=evt/ \
> --ev ./test_bpf_trace.c/map:__bpf_stdout__.event=evt/ \
> --ev ./test_bpf_trace2.c usleep 100000
>
> equals to:
>
> # perf trace --ev bpf-output/no-inherit,name=evt/ \
> --ev ./test_bpf_trace.c/map:__bpf_stdout__.event=evt/ \
> --ev ./test_bpf_trace2.c/map:__bpf_stdout__.event=evt/ \
> usleep 100000

I'm fixing it, minor stuff, thanks for working on this!

[acme@jouet linux]$ am /wb/1.patch
Applying: perf bpf: Clone bpf stdout events in multiple bpf scripts
/home/acme/git/linux/.git/rebase-apply/patch:135: trailing whitespace.

/home/acme/git/linux/.git/rebase-apply/patch:140: trailing whitespace.
/*
/home/acme/git/linux/.git/rebase-apply/patch:158: trailing whitespace.

error: patch failed: tools/perf/builtin-trace.c:3289
error: tools/perf/builtin-trace.c: patch does not apply

2016-04-08 17:54:35

by Arnaldo Carvalho de Melo

[permalink] [raw]
Subject: Re: [PATCH 4/4] perf bpf: Automatically create bpf-output event __bpf_stdout__

Em Fri, Apr 08, 2016 at 03:07:25PM +0000, Wang Nan escreveu:
> struct bpf_map_def SEC("maps") __bpf_stdout__ = {
> .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> .key_size = sizeof(int),
> .value_size = sizeof(u32),
> .max_entries = __NR_CPUS__,
> };
>
> static inline int __attribute__((always_inline))
> func(void *ctx, int type)
> {
> char output_str[] = "Raise a BPF event!";
> char err_str[] = "BAD %d\n";
> int err;
>
> err = perf_event_output(ctx, &channel, get_smp_processor_id(),

I had to change channel here to __bpf_stdout__, then it works :-)

> &output_str, sizeof(output_str));
> if (err)
> trace_printk(err_str, sizeof(err_str), err);
> return 1;

Asking for strace-like output for 'nanosleep', the function hooked:

[root@jouet bpf]# trace -e nanosleep --ev test_bpf_stdout.c usleep 1
0.007 ( 0.007 ms): usleep/729 nanosleep(rqtp: 0x7ffc5bbc5fe0) ...
0.007 ( ): __bpf_stdout__:Raise a BPF event!..)
0.008 ( ): perf_bpf_probe:func_begin:(ffffffff81112460))
0.069 ( ): __bpf_stdout__:Raise a BPF event!..)
0.070 ( ): perf_bpf_probe:func_end:(ffffffff81112460 <- ffffffff81003d92))
0.072 ( 0.072 ms): usleep/729 ... [continued]: nanosleep()) = 0
[root@jouet bpf]#

Ok, merged those, now lets see how all this mixes with callchains...

- Arnaldo

Subject: [tip:perf/core] perf bpf: Clone bpf stdout events in multiple bpf scripts

Commit-ID: d78885739a7df111dc7b081f8a09e08a5fcfecc2
Gitweb: http://git.kernel.org/tip/d78885739a7df111dc7b081f8a09e08a5fcfecc2
Author: Wang Nan <[email protected]>
AuthorDate: Fri, 8 Apr 2016 15:07:24 +0000
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 11 Apr 2016 22:17:45 -0300

perf bpf: Clone bpf stdout events in multiple bpf scripts

This patch allows cloning bpf-output event configuration among multiple
bpf scripts. If there exist a map named '__bpf_output__' and not
configured using 'map:__bpf_output__.event=', this patch clones the
configuration of another '__bpf_stdout__' map. For example, following
command:

# perf trace --ev bpf-output/no-inherit,name=evt/ \
--ev ./test_bpf_trace.c/map:__bpf_stdout__.event=evt/ \
--ev ./test_bpf_trace2.c usleep 100000

equals to:

# perf trace --ev bpf-output/no-inherit,name=evt/ \
--ev ./test_bpf_trace.c/map:__bpf_stdout__.event=evt/ \
--ev ./test_bpf_trace2.c/map:__bpf_stdout__.event=evt/ \
usleep 100000

Signed-off-by: Wang Nan <[email protected]>
Suggested-by: Arnaldo Carvalho de Melo <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Zefan Li <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/builtin-record.c | 8 +++
tools/perf/builtin-trace.c | 7 +++
tools/perf/util/bpf-loader.c | 124 +++++++++++++++++++++++++++++++++++++++++++
tools/perf/util/bpf-loader.h | 19 +++++++
4 files changed, 158 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 410035c..e64bd1e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1276,6 +1276,14 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
if (err)
return err;

+ err = bpf__setup_stdout(rec->evlist);
+ if (err) {
+ bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
+ pr_err("ERROR: Setup BPF stdout failed: %s\n",
+ errbuf);
+ return err;
+ }
+
err = -ENOMEM;

symbol__init(NULL);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 11290b5..27d9870 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -3273,6 +3273,13 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);

+ err = bpf__setup_stdout(trace.evlist);
+ if (err) {
+ bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
+ pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
+ goto out;
+ }
+
if (trace.trace_pgfaults) {
trace.opts.sample_address = true;
trace.opts.sample_time = true;
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 0967ce6..67f61a9 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -842,6 +842,58 @@ bpf_map_op__new(struct parse_events_term *term)
return op;
}

+static struct bpf_map_op *
+bpf_map_op__clone(struct bpf_map_op *op)
+{
+ struct bpf_map_op *newop;
+
+ newop = memdup(op, sizeof(*op));
+ if (!newop) {
+ pr_debug("Failed to alloc bpf_map_op\n");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&newop->list);
+ if (op->key_type == BPF_MAP_KEY_RANGES) {
+ size_t memsz = op->k.array.nr_ranges *
+ sizeof(op->k.array.ranges[0]);
+
+ newop->k.array.ranges = memdup(op->k.array.ranges, memsz);
+ if (!newop->k.array.ranges) {
+ pr_debug("Failed to alloc indices for map\n");
+ free(newop);
+ return NULL;
+ }
+ }
+
+ return newop;
+}
+
+static struct bpf_map_priv *
+bpf_map_priv__clone(struct bpf_map_priv *priv)
+{
+ struct bpf_map_priv *newpriv;
+ struct bpf_map_op *pos, *newop;
+
+ newpriv = zalloc(sizeof(*newpriv));
+ if (!newpriv) {
+ pr_debug("No enough memory to alloc map private\n");
+ return NULL;
+ }
+ INIT_LIST_HEAD(&newpriv->ops_list);
+
+ list_for_each_entry(pos, &priv->ops_list, list) {
+ newop = bpf_map_op__clone(pos);
+ if (!newop) {
+ bpf_map_priv__purge(newpriv);
+ return NULL;
+ }
+ list_add_tail(&newop->list, &newpriv->ops_list);
+ }
+
+ return newpriv;
+}
+
static int
bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
{
@@ -1417,6 +1469,70 @@ int bpf__apply_obj_config(void)
return 0;
}

+#define bpf__for_each_map(pos, obj, objtmp) \
+ bpf_object__for_each_safe(obj, objtmp) \
+ bpf_map__for_each(pos, obj)
+
+#define bpf__for_each_stdout_map(pos, obj, objtmp) \
+ bpf__for_each_map(pos, obj, objtmp) \
+ if (bpf_map__get_name(pos) && \
+ (strcmp("__bpf_stdout__", \
+ bpf_map__get_name(pos)) == 0))
+
+int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
+{
+ struct bpf_map_priv *tmpl_priv = NULL;
+ struct bpf_object *obj, *tmp;
+ struct bpf_map *map;
+ int err;
+ bool need_init = false;
+
+ bpf__for_each_stdout_map(map, obj, tmp) {
+ struct bpf_map_priv *priv;
+
+ err = bpf_map__get_private(map, (void **)&priv);
+ if (err)
+ return -BPF_LOADER_ERRNO__INTERNAL;
+
+ /*
+ * No need to check map type: type should have been
+ * verified by kernel.
+ */
+ if (!need_init && !priv)
+ need_init = !priv;
+ if (!tmpl_priv && priv)
+ tmpl_priv = priv;
+ }
+
+ if (!need_init)
+ return 0;
+
+ if (!tmpl_priv)
+ return 0;
+
+ bpf__for_each_stdout_map(map, obj, tmp) {
+ struct bpf_map_priv *priv;
+
+ err = bpf_map__get_private(map, (void **)&priv);
+ if (err)
+ return -BPF_LOADER_ERRNO__INTERNAL;
+ if (priv)
+ continue;
+
+ priv = bpf_map_priv__clone(tmpl_priv);
+ if (!priv)
+ return -ENOMEM;
+
+ err = bpf_map__set_private(map, priv, bpf_map_priv__clear);
+ if (err) {
+ bpf_map_priv__clear(map, priv);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
#define ERRNO_OFFSET(e) ((e) - __BPF_LOADER_ERRNO__START)
#define ERRCODE_OFFSET(c) ERRNO_OFFSET(BPF_LOADER_ERRNO__##c)
#define NR_ERRNO (__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START)
@@ -1590,3 +1706,11 @@ int bpf__strerror_apply_obj_config(int err, char *buf, size_t size)
bpf__strerror_end(buf, size);
return 0;
}
+
+int bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused,
+ int err, char *buf, size_t size)
+{
+ bpf__strerror_head(err, buf, size);
+ bpf__strerror_end(buf, size);
+ return 0;
+}
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index be43119..941e172 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -79,6 +79,11 @@ int bpf__strerror_config_obj(struct bpf_object *obj,
size_t size);
int bpf__apply_obj_config(void);
int bpf__strerror_apply_obj_config(int err, char *buf, size_t size);
+
+int bpf__setup_stdout(struct perf_evlist *evlist);
+int bpf__strerror_setup_stdout(struct perf_evlist *evlist, int err,
+ char *buf, size_t size);
+
#else
static inline struct bpf_object *
bpf__prepare_load(const char *filename __maybe_unused,
@@ -125,6 +130,12 @@ bpf__apply_obj_config(void)
}

static inline int
+bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
+{
+ return 0;
+}
+
+static inline int
__bpf_strerror(char *buf, size_t size)
{
if (!size)
@@ -177,5 +188,13 @@ bpf__strerror_apply_obj_config(int err __maybe_unused,
{
return __bpf_strerror(buf, size);
}
+
+static inline int
+bpf__strerror_setup_stdout(struct perf_evlist *evlist __maybe_unused,
+ int err __maybe_unused, char *buf,
+ size_t size)
+{
+ return __bpf_strerror(buf, size);
+}
#endif
#endif

Subject: [tip:perf/core] perf bpf: Automatically create bpf-output event __bpf_stdout__

Commit-ID: 72c0809856b9174e71eab4e293089f6a114e0d41
Gitweb: http://git.kernel.org/tip/72c0809856b9174e71eab4e293089f6a114e0d41
Author: Wang Nan <[email protected]>
AuthorDate: Fri, 8 Apr 2016 15:07:25 +0000
Committer: Arnaldo Carvalho de Melo <[email protected]>
CommitDate: Mon, 11 Apr 2016 22:18:04 -0300

perf bpf: Automatically create bpf-output event __bpf_stdout__

This patch removes the need to set a bpf-output event in cmdline. By
referencing a map named '__bpf_stdout__', perf automatically creates an
event for it.

For example:

# perf record -e ./test_bpf_trace.c usleep 100000
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.012 MB perf.data (2 samples) ]
# perf script
usleep 4639 [000] 261895.307826: 0 __bpf_stdout__: ffffffff810eb9a1 ...
BPF output: 0000: 52 61 69 73 65 20 61 20 Raise a
0008: 42 50 46 20 65 76 65 6e BPF even
0010: 74 21 00 00 t!..
BPF string: "Raise a BPF event!"

usleep 4639 [000] 261895.407883: 0 __bpf_stdout__: ffffffff8105d609 ...
BPF output: 0000: 52 61 69 73 65 20 61 20 Raise a
0008: 42 50 46 20 65 76 65 6e BPF even
0010: 74 21 00 00 t!..
BPF string: "Raise a BPF event!"

perf record -e ./test_bpf_trace.c usleep 100000

equals to:

perf record -e bpf-output/no-inherit=1,name=__bpf_stdout__/ \
-e ./test_bpf_trace.c/map:__bpf_stdout__.event=__bpf_stdout__/ \
usleep 100000

Where test_bpf_trace.c is:

/************************ BEGIN **************************/
#include <uapi/linux/bpf.h>
struct bpf_map_def {
unsigned int type;
unsigned int key_size;
unsigned int value_size;
unsigned int max_entries;
};
#define SEC(NAME) __attribute__((section(NAME), used))
static u64 (*ktime_get_ns)(void) =
(void *)BPF_FUNC_ktime_get_ns;
static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
(void *)BPF_FUNC_trace_printk;
static int (*get_smp_processor_id)(void) =
(void *)BPF_FUNC_get_smp_processor_id;
static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
(void *)BPF_FUNC_perf_event_output;

struct bpf_map_def SEC("maps") __bpf_stdout__ = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = __NR_CPUS__,
};

static inline int __attribute__((always_inline))
func(void *ctx, int type)
{
char output_str[] = "Raise a BPF event!";
char err_str[] = "BAD %d\n";
int err;

err = perf_event_output(ctx, &__bpf_stdout__, get_smp_processor_id(),
&output_str, sizeof(output_str));
if (err)
trace_printk(err_str, sizeof(err_str), err);
return 1;
}
SEC("func_begin=sys_nanosleep")
int func_begin(void *ctx) {return func(ctx, 1);}
SEC("func_end=sys_nanosleep%return")
int func_end(void *ctx) { return func(ctx, 2);}
char _license[] SEC("license") = "GPL";
int _version SEC("version") = LINUX_VERSION_CODE;
/************************* END ***************************/

Committer note:

Testing with 'perf trace':

# trace -e nanosleep --ev test_bpf_stdout.c usleep 1
0.007 ( 0.007 ms): usleep/729 nanosleep(rqtp: 0x7ffc5bbc5fe0) ...
0.007 ( ): __bpf_stdout__:Raise a BPF event!..)
0.008 ( ): perf_bpf_probe:func_begin:(ffffffff81112460))
0.069 ( ): __bpf_stdout__:Raise a BPF event!..)
0.070 ( ): perf_bpf_probe:func_end:(ffffffff81112460 <- ffffffff81003d92))
0.072 ( 0.072 ms): usleep/729 ... [continued]: nanosleep()) = 0
#

Suggested-and-Tested-by: Arnaldo Carvalho de Melo <[email protected]>
Signed-off-by: Wang Nan <[email protected]>
Cc: Jiri Olsa <[email protected]>
Cc: Zefan Li <[email protected]>
Cc: [email protected]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
---
tools/perf/util/bpf-loader.c | 37 ++++++++++++++++++++++++++++---------
1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 67f61a9..493307d 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -1483,6 +1483,7 @@ int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
{
struct bpf_map_priv *tmpl_priv = NULL;
struct bpf_object *obj, *tmp;
+ struct perf_evsel *evsel = NULL;
struct bpf_map *map;
int err;
bool need_init = false;
@@ -1507,8 +1508,16 @@ int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
if (!need_init)
return 0;

- if (!tmpl_priv)
- return 0;
+ if (!tmpl_priv) {
+ err = parse_events(evlist, "bpf-output/no-inherit=1,name=__bpf_stdout__/",
+ NULL);
+ if (err) {
+ pr_debug("ERROR: failed to create bpf-output event\n");
+ return -err;
+ }
+
+ evsel = perf_evlist__last(evlist);
+ }

bpf__for_each_stdout_map(map, obj, tmp) {
struct bpf_map_priv *priv;
@@ -1519,14 +1528,24 @@ int bpf__setup_stdout(struct perf_evlist *evlist __maybe_unused)
if (priv)
continue;

- priv = bpf_map_priv__clone(tmpl_priv);
- if (!priv)
- return -ENOMEM;
+ if (tmpl_priv) {
+ priv = bpf_map_priv__clone(tmpl_priv);
+ if (!priv)
+ return -ENOMEM;

- err = bpf_map__set_private(map, priv, bpf_map_priv__clear);
- if (err) {
- bpf_map_priv__clear(map, priv);
- return err;
+ err = bpf_map__set_private(map, priv, bpf_map_priv__clear);
+ if (err) {
+ bpf_map_priv__clear(map, priv);
+ return err;
+ }
+ } else if (evsel) {
+ struct bpf_map_op *op;
+
+ op = bpf_map__add_newop(map, NULL);
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+ op->op_type = BPF_MAP_OP_SET_EVSEL;
+ op->v.evsel = evsel;
}
}