Allow uid and gid to be terms in BPF filters by first breaking the
connection between filter terms and PERF_SAMPLE_xx values. Calculate
the uid and gid using the bpf_get_current_uid_gid helper, rather than
from a value in the sample. Allow filters to be passed to perf top, this allows:
$ perf top -e cycles:P --filter "uid == $(id -u)"
to work as a "perf top -u" workaround, as "perf top -u" usually fails
due to processes/threads terminating between the /proc scan and the
perf_event_open.
Ian Rogers (3):
perf bpf filter: Give terms their own enum
perf bpf filter: Add uid and gid terms
perf top: Allow filters on events
tools/perf/Documentation/perf-record.txt | 2 +-
tools/perf/Documentation/perf-top.txt | 4 ++
tools/perf/builtin-top.c | 9 +++
tools/perf/util/bpf-filter.c | 55 ++++++++++++----
tools/perf/util/bpf-filter.h | 5 +-
tools/perf/util/bpf-filter.l | 66 +++++++++----------
tools/perf/util/bpf-filter.y | 7 +-
tools/perf/util/bpf_skel/sample-filter.h | 27 +++++++-
tools/perf/util/bpf_skel/sample_filter.bpf.c | 67 +++++++++++++++-----
9 files changed, 172 insertions(+), 70 deletions(-)
--
2.45.0.rc1.225.g2a3ae87e7f-goog
Allow filters to be added to perf top events. One use is to workaround
issues with:
```
$ perf top --uid="$(id -u)"
```
which tries to scan /proc find processes belonging to the uid and can
fail in such a pid terminates between the scan and the
perf_event_open reporting:
```
Error:
The sys_perf_event_open() syscall returned with 3 (No such process) for event (cycles:P).
/bin/dmesg | grep -i perf may provide additional information.
```
A similar filter:
```
$ perf top -e cycles:P --filter "uid == $(id -u)"
```
doesn't fail this way.
Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/Documentation/perf-top.txt | 4 ++++
tools/perf/builtin-top.c | 9 +++++++++
2 files changed, 13 insertions(+)
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index a754875fa5bb..667e5102075e 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -43,6 +43,10 @@ Default is to monitor all CPUS.
encoding with the layout of the event control registers as described
by entries in /sys/bus/event_source/devices/cpu/format/*.
+--filter=<filter>::
+ Event filter. This option should follow an event selector (-e). For
+ syntax see linkperf:perf-record[1].
+
-E <entries>::
--entries=<entries>::
Display this many functions.
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 1d6aef51c122..e8cbbf10d361 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1055,6 +1055,13 @@ static int perf_top__start_counters(struct perf_top *top)
}
}
+ if (evlist__apply_filters(evlist, &counter)) {
+ pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
+ counter->filter ?: "BPF", evsel__name(counter), errno,
+ str_error_r(errno, msg, sizeof(msg)));
+ goto out_err;
+ }
+
if (evlist__mmap(evlist, opts->mmap_pages) < 0) {
ui__error("Failed to mmap with %d (%s)\n",
errno, str_error_r(errno, msg, sizeof(msg)));
@@ -1462,6 +1469,8 @@ int cmd_top(int argc, const char **argv)
OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
"event selector. use 'perf list' to list available events",
parse_events_option),
+ OPT_CALLBACK(0, "filter", &top.evlist, "filter",
+ "event filter", parse_filter),
OPT_U64('c', "count", &opts->user_interval, "event period to sample"),
OPT_STRING('p', "pid", &target->pid, "pid",
"profile events on existing process id"),
--
2.45.0.rc1.225.g2a3ae87e7f-goog
Allow the BPF filter to use the uid and gid terms determined by the
bpf_get_current_uid_gid BPF helper. For example, the following will
record the cpu-clock event system wide discarding samples that don't
belong to the current user.
$ perf record -e cpu-clock --filter "uid == $(id -u)" -a sleep 0.1
Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/Documentation/perf-record.txt | 2 +-
tools/perf/util/bpf-filter.c | 4 ++++
tools/perf/util/bpf-filter.l | 2 ++
tools/perf/util/bpf_skel/sample-filter.h | 3 +++
tools/perf/util/bpf_skel/sample_filter.bpf.c | 4 ++++
5 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 6015fdd08fb6..059bc40c5ee1 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -200,7 +200,7 @@ OPTIONS
ip, id, tid, pid, cpu, time, addr, period, txn, weight, phys_addr,
code_pgsz, data_pgsz, weight1, weight2, weight3, ins_lat, retire_lat,
p_stage_cyc, mem_op, mem_lvl, mem_snoop, mem_remote, mem_lock,
- mem_dtlb, mem_blk, mem_hops
+ mem_dtlb, mem_blk, mem_hops, uid, gid
The <operator> can be one of:
==, !=, >, >=, <, <=, &
diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index 7e8d179f03dc..c5d6c192d33a 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -80,6 +80,10 @@ static int check_sample_flags(struct evsel *evsel, struct perf_bpf_filter_expr *
CHECK_TERM(DATA_PAGE_SIZE);
CHECK_TERM(WEIGHT_STRUCT);
CHECK_TERM(DATA_SRC);
+ case PBF_TERM_UID:
+ case PBF_TERM_GID:
+ /* Not dependent on the sample_type as computed from a BPF helper. */
+ return 0;
case PBF_TERM_NONE:
default:
break;
diff --git a/tools/perf/util/bpf-filter.l b/tools/perf/util/bpf-filter.l
index 62c959813466..2a7c839f3fae 100644
--- a/tools/perf/util/bpf-filter.l
+++ b/tools/perf/util/bpf-filter.l
@@ -95,6 +95,8 @@ mem_lock { return sample_part(PBF_TERM_DATA_SRC, 5); }
mem_dtlb { return sample_part(PBF_TERM_DATA_SRC, 6); }
mem_blk { return sample_part(PBF_TERM_DATA_SRC, 7); }
mem_hops { return sample_part(PBF_TERM_DATA_SRC, 8); }
+uid { return sample(PBF_TERM_UID); }
+gid { return sample(PBF_TERM_GID); }
"==" { return operator(PBF_OP_EQ); }
"!=" { return operator(PBF_OP_NEQ); }
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
index 161d5ff49cb6..3e64ccacc5e5 100644
--- a/tools/perf/util/bpf_skel/sample-filter.h
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -34,6 +34,9 @@ enum perf_bpf_filter_term {
PBF_TERM_DATA_PAGE_SIZE,
PBF_TERM_WEIGHT_STRUCT,
PBF_TERM_DATA_SRC,
+ /* Terms computed from BPF helpers. */
+ PBF_TERM_UID,
+ PBF_TERM_GID,
};
/* BPF map entry for filtering */
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
index 8666c85e9333..da4b5eb7cce3 100644
--- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -146,6 +146,10 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
}
/* return the whole word */
return kctx->data->data_src.val;
+ case PBF_TERM_UID:
+ return bpf_get_current_uid_gid() & 0xFFFFFFFF;
+ case PBF_TERM_GID:
+ return bpf_get_current_uid_gid() >> 32;
default:
break;
}
--
2.45.0.rc1.225.g2a3ae87e7f-goog
Give the term types their own enum so that additional terms can be
added that don't correspond to a PERF_SAMPLE_xx flag. The term values
are numerically ascending rather than bit field positions, this means
they need translating to a PERF_SAMPLE_xx bit field in certain places
and they are more densely encoded.
Signed-off-by: Ian Rogers <[email protected]>
---
tools/perf/util/bpf-filter.c | 53 +++++++++++-----
tools/perf/util/bpf-filter.h | 5 +-
tools/perf/util/bpf-filter.l | 64 ++++++++++----------
tools/perf/util/bpf-filter.y | 7 ++-
tools/perf/util/bpf_skel/sample-filter.h | 24 +++++++-
tools/perf/util/bpf_skel/sample_filter.bpf.c | 63 +++++++++++++------
6 files changed, 146 insertions(+), 70 deletions(-)
diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c
index b51544996046..7e8d179f03dc 100644
--- a/tools/perf/util/bpf-filter.c
+++ b/tools/perf/util/bpf-filter.c
@@ -17,11 +17,11 @@
#define FD(e, x, y) (*(int *)xyarray__entry(e->core.fd, x, y))
-#define __PERF_SAMPLE_TYPE(st, opt) { st, #st, opt }
-#define PERF_SAMPLE_TYPE(_st, opt) __PERF_SAMPLE_TYPE(PERF_SAMPLE_##_st, opt)
+#define __PERF_SAMPLE_TYPE(tt, st, opt) { tt, #st, opt }
+#define PERF_SAMPLE_TYPE(_st, opt) __PERF_SAMPLE_TYPE(PBF_TERM_##_st, PERF_SAMPLE_##_st, opt)
static const struct perf_sample_info {
- u64 type;
+ enum perf_bpf_filter_term type;
const char *name;
const char *option;
} sample_table[] = {
@@ -44,12 +44,12 @@ static const struct perf_sample_info {
PERF_SAMPLE_TYPE(DATA_PAGE_SIZE, "--data-page-size"),
};
-static const struct perf_sample_info *get_sample_info(u64 flags)
+static const struct perf_sample_info *get_sample_info(enum perf_bpf_filter_term type)
{
size_t i;
for (i = 0; i < ARRAY_SIZE(sample_table); i++) {
- if (sample_table[i].type == flags)
+ if (sample_table[i].type == type)
return &sample_table[i];
}
return NULL;
@@ -59,8 +59,32 @@ static int check_sample_flags(struct evsel *evsel, struct perf_bpf_filter_expr *
{
const struct perf_sample_info *info;
- if (evsel->core.attr.sample_type & expr->sample_flags)
- return 0;
+#define CHECK_TERM(x) \
+ case PBF_TERM_##x: \
+ if (evsel->core.attr.sample_type & PERF_SAMPLE_##x) \
+ return 0; \
+ break
+
+ switch (expr->term) {
+ CHECK_TERM(IP);
+ CHECK_TERM(ID);
+ CHECK_TERM(TID);
+ CHECK_TERM(CPU);
+ CHECK_TERM(TIME);
+ CHECK_TERM(ADDR);
+ CHECK_TERM(PERIOD);
+ CHECK_TERM(TRANSACTION);
+ CHECK_TERM(WEIGHT);
+ CHECK_TERM(PHYS_ADDR);
+ CHECK_TERM(CODE_PAGE_SIZE);
+ CHECK_TERM(DATA_PAGE_SIZE);
+ CHECK_TERM(WEIGHT_STRUCT);
+ CHECK_TERM(DATA_SRC);
+ case PBF_TERM_NONE:
+ default:
+ break;
+ }
+#undef CHECK_TERM
if (expr->op == PBF_OP_GROUP_BEGIN) {
struct perf_bpf_filter_expr *group;
@@ -72,10 +96,10 @@ static int check_sample_flags(struct evsel *evsel, struct perf_bpf_filter_expr *
return 0;
}
- info = get_sample_info(expr->sample_flags);
+ info = get_sample_info(expr->term);
if (info == NULL) {
- pr_err("Error: %s event does not have sample flags %lx\n",
- evsel__name(evsel), expr->sample_flags);
+ pr_err("Error: %s event does not have sample flags %d\n",
+ evsel__name(evsel), expr->term);
return -1;
}
@@ -105,7 +129,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel)
struct perf_bpf_filter_entry entry = {
.op = expr->op,
.part = expr->part,
- .flags = expr->sample_flags,
+ .term = expr->term,
.value = expr->val,
};
@@ -122,7 +146,7 @@ int perf_bpf_filter__prepare(struct evsel *evsel)
struct perf_bpf_filter_entry group_entry = {
.op = group->op,
.part = group->part,
- .flags = group->sample_flags,
+ .term = group->term,
.value = group->val,
};
bpf_map_update_elem(fd, &i, &group_entry, BPF_ANY);
@@ -173,7 +197,8 @@ u64 perf_bpf_filter__lost_count(struct evsel *evsel)
return skel ? skel->bss->dropped : 0;
}
-struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(unsigned long sample_flags, int part,
+struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term term,
+ int part,
enum perf_bpf_filter_op op,
unsigned long val)
{
@@ -181,7 +206,7 @@ struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(unsigned long sample_flag
expr = malloc(sizeof(*expr));
if (expr != NULL) {
- expr->sample_flags = sample_flags;
+ expr->term = term,
expr->part = part;
expr->op = op;
expr->val = val;
diff --git a/tools/perf/util/bpf-filter.h b/tools/perf/util/bpf-filter.h
index 7afd159411b8..cd6764442c16 100644
--- a/tools/perf/util/bpf-filter.h
+++ b/tools/perf/util/bpf-filter.h
@@ -11,14 +11,15 @@ struct perf_bpf_filter_expr {
struct list_head groups;
enum perf_bpf_filter_op op;
int part;
- unsigned long sample_flags;
+ enum perf_bpf_filter_term term;
unsigned long val;
};
struct evsel;
#ifdef HAVE_BPF_SKEL
-struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(unsigned long sample_flags, int part,
+struct perf_bpf_filter_expr *perf_bpf_filter_expr__new(enum perf_bpf_filter_term term,
+ int part,
enum perf_bpf_filter_op op,
unsigned long val);
int perf_bpf_filter__parse(struct list_head *expr_head, const char *str);
diff --git a/tools/perf/util/bpf-filter.l b/tools/perf/util/bpf-filter.l
index d4ff0f1345cd..62c959813466 100644
--- a/tools/perf/util/bpf-filter.l
+++ b/tools/perf/util/bpf-filter.l
@@ -9,16 +9,16 @@
#include "bpf-filter.h"
#include "bpf-filter-bison.h"
-static int sample(unsigned long sample_flag)
+static int sample(enum perf_bpf_filter_term term)
{
- perf_bpf_filter_lval.sample.type = sample_flag;
+ perf_bpf_filter_lval.sample.term = term;
perf_bpf_filter_lval.sample.part = 0;
return BFT_SAMPLE;
}
-static int sample_part(unsigned long sample_flag, int part)
+static int sample_part(enum perf_bpf_filter_term term, int part)
{
- perf_bpf_filter_lval.sample.type = sample_flag;
+ perf_bpf_filter_lval.sample.term = term;
perf_bpf_filter_lval.sample.part = part;
return BFT_SAMPLE;
}
@@ -67,34 +67,34 @@ ident [_a-zA-Z][_a-zA-Z0-9]+
{num_hex} { return value(16); }
{space} { }
-ip { return sample(PERF_SAMPLE_IP); }
-id { return sample(PERF_SAMPLE_ID); }
-tid { return sample(PERF_SAMPLE_TID); }
-pid { return sample_part(PERF_SAMPLE_TID, 1); }
-cpu { return sample(PERF_SAMPLE_CPU); }
-time { return sample(PERF_SAMPLE_TIME); }
-addr { return sample(PERF_SAMPLE_ADDR); }
-period { return sample(PERF_SAMPLE_PERIOD); }
-txn { return sample(PERF_SAMPLE_TRANSACTION); }
-weight { return sample(PERF_SAMPLE_WEIGHT); }
-weight1 { return sample_part(PERF_SAMPLE_WEIGHT_STRUCT, 1); }
-weight2 { return sample_part(PERF_SAMPLE_WEIGHT_STRUCT, 2); }
-weight3 { return sample_part(PERF_SAMPLE_WEIGHT_STRUCT, 3); }
-ins_lat { return sample_part(PERF_SAMPLE_WEIGHT_STRUCT, 2); } /* alias for weight2 */
-p_stage_cyc { return sample_part(PERF_SAMPLE_WEIGHT_STRUCT, 3); } /* alias for weight3 */
-retire_lat { return sample_part(PERF_SAMPLE_WEIGHT_STRUCT, 3); } /* alias for weight3 */
-phys_addr { return sample(PERF_SAMPLE_PHYS_ADDR); }
-code_pgsz { return sample(PERF_SAMPLE_CODE_PAGE_SIZE); }
-data_pgsz { return sample(PERF_SAMPLE_DATA_PAGE_SIZE); }
-mem_op { return sample_part(PERF_SAMPLE_DATA_SRC, 1); }
-mem_lvlnum { return sample_part(PERF_SAMPLE_DATA_SRC, 2); }
-mem_lvl { return sample_part(PERF_SAMPLE_DATA_SRC, 2); } /* alias for mem_lvlnum */
-mem_snoop { return sample_part(PERF_SAMPLE_DATA_SRC, 3); } /* include snoopx */
-mem_remote { return sample_part(PERF_SAMPLE_DATA_SRC, 4); }
-mem_lock { return sample_part(PERF_SAMPLE_DATA_SRC, 5); }
-mem_dtlb { return sample_part(PERF_SAMPLE_DATA_SRC, 6); }
-mem_blk { return sample_part(PERF_SAMPLE_DATA_SRC, 7); }
-mem_hops { return sample_part(PERF_SAMPLE_DATA_SRC, 8); }
+ip { return sample(PBF_TERM_IP); }
+id { return sample(PBF_TERM_ID); }
+tid { return sample(PBF_TERM_TID); }
+pid { return sample_part(PBF_TERM_TID, 1); }
+cpu { return sample(PBF_TERM_CPU); }
+time { return sample(PBF_TERM_TIME); }
+addr { return sample(PBF_TERM_ADDR); }
+period { return sample(PBF_TERM_PERIOD); }
+txn { return sample(PBF_TERM_TRANSACTION); }
+weight { return sample(PBF_TERM_WEIGHT); }
+weight1 { return sample_part(PBF_TERM_WEIGHT_STRUCT, 1); }
+weight2 { return sample_part(PBF_TERM_WEIGHT_STRUCT, 2); }
+weight3 { return sample_part(PBF_TERM_WEIGHT_STRUCT, 3); }
+ins_lat { return sample_part(PBF_TERM_WEIGHT_STRUCT, 2); } /* alias for weight2 */
+p_stage_cyc { return sample_part(PBF_TERM_WEIGHT_STRUCT, 3); } /* alias for weight3 */
+retire_lat { return sample_part(PBF_TERM_WEIGHT_STRUCT, 3); } /* alias for weight3 */
+phys_addr { return sample(PBF_TERM_PHYS_ADDR); }
+code_pgsz { return sample(PBF_TERM_CODE_PAGE_SIZE); }
+data_pgsz { return sample(PBF_TERM_DATA_PAGE_SIZE); }
+mem_op { return sample_part(PBF_TERM_DATA_SRC, 1); }
+mem_lvlnum { return sample_part(PBF_TERM_DATA_SRC, 2); }
+mem_lvl { return sample_part(PBF_TERM_DATA_SRC, 2); } /* alias for mem_lvlnum */
+mem_snoop { return sample_part(PBF_TERM_DATA_SRC, 3); } /* include snoopx */
+mem_remote { return sample_part(PBF_TERM_DATA_SRC, 4); }
+mem_lock { return sample_part(PBF_TERM_DATA_SRC, 5); }
+mem_dtlb { return sample_part(PBF_TERM_DATA_SRC, 6); }
+mem_blk { return sample_part(PBF_TERM_DATA_SRC, 7); }
+mem_hops { return sample_part(PBF_TERM_DATA_SRC, 8); }
"==" { return operator(PBF_OP_EQ); }
"!=" { return operator(PBF_OP_NEQ); }
diff --git a/tools/perf/util/bpf-filter.y b/tools/perf/util/bpf-filter.y
index 0e4d6de3c2ad..0c56fccb8874 100644
--- a/tools/perf/util/bpf-filter.y
+++ b/tools/perf/util/bpf-filter.y
@@ -27,7 +27,7 @@ static void perf_bpf_filter_error(struct list_head *expr __maybe_unused,
{
unsigned long num;
struct {
- unsigned long type;
+ enum perf_bpf_filter_term term;
int part;
} sample;
enum perf_bpf_filter_op op;
@@ -62,7 +62,8 @@ filter_term BFT_LOGICAL_OR filter_expr
if ($1->op == PBF_OP_GROUP_BEGIN) {
expr = $1;
} else {
- expr = perf_bpf_filter_expr__new(0, 0, PBF_OP_GROUP_BEGIN, 1);
+ expr = perf_bpf_filter_expr__new(PBF_TERM_NONE, /*part=*/0,
+ PBF_OP_GROUP_BEGIN, /*val=*/1);
list_add_tail(&$1->list, &expr->groups);
}
expr->val++;
@@ -78,7 +79,7 @@ filter_expr
filter_expr:
BFT_SAMPLE BFT_OP BFT_NUM
{
- $$ = perf_bpf_filter_expr__new($1.type, $1.part, $2, $3);
+ $$ = perf_bpf_filter_expr__new($1.term, $1.part, $2, $3);
}
%%
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
index 2e96e1ab084a..161d5ff49cb6 100644
--- a/tools/perf/util/bpf_skel/sample-filter.h
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -16,12 +16,32 @@ enum perf_bpf_filter_op {
PBF_OP_GROUP_END,
};
+enum perf_bpf_filter_term {
+ /* No term is in use. */
+ PBF_TERM_NONE,
+ /* Terms that correspond to PERF_SAMPLE_xx values. */
+ PBF_TERM_IP,
+ PBF_TERM_ID,
+ PBF_TERM_TID,
+ PBF_TERM_CPU,
+ PBF_TERM_TIME,
+ PBF_TERM_ADDR,
+ PBF_TERM_PERIOD,
+ PBF_TERM_TRANSACTION,
+ PBF_TERM_WEIGHT,
+ PBF_TERM_PHYS_ADDR,
+ PBF_TERM_CODE_PAGE_SIZE,
+ PBF_TERM_DATA_PAGE_SIZE,
+ PBF_TERM_WEIGHT_STRUCT,
+ PBF_TERM_DATA_SRC,
+};
+
/* BPF map entry for filtering */
struct perf_bpf_filter_entry {
enum perf_bpf_filter_op op;
__u32 part; /* sub-sample type info when it has multiple values */
- __u64 flags; /* perf sample type flags */
+ enum perf_bpf_filter_term term;
__u64 value;
};
-#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
\ No newline at end of file
+#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
index fb94f5280626..8666c85e9333 100644
--- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -48,31 +48,50 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
{
struct perf_sample_data___new *data = (void *)kctx->data;
- if (!bpf_core_field_exists(data->sample_flags) ||
- (data->sample_flags & entry->flags) == 0)
+ if (!bpf_core_field_exists(data->sample_flags))
return 0;
- switch (entry->flags) {
- case PERF_SAMPLE_IP:
+ switch (entry->term) {
+ case PBF_TERM_NONE:
+ return 0;
+ case PBF_TERM_IP:
+ if ((data->sample_flags & PERF_SAMPLE_IP) == 0)
+ return 0;
return kctx->data->ip;
- case PERF_SAMPLE_ID:
+ case PBF_TERM_ID:
+ if ((data->sample_flags & PERF_SAMPLE_ID) == 0)
+ return 0;
return kctx->data->id;
- case PERF_SAMPLE_TID:
+ case PBF_TERM_TID:
+ if ((data->sample_flags & PERF_SAMPLE_TID) == 0)
+ return 0;
if (entry->part)
return kctx->data->tid_entry.pid;
else
return kctx->data->tid_entry.tid;
- case PERF_SAMPLE_CPU:
+ case PBF_TERM_CPU:
+ if ((data->sample_flags & PERF_SAMPLE_CPU) == 0)
+ return 0;
return kctx->data->cpu_entry.cpu;
- case PERF_SAMPLE_TIME:
+ case PBF_TERM_TIME:
+ if ((data->sample_flags & PERF_SAMPLE_TIME) == 0)
+ return 0;
return kctx->data->time;
- case PERF_SAMPLE_ADDR:
+ case PBF_TERM_ADDR:
+ if ((data->sample_flags & PERF_SAMPLE_ADDR) == 0)
+ return 0;
return kctx->data->addr;
- case PERF_SAMPLE_PERIOD:
+ case PBF_TERM_PERIOD:
+ if ((data->sample_flags & PERF_SAMPLE_PERIOD) == 0)
+ return 0;
return kctx->data->period;
- case PERF_SAMPLE_TRANSACTION:
+ case PBF_TERM_TRANSACTION:
+ if ((data->sample_flags & PERF_SAMPLE_TRANSACTION) == 0)
+ return 0;
return kctx->data->txn;
- case PERF_SAMPLE_WEIGHT_STRUCT:
+ case PBF_TERM_WEIGHT_STRUCT:
+ if ((data->sample_flags & PERF_SAMPLE_WEIGHT_STRUCT) == 0)
+ return 0;
if (entry->part == 1)
return kctx->data->weight.var1_dw;
if (entry->part == 2)
@@ -80,15 +99,25 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
if (entry->part == 3)
return kctx->data->weight.var3_w;
/* fall through */
- case PERF_SAMPLE_WEIGHT:
+ case PBF_TERM_WEIGHT:
+ if ((data->sample_flags & PERF_SAMPLE_WEIGHT) == 0)
+ return 0;
return kctx->data->weight.full;
- case PERF_SAMPLE_PHYS_ADDR:
+ case PBF_TERM_PHYS_ADDR:
+ if ((data->sample_flags & PERF_SAMPLE_PHYS_ADDR) == 0)
+ return 0;
return kctx->data->phys_addr;
- case PERF_SAMPLE_CODE_PAGE_SIZE:
+ case PBF_TERM_CODE_PAGE_SIZE:
+ if ((data->sample_flags & PERF_SAMPLE_CODE_PAGE_SIZE) == 0)
+ return 0;
return kctx->data->code_page_size;
- case PERF_SAMPLE_DATA_PAGE_SIZE:
+ case PBF_TERM_DATA_PAGE_SIZE:
+ if ((data->sample_flags & PERF_SAMPLE_DATA_PAGE_SIZE) == 0)
+ return 0;
return kctx->data->data_page_size;
- case PERF_SAMPLE_DATA_SRC:
+ case PBF_TERM_DATA_SRC:
+ if ((data->sample_flags & PERF_SAMPLE_DATA_SRC) == 0)
+ return 0;
if (entry->part == 1)
return kctx->data->data_src.mem_op;
if (entry->part == 2)
--
2.45.0.rc1.225.g2a3ae87e7f-goog
On Wed, May 15, 2024 at 9:20 PM Ian Rogers <[email protected]> wrote:
>
> Allow uid and gid to be terms in BPF filters by first breaking the
> connection between filter terms and PERF_SAMPLE_xx values. Calculate
> the uid and gid using the bpf_get_current_uid_gid helper, rather than
> from a value in the sample. Allow filters to be passed to perf top, this allows:
>
> $ perf top -e cycles:P --filter "uid == $(id -u)"
>
> to work as a "perf top -u" workaround, as "perf top -u" usually fails
> due to processes/threads terminating between the /proc scan and the
> perf_event_open.
Fwiw, something I noticed playing around with this (my workload was
`perf test -w noploop 100000` as different users) is that old samples
appeared to linger around making terminated processes still appear in
the top list. My guess is that there aren't other samples showing up
and pushing the old sample events out of the ring buffers due to the
filter. This can look quite odd and I don't know if we have a way to
improve upon it, flush the ring buffers, histograms, etc. It appears
to be a latent `perf top` issue that you could encounter on other low
frequency events, but I thought I'd mention it anyway.
Thanks,
Ian
> Ian Rogers (3):
> perf bpf filter: Give terms their own enum
> perf bpf filter: Add uid and gid terms
> perf top: Allow filters on events
>
> tools/perf/Documentation/perf-record.txt | 2 +-
> tools/perf/Documentation/perf-top.txt | 4 ++
> tools/perf/builtin-top.c | 9 +++
> tools/perf/util/bpf-filter.c | 55 ++++++++++++----
> tools/perf/util/bpf-filter.h | 5 +-
> tools/perf/util/bpf-filter.l | 66 +++++++++----------
> tools/perf/util/bpf-filter.y | 7 +-
> tools/perf/util/bpf_skel/sample-filter.h | 27 +++++++-
> tools/perf/util/bpf_skel/sample_filter.bpf.c | 67 +++++++++++++++-----
> 9 files changed, 172 insertions(+), 70 deletions(-)
>
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog
>
On Wed, May 15, 2024 at 10:04 PM Ian Rogers <[email protected]> wrote:
>
> On Wed, May 15, 2024 at 9:20 PM Ian Rogers <[email protected]> wrote:
> >
> > Allow uid and gid to be terms in BPF filters by first breaking the
> > connection between filter terms and PERF_SAMPLE_xx values. Calculate
> > the uid and gid using the bpf_get_current_uid_gid helper, rather than
> > from a value in the sample. Allow filters to be passed to perf top, this allows:
> >
> > $ perf top -e cycles:P --filter "uid == $(id -u)"
> >
> > to work as a "perf top -u" workaround, as "perf top -u" usually fails
> > due to processes/threads terminating between the /proc scan and the
> > perf_event_open.
>
> Fwiw, something I noticed playing around with this (my workload was
> `perf test -w noploop 100000` as different users) is that old samples
> appeared to linger around making terminated processes still appear in
> the top list. My guess is that there aren't other samples showing up
> and pushing the old sample events out of the ring buffers due to the
> filter. This can look quite odd and I don't know if we have a way to
> improve upon it, flush the ring buffers, histograms, etc. It appears
> to be a latent `perf top` issue that you could encounter on other low
> frequency events, but I thought I'd mention it anyway.
Some other thoughts:
- It is kind of annoying with the --filter option (either on top or
record) that there first needs to be an event to filter on. It'd be
nice if we could just filter the default event.
- Should "perf top --uid=1234" be removed or turned into an alias
for '--filter "uid == $(id -u)"' given the --uid option generally
doesn't work?
- What should happen to the perf top --pid and --tid options, should
they be filters? Should they fallback on /proc scanning if there
aren't sufficient BPF permissions? The plumbing for that is going to
be messy.
- There should probably be a way to filter on cgroups.
- Does the user care that there are 3 kinds of filter that will work
differently? Could we break them apart to make it more explicit, I may
want tracepoint events with a BPF filter. How can we ensure 1 syntax
for the 3 kinds of filter.
- Filtering on register values could be potentially interesting, for
example, sampling on memcpy-s where the length is over a threshold. We
have a register capture test:
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/tests/shell/record.sh#n81
Perhaps the filter could look something like 'perf record -g -e
mem:$ADDRESS_OF_MEMCPY:x --filter "reg:rdx > 1024"' - this makes me
think we need to make a more convenient way to specify memory
addresses as symbols.
Thanks,
Ian
>
> > Ian Rogers (3):
> > perf bpf filter: Give terms their own enum
> > perf bpf filter: Add uid and gid terms
> > perf top: Allow filters on events
> >
> > tools/perf/Documentation/perf-record.txt | 2 +-
> > tools/perf/Documentation/perf-top.txt | 4 ++
> > tools/perf/builtin-top.c | 9 +++
> > tools/perf/util/bpf-filter.c | 55 ++++++++++++----
> > tools/perf/util/bpf-filter.h | 5 +-
> > tools/perf/util/bpf-filter.l | 66 +++++++++----------
> > tools/perf/util/bpf-filter.y | 7 +-
> > tools/perf/util/bpf_skel/sample-filter.h | 27 +++++++-
> > tools/perf/util/bpf_skel/sample_filter.bpf.c | 67 +++++++++++++++-----
> > 9 files changed, 172 insertions(+), 70 deletions(-)
> >
> > --
> > 2.45.0.rc1.225.g2a3ae87e7f-goog
> >
Hi Ian,
On Thu, May 16, 2024 at 10:34 AM Ian Rogers <[email protected]> wrote:
>
> On Wed, May 15, 2024 at 10:04 PM Ian Rogers <[email protected]> wrote:
> >
> > On Wed, May 15, 2024 at 9:20 PM Ian Rogers <[email protected]> wrote:
> > >
> > > Allow uid and gid to be terms in BPF filters by first breaking the
> > > connection between filter terms and PERF_SAMPLE_xx values. Calculate
> > > the uid and gid using the bpf_get_current_uid_gid helper, rather than
> > > from a value in the sample. Allow filters to be passed to perf top, this allows:
> > >
> > > $ perf top -e cycles:P --filter "uid == $(id -u)"
> > >
> > > to work as a "perf top -u" workaround, as "perf top -u" usually fails
> > > due to processes/threads terminating between the /proc scan and the
> > > perf_event_open.
> >
> > Fwiw, something I noticed playing around with this (my workload was
> > `perf test -w noploop 100000` as different users) is that old samples
> > appeared to linger around making terminated processes still appear in
> > the top list. My guess is that there aren't other samples showing up
> > and pushing the old sample events out of the ring buffers due to the
> > filter. This can look quite odd and I don't know if we have a way to
> > improve upon it, flush the ring buffers, histograms, etc. It appears
> > to be a latent `perf top` issue that you could encounter on other low
> > frequency events, but I thought I'd mention it anyway.
>
> Some other thoughts:
>
> - It is kind of annoying with the --filter option (either on top or
> record) that there first needs to be an event to filter on. It'd be
> nice if we could just filter the default event.
Hmm.. right. It should work with the default event when
no -e option is given.
>
> - Should "perf top --uid=1234" be removed or turned into an alias
> for '--filter "uid == $(id -u)"' given the --uid option generally
> doesn't work?
I think --uid should not fail if it cannot find the task.
I had a similar situation for perf stat --for-each-cgroup
and made it ignore the failures.
>
> - What should happen to the perf top --pid and --tid options, should
> they be filters? Should they fallback on /proc scanning if there
> aren't sufficient BPF permissions? The plumbing for that is going to
> be messy.
I'm not inclined to do such things.
>
> - There should probably be a way to filter on cgroups.
+1
>
> - Does the user care that there are 3 kinds of filter that will work
> differently? Could we break them apart to make it more explicit, I may
> want tracepoint events with a BPF filter. How can we ensure 1 syntax
> for the 3 kinds of filter.
>
> - Filtering on register values could be potentially interesting, for
> example, sampling on memcpy-s where the length is over a threshold. We
> have a register capture test:
> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/tests/shell/record.sh#n81
> Perhaps the filter could look something like 'perf record -g -e
> mem:$ADDRESS_OF_MEMCPY:x --filter "reg:rdx > 1024"' - this makes me
> think we need to make a more convenient way to specify memory
> addresses as symbols.
I've been thinking about a similar idea on uftrace.
It would filter the function based on the value of an
argument or a global variable.
Thanks,
Namhyung
> >
> > > Ian Rogers (3):
> > > perf bpf filter: Give terms their own enum
> > > perf bpf filter: Add uid and gid terms
> > > perf top: Allow filters on events
> > >
> > > tools/perf/Documentation/perf-record.txt | 2 +-
> > > tools/perf/Documentation/perf-top.txt | 4 ++
> > > tools/perf/builtin-top.c | 9 +++
> > > tools/perf/util/bpf-filter.c | 55 ++++++++++++----
> > > tools/perf/util/bpf-filter.h | 5 +-
> > > tools/perf/util/bpf-filter.l | 66 +++++++++----------
> > > tools/perf/util/bpf-filter.y | 7 +-
> > > tools/perf/util/bpf_skel/sample-filter.h | 27 +++++++-
> > > tools/perf/util/bpf_skel/sample_filter.bpf.c | 67 +++++++++++++++-----
> > > 9 files changed, 172 insertions(+), 70 deletions(-)
> > >
> > > --
> > > 2.45.0.rc1.225.g2a3ae87e7f-goog
> > >
On Wed, May 15, 2024 at 9:20 PM Ian Rogers <[email protected]> wrote:
>
> Give the term types their own enum so that additional terms can be
> added that don't correspond to a PERF_SAMPLE_xx flag. The term values
> are numerically ascending rather than bit field positions, this means
> they need translating to a PERF_SAMPLE_xx bit field in certain places
> and they are more densely encoded.
>
> Signed-off-by: Ian Rogers <[email protected]>
> ---
[SNIP]
> diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
> index 2e96e1ab084a..161d5ff49cb6 100644
> --- a/tools/perf/util/bpf_skel/sample-filter.h
> +++ b/tools/perf/util/bpf_skel/sample-filter.h
> @@ -16,12 +16,32 @@ enum perf_bpf_filter_op {
> PBF_OP_GROUP_END,
> };
>
> +enum perf_bpf_filter_term {
> + /* No term is in use. */
> + PBF_TERM_NONE,
> + /* Terms that correspond to PERF_SAMPLE_xx values. */
> + PBF_TERM_IP,
> + PBF_TERM_ID,
> + PBF_TERM_TID,
> + PBF_TERM_CPU,
> + PBF_TERM_TIME,
> + PBF_TERM_ADDR,
> + PBF_TERM_PERIOD,
> + PBF_TERM_TRANSACTION,
> + PBF_TERM_WEIGHT,
> + PBF_TERM_PHYS_ADDR,
> + PBF_TERM_CODE_PAGE_SIZE,
> + PBF_TERM_DATA_PAGE_SIZE,
> + PBF_TERM_WEIGHT_STRUCT,
> + PBF_TERM_DATA_SRC,
> +};
> +
> /* BPF map entry for filtering */
> struct perf_bpf_filter_entry {
> enum perf_bpf_filter_op op;
> __u32 part; /* sub-sample type info when it has multiple values */
> - __u64 flags; /* perf sample type flags */
> + enum perf_bpf_filter_term term;
> __u64 value;
> };
>
> -#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
> \ No newline at end of file
> +#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
> diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
> index fb94f5280626..8666c85e9333 100644
> --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
> +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
> @@ -48,31 +48,50 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
> {
> struct perf_sample_data___new *data = (void *)kctx->data;
>
> - if (!bpf_core_field_exists(data->sample_flags) ||
> - (data->sample_flags & entry->flags) == 0)
> + if (!bpf_core_field_exists(data->sample_flags))
> return 0;
>
> - switch (entry->flags) {
> - case PERF_SAMPLE_IP:
> + switch (entry->term) {
> + case PBF_TERM_NONE:
> + return 0;
> + case PBF_TERM_IP:
> + if ((data->sample_flags & PERF_SAMPLE_IP) == 0)
> + return 0;
Can we check this in a single place like in the original code
instead of doing it in every case? I think we can keep the
entry->flags and check it only if it's non zero. Then uid and
gid will have 0 to skip.
Thanks,
Namhyung
> return kctx->data->ip;
> - case PERF_SAMPLE_ID:
> + case PBF_TERM_ID:
> + if ((data->sample_flags & PERF_SAMPLE_ID) == 0)
> + return 0;
> return kctx->data->id;
> - case PERF_SAMPLE_TID:
> + case PBF_TERM_TID:
> + if ((data->sample_flags & PERF_SAMPLE_TID) == 0)
> + return 0;
> if (entry->part)
> return kctx->data->tid_entry.pid;
> else
> return kctx->data->tid_entry.tid;
> - case PERF_SAMPLE_CPU:
> + case PBF_TERM_CPU:
> + if ((data->sample_flags & PERF_SAMPLE_CPU) == 0)
> + return 0;
> return kctx->data->cpu_entry.cpu;
> - case PERF_SAMPLE_TIME:
> + case PBF_TERM_TIME:
> + if ((data->sample_flags & PERF_SAMPLE_TIME) == 0)
> + return 0;
> return kctx->data->time;
> - case PERF_SAMPLE_ADDR:
> + case PBF_TERM_ADDR:
> + if ((data->sample_flags & PERF_SAMPLE_ADDR) == 0)
> + return 0;
> return kctx->data->addr;
> - case PERF_SAMPLE_PERIOD:
> + case PBF_TERM_PERIOD:
> + if ((data->sample_flags & PERF_SAMPLE_PERIOD) == 0)
> + return 0;
> return kctx->data->period;
> - case PERF_SAMPLE_TRANSACTION:
> + case PBF_TERM_TRANSACTION:
> + if ((data->sample_flags & PERF_SAMPLE_TRANSACTION) == 0)
> + return 0;
> return kctx->data->txn;
> - case PERF_SAMPLE_WEIGHT_STRUCT:
> + case PBF_TERM_WEIGHT_STRUCT:
> + if ((data->sample_flags & PERF_SAMPLE_WEIGHT_STRUCT) == 0)
> + return 0;
> if (entry->part == 1)
> return kctx->data->weight.var1_dw;
> if (entry->part == 2)
> @@ -80,15 +99,25 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
> if (entry->part == 3)
> return kctx->data->weight.var3_w;
> /* fall through */
> - case PERF_SAMPLE_WEIGHT:
> + case PBF_TERM_WEIGHT:
> + if ((data->sample_flags & PERF_SAMPLE_WEIGHT) == 0)
> + return 0;
> return kctx->data->weight.full;
> - case PERF_SAMPLE_PHYS_ADDR:
> + case PBF_TERM_PHYS_ADDR:
> + if ((data->sample_flags & PERF_SAMPLE_PHYS_ADDR) == 0)
> + return 0;
> return kctx->data->phys_addr;
> - case PERF_SAMPLE_CODE_PAGE_SIZE:
> + case PBF_TERM_CODE_PAGE_SIZE:
> + if ((data->sample_flags & PERF_SAMPLE_CODE_PAGE_SIZE) == 0)
> + return 0;
> return kctx->data->code_page_size;
> - case PERF_SAMPLE_DATA_PAGE_SIZE:
> + case PBF_TERM_DATA_PAGE_SIZE:
> + if ((data->sample_flags & PERF_SAMPLE_DATA_PAGE_SIZE) == 0)
> + return 0;
> return kctx->data->data_page_size;
> - case PERF_SAMPLE_DATA_SRC:
> + case PBF_TERM_DATA_SRC:
> + if ((data->sample_flags & PERF_SAMPLE_DATA_SRC) == 0)
> + return 0;
> if (entry->part == 1)
> return kctx->data->data_src.mem_op;
> if (entry->part == 2)
> --
> 2.45.0.rc1.225.g2a3ae87e7f-goog
>
On Fri, May 17, 2024 at 6:36 PM Namhyung Kim <[email protected]> wrote:
>
> On Wed, May 15, 2024 at 9:20 PM Ian Rogers <[email protected]> wrote:
> >
> > Give the term types their own enum so that additional terms can be
> > added that don't correspond to a PERF_SAMPLE_xx flag. The term values
> > are numerically ascending rather than bit field positions, this means
> > they need translating to a PERF_SAMPLE_xx bit field in certain places
> > and they are more densely encoded.
> >
> > Signed-off-by: Ian Rogers <[email protected]>
> > ---
> [SNIP]
> > diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
> > index 2e96e1ab084a..161d5ff49cb6 100644
> > --- a/tools/perf/util/bpf_skel/sample-filter.h
> > +++ b/tools/perf/util/bpf_skel/sample-filter.h
> > @@ -16,12 +16,32 @@ enum perf_bpf_filter_op {
> > PBF_OP_GROUP_END,
> > };
> >
> > +enum perf_bpf_filter_term {
> > + /* No term is in use. */
> > + PBF_TERM_NONE,
> > + /* Terms that correspond to PERF_SAMPLE_xx values. */
> > + PBF_TERM_IP,
> > + PBF_TERM_ID,
> > + PBF_TERM_TID,
> > + PBF_TERM_CPU,
> > + PBF_TERM_TIME,
> > + PBF_TERM_ADDR,
> > + PBF_TERM_PERIOD,
> > + PBF_TERM_TRANSACTION,
> > + PBF_TERM_WEIGHT,
> > + PBF_TERM_PHYS_ADDR,
> > + PBF_TERM_CODE_PAGE_SIZE,
> > + PBF_TERM_DATA_PAGE_SIZE,
> > + PBF_TERM_WEIGHT_STRUCT,
> > + PBF_TERM_DATA_SRC,
> > +};
> > +
> > /* BPF map entry for filtering */
> > struct perf_bpf_filter_entry {
> > enum perf_bpf_filter_op op;
> > __u32 part; /* sub-sample type info when it has multiple values */
> > - __u64 flags; /* perf sample type flags */
> > + enum perf_bpf_filter_term term;
> > __u64 value;
> > };
> >
> > -#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
> > \ No newline at end of file
> > +#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
> > diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
> > index fb94f5280626..8666c85e9333 100644
> > --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
> > +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
> > @@ -48,31 +48,50 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
> > {
> > struct perf_sample_data___new *data = (void *)kctx->data;
> >
> > - if (!bpf_core_field_exists(data->sample_flags) ||
> > - (data->sample_flags & entry->flags) == 0)
> > + if (!bpf_core_field_exists(data->sample_flags))
> > return 0;
> >
> > - switch (entry->flags) {
> > - case PERF_SAMPLE_IP:
> > + switch (entry->term) {
> > + case PBF_TERM_NONE:
> > + return 0;
> > + case PBF_TERM_IP:
> > + if ((data->sample_flags & PERF_SAMPLE_IP) == 0)
> > + return 0;
>
> Can we check this in a single place like in the original code
> instead of doing it in every case? I think we can keep the
> entry->flags and check it only if it's non zero. Then uid and
> gid will have 0 to skip.
I found the old way confusing. As the flags are a bitmap it looks like
more than one can be set, if that were the case then the switch
statement would be broken as the case wouldn't exist. Using an enum
like this allows warnings to occur when a term is missing in the
switch statement - which is good when you are adding new terms. I
think it more obviously matches the man page. We could arrange for the
enum values to encode the shift position of the flag. Something like:
if ((entry->term > PBF_TERM_NONE && entry->term <= PBF_TERM_DATA_SRC) &&
(data->sample_flags & (1 << entry->term)) == 0)
return 0;
But the problem there is that not every sample type has an enum value,
and I'm not sure it makes sense for things like STREAM_ID. We could do
some macro magic to reduce the verbosity like:
#define SAMPLE_CASE(x) \
case PBF_TERM_##x: \
if ((data->sample_flags & PERF_SAMPLE_x) == 0) \
return 0
But I thought that made the code harder to read given the relatively
small number of sample cases.
Thanks,
Ian
> Thanks,
> Namhyung
>
>
> > return kctx->data->ip;
> > - case PERF_SAMPLE_ID:
> > + case PBF_TERM_ID:
> > + if ((data->sample_flags & PERF_SAMPLE_ID) == 0)
> > + return 0;
> > return kctx->data->id;
> > - case PERF_SAMPLE_TID:
> > + case PBF_TERM_TID:
> > + if ((data->sample_flags & PERF_SAMPLE_TID) == 0)
> > + return 0;
> > if (entry->part)
> > return kctx->data->tid_entry.pid;
> > else
> > return kctx->data->tid_entry.tid;
> > - case PERF_SAMPLE_CPU:
> > + case PBF_TERM_CPU:
> > + if ((data->sample_flags & PERF_SAMPLE_CPU) == 0)
> > + return 0;
> > return kctx->data->cpu_entry.cpu;
> > - case PERF_SAMPLE_TIME:
> > + case PBF_TERM_TIME:
> > + if ((data->sample_flags & PERF_SAMPLE_TIME) == 0)
> > + return 0;
> > return kctx->data->time;
> > - case PERF_SAMPLE_ADDR:
> > + case PBF_TERM_ADDR:
> > + if ((data->sample_flags & PERF_SAMPLE_ADDR) == 0)
> > + return 0;
> > return kctx->data->addr;
> > - case PERF_SAMPLE_PERIOD:
> > + case PBF_TERM_PERIOD:
> > + if ((data->sample_flags & PERF_SAMPLE_PERIOD) == 0)
> > + return 0;
> > return kctx->data->period;
> > - case PERF_SAMPLE_TRANSACTION:
> > + case PBF_TERM_TRANSACTION:
> > + if ((data->sample_flags & PERF_SAMPLE_TRANSACTION) == 0)
> > + return 0;
> > return kctx->data->txn;
> > - case PERF_SAMPLE_WEIGHT_STRUCT:
> > + case PBF_TERM_WEIGHT_STRUCT:
> > + if ((data->sample_flags & PERF_SAMPLE_WEIGHT_STRUCT) == 0)
> > + return 0;
> > if (entry->part == 1)
> > return kctx->data->weight.var1_dw;
> > if (entry->part == 2)
> > @@ -80,15 +99,25 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
> > if (entry->part == 3)
> > return kctx->data->weight.var3_w;
> > /* fall through */
> > - case PERF_SAMPLE_WEIGHT:
> > + case PBF_TERM_WEIGHT:
> > + if ((data->sample_flags & PERF_SAMPLE_WEIGHT) == 0)
> > + return 0;
> > return kctx->data->weight.full;
> > - case PERF_SAMPLE_PHYS_ADDR:
> > + case PBF_TERM_PHYS_ADDR:
> > + if ((data->sample_flags & PERF_SAMPLE_PHYS_ADDR) == 0)
> > + return 0;
> > return kctx->data->phys_addr;
> > - case PERF_SAMPLE_CODE_PAGE_SIZE:
> > + case PBF_TERM_CODE_PAGE_SIZE:
> > + if ((data->sample_flags & PERF_SAMPLE_CODE_PAGE_SIZE) == 0)
> > + return 0;
> > return kctx->data->code_page_size;
> > - case PERF_SAMPLE_DATA_PAGE_SIZE:
> > + case PBF_TERM_DATA_PAGE_SIZE:
> > + if ((data->sample_flags & PERF_SAMPLE_DATA_PAGE_SIZE) == 0)
> > + return 0;
> > return kctx->data->data_page_size;
> > - case PERF_SAMPLE_DATA_SRC:
> > + case PBF_TERM_DATA_SRC:
> > + if ((data->sample_flags & PERF_SAMPLE_DATA_SRC) == 0)
> > + return 0;
> > if (entry->part == 1)
> > return kctx->data->data_src.mem_op;
> > if (entry->part == 2)
> > --
> > 2.45.0.rc1.225.g2a3ae87e7f-goog
> >
On Fri, May 17, 2024 at 8:30 PM Ian Rogers <[email protected]> wrote:
>
> On Fri, May 17, 2024 at 6:36 PM Namhyung Kim <[email protected]> wrote:
> >
> > On Wed, May 15, 2024 at 9:20 PM Ian Rogers <[email protected]> wrote:
> > >
> > > Give the term types their own enum so that additional terms can be
> > > added that don't correspond to a PERF_SAMPLE_xx flag. The term values
> > > are numerically ascending rather than bit field positions, this means
> > > they need translating to a PERF_SAMPLE_xx bit field in certain places
> > > and they are more densely encoded.
> > >
> > > Signed-off-by: Ian Rogers <[email protected]>
> > > ---
> > [SNIP]
> > > diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
> > > index fb94f5280626..8666c85e9333 100644
> > > --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c
> > > +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
> > > @@ -48,31 +48,50 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
> > > {
> > > struct perf_sample_data___new *data = (void *)kctx->data;
> > >
> > > - if (!bpf_core_field_exists(data->sample_flags) ||
> > > - (data->sample_flags & entry->flags) == 0)
> > > + if (!bpf_core_field_exists(data->sample_flags))
> > > return 0;
> > >
> > > - switch (entry->flags) {
> > > - case PERF_SAMPLE_IP:
> > > + switch (entry->term) {
> > > + case PBF_TERM_NONE:
> > > + return 0;
> > > + case PBF_TERM_IP:
> > > + if ((data->sample_flags & PERF_SAMPLE_IP) == 0)
> > > + return 0;
> >
> > Can we check this in a single place like in the original code
> > instead of doing it in every case? I think we can keep the
> > entry->flags and check it only if it's non zero. Then uid and
> > gid will have 0 to skip.
>
> I found the old way confusing. As the flags are a bitmap it looks like
> more than one can be set, if that were the case then the switch
> statement would be broken as the case wouldn't exist. Using an enum
The entry->flags is set by the bpf-filter code and it's guaranteed
to have a single bit.
> like this allows warnings to occur when a term is missing in the
> switch statement - which is good when you are adding new terms. I
> think it more obviously matches the man page. We could arrange for the
> enum values to encode the shift position of the flag. Something like:
>
> if ((entry->term > PBF_TERM_NONE && entry->term <= PBF_TERM_DATA_SRC) &&
> (data->sample_flags & (1 << entry->term)) == 0)
> return 0;
Actually I'm ok with enum (and bit shift if needed). I'm just thinking
if it'd be nicer if we can have a check in a single place.
>
> But the problem there is that not every sample type has an enum value,
> and I'm not sure it makes sense for things like STREAM_ID. We could do
> some macro magic to reduce the verbosity like:
>
> #define SAMPLE_CASE(x) \
> case PBF_TERM_##x: \
> if ((data->sample_flags & PERF_SAMPLE_x) == 0) \
> return 0
>
> But I thought that made the code harder to read given the relatively
> small number of sample cases.
I also want to avoid the macro if possible.
Thanks,
Namhyung
> >
> > > return kctx->data->ip;
> > > - case PERF_SAMPLE_ID:
> > > + case PBF_TERM_ID:
> > > + if ((data->sample_flags & PERF_SAMPLE_ID) == 0)
> > > + return 0;
> > > return kctx->data->id;
> > > - case PERF_SAMPLE_TID:
> > > + case PBF_TERM_TID:
> > > + if ((data->sample_flags & PERF_SAMPLE_TID) == 0)
> > > + return 0;
> > > if (entry->part)
> > > return kctx->data->tid_entry.pid;
> > > else
> > > return kctx->data->tid_entry.tid;
> > > - case PERF_SAMPLE_CPU:
> > > + case PBF_TERM_CPU:
> > > + if ((data->sample_flags & PERF_SAMPLE_CPU) == 0)
> > > + return 0;
> > > return kctx->data->cpu_entry.cpu;
> > > - case PERF_SAMPLE_TIME:
> > > + case PBF_TERM_TIME:
> > > + if ((data->sample_flags & PERF_SAMPLE_TIME) == 0)
> > > + return 0;
> > > return kctx->data->time;
> > > - case PERF_SAMPLE_ADDR:
> > > + case PBF_TERM_ADDR:
> > > + if ((data->sample_flags & PERF_SAMPLE_ADDR) == 0)
> > > + return 0;
> > > return kctx->data->addr;
> > > - case PERF_SAMPLE_PERIOD:
> > > + case PBF_TERM_PERIOD:
> > > + if ((data->sample_flags & PERF_SAMPLE_PERIOD) == 0)
> > > + return 0;
> > > return kctx->data->period;
> > > - case PERF_SAMPLE_TRANSACTION:
> > > + case PBF_TERM_TRANSACTION:
> > > + if ((data->sample_flags & PERF_SAMPLE_TRANSACTION) == 0)
> > > + return 0;
> > > return kctx->data->txn;
> > > - case PERF_SAMPLE_WEIGHT_STRUCT:
> > > + case PBF_TERM_WEIGHT_STRUCT:
> > > + if ((data->sample_flags & PERF_SAMPLE_WEIGHT_STRUCT) == 0)
> > > + return 0;
> > > if (entry->part == 1)
> > > return kctx->data->weight.var1_dw;
> > > if (entry->part == 2)
> > > @@ -80,15 +99,25 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
> > > if (entry->part == 3)
> > > return kctx->data->weight.var3_w;
> > > /* fall through */
> > > - case PERF_SAMPLE_WEIGHT:
> > > + case PBF_TERM_WEIGHT:
> > > + if ((data->sample_flags & PERF_SAMPLE_WEIGHT) == 0)
> > > + return 0;
> > > return kctx->data->weight.full;
> > > - case PERF_SAMPLE_PHYS_ADDR:
> > > + case PBF_TERM_PHYS_ADDR:
> > > + if ((data->sample_flags & PERF_SAMPLE_PHYS_ADDR) == 0)
> > > + return 0;
> > > return kctx->data->phys_addr;
> > > - case PERF_SAMPLE_CODE_PAGE_SIZE:
> > > + case PBF_TERM_CODE_PAGE_SIZE:
> > > + if ((data->sample_flags & PERF_SAMPLE_CODE_PAGE_SIZE) == 0)
> > > + return 0;
> > > return kctx->data->code_page_size;
> > > - case PERF_SAMPLE_DATA_PAGE_SIZE:
> > > + case PBF_TERM_DATA_PAGE_SIZE:
> > > + if ((data->sample_flags & PERF_SAMPLE_DATA_PAGE_SIZE) == 0)
> > > + return 0;
> > > return kctx->data->data_page_size;
> > > - case PERF_SAMPLE_DATA_SRC:
> > > + case PBF_TERM_DATA_SRC:
> > > + if ((data->sample_flags & PERF_SAMPLE_DATA_SRC) == 0)
> > > + return 0;
> > > if (entry->part == 1)
> > > return kctx->data->data_src.mem_op;
> > > if (entry->part == 2)
> > > --
> > > 2.45.0.rc1.225.g2a3ae87e7f-goog
> > >