When the BPF program calls bpf_cast_to_kern_ctx(), it assumes the program will
access perf sample data directly and call perf_prepare_sample() to make sure
the sample data is populated.
Signed-off-by: Namhyung Kim <[email protected]>
---
include/linux/bpf.h | 1 +
kernel/bpf/verifier.c | 1 +
kernel/events/core.c | 3 +++
3 files changed, 5 insertions(+)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5fec2d1be6d7..6bd4c21a6dd4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1341,6 +1341,7 @@ struct bpf_prog {
enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
call_get_func_ip:1, /* Do we call get_func_ip() */
+ call_cast_kctx:1, /* Do we call bpf_cast_to_kern_ctx() */
tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index faa358b3d5d7..23a9dc187292 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9236,6 +9236,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+ env->prog->call_cast_kctx = 1;
} else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
if (!ret_t || !btf_type_is_struct(ret_t)) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e47914ac8732..a654a0cb6842 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10332,6 +10332,7 @@ static void bpf_overflow_handler(struct perf_event *event,
.event = event,
};
struct bpf_prog *prog;
+ struct perf_event_header dummy;
int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
@@ -10346,6 +10347,8 @@ static void bpf_overflow_handler(struct perf_event *event,
data->callchain = perf_callchain(event, regs);
data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
}
+ if (prog->call_cast_kctx)
+ perf_prepare_sample(&dummy, data, event, regs);
ret = bpf_prog_run(prog, &ctx);
}
--
2.39.0.314.g84b9a713c41-goog
On Tue, Dec 20, 2022 at 02:01:43PM -0800, Namhyung Kim wrote:
> When the BPF program calls bpf_cast_to_kern_ctx(), it assumes the program will
> access perf sample data directly and call perf_prepare_sample() to make sure
> the sample data is populated.
I don't understand a word of this :/ What are you doing and why?
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> include/linux/bpf.h | 1 +
> kernel/bpf/verifier.c | 1 +
> kernel/events/core.c | 3 +++
> 3 files changed, 5 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 5fec2d1be6d7..6bd4c21a6dd4 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1341,6 +1341,7 @@ struct bpf_prog {
> enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
> call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
> call_get_func_ip:1, /* Do we call get_func_ip() */
> + call_cast_kctx:1, /* Do we call bpf_cast_to_kern_ctx() */
> tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */
> enum bpf_prog_type type; /* Type of BPF program */
> enum bpf_attach_type expected_attach_type; /* For some prog types */
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index faa358b3d5d7..23a9dc187292 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -9236,6 +9236,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
> regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
> regs[BPF_REG_0].btf = desc_btf;
> regs[BPF_REG_0].btf_id = meta.ret_btf_id;
> + env->prog->call_cast_kctx = 1;
> } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
> ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
> if (!ret_t || !btf_type_is_struct(ret_t)) {
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index e47914ac8732..a654a0cb6842 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -10332,6 +10332,7 @@ static void bpf_overflow_handler(struct perf_event *event,
> .event = event,
> };
> struct bpf_prog *prog;
> + struct perf_event_header dummy;
> int ret = 0;
>
> ctx.regs = perf_arch_bpf_user_pt_regs(regs);
> @@ -10346,6 +10347,8 @@ static void bpf_overflow_handler(struct perf_event *event,
> data->callchain = perf_callchain(event, regs);
> data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
> }
> + if (prog->call_cast_kctx)
> + perf_prepare_sample(&dummy, data, event, regs);
>
> ret = bpf_prog_run(prog, &ctx);
> }
> --
> 2.39.0.314.g84b9a713c41-goog
>
On 12/22/22 1:55 PM, Peter Zijlstra wrote:
> On Tue, Dec 20, 2022 at 02:01:43PM -0800, Namhyung Kim wrote:
>> When the BPF program calls bpf_cast_to_kern_ctx(), it assumes the program will
>> access perf sample data directly and call perf_prepare_sample() to make sure
>> the sample data is populated.
>
> I don't understand a word of this :/ What are you doing and why?
Yeah, above commit message is too terse and unclear. Also, not following where
this assumption comes from; bpf_cast_to_kern_ctx() can be used elsewhere, too,
not just tracing. Recent example from CI side can be found [0].
Thanks,
Daniel
[0] bpf tree, 70a00e2f1dba ("selftests/bpf: Test bpf_skb_adjust_room on CHECKSUM_PARTIAL")
Hello,
On Thu, Dec 22, 2022 at 5:17 AM Daniel Borkmann <[email protected]> wrote:
>
> On 12/22/22 1:55 PM, Peter Zijlstra wrote:
> > On Tue, Dec 20, 2022 at 02:01:43PM -0800, Namhyung Kim wrote:
> >> When the BPF program calls bpf_cast_to_kern_ctx(), it assumes the program will
> >> access perf sample data directly and call perf_prepare_sample() to make sure
> >> the sample data is populated.
> >
> > I don't understand a word of this :/ What are you doing and why?
>
> Yeah, above commit message is too terse and unclear. Also, not following where
> this assumption comes from; bpf_cast_to_kern_ctx() can be used elsewhere, too,
> not just tracing. Recent example from CI side can be found [0].
Sorry about that. Let me rephrase it like below:
With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
can access perf sample data directly from the ctx. But the perf sample
data is not fully prepared at this point, and some fields can have invalid
uninitialized values. So it needs to call perf_prepare_sample() before
calling the BPF overflow handler.
But just calling perf_prepare_sample() can be costly when the BPF
doesn't access the sample data. It's needed only if the BPF program
uses the sample data. But it seems hard for the BPF verifier to detect
if it'd access perf sample data. So I just checked if it calls the
bpf_cast_to_kern_ctx() and assumed it does.
Thanks,
Namhyung
On Thu, Dec 22, 2022 at 09:34:42AM -0800, Namhyung Kim wrote:
> Sorry about that. Let me rephrase it like below:
>
> With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
> can access perf sample data directly from the ctx.
This is the bpf_prog_run() in bpf_overflow_handler(), right?
> But the perf sample
> data is not fully prepared at this point, and some fields can have invalid
> uninitialized values. So it needs to call perf_prepare_sample() before
> calling the BPF overflow handler.
It never was, why is it a problem now?
> But just calling perf_prepare_sample() can be costly when the BPF
So you potentially call it twice now, how's that useful?
On Thu, Dec 22, 2022 at 12:16 PM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Dec 22, 2022 at 09:34:42AM -0800, Namhyung Kim wrote:
>
> > Sorry about that. Let me rephrase it like below:
> >
> > With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
> > can access perf sample data directly from the ctx.
>
> This is the bpf_prog_run() in bpf_overflow_handler(), right?
Yes.
>
> > But the perf sample
> > data is not fully prepared at this point, and some fields can have invalid
> > uninitialized values. So it needs to call perf_prepare_sample() before
> > calling the BPF overflow handler.
>
> It never was, why is it a problem now?
BPF used to allow selected fields only like period and addr, and they
are initialized always by perf_sample_data_init(). This is relaxed
by the bpf_cast_to_kern_ctx() and it can easily access arbitrary
fields of perf_sample_data now.
The background of this change is to use BPF as a filter for perf
event samples. The code is there already and returning 0 from
BPF can drop perf samples. With access to more sample data,
it'd make more educated decisions.
For example, I got some requests to limit perf samples in a
selected region of address (code or data). Or it can collect
samples only if some hardware specific information is set in
the raw data like in AMD IBS. We can easily extend it to other
sample info based on users' needs.
>
> > But just calling perf_prepare_sample() can be costly when the BPF
>
> So you potentially call it twice now, how's that useful?
Right. I think we can check data->sample_flags in
perf_prepare_sample() to minimize the duplicate work.
It already does it for some fields, but misses others.
Thanks,
Namhyung
On Thu, Dec 22, 2022 at 02:25:49PM -0800, Namhyung Kim wrote:
> On Thu, Dec 22, 2022 at 12:16 PM Peter Zijlstra <[email protected]> wrote:
> >
> > On Thu, Dec 22, 2022 at 09:34:42AM -0800, Namhyung Kim wrote:
> >
> > > Sorry about that. Let me rephrase it like below:
> > >
> > > With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
> > > can access perf sample data directly from the ctx.
> >
> > This is the bpf_prog_run() in bpf_overflow_handler(), right?
>
> Yes.
>
> >
> > > But the perf sample
> > > data is not fully prepared at this point, and some fields can have invalid
> > > uninitialized values. So it needs to call perf_prepare_sample() before
> > > calling the BPF overflow handler.
> >
> > It never was, why is it a problem now?
>
> BPF used to allow selected fields only like period and addr, and they
> are initialized always by perf_sample_data_init(). This is relaxed
> by the bpf_cast_to_kern_ctx() and it can easily access arbitrary
> fields of perf_sample_data now.
>
> The background of this change is to use BPF as a filter for perf
> event samples. The code is there already and returning 0 from
> BPF can drop perf samples. With access to more sample data,
> it'd make more educated decisions.
>
> For example, I got some requests to limit perf samples in a
> selected region of address (code or data). Or it can collect
> samples only if some hardware specific information is set in
> the raw data like in AMD IBS. We can easily extend it to other
> sample info based on users' needs.
>
> >
> > > But just calling perf_prepare_sample() can be costly when the BPF
> >
> > So you potentially call it twice now, how's that useful?
>
> Right. I think we can check data->sample_flags in
> perf_prepare_sample() to minimize the duplicate work.
> It already does it for some fields, but misses others.
we used to have __PERF_SAMPLE_CALLCHAIN_EARLY to avoid extra perf_callchain,
could we add some flag like __PERF_SAMPLE_INIT_EARLY to avoid double call to
perf_prepare_sample?
jirka
>
> Thanks,
> Namhyung
On Thu, Dec 22, 2022 at 11:53 PM Jiri Olsa <[email protected]> wrote:
>
> On Thu, Dec 22, 2022 at 02:25:49PM -0800, Namhyung Kim wrote:
> > On Thu, Dec 22, 2022 at 12:16 PM Peter Zijlstra <[email protected]> wrote:
> > >
> > > On Thu, Dec 22, 2022 at 09:34:42AM -0800, Namhyung Kim wrote:
> > >
> > > > Sorry about that. Let me rephrase it like below:
> > > >
> > > > With bpf_cast_to_kern_ctx(), BPF programs attached to a perf event
> > > > can access perf sample data directly from the ctx.
> > >
> > > This is the bpf_prog_run() in bpf_overflow_handler(), right?
> >
> > Yes.
> >
> > >
> > > > But the perf sample
> > > > data is not fully prepared at this point, and some fields can have invalid
> > > > uninitialized values. So it needs to call perf_prepare_sample() before
> > > > calling the BPF overflow handler.
> > >
> > > It never was, why is it a problem now?
> >
> > BPF used to allow selected fields only like period and addr, and they
> > are initialized always by perf_sample_data_init(). This is relaxed
> > by the bpf_cast_to_kern_ctx() and it can easily access arbitrary
> > fields of perf_sample_data now.
> >
> > The background of this change is to use BPF as a filter for perf
> > event samples. The code is there already and returning 0 from
> > BPF can drop perf samples. With access to more sample data,
> > it'd make more educated decisions.
> >
> > For example, I got some requests to limit perf samples in a
> > selected region of address (code or data). Or it can collect
> > samples only if some hardware specific information is set in
> > the raw data like in AMD IBS. We can easily extend it to other
> > sample info based on users' needs.
> >
> > >
> > > > But just calling perf_prepare_sample() can be costly when the BPF
> > >
> > > So you potentially call it twice now, how's that useful?
> >
> > Right. I think we can check data->sample_flags in
> > perf_prepare_sample() to minimize the duplicate work.
> > It already does it for some fields, but misses others.
>
> we used to have __PERF_SAMPLE_CALLCHAIN_EARLY to avoid extra perf_callchain,
> could we add some flag like __PERF_SAMPLE_INIT_EARLY to avoid double call to
> perf_prepare_sample?
I think we can check if the filtered_sample_type is 0.
But it still needs to update the perf_event_header.
I think we need to save the calculated size separately.
Thanks,
Namhyung