2019-06-12 03:48:41

by Matt Mullins

[permalink] [raw]
Subject: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data

BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
they do not increment bpf_prog_active while executing.

This enables three levels of nesting, to support
- a kprobe or raw tp or perf event,
- another one of the above that irq context happens to call, and
- another one in nmi context
(at most one of which may be a kprobe or perf event).

Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
Signed-off-by: Matt Mullins <[email protected]>
---
v1->v2:
* reverse-Christmas-tree-ize the declarations in bpf_perf_event_output
* instantiate err more readably

I've done additional testing with the original workload that hit the
irq+raw-tp reentrancy problem, and as far as I can tell, it's still
solved with this solution (as opposed to my earlier per-map-element
version).

kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++-------
1 file changed, 84 insertions(+), 16 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f92d6ad5e080..1c9a4745e596 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.arg4_type = ARG_CONST_SIZE,
};

-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
-
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_sample_data *sd)
@@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
return perf_event_output(event, sd, regs);
}

+/*
+ * Support executing tracepoints in normal, irq, and nmi context that each call
+ * bpf_perf_event_output
+ */
+struct bpf_trace_sample_data {
+ struct perf_sample_data sds[3];
+};
+
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
+static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
- struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
+ struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
+ int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
struct perf_raw_record raw = {
.frag = {
.size = size,
.data = data,
},
};
+ struct perf_sample_data *sd;
+ int err;

- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
+ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ sd = &sds->sds[nest_level - 1];
+
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
+ err = -EINVAL;
+ goto out;
+ }

perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;

- return __bpf_perf_event_output(regs, map, flags, sd);
+ err = __bpf_perf_event_output(regs, map, flags, sd);
+
+out:
+ this_cpu_dec(bpf_trace_nest_level);
+ return err;
}

static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
/*
* bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
* to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
+ *
+ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
+ * in normal, irq, and nmi context.
*/
-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+struct bpf_raw_tp_regs {
+ struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
+static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
+static struct pt_regs *get_bpf_raw_tp_regs(void)
+{
+ struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
+
+ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+ this_cpu_dec(bpf_raw_tp_nest_level);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return &tp_regs->regs[nest_level - 1];
+}
+
+static void put_bpf_raw_tp_regs(void)
+{
+ this_cpu_dec(bpf_raw_tp_nest_level);
+}
+
BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags, void *, data, u64, size)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);

perf_fetch_caller_regs(regs);
- return ____bpf_perf_event_output(regs, map, flags, data, size);
+ ret = ____bpf_perf_event_output(regs, map, flags, data, size);
+
+ put_bpf_raw_tp_regs();
+ return ret;
}

static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
@@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);

perf_fetch_caller_regs(regs);
/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
- return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
- flags, 0, 0);
+ ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+ flags, 0, 0);
+ put_bpf_raw_tp_regs();
+ return ret;
}

static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
@@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
void *, buf, u32, size, u64, flags)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);

perf_fetch_caller_regs(regs);
- return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
- (unsigned long) size, flags, 0);
+ ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+ put_bpf_raw_tp_regs();
+ return ret;
}

static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
--
2.17.1


2019-06-12 07:59:58

by Andrii Nakryiko

[permalink] [raw]
Subject: Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data

On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <[email protected]> wrote:
>
> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> they do not increment bpf_prog_active while executing.
>
> This enables three levels of nesting, to support
> - a kprobe or raw tp or perf event,
> - another one of the above that irq context happens to call, and
> - another one in nmi context
> (at most one of which may be a kprobe or perf event).
>
> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> Signed-off-by: Matt Mullins <[email protected]>
> ---

LGTM, minor nit below.

Acked-by: Andrii Nakryiko <[email protected]>

> v1->v2:
> * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output
> * instantiate err more readably
>
> I've done additional testing with the original workload that hit the
> irq+raw-tp reentrancy problem, and as far as I can tell, it's still
> solved with this solution (as opposed to my earlier per-map-element
> version).
>
> kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++-------
> 1 file changed, 84 insertions(+), 16 deletions(-)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index f92d6ad5e080..1c9a4745e596 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
> .arg4_type = ARG_CONST_SIZE,
> };
>
> -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
> -
> static __always_inline u64
> __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> u64 flags, struct perf_sample_data *sd)
> @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> return perf_event_output(event, sd, regs);
> }
>
> +/*
> + * Support executing tracepoints in normal, irq, and nmi context that each call
> + * bpf_perf_event_output
> + */
> +struct bpf_trace_sample_data {
> + struct perf_sample_data sds[3];
> +};
> +
> +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
> +static DEFINE_PER_CPU(int, bpf_trace_nest_level);
> BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
> u64, flags, void *, data, u64, size)
> {
> - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
> + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
> + int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
> struct perf_raw_record raw = {
> .frag = {
> .size = size,
> .data = data,
> },
> };
> + struct perf_sample_data *sd;
> + int err;
>
> - if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
> - return -EINVAL;
> + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
> + err = -EBUSY;
> + goto out;
> + }
> +
> + sd = &sds->sds[nest_level - 1];
> +
> + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
> + err = -EINVAL;
> + goto out;
> + }

Feel free to ignore, but just stylistically, given this check doesn't
depend on sd, I'd move it either to the very top or right after
`nest_level > ARRAY_SIZE(sds->sds)` check, so that all the error
checking is grouped without interspersed assignment.

>
> perf_sample_data_init(sd, 0, 0);
> sd->raw = &raw;
>
> - return __bpf_perf_event_output(regs, map, flags, sd);
> + err = __bpf_perf_event_output(regs, map, flags, sd);
> +
> +out:
> + this_cpu_dec(bpf_trace_nest_level);
> + return err;
> }
>
> static const struct bpf_func_proto bpf_perf_event_output_proto = {
> @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> /*
> * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
> * to avoid potential recursive reuse issue when/if tracepoints are added
> - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
> + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
> + *
> + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
> + * in normal, irq, and nmi context.
> */
> -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
> +struct bpf_raw_tp_regs {
> + struct pt_regs regs[3];
> +};
> +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
> +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
> +static struct pt_regs *get_bpf_raw_tp_regs(void)
> +{
> + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
> + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
> +
> + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
> + this_cpu_dec(bpf_raw_tp_nest_level);
> + return ERR_PTR(-EBUSY);
> + }
> +
> + return &tp_regs->regs[nest_level - 1];
> +}
> +
> +static void put_bpf_raw_tp_regs(void)
> +{
> + this_cpu_dec(bpf_raw_tp_nest_level);
> +}
> +
> BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
> struct bpf_map *, map, u64, flags, void *, data, u64, size)
> {
> - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> + struct pt_regs *regs = get_bpf_raw_tp_regs();
> + int ret;
> +
> + if (IS_ERR(regs))
> + return PTR_ERR(regs);
>
> perf_fetch_caller_regs(regs);
> - return ____bpf_perf_event_output(regs, map, flags, data, size);
> + ret = ____bpf_perf_event_output(regs, map, flags, data, size);
> +
> + put_bpf_raw_tp_regs();
> + return ret;
> }
>
> static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
> @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
> BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
> struct bpf_map *, map, u64, flags)
> {
> - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> + struct pt_regs *regs = get_bpf_raw_tp_regs();
> + int ret;
> +
> + if (IS_ERR(regs))
> + return PTR_ERR(regs);
>
> perf_fetch_caller_regs(regs);
> /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
> - return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> - flags, 0, 0);
> + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> + flags, 0, 0);
> + put_bpf_raw_tp_regs();
> + return ret;
> }
>
> static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
> @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
> BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
> void *, buf, u32, size, u64, flags)
> {
> - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> + struct pt_regs *regs = get_bpf_raw_tp_regs();
> + int ret;
> +
> + if (IS_ERR(regs))
> + return PTR_ERR(regs);
>
> perf_fetch_caller_regs(regs);
> - return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> - (unsigned long) size, flags, 0);
> + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> + (unsigned long) size, flags, 0);
> + put_bpf_raw_tp_regs();
> + return ret;
> }
>
> static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
> --
> 2.17.1
>

2019-06-13 22:49:04

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data

On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
> On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <[email protected]> wrote:
>>
>> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
>> they do not increment bpf_prog_active while executing.
>>
>> This enables three levels of nesting, to support
>> - a kprobe or raw tp or perf event,
>> - another one of the above that irq context happens to call, and
>> - another one in nmi context
>> (at most one of which may be a kprobe or perf event).
>>
>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")

Generally, looks good to me. Two things below:

Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
instead of the one you currently have?

One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().

Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").

1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
performance overhead (and desire to not miss events as a result of nesting)? (This
also means we're not protected by bpf_prog_active in all the map ops, of course.)
2) Wouldn't this also mean that we only need to fix the raw tp programs via
get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
the rest which relies upon trace_call_bpf()? I'm probably missing something, but
given they have separate pt_regs there, how could they be affected then?

Thanks,
Daniel

>> Signed-off-by: Matt Mullins <[email protected]>
>> ---
>
> LGTM, minor nit below.
>
> Acked-by: Andrii Nakryiko <[email protected]>
>
>> v1->v2:
>> * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output
>> * instantiate err more readably
>>
>> I've done additional testing with the original workload that hit the
>> irq+raw-tp reentrancy problem, and as far as I can tell, it's still
>> solved with this solution (as opposed to my earlier per-map-element
>> version).
>>
>> kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++-------
>> 1 file changed, 84 insertions(+), 16 deletions(-)
>>
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index f92d6ad5e080..1c9a4745e596 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
>> .arg4_type = ARG_CONST_SIZE,
>> };
>>
>> -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
>> -
>> static __always_inline u64
>> __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
>> u64 flags, struct perf_sample_data *sd)
>> @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
>> return perf_event_output(event, sd, regs);
>> }
>>
>> +/*
>> + * Support executing tracepoints in normal, irq, and nmi context that each call
>> + * bpf_perf_event_output
>> + */
>> +struct bpf_trace_sample_data {
>> + struct perf_sample_data sds[3];
>> +};
>> +
>> +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
>> +static DEFINE_PER_CPU(int, bpf_trace_nest_level);
>> BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
>> u64, flags, void *, data, u64, size)
>> {
>> - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
>> + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
>> + int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
>> struct perf_raw_record raw = {
>> .frag = {
>> .size = size,
>> .data = data,
>> },
>> };
>> + struct perf_sample_data *sd;
>> + int err;
>>
>> - if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
>> - return -EINVAL;
>> + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
>> + err = -EBUSY;
>> + goto out;
>> + }
>> +
>> + sd = &sds->sds[nest_level - 1];
>> +
>> + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
>> + err = -EINVAL;
>> + goto out;
>> + }
>
> Feel free to ignore, but just stylistically, given this check doesn't
> depend on sd, I'd move it either to the very top or right after
> `nest_level > ARRAY_SIZE(sds->sds)` check, so that all the error
> checking is grouped without interspersed assignment.

Makes sense.

>> perf_sample_data_init(sd, 0, 0);
>> sd->raw = &raw;
>>
>> - return __bpf_perf_event_output(regs, map, flags, sd);
>> + err = __bpf_perf_event_output(regs, map, flags, sd);
>> +
>> +out:
>> + this_cpu_dec(bpf_trace_nest_level);
>> + return err;
>> }
>>
>> static const struct bpf_func_proto bpf_perf_event_output_proto = {
>> @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>> /*
>> * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
>> * to avoid potential recursive reuse issue when/if tracepoints are added
>> - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
>> + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
>> + *
>> + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
>> + * in normal, irq, and nmi context.
>> */
>> -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
>> +struct bpf_raw_tp_regs {
>> + struct pt_regs regs[3];
>> +};
>> +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
>> +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
>> +static struct pt_regs *get_bpf_raw_tp_regs(void)
>> +{
>> + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
>> + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
>> +
>> + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
>> + this_cpu_dec(bpf_raw_tp_nest_level);
>> + return ERR_PTR(-EBUSY);
>> + }
>> +
>> + return &tp_regs->regs[nest_level - 1];
>> +}
>> +
>> +static void put_bpf_raw_tp_regs(void)
>> +{
>> + this_cpu_dec(bpf_raw_tp_nest_level);
>> +}
>> +
>> BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
>> struct bpf_map *, map, u64, flags, void *, data, u64, size)
>> {
>> - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
>> + struct pt_regs *regs = get_bpf_raw_tp_regs();
>> + int ret;
>> +
>> + if (IS_ERR(regs))
>> + return PTR_ERR(regs);
>>
>> perf_fetch_caller_regs(regs);
>> - return ____bpf_perf_event_output(regs, map, flags, data, size);
>> + ret = ____bpf_perf_event_output(regs, map, flags, data, size);
>> +
>> + put_bpf_raw_tp_regs();
>> + return ret;
>> }
>>
>> static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
>> @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
>> BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
>> struct bpf_map *, map, u64, flags)
>> {
>> - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
>> + struct pt_regs *regs = get_bpf_raw_tp_regs();
>> + int ret;
>> +
>> + if (IS_ERR(regs))
>> + return PTR_ERR(regs);
>>
>> perf_fetch_caller_regs(regs);
>> /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
>> - return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
>> - flags, 0, 0);
>> + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
>> + flags, 0, 0);
>> + put_bpf_raw_tp_regs();
>> + return ret;
>> }
>>
>> static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
>> @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
>> BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
>> void *, buf, u32, size, u64, flags)
>> {
>> - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
>> + struct pt_regs *regs = get_bpf_raw_tp_regs();
>> + int ret;
>> +
>> + if (IS_ERR(regs))
>> + return PTR_ERR(regs);
>>
>> perf_fetch_caller_regs(regs);
>> - return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
>> - (unsigned long) size, flags, 0);
>> + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
>> + (unsigned long) size, flags, 0);
>> + put_bpf_raw_tp_regs();
>> + return ret;
>> }
>>
>> static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
>> --
>> 2.17.1
>>

2019-06-14 00:53:13

by Matt Mullins

[permalink] [raw]
Subject: Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data

On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
> On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
> > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <[email protected]> wrote:
> > >
> > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> > > they do not increment bpf_prog_active while executing.
> > >
> > > This enables three levels of nesting, to support
> > > - a kprobe or raw tp or perf event,
> > > - another one of the above that irq context happens to call, and
> > > - another one in nmi context
> > > (at most one of which may be a kprobe or perf event).
> > >
> > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
>
> Generally, looks good to me. Two things below:
>
> Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
> instead of the one you currently have?

Ah, yeah, that's probably more reasonable; I haven't managed to come up
with a scenario where one could hit this without raw tracepoints. I'll
fix up the nits that've accumulated since v2.

> One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
>
> Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
> counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
> the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
>
> 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
> the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
> performance overhead (and desire to not miss events as a result of nesting)? (This
> also means we're not protected by bpf_prog_active in all the map ops, of course.)
> 2) Wouldn't this also mean that we only need to fix the raw tp programs via
> get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
> the rest which relies upon trace_call_bpf()? I'm probably missing something, but
> given they have separate pt_regs there, how could they be affected then?

For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
the _raw_tp variants. However, consider the following nesting:

trace_nest_level raw_tp_nest_level
(kprobe) bpf_perf_event_output 1 0
(raw_tp) bpf_perf_event_output_raw_tp 2 1
(raw_tp) bpf_get_stackid_raw_tp 2 2

I need to increment a nest level (and ideally increment it only once)
between the kprobe and the first raw_tp, because they would otherwise
share the struct perf_sample_data. But I also need to increment a nest
level between the two raw_tps, since they share the pt_regs -- I can't
use trace_nest_level for everything because it's not used by
get_stackid, and I can't use raw_tp_nest_level for everything because
it's not incremented by kprobes.

If raw tracepoints were to bump bpf_prog_active, then I could get away
with just using that count in these callsites -- I'm reluctant to do
that, though, since it would prevent kprobes from ever running inside a
raw_tp. I'd like to retain the ability to (e.g.)
trace.py -K htab_map_update_elem
and get some stack traces from at least within raw tracepoints.

That said, as I wrote up this example, bpf_trace_nest_level seems to be
wildly misnamed; I should name those after the structure they're
protecting...

> Thanks,
> Daniel
>
> > > Signed-off-by: Matt Mullins <[email protected]>
> > > ---
> >
> > LGTM, minor nit below.
> >
> > Acked-by: Andrii Nakryiko <[email protected]>
> >
> > > v1->v2:
> > > * reverse-Christmas-tree-ize the declarations in bpf_perf_event_output
> > > * instantiate err more readably
> > >
> > > I've done additional testing with the original workload that hit the
> > > irq+raw-tp reentrancy problem, and as far as I can tell, it's still
> > > solved with this solution (as opposed to my earlier per-map-element
> > > version).
> > >
> > > kernel/trace/bpf_trace.c | 100 ++++++++++++++++++++++++++++++++-------
> > > 1 file changed, 84 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > > index f92d6ad5e080..1c9a4745e596 100644
> > > --- a/kernel/trace/bpf_trace.c
> > > +++ b/kernel/trace/bpf_trace.c
> > > @@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
> > > .arg4_type = ARG_CONST_SIZE,
> > > };
> > >
> > > -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
> > > -
> > > static __always_inline u64
> > > __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> > > u64 flags, struct perf_sample_data *sd)
> > > @@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
> > > return perf_event_output(event, sd, regs);
> > > }
> > >
> > > +/*
> > > + * Support executing tracepoints in normal, irq, and nmi context that each call
> > > + * bpf_perf_event_output
> > > + */
> > > +struct bpf_trace_sample_data {
> > > + struct perf_sample_data sds[3];
> > > +};
> > > +
> > > +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
> > > +static DEFINE_PER_CPU(int, bpf_trace_nest_level);
> > > BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
> > > u64, flags, void *, data, u64, size)
> > > {
> > > - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
> > > + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
> > > + int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
> > > struct perf_raw_record raw = {
> > > .frag = {
> > > .size = size,
> > > .data = data,
> > > },
> > > };
> > > + struct perf_sample_data *sd;
> > > + int err;
> > >
> > > - if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
> > > - return -EINVAL;
> > > + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
> > > + err = -EBUSY;
> > > + goto out;
> > > + }
> > > +
> > > + sd = &sds->sds[nest_level - 1];
> > > +
> > > + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
> > > + err = -EINVAL;
> > > + goto out;
> > > + }
> >
> > Feel free to ignore, but just stylistically, given this check doesn't
> > depend on sd, I'd move it either to the very top or right after
> > `nest_level > ARRAY_SIZE(sds->sds)` check, so that all the error
> > checking is grouped without interspersed assignment.
>
> Makes sense.
>
> > > perf_sample_data_init(sd, 0, 0);
> > > sd->raw = &raw;
> > >
> > > - return __bpf_perf_event_output(regs, map, flags, sd);
> > > + err = __bpf_perf_event_output(regs, map, flags, sd);
> > > +
> > > +out:
> > > + this_cpu_dec(bpf_trace_nest_level);
> > > + return err;
> > > }
> > >
> > > static const struct bpf_func_proto bpf_perf_event_output_proto = {
> > > @@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> > > /*
> > > * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
> > > * to avoid potential recursive reuse issue when/if tracepoints are added
> > > - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
> > > + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
> > > + *
> > > + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
> > > + * in normal, irq, and nmi context.
> > > */
> > > -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
> > > +struct bpf_raw_tp_regs {
> > > + struct pt_regs regs[3];
> > > +};
> > > +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
> > > +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
> > > +static struct pt_regs *get_bpf_raw_tp_regs(void)
> > > +{
> > > + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
> > > + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
> > > +
> > > + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
> > > + this_cpu_dec(bpf_raw_tp_nest_level);
> > > + return ERR_PTR(-EBUSY);
> > > + }
> > > +
> > > + return &tp_regs->regs[nest_level - 1];
> > > +}
> > > +
> > > +static void put_bpf_raw_tp_regs(void)
> > > +{
> > > + this_cpu_dec(bpf_raw_tp_nest_level);
> > > +}
> > > +
> > > BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
> > > struct bpf_map *, map, u64, flags, void *, data, u64, size)
> > > {
> > > - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> > > + struct pt_regs *regs = get_bpf_raw_tp_regs();
> > > + int ret;
> > > +
> > > + if (IS_ERR(regs))
> > > + return PTR_ERR(regs);
> > >
> > > perf_fetch_caller_regs(regs);
> > > - return ____bpf_perf_event_output(regs, map, flags, data, size);
> > > + ret = ____bpf_perf_event_output(regs, map, flags, data, size);
> > > +
> > > + put_bpf_raw_tp_regs();
> > > + return ret;
> > > }
> > >
> > > static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
> > > @@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
> > > BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
> > > struct bpf_map *, map, u64, flags)
> > > {
> > > - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> > > + struct pt_regs *regs = get_bpf_raw_tp_regs();
> > > + int ret;
> > > +
> > > + if (IS_ERR(regs))
> > > + return PTR_ERR(regs);
> > >
> > > perf_fetch_caller_regs(regs);
> > > /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
> > > - return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> > > - flags, 0, 0);
> > > + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
> > > + flags, 0, 0);
> > > + put_bpf_raw_tp_regs();
> > > + return ret;
> > > }
> > >
> > > static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
> > > @@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
> > > BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
> > > void *, buf, u32, size, u64, flags)
> > > {
> > > - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
> > > + struct pt_regs *regs = get_bpf_raw_tp_regs();
> > > + int ret;
> > > +
> > > + if (IS_ERR(regs))
> > > + return PTR_ERR(regs);
> > >
> > > perf_fetch_caller_regs(regs);
> > > - return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> > > - (unsigned long) size, flags, 0);
> > > + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
> > > + (unsigned long) size, flags, 0);
> > > + put_bpf_raw_tp_regs();
> > > + return ret;
> > > }
> > >
> > > static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
> > > --
> > > 2.17.1
> > >
>
>

2019-06-14 00:57:28

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data

On Thu, Jun 13, 2019 at 5:52 PM Matt Mullins <[email protected]> wrote:
>
> On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
> > On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
> > > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <[email protected]> wrote:
> > > >
> > > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> > > > they do not increment bpf_prog_active while executing.
> > > >
> > > > This enables three levels of nesting, to support
> > > > - a kprobe or raw tp or perf event,
> > > > - another one of the above that irq context happens to call, and
> > > > - another one in nmi context
> > > > (at most one of which may be a kprobe or perf event).
> > > >
> > > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> >
> > Generally, looks good to me. Two things below:
> >
> > Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
> > instead of the one you currently have?
>
> Ah, yeah, that's probably more reasonable; I haven't managed to come up
> with a scenario where one could hit this without raw tracepoints. I'll
> fix up the nits that've accumulated since v2.
>
> > One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
> >
> > Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
> > counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
> > the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
> >
> > 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
> > the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
> > performance overhead (and desire to not miss events as a result of nesting)? (This
> > also means we're not protected by bpf_prog_active in all the map ops, of course.)
> > 2) Wouldn't this also mean that we only need to fix the raw tp programs via
> > get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
> > the rest which relies upon trace_call_bpf()? I'm probably missing something, but
> > given they have separate pt_regs there, how could they be affected then?
>
> For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
> the _raw_tp variants. However, consider the following nesting:
>
> trace_nest_level raw_tp_nest_level
> (kprobe) bpf_perf_event_output 1 0
> (raw_tp) bpf_perf_event_output_raw_tp 2 1
> (raw_tp) bpf_get_stackid_raw_tp 2 2
>
> I need to increment a nest level (and ideally increment it only once)
> between the kprobe and the first raw_tp, because they would otherwise
> share the struct perf_sample_data. But I also need to increment a nest
> level between the two raw_tps, since they share the pt_regs -- I can't
> use trace_nest_level for everything because it's not used by
> get_stackid, and I can't use raw_tp_nest_level for everything because
> it's not incremented by kprobes.
>
> If raw tracepoints were to bump bpf_prog_active, then I could get away
> with just using that count in these callsites -- I'm reluctant to do
> that, though, since it would prevent kprobes from ever running inside a
> raw_tp. I'd like to retain the ability to (e.g.)
> trace.py -K htab_map_update_elem
> and get some stack traces from at least within raw tracepoints.
>
> That said, as I wrote up this example, bpf_trace_nest_level seems to be
> wildly misnamed; I should name those after the structure they're
> protecting...

I still don't get what's wrong with the previous approach.
Didn't I manage to convince both of you that perf_sample_data
inside bpf_perf_event_array doesn't have any issue that Daniel brought up?
I think this refcnting approach is inferior.

2019-06-14 14:51:22

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data

On 06/14/2019 02:51 AM, Matt Mullins wrote:
> On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
>> On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
>>> On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <[email protected]> wrote:
>>>>
>>>> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
>>>> they do not increment bpf_prog_active while executing.
>>>>
>>>> This enables three levels of nesting, to support
>>>> - a kprobe or raw tp or perf event,
>>>> - another one of the above that irq context happens to call, and
>>>> - another one in nmi context
>>>> (at most one of which may be a kprobe or perf event).
>>>>
>>>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
>>
>> Generally, looks good to me. Two things below:
>>
>> Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
>> instead of the one you currently have?
>
> Ah, yeah, that's probably more reasonable; I haven't managed to come up
> with a scenario where one could hit this without raw tracepoints. I'll
> fix up the nits that've accumulated since v2.
>
>> One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
>>
>> Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
>> counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
>> the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
>>
>> 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
>> the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
>> performance overhead (and desire to not miss events as a result of nesting)? (This
>> also means we're not protected by bpf_prog_active in all the map ops, of course.)
>> 2) Wouldn't this also mean that we only need to fix the raw tp programs via
>> get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
>> the rest which relies upon trace_call_bpf()? I'm probably missing something, but
>> given they have separate pt_regs there, how could they be affected then?
>
> For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
> the _raw_tp variants. However, consider the following nesting:
>
> trace_nest_level raw_tp_nest_level
> (kprobe) bpf_perf_event_output 1 0
> (raw_tp) bpf_perf_event_output_raw_tp 2 1
> (raw_tp) bpf_get_stackid_raw_tp 2 2
>
> I need to increment a nest level (and ideally increment it only once)
> between the kprobe and the first raw_tp, because they would otherwise
> share the struct perf_sample_data. But I also need to increment a nest

I'm not sure I follow on this one: the former would still keep using the
bpf_trace_sd as-is today since only ever /one/ can be active on a given CPU
as we otherwise bail out in trace_call_bpf() due to bpf_prog_active counter.
Given these two are /not/ shared, you only need the code you have below for
nesting to deal with the raw_tps via get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs()
which should also simplify the code quite a bit.

> level between the two raw_tps, since they share the pt_regs -- I can't
> use trace_nest_level for everything because it's not used by
> get_stackid, and I can't use raw_tp_nest_level for everything because
> it's not incremented by kprobes.

(See above wrt kprobes.)

> If raw tracepoints were to bump bpf_prog_active, then I could get away
> with just using that count in these callsites -- I'm reluctant to do
> that, though, since it would prevent kprobes from ever running inside a
> raw_tp. I'd like to retain the ability to (e.g.)
> trace.py -K htab_map_update_elem
> and get some stack traces from at least within raw tracepoints.
>
> That said, as I wrote up this example, bpf_trace_nest_level seems to be
> wildly misnamed; I should name those after the structure they're
> protecting...

2019-06-14 15:17:00

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data

On 06/14/2019 02:55 AM, Alexei Starovoitov wrote:
> On Thu, Jun 13, 2019 at 5:52 PM Matt Mullins <[email protected]> wrote:
>> On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
>>> On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
>>>> On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <[email protected]> wrote:
>>>>>
>>>>> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
>>>>> they do not increment bpf_prog_active while executing.
>>>>>
>>>>> This enables three levels of nesting, to support
>>>>> - a kprobe or raw tp or perf event,
>>>>> - another one of the above that irq context happens to call, and
>>>>> - another one in nmi context
>>>>> (at most one of which may be a kprobe or perf event).
>>>>>
>>>>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
>>>
>>> Generally, looks good to me. Two things below:
>>>
>>> Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
>>> instead of the one you currently have?
>>
>> Ah, yeah, that's probably more reasonable; I haven't managed to come up
>> with a scenario where one could hit this without raw tracepoints. I'll
>> fix up the nits that've accumulated since v2.
>>
>>> One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
>>>
>>> Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
>>> counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
>>> the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
>>>
>>> 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
>>> the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
>>> performance overhead (and desire to not miss events as a result of nesting)? (This
>>> also means we're not protected by bpf_prog_active in all the map ops, of course.)
>>> 2) Wouldn't this also mean that we only need to fix the raw tp programs via
>>> get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
>>> the rest which relies upon trace_call_bpf()? I'm probably missing something, but
>>> given they have separate pt_regs there, how could they be affected then?
>>
>> For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
>> the _raw_tp variants. However, consider the following nesting:
>>
>> trace_nest_level raw_tp_nest_level
>> (kprobe) bpf_perf_event_output 1 0
>> (raw_tp) bpf_perf_event_output_raw_tp 2 1
>> (raw_tp) bpf_get_stackid_raw_tp 2 2
>>
>> I need to increment a nest level (and ideally increment it only once)
>> between the kprobe and the first raw_tp, because they would otherwise
>> share the struct perf_sample_data. But I also need to increment a nest
>> level between the two raw_tps, since they share the pt_regs -- I can't
>> use trace_nest_level for everything because it's not used by
>> get_stackid, and I can't use raw_tp_nest_level for everything because
>> it's not incremented by kprobes.
>>
>> If raw tracepoints were to bump bpf_prog_active, then I could get away
>> with just using that count in these callsites -- I'm reluctant to do
>> that, though, since it would prevent kprobes from ever running inside a
>> raw_tp. I'd like to retain the ability to (e.g.)
>> trace.py -K htab_map_update_elem
>> and get some stack traces from at least within raw tracepoints.
>>
>> That said, as I wrote up this example, bpf_trace_nest_level seems to be
>> wildly misnamed; I should name those after the structure they're
>> protecting...
>
> I still don't get what's wrong with the previous approach.
> Didn't I manage to convince both of you that perf_sample_data
> inside bpf_perf_event_array doesn't have any issue that Daniel brought up?
> I think this refcnting approach is inferior.

Hm, but looking at perf RB handling code, it can deal with nesting situation
just fine. I think this was your main concern with prior email:

because I suspect that 'struct bpf_event_entry' is not reentrable
(even w/o issues with 'struct perf_sample_data').

Even if we always use 'struct perf_sample_data' on stack, I suspect
the same 'struct bpf_event_entry' cannot be reused in the nested way.

Check the perf_output_{get,put}_handle() and the way it updates head pointer
and does the final propagation to user_page. So if it's designed to handle
these situations, then bailing out doesn't make sense from BPF side.

2019-06-14 17:27:54

by Matt Mullins

[permalink] [raw]
Subject: Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data

On Fri, 2019-06-14 at 16:50 +0200, Daniel Borkmann wrote:
> On 06/14/2019 02:51 AM, Matt Mullins wrote:
> > On Fri, 2019-06-14 at 00:47 +0200, Daniel Borkmann wrote:
> > > On 06/12/2019 07:00 AM, Andrii Nakryiko wrote:
> > > > On Tue, Jun 11, 2019 at 8:48 PM Matt Mullins <[email protected]> wrote:
> > > > >
> > > > > BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> > > > > they do not increment bpf_prog_active while executing.
> > > > >
> > > > > This enables three levels of nesting, to support
> > > > > - a kprobe or raw tp or perf event,
> > > > > - another one of the above that irq context happens to call, and
> > > > > - another one in nmi context
> > > > > (at most one of which may be a kprobe or perf event).
> > > > >
> > > > > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> > >
> > > Generally, looks good to me. Two things below:
> > >
> > > Nit, for stable, shouldn't fixes tag be c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT")
> > > instead of the one you currently have?
> >
> > Ah, yeah, that's probably more reasonable; I haven't managed to come up
> > with a scenario where one could hit this without raw tracepoints. I'll
> > fix up the nits that've accumulated since v2.
> >
> > > One more question / clarification: we have __bpf_trace_run() vs trace_call_bpf().
> > >
> > > Only raw tracepoints can be nested since the rest has the bpf_prog_active per-CPU
> > > counter via trace_call_bpf() and would bail out otherwise, iiuc. And raw ones use
> > > the __bpf_trace_run() added in c4f6699dfcb8 ("bpf: introduce BPF_RAW_TRACEPOINT").
> > >
> > > 1) I tried to recall and find a rationale for mentioned trace_call_bpf() split in
> > > the c4f6699dfcb8 log, but couldn't find any. Is the raison d'être purely because of
> > > performance overhead (and desire to not miss events as a result of nesting)? (This
> > > also means we're not protected by bpf_prog_active in all the map ops, of course.)
> > > 2) Wouldn't this also mean that we only need to fix the raw tp programs via
> > > get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs() and won't need this duplication for
> > > the rest which relies upon trace_call_bpf()? I'm probably missing something, but
> > > given they have separate pt_regs there, how could they be affected then?
> >
> > For the pt_regs, you're correct: I only used get/put_raw_tp_regs for
> > the _raw_tp variants. However, consider the following nesting:
> >
> > trace_nest_level raw_tp_nest_level
> > (kprobe) bpf_perf_event_output 1 0
> > (raw_tp) bpf_perf_event_output_raw_tp 2 1
> > (raw_tp) bpf_get_stackid_raw_tp 2 2
> >
> > I need to increment a nest level (and ideally increment it only once)
> > between the kprobe and the first raw_tp, because they would otherwise
> > share the struct perf_sample_data. But I also need to increment a nest
>
> I'm not sure I follow on this one: the former would still keep using the
> bpf_trace_sd as-is today since only ever /one/ can be active on a given CPU
> as we otherwise bail out in trace_call_bpf() due to bpf_prog_active counter.
> Given these two are /not/ shared, you only need the code you have below for
> nesting to deal with the raw_tps via get_bpf_raw_tp_regs() / put_bpf_raw_tp_regs()
> which should also simplify the code quite a bit.

bpf_perf_event_output_raw_tp calls ____bpf_perf_event_output, so it
currently shares bpf_trace_sd with kprobes -- it _can_ be nested.

>
> > level between the two raw_tps, since they share the pt_regs -- I can't
> > use trace_nest_level for everything because it's not used by
> > get_stackid, and I can't use raw_tp_nest_level for everything because
> > it's not incremented by kprobes.
>
> (See above wrt kprobes.)
>
> > If raw tracepoints were to bump bpf_prog_active, then I could get away
> > with just using that count in these callsites -- I'm reluctant to do
> > that, though, since it would prevent kprobes from ever running inside a
> > raw_tp. I'd like to retain the ability to (e.g.)
> > trace.py -K htab_map_update_elem
> > and get some stack traces from at least within raw tracepoints.
> >
> > That said, as I wrote up this example, bpf_trace_nest_level seems to be
> > wildly misnamed; I should name those after the structure they're
> > protecting...

2019-06-15 23:35:58

by Alexei Starovoitov

[permalink] [raw]
Subject: Re: [PATCH bpf v2] bpf: fix nested bpf tracepoints with per-cpu data

On Tue, Jun 11, 2019 at 2:54 PM Matt Mullins <[email protected]> wrote:
>
> BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
> they do not increment bpf_prog_active while executing.
>
> This enables three levels of nesting, to support
> - a kprobe or raw tp or perf event,
> - another one of the above that irq context happens to call, and
> - another one in nmi context
> (at most one of which may be a kprobe or perf event).
>
> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> Signed-off-by: Matt Mullins <[email protected]>

Applied. Thanks