Add a new type of bpf tracepoints: sleepable tracepoints, which allows
the handler to make calls that may sleep. With sleepable tracepoints, a
set of syscall helpers (which may sleep) may also be called from
sleepable tracepoints.
In the following patches, we will whitelist some tracepoints to be
sleepable.
Signed-off-by: Hao Luo <[email protected]>
---
include/linux/bpf.h | 10 +++++++-
include/linux/tracepoint-defs.h | 1 +
include/trace/bpf_probe.h | 22 ++++++++++++++----
kernel/bpf/syscall.c | 41 +++++++++++++++++++++++----------
kernel/trace/bpf_trace.c | 5 ++++
5 files changed, 61 insertions(+), 18 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c36eeced3838..759ade7b24b3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1810,6 +1810,9 @@ struct bpf_prog *bpf_prog_by_id(u32 id);
struct bpf_link *bpf_link_by_id(u32 id);
const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+const struct bpf_func_proto *
+tracing_prog_syscall_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
+
void bpf_task_storage_free(struct task_struct *task);
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
const struct btf_func_model *
@@ -1822,7 +1825,6 @@ struct bpf_core_ctx {
int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
int relo_idx, void *insn);
-
#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
{
@@ -2011,6 +2013,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return NULL;
}
+static inline struct bpf_func_proto *
+tracing_prog_syscall_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return NULL;
+}
+
static inline void bpf_task_storage_free(struct task_struct *task)
{
}
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index e7c2276be33e..c73c7ab3680e 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -51,6 +51,7 @@ struct bpf_raw_event_map {
void *bpf_func;
u32 num_args;
u32 writable_size;
+ u32 sleepable;
} __aligned(32);
/*
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index 7660a7846586..4edfc6df2f52 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -88,7 +88,7 @@ __bpf_trace_##call(void *__data, proto) \
* to make sure that if the tracepoint handling changes, the
* bpf probe will fail to compile unless it too is updated.
*/
-#define __DEFINE_EVENT(template, call, proto, args, size) \
+#define __DEFINE_EVENT(template, call, proto, args, size, sleep) \
static inline void bpf_test_probe_##call(void) \
{ \
check_trace_callback_type_##call(__bpf_trace_##template); \
@@ -104,6 +104,7 @@ __section("__bpf_raw_tp_map") = { \
.bpf_func = __bpf_trace_##template, \
.num_args = COUNT_ARGS(args), \
.writable_size = size, \
+ .sleepable = sleep, \
}, \
};
@@ -123,11 +124,15 @@ static inline void bpf_test_buffer_##call(void) \
#undef DEFINE_EVENT_WRITABLE
#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \
__CHECK_WRITABLE_BUF_SIZE(call, PARAMS(proto), PARAMS(args), size) \
- __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size)
+ __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size, 0)
+
+#undef DEFINE_EVENT_SLEEPABLE
+#define DEFINE_EVENT_SLEEPABLE(template, call, proto, args) \
+ __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0, 1)
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args) \
- __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0)
+ __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0, 0)
#undef DEFINE_EVENT_PRINT
#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
@@ -136,19 +141,26 @@ static inline void bpf_test_buffer_##call(void) \
#undef DECLARE_TRACE
#define DECLARE_TRACE(call, proto, args) \
__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \
- __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0)
+ __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0, 0)
#undef DECLARE_TRACE_WRITABLE
#define DECLARE_TRACE_WRITABLE(call, proto, args, size) \
__CHECK_WRITABLE_BUF_SIZE(call, PARAMS(proto), PARAMS(args), size) \
__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \
- __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), size)
+ __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), size, 0)
+
+#undef DECLARE_TRACE_SLEEPABLE
+#define DECLARE_TRACE_SLEEPABLE(call, proto, args) \
+ __BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args)) \
+ __DEFINE_EVENT(call, call, PARAMS(proto), PARAMS(args), 0, 1)
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
#undef DECLARE_TRACE_WRITABLE
#undef DEFINE_EVENT_WRITABLE
#undef __CHECK_WRITABLE_BUF_SIZE
+#undef DECLARE_TRACE_SLEEPABLE
+#undef DEFINE_EVENT_SLEEPABLE
#undef __DEFINE_EVENT
#undef FIRST
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9e6d8d0c8af5..0a12f52fe8a9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4827,12 +4827,6 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
.arg3_type = ARG_CONST_SIZE,
};
-const struct bpf_func_proto * __weak
-tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
-{
- return bpf_base_func_proto(func_id);
-}
-
BPF_CALL_1(bpf_sys_close, u32, fd)
{
/* When bpf program calls this helper there should not be
@@ -5045,24 +5039,47 @@ const struct bpf_func_proto bpf_unlink_proto = {
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
};
-static const struct bpf_func_proto *
-syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+/* Syscall helpers that are also allowed in sleepable tracing prog. */
+const struct bpf_func_proto *
+tracing_prog_syscall_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_sys_bpf:
return &bpf_sys_bpf_proto;
- case BPF_FUNC_btf_find_by_name_kind:
- return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
return &bpf_sys_close_proto;
- case BPF_FUNC_kallsyms_lookup_name:
- return &bpf_kallsyms_lookup_name_proto;
case BPF_FUNC_mkdir:
return &bpf_mkdir_proto;
case BPF_FUNC_rmdir:
return &bpf_rmdir_proto;
case BPF_FUNC_unlink:
return &bpf_unlink_proto;
+ default:
+ return NULL;
+ }
+}
+
+const struct bpf_func_proto * __weak
+tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *fn;
+
+ fn = tracing_prog_syscall_func_proto(func_id, prog);
+ if (fn)
+ return fn;
+
+ return bpf_base_func_proto(func_id);
+}
+
+static const struct bpf_func_proto *
+syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_btf_find_by_name_kind:
+ return &bpf_btf_find_by_name_kind_proto;
+ case BPF_FUNC_kallsyms_lookup_name:
+ return &bpf_kallsyms_lookup_name_proto;
default:
return tracing_prog_func_proto(func_id, prog);
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a2024ba32a20..c816e0e0d4a0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1691,6 +1691,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
fn = raw_tp_prog_func_proto(func_id, prog);
if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
fn = bpf_iter_get_func_proto(func_id, prog);
+ if (!fn && prog->aux->sleepable)
+ fn = tracing_prog_syscall_func_proto(func_id, prog);
return fn;
}
}
@@ -2053,6 +2055,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *
if (prog->aux->max_tp_access > btp->writable_size)
return -EINVAL;
+ if (prog->aux->sleepable && !btp->sleepable)
+ return -EPERM;
+
return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
prog);
}
--
2.35.1.574.g5d30c73bfb-goog
On 2/25/22 3:43 PM, Hao Luo wrote:
> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> the handler to make calls that may sleep. With sleepable tracepoints, a
> set of syscall helpers (which may sleep) may also be called from
> sleepable tracepoints.
There are some old discussions on sleepable tracepoints, maybe
worthwhile to take a look.
https://lore.kernel.org/bpf/[email protected]/T/
>
> In the following patches, we will whitelist some tracepoints to be
> sleepable.
>
> Signed-off-by: Hao Luo <[email protected]>
> ---
> include/linux/bpf.h | 10 +++++++-
> include/linux/tracepoint-defs.h | 1 +
> include/trace/bpf_probe.h | 22 ++++++++++++++----
> kernel/bpf/syscall.c | 41 +++++++++++++++++++++++----------
> kernel/trace/bpf_trace.c | 5 ++++
> 5 files changed, 61 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index c36eeced3838..759ade7b24b3 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1810,6 +1810,9 @@ struct bpf_prog *bpf_prog_by_id(u32 id);
> struct bpf_link *bpf_link_by_id(u32 id);
>
> const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
> +const struct bpf_func_proto *
> +tracing_prog_syscall_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
> +
> void bpf_task_storage_free(struct task_struct *task);
> bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
> const struct btf_func_model *
> @@ -1822,7 +1825,6 @@ struct bpf_core_ctx {
>
> int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
> int relo_idx, void *insn);
> -
> #else /* !CONFIG_BPF_SYSCALL */
> static inline struct bpf_prog *bpf_prog_get(u32 ufd)
> {
> @@ -2011,6 +2013,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
> return NULL;
> }
>
> +static inline struct bpf_func_proto *
> +tracing_prog_syscall_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> + return NULL;
> +}
> +
> static inline void bpf_task_storage_free(struct task_struct *task)
> {
> }
> diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
> index e7c2276be33e..c73c7ab3680e 100644
> --- a/include/linux/tracepoint-defs.h
> +++ b/include/linux/tracepoint-defs.h
> @@ -51,6 +51,7 @@ struct bpf_raw_event_map {
> void *bpf_func;
> u32 num_args;
> u32 writable_size;
> + u32 sleepable;
> } __aligned(32);
>
> /*
[...]
On Fri, Feb 25, 2022 at 03:43:34PM -0800, Hao Luo wrote:
> diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
> index e7c2276be33e..c73c7ab3680e 100644
> --- a/include/linux/tracepoint-defs.h
> +++ b/include/linux/tracepoint-defs.h
> @@ -51,6 +51,7 @@ struct bpf_raw_event_map {
> void *bpf_func;
> u32 num_args;
> u32 writable_size;
> + u32 sleepable;
It increases the size for all tracepoints.
See BPF_RAW_TP in include/asm-generic/vmlinux.lds.h
Please switch writeable_size and sleepable to u16.
>
> -static const struct bpf_func_proto *
> -syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +/* Syscall helpers that are also allowed in sleepable tracing prog. */
> +const struct bpf_func_proto *
> +tracing_prog_syscall_func_proto(enum bpf_func_id func_id,
> + const struct bpf_prog *prog)
> {
> switch (func_id) {
> case BPF_FUNC_sys_bpf:
> return &bpf_sys_bpf_proto;
> - case BPF_FUNC_btf_find_by_name_kind:
> - return &bpf_btf_find_by_name_kind_proto;
> case BPF_FUNC_sys_close:
> return &bpf_sys_close_proto;
> - case BPF_FUNC_kallsyms_lookup_name:
> - return &bpf_kallsyms_lookup_name_proto;
> case BPF_FUNC_mkdir:
> return &bpf_mkdir_proto;
> case BPF_FUNC_rmdir:
> return &bpf_rmdir_proto;
> case BPF_FUNC_unlink:
> return &bpf_unlink_proto;
> + default:
> + return NULL;
> + }
> +}
If I read this correctly the goal is to disallow find_by_name_kind
and lookup_name from sleepable tps. Why? What's the harm?
On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <[email protected]> wrote:
>
>
>
> On 2/25/22 3:43 PM, Hao Luo wrote:
> > Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > the handler to make calls that may sleep. With sleepable tracepoints, a
> > set of syscall helpers (which may sleep) may also be called from
> > sleepable tracepoints.
>
> There are some old discussions on sleepable tracepoints, maybe
> worthwhile to take a look.
>
> https://lore.kernel.org/bpf/[email protected]/T/
Right. It's very much related, but obsolete too.
We don't need any of that for sleeptable _raw_ tps.
I prefer to stay with "sleepable" name as well to
match the rest of the bpf sleepable code.
In all cases it's faultable.
On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <[email protected]> wrote:
>>
>>
>>
>> On 2/25/22 3:43 PM, Hao Luo wrote:
>>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
>>> the handler to make calls that may sleep. With sleepable tracepoints, a
>>> set of syscall helpers (which may sleep) may also be called from
>>> sleepable tracepoints.
>>
>> There are some old discussions on sleepable tracepoints, maybe
>> worthwhile to take a look.
>>
>> https://lore.kernel.org/bpf/[email protected]/T/
>
> Right. It's very much related, but obsolete too.
> We don't need any of that for sleeptable _raw_ tps.
> I prefer to stay with "sleepable" name as well to
> match the rest of the bpf sleepable code.
> In all cases it's faultable.
sounds good to me. Agree that for the bpf user case, Hao's
implementation should be enough.
On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <[email protected]> wrote:
>
>
>
> On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <[email protected]> wrote:
> >>
> >>
> >>
> >> On 2/25/22 3:43 PM, Hao Luo wrote:
> >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> >>> set of syscall helpers (which may sleep) may also be called from
> >>> sleepable tracepoints.
> >>
> >> There are some old discussions on sleepable tracepoints, maybe
> >> worthwhile to take a look.
> >>
> >> https://lore.kernel.org/bpf/[email protected]/T/
> >
> > Right. It's very much related, but obsolete too.
> > We don't need any of that for sleeptable _raw_ tps.
> > I prefer to stay with "sleepable" name as well to
> > match the rest of the bpf sleepable code.
> > In all cases it's faultable.
>
> sounds good to me. Agree that for the bpf user case, Hao's
> implementation should be enough.
Just remembered that we can also do trivial noinline __weak
nop function and mark it sleepable on the verifier side.
That's what we were planning to do to trace map update/delete ops
in Joe Burton's series.
Then we don't need to extend tp infra.
I'm fine whichever way. I see pros and cons in both options.
On Thu, Mar 3, 2022 at 11:43 AM Hao Luo <[email protected]> wrote:
>
> On Wed, Mar 2, 2022 at 6:29 PM Alexei Starovoitov
> <[email protected]> wrote:
> >
> > On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <[email protected]> wrote:
> > >
> > >
> > >
> > > On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > > > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <[email protected]> wrote:
> > > >>
> > > >>
> > > >>
> > > >> On 2/25/22 3:43 PM, Hao Luo wrote:
> > > >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > > >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> > > >>> set of syscall helpers (which may sleep) may also be called from
> > > >>> sleepable tracepoints.
> > > >>
> > > >> There are some old discussions on sleepable tracepoints, maybe
> > > >> worthwhile to take a look.
> > > >>
> > > >> https://lore.kernel.org/bpf/[email protected]/T/
> > > >
> > > > Right. It's very much related, but obsolete too.
> > > > We don't need any of that for sleeptable _raw_ tps.
> > > > I prefer to stay with "sleepable" name as well to
> > > > match the rest of the bpf sleepable code.
> > > > In all cases it's faultable.
> > >
> > > sounds good to me. Agree that for the bpf user case, Hao's
> > > implementation should be enough.
> >
> > Just remembered that we can also do trivial noinline __weak
> > nop function and mark it sleepable on the verifier side.
> > That's what we were planning to do to trace map update/delete ops
> > in Joe Burton's series.
> > Then we don't need to extend tp infra.
> > I'm fine whichever way. I see pros and cons in both options.
>
> Joe is also cc'ed in this patchset, I will sync up with him on the
> status of trace map work.
>
> Alexei, do we have potentially other variants of tp? We can make the
> current u16 sleepable a flag, so we can reuse this flag later when we
> have another type of tracepoints.
When we added the ability to attach to kernel functions and mark them
as allow_error_inject the usefulness of tracepoints and even
writeable tracepoints was deminissed.
If we do sleepable tracepoint, I suspect, it may be the last extension
in that area.
I guess I'm convincing myself that noinline weak nop func
is better here. Just like it's better for Joe's map tracing.
On Wed, Mar 2, 2022 at 11:41 AM Alexei Starovoitov
<[email protected]> wrote:
>
> On Fri, Feb 25, 2022 at 03:43:34PM -0800, Hao Luo wrote:
> > diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
> > index e7c2276be33e..c73c7ab3680e 100644
> > --- a/include/linux/tracepoint-defs.h
> > +++ b/include/linux/tracepoint-defs.h
> > @@ -51,6 +51,7 @@ struct bpf_raw_event_map {
> > void *bpf_func;
> > u32 num_args;
> > u32 writable_size;
> > + u32 sleepable;
>
> It increases the size for all tracepoints.
> See BPF_RAW_TP in include/asm-generic/vmlinux.lds.h
> Please switch writeable_size and sleepable to u16.
No problem.
> >
> > -static const struct bpf_func_proto *
> > -syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> > +/* Syscall helpers that are also allowed in sleepable tracing prog. */
> > +const struct bpf_func_proto *
> > +tracing_prog_syscall_func_proto(enum bpf_func_id func_id,
> > + const struct bpf_prog *prog)
> > {
> > switch (func_id) {
> > case BPF_FUNC_sys_bpf:
> > return &bpf_sys_bpf_proto;
> > - case BPF_FUNC_btf_find_by_name_kind:
> > - return &bpf_btf_find_by_name_kind_proto;
> > case BPF_FUNC_sys_close:
> > return &bpf_sys_close_proto;
> > - case BPF_FUNC_kallsyms_lookup_name:
> > - return &bpf_kallsyms_lookup_name_proto;
> > case BPF_FUNC_mkdir:
> > return &bpf_mkdir_proto;
> > case BPF_FUNC_rmdir:
> > return &bpf_rmdir_proto;
> > case BPF_FUNC_unlink:
> > return &bpf_unlink_proto;
> > + default:
> > + return NULL;
> > + }
> > +}
>
> If I read this correctly the goal is to disallow find_by_name_kind
> and lookup_name from sleepable tps. Why? What's the harm?
A couple of thoughts, please correct me if they don't make sense. I
may think too much.
1. The very first reason is, I don't know the use case of them in
tracing. So I think I can leave them right now and add them later if
the maintainers want them.
2. A related question is, do we actually want all syscall helpers to
be in sleepable tracing? Some helpers may cause re-entering the
tracepoints. For a hypothetical example, if we call mkdir while
tracing some tracepoints in vfs_mkdir. Do we have protection for this?
Another potential problem is about lookup_name in particular,
sleepable_tracing could be triggered by any user, will lookup_name
leak kernel addresses to users in some way? The filesystem helpers
have some basic perm checks, I would think it's relatively safer.
On Wed, Mar 2, 2022 at 6:29 PM Alexei Starovoitov
<[email protected]> wrote:
>
> On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <[email protected]> wrote:
> >
> >
> >
> > On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <[email protected]> wrote:
> > >>
> > >>
> > >>
> > >> On 2/25/22 3:43 PM, Hao Luo wrote:
> > >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> > >>> set of syscall helpers (which may sleep) may also be called from
> > >>> sleepable tracepoints.
> > >>
> > >> There are some old discussions on sleepable tracepoints, maybe
> > >> worthwhile to take a look.
> > >>
> > >> https://lore.kernel.org/bpf/[email protected]/T/
> > >
> > > Right. It's very much related, but obsolete too.
> > > We don't need any of that for sleeptable _raw_ tps.
> > > I prefer to stay with "sleepable" name as well to
> > > match the rest of the bpf sleepable code.
> > > In all cases it's faultable.
> >
> > sounds good to me. Agree that for the bpf user case, Hao's
> > implementation should be enough.
>
> Just remembered that we can also do trivial noinline __weak
> nop function and mark it sleepable on the verifier side.
> That's what we were planning to do to trace map update/delete ops
> in Joe Burton's series.
> Then we don't need to extend tp infra.
> I'm fine whichever way. I see pros and cons in both options.
Joe is also cc'ed in this patchset, I will sync up with him on the
status of trace map work.
Alexei, do we have potentially other variants of tp? We can make the
current u16 sleepable a flag, so we can reuse this flag later when we
have another type of tracepoints.
On Thu, Mar 3, 2022 at 11:37 AM Hao Luo <[email protected]> wrote:
>
> On Wed, Mar 2, 2022 at 11:41 AM Alexei Starovoitov
> <[email protected]> wrote:
> >
> > On Fri, Feb 25, 2022 at 03:43:34PM -0800, Hao Luo wrote:
> > > diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
> > > index e7c2276be33e..c73c7ab3680e 100644
> > > --- a/include/linux/tracepoint-defs.h
> > > +++ b/include/linux/tracepoint-defs.h
> > > @@ -51,6 +51,7 @@ struct bpf_raw_event_map {
> > > void *bpf_func;
> > > u32 num_args;
> > > u32 writable_size;
> > > + u32 sleepable;
> >
> > It increases the size for all tracepoints.
> > See BPF_RAW_TP in include/asm-generic/vmlinux.lds.h
> > Please switch writeable_size and sleepable to u16.
>
> No problem.
>
> > >
> > > -static const struct bpf_func_proto *
> > > -syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> > > +/* Syscall helpers that are also allowed in sleepable tracing prog. */
> > > +const struct bpf_func_proto *
> > > +tracing_prog_syscall_func_proto(enum bpf_func_id func_id,
> > > + const struct bpf_prog *prog)
> > > {
> > > switch (func_id) {
> > > case BPF_FUNC_sys_bpf:
> > > return &bpf_sys_bpf_proto;
> > > - case BPF_FUNC_btf_find_by_name_kind:
> > > - return &bpf_btf_find_by_name_kind_proto;
> > > case BPF_FUNC_sys_close:
> > > return &bpf_sys_close_proto;
> > > - case BPF_FUNC_kallsyms_lookup_name:
> > > - return &bpf_kallsyms_lookup_name_proto;
> > > case BPF_FUNC_mkdir:
> > > return &bpf_mkdir_proto;
> > > case BPF_FUNC_rmdir:
> > > return &bpf_rmdir_proto;
> > > case BPF_FUNC_unlink:
> > > return &bpf_unlink_proto;
> > > + default:
> > > + return NULL;
> > > + }
> > > +}
> >
> > If I read this correctly the goal is to disallow find_by_name_kind
> > and lookup_name from sleepable tps. Why? What's the harm?
>
> A couple of thoughts, please correct me if they don't make sense. I
> may think too much.
>
> 1. The very first reason is, I don't know the use case of them in
> tracing. So I think I can leave them right now and add them later if
> the maintainers want them.
>
> 2. A related question is, do we actually want all syscall helpers to
> be in sleepable tracing? Some helpers may cause re-entering the
> tracepoints. For a hypothetical example, if we call mkdir while
> tracing some tracepoints in vfs_mkdir. Do we have protection for this?
If we go with noinline weak nop function approach then we will
get recursion protection for free. All trampoline powered progs have it.
Both sleepable and not.
> Another potential problem is about lookup_name in particular,
> sleepable_tracing could be triggered by any user, will lookup_name
> leak kernel addresses to users in some way? The filesystem helpers
> have some basic perm checks, I would think it's relatively safer.
The tracepoint may be triggerable by any user, but the sleepable
tp bpf prog will be loaded with cap_perfmon permissions, so it has
the rights to read anything.
So I don't see any concerns with enabling lookup_name to both
syscall bpf prog and tp progs.
On Thu, Mar 3, 2022 at 12:02 PM Alexei Starovoitov
<[email protected]> wrote:
>
> On Thu, Mar 3, 2022 at 11:43 AM Hao Luo <[email protected]> wrote:
> >
> > On Wed, Mar 2, 2022 at 6:29 PM Alexei Starovoitov
> > <[email protected]> wrote:
> > >
> > > On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <[email protected]> wrote:
> > > >
> > > >
> > > >
> > > > On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > > > > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <[email protected]> wrote:
> > > > >>
> > > > >>
> > > > >>
> > > > >> On 2/25/22 3:43 PM, Hao Luo wrote:
> > > > >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > > > >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> > > > >>> set of syscall helpers (which may sleep) may also be called from
> > > > >>> sleepable tracepoints.
> > > > >>
> > > > >> There are some old discussions on sleepable tracepoints, maybe
> > > > >> worthwhile to take a look.
> > > > >>
> > > > >> https://lore.kernel.org/bpf/[email protected]/T/
> > > > >
> > > > > Right. It's very much related, but obsolete too.
> > > > > We don't need any of that for sleeptable _raw_ tps.
> > > > > I prefer to stay with "sleepable" name as well to
> > > > > match the rest of the bpf sleepable code.
> > > > > In all cases it's faultable.
> > > >
> > > > sounds good to me. Agree that for the bpf user case, Hao's
> > > > implementation should be enough.
> > >
> > > Just remembered that we can also do trivial noinline __weak
> > > nop function and mark it sleepable on the verifier side.
> > > That's what we were planning to do to trace map update/delete ops
> > > in Joe Burton's series.
> > > Then we don't need to extend tp infra.
> > > I'm fine whichever way. I see pros and cons in both options.
> >
> > Joe is also cc'ed in this patchset, I will sync up with him on the
> > status of trace map work.
> >
> > Alexei, do we have potentially other variants of tp? We can make the
> > current u16 sleepable a flag, so we can reuse this flag later when we
> > have another type of tracepoints.
>
> When we added the ability to attach to kernel functions and mark them
> as allow_error_inject the usefulness of tracepoints and even
> writeable tracepoints was deminissed.
> If we do sleepable tracepoint, I suspect, it may be the last extension
> in that area.
> I guess I'm convincing myself that noinline weak nop func
> is better here. Just like it's better for Joe's map tracing.
To add to the above... The only downside of sleepable nop func
comparing to tp is the lack of static_branch.
So this nop call will always be there.
For map tracing and for cgroup mkdir/rmdir the few nanosecond
overhead of calling an empty function isn't even measurable.
On Thu, Mar 3, 2022 at 12:04 PM Alexei Starovoitov
<[email protected]> wrote:
>
> On Thu, Mar 3, 2022 at 12:02 PM Alexei Starovoitov
> <[email protected]> wrote:
> >
> > On Thu, Mar 3, 2022 at 11:43 AM Hao Luo <[email protected]> wrote:
> > >
> > > On Wed, Mar 2, 2022 at 6:29 PM Alexei Starovoitov
> > > <[email protected]> wrote:
> > > >
> > > > On Wed, Mar 2, 2022 at 5:09 PM Yonghong Song <[email protected]> wrote:
> > > > >
> > > > >
> > > > >
> > > > > On 3/2/22 1:30 PM, Alexei Starovoitov wrote:
> > > > > > On Wed, Mar 2, 2022 at 1:23 PM Yonghong Song <[email protected]> wrote:
> > > > > >>
> > > > > >>
> > > > > >>
> > > > > >> On 2/25/22 3:43 PM, Hao Luo wrote:
> > > > > >>> Add a new type of bpf tracepoints: sleepable tracepoints, which allows
> > > > > >>> the handler to make calls that may sleep. With sleepable tracepoints, a
> > > > > >>> set of syscall helpers (which may sleep) may also be called from
> > > > > >>> sleepable tracepoints.
> > > > > >>
> > > > > >> There are some old discussions on sleepable tracepoints, maybe
> > > > > >> worthwhile to take a look.
> > > > > >>
> > > > > >> https://lore.kernel.org/bpf/[email protected]/T/
> > > > > >
> > > > > > Right. It's very much related, but obsolete too.
> > > > > > We don't need any of that for sleeptable _raw_ tps.
> > > > > > I prefer to stay with "sleepable" name as well to
> > > > > > match the rest of the bpf sleepable code.
> > > > > > In all cases it's faultable.
> > > > >
> > > > > sounds good to me. Agree that for the bpf user case, Hao's
> > > > > implementation should be enough.
> > > >
> > > > Just remembered that we can also do trivial noinline __weak
> > > > nop function and mark it sleepable on the verifier side.
> > > > That's what we were planning to do to trace map update/delete ops
> > > > in Joe Burton's series.
> > > > Then we don't need to extend tp infra.
> > > > I'm fine whichever way. I see pros and cons in both options.
> > >
> > > Joe is also cc'ed in this patchset, I will sync up with him on the
> > > status of trace map work.
> > >
> > > Alexei, do we have potentially other variants of tp? We can make the
> > > current u16 sleepable a flag, so we can reuse this flag later when we
> > > have another type of tracepoints.
> >
> > When we added the ability to attach to kernel functions and mark them
> > as allow_error_inject the usefulness of tracepoints and even
> > writeable tracepoints was deminissed.
> > If we do sleepable tracepoint, I suspect, it may be the last extension
> > in that area.
> > I guess I'm convincing myself that noinline weak nop func
> > is better here. Just like it's better for Joe's map tracing.
>
> To add to the above... The only downside of sleepable nop func
> comparing to tp is the lack of static_branch.
> So this nop call will always be there.
> For map tracing and for cgroup mkdir/rmdir the few nanosecond
> overhead of calling an empty function isn't even measurable.
The overhead should be fine, I think. mkdir/rmdir won't be frequent
operations. Thanks for the explanation. Let me give it a try and
report back how it works.