LinuxLists.cc - [PATCH v2] cgroup/bpf: fast path for not loaded skb BPF filtering

2021-12-11 19:19:09

Subject: [PATCH v2] cgroup/bpf: fast path for not loaded skb BPF filtering

cgroup_bpf_enabled_key static key guards from overhead in cases where
no cgroup bpf program of a specific type is loaded in any cgroup. Turn
out that's not always good enough, e.g. when there are many cgroups but
ones that we're interesting in are without bpf. It's seen in server
environments, but the problem seems to be even wider as apparently
systemd loads some BPF affecting my laptop.

Profiles for small packet or zerocopy transmissions over fast network
show __cgroup_bpf_run_filter_skb() taking 2-3%, 1% of which is from
migrate_disable/enable(), and similarly on the receiving side. Also
got +4-5% of t-put for local testing.

Signed-off-by: Pavel Begunkov <[email protected]>
---

v2: replace bitmask appoach with empty_prog_array (suggested by Martin)

include/linux/bpf-cgroup.h | 24 +++++++++++++++++++++---
include/linux/bpf.h | 13 +++++++++++++
kernel/bpf/cgroup.c | 18 ++----------------
kernel/bpf/core.c | 12 ++----------
4 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 11820a430d6c..793e4f65ccb5 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -219,11 +219,28 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
void *value, u64 flags);

+static inline bool
+__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
+ enum cgroup_bpf_attach_type type)
+{
+ struct bpf_prog_array *array = rcu_access_pointer(cgrp_bpf->effective[type]);
+
+ return array == &empty_prog_array.hdr;
+}
+
+#define CGROUP_BPF_TYPE_ENABLED(sk, atype) \
+({ \
+ struct cgroup *__cgrp = sock_cgroup_ptr(&(sk)->sk_cgrp_data); \
+ \
+ !__cgroup_bpf_prog_array_is_empty(&__cgrp->bpf, (atype)); \
+})
+
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
({ \
int __ret = 0; \
- if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \
+ if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && sk && \
+ CGROUP_BPF_TYPE_ENABLED((sk), CGROUP_INET_INGRESS)) \
__ret = __cgroup_bpf_run_filter_skb(sk, skb, \
CGROUP_INET_INGRESS); \
\
@@ -235,9 +252,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
int __ret = 0; \
if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
typeof(sk) __sk = sk_to_full_sk(sk); \
- if (sk_fullsock(__sk)) \
+ if (sk_fullsock(__sk) && \
+ CGROUP_BPF_TYPE_ENABLED(__sk, CGROUP_INET_EGRESS)) \
__ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
- CGROUP_INET_EGRESS); \
+ CGROUP_INET_EGRESS); \
} \
__ret; \
})
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e7a163a3146b..4a081065b77d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1161,6 +1161,19 @@ struct bpf_prog_array {
struct bpf_prog_array_item items[];
};

+struct bpf_empty_prog_array {
+ struct bpf_prog_array hdr;
+ struct bpf_prog *null_prog;
+};
+
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+extern struct bpf_empty_prog_array empty_prog_array;
+
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array *progs);
int bpf_prog_array_length(struct bpf_prog_array *progs);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43eb3501721b..99e85f44e257 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1354,20 +1354,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
}

#ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
- enum cgroup_bpf_attach_type attach_type)
-{
- struct bpf_prog_array *prog_array;
- bool empty;
-
- rcu_read_lock();
- prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
- empty = bpf_prog_array_is_empty(prog_array);
- rcu_read_unlock();
-
- return empty;
-}
-
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
struct bpf_sockopt_buf *buf)
{
@@ -1430,7 +1416,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
+ if (__cgroup_bpf_prog_array_is_empty(&cgrp->bpf, CGROUP_SETSOCKOPT))
return 0;

/* Allocate a bit more than the initial user buffer for
@@ -1526,7 +1512,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
+ if (__cgroup_bpf_prog_array_is_empty(&cgrp->bpf, CGROUP_GETSOCKOPT))
return retval;

ctx.optlen = max_optlen;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2405e39d800f..fedc7b44a1a9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1967,18 +1967,10 @@ static struct bpf_prog_dummy {
},
};

-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
- struct bpf_prog_array hdr;
- struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array empty_prog_array = {
.null_prog = NULL,
};
+EXPORT_SYMBOL(empty_prog_array);

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
--
2.34.0

2021-12-14 07:27:44

by Martin KaFai Lau

[permalink] [raw]

Subject: Re: [PATCH v2] cgroup/bpf: fast path for not loaded skb BPF filtering

On Sat, Dec 11, 2021 at 07:17:49PM +0000, Pavel Begunkov wrote:
> cgroup_bpf_enabled_key static key guards from overhead in cases where
> no cgroup bpf program of a specific type is loaded in any cgroup. Turn
> out that's not always good enough, e.g. when there are many cgroups but
> ones that we're interesting in are without bpf. It's seen in server
> environments, but the problem seems to be even wider as apparently
> systemd loads some BPF affecting my laptop.
>
> Profiles for small packet or zerocopy transmissions over fast network
> show __cgroup_bpf_run_filter_skb() taking 2-3%, 1% of which is from
> migrate_disable/enable(), and similarly on the receiving side. Also
> got +4-5% of t-put for local testing.
What is t-put? throughput?

Local testing means sending to lo/dummy?

[ ... ]

> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 11820a430d6c..793e4f65ccb5 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -219,11 +219,28 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
> int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> void *value, u64 flags);
>
> +static inline bool
> +__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
> + enum cgroup_bpf_attach_type type)
Lets remove this.

> +{
> + struct bpf_prog_array *array = rcu_access_pointer(cgrp_bpf->effective[type]);
> +
> + return array == &empty_prog_array.hdr;
> +}
> +
> +#define CGROUP_BPF_TYPE_ENABLED(sk, atype) \
and change cgroup.c to directly use this instead, so
everywhere holding a fullsock sk will use this instead
of having two helpers for empty check.

[ ... ]

> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 2405e39d800f..fedc7b44a1a9 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1967,18 +1967,10 @@ static struct bpf_prog_dummy {
> },
> };
>
> -/* to avoid allocating empty bpf_prog_array for cgroups that
> - * don't have bpf program attached use one global 'empty_prog_array'
> - * It will not be modified the caller of bpf_prog_array_alloc()
> - * (since caller requested prog_cnt == 0)
> - * that pointer should be 'freed' by bpf_prog_array_free()
> - */
> -static struct {
> - struct bpf_prog_array hdr;
> - struct bpf_prog *null_prog;
> -} empty_prog_array = {
> +struct bpf_empty_prog_array empty_prog_array = {
> .null_prog = NULL,
> };
> +EXPORT_SYMBOL(empty_prog_array);
nit. Since it is exported, may be prefix it with 'bpf_'.

2021-12-14 11:40:35

by Pavel Begunkov

[permalink] [raw]

Subject: Re: [PATCH v2] cgroup/bpf: fast path for not loaded skb BPF filtering

On 12/14/21 07:27, Martin KaFai Lau wrote:
> On Sat, Dec 11, 2021 at 07:17:49PM +0000, Pavel Begunkov wrote:
>> cgroup_bpf_enabled_key static key guards from overhead in cases where
>> no cgroup bpf program of a specific type is loaded in any cgroup. Turn
>> out that's not always good enough, e.g. when there are many cgroups but
>> ones that we're interesting in are without bpf. It's seen in server
>> environments, but the problem seems to be even wider as apparently
>> systemd loads some BPF affecting my laptop.
>>
>> Profiles for small packet or zerocopy transmissions over fast network
>> show __cgroup_bpf_run_filter_skb() taking 2-3%, 1% of which is from
>> migrate_disable/enable(), and similarly on the receiving side. Also
>> got +4-5% of t-put for local testing.
> What is t-put? throughput?

yes

> Local testing means sending to lo/dummy?

yes, it was dummy specifically

>
> [ ... ]
>
>> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
>> index 11820a430d6c..793e4f65ccb5 100644
>> --- a/include/linux/bpf-cgroup.h
>> +++ b/include/linux/bpf-cgroup.h
>> @@ -219,11 +219,28 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
>> int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
>> void *value, u64 flags);
>>
>> +static inline bool
>> +__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
>> + enum cgroup_bpf_attach_type type)
> Lets remove this.
>
>> +{
>> + struct bpf_prog_array *array = rcu_access_pointer(cgrp_bpf->effective[type]);
>> +
>> + return array == &empty_prog_array.hdr;
>> +}
>> +
>> +#define CGROUP_BPF_TYPE_ENABLED(sk, atype) \
> and change cgroup.c to directly use this instead, so
> everywhere holding a fullsock sk will use this instead
> of having two helpers for empty check.

Why? CGROUP_BPF_TYPE_ENABLED can't be a function atm because of header
dependency hell, and so it'd kill some of typization, which doesn't add
clarity. And also it imposes some extra overhead to *sockopt using
the first helper directly.

I think it's better with two of them. I could inline the second
one, but it wouldn't have been pretty.

>
> [ ... ]
>
>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>> index 2405e39d800f..fedc7b44a1a9 100644
>> --- a/kernel/bpf/core.c
>> +++ b/kernel/bpf/core.c
>> @@ -1967,18 +1967,10 @@ static struct bpf_prog_dummy {
>> },
>> };
>>
>> -/* to avoid allocating empty bpf_prog_array for cgroups that
>> - * don't have bpf program attached use one global 'empty_prog_array'
>> - * It will not be modified the caller of bpf_prog_array_alloc()
>> - * (since caller requested prog_cnt == 0)
>> - * that pointer should be 'freed' by bpf_prog_array_free()
>> - */
>> -static struct {
>> - struct bpf_prog_array hdr;
>> - struct bpf_prog *null_prog;
>> -} empty_prog_array = {
>> +struct bpf_empty_prog_array empty_prog_array = {
>> .null_prog = NULL,
>> };
>> +EXPORT_SYMBOL(empty_prog_array);
> nit. Since it is exported, may be prefix it with 'bpf_'.

yeah, sure

--
Pavel Begunkov

2021-12-14 19:14:55

by Martin KaFai Lau

[permalink] [raw]

Subject: Re: [PATCH v2] cgroup/bpf: fast path for not loaded skb BPF filtering

On Tue, Dec 14, 2021 at 11:40:26AM +0000, Pavel Begunkov wrote:
> On 12/14/21 07:27, Martin KaFai Lau wrote:
> > On Sat, Dec 11, 2021 at 07:17:49PM +0000, Pavel Begunkov wrote:
> > > cgroup_bpf_enabled_key static key guards from overhead in cases where
> > > no cgroup bpf program of a specific type is loaded in any cgroup. Turn
> > > out that's not always good enough, e.g. when there are many cgroups but
> > > ones that we're interesting in are without bpf. It's seen in server
> > > environments, but the problem seems to be even wider as apparently
> > > systemd loads some BPF affecting my laptop.
> > >
> > > Profiles for small packet or zerocopy transmissions over fast network
> > > show __cgroup_bpf_run_filter_skb() taking 2-3%, 1% of which is from
> > > migrate_disable/enable(), and similarly on the receiving side. Also
> > > got +4-5% of t-put for local testing.
> > What is t-put? throughput?
>
> yes
>
> > Local testing means sending to lo/dummy?
>
> yes, it was dummy specifically
Thanks for confirming.

Please also put these details in the commit log.
I was slow. With only '%' as a unit, it took me a min to guess
what t-put may mean ;)

> > [ ... ]
> >
> > > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > > index 11820a430d6c..793e4f65ccb5 100644
> > > --- a/include/linux/bpf-cgroup.h
> > > +++ b/include/linux/bpf-cgroup.h
> > > @@ -219,11 +219,28 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
> > > int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> > > void *value, u64 flags);
> > > +static inline bool
> > > +__cgroup_bpf_prog_array_is_empty(struct cgroup_bpf *cgrp_bpf,
> > > + enum cgroup_bpf_attach_type type)
> > Lets remove this.
> >
> > > +{
> > > + struct bpf_prog_array *array = rcu_access_pointer(cgrp_bpf->effective[type]);
> > > +
> > > + return array == &empty_prog_array.hdr;
> > > +}
> > > +
> > > +#define CGROUP_BPF_TYPE_ENABLED(sk, atype) \
> > and change cgroup.c to directly use this instead, so
> > everywhere holding a fullsock sk will use this instead
> > of having two helpers for empty check.
>
> Why?
As mentioned earlier, prefer to have one way to do the same thing
for checking with a fullsock.

> CGROUP_BPF_TYPE_ENABLED can't be a function atm because of header
> dependency hell, and so it'd kill some of typization, which doesn't add
> clarity.
I didn't mean to change it to a function. I actually think,
for the sk context, it should eventually be folded with the existing
cgroup_bpf_enabled() macro because those are the tests to ensure
there is bpf prog to run before proceeding.
Need to audit about the non fullsock case. not sure yet.

> And also it imposes some extra overhead to *sockopt using
> the first helper directly.
I think it is unimportant unless it is measurable in normal
use case.

> I think it's better with two of them.
Ok. I won't insist. There are atype that may not have sk, so
a separate inline function for checking emptiness may eventually
be useful there.

> I could inline the second one, but it wouldn't have been pretty.
Leaving CGROUP_BPF_TYPE_ENABLED as macro is fine.

2021-12-15 11:45:50

by Pavel Begunkov

[permalink] [raw]

Subject: Re: [PATCH v2] cgroup/bpf: fast path for not loaded skb BPF filtering

On 12/14/21 19:14, Martin KaFai Lau wrote:
> On Tue, Dec 14, 2021 at 11:40:26AM +0000, Pavel Begunkov wrote:
>> On 12/14/21 07:27, Martin KaFai Lau wrote:
>>> On Sat, Dec 11, 2021 at 07:17:49PM +0000, Pavel Begunkov wrote:
>>>> cgroup_bpf_enabled_key static key guards from overhead in cases where
>>>> no cgroup bpf program of a specific type is loaded in any cgroup. Turn
>>>> out that's not always good enough, e.g. when there are many cgroups but
>>>> ones that we're interesting in are without bpf. It's seen in server
>>>> environments, but the problem seems to be even wider as apparently
>>>> systemd loads some BPF affecting my laptop.
>>>>
>>>> Profiles for small packet or zerocopy transmissions over fast network
>>>> show __cgroup_bpf_run_filter_skb() taking 2-3%, 1% of which is from
>>>> migrate_disable/enable(), and similarly on the receiving side. Also
>>>> got +4-5% of t-put for local testing.
>>> What is t-put? throughput?
>>
>> yes
>>
>>> Local testing means sending to lo/dummy?
>>
>> yes, it was dummy specifically
> Thanks for confirming.
>
> Please also put these details in the commit log.
> I was slow. With only '%' as a unit, it took me a min to guess
> what t-put may mean ;)

I guess requests/s is a more natural metric for net. I anyway going
to resend, will reword it a bit.

>>>> +#define CGROUP_BPF_TYPE_ENABLED(sk, atype) \
>>> and change cgroup.c to directly use this instead, so
>>> everywhere holding a fullsock sk will use this instead
>>> of having two helpers for empty check.
>>
>> Why?
> As mentioned earlier, prefer to have one way to do the same thing
> for checking with a fullsock.
>
>> CGROUP_BPF_TYPE_ENABLED can't be a function atm because of header
>> dependency hell, and so it'd kill some of typization, which doesn't add
>> clarity.
> I didn't mean to change it to a function. I actually think,
> for the sk context, it should eventually be folded with the existing
> cgroup_bpf_enabled() macro because those are the tests to ensure
> there is bpf prog to run before proceeding.
> Need to audit about the non fullsock case. not sure yet.

btw, would be nice to rewrite helpers as inline functions, but
sock, cgroup, etc. are not defined in bpf-cgroup.h are can't be
included. May make sense e.g. not include bpf-cgroup.h in bpf.h
but to move some definitions like struct cgroup_bpf into
include/linux/cgroup-defs.h.
Though I'd rather leave it to someone with a better grasp on
BPF code base.

>> And also it imposes some extra overhead to *sockopt using
>> the first helper directly.
> I think it is unimportant unless it is measurable in normal
> use case.

I hope so

>> I think it's better with two of them.
> Ok. I won't insist. There are atype that may not have sk, so
> a separate inline function for checking emptiness may eventually
> be useful there.

--
Pavel Begunkov