While rolling out a new BPF based TC classifier I hit a memory leak, which
manifests in large numbers of request and time wait sockets not being released.
The root cause is that the current BPF helpers dealing with sockets are naive:
they assume that sk->sk_flags is always valid. struct request_sock and
struct inet_timewait_sock break this.
I've fixed this up by adding a helper that checks sk_state in addition to sk_flags.
The solution is a bit clumsy: it encapsulates details of struct sock in BPF.
It would probably be nicer to have a sock_gen_put + SOCK_RCU_FREE function exposed
in sock.h, but that might be too big a change for backports.
Thoughts?
Lorenz Bauer (1):
net: bpf: don't leak time wait and request sockets
net/core/filter.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
--
2.20.1
It's possible to leak time wait and request sockets via the following
BPF pseudo code:
sk = bpf_skc_lookup_tcp(...)
if (sk)
bpf_sk_release(sk)
If sk->sk_state is TCP_NEW_SYN_RECV or TCP_TIME_WAIT the refcount taken
by bpf_skc_lookup_tcp is not undone by bpf_sk_release. This is because
sk_flags is re-used for other data in both kinds of sockets. The check
!sock_flag(sk, SOCK_RCU_FREE)
therefore returns a bogus result.
Introduce a helper to account for this complication, and call it from
the necessary places.
Fixes: edbf8c01de5a ("bpf: add skc_lookup_tcp helper")
Fixes: f7355a6c0497 ("bpf: Check sk_fullsock() before returning from bpf_sk_lookup()")
Signed-off-by: Lorenz Bauer <[email protected]>
---
net/core/filter.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 42fd17c48c5f..d98dc4526d82 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5266,6 +5266,14 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
return sk;
}
+static void __bpf_sk_release(struct sock *sk)
+{
+ /* time wait and request socks don't have sk_flags. */
+ if (sk->sk_state == TCP_TIME_WAIT || sk->sk_state == TCP_NEW_SYN_RECV ||
+ !sock_flag(sk, SOCK_RCU_FREE))
+ sock_gen_put(sk);
+}
+
static struct sock *
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
@@ -5277,8 +5285,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
if (sk) {
sk = sk_to_full_sk(sk);
if (!sk_fullsock(sk)) {
- if (!sock_flag(sk, SOCK_RCU_FREE))
- sock_gen_put(sk);
+ __bpf_sk_release(sk);
return NULL;
}
}
@@ -5315,8 +5322,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
if (sk) {
sk = sk_to_full_sk(sk);
if (!sk_fullsock(sk)) {
- if (!sock_flag(sk, SOCK_RCU_FREE))
- sock_gen_put(sk);
+ __bpf_sk_release(sk);
return NULL;
}
}
@@ -5383,8 +5389,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- if (!sock_flag(sk, SOCK_RCU_FREE))
- sock_gen_put(sk);
+ __bpf_sk_release(sk);
return 0;
}
--
2.20.1
On Thu, Jan 09, 2020 at 11:57:48AM +0000, Lorenz Bauer wrote:
> It's possible to leak time wait and request sockets via the following
> BPF pseudo code:
> ?
> sk = bpf_skc_lookup_tcp(...)
> if (sk)
> bpf_sk_release(sk)
>
> If sk->sk_state is TCP_NEW_SYN_RECV or TCP_TIME_WAIT the refcount taken
> by bpf_skc_lookup_tcp is not undone by bpf_sk_release. This is because
> sk_flags is re-used for other data in both kinds of sockets. The check
Thanks for the report.
>
> !sock_flag(sk, SOCK_RCU_FREE)
>
> therefore returns a bogus result.
>
> Introduce a helper to account for this complication, and call it from
> the necessary places.
>
> Fixes: edbf8c01de5a ("bpf: add skc_lookup_tcp helper")
> Fixes: f7355a6c0497 ("bpf: Check sk_fullsock() before returning from bpf_sk_lookup()")
> Signed-off-by: Lorenz Bauer <[email protected]>
> ---
> net/core/filter.c | 17 +++++++++++------
> 1 file changed, 11 insertions(+), 6 deletions(-)
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 42fd17c48c5f..d98dc4526d82 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5266,6 +5266,14 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> return sk;
> }
>
> +static void __bpf_sk_release(struct sock *sk)
> +{
> + /* time wait and request socks don't have sk_flags. */
> + if (sk->sk_state == TCP_TIME_WAIT || sk->sk_state == TCP_NEW_SYN_RECV ||
> + !sock_flag(sk, SOCK_RCU_FREE))
Would this work too?
if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
> + sock_gen_put(sk);
> +}
> +
> static struct sock *
> __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
> @@ -5277,8 +5285,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> if (sk) {
> sk = sk_to_full_sk(sk);
> if (!sk_fullsock(sk)) {
> - if (!sock_flag(sk, SOCK_RCU_FREE))
> - sock_gen_put(sk);
> + __bpf_sk_release(sk);
> return NULL;
> }
> }
> @@ -5315,8 +5322,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
> if (sk) {
> sk = sk_to_full_sk(sk);
> if (!sk_fullsock(sk)) {
> - if (!sock_flag(sk, SOCK_RCU_FREE))
> - sock_gen_put(sk);
> + __bpf_sk_release(sk);
> return NULL;
> }
> }
> @@ -5383,8 +5389,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
>
> BPF_CALL_1(bpf_sk_release, struct sock *, sk)
> {
> - if (!sock_flag(sk, SOCK_RCU_FREE))
> - sock_gen_put(sk);
> + __bpf_sk_release(sk);
> return 0;
> }
>
> --
> 2.20.1
>
It's possible to leak time wait and request sockets via the following
BPF pseudo code:
sk = bpf_skc_lookup_tcp(...)
if (sk)
bpf_sk_release(sk)
If sk->sk_state is TCP_NEW_SYN_RECV or TCP_TIME_WAIT the refcount taken
by bpf_skc_lookup_tcp is not undone by bpf_sk_release. This is because
sk_flags is re-used for other data in both kinds of sockets. The check
!sock_flag(sk, SOCK_RCU_FREE)
therefore returns a bogus result. Check that sk_flags is valid by calling
sk_fullsock. Skip checking SOCK_RCU_FREE if we already know that sk is
not a full socket.
Fixes: edbf8c01de5a ("bpf: add skc_lookup_tcp helper")
Fixes: f7355a6c0497 ("bpf: Check sk_fullsock() before returning from bpf_sk_lookup()")
Signed-off-by: Lorenz Bauer <[email protected]>
---
net/core/filter.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 42fd17c48c5f..41820ba0774c 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5277,8 +5277,7 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
if (sk) {
sk = sk_to_full_sk(sk);
if (!sk_fullsock(sk)) {
- if (!sock_flag(sk, SOCK_RCU_FREE))
- sock_gen_put(sk);
+ sock_gen_put(sk);
return NULL;
}
}
@@ -5315,8 +5314,7 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
if (sk) {
sk = sk_to_full_sk(sk);
if (!sk_fullsock(sk)) {
- if (!sock_flag(sk, SOCK_RCU_FREE))
- sock_gen_put(sk);
+ sock_gen_put(sk);
return NULL;
}
}
@@ -5383,7 +5381,8 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- if (!sock_flag(sk, SOCK_RCU_FREE))
+ /* Only full sockets have sk->sk_flags. */
+ if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
sock_gen_put(sk);
return 0;
}
--
2.20.1
On Thu, 9 Jan 2020 at 18:23, Martin Lau <[email protected]> wrote:
>
> Would this work too?
> if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
Thank you for the suggestion, this makes the patch much nicer.
--
Lorenz Bauer | Systems Engineer
6th Floor, County Hall/The Riverside Building, SE1 7PB, UK
http://www.cloudflare.com
On Fri, Jan 10, 2020 at 01:23:36PM +0000, Lorenz Bauer wrote:
> It's possible to leak time wait and request sockets via the following
> BPF pseudo code:
> ?
> sk = bpf_skc_lookup_tcp(...)
> if (sk)
> bpf_sk_release(sk)
>
> If sk->sk_state is TCP_NEW_SYN_RECV or TCP_TIME_WAIT the refcount taken
> by bpf_skc_lookup_tcp is not undone by bpf_sk_release. This is because
> sk_flags is re-used for other data in both kinds of sockets. The check
>
> !sock_flag(sk, SOCK_RCU_FREE)
>
> therefore returns a bogus result. Check that sk_flags is valid by calling
> sk_fullsock. Skip checking SOCK_RCU_FREE if we already know that sk is
> not a full socket.
Acked-by: Martin KaFai Lau <[email protected]>
On Fri, Jan 10, 2020 at 8:43 AM Martin Lau <[email protected]> wrote:
>
> On Fri, Jan 10, 2020 at 01:23:36PM +0000, Lorenz Bauer wrote:
> > It's possible to leak time wait and request sockets via the following
> > BPF pseudo code:
> >
> > sk = bpf_skc_lookup_tcp(...)
> > if (sk)
> > bpf_sk_release(sk)
> >
> > If sk->sk_state is TCP_NEW_SYN_RECV or TCP_TIME_WAIT the refcount taken
> > by bpf_skc_lookup_tcp is not undone by bpf_sk_release. This is because
> > sk_flags is re-used for other data in both kinds of sockets. The check
> >
> > !sock_flag(sk, SOCK_RCU_FREE)
> >
> > therefore returns a bogus result. Check that sk_flags is valid by calling
> > sk_fullsock. Skip checking SOCK_RCU_FREE if we already know that sk is
> > not a full socket.
> Acked-by: Martin KaFai Lau <[email protected]>
Applied. Thanks