From: Menglong Dong <[email protected]>
The cgroup eBPF attach type 'CGROUP_SOCK_OPS' is able to monitor the
state change of a tcp connect with 'BPF_SOCK_OPS_STATE_CB' ops.
However, it can't trace the whole state change of a tcp connect. While
a connect becomes 'TCP_TIME_WAIT' state, this sock will be release and
a tw sock will be created. While tcp sock release, 'TCP_CLOSE' state
change will be passed to eBPF program. Howeven, the real state of this
connect is 'TCP_TIME_WAIT'.
To make eBPF get the real state change of a tcp connect, add
'CGROUP_TWSK_CLOSE' cgroup attach type, which will be called when
tw sock release and tcp connect become CLOSE.
Signed-off-by: Menglong Dong <[email protected]>
---
include/linux/bpf-cgroup.h | 18 ++++++++++++++++++
include/net/inet_timewait_sock.h | 1 +
include/uapi/linux/bpf.h | 4 +++-
kernel/bpf/cgroup.c | 16 ++++++++++++++++
kernel/bpf/syscall.c | 3 +++
net/core/filter.c | 11 +++++++++++
net/ipv4/inet_timewait_sock.c | 4 ++++
net/ipv4/tcp_minisocks.c | 3 +++
8 files changed, 59 insertions(+), 1 deletion(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 11820a430d6c..eef4d9ab7701 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -48,6 +48,7 @@ enum cgroup_bpf_attach_type {
CGROUP_INET4_GETSOCKNAME,
CGROUP_INET6_GETSOCKNAME,
CGROUP_INET_SOCK_RELEASE,
+ CGROUP_TWSK_CLOSE,
MAX_CGROUP_BPF_ATTACH_TYPE
};
@@ -81,6 +82,7 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME);
CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME);
CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE);
+ CGROUP_ATYPE(CGROUP_TWSK_CLOSE);
default:
return CGROUP_BPF_ATTACH_TYPE_INVALID;
}
@@ -164,6 +166,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum cgroup_bpf_attach_type atype);
+int __cgroup_bpf_run_filter_twsk(struct sock *sk,
+ enum cgroup_bpf_attach_type atype);
+
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct sockaddr *uaddr,
enum cgroup_bpf_attach_type atype,
@@ -251,6 +256,15 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
__ret; \
})
+#define BPF_CGROUP_RUN_TWSK_PROG(sk, atype) \
+({ \
+ int __ret = 0; \
+ if (cgroup_bpf_enabled(atype)) { \
+ __ret = __cgroup_bpf_run_filter_twsk(sk, atype); \
+ } \
+ __ret; \
+})
+
#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE)
@@ -263,6 +277,9 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \
BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
+#define BPF_CGROUP_RUN_PROG_TWSK_CLOSE(sk) \
+ BPF_CGROUP_RUN_TWSK_PROG(sk, CGROUP_TWSK_CLOSE)
+
#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \
({ \
u32 __unused_flags; \
@@ -524,6 +541,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
optlen, retval) ({ retval; })
#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
kernel_optval) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_TWSK_CLOSE(sk) ({ 0; })
#define for_each_cgroup_storage_type(stype) for (; false; )
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index dfd919b3119e..7427219b0796 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -74,6 +74,7 @@ struct inet_timewait_sock {
u32 tw_priority;
struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb;
+ struct sock_cgroup_data tw_cgrp_data;
};
#define tw_tclass tw_tos
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c26871263f1f..641113b87f9d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -995,6 +995,7 @@ enum bpf_attach_type {
BPF_SK_REUSEPORT_SELECT,
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
BPF_PERF_EVENT,
+ BPF_CGROUP_TWSK_CLOSE,
__MAX_BPF_ATTACH_TYPE
};
@@ -5917,8 +5918,9 @@ enum {
* options first before the BPF program does.
*/
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
+ BPF_SOCK_OPS_TW_CLOSE_FLAG = (1<<7),
/* Mask of all currently supported cb flags */
- BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
+ BPF_SOCK_OPS_ALL_CB_FLAGS = 0xFF,
};
/* List of known BPF sock_ops operators.
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 43eb3501721b..2c32652e47d6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -16,6 +16,7 @@
#include <linux/bpf-cgroup.h>
#include <net/sock.h>
#include <net/bpf_sk_storage.h>
+#include <net/inet_timewait_sock.h>
#include "../cgroup/cgroup-internal.h"
@@ -1114,6 +1115,21 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+int __cgroup_bpf_run_filter_twsk(struct sock *sk,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct inet_timewait_sock *twsk;
+ struct cgroup *cgrp;
+ int ret;
+
+ twsk = (struct inet_timewait_sock *)sk;
+ cgrp = sock_cgroup_ptr(&twsk->tw_cgrp_data);
+
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_twsk);
+
/**
* __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
* provided by user sockaddr
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ddd81d543203..da4b6ed192f4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2088,6 +2088,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
+ case BPF_CGROUP_TWSK_CLOSE:
return 0;
default:
return -EINVAL;
@@ -3140,6 +3141,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET_SOCK_RELEASE:
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
+ case BPF_CGROUP_TWSK_CLOSE:
return BPF_PROG_TYPE_CGROUP_SOCK;
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET6_BIND:
@@ -3311,6 +3313,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_SYSCTL:
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
+ case BPF_CGROUP_TWSK_CLOSE:
return cgroup_bpf_prog_query(attr, uattr);
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
diff --git a/net/core/filter.c b/net/core/filter.c
index e4cc3aff5bf7..4d3847053a78 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7927,6 +7927,7 @@ static bool __sock_filter_check_attach_type(int off,
case bpf_ctx_range(struct bpf_sock, src_ip4):
switch (attach_type) {
case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_TWSK_CLOSE:
goto read_only;
default:
return false;
@@ -7934,6 +7935,7 @@ static bool __sock_filter_check_attach_type(int off,
case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
switch (attach_type) {
case BPF_CGROUP_INET6_POST_BIND:
+ case BPF_CGROUP_TWSK_CLOSE:
goto read_only;
default:
return false;
@@ -7942,10 +7944,19 @@ static bool __sock_filter_check_attach_type(int off,
switch (attach_type) {
case BPF_CGROUP_INET4_POST_BIND:
case BPF_CGROUP_INET6_POST_BIND:
+ case BPF_CGROUP_TWSK_CLOSE:
goto read_only;
default:
return false;
}
+ case bpf_ctx_range(struct bpf_sock, protocol):
+ case bpf_ctx_range(struct bpf_sock, type):
+ switch (attach_type) {
+ case BPF_CGROUP_TWSK_CLOSE:
+ return false;
+ default:
+ break;
+ }
}
read_only:
return access_type == BPF_READ;
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 437afe392e66..1fc4c9190a79 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -47,6 +47,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
struct inet_bind_hashbucket *bhead;
+ BPF_CGROUP_RUN_PROG_TWSK_CLOSE((struct sock *)tw);
+
spin_lock(lock);
sk_nulls_del_node_init_rcu((struct sock *)tw);
spin_unlock(lock);
@@ -184,6 +186,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_ipv6only = 0;
tw->tw_transparent = inet->transparent;
tw->tw_prot = sk->sk_prot_creator;
+ tw->tw_cgrp_data = sk->sk_cgrp_data;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
@@ -195,6 +198,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
refcount_set(&tw->tw_refcnt, 0);
__module_get(tw->tw_prot->owner);
+
}
return tw;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7c2d3ac2363a..1fc88e12ba95 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -310,6 +310,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (state == TCP_TIME_WAIT)
timeo = TCP_TIMEWAIT_LEN;
+ if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_TW_CLOSE_FLAG))
+ tcp_sk(sk)->bpf_sock_ops_cb_flags &= ~BPF_SOCK_OPS_STATE_CB_FLAG;
+
/* tw_timer is pinned, so we need to make sure BH are disabled
* in following section, otherwise timer handler could run before
* we complete the initialization.
--
2.30.2
On Wed, Dec 29, 2021 at 3:33 AM <[email protected]> wrote:
>
> From: Menglong Dong <[email protected]>
>
> The cgroup eBPF attach type 'CGROUP_SOCK_OPS' is able to monitor the
> state change of a tcp connect with 'BPF_SOCK_OPS_STATE_CB' ops.
>
> However, it can't trace the whole state change of a tcp connect. While
> a connect becomes 'TCP_TIME_WAIT' state, this sock will be release and
> a tw sock will be created. While tcp sock release, 'TCP_CLOSE' state
> change will be passed to eBPF program. Howeven, the real state of this
> connect is 'TCP_TIME_WAIT'.
>
> To make eBPF get the real state change of a tcp connect, add
> 'CGROUP_TWSK_CLOSE' cgroup attach type, which will be called when
> tw sock release and tcp connect become CLOSE.
The use case is not explained.
Why bpf tracing cannot be used to achieve the same?
Also there are no selftests.
On Thu, Dec 30, 2021 at 12:46 AM Alexei Starovoitov
<[email protected]> wrote:
>
> On Wed, Dec 29, 2021 at 3:33 AM <[email protected]> wrote:
> >
> > From: Menglong Dong <[email protected]>
> >
> > The cgroup eBPF attach type 'CGROUP_SOCK_OPS' is able to monitor the
> > state change of a tcp connect with 'BPF_SOCK_OPS_STATE_CB' ops.
> >
> > However, it can't trace the whole state change of a tcp connect. While
> > a connect becomes 'TCP_TIME_WAIT' state, this sock will be release and
> > a tw sock will be created. While tcp sock release, 'TCP_CLOSE' state
> > change will be passed to eBPF program. Howeven, the real state of this
> > connect is 'TCP_TIME_WAIT'.
> >
> > To make eBPF get the real state change of a tcp connect, add
> > 'CGROUP_TWSK_CLOSE' cgroup attach type, which will be called when
> > tw sock release and tcp connect become CLOSE.
>
> The use case is not explained.
Sorry for the absence of use cases and selftests. In my case, it is for NAT of
a docker container.
Simply speaking, I'll add an element to a hash map during sys_connect() with
'BPF_SOCK_OPS_TCP_CONNECT_CB' ops of 'BPF_CGROUP_SOCK_OPS'
cgroup attach type. Therefore, the received packet of the host can do DNAT
according to the hash map.
I need to release the element in the hashmap when the connection closes.
With the help of 'BPF_SOCK_OPS_STATE_CB', I can monitor the TCP_CLOSE
of the connection. However, as I mentioned above, it doesn't work well when
it comes to tw sock. When the connect become 'FIN_WAIT2' or 'TIME_WAIT',
the state of the tcp sock becomes 'TCP_CLOSE', which doesn't match the connect
state. Therefore, the 'fin' packet that the host received can't be DNAT, as the
element is already removed.
In this patch, BPF_SOCK_OPS_TW_CLOSE_FLAG is introduced, which is used
make 'BPF_SOCK_OPS_STATE_CB' not called when this sock becomes
TCP_CLOSE if it is being replaced with a tw sock.
> Why bpf tracing cannot be used to achieve the same?
En...do you mean kprobe based eBPF trace? It can work, but I don't think it's
high-performance, especially for network NAT. Strictly speaking, attach types,
such as 'CGROUP_INET_SOCK_RELEASE', can be replaced by bpf tracing, but
they exist out of performance.
Thanks!
Menglong Dong
>
> Also there are no selftests.
On Wed, Dec 29, 2021 at 6:07 PM Menglong Dong <[email protected]> wrote:
>
> On Thu, Dec 30, 2021 at 12:46 AM Alexei Starovoitov
> <[email protected]> wrote:
> >
> > On Wed, Dec 29, 2021 at 3:33 AM <[email protected]> wrote:
> > >
> > > From: Menglong Dong <[email protected]>
> > >
> > > The cgroup eBPF attach type 'CGROUP_SOCK_OPS' is able to monitor the
> > > state change of a tcp connect with 'BPF_SOCK_OPS_STATE_CB' ops.
> > >
> > > However, it can't trace the whole state change of a tcp connect. While
> > > a connect becomes 'TCP_TIME_WAIT' state, this sock will be release and
> > > a tw sock will be created. While tcp sock release, 'TCP_CLOSE' state
> > > change will be passed to eBPF program. Howeven, the real state of this
> > > connect is 'TCP_TIME_WAIT'.
> > >
> > > To make eBPF get the real state change of a tcp connect, add
> > > 'CGROUP_TWSK_CLOSE' cgroup attach type, which will be called when
> > > tw sock release and tcp connect become CLOSE.
> >
> > The use case is not explained.
>
> Sorry for the absence of use cases and selftests. In my case, it is for NAT of
> a docker container.
>
> Simply speaking, I'll add an element to a hash map during sys_connect() with
> 'BPF_SOCK_OPS_TCP_CONNECT_CB' ops of 'BPF_CGROUP_SOCK_OPS'
> cgroup attach type. Therefore, the received packet of the host can do DNAT
> according to the hash map.
>
> I need to release the element in the hashmap when the connection closes.
> With the help of 'BPF_SOCK_OPS_STATE_CB', I can monitor the TCP_CLOSE
> of the connection. However, as I mentioned above, it doesn't work well when
> it comes to tw sock. When the connect become 'FIN_WAIT2' or 'TIME_WAIT',
> the state of the tcp sock becomes 'TCP_CLOSE', which doesn't match the connect
> state. Therefore, the 'fin' packet that the host received can't be DNAT, as the
> element is already removed.
>
> In this patch, BPF_SOCK_OPS_TW_CLOSE_FLAG is introduced, which is used
> make 'BPF_SOCK_OPS_STATE_CB' not called when this sock becomes
> TCP_CLOSE if it is being replaced with a tw sock.
>
> > Why bpf tracing cannot be used to achieve the same?
>
> En...do you mean kprobe based eBPF trace? It can work, but I don't think it's
> high-performance, especially for network NAT. Strictly speaking, attach types,
> such as 'CGROUP_INET_SOCK_RELEASE', can be replaced by bpf tracing, but
> they exist out of performance.
kprobe at the entry is pretty fast.
fentry is even faster. It's actually faster than cgroup based hook.
Give it a shot.
On Thu, Dec 30, 2021 at 10:12 AM Alexei Starovoitov
<[email protected]> wrote:
>
> On Wed, Dec 29, 2021 at 6:07 PM Menglong Dong <[email protected]> wrote:
> >
> > On Thu, Dec 30, 2021 at 12:46 AM Alexei Starovoitov
> > <[email protected]> wrote:
> > >
> > > On Wed, Dec 29, 2021 at 3:33 AM <[email protected]> wrote:
> > > >
> > > > From: Menglong Dong <[email protected]>
> > > >
> > > > The cgroup eBPF attach type 'CGROUP_SOCK_OPS' is able to monitor the
> > > > state change of a tcp connect with 'BPF_SOCK_OPS_STATE_CB' ops.
> > > >
> > > > However, it can't trace the whole state change of a tcp connect. While
> > > > a connect becomes 'TCP_TIME_WAIT' state, this sock will be release and
> > > > a tw sock will be created. While tcp sock release, 'TCP_CLOSE' state
> > > > change will be passed to eBPF program. Howeven, the real state of this
> > > > connect is 'TCP_TIME_WAIT'.
> > > >
> > > > To make eBPF get the real state change of a tcp connect, add
> > > > 'CGROUP_TWSK_CLOSE' cgroup attach type, which will be called when
> > > > tw sock release and tcp connect become CLOSE.
> > >
> > > The use case is not explained.
> >
> > Sorry for the absence of use cases and selftests. In my case, it is for NAT of
> > a docker container.
> >
> > Simply speaking, I'll add an element to a hash map during sys_connect() with
> > 'BPF_SOCK_OPS_TCP_CONNECT_CB' ops of 'BPF_CGROUP_SOCK_OPS'
> > cgroup attach type. Therefore, the received packet of the host can do DNAT
> > according to the hash map.
> >
> > I need to release the element in the hashmap when the connection closes.
> > With the help of 'BPF_SOCK_OPS_STATE_CB', I can monitor the TCP_CLOSE
> > of the connection. However, as I mentioned above, it doesn't work well when
> > it comes to tw sock. When the connect become 'FIN_WAIT2' or 'TIME_WAIT',
> > the state of the tcp sock becomes 'TCP_CLOSE', which doesn't match the connect
> > state. Therefore, the 'fin' packet that the host received can't be DNAT, as the
> > element is already removed.
> >
> > In this patch, BPF_SOCK_OPS_TW_CLOSE_FLAG is introduced, which is used
> > make 'BPF_SOCK_OPS_STATE_CB' not called when this sock becomes
> > TCP_CLOSE if it is being replaced with a tw sock.
> >
> > > Why bpf tracing cannot be used to achieve the same?
> >
> > En...do you mean kprobe based eBPF trace? It can work, but I don't think it's
> > high-performance, especially for network NAT. Strictly speaking, attach types,
> > such as 'CGROUP_INET_SOCK_RELEASE', can be replaced by bpf tracing, but
> > they exist out of performance.
>
> kprobe at the entry is pretty fast.
> fentry is even faster. It's actually faster than cgroup based hook.
Really? Doesn't it already consider the data copy of bpf_probe_read()?
After all,
sock based eBPF can read sock data directly.
What's more, I'm not sure bpf tracing is enough, because I do NAT not for all
the docker containers, which means not all tcp connect close should call my
eBPF program, and that's the advantage of cgroup based eBPF.
Thanks!
Menglong Dong
> Give it a shot.