From: Menglong Dong <[email protected]>
The return value of BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND() in
__inet_bind() is not handled properly. While the return value
is non-zero, it will set inet_saddr and inet_rcv_saddr to 0 and
exit:
exit:
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
goto out_release_sock;
}
Let's take UDP for example and see what will happen. For UDP
socket, it will be added to 'udp_prot.h.udp_table->hash' and
'udp_prot.h.udp_table->hash2' after the sk->sk_prot->get_port()
called success. If 'inet->inet_rcv_saddr' is specified here,
then 'sk' will be in the 'hslot2' of 'hash2' that it don't belong
to (because inet_saddr is changed to 0), and UDP packet received
will not be passed to this sock. If 'inet->inet_rcv_saddr' is not
specified here, the sock will work fine, as it can receive packet
properly, which is wired, as the 'bind()' is already failed.
To undo the get_port() operation, introduce the 'put_port' field
for 'struct proto'. For TCP proto, it is inet_put_port(); For UDP
proto, it is udp_lib_unhash(); For icmp proto, it is
ping_unhash().
Therefore, after sys_bind() fail caused by
BPF_CGROUP_RUN_PROG_INET4_POST_BIND(), it will be unbinded, which
means that it can try to be binded to another port.
The second patch is the selftests for this modification.
Changes since v1:
- introduce 'put_port' field for 'struct proto'
- add selftests for it
Menglong Dong (2):
net: bpf: handle return value of
BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND()
bpf: selftests: add bind retry for post_bind{4, 6}
include/net/sock.h | 1 +
net/ipv4/af_inet.c | 2 +
net/ipv4/ping.c | 1 +
net/ipv4/tcp_ipv4.c | 1 +
net/ipv4/udp.c | 1 +
net/ipv6/af_inet6.c | 2 +
net/ipv6/ping.c | 1 +
net/ipv6/tcp_ipv6.c | 1 +
net/ipv6/udp.c | 1 +
tools/testing/selftests/bpf/test_sock.c | 166 +++++++++++++++++++++---
10 files changed, 157 insertions(+), 20 deletions(-)
--
2.27.0
From: Menglong Dong <[email protected]>
The return value of BPF_CGROUP_RUN_PROG_INET{4,6}_POST_BIND() in
__inet_bind() is not handled properly. While the return value
is non-zero, it will set inet_saddr and inet_rcv_saddr to 0 and
exit:
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
goto out_release_sock;
}
Let's take UDP for example and see what will happen. For UDP
socket, it will be added to 'udp_prot.h.udp_table->hash' and
'udp_prot.h.udp_table->hash2' after the sk->sk_prot->get_port()
called success. If 'inet->inet_rcv_saddr' is specified here,
then 'sk' will be in the 'hslot2' of 'hash2' that it don't belong
to (because inet_saddr is changed to 0), and UDP packet received
will not be passed to this sock. If 'inet->inet_rcv_saddr' is not
specified here, the sock will work fine, as it can receive packet
properly, which is wired, as the 'bind()' is already failed.
To undo the get_port() operation, introduce the 'put_port' field
for 'struct proto'. For TCP proto, it is inet_put_port(); For UDP
proto, it is udp_lib_unhash(); For icmp proto, it is
ping_unhash().
Therefore, after sys_bind() fail caused by
BPF_CGROUP_RUN_PROG_INET4_POST_BIND(), it will be unbinded, which
means that it can try to be binded to another port.
Signed-off-by: Menglong Dong <[email protected]>
---
v2:
- introduce 'put_port' field for 'struct proto'
---
include/net/sock.h | 1 +
net/ipv4/af_inet.c | 2 ++
net/ipv4/ping.c | 1 +
net/ipv4/tcp_ipv4.c | 1 +
net/ipv4/udp.c | 1 +
net/ipv6/af_inet6.c | 2 ++
net/ipv6/ping.c | 1 +
net/ipv6/tcp_ipv6.c | 1 +
net/ipv6/udp.c | 1 +
9 files changed, 11 insertions(+)
diff --git a/include/net/sock.h b/include/net/sock.h
index 44cc25f0bae7..f5fc0432374e 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1209,6 +1209,7 @@ struct proto {
void (*unhash)(struct sock *sk);
void (*rehash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short snum);
+ void (*put_port)(struct sock *sk);
#ifdef CONFIG_BPF_SYSCALL
int (*psock_update_sk_prot)(struct sock *sk,
struct sk_psock *psock,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5d18d32557d2..8784e72d4b8b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -531,6 +531,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
+ if (sk->sk_prot->get_port)
+ sk->sk_prot->put_port(sk);
goto out_release_sock;
}
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index e540b0dcf085..0e56df3a45e2 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -994,6 +994,7 @@ struct proto ping_prot = {
.hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
+ .put_port = ping_unhash,
.obj_size = sizeof(struct inet_sock),
};
EXPORT_SYMBOL(ping_prot);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 03dc4c79b84b..0ffb5b5779c0 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3082,6 +3082,7 @@ struct proto tcp_prot = {
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
+ .put_port = inet_put_port,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = tcp_bpf_update_proto,
#endif
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f376c777e8fc..c87e3888c8f8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2926,6 +2926,7 @@ struct proto udp_prot = {
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
+ .put_port = udp_lib_unhash,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = udp_bpf_update_proto,
#endif
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d1636425654e..ddfc6821e682 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -413,6 +413,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
if (err) {
sk->sk_ipv6only = saved_ipv6only;
inet_reset_saddr(sk);
+ if (sk->sk_prot->get_port)
+ sk->sk_prot->put_port(sk);
goto out;
}
}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 6ac88fe24a8e..9256f6ba87ef 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -177,6 +177,7 @@ struct proto pingv6_prot = {
.hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
+ .put_port = ping_unhash,
.obj_size = sizeof(struct raw6_sock),
};
EXPORT_SYMBOL_GPL(pingv6_prot);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1ac243d18c2b..075ee8a2df3b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2181,6 +2181,7 @@ struct proto tcpv6_prot = {
.hash = inet6_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
+ .put_port = inet_put_port,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = tcp_bpf_update_proto,
#endif
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 01e53eb4875a..cd402bdf9eed 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1731,6 +1731,7 @@ struct proto udpv6_prot = {
.unhash = udp_lib_unhash,
.rehash = udp_v6_rehash,
.get_port = udp_v6_get_port,
+ .put_port = udp_lib_unhash,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = udp_bpf_update_proto,
#endif
--
2.30.2
From: Menglong Dong <[email protected]>
With previous patch, kernel is able to 'put_port' after sys_bind()
fails. Add the test for that case: rebind another port after
sys_bind() fails. If the bind success, it means previous bind
operation is already undoed.
Signed-off-by: Menglong Dong <[email protected]>
---
tools/testing/selftests/bpf/test_sock.c | 166 +++++++++++++++++++++---
1 file changed, 146 insertions(+), 20 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index e8edd3dd3ec2..68525d68d4e5 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -35,12 +35,15 @@ struct sock_test {
/* Endpoint to bind() to */
const char *ip;
unsigned short port;
+ unsigned short port_retry;
/* Expected test result */
enum {
LOAD_REJECT,
ATTACH_REJECT,
BIND_REJECT,
SUCCESS,
+ RETRY_SUCCESS,
+ RETRY_REJECT
} result;
};
@@ -60,6 +63,7 @@ static struct sock_test tests[] = {
0,
NULL,
0,
+ 0,
LOAD_REJECT,
},
{
@@ -77,6 +81,7 @@ static struct sock_test tests[] = {
0,
NULL,
0,
+ 0,
LOAD_REJECT,
},
{
@@ -94,6 +99,7 @@ static struct sock_test tests[] = {
0,
NULL,
0,
+ 0,
LOAD_REJECT,
},
{
@@ -111,6 +117,7 @@ static struct sock_test tests[] = {
0,
NULL,
0,
+ 0,
LOAD_REJECT,
},
{
@@ -125,6 +132,7 @@ static struct sock_test tests[] = {
SOCK_STREAM,
"127.0.0.1",
8097,
+ 0,
SUCCESS,
},
{
@@ -139,6 +147,7 @@ static struct sock_test tests[] = {
SOCK_STREAM,
"127.0.0.1",
8097,
+ 0,
SUCCESS,
},
{
@@ -153,6 +162,7 @@ static struct sock_test tests[] = {
0,
NULL,
0,
+ 0,
ATTACH_REJECT,
},
{
@@ -167,6 +177,7 @@ static struct sock_test tests[] = {
0,
NULL,
0,
+ 0,
ATTACH_REJECT,
},
{
@@ -181,6 +192,7 @@ static struct sock_test tests[] = {
0,
NULL,
0,
+ 0,
ATTACH_REJECT,
},
{
@@ -195,6 +207,7 @@ static struct sock_test tests[] = {
0,
NULL,
0,
+ 0,
ATTACH_REJECT,
},
{
@@ -209,6 +222,7 @@ static struct sock_test tests[] = {
SOCK_STREAM,
"0.0.0.0",
0,
+ 0,
BIND_REJECT,
},
{
@@ -223,6 +237,7 @@ static struct sock_test tests[] = {
SOCK_STREAM,
"::",
0,
+ 0,
BIND_REJECT,
},
{
@@ -253,6 +268,7 @@ static struct sock_test tests[] = {
SOCK_STREAM,
"::1",
8193,
+ 0,
BIND_REJECT,
},
{
@@ -283,8 +299,102 @@ static struct sock_test tests[] = {
SOCK_STREAM,
"127.0.0.1",
4098,
+ 0,
SUCCESS,
},
+ {
+ "bind4 deny specific IP & port of TCP, and retry",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (ip == expected && port == expected) */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_ip4)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+ __bpf_constant_ntohl(0x7F000001), 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_port)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET4_POST_BIND,
+ AF_INET,
+ SOCK_STREAM,
+ "127.0.0.1",
+ 4098,
+ 5000,
+ RETRY_SUCCESS,
+ },
+ {
+ "bind4 deny specific IP & port of UDP, and retry",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (ip == expected && port == expected) */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_ip4)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+ __bpf_constant_ntohl(0x7F000001), 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_port)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET4_POST_BIND,
+ AF_INET,
+ SOCK_DGRAM,
+ "127.0.0.1",
+ 4098,
+ 5000,
+ RETRY_SUCCESS,
+ },
+ {
+ "bind6 deny specific IP & port, and retry",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (ip == expected && port == expected) */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_ip6[3])),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+ __bpf_constant_ntohl(0x00000001), 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_port)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET6_POST_BIND,
+ BPF_CGROUP_INET6_POST_BIND,
+ AF_INET6,
+ SOCK_STREAM,
+ "::1",
+ 8193,
+ 9000,
+ RETRY_SUCCESS,
+ },
{
"bind4 allow all",
.insns = {
@@ -297,6 +407,7 @@ static struct sock_test tests[] = {
SOCK_STREAM,
"0.0.0.0",
0,
+ 0,
SUCCESS,
},
{
@@ -311,6 +422,7 @@ static struct sock_test tests[] = {
SOCK_STREAM,
"::",
0,
+ 0,
SUCCESS,
},
};
@@ -351,14 +463,15 @@ static int attach_sock_prog(int cgfd, int progfd,
return bpf_prog_attach(progfd, cgfd, attach_type, BPF_F_ALLOW_OVERRIDE);
}
-static int bind_sock(int domain, int type, const char *ip, unsigned short port)
+static int bind_sock(int domain, int type, const char *ip,
+ unsigned short port, unsigned short port_retry)
{
struct sockaddr_storage addr;
struct sockaddr_in6 *addr6;
struct sockaddr_in *addr4;
int sockfd = -1;
socklen_t len;
- int err = 0;
+ int res = SUCCESS;
sockfd = socket(domain, type, 0);
if (sockfd < 0)
@@ -384,21 +497,44 @@ static int bind_sock(int domain, int type, const char *ip, unsigned short port)
goto err;
}
- if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1)
- goto err;
+ if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
+ /* sys_bind() may fail for different reasons, errno has to be
+ * checked to confirm that BPF program rejected it.
+ */
+ if (errno != EPERM)
+ goto err;
+ if (port_retry)
+ goto retry;
+ res = BIND_REJECT;
+ goto out;
+ }
+ goto out;
+retry:
+ if (domain == AF_INET)
+ addr4->sin_port = htons(port_retry);
+ else
+ addr6->sin6_port = htons(port_retry);
+ if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) {
+ if (errno != EPERM)
+ goto err;
+ res = RETRY_REJECT;
+ } else {
+ res = RETRY_SUCCESS;
+ }
goto out;
err:
- err = -1;
+ res = -1;
out:
close(sockfd);
- return err;
+ return res;
}
static int run_test_case(int cgfd, const struct sock_test *test)
{
int progfd = -1;
int err = 0;
+ int res;
printf("Test case: %s .. ", test->descr);
progfd = load_sock_prog(test->insns, test->expected_attach_type);
@@ -416,21 +552,11 @@ static int run_test_case(int cgfd, const struct sock_test *test)
goto err;
}
- if (bind_sock(test->domain, test->type, test->ip, test->port) == -1) {
- /* sys_bind() may fail for different reasons, errno has to be
- * checked to confirm that BPF program rejected it.
- */
- if (test->result == BIND_REJECT && errno == EPERM)
- goto out;
- else
- goto err;
- }
-
+ res = bind_sock(test->domain, test->type, test->ip, test->port,
+ test->port_retry);
+ if (res > 0 && test->result == res)
+ goto out;
- if (test->result != SUCCESS)
- goto err;
-
- goto out;
err:
err = -1;
out:
--
2.30.2
On 12/30/21 9:03 AM, [email protected] wrote:
[...]
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 44cc25f0bae7..f5fc0432374e 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1209,6 +1209,7 @@ struct proto {
> void (*unhash)(struct sock *sk);
> void (*rehash)(struct sock *sk);
> int (*get_port)(struct sock *sk, unsigned short snum);
> + void (*put_port)(struct sock *sk);
> #ifdef CONFIG_BPF_SYSCALL
> int (*psock_update_sk_prot)(struct sock *sk,
> struct sk_psock *psock,
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 5d18d32557d2..8784e72d4b8b 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -531,6 +531,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
> err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
> if (err) {
> inet->inet_saddr = inet->inet_rcv_saddr = 0;
> + if (sk->sk_prot->get_port)
> + sk->sk_prot->put_port(sk);
> goto out_release_sock;
> }
> }
[...]
> diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> index d1636425654e..ddfc6821e682 100644
> --- a/net/ipv6/af_inet6.c
> +++ b/net/ipv6/af_inet6.c
> @@ -413,6 +413,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
> if (err) {
> sk->sk_ipv6only = saved_ipv6only;
> inet_reset_saddr(sk);
> + if (sk->sk_prot->get_port)
> + sk->sk_prot->put_port(sk);
> goto out;
> }
> }
I presume both tests above should test for non-zero sk->sk_prot->put_port
callback given that is what they end up calling when true, no?
Thanks,
Daniel
On Wed, Jan 5, 2022 at 9:01 PM Daniel Borkmann <[email protected]> wrote:
>
> On 12/30/21 9:03 AM, [email protected] wrote:
> [...]
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 44cc25f0bae7..f5fc0432374e 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -1209,6 +1209,7 @@ struct proto {
> > void (*unhash)(struct sock *sk);
> > void (*rehash)(struct sock *sk);
> > int (*get_port)(struct sock *sk, unsigned short snum);
> > + void (*put_port)(struct sock *sk);
> > #ifdef CONFIG_BPF_SYSCALL
> > int (*psock_update_sk_prot)(struct sock *sk,
> > struct sk_psock *psock,
> > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> > index 5d18d32557d2..8784e72d4b8b 100644
> > --- a/net/ipv4/af_inet.c
> > +++ b/net/ipv4/af_inet.c
> > @@ -531,6 +531,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
> > err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
> > if (err) {
> > inet->inet_saddr = inet->inet_rcv_saddr = 0;
> > + if (sk->sk_prot->get_port)
> > + sk->sk_prot->put_port(sk);
> > goto out_release_sock;
> > }
> > }
> [...]
> > diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> > index d1636425654e..ddfc6821e682 100644
> > --- a/net/ipv6/af_inet6.c
> > +++ b/net/ipv6/af_inet6.c
> > @@ -413,6 +413,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
> > if (err) {
> > sk->sk_ipv6only = saved_ipv6only;
> > inet_reset_saddr(sk);
> > + if (sk->sk_prot->get_port)
> > + sk->sk_prot->put_port(sk);
> > goto out;
> > }
> > }
>
> I presume both tests above should test for non-zero sk->sk_prot->put_port
> callback given that is what they end up calling when true, no?
>
You are right, I think that I made a big mistake here...:/
Thanks!
Menglong Dong