2019-04-18 15:59:54

by Alban Crequy

[permalink] [raw]
Subject: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context

From: Alban Crequy <[email protected]>

sockops programs can now access the network namespace inode and device
via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
to apply different policies on different network namespaces.

In the unlikely case where network namespaces are not compiled in
(CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.

The generated BPF bytecode for netns_ino is loading the correct inode
number at the time of execution.

However, the generated BPF bytecode for netns_dev is loading an
immediate value determined at BPF-load-time by looking at the initial
network namespace. In practice, this works because all netns currently
use the same virtual device. If this was to change, this code would need
to be updated too.

Signed-off-by: Alban Crequy <[email protected]>

---

Changes since v1:
- add netns_dev (review from Alexei)
---
include/uapi/linux/bpf.h | 2 ++
net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 72 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index eaf2d3284248..f4f841dde42c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
__u32 sk_txhash;
__u64 bytes_received;
__u64 bytes_acked;
+ __u64 netns_dev;
+ __u64 netns_ino;
};

/* Definitions for bpf_sock_ops_cb_flags */
diff --git a/net/core/filter.c b/net/core/filter.c
index 1833926a63fc..93e3429603d7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -75,6 +75,8 @@
#include <net/seg6_local.h>
#include <net/lwtunnel.h>
#include <net/ipv6_stubs.h>
+#include <linux/kdev_t.h>
+#include <linux/proc_ns.h>

/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -6774,6 +6776,15 @@ static bool sock_ops_is_valid_access(int off, int size,
}
} else {
switch (off) {
+ case offsetof(struct bpf_sock_ops, netns_dev):
+ case offsetof(struct bpf_sock_ops, netns_ino):
+#ifdef CONFIG_NET_NS
+ if (size != sizeof(__u64))
+ return false;
+#else
+ return false;
+#endif
+ break;
case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
bytes_acked):
if (size != sizeof(__u64))
@@ -7660,6 +7671,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}

+static struct ns_common *sockops_netns_cb(void *private_data)
+{
+ return &init_net.ns;
+}
+
static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -7668,6 +7684,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
{
struct bpf_insn *insn = insn_buf;
int off;
+ struct inode *ns_inode;
+ struct path ns_path;
+ __u64 netns_dev;
+ void *res;

/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
@@ -7914,6 +7934,56 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
struct sock, type);
break;
+
+ case offsetof(struct bpf_sock_ops, netns_dev):
+#ifdef CONFIG_NET_NS
+ /* We get the netns_dev at BPF-load-time and not at
+ * BPF-exec-time. We assume that netns_dev is a constant.
+ */
+ res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
+ if (IS_ERR(res)) {
+ netns_dev = 0;
+ } else {
+ ns_inode = ns_path.dentry->d_inode;
+ netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ }
+#else
+ netns_dev = 0;
+#endif
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
+ break;
+
+ case offsetof(struct bpf_sock_ops, netns_ino):
+#ifdef CONFIG_NET_NS
+ /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
+ * Type: (struct bpf_sock_ops_kern *)
+ * ->(struct sock *)
+ * ->(struct sock_common)
+ * .possible_net_t
+ * .(struct net *)
+ * ->(struct ns_common)
+ * .(unsigned int)
+ */
+ BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
+ BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ possible_net_t, net),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_net));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct ns_common, inum),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct net, ns) +
+ offsetof(struct ns_common, inum));
+#else
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
+#endif
+ break;
+
}
return insn - insn_buf;
}
--
2.20.1


2019-04-18 15:58:59

by Alban Crequy

[permalink] [raw]
Subject: [PATCH bpf-next v2 2/3] selftests: bpf: read netns from struct bpf_sock_ops

From: Alban Crequy <[email protected]>

This shows how a sockops program could be restricted to a specific
network namespace. The sockops program looks at the current netns via
(struct bpf_sock_ops)->netns and checks if the value matches the
configuration in the new BPF map "sock_netns".

The test program ./test_sockmap accepts a new parameter "--netns"; the
default value is the current netns found by stat() on /proc/self/ns/net,
so the previous tests still pass:

sudo ./test_sockmap
...
Summary: 412 PASSED 0 FAILED
...
Summary: 824 PASSED 0 FAILED

I run my additional test in the following way:

NETNS=$(readlink /proc/self/ns/net | sed 's/^net:\[\(.*\)\]$/\1/')
CGR=/sys/fs/cgroup/unified/user.slice/user-1000.slice/session-5.scope/
sudo ./test_sockmap --cgroup $CGR --netns $NETNS &

cat /sys/kernel/debug/tracing/trace_pipe

echo foo | nc -l 127.0.0.1 8080 &
echo bar | nc 127.0.0.1 8080

=> the connection goes through the sockmap

When testing with a wrong $NETNS, I get the trace_pipe log:
> not binding connection on netns 4026531992

Signed-off-by: Alban Crequy <[email protected]>

---

Changes since v1:
- tools/include/uapi/linux/bpf.h: update with netns_dev
- tools/testing/selftests/bpf/test_sockmap_kern.h: print debugs with
both netns_dev and netns_ino
---
tools/include/uapi/linux/bpf.h | 2 +
tools/testing/selftests/bpf/test_sockmap.c | 38 +++++++++++++++++--
.../testing/selftests/bpf/test_sockmap_kern.h | 22 +++++++++++
3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 704bb69514a2..eb56620a9d7a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3206,6 +3206,8 @@ struct bpf_sock_ops {
__u32 sk_txhash;
__u64 bytes_received;
__u64 bytes_acked;
+ __u64 netns_dev;
+ __u64 netns_ino;
};

/* Definitions for bpf_sock_ops_cb_flags */
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 3845144e2c91..5a1b9c96fca1 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -2,6 +2,7 @@
// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
#include <stdio.h>
#include <stdlib.h>
+#include <stdint.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/select.h>
@@ -21,6 +22,7 @@
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/sendfile.h>
+#include <sys/stat.h>

#include <linux/netlink.h>
#include <linux/socket.h>
@@ -63,8 +65,8 @@ int s1, s2, c1, c2, p1, p2;
int test_cnt;
int passed;
int failed;
-int map_fd[8];
-struct bpf_map *maps[8];
+int map_fd[9];
+struct bpf_map *maps[9];
int prog_fd[11];

int txmsg_pass;
@@ -84,6 +86,7 @@ int txmsg_ingress;
int txmsg_skb;
int ktls;
int peek_flag;
+uint64_t netns_opt;

static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
@@ -111,6 +114,7 @@ static const struct option long_options[] = {
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
{"ktls", no_argument, &ktls, 1 },
{"peek", no_argument, &peek_flag, 1 },
+ {"netns", required_argument, NULL, 'n'},
{0, 0, NULL, 0 }
};

@@ -1585,6 +1589,7 @@ char *map_names[] = {
"sock_bytes",
"sock_redir_flags",
"sock_skb_opts",
+ "sock_netns",
};

int prog_attach_type[] = {
@@ -1619,6 +1624,8 @@ static int populate_progs(char *bpf_file)
struct bpf_object *obj;
int i = 0;
long err;
+ struct stat netns_sb;
+ uint64_t netns_ino;

obj = bpf_object__open(bpf_file);
err = libbpf_get_error(obj);
@@ -1655,6 +1662,28 @@ static int populate_progs(char *bpf_file)
}
}

+ if (netns_opt == 0) {
+ err = stat("/proc/self/ns/net", &netns_sb);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: cannot stat network namespace: %ld (%s)\n",
+ err, strerror(errno));
+ return -1;
+ }
+ netns_ino = netns_sb.st_ino;
+ } else {
+ netns_ino = netns_opt;
+ }
+ i = 1;
+ err = bpf_map_update_elem(map_fd[8], &netns_ino, &i, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (netns): %ld (%s)\n",
+ err, strerror(errno));
+ return -1;
+ }
+
+
return 0;
}

@@ -1738,7 +1767,7 @@ int main(int argc, char **argv)
if (argc < 2)
return test_suite(-1);

- while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
+ while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:n:",
long_options, &longindex)) != -1) {
switch (opt) {
case 's':
@@ -1805,6 +1834,9 @@ int main(int argc, char **argv)
return -1;
}
break;
+ case 'n':
+ netns_opt = strtoull(optarg, NULL, 10);
+ break;
case 0:
break;
case 'h':
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
index e7639f66a941..317406dad6cf 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.h
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@@ -91,6 +91,13 @@ struct bpf_map_def SEC("maps") sock_skb_opts = {
.max_entries = 1
};

+struct bpf_map_def SEC("maps") sock_netns = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__u64),
+ .value_size = sizeof(int),
+ .max_entries = 16
+};
+
SEC("sk_skb1")
int bpf_prog1(struct __sk_buff *skb)
{
@@ -132,9 +139,24 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
{
__u32 lport, rport;
int op, err = 0, index, key, ret;
+ int i = 0;
+ __u64 netns_dev, netns_ino;
+ int *allowed;


op = (int) skops->op;
+ netns_dev = skops->netns_dev;
+ netns_ino = skops->netns_ino;
+ bpf_printk("bpf_sockmap: netns_dev = %lu netns_ino = %lu\n",
+ netns_dev, netns_ino);
+
+ // Only allow sockmap connection on the configured network namespace
+ allowed = bpf_map_lookup_elem(&sock_netns, &netns_ino);
+ if (allowed == NULL || *allowed == 0) {
+ bpf_printk("not binding connection on netns_ino %lu\n",
+ netns_ino);
+ return 0;
+ }

switch (op) {
case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
--
2.20.1

2019-04-18 16:35:30

by Alban Crequy

[permalink] [raw]
Subject: [PATCH bpf-next v2 3/3] selftests: bpf: verifier: read netns_dev and netns_ino from struct bpf_sock_ops

From: Alban Crequy <[email protected]>

Tested with:
> $ sudo ./test_verifier
> ...
> #905/p sockops accessing bpf_sock_ops->netns_dev and netns_ino, ok OK
> ...
> Summary: 1420 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Alban Crequy <[email protected]>

---

Changes since v1:
- This is a new selftest (review from Song)
---
tools/testing/selftests/bpf/verifier/var_off.c | 13 +++++++++++++
1 file changed, 13 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/var_off.c b/tools/testing/selftests/bpf/verifier/var_off.c
index 8504ac937809..1ad3f64145ba 100644
--- a/tools/testing/selftests/bpf/verifier/var_off.c
+++ b/tools/testing/selftests/bpf/verifier/var_off.c
@@ -246,3 +246,16 @@
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_LWT_IN,
},
+{
+ "sockops accessing bpf_sock_ops->netns_dev and netns_ino, ok",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_ino)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+},
--
2.20.1

2019-04-18 21:33:02

by Y Song

[permalink] [raw]
Subject: Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context

On Thu, Apr 18, 2019 at 8:58 AM Alban Crequy <[email protected]> wrote:
>
> From: Alban Crequy <[email protected]>
>
> sockops programs can now access the network namespace inode and device
> via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> to apply different policies on different network namespaces.
>
> In the unlikely case where network namespaces are not compiled in
> (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
>
> The generated BPF bytecode for netns_ino is loading the correct inode
> number at the time of execution.
>
> However, the generated BPF bytecode for netns_dev is loading an
> immediate value determined at BPF-load-time by looking at the initial
> network namespace. In practice, this works because all netns currently
> use the same virtual device. If this was to change, this code would need
> to be updated too.
>
> Signed-off-by: Alban Crequy <[email protected]>
>
> ---
>
> Changes since v1:
> - add netns_dev (review from Alexei)
> ---
> include/uapi/linux/bpf.h | 2 ++
> net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 72 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index eaf2d3284248..f4f841dde42c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> __u32 sk_txhash;
> __u64 bytes_received;
> __u64 bytes_acked;
> + __u64 netns_dev;
> + __u64 netns_ino;

Maybe we can define netns_dev as __u32?
__u64 netns_ino;
__u32 netns_dev;

There is a hole at the end which can be used if the next
field to be added in the future is a __u32.

From
static inline u32 new_encode_dev(dev_t dev)
{
unsigned major = MAJOR(dev);
unsigned minor = MINOR(dev);
return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
}

device num is encoded in a u32.

> };
>
> /* Definitions for bpf_sock_ops_cb_flags */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1833926a63fc..93e3429603d7 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -75,6 +75,8 @@
> #include <net/seg6_local.h>
> #include <net/lwtunnel.h>
> #include <net/ipv6_stubs.h>
> +#include <linux/kdev_t.h>
> +#include <linux/proc_ns.h>
>
> /**
> * sk_filter_trim_cap - run a packet through a socket filter
> @@ -6774,6 +6776,15 @@ static bool sock_ops_is_valid_access(int off, int size,
> }
> } else {
> switch (off) {
> + case offsetof(struct bpf_sock_ops, netns_dev):
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + if (size != sizeof(__u64))
> + return false;

for netns_dev, looks like we have encoding like
(minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12)

it is possible user may access like:
u16 dev = sk->netns_dev;
the compiler may translate into a 2-byte load
or even
u8 major = (sk->netns_dev >> 8) & 0xf
the compiler may translate to a byte load of &sk->netns_dev + 1 or 3
depends on endianness.
I suggest we relax the access pattern now to avoid patch later.

> +#else
> + return false;
> +#endif
> + break;
> case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> bytes_acked):
> if (size != sizeof(__u64))
> @@ -7660,6 +7671,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> return insn - insn_buf;
> }
>
> +static struct ns_common *sockops_netns_cb(void *private_data)
> +{
> + return &init_net.ns;
> +}
> +
> static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> const struct bpf_insn *si,
> struct bpf_insn *insn_buf,
> @@ -7668,6 +7684,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> {
> struct bpf_insn *insn = insn_buf;
> int off;
> + struct inode *ns_inode;
> + struct path ns_path;
> + __u64 netns_dev;

Although there are a few discrepancies, I suggest to use u64 for
kernel as __u64 is typically
used in user space.

> + void *res;
>
> /* Helper macro for adding read access to tcp_sock or sock fields. */
> #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> @@ -7914,6 +7934,56 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> struct sock, type);
> break;
> +
> + case offsetof(struct bpf_sock_ops, netns_dev):
> +#ifdef CONFIG_NET_NS
> + /* We get the netns_dev at BPF-load-time and not at
> + * BPF-exec-time. We assume that netns_dev is a constant.
> + */
> + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> + if (IS_ERR(res)) {
> + netns_dev = 0;
> + } else {
> + ns_inode = ns_path.dentry->d_inode;
> + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> + }
> +#else
> + netns_dev = 0;

The #else branch is not needed. During is_valid_access check, if CONFIG_NET_NS
is not defined, the program should have been rejected.

> +#endif
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> + break;
> +
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> + * Type: (struct bpf_sock_ops_kern *)
> + * ->(struct sock *)
> + * ->(struct sock_common)
> + * .possible_net_t
> + * .(struct net *)
> + * ->(struct ns_common)
> + * .(unsigned int)
> + */
> + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct bpf_sock_ops_kern, sk),
> + si->dst_reg, si->src_reg,
> + offsetof(struct bpf_sock_ops_kern, sk));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + possible_net_t, net),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct sock_common, skc_net));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct ns_common, inum),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct net, ns) +
> + offsetof(struct ns_common, inum));
> +#else
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);

The same as above, this else branch is not needed.

> +#endif
> + break;
> +
> }
> return insn - insn_buf;
> }
> --
> 2.20.1
>

2019-04-18 21:37:43

by Y Song

[permalink] [raw]
Subject: Re: [PATCH bpf-next v2 2/3] selftests: bpf: read netns from struct bpf_sock_ops

On Thu, Apr 18, 2019 at 8:58 AM Alban Crequy <[email protected]> wrote:
>
> From: Alban Crequy <[email protected]>
>
> This shows how a sockops program could be restricted to a specific
> network namespace. The sockops program looks at the current netns via
> (struct bpf_sock_ops)->netns and checks if the value matches the
> configuration in the new BPF map "sock_netns".
>
> The test program ./test_sockmap accepts a new parameter "--netns"; the
> default value is the current netns found by stat() on /proc/self/ns/net,
> so the previous tests still pass:
>
> sudo ./test_sockmap
> ...
> Summary: 412 PASSED 0 FAILED
> ...
> Summary: 824 PASSED 0 FAILED
>
> I run my additional test in the following way:
>
> NETNS=$(readlink /proc/self/ns/net | sed 's/^net:\[\(.*\)\]$/\1/')
> CGR=/sys/fs/cgroup/unified/user.slice/user-1000.slice/session-5.scope/
> sudo ./test_sockmap --cgroup $CGR --netns $NETNS &
>
> cat /sys/kernel/debug/tracing/trace_pipe
>
> echo foo | nc -l 127.0.0.1 8080 &
> echo bar | nc 127.0.0.1 8080
>
> => the connection goes through the sockmap
>
> When testing with a wrong $NETNS, I get the trace_pipe log:
> > not binding connection on netns 4026531992
>
> Signed-off-by: Alban Crequy <[email protected]>
>
> ---
>
> Changes since v1:
> - tools/include/uapi/linux/bpf.h: update with netns_dev

Please have a separate patch for the uapi header sync
(tools/include/uapi/linux/bpf.h).

> - tools/testing/selftests/bpf/test_sockmap_kern.h: print debugs with
> both netns_dev and netns_ino
> ---
> tools/include/uapi/linux/bpf.h | 2 +
> tools/testing/selftests/bpf/test_sockmap.c | 38 +++++++++++++++++--
> .../testing/selftests/bpf/test_sockmap_kern.h | 22 +++++++++++
> 3 files changed, 59 insertions(+), 3 deletions(-)
>
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 704bb69514a2..eb56620a9d7a 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -3206,6 +3206,8 @@ struct bpf_sock_ops {
> __u32 sk_txhash;
> __u64 bytes_received;
> __u64 bytes_acked;
> + __u64 netns_dev;
> + __u64 netns_ino;
> };
>
> /* Definitions for bpf_sock_ops_cb_flags */
> diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
> index 3845144e2c91..5a1b9c96fca1 100644
> --- a/tools/testing/selftests/bpf/test_sockmap.c
> +++ b/tools/testing/selftests/bpf/test_sockmap.c
> @@ -2,6 +2,7 @@
> // Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
> #include <stdio.h>
> #include <stdlib.h>
> +#include <stdint.h>
> #include <sys/socket.h>
> #include <sys/ioctl.h>
> #include <sys/select.h>
> @@ -21,6 +22,7 @@
> #include <sys/resource.h>
> #include <sys/types.h>
> #include <sys/sendfile.h>
> +#include <sys/stat.h>
>
> #include <linux/netlink.h>
> #include <linux/socket.h>
> @@ -63,8 +65,8 @@ int s1, s2, c1, c2, p1, p2;
> int test_cnt;
> int passed;
> int failed;
> -int map_fd[8];
> -struct bpf_map *maps[8];
> +int map_fd[9];
> +struct bpf_map *maps[9];
> int prog_fd[11];
>
> int txmsg_pass;
> @@ -84,6 +86,7 @@ int txmsg_ingress;
> int txmsg_skb;
> int ktls;
> int peek_flag;
> +uint64_t netns_opt;
>
> static const struct option long_options[] = {
> {"help", no_argument, NULL, 'h' },
> @@ -111,6 +114,7 @@ static const struct option long_options[] = {
> {"txmsg_skb", no_argument, &txmsg_skb, 1 },
> {"ktls", no_argument, &ktls, 1 },
> {"peek", no_argument, &peek_flag, 1 },
> + {"netns", required_argument, NULL, 'n'},
> {0, 0, NULL, 0 }
> };
>
> @@ -1585,6 +1589,7 @@ char *map_names[] = {
> "sock_bytes",
> "sock_redir_flags",
> "sock_skb_opts",
> + "sock_netns",
> };
>
> int prog_attach_type[] = {
> @@ -1619,6 +1624,8 @@ static int populate_progs(char *bpf_file)
> struct bpf_object *obj;
> int i = 0;
> long err;
> + struct stat netns_sb;
> + uint64_t netns_ino;
>
> obj = bpf_object__open(bpf_file);
> err = libbpf_get_error(obj);
> @@ -1655,6 +1662,28 @@ static int populate_progs(char *bpf_file)
> }
> }
>
> + if (netns_opt == 0) {
> + err = stat("/proc/self/ns/net", &netns_sb);
> + if (err) {
> + fprintf(stderr,
> + "ERROR: cannot stat network namespace: %ld (%s)\n",
> + err, strerror(errno));
> + return -1;
> + }
> + netns_ino = netns_sb.st_ino;
> + } else {
> + netns_ino = netns_opt;
> + }
> + i = 1;
> + err = bpf_map_update_elem(map_fd[8], &netns_ino, &i, BPF_ANY);
> + if (err) {
> + fprintf(stderr,
> + "ERROR: bpf_map_update_elem (netns): %ld (%s)\n",
> + err, strerror(errno));
> + return -1;
> + }
> +
> +
> return 0;
> }
>
> @@ -1738,7 +1767,7 @@ int main(int argc, char **argv)
> if (argc < 2)
> return test_suite(-1);
>
> - while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
> + while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:n:",
> long_options, &longindex)) != -1) {
> switch (opt) {
> case 's':
> @@ -1805,6 +1834,9 @@ int main(int argc, char **argv)
> return -1;
> }
> break;
> + case 'n':
> + netns_opt = strtoull(optarg, NULL, 10);
> + break;
> case 0:
> break;
> case 'h':
> diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
> index e7639f66a941..317406dad6cf 100644
> --- a/tools/testing/selftests/bpf/test_sockmap_kern.h
> +++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
> @@ -91,6 +91,13 @@ struct bpf_map_def SEC("maps") sock_skb_opts = {
> .max_entries = 1
> };
>
> +struct bpf_map_def SEC("maps") sock_netns = {
> + .type = BPF_MAP_TYPE_HASH,
> + .key_size = sizeof(__u64),
> + .value_size = sizeof(int),
> + .max_entries = 16
> +};
> +
> SEC("sk_skb1")
> int bpf_prog1(struct __sk_buff *skb)
> {
> @@ -132,9 +139,24 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
> {
> __u32 lport, rport;
> int op, err = 0, index, key, ret;
> + int i = 0;
> + __u64 netns_dev, netns_ino;
> + int *allowed;
>
>
> op = (int) skops->op;
> + netns_dev = skops->netns_dev;
> + netns_ino = skops->netns_ino;
> + bpf_printk("bpf_sockmap: netns_dev = %lu netns_ino = %lu\n",
> + netns_dev, netns_ino);
> +
> + // Only allow sockmap connection on the configured network namespace
> + allowed = bpf_map_lookup_elem(&sock_netns, &netns_ino);
> + if (allowed == NULL || *allowed == 0) {
> + bpf_printk("not binding connection on netns_ino %lu\n",
> + netns_ino);
> + return 0;
> + }
>
> switch (op) {
> case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
> --
> 2.20.1
>

2019-04-18 21:43:01

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context

On Thu, Apr 18, 2019 at 8:59 AM Alban Crequy <[email protected]> wrote:
>
> From: Alban Crequy <[email protected]>
>
> sockops programs can now access the network namespace inode and device
> via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> to apply different policies on different network namespaces.
>
> In the unlikely case where network namespaces are not compiled in
> (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
>
> The generated BPF bytecode for netns_ino is loading the correct inode
> number at the time of execution.
>
> However, the generated BPF bytecode for netns_dev is loading an
> immediate value determined at BPF-load-time by looking at the initial
> network namespace. In practice, this works because all netns currently
> use the same virtual device. If this was to change, this code would need
> to be updated too.
>
> Signed-off-by: Alban Crequy <[email protected]>

Acked-by: Song Liu <[email protected]>

>
> ---
>
> Changes since v1:
> - add netns_dev (review from Alexei)
> ---
> include/uapi/linux/bpf.h | 2 ++
> net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 72 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index eaf2d3284248..f4f841dde42c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> __u32 sk_txhash;
> __u64 bytes_received;
> __u64 bytes_acked;
> + __u64 netns_dev;
> + __u64 netns_ino;
> };
>
> /* Definitions for bpf_sock_ops_cb_flags */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 1833926a63fc..93e3429603d7 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -75,6 +75,8 @@
> #include <net/seg6_local.h>
> #include <net/lwtunnel.h>
> #include <net/ipv6_stubs.h>
> +#include <linux/kdev_t.h>
> +#include <linux/proc_ns.h>
>
> /**
> * sk_filter_trim_cap - run a packet through a socket filter
> @@ -6774,6 +6776,15 @@ static bool sock_ops_is_valid_access(int off, int size,
> }
> } else {
> switch (off) {
> + case offsetof(struct bpf_sock_ops, netns_dev):
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + if (size != sizeof(__u64))
> + return false;
> +#else
> + return false;
> +#endif
> + break;
> case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> bytes_acked):
> if (size != sizeof(__u64))
> @@ -7660,6 +7671,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> return insn - insn_buf;
> }
>
> +static struct ns_common *sockops_netns_cb(void *private_data)
> +{
> + return &init_net.ns;
> +}
> +
> static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> const struct bpf_insn *si,
> struct bpf_insn *insn_buf,
> @@ -7668,6 +7684,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> {
> struct bpf_insn *insn = insn_buf;
> int off;
> + struct inode *ns_inode;
> + struct path ns_path;
> + __u64 netns_dev;
> + void *res;
>
> /* Helper macro for adding read access to tcp_sock or sock fields. */
> #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> @@ -7914,6 +7934,56 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> struct sock, type);
> break;
> +
> + case offsetof(struct bpf_sock_ops, netns_dev):
> +#ifdef CONFIG_NET_NS
> + /* We get the netns_dev at BPF-load-time and not at
> + * BPF-exec-time. We assume that netns_dev is a constant.
> + */
> + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> + if (IS_ERR(res)) {
> + netns_dev = 0;
> + } else {
> + ns_inode = ns_path.dentry->d_inode;
> + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> + }
> +#else
> + netns_dev = 0;
> +#endif
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> + break;
> +
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> + * Type: (struct bpf_sock_ops_kern *)
> + * ->(struct sock *)
> + * ->(struct sock_common)
> + * .possible_net_t
> + * .(struct net *)
> + * ->(struct ns_common)
> + * .(unsigned int)
> + */
> + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct bpf_sock_ops_kern, sk),
> + si->dst_reg, si->src_reg,
> + offsetof(struct bpf_sock_ops_kern, sk));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + possible_net_t, net),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct sock_common, skc_net));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct ns_common, inum),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct net, ns) +
> + offsetof(struct ns_common, inum));
> +#else
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
> +#endif
> + break;
> +
> }
> return insn - insn_buf;
> }
> --
> 2.20.1
>

2019-04-18 21:54:20

by Song Liu

[permalink] [raw]
Subject: Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context

On Thu, Apr 18, 2019 at 2:41 PM Song Liu <[email protected]> wrote:
>
> On Thu, Apr 18, 2019 at 8:59 AM Alban Crequy <[email protected]> wrote:
> >
> > From: Alban Crequy <[email protected]>
> >
> > sockops programs can now access the network namespace inode and device
> > via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> > to apply different policies on different network namespaces.
> >
> > In the unlikely case where network namespaces are not compiled in
> > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> >
> > The generated BPF bytecode for netns_ino is loading the correct inode
> > number at the time of execution.
> >
> > However, the generated BPF bytecode for netns_dev is loading an
> > immediate value determined at BPF-load-time by looking at the initial
> > network namespace. In practice, this works because all netns currently
> > use the same virtual device. If this was to change, this code would need
> > to be updated too.
> >
> > Signed-off-by: Alban Crequy <[email protected]>
>
> Acked-by: Song Liu <[email protected]>

Sorry! I didn't see Yonghong's comments. Please resolve those.

Sorry for the confusion.
Song

>
> >
> > ---
> >
> > Changes since v1:
> > - add netns_dev (review from Alexei)
> > ---
> > include/uapi/linux/bpf.h | 2 ++
> > net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 72 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index eaf2d3284248..f4f841dde42c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> > __u32 sk_txhash;
> > __u64 bytes_received;
> > __u64 bytes_acked;
> > + __u64 netns_dev;
> > + __u64 netns_ino;
> > };
> >
> > /* Definitions for bpf_sock_ops_cb_flags */
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 1833926a63fc..93e3429603d7 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -75,6 +75,8 @@
> > #include <net/seg6_local.h>
> > #include <net/lwtunnel.h>
> > #include <net/ipv6_stubs.h>
> > +#include <linux/kdev_t.h>
> > +#include <linux/proc_ns.h>
> >
> > /**
> > * sk_filter_trim_cap - run a packet through a socket filter
> > @@ -6774,6 +6776,15 @@ static bool sock_ops_is_valid_access(int off, int size,
> > }
> > } else {
> > switch (off) {
> > + case offsetof(struct bpf_sock_ops, netns_dev):
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + if (size != sizeof(__u64))
> > + return false;
> > +#else
> > + return false;
> > +#endif
> > + break;
> > case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> > bytes_acked):
> > if (size != sizeof(__u64))
> > @@ -7660,6 +7671,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> > return insn - insn_buf;
> > }
> >
> > +static struct ns_common *sockops_netns_cb(void *private_data)
> > +{
> > + return &init_net.ns;
> > +}
> > +
> > static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > const struct bpf_insn *si,
> > struct bpf_insn *insn_buf,
> > @@ -7668,6 +7684,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > {
> > struct bpf_insn *insn = insn_buf;
> > int off;
> > + struct inode *ns_inode;
> > + struct path ns_path;
> > + __u64 netns_dev;
> > + void *res;
> >
> > /* Helper macro for adding read access to tcp_sock or sock fields. */
> > #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> > @@ -7914,6 +7934,56 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> > struct sock, type);
> > break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_dev):
> > +#ifdef CONFIG_NET_NS
> > + /* We get the netns_dev at BPF-load-time and not at
> > + * BPF-exec-time. We assume that netns_dev is a constant.
> > + */
> > + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> > + if (IS_ERR(res)) {
> > + netns_dev = 0;
> > + } else {
> > + ns_inode = ns_path.dentry->d_inode;
> > + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> > + }
> > +#else
> > + netns_dev = 0;
> > +#endif
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> > + break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> > + * Type: (struct bpf_sock_ops_kern *)
> > + * ->(struct sock *)
> > + * ->(struct sock_common)
> > + * .possible_net_t
> > + * .(struct net *)
> > + * ->(struct ns_common)
> > + * .(unsigned int)
> > + */
> > + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> > + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct bpf_sock_ops_kern, sk),
> > + si->dst_reg, si->src_reg,
> > + offsetof(struct bpf_sock_ops_kern, sk));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + possible_net_t, net),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct sock_common, skc_net));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct ns_common, inum),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct net, ns) +
> > + offsetof(struct ns_common, inum));
> > +#else
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
> > +#endif
> > + break;
> > +
> > }
> > return insn - insn_buf;
> > }
> > --
> > 2.20.1
> >

2019-04-19 19:05:05

by Alban Crequy

[permalink] [raw]
Subject: Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context

On Thu, Apr 18, 2019 at 11:31 PM Y Song <[email protected]> wrote:
>
> On Thu, Apr 18, 2019 at 8:58 AM Alban Crequy <[email protected]> wrote:
> >
> > From: Alban Crequy <[email protected]>
> >
> > sockops programs can now access the network namespace inode and device
> > via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> > to apply different policies on different network namespaces.
> >
> > In the unlikely case where network namespaces are not compiled in
> > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> >
> > The generated BPF bytecode for netns_ino is loading the correct inode
> > number at the time of execution.
> >
> > However, the generated BPF bytecode for netns_dev is loading an
> > immediate value determined at BPF-load-time by looking at the initial
> > network namespace. In practice, this works because all netns currently
> > use the same virtual device. If this was to change, this code would need
> > to be updated too.
> >
> > Signed-off-by: Alban Crequy <[email protected]>
> >
> > ---
> >
> > Changes since v1:
> > - add netns_dev (review from Alexei)
> > ---
> > include/uapi/linux/bpf.h | 2 ++
> > net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 72 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index eaf2d3284248..f4f841dde42c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> > __u32 sk_txhash;
> > __u64 bytes_received;
> > __u64 bytes_acked;
> > + __u64 netns_dev;
> > + __u64 netns_ino;
>
> Maybe we can define netns_dev as __u32?
> __u64 netns_ino;
> __u32 netns_dev;
>
> There is a hole at the end which can be used if the next
> field to be added in the future is a __u32.
>
> From
> static inline u32 new_encode_dev(dev_t dev)
> {
> unsigned major = MAJOR(dev);
> unsigned minor = MINOR(dev);
> return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
> }
>
> device num is encoded in a u32.

I could do that but there are already two occurrences of "__u64
netns_dev" in bpf.h:
- struct bpf_prog_info
- struct bpf_map_info

Should I keep it a u64 for consistency with the rest of bpf.h, or
change it to u32?

> > };
> >
> > /* Definitions for bpf_sock_ops_cb_flags */
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 1833926a63fc..93e3429603d7 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -75,6 +75,8 @@
> > #include <net/seg6_local.h>
> > #include <net/lwtunnel.h>
> > #include <net/ipv6_stubs.h>
> > +#include <linux/kdev_t.h>
> > +#include <linux/proc_ns.h>
> >
> > /**
> > * sk_filter_trim_cap - run a packet through a socket filter
> > @@ -6774,6 +6776,15 @@ static bool sock_ops_is_valid_access(int off, int size,
> > }
> > } else {
> > switch (off) {
> > + case offsetof(struct bpf_sock_ops, netns_dev):
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + if (size != sizeof(__u64))
> > + return false;
>
> for netns_dev, looks like we have encoding like
> (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12)
>
> it is possible user may access like:
> u16 dev = sk->netns_dev;
> the compiler may translate into a 2-byte load
> or even
> u8 major = (sk->netns_dev >> 8) & 0xf
> the compiler may translate to a byte load of &sk->netns_dev + 1 or 3
> depends on endianness.
> I suggest we relax the access pattern now to avoid patch later.

Ok, I can do that.

My use case was to compare (struct bpf_sock_ops)->netns_dev/netns_ino
with the value from a map prepared from userspace, i.e. the value of
netns_dev/ino is an opaque buffer for the BPF program, so the problem
is unlikely. But I've got bitten by a similar thing before (struct
sk_msg_md->remote_port/local_port was read in one go as a u64) so we
can improve this a bit.

Thanks for the review, I'll do the rest of the requested changes
probably end of next week.

> > +#else
> > + return false;
> > +#endif
> > + break;
> > case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> > bytes_acked):
> > if (size != sizeof(__u64))
> > @@ -7660,6 +7671,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> > return insn - insn_buf;
> > }
> >
> > +static struct ns_common *sockops_netns_cb(void *private_data)
> > +{
> > + return &init_net.ns;
> > +}
> > +
> > static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > const struct bpf_insn *si,
> > struct bpf_insn *insn_buf,
> > @@ -7668,6 +7684,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > {
> > struct bpf_insn *insn = insn_buf;
> > int off;
> > + struct inode *ns_inode;
> > + struct path ns_path;
> > + __u64 netns_dev;
>
> Although there are a few discrepancies, I suggest to use u64 for
> kernel as __u64 is typically
> used in user space.
>
> > + void *res;
> >
> > /* Helper macro for adding read access to tcp_sock or sock fields. */
> > #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> > @@ -7914,6 +7934,56 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> > struct sock, type);
> > break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_dev):
> > +#ifdef CONFIG_NET_NS
> > + /* We get the netns_dev at BPF-load-time and not at
> > + * BPF-exec-time. We assume that netns_dev is a constant.
> > + */
> > + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> > + if (IS_ERR(res)) {
> > + netns_dev = 0;
> > + } else {
> > + ns_inode = ns_path.dentry->d_inode;
> > + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> > + }
> > +#else
> > + netns_dev = 0;
>
> The #else branch is not needed. During is_valid_access check, if CONFIG_NET_NS
> is not defined, the program should have been rejected.
>
> > +#endif
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> > + break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> > + * Type: (struct bpf_sock_ops_kern *)
> > + * ->(struct sock *)
> > + * ->(struct sock_common)
> > + * .possible_net_t
> > + * .(struct net *)
> > + * ->(struct ns_common)
> > + * .(unsigned int)
> > + */
> > + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> > + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct bpf_sock_ops_kern, sk),
> > + si->dst_reg, si->src_reg,
> > + offsetof(struct bpf_sock_ops_kern, sk));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + possible_net_t, net),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct sock_common, skc_net));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct ns_common, inum),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct net, ns) +
> > + offsetof(struct ns_common, inum));
> > +#else
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
>
> The same as above, this else branch is not needed.
>
> > +#endif
> > + break;
> > +
> > }
> > return insn - insn_buf;
> > }
> > --
> > 2.20.1
> >

2019-04-19 23:02:03

by Y Song

[permalink] [raw]
Subject: Re: [PATCH bpf-next v2 1/3] bpf: sock ops: add netns ino and dev in bpf context

On Fri, Apr 19, 2019 at 4:51 AM Alban Crequy <[email protected]> wrote:
>
> On Thu, Apr 18, 2019 at 11:31 PM Y Song <[email protected]> wrote:
> >
> > On Thu, Apr 18, 2019 at 8:58 AM Alban Crequy <[email protected]> wrote:
> > >
> > > From: Alban Crequy <[email protected]>
> > >
> > > sockops programs can now access the network namespace inode and device
> > > via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> > > to apply different policies on different network namespaces.
> > >
> > > In the unlikely case where network namespaces are not compiled in
> > > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> > >
> > > The generated BPF bytecode for netns_ino is loading the correct inode
> > > number at the time of execution.
> > >
> > > However, the generated BPF bytecode for netns_dev is loading an
> > > immediate value determined at BPF-load-time by looking at the initial
> > > network namespace. In practice, this works because all netns currently
> > > use the same virtual device. If this was to change, this code would need
> > > to be updated too.
> > >
> > > Signed-off-by: Alban Crequy <[email protected]>
> > >
> > > ---
> > >
> > > Changes since v1:
> > > - add netns_dev (review from Alexei)
> > > ---
> > > include/uapi/linux/bpf.h | 2 ++
> > > net/core/filter.c | 70 ++++++++++++++++++++++++++++++++++++++++
> > > 2 files changed, 72 insertions(+)
> > >
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index eaf2d3284248..f4f841dde42c 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> > > __u32 sk_txhash;
> > > __u64 bytes_received;
> > > __u64 bytes_acked;
> > > + __u64 netns_dev;
> > > + __u64 netns_ino;
> >
> > Maybe we can define netns_dev as __u32?
> > __u64 netns_ino;
> > __u32 netns_dev;
> >
> > There is a hole at the end which can be used if the next
> > field to be added in the future is a __u32.
> >
> > From
> > static inline u32 new_encode_dev(dev_t dev)
> > {
> > unsigned major = MAJOR(dev);
> > unsigned minor = MINOR(dev);
> > return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
> > }
> >
> > device num is encoded in a u32.
>
> I could do that but there are already two occurrences of "__u64
> netns_dev" in bpf.h:
> - struct bpf_prog_info
> - struct bpf_map_info
>
> Should I keep it a u64 for consistency with the rest of bpf.h, or
> change it to u32?

Agreed. We probably should keep it to be __u64 to be consistent with others.