2019-04-26 15:50:47

by Alban Crequy

[permalink] [raw]
Subject: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context

From: Alban Crequy <[email protected]>

sockops programs can now access the network namespace inode and device
via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
to apply different policies on different network namespaces.

In the unlikely case where network namespaces are not compiled in
(CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.

The generated BPF bytecode for netns_ino is loading the correct inode
number at the time of execution.

However, the generated BPF bytecode for netns_dev is loading an
immediate value determined at BPF-load-time by looking at the initial
network namespace. In practice, this works because all netns currently
use the same virtual device. If this was to change, this code would need
to be updated too.

Signed-off-by: Alban Crequy <[email protected]>

---

Changes since v1:
- add netns_dev (review from Alexei)

Changes since v2:
- replace __u64 by u64 in kernel code (review from Y Song)
- remove unneeded #else branch: program would be rejected in
is_valid_access (review from Y Song)
- allow partial reads (<u64) (review from Y Song)

Note: I have not been able to fully test partial reads on netns_dev.
The following patches check partial reads in the verifier but it does
not actually execute the program to check if partial reads generate the
correct value. I tried to write a BPF program in C and declare the
struct bpf_sock_ops as a volatile variable and I could get llvm to
generate the BPF instructions to do partial loads. But then, I get the
verifier error "dereference of modified ctx ptr R2 off=184 disallowed",
explained in https://www.spinics.net/lists/netdev/msg531582.html
What do you think should be done here?
---
include/uapi/linux/bpf.h | 2 +
net/core/filter.c | 94 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 96 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index eaf2d3284248..f4f841dde42c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
__u32 sk_txhash;
__u64 bytes_received;
__u64 bytes_acked;
+ __u64 netns_dev;
+ __u64 netns_ino;
};

/* Definitions for bpf_sock_ops_cb_flags */
diff --git a/net/core/filter.c b/net/core/filter.c
index 2f88baf39cc2..9c77464b1501 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -75,6 +75,8 @@
#include <net/seg6_local.h>
#include <net/lwtunnel.h>
#include <net/ipv6_stubs.h>
+#include <linux/kdev_t.h>
+#include <linux/proc_ns.h>

/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -6810,6 +6812,24 @@ static bool sock_ops_is_valid_access(int off, int size,
}
} else {
switch (off) {
+ case offsetof(struct bpf_sock_ops, netns_dev) ...
+ offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
+#ifdef CONFIG_NET_NS
+ if (off - offsetof(struct bpf_sock_ops, netns_dev)
+ + size > sizeof(u64))
+ return false;
+#else
+ return false;
+#endif
+ break;
+ case offsetof(struct bpf_sock_ops, netns_ino):
+#ifdef CONFIG_NET_NS
+ if (size != sizeof(u64))
+ return false;
+#else
+ return false;
+#endif
+ break;
case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
bytes_acked):
if (size != sizeof(__u64))
@@ -7727,6 +7747,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}

+static struct ns_common *sockops_netns_cb(void *private_data)
+{
+ return &init_net.ns;
+}
+
static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -7735,6 +7760,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
{
struct bpf_insn *insn = insn_buf;
int off;
+ struct inode *ns_inode;
+ struct path ns_path;
+ u64 netns_dev;
+ void *res;

/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
@@ -7981,6 +8010,71 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
struct sock, type);
break;
+
+ case offsetof(struct bpf_sock_ops, netns_dev) ...
+ offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
+#ifdef CONFIG_NET_NS
+ /* We get the netns_dev at BPF-load-time and not at
+ * BPF-exec-time. We assume that netns_dev is a constant.
+ */
+ res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
+ if (IS_ERR(res)) {
+ netns_dev = 0;
+ } else {
+ ns_inode = ns_path.dentry->d_inode;
+ netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ }
+ off = si->off;
+ off -= offsetof(struct bpf_sock_ops, netns_dev);
+ switch (BPF_LDST_BYTES(si)) {
+ case sizeof(u64):
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
+ break;
+ case sizeof(u32):
+ netns_dev = *(u32 *)(((char *)&netns_dev) + off);
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
+ break;
+ case sizeof(u16):
+ netns_dev = *(u16 *)(((char *)&netns_dev) + off);
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
+ break;
+ case sizeof(u8):
+ netns_dev = *(u8 *)(((char *)&netns_dev) + off);
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
+ break;
+ }
+#endif
+ break;
+
+ case offsetof(struct bpf_sock_ops, netns_ino):
+#ifdef CONFIG_NET_NS
+ /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
+ * Type: (struct bpf_sock_ops_kern *)
+ * ->(struct sock *)
+ * ->(struct sock_common)
+ * .possible_net_t
+ * .(struct net *)
+ * ->(struct ns_common)
+ * .(unsigned int)
+ */
+ BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
+ BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ possible_net_t, net),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sock_common, skc_net));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct ns_common, inum),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct net, ns) +
+ offsetof(struct ns_common, inum));
+#endif
+ break;
+
}
return insn - insn_buf;
}
--
2.20.1


2019-04-26 15:51:03

by Alban Crequy

[permalink] [raw]
Subject: [PATCH bpf-next v3 2/4] bpf: sync bpf.h to tools/ for bpf_sock_ops->netns*

From: Alban Crequy <[email protected]>

The change in struct bpf_sock_ops is synchronised
from: include/uapi/linux/bpf.h
to: tools/include/uapi/linux/bpf.h

Signed-off-by: Alban Crequy <[email protected]>

---

Changes since v2:
- standalone patch for the sync (requested by Y Song)
---
tools/include/uapi/linux/bpf.h | 2 ++
1 file changed, 2 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 704bb69514a2..eb56620a9d7a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3206,6 +3206,8 @@ struct bpf_sock_ops {
__u32 sk_txhash;
__u64 bytes_received;
__u64 bytes_acked;
+ __u64 netns_dev;
+ __u64 netns_ino;
};

/* Definitions for bpf_sock_ops_cb_flags */
--
2.20.1

2019-04-26 15:51:42

by Alban Crequy

[permalink] [raw]
Subject: [PATCH bpf-next v3 3/4] selftests: bpf: read netns_ino from struct bpf_sock_ops

From: Alban Crequy <[email protected]>

This shows how a sockops program could be restricted to a specific
network namespace. The sockops program looks at the current netns via
(struct bpf_sock_ops)->netns_ino and checks if the value matches the
configuration in the new BPF map "sock_netns".

The test program ./test_sockmap accepts a new parameter "--netns"; the
default value is the current netns found by stat() on /proc/self/ns/net,
so the previous tests still pass:

sudo ./test_sockmap
...
Summary: 412 PASSED 0 FAILED
...
Summary: 824 PASSED 0 FAILED

I run my additional test in the following way:

NETNS=$(readlink /proc/self/ns/net | sed 's/^net:\[\(.*\)\]$/\1/')
CGR=/sys/fs/cgroup/unified/user.slice/user-1000.slice/session-5.scope/
sudo ./test_sockmap --cgroup $CGR --netns $NETNS &

cat /sys/kernel/debug/tracing/trace_pipe

echo foo | nc -l 127.0.0.1 8080 &
echo bar | nc 127.0.0.1 8080

=> the connection goes through the sockmap

When testing with a wrong $NETNS, I get the trace_pipe log:
> not binding connection on netns 4026531992

Signed-off-by: Alban Crequy <[email protected]>

---

Changes since v1:
- tools/include/uapi/linux/bpf.h: update with netns_dev
- tools/testing/selftests/bpf/test_sockmap_kern.h: print debugs with
both netns_dev and netns_ino

Changes since v2:
- update commitmsg to refer to netns_ino
---
tools/testing/selftests/bpf/test_sockmap.c | 38 +++++++++++++++++--
.../testing/selftests/bpf/test_sockmap_kern.h | 22 +++++++++++
2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 3845144e2c91..5a1b9c96fca1 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -2,6 +2,7 @@
// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
#include <stdio.h>
#include <stdlib.h>
+#include <stdint.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/select.h>
@@ -21,6 +22,7 @@
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/sendfile.h>
+#include <sys/stat.h>

#include <linux/netlink.h>
#include <linux/socket.h>
@@ -63,8 +65,8 @@ int s1, s2, c1, c2, p1, p2;
int test_cnt;
int passed;
int failed;
-int map_fd[8];
-struct bpf_map *maps[8];
+int map_fd[9];
+struct bpf_map *maps[9];
int prog_fd[11];

int txmsg_pass;
@@ -84,6 +86,7 @@ int txmsg_ingress;
int txmsg_skb;
int ktls;
int peek_flag;
+uint64_t netns_opt;

static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
@@ -111,6 +114,7 @@ static const struct option long_options[] = {
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
{"ktls", no_argument, &ktls, 1 },
{"peek", no_argument, &peek_flag, 1 },
+ {"netns", required_argument, NULL, 'n'},
{0, 0, NULL, 0 }
};

@@ -1585,6 +1589,7 @@ char *map_names[] = {
"sock_bytes",
"sock_redir_flags",
"sock_skb_opts",
+ "sock_netns",
};

int prog_attach_type[] = {
@@ -1619,6 +1624,8 @@ static int populate_progs(char *bpf_file)
struct bpf_object *obj;
int i = 0;
long err;
+ struct stat netns_sb;
+ uint64_t netns_ino;

obj = bpf_object__open(bpf_file);
err = libbpf_get_error(obj);
@@ -1655,6 +1662,28 @@ static int populate_progs(char *bpf_file)
}
}

+ if (netns_opt == 0) {
+ err = stat("/proc/self/ns/net", &netns_sb);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: cannot stat network namespace: %ld (%s)\n",
+ err, strerror(errno));
+ return -1;
+ }
+ netns_ino = netns_sb.st_ino;
+ } else {
+ netns_ino = netns_opt;
+ }
+ i = 1;
+ err = bpf_map_update_elem(map_fd[8], &netns_ino, &i, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (netns): %ld (%s)\n",
+ err, strerror(errno));
+ return -1;
+ }
+
+
return 0;
}

@@ -1738,7 +1767,7 @@ int main(int argc, char **argv)
if (argc < 2)
return test_suite(-1);

- while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:",
+ while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:p:q:n:",
long_options, &longindex)) != -1) {
switch (opt) {
case 's':
@@ -1805,6 +1834,9 @@ int main(int argc, char **argv)
return -1;
}
break;
+ case 'n':
+ netns_opt = strtoull(optarg, NULL, 10);
+ break;
case 0:
break;
case 'h':
diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.h b/tools/testing/selftests/bpf/test_sockmap_kern.h
index e7639f66a941..317406dad6cf 100644
--- a/tools/testing/selftests/bpf/test_sockmap_kern.h
+++ b/tools/testing/selftests/bpf/test_sockmap_kern.h
@@ -91,6 +91,13 @@ struct bpf_map_def SEC("maps") sock_skb_opts = {
.max_entries = 1
};

+struct bpf_map_def SEC("maps") sock_netns = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__u64),
+ .value_size = sizeof(int),
+ .max_entries = 16
+};
+
SEC("sk_skb1")
int bpf_prog1(struct __sk_buff *skb)
{
@@ -132,9 +139,24 @@ int bpf_sockmap(struct bpf_sock_ops *skops)
{
__u32 lport, rport;
int op, err = 0, index, key, ret;
+ int i = 0;
+ __u64 netns_dev, netns_ino;
+ int *allowed;


op = (int) skops->op;
+ netns_dev = skops->netns_dev;
+ netns_ino = skops->netns_ino;
+ bpf_printk("bpf_sockmap: netns_dev = %lu netns_ino = %lu\n",
+ netns_dev, netns_ino);
+
+ // Only allow sockmap connection on the configured network namespace
+ allowed = bpf_map_lookup_elem(&sock_netns, &netns_ino);
+ if (allowed == NULL || *allowed == 0) {
+ bpf_printk("not binding connection on netns_ino %lu\n",
+ netns_ino);
+ return 0;
+ }

switch (op) {
case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
--
2.20.1

2019-04-26 15:53:17

by Alban Crequy

[permalink] [raw]
Subject: [PATCH bpf-next v3 4/4] selftests: bpf: verifier: read netns_dev and netns_ino from struct bpf_sock_ops

From: Alban Crequy <[email protected]>

Tested with:
> $ sudo ./test_verifier
> ...
> #905/p sockops accessing bpf_sock_ops->netns_dev, ok OK
> #906/p sockops accessing bpf_sock_ops->netns_ino, ok OK
> ...
> Summary: 1421 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Alban Crequy <[email protected]>

---

Changes since v1:
- This is a new selftest (review from Song)

Changes since v2:
- test partial reads on netns_dev (review from Y Song)
- split in two tests
---
.../testing/selftests/bpf/verifier/var_off.c | 53 +++++++++++++++++++
1 file changed, 53 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/var_off.c b/tools/testing/selftests/bpf/verifier/var_off.c
index 8504ac937809..9e4c6c78eb9d 100644
--- a/tools/testing/selftests/bpf/verifier/var_off.c
+++ b/tools/testing/selftests/bpf/verifier/var_off.c
@@ -246,3 +246,56 @@
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_LWT_IN,
},
+{
+ "sockops accessing bpf_sock_ops->netns_dev, ok",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 4),
+
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 2),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 4),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 6),
+
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 4),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 5),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 6),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_dev) + 7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+},
+{
+ "sockops accessing bpf_sock_ops->netns_ino, ok",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ netns_ino)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+},
--
2.20.1

2019-04-26 21:05:48

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context

On Fri, 26 Apr 2019 17:48:45 +0200, Alban Crequy wrote:
> In the unlikely case where network namespaces are not compiled in
> (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.

Naive question - why return an error? init_net should always be there,
no?

2019-04-27 10:49:44

by Alban Crequy

[permalink] [raw]
Subject: Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context

On Fri, Apr 26, 2019 at 11:03 PM Jakub Kicinski
<[email protected]> wrote:
>
> On Fri, 26 Apr 2019 17:48:45 +0200, Alban Crequy wrote:
> > In the unlikely case where network namespaces are not compiled in
> > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
>
> Naive question - why return an error? init_net should always be there,
> no?

True for netns_dev. However, without CONFIG_NET_NS, we cannot access netns_ino:

(struct sock_common).possible_net_t.(struct net *):

typedef struct {
#ifdef CONFIG_NET_NS
struct net *net;
#endif
} possible_net_t;

And I don't think it would make much sense to allow access to
netns_dev but not netns_ino.

2019-04-27 16:38:10

by Y Song

[permalink] [raw]
Subject: Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context

On Fri, Apr 26, 2019 at 8:50 AM Alban Crequy <[email protected]> wrote:
>
> From: Alban Crequy <[email protected]>
>
> sockops programs can now access the network namespace inode and device
> via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> to apply different policies on different network namespaces.
>
> In the unlikely case where network namespaces are not compiled in
> (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
>
> The generated BPF bytecode for netns_ino is loading the correct inode
> number at the time of execution.
>
> However, the generated BPF bytecode for netns_dev is loading an
> immediate value determined at BPF-load-time by looking at the initial
> network namespace. In practice, this works because all netns currently
> use the same virtual device. If this was to change, this code would need
> to be updated too.
>
> Signed-off-by: Alban Crequy <[email protected]>
>
> ---
>
> Changes since v1:
> - add netns_dev (review from Alexei)
>
> Changes since v2:
> - replace __u64 by u64 in kernel code (review from Y Song)
> - remove unneeded #else branch: program would be rejected in
> is_valid_access (review from Y Song)
> - allow partial reads (<u64) (review from Y Song)
>
> Note: I have not been able to fully test partial reads on netns_dev.
> The following patches check partial reads in the verifier but it does
> not actually execute the program to check if partial reads generate the
> correct value. I tried to write a BPF program in C and declare the
> struct bpf_sock_ops as a volatile variable and I could get llvm to
> generate the BPF instructions to do partial loads. But then, I get the
> verifier error "dereference of modified ctx ptr R2 off=184 disallowed",
> explained in https://www.spinics.net/lists/netdev/msg531582.html
> What do you think should be done here?

You added partial read tests in test_verifier with raw asm codes.
It should be good enough.

For the compiler generated code causing verifier error, will take
a detailed look later.

Also I did not see a cover letter. For a series with 4 patches, it would be
the best if you can provide a separate cover letter.

> ---
> include/uapi/linux/bpf.h | 2 +
> net/core/filter.c | 94 ++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 96 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index eaf2d3284248..f4f841dde42c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> __u32 sk_txhash;
> __u64 bytes_received;
> __u64 bytes_acked;
> + __u64 netns_dev;
> + __u64 netns_ino;
> };
>
> /* Definitions for bpf_sock_ops_cb_flags */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 2f88baf39cc2..9c77464b1501 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -75,6 +75,8 @@
> #include <net/seg6_local.h>
> #include <net/lwtunnel.h>
> #include <net/ipv6_stubs.h>
> +#include <linux/kdev_t.h>
> +#include <linux/proc_ns.h>
>
> /**
> * sk_filter_trim_cap - run a packet through a socket filter
> @@ -6810,6 +6812,24 @@ static bool sock_ops_is_valid_access(int off, int size,
> }
> } else {
> switch (off) {
> + case offsetof(struct bpf_sock_ops, netns_dev) ...
> + offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
> +#ifdef CONFIG_NET_NS
> + if (off - offsetof(struct bpf_sock_ops, netns_dev)
> + + size > sizeof(u64))

This will allow something off = 1, size = 4. This is not what we want as
the access is not properly aligned.

You can look at function bpf_skb_is_valid_access(), esp. the two lines below:
bpf_ctx_record_field_size(info, size_default);
if (!bpf_ctx_narrow_access_ok(off, size, size_default))
return false;


> + return false;
> +#else
> + return false;
> +#endif
> + break;
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + if (size != sizeof(u64))
> + return false;
> +#else
> + return false;
> +#endif
> + break;
> case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> bytes_acked):
> if (size != sizeof(__u64))
> @@ -7727,6 +7747,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> return insn - insn_buf;
> }
>
> +static struct ns_common *sockops_netns_cb(void *private_data)
> +{
> + return &init_net.ns;
> +}
> +
> static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> const struct bpf_insn *si,
> struct bpf_insn *insn_buf,
> @@ -7735,6 +7760,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> {
> struct bpf_insn *insn = insn_buf;
> int off;
> + struct inode *ns_inode;
> + struct path ns_path;
> + u64 netns_dev;
> + void *res;
>
> /* Helper macro for adding read access to tcp_sock or sock fields. */
> #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> @@ -7981,6 +8010,71 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> struct sock, type);
> break;
> +
> + case offsetof(struct bpf_sock_ops, netns_dev) ...
> + offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
> +#ifdef CONFIG_NET_NS
> + /* We get the netns_dev at BPF-load-time and not at
> + * BPF-exec-time. We assume that netns_dev is a constant.
> + */
> + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> + if (IS_ERR(res)) {
> + netns_dev = 0;
> + } else {
> + ns_inode = ns_path.dentry->d_inode;
> + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> + }
> + off = si->off;
> + off -= offsetof(struct bpf_sock_ops, netns_dev);
> + switch (BPF_LDST_BYTES(si)) {
> + case sizeof(u64):
> + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> + break;
> + case sizeof(u32):
> + netns_dev = *(u32 *)(((char *)&netns_dev) + off);
> + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> + break;
> + case sizeof(u16):
> + netns_dev = *(u16 *)(((char *)&netns_dev) + off);
> + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> + break;
> + case sizeof(u8):
> + netns_dev = *(u8 *)(((char *)&netns_dev) + off);
> + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> + break;
> + }
> +#endif
> + break;
> +
> + case offsetof(struct bpf_sock_ops, netns_ino):
> +#ifdef CONFIG_NET_NS
> + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> + * Type: (struct bpf_sock_ops_kern *)
> + * ->(struct sock *)
> + * ->(struct sock_common)
> + * .possible_net_t
> + * .(struct net *)
> + * ->(struct ns_common)
> + * .(unsigned int)
> + */
> + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct bpf_sock_ops_kern, sk),
> + si->dst_reg, si->src_reg,
> + offsetof(struct bpf_sock_ops_kern, sk));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + possible_net_t, net),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct sock_common, skc_net));
> + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> + struct ns_common, inum),
> + si->dst_reg, si->dst_reg,
> + offsetof(struct net, ns) +
> + offsetof(struct ns_common, inum));
> +#endif
> + break;
> +
> }
> return insn - insn_buf;
> }
> --
> 2.20.1
>

2019-04-27 18:41:04

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context

On Sat, 27 Apr 2019 12:48:25 +0200, Alban Crequy wrote:
> On Fri, Apr 26, 2019 at 11:03 PM Jakub Kicinski
> <[email protected]> wrote:
> >
> > On Fri, 26 Apr 2019 17:48:45 +0200, Alban Crequy wrote:
> > > In the unlikely case where network namespaces are not compiled in
> > > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> >
> > Naive question - why return an error? init_net should always be there,
> > no?
>
> True for netns_dev. However, without CONFIG_NET_NS, we cannot access netns_ino:
>
> (struct sock_common).possible_net_t.(struct net *):
>
> typedef struct {
> #ifdef CONFIG_NET_NS
> struct net *net;
> #endif
> } possible_net_t;
>
> And I don't think it would make much sense to allow access to
> netns_dev but not netns_ino.

Right, if CONFIG_NET_NS=n we could just take the pointer to init_net
directly, and not worry about the field. IMHO it'd be preferable to
changing the UAPI based on kernel config, but I don't feel super
strongly.

2019-04-29 15:36:15

by Alban Crequy

[permalink] [raw]
Subject: Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context

On Sat, Apr 27, 2019 at 6:35 PM Y Song <[email protected]> wrote:
>
> On Fri, Apr 26, 2019 at 8:50 AM Alban Crequy <[email protected]> wrote:
> >
> > From: Alban Crequy <[email protected]>
> >
> > sockops programs can now access the network namespace inode and device
> > via (struct bpf_sock_ops)->netns_ino and ->netns_dev. This can be useful
> > to apply different policies on different network namespaces.
> >
> > In the unlikely case where network namespaces are not compiled in
> > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> >
> > The generated BPF bytecode for netns_ino is loading the correct inode
> > number at the time of execution.
> >
> > However, the generated BPF bytecode for netns_dev is loading an
> > immediate value determined at BPF-load-time by looking at the initial
> > network namespace. In practice, this works because all netns currently
> > use the same virtual device. If this was to change, this code would need
> > to be updated too.
> >
> > Signed-off-by: Alban Crequy <[email protected]>
> >
> > ---
> >
> > Changes since v1:
> > - add netns_dev (review from Alexei)
> >
> > Changes since v2:
> > - replace __u64 by u64 in kernel code (review from Y Song)
> > - remove unneeded #else branch: program would be rejected in
> > is_valid_access (review from Y Song)
> > - allow partial reads (<u64) (review from Y Song)
> >
> > Note: I have not been able to fully test partial reads on netns_dev.
> > The following patches check partial reads in the verifier but it does
> > not actually execute the program to check if partial reads generate the
> > correct value. I tried to write a BPF program in C and declare the
> > struct bpf_sock_ops as a volatile variable and I could get llvm to
> > generate the BPF instructions to do partial loads. But then, I get the
> > verifier error "dereference of modified ctx ptr R2 off=184 disallowed",
> > explained in https://www.spinics.net/lists/netdev/msg531582.html
> > What do you think should be done here?
>
> You added partial read tests in test_verifier with raw asm codes.
> It should be good enough.
>
> For the compiler generated code causing verifier error, will take
> a detailed look later.

Thanks! To clarify my note: the patches I sent on the mailing list
don't generate a verifier error.

It only errors out when I try partial reads in C. You can see the code
of the failed attempt that generate the error here:
https://github.com/kinvolk/linux/blob/c5fe70990c897a866c7006a0068876b0fde9ee4d/tools/testing/selftests/bpf/test_sockmap_kern.h#L146-L176

So if the partial read tests in test_verifier with raw asm codes are
good enough, there is no need to investigate more on that.

> Also I did not see a cover letter. For a series with 4 patches, it would be
> the best if you can provide a separate cover letter.

Ok, I will do that for the next iteration.

> > ---
> > include/uapi/linux/bpf.h | 2 +
> > net/core/filter.c | 94 ++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 96 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index eaf2d3284248..f4f841dde42c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3213,6 +3213,8 @@ struct bpf_sock_ops {
> > __u32 sk_txhash;
> > __u64 bytes_received;
> > __u64 bytes_acked;
> > + __u64 netns_dev;
> > + __u64 netns_ino;
> > };
> >
> > /* Definitions for bpf_sock_ops_cb_flags */
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 2f88baf39cc2..9c77464b1501 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -75,6 +75,8 @@
> > #include <net/seg6_local.h>
> > #include <net/lwtunnel.h>
> > #include <net/ipv6_stubs.h>
> > +#include <linux/kdev_t.h>
> > +#include <linux/proc_ns.h>
> >
> > /**
> > * sk_filter_trim_cap - run a packet through a socket filter
> > @@ -6810,6 +6812,24 @@ static bool sock_ops_is_valid_access(int off, int size,
> > }
> > } else {
> > switch (off) {
> > + case offsetof(struct bpf_sock_ops, netns_dev) ...
> > + offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
> > +#ifdef CONFIG_NET_NS
> > + if (off - offsetof(struct bpf_sock_ops, netns_dev)
> > + + size > sizeof(u64))
>
> This will allow something off = 1, size = 4. This is not what we want as
> the access is not properly aligned.

sock_ops_is_valid_access() does not allow off = 1, size = 4. There is
this check at the beginning of the function:
if (off % size != 0)
return false;

> You can look at function bpf_skb_is_valid_access(), esp. the two lines below:
> bpf_ctx_record_field_size(info, size_default);
> if (!bpf_ctx_narrow_access_ok(off, size, size_default))
> return false;

Thanks for the pointer! I now see that if I use them, my code in
sock_ops_convert_ctx_access() can be simplified.

> > + return false;
> > +#else
> > + return false;
> > +#endif
> > + break;
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + if (size != sizeof(u64))
> > + return false;
> > +#else
> > + return false;
> > +#endif
> > + break;
> > case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
> > bytes_acked):
> > if (size != sizeof(__u64))
> > @@ -7727,6 +7747,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
> > return insn - insn_buf;
> > }
> >
> > +static struct ns_common *sockops_netns_cb(void *private_data)
> > +{
> > + return &init_net.ns;
> > +}
> > +
> > static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > const struct bpf_insn *si,
> > struct bpf_insn *insn_buf,
> > @@ -7735,6 +7760,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > {
> > struct bpf_insn *insn = insn_buf;
> > int off;
> > + struct inode *ns_inode;
> > + struct path ns_path;
> > + u64 netns_dev;
> > + void *res;
> >
> > /* Helper macro for adding read access to tcp_sock or sock fields. */
> > #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
> > @@ -7981,6 +8010,71 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
> > SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
> > struct sock, type);
> > break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_dev) ...
> > + offsetof(struct bpf_sock_ops, netns_dev) + sizeof(u64) - 1:
> > +#ifdef CONFIG_NET_NS
> > + /* We get the netns_dev at BPF-load-time and not at
> > + * BPF-exec-time. We assume that netns_dev is a constant.
> > + */
> > + res = ns_get_path_cb(&ns_path, sockops_netns_cb, NULL);
> > + if (IS_ERR(res)) {
> > + netns_dev = 0;
> > + } else {
> > + ns_inode = ns_path.dentry->d_inode;
> > + netns_dev = new_encode_dev(ns_inode->i_sb->s_dev);
> > + }
> > + off = si->off;
> > + off -= offsetof(struct bpf_sock_ops, netns_dev);
> > + switch (BPF_LDST_BYTES(si)) {
> > + case sizeof(u64):
> > + *insn++ = BPF_MOV64_IMM(si->dst_reg, netns_dev);
> > + break;
> > + case sizeof(u32):
> > + netns_dev = *(u32 *)(((char *)&netns_dev) + off);
> > + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> > + break;
> > + case sizeof(u16):
> > + netns_dev = *(u16 *)(((char *)&netns_dev) + off);
> > + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> > + break;
> > + case sizeof(u8):
> > + netns_dev = *(u8 *)(((char *)&netns_dev) + off);
> > + *insn++ = BPF_MOV32_IMM(si->dst_reg, netns_dev);
> > + break;
> > + }
> > +#endif
> > + break;
> > +
> > + case offsetof(struct bpf_sock_ops, netns_ino):
> > +#ifdef CONFIG_NET_NS
> > + /* Loading: sk_ops->sk->__sk_common.skc_net.net->ns.inum
> > + * Type: (struct bpf_sock_ops_kern *)
> > + * ->(struct sock *)
> > + * ->(struct sock_common)
> > + * .possible_net_t
> > + * .(struct net *)
> > + * ->(struct ns_common)
> > + * .(unsigned int)
> > + */
> > + BUILD_BUG_ON(offsetof(struct sock, __sk_common) != 0);
> > + BUILD_BUG_ON(offsetof(possible_net_t, net) != 0);
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct bpf_sock_ops_kern, sk),
> > + si->dst_reg, si->src_reg,
> > + offsetof(struct bpf_sock_ops_kern, sk));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + possible_net_t, net),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct sock_common, skc_net));
> > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> > + struct ns_common, inum),
> > + si->dst_reg, si->dst_reg,
> > + offsetof(struct net, ns) +
> > + offsetof(struct ns_common, inum));
> > +#endif
> > + break;
> > +
> > }
> > return insn - insn_buf;
> > }
> > --
> > 2.20.1
> >

2019-04-29 15:43:46

by Alban Crequy

[permalink] [raw]
Subject: Re: [PATCH bpf-next v3 1/4] bpf: sock ops: add netns ino and dev in bpf context

On Sat, Apr 27, 2019 at 8:39 PM Jakub Kicinski
<[email protected]> wrote:
>
> On Sat, 27 Apr 2019 12:48:25 +0200, Alban Crequy wrote:
> > On Fri, Apr 26, 2019 at 11:03 PM Jakub Kicinski
> > <[email protected]> wrote:
> > >
> > > On Fri, 26 Apr 2019 17:48:45 +0200, Alban Crequy wrote:
> > > > In the unlikely case where network namespaces are not compiled in
> > > > (CONFIG_NET_NS=n), the verifier will not allow access to ->netns_*.
> > >
> > > Naive question - why return an error? init_net should always be there,
> > > no?
> >
> > True for netns_dev. However, without CONFIG_NET_NS, we cannot access netns_ino:
> >
> > (struct sock_common).possible_net_t.(struct net *):
> >
> > typedef struct {
> > #ifdef CONFIG_NET_NS
> > struct net *net;
> > #endif
> > } possible_net_t;
> >
> > And I don't think it would make much sense to allow access to
> > netns_dev but not netns_ino.
>
> Right, if CONFIG_NET_NS=n we could just take the pointer to init_net
> directly, and not worry about the field. IMHO it'd be preferable to
> changing the UAPI based on kernel config, but I don't feel super
> strongly.

I see the point about not changing the UAPI. So I will update the patch to:
- return netns_dev unconditionally, regardless of CONFIG_NET_NS
- return netns_ino with either the correct value or zero depending on
CONFIG_NET_NS.