LinuxLists.cc - [PATCH ipsec-next v2 0/6] Add bpf_xdp_get_xfrm

2023-11-28 17:55:02

Subject: [PATCH ipsec-next v2 0/6] Add bpf_xdp_get_xfrm_state() kfunc

This patchset adds two kfunc helpers, bpf_xdp_get_xfrm_state() and
bpf_xdp_xfrm_state_release() that wrap xfrm_state_lookup() and
xfrm_state_put(). The intent is to support software RSS (via XDP) for
the ongoing/upcoming ipsec pcpu work [0]. Recent experiments performed
on (hopefully) reproducible AWS testbeds indicate that single tunnel
pcpu ipsec can reach line rate on 100G ENA nics.

Note this patchset only tests/shows generic xfrm_state access. The
"secret sauce" (if you can really even call it that) involves accessing
a soon-to-be-upstreamed pcpu_num field in xfrm_state. Early example is
available here [1].

[0]: https://datatracker.ietf.org/doc/draft-ietf-ipsecme-multi-sa-performance/03/
[1]: https://github.com/danobi/xdp-tools/blob/e89a1c617aba3b50d990f779357d6ce2863ecb27/xdp-bench/xdp_redirect_cpumap.bpf.c#L385-L406

Changes from v1:
* Move xfrm tunnel tests to test_progs
* Fix writing to opts->error when opts is invalid
* Use __bpf_kfunc_start_defs()
* Remove unused vxlanhdr definition
* Add and use BPF_CORE_WRITE_BITFIELD() macro
* Make series bisect clean

Changes from RFCv2:
* Rebased to ipsec-next
* Fix netns leak

Changes from RFCv1:
* Add Antony's commit tags
* Add KF_ACQUIRE and KF_RELEASE semantics

Daniel Xu (6):
bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc
bpf: xfrm: Add bpf_xdp_xfrm_state_release() kfunc
libbpf: Add BPF_CORE_WRITE_BITFIELD() macro
bpf: selftests: test_tunnel: Use vmlinux.h declarations
bpf: selftests: Move xfrm tunnel test to test_progs
bpf: xfrm: Add selftest for bpf_xdp_get_xfrm_state()

include/net/xfrm.h | 9 +
net/xfrm/Makefile | 1 +
net/xfrm/xfrm_policy.c | 2 +
net/xfrm/xfrm_state_bpf.c | 128 +++++++++++++++
tools/lib/bpf/bpf_core_read.h | 36 ++++
.../selftests/bpf/prog_tests/test_tunnel.c | 155 ++++++++++++++++++
.../selftests/bpf/progs/bpf_tracing_net.h | 1 +
.../selftests/bpf/progs/test_tunnel_kern.c | 138 +++++++++-------
tools/testing/selftests/bpf/test_tunnel.sh | 92 -----------
9 files changed, 412 insertions(+), 150 deletions(-)
create mode 100644 net/xfrm/xfrm_state_bpf.c

--
2.42.1

2023-11-28 17:55:51

by Daniel Xu

[permalink] [raw]

Subject: [PATCH ipsec-next v2 1/6] bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc

This commit adds an unstable kfunc helper to access internal xfrm_state
associated with an SA. This is intended to be used for the upcoming
IPsec pcpu work to assign special pcpu SAs to a particular CPU. In other
words: for custom software RSS.

That being said, the function that this kfunc wraps is fairly generic
and used for a lot of xfrm tasks. I'm sure people will find uses
elsewhere over time.

Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
include/net/xfrm.h | 9 +++
net/xfrm/Makefile | 1 +
net/xfrm/xfrm_policy.c | 2 +
net/xfrm/xfrm_state_bpf.c | 112 ++++++++++++++++++++++++++++++++++++++
4 files changed, 124 insertions(+)
create mode 100644 net/xfrm/xfrm_state_bpf.c

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c9bb0f892f55..1d107241b901 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void)

#endif

+#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF)
+int register_xfrm_state_bpf(void);
+#else
+static inline int register_xfrm_state_bpf(void)
+{
+ return 0;
+}
+#endif
+
#endif /* _NET_XFRM_H */
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index cd47f88921f5..547cec77ba03 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
+obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index c13dc3ef7910..1b7e75159727 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4218,6 +4218,8 @@ void __init xfrm_init(void)
#ifdef CONFIG_XFRM_ESPINTCP
espintcp_init();
#endif
+
+ register_xfrm_state_bpf();
}

#ifdef CONFIG_AUDITSYSCALL
diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c
new file mode 100644
index 000000000000..1681825db506
--- /dev/null
+++ b/net/xfrm/xfrm_state_bpf.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable XFRM state BPF helpers.
+ *
+ * Note that it is allowed to break compatibility for these functions since the
+ * interface they are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <net/xdp.h>
+#include <net/xfrm.h>
+
+/* bpf_xfrm_state_opts - Options for XFRM state lookup helpers
+ *
+ * Members:
+ * @error - Out parameter, set for any errors encountered
+ * Values:
+ * -EINVAL - netns_id is less than -1
+ * -EINVAL - opts__sz isn't BPF_XFRM_STATE_OPTS_SZ
+ * -ENONET - No network namespace found for netns_id
+ * @netns_id - Specify the network namespace for lookup
+ * Values:
+ * BPF_F_CURRENT_NETNS (-1)
+ * Use namespace associated with ctx
+ * [0, S32_MAX]
+ * Network Namespace ID
+ * @mark - XFRM mark to match on
+ * @daddr - Destination address to match on
+ * @spi - Security parameter index to match on
+ * @proto - L3 protocol to match on
+ * @family - L3 protocol family to match on
+ */
+struct bpf_xfrm_state_opts {
+ s32 error;
+ s32 netns_id;
+ u32 mark;
+ xfrm_address_t daddr;
+ __be32 spi;
+ u8 proto;
+ u16 family;
+};
+
+enum {
+ BPF_XFRM_STATE_OPTS_SZ = sizeof(struct bpf_xfrm_state_opts),
+};
+
+__bpf_kfunc_start_defs();
+
+/* bpf_xdp_get_xfrm_state - Get XFRM state
+ *
+ * Parameters:
+ * @ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @opts - Options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_xfrm_state_opts structure
+ * Must be BPF_XFRM_STATE_OPTS_SZ
+ */
+__bpf_kfunc struct xfrm_state *
+bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+ struct net *net = dev_net(xdp->rxq->dev);
+ struct xfrm_state *x;
+
+ if (!opts || opts__sz < sizeof(opts->error))
+ return NULL;
+
+ if (opts__sz != BPF_XFRM_STATE_OPTS_SZ) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net)) {
+ opts->error = -ENONET;
+ return NULL;
+ }
+ }
+
+ x = xfrm_state_lookup(net, opts->mark, &opts->daddr, opts->spi,
+ opts->proto, opts->family);
+
+ if (opts->netns_id >= 0)
+ put_net(net);
+
+ return x;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(xfrm_state_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
+BTF_SET8_END(xfrm_state_kfunc_set)
+
+static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &xfrm_state_kfunc_set,
+};
+
+int __init register_xfrm_state_bpf(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &xfrm_state_xdp_kfunc_set);
+}
--
2.42.1

2023-11-28 17:56:18

by Daniel Xu

[permalink] [raw]

Subject: [PATCH ipsec-next v2 5/6] bpf: selftests: Move xfrm tunnel test to test_progs

test_progs is better than a shell script b/c C is a bit easier to
maintain than shell. Also it's easier to use new infra like memory
mapped global variables from C via bpf skeleton.

Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/prog_tests/test_tunnel.c | 143 ++++++++++++++++++
.../selftests/bpf/progs/test_tunnel_kern.c | 11 +-
tools/testing/selftests/bpf/test_tunnel.sh | 92 -----------
3 files changed, 151 insertions(+), 95 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
index d149ab98798d..3bcb6f96b9b5 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
@@ -50,6 +50,7 @@
*/

#include <arpa/inet.h>
+#include <linux/if_link.h>
#include <linux/if_tun.h>
#include <linux/limits.h>
#include <linux/sysctl.h>
@@ -92,6 +93,11 @@
#define IPIP_TUNL_DEV0 "ipip00"
#define IPIP_TUNL_DEV1 "ipip11"

+#define XFRM_AUTH "0x1111111111111111111111111111111111111111"
+#define XFRM_ENC "0x22222222222222222222222222222222"
+#define XFRM_SPI_IN_TO_OUT 0x1
+#define XFRM_SPI_OUT_TO_IN 0x2
+
#define PING_ARGS "-i 0.01 -c 3 -w 10 -q"

static int config_device(void)
@@ -264,6 +270,92 @@ static void delete_ipip_tunnel(void)
SYS_NOFAIL("ip fou del port 5555 2> /dev/null");
}

+static int add_xfrm_tunnel(void)
+{
+ /* at_ns0 namespace
+ * at_ns0 -> root
+ */
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 1 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm policy add src %s/32 dst %s/32 dir out "
+ "tmpl src %s dst %s proto esp reqid 1 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);
+
+ /* root -> at_ns0 */
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 2 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm policy add src %s/32 dst %s/32 dir in "
+ "tmpl src %s dst %s proto esp reqid 2 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);
+
+ /* address & route */
+ SYS(fail, "ip netns exec at_ns0 ip addr add dev veth0 %s/32",
+ IP4_ADDR_TUNL_DEV0);
+ SYS(fail, "ip netns exec at_ns0 ip route add %s dev veth0 via %s src %s",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR1_VETH1, IP4_ADDR_TUNL_DEV0);
+
+ /* root namespace
+ * at_ns0 -> root
+ */
+ SYS(fail,
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 1 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip xfrm policy add src %s/32 dst %s/32 dir in "
+ "tmpl src %s dst %s proto esp reqid 1 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);
+
+ /* root -> at_ns0 */
+ SYS(fail,
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 2 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip xfrm policy add src %s/32 dst %s/32 dir out "
+ "tmpl src %s dst %s proto esp reqid 2 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);
+
+ /* address & route */
+ SYS(fail, "ip addr add dev veth1 %s/32", IP4_ADDR_TUNL_DEV1);
+ SYS(fail, "ip route add %s dev veth1 via %s src %s",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_VETH0, IP4_ADDR_TUNL_DEV1);
+
+ return 0;
+fail:
+ return -1;
+}
+
+static void delete_xfrm_tunnel(void)
+{
+ SYS_NOFAIL("ip xfrm policy delete dir out src %s/32 dst %s/32 2> /dev/null",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0);
+ SYS_NOFAIL("ip xfrm policy delete dir in src %s/32 dst %s/32 2> /dev/null",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1);
+ SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
+ IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT);
+ SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
+ IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN);
+}
+
static int test_ping(int family, const char *addr)
{
SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr);
@@ -532,6 +624,56 @@ static void test_ipip_tunnel(enum ipip_encap encap)
test_tunnel_kern__destroy(skel);
}

+static void test_xfrm_tunnel(void)
+{
+ DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook,
+ .attach_point = BPF_TC_INGRESS);
+ struct test_tunnel_kern *skel = NULL;
+ struct nstoken *nstoken;
+ int tc_prog_fd;
+ int ifindex;
+ int err;
+
+ err = add_xfrm_tunnel();
+ if (!ASSERT_OK(err, "add_xfrm_tunnel"))
+ return;
+
+ skel = test_tunnel_kern__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load"))
+ goto done;
+
+ ifindex = if_nametoindex("veth1");
+ if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex"))
+ goto done;
+
+ /* attach tc prog to tunnel dev */
+ tc_hook.ifindex = ifindex;
+ tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state);
+ if (!ASSERT_GE(tc_prog_fd, 0, "bpf_program__fd"))
+ goto done;
+ if (attach_tc_prog(&tc_hook, tc_prog_fd, -1))
+ goto done;
+
+ /* ping from at_ns0 namespace test */
+ nstoken = open_netns("at_ns0");
+ err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
+ close_netns(nstoken);
+ if (!ASSERT_OK(err, "test_ping"))
+ goto done;
+
+ if (!ASSERT_EQ(skel->bss->xfrm_reqid, 1, "req_id"))
+ goto done;
+ if (!ASSERT_EQ(skel->bss->xfrm_spi, XFRM_SPI_IN_TO_OUT, "spi"))
+ goto done;
+ if (!ASSERT_EQ(skel->bss->xfrm_remote_ip, 0xac100164, "remote_ip"))
+ goto done;
+
+done:
+ delete_xfrm_tunnel();
+ if (skel)
+ test_tunnel_kern__destroy(skel);
+}
+
#define RUN_TEST(name, ...) \
({ \
if (test__start_subtest(#name)) { \
@@ -549,6 +691,7 @@ static void *test_tunnel_run_tests(void *arg)
RUN_TEST(ipip_tunnel, NONE);
RUN_TEST(ipip_tunnel, FOU);
RUN_TEST(ipip_tunnel, GUE);
+ RUN_TEST(xfrm_tunnel);

cleanup();

diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index b320fb7bb080..3a59eb9c34de 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -929,6 +929,10 @@ int ip6ip6_get_tunnel(struct __sk_buff *skb)
return TC_ACT_OK;
}

+volatile int xfrm_reqid = 0;
+volatile int xfrm_spi = 0;
+volatile int xfrm_remote_ip = 0;
+
SEC("tc")
int xfrm_get_state(struct __sk_buff *skb)
{
@@ -939,9 +943,10 @@ int xfrm_get_state(struct __sk_buff *skb)
if (ret < 0)
return TC_ACT_OK;

- bpf_printk("reqid %d spi 0x%x remote ip 0x%x\n",
- x.reqid, bpf_ntohl(x.spi),
- bpf_ntohl(x.remote_ipv4));
+ xfrm_reqid = x.reqid;
+ xfrm_spi = bpf_ntohl(x.spi);
+ xfrm_remote_ip = bpf_ntohl(x.remote_ipv4);
+
return TC_ACT_OK;
}

diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh
index 2dec7dbf29a2..d9661b9988ba 100755
--- a/tools/testing/selftests/bpf/test_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tunnel.sh
@@ -517,90 +517,6 @@ test_ip6ip6()
echo -e ${GREEN}"PASS: ip6$TYPE"${NC}
}

-setup_xfrm_tunnel()
-{
- auth=0x$(printf '1%.0s' {1..40})
- enc=0x$(printf '2%.0s' {1..32})
- spi_in_to_out=0x1
- spi_out_to_in=0x2
- # at_ns0 namespace
- # at_ns0 -> root
- ip netns exec at_ns0 \
- ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
- spi $spi_in_to_out reqid 1 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip netns exec at_ns0 \
- ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir out \
- tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
- mode tunnel
- # root -> at_ns0
- ip netns exec at_ns0 \
- ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
- spi $spi_out_to_in reqid 2 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip netns exec at_ns0 \
- ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir in \
- tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
- mode tunnel
- # address & route
- ip netns exec at_ns0 \
- ip addr add dev veth0 10.1.1.100/32
- ip netns exec at_ns0 \
- ip route add 10.1.1.200 dev veth0 via 172.16.1.200 \
- src 10.1.1.100
-
- # root namespace
- # at_ns0 -> root
- ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
- spi $spi_in_to_out reqid 1 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir in \
- tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
- mode tunnel
- # root -> at_ns0
- ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
- spi $spi_out_to_in reqid 2 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir out \
- tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
- mode tunnel
- # address & route
- ip addr add dev veth1 10.1.1.200/32
- ip route add 10.1.1.100 dev veth1 via 172.16.1.100 src 10.1.1.200
-}
-
-test_xfrm_tunnel()
-{
- if [[ -e /sys/kernel/tracing/trace ]]; then
- TRACE=/sys/kernel/tracing/trace
- else
- TRACE=/sys/kernel/debug/tracing/trace
- fi
- config_device
- > ${TRACE}
- setup_xfrm_tunnel
- mkdir -p ${BPF_PIN_TUNNEL_DIR}
- bpftool prog loadall ${BPF_FILE} ${BPF_PIN_TUNNEL_DIR}
- tc qdisc add dev veth1 clsact
- tc filter add dev veth1 proto ip ingress bpf da object-pinned \
- ${BPF_PIN_TUNNEL_DIR}/xfrm_get_state
- ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
- sleep 1
- grep "reqid 1" ${TRACE}
- check_err $?
- grep "spi 0x1" ${TRACE}
- check_err $?
- grep "remote ip 0xac100164" ${TRACE}
- check_err $?
- cleanup
-
- if [ $ret -ne 0 ]; then
- echo -e ${RED}"FAIL: xfrm tunnel"${NC}
- return 1
- fi
- echo -e ${GREEN}"PASS: xfrm tunnel"${NC}
-}
-
attach_bpf()
{
DEV=$1
@@ -630,10 +546,6 @@ cleanup()
ip link del ip6geneve11 2> /dev/null
ip link del erspan11 2> /dev/null
ip link del ip6erspan11 2> /dev/null
- ip xfrm policy delete dir out src 10.1.1.200/32 dst 10.1.1.100/32 2> /dev/null
- ip xfrm policy delete dir in src 10.1.1.100/32 dst 10.1.1.200/32 2> /dev/null
- ip xfrm state delete src 172.16.1.100 dst 172.16.1.200 proto esp spi 0x1 2> /dev/null
- ip xfrm state delete src 172.16.1.200 dst 172.16.1.100 proto esp spi 0x2 2> /dev/null
}

cleanup_exit()
@@ -716,10 +628,6 @@ bpf_tunnel_test()
test_ip6ip6
errors=$(( $errors + $? ))

- echo "Testing IPSec tunnel..."
- test_xfrm_tunnel
- errors=$(( $errors + $? ))
-
return $errors
}

--
2.42.1

2023-11-28 17:56:36

by Daniel Xu

[permalink] [raw]

Subject: [PATCH ipsec-next v2 2/6] bpf: xfrm: Add bpf_xdp_xfrm_state_release() kfunc

This kfunc releases a previously acquired xfrm_state from
bpf_xdp_get_xfrm_state().

Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
net/xfrm/xfrm_state_bpf.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)

diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c
index 1681825db506..1485b9da9425 100644
--- a/net/xfrm/xfrm_state_bpf.c
+++ b/net/xfrm/xfrm_state_bpf.c
@@ -94,10 +94,26 @@ bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32
return x;
}

+/* bpf_xdp_xfrm_state_release - Release acquired xfrm_state object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @x - Pointer to referenced xfrm_state object, obtained using
+ * bpf_xdp_get_xfrm_state.
+ */
+__bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x)
+{
+ xfrm_state_put(x);
+}
+
__bpf_kfunc_end_defs();

BTF_SET8_START(xfrm_state_kfunc_set)
BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE)
BTF_SET8_END(xfrm_state_kfunc_set)

static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
--
2.42.1

2023-11-28 17:56:40

by Daniel Xu

[permalink] [raw]

Subject: [PATCH ipsec-next v2 3/6] libbpf: Add BPF_CORE_WRITE_BITFIELD() macro

Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
writing wrapper to make the verifier happy.

Two alternatives to this approach are:

1. Use the upcoming `preserve_static_offset` [0] attribute to disable
CO-RE on specific structs.
2. Use broader byte-sized writes to write to bitfields.

(1) is a bit a bit hard to use. It requires specific and
not-very-obvious annotations to bpftool generated vmlinux.h. It's also
not generally available in released LLVM versions yet.

(2) makes the code quite hard to read and write. And especially if
BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
to have an inverse helper for writing.

[0]: https://reviews.llvm.org/D133361
From: Eduard Zingerman <[email protected]>

Signed-off-by: Daniel Xu <[email protected]>
---
tools/lib/bpf/bpf_core_read.h | 36 +++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)

diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index 1ac57bb7ac55..7a764f65d299 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -111,6 +111,42 @@ enum bpf_enum_value_kind {
val; \
})

+/*
+ * Write to a bitfield, identified by s->field.
+ * This is the inverse of BPF_CORE_WRITE_BITFIELD().
+ */
+#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({ \
+ void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \
+ unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
+ unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
+ unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
+ unsigned int bit_size = (rshift - lshift); \
+ unsigned long long nval, val, hi, lo; \
+ \
+ asm volatile("" : "+r"(p)); \
+ \
+ switch (byte_size) { \
+ case 1: val = *(unsigned char *)p; break; \
+ case 2: val = *(unsigned short *)p; break; \
+ case 4: val = *(unsigned int *)p; break; \
+ case 8: val = *(unsigned long long *)p; break; \
+ } \
+ hi = val >> (bit_size + rshift); \
+ hi <<= bit_size + rshift; \
+ lo = val << (bit_size + lshift); \
+ lo >>= bit_size + lshift; \
+ nval = new_val; \
+ nval <<= lshift; \
+ nval >>= rshift; \
+ val = hi | nval | lo; \
+ switch (byte_size) { \
+ case 1: *(unsigned char *)p = val; break; \
+ case 2: *(unsigned short *)p = val; break; \
+ case 4: *(unsigned int *)p = val; break; \
+ case 8: *(unsigned long long *)p = val; break; \
+ } \
+})
+
#define ___bpf_field_ref1(field) (field)
#define ___bpf_field_ref2(type, field) (((typeof(type) *)0)->field)
#define ___bpf_field_ref(args...) \
--
2.42.1

2023-11-28 17:56:43

by Daniel Xu

[permalink] [raw]

Subject: [PATCH ipsec-next v2 4/6] bpf: selftests: test_tunnel: Use vmlinux.h declarations

vmlinux.h declarations are more ergnomic, especially when working with
kfuncs. The uapi headers are often incomplete for kfunc definitions.

This commit also switches bitfield accesses to use CO-RE helpers.
Switching to vmlinux.h definitions makes the verifier very
unhappy with raw bitfield accesses. The error is:

; md.u.md2.dir = direction;
33: (69) r1 = *(u16 *)(r2 +11)
misaligned stack access off (0x0; 0x0)+-64+11 size 2

Fix by using CO-RE-aware bitfield reads and writes.

Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/progs/bpf_tracing_net.h | 1 +
.../selftests/bpf/progs/test_tunnel_kern.c | 76 +++++--------------
2 files changed, 22 insertions(+), 55 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index 0b793a102791..1bdc680b0e0e 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -26,6 +26,7 @@
#define IPV6_AUTOFLOWLABEL 70

#define TC_ACT_UNSPEC (-1)
+#define TC_ACT_OK 0
#define TC_ACT_SHOT 2

#define SOL_TCP 6
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index f66af753bbbb..b320fb7bb080 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -6,62 +6,26 @@
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
-#include <stddef.h>
-#include <string.h>
-#include <arpa/inet.h>
-#include <linux/bpf.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/if_tunnel.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/icmp.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/pkt_cls.h>
-#include <linux/erspan.h>
-#include <linux/udp.h>
+#include "vmlinux.h"
+#include <bpf/bpf_core_read.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
+#include "bpf_kfuncs.h"
+#include "bpf_tracing_net.h"

#define log_err(__ret) bpf_printk("ERROR line:%d ret:%d\n", __LINE__, __ret)

-#define VXLAN_UDP_PORT 4789
+#define VXLAN_UDP_PORT 4789
+#define ETH_P_IP 0x0800
+#define PACKET_HOST 0
+#define TUNNEL_CSUM bpf_htons(0x01)
+#define TUNNEL_KEY bpf_htons(0x04)

/* Only IPv4 address assigned to veth1.
* 172.16.1.200
*/
#define ASSIGNED_ADDR_VETH1 0xac1001c8

-struct geneve_opt {
- __be16 opt_class;
- __u8 type;
- __u8 length:5;
- __u8 r3:1;
- __u8 r2:1;
- __u8 r1:1;
- __u8 opt_data[8]; /* hard-coded to 8 byte */
-};
-
-struct vxlanhdr {
- __be32 vx_flags;
- __be32 vx_vni;
-} __attribute__((packed));
-
-struct vxlan_metadata {
- __u32 gbp;
-};
-
-struct bpf_fou_encap {
- __be16 sport;
- __be16 dport;
-};
-
-enum bpf_fou_encap_type {
- FOU_BPF_ENCAP_FOU,
- FOU_BPF_ENCAP_GUE,
-};
-
int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap, int type) __ksym;
int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
@@ -205,9 +169,9 @@ int erspan_set_tunnel(struct __sk_buff *skb)
__u8 hwid = 7;

md.version = 2;
- md.u.md2.dir = direction;
- md.u.md2.hwid = hwid & 0xf;
- md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, dir, direction);
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid, (hwid & 0xf));
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid_upper, (hwid >> 4) & 0x3);
#endif

ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
@@ -246,8 +210,9 @@ int erspan_get_tunnel(struct __sk_buff *skb)
bpf_printk("\tindex %x\n", index);
#else
bpf_printk("\tdirection %d hwid %x timestamp %u\n",
- md.u.md2.dir,
- (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ BPF_CORE_READ_BITFIELD(&md.u.md2, dir),
+ (BPF_CORE_READ_BITFIELD(&md.u.md2, hwid_upper) << 4) +
+ BPF_CORE_READ_BITFIELD(&md.u.md2, hwid),
bpf_ntohl(md.u.md2.timestamp));
#endif

@@ -284,9 +249,9 @@ int ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
__u8 hwid = 17;

md.version = 2;
- md.u.md2.dir = direction;
- md.u.md2.hwid = hwid & 0xf;
- md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, dir, direction);
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid, (hwid & 0xf));
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid_upper, (hwid >> 4) & 0x3);
#endif

ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
@@ -326,8 +291,9 @@ int ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
bpf_printk("\tindex %x\n", index);
#else
bpf_printk("\tdirection %d hwid %x timestamp %u\n",
- md.u.md2.dir,
- (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ BPF_CORE_READ_BITFIELD(&md.u.md2, dir),
+ (BPF_CORE_READ_BITFIELD(&md.u.md2, hwid_upper) << 4) +
+ BPF_CORE_READ_BITFIELD(&md.u.md2, hwid),
bpf_ntohl(md.u.md2.timestamp));
#endif

--
2.42.1

2023-11-28 17:56:44

by Daniel Xu

[permalink] [raw]

Subject: [PATCH ipsec-next v2 6/6] bpf: xfrm: Add selftest for bpf_xdp_get_xfrm_state()

This commit extends test_tunnel selftest to test the new XDP xfrm state
lookup kfunc.

Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/prog_tests/test_tunnel.c | 20 ++++++--
.../selftests/bpf/progs/test_tunnel_kern.c | 51 +++++++++++++++++++
2 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
index 3bcb6f96b9b5..54308afb3cdc 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
@@ -278,7 +278,7 @@ static int add_xfrm_tunnel(void)
SYS(fail,
"ip netns exec at_ns0 "
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 1 mode tunnel "
+ "spi %d reqid 1 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -292,7 +292,7 @@ static int add_xfrm_tunnel(void)
SYS(fail,
"ip netns exec at_ns0 "
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 2 mode tunnel "
+ "spi %d reqid 2 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -313,7 +313,7 @@ static int add_xfrm_tunnel(void)
*/
SYS(fail,
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 1 mode tunnel "
+ "spi %d reqid 1 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -325,7 +325,7 @@ static int add_xfrm_tunnel(void)
/* root -> at_ns0 */
SYS(fail,
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 2 mode tunnel "
+ "spi %d reqid 2 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -628,8 +628,10 @@ static void test_xfrm_tunnel(void)
{
DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook,
.attach_point = BPF_TC_INGRESS);
+ LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
struct test_tunnel_kern *skel = NULL;
struct nstoken *nstoken;
+ int xdp_prog_fd;
int tc_prog_fd;
int ifindex;
int err;
@@ -654,6 +656,14 @@ static void test_xfrm_tunnel(void)
if (attach_tc_prog(&tc_hook, tc_prog_fd, -1))
goto done;

+ /* attach xdp prog to tunnel dev */
+ xdp_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state_xdp);
+ if (!ASSERT_GE(xdp_prog_fd, 0, "bpf_program__fd"))
+ goto done;
+ err = bpf_xdp_attach(ifindex, xdp_prog_fd, XDP_FLAGS_REPLACE, &opts);
+ if (!ASSERT_OK(err, "bpf_xdp_attach"))
+ goto done;
+
/* ping from at_ns0 namespace test */
nstoken = open_netns("at_ns0");
err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
@@ -667,6 +677,8 @@ static void test_xfrm_tunnel(void)
goto done;
if (!ASSERT_EQ(skel->bss->xfrm_remote_ip, 0xac100164, "remote_ip"))
goto done;
+ if (!ASSERT_EQ(skel->bss->xfrm_replay_window, 42, "replay_window"))
+ goto done;

done:
delete_xfrm_tunnel();
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index 3a59eb9c34de..c0dd38616562 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -30,6 +30,10 @@ int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap, int type) __ksym;
int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap) __ksym;
+struct xfrm_state *
+bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts,
+ u32 opts__sz) __ksym;
+void bpf_xdp_xfrm_state_release(struct xfrm_state *x) __ksym;

struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
@@ -950,4 +954,51 @@ int xfrm_get_state(struct __sk_buff *skb)
return TC_ACT_OK;
}

+volatile int xfrm_replay_window = 0;
+
+SEC("xdp")
+int xfrm_get_state_xdp(struct xdp_md *xdp)
+{
+ struct bpf_xfrm_state_opts opts = {};
+ struct xfrm_state *x = NULL;
+ struct ip_esp_hdr *esph;
+ struct bpf_dynptr ptr;
+ u8 esph_buf[8] = {};
+ u8 iph_buf[20] = {};
+ struct iphdr *iph;
+ u32 off;
+
+ if (bpf_dynptr_from_xdp(xdp, 0, &ptr))
+ goto out;
+
+ off = sizeof(struct ethhdr);
+ iph = bpf_dynptr_slice(&ptr, off, iph_buf, sizeof(iph_buf));
+ if (!iph || iph->protocol != IPPROTO_ESP)
+ goto out;
+
+ off += sizeof(struct iphdr);
+ esph = bpf_dynptr_slice(&ptr, off, esph_buf, sizeof(esph_buf));
+ if (!esph)
+ goto out;
+
+ opts.netns_id = BPF_F_CURRENT_NETNS;
+ opts.daddr.a4 = iph->daddr;
+ opts.spi = esph->spi;
+ opts.proto = IPPROTO_ESP;
+ opts.family = AF_INET;
+
+ x = bpf_xdp_get_xfrm_state(xdp, &opts, sizeof(opts));
+ if (!x || opts.error)
+ goto out;
+
+ if (!x->replay_esn)
+ goto out;
+
+ xfrm_replay_window = x->replay_esn->replay_window;
+out:
+ if (x)
+ bpf_xdp_xfrm_state_release(x);
+ return XDP_PASS;
+}
+
char _license[] SEC("license") = "GPL";
--
2.42.1

2023-11-28 17:59:52

by Eduard Zingerman

[permalink] [raw]

Subject: Re: [PATCH ipsec-next v2 3/6] libbpf: Add BPF_CORE_WRITE_BITFIELD() macro

On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> writing wrapper to make the verifier happy.
>
> Two alternatives to this approach are:
>
> 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> CO-RE on specific structs.
> 2. Use broader byte-sized writes to write to bitfields.
>
> (1) is a bit a bit hard to use. It requires specific and
> not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> not generally available in released LLVM versions yet.
>
> (2) makes the code quite hard to read and write. And especially if
> BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> to have an inverse helper for writing.
>
> [0]: https://reviews.llvm.org/D133361
> From: Eduard Zingerman <[email protected]>
>
> Signed-off-by: Daniel Xu <[email protected]>
> ---

Could you please also add a selftest (or several) using __retval()
annotation for this macro?

2023-11-28 19:16:32

by Daniel Xu

[permalink] [raw]

Subject: Re: [PATCH ipsec-next v2 3/6] libbpf: Add BPF_CORE_WRITE_BITFIELD() macro

2023-12-01 01:34:00

by Daniel Xu

[permalink] [raw]

Subject: Re: [PATCH ipsec-next v2 3/6] libbpf: Add BPF_CORE_WRITE_BITFIELD() macro

On Tue, Nov 28, 2023 at 07:59:01PM +0200, Eduard Zingerman wrote:
> On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > writing wrapper to make the verifier happy.
> >
> > Two alternatives to this approach are:
> >
> > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > CO-RE on specific structs.
> > 2. Use broader byte-sized writes to write to bitfields.
> >
> > (1) is a bit a bit hard to use. It requires specific and
> > not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> > not generally available in released LLVM versions yet.
> >
> > (2) makes the code quite hard to read and write. And especially if
> > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > to have an inverse helper for writing.
> >
> > [0]: https://reviews.llvm.org/D133361
> > From: Eduard Zingerman <[email protected]>
> >
> > Signed-off-by: Daniel Xu <[email protected]>
> > ---
>
> Could you please also add a selftest (or several) using __retval()
> annotation for this macro?

Good call about adding tests -- I found a few bugs with the code from
the other thread. But boy did they take a lot of brain cells to figure
out.

There was some 6th grade algebra involved too -- I'll do my best to
explain it in the commit msg for v3.

Here are the fixes in case you are curious:

diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index 7a764f65d299..8f02c558c0ff 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -120,7 +120,9 @@ enum bpf_enum_value_kind {
unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
- unsigned int bit_size = (rshift - lshift); \
+ unsigned int bit_size = (64 - rshift); \
+ unsigned int hi_size = lshift; \
+ unsigned int lo_size = (rshift - lshift); \
unsigned long long nval, val, hi, lo; \
\
asm volatile("" : "+r"(p)); \
@@ -131,13 +133,13 @@ enum bpf_enum_value_kind {
case 4: val = *(unsigned int *)p; break; \
case 8: val = *(unsigned long long *)p; break; \
} \
- hi = val >> (bit_size + rshift); \
- hi <<= bit_size + rshift; \
- lo = val << (bit_size + lshift); \
- lo >>= bit_size + lshift; \
+ hi = val >> (64 - hi_size); \
+ hi <<= 64 - hi_size; \
+ lo = val << (64 - lo_size); \
+ lo >>= 64 - lo_size; \
nval = new_val; \
- nval <<= lshift; \
- nval >>= rshift; \
+ nval <<= (64 - bit_size); \
+ nval >>= (64 - bit_size - lo_size); \
val = hi | nval | lo; \
switch (byte_size) { \
case 1: *(unsigned char *)p = val; break; \

Thanks,
Daniel

2023-12-01 16:13:37

by Eduard Zingerman

[permalink] [raw]

Subject: Re: [PATCH ipsec-next v2 3/6] libbpf: Add BPF_CORE_WRITE_BITFIELD() macro

On Thu, 2023-11-30 at 18:33 -0700, Daniel Xu wrote:
[...]
> Good call about adding tests -- I found a few bugs with the code from
> the other thread. But boy did they take a lot of brain cells to figure
> out.
>
> There was some 6th grade algebra involved too -- I'll do my best to
> explain it in the commit msg for v3.
>
> Here are the fixes in case you are curious:

Ouch, I knew my code from 3am can't be trusted, sorry for that.
Your math seem to make sense, thank you.

[...]

2023-12-01 19:14:55

by Andrii Nakryiko

[permalink] [raw]

Subject: Re: [PATCH ipsec-next v2 3/6] libbpf: Add BPF_CORE_WRITE_BITFIELD() macro

On Fri, Dec 1, 2023 at 11:11 AM Andrii Nakryiko
<[email protected]> wrote:
>
> On Thu, Nov 30, 2023 at 5:33 PM Daniel Xu <[email protected]> wrote:
> >
> > On Tue, Nov 28, 2023 at 07:59:01PM +0200, Eduard Zingerman wrote:
> > > On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> > > > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > > > writing wrapper to make the verifier happy.
> > > >
> > > > Two alternatives to this approach are:
> > > >
> > > > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > > > CO-RE on specific structs.
> > > > 2. Use broader byte-sized writes to write to bitfields.
> > > >
> > > > (1) is a bit a bit hard to use. It requires specific and
> > > > not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> > > > not generally available in released LLVM versions yet.
> > > >
> > > > (2) makes the code quite hard to read and write. And especially if
> > > > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > > > to have an inverse helper for writing.
> > > >
> > > > [0]: https://reviews.llvm.org/D133361
> > > > From: Eduard Zingerman <[email protected]>
> > > >
> > > > Signed-off-by: Daniel Xu <[email protected]>
> > > > ---
> > >
> > > Could you please also add a selftest (or several) using __retval()
> > > annotation for this macro?
> >
> > Good call about adding tests -- I found a few bugs with the code from
> > the other thread. But boy did they take a lot of brain cells to figure
> > out.
> >
> > There was some 6th grade algebra involved too -- I'll do my best to
> > explain it in the commit msg for v3.
> >
> >
> > Here are the fixes in case you are curious:
> >
> > diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> > index 7a764f65d299..8f02c558c0ff 100644
> > --- a/tools/lib/bpf/bpf_core_read.h
> > +++ b/tools/lib/bpf/bpf_core_read.h
> > @@ -120,7 +120,9 @@ enum bpf_enum_value_kind {
> > unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> > unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> > unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> > - unsigned int bit_size = (rshift - lshift); \
> > + unsigned int bit_size = (64 - rshift); \
> > + unsigned int hi_size = lshift; \
> > + unsigned int lo_size = (rshift - lshift); \
>
> nit: let's drop unnecessary ()
>
> > unsigned long long nval, val, hi, lo; \
> > \
> > asm volatile("" : "+r"(p)); \
> > @@ -131,13 +133,13 @@ enum bpf_enum_value_kind {
> > case 4: val = *(unsigned int *)p; break; \
> > case 8: val = *(unsigned long long *)p; break; \
> > } \
> > - hi = val >> (bit_size + rshift); \
> > - hi <<= bit_size + rshift; \
> > - lo = val << (bit_size + lshift); \
> > - lo >>= bit_size + lshift; \
> > + hi = val >> (64 - hi_size); \
> > + hi <<= 64 - hi_size; \
> > + lo = val << (64 - lo_size); \
> > + lo >>= 64 - lo_size; \
> > nval = new_val; \
> > - nval <<= lshift; \
> > - nval >>= rshift; \
> > + nval <<= (64 - bit_size); \
> > + nval >>= (64 - bit_size - lo_size); \
> > val = hi | nval | lo; \
>
> this looks.. unusual. I'd imagine we calculate a mask, mask out bits
> we are replacing, and then OR with new values, roughly (assuming all
> the right left/right shift values and stuff)
>
> /* clear bits */
> val &= ~(bitfield_mask << shift);

we can also calculate shifted mask with just

bitfield_mask = (-1ULL) << some_left_shift >> some_right_shift;
val &= ~bitfield_mask;

> /* set bits */
> val |= (nval & bitfield_mask) << shift;
>
> ?
>
> > switch (byte_size) { \
> > case 1: *(unsigned char *)p = val; break; \
> >
> >
> > Thanks,
> > Daniel

2023-12-01 19:31:29

by Andrii Nakryiko

[permalink] [raw]

Subject: Re: [PATCH ipsec-next v2 3/6] libbpf: Add BPF_CORE_WRITE_BITFIELD() macro

On Thu, Nov 30, 2023 at 5:33 PM Daniel Xu <[email protected]> wrote:
>
> On Tue, Nov 28, 2023 at 07:59:01PM +0200, Eduard Zingerman wrote:
> > On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> > > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > > writing wrapper to make the verifier happy.
> > >
> > > Two alternatives to this approach are:
> > >
> > > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > > CO-RE on specific structs.
> > > 2. Use broader byte-sized writes to write to bitfields.
> > >
> > > (1) is a bit a bit hard to use. It requires specific and
> > > not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> > > not generally available in released LLVM versions yet.
> > >
> > > (2) makes the code quite hard to read and write. And especially if
> > > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > > to have an inverse helper for writing.
> > >
> > > [0]: https://reviews.llvm.org/D133361
> > > From: Eduard Zingerman <[email protected]>
> > >
> > > Signed-off-by: Daniel Xu <[email protected]>
> > > ---
> >
> > Could you please also add a selftest (or several) using __retval()
> > annotation for this macro?
>
> Good call about adding tests -- I found a few bugs with the code from
> the other thread. But boy did they take a lot of brain cells to figure
> out.
>
> There was some 6th grade algebra involved too -- I'll do my best to
> explain it in the commit msg for v3.
>
>
> Here are the fixes in case you are curious:
>
> diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> index 7a764f65d299..8f02c558c0ff 100644
> --- a/tools/lib/bpf/bpf_core_read.h
> +++ b/tools/lib/bpf/bpf_core_read.h
> @@ -120,7 +120,9 @@ enum bpf_enum_value_kind {
> unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> - unsigned int bit_size = (rshift - lshift); \
> + unsigned int bit_size = (64 - rshift); \
> + unsigned int hi_size = lshift; \
> + unsigned int lo_size = (rshift - lshift); \

nit: let's drop unnecessary ()

> unsigned long long nval, val, hi, lo; \
> \
> asm volatile("" : "+r"(p)); \
> @@ -131,13 +133,13 @@ enum bpf_enum_value_kind {
> case 4: val = *(unsigned int *)p; break; \
> case 8: val = *(unsigned long long *)p; break; \
> } \
> - hi = val >> (bit_size + rshift); \
> - hi <<= bit_size + rshift; \
> - lo = val << (bit_size + lshift); \
> - lo >>= bit_size + lshift; \
> + hi = val >> (64 - hi_size); \
> + hi <<= 64 - hi_size; \
> + lo = val << (64 - lo_size); \
> + lo >>= 64 - lo_size; \
> nval = new_val; \
> - nval <<= lshift; \
> - nval >>= rshift; \
> + nval <<= (64 - bit_size); \
> + nval >>= (64 - bit_size - lo_size); \
> val = hi | nval | lo; \

this looks.. unusual. I'd imagine we calculate a mask, mask out bits
we are replacing, and then OR with new values, roughly (assuming all
the right left/right shift values and stuff)

/* clear bits */
val &= ~(bitfield_mask << shift);
/* set bits */
val |= (nval & bitfield_mask) << shift;

?

> switch (byte_size) { \
> case 1: *(unsigned char *)p = val; break; \
>
>
> Thanks,
> Daniel

2023-12-01 20:07:10

by Daniel Xu

[permalink] [raw]

Subject: Re: [PATCH ipsec-next v2 3/6] libbpf: Add BPF_CORE_WRITE_BITFIELD() macro

On Fri, Dec 01, 2023 at 11:13:13AM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 1, 2023 at 11:11 AM Andrii Nakryiko
> <[email protected]> wrote:
> >
> > On Thu, Nov 30, 2023 at 5:33 PM Daniel Xu <[email protected]> wrote:
> > >
> > > On Tue, Nov 28, 2023 at 07:59:01PM +0200, Eduard Zingerman wrote:
> > > > On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> > > > > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > > > > writing wrapper to make the verifier happy.
> > > > >
> > > > > Two alternatives to this approach are:
> > > > >
> > > > > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > > > > CO-RE on specific structs.
> > > > > 2. Use broader byte-sized writes to write to bitfields.
> > > > >
> > > > > (1) is a bit a bit hard to use. It requires specific and
> > > > > not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> > > > > not generally available in released LLVM versions yet.
> > > > >
> > > > > (2) makes the code quite hard to read and write. And especially if
> > > > > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > > > > to have an inverse helper for writing.
> > > > >
> > > > > [0]: https://reviews.llvm.org/D133361
> > > > > From: Eduard Zingerman <[email protected]>
> > > > >
> > > > > Signed-off-by: Daniel Xu <[email protected]>
> > > > > ---
> > > >
> > > > Could you please also add a selftest (or several) using __retval()
> > > > annotation for this macro?
> > >
> > > Good call about adding tests -- I found a few bugs with the code from
> > > the other thread. But boy did they take a lot of brain cells to figure
> > > out.
> > >
> > > There was some 6th grade algebra involved too -- I'll do my best to
> > > explain it in the commit msg for v3.
> > >
> > >
> > > Here are the fixes in case you are curious:
> > >
> > > diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> > > index 7a764f65d299..8f02c558c0ff 100644
> > > --- a/tools/lib/bpf/bpf_core_read.h
> > > +++ b/tools/lib/bpf/bpf_core_read.h
> > > @@ -120,7 +120,9 @@ enum bpf_enum_value_kind {
> > > unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> > > unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> > > unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> > > - unsigned int bit_size = (rshift - lshift); \
> > > + unsigned int bit_size = (64 - rshift); \
> > > + unsigned int hi_size = lshift; \
> > > + unsigned int lo_size = (rshift - lshift); \
> >
> > nit: let's drop unnecessary ()
> >
> > > unsigned long long nval, val, hi, lo; \
> > > \
> > > asm volatile("" : "+r"(p)); \
> > > @@ -131,13 +133,13 @@ enum bpf_enum_value_kind {
> > > case 4: val = *(unsigned int *)p; break; \
> > > case 8: val = *(unsigned long long *)p; break; \
> > > } \
> > > - hi = val >> (bit_size + rshift); \
> > > - hi <<= bit_size + rshift; \
> > > - lo = val << (bit_size + lshift); \
> > > - lo >>= bit_size + lshift; \
> > > + hi = val >> (64 - hi_size); \
> > > + hi <<= 64 - hi_size; \
> > > + lo = val << (64 - lo_size); \
> > > + lo >>= 64 - lo_size; \
> > > nval = new_val; \
> > > - nval <<= lshift; \
> > > - nval >>= rshift; \
> > > + nval <<= (64 - bit_size); \
> > > + nval >>= (64 - bit_size - lo_size); \
> > > val = hi | nval | lo; \
> >
> > this looks.. unusual. I'd imagine we calculate a mask, mask out bits
> > we are replacing, and then OR with new values, roughly (assuming all
> > the right left/right shift values and stuff)
> >
> > /* clear bits */
> > val &= ~(bitfield_mask << shift);
>
> we can also calculate shifted mask with just
>
> bitfield_mask = (-1ULL) << some_left_shift >> some_right_shift;
> val &= ~bitfield_mask;

Yeah I was chatting w/ JonathanL about this and I've got basically that
code ready to send for v3.

Thanks,
Daniel