This patchset adds two kfunc helpers, bpf_xdp_get_xfrm_state() and
bpf_xdp_xfrm_state_release() that wrap xfrm_state_lookup() and
xfrm_state_put(). The intent is to support software RSS (via XDP) for
the ongoing/upcoming ipsec pcpu work [0]. Recent experiments performed
on (hopefully) reproducible AWS testbeds indicate that single tunnel
pcpu ipsec can reach line rate on 100G ENA nics.
Note this patchset only tests/shows generic xfrm_state access. The
"secret sauce" (if you can really even call it that) involves accessing
a soon-to-be-upstreamed pcpu_num field in xfrm_state. Early example is
available here [1].
[0]: https://datatracker.ietf.org/doc/draft-ietf-ipsecme-multi-sa-performance/03/
[1]: https://github.com/danobi/xdp-tools/blob/e89a1c617aba3b50d990f779357d6ce2863ecb27/xdp-bench/xdp_redirect_cpumap.bpf.c#L385-L406
Changes from v1:
* Move xfrm tunnel tests to test_progs
* Fix writing to opts->error when opts is invalid
* Use __bpf_kfunc_start_defs()
* Remove unused vxlanhdr definition
* Add and use BPF_CORE_WRITE_BITFIELD() macro
* Make series bisect clean
Changes from RFCv2:
* Rebased to ipsec-next
* Fix netns leak
Changes from RFCv1:
* Add Antony's commit tags
* Add KF_ACQUIRE and KF_RELEASE semantics
Daniel Xu (6):
bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc
bpf: xfrm: Add bpf_xdp_xfrm_state_release() kfunc
libbpf: Add BPF_CORE_WRITE_BITFIELD() macro
bpf: selftests: test_tunnel: Use vmlinux.h declarations
bpf: selftests: Move xfrm tunnel test to test_progs
bpf: xfrm: Add selftest for bpf_xdp_get_xfrm_state()
include/net/xfrm.h | 9 +
net/xfrm/Makefile | 1 +
net/xfrm/xfrm_policy.c | 2 +
net/xfrm/xfrm_state_bpf.c | 128 +++++++++++++++
tools/lib/bpf/bpf_core_read.h | 36 ++++
.../selftests/bpf/prog_tests/test_tunnel.c | 155 ++++++++++++++++++
.../selftests/bpf/progs/bpf_tracing_net.h | 1 +
.../selftests/bpf/progs/test_tunnel_kern.c | 138 +++++++++-------
tools/testing/selftests/bpf/test_tunnel.sh | 92 -----------
9 files changed, 412 insertions(+), 150 deletions(-)
create mode 100644 net/xfrm/xfrm_state_bpf.c
--
2.42.1
This commit adds an unstable kfunc helper to access internal xfrm_state
associated with an SA. This is intended to be used for the upcoming
IPsec pcpu work to assign special pcpu SAs to a particular CPU. In other
words: for custom software RSS.
That being said, the function that this kfunc wraps is fairly generic
and used for a lot of xfrm tasks. I'm sure people will find uses
elsewhere over time.
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
include/net/xfrm.h | 9 +++
net/xfrm/Makefile | 1 +
net/xfrm/xfrm_policy.c | 2 +
net/xfrm/xfrm_state_bpf.c | 112 ++++++++++++++++++++++++++++++++++++++
4 files changed, 124 insertions(+)
create mode 100644 net/xfrm/xfrm_state_bpf.c
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c9bb0f892f55..1d107241b901 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void)
#endif
+#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF)
+int register_xfrm_state_bpf(void);
+#else
+static inline int register_xfrm_state_bpf(void)
+{
+ return 0;
+}
+#endif
+
#endif /* _NET_XFRM_H */
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index cd47f88921f5..547cec77ba03 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
+obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index c13dc3ef7910..1b7e75159727 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4218,6 +4218,8 @@ void __init xfrm_init(void)
#ifdef CONFIG_XFRM_ESPINTCP
espintcp_init();
#endif
+
+ register_xfrm_state_bpf();
}
#ifdef CONFIG_AUDITSYSCALL
diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c
new file mode 100644
index 000000000000..1681825db506
--- /dev/null
+++ b/net/xfrm/xfrm_state_bpf.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable XFRM state BPF helpers.
+ *
+ * Note that it is allowed to break compatibility for these functions since the
+ * interface they are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <net/xdp.h>
+#include <net/xfrm.h>
+
+/* bpf_xfrm_state_opts - Options for XFRM state lookup helpers
+ *
+ * Members:
+ * @error - Out parameter, set for any errors encountered
+ * Values:
+ * -EINVAL - netns_id is less than -1
+ * -EINVAL - opts__sz isn't BPF_XFRM_STATE_OPTS_SZ
+ * -ENONET - No network namespace found for netns_id
+ * @netns_id - Specify the network namespace for lookup
+ * Values:
+ * BPF_F_CURRENT_NETNS (-1)
+ * Use namespace associated with ctx
+ * [0, S32_MAX]
+ * Network Namespace ID
+ * @mark - XFRM mark to match on
+ * @daddr - Destination address to match on
+ * @spi - Security parameter index to match on
+ * @proto - L3 protocol to match on
+ * @family - L3 protocol family to match on
+ */
+struct bpf_xfrm_state_opts {
+ s32 error;
+ s32 netns_id;
+ u32 mark;
+ xfrm_address_t daddr;
+ __be32 spi;
+ u8 proto;
+ u16 family;
+};
+
+enum {
+ BPF_XFRM_STATE_OPTS_SZ = sizeof(struct bpf_xfrm_state_opts),
+};
+
+__bpf_kfunc_start_defs();
+
+/* bpf_xdp_get_xfrm_state - Get XFRM state
+ *
+ * Parameters:
+ * @ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @opts - Options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_xfrm_state_opts structure
+ * Must be BPF_XFRM_STATE_OPTS_SZ
+ */
+__bpf_kfunc struct xfrm_state *
+bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+ struct net *net = dev_net(xdp->rxq->dev);
+ struct xfrm_state *x;
+
+ if (!opts || opts__sz < sizeof(opts->error))
+ return NULL;
+
+ if (opts__sz != BPF_XFRM_STATE_OPTS_SZ) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net)) {
+ opts->error = -ENONET;
+ return NULL;
+ }
+ }
+
+ x = xfrm_state_lookup(net, opts->mark, &opts->daddr, opts->spi,
+ opts->proto, opts->family);
+
+ if (opts->netns_id >= 0)
+ put_net(net);
+
+ return x;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(xfrm_state_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
+BTF_SET8_END(xfrm_state_kfunc_set)
+
+static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &xfrm_state_kfunc_set,
+};
+
+int __init register_xfrm_state_bpf(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &xfrm_state_xdp_kfunc_set);
+}
--
2.42.1
test_progs is better than a shell script b/c C is a bit easier to
maintain than shell. Also it's easier to use new infra like memory
mapped global variables from C via bpf skeleton.
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/prog_tests/test_tunnel.c | 143 ++++++++++++++++++
.../selftests/bpf/progs/test_tunnel_kern.c | 11 +-
tools/testing/selftests/bpf/test_tunnel.sh | 92 -----------
3 files changed, 151 insertions(+), 95 deletions(-)
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
index d149ab98798d..3bcb6f96b9b5 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
@@ -50,6 +50,7 @@
*/
#include <arpa/inet.h>
+#include <linux/if_link.h>
#include <linux/if_tun.h>
#include <linux/limits.h>
#include <linux/sysctl.h>
@@ -92,6 +93,11 @@
#define IPIP_TUNL_DEV0 "ipip00"
#define IPIP_TUNL_DEV1 "ipip11"
+#define XFRM_AUTH "0x1111111111111111111111111111111111111111"
+#define XFRM_ENC "0x22222222222222222222222222222222"
+#define XFRM_SPI_IN_TO_OUT 0x1
+#define XFRM_SPI_OUT_TO_IN 0x2
+
#define PING_ARGS "-i 0.01 -c 3 -w 10 -q"
static int config_device(void)
@@ -264,6 +270,92 @@ static void delete_ipip_tunnel(void)
SYS_NOFAIL("ip fou del port 5555 2> /dev/null");
}
+static int add_xfrm_tunnel(void)
+{
+ /* at_ns0 namespace
+ * at_ns0 -> root
+ */
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 1 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm policy add src %s/32 dst %s/32 dir out "
+ "tmpl src %s dst %s proto esp reqid 1 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);
+
+ /* root -> at_ns0 */
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 2 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm policy add src %s/32 dst %s/32 dir in "
+ "tmpl src %s dst %s proto esp reqid 2 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);
+
+ /* address & route */
+ SYS(fail, "ip netns exec at_ns0 ip addr add dev veth0 %s/32",
+ IP4_ADDR_TUNL_DEV0);
+ SYS(fail, "ip netns exec at_ns0 ip route add %s dev veth0 via %s src %s",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR1_VETH1, IP4_ADDR_TUNL_DEV0);
+
+ /* root namespace
+ * at_ns0 -> root
+ */
+ SYS(fail,
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 1 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip xfrm policy add src %s/32 dst %s/32 dir in "
+ "tmpl src %s dst %s proto esp reqid 1 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);
+
+ /* root -> at_ns0 */
+ SYS(fail,
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 2 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip xfrm policy add src %s/32 dst %s/32 dir out "
+ "tmpl src %s dst %s proto esp reqid 2 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);
+
+ /* address & route */
+ SYS(fail, "ip addr add dev veth1 %s/32", IP4_ADDR_TUNL_DEV1);
+ SYS(fail, "ip route add %s dev veth1 via %s src %s",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_VETH0, IP4_ADDR_TUNL_DEV1);
+
+ return 0;
+fail:
+ return -1;
+}
+
+static void delete_xfrm_tunnel(void)
+{
+ SYS_NOFAIL("ip xfrm policy delete dir out src %s/32 dst %s/32 2> /dev/null",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0);
+ SYS_NOFAIL("ip xfrm policy delete dir in src %s/32 dst %s/32 2> /dev/null",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1);
+ SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
+ IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT);
+ SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
+ IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN);
+}
+
static int test_ping(int family, const char *addr)
{
SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr);
@@ -532,6 +624,56 @@ static void test_ipip_tunnel(enum ipip_encap encap)
test_tunnel_kern__destroy(skel);
}
+static void test_xfrm_tunnel(void)
+{
+ DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook,
+ .attach_point = BPF_TC_INGRESS);
+ struct test_tunnel_kern *skel = NULL;
+ struct nstoken *nstoken;
+ int tc_prog_fd;
+ int ifindex;
+ int err;
+
+ err = add_xfrm_tunnel();
+ if (!ASSERT_OK(err, "add_xfrm_tunnel"))
+ return;
+
+ skel = test_tunnel_kern__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load"))
+ goto done;
+
+ ifindex = if_nametoindex("veth1");
+ if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex"))
+ goto done;
+
+ /* attach tc prog to tunnel dev */
+ tc_hook.ifindex = ifindex;
+ tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state);
+ if (!ASSERT_GE(tc_prog_fd, 0, "bpf_program__fd"))
+ goto done;
+ if (attach_tc_prog(&tc_hook, tc_prog_fd, -1))
+ goto done;
+
+ /* ping from at_ns0 namespace test */
+ nstoken = open_netns("at_ns0");
+ err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
+ close_netns(nstoken);
+ if (!ASSERT_OK(err, "test_ping"))
+ goto done;
+
+ if (!ASSERT_EQ(skel->bss->xfrm_reqid, 1, "req_id"))
+ goto done;
+ if (!ASSERT_EQ(skel->bss->xfrm_spi, XFRM_SPI_IN_TO_OUT, "spi"))
+ goto done;
+ if (!ASSERT_EQ(skel->bss->xfrm_remote_ip, 0xac100164, "remote_ip"))
+ goto done;
+
+done:
+ delete_xfrm_tunnel();
+ if (skel)
+ test_tunnel_kern__destroy(skel);
+}
+
#define RUN_TEST(name, ...) \
({ \
if (test__start_subtest(#name)) { \
@@ -549,6 +691,7 @@ static void *test_tunnel_run_tests(void *arg)
RUN_TEST(ipip_tunnel, NONE);
RUN_TEST(ipip_tunnel, FOU);
RUN_TEST(ipip_tunnel, GUE);
+ RUN_TEST(xfrm_tunnel);
cleanup();
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index b320fb7bb080..3a59eb9c34de 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -929,6 +929,10 @@ int ip6ip6_get_tunnel(struct __sk_buff *skb)
return TC_ACT_OK;
}
+volatile int xfrm_reqid = 0;
+volatile int xfrm_spi = 0;
+volatile int xfrm_remote_ip = 0;
+
SEC("tc")
int xfrm_get_state(struct __sk_buff *skb)
{
@@ -939,9 +943,10 @@ int xfrm_get_state(struct __sk_buff *skb)
if (ret < 0)
return TC_ACT_OK;
- bpf_printk("reqid %d spi 0x%x remote ip 0x%x\n",
- x.reqid, bpf_ntohl(x.spi),
- bpf_ntohl(x.remote_ipv4));
+ xfrm_reqid = x.reqid;
+ xfrm_spi = bpf_ntohl(x.spi);
+ xfrm_remote_ip = bpf_ntohl(x.remote_ipv4);
+
return TC_ACT_OK;
}
diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh
index 2dec7dbf29a2..d9661b9988ba 100755
--- a/tools/testing/selftests/bpf/test_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tunnel.sh
@@ -517,90 +517,6 @@ test_ip6ip6()
echo -e ${GREEN}"PASS: ip6$TYPE"${NC}
}
-setup_xfrm_tunnel()
-{
- auth=0x$(printf '1%.0s' {1..40})
- enc=0x$(printf '2%.0s' {1..32})
- spi_in_to_out=0x1
- spi_out_to_in=0x2
- # at_ns0 namespace
- # at_ns0 -> root
- ip netns exec at_ns0 \
- ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
- spi $spi_in_to_out reqid 1 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip netns exec at_ns0 \
- ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir out \
- tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
- mode tunnel
- # root -> at_ns0
- ip netns exec at_ns0 \
- ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
- spi $spi_out_to_in reqid 2 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip netns exec at_ns0 \
- ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir in \
- tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
- mode tunnel
- # address & route
- ip netns exec at_ns0 \
- ip addr add dev veth0 10.1.1.100/32
- ip netns exec at_ns0 \
- ip route add 10.1.1.200 dev veth0 via 172.16.1.200 \
- src 10.1.1.100
-
- # root namespace
- # at_ns0 -> root
- ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
- spi $spi_in_to_out reqid 1 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir in \
- tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
- mode tunnel
- # root -> at_ns0
- ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
- spi $spi_out_to_in reqid 2 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir out \
- tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
- mode tunnel
- # address & route
- ip addr add dev veth1 10.1.1.200/32
- ip route add 10.1.1.100 dev veth1 via 172.16.1.100 src 10.1.1.200
-}
-
-test_xfrm_tunnel()
-{
- if [[ -e /sys/kernel/tracing/trace ]]; then
- TRACE=/sys/kernel/tracing/trace
- else
- TRACE=/sys/kernel/debug/tracing/trace
- fi
- config_device
- > ${TRACE}
- setup_xfrm_tunnel
- mkdir -p ${BPF_PIN_TUNNEL_DIR}
- bpftool prog loadall ${BPF_FILE} ${BPF_PIN_TUNNEL_DIR}
- tc qdisc add dev veth1 clsact
- tc filter add dev veth1 proto ip ingress bpf da object-pinned \
- ${BPF_PIN_TUNNEL_DIR}/xfrm_get_state
- ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
- sleep 1
- grep "reqid 1" ${TRACE}
- check_err $?
- grep "spi 0x1" ${TRACE}
- check_err $?
- grep "remote ip 0xac100164" ${TRACE}
- check_err $?
- cleanup
-
- if [ $ret -ne 0 ]; then
- echo -e ${RED}"FAIL: xfrm tunnel"${NC}
- return 1
- fi
- echo -e ${GREEN}"PASS: xfrm tunnel"${NC}
-}
-
attach_bpf()
{
DEV=$1
@@ -630,10 +546,6 @@ cleanup()
ip link del ip6geneve11 2> /dev/null
ip link del erspan11 2> /dev/null
ip link del ip6erspan11 2> /dev/null
- ip xfrm policy delete dir out src 10.1.1.200/32 dst 10.1.1.100/32 2> /dev/null
- ip xfrm policy delete dir in src 10.1.1.100/32 dst 10.1.1.200/32 2> /dev/null
- ip xfrm state delete src 172.16.1.100 dst 172.16.1.200 proto esp spi 0x1 2> /dev/null
- ip xfrm state delete src 172.16.1.200 dst 172.16.1.100 proto esp spi 0x2 2> /dev/null
}
cleanup_exit()
@@ -716,10 +628,6 @@ bpf_tunnel_test()
test_ip6ip6
errors=$(( $errors + $? ))
- echo "Testing IPSec tunnel..."
- test_xfrm_tunnel
- errors=$(( $errors + $? ))
-
return $errors
}
--
2.42.1
This kfunc releases a previously acquired xfrm_state from
bpf_xdp_get_xfrm_state().
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
net/xfrm/xfrm_state_bpf.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c
index 1681825db506..1485b9da9425 100644
--- a/net/xfrm/xfrm_state_bpf.c
+++ b/net/xfrm/xfrm_state_bpf.c
@@ -94,10 +94,26 @@ bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32
return x;
}
+/* bpf_xdp_xfrm_state_release - Release acquired xfrm_state object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @x - Pointer to referenced xfrm_state object, obtained using
+ * bpf_xdp_get_xfrm_state.
+ */
+__bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x)
+{
+ xfrm_state_put(x);
+}
+
__bpf_kfunc_end_defs();
BTF_SET8_START(xfrm_state_kfunc_set)
BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE)
BTF_SET8_END(xfrm_state_kfunc_set)
static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
--
2.42.1
Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
writing wrapper to make the verifier happy.
Two alternatives to this approach are:
1. Use the upcoming `preserve_static_offset` [0] attribute to disable
CO-RE on specific structs.
2. Use broader byte-sized writes to write to bitfields.
(1) is a bit a bit hard to use. It requires specific and
not-very-obvious annotations to bpftool generated vmlinux.h. It's also
not generally available in released LLVM versions yet.
(2) makes the code quite hard to read and write. And especially if
BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
to have an inverse helper for writing.
[0]: https://reviews.llvm.org/D133361
From: Eduard Zingerman <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
tools/lib/bpf/bpf_core_read.h | 36 +++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index 1ac57bb7ac55..7a764f65d299 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -111,6 +111,42 @@ enum bpf_enum_value_kind {
val; \
})
+/*
+ * Write to a bitfield, identified by s->field.
+ * This is the inverse of BPF_CORE_WRITE_BITFIELD().
+ */
+#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({ \
+ void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \
+ unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
+ unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
+ unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
+ unsigned int bit_size = (rshift - lshift); \
+ unsigned long long nval, val, hi, lo; \
+ \
+ asm volatile("" : "+r"(p)); \
+ \
+ switch (byte_size) { \
+ case 1: val = *(unsigned char *)p; break; \
+ case 2: val = *(unsigned short *)p; break; \
+ case 4: val = *(unsigned int *)p; break; \
+ case 8: val = *(unsigned long long *)p; break; \
+ } \
+ hi = val >> (bit_size + rshift); \
+ hi <<= bit_size + rshift; \
+ lo = val << (bit_size + lshift); \
+ lo >>= bit_size + lshift; \
+ nval = new_val; \
+ nval <<= lshift; \
+ nval >>= rshift; \
+ val = hi | nval | lo; \
+ switch (byte_size) { \
+ case 1: *(unsigned char *)p = val; break; \
+ case 2: *(unsigned short *)p = val; break; \
+ case 4: *(unsigned int *)p = val; break; \
+ case 8: *(unsigned long long *)p = val; break; \
+ } \
+})
+
#define ___bpf_field_ref1(field) (field)
#define ___bpf_field_ref2(type, field) (((typeof(type) *)0)->field)
#define ___bpf_field_ref(args...) \
--
2.42.1
vmlinux.h declarations are more ergnomic, especially when working with
kfuncs. The uapi headers are often incomplete for kfunc definitions.
This commit also switches bitfield accesses to use CO-RE helpers.
Switching to vmlinux.h definitions makes the verifier very
unhappy with raw bitfield accesses. The error is:
; md.u.md2.dir = direction;
33: (69) r1 = *(u16 *)(r2 +11)
misaligned stack access off (0x0; 0x0)+-64+11 size 2
Fix by using CO-RE-aware bitfield reads and writes.
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/progs/bpf_tracing_net.h | 1 +
.../selftests/bpf/progs/test_tunnel_kern.c | 76 +++++--------------
2 files changed, 22 insertions(+), 55 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index 0b793a102791..1bdc680b0e0e 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -26,6 +26,7 @@
#define IPV6_AUTOFLOWLABEL 70
#define TC_ACT_UNSPEC (-1)
+#define TC_ACT_OK 0
#define TC_ACT_SHOT 2
#define SOL_TCP 6
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index f66af753bbbb..b320fb7bb080 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -6,62 +6,26 @@
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
-#include <stddef.h>
-#include <string.h>
-#include <arpa/inet.h>
-#include <linux/bpf.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/if_tunnel.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/icmp.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/pkt_cls.h>
-#include <linux/erspan.h>
-#include <linux/udp.h>
+#include "vmlinux.h"
+#include <bpf/bpf_core_read.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
+#include "bpf_kfuncs.h"
+#include "bpf_tracing_net.h"
#define log_err(__ret) bpf_printk("ERROR line:%d ret:%d\n", __LINE__, __ret)
-#define VXLAN_UDP_PORT 4789
+#define VXLAN_UDP_PORT 4789
+#define ETH_P_IP 0x0800
+#define PACKET_HOST 0
+#define TUNNEL_CSUM bpf_htons(0x01)
+#define TUNNEL_KEY bpf_htons(0x04)
/* Only IPv4 address assigned to veth1.
* 172.16.1.200
*/
#define ASSIGNED_ADDR_VETH1 0xac1001c8
-struct geneve_opt {
- __be16 opt_class;
- __u8 type;
- __u8 length:5;
- __u8 r3:1;
- __u8 r2:1;
- __u8 r1:1;
- __u8 opt_data[8]; /* hard-coded to 8 byte */
-};
-
-struct vxlanhdr {
- __be32 vx_flags;
- __be32 vx_vni;
-} __attribute__((packed));
-
-struct vxlan_metadata {
- __u32 gbp;
-};
-
-struct bpf_fou_encap {
- __be16 sport;
- __be16 dport;
-};
-
-enum bpf_fou_encap_type {
- FOU_BPF_ENCAP_FOU,
- FOU_BPF_ENCAP_GUE,
-};
-
int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap, int type) __ksym;
int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
@@ -205,9 +169,9 @@ int erspan_set_tunnel(struct __sk_buff *skb)
__u8 hwid = 7;
md.version = 2;
- md.u.md2.dir = direction;
- md.u.md2.hwid = hwid & 0xf;
- md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, dir, direction);
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid, (hwid & 0xf));
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid_upper, (hwid >> 4) & 0x3);
#endif
ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
@@ -246,8 +210,9 @@ int erspan_get_tunnel(struct __sk_buff *skb)
bpf_printk("\tindex %x\n", index);
#else
bpf_printk("\tdirection %d hwid %x timestamp %u\n",
- md.u.md2.dir,
- (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ BPF_CORE_READ_BITFIELD(&md.u.md2, dir),
+ (BPF_CORE_READ_BITFIELD(&md.u.md2, hwid_upper) << 4) +
+ BPF_CORE_READ_BITFIELD(&md.u.md2, hwid),
bpf_ntohl(md.u.md2.timestamp));
#endif
@@ -284,9 +249,9 @@ int ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
__u8 hwid = 17;
md.version = 2;
- md.u.md2.dir = direction;
- md.u.md2.hwid = hwid & 0xf;
- md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, dir, direction);
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid, (hwid & 0xf));
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid_upper, (hwid >> 4) & 0x3);
#endif
ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
@@ -326,8 +291,9 @@ int ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
bpf_printk("\tindex %x\n", index);
#else
bpf_printk("\tdirection %d hwid %x timestamp %u\n",
- md.u.md2.dir,
- (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ BPF_CORE_READ_BITFIELD(&md.u.md2, dir),
+ (BPF_CORE_READ_BITFIELD(&md.u.md2, hwid_upper) << 4) +
+ BPF_CORE_READ_BITFIELD(&md.u.md2, hwid),
bpf_ntohl(md.u.md2.timestamp));
#endif
--
2.42.1
This commit extends test_tunnel selftest to test the new XDP xfrm state
lookup kfunc.
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/prog_tests/test_tunnel.c | 20 ++++++--
.../selftests/bpf/progs/test_tunnel_kern.c | 51 +++++++++++++++++++
2 files changed, 67 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
index 3bcb6f96b9b5..54308afb3cdc 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
@@ -278,7 +278,7 @@ static int add_xfrm_tunnel(void)
SYS(fail,
"ip netns exec at_ns0 "
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 1 mode tunnel "
+ "spi %d reqid 1 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -292,7 +292,7 @@ static int add_xfrm_tunnel(void)
SYS(fail,
"ip netns exec at_ns0 "
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 2 mode tunnel "
+ "spi %d reqid 2 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -313,7 +313,7 @@ static int add_xfrm_tunnel(void)
*/
SYS(fail,
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 1 mode tunnel "
+ "spi %d reqid 1 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -325,7 +325,7 @@ static int add_xfrm_tunnel(void)
/* root -> at_ns0 */
SYS(fail,
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 2 mode tunnel "
+ "spi %d reqid 2 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -628,8 +628,10 @@ static void test_xfrm_tunnel(void)
{
DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook,
.attach_point = BPF_TC_INGRESS);
+ LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
struct test_tunnel_kern *skel = NULL;
struct nstoken *nstoken;
+ int xdp_prog_fd;
int tc_prog_fd;
int ifindex;
int err;
@@ -654,6 +656,14 @@ static void test_xfrm_tunnel(void)
if (attach_tc_prog(&tc_hook, tc_prog_fd, -1))
goto done;
+ /* attach xdp prog to tunnel dev */
+ xdp_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state_xdp);
+ if (!ASSERT_GE(xdp_prog_fd, 0, "bpf_program__fd"))
+ goto done;
+ err = bpf_xdp_attach(ifindex, xdp_prog_fd, XDP_FLAGS_REPLACE, &opts);
+ if (!ASSERT_OK(err, "bpf_xdp_attach"))
+ goto done;
+
/* ping from at_ns0 namespace test */
nstoken = open_netns("at_ns0");
err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
@@ -667,6 +677,8 @@ static void test_xfrm_tunnel(void)
goto done;
if (!ASSERT_EQ(skel->bss->xfrm_remote_ip, 0xac100164, "remote_ip"))
goto done;
+ if (!ASSERT_EQ(skel->bss->xfrm_replay_window, 42, "replay_window"))
+ goto done;
done:
delete_xfrm_tunnel();
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index 3a59eb9c34de..c0dd38616562 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -30,6 +30,10 @@ int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap, int type) __ksym;
int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap) __ksym;
+struct xfrm_state *
+bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts,
+ u32 opts__sz) __ksym;
+void bpf_xdp_xfrm_state_release(struct xfrm_state *x) __ksym;
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
@@ -950,4 +954,51 @@ int xfrm_get_state(struct __sk_buff *skb)
return TC_ACT_OK;
}
+volatile int xfrm_replay_window = 0;
+
+SEC("xdp")
+int xfrm_get_state_xdp(struct xdp_md *xdp)
+{
+ struct bpf_xfrm_state_opts opts = {};
+ struct xfrm_state *x = NULL;
+ struct ip_esp_hdr *esph;
+ struct bpf_dynptr ptr;
+ u8 esph_buf[8] = {};
+ u8 iph_buf[20] = {};
+ struct iphdr *iph;
+ u32 off;
+
+ if (bpf_dynptr_from_xdp(xdp, 0, &ptr))
+ goto out;
+
+ off = sizeof(struct ethhdr);
+ iph = bpf_dynptr_slice(&ptr, off, iph_buf, sizeof(iph_buf));
+ if (!iph || iph->protocol != IPPROTO_ESP)
+ goto out;
+
+ off += sizeof(struct iphdr);
+ esph = bpf_dynptr_slice(&ptr, off, esph_buf, sizeof(esph_buf));
+ if (!esph)
+ goto out;
+
+ opts.netns_id = BPF_F_CURRENT_NETNS;
+ opts.daddr.a4 = iph->daddr;
+ opts.spi = esph->spi;
+ opts.proto = IPPROTO_ESP;
+ opts.family = AF_INET;
+
+ x = bpf_xdp_get_xfrm_state(xdp, &opts, sizeof(opts));
+ if (!x || opts.error)
+ goto out;
+
+ if (!x->replay_esn)
+ goto out;
+
+ xfrm_replay_window = x->replay_esn->replay_window;
+out:
+ if (x)
+ bpf_xdp_xfrm_state_release(x);
+ return XDP_PASS;
+}
+
char _license[] SEC("license") = "GPL";
--
2.42.1
On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> writing wrapper to make the verifier happy.
>
> Two alternatives to this approach are:
>
> 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> CO-RE on specific structs.
> 2. Use broader byte-sized writes to write to bitfields.
>
> (1) is a bit a bit hard to use. It requires specific and
> not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> not generally available in released LLVM versions yet.
>
> (2) makes the code quite hard to read and write. And especially if
> BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> to have an inverse helper for writing.
>
> [0]: https://reviews.llvm.org/D133361
> From: Eduard Zingerman <[email protected]>
>
> Signed-off-by: Daniel Xu <[email protected]>
> ---
Could you please also add a selftest (or several) using __retval()
annotation for this macro?
On Tue, Nov 28, 2023 at 07:59:01PM +0200, Eduard Zingerman wrote:
> On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > writing wrapper to make the verifier happy.
> >
> > Two alternatives to this approach are:
> >
> > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > CO-RE on specific structs.
> > 2. Use broader byte-sized writes to write to bitfields.
> >
> > (1) is a bit a bit hard to use. It requires specific and
> > not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> > not generally available in released LLVM versions yet.
> >
> > (2) makes the code quite hard to read and write. And especially if
> > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > to have an inverse helper for writing.
> >
> > [0]: https://reviews.llvm.org/D133361
> > From: Eduard Zingerman <[email protected]>
> >
> > Signed-off-by: Daniel Xu <[email protected]>
> > ---
>
> Could you please also add a selftest (or several) using __retval()
> annotation for this macro?
Sure, I'll take a look.
Thanks,
Daniel
On Tue, Nov 28, 2023 at 07:59:01PM +0200, Eduard Zingerman wrote:
> On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > writing wrapper to make the verifier happy.
> >
> > Two alternatives to this approach are:
> >
> > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > CO-RE on specific structs.
> > 2. Use broader byte-sized writes to write to bitfields.
> >
> > (1) is a bit a bit hard to use. It requires specific and
> > not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> > not generally available in released LLVM versions yet.
> >
> > (2) makes the code quite hard to read and write. And especially if
> > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > to have an inverse helper for writing.
> >
> > [0]: https://reviews.llvm.org/D133361
> > From: Eduard Zingerman <[email protected]>
> >
> > Signed-off-by: Daniel Xu <[email protected]>
> > ---
>
> Could you please also add a selftest (or several) using __retval()
> annotation for this macro?
Good call about adding tests -- I found a few bugs with the code from
the other thread. But boy did they take a lot of brain cells to figure
out.
There was some 6th grade algebra involved too -- I'll do my best to
explain it in the commit msg for v3.
Here are the fixes in case you are curious:
diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index 7a764f65d299..8f02c558c0ff 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -120,7 +120,9 @@ enum bpf_enum_value_kind {
unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
- unsigned int bit_size = (rshift - lshift); \
+ unsigned int bit_size = (64 - rshift); \
+ unsigned int hi_size = lshift; \
+ unsigned int lo_size = (rshift - lshift); \
unsigned long long nval, val, hi, lo; \
\
asm volatile("" : "+r"(p)); \
@@ -131,13 +133,13 @@ enum bpf_enum_value_kind {
case 4: val = *(unsigned int *)p; break; \
case 8: val = *(unsigned long long *)p; break; \
} \
- hi = val >> (bit_size + rshift); \
- hi <<= bit_size + rshift; \
- lo = val << (bit_size + lshift); \
- lo >>= bit_size + lshift; \
+ hi = val >> (64 - hi_size); \
+ hi <<= 64 - hi_size; \
+ lo = val << (64 - lo_size); \
+ lo >>= 64 - lo_size; \
nval = new_val; \
- nval <<= lshift; \
- nval >>= rshift; \
+ nval <<= (64 - bit_size); \
+ nval >>= (64 - bit_size - lo_size); \
val = hi | nval | lo; \
switch (byte_size) { \
case 1: *(unsigned char *)p = val; break; \
Thanks,
Daniel
On Thu, 2023-11-30 at 18:33 -0700, Daniel Xu wrote:
[...]
> Good call about adding tests -- I found a few bugs with the code from
> the other thread. But boy did they take a lot of brain cells to figure
> out.
>
> There was some 6th grade algebra involved too -- I'll do my best to
> explain it in the commit msg for v3.
>
> Here are the fixes in case you are curious:
Ouch, I knew my code from 3am can't be trusted, sorry for that.
Your math seem to make sense, thank you.
[...]
On Fri, Dec 1, 2023 at 11:11 AM Andrii Nakryiko
<[email protected]> wrote:
>
> On Thu, Nov 30, 2023 at 5:33 PM Daniel Xu <[email protected]> wrote:
> >
> > On Tue, Nov 28, 2023 at 07:59:01PM +0200, Eduard Zingerman wrote:
> > > On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> > > > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > > > writing wrapper to make the verifier happy.
> > > >
> > > > Two alternatives to this approach are:
> > > >
> > > > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > > > CO-RE on specific structs.
> > > > 2. Use broader byte-sized writes to write to bitfields.
> > > >
> > > > (1) is a bit a bit hard to use. It requires specific and
> > > > not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> > > > not generally available in released LLVM versions yet.
> > > >
> > > > (2) makes the code quite hard to read and write. And especially if
> > > > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > > > to have an inverse helper for writing.
> > > >
> > > > [0]: https://reviews.llvm.org/D133361
> > > > From: Eduard Zingerman <[email protected]>
> > > >
> > > > Signed-off-by: Daniel Xu <[email protected]>
> > > > ---
> > >
> > > Could you please also add a selftest (or several) using __retval()
> > > annotation for this macro?
> >
> > Good call about adding tests -- I found a few bugs with the code from
> > the other thread. But boy did they take a lot of brain cells to figure
> > out.
> >
> > There was some 6th grade algebra involved too -- I'll do my best to
> > explain it in the commit msg for v3.
> >
> >
> > Here are the fixes in case you are curious:
> >
> > diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> > index 7a764f65d299..8f02c558c0ff 100644
> > --- a/tools/lib/bpf/bpf_core_read.h
> > +++ b/tools/lib/bpf/bpf_core_read.h
> > @@ -120,7 +120,9 @@ enum bpf_enum_value_kind {
> > unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> > unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> > unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> > - unsigned int bit_size = (rshift - lshift); \
> > + unsigned int bit_size = (64 - rshift); \
> > + unsigned int hi_size = lshift; \
> > + unsigned int lo_size = (rshift - lshift); \
>
> nit: let's drop unnecessary ()
>
> > unsigned long long nval, val, hi, lo; \
> > \
> > asm volatile("" : "+r"(p)); \
> > @@ -131,13 +133,13 @@ enum bpf_enum_value_kind {
> > case 4: val = *(unsigned int *)p; break; \
> > case 8: val = *(unsigned long long *)p; break; \
> > } \
> > - hi = val >> (bit_size + rshift); \
> > - hi <<= bit_size + rshift; \
> > - lo = val << (bit_size + lshift); \
> > - lo >>= bit_size + lshift; \
> > + hi = val >> (64 - hi_size); \
> > + hi <<= 64 - hi_size; \
> > + lo = val << (64 - lo_size); \
> > + lo >>= 64 - lo_size; \
> > nval = new_val; \
> > - nval <<= lshift; \
> > - nval >>= rshift; \
> > + nval <<= (64 - bit_size); \
> > + nval >>= (64 - bit_size - lo_size); \
> > val = hi | nval | lo; \
>
> this looks.. unusual. I'd imagine we calculate a mask, mask out bits
> we are replacing, and then OR with new values, roughly (assuming all
> the right left/right shift values and stuff)
>
> /* clear bits */
> val &= ~(bitfield_mask << shift);
we can also calculate shifted mask with just
bitfield_mask = (-1ULL) << some_left_shift >> some_right_shift;
val &= ~bitfield_mask;
> /* set bits */
> val |= (nval & bitfield_mask) << shift;
>
> ?
>
> > switch (byte_size) { \
> > case 1: *(unsigned char *)p = val; break; \
> >
> >
> > Thanks,
> > Daniel
On Thu, Nov 30, 2023 at 5:33 PM Daniel Xu <[email protected]> wrote:
>
> On Tue, Nov 28, 2023 at 07:59:01PM +0200, Eduard Zingerman wrote:
> > On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> > > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > > writing wrapper to make the verifier happy.
> > >
> > > Two alternatives to this approach are:
> > >
> > > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > > CO-RE on specific structs.
> > > 2. Use broader byte-sized writes to write to bitfields.
> > >
> > > (1) is a bit a bit hard to use. It requires specific and
> > > not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> > > not generally available in released LLVM versions yet.
> > >
> > > (2) makes the code quite hard to read and write. And especially if
> > > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > > to have an inverse helper for writing.
> > >
> > > [0]: https://reviews.llvm.org/D133361
> > > From: Eduard Zingerman <[email protected]>
> > >
> > > Signed-off-by: Daniel Xu <[email protected]>
> > > ---
> >
> > Could you please also add a selftest (or several) using __retval()
> > annotation for this macro?
>
> Good call about adding tests -- I found a few bugs with the code from
> the other thread. But boy did they take a lot of brain cells to figure
> out.
>
> There was some 6th grade algebra involved too -- I'll do my best to
> explain it in the commit msg for v3.
>
>
> Here are the fixes in case you are curious:
>
> diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> index 7a764f65d299..8f02c558c0ff 100644
> --- a/tools/lib/bpf/bpf_core_read.h
> +++ b/tools/lib/bpf/bpf_core_read.h
> @@ -120,7 +120,9 @@ enum bpf_enum_value_kind {
> unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> - unsigned int bit_size = (rshift - lshift); \
> + unsigned int bit_size = (64 - rshift); \
> + unsigned int hi_size = lshift; \
> + unsigned int lo_size = (rshift - lshift); \
nit: let's drop unnecessary ()
> unsigned long long nval, val, hi, lo; \
> \
> asm volatile("" : "+r"(p)); \
> @@ -131,13 +133,13 @@ enum bpf_enum_value_kind {
> case 4: val = *(unsigned int *)p; break; \
> case 8: val = *(unsigned long long *)p; break; \
> } \
> - hi = val >> (bit_size + rshift); \
> - hi <<= bit_size + rshift; \
> - lo = val << (bit_size + lshift); \
> - lo >>= bit_size + lshift; \
> + hi = val >> (64 - hi_size); \
> + hi <<= 64 - hi_size; \
> + lo = val << (64 - lo_size); \
> + lo >>= 64 - lo_size; \
> nval = new_val; \
> - nval <<= lshift; \
> - nval >>= rshift; \
> + nval <<= (64 - bit_size); \
> + nval >>= (64 - bit_size - lo_size); \
> val = hi | nval | lo; \
this looks.. unusual. I'd imagine we calculate a mask, mask out bits
we are replacing, and then OR with new values, roughly (assuming all
the right left/right shift values and stuff)
/* clear bits */
val &= ~(bitfield_mask << shift);
/* set bits */
val |= (nval & bitfield_mask) << shift;
?
> switch (byte_size) { \
> case 1: *(unsigned char *)p = val; break; \
>
>
> Thanks,
> Daniel
On Fri, Dec 01, 2023 at 11:13:13AM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 1, 2023 at 11:11 AM Andrii Nakryiko
> <[email protected]> wrote:
> >
> > On Thu, Nov 30, 2023 at 5:33 PM Daniel Xu <[email protected]> wrote:
> > >
> > > On Tue, Nov 28, 2023 at 07:59:01PM +0200, Eduard Zingerman wrote:
> > > > On Tue, 2023-11-28 at 10:54 -0700, Daniel Xu wrote:
> > > > > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > > > > writing wrapper to make the verifier happy.
> > > > >
> > > > > Two alternatives to this approach are:
> > > > >
> > > > > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > > > > CO-RE on specific structs.
> > > > > 2. Use broader byte-sized writes to write to bitfields.
> > > > >
> > > > > (1) is a bit a bit hard to use. It requires specific and
> > > > > not-very-obvious annotations to bpftool generated vmlinux.h. It's also
> > > > > not generally available in released LLVM versions yet.
> > > > >
> > > > > (2) makes the code quite hard to read and write. And especially if
> > > > > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > > > > to have an inverse helper for writing.
> > > > >
> > > > > [0]: https://reviews.llvm.org/D133361
> > > > > From: Eduard Zingerman <[email protected]>
> > > > >
> > > > > Signed-off-by: Daniel Xu <[email protected]>
> > > > > ---
> > > >
> > > > Could you please also add a selftest (or several) using __retval()
> > > > annotation for this macro?
> > >
> > > Good call about adding tests -- I found a few bugs with the code from
> > > the other thread. But boy did they take a lot of brain cells to figure
> > > out.
> > >
> > > There was some 6th grade algebra involved too -- I'll do my best to
> > > explain it in the commit msg for v3.
> > >
> > >
> > > Here are the fixes in case you are curious:
> > >
> > > diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> > > index 7a764f65d299..8f02c558c0ff 100644
> > > --- a/tools/lib/bpf/bpf_core_read.h
> > > +++ b/tools/lib/bpf/bpf_core_read.h
> > > @@ -120,7 +120,9 @@ enum bpf_enum_value_kind {
> > > unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> > > unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> > > unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> > > - unsigned int bit_size = (rshift - lshift); \
> > > + unsigned int bit_size = (64 - rshift); \
> > > + unsigned int hi_size = lshift; \
> > > + unsigned int lo_size = (rshift - lshift); \
> >
> > nit: let's drop unnecessary ()
> >
> > > unsigned long long nval, val, hi, lo; \
> > > \
> > > asm volatile("" : "+r"(p)); \
> > > @@ -131,13 +133,13 @@ enum bpf_enum_value_kind {
> > > case 4: val = *(unsigned int *)p; break; \
> > > case 8: val = *(unsigned long long *)p; break; \
> > > } \
> > > - hi = val >> (bit_size + rshift); \
> > > - hi <<= bit_size + rshift; \
> > > - lo = val << (bit_size + lshift); \
> > > - lo >>= bit_size + lshift; \
> > > + hi = val >> (64 - hi_size); \
> > > + hi <<= 64 - hi_size; \
> > > + lo = val << (64 - lo_size); \
> > > + lo >>= 64 - lo_size; \
> > > nval = new_val; \
> > > - nval <<= lshift; \
> > > - nval >>= rshift; \
> > > + nval <<= (64 - bit_size); \
> > > + nval >>= (64 - bit_size - lo_size); \
> > > val = hi | nval | lo; \
> >
> > this looks.. unusual. I'd imagine we calculate a mask, mask out bits
> > we are replacing, and then OR with new values, roughly (assuming all
> > the right left/right shift values and stuff)
> >
> > /* clear bits */
> > val &= ~(bitfield_mask << shift);
>
> we can also calculate shifted mask with just
>
> bitfield_mask = (-1ULL) << some_left_shift >> some_right_shift;
> val &= ~bitfield_mask;
Yeah I was chatting w/ JonathanL about this and I've got basically that
code ready to send for v3.
Thanks,
Daniel