This patchset adds two kfunc helpers, bpf_xdp_get_xfrm_state() and
bpf_xdp_xfrm_state_release() that wrap xfrm_state_lookup() and
xfrm_state_put(). The intent is to support software RSS (via XDP) for
the ongoing/upcoming ipsec pcpu work [0]. Recent experiments performed
on (hopefully) reproducible AWS testbeds indicate that single tunnel
pcpu ipsec can reach line rate on 100G ENA nics.
Note this patchset only tests/shows generic xfrm_state access. The
"secret sauce" (if you can really even call it that) involves accessing
a soon-to-be-upstreamed pcpu_num field in xfrm_state. Early example is
available here [1].
[0]: https://datatracker.ietf.org/doc/draft-ietf-ipsecme-multi-sa-performance/03/
[1]: https://github.com/danobi/xdp-tools/blob/e89a1c617aba3b50d990f779357d6ce2863ecb27/xdp-bench/xdp_redirect_cpumap.bpf.c#L385-L406
Changes from v2:
* Fix/simplify BPF_CORE_WRITE_BITFIELD() algorithm
* Added verifier tests for bitfield writes
* Fix state leakage across test_tunnel subtests
Changes from v1:
* Move xfrm tunnel tests to test_progs
* Fix writing to opts->error when opts is invalid
* Use __bpf_kfunc_start_defs()
* Remove unused vxlanhdr definition
* Add and use BPF_CORE_WRITE_BITFIELD() macro
* Make series bisect clean
Changes from RFCv2:
* Rebased to ipsec-next
* Fix netns leak
Changes from RFCv1:
* Add Antony's commit tags
* Add KF_ACQUIRE and KF_RELEASE semantics
Daniel Xu (9):
bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc
bpf: xfrm: Add bpf_xdp_xfrm_state_release() kfunc
libbpf: Add BPF_CORE_WRITE_BITFIELD() macro
bpf: selftests: test_loader: Support __btf_path() annotation
libbpf: selftests: Add verifier tests for CO-RE bitfield writes
bpf: selftests: test_tunnel: Setup fresh topology for each subtest
bpf: selftests: test_tunnel: Use vmlinux.h declarations
bpf: selftests: Move xfrm tunnel test to test_progs
bpf: xfrm: Add selftest for bpf_xdp_get_xfrm_state()
include/net/xfrm.h | 9 +
net/xfrm/Makefile | 1 +
net/xfrm/xfrm_policy.c | 2 +
net/xfrm/xfrm_state_bpf.c | 128 ++++++++++++++
tools/lib/bpf/bpf_core_read.h | 34 ++++
.../selftests/bpf/prog_tests/test_tunnel.c | 162 +++++++++++++++++-
.../selftests/bpf/prog_tests/verifier.c | 2 +
tools/testing/selftests/bpf/progs/bpf_misc.h | 1 +
.../selftests/bpf/progs/bpf_tracing_net.h | 1 +
.../selftests/bpf/progs/test_tunnel_kern.c | 138 ++++++++-------
.../bpf/progs/verifier_bitfield_write.c | 100 +++++++++++
tools/testing/selftests/bpf/test_loader.c | 7 +
tools/testing/selftests/bpf/test_tunnel.sh | 92 ----------
13 files changed, 522 insertions(+), 155 deletions(-)
create mode 100644 net/xfrm/xfrm_state_bpf.c
create mode 100644 tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
--
2.42.1
This commit adds an unstable kfunc helper to access internal xfrm_state
associated with an SA. This is intended to be used for the upcoming
IPsec pcpu work to assign special pcpu SAs to a particular CPU. In other
words: for custom software RSS.
That being said, the function that this kfunc wraps is fairly generic
and used for a lot of xfrm tasks. I'm sure people will find uses
elsewhere over time.
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
include/net/xfrm.h | 9 +++
net/xfrm/Makefile | 1 +
net/xfrm/xfrm_policy.c | 2 +
net/xfrm/xfrm_state_bpf.c | 112 ++++++++++++++++++++++++++++++++++++++
4 files changed, 124 insertions(+)
create mode 100644 net/xfrm/xfrm_state_bpf.c
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c9bb0f892f55..1d107241b901 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void)
#endif
+#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF)
+int register_xfrm_state_bpf(void);
+#else
+static inline int register_xfrm_state_bpf(void)
+{
+ return 0;
+}
+#endif
+
#endif /* _NET_XFRM_H */
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index cd47f88921f5..547cec77ba03 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
+obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index c13dc3ef7910..1b7e75159727 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4218,6 +4218,8 @@ void __init xfrm_init(void)
#ifdef CONFIG_XFRM_ESPINTCP
espintcp_init();
#endif
+
+ register_xfrm_state_bpf();
}
#ifdef CONFIG_AUDITSYSCALL
diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c
new file mode 100644
index 000000000000..1681825db506
--- /dev/null
+++ b/net/xfrm/xfrm_state_bpf.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable XFRM state BPF helpers.
+ *
+ * Note that it is allowed to break compatibility for these functions since the
+ * interface they are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <net/xdp.h>
+#include <net/xfrm.h>
+
+/* bpf_xfrm_state_opts - Options for XFRM state lookup helpers
+ *
+ * Members:
+ * @error - Out parameter, set for any errors encountered
+ * Values:
+ * -EINVAL - netns_id is less than -1
+ * -EINVAL - opts__sz isn't BPF_XFRM_STATE_OPTS_SZ
+ * -ENONET - No network namespace found for netns_id
+ * @netns_id - Specify the network namespace for lookup
+ * Values:
+ * BPF_F_CURRENT_NETNS (-1)
+ * Use namespace associated with ctx
+ * [0, S32_MAX]
+ * Network Namespace ID
+ * @mark - XFRM mark to match on
+ * @daddr - Destination address to match on
+ * @spi - Security parameter index to match on
+ * @proto - L3 protocol to match on
+ * @family - L3 protocol family to match on
+ */
+struct bpf_xfrm_state_opts {
+ s32 error;
+ s32 netns_id;
+ u32 mark;
+ xfrm_address_t daddr;
+ __be32 spi;
+ u8 proto;
+ u16 family;
+};
+
+enum {
+ BPF_XFRM_STATE_OPTS_SZ = sizeof(struct bpf_xfrm_state_opts),
+};
+
+__bpf_kfunc_start_defs();
+
+/* bpf_xdp_get_xfrm_state - Get XFRM state
+ *
+ * Parameters:
+ * @ctx - Pointer to ctx (xdp_md) in XDP program
+ * Cannot be NULL
+ * @opts - Options for lookup (documented above)
+ * Cannot be NULL
+ * @opts__sz - Length of the bpf_xfrm_state_opts structure
+ * Must be BPF_XFRM_STATE_OPTS_SZ
+ */
+__bpf_kfunc struct xfrm_state *
+bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 opts__sz)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+ struct net *net = dev_net(xdp->rxq->dev);
+ struct xfrm_state *x;
+
+ if (!opts || opts__sz < sizeof(opts->error))
+ return NULL;
+
+ if (opts__sz != BPF_XFRM_STATE_OPTS_SZ) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+
+ if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) {
+ opts->error = -EINVAL;
+ return NULL;
+ }
+
+ if (opts->netns_id >= 0) {
+ net = get_net_ns_by_id(net, opts->netns_id);
+ if (unlikely(!net)) {
+ opts->error = -ENONET;
+ return NULL;
+ }
+ }
+
+ x = xfrm_state_lookup(net, opts->mark, &opts->daddr, opts->spi,
+ opts->proto, opts->family);
+
+ if (opts->netns_id >= 0)
+ put_net(net);
+
+ return x;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(xfrm_state_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
+BTF_SET8_END(xfrm_state_kfunc_set)
+
+static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &xfrm_state_kfunc_set,
+};
+
+int __init register_xfrm_state_bpf(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+ &xfrm_state_xdp_kfunc_set);
+}
--
2.42.1
This kfunc releases a previously acquired xfrm_state from
bpf_xdp_get_xfrm_state().
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
net/xfrm/xfrm_state_bpf.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c
index 1681825db506..1485b9da9425 100644
--- a/net/xfrm/xfrm_state_bpf.c
+++ b/net/xfrm/xfrm_state_bpf.c
@@ -94,10 +94,26 @@ bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32
return x;
}
+/* bpf_xdp_xfrm_state_release - Release acquired xfrm_state object
+ *
+ * This must be invoked for referenced PTR_TO_BTF_ID, and the verifier rejects
+ * the program if any references remain in the program in all of the explored
+ * states.
+ *
+ * Parameters:
+ * @x - Pointer to referenced xfrm_state object, obtained using
+ * bpf_xdp_get_xfrm_state.
+ */
+__bpf_kfunc void bpf_xdp_xfrm_state_release(struct xfrm_state *x)
+{
+ xfrm_state_put(x);
+}
+
__bpf_kfunc_end_defs();
BTF_SET8_START(xfrm_state_kfunc_set)
BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_xdp_xfrm_state_release, KF_RELEASE)
BTF_SET8_END(xfrm_state_kfunc_set)
static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
--
2.42.1
=== Motivation ===
Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
writing wrapper to make the verifier happy.
Two alternatives to this approach are:
1. Use the upcoming `preserve_static_offset` [0] attribute to disable
CO-RE on specific structs.
2. Use broader byte-sized writes to write to bitfields.
(1) is a bit hard to use. It requires specific and not-very-obvious
annotations to bpftool generated vmlinux.h. It's also not generally
available in released LLVM versions yet.
(2) makes the code quite hard to read and write. And especially if
BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
to have an inverse helper for writing.
=== Implementation details ===
Since the logic is a bit non-obvious, I thought it would be helpful
to explain exactly what's going on.
To start, it helps by explaining what LSHIFT_U64 (lshift) and RSHIFT_U64
(rshift) is designed to mean. Consider the core of the
BPF_CORE_READ_BITFIELD() algorithm:
val <<= __CORE_RELO(s, field, LSHIFT_U64);
val = val >> __CORE_RELO(s, field, RSHIFT_U64);
Basically what happens is we lshift to clear the non-relevant (blank)
higher order bits. Then we rshift to bring the relevant bits (bitfield)
down to LSB position (while also clearing blank lower order bits). To
illustrate:
Start: ........XXX......
Lshift: XXX......00000000
Rshift: 00000000000000XXX
where `.` means blank bit, `0` means 0 bit, and `X` means bitfield bit.
After the two operations, the bitfield is ready to be interpreted as a
regular integer.
Next, we want to build an alternative (but more helpful) mental model
on lshift and rshift. That is, to consider:
* rshift as the total number of blank bits in the u64
* lshift as number of blank bits left of the bitfield in the u64
Take a moment to consider why that is true by consulting the above
diagram.
With this insight, we can how define the following relationship:
bitfield
_
| |
0.....00XXX0...00
| | | |
|______| | |
lshift | |
|____|
(rshift - lshift)
That is, we know the number of higher order blank bits is just lshift.
And the number of lower order blank bits is (rshift - lshift).
Finally, we can examine the core of the write side algorithm:
mask = (~0ULL << rshift) >> lshift; // 1
nval = new_val; // 2
nval = (nval << rpad) & mask; // 3
val = (val & ~mask) | nval; // 4
(1): Compute a mask where the set bits are the bitfield bits. The first
left shift zeros out exactly the number of blank bits, leaving a
bitfield sized set of 1s. The subsequent right shift inserts the
correct amount of higher order blank bits.
(2): Place the new value into a word sized container, nval.
(3): Place nval at the correct bit position and mask out blank bits.
(4): Mix the bitfield in with original surrounding blank bits.
[0]: https://reviews.llvm.org/D133361
Co-authored-by: Eduard Zingerman <[email protected]>
Signed-off-by: Eduard Zingerman <[email protected]>
Co-authored-by: Jonathan Lemon <[email protected]>
Signed-off-by: Jonathan Lemon <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
tools/lib/bpf/bpf_core_read.h | 34 ++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index 1ac57bb7ac55..a7ffb80e3539 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -111,6 +111,40 @@ enum bpf_enum_value_kind {
val; \
})
+/*
+ * Write to a bitfield, identified by s->field.
+ * This is the inverse of BPF_CORE_WRITE_BITFIELD().
+ */
+#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({ \
+ void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \
+ unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
+ unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
+ unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
+ unsigned int rpad = rshift - lshift; \
+ unsigned long long nval, mask, val; \
+ \
+ asm volatile("" : "+r"(p)); \
+ \
+ switch (byte_size) { \
+ case 1: val = *(unsigned char *)p; break; \
+ case 2: val = *(unsigned short *)p; break; \
+ case 4: val = *(unsigned int *)p; break; \
+ case 8: val = *(unsigned long long *)p; break; \
+ } \
+ \
+ mask = (~0ULL << rshift) >> lshift; \
+ nval = new_val; \
+ nval = (nval << rpad) & mask; \
+ val = (val & ~mask) | nval; \
+ \
+ switch (byte_size) { \
+ case 1: *(unsigned char *)p = val; break; \
+ case 2: *(unsigned short *)p = val; break; \
+ case 4: *(unsigned int *)p = val; break; \
+ case 8: *(unsigned long long *)p = val; break; \
+ } \
+})
+
#define ___bpf_field_ref1(field) (field)
#define ___bpf_field_ref2(type, field) (((typeof(type) *)0)->field)
#define ___bpf_field_ref(args...) \
--
2.42.1
This commit adds support for per-prog btf_custom_path. This is necessary
for testing CO-RE relocations on non-vmlinux types using test_loader
infrastructure.
Signed-off-by: Daniel Xu <[email protected]>
---
tools/testing/selftests/bpf/progs/bpf_misc.h | 1 +
tools/testing/selftests/bpf/test_loader.c | 7 +++++++
2 files changed, 8 insertions(+)
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 799fff4995d8..2fd59970c43a 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -71,6 +71,7 @@
#define __retval_unpriv(val) __attribute__((btf_decl_tag("comment:test_retval_unpriv="#val)))
#define __auxiliary __attribute__((btf_decl_tag("comment:test_auxiliary")))
#define __auxiliary_unpriv __attribute__((btf_decl_tag("comment:test_auxiliary_unpriv")))
+#define __btf_path(path) __attribute__((btf_decl_tag("comment:test_btf_path=" path)))
/* Convenience macro for use with 'asm volatile' blocks */
#define __naked __attribute__((naked))
diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
index a350ecdfba4a..74ceb7877ae2 100644
--- a/tools/testing/selftests/bpf/test_loader.c
+++ b/tools/testing/selftests/bpf/test_loader.c
@@ -27,6 +27,7 @@
#define TEST_TAG_RETVAL_PFX_UNPRIV "comment:test_retval_unpriv="
#define TEST_TAG_AUXILIARY "comment:test_auxiliary"
#define TEST_TAG_AUXILIARY_UNPRIV "comment:test_auxiliary_unpriv"
+#define TEST_BTF_PATH "comment:test_btf_path="
/* Warning: duplicated in bpf_misc.h */
#define POINTER_VALUE 0xcafe4all
@@ -58,6 +59,7 @@ struct test_spec {
const char *prog_name;
struct test_subspec priv;
struct test_subspec unpriv;
+ const char *btf_custom_path;
int log_level;
int prog_flags;
int mode_mask;
@@ -288,6 +290,8 @@ static int parse_test_spec(struct test_loader *tester,
goto cleanup;
update_flags(&spec->prog_flags, flags, clear);
}
+ } else if (str_has_pfx(s, TEST_BTF_PATH)) {
+ spec->btf_custom_path = s + sizeof(TEST_BTF_PATH) - 1;
}
}
@@ -578,6 +582,9 @@ void run_subtest(struct test_loader *tester,
}
}
+ /* Implicitly reset to NULL if next test case doesn't specify */
+ open_opts->btf_custom_path = spec->btf_custom_path;
+
tobj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, open_opts);
if (!ASSERT_OK_PTR(tobj, "obj_open_mem")) /* shouldn't happen */
goto subtest_cleanup;
--
2.42.1
Add some tests that exercise BPF_CORE_WRITE_BITFIELD() macro. Since some
non-trivial bit fiddling is going on, make sure various edge cases (such
as adjacent bitfields and bitfields at the edge of structs) are
exercised.
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/prog_tests/verifier.c | 2 +
.../bpf/progs/verifier_bitfield_write.c | 100 ++++++++++++++++++
2 files changed, 102 insertions(+)
create mode 100644 tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 5cfa7a6316b6..67b4948865a3 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -6,6 +6,7 @@
#include "verifier_and.skel.h"
#include "verifier_array_access.skel.h"
#include "verifier_basic_stack.skel.h"
+#include "verifier_bitfield_write.skel.h"
#include "verifier_bounds.skel.h"
#include "verifier_bounds_deduction.skel.h"
#include "verifier_bounds_deduction_non_const.skel.h"
@@ -115,6 +116,7 @@ static void run_tests_aux(const char *skel_name,
void test_verifier_and(void) { RUN(verifier_and); }
void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); }
+void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); }
void test_verifier_bounds(void) { RUN(verifier_bounds); }
void test_verifier_bounds_deduction(void) { RUN(verifier_bounds_deduction); }
void test_verifier_bounds_deduction_non_const(void) { RUN(verifier_bounds_deduction_non_const); }
diff --git a/tools/testing/selftests/bpf/progs/verifier_bitfield_write.c b/tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
new file mode 100644
index 000000000000..8fe355a19ba6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <stdint.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+#include "bpf_misc.h"
+
+struct core_reloc_bitfields {
+ /* unsigned bitfields */
+ uint8_t ub1: 1;
+ uint8_t ub2: 2;
+ uint32_t ub7: 7;
+ /* signed bitfields */
+ int8_t sb4: 4;
+ int32_t sb20: 20;
+ /* non-bitfields */
+ uint32_t u32;
+ int32_t s32;
+} __attribute__((preserve_access_index));
+
+SEC("tc")
+__description("single CO-RE bitfield roundtrip")
+__btf_path("btf__core_reloc_bitfields.bpf.o")
+__success __failure_unpriv
+__retval(3)
+int single_field_roundtrip(struct __sk_buff *ctx)
+{
+ struct core_reloc_bitfields bitfields;
+
+ __builtin_memset(&bitfields, 0, sizeof(bitfields));
+ BPF_CORE_WRITE_BITFIELD(&bitfields, ub2, 3);
+ return BPF_CORE_READ_BITFIELD(&bitfields, ub2);
+}
+
+SEC("tc")
+__description("multiple CO-RE bitfield roundtrip")
+__btf_path("btf__core_reloc_bitfields.bpf.o")
+__success __failure_unpriv
+__retval(0x3FD)
+int multiple_field_roundtrip(struct __sk_buff *ctx)
+{
+ struct core_reloc_bitfields bitfields;
+ uint8_t ub2;
+ int8_t sb4;
+
+ __builtin_memset(&bitfields, 0, sizeof(bitfields));
+ BPF_CORE_WRITE_BITFIELD(&bitfields, ub2, 1);
+ BPF_CORE_WRITE_BITFIELD(&bitfields, sb4, -1);
+
+ ub2 = BPF_CORE_READ_BITFIELD(&bitfields, ub2);
+ sb4 = BPF_CORE_READ_BITFIELD(&bitfields, sb4);
+
+ return (((uint8_t)sb4) << 2) | ub2;
+}
+
+SEC("tc")
+__description("adjacent CO-RE bitfield roundtrip")
+__btf_path("btf__core_reloc_bitfields.bpf.o")
+__success __failure_unpriv
+__retval(7)
+int adjacent_field_roundtrip(struct __sk_buff *ctx)
+{
+ struct core_reloc_bitfields bitfields;
+ uint8_t ub1, ub2;
+
+ __builtin_memset(&bitfields, 0, sizeof(bitfields));
+ BPF_CORE_WRITE_BITFIELD(&bitfields, ub1, 1);
+ BPF_CORE_WRITE_BITFIELD(&bitfields, ub2, 3);
+
+ ub1 = BPF_CORE_READ_BITFIELD(&bitfields, ub1);
+ ub2 = BPF_CORE_READ_BITFIELD(&bitfields, ub2);
+
+ return (ub2 << 1) | ub1;
+}
+
+SEC("tc")
+__description("multibyte CO-RE bitfield roundtrip")
+__btf_path("btf__core_reloc_bitfields.bpf.o")
+__success __failure_unpriv
+__retval(0x21)
+int multibyte_field_roundtrip(struct __sk_buff *ctx)
+{
+ struct core_reloc_bitfields bitfields;
+ uint32_t ub7;
+ uint8_t ub1;
+
+ __builtin_memset(&bitfields, 0, sizeof(bitfields));
+ BPF_CORE_WRITE_BITFIELD(&bitfields, ub1, 1);
+ BPF_CORE_WRITE_BITFIELD(&bitfields, ub7, 16);
+
+ ub1 = BPF_CORE_READ_BITFIELD(&bitfields, ub1);
+ ub7 = BPF_CORE_READ_BITFIELD(&bitfields, ub7);
+
+ return (ub7 << 1) | ub1;
+}
+
+char _license[] SEC("license") = "GPL";
--
2.42.1
This helps with determinism b/c individual setup/teardown prevents
leaking state between different subtests.
Signed-off-by: Daniel Xu <[email protected]>
---
tools/testing/selftests/bpf/prog_tests/test_tunnel.c | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
index d149ab98798d..b57d48219d0b 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
@@ -535,23 +535,20 @@ static void test_ipip_tunnel(enum ipip_encap encap)
#define RUN_TEST(name, ...) \
({ \
if (test__start_subtest(#name)) { \
+ config_device(); \
test_ ## name(__VA_ARGS__); \
+ cleanup(); \
} \
})
static void *test_tunnel_run_tests(void *arg)
{
- cleanup();
- config_device();
-
RUN_TEST(vxlan_tunnel);
RUN_TEST(ip6vxlan_tunnel);
RUN_TEST(ipip_tunnel, NONE);
RUN_TEST(ipip_tunnel, FOU);
RUN_TEST(ipip_tunnel, GUE);
- cleanup();
-
return NULL;
}
--
2.42.1
vmlinux.h declarations are more ergnomic, especially when working with
kfuncs. The uapi headers are often incomplete for kfunc definitions.
This commit also switches bitfield accesses to use CO-RE helpers.
Switching to vmlinux.h definitions makes the verifier very
unhappy with raw bitfield accesses. The error is:
; md.u.md2.dir = direction;
33: (69) r1 = *(u16 *)(r2 +11)
misaligned stack access off (0x0; 0x0)+-64+11 size 2
Fix by using CO-RE-aware bitfield reads and writes.
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/progs/bpf_tracing_net.h | 1 +
.../selftests/bpf/progs/test_tunnel_kern.c | 76 +++++--------------
2 files changed, 22 insertions(+), 55 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
index 0b793a102791..1bdc680b0e0e 100644
--- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -26,6 +26,7 @@
#define IPV6_AUTOFLOWLABEL 70
#define TC_ACT_UNSPEC (-1)
+#define TC_ACT_OK 0
#define TC_ACT_SHOT 2
#define SOL_TCP 6
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index f66af753bbbb..b320fb7bb080 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -6,62 +6,26 @@
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
-#include <stddef.h>
-#include <string.h>
-#include <arpa/inet.h>
-#include <linux/bpf.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/if_tunnel.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/icmp.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/pkt_cls.h>
-#include <linux/erspan.h>
-#include <linux/udp.h>
+#include "vmlinux.h"
+#include <bpf/bpf_core_read.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
+#include "bpf_kfuncs.h"
+#include "bpf_tracing_net.h"
#define log_err(__ret) bpf_printk("ERROR line:%d ret:%d\n", __LINE__, __ret)
-#define VXLAN_UDP_PORT 4789
+#define VXLAN_UDP_PORT 4789
+#define ETH_P_IP 0x0800
+#define PACKET_HOST 0
+#define TUNNEL_CSUM bpf_htons(0x01)
+#define TUNNEL_KEY bpf_htons(0x04)
/* Only IPv4 address assigned to veth1.
* 172.16.1.200
*/
#define ASSIGNED_ADDR_VETH1 0xac1001c8
-struct geneve_opt {
- __be16 opt_class;
- __u8 type;
- __u8 length:5;
- __u8 r3:1;
- __u8 r2:1;
- __u8 r1:1;
- __u8 opt_data[8]; /* hard-coded to 8 byte */
-};
-
-struct vxlanhdr {
- __be32 vx_flags;
- __be32 vx_vni;
-} __attribute__((packed));
-
-struct vxlan_metadata {
- __u32 gbp;
-};
-
-struct bpf_fou_encap {
- __be16 sport;
- __be16 dport;
-};
-
-enum bpf_fou_encap_type {
- FOU_BPF_ENCAP_FOU,
- FOU_BPF_ENCAP_GUE,
-};
-
int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap, int type) __ksym;
int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
@@ -205,9 +169,9 @@ int erspan_set_tunnel(struct __sk_buff *skb)
__u8 hwid = 7;
md.version = 2;
- md.u.md2.dir = direction;
- md.u.md2.hwid = hwid & 0xf;
- md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, dir, direction);
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid, (hwid & 0xf));
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid_upper, (hwid >> 4) & 0x3);
#endif
ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
@@ -246,8 +210,9 @@ int erspan_get_tunnel(struct __sk_buff *skb)
bpf_printk("\tindex %x\n", index);
#else
bpf_printk("\tdirection %d hwid %x timestamp %u\n",
- md.u.md2.dir,
- (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ BPF_CORE_READ_BITFIELD(&md.u.md2, dir),
+ (BPF_CORE_READ_BITFIELD(&md.u.md2, hwid_upper) << 4) +
+ BPF_CORE_READ_BITFIELD(&md.u.md2, hwid),
bpf_ntohl(md.u.md2.timestamp));
#endif
@@ -284,9 +249,9 @@ int ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
__u8 hwid = 17;
md.version = 2;
- md.u.md2.dir = direction;
- md.u.md2.hwid = hwid & 0xf;
- md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, dir, direction);
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid, (hwid & 0xf));
+ BPF_CORE_WRITE_BITFIELD(&md.u.md2, hwid_upper, (hwid >> 4) & 0x3);
#endif
ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
@@ -326,8 +291,9 @@ int ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
bpf_printk("\tindex %x\n", index);
#else
bpf_printk("\tdirection %d hwid %x timestamp %u\n",
- md.u.md2.dir,
- (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ BPF_CORE_READ_BITFIELD(&md.u.md2, dir),
+ (BPF_CORE_READ_BITFIELD(&md.u.md2, hwid_upper) << 4) +
+ BPF_CORE_READ_BITFIELD(&md.u.md2, hwid),
bpf_ntohl(md.u.md2.timestamp));
#endif
--
2.42.1
test_progs is better than a shell script b/c C is a bit easier to
maintain than shell. Also it's easier to use new infra like memory
mapped global variables from C via bpf skeleton.
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/prog_tests/test_tunnel.c | 143 ++++++++++++++++++
.../selftests/bpf/progs/test_tunnel_kern.c | 11 +-
tools/testing/selftests/bpf/test_tunnel.sh | 92 -----------
3 files changed, 151 insertions(+), 95 deletions(-)
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
index b57d48219d0b..2d7f8fa82ebd 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
@@ -50,6 +50,7 @@
*/
#include <arpa/inet.h>
+#include <linux/if_link.h>
#include <linux/if_tun.h>
#include <linux/limits.h>
#include <linux/sysctl.h>
@@ -92,6 +93,11 @@
#define IPIP_TUNL_DEV0 "ipip00"
#define IPIP_TUNL_DEV1 "ipip11"
+#define XFRM_AUTH "0x1111111111111111111111111111111111111111"
+#define XFRM_ENC "0x22222222222222222222222222222222"
+#define XFRM_SPI_IN_TO_OUT 0x1
+#define XFRM_SPI_OUT_TO_IN 0x2
+
#define PING_ARGS "-i 0.01 -c 3 -w 10 -q"
static int config_device(void)
@@ -264,6 +270,92 @@ static void delete_ipip_tunnel(void)
SYS_NOFAIL("ip fou del port 5555 2> /dev/null");
}
+static int add_xfrm_tunnel(void)
+{
+ /* at_ns0 namespace
+ * at_ns0 -> root
+ */
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 1 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm policy add src %s/32 dst %s/32 dir out "
+ "tmpl src %s dst %s proto esp reqid 1 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);
+
+ /* root -> at_ns0 */
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 2 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip netns exec at_ns0 "
+ "ip xfrm policy add src %s/32 dst %s/32 dir in "
+ "tmpl src %s dst %s proto esp reqid 2 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);
+
+ /* address & route */
+ SYS(fail, "ip netns exec at_ns0 ip addr add dev veth0 %s/32",
+ IP4_ADDR_TUNL_DEV0);
+ SYS(fail, "ip netns exec at_ns0 ip route add %s dev veth0 via %s src %s",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR1_VETH1, IP4_ADDR_TUNL_DEV0);
+
+ /* root namespace
+ * at_ns0 -> root
+ */
+ SYS(fail,
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 1 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip xfrm policy add src %s/32 dst %s/32 dir in "
+ "tmpl src %s dst %s proto esp reqid 1 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1, IP4_ADDR_VETH0, IP4_ADDR1_VETH1);
+
+ /* root -> at_ns0 */
+ SYS(fail,
+ "ip xfrm state add src %s dst %s proto esp "
+ "spi %d reqid 2 mode tunnel "
+ "auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
+ IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
+ SYS(fail,
+ "ip xfrm policy add src %s/32 dst %s/32 dir out "
+ "tmpl src %s dst %s proto esp reqid 2 "
+ "mode tunnel",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0, IP4_ADDR1_VETH1, IP4_ADDR_VETH0);
+
+ /* address & route */
+ SYS(fail, "ip addr add dev veth1 %s/32", IP4_ADDR_TUNL_DEV1);
+ SYS(fail, "ip route add %s dev veth1 via %s src %s",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_VETH0, IP4_ADDR_TUNL_DEV1);
+
+ return 0;
+fail:
+ return -1;
+}
+
+static void delete_xfrm_tunnel(void)
+{
+ SYS_NOFAIL("ip xfrm policy delete dir out src %s/32 dst %s/32 2> /dev/null",
+ IP4_ADDR_TUNL_DEV1, IP4_ADDR_TUNL_DEV0);
+ SYS_NOFAIL("ip xfrm policy delete dir in src %s/32 dst %s/32 2> /dev/null",
+ IP4_ADDR_TUNL_DEV0, IP4_ADDR_TUNL_DEV1);
+ SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
+ IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT);
+ SYS_NOFAIL("ip xfrm state delete src %s dst %s proto esp spi %d 2> /dev/null",
+ IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN);
+}
+
static int test_ping(int family, const char *addr)
{
SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr);
@@ -532,6 +624,56 @@ static void test_ipip_tunnel(enum ipip_encap encap)
test_tunnel_kern__destroy(skel);
}
+static void test_xfrm_tunnel(void)
+{
+ DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook,
+ .attach_point = BPF_TC_INGRESS);
+ struct test_tunnel_kern *skel = NULL;
+ struct nstoken *nstoken;
+ int tc_prog_fd;
+ int ifindex;
+ int err;
+
+ err = add_xfrm_tunnel();
+ if (!ASSERT_OK(err, "add_xfrm_tunnel"))
+ return;
+
+ skel = test_tunnel_kern__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load"))
+ goto done;
+
+ ifindex = if_nametoindex("veth1");
+ if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex"))
+ goto done;
+
+ /* attach tc prog to tunnel dev */
+ tc_hook.ifindex = ifindex;
+ tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state);
+ if (!ASSERT_GE(tc_prog_fd, 0, "bpf_program__fd"))
+ goto done;
+ if (attach_tc_prog(&tc_hook, tc_prog_fd, -1))
+ goto done;
+
+ /* ping from at_ns0 namespace test */
+ nstoken = open_netns("at_ns0");
+ err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
+ close_netns(nstoken);
+ if (!ASSERT_OK(err, "test_ping"))
+ goto done;
+
+ if (!ASSERT_EQ(skel->bss->xfrm_reqid, 1, "req_id"))
+ goto done;
+ if (!ASSERT_EQ(skel->bss->xfrm_spi, XFRM_SPI_IN_TO_OUT, "spi"))
+ goto done;
+ if (!ASSERT_EQ(skel->bss->xfrm_remote_ip, 0xac100164, "remote_ip"))
+ goto done;
+
+done:
+ delete_xfrm_tunnel();
+ if (skel)
+ test_tunnel_kern__destroy(skel);
+}
+
#define RUN_TEST(name, ...) \
({ \
if (test__start_subtest(#name)) { \
@@ -548,6 +690,7 @@ static void *test_tunnel_run_tests(void *arg)
RUN_TEST(ipip_tunnel, NONE);
RUN_TEST(ipip_tunnel, FOU);
RUN_TEST(ipip_tunnel, GUE);
+ RUN_TEST(xfrm_tunnel);
return NULL;
}
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index b320fb7bb080..3a59eb9c34de 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -929,6 +929,10 @@ int ip6ip6_get_tunnel(struct __sk_buff *skb)
return TC_ACT_OK;
}
+volatile int xfrm_reqid = 0;
+volatile int xfrm_spi = 0;
+volatile int xfrm_remote_ip = 0;
+
SEC("tc")
int xfrm_get_state(struct __sk_buff *skb)
{
@@ -939,9 +943,10 @@ int xfrm_get_state(struct __sk_buff *skb)
if (ret < 0)
return TC_ACT_OK;
- bpf_printk("reqid %d spi 0x%x remote ip 0x%x\n",
- x.reqid, bpf_ntohl(x.spi),
- bpf_ntohl(x.remote_ipv4));
+ xfrm_reqid = x.reqid;
+ xfrm_spi = bpf_ntohl(x.spi);
+ xfrm_remote_ip = bpf_ntohl(x.remote_ipv4);
+
return TC_ACT_OK;
}
diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh
index 2dec7dbf29a2..d9661b9988ba 100755
--- a/tools/testing/selftests/bpf/test_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tunnel.sh
@@ -517,90 +517,6 @@ test_ip6ip6()
echo -e ${GREEN}"PASS: ip6$TYPE"${NC}
}
-setup_xfrm_tunnel()
-{
- auth=0x$(printf '1%.0s' {1..40})
- enc=0x$(printf '2%.0s' {1..32})
- spi_in_to_out=0x1
- spi_out_to_in=0x2
- # at_ns0 namespace
- # at_ns0 -> root
- ip netns exec at_ns0 \
- ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
- spi $spi_in_to_out reqid 1 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip netns exec at_ns0 \
- ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir out \
- tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
- mode tunnel
- # root -> at_ns0
- ip netns exec at_ns0 \
- ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
- spi $spi_out_to_in reqid 2 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip netns exec at_ns0 \
- ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir in \
- tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
- mode tunnel
- # address & route
- ip netns exec at_ns0 \
- ip addr add dev veth0 10.1.1.100/32
- ip netns exec at_ns0 \
- ip route add 10.1.1.200 dev veth0 via 172.16.1.200 \
- src 10.1.1.100
-
- # root namespace
- # at_ns0 -> root
- ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
- spi $spi_in_to_out reqid 1 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir in \
- tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
- mode tunnel
- # root -> at_ns0
- ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
- spi $spi_out_to_in reqid 2 mode tunnel \
- auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
- ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir out \
- tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
- mode tunnel
- # address & route
- ip addr add dev veth1 10.1.1.200/32
- ip route add 10.1.1.100 dev veth1 via 172.16.1.100 src 10.1.1.200
-}
-
-test_xfrm_tunnel()
-{
- if [[ -e /sys/kernel/tracing/trace ]]; then
- TRACE=/sys/kernel/tracing/trace
- else
- TRACE=/sys/kernel/debug/tracing/trace
- fi
- config_device
- > ${TRACE}
- setup_xfrm_tunnel
- mkdir -p ${BPF_PIN_TUNNEL_DIR}
- bpftool prog loadall ${BPF_FILE} ${BPF_PIN_TUNNEL_DIR}
- tc qdisc add dev veth1 clsact
- tc filter add dev veth1 proto ip ingress bpf da object-pinned \
- ${BPF_PIN_TUNNEL_DIR}/xfrm_get_state
- ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
- sleep 1
- grep "reqid 1" ${TRACE}
- check_err $?
- grep "spi 0x1" ${TRACE}
- check_err $?
- grep "remote ip 0xac100164" ${TRACE}
- check_err $?
- cleanup
-
- if [ $ret -ne 0 ]; then
- echo -e ${RED}"FAIL: xfrm tunnel"${NC}
- return 1
- fi
- echo -e ${GREEN}"PASS: xfrm tunnel"${NC}
-}
-
attach_bpf()
{
DEV=$1
@@ -630,10 +546,6 @@ cleanup()
ip link del ip6geneve11 2> /dev/null
ip link del erspan11 2> /dev/null
ip link del ip6erspan11 2> /dev/null
- ip xfrm policy delete dir out src 10.1.1.200/32 dst 10.1.1.100/32 2> /dev/null
- ip xfrm policy delete dir in src 10.1.1.100/32 dst 10.1.1.200/32 2> /dev/null
- ip xfrm state delete src 172.16.1.100 dst 172.16.1.200 proto esp spi 0x1 2> /dev/null
- ip xfrm state delete src 172.16.1.200 dst 172.16.1.100 proto esp spi 0x2 2> /dev/null
}
cleanup_exit()
@@ -716,10 +628,6 @@ bpf_tunnel_test()
test_ip6ip6
errors=$(( $errors + $? ))
- echo "Testing IPSec tunnel..."
- test_xfrm_tunnel
- errors=$(( $errors + $? ))
-
return $errors
}
--
2.42.1
This commit extends test_tunnel selftest to test the new XDP xfrm state
lookup kfunc.
Co-developed-by: Antony Antony <[email protected]>
Signed-off-by: Antony Antony <[email protected]>
Signed-off-by: Daniel Xu <[email protected]>
---
.../selftests/bpf/prog_tests/test_tunnel.c | 20 ++++++--
.../selftests/bpf/progs/test_tunnel_kern.c | 51 +++++++++++++++++++
2 files changed, 67 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
index 2d7f8fa82ebd..fc804095d578 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c
@@ -278,7 +278,7 @@ static int add_xfrm_tunnel(void)
SYS(fail,
"ip netns exec at_ns0 "
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 1 mode tunnel "
+ "spi %d reqid 1 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -292,7 +292,7 @@ static int add_xfrm_tunnel(void)
SYS(fail,
"ip netns exec at_ns0 "
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 2 mode tunnel "
+ "spi %d reqid 2 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -313,7 +313,7 @@ static int add_xfrm_tunnel(void)
*/
SYS(fail,
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 1 mode tunnel "
+ "spi %d reqid 1 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR_VETH0, IP4_ADDR1_VETH1, XFRM_SPI_IN_TO_OUT, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -325,7 +325,7 @@ static int add_xfrm_tunnel(void)
/* root -> at_ns0 */
SYS(fail,
"ip xfrm state add src %s dst %s proto esp "
- "spi %d reqid 2 mode tunnel "
+ "spi %d reqid 2 mode tunnel replay-window 42 "
"auth-trunc 'hmac(sha1)' %s 96 enc 'cbc(aes)' %s",
IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN, XFRM_AUTH, XFRM_ENC);
SYS(fail,
@@ -628,8 +628,10 @@ static void test_xfrm_tunnel(void)
{
DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook,
.attach_point = BPF_TC_INGRESS);
+ LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
struct test_tunnel_kern *skel = NULL;
struct nstoken *nstoken;
+ int xdp_prog_fd;
int tc_prog_fd;
int ifindex;
int err;
@@ -654,6 +656,14 @@ static void test_xfrm_tunnel(void)
if (attach_tc_prog(&tc_hook, tc_prog_fd, -1))
goto done;
+ /* attach xdp prog to tunnel dev */
+ xdp_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state_xdp);
+ if (!ASSERT_GE(xdp_prog_fd, 0, "bpf_program__fd"))
+ goto done;
+ err = bpf_xdp_attach(ifindex, xdp_prog_fd, XDP_FLAGS_REPLACE, &opts);
+ if (!ASSERT_OK(err, "bpf_xdp_attach"))
+ goto done;
+
/* ping from at_ns0 namespace test */
nstoken = open_netns("at_ns0");
err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1);
@@ -667,6 +677,8 @@ static void test_xfrm_tunnel(void)
goto done;
if (!ASSERT_EQ(skel->bss->xfrm_remote_ip, 0xac100164, "remote_ip"))
goto done;
+ if (!ASSERT_EQ(skel->bss->xfrm_replay_window, 42, "replay_window"))
+ goto done;
done:
delete_xfrm_tunnel();
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
index 3a59eb9c34de..c0dd38616562 100644
--- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -30,6 +30,10 @@ int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap, int type) __ksym;
int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
struct bpf_fou_encap *encap) __ksym;
+struct xfrm_state *
+bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts,
+ u32 opts__sz) __ksym;
+void bpf_xdp_xfrm_state_release(struct xfrm_state *x) __ksym;
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
@@ -950,4 +954,51 @@ int xfrm_get_state(struct __sk_buff *skb)
return TC_ACT_OK;
}
+volatile int xfrm_replay_window = 0;
+
+SEC("xdp")
+int xfrm_get_state_xdp(struct xdp_md *xdp)
+{
+ struct bpf_xfrm_state_opts opts = {};
+ struct xfrm_state *x = NULL;
+ struct ip_esp_hdr *esph;
+ struct bpf_dynptr ptr;
+ u8 esph_buf[8] = {};
+ u8 iph_buf[20] = {};
+ struct iphdr *iph;
+ u32 off;
+
+ if (bpf_dynptr_from_xdp(xdp, 0, &ptr))
+ goto out;
+
+ off = sizeof(struct ethhdr);
+ iph = bpf_dynptr_slice(&ptr, off, iph_buf, sizeof(iph_buf));
+ if (!iph || iph->protocol != IPPROTO_ESP)
+ goto out;
+
+ off += sizeof(struct iphdr);
+ esph = bpf_dynptr_slice(&ptr, off, esph_buf, sizeof(esph_buf));
+ if (!esph)
+ goto out;
+
+ opts.netns_id = BPF_F_CURRENT_NETNS;
+ opts.daddr.a4 = iph->daddr;
+ opts.spi = esph->spi;
+ opts.proto = IPPROTO_ESP;
+ opts.family = AF_INET;
+
+ x = bpf_xdp_get_xfrm_state(xdp, &opts, sizeof(opts));
+ if (!x || opts.error)
+ goto out;
+
+ if (!x->replay_esn)
+ goto out;
+
+ xfrm_replay_window = x->replay_esn->replay_window;
+out:
+ if (x)
+ bpf_xdp_xfrm_state_release(x);
+ return XDP_PASS;
+}
+
char _license[] SEC("license") = "GPL";
--
2.42.1
On Fri, Dec 01, 2023 at 01:23:14PM -0700, Daniel Xu via Devel wrote:
> === Motivation ===
>
> Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> writing wrapper to make the verifier happy.
>
> Two alternatives to this approach are:
>
> 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> CO-RE on specific structs.
> 2. Use broader byte-sized writes to write to bitfields.
>
> (1) is a bit hard to use. It requires specific and not-very-obvious
> annotations to bpftool generated vmlinux.h. It's also not generally
> available in released LLVM versions yet.
>
> (2) makes the code quite hard to read and write. And especially if
> BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> to have an inverse helper for writing.
>
> === Implementation details ===
>
> Since the logic is a bit non-obvious, I thought it would be helpful
> to explain exactly what's going on.
>
> To start, it helps by explaining what LSHIFT_U64 (lshift) and RSHIFT_U64
> (rshift) is designed to mean. Consider the core of the
> BPF_CORE_READ_BITFIELD() algorithm:
>
> val <<= __CORE_RELO(s, field, LSHIFT_U64);
> val = val >> __CORE_RELO(s, field, RSHIFT_U64);
>
> Basically what happens is we lshift to clear the non-relevant (blank)
> higher order bits. Then we rshift to bring the relevant bits (bitfield)
> down to LSB position (while also clearing blank lower order bits). To
> illustrate:
>
> Start: ........XXX......
> Lshift: XXX......00000000
> Rshift: 00000000000000XXX
>
> where `.` means blank bit, `0` means 0 bit, and `X` means bitfield bit.
>
> After the two operations, the bitfield is ready to be interpreted as a
> regular integer.
>
> Next, we want to build an alternative (but more helpful) mental model
> on lshift and rshift. That is, to consider:
>
> * rshift as the total number of blank bits in the u64
> * lshift as number of blank bits left of the bitfield in the u64
>
> Take a moment to consider why that is true by consulting the above
> diagram.
>
> With this insight, we can how define the following relationship:
>
> bitfield
> _
> | |
> 0.....00XXX0...00
> | | | |
> |______| | |
> lshift | |
> |____|
> (rshift - lshift)
>
> That is, we know the number of higher order blank bits is just lshift.
> And the number of lower order blank bits is (rshift - lshift).
>
> Finally, we can examine the core of the write side algorithm:
>
> mask = (~0ULL << rshift) >> lshift; // 1
> nval = new_val; // 2
> nval = (nval << rpad) & mask; // 3
> val = (val & ~mask) | nval; // 4
>
> (1): Compute a mask where the set bits are the bitfield bits. The first
> left shift zeros out exactly the number of blank bits, leaving a
> bitfield sized set of 1s. The subsequent right shift inserts the
> correct amount of higher order blank bits.
> (2): Place the new value into a word sized container, nval.
> (3): Place nval at the correct bit position and mask out blank bits.
> (4): Mix the bitfield in with original surrounding blank bits.
>
> [0]: https://reviews.llvm.org/D133361
> Co-authored-by: Eduard Zingerman <[email protected]>
> Signed-off-by: Eduard Zingerman <[email protected]>
Just pointing out I inserted Eduard's tags here. Eduard - I hope that's
OK. Not sure what the usual procedure for this is.
> Co-authored-by: Jonathan Lemon <[email protected]>
> Signed-off-by: Jonathan Lemon <[email protected]>
> Signed-off-by: Daniel Xu <[email protected]>
> ---
> tools/lib/bpf/bpf_core_read.h | 34 ++++++++++++++++++++++++++++++++++
> 1 file changed, 34 insertions(+)
>
[..]
On Fri, 2023-12-01 at 13:51 -0700, Daniel Xu wrote:
> On Fri, Dec 01, 2023 at 01:23:14PM -0700, Daniel Xu via Devel wrote:
> > === Motivation ===
> >
> > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > writing wrapper to make the verifier happy.
> >
> > Two alternatives to this approach are:
> >
> > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > CO-RE on specific structs.
> > 2. Use broader byte-sized writes to write to bitfields.
> >
> > (1) is a bit hard to use. It requires specific and not-very-obvious
> > annotations to bpftool generated vmlinux.h. It's also not generally
> > available in released LLVM versions yet.
> >
> > (2) makes the code quite hard to read and write. And especially if
> > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > to have an inverse helper for writing.
> >
> > === Implementation details ===
> >
> > Since the logic is a bit non-obvious, I thought it would be helpful
> > to explain exactly what's going on.
> >
> > To start, it helps by explaining what LSHIFT_U64 (lshift) and RSHIFT_U64
> > (rshift) is designed to mean. Consider the core of the
> > BPF_CORE_READ_BITFIELD() algorithm:
> >
> > val <<= __CORE_RELO(s, field, LSHIFT_U64);
> > val = val >> __CORE_RELO(s, field, RSHIFT_U64);
> >
> > Basically what happens is we lshift to clear the non-relevant (blank)
> > higher order bits. Then we rshift to bring the relevant bits (bitfield)
> > down to LSB position (while also clearing blank lower order bits). To
> > illustrate:
> >
> > Start: ........XXX......
> > Lshift: XXX......00000000
> > Rshift: 00000000000000XXX
> >
> > where `.` means blank bit, `0` means 0 bit, and `X` means bitfield bit.
> >
> > After the two operations, the bitfield is ready to be interpreted as a
> > regular integer.
> >
> > Next, we want to build an alternative (but more helpful) mental model
> > on lshift and rshift. That is, to consider:
> >
> > * rshift as the total number of blank bits in the u64
> > * lshift as number of blank bits left of the bitfield in the u64
> >
> > Take a moment to consider why that is true by consulting the above
> > diagram.
> >
> > With this insight, we can how define the following relationship:
> >
> > bitfield
> > _
> > | |
> > 0.....00XXX0...00
> > | | | |
> > |______| | |
> > lshift | |
> > |____|
> > (rshift - lshift)
> >
> > That is, we know the number of higher order blank bits is just lshift.
> > And the number of lower order blank bits is (rshift - lshift).
> >
> > Finally, we can examine the core of the write side algorithm:
> >
> > mask = (~0ULL << rshift) >> lshift; // 1
> > nval = new_val; // 2
> > nval = (nval << rpad) & mask; // 3
> > val = (val & ~mask) | nval; // 4
> >
> > (1): Compute a mask where the set bits are the bitfield bits. The first
> > left shift zeros out exactly the number of blank bits, leaving a
> > bitfield sized set of 1s. The subsequent right shift inserts the
> > correct amount of higher order blank bits.
> > (2): Place the new value into a word sized container, nval.
> > (3): Place nval at the correct bit position and mask out blank bits.
> > (4): Mix the bitfield in with original surrounding blank bits.
> >
> > [0]: https://reviews.llvm.org/D133361
> > Co-authored-by: Eduard Zingerman <[email protected]>
> > Signed-off-by: Eduard Zingerman <[email protected]>
>
> Just pointing out I inserted Eduard's tags here. Eduard - I hope that's
> OK. Not sure what the usual procedure for this is.
Not that I did a big contribution, you and Andrii figured out a much
better (and correct) expression :) I'm fine with and without this tag,
thank you for working on this.
On Fri, Dec 1, 2023 at 12:24 PM Daniel Xu <[email protected]> wrote:
>
> === Motivation ===
>
> Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> writing wrapper to make the verifier happy.
>
> Two alternatives to this approach are:
>
> 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> CO-RE on specific structs.
> 2. Use broader byte-sized writes to write to bitfields.
>
> (1) is a bit hard to use. It requires specific and not-very-obvious
> annotations to bpftool generated vmlinux.h. It's also not generally
> available in released LLVM versions yet.
>
> (2) makes the code quite hard to read and write. And especially if
> BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> to have an inverse helper for writing.
>
> === Implementation details ===
>
> Since the logic is a bit non-obvious, I thought it would be helpful
> to explain exactly what's going on.
>
> To start, it helps by explaining what LSHIFT_U64 (lshift) and RSHIFT_U64
> (rshift) is designed to mean. Consider the core of the
> BPF_CORE_READ_BITFIELD() algorithm:
>
> val <<= __CORE_RELO(s, field, LSHIFT_U64);
> val = val >> __CORE_RELO(s, field, RSHIFT_U64);
nit: indentation is off?
>
> Basically what happens is we lshift to clear the non-relevant (blank)
> higher order bits. Then we rshift to bring the relevant bits (bitfield)
> down to LSB position (while also clearing blank lower order bits). To
> illustrate:
>
> Start: ........XXX......
> Lshift: XXX......00000000
> Rshift: 00000000000000XXX
>
> where `.` means blank bit, `0` means 0 bit, and `X` means bitfield bit.
>
> After the two operations, the bitfield is ready to be interpreted as a
> regular integer.
>
> Next, we want to build an alternative (but more helpful) mental model
> on lshift and rshift. That is, to consider:
>
> * rshift as the total number of blank bits in the u64
> * lshift as number of blank bits left of the bitfield in the u64
>
> Take a moment to consider why that is true by consulting the above
> diagram.
>
> With this insight, we can how define the following relationship:
>
> bitfield
> _
> | |
> 0.....00XXX0...00
> | | | |
> |______| | |
> lshift | |
> |____|
> (rshift - lshift)
>
> That is, we know the number of higher order blank bits is just lshift.
> And the number of lower order blank bits is (rshift - lshift).
>
Nice diagrams and description, thanks!
> Finally, we can examine the core of the write side algorithm:
>
> mask = (~0ULL << rshift) >> lshift; // 1
> nval = new_val; // 2
> nval = (nval << rpad) & mask; // 3
> val = (val & ~mask) | nval; // 4
>
> (1): Compute a mask where the set bits are the bitfield bits. The first
> left shift zeros out exactly the number of blank bits, leaving a
> bitfield sized set of 1s. The subsequent right shift inserts the
> correct amount of higher order blank bits.
> (2): Place the new value into a word sized container, nval.
> (3): Place nval at the correct bit position and mask out blank bits.
> (4): Mix the bitfield in with original surrounding blank bits.
>
> [0]: https://reviews.llvm.org/D133361
> Co-authored-by: Eduard Zingerman <[email protected]>
> Signed-off-by: Eduard Zingerman <[email protected]>
> Co-authored-by: Jonathan Lemon <[email protected]>
> Signed-off-by: Jonathan Lemon <[email protected]>
> Signed-off-by: Daniel Xu <[email protected]>
> ---
> tools/lib/bpf/bpf_core_read.h | 34 ++++++++++++++++++++++++++++++++++
> 1 file changed, 34 insertions(+)
>
> diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> index 1ac57bb7ac55..a7ffb80e3539 100644
> --- a/tools/lib/bpf/bpf_core_read.h
> +++ b/tools/lib/bpf/bpf_core_read.h
> @@ -111,6 +111,40 @@ enum bpf_enum_value_kind {
> val; \
> })
>
> +/*
> + * Write to a bitfield, identified by s->field.
> + * This is the inverse of BPF_CORE_WRITE_BITFIELD().
> + */
> +#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({ \
> + void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \
> + unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> + unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> + unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> + unsigned int rpad = rshift - lshift; \
> + unsigned long long nval, mask, val; \
> + \
> + asm volatile("" : "+r"(p)); \
> + \
> + switch (byte_size) { \
> + case 1: val = *(unsigned char *)p; break; \
> + case 2: val = *(unsigned short *)p; break; \
> + case 4: val = *(unsigned int *)p; break; \
> + case 8: val = *(unsigned long long *)p; break; \
> + } \
> + \
> + mask = (~0ULL << rshift) >> lshift; \
> + nval = new_val; \
> + nval = (nval << rpad) & mask; \
> + val = (val & ~mask) | nval; \
I'd simplify it to not need nval at all
val = (val & ~mask) | ((new_val << rpad) & mask);
I actually find it easier to follow and make sure we are not doing
anything unexpected. First part before |, we take old value and clear
bits we are about to set, second part after |, we take bitfield value,
shift it in position, and just in case mask it out if it's too big to
fit. Combine, done.
Other than that, it looks good.
> + \
> + switch (byte_size) { \
> + case 1: *(unsigned char *)p = val; break; \
> + case 2: *(unsigned short *)p = val; break; \
> + case 4: *(unsigned int *)p = val; break; \
> + case 8: *(unsigned long long *)p = val; break; \
> + } \
> +})
> +
> #define ___bpf_field_ref1(field) (field)
> #define ___bpf_field_ref2(type, field) (((typeof(type) *)0)->field)
> #define ___bpf_field_ref(args...) \
> --
> 2.42.1
>
On Fri, Dec 1, 2023 at 12:24 PM Daniel Xu <[email protected]> wrote:
>
> This commit adds support for per-prog btf_custom_path. This is necessary
> for testing CO-RE relocations on non-vmlinux types using test_loader
> infrastructure.
>
> Signed-off-by: Daniel Xu <[email protected]>
> ---
> tools/testing/selftests/bpf/progs/bpf_misc.h | 1 +
> tools/testing/selftests/bpf/test_loader.c | 7 +++++++
> 2 files changed, 8 insertions(+)
>
Acked-by: Andrii Nakryiko <[email protected]>
> diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
> index 799fff4995d8..2fd59970c43a 100644
> --- a/tools/testing/selftests/bpf/progs/bpf_misc.h
> +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
> @@ -71,6 +71,7 @@
> #define __retval_unpriv(val) __attribute__((btf_decl_tag("comment:test_retval_unpriv="#val)))
> #define __auxiliary __attribute__((btf_decl_tag("comment:test_auxiliary")))
> #define __auxiliary_unpriv __attribute__((btf_decl_tag("comment:test_auxiliary_unpriv")))
> +#define __btf_path(path) __attribute__((btf_decl_tag("comment:test_btf_path=" path)))
>
> /* Convenience macro for use with 'asm volatile' blocks */
> #define __naked __attribute__((naked))
> diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c
> index a350ecdfba4a..74ceb7877ae2 100644
> --- a/tools/testing/selftests/bpf/test_loader.c
> +++ b/tools/testing/selftests/bpf/test_loader.c
> @@ -27,6 +27,7 @@
> #define TEST_TAG_RETVAL_PFX_UNPRIV "comment:test_retval_unpriv="
> #define TEST_TAG_AUXILIARY "comment:test_auxiliary"
> #define TEST_TAG_AUXILIARY_UNPRIV "comment:test_auxiliary_unpriv"
> +#define TEST_BTF_PATH "comment:test_btf_path="
>
> /* Warning: duplicated in bpf_misc.h */
> #define POINTER_VALUE 0xcafe4all
> @@ -58,6 +59,7 @@ struct test_spec {
> const char *prog_name;
> struct test_subspec priv;
> struct test_subspec unpriv;
> + const char *btf_custom_path;
> int log_level;
> int prog_flags;
> int mode_mask;
> @@ -288,6 +290,8 @@ static int parse_test_spec(struct test_loader *tester,
> goto cleanup;
> update_flags(&spec->prog_flags, flags, clear);
> }
> + } else if (str_has_pfx(s, TEST_BTF_PATH)) {
> + spec->btf_custom_path = s + sizeof(TEST_BTF_PATH) - 1;
> }
> }
>
> @@ -578,6 +582,9 @@ void run_subtest(struct test_loader *tester,
> }
> }
>
> + /* Implicitly reset to NULL if next test case doesn't specify */
> + open_opts->btf_custom_path = spec->btf_custom_path;
> +
> tobj = bpf_object__open_mem(obj_bytes, obj_byte_cnt, open_opts);
> if (!ASSERT_OK_PTR(tobj, "obj_open_mem")) /* shouldn't happen */
> goto subtest_cleanup;
> --
> 2.42.1
>
On Fri, Dec 1, 2023 at 12:24 PM Daniel Xu <[email protected]> wrote:
>
> Add some tests that exercise BPF_CORE_WRITE_BITFIELD() macro. Since some
> non-trivial bit fiddling is going on, make sure various edge cases (such
> as adjacent bitfields and bitfields at the edge of structs) are
> exercised.
>
> Signed-off-by: Daniel Xu <[email protected]>
> ---
> .../selftests/bpf/prog_tests/verifier.c | 2 +
> .../bpf/progs/verifier_bitfield_write.c | 100 ++++++++++++++++++
> 2 files changed, 102 insertions(+)
> create mode 100644 tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
>
LGTM, but I'm not sure why we need all those __failure_unpriv, see
below. Regardless:
Acked-by: Andrii Nakryiko <[email protected]>
> diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
> index 5cfa7a6316b6..67b4948865a3 100644
> --- a/tools/testing/selftests/bpf/prog_tests/verifier.c
> +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
> @@ -6,6 +6,7 @@
> #include "verifier_and.skel.h"
> #include "verifier_array_access.skel.h"
> #include "verifier_basic_stack.skel.h"
> +#include "verifier_bitfield_write.skel.h"
> #include "verifier_bounds.skel.h"
> #include "verifier_bounds_deduction.skel.h"
> #include "verifier_bounds_deduction_non_const.skel.h"
> @@ -115,6 +116,7 @@ static void run_tests_aux(const char *skel_name,
>
> void test_verifier_and(void) { RUN(verifier_and); }
> void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); }
> +void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); }
> void test_verifier_bounds(void) { RUN(verifier_bounds); }
> void test_verifier_bounds_deduction(void) { RUN(verifier_bounds_deduction); }
> void test_verifier_bounds_deduction_non_const(void) { RUN(verifier_bounds_deduction_non_const); }
> diff --git a/tools/testing/selftests/bpf/progs/verifier_bitfield_write.c b/tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
> new file mode 100644
> index 000000000000..8fe355a19ba6
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
> @@ -0,0 +1,100 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/bpf.h>
> +#include <stdint.h>
> +
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_core_read.h>
> +
> +#include "bpf_misc.h"
> +
> +struct core_reloc_bitfields {
> + /* unsigned bitfields */
> + uint8_t ub1: 1;
> + uint8_t ub2: 2;
> + uint32_t ub7: 7;
> + /* signed bitfields */
> + int8_t sb4: 4;
> + int32_t sb20: 20;
> + /* non-bitfields */
> + uint32_t u32;
> + int32_t s32;
> +} __attribute__((preserve_access_index));
> +
> +SEC("tc")
> +__description("single CO-RE bitfield roundtrip")
> +__btf_path("btf__core_reloc_bitfields.bpf.o")
> +__success __failure_unpriv
do we want __failure_unpriv at all? Is this failure related to
*bitfield* logic at all?
> +__retval(3)
> +int single_field_roundtrip(struct __sk_buff *ctx)
> +{
> + struct core_reloc_bitfields bitfields;
> +
> + __builtin_memset(&bitfields, 0, sizeof(bitfields));
> + BPF_CORE_WRITE_BITFIELD(&bitfields, ub2, 3);
> + return BPF_CORE_READ_BITFIELD(&bitfields, ub2);
> +}
> +
[...]
On Fri, Dec 1, 2023 at 12:23 PM Daniel Xu <[email protected]> wrote:
>
> This patchset adds two kfunc helpers, bpf_xdp_get_xfrm_state() and
> bpf_xdp_xfrm_state_release() that wrap xfrm_state_lookup() and
> xfrm_state_put(). The intent is to support software RSS (via XDP) for
> the ongoing/upcoming ipsec pcpu work [0]. Recent experiments performed
> on (hopefully) reproducible AWS testbeds indicate that single tunnel
> pcpu ipsec can reach line rate on 100G ENA nics.
>
> Note this patchset only tests/shows generic xfrm_state access. The
> "secret sauce" (if you can really even call it that) involves accessing
> a soon-to-be-upstreamed pcpu_num field in xfrm_state. Early example is
> available here [1].
>
> [0]: https://datatracker.ietf.org/doc/draft-ietf-ipsecme-multi-sa-performance/03/
> [1]: https://github.com/danobi/xdp-tools/blob/e89a1c617aba3b50d990f779357d6ce2863ecb27/xdp-bench/xdp_redirect_cpumap.bpf.c#L385-L406
>
> Changes from v2:
> * Fix/simplify BPF_CORE_WRITE_BITFIELD() algorithm
> * Added verifier tests for bitfield writes
> * Fix state leakage across test_tunnel subtests
>
> Changes from v1:
> * Move xfrm tunnel tests to test_progs
> * Fix writing to opts->error when opts is invalid
> * Use __bpf_kfunc_start_defs()
> * Remove unused vxlanhdr definition
> * Add and use BPF_CORE_WRITE_BITFIELD() macro
> * Make series bisect clean
>
> Changes from RFCv2:
> * Rebased to ipsec-next
> * Fix netns leak
>
> Changes from RFCv1:
> * Add Antony's commit tags
> * Add KF_ACQUIRE and KF_RELEASE semantics
>
> Daniel Xu (9):
> bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc
> bpf: xfrm: Add bpf_xdp_xfrm_state_release() kfunc
> libbpf: Add BPF_CORE_WRITE_BITFIELD() macro
> bpf: selftests: test_loader: Support __btf_path() annotation
> libbpf: selftests: Add verifier tests for CO-RE bitfield writes
> bpf: selftests: test_tunnel: Setup fresh topology for each subtest
> bpf: selftests: test_tunnel: Use vmlinux.h declarations
> bpf: selftests: Move xfrm tunnel test to test_progs
> bpf: xfrm: Add selftest for bpf_xdp_get_xfrm_state()
>
> include/net/xfrm.h | 9 +
> net/xfrm/Makefile | 1 +
> net/xfrm/xfrm_policy.c | 2 +
> net/xfrm/xfrm_state_bpf.c | 128 ++++++++++++++
> tools/lib/bpf/bpf_core_read.h | 34 ++++
> .../selftests/bpf/prog_tests/test_tunnel.c | 162 +++++++++++++++++-
> .../selftests/bpf/prog_tests/verifier.c | 2 +
> tools/testing/selftests/bpf/progs/bpf_misc.h | 1 +
> .../selftests/bpf/progs/bpf_tracing_net.h | 1 +
> .../selftests/bpf/progs/test_tunnel_kern.c | 138 ++++++++-------
> .../bpf/progs/verifier_bitfield_write.c | 100 +++++++++++
> tools/testing/selftests/bpf/test_loader.c | 7 +
> tools/testing/selftests/bpf/test_tunnel.sh | 92 ----------
> 13 files changed, 522 insertions(+), 155 deletions(-)
I really think this should go via bpf-next tree.
The bpf changes are much bigger than ipsec.
Hi Andrii,
On Fri, Dec 01, 2023 at 03:52:25PM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 1, 2023 at 12:24 PM Daniel Xu <[email protected]> wrote:
> >
> > Add some tests that exercise BPF_CORE_WRITE_BITFIELD() macro. Since some
> > non-trivial bit fiddling is going on, make sure various edge cases (such
> > as adjacent bitfields and bitfields at the edge of structs) are
> > exercised.
> >
> > Signed-off-by: Daniel Xu <[email protected]>
> > ---
> > .../selftests/bpf/prog_tests/verifier.c | 2 +
> > .../bpf/progs/verifier_bitfield_write.c | 100 ++++++++++++++++++
> > 2 files changed, 102 insertions(+)
> > create mode 100644 tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
> >
>
> LGTM, but I'm not sure why we need all those __failure_unpriv, see
> below. Regardless:
>
> Acked-by: Andrii Nakryiko <[email protected]>
>
> > diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
> > index 5cfa7a6316b6..67b4948865a3 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/verifier.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
> > @@ -6,6 +6,7 @@
> > #include "verifier_and.skel.h"
> > #include "verifier_array_access.skel.h"
> > #include "verifier_basic_stack.skel.h"
> > +#include "verifier_bitfield_write.skel.h"
> > #include "verifier_bounds.skel.h"
> > #include "verifier_bounds_deduction.skel.h"
> > #include "verifier_bounds_deduction_non_const.skel.h"
> > @@ -115,6 +116,7 @@ static void run_tests_aux(const char *skel_name,
> >
> > void test_verifier_and(void) { RUN(verifier_and); }
> > void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); }
> > +void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); }
> > void test_verifier_bounds(void) { RUN(verifier_bounds); }
> > void test_verifier_bounds_deduction(void) { RUN(verifier_bounds_deduction); }
> > void test_verifier_bounds_deduction_non_const(void) { RUN(verifier_bounds_deduction_non_const); }
> > diff --git a/tools/testing/selftests/bpf/progs/verifier_bitfield_write.c b/tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
> > new file mode 100644
> > index 000000000000..8fe355a19ba6
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/verifier_bitfield_write.c
> > @@ -0,0 +1,100 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#include <linux/bpf.h>
> > +#include <stdint.h>
> > +
> > +#include <bpf/bpf_helpers.h>
> > +#include <bpf/bpf_core_read.h>
> > +
> > +#include "bpf_misc.h"
> > +
> > +struct core_reloc_bitfields {
> > + /* unsigned bitfields */
> > + uint8_t ub1: 1;
> > + uint8_t ub2: 2;
> > + uint32_t ub7: 7;
> > + /* signed bitfields */
> > + int8_t sb4: 4;
> > + int32_t sb20: 20;
> > + /* non-bitfields */
> > + uint32_t u32;
> > + int32_t s32;
> > +} __attribute__((preserve_access_index));
> > +
> > +SEC("tc")
> > +__description("single CO-RE bitfield roundtrip")
> > +__btf_path("btf__core_reloc_bitfields.bpf.o")
> > +__success __failure_unpriv
>
> do we want __failure_unpriv at all? Is this failure related to
> *bitfield* logic at all?
Oh, I pre-emptively added it. From the docs, I thought __failure_unpriv
meant "don't try to load this as an unprivileged used cuz it'll fail".
And since I used the tc hook, I figured it'd fail.
Removing the annotation doesn't seem to do anything bad so I'll drop it
for v4.
[...]
Thanks,
Daniel
On Fri, Dec 01, 2023 at 03:49:30PM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 1, 2023 at 12:24 PM Daniel Xu <[email protected]> wrote:
> >
> > === Motivation ===
> >
> > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > writing wrapper to make the verifier happy.
> >
> > Two alternatives to this approach are:
> >
> > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > CO-RE on specific structs.
> > 2. Use broader byte-sized writes to write to bitfields.
> >
> > (1) is a bit hard to use. It requires specific and not-very-obvious
> > annotations to bpftool generated vmlinux.h. It's also not generally
> > available in released LLVM versions yet.
> >
> > (2) makes the code quite hard to read and write. And especially if
> > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > to have an inverse helper for writing.
> >
> > === Implementation details ===
> >
> > Since the logic is a bit non-obvious, I thought it would be helpful
> > to explain exactly what's going on.
> >
> > To start, it helps by explaining what LSHIFT_U64 (lshift) and RSHIFT_U64
> > (rshift) is designed to mean. Consider the core of the
> > BPF_CORE_READ_BITFIELD() algorithm:
> >
> > val <<= __CORE_RELO(s, field, LSHIFT_U64);
> > val = val >> __CORE_RELO(s, field, RSHIFT_U64);
>
> nit: indentation is off?
Oops, it's cuz I only deleted the SIGNED check. Will fix.
>
> >
> > Basically what happens is we lshift to clear the non-relevant (blank)
> > higher order bits. Then we rshift to bring the relevant bits (bitfield)
> > down to LSB position (while also clearing blank lower order bits). To
> > illustrate:
> >
> > Start: ........XXX......
> > Lshift: XXX......00000000
> > Rshift: 00000000000000XXX
> >
> > where `.` means blank bit, `0` means 0 bit, and `X` means bitfield bit.
> >
> > After the two operations, the bitfield is ready to be interpreted as a
> > regular integer.
> >
> > Next, we want to build an alternative (but more helpful) mental model
> > on lshift and rshift. That is, to consider:
> >
> > * rshift as the total number of blank bits in the u64
> > * lshift as number of blank bits left of the bitfield in the u64
> >
> > Take a moment to consider why that is true by consulting the above
> > diagram.
> >
> > With this insight, we can how define the following relationship:
> >
> > bitfield
> > _
> > | |
> > 0.....00XXX0...00
> > | | | |
> > |______| | |
> > lshift | |
> > |____|
> > (rshift - lshift)
> >
> > That is, we know the number of higher order blank bits is just lshift.
> > And the number of lower order blank bits is (rshift - lshift).
> >
>
> Nice diagrams and description, thanks!
Thanks!
>
> > Finally, we can examine the core of the write side algorithm:
> >
> > mask = (~0ULL << rshift) >> lshift; // 1
> > nval = new_val; // 2
> > nval = (nval << rpad) & mask; // 3
> > val = (val & ~mask) | nval; // 4
> >
> > (1): Compute a mask where the set bits are the bitfield bits. The first
> > left shift zeros out exactly the number of blank bits, leaving a
> > bitfield sized set of 1s. The subsequent right shift inserts the
> > correct amount of higher order blank bits.
> > (2): Place the new value into a word sized container, nval.
> > (3): Place nval at the correct bit position and mask out blank bits.
> > (4): Mix the bitfield in with original surrounding blank bits.
> >
> > [0]: https://reviews.llvm.org/D133361
> > Co-authored-by: Eduard Zingerman <[email protected]>
> > Signed-off-by: Eduard Zingerman <[email protected]>
> > Co-authored-by: Jonathan Lemon <[email protected]>
> > Signed-off-by: Jonathan Lemon <[email protected]>
> > Signed-off-by: Daniel Xu <[email protected]>
> > ---
> > tools/lib/bpf/bpf_core_read.h | 34 ++++++++++++++++++++++++++++++++++
> > 1 file changed, 34 insertions(+)
> >
> > diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> > index 1ac57bb7ac55..a7ffb80e3539 100644
> > --- a/tools/lib/bpf/bpf_core_read.h
> > +++ b/tools/lib/bpf/bpf_core_read.h
> > @@ -111,6 +111,40 @@ enum bpf_enum_value_kind {
> > val; \
> > })
> >
> > +/*
> > + * Write to a bitfield, identified by s->field.
> > + * This is the inverse of BPF_CORE_WRITE_BITFIELD().
> > + */
> > +#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({ \
> > + void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \
> > + unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> > + unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> > + unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> > + unsigned int rpad = rshift - lshift; \
> > + unsigned long long nval, mask, val; \
> > + \
> > + asm volatile("" : "+r"(p)); \
> > + \
> > + switch (byte_size) { \
> > + case 1: val = *(unsigned char *)p; break; \
> > + case 2: val = *(unsigned short *)p; break; \
> > + case 4: val = *(unsigned int *)p; break; \
> > + case 8: val = *(unsigned long long *)p; break; \
> > + } \
> > + \
> > + mask = (~0ULL << rshift) >> lshift; \
> > + nval = new_val; \
> > + nval = (nval << rpad) & mask; \
> > + val = (val & ~mask) | nval; \
>
> I'd simplify it to not need nval at all
>
> val = (val & ~mask) | ((new_val << rpad) & mask);
>
> I actually find it easier to follow and make sure we are not doing
> anything unexpected. First part before |, we take old value and clear
> bits we are about to set, second part after |, we take bitfield value,
> shift it in position, and just in case mask it out if it's too big to
> fit. Combine, done.
>
> Other than that, it looks good.
I mostly left it there for the cast. Cuz injecting the `unsigned long
long` cast made the line really long. How about this instead?
diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
index a7ffb80e3539..7325a12692a3 100644
--- a/tools/lib/bpf/bpf_core_read.h
+++ b/tools/lib/bpf/bpf_core_read.h
@@ -120,8 +120,8 @@ enum bpf_enum_value_kind {
unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
+ unsigned long long mask, val, nval = new_val; \
unsigned int rpad = rshift - lshift; \
- unsigned long long nval, mask, val; \
\
asm volatile("" : "+r"(p)); \
\
@@ -133,9 +133,7 @@ enum bpf_enum_value_kind {
} \
\
mask = (~0ULL << rshift) >> lshift; \
- nval = new_val; \
- nval = (nval << rpad) & mask; \
- val = (val & ~mask) | nval; \
+ val = (val & ~mask) | ((nval << rpad) & mask); \
\
switch (byte_size) { \
case 1: *(unsigned char *)p = val; break; \
Thanks,
Daniel
On Fri, Dec 01, 2023 at 04:10:18PM -0800, Alexei Starovoitov wrote:
> On Fri, Dec 1, 2023 at 12:23 PM Daniel Xu <[email protected]> wrote:
> >
> > This patchset adds two kfunc helpers, bpf_xdp_get_xfrm_state() and
> > bpf_xdp_xfrm_state_release() that wrap xfrm_state_lookup() and
> > xfrm_state_put(). The intent is to support software RSS (via XDP) for
> > the ongoing/upcoming ipsec pcpu work [0]. Recent experiments performed
> > on (hopefully) reproducible AWS testbeds indicate that single tunnel
> > pcpu ipsec can reach line rate on 100G ENA nics.
> >
> > Note this patchset only tests/shows generic xfrm_state access. The
> > "secret sauce" (if you can really even call it that) involves accessing
> > a soon-to-be-upstreamed pcpu_num field in xfrm_state. Early example is
> > available here [1].
> >
> > [0]: https://datatracker.ietf.org/doc/draft-ietf-ipsecme-multi-sa-performance/03/
> > [1]: https://github.com/danobi/xdp-tools/blob/e89a1c617aba3b50d990f779357d6ce2863ecb27/xdp-bench/xdp_redirect_cpumap.bpf.c#L385-L406
> >
> > Changes from v2:
> > * Fix/simplify BPF_CORE_WRITE_BITFIELD() algorithm
> > * Added verifier tests for bitfield writes
> > * Fix state leakage across test_tunnel subtests
> >
> > Changes from v1:
> > * Move xfrm tunnel tests to test_progs
> > * Fix writing to opts->error when opts is invalid
> > * Use __bpf_kfunc_start_defs()
> > * Remove unused vxlanhdr definition
> > * Add and use BPF_CORE_WRITE_BITFIELD() macro
> > * Make series bisect clean
> >
> > Changes from RFCv2:
> > * Rebased to ipsec-next
> > * Fix netns leak
> >
> > Changes from RFCv1:
> > * Add Antony's commit tags
> > * Add KF_ACQUIRE and KF_RELEASE semantics
> >
> > Daniel Xu (9):
> > bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc
> > bpf: xfrm: Add bpf_xdp_xfrm_state_release() kfunc
> > libbpf: Add BPF_CORE_WRITE_BITFIELD() macro
> > bpf: selftests: test_loader: Support __btf_path() annotation
> > libbpf: selftests: Add verifier tests for CO-RE bitfield writes
> > bpf: selftests: test_tunnel: Setup fresh topology for each subtest
> > bpf: selftests: test_tunnel: Use vmlinux.h declarations
> > bpf: selftests: Move xfrm tunnel test to test_progs
> > bpf: xfrm: Add selftest for bpf_xdp_get_xfrm_state()
> >
> > include/net/xfrm.h | 9 +
> > net/xfrm/Makefile | 1 +
> > net/xfrm/xfrm_policy.c | 2 +
> > net/xfrm/xfrm_state_bpf.c | 128 ++++++++++++++
> > tools/lib/bpf/bpf_core_read.h | 34 ++++
> > .../selftests/bpf/prog_tests/test_tunnel.c | 162 +++++++++++++++++-
> > .../selftests/bpf/prog_tests/verifier.c | 2 +
> > tools/testing/selftests/bpf/progs/bpf_misc.h | 1 +
> > .../selftests/bpf/progs/bpf_tracing_net.h | 1 +
> > .../selftests/bpf/progs/test_tunnel_kern.c | 138 ++++++++-------
> > .../bpf/progs/verifier_bitfield_write.c | 100 +++++++++++
> > tools/testing/selftests/bpf/test_loader.c | 7 +
> > tools/testing/selftests/bpf/test_tunnel.sh | 92 ----------
> > 13 files changed, 522 insertions(+), 155 deletions(-)
>
> I really think this should go via bpf-next tree.
> The bpf changes are much bigger than ipsec.
Ack. Ended up picking up a lot of stuff along the way.
On Fri, 2023-12-01 at 17:10 -0700, Daniel Xu wrote:
[...]
> > > +SEC("tc")
> > > +__description("single CO-RE bitfield roundtrip")
> > > +__btf_path("btf__core_reloc_bitfields.bpf.o")
> > > +__success __failure_unpriv
> >
> > do we want __failure_unpriv at all? Is this failure related to
> > *bitfield* logic at all?
>
> Oh, I pre-emptively added it. From the docs, I thought __failure_unpriv
> meant "don't try to load this as an unprivileged used cuz it'll fail".
> And since I used the tc hook, I figured it'd fail.
Actually it means:
"try to load as unprivileged user and expect failure,
report error on successful load".
In general, the meaning of "___xxx" and "___xxx_unpriv" annotations
is identical, except first instructs to run the test in privileged mode,
while second instructs to run test in unprivileged mode:
- if only annotations w/o "*_unpriv" suffix are present the test would
be executed as privileged;
- if only annotations with "*_unpriv" suffix are present the test would
be executed as unprivileged;
- if both kinds of annotations are present the test would be executed
in both modes.
[...]
On Fri, Dec 1, 2023 at 4:13 PM Daniel Xu <[email protected]> wrote:
>
> On Fri, Dec 01, 2023 at 03:49:30PM -0800, Andrii Nakryiko wrote:
> > On Fri, Dec 1, 2023 at 12:24 PM Daniel Xu <[email protected]> wrote:
> > >
> > > === Motivation ===
> > >
> > > Similar to reading from CO-RE bitfields, we need a CO-RE aware bitfield
> > > writing wrapper to make the verifier happy.
> > >
> > > Two alternatives to this approach are:
> > >
> > > 1. Use the upcoming `preserve_static_offset` [0] attribute to disable
> > > CO-RE on specific structs.
> > > 2. Use broader byte-sized writes to write to bitfields.
> > >
> > > (1) is a bit hard to use. It requires specific and not-very-obvious
> > > annotations to bpftool generated vmlinux.h. It's also not generally
> > > available in released LLVM versions yet.
> > >
> > > (2) makes the code quite hard to read and write. And especially if
> > > BPF_CORE_READ_BITFIELD() is already being used, it makes more sense to
> > > to have an inverse helper for writing.
> > >
> > > === Implementation details ===
> > >
> > > Since the logic is a bit non-obvious, I thought it would be helpful
> > > to explain exactly what's going on.
> > >
> > > To start, it helps by explaining what LSHIFT_U64 (lshift) and RSHIFT_U64
> > > (rshift) is designed to mean. Consider the core of the
> > > BPF_CORE_READ_BITFIELD() algorithm:
> > >
> > > val <<= __CORE_RELO(s, field, LSHIFT_U64);
> > > val = val >> __CORE_RELO(s, field, RSHIFT_U64);
> >
> > nit: indentation is off?
>
> Oops, it's cuz I only deleted the SIGNED check. Will fix.
> >
> > >
> > > Basically what happens is we lshift to clear the non-relevant (blank)
> > > higher order bits. Then we rshift to bring the relevant bits (bitfield)
> > > down to LSB position (while also clearing blank lower order bits). To
> > > illustrate:
> > >
> > > Start: ........XXX......
> > > Lshift: XXX......00000000
> > > Rshift: 00000000000000XXX
> > >
> > > where `.` means blank bit, `0` means 0 bit, and `X` means bitfield bit.
> > >
> > > After the two operations, the bitfield is ready to be interpreted as a
> > > regular integer.
> > >
> > > Next, we want to build an alternative (but more helpful) mental model
> > > on lshift and rshift. That is, to consider:
> > >
> > > * rshift as the total number of blank bits in the u64
> > > * lshift as number of blank bits left of the bitfield in the u64
> > >
> > > Take a moment to consider why that is true by consulting the above
> > > diagram.
> > >
> > > With this insight, we can how define the following relationship:
> > >
> > > bitfield
> > > _
> > > | |
> > > 0.....00XXX0...00
> > > | | | |
> > > |______| | |
> > > lshift | |
> > > |____|
> > > (rshift - lshift)
> > >
> > > That is, we know the number of higher order blank bits is just lshift.
> > > And the number of lower order blank bits is (rshift - lshift).
> > >
> >
> > Nice diagrams and description, thanks!
>
> Thanks!
>
> >
> > > Finally, we can examine the core of the write side algorithm:
> > >
> > > mask = (~0ULL << rshift) >> lshift; // 1
> > > nval = new_val; // 2
> > > nval = (nval << rpad) & mask; // 3
> > > val = (val & ~mask) | nval; // 4
> > >
> > > (1): Compute a mask where the set bits are the bitfield bits. The first
> > > left shift zeros out exactly the number of blank bits, leaving a
> > > bitfield sized set of 1s. The subsequent right shift inserts the
> > > correct amount of higher order blank bits.
> > > (2): Place the new value into a word sized container, nval.
> > > (3): Place nval at the correct bit position and mask out blank bits.
> > > (4): Mix the bitfield in with original surrounding blank bits.
> > >
> > > [0]: https://reviews.llvm.org/D133361
> > > Co-authored-by: Eduard Zingerman <[email protected]>
> > > Signed-off-by: Eduard Zingerman <[email protected]>
> > > Co-authored-by: Jonathan Lemon <[email protected]>
> > > Signed-off-by: Jonathan Lemon <[email protected]>
> > > Signed-off-by: Daniel Xu <[email protected]>
> > > ---
> > > tools/lib/bpf/bpf_core_read.h | 34 ++++++++++++++++++++++++++++++++++
> > > 1 file changed, 34 insertions(+)
> > >
> > > diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> > > index 1ac57bb7ac55..a7ffb80e3539 100644
> > > --- a/tools/lib/bpf/bpf_core_read.h
> > > +++ b/tools/lib/bpf/bpf_core_read.h
> > > @@ -111,6 +111,40 @@ enum bpf_enum_value_kind {
> > > val; \
> > > })
> > >
> > > +/*
> > > + * Write to a bitfield, identified by s->field.
> > > + * This is the inverse of BPF_CORE_WRITE_BITFIELD().
> > > + */
> > > +#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({ \
> > > + void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \
> > > + unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> > > + unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> > > + unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> > > + unsigned int rpad = rshift - lshift; \
> > > + unsigned long long nval, mask, val; \
> > > + \
> > > + asm volatile("" : "+r"(p)); \
> > > + \
> > > + switch (byte_size) { \
> > > + case 1: val = *(unsigned char *)p; break; \
> > > + case 2: val = *(unsigned short *)p; break; \
> > > + case 4: val = *(unsigned int *)p; break; \
> > > + case 8: val = *(unsigned long long *)p; break; \
> > > + } \
> > > + \
> > > + mask = (~0ULL << rshift) >> lshift; \
> > > + nval = new_val; \
> > > + nval = (nval << rpad) & mask; \
> > > + val = (val & ~mask) | nval; \
> >
> > I'd simplify it to not need nval at all
> >
> > val = (val & ~mask) | ((new_val << rpad) & mask);
> >
> > I actually find it easier to follow and make sure we are not doing
> > anything unexpected. First part before |, we take old value and clear
> > bits we are about to set, second part after |, we take bitfield value,
> > shift it in position, and just in case mask it out if it's too big to
> > fit. Combine, done.
> >
> > Other than that, it looks good.
>
> I mostly left it there for the cast. Cuz injecting the `unsigned long
> long` cast made the line really long. How about this instead?
>
> diff --git a/tools/lib/bpf/bpf_core_read.h b/tools/lib/bpf/bpf_core_read.h
> index a7ffb80e3539..7325a12692a3 100644
> --- a/tools/lib/bpf/bpf_core_read.h
> +++ b/tools/lib/bpf/bpf_core_read.h
> @@ -120,8 +120,8 @@ enum bpf_enum_value_kind {
> unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \
> unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \
> unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \
> + unsigned long long mask, val, nval = new_val; \
> unsigned int rpad = rshift - lshift; \
> - unsigned long long nval, mask, val; \
> \
> asm volatile("" : "+r"(p)); \
> \
> @@ -133,9 +133,7 @@ enum bpf_enum_value_kind {
> } \
> \
> mask = (~0ULL << rshift) >> lshift; \
> - nval = new_val; \
> - nval = (nval << rpad) & mask; \
> - val = (val & ~mask) | nval; \
> + val = (val & ~mask) | ((nval << rpad) & mask); \
sgtm
> \
> switch (byte_size) { \
> case 1: *(unsigned char *)p = val; break; \
>
>
> Thanks,
> Daniel
On Fri, Dec 01, 2023 at 05:16:04PM -0700, Daniel Xu wrote:
> On Fri, Dec 01, 2023 at 04:10:18PM -0800, Alexei Starovoitov wrote:
> > On Fri, Dec 1, 2023 at 12:23 PM Daniel Xu <[email protected]> wrote:
> > >
> > > include/net/xfrm.h | 9 +
> > > net/xfrm/Makefile | 1 +
> > > net/xfrm/xfrm_policy.c | 2 +
> > > net/xfrm/xfrm_state_bpf.c | 128 ++++++++++++++
> > > tools/lib/bpf/bpf_core_read.h | 34 ++++
> > > .../selftests/bpf/prog_tests/test_tunnel.c | 162 +++++++++++++++++-
> > > .../selftests/bpf/prog_tests/verifier.c | 2 +
> > > tools/testing/selftests/bpf/progs/bpf_misc.h | 1 +
> > > .../selftests/bpf/progs/bpf_tracing_net.h | 1 +
> > > .../selftests/bpf/progs/test_tunnel_kern.c | 138 ++++++++-------
> > > .../bpf/progs/verifier_bitfield_write.c | 100 +++++++++++
> > > tools/testing/selftests/bpf/test_loader.c | 7 +
> > > tools/testing/selftests/bpf/test_tunnel.sh | 92 ----------
> > > 13 files changed, 522 insertions(+), 155 deletions(-)
> >
> > I really think this should go via bpf-next tree.
> > The bpf changes are much bigger than ipsec.
>
> Ack. Ended up picking up a lot of stuff along the way.
I'm fine with merging this via the bpf-next tree.
Please consider to merge the bpf hepler functions
to one file. We have already xfrm_interface_bpf.c
and now you introduce xfrm_state_bpf.c.
Try to merge this into a single xfrm_bpf.c file.