LinuxLists.cc - [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2

2021-03-04 10:38:04

Subject: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH

From: Xuesen Huang <[email protected]>

bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
encapsulation. But that is not appropriate when pushing Ethernet header.

Add an option to further specify encap L2 type and set the inner_protocol
as ETH_P_TEB.

Update test_tc_tunnel to verify adding vxlan encapsulation works with
this flag.

Suggested-by: Willem de Bruijn <[email protected]>
Signed-off-by: Xuesen Huang <[email protected]>
Signed-off-by: Zhiyong Cheng <[email protected]>
Signed-off-by: Li Wang <[email protected]>
---
include/uapi/linux/bpf.h | 5 +
net/core/filter.c | 11 ++-
tools/include/uapi/linux/bpf.h | 5 +
tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 107 ++++++++++++++++++---
tools/testing/selftests/bpf/test_tc_tunnel.sh | 15 ++-
5 files changed, 124 insertions(+), 19 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77d7c1b..d791596 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1751,6 +1751,10 @@ struct bpf_stack_build_id {
* Use with ENCAP_L3/L4 flags to further specify the tunnel
* type; *len* is the length of the inner MAC header.
*
+ * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ * L2 type as Ethernet.
+ *
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
@@ -4088,6 +4092,7 @@ enum {
BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3),
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
};

enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index 255aeee..8d1fb61 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3412,6 +3412,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
BPF_F_ADJ_ROOM_ENCAP_L2( \
BPF_ADJ_ROOM_ENCAP_L2_MASK))

@@ -3448,6 +3449,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
return -EINVAL;

+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+ inner_mac_len < ETH_HLEN)
+ return -EINVAL;
+
if (skb->encapsulation)
return -EALREADY;

@@ -3466,7 +3471,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
skb->inner_mac_header = inner_net - inner_mac_len;
skb->inner_network_header = inner_net;
skb->inner_transport_header = inner_trans;
- skb_set_inner_protocol(skb, skb->protocol);
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ else
+ skb_set_inner_protocol(skb, skb->protocol);

skb->encapsulation = 1;
skb_set_network_header(skb, mac_len);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 77d7c1b..d791596 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1751,6 +1751,10 @@ struct bpf_stack_build_id {
* Use with ENCAP_L3/L4 flags to further specify the tunnel
* type; *len* is the length of the inner MAC header.
*
+ * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ * L2 type as Ethernet.
+ *
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be
@@ -4088,6 +4092,7 @@ enum {
BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3),
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
};

enum {
diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
index 37bce7a..6e144db 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -20,6 +20,14 @@
#include <bpf/bpf_endian.h>
#include <bpf/bpf_helpers.h>

+#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
+
+#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
+
+#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
+
+#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
+
static const int cfg_port = 8000;

static const int cfg_udp_src = 20000;
@@ -27,11 +35,24 @@
#define UDP_PORT 5555
#define MPLS_OVER_UDP_PORT 6635
#define ETH_OVER_UDP_PORT 7777
+#define VXLAN_UDP_PORT 8472
+
+#define EXTPROTO_VXLAN 0x1
+
+#define VXLAN_N_VID (1u << 24)
+#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8)
+#define VXLAN_FLAGS 0x8
+#define VXLAN_VNI 1

/* MPLS label 1000 with S bit (last label) set and ttl of 255. */
static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
MPLS_LS_S_MASK | 0xff);

+struct vxlanhdr {
+ __be32 vx_flags;
+ __be32 vx_vni;
+} __attribute__((packed));
+
struct gre_hdr {
__be16 flags;
__be16 protocol;
@@ -45,13 +66,13 @@ struct gre_hdr {
struct v4hdr {
struct iphdr ip;
union l4hdr l4hdr;
- __u8 pad[16]; /* enough space for L2 header */
+ __u8 pad[24]; /* space for L2 header / vxlan header ... */
} __attribute__((packed));

struct v6hdr {
struct ipv6hdr ip;
union l4hdr l4hdr;
- __u8 pad[16]; /* enough space for L2 header */
+ __u8 pad[24]; /* space for L2 header / vxlan header ... */
} __attribute__((packed));

static __always_inline void set_ipv4_csum(struct iphdr *iph)
@@ -69,14 +90,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
iph->check = ~((csum & 0xffff) + (csum >> 16));
}

-static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
- __u16 l2_proto)
+static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
+ __u16 l2_proto, __u16 ext_proto)
{
__u16 udp_dst = UDP_PORT;
struct iphdr iph_inner;
struct v4hdr h_outer;
struct tcphdr tcph;
int olen, l2_len;
+ __u8 *l2_hdr = NULL;
int tcp_off;
__u64 flags;

@@ -141,7 +163,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
break;
case ETH_P_TEB:
l2_len = ETH_HLEN;
- udp_dst = ETH_OVER_UDP_PORT;
+ if (ext_proto & EXTPROTO_VXLAN) {
+ udp_dst = VXLAN_UDP_PORT;
+ l2_len += sizeof(struct vxlanhdr);
+ } else
+ udp_dst = ETH_OVER_UDP_PORT;
break;
}
flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
@@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
}

/* add L2 encap (if specified) */
+ l2_hdr = (__u8 *)&h_outer + olen;
switch (l2_proto) {
case ETH_P_MPLS_UC:
- *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+ *(__u32 *)l2_hdr = mpls_label;
break;
case ETH_P_TEB:
- if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
- ETH_HLEN))
+ flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
+
+ if (ext_proto & EXTPROTO_VXLAN) {
+ struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
+
+ vxlan_hdr->vx_flags = VXLAN_FLAGS;
+ vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
+
+ l2_hdr += sizeof(struct vxlanhdr);
+ }
+
+ if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
return TC_ACT_SHOT;
+
break;
}
olen += l2_len;
@@ -214,14 +252,15 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
return TC_ACT_OK;
}

-static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
- __u16 l2_proto)
+static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
+ __u16 l2_proto, __u16 ext_proto)
{
__u16 udp_dst = UDP_PORT;
struct ipv6hdr iph_inner;
struct v6hdr h_outer;
struct tcphdr tcph;
int olen, l2_len;
+ __u8 *l2_hdr = NULL;
__u16 tot_len;
__u64 flags;

@@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
break;
case ETH_P_TEB:
l2_len = ETH_HLEN;
- udp_dst = ETH_OVER_UDP_PORT;
+ if (ext_proto & EXTPROTO_VXLAN) {
+ udp_dst = VXLAN_UDP_PORT;
+ l2_len += sizeof(struct vxlanhdr);
+ } else
+ udp_dst = ETH_OVER_UDP_PORT;
break;
}
flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
@@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
- sizeof(h_outer.l4hdr.udp);
+ sizeof(h_outer.l4hdr.udp) + l2_len;
h_outer.l4hdr.udp.check = 0;
h_outer.l4hdr.udp.len = bpf_htons(tot_len);
break;
@@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
}

/* add L2 encap (if specified) */
+ l2_hdr = (__u8 *)&h_outer + olen;
switch (l2_proto) {
case ETH_P_MPLS_UC:
- *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+ *(__u32 *)l2_hdr = mpls_label;
break;
case ETH_P_TEB:
- if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
- ETH_HLEN))
+ flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
+
+ if (ext_proto & EXTPROTO_VXLAN) {
+ struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
+
+ vxlan_hdr->vx_flags = VXLAN_FLAGS;
+ vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
+
+ l2_hdr += sizeof(struct vxlanhdr);
+ }
+
+ if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
return TC_ACT_SHOT;
break;
}
@@ -372,6 +426,16 @@ int __encap_udp_eth(struct __sk_buff *skb)
return TC_ACT_OK;
}

+SEC("encap_vxlan_eth")
+int __encap_vxlan_eth(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+ return encap_ipv4_with_ext_proto(skb, IPPROTO_UDP,
+ ETH_P_TEB, EXTPROTO_VXLAN);
+ else
+ return TC_ACT_OK;
+}
+
SEC("encap_sit_none")
int __encap_sit_none(struct __sk_buff *skb)
{
@@ -444,6 +508,16 @@ int __encap_ip6udp_eth(struct __sk_buff *skb)
return TC_ACT_OK;
}

+SEC("encap_ip6vxlan_eth")
+int __encap_ip6vxlan_eth(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+ return encap_ipv6_with_ext_proto(skb, IPPROTO_UDP,
+ ETH_P_TEB, EXTPROTO_VXLAN);
+ else
+ return TC_ACT_OK;
+}
+
static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
{
char buf[sizeof(struct v6hdr)];
@@ -479,6 +553,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
case ETH_OVER_UDP_PORT:
olen += ETH_HLEN;
break;
+ case VXLAN_UDP_PORT:
+ olen += ETH_HLEN + sizeof(struct vxlanhdr);
+ break;
}
break;
default:
diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
index 7c76b84..c9dde9b 100755
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -44,8 +44,8 @@ setup() {
# clamp route to reserve room for tunnel headers
ip -netns "${ns1}" -4 route flush table main
ip -netns "${ns1}" -6 route flush table main
- ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1
- ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1
+ ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1
+ ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1

sleep 1

@@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then
echo "sit"
$0 ipv6 sit none 100

+ echo "ip4 vxlan"
+ $0 ipv4 vxlan eth 2000
+
+ echo "ip6 vxlan"
+ $0 ipv6 ip6vxlan eth 2000
+
for mac in none mpls eth ; do
echo "ip gre $mac"
$0 ipv4 gre $mac 100
@@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then
targs="encap fou encap-sport auto encap-dport $dport"
elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
ttype=$gretaptype
+elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then
+ ttype="vxlan"
+ targs="id 1 dstport 8472 udp6zerocsumrx"
else
ttype=$tuntype
targs=""
@@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then
elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then
# No support for TEB fou tunnel; expect failure.
expect_tun_fail=1
-elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
+elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then
# Share ethernet address between tunnel/veth2 so L2 decap works.
ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \
awk '/ether/ { print $2 }')
--
1.8.3.1

2021-03-04 12:37:32

by Willem de Bruijn

[permalink] [raw]

Subject: Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH

On Wed, Mar 3, 2021 at 7:33 AM Xuesen Huang <[email protected]> wrote:
>
> From: Xuesen Huang <[email protected]>
>
> bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
> encapsulation. But that is not appropriate when pushing Ethernet header.
>
> Add an option to further specify encap L2 type and set the inner_protocol
> as ETH_P_TEB.
>
> Update test_tc_tunnel to verify adding vxlan encapsulation works with
> this flag.
>
> Suggested-by: Willem de Bruijn <[email protected]>
> Signed-off-by: Xuesen Huang <[email protected]>
> Signed-off-by: Zhiyong Cheng <[email protected]>
> Signed-off-by: Li Wang <[email protected]>

Thanks for adding the test. Perhaps that is better in a separate patch?

Overall looks great to me.

The patch has not (yet?) arrived on patchwork.

> enum {
> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> index 37bce7a..6e144db 100644
> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> @@ -20,6 +20,14 @@
> #include <bpf/bpf_endian.h>
> #include <bpf/bpf_helpers.h>
>
> +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
> +
> +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
> +
> +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
> +
> +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
> +

Instead of untyped macros, I'd define encap_ipv4 as a function that
calls __encap_ipv4.

And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4.

> static const int cfg_port = 8000;
>
> static const int cfg_udp_src = 20000;
> @@ -27,11 +35,24 @@
> #define UDP_PORT 5555
> #define MPLS_OVER_UDP_PORT 6635
> #define ETH_OVER_UDP_PORT 7777
> +#define VXLAN_UDP_PORT 8472
> +
> +#define EXTPROTO_VXLAN 0x1
> +
> +#define VXLAN_N_VID (1u << 24)
> +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8)
> +#define VXLAN_FLAGS 0x8
> +#define VXLAN_VNI 1
>
> /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
> static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
> MPLS_LS_S_MASK | 0xff);
>
> +struct vxlanhdr {
> + __be32 vx_flags;
> + __be32 vx_vni;
> +} __attribute__((packed));
> +
> struct gre_hdr {
> __be16 flags;
> __be16 protocol;
> @@ -45,13 +66,13 @@ struct gre_hdr {
> struct v4hdr {
> struct iphdr ip;
> union l4hdr l4hdr;
> - __u8 pad[16]; /* enough space for L2 header */
> + __u8 pad[24]; /* space for L2 header / vxlan header ... */

could we use something like sizeof(..) instead of a constant?

> @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> }
>
> /* add L2 encap (if specified) */
> + l2_hdr = (__u8 *)&h_outer + olen;
> switch (l2_proto) {
> case ETH_P_MPLS_UC:
> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> + *(__u32 *)l2_hdr = mpls_label;
> break;
> case ETH_P_TEB:
> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> - ETH_HLEN))

This is non-standard indentation? Here and elsewhere.

> @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> break;
> case ETH_P_TEB:
> l2_len = ETH_HLEN;
> - udp_dst = ETH_OVER_UDP_PORT;
> + if (ext_proto & EXTPROTO_VXLAN) {
> + udp_dst = VXLAN_UDP_PORT;
> + l2_len += sizeof(struct vxlanhdr);
> + } else
> + udp_dst = ETH_OVER_UDP_PORT;
> break;
> }
> flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
> @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
> h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
> tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
> - sizeof(h_outer.l4hdr.udp);
> + sizeof(h_outer.l4hdr.udp) + l2_len;

Was this a bug previously?

> h_outer.l4hdr.udp.check = 0;
> h_outer.l4hdr.udp.len = bpf_htons(tot_len);
> break;
> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> }
>
> /* add L2 encap (if specified) */
> + l2_hdr = (__u8 *)&h_outer + olen;
> switch (l2_proto) {
> case ETH_P_MPLS_UC:
> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> + *(__u32 *)l2_hdr = mpls_label;
> break;
> case ETH_P_TEB:
> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> - ETH_HLEN))
> + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;

This is a change also for the existing case. Correctly so, I imagine.
But the test used to pass with the wrong protocol?

2021-03-04 14:39:45

by huangxuesen

[permalink] [raw]

Subject: Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH

> 2021年3月4日上午2:53，Willem de Bruijn <[email protected]> 写道：
>
> On Wed, Mar 3, 2021 at 7:33 AM Xuesen Huang <[email protected]> wrote:
>>
>> From: Xuesen Huang <[email protected]>
>>
>> bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
>> encapsulation. But that is not appropriate when pushing Ethernet header.
>>
>> Add an option to further specify encap L2 type and set the inner_protocol
>> as ETH_P_TEB.
>>
>> Update test_tc_tunnel to verify adding vxlan encapsulation works with
>> this flag.
>>
>> Suggested-by: Willem de Bruijn <[email protected]>
>> Signed-off-by: Xuesen Huang <[email protected]>
>> Signed-off-by: Zhiyong Cheng <[email protected]>
>> Signed-off-by: Li Wang <[email protected]>
>
> Thanks for adding the test. Perhaps that is better in a separate patch?
>
> Overall looks great to me.
>
> The patch has not (yet?) arrived on patchwork.
>
Thanks Willem, I will separate it into two patch.

I will send patch/v5 with only that new flag addition, lol.

>> enum {
>> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
>> index 37bce7a..6e144db 100644
>> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
>> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
>> @@ -20,6 +20,14 @@
>> #include <bpf/bpf_endian.h>
>> #include <bpf/bpf_helpers.h>
>>
>> +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
>> +
>> +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
>> +
>> +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
>> +
>> +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
>> +
>
> Instead of untyped macros, I'd define encap_ipv4 as a function that
> calls __encap_ipv4.
>
> And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4.
>
I defined these macros to try to keep the existing invocation for encap_ipv4/6
as the same, if we define this as a function all invocation should be modified?

>> static const int cfg_port = 8000;
>>
>> static const int cfg_udp_src = 20000;
>> @@ -27,11 +35,24 @@
>> #define UDP_PORT 5555
>> #define MPLS_OVER_UDP_PORT 6635
>> #define ETH_OVER_UDP_PORT 7777
>> +#define VXLAN_UDP_PORT 8472
>> +
>> +#define EXTPROTO_VXLAN 0x1
>> +
>> +#define VXLAN_N_VID (1u << 24)
>> +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8)
>> +#define VXLAN_FLAGS 0x8
>> +#define VXLAN_VNI 1
>>
>> /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
>> static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
>> MPLS_LS_S_MASK | 0xff);
>>
>> +struct vxlanhdr {
>> + __be32 vx_flags;
>> + __be32 vx_vni;
>> +} __attribute__((packed));
>> +
>> struct gre_hdr {
>> __be16 flags;
>> __be16 protocol;
>> @@ -45,13 +66,13 @@ struct gre_hdr {
>> struct v4hdr {
>> struct iphdr ip;
>> union l4hdr l4hdr;
>> - __u8 pad[16]; /* enough space for L2 header */
>> + __u8 pad[24]; /* space for L2 header / vxlan header ... */
>
> could we use something like sizeof(..) instead of a constant?
>
Thanks, I will try to fix this.

>> @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
>> }
>>
>> /* add L2 encap (if specified) */
>> + l2_hdr = (__u8 *)&h_outer + olen;
>> switch (l2_proto) {
>> case ETH_P_MPLS_UC:
>> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
>> + *(__u32 *)l2_hdr = mpls_label;
>> break;
>> case ETH_P_TEB:
>> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
>> - ETH_HLEN))
>
> This is non-standard indentation? Here and elsewhere.
I thinks it’s a previous issue.

>
>> @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>> break;
>> case ETH_P_TEB:
>> l2_len = ETH_HLEN;
>> - udp_dst = ETH_OVER_UDP_PORT;
>> + if (ext_proto & EXTPROTO_VXLAN) {
>> + udp_dst = VXLAN_UDP_PORT;
>> + l2_len += sizeof(struct vxlanhdr);
>> + } else
>> + udp_dst = ETH_OVER_UDP_PORT;
>> break;
>> }
>> flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
>> @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>> h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
>> h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
>> tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
>> - sizeof(h_outer.l4hdr.udp);
>> + sizeof(h_outer.l4hdr.udp) + l2_len;
>
> Was this a bug previously?
>
Yes, a tiny bug.

>> h_outer.l4hdr.udp.check = 0;
>> h_outer.l4hdr.udp.len = bpf_htons(tot_len);
>> break;
>> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
>> }
>>
>> /* add L2 encap (if specified) */
>> + l2_hdr = (__u8 *)&h_outer + olen;
>> switch (l2_proto) {
>> case ETH_P_MPLS_UC:
>> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
>> + *(__u32 *)l2_hdr = mpls_label;
>> break;
>> case ETH_P_TEB:
>> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
>> - ETH_HLEN))
>> + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
>
> This is a change also for the existing case. Correctly so, I imagine.
> But the test used to pass with the wrong protocol?
Yes all tests pass. I’m not sure should we add this flag for the existing tests
which encap eth as the l2 header or only for the Vxlan test?

Waiting for your suggestion.
Thanks.

2021-03-04 14:41:37

by Willem de Bruijn

[permalink] [raw]

Subject: Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH

> > Instead of untyped macros, I'd define encap_ipv4 as a function that
> > calls __encap_ipv4.
> >
> > And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4.
> >
> I defined these macros to try to keep the existing invocation for encap_ipv4/6
> as the same, if we define this as a function all invocation should be modified?

You can leave the existing invocations the same and make the new
callers caller __encap_ipv4 directly, which takes one extra argument?
Adding a __ prefixed variant with extra args is a common pattern.

> >> /* add L2 encap (if specified) */
> >> + l2_hdr = (__u8 *)&h_outer + olen;
> >> switch (l2_proto) {
> >> case ETH_P_MPLS_UC:
> >> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> >> + *(__u32 *)l2_hdr = mpls_label;
> >> break;
> >> case ETH_P_TEB:
> >> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> >> - ETH_HLEN))
> >
> > This is non-standard indentation? Here and elsewhere.
> I thinks it’s a previous issue.

Ah right. Bad example. How about in __encap_vxlan_eth

+ return encap_ipv4_with_ext_proto(skb, IPPROTO_UDP,
+ ETH_P_TEB, EXTPROTO_VXLAN);

> >> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> >> }
> >>
> >> /* add L2 encap (if specified) */
> >> + l2_hdr = (__u8 *)&h_outer + olen;
> >> switch (l2_proto) {
> >> case ETH_P_MPLS_UC:
> >> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> >> + *(__u32 *)l2_hdr = mpls_label;
> >> break;
> >> case ETH_P_TEB:
> >> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> >> - ETH_HLEN))
> >> + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
> >
> > This is a change also for the existing case. Correctly so, I imagine.
> > But the test used to pass with the wrong protocol?
> Yes all tests pass. I’m not sure should we add this flag for the existing tests
> which encap eth as the l2 header or only for the Vxlan test?

It is correct in both cases. If it does not break anything, I would do both.

Thanks,

Willem