This series attempts to reduce parsing overhead of IPv6 extension headers
in GRO and GSO, by removing extension header specific code and enabling
the frag0 fast path.
The following changes were made:
- Specific unnecessary HBH conditionals were removed by adding HBH offload
to inet6_offloads
- Added a utility function to support frag0 fast path in ipv6_gro_receive
- Added self-test for IPv6 packets with extension headers in GRO
Richard
Richard Gobert (3):
net: gso: add HBH extension header offload support
net: gro: parse ipv6 ext headers without frag0 invalidation
selftests/net: fix GRO coalesce test and add ext header coalesce test
net/ipv6/exthdrs_offload.c | 11 +++++
net/ipv6/ip6_offload.c | 76 ++++++++++++++++++++----------
tools/testing/selftests/net/gro.c | 78 ++++++++++++++++++++++++++++---
3 files changed, 134 insertions(+), 31 deletions(-)
--
2.36.1
This commit utilizes a new helper function, ipv6_gro_pull_exthdrs, which
is used in ipv6_gro_receive to pull ipv6 ext headers instead of
ipv6_gso_pull_exthdrs. To use ipv6_gso_pull_exthdr, pskb_pull and
__skb_push must be used, and frag0 must be invalidated. This commit
removes unnecessary code around the call to ipv6_gso_pull_exthdrs and
enables the frag0 fast path in IPv6 packets with ext headers.
Signed-off-by: Richard Gobert <[email protected]>
---
net/ipv6/ip6_offload.c | 51 +++++++++++++++++++++++++++++++++---------
1 file changed, 41 insertions(+), 10 deletions(-)
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 0e0b5fed0995..a3b8d9127dbb 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -37,6 +37,40 @@
INDIRECT_CALL_L4(cb, f2, f1, head, skb); \
})
+static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
+{
+ const struct net_offload *ops = NULL;
+ struct ipv6_opt_hdr *opth;
+
+ for (;;) {
+ int len;
+
+ ops = rcu_dereference(inet6_offloads[proto]);
+
+ if (unlikely(!ops))
+ break;
+
+ if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
+ break;
+
+ opth = skb_gro_header(skb, off + 8, off);
+ if (unlikely(!opth))
+ break;
+
+ len = ipv6_optlen(opth);
+
+ opth = skb_gro_header(skb, off + len, off);
+ if (unlikely(!opth))
+ break;
+ proto = opth->nexthdr;
+
+ off += len;
+ }
+
+ skb_gro_pull(skb, off - skb_network_offset(skb));
+ return proto;
+}
+
static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
{
const struct net_offload *ops = NULL;
@@ -203,28 +237,25 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
goto out;
skb_set_network_header(skb, off);
- skb_gro_pull(skb, sizeof(*iph));
- skb_set_transport_header(skb, skb_gro_offset(skb));
- flush += ntohs(iph->payload_len) != skb_gro_len(skb);
+ flush += ntohs(iph->payload_len) != skb->len - hlen;
proto = iph->nexthdr;
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive) {
- pskb_pull(skb, skb_gro_offset(skb));
- skb_gro_frag0_invalidate(skb);
- proto = ipv6_gso_pull_exthdrs(skb, proto);
- skb_gro_pull(skb, -skb_transport_offset(skb));
- skb_reset_transport_header(skb);
- __skb_push(skb, skb_gro_offset(skb));
+ proto = ipv6_gro_pull_exthdrs(skb, hlen, proto);
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive)
goto out;
- iph = ipv6_hdr(skb);
+ iph = skb_gro_network_header(skb);
+ } else {
+ skb_gro_pull(skb, sizeof(*iph));
}
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
NAPI_GRO_CB(skb)->proto = proto;
flush--;
--
2.36.1
Currently there is no test which checks that IPv6 extension header packets
successfully coalesce. This commit adds a test, which verifies two IPv6
packets with HBH extension headers do coalesce.
I changed the receive socket filter to accept a packet with one extension
header. This change exposed a bug in the fragment test -- the old BPF did
not accept the fragment packet. I updated correct_num_packets in the
fragment test accordingly.
Signed-off-by: Richard Gobert <[email protected]>
---
tools/testing/selftests/net/gro.c | 78 ++++++++++++++++++++++++++++---
1 file changed, 71 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
index 30024d0ed373..4ee34dca8e5f 100644
--- a/tools/testing/selftests/net/gro.c
+++ b/tools/testing/selftests/net/gro.c
@@ -71,6 +71,10 @@
#define MAX_PAYLOAD (IP_MAXPACKET - sizeof(struct tcphdr) - sizeof(struct ipv6hdr))
#define NUM_LARGE_PKT (MAX_PAYLOAD / MSS)
#define MAX_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr))
+#define MIN_EXTHDR_SIZE 8 /* minimum size of IPv6 extension header */
+
+#define ipv6_optlen(p) (((p)->hdrlen+1) << 3) /* calculate IPv6 extension header len */
+
static const char *addr6_src = "fdaa::2";
static const char *addr6_dst = "fdaa::1";
@@ -104,7 +108,7 @@ static void setup_sock_filter(int fd)
const int dport_off = tcp_offset + offsetof(struct tcphdr, dest);
const int ethproto_off = offsetof(struct ethhdr, h_proto);
int optlen = 0;
- int ipproto_off;
+ int ipproto_off, opt_ipproto_off;
int next_off;
if (proto == PF_INET)
@@ -116,14 +120,27 @@ static void setup_sock_filter(int fd)
if (strcmp(testname, "ip") == 0) {
if (proto == PF_INET)
optlen = sizeof(struct ip_timestamp);
- else
- optlen = sizeof(struct ip6_frag);
+ else {
+ // same size for HBH and Fragment extension header types
+ optlen = MIN_EXTHDR_SIZE;
+ opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) +
+ offsetof(struct ip6_ext, ip6e_nxt);
+ }
}
+ /*
+ * this filter validates the following:
+ * - packet is IPv4/IPv6 according to the running test.
+ * - packet is TCP. Also handles the case of one extension header and then TCP.
+ * - checks the packet tcp dport equals to DPORT.
+ * (also handles the case of one extension header and then TCP.)
+ */
struct sock_filter filter[] = {
BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ethproto_off),
- BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(ethhdr_proto), 0, 7),
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(ethhdr_proto), 0, 9),
BPF_STMT(BPF_LD + BPF_B + BPF_ABS, ipproto_off),
+ BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 2, 0),
+ BPF_STMT(BPF_LD + BPF_B + BPF_ABS, opt_ipproto_off),
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 0, 5),
BPF_STMT(BPF_LD + BPF_H + BPF_ABS, dport_off),
BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DPORT, 2, 0),
@@ -576,6 +593,39 @@ static void add_ipv4_ts_option(void *buf, void *optpkt)
iph->check = checksum_fold(iph, sizeof(struct iphdr) + optlen, 0);
}
+static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type)
+{
+ struct ipv6_opt_hdr *hbh_hdr = (struct ipv6_opt_hdr *)(optpkt + tcp_offset);
+ struct ipv6hdr *iph = (struct ipv6hdr *)(optpkt + ETH_HLEN);
+ int opt_len;
+
+ hbh_hdr->hdrlen = 0;
+ hbh_hdr->nexthdr = IPPROTO_TCP;
+ opt_len = ipv6_optlen(hbh_hdr);
+
+ memcpy(optpkt, buf, tcp_offset);
+ memcpy(optpkt + tcp_offset + MIN_EXTHDR_SIZE, buf + tcp_offset,
+ sizeof(struct tcphdr) + PAYLOAD_LEN);
+
+ iph->nexthdr = exthdr_type;
+ iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE);
+}
+
+/* Send IPv6 packet with extension header */
+static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr)
+{
+ static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
+ static char exthdr_pck[sizeof(buf) + MIN_EXTHDR_SIZE];
+
+ create_packet(buf, 0, 0, PAYLOAD_LEN, 0);
+ add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS);
+ write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
+
+ create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0);
+ add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS);
+ write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
+}
+
/* IPv4 options shouldn't coalesce */
static void send_ip_options(int fd, struct sockaddr_ll *daddr)
{
@@ -697,7 +747,7 @@ static void send_fragment6(int fd, struct sockaddr_ll *daddr)
create_packet(buf, PAYLOAD_LEN * i, 0, PAYLOAD_LEN, 0);
write_packet(fd, buf, bufpkt_len, daddr);
}
-
+ sleep(1);
create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0);
memset(extpkt, 0, extpkt_len);
@@ -760,6 +810,7 @@ static void check_recv_pkts(int fd, int *correct_payload,
vlog("}, Total %d packets\nReceived {", correct_num_pkts);
while (1) {
+ ip_ext_len = 0;
pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0);
if (pkt_size < 0)
error(1, errno, "could not receive");
@@ -767,7 +818,7 @@ static void check_recv_pkts(int fd, int *correct_payload,
if (iph->version == 4)
ip_ext_len = (iph->ihl - 5) * 4;
else if (ip6h->version == 6 && ip6h->nexthdr != IPPROTO_TCP)
- ip_ext_len = sizeof(struct ip6_frag);
+ ip_ext_len = MIN_EXTHDR_SIZE;
tcph = (struct tcphdr *)(buffer + tcp_offset + ip_ext_len);
@@ -880,7 +931,14 @@ static void gro_sender(void)
sleep(1);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
} else if (proto == PF_INET6) {
+ sleep(1);
send_fragment6(txfd, &daddr);
+ sleep(1);
+ write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+
+ sleep(1);
+ send_ipv6_exthdr(txfd, &daddr);
+ sleep(1);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
}
} else if (strcmp(testname, "large") == 0) {
@@ -997,7 +1055,13 @@ static void gro_receiver(void)
*/
printf("fragmented ip6 doesn't coalesce: ");
correct_payload[0] = PAYLOAD_LEN * 2;
- check_recv_pkts(rxfd, correct_payload, 2);
+ correct_payload[1] = PAYLOAD_LEN;
+ correct_payload[2] = PAYLOAD_LEN;
+ check_recv_pkts(rxfd, correct_payload, 3);
+
+ correct_payload[0] = PAYLOAD_LEN * 2;
+ printf("ipv6 with extension header DOES coalesce: ");
+ check_recv_pkts(rxfd, correct_payload, 1);
}
} else if (strcmp(testname, "large") == 0) {
int offset = proto == PF_INET ? 20 : 0;
--
2.36.1
Hi Richard,
kernel test robot noticed the following build warnings:
[auto build test WARNING on net-next/main]
url: https://github.com/intel-lab-lkp/linux/commits/Richard-Gobert/net-gso-add-HBH-extension-header-offload-support/20231222-172059
base: net-next/main
patch link: https://lore.kernel.org/r/641157c0-f224-4910-874d-7906a48d914b%40gmail.com
patch subject: [PATCH net-next 3/3] selftests/net: fix GRO coalesce test and add ext
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231223/[email protected]/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
All warnings (new ones prefixed by >>):
gro.c: In function 'add_ipv6_exthdr':
>> gro.c:600:13: warning: variable 'opt_len' set but not used [-Wunused-but-set-variable]
600 | int opt_len;
| ^~~~~~~
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Richard Gobert wrote:
> Currently there is no test which checks that IPv6 extension header packets
> successfully coalesce. This commit adds a test, which verifies two IPv6
> packets with HBH extension headers do coalesce.
>
> I changed the receive socket filter to accept a packet with one extension
> header. This change exposed a bug in the fragment test -- the old BPF did
> not accept the fragment packet. I updated correct_num_packets in the
> fragment test accordingly.
>
> Signed-off-by: Richard Gobert <[email protected]>
Thanks for adding test coverage along with the feature, and as part
of the existing gro test too.
> ---
> tools/testing/selftests/net/gro.c | 78 ++++++++++++++++++++++++++++---
> 1 file changed, 71 insertions(+), 7 deletions(-)
>
> diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
> index 30024d0ed373..4ee34dca8e5f 100644
> --- a/tools/testing/selftests/net/gro.c
> +++ b/tools/testing/selftests/net/gro.c
> @@ -71,6 +71,10 @@
> #define MAX_PAYLOAD (IP_MAXPACKET - sizeof(struct tcphdr) - sizeof(struct ipv6hdr))
> #define NUM_LARGE_PKT (MAX_PAYLOAD / MSS)
> #define MAX_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr))
> +#define MIN_EXTHDR_SIZE 8 /* minimum size of IPv6 extension header */
> +
> +#define ipv6_optlen(p) (((p)->hdrlen+1) << 3) /* calculate IPv6 extension header len */
> +
>
> static const char *addr6_src = "fdaa::2";
> static const char *addr6_dst = "fdaa::1";
> @@ -104,7 +108,7 @@ static void setup_sock_filter(int fd)
> const int dport_off = tcp_offset + offsetof(struct tcphdr, dest);
> const int ethproto_off = offsetof(struct ethhdr, h_proto);
> int optlen = 0;
> - int ipproto_off;
> + int ipproto_off, opt_ipproto_off;
> int next_off;
>
> if (proto == PF_INET)
> @@ -116,14 +120,27 @@ static void setup_sock_filter(int fd)
> if (strcmp(testname, "ip") == 0) {
> if (proto == PF_INET)
> optlen = sizeof(struct ip_timestamp);
> - else
> - optlen = sizeof(struct ip6_frag);
> + else {
> + // same size for HBH and Fragment extension header types
no C++ style comments
Also, instead of comments here and at the MIN_EXTHDR_SIZE definition,
perhaps cleaner as self documenting code:
BUILD_BUG_ON(MIN_EXTHDR_SIZE != sizeof(struct ip6_hbh));
BUILD_BUG_ON(MIN_EXTHDR_SIZE != sizeof(struct ip6_dest));
BUILD_BUG_ON(MIN_EXTHDR_SIZE < sizeof(struct ip6_frag));
> + optlen = MIN_EXTHDR_SIZE;
> + opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) +
> + offsetof(struct ip6_ext, ip6e_nxt);
> + }
> }
>
> + /*
> + * this filter validates the following:
> + * - packet is IPv4/IPv6 according to the running test.
> + * - packet is TCP. Also handles the case of one extension header and then TCP.
> + * - checks the packet tcp dport equals to DPORT.
> + * (also handles the case of one extension header and then TCP.)
> + */
nit: for filewide consistency: netdev style:
/* start comment
* more comment
*/
> struct sock_filter filter[] = {
> BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ethproto_off),
> - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(ethhdr_proto), 0, 7),
> + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(ethhdr_proto), 0, 9),
> BPF_STMT(BPF_LD + BPF_B + BPF_ABS, ipproto_off),
> + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 2, 0),
> + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, opt_ipproto_off),
> BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 0, 5),
> BPF_STMT(BPF_LD + BPF_H + BPF_ABS, dport_off),
> BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DPORT, 2, 0),
> @@ -576,6 +593,39 @@ static void add_ipv4_ts_option(void *buf, void *optpkt)
> iph->check = checksum_fold(iph, sizeof(struct iphdr) + optlen, 0);
> }
>
> +static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type)
> +{
> + struct ipv6_opt_hdr *hbh_hdr = (struct ipv6_opt_hdr *)(optpkt + tcp_offset);
> + struct ipv6hdr *iph = (struct ipv6hdr *)(optpkt + ETH_HLEN);
> + int opt_len;
> +
> + hbh_hdr->hdrlen = 0;
> + hbh_hdr->nexthdr = IPPROTO_TCP;
> + opt_len = ipv6_optlen(hbh_hdr);
> +
> + memcpy(optpkt, buf, tcp_offset);
> + memcpy(optpkt + tcp_offset + MIN_EXTHDR_SIZE, buf + tcp_offset,
> + sizeof(struct tcphdr) + PAYLOAD_LEN);
> +
> + iph->nexthdr = exthdr_type;
> + iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE);
> +}
> +
> +/* Send IPv6 packet with extension header */
> +static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr)
> +{
> + static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
> + static char exthdr_pck[sizeof(buf) + MIN_EXTHDR_SIZE];
> +
> + create_packet(buf, 0, 0, PAYLOAD_LEN, 0);
> + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS);
> + write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
> +
> + create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0);
> + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS);
> + write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
> +}
> +
> /* IPv4 options shouldn't coalesce */
> static void send_ip_options(int fd, struct sockaddr_ll *daddr)
> {
> @@ -697,7 +747,7 @@ static void send_fragment6(int fd, struct sockaddr_ll *daddr)
> create_packet(buf, PAYLOAD_LEN * i, 0, PAYLOAD_LEN, 0);
> write_packet(fd, buf, bufpkt_len, daddr);
> }
> -
> + sleep(1);
Leftover debug, or necessary fix to existing test?
> create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0);
> memset(extpkt, 0, extpkt_len);
>
> @@ -760,6 +810,7 @@ static void check_recv_pkts(int fd, int *correct_payload,
> vlog("}, Total %d packets\nReceived {", correct_num_pkts);
>
> while (1) {
> + ip_ext_len = 0;
> pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0);
> if (pkt_size < 0)
> error(1, errno, "could not receive");
> @@ -767,7 +818,7 @@ static void check_recv_pkts(int fd, int *correct_payload,
> if (iph->version == 4)
> ip_ext_len = (iph->ihl - 5) * 4;
> else if (ip6h->version == 6 && ip6h->nexthdr != IPPROTO_TCP)
> - ip_ext_len = sizeof(struct ip6_frag);
> + ip_ext_len = MIN_EXTHDR_SIZE;
struct ip6_frag is > MIN_EXTHDR_SIZE. Need both cases?
else if (ip6h->version == 6 && ip6h->nexthdr == NEXTHDR_FRAGMENT
ip_ext_len = sizeof(struct ip6_frag);
else if (ip6h->version == 6 && ip6h->nexthdr != IPPROTO_TCP)
ip_ext_len = MIN_EXTHDR_SIZE;
>
> tcph = (struct tcphdr *)(buffer + tcp_offset + ip_ext_len);
>
> @@ -880,7 +931,14 @@ static void gro_sender(void)
> sleep(1);
> write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
> } else if (proto == PF_INET6) {
> + sleep(1);
> send_fragment6(txfd, &daddr);
> + sleep(1);
> + write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
> +
> + sleep(1);
> + send_ipv6_exthdr(txfd, &daddr);
> + sleep(1);
For the casual reader: these sleeps are not leftover debug statements
unfortunately. The same exists in the PF_INET branch:
/* Modified packets may be received out of order.
* Sleep function added to enforce test boundaries
* so that fin pkts are not received prior to other pkts.
*/
> write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
> }
> } else if (strcmp(testname, "large") == 0) {
> @@ -997,7 +1055,13 @@ static void gro_receiver(void)
> */
> printf("fragmented ip6 doesn't coalesce: ");
> correct_payload[0] = PAYLOAD_LEN * 2;
> - check_recv_pkts(rxfd, correct_payload, 2);
> + correct_payload[1] = PAYLOAD_LEN;
> + correct_payload[2] = PAYLOAD_LEN;
> + check_recv_pkts(rxfd, correct_payload, 3);
> +
> + correct_payload[0] = PAYLOAD_LEN * 2;
> + printf("ipv6 with extension header DOES coalesce: ");
> + check_recv_pkts(rxfd, correct_payload, 1);
It might be worth adding a test with two different extension headers.
To verify that these should not coalesce.
Either different ipprotos, like IPPROTO_DSTOPTS vs IPPROTO_HOPOPTS.
Perhaps more intesting are two of the same ipproto but with different
content.
> }
> } else if (strcmp(testname, "large") == 0) {
> int offset = proto == PF_INET ? 20 : 0;
> --
> 2.36.1
>
Richard Gobert wrote:
> This commit utilizes a new helper function, ipv6_gro_pull_exthdrs, which
> is used in ipv6_gro_receive to pull ipv6 ext headers instead of
> ipv6_gso_pull_exthdrs. To use ipv6_gso_pull_exthdr, pskb_pull and
> __skb_push must be used, and frag0 must be invalidated. This commit
> removes unnecessary code around the call to ipv6_gso_pull_exthdrs and
> enables the frag0 fast path in IPv6 packets with ext headers.
>
> Signed-off-by: Richard Gobert <[email protected]>
> ---
> net/ipv6/ip6_offload.c | 51 +++++++++++++++++++++++++++++++++---------
> 1 file changed, 41 insertions(+), 10 deletions(-)
>
> diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
> index 0e0b5fed0995..a3b8d9127dbb 100644
> --- a/net/ipv6/ip6_offload.c
> +++ b/net/ipv6/ip6_offload.c
> @@ -37,6 +37,40 @@
> INDIRECT_CALL_L4(cb, f2, f1, head, skb); \
> })
>
> +static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
> +{
> + const struct net_offload *ops = NULL;
> + struct ipv6_opt_hdr *opth;
> +
> + for (;;) {
> + int len;
> +
> + ops = rcu_dereference(inet6_offloads[proto]);
> +
> + if (unlikely(!ops))
> + break;
> +
> + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
> + break;
> +
> + opth = skb_gro_header(skb, off + 8, off);
When changing this code, it would be great to make it more self
documenting. It's not entirely clear what that 8 is based on.
sizeof(*opth) is only 2. Probably an optimization to handle the most
common extension headers in a single pskb_may_pull? If so, this new
code does not have that concern, so can just use sizeof(*opth). Or
else add a const int likely_max_opt_hdr_len = 8 or so.
> + if (unlikely(!opth))
> + break;
> +
> + len = ipv6_optlen(opth);
> +
> + opth = skb_gro_header(skb, off + len, off);
> + if (unlikely(!opth))
> + break;
> + proto = opth->nexthdr;
> +
> + off += len;
> + }
> +
> + skb_gro_pull(skb, off - skb_network_offset(skb));
> + return proto;
> +}
> +
> static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
> {
> const struct net_offload *ops = NULL;
> @@ -203,28 +237,25 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
> goto out;
>
> skb_set_network_header(skb, off);
> - skb_gro_pull(skb, sizeof(*iph));
> - skb_set_transport_header(skb, skb_gro_offset(skb));
>
> - flush += ntohs(iph->payload_len) != skb_gro_len(skb);
> + flush += ntohs(iph->payload_len) != skb->len - hlen;
>
> proto = iph->nexthdr;
> ops = rcu_dereference(inet6_offloads[proto]);
> if (!ops || !ops->callbacks.gro_receive) {
> - pskb_pull(skb, skb_gro_offset(skb));
> - skb_gro_frag0_invalidate(skb);
> - proto = ipv6_gso_pull_exthdrs(skb, proto);
> - skb_gro_pull(skb, -skb_transport_offset(skb));
> - skb_reset_transport_header(skb);
> - __skb_push(skb, skb_gro_offset(skb));
> + proto = ipv6_gro_pull_exthdrs(skb, hlen, proto);
>
> ops = rcu_dereference(inet6_offloads[proto]);
> if (!ops || !ops->callbacks.gro_receive)
> goto out;
>
> - iph = ipv6_hdr(skb);
> + iph = skb_gro_network_header(skb);
> + } else {
> + skb_gro_pull(skb, sizeof(*iph));
> }
This code is non-obvious and has proven fragile (57ea52a8651). Changes
are best as simple as they can be, with ample documentation. My
attempt, as arrived at during review:
The existing always pulls the IPv6 header and sets the transport
offset initially. Then optionally again pulls any extension headers
in ipv6_gso_pull_exthdrs and sets the transport offset again on
return from that call.
The new code adds a small optimization to only pull and set transport
offset once.
The existing code needs to set skb->data at the start of the first
extension header before calling ipv6_gso_pull_exthdrs, and must
disable the frag0 optimization because that function uses
pskb_may_pull/pskb_pull instead of skb_gro_ helpers. It sets the
GRO offset to the inner TCP header with skb_gro_pull and sets the
transport header. Then returns skb->data to its position before
this block.
The new code is much simpler: it does not have to modify skb->data,
as all operations are with skb_gro_ helpers.
Aside from the small comment above, and suggestion to include
something like this summary in the code and/or avoid the extra
optimization,
Reviewed-by: Willem de Bruijn <[email protected]>
>
> + skb_set_transport_header(skb, skb_gro_offset(skb));
> +
> NAPI_GRO_CB(skb)->proto = proto;
>
> flush--;
> --
> 2.36.1
>
Willem de Bruijn wrote:
> Richard Gobert wrote:
>> This commit utilizes a new helper function, ipv6_gro_pull_exthdrs, which
>> is used in ipv6_gro_receive to pull ipv6 ext headers instead of
>> ipv6_gso_pull_exthdrs. To use ipv6_gso_pull_exthdr, pskb_pull and
>> __skb_push must be used, and frag0 must be invalidated. This commit
>> removes unnecessary code around the call to ipv6_gso_pull_exthdrs and
>> enables the frag0 fast path in IPv6 packets with ext headers.
>>
>> Signed-off-by: Richard Gobert <[email protected]>
>> ---
>> net/ipv6/ip6_offload.c | 51 +++++++++++++++++++++++++++++++++---------
>> 1 file changed, 41 insertions(+), 10 deletions(-)
>>
>> diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
>> index 0e0b5fed0995..a3b8d9127dbb 100644
>> --- a/net/ipv6/ip6_offload.c
>> +++ b/net/ipv6/ip6_offload.c
>> @@ -37,6 +37,40 @@
>> INDIRECT_CALL_L4(cb, f2, f1, head, skb); \
>> })
>>
>> +static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
>> +{
>> + const struct net_offload *ops = NULL;
>> + struct ipv6_opt_hdr *opth;
>> +
>> + for (;;) {
>> + int len;
>> +
>> + ops = rcu_dereference(inet6_offloads[proto]);
>> +
>> + if (unlikely(!ops))
>> + break;
>> +
>> + if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
>> + break;
>> +
>> + opth = skb_gro_header(skb, off + 8, off);
>
> When changing this code, it would be great to make it more self
> documenting. It's not entirely clear what that 8 is based on.
> sizeof(*opth) is only 2. Probably an optimization to handle the most
> common extension headers in a single pskb_may_pull? If so, this new
> code does not have that concern, so can just use sizeof(*opth). Or
> else add a const int likely_max_opt_hdr_len = 8 or so.
>
>
>> + if (unlikely(!opth))
>> + break;
>> +
>> + len = ipv6_optlen(opth);
>> +
>> + opth = skb_gro_header(skb, off + len, off);
>> + if (unlikely(!opth))
>> + break;
>> + proto = opth->nexthdr;
>> +
>> + off += len;
>> + }
>> +
>> + skb_gro_pull(skb, off - skb_network_offset(skb));
>> + return proto;
>> +}
>> +
>> static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
>> {
>> const struct net_offload *ops = NULL;
>> @@ -203,28 +237,25 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
>> goto out;
>>
>> skb_set_network_header(skb, off);
>> - skb_gro_pull(skb, sizeof(*iph));
>> - skb_set_transport_header(skb, skb_gro_offset(skb));
>>
>> - flush += ntohs(iph->payload_len) != skb_gro_len(skb);
>> + flush += ntohs(iph->payload_len) != skb->len - hlen;
>>
>> proto = iph->nexthdr;
>> ops = rcu_dereference(inet6_offloads[proto]);
>> if (!ops || !ops->callbacks.gro_receive) {
>> - pskb_pull(skb, skb_gro_offset(skb));
>> - skb_gro_frag0_invalidate(skb);
>> - proto = ipv6_gso_pull_exthdrs(skb, proto);
>> - skb_gro_pull(skb, -skb_transport_offset(skb));
>> - skb_reset_transport_header(skb);
>> - __skb_push(skb, skb_gro_offset(skb));
>> + proto = ipv6_gro_pull_exthdrs(skb, hlen, proto);
>>
>> ops = rcu_dereference(inet6_offloads[proto]);
>> if (!ops || !ops->callbacks.gro_receive)
>> goto out;
>>
>> - iph = ipv6_hdr(skb);
>> + iph = skb_gro_network_header(skb);
>> + } else {
>> + skb_gro_pull(skb, sizeof(*iph));
>> }
>
> This code is non-obvious and has proven fragile (57ea52a8651). Changes
> are best as simple as they can be, with ample documentation. My
> attempt, as arrived at during review:
>
> The existing always pulls the IPv6 header and sets the transport
> offset initially. Then optionally again pulls any extension headers
> in ipv6_gso_pull_exthdrs and sets the transport offset again on
> return from that call.
>
> The new code adds a small optimization to only pull and set transport
> offset once.
>
> The existing code needs to set skb->data at the start of the first
> extension header before calling ipv6_gso_pull_exthdrs, and must
> disable the frag0 optimization because that function uses
> pskb_may_pull/pskb_pull instead of skb_gro_ helpers. It sets the
> GRO offset to the inner TCP header with skb_gro_pull and sets the
> transport header. Then returns skb->data to its position before
> this block.
>
> The new code is much simpler: it does not have to modify skb->data,
> as all operations are with skb_gro_ helpers.
>
> Aside from the small comment above, and suggestion to include
> something like this summary in the code and/or avoid the extra
> optimization,
>
> Reviewed-by: Willem de Bruijn <[email protected]>
>
Thanks for the review, I'll submit v2 with an extended commit message
summary as suggested.
>>
>> + skb_set_transport_header(skb, skb_gro_offset(skb));
>> +
>> NAPI_GRO_CB(skb)->proto = proto;
>>
>> flush--;
>> --
>> 2.36.1
>>
>
>
Willem de Bruijn wrote:
> Richard Gobert wrote:
>> Currently there is no test which checks that IPv6 extension header packets
>> successfully coalesce. This commit adds a test, which verifies two IPv6
>> packets with HBH extension headers do coalesce.
>>
>> I changed the receive socket filter to accept a packet with one extension
>> header. This change exposed a bug in the fragment test -- the old BPF did
>> not accept the fragment packet. I updated correct_num_packets in the
>> fragment test accordingly.
>>
>> Signed-off-by: Richard Gobert <[email protected]>
>
> Thanks for adding test coverage along with the feature, and as part
> of the existing gro test too.
>
>> ---
>> tools/testing/selftests/net/gro.c | 78 ++++++++++++++++++++++++++++---
>> 1 file changed, 71 insertions(+), 7 deletions(-)
>>
>> diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
>> index 30024d0ed373..4ee34dca8e5f 100644
>> --- a/tools/testing/selftests/net/gro.c
>> +++ b/tools/testing/selftests/net/gro.c
>> @@ -71,6 +71,10 @@
>> #define MAX_PAYLOAD (IP_MAXPACKET - sizeof(struct tcphdr) - sizeof(struct ipv6hdr))
>> #define NUM_LARGE_PKT (MAX_PAYLOAD / MSS)
>> #define MAX_HDR_LEN (ETH_HLEN + sizeof(struct ipv6hdr) + sizeof(struct tcphdr))
>> +#define MIN_EXTHDR_SIZE 8 /* minimum size of IPv6 extension header */
>> +
>> +#define ipv6_optlen(p) (((p)->hdrlen+1) << 3) /* calculate IPv6 extension header len */
>> +
>>
>> static const char *addr6_src = "fdaa::2";
>> static const char *addr6_dst = "fdaa::1";
>> @@ -104,7 +108,7 @@ static void setup_sock_filter(int fd)
>> const int dport_off = tcp_offset + offsetof(struct tcphdr, dest);
>> const int ethproto_off = offsetof(struct ethhdr, h_proto);
>> int optlen = 0;
>> - int ipproto_off;
>> + int ipproto_off, opt_ipproto_off;
>> int next_off;
>>
>> if (proto == PF_INET)
>> @@ -116,14 +120,27 @@ static void setup_sock_filter(int fd)
>> if (strcmp(testname, "ip") == 0) {
>> if (proto == PF_INET)
>> optlen = sizeof(struct ip_timestamp);
>> - else
>> - optlen = sizeof(struct ip6_frag);
>> + else {
>> + // same size for HBH and Fragment extension header types
>
> no C++ style comments
>
> Also, instead of comments here and at the MIN_EXTHDR_SIZE definition,
> perhaps cleaner as self documenting code:
>
> BUILD_BUG_ON(MIN_EXTHDR_SIZE != sizeof(struct ip6_hbh));
> BUILD_BUG_ON(MIN_EXTHDR_SIZE != sizeof(struct ip6_dest));
> BUILD_BUG_ON(MIN_EXTHDR_SIZE < sizeof(struct ip6_frag));
>
Thanks for the suggestion, I will add a check that the exthdr structs
are smaller than MIN_EXTHDR_SIZE bytes:
BUILD_BUG_ON(MIN_EXTHDR_SIZE < sizeof(struct ip6_hbh));
BUILD_BUG_ON(MIN_EXTHDR_SIZE < sizeof(struct ip6_dest));
BUILD_BUG_ON(MIN_EXTHDR_SIZE < sizeof(struct ip6_frag));
(The ip6_hbh and ip6_dest structs are 2 bytes long, and the ip6_frag
struct is 8 bytes long)
>> + optlen = MIN_EXTHDR_SIZE;
>> + opt_ipproto_off = ETH_HLEN + sizeof(struct ipv6hdr) +
>> + offsetof(struct ip6_ext, ip6e_nxt);
>> + }
>> }
>>
>> + /*
>> + * this filter validates the following:
>> + * - packet is IPv4/IPv6 according to the running test.
>> + * - packet is TCP. Also handles the case of one extension header and then TCP.
>> + * - checks the packet tcp dport equals to DPORT.
>> + * (also handles the case of one extension header and then TCP.)
>> + */
>
> nit: for filewide consistency: netdev style:
>
> /* start comment
> * more comment
> */
>
>> struct sock_filter filter[] = {
>> BPF_STMT(BPF_LD + BPF_H + BPF_ABS, ethproto_off),
>> - BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(ethhdr_proto), 0, 7),
>> + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(ethhdr_proto), 0, 9),
>> BPF_STMT(BPF_LD + BPF_B + BPF_ABS, ipproto_off),
>> + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 2, 0),
>> + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, opt_ipproto_off),
>> BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_TCP, 0, 5),
>> BPF_STMT(BPF_LD + BPF_H + BPF_ABS, dport_off),
>> BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, DPORT, 2, 0),
>> @@ -576,6 +593,39 @@ static void add_ipv4_ts_option(void *buf, void *optpkt)
>> iph->check = checksum_fold(iph, sizeof(struct iphdr) + optlen, 0);
>> }
>>
>> +static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type)
>> +{
>> + struct ipv6_opt_hdr *hbh_hdr = (struct ipv6_opt_hdr *)(optpkt + tcp_offset);
>> + struct ipv6hdr *iph = (struct ipv6hdr *)(optpkt + ETH_HLEN);
>> + int opt_len;
>> +
>> + hbh_hdr->hdrlen = 0;
>> + hbh_hdr->nexthdr = IPPROTO_TCP;
>> + opt_len = ipv6_optlen(hbh_hdr);
>> +
>> + memcpy(optpkt, buf, tcp_offset);
>> + memcpy(optpkt + tcp_offset + MIN_EXTHDR_SIZE, buf + tcp_offset,
>> + sizeof(struct tcphdr) + PAYLOAD_LEN);
>> +
>> + iph->nexthdr = exthdr_type;
>> + iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE);
>> +}
>> +
>> +/* Send IPv6 packet with extension header */
>> +static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr)
>> +{
>> + static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
>> + static char exthdr_pck[sizeof(buf) + MIN_EXTHDR_SIZE];
>> +
>> + create_packet(buf, 0, 0, PAYLOAD_LEN, 0);
>> + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS);
>> + write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
>> +
>> + create_packet(buf, PAYLOAD_LEN * 1, 0, PAYLOAD_LEN, 0);
>> + add_ipv6_exthdr(buf, exthdr_pck, IPPROTO_HOPOPTS);
>> + write_packet(fd, exthdr_pck, total_hdr_len + PAYLOAD_LEN + MIN_EXTHDR_SIZE, daddr);
>> +}
>> +
>> /* IPv4 options shouldn't coalesce */
>> static void send_ip_options(int fd, struct sockaddr_ll *daddr)
>> {
>> @@ -697,7 +747,7 @@ static void send_fragment6(int fd, struct sockaddr_ll *daddr)
>> create_packet(buf, PAYLOAD_LEN * i, 0, PAYLOAD_LEN, 0);
>> write_packet(fd, buf, bufpkt_len, daddr);
>> }
>> -
>> + sleep(1);
>
> Leftover debug, or necessary fix to existing test?
This sleep was necessary for the fragment test to consistently pass
as packets may arrive out-of-order on the receiving side.
>> create_packet(buf, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0);
>> memset(extpkt, 0, extpkt_len);
>>
>> @@ -760,6 +810,7 @@ static void check_recv_pkts(int fd, int *correct_payload,
>> vlog("}, Total %d packets\nReceived {", correct_num_pkts);
>>
>> while (1) {
>> + ip_ext_len = 0;
>> pkt_size = recv(fd, buffer, IP_MAXPACKET + ETH_HLEN + 1, 0);
>> if (pkt_size < 0)
>> error(1, errno, "could not receive");
>> @@ -767,7 +818,7 @@ static void check_recv_pkts(int fd, int *correct_payload,
>> if (iph->version == 4)
>> ip_ext_len = (iph->ihl - 5) * 4;
>> else if (ip6h->version == 6 && ip6h->nexthdr != IPPROTO_TCP)
>> - ip_ext_len = sizeof(struct ip6_frag);
>> + ip_ext_len = MIN_EXTHDR_SIZE;
>
> struct ip6_frag is > MIN_EXTHDR_SIZE. Need both cases?
>
> else if (ip6h->version == 6 && ip6h->nexthdr == NEXTHDR_FRAGMENT
> ip_ext_len = sizeof(struct ip6_frag);
> else if (ip6h->version == 6 && ip6h->nexthdr != IPPROTO_TCP)
> ip_ext_len = MIN_EXTHDR_SIZE;
>
sizeof(struct ip6_frag) == MIN_EXTHDR_SIZE, and any other IPv6 exthdr
test uses an 8(=MIN_EXTHDR_SIZE) byte exthdr, as this is the minimum
size of a single IPv6 exthdr.
We can also see it by looking at the 'ipv6_optlen' macro copied from the
IPv6 exthdr parsing in GRO:
#define ipv6_optlen(p) (((p)->hdrlen+1) << 3)
When hdrlen == 0, the exthdr len will be (0 + 1) << 3 = 8.
>>
>> tcph = (struct tcphdr *)(buffer + tcp_offset + ip_ext_len);
>>
>> @@ -880,7 +931,14 @@ static void gro_sender(void)
>> sleep(1);
>> write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
>> } else if (proto == PF_INET6) {
>> + sleep(1);
>> send_fragment6(txfd, &daddr);
>> + sleep(1);
>> + write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
>> +
>> + sleep(1);
>> + send_ipv6_exthdr(txfd, &daddr);
>> + sleep(1);
>
> For the casual reader: these sleeps are not leftover debug statements
> unfortunately. The same exists in the PF_INET branch:
>
> /* Modified packets may be received out of order.
> * Sleep function added to enforce test boundaries
> * so that fin pkts are not received prior to other pkts.
> */
>
>> write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
>> }
>> } else if (strcmp(testname, "large") == 0) {
>> @@ -997,7 +1055,13 @@ static void gro_receiver(void)
>> */
>> printf("fragmented ip6 doesn't coalesce: ");
>> correct_payload[0] = PAYLOAD_LEN * 2;
>> - check_recv_pkts(rxfd, correct_payload, 2);
>> + correct_payload[1] = PAYLOAD_LEN;
>> + correct_payload[2] = PAYLOAD_LEN;
>> + check_recv_pkts(rxfd, correct_payload, 3);
>> +
>> + correct_payload[0] = PAYLOAD_LEN * 2;
>> + printf("ipv6 with extension header DOES coalesce: ");
>> + check_recv_pkts(rxfd, correct_payload, 1);
>
> It might be worth adding a test with two different extension headers.
> To verify that these should not coalesce.
>
> Either different ipprotos, like IPPROTO_DSTOPTS vs IPPROTO_HOPOPTS.
> Perhaps more intesting are two of the same ipproto but with different
> content.
>
Good idea, I will add this test in v2.
>> }
>> } else if (strcmp(testname, "large") == 0) {
>> int offset = proto == PF_INET ? 20 : 0;
>> --
>> 2.36.1
>>
>
>