2024-04-12 15:56:27

by Richard Gobert

[permalink] [raw]
Subject: [PATCH net-next v7 0/3] net: gro: move p->{flush/flush_id} calculations to L4

This patch series depends on commits in the series submitted to net.
(https://lore.kernel.org/netdev/[email protected]/)

The fields network_offset and inner_network_offset are added to
napi_gro_cb, and are both set during the receive phase of GRO. This is then
leveraged in the next commit to remove flush_id state from napi_gro_cb, and
stateful code in {ipv6,inet}_gro_receive which may be unnecessarily
complicated due to encapsulation support in GRO.

3rd patch adds tests for different flush_id flows in GRO.

v6 -> v7:
- Moved bug fixes to a separate submission in net
- Added UDP fwd benchmark
- v6:
https://lore.kernel.org/all/[email protected]/

v5 -> v6:
- Write inner_network_offset in vxlan and geneve
- Ignore is_atomic when DF=0
- v5:
https://lore.kernel.org/all/[email protected]/

v4 -> v5:
- Add 1st commit - flush id checks in udp_gro_receive segment which can be
backported by itself
- Add TCP measurements for the 5th commit
- Add flush id tests to ensure flush id logic is preserved in GRO
- Simplify gro_inet_flush by removing a branch
- v4:
https://lore.kernel.org/all/[email protected]/

v3 -> v4:
- Fix code comment and commit message typos
- v3:
https://lore.kernel.org/all/[email protected]/

v2 -> v3:
- Use napi_gro_cb instead of skb->{offset}
- v2:
https://lore.kernel.org/all/[email protected]/

v1 -> v2:
- Pass p_off in *_gro_complete to fix UDP bug
- Remove more conditionals and memory fetches from inet_gro_flush
- v1:
https://lore.kernel.org/netdev/[email protected]/

Richard Gobert (3):
net: gro: add {inner_}network_offset to napi_gro_cb
net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment
selftests/net: add flush id selftests

drivers/net/geneve.c | 1 +
drivers/net/vxlan/vxlan_core.c | 1 +
include/net/gro.h | 82 +++++++++++++++--
net/8021q/vlan_core.c | 2 +
net/core/gro.c | 5 +-
net/ethernet/eth.c | 1 +
net/ipv4/af_inet.c | 46 +---------
net/ipv4/gre_offload.c | 1 +
net/ipv4/tcp_offload.c | 15 +---
net/ipv4/udp_offload.c | 16 +---
net/ipv6/ip6_offload.c | 19 +---
tools/testing/selftests/net/gro.c | 144 ++++++++++++++++++++++++++++++
12 files changed, 238 insertions(+), 95 deletions(-)

--
2.36.1



2024-04-12 15:56:50

by Richard Gobert

[permalink] [raw]
Subject: [PATCH net-next v7 1/3] net: gro: add {inner_}network_offset to napi_gro_cb

This patch adds network_offset and inner_network_offset to napi_gro_cb, and
makes sure both are set correctly. In the common path there's only one
write (skb_gro_reset_offset).

Signed-off-by: Richard Gobert <[email protected]>
---
drivers/net/geneve.c | 1 +
drivers/net/vxlan/vxlan_core.c | 1 +
include/net/gro.h | 18 ++++++++++++++++--
net/8021q/vlan_core.c | 2 ++
net/core/gro.c | 1 +
net/ethernet/eth.c | 1 +
net/ipv4/af_inet.c | 5 +----
net/ipv4/gre_offload.c | 1 +
net/ipv6/ip6_offload.c | 8 ++++----
9 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 9c18a39b0d0c..a6256ea1f5bc 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -545,6 +545,7 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk,
if (!ptype)
goto out;

+ NAPI_GRO_CB(skb)->inner_network_offset = hlen;
pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
flush = 0;

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 6fb182d9d6e7..9fb93c3953c1 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -754,6 +754,7 @@ static struct sk_buff *vxlan_gpe_gro_receive(struct sock *sk,

vh = vxlan_gro_prepare_receive(sk, head, skb, &grc);
if (vh) {
+ NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);
if (!vxlan_parse_gpe_proto(vh, &protocol))
goto out;
ptype = gro_find_receive_by_type(protocol);
diff --git a/include/net/gro.h b/include/net/gro.h
index ebead1d642b4..a1cc8e8c2ebd 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -87,6 +87,15 @@ struct napi_gro_cb {

/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
+
+ /* L3 offsets */
+ union {
+ struct {
+ u16 network_offset;
+ u16 inner_network_offset;
+ };
+ u16 network_offsets[2];
+ };
};

#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
@@ -172,12 +181,17 @@ static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen,
return ptr;
}

+static inline int skb_gro_network_offset(const struct sk_buff *skb)
+{
+ return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark];
+}
+
static inline void *skb_gro_network_header(const struct sk_buff *skb)
{
if (skb_gro_may_pull(skb, skb_gro_offset(skb)))
- return skb_gro_header_fast(skb, skb_network_offset(skb));
+ return skb_gro_header_fast(skb, skb_gro_network_offset(skb));

- return skb_network_header(skb);
+ return skb->data + skb_gro_network_offset(skb);
}

static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb,
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 247704cf70af..355cafe23329 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -478,6 +478,8 @@ static struct sk_buff *vlan_gro_receive(struct list_head *head,
if (unlikely(!vhdr))
goto out;

+ NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen;
+
type = vhdr->h_vlan_encapsulated_proto;

ptype = gro_find_receive_by_type(type);
diff --git a/net/core/gro.c b/net/core/gro.c
index b129cd201937..b2156e6cc4ad 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -371,6 +371,7 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
const skb_frag_t *frag0;
unsigned int headlen;

+ NAPI_GRO_CB(skb)->network_offset = 0;
NAPI_GRO_CB(skb)->data_offset = 0;
headlen = skb_headlen(skb);
NAPI_GRO_CB(skb)->frag0 = skb->data;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 7515e6bcbb7d..e3eca605bcc7 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -441,6 +441,7 @@ struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)

skb_gro_pull(skb, sizeof(*eh));
skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
+ NAPI_GRO_CB(skb)->inner_network_offset = hlen;

pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
ipv6_gro_receive, inet_gro_receive,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5a747d91cd0a..6546bf376b24 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1568,10 +1568,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)

NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
- skb_set_network_header(skb, off);
- /* The above will be needed by the transport layer if there is one
- * immediately following this IP hdr.
- */

/* Note : No need to call skb_gro_postpull_rcsum() here,
* as we already checked checksum over ipv4 header was 0
@@ -1597,6 +1593,7 @@ static struct sk_buff *ipip_gro_receive(struct list_head *head,
}

NAPI_GRO_CB(skb)->encap_mark = 1;
+ NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);

return inet_gro_receive(head, skb);
}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index d4520c3f7c09..ae596285d78c 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -224,6 +224,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
skb_gro_postpull_rcsum(skb, greh, grehlen);

+ NAPI_GRO_CB(skb)->inner_network_offset = hlen;
pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
flush = 0;

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 10ddbbc0e46d..ba41939537f2 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -67,7 +67,7 @@ static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
off += len;
}

- skb_gro_pull(skb, off - skb_network_offset(skb));
+ skb_gro_pull(skb, off - skb_gro_network_offset(skb));
return proto;
}

@@ -236,8 +236,6 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
if (unlikely(!iph))
goto out;

- skb_set_network_header(skb, off);
-
flush += ntohs(iph->payload_len) != skb->len - hlen;

proto = iph->nexthdr;
@@ -259,7 +257,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
NAPI_GRO_CB(skb)->proto = proto;

flush--;
- nlen = skb_network_header_len(skb);
+ nlen = skb_gro_offset(skb) - off;

list_for_each_entry(p, head, list) {
const struct ipv6hdr *iph2;
@@ -327,6 +325,7 @@ static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head,
}

NAPI_GRO_CB(skb)->encap_mark = 1;
+ NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);

return ipv6_gro_receive(head, skb);
}
@@ -342,6 +341,7 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
}

NAPI_GRO_CB(skb)->encap_mark = 1;
+ NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);

return inet_gro_receive(head, skb);
}
--
2.36.1


2024-04-12 15:57:27

by Richard Gobert

[permalink] [raw]
Subject: [PATCH net-next v7 2/3] net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment

{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
iph->id, ...) against all packets in a loop. These flush checks are used
currently in all tcp flows and in some UDP flows in GRO.

These checks need to be done only once and only against the found p skb,
since they only affect flush and not same_flow.

Leveraging the previous commit in the series, in which correct network
header offsets are saved for both outer and inner network headers -
allowing these checks to be done only once, in tcp_gro_receive and
udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at
all. In addition, flush_id checks are more declarative and contained in
inet_gro_flush, thus removing the need for flush_id in napi_gro_cb.

This results in less parsing code for UDP flows and non-loop flush tests
for TCP flows.

To make sure results are not within noise range - I've made netfilter drop
all TCP packets, and measured CPU performance in GRO (in this case GRO is
responsible for about 50% of the CPU utilization).

L3 flush/flush_id checks are not relevant to UDP connections where
skb_gro_receive_list is called. The only code change relevant to this flow
is inet_gro_receive. The rest of the code parsing this flow stays the
same.

All concurrent connections tested are with the same ip srcaddr and
dstaddr.

perf top while replaying 64 concurrent IP/UDP connections (UDP fwd flow):
net-next:
3.03% [kernel] [k] inet_gro_receive

patch applied:
2.78% [kernel] [k] inet_gro_receive

perf top while replaying encapsulated load - 64 concurrent IP/IP/UDP
connections (rx-gro-list and rx-udp-gro-forwarding are enabled):
net-next:
10.50% [kernel] [k] inet_gro_receive

patch applied:
8.19% [kernel] [k] inet_gro_receive

perf top while replaying 64 parallel IP/TCP streams merging in GRO:
(gro_network_flush is compiled inline to tcp_gro_receive)
net-next:
6.94% [kernel] [k] inet_gro_receive
3.02% [kernel] [k] tcp_gro_receive

patch applied:
4.27% [kernel] [k] tcp_gro_receive
4.22% [kernel] [k] inet_gro_receive

perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same
results for any encapsulation, in this case inet_gro_receive is top
offender in net-next)
net-next:
10.09% [kernel] [k] inet_gro_receive
2.08% [kernel] [k] tcp_gro_receive

patch applied:
6.97% [kernel] [k] inet_gro_receive
3.68% [kernel] [k] tcp_gro_receive

Signed-off-by: Richard Gobert <[email protected]>
---
include/net/gro.h | 66 ++++++++++++++++++++++++++++++++++++++----
net/core/gro.c | 4 ---
net/ipv4/af_inet.c | 41 +-------------------------
net/ipv4/tcp_offload.c | 15 ++--------
net/ipv4/udp_offload.c | 16 +++-------
net/ipv6/ip6_offload.c | 11 -------
6 files changed, 67 insertions(+), 86 deletions(-)

diff --git a/include/net/gro.h b/include/net/gro.h
index a1cc8e8c2ebd..c1f80f1156d6 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -36,15 +36,15 @@ struct napi_gro_cb {
/* This is non-zero if the packet cannot be merged with the new skb. */
u16 flush;

- /* Save the IP ID here and check when we get to the transport layer */
- u16 flush_id;
-
/* Number of segments aggregated. */
u16 count;

/* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */
u16 proto;

+ /* used to support CHECKSUM_COMPLETE for tunneling protocols */
+ __wsum csum;
+
/* Used in napi_gro_cb::free */
#define NAPI_GRO_FREE 1
#define NAPI_GRO_FREE_STOLEN_HEAD 2
@@ -85,9 +85,6 @@ struct napi_gro_cb {
u8 is_flist:1;
);

- /* used to support CHECKSUM_COMPLETE for tunneling protocols */
- __wsum csum;
-
/* L3 offsets */
union {
struct {
@@ -443,6 +440,63 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb,
skb_gro_len(skb), proto, 0));
}

+static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2,
+ struct sk_buff *p, bool outer)
+{
+ const u32 id = ntohl(*(__be32 *)&iph->id);
+ const u32 id2 = ntohl(*(__be32 *)&iph2->id);
+ const u16 flush_id = (id >> 16) - (id2 >> 16);
+ const u16 count = NAPI_GRO_CB(p)->count;
+ const u32 df = id & IP_DF;
+ u32 is_atomic;
+ int flush;
+
+ /* All fields must match except length and checksum. */
+ flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF));
+
+ if (outer && df)
+ return flush;
+
+ /* When we receive our second frame we can make a decision on if we
+ * continue this flow as an atomic flow with a fixed ID or if we use
+ * an incrementing ID.
+ */
+ NAPI_GRO_CB(p)->is_atomic |= (count == 1 && df && flush_id == 0);
+ is_atomic = (df && NAPI_GRO_CB(p)->is_atomic) - 1;
+
+ return flush | (flush_id ^ (count & is_atomic));
+}
+
+static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2)
+{
+ /* <Version:4><Traffic_Class:8><Flow_Label:20> */
+ __be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
+
+ /* Flush if Traffic Class fields are different. */
+ return !!((first_word & htonl(0x0FF00000)) |
+ (__force __be32)(iph->hop_limit ^ iph2->hop_limit));
+}
+
+static inline int gro_network_flush(const void *th, const void *th2, struct sk_buff *p, int off)
+{
+ const bool encap_mark = NAPI_GRO_CB(p)->encap_mark;
+ int flush = 0;
+ int i;
+
+ for (i = 0; i <= encap_mark; i++) {
+ const u16 diff = off - NAPI_GRO_CB(p)->network_offsets[i];
+ const void *nh = th - diff;
+ const void *nh2 = th2 - diff;
+
+ if (((struct iphdr *)nh)->version == 6)
+ flush |= ipv6_gro_flush(nh, nh2);
+ else
+ flush |= inet_gro_flush(nh, nh2, p, i != encap_mark);
+ }
+
+ return flush;
+}
+
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);

/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
diff --git a/net/core/gro.c b/net/core/gro.c
index b2156e6cc4ad..3bfdfefe4a24 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -89,7 +89,6 @@ void dev_remove_offload(struct packet_offload *po)
}
EXPORT_SYMBOL(dev_remove_offload);

-
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
{
struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
@@ -330,8 +329,6 @@ static void gro_list_prepare(const struct list_head *head,
list_for_each_entry(p, head, list) {
unsigned long diffs;

- NAPI_GRO_CB(p)->flush = 0;
-
if (hash != skb_get_hash_raw(p)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
@@ -471,7 +468,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
sizeof(u32))); /* Avoid slow unaligned acc */
*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
- NAPI_GRO_CB(skb)->is_atomic = 1;
NAPI_GRO_CB(skb)->count = 1;
if (unlikely(skb_is_gso(skb))) {
NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6546bf376b24..af094aecf38c 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1481,7 +1481,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
struct sk_buff *p;
unsigned int hlen;
unsigned int off;
- unsigned int id;
int flush = 1;
int proto;

@@ -1507,13 +1506,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
goto out;

NAPI_GRO_CB(skb)->proto = proto;
- id = ntohl(*(__be32 *)&iph->id);
- flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
- id >>= 16;
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));

list_for_each_entry(p, head, list) {
struct iphdr *iph2;
- u16 flush_id;

if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1530,43 +1526,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
-
- /* All fields must match except length and checksum. */
- NAPI_GRO_CB(p)->flush |=
- (iph->ttl ^ iph2->ttl) |
- (iph->tos ^ iph2->tos) |
- ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
-
- NAPI_GRO_CB(p)->flush |= flush;
-
- /* We need to store of the IP ID check to be included later
- * when we can verify that this packet does in fact belong
- * to a given flow.
- */
- flush_id = (u16)(id - ntohs(iph2->id));
-
- /* This bit of code makes it much easier for us to identify
- * the cases where we are doing atomic vs non-atomic IP ID
- * checks. Specifically an atomic check can return IP ID
- * values 0 - 0xFFFF, while a non-atomic check can only
- * return 0 or 0xFFFF.
- */
- if (!NAPI_GRO_CB(p)->is_atomic ||
- !(iph->frag_off & htons(IP_DF))) {
- flush_id ^= NAPI_GRO_CB(p)->count;
- flush_id = flush_id ? 0xFFFF : 0;
- }
-
- /* If the previous IP ID value was based on an atomic
- * datagram we can overwrite the value and ignore it.
- */
- if (NAPI_GRO_CB(skb)->is_atomic)
- NAPI_GRO_CB(p)->flush_id = flush_id;
- else
- NAPI_GRO_CB(p)->flush_id |= flush_id;
}

- NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;

/* Note : No need to call skb_gro_postpull_rcsum() here,
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 7f045b881dd4..1b10ab3b0f6a 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -232,9 +232,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
goto out_check_final;

found:
- /* Include the IP ID check below from the inner most IP hdr */
- flush = NAPI_GRO_CB(p)->flush;
- flush |= (__force int)(flags & TCP_FLAG_CWR);
+ flush = (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
@@ -242,16 +240,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);

- /* When we receive our second frame we can made a decision on if we
- * continue this flow as an atomic flow with a fixed ID or if we use
- * an incrementing ID.
- */
- if (NAPI_GRO_CB(p)->flush_id != 1 ||
- NAPI_GRO_CB(p)->count != 1 ||
- !NAPI_GRO_CB(p)->is_atomic)
- flush |= NAPI_GRO_CB(p)->flush_id;
- else
- NAPI_GRO_CB(p)->is_atomic = false;
+ flush |= gro_network_flush(th, th2, p, off);

mss = skb_shinfo(p)->gso_size;

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ad4c88fe7d15..c5a5155904cf 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -466,12 +466,12 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
+ int off = skb_gro_offset(skb);
struct sk_buff *pp = NULL;
struct udphdr *uh2;
struct sk_buff *p;
unsigned int ulen;
int ret = 0;
- int flush;

/* requires non zero csum, for symmetry with GSO */
if (!uh->check) {
@@ -529,17 +529,9 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
skb_gro_postpull_rcsum(skb, uh,
sizeof(struct udphdr));

- flush = NAPI_GRO_CB(p)->flush;
-
- if (NAPI_GRO_CB(p)->flush_id != 1 ||
- NAPI_GRO_CB(p)->count != 1 ||
- !NAPI_GRO_CB(p)->is_atomic)
- flush |= NAPI_GRO_CB(p)->flush_id;
- else
- NAPI_GRO_CB(p)->is_atomic = false;
-
- if (flush || skb_gro_receive(p, skb))
- ret = 1;
+ ret = gro_network_flush(uh, uh2, p, off);
+ if (!ret)
+ ret = skb_gro_receive(p, skb);
}
}

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index ba41939537f2..c9a6bc1afc9a 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -288,19 +288,8 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
nlen - sizeof(struct ipv6hdr)))
goto not_same_flow;
}
- /* flush if Traffic Class fields are different */
- NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) |
- (__force __be32)(iph->hop_limit ^ iph2->hop_limit));
- NAPI_GRO_CB(p)->flush |= flush;
-
- /* If the previous IP ID value was based on an atomic
- * datagram we can overwrite the value and ignore it.
- */
- if (NAPI_GRO_CB(skb)->is_atomic)
- NAPI_GRO_CB(p)->flush_id = 0;
}

- NAPI_GRO_CB(skb)->is_atomic = true;
NAPI_GRO_CB(skb)->flush |= flush;

skb_gro_postpull_rcsum(skb, iph, nlen);
--
2.36.1


2024-04-12 16:06:32

by Richard Gobert

[permalink] [raw]
Subject: [PATCH net-next v7 3/3] selftests/net: add flush id selftests

Added flush id selftests to test different cases where DF flag is set or
unset and id value changes in the following packets. All cases where the
packets should coalesce or should not coalesce are tested.

Signed-off-by: Richard Gobert <[email protected]>
---
tools/testing/selftests/net/gro.c | 144 ++++++++++++++++++++++++++++++
1 file changed, 144 insertions(+)

diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
index 353e1e867fbb..74ab06953c38 100644
--- a/tools/testing/selftests/net/gro.c
+++ b/tools/testing/selftests/net/gro.c
@@ -617,6 +617,120 @@ static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext
iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE);
}

+static void fix_ip4_checksum(struct iphdr *iph)
+{
+ iph->check = 0;
+ iph->check = checksum_fold(iph, sizeof(struct iphdr), 0);
+}
+
+static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
+{
+ bool send_three = false;
+ static char buf1[MAX_HDR_LEN + PAYLOAD_LEN];
+ static char buf2[MAX_HDR_LEN + PAYLOAD_LEN];
+ static char buf3[MAX_HDR_LEN + PAYLOAD_LEN];
+
+ create_packet(buf1, 0, 0, PAYLOAD_LEN, 0);
+ create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0);
+ create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0);
+
+ struct iphdr *iph1 = (struct iphdr *)(buf1 + ETH_HLEN);
+ struct iphdr *iph2 = (struct iphdr *)(buf2 + ETH_HLEN);
+ struct iphdr *iph3 = (struct iphdr *)(buf3 + ETH_HLEN);
+
+ switch (tcase) {
+ case 0: /* DF=1, Incrementing - should coalesce */
+ iph1->frag_off |= htons(IP_DF);
+ iph1->id = htons(8);
+ fix_ip4_checksum(iph1);
+
+ iph2->frag_off |= htons(IP_DF);
+ iph2->id = htons(9);
+ fix_ip4_checksum(iph2);
+ break;
+
+ case 1: /* DF=1, Fixed - should coalesce */
+ iph1->frag_off |= htons(IP_DF);
+ iph1->id = htons(8);
+ fix_ip4_checksum(iph1);
+
+ iph2->frag_off |= htons(IP_DF);
+ iph2->id = htons(8);
+ fix_ip4_checksum(iph2);
+ break;
+
+ case 2: /* DF=0, Incrementing - should coalesce */
+ iph1->frag_off &= ~htons(IP_DF);
+ iph1->id = htons(8);
+ fix_ip4_checksum(iph1);
+
+ iph2->frag_off &= ~htons(IP_DF);
+ iph2->id = htons(9);
+ fix_ip4_checksum(iph2);
+ break;
+
+ case 3: /* DF=0, Fixed - should not coalesce */
+ iph1->frag_off &= ~htons(IP_DF);
+ iph1->id = htons(8);
+ fix_ip4_checksum(iph1);
+
+ iph2->frag_off &= ~htons(IP_DF);
+ iph2->id = htons(8);
+ fix_ip4_checksum(iph2);
+ break;
+
+ case 4: /* DF=1, two packets incrementing, and one fixed - should
+ * coalesce only the first two packets
+ */
+ iph1->frag_off |= htons(IP_DF);
+ iph1->id = htons(8);
+ fix_ip4_checksum(iph1);
+
+ iph2->frag_off |= htons(IP_DF);
+ iph2->id = htons(9);
+ fix_ip4_checksum(iph2);
+
+ iph3->frag_off |= htons(IP_DF);
+ iph3->id = htons(9);
+ fix_ip4_checksum(iph3);
+ send_three = true;
+ break;
+
+ case 5: /* DF=1, two packets fixed, and one incrementing - should
+ * coalesce only the first two packets
+ */
+ iph1->frag_off |= htons(IP_DF);
+ iph1->id = htons(8);
+ fix_ip4_checksum(iph1);
+
+ iph2->frag_off |= htons(IP_DF);
+ iph2->id = htons(8);
+ fix_ip4_checksum(iph2);
+
+ iph3->frag_off |= htons(IP_DF);
+ iph3->id = htons(9);
+ fix_ip4_checksum(iph3);
+ send_three = true;
+ break;
+ }
+
+ write_packet(fd, buf1, total_hdr_len + PAYLOAD_LEN, daddr);
+ write_packet(fd, buf2, total_hdr_len + PAYLOAD_LEN, daddr);
+
+ if (send_three)
+ write_packet(fd, buf3, total_hdr_len + PAYLOAD_LEN, daddr);
+}
+
+static void test_flush_id(int fd, struct sockaddr_ll *daddr, char *fin_pkt)
+{
+ for (int i = 0; i < 6; i++) {
+ sleep(1);
+ send_flush_id_case(fd, daddr, i);
+ sleep(1);
+ write_packet(fd, fin_pkt, total_hdr_len, daddr);
+ }
+}
+
static void send_ipv6_exthdr(int fd, struct sockaddr_ll *daddr, char *ext_data1, char *ext_data2)
{
static char buf[MAX_HDR_LEN + PAYLOAD_LEN];
@@ -935,6 +1049,8 @@ static void gro_sender(void)
send_fragment4(txfd, &daddr);
sleep(1);
write_packet(txfd, fin_pkt, total_hdr_len, &daddr);
+
+ test_flush_id(txfd, &daddr, fin_pkt);
} else if (proto == PF_INET6) {
sleep(1);
send_fragment6(txfd, &daddr);
@@ -1061,6 +1177,34 @@ static void gro_receiver(void)

printf("fragmented ip4 doesn't coalesce: ");
check_recv_pkts(rxfd, correct_payload, 2);
+
+ /* is_atomic checks */
+ printf("DF=1, Incrementing - should coalesce: ");
+ correct_payload[0] = PAYLOAD_LEN * 2;
+ check_recv_pkts(rxfd, correct_payload, 1);
+
+ printf("DF=1, Fixed - should coalesce: ");
+ correct_payload[0] = PAYLOAD_LEN * 2;
+ check_recv_pkts(rxfd, correct_payload, 1);
+
+ printf("DF=0, Incrementing - should coalesce: ");
+ correct_payload[0] = PAYLOAD_LEN * 2;
+ check_recv_pkts(rxfd, correct_payload, 1);
+
+ printf("DF=0, Fixed - should not coalesce: ");
+ correct_payload[0] = PAYLOAD_LEN;
+ correct_payload[1] = PAYLOAD_LEN;
+ check_recv_pkts(rxfd, correct_payload, 2);
+
+ printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: ");
+ correct_payload[0] = PAYLOAD_LEN * 2;
+ correct_payload[1] = PAYLOAD_LEN;
+ check_recv_pkts(rxfd, correct_payload, 2);
+
+ printf("DF=1, 2 Fixed and one incrementing - should coalesce only first 2 packets: ");
+ correct_payload[0] = PAYLOAD_LEN * 2;
+ correct_payload[1] = PAYLOAD_LEN;
+ check_recv_pkts(rxfd, correct_payload, 2);
} else if (proto == PF_INET6) {
/* GRO doesn't check for ipv6 hop limit when flushing.
* Hence no corresponding test to the ipv4 case.
--
2.36.1


2024-04-14 00:56:11

by Willem de Bruijn

[permalink] [raw]
Subject: Re: [PATCH net-next v7 3/3] selftests/net: add flush id selftests

Richard Gobert wrote:
> Added flush id selftests to test different cases where DF flag is set or
> unset and id value changes in the following packets. All cases where the
> packets should coalesce or should not coalesce are tested.
>
> Signed-off-by: Richard Gobert <[email protected]>

Thanks for adding tests. Minor point below only. The tests pass both
before and after your series, right? Then immediately a nice
validation that the optimization has no unintended side-effects.

> ---
> tools/testing/selftests/net/gro.c | 144 ++++++++++++++++++++++++++++++
> 1 file changed, 144 insertions(+)
>
> diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
> index 353e1e867fbb..74ab06953c38 100644
> --- a/tools/testing/selftests/net/gro.c
> +++ b/tools/testing/selftests/net/gro.c
> @@ -617,6 +617,120 @@ static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext
> iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE);
> }
>
> +static void fix_ip4_checksum(struct iphdr *iph)
> +{
> + iph->check = 0;
> + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0);
> +}
> +
> +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
> +{
> + bool send_three = false;
> + static char buf1[MAX_HDR_LEN + PAYLOAD_LEN];
> + static char buf2[MAX_HDR_LEN + PAYLOAD_LEN];
> + static char buf3[MAX_HDR_LEN + PAYLOAD_LEN];
> +
> + create_packet(buf1, 0, 0, PAYLOAD_LEN, 0);
> + create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0);
> + create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0);
> +
> + struct iphdr *iph1 = (struct iphdr *)(buf1 + ETH_HLEN);
> + struct iphdr *iph2 = (struct iphdr *)(buf2 + ETH_HLEN);
> + struct iphdr *iph3 = (struct iphdr *)(buf3 + ETH_HLEN);
> +

minor: variable defintions before code, and reverse chrismas tree.

2024-04-14 01:15:40

by Willem de Bruijn

[permalink] [raw]
Subject: Re: [PATCH net-next v7 1/3] net: gro: add {inner_}network_offset to napi_gro_cb

Richard Gobert wrote:
> This patch adds network_offset and inner_network_offset to napi_gro_cb, and
> makes sure both are set correctly. In the common path there's only one
> write (skb_gro_reset_offset).
>
> Signed-off-by: Richard Gobert <[email protected]>
> ---
> drivers/net/geneve.c | 1 +
> drivers/net/vxlan/vxlan_core.c | 1 +
> include/net/gro.h | 18 ++++++++++++++++--
> net/8021q/vlan_core.c | 2 ++
> net/core/gro.c | 1 +
> net/ethernet/eth.c | 1 +
> net/ipv4/af_inet.c | 5 +----
> net/ipv4/gre_offload.c | 1 +
> net/ipv6/ip6_offload.c | 8 ++++----
> 9 files changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> index d4520c3f7c09..ae596285d78c 100644
> --- a/net/ipv4/gre_offload.c
> +++ b/net/ipv4/gre_offload.c
> @@ -224,6 +224,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
> /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
> skb_gro_postpull_rcsum(skb, greh, grehlen);
>
> + NAPI_GRO_CB(skb)->inner_network_offset = hlen;
> pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
> flush = 0;

Nice that this even works for ETH_P_TEB, as eth_gro_receive will
overwrite the offset written here.


> list_for_each_entry(p, head, list) {
> const struct ipv6hdr *iph2;
> @@ -327,6 +325,7 @@ static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head,
> }
>
> NAPI_GRO_CB(skb)->encap_mark = 1;
> + NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);
>
> return ipv6_gro_receive(head, skb);
> }
> @@ -342,6 +341,7 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
> }
>
> NAPI_GRO_CB(skb)->encap_mark = 1;
> + NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);

Do we still need encap_mark, or is it always set at the same time that
inner_network_offset becomes non-zero?


2024-04-16 09:23:35

by Paolo Abeni

[permalink] [raw]
Subject: Re: [PATCH net-next v7 2/3] net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment

On Fri, 2024-04-12 at 17:55 +0200, Richard Gobert wrote:
> {inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
> iph->id, ...) against all packets in a loop. These flush checks are used
> currently in all tcp flows and in some UDP flows in GRO.
>
> These checks need to be done only once and only against the found p skb,
> since they only affect flush and not same_flow.
>
> Leveraging the previous commit in the series, in which correct network
> header offsets are saved for both outer and inner network headers -
> allowing these checks to be done only once, in tcp_gro_receive and
> udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at
> all. In addition, flush_id checks are more declarative and contained in
> inet_gro_flush, thus removing the need for flush_id in napi_gro_cb.
>
> This results in less parsing code for UDP flows and non-loop flush tests
> for TCP flows.
>
> To make sure results are not within noise range - I've made netfilter drop
> all TCP packets, and measured CPU performance in GRO (in this case GRO is
> responsible for about 50% of the CPU utilization).
>
> L3 flush/flush_id checks are not relevant to UDP connections where
> skb_gro_receive_list is called. The only code change relevant to this flow
> is inet_gro_receive. The rest of the code parsing this flow stays the
> same.
>
> All concurrent connections tested are with the same ip srcaddr and
> dstaddr.
>
> perf top while replaying 64 concurrent IP/UDP connections (UDP fwd flow):
> net-next:
> 3.03% [kernel] [k] inet_gro_receive
>
> patch applied:
> 2.78% [kernel] [k] inet_gro_receive

Why there are no figures for
udp_gro_receive_segment()/gro_network_flush() here?

Also you should be able to observer a very high amount of CPU usage by
GRO even with TCP with very high speed links, keeping the BH/GRO on a
CPU and the user-space/data copy on a different one (or using rx zero
copy).

Thanks,

Paolo


2024-04-16 09:37:06

by Paolo Abeni

[permalink] [raw]
Subject: Re: [PATCH net-next v7 1/3] net: gro: add {inner_}network_offset to napi_gro_cb

On Fri, 2024-04-12 at 17:55 +0200, Richard Gobert wrote:
> This patch adds network_offset and inner_network_offset to napi_gro_cb, and
> makes sure both are set correctly. In the common path there's only one
> write (skb_gro_reset_offset).
>
> Signed-off-by: Richard Gobert <[email protected]>

Does not apply cleanly to net-next. You have to wait until the net
dependency is merged into net-next before posting.

> ---
> drivers/net/geneve.c | 1 +
> drivers/net/vxlan/vxlan_core.c | 1 +
> include/net/gro.h | 18 ++++++++++++++++--
> net/8021q/vlan_core.c | 2 ++
> net/core/gro.c | 1 +
> net/ethernet/eth.c | 1 +
> net/ipv4/af_inet.c | 5 +----
> net/ipv4/gre_offload.c | 1 +
> net/ipv6/ip6_offload.c | 8 ++++----
> 9 files changed, 28 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
> index 9c18a39b0d0c..a6256ea1f5bc 100644
> --- a/drivers/net/geneve.c
> +++ b/drivers/net/geneve.c
> @@ -545,6 +545,7 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk,
> if (!ptype)
> goto out;
>
> + NAPI_GRO_CB(skb)->inner_network_offset = hlen;
> pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
> flush = 0;
>
> diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
> index 6fb182d9d6e7..9fb93c3953c1 100644
> --- a/drivers/net/vxlan/vxlan_core.c
> +++ b/drivers/net/vxlan/vxlan_core.c
> @@ -754,6 +754,7 @@ static struct sk_buff *vxlan_gpe_gro_receive(struct sock *sk,
>
> vh = vxlan_gro_prepare_receive(sk, head, skb, &grc);
> if (vh) {
> + NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);
> if (!vxlan_parse_gpe_proto(vh, &protocol))
> goto out;
> ptype = gro_find_receive_by_type(protocol);

What about vxlan_gro_receive? and fou/gue?

Side note: the latter apparently exist mainly to make UDP-related
changes more difficult, can we deprecated them once for all?

Thank,

Paolo


2024-04-16 09:58:17

by Paolo Abeni

[permalink] [raw]
Subject: Re: [PATCH net-next v7 2/3] net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment

On Tue, 2024-04-16 at 11:21 +0200, Paolo Abeni wrote:
> On Fri, 2024-04-12 at 17:55 +0200, Richard Gobert wrote:
> > {inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
> > iph->id, ...) against all packets in a loop. These flush checks are used
> > currently in all tcp flows and in some UDP flows in GRO.
> >
> > These checks need to be done only once and only against the found p skb,
> > since they only affect flush and not same_flow.
> >
> > Leveraging the previous commit in the series, in which correct network
> > header offsets are saved for both outer and inner network headers -
> > allowing these checks to be done only once, in tcp_gro_receive and
> > udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at
> > all. In addition, flush_id checks are more declarative and contained in
> > inet_gro_flush, thus removing the need for flush_id in napi_gro_cb.
> >
> > This results in less parsing code for UDP flows and non-loop flush tests
> > for TCP flows.
> >
> > To make sure results are not within noise range - I've made netfilter drop
> > all TCP packets, and measured CPU performance in GRO (in this case GRO is
> > responsible for about 50% of the CPU utilization).
> >
> > L3 flush/flush_id checks are not relevant to UDP connections where
> > skb_gro_receive_list is called. The only code change relevant to this flow
> > is inet_gro_receive. The rest of the code parsing this flow stays the
> > same.
> >
> > All concurrent connections tested are with the same ip srcaddr and
> > dstaddr.
> >
> > perf top while replaying 64 concurrent IP/UDP connections (UDP fwd flow):
> > net-next:
> > 3.03% [kernel] [k] inet_gro_receive
> >
> > patch applied:
> > 2.78% [kernel] [k] inet_gro_receive
>
> Why there are no figures for
> udp_gro_receive_segment()/gro_network_flush() here?
>
> Also you should be able to observer a very high amount of CPU usage by
> GRO even with TCP with very high speed links, keeping the BH/GRO on a
> CPU and the user-space/data copy on a different one (or using rx zero
> copy).

To be more explicit: I think at least the above figures are required, 
and I still fear the real gain in that case would range from zero to
negative.

If you can't do the TCP part of the testing, please provide at least
the figures for a single UDP flow, that should give more indication WRT
the result we can expect with TCP.

Note that GRO is used mainly by TCP and TCP packets with different
src/dst port will land into different GRO hash buckets, having
different RX hash.

That will happen even for UDP, at least for some (most?) nics include
the UDP ports in the RX hash.

Thanks,

Paolo


2024-04-17 13:57:47

by Richard Gobert

[permalink] [raw]
Subject: Re: [PATCH net-next v7 1/3] net: gro: add {inner_}network_offset to napi_gro_cb

Willem de Bruijn wrote:
> Richard Gobert wrote:
>> This patch adds network_offset and inner_network_offset to napi_gro_cb, and
>> makes sure both are set correctly. In the common path there's only one
>> write (skb_gro_reset_offset).
>>
>> Signed-off-by: Richard Gobert <[email protected]>
>> ---
>> drivers/net/geneve.c | 1 +
>> drivers/net/vxlan/vxlan_core.c | 1 +
>> include/net/gro.h | 18 ++++++++++++++++--
>> net/8021q/vlan_core.c | 2 ++
>> net/core/gro.c | 1 +
>> net/ethernet/eth.c | 1 +
>> net/ipv4/af_inet.c | 5 +----
>> net/ipv4/gre_offload.c | 1 +
>> net/ipv6/ip6_offload.c | 8 ++++----
>> 9 files changed, 28 insertions(+), 10 deletions(-)
>>
>> diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
>> index d4520c3f7c09..ae596285d78c 100644
>> --- a/net/ipv4/gre_offload.c
>> +++ b/net/ipv4/gre_offload.c
>> @@ -224,6 +224,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
>> /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
>> skb_gro_postpull_rcsum(skb, greh, grehlen);
>>
>> + NAPI_GRO_CB(skb)->inner_network_offset = hlen;
>> pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
>> flush = 0;
>
> Nice that this even works for ETH_P_TEB, as eth_gro_receive will
> overwrite the offset written here.
>
>
>> list_for_each_entry(p, head, list) {
>> const struct ipv6hdr *iph2;
>> @@ -327,6 +325,7 @@ static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head,
>> }
>>
>> NAPI_GRO_CB(skb)->encap_mark = 1;
>> + NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);
>>
>> return ipv6_gro_receive(head, skb);
>> }
>> @@ -342,6 +341,7 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
>> }
>>
>> NAPI_GRO_CB(skb)->encap_mark = 1;
>> + NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);
>
> Do we still need encap_mark, or is it always set at the same time that
> inner_network_offset becomes non-zero?
>

This would require setting inner_network_header to 0 before dev_gro_receive
which would not be favorable to the common case. (As opposed to encap_mark
which is already set to 0 as being part of NAPI_GRO_CB->zeroed). In my
opinion, it might also be less readable.

2024-04-17 14:28:56

by Richard Gobert

[permalink] [raw]
Subject: Re: [PATCH net-next v7 3/3] selftests/net: add flush id selftests

Willem de Bruijn wrote:
> Richard Gobert wrote:
>> Added flush id selftests to test different cases where DF flag is set or
>> unset and id value changes in the following packets. All cases where the
>> packets should coalesce or should not coalesce are tested.
>>
>> Signed-off-by: Richard Gobert <[email protected]>
>
> Thanks for adding tests. Minor point below only. The tests pass both
> before and after your series, right? Then immediately a nice
> validation that the optimization has no unintended side-effects.
>

Yes, the logic is preserved - tests pass both in net-next and after
applying the patch :)

>> ---
>> tools/testing/selftests/net/gro.c | 144 ++++++++++++++++++++++++++++++
>> 1 file changed, 144 insertions(+)
>>
>> diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c
>> index 353e1e867fbb..74ab06953c38 100644
>> --- a/tools/testing/selftests/net/gro.c
>> +++ b/tools/testing/selftests/net/gro.c
>> @@ -617,6 +617,120 @@ static void add_ipv6_exthdr(void *buf, void *optpkt, __u8 exthdr_type, char *ext
>> iph->payload_len = htons(ntohs(iph->payload_len) + MIN_EXTHDR_SIZE);
>> }
>>
>> +static void fix_ip4_checksum(struct iphdr *iph)
>> +{
>> + iph->check = 0;
>> + iph->check = checksum_fold(iph, sizeof(struct iphdr), 0);
>> +}
>> +
>> +static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase)
>> +{
>> + bool send_three = false;
>> + static char buf1[MAX_HDR_LEN + PAYLOAD_LEN];
>> + static char buf2[MAX_HDR_LEN + PAYLOAD_LEN];
>> + static char buf3[MAX_HDR_LEN + PAYLOAD_LEN];
>> +
>> + create_packet(buf1, 0, 0, PAYLOAD_LEN, 0);
>> + create_packet(buf2, PAYLOAD_LEN, 0, PAYLOAD_LEN, 0);
>> + create_packet(buf3, PAYLOAD_LEN * 2, 0, PAYLOAD_LEN, 0);
>> +
>> + struct iphdr *iph1 = (struct iphdr *)(buf1 + ETH_HLEN);
>> + struct iphdr *iph2 = (struct iphdr *)(buf2 + ETH_HLEN);
>> + struct iphdr *iph3 = (struct iphdr *)(buf3 + ETH_HLEN);
>> +
>
> minor: variable defintions before code, and reverse chrismas tree.

Good catch, I'll apply these changes and push v8 when the relevant series
for net will be merged. Thanks!

2024-04-18 15:06:24

by Richard Gobert

[permalink] [raw]
Subject: Re: [PATCH net-next v7 2/3] net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment



Paolo Abeni wrote:
> On Tue, 2024-04-16 at 11:21 +0200, Paolo Abeni wrote:
>> On Fri, 2024-04-12 at 17:55 +0200, Richard Gobert wrote:
>>> {inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
>>> iph->id, ...) against all packets in a loop. These flush checks are used
>>> currently in all tcp flows and in some UDP flows in GRO.
>>>
>>> These checks need to be done only once and only against the found p skb,
>>> since they only affect flush and not same_flow.
>>>
>>> Leveraging the previous commit in the series, in which correct network
>>> header offsets are saved for both outer and inner network headers -
>>> allowing these checks to be done only once, in tcp_gro_receive and
>>> udp_gro_receive_segment. As a result, NAPI_GRO_CB(p)->flush is not used at
>>> all. In addition, flush_id checks are more declarative and contained in
>>> inet_gro_flush, thus removing the need for flush_id in napi_gro_cb.
>>>
>>> This results in less parsing code for UDP flows and non-loop flush tests
>>> for TCP flows.
>>>
>>> To make sure results are not within noise range - I've made netfilter drop
>>> all TCP packets, and measured CPU performance in GRO (in this case GRO is
>>> responsible for about 50% of the CPU utilization).
>>>
>>> L3 flush/flush_id checks are not relevant to UDP connections where
>>> skb_gro_receive_list is called. The only code change relevant to this flow
>>> is inet_gro_receive. The rest of the code parsing this flow stays the
>>> same.
>>>
>>> All concurrent connections tested are with the same ip srcaddr and
>>> dstaddr.
>>>
>>> perf top while replaying 64 concurrent IP/UDP connections (UDP fwd flow):
>>> net-next:
>>> 3.03% [kernel] [k] inet_gro_receive
>>>
>>> patch applied:
>>> 2.78% [kernel] [k] inet_gro_receive
>>
>> Why there are no figures for
>> udp_gro_receive_segment()/gro_network_flush() here?
>>
>> Also you should be able to observer a very high amount of CPU usage by
>> GRO even with TCP with very high speed links, keeping the BH/GRO on a
>> CPU and the user-space/data copy on a different one (or using rx zero
>> copy).
>
> To be more explicit: I think at least the above figures are required, 
> and I still fear the real gain in that case would range from zero to
> negative.
>

I wrote about it in the commit message in short, sorry if I wasn't clear
enough.

gro_network_flush is compiled in-line to both udp_gro_receive_segment and
tcp_gro_receive. udp_gro_receive_segment is compiled in-line to
udp_gro_receive.

The UDP numbers I posted are not relevant anymore after Willem and
Alexander's thread, after which we understood flush and flush_id should be
calculated for all UDP flows.

I can post new numbers for the UDP fwd path after implementing the correct
change. As for TCP - the numbers I posted stay the same.

You should note there is an increase in CPU utilization in tcp_gro_receive
because of the inline compilation of gro_network_flush. The numbers make
sense and show performance enhancement in the case I showed when both
inet_gro_receive and tcp_gro_receive are accounted for.

> If you can't do the TCP part of the testing, please provide at least
> the figures for a single UDP flow, that should give more indication WRT
> the result we can expect with TCP.
>
> Note that GRO is used mainly by TCP and TCP packets with different
> src/dst port will land into different GRO hash buckets, having
> different RX hash.
>
> That will happen even for UDP, at least for some (most?) nics include
> the UDP ports in the RX hash.
>
> Thanks,
>
> Paolo
>

2024-04-18 15:32:42

by Richard Gobert

[permalink] [raw]
Subject: Re: [PATCH net-next v7 1/3] net: gro: add {inner_}network_offset to napi_gro_cb

Paolo Abeni wrote:
> On Fri, 2024-04-12 at 17:55 +0200, Richard Gobert wrote:
>> This patch adds network_offset and inner_network_offset to napi_gro_cb, and
>> makes sure both are set correctly. In the common path there's only one
>> write (skb_gro_reset_offset).
>>
>> Signed-off-by: Richard Gobert <[email protected]>
>
> Does not apply cleanly to net-next. You have to wait until the net
> dependency is merged into net-next before posting.
>
>> ---
>> drivers/net/geneve.c | 1 +
>> drivers/net/vxlan/vxlan_core.c | 1 +
>> include/net/gro.h | 18 ++++++++++++++++--
>> net/8021q/vlan_core.c | 2 ++
>> net/core/gro.c | 1 +
>> net/ethernet/eth.c | 1 +
>> net/ipv4/af_inet.c | 5 +----
>> net/ipv4/gre_offload.c | 1 +
>> net/ipv6/ip6_offload.c | 8 ++++----
>> 9 files changed, 28 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
>> index 9c18a39b0d0c..a6256ea1f5bc 100644
>> --- a/drivers/net/geneve.c
>> +++ b/drivers/net/geneve.c
>> @@ -545,6 +545,7 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk,
>> if (!ptype)
>> goto out;
>>
>> + NAPI_GRO_CB(skb)->inner_network_offset = hlen;
>> pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
>> flush = 0;
>>
>> diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
>> index 6fb182d9d6e7..9fb93c3953c1 100644
>> --- a/drivers/net/vxlan/vxlan_core.c
>> +++ b/drivers/net/vxlan/vxlan_core.c
>> @@ -754,6 +754,7 @@ static struct sk_buff *vxlan_gpe_gro_receive(struct sock *sk,
>>
>> vh = vxlan_gro_prepare_receive(sk, head, skb, &grc);
>> if (vh) {
>> + NAPI_GRO_CB(skb)->inner_network_offset = skb_gro_offset(skb);
>> if (!vxlan_parse_gpe_proto(vh, &protocol))
>> goto out;
>> ptype = gro_find_receive_by_type(protocol);
>
> What about vxlan_gro_receive? and fou/gue?
>

No need to write in fou/gue functions, as both functions call
{inet,inet6}_offloads, which means if there's an IP/IPv6 header after
fou/gue - ipip_gro_receive will be called (or ip6ip6_gro_receive, or
sit_ip6ip6_gro_receive, etc), in which inner_network_offset is written.

vxlan_gro_receive calls eth_gro_receive, in which inner_network_offset
is written as well.

> Side note: the latter apparently exist mainly to make UDP-related
> changes more difficult, can we deprecated them once for all?
>
> Thank,
>
> Paolo
>