2021-03-12 16:23:39

by Alexander Lobakin

[permalink] [raw]
Subject: [PATCH net-next 0/4] gro: micro-optimize dev_gro_receive()

This random series addresses some of suboptimal paths used in
the main GRO entry point.
The main body is patches 3-4 which simplify the code and improve
flow distribution. Two others are mostly cosmetic to make code
more readable.

The benetifs are not so huge and mostly depend on NIC RSS hash
function and a number of Rx flows per single NAPI instance. I got
something like +10-15 Mbps on 4-8 flows NATing.

Alexander Lobakin (4):
gro: give 'hash' variable in dev_gro_receive() a less confusing name
gro: don't dereference napi->gro_hash[x] multiple times in
dev_gro_receive()
gro: simplify gro_list_prepare()
gro: improve flow distribution across GRO buckets in dev_gro_receive()

net/core/dev.c | 31 ++++++++++++++-----------------
1 file changed, 14 insertions(+), 17 deletions(-)

--
2.30.2



2021-03-12 16:23:39

by Alexander Lobakin

[permalink] [raw]
Subject: [PATCH net-next 1/4] gro: give 'hash' variable in dev_gro_receive() a less confusing name

'hash' stores not the flow hash, but the index of the GRO bucket
corresponding to it.
Change its name to 'bucket' to avoid confusion while reading lines
like '__set_bit(hash, &napi->gro_bitmask)'.

Signed-off-by: Alexander Lobakin <[email protected]>
---
net/core/dev.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 2bfdd528c7c3..adc42ba7ffd8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5956,7 +5956,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)

static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
- u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
@@ -6024,7 +6024,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (pp) {
skb_list_del_init(pp);
napi_gro_complete(napi, pp);
- napi->gro_hash[hash].count--;
+ napi->gro_hash[bucket].count--;
}

if (same_flow)
@@ -6033,10 +6033,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (NAPI_GRO_CB(skb)->flush)
goto normal;

- if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
+ if (unlikely(napi->gro_hash[bucket].count >= MAX_GRO_SKBS)) {
gro_flush_oldest(napi, gro_head);
} else {
- napi->gro_hash[hash].count++;
+ napi->gro_hash[bucket].count++;
}
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
@@ -6050,11 +6050,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
- if (napi->gro_hash[hash].count) {
- if (!test_bit(hash, &napi->gro_bitmask))
- __set_bit(hash, &napi->gro_bitmask);
- } else if (test_bit(hash, &napi->gro_bitmask)) {
- __clear_bit(hash, &napi->gro_bitmask);
+ if (napi->gro_hash[bucket].count) {
+ if (!test_bit(bucket, &napi->gro_bitmask))
+ __set_bit(bucket, &napi->gro_bitmask);
+ } else if (test_bit(bucket, &napi->gro_bitmask)) {
+ __clear_bit(bucket, &napi->gro_bitmask);
}

return ret;
--
2.30.2


2021-03-12 16:23:40

by Alexander Lobakin

[permalink] [raw]
Subject: [PATCH net-next 2/4] gro: don't dereference napi->gro_hash[x] multiple times in dev_gro_receive()

GRO bucket index doesn't change through the entire function.
Store a pointer to the corresponding bucket on stack once and use
it later instead of dereferencing again and again.

Signed-off-by: Alexander Lobakin <[email protected]>
---
net/core/dev.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index adc42ba7ffd8..ee124aecb8a2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5957,6 +5957,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct gro_list *gro_list = &napi->gro_hash[bucket];
struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
@@ -6024,7 +6025,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (pp) {
skb_list_del_init(pp);
napi_gro_complete(napi, pp);
- napi->gro_hash[bucket].count--;
+ gro_list->count--;
}

if (same_flow)
@@ -6033,10 +6034,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (NAPI_GRO_CB(skb)->flush)
goto normal;

- if (unlikely(napi->gro_hash[bucket].count >= MAX_GRO_SKBS)) {
+ if (unlikely(gro_list->count >= MAX_GRO_SKBS)) {
gro_flush_oldest(napi, gro_head);
} else {
- napi->gro_hash[bucket].count++;
+ gro_list->count++;
}
NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
@@ -6050,7 +6051,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
- if (napi->gro_hash[bucket].count) {
+ if (gro_list->count) {
if (!test_bit(bucket, &napi->gro_bitmask))
__set_bit(bucket, &napi->gro_bitmask);
} else if (test_bit(bucket, &napi->gro_bitmask)) {
--
2.30.2


2021-03-12 16:25:44

by Alexander Lobakin

[permalink] [raw]
Subject: [PATCH net-next 4/4] gro: improve flow distribution across GRO buckets in dev_gro_receive()

Most of the functions that "convert" hash value into an index
(when RPS is configured / XPS is not configured / etc.) set
reciprocal_scale() on it. Its logics is simple, but fair enough and
accounts the entire input value.
On the opposite side, 'hash & (GRO_HASH_BUCKETS - 1)' expression uses
only 3 least significant bits of the value, which is far from
optimal (especially for XOR RSS hashers, where the hashes of two
different flows may differ only by 1 bit somewhere in the middle).

Use reciprocal_scale() here too to take the entire hash value into
account and improve flow dispersion between GRO hash buckets.

Signed-off-by: Alexander Lobakin <[email protected]>
---
net/core/dev.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 65d9e7d9d1e8..bd7c9ba54623 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5952,7 +5952,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)

static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
- u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ u32 bucket = reciprocal_scale(skb_get_hash_raw(skb), GRO_HASH_BUCKETS);
struct gro_list *gro_list = &napi->gro_hash[bucket];
struct list_head *gro_head = &gro_list->list;
struct list_head *head = &offload_base;
--
2.30.2


2021-03-12 16:25:47

by Alexander Lobakin

[permalink] [raw]
Subject: [PATCH net-next 3/4] gro: simplify gro_list_prepare()

gro_list_prepare() always returns &napi->gro_hash[bucket].list,
without any variations. Moreover, it uses 'napi' argument only to
have access to this list, and calculates the bucket index for the
second time (firstly it happens at the beginning of
dev_gro_receive()) to do that.
Given that dev_gro_receive() already has a pointer to the needed
list, just pass it as the first argument to eliminate redundant
calculations, and make gro_list_prepare() return void.
Also, both arguments of gro_list_prepare() can be constified since
this function can only modify the skbs from the bucket list.

Signed-off-by: Alexander Lobakin <[email protected]>
---
net/core/dev.c | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index ee124aecb8a2..65d9e7d9d1e8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5858,15 +5858,13 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
}
EXPORT_SYMBOL(napi_gro_flush);

-static struct list_head *gro_list_prepare(struct napi_struct *napi,
- struct sk_buff *skb)
+static void gro_list_prepare(const struct list_head *head,
+ const struct sk_buff *skb)
{
unsigned int maclen = skb->dev->hard_header_len;
u32 hash = skb_get_hash_raw(skb);
- struct list_head *head;
struct sk_buff *p;

- head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
list_for_each_entry(p, head, list) {
unsigned long diffs;

@@ -5892,8 +5890,6 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi,
maclen);
NAPI_GRO_CB(p)->same_flow = !diffs;
}
-
- return head;
}

static void skb_gro_reset_offset(struct sk_buff *skb)
@@ -5958,10 +5954,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
{
u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
struct gro_list *gro_list = &napi->gro_hash[bucket];
+ struct list_head *gro_head = &gro_list->list;
struct list_head *head = &offload_base;
struct packet_offload *ptype;
__be16 type = skb->protocol;
- struct list_head *gro_head;
struct sk_buff *pp = NULL;
enum gro_result ret;
int same_flow;
@@ -5970,7 +5966,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (netif_elide_gro(skb->dev))
goto normal;

- gro_head = gro_list_prepare(napi, skb);
+ gro_list_prepare(gro_head, skb);

rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
--
2.30.2


2021-03-12 16:35:56

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH net-next 4/4] gro: improve flow distribution across GRO buckets in dev_gro_receive()

On Fri, Mar 12, 2021 at 5:22 PM Alexander Lobakin <[email protected]> wrote:
>
> Most of the functions that "convert" hash value into an index
> (when RPS is configured / XPS is not configured / etc.) set
> reciprocal_scale() on it. Its logics is simple, but fair enough and
> accounts the entire input value.
> On the opposite side, 'hash & (GRO_HASH_BUCKETS - 1)' expression uses
> only 3 least significant bits of the value, which is far from
> optimal (especially for XOR RSS hashers, where the hashes of two
> different flows may differ only by 1 bit somewhere in the middle).
>
> Use reciprocal_scale() here too to take the entire hash value into
> account and improve flow dispersion between GRO hash buckets.
>
> Signed-off-by: Alexander Lobakin <[email protected]>
> ---
> net/core/dev.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 65d9e7d9d1e8..bd7c9ba54623 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -5952,7 +5952,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
>
> static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
> {
> - u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
> + u32 bucket = reciprocal_scale(skb_get_hash_raw(skb), GRO_HASH_BUCKETS);

This is going to use 3 high order bits instead of 3 low-order bits.
Now, had you use hash_32(skb_get_hash_raw(skb), 3), you could have
claimed to use "more bits"

Toeplitz already shuffles stuff.

Adding a multiply here seems not needed.

Please provide experimental results, because this looks unnecessary to me.

2021-03-12 16:48:49

by Eric Dumazet

[permalink] [raw]
Subject: Re: [PATCH net-next 2/4] gro: don't dereference napi->gro_hash[x] multiple times in dev_gro_receive()

On Fri, Mar 12, 2021 at 5:22 PM Alexander Lobakin <[email protected]> wrote:
>
> GRO bucket index doesn't change through the entire function.
> Store a pointer to the corresponding bucket on stack once and use
> it later instead of dereferencing again and again.
>
> Signed-off-by: Alexander Lobakin <[email protected]>
> ---
> net/core/dev.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index adc42ba7ffd8..ee124aecb8a2 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -5957,6 +5957,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
> static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
> {
> u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
> + struct gro_list *gro_list = &napi->gro_hash[bucket];
> struct list_head *head = &offload_base;
> struct packet_offload *ptype;
> __be16 type = skb->protocol;
> @@ -6024,7 +6025,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> if (pp) {
> skb_list_del_init(pp);
> napi_gro_complete(napi, pp);
> - napi->gro_hash[bucket].count--;
> + gro_list->count--;
> }
>
> if (same_flow)
> @@ -6033,10 +6034,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> if (NAPI_GRO_CB(skb)->flush)
> goto normal;
>
> - if (unlikely(napi->gro_hash[bucket].count >= MAX_GRO_SKBS)) {
> + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) {
> gro_flush_oldest(napi, gro_head);
> } else {
> - napi->gro_hash[bucket].count++;
> + gro_list->count++;
> }
> NAPI_GRO_CB(skb)->count = 1;
> NAPI_GRO_CB(skb)->age = jiffies;
> @@ -6050,7 +6051,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> if (grow > 0)
> gro_pull_from_frag0(skb, grow);
> ok:
> - if (napi->gro_hash[bucket].count) {
> + if (gro_list->count) {
> if (!test_bit(bucket, &napi->gro_bitmask))
> __set_bit(bucket, &napi->gro_bitmask);
> } else if (test_bit(bucket, &napi->gro_bitmask)) {
> --
> 2.30.2
>
>

This adds more register pressure, do you have precise measures to
confirm this change is a win ?

Presumably the compiler should be able to optimize the code just fine,
it can see @bucket does not change.

2021-03-12 18:30:05

by Alexander Lobakin

[permalink] [raw]
Subject: Re: [PATCH net-next 4/4] gro: improve flow distribution across GRO buckets in dev_gro_receive()

From: Eric Dumazet <[email protected]>
Date: Fri, 12 Mar 2021 17:33:53 +0100

> On Fri, Mar 12, 2021 at 5:22 PM Alexander Lobakin <[email protected]> wrote:
> >
> > Most of the functions that "convert" hash value into an index
> > (when RPS is configured / XPS is not configured / etc.) set
> > reciprocal_scale() on it. Its logics is simple, but fair enough and
> > accounts the entire input value.
> > On the opposite side, 'hash & (GRO_HASH_BUCKETS - 1)' expression uses
> > only 3 least significant bits of the value, which is far from
> > optimal (especially for XOR RSS hashers, where the hashes of two
> > different flows may differ only by 1 bit somewhere in the middle).
> >
> > Use reciprocal_scale() here too to take the entire hash value into
> > account and improve flow dispersion between GRO hash buckets.
> >
> > Signed-off-by: Alexander Lobakin <[email protected]>
> > ---
> > net/core/dev.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 65d9e7d9d1e8..bd7c9ba54623 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -5952,7 +5952,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
> >
> > static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
> > {
> > - u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
> > + u32 bucket = reciprocal_scale(skb_get_hash_raw(skb), GRO_HASH_BUCKETS);
>
> This is going to use 3 high order bits instead of 3 low-order bits.

We-e-ell, seems like it.

> Now, had you use hash_32(skb_get_hash_raw(skb), 3), you could have
> claimed to use "more bits"

Nice suggestion, I'll try. If there won't be any visible improvements,
I'll just drop this one.

> Toeplitz already shuffles stuff.

As well as CRC and others, but I feel like we shouldn't rely only on
the hardware.

> Adding a multiply here seems not needed.
>
> Please provide experimental results, because this looks unnecessary to me.

Thanks,
Al

2021-03-12 18:39:01

by Alexander Lobakin

[permalink] [raw]
Subject: Re: [PATCH net-next 2/4] gro: don't dereference napi->gro_hash[x] multiple times in dev_gro_receive()

From: Eric Dumazet <[email protected]>
Date: Fri, 12 Mar 2021 17:47:04 +0100

> On Fri, Mar 12, 2021 at 5:22 PM Alexander Lobakin <[email protected]> wrote:
> >
> > GRO bucket index doesn't change through the entire function.
> > Store a pointer to the corresponding bucket on stack once and use
> > it later instead of dereferencing again and again.
> >
> > Signed-off-by: Alexander Lobakin <[email protected]>
> > ---
> > net/core/dev.c | 9 +++++----
> > 1 file changed, 5 insertions(+), 4 deletions(-)
> >
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index adc42ba7ffd8..ee124aecb8a2 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -5957,6 +5957,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
> > static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
> > {
> > u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
> > + struct gro_list *gro_list = &napi->gro_hash[bucket];
> > struct list_head *head = &offload_base;
> > struct packet_offload *ptype;
> > __be16 type = skb->protocol;
> > @@ -6024,7 +6025,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > if (pp) {
> > skb_list_del_init(pp);
> > napi_gro_complete(napi, pp);
> > - napi->gro_hash[bucket].count--;
> > + gro_list->count--;
> > }
> >
> > if (same_flow)
> > @@ -6033,10 +6034,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > if (NAPI_GRO_CB(skb)->flush)
> > goto normal;
> >
> > - if (unlikely(napi->gro_hash[bucket].count >= MAX_GRO_SKBS)) {
> > + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) {
> > gro_flush_oldest(napi, gro_head);
> > } else {
> > - napi->gro_hash[bucket].count++;
> > + gro_list->count++;
> > }
> > NAPI_GRO_CB(skb)->count = 1;
> > NAPI_GRO_CB(skb)->age = jiffies;
> > @@ -6050,7 +6051,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
> > if (grow > 0)
> > gro_pull_from_frag0(skb, grow);
> > ok:
> > - if (napi->gro_hash[bucket].count) {
> > + if (gro_list->count) {
> > if (!test_bit(bucket, &napi->gro_bitmask))
> > __set_bit(bucket, &napi->gro_bitmask);
> > } else if (test_bit(bucket, &napi->gro_bitmask)) {
> > --
> > 2.30.2
> >
> >
>
> This adds more register pressure, do you have precise measures to
> confirm this change is a win ?
>
> Presumably the compiler should be able to optimize the code just fine,
> it can see @bucket does not change.

This is mostly (if not purely) cosmetic, I don't think it changes
anything at all for the most of sane compilers.

Regarding registers, since @gro_list and @gro_head are pretty the
same, we could drop @gro_head in favour of @gro_list and just use
@gro_list->list instead.

Al