In preparation for skbuff_heads caching and reusing, open-code
__build_skb() inside __napi_alloc_skb() with factoring out
the skbbuff_head allocation itself.
Note that the return value of __build_skb_around() is not checked
since it never returns anything except the given skb.
Signed-off-by: Alexander Lobakin <[email protected]>
---
net/core/skbuff.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7626a33cce59..dc3300dc2ac4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -485,6 +485,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
}
EXPORT_SYMBOL(__netdev_alloc_skb);
+static struct sk_buff *napi_skb_cache_get(struct napi_alloc_cache *nc)
+{
+ return kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+}
+
/**
* __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
* @napi: napi instance this buffer was allocated for
@@ -525,12 +530,15 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (unlikely(!data))
return NULL;
- skb = __build_skb(data, len);
+ skb = napi_skb_cache_get(nc);
if (unlikely(!skb)) {
skb_free_frag(data);
return NULL;
}
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, len);
+
if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
--
2.30.0
Instead of calling kmem_cache_alloc() every time when building a NAPI
skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
this cache was only used for bulk-freeing skbuff_heads consumed via
napi_consume_skb() or __kfree_skb_defer().
Typical path is:
- skb is queued for freeing from driver or stack, its skbuff_head
goes into the cache instead of immediate freeing;
- driver or stack requests NAPI skb allocation, an skbuff_head is
taken from the cache instead of allocation.
Corner cases:
- if it's empty on skb allocation, bulk-allocate the first half;
- if it's full on skb consuming, bulk-wipe the second half.
Also try to balance its size after completing network softirqs
(__kfree_skb_flush()).
prefetchw() on CONFIG_SLUB is dropped since it makes no sense anymore.
Suggested-by: Edward Cree <[email protected]>
Signed-off-by: Alexander Lobakin <[email protected]>
---
net/core/skbuff.c | 54 ++++++++++++++++++++++++++++++-----------------
1 file changed, 35 insertions(+), 19 deletions(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dc3300dc2ac4..f42a3a04b918 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -364,6 +364,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
EXPORT_SYMBOL(build_skb_around);
#define NAPI_SKB_CACHE_SIZE 64
+#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
struct napi_alloc_cache {
struct page_frag_cache page;
@@ -487,7 +488,15 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
static struct sk_buff *napi_skb_cache_get(struct napi_alloc_cache *nc)
{
- return kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ if (unlikely(!nc->skb_count))
+ nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
+ GFP_ATOMIC,
+ NAPI_SKB_CACHE_HALF,
+ nc->skb_cache);
+ if (unlikely(!nc->skb_count))
+ return NULL;
+
+ return nc->skb_cache[--nc->skb_count];
}
/**
@@ -867,40 +876,47 @@ void __consume_stateless_skb(struct sk_buff *skb)
void __kfree_skb_flush(void)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ size_t count;
+ void **ptr;
+
+ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_HALF))
+ return;
+
+ if (nc->skb_count > NAPI_SKB_CACHE_HALF) {
+ count = nc->skb_count - NAPI_SKB_CACHE_HALF;
+ ptr = nc->skb_cache + NAPI_SKB_CACHE_HALF;
- /* flush skb_cache if containing objects */
- if (nc->skb_count) {
- kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
- nc->skb_cache);
- nc->skb_count = 0;
+ kmem_cache_free_bulk(skbuff_head_cache, count, ptr);
+ nc->skb_count = NAPI_SKB_CACHE_HALF;
+ } else {
+ count = NAPI_SKB_CACHE_HALF - nc->skb_count;
+ ptr = nc->skb_cache + nc->skb_count;
+
+ nc->skb_count += kmem_cache_alloc_bulk(skbuff_head_cache,
+ GFP_ATOMIC, count,
+ ptr);
}
}
-static inline void _kfree_skb_defer(struct sk_buff *skb)
+static void napi_skb_cache_put(struct sk_buff *skb)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
/* drop skb->head and call any destructors for packet */
skb_release_all(skb);
- /* record skb to CPU local list */
nc->skb_cache[nc->skb_count++] = skb;
-#ifdef CONFIG_SLUB
- /* SLUB writes into objects when freeing */
- prefetchw(skb);
-#endif
-
- /* flush skb_cache if it is filled */
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
- kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
- nc->skb_cache);
- nc->skb_count = 0;
+ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
+ nc->skb_cache + NAPI_SKB_CACHE_HALF);
+ nc->skb_count = NAPI_SKB_CACHE_HALF;
}
}
+
void __kfree_skb_defer(struct sk_buff *skb)
{
- _kfree_skb_defer(skb);
+ napi_skb_cache_put(skb);
}
void napi_consume_skb(struct sk_buff *skb, int budget)
@@ -925,7 +941,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- _kfree_skb_defer(skb);
+ napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
--
2.30.0
Instead of immediate freeing, recycle GRO_MERGED_FREE skbs into
NAPI skb cache. This is safe, because napi_gro_receive() and
napi_gro_frags() are called only inside NAPI softirq context.
As many drivers call napi_alloc_skb()/napi_get_frags() on their
receive path, this becomes especially useful.
Signed-off-by: Alexander Lobakin <[email protected]>
---
include/linux/skbuff.h | 1 +
net/core/dev.c | 9 +--------
net/core/skbuff.c | 12 +++++++++---
3 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7a057b1f1eb8..507f1598e446 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2888,6 +2888,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget);
void __kfree_skb_flush(void);
void __kfree_skb_defer(struct sk_buff *skb);
+void napi_skb_free_stolen_head(struct sk_buff *skb);
/**
* __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index e4d77c8abe76..c28f0d601378 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6054,13 +6054,6 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
}
EXPORT_SYMBOL(gro_find_complete_by_type);
-static void napi_skb_free_stolen_head(struct sk_buff *skb)
-{
- skb_dst_drop(skb);
- skb_ext_put(skb);
- kmem_cache_free(skbuff_head_cache, skb);
-}
-
static gro_result_t napi_skb_finish(struct napi_struct *napi,
struct sk_buff *skb,
gro_result_t ret)
@@ -6074,7 +6067,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
napi_skb_free_stolen_head(skb);
else
- __kfree_skb(skb);
+ __kfree_skb_defer(skb);
break;
case GRO_HELD:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f42a3a04b918..bf6f92f1f4c7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -902,9 +902,6 @@ static void napi_skb_cache_put(struct sk_buff *skb)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- /* drop skb->head and call any destructors for packet */
- skb_release_all(skb);
-
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
@@ -916,6 +913,14 @@ static void napi_skb_cache_put(struct sk_buff *skb)
void __kfree_skb_defer(struct sk_buff *skb)
{
+ skb_release_all(skb);
+ napi_skb_cache_put(skb);
+}
+
+void napi_skb_free_stolen_head(struct sk_buff *skb)
+{
+ skb_dst_drop(skb);
+ skb_ext_put(skb);
napi_skb_cache_put(skb);
}
@@ -941,6 +946,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
+ skb_release_all(skb);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
--
2.30.0
On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <[email protected]> wrote:
>
> Instead of calling kmem_cache_alloc() every time when building a NAPI
> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
> this cache was only used for bulk-freeing skbuff_heads consumed via
> napi_consume_skb() or __kfree_skb_defer().
>
> Typical path is:
> - skb is queued for freeing from driver or stack, its skbuff_head
> goes into the cache instead of immediate freeing;
> - driver or stack requests NAPI skb allocation, an skbuff_head is
> taken from the cache instead of allocation.
>
> Corner cases:
> - if it's empty on skb allocation, bulk-allocate the first half;
> - if it's full on skb consuming, bulk-wipe the second half.
>
> Also try to balance its size after completing network softirqs
> (__kfree_skb_flush()).
I do not see the point of doing this rebalance (especially if we do not change
its name describing its purpose more accurately).
For moderate load, we will have a reduced bulk size (typically one or two).
Number of skbs in the cache is in [0, 64[ , there is really no risk of
letting skbs there for a long period of time.
(32 * sizeof(sk_buff) = 8192)
I would personally get rid of this function completely.
Also it seems you missed my KASAN support request ?
I guess this is a matter of using kasan_unpoison_range(), we can ask for help.
>
> prefetchw() on CONFIG_SLUB is dropped since it makes no sense anymore.
>
> Suggested-by: Edward Cree <[email protected]>
> Signed-off-by: Alexander Lobakin <[email protected]>
> ---
> net/core/skbuff.c | 54 ++++++++++++++++++++++++++++++-----------------
> 1 file changed, 35 insertions(+), 19 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index dc3300dc2ac4..f42a3a04b918 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -364,6 +364,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
> EXPORT_SYMBOL(build_skb_around);
>
> #define NAPI_SKB_CACHE_SIZE 64
> +#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
>
> struct napi_alloc_cache {
> struct page_frag_cache page;
> @@ -487,7 +488,15 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
>
> static struct sk_buff *napi_skb_cache_get(struct napi_alloc_cache *nc)
> {
> - return kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
> + if (unlikely(!nc->skb_count))
> + nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
> + GFP_ATOMIC,
> + NAPI_SKB_CACHE_HALF,
> + nc->skb_cache);
> + if (unlikely(!nc->skb_count))
> + return NULL;
> +
> + return nc->skb_cache[--nc->skb_count];
> }
>
> /**
> @@ -867,40 +876,47 @@ void __consume_stateless_skb(struct sk_buff *skb)
> void __kfree_skb_flush(void)
> {
> struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
> + size_t count;
> + void **ptr;
> +
> + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_HALF))
> + return;
> +
> + if (nc->skb_count > NAPI_SKB_CACHE_HALF) {
> + count = nc->skb_count - NAPI_SKB_CACHE_HALF;
> + ptr = nc->skb_cache + NAPI_SKB_CACHE_HALF;
>
> - /* flush skb_cache if containing objects */
> - if (nc->skb_count) {
> - kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
> - nc->skb_cache);
> - nc->skb_count = 0;
> + kmem_cache_free_bulk(skbuff_head_cache, count, ptr);
> + nc->skb_count = NAPI_SKB_CACHE_HALF;
> + } else {
> + count = NAPI_SKB_CACHE_HALF - nc->skb_count;
> + ptr = nc->skb_cache + nc->skb_count;
> +
> + nc->skb_count += kmem_cache_alloc_bulk(skbuff_head_cache,
> + GFP_ATOMIC, count,
> + ptr);
> }
> }
>
From: Eric Dumazet <[email protected]>
Date: Wed, 13 Jan 2021 15:36:05 +0100
> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <[email protected]> wrote:
>>
>> Instead of calling kmem_cache_alloc() every time when building a NAPI
>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
>> this cache was only used for bulk-freeing skbuff_heads consumed via
>> napi_consume_skb() or __kfree_skb_defer().
>>
>> Typical path is:
>> - skb is queued for freeing from driver or stack, its skbuff_head
>> goes into the cache instead of immediate freeing;
>> - driver or stack requests NAPI skb allocation, an skbuff_head is
>> taken from the cache instead of allocation.
>>
>> Corner cases:
>> - if it's empty on skb allocation, bulk-allocate the first half;
>> - if it's full on skb consuming, bulk-wipe the second half.
>>
>> Also try to balance its size after completing network softirqs
>> (__kfree_skb_flush()).
>
> I do not see the point of doing this rebalance (especially if we do not change
> its name describing its purpose more accurately).
>
> For moderate load, we will have a reduced bulk size (typically one or two).
> Number of skbs in the cache is in [0, 64[ , there is really no risk of
> letting skbs there for a long period of time.
> (32 * sizeof(sk_buff) = 8192)
> I would personally get rid of this function completely.
When I had a cache of 128 entries, I had worse results without this
function. But seems like I forgot to retest when I switched to the
original size of 64.
I also thought about removing this function entirely, will test.
> Also it seems you missed my KASAN support request ?
> I guess this is a matter of using kasan_unpoison_range(), we can ask for help.
I saw your request, but don't see a reason for doing this.
We are not caching already freed skbuff_heads. They don't get
kmem_cache_freed before getting into local cache. KASAN poisons
them no earlier than at kmem_cache_free() (or did I miss someting?).
heads being cached just get rid of all references and at the moment
of dropping to the cache they are pretty the same as if they were
allocated.
I also remind that only skbs that are caught by napi_consume_skb() or
__kfree_skb_defer() are getting into skb_cache, not every single one.
Regarding other emails:
1. NUMA awareness.
napi_alloc_cache is percpu, we're partly protected. The only thing
that might happen is that napi_consume_skb() can be called for skb
that was allocated at a distant node, and then it's requested by
napi_alloc_skb() (and there were no bulk-wipes between).
This can occur only if a NAPI polling cycle for cleaning up the
completion/send queue(s) is scheduled on a CPU that is far away
from the one(s) that clean(s) up the receive queue(s).
That is really very unlikely to be caught, but...
One of the ways to handle this is like (inside napi_skb_cache_get()):
skb = nc->skb_cache[--nc->skb_count];
if (unlikely(pfn_to_nid(virt_to_pfn(skb)) != numa_mem_id())) {
kmem_cache_free(skbuff_head_cache, skb);
skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
}
return skb;
This whole condition will be optimized out on !CONFIG_NUMA, as
pfn_to_nid() and numa_mem_id() are compile-time 0 in this case.
This won't break currently present bulk-freeing.
2. Where do optimizations come from.
Not only from bulk allocations, but also from the shortcut:
napi_consume_skb()/__kfree_skb_defer() -> skb_cache -> napi_alloc_skb();
napi_alloc_skb() will get a new head directly without calling for MM
functions.
I'm aware that kmem_cache has its own cache, but this also applies to
page allocators etc. which doesn't prevent from having things like
page_frag_cache or page_pool to recycle pages and fragments directly,
not through MM layer.
>> prefetchw() on CONFIG_SLUB is dropped since it makes no sense anymore.
>>
>> Suggested-by: Edward Cree <[email protected]>
>> Signed-off-by: Alexander Lobakin <[email protected]>
>> ---
>> net/core/skbuff.c | 54 ++++++++++++++++++++++++++++++-----------------
>> 1 file changed, 35 insertions(+), 19 deletions(-)
>>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index dc3300dc2ac4..f42a3a04b918 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -364,6 +364,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb,
>> EXPORT_SYMBOL(build_skb_around);
>>
>> #define NAPI_SKB_CACHE_SIZE 64
>> +#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
>>
>> struct napi_alloc_cache {
>> struct page_frag_cache page;
>> @@ -487,7 +488,15 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
>>
>> static struct sk_buff *napi_skb_cache_get(struct napi_alloc_cache *nc)
>> {
>> - return kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
>> + if (unlikely(!nc->skb_count))
>> + nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
>> + GFP_ATOMIC,
>> + NAPI_SKB_CACHE_HALF,
>> + nc->skb_cache);
>> + if (unlikely(!nc->skb_count))
>> + return NULL;
>> +
>> + return nc->skb_cache[--nc->skb_count];
>> }
>>
>> /**
>> @@ -867,40 +876,47 @@ void __consume_stateless_skb(struct sk_buff *skb)
>> void __kfree_skb_flush(void)
>> {
>> struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
>> + size_t count;
>> + void **ptr;
>> +
>> + if (unlikely(nc->skb_count == NAPI_SKB_CACHE_HALF))
>> + return;
>> +
>> + if (nc->skb_count > NAPI_SKB_CACHE_HALF) {
>> + count = nc->skb_count - NAPI_SKB_CACHE_HALF;
>> + ptr = nc->skb_cache + NAPI_SKB_CACHE_HALF;
>>
>> - /* flush skb_cache if containing objects */
>> - if (nc->skb_count) {
>> - kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
>> - nc->skb_cache);
>> - nc->skb_count = 0;
>> + kmem_cache_free_bulk(skbuff_head_cache, count, ptr);
>> + nc->skb_count = NAPI_SKB_CACHE_HALF;
>> + } else {
>> + count = NAPI_SKB_CACHE_HALF - nc->skb_count;
>> + ptr = nc->skb_cache + nc->skb_count;
>> +
>> + nc->skb_count += kmem_cache_alloc_bulk(skbuff_head_cache,
>> + GFP_ATOMIC, count,
>> + ptr);
>> }
>> }
>>
Al
On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <[email protected]> wrote:
>
> From: Eric Dumazet <[email protected]>
> Date: Wed, 13 Jan 2021 15:36:05 +0100
>
> > On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <[email protected]> wrote:
> >>
> >> Instead of calling kmem_cache_alloc() every time when building a NAPI
> >> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
> >> this cache was only used for bulk-freeing skbuff_heads consumed via
> >> napi_consume_skb() or __kfree_skb_defer().
> >>
> >> Typical path is:
> >> - skb is queued for freeing from driver or stack, its skbuff_head
> >> goes into the cache instead of immediate freeing;
> >> - driver or stack requests NAPI skb allocation, an skbuff_head is
> >> taken from the cache instead of allocation.
> >>
> >> Corner cases:
> >> - if it's empty on skb allocation, bulk-allocate the first half;
> >> - if it's full on skb consuming, bulk-wipe the second half.
> >>
> >> Also try to balance its size after completing network softirqs
> >> (__kfree_skb_flush()).
> >
> > I do not see the point of doing this rebalance (especially if we do not change
> > its name describing its purpose more accurately).
> >
> > For moderate load, we will have a reduced bulk size (typically one or two).
> > Number of skbs in the cache is in [0, 64[ , there is really no risk of
> > letting skbs there for a long period of time.
> > (32 * sizeof(sk_buff) = 8192)
> > I would personally get rid of this function completely.
>
> When I had a cache of 128 entries, I had worse results without this
> function. But seems like I forgot to retest when I switched to the
> original size of 64.
> I also thought about removing this function entirely, will test.
>
> > Also it seems you missed my KASAN support request ?
> > I guess this is a matter of using kasan_unpoison_range(), we can ask for help.
>
> I saw your request, but don't see a reason for doing this.
> We are not caching already freed skbuff_heads. They don't get
> kmem_cache_freed before getting into local cache. KASAN poisons
> them no earlier than at kmem_cache_free() (or did I miss someting?).
> heads being cached just get rid of all references and at the moment
> of dropping to the cache they are pretty the same as if they were
> allocated.
KASAN should not report false positives in this case.
But I think Eric meant preventing false negatives. If we kmalloc 17
bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.
But we put that data into 128-byte blocks, KASAN will miss
out-of-bounds accesses beyond 17 bytes up to 128 bytes.
The same holds for "logical" use-after-frees when object is free, but
not freed into slab.
An important custom cache should use annotations like
kasan_poison_object_data/kasan_unpoison_range.
From: Dmitry Vyukov <[email protected]>
Date: Thu, 14 Jan 2021 12:47:31 +0100
> On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <[email protected]> wrote:
>>
>> From: Eric Dumazet <[email protected]>
>> Date: Wed, 13 Jan 2021 15:36:05 +0100
>>
>>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <[email protected]> wrote:
>>>>
>>>> Instead of calling kmem_cache_alloc() every time when building a NAPI
>>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
>>>> this cache was only used for bulk-freeing skbuff_heads consumed via
>>>> napi_consume_skb() or __kfree_skb_defer().
>>>>
>>>> Typical path is:
>>>> - skb is queued for freeing from driver or stack, its skbuff_head
>>>> goes into the cache instead of immediate freeing;
>>>> - driver or stack requests NAPI skb allocation, an skbuff_head is
>>>> taken from the cache instead of allocation.
>>>>
>>>> Corner cases:
>>>> - if it's empty on skb allocation, bulk-allocate the first half;
>>>> - if it's full on skb consuming, bulk-wipe the second half.
>>>>
>>>> Also try to balance its size after completing network softirqs
>>>> (__kfree_skb_flush()).
>>>
>>> I do not see the point of doing this rebalance (especially if we do not change
>>> its name describing its purpose more accurately).
>>>
>>> For moderate load, we will have a reduced bulk size (typically one or two).
>>> Number of skbs in the cache is in [0, 64[ , there is really no risk of
>>> letting skbs there for a long period of time.
>>> (32 * sizeof(sk_buff) = 8192)
>>> I would personally get rid of this function completely.
>>
>> When I had a cache of 128 entries, I had worse results without this
>> function. But seems like I forgot to retest when I switched to the
>> original size of 64.
>> I also thought about removing this function entirely, will test.
>>
>>> Also it seems you missed my KASAN support request ?
>> I guess this is a matter of using kasan_unpoison_range(), we can ask for help.
>>
>> I saw your request, but don't see a reason for doing this.
>> We are not caching already freed skbuff_heads. They don't get
>> kmem_cache_freed before getting into local cache. KASAN poisons
>> them no earlier than at kmem_cache_free() (or did I miss someting?).
>> heads being cached just get rid of all references and at the moment
>> of dropping to the cache they are pretty the same as if they were
>> allocated.
>
> KASAN should not report false positives in this case.
> But I think Eric meant preventing false negatives. If we kmalloc 17
> bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.
> But we put that data into 128-byte blocks, KASAN will miss
> out-of-bounds accesses beyond 17 bytes up to 128 bytes.
> The same holds for "logical" use-after-frees when object is free, but
> not freed into slab.
>
> An important custom cache should use annotations like
> kasan_poison_object_data/kasan_unpoison_range.
As I understand, I should
kasan_poison_object_data(skbuff_head_cache, skb) and then
kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the
cache?
Thanks,
Al
On Thu, Jan 14, 2021 at 1:44 PM Alexander Lobakin <[email protected]> wrote:
>
> From: Dmitry Vyukov <[email protected]>
> Date: Thu, 14 Jan 2021 12:47:31 +0100
>
> > On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <[email protected]> wrote:
> >>
> >> From: Eric Dumazet <[email protected]>
> >> Date: Wed, 13 Jan 2021 15:36:05 +0100
> >>
> >>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <[email protected]> wrote:
> >>>>
> >>>> Instead of calling kmem_cache_alloc() every time when building a NAPI
> >>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
> >>>> this cache was only used for bulk-freeing skbuff_heads consumed via
> >>>> napi_consume_skb() or __kfree_skb_defer().
> >>>>
> >>>> Typical path is:
> >>>> - skb is queued for freeing from driver or stack, its skbuff_head
> >>>> goes into the cache instead of immediate freeing;
> >>>> - driver or stack requests NAPI skb allocation, an skbuff_head is
> >>>> taken from the cache instead of allocation.
> >>>>
> >>>> Corner cases:
> >>>> - if it's empty on skb allocation, bulk-allocate the first half;
> >>>> - if it's full on skb consuming, bulk-wipe the second half.
> >>>>
> >>>> Also try to balance its size after completing network softirqs
> >>>> (__kfree_skb_flush()).
> >>>
> >>> I do not see the point of doing this rebalance (especially if we do not change
> >>> its name describing its purpose more accurately).
> >>>
> >>> For moderate load, we will have a reduced bulk size (typically one or two).
> >>> Number of skbs in the cache is in [0, 64[ , there is really no risk of
> >>> letting skbs there for a long period of time.
> >>> (32 * sizeof(sk_buff) = 8192)
> >>> I would personally get rid of this function completely.
> >>
> >> When I had a cache of 128 entries, I had worse results without this
> >> function. But seems like I forgot to retest when I switched to the
> >> original size of 64.
> >> I also thought about removing this function entirely, will test.
> >>
> >>> Also it seems you missed my KASAN support request ?
> >> I guess this is a matter of using kasan_unpoison_range(), we can ask for help.
> >>
> >> I saw your request, but don't see a reason for doing this.
> >> We are not caching already freed skbuff_heads. They don't get
> >> kmem_cache_freed before getting into local cache. KASAN poisons
> >> them no earlier than at kmem_cache_free() (or did I miss someting?).
> >> heads being cached just get rid of all references and at the moment
> >> of dropping to the cache they are pretty the same as if they were
> >> allocated.
> >
> > KASAN should not report false positives in this case.
> > But I think Eric meant preventing false negatives. If we kmalloc 17
> > bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.
> > But we put that data into 128-byte blocks, KASAN will miss
> > out-of-bounds accesses beyond 17 bytes up to 128 bytes.
> > The same holds for "logical" use-after-frees when object is free, but
> > not freed into slab.
> >
> > An important custom cache should use annotations like
> > kasan_poison_object_data/kasan_unpoison_range.
>
> As I understand, I should
> kasan_poison_object_data(skbuff_head_cache, skb) and then
> kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the
> cache?
I think it's the other way around. It should be _un_poisoned when used.
If it's fixed size, then unpoison_object_data should be a better fit:
https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253
On Thu, Jan 14, 2021 at 1:50 PM Dmitry Vyukov <[email protected]> wrote:
>
> On Thu, Jan 14, 2021 at 1:44 PM Alexander Lobakin <[email protected]> wrote:
> >
> > From: Dmitry Vyukov <[email protected]>
> > Date: Thu, 14 Jan 2021 12:47:31 +0100
> >
> > > On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <[email protected]> wrote:
> > >>
> > >> From: Eric Dumazet <[email protected]>
> > >> Date: Wed, 13 Jan 2021 15:36:05 +0100
> > >>
> > >>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <[email protected]> wrote:
> > >>>>
> > >>>> Instead of calling kmem_cache_alloc() every time when building a NAPI
> > >>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
> > >>>> this cache was only used for bulk-freeing skbuff_heads consumed via
> > >>>> napi_consume_skb() or __kfree_skb_defer().
> > >>>>
> > >>>> Typical path is:
> > >>>> - skb is queued for freeing from driver or stack, its skbuff_head
> > >>>> goes into the cache instead of immediate freeing;
> > >>>> - driver or stack requests NAPI skb allocation, an skbuff_head is
> > >>>> taken from the cache instead of allocation.
> > >>>>
> > >>>> Corner cases:
> > >>>> - if it's empty on skb allocation, bulk-allocate the first half;
> > >>>> - if it's full on skb consuming, bulk-wipe the second half.
> > >>>>
> > >>>> Also try to balance its size after completing network softirqs
> > >>>> (__kfree_skb_flush()).
> > >>>
> > >>> I do not see the point of doing this rebalance (especially if we do not change
> > >>> its name describing its purpose more accurately).
> > >>>
> > >>> For moderate load, we will have a reduced bulk size (typically one or two).
> > >>> Number of skbs in the cache is in [0, 64[ , there is really no risk of
> > >>> letting skbs there for a long period of time.
> > >>> (32 * sizeof(sk_buff) = 8192)
> > >>> I would personally get rid of this function completely.
> > >>
> > >> When I had a cache of 128 entries, I had worse results without this
> > >> function. But seems like I forgot to retest when I switched to the
> > >> original size of 64.
> > >> I also thought about removing this function entirely, will test.
> > >>
> > >>> Also it seems you missed my KASAN support request ?
> > >> I guess this is a matter of using kasan_unpoison_range(), we can ask for help.
> > >>
> > >> I saw your request, but don't see a reason for doing this.
> > >> We are not caching already freed skbuff_heads. They don't get
> > >> kmem_cache_freed before getting into local cache. KASAN poisons
> > >> them no earlier than at kmem_cache_free() (or did I miss someting?).
> > >> heads being cached just get rid of all references and at the moment
> > >> of dropping to the cache they are pretty the same as if they were
> > >> allocated.
> > >
> > > KASAN should not report false positives in this case.
> > > But I think Eric meant preventing false negatives. If we kmalloc 17
> > > bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.
> > > But we put that data into 128-byte blocks, KASAN will miss
> > > out-of-bounds accesses beyond 17 bytes up to 128 bytes.
> > > The same holds for "logical" use-after-frees when object is free, but
> > > not freed into slab.
> > >
> > > An important custom cache should use annotations like
> > > kasan_poison_object_data/kasan_unpoison_range.
> >
> > As I understand, I should
> > kasan_poison_object_data(skbuff_head_cache, skb) and then
> > kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the
> > cache?
>
> I think it's the other way around. It should be _un_poisoned when used.
> If it's fixed size, then unpoison_object_data should be a better fit:
> https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253
Variable-size poisoning/unpoisoning would be needed for the skb data itself:
https://bugzilla.kernel.org/show_bug.cgi?id=199055
From: Dmitry Vyukov <[email protected]>
Date: Thu, 14 Jan 2021 13:50:25 +0100
> On Thu, Jan 14, 2021 at 1:44 PM Alexander Lobakin <[email protected]> wrote:
>>
>> From: Dmitry Vyukov <[email protected]>
>> Date: Thu, 14 Jan 2021 12:47:31 +0100
>>
>>> On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <[email protected]> wrote:
>>>>
>>>> From: Eric Dumazet <[email protected]>
>>>> Date: Wed, 13 Jan 2021 15:36:05 +0100
>>>>
>>>>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <[email protected]> wrote:
>>>>>>
>>>>>> Instead of calling kmem_cache_alloc() every time when building a NAPI
>>>>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
>>>>>> this cache was only used for bulk-freeing skbuff_heads consumed via
>>>>>> napi_consume_skb() or __kfree_skb_defer().
>>>>>>
>>>>>> Typical path is:
>>>>>> - skb is queued for freeing from driver or stack, its skbuff_head
>>>>>> goes into the cache instead of immediate freeing;
>>>>>> - driver or stack requests NAPI skb allocation, an skbuff_head is
>>>>>> taken from the cache instead of allocation.
>>>>>>
>>>>>> Corner cases:
>>>>>> - if it's empty on skb allocation, bulk-allocate the first half;
>>>>>> - if it's full on skb consuming, bulk-wipe the second half.
>>>>>>
>>>>>> Also try to balance its size after completing network softirqs
>>>>>> (__kfree_skb_flush()).
>>>>>
>>>>> I do not see the point of doing this rebalance (especially if we do not change
>>>>> its name describing its purpose more accurately).
>>>>>
>>>>> For moderate load, we will have a reduced bulk size (typically one or two).
>>>>> Number of skbs in the cache is in [0, 64[ , there is really no risk of
>>>>> letting skbs there for a long period of time.
>>>>> (32 * sizeof(sk_buff) = 8192)
>>>>> I would personally get rid of this function completely.
>>>>
>>>> When I had a cache of 128 entries, I had worse results without this
>>>> function. But seems like I forgot to retest when I switched to the
>>>> original size of 64.
>>>> I also thought about removing this function entirely, will test.
>>>>
>>>>> Also it seems you missed my KASAN support request ?
>>>> I guess this is a matter of using kasan_unpoison_range(), we can ask for help.
>>>>
>>>> I saw your request, but don't see a reason for doing this.
>>>> We are not caching already freed skbuff_heads. They don't get
>>>> kmem_cache_freed before getting into local cache. KASAN poisons
>>>> them no earlier than at kmem_cache_free() (or did I miss someting?).
>>>> heads being cached just get rid of all references and at the moment
>>>> of dropping to the cache they are pretty the same as if they were
>>>> allocated.
>>>
>>> KASAN should not report false positives in this case.
>>> But I think Eric meant preventing false negatives. If we kmalloc 17
>>> bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.
>>> But we put that data into 128-byte blocks, KASAN will miss
>>> out-of-bounds accesses beyond 17 bytes up to 128 bytes.
>>> The same holds for "logical" use-after-frees when object is free, but
>>> not freed into slab.
>>>
>>> An important custom cache should use annotations like
>>> kasan_poison_object_data/kasan_unpoison_range.
>>
>> As I understand, I should
>> kasan_poison_object_data(skbuff_head_cache, skb) and then
>> kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the
>> cache?
>
> I think it's the other way around. It should be _un_poisoned when used.
> If it's fixed size, then unpoison_object_data should be a better fit:
> https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253
Ah, I though of this too. But wouldn't there be a false-positive if
a poisoned skb hits kmem_cache_free_bulk(), not the allocation path?
We plan to use skb_cache for both reusing and bulk-freeing, and SLUB,
for example, might do writes into objects before freeing.
If it also should get unpoisoned before kmem_cache_free_bulk(), we'll
lose bulking as unpoisoning is performed per-object.
Al
On Thu, Jan 14, 2021 at 2:00 PM Alexander Lobakin <[email protected]> wrote:
> >>>>>> Instead of calling kmem_cache_alloc() every time when building a NAPI
> >>>>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
> >>>>>> this cache was only used for bulk-freeing skbuff_heads consumed via
> >>>>>> napi_consume_skb() or __kfree_skb_defer().
> >>>>>>
> >>>>>> Typical path is:
> >>>>>> - skb is queued for freeing from driver or stack, its skbuff_head
> >>>>>> goes into the cache instead of immediate freeing;
> >>>>>> - driver or stack requests NAPI skb allocation, an skbuff_head is
> >>>>>> taken from the cache instead of allocation.
> >>>>>>
> >>>>>> Corner cases:
> >>>>>> - if it's empty on skb allocation, bulk-allocate the first half;
> >>>>>> - if it's full on skb consuming, bulk-wipe the second half.
> >>>>>>
> >>>>>> Also try to balance its size after completing network softirqs
> >>>>>> (__kfree_skb_flush()).
> >>>>>
> >>>>> I do not see the point of doing this rebalance (especially if we do not change
> >>>>> its name describing its purpose more accurately).
> >>>>>
> >>>>> For moderate load, we will have a reduced bulk size (typically one or two).
> >>>>> Number of skbs in the cache is in [0, 64[ , there is really no risk of
> >>>>> letting skbs there for a long period of time.
> >>>>> (32 * sizeof(sk_buff) = 8192)
> >>>>> I would personally get rid of this function completely.
> >>>>
> >>>> When I had a cache of 128 entries, I had worse results without this
> >>>> function. But seems like I forgot to retest when I switched to the
> >>>> original size of 64.
> >>>> I also thought about removing this function entirely, will test.
> >>>>
> >>>>> Also it seems you missed my KASAN support request ?
> >>>> I guess this is a matter of using kasan_unpoison_range(), we can ask for help.
> >>>>
> >>>> I saw your request, but don't see a reason for doing this.
> >>>> We are not caching already freed skbuff_heads. They don't get
> >>>> kmem_cache_freed before getting into local cache. KASAN poisons
> >>>> them no earlier than at kmem_cache_free() (or did I miss someting?).
> >>>> heads being cached just get rid of all references and at the moment
> >>>> of dropping to the cache they are pretty the same as if they were
> >>>> allocated.
> >>>
> >>> KASAN should not report false positives in this case.
> >>> But I think Eric meant preventing false negatives. If we kmalloc 17
> >>> bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.
> >>> But we put that data into 128-byte blocks, KASAN will miss
> >>> out-of-bounds accesses beyond 17 bytes up to 128 bytes.
> >>> The same holds for "logical" use-after-frees when object is free, but
> >>> not freed into slab.
> >>>
> >>> An important custom cache should use annotations like
> >>> kasan_poison_object_data/kasan_unpoison_range.
> >>
> >> As I understand, I should
> >> kasan_poison_object_data(skbuff_head_cache, skb) and then
> >> kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the
> >> cache?
> >
> > I think it's the other way around. It should be _un_poisoned when used.
> > If it's fixed size, then unpoison_object_data should be a better fit:
> > https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253
>
> Ah, I though of this too. But wouldn't there be a false-positive if
> a poisoned skb hits kmem_cache_free_bulk(), not the allocation path?
> We plan to use skb_cache for both reusing and bulk-freeing, and SLUB,
> for example, might do writes into objects before freeing.
> If it also should get unpoisoned before kmem_cache_free_bulk(), we'll
> lose bulking as unpoisoning is performed per-object.
Yes, it needs to be unpoisoned before free.
Unpoison one-by-one, free in bulk. Unpoisoningin is debug-only code anyway.
From: Dmitry Vyukov <[email protected]>
Date: Thu, 14 Jan 2021 13:51:44 +0100
> On Thu, Jan 14, 2021 at 1:50 PM Dmitry Vyukov <[email protected]> wrote:
>>
>> On Thu, Jan 14, 2021 at 1:44 PM Alexander Lobakin <[email protected]> wrote:
>>>
>>> From: Dmitry Vyukov <[email protected]>
>>> Date: Thu, 14 Jan 2021 12:47:31 +0100
>>>
>>>> On Thu, Jan 14, 2021 at 12:41 PM Alexander Lobakin <[email protected]> wrote:
>>>>>
>>>>> From: Eric Dumazet <[email protected]>
>>>>> Date: Wed, 13 Jan 2021 15:36:05 +0100
>>>>>
>>>>>> On Wed, Jan 13, 2021 at 2:37 PM Alexander Lobakin <[email protected]> wrote:
>>>>>>>
>>>>>>> Instead of calling kmem_cache_alloc() every time when building a NAPI
>>>>>>> skb, (re)use skbuff_heads from napi_alloc_cache.skb_cache. Previously
>>>>>>> this cache was only used for bulk-freeing skbuff_heads consumed via
>>>>>>> napi_consume_skb() or __kfree_skb_defer().
>>>>>>>
>>>>>>> Typical path is:
>>>>>>> - skb is queued for freeing from driver or stack, its skbuff_head
>>>>>>> goes into the cache instead of immediate freeing;
>>>>>>> - driver or stack requests NAPI skb allocation, an skbuff_head is
>>>>>>> taken from the cache instead of allocation.
>>>>>>>
>>>>>>> Corner cases:
>>>>>>> - if it's empty on skb allocation, bulk-allocate the first half;
>>>>>>> - if it's full on skb consuming, bulk-wipe the second half.
>>>>>>>
>>>>>>> Also try to balance its size after completing network softirqs
>>>>>>> (__kfree_skb_flush()).
>>>>>>
>>>>>> I do not see the point of doing this rebalance (especially if we do not change
>>>>>> its name describing its purpose more accurately).
>>>>>>
>>>>>> For moderate load, we will have a reduced bulk size (typically one or two).
>>>>>> Number of skbs in the cache is in [0, 64[ , there is really no risk of
>>>>>> letting skbs there for a long period of time.
>>>>>> (32 * sizeof(sk_buff) = 8192)
>>>>>> I would personally get rid of this function completely.
>>>>>
>>>>> When I had a cache of 128 entries, I had worse results without this
>>>>> function. But seems like I forgot to retest when I switched to the
>>>>> original size of 64.
>>>>> I also thought about removing this function entirely, will test.
>>>>>
>>>>>> Also it seems you missed my KASAN support request ?
>>>>> I guess this is a matter of using kasan_unpoison_range(), we can ask for help.
>>>>>
>>>>> I saw your request, but don't see a reason for doing this.
>>>>> We are not caching already freed skbuff_heads. They don't get
>>>>> kmem_cache_freed before getting into local cache. KASAN poisons
>>>>> them no earlier than at kmem_cache_free() (or did I miss someting?).
>>>>> heads being cached just get rid of all references and at the moment
>>>>> of dropping to the cache they are pretty the same as if they were
>>>>> allocated.
>>>>
>>>> KASAN should not report false positives in this case.
>>>> But I think Eric meant preventing false negatives. If we kmalloc 17
>>>> bytes, KASAN will detect out-of-bounds accesses beyond these 17 bytes.
>>>> But we put that data into 128-byte blocks, KASAN will miss
>>>> out-of-bounds accesses beyond 17 bytes up to 128 bytes.
>>>> The same holds for "logical" use-after-frees when object is free, but
>>>> not freed into slab.
>>>>
>>>> An important custom cache should use annotations like
>>>> kasan_poison_object_data/kasan_unpoison_range.
>>>
>>> As I understand, I should
>>> kasan_poison_object_data(skbuff_head_cache, skb) and then
>>> kasan_unpoison_range(skb, sizeof(*skb)) when putting it into the
>>> cache?
>>
>> I think it's the other way around. It should be _un_poisoned when used.
>> If it's fixed size, then unpoison_object_data should be a better fit:
>> https://elixir.bootlin.com/linux/v5.11-rc3/source/mm/kasan/common.c#L253
>
> Variable-size poisoning/unpoisoning would be needed for the skb data itself:
> https://bugzilla.kernel.org/show_bug.cgi?id=199055
This cache is for skbuff_heads only, not for the entire skbs. All
linear data and frags gets freed before head hits the cache.
The cache will store skbuff_heads as if they were freshly allocated
by kmem_cache_alloc().
Al