2022-08-17 10:41:57

by Hyeonggon Yoo

[permalink] [raw]
Subject: [PATCH v4 10/17] mm/slab: kmalloc: pass requests larger than order-1 page to page allocator

There is not much benefit for serving large objects in kmalloc().
Let's pass large requests to page allocator like SLUB for better
maintenance of common code.

Signed-off-by: Hyeonggon Yoo <[email protected]>
Reviewed-by: Vlastimil Babka <[email protected]>
---
include/linux/slab.h | 23 ++++-------------
mm/slab.c | 60 +++++++++++++++++++++++++++++++-------------
mm/slab.h | 3 +++
mm/slab_common.c | 25 ++++++++++++------
mm/slub.c | 19 --------------
5 files changed, 68 insertions(+), 62 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index fd2e129fc813..4ee5b2fed164 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -243,27 +243,17 @@ static inline unsigned int arch_slab_minalign(void)

#ifdef CONFIG_SLAB
/*
- * The largest kmalloc size supported by the SLAB allocators is
- * 32 megabyte (2^25) or the maximum allocatable page order if that is
- * less than 32 MB.
- *
- * WARNING: Its not easy to increase this value since the allocators have
- * to do various tricks to work around compiler limitations in order to
- * ensure proper constant folding.
+ * SLAB and SLUB directly allocates requests fitting in to an order-1 page
+ * (PAGE_SIZE*2). Larger requests are passed to the page allocator.
*/
-#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
- (MAX_ORDER + PAGE_SHIFT - 1) : 25)
-#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH
+#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
+#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 5
#endif
#endif

#ifdef CONFIG_SLUB
-/*
- * SLUB directly allocates requests fitting in to an order-1 page
- * (PAGE_SIZE*2). Larger requests are passed to the page allocator.
- */
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
@@ -415,10 +405,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
if (size <= 512 * 1024) return 19;
if (size <= 1024 * 1024) return 20;
if (size <= 2 * 1024 * 1024) return 21;
- if (size <= 4 * 1024 * 1024) return 22;
- if (size <= 8 * 1024 * 1024) return 23;
- if (size <= 16 * 1024 * 1024) return 24;
- if (size <= 32 * 1024 * 1024) return 25;

if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
@@ -428,6 +414,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
/* Will never be reached. Needed because the compiler may complain */
return -1;
}
+static_assert(PAGE_SHIFT <= 20);
#define kmalloc_index(s) __kmalloc_index(s, true)
#endif /* !CONFIG_SLOB */

diff --git a/mm/slab.c b/mm/slab.c
index 8c08d7f3dead..10c9af904410 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3585,11 +3585,19 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
struct kmem_cache *cachep;
void *ret;

- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ ret = kmalloc_large_node_notrace(size, flags, node);
+
+ trace_kmalloc_node(caller, ret, NULL, size,
+ PAGE_SIZE << get_order(size),
+ flags, node);
+ return ret;
+ }
+
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
+
ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
ret = kasan_kmalloc(cachep, ret, size, flags);

@@ -3664,17 +3672,27 @@ EXPORT_SYMBOL(kmem_cache_free);

void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
{
- struct kmem_cache *s;
- size_t i;

local_irq_disable();
- for (i = 0; i < size; i++) {
+ for (int i = 0; i < size; i++) {
void *objp = p[i];
+ struct kmem_cache *s;

- if (!orig_s) /* called via kfree_bulk */
- s = virt_to_cache(objp);
- else
+ if (!orig_s) {
+ struct folio *folio = virt_to_folio(objp);
+
+ /* called via kfree_bulk */
+ if (!folio_test_slab(folio)) {
+ local_irq_enable();
+ free_large_kmalloc(folio, objp);
+ local_irq_disable();
+ continue;
+ }
+ s = folio_slab(folio)->slab_cache;
+ } else {
s = cache_from_obj(orig_s, objp);
+ }
+
if (!s)
continue;

@@ -3703,20 +3721,24 @@ void kfree(const void *objp)
{
struct kmem_cache *c;
unsigned long flags;
+ struct folio *folio;

trace_kfree(_RET_IP_, objp);

if (unlikely(ZERO_OR_NULL_PTR(objp)))
return;
- local_irq_save(flags);
- kfree_debugcheck(objp);
- c = virt_to_cache(objp);
- if (!c) {
- local_irq_restore(flags);
+
+ folio = virt_to_folio(objp);
+ if (!folio_test_slab(folio)) {
+ free_large_kmalloc(folio, (void *)objp);
return;
}
- debug_check_no_locks_freed(objp, c->object_size);

+ c = folio_slab(folio)->slab_cache;
+
+ local_irq_save(flags);
+ kfree_debugcheck(objp);
+ debug_check_no_locks_freed(objp, c->object_size);
debug_check_no_obj_freed(objp, c->object_size);
__cache_free(c, (void *)objp, _RET_IP_);
local_irq_restore(flags);
@@ -4138,15 +4160,17 @@ void __check_heap_object(const void *ptr, unsigned long n,
size_t __ksize(const void *objp)
{
struct kmem_cache *c;
- size_t size;
+ struct folio *folio;

BUG_ON(!objp);
if (unlikely(objp == ZERO_SIZE_PTR))
return 0;

- c = virt_to_cache(objp);
- size = c ? c->object_size : 0;
+ folio = virt_to_folio(objp);
+ if (!folio_test_slab(folio))
+ return folio_size(folio);

- return size;
+ c = folio_slab(folio)->slab_cache;
+ return c->object_size;
}
EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab.h b/mm/slab.h
index 40322bcf07be..381ba3e6b2a1 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -660,6 +660,9 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
print_tracking(cachep, x);
return cachep;
}
+
+void free_large_kmalloc(struct folio *folio, void *object);
+
#endif /* CONFIG_SLOB */

static inline size_t slab_ksize(const struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 51ccd0545816..5a2e81f42ee9 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -744,8 +744,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)

/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
- * kmalloc-32M.
+ * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
+ * kmalloc-2M.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(0, 0),
@@ -769,11 +769,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(262144, 256k),
INIT_KMALLOC_INFO(524288, 512k),
INIT_KMALLOC_INFO(1048576, 1M),
- INIT_KMALLOC_INFO(2097152, 2M),
- INIT_KMALLOC_INFO(4194304, 4M),
- INIT_KMALLOC_INFO(8388608, 8M),
- INIT_KMALLOC_INFO(16777216, 16M),
- INIT_KMALLOC_INFO(33554432, 32M)
+ INIT_KMALLOC_INFO(2097152, 2M)
};

/*
@@ -886,6 +882,21 @@ void __init create_kmalloc_caches(slab_flags_t flags)
/* Kmalloc array is now usable */
slab_state = UP;
}
+
+void free_large_kmalloc(struct folio *folio, void *object)
+{
+ unsigned int order = folio_order(folio);
+
+ if (WARN_ON_ONCE(order == 0))
+ pr_warn_once("object pointer: 0x%p\n", object);
+
+ kmemleak_free(object);
+ kasan_kfree_large(object);
+
+ mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(folio_page(folio, 0), order);
+}
#endif /* !CONFIG_SLOB */

gfp_t kmalloc_fix_flags(gfp_t flags)
diff --git a/mm/slub.c b/mm/slub.c
index 165fe87af204..a659874c5d44 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1704,12 +1704,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static __always_inline void kfree_hook(void *x)
-{
- kmemleak_free(x);
- kasan_kfree_large(x);
-}
-
static __always_inline bool slab_free_hook(struct kmem_cache *s,
void *x, bool init)
{
@@ -3550,19 +3544,6 @@ struct detached_freelist {
struct kmem_cache *s;
};

-static inline void free_large_kmalloc(struct folio *folio, void *object)
-{
- unsigned int order = folio_order(folio);
-
- if (WARN_ON_ONCE(order == 0))
- pr_warn_once("object pointer: 0x%p\n", object);
-
- kfree_hook(object);
- mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
- __free_pages(folio_page(folio, 0), order);
-}
-
/*
* This function progressively scans the array with free objects (with
* a limited look ahead) and extract objects belonging to the same
--
2.32.0


2022-10-14 21:52:57

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH v4 10/17] mm/slab: kmalloc: pass requests larger than order-1 page to page allocator

Hi,

On Wed, Aug 17, 2022 at 07:18:19PM +0900, Hyeonggon Yoo wrote:
> There is not much benefit for serving large objects in kmalloc().
> Let's pass large requests to page allocator like SLUB for better
> maintenance of common code.
>
> Signed-off-by: Hyeonggon Yoo <[email protected]>
> Reviewed-by: Vlastimil Babka <[email protected]>
> ---

This patch results in a WARNING backtrace in all mips and sparc64
emulations.

------------[ cut here ]------------
WARNING: CPU: 0 PID: 0 at mm/slab_common.c:729 kmalloc_slab+0xc0/0xdc
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 6.0.0-11990-g9c9155a3509a #1
Stack : ffffffff 801b2a18 80dd0000 00000004 00000000 00000000 81023cd4 00000000
81040000 811a9930 81040000 8104a628 81101833 00000001 81023c78 00000000
00000000 00000000 80f5d858 81023b98 00000001 00000023 00000000 ffffffff
00000000 00000064 00000002 81040000 81040000 00000001 80f5d858 000002d9
00000000 00000000 80000000 80002000 00000000 00000000 00000000 00000000
...
Call Trace:
[<8010a2bc>] show_stack+0x38/0x118
[<80cf5f7c>] dump_stack_lvl+0xac/0x104
[<80130d7c>] __warn+0xe0/0x224
[<80cdba5c>] warn_slowpath_fmt+0x64/0xb8
[<8028c058>] kmalloc_slab+0xc0/0xdc

irq event stamp: 0
hardirqs last enabled at (0): [<00000000>] 0x0
hardirqs last disabled at (0): [<00000000>] 0x0
softirqs last enabled at (0): [<00000000>] 0x0
softirqs last disabled at (0): [<00000000>] 0x0
---[ end trace 0000000000000000 ]---

Guenter

2022-10-14 23:55:22

by Hyeonggon Yoo

[permalink] [raw]
Subject: Re: [PATCH v4 10/17] mm/slab: kmalloc: pass requests larger than order-1 page to page allocator

On Fri, Oct 14, 2022 at 01:58:18PM -0700, Guenter Roeck wrote:
> Hi,
>
> On Wed, Aug 17, 2022 at 07:18:19PM +0900, Hyeonggon Yoo wrote:
> > There is not much benefit for serving large objects in kmalloc().
> > Let's pass large requests to page allocator like SLUB for better
> > maintenance of common code.
> >
> > Signed-off-by: Hyeonggon Yoo <[email protected]>
> > Reviewed-by: Vlastimil Babka <[email protected]>
> > ---
>
> This patch results in a WARNING backtrace in all mips and sparc64
> emulations.
>
> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 0 at mm/slab_common.c:729 kmalloc_slab+0xc0/0xdc
> Modules linked in:
> CPU: 0 PID: 0 Comm: swapper Not tainted 6.0.0-11990-g9c9155a3509a #1
> Stack : ffffffff 801b2a18 80dd0000 00000004 00000000 00000000 81023cd4 00000000
> 81040000 811a9930 81040000 8104a628 81101833 00000001 81023c78 00000000
> 00000000 00000000 80f5d858 81023b98 00000001 00000023 00000000 ffffffff
> 00000000 00000064 00000002 81040000 81040000 00000001 80f5d858 000002d9
> 00000000 00000000 80000000 80002000 00000000 00000000 00000000 00000000
> ...
> Call Trace:
> [<8010a2bc>] show_stack+0x38/0x118
> [<80cf5f7c>] dump_stack_lvl+0xac/0x104
> [<80130d7c>] __warn+0xe0/0x224
> [<80cdba5c>] warn_slowpath_fmt+0x64/0xb8
> [<8028c058>] kmalloc_slab+0xc0/0xdc
>
> irq event stamp: 0
> hardirqs last enabled at (0): [<00000000>] 0x0
> hardirqs last disabled at (0): [<00000000>] 0x0
> softirqs last enabled at (0): [<00000000>] 0x0
> softirqs last disabled at (0): [<00000000>] 0x0
> ---[ end trace 0000000000000000 ]---
>
> Guenter

Hi.

Thank you so much for this report!

Hmm so SLAB tries to find kmalloc cache for freelist index array using
kmalloc_slab() directly, and it becomes problematic when size of the
array is larger than PAGE_SIZE * 2.

Will send a fix soon.

--
Thanks,
Hyeonggon

2022-10-15 05:42:09

by Hyeonggon Yoo

[permalink] [raw]
Subject: [PATCH] mm/slab: use kmalloc_node() for off slab freelist_idx_t array allocation

After commit d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than
order-1 page to page allocator"), SLAB passes large ( > PAGE_SIZE * 2)
requests to buddy like SLUB does.

SLAB has been using kmalloc caches to allocate freelist_idx_t array for
off slab caches. But after the commit, freelist_size can be bigger than
KMALLOC_MAX_CACHE_SIZE.

Instead of using pointer to kmalloc cache, use kmalloc_node() and only
check if the kmalloc cache is off slab during calculate_slab_order().
If freelist_size > KMALLOC_MAX_CACHE_SIZE, no looping condition happens
as it allocates freelist_idx_t array directly from buddy.

Reported-by: Guenter Roeck <[email protected]>
Fixes: d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than order-1 page to page allocator")
Signed-off-by: Hyeonggon Yoo <[email protected]>
---

@Guenter:
This fixes the issue on my emulation.
Can you please test this on your environment?

include/linux/slab_def.h | 1 -
mm/slab.c | 37 +++++++++++++++++++------------------
2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index e24c9aff6fed..f0ffad6a3365 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -33,7 +33,6 @@ struct kmem_cache {

size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
- struct kmem_cache *freelist_cache;
unsigned int freelist_size;

/* constructor func */
diff --git a/mm/slab.c b/mm/slab.c
index a5486ff8362a..d1f6e2c64c2e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1619,7 +1619,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slab)
* although actual page can be freed in rcu context
*/
if (OFF_SLAB(cachep))
- kmem_cache_free(cachep->freelist_cache, freelist);
+ kfree(freelist);
}

/*
@@ -1671,21 +1671,27 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (flags & CFLGS_OFF_SLAB) {
struct kmem_cache *freelist_cache;
size_t freelist_size;
+ size_t freelist_cache_size;

freelist_size = num * sizeof(freelist_idx_t);
- freelist_cache = kmalloc_slab(freelist_size, 0u);
- if (!freelist_cache)
- continue;
-
- /*
- * Needed to avoid possible looping condition
- * in cache_grow_begin()
- */
- if (OFF_SLAB(freelist_cache))
- continue;
+ if (freelist_size > KMALLOC_MAX_CACHE_SIZE) {
+ freelist_cache_size = PAGE_SIZE << get_order(freelist_size);
+ } else {
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+ freelist_cache_size = freelist_cache->size;
+
+ /*
+ * Needed to avoid possible looping condition
+ * in cache_grow_begin()
+ */
+ if (OFF_SLAB(freelist_cache))
+ continue;
+ }

/* check if off slab has enough benefit */
- if (freelist_cache->size > cachep->size / 2)
+ if (freelist_cache_size > cachep->size / 2)
continue;
}

@@ -2061,11 +2067,6 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif

- if (OFF_SLAB(cachep)) {
- cachep->freelist_cache =
- kmalloc_slab(cachep->freelist_size, 0u);
- }
-
err = setup_cpu_cache(cachep, gfp);
if (err) {
__kmem_cache_release(cachep);
@@ -2292,7 +2293,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
freelist = NULL;
else if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
- freelist = kmem_cache_alloc_node(cachep->freelist_cache,
+ freelist = kmalloc_node(cachep->freelist_size,
local_flags, nodeid);
} else {
/* We will use last bytes at the slab for freelist */
--
2.32.0

2022-10-15 11:50:24

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH] mm/slab: use kmalloc_node() for off slab freelist_idx_t array allocation

On Sat, Oct 15, 2022 at 01:34:29PM +0900, Hyeonggon Yoo wrote:
> After commit d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than
> order-1 page to page allocator"), SLAB passes large ( > PAGE_SIZE * 2)
> requests to buddy like SLUB does.
>
> SLAB has been using kmalloc caches to allocate freelist_idx_t array for
> off slab caches. But after the commit, freelist_size can be bigger than
> KMALLOC_MAX_CACHE_SIZE.
>
> Instead of using pointer to kmalloc cache, use kmalloc_node() and only
> check if the kmalloc cache is off slab during calculate_slab_order().
> If freelist_size > KMALLOC_MAX_CACHE_SIZE, no looping condition happens
> as it allocates freelist_idx_t array directly from buddy.
>
> Reported-by: Guenter Roeck <[email protected]>
> Fixes: d6a71648dbc0 ("mm/slab: kmalloc: pass requests larger than order-1 page to page allocator")
> Signed-off-by: Hyeonggon Yoo <[email protected]>
> ---
>
> @Guenter:
> This fixes the issue on my emulation.
> Can you please test this on your environment?

Yes, that fixes the problem for me.

Tested-by: Guenter Roeck <[email protected]>

Thanks,
Guenter

>
> include/linux/slab_def.h | 1 -
> mm/slab.c | 37 +++++++++++++++++++------------------
> 2 files changed, 19 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
> index e24c9aff6fed..f0ffad6a3365 100644
> --- a/include/linux/slab_def.h
> +++ b/include/linux/slab_def.h
> @@ -33,7 +33,6 @@ struct kmem_cache {
>
> size_t colour; /* cache colouring range */
> unsigned int colour_off; /* colour offset */
> - struct kmem_cache *freelist_cache;
> unsigned int freelist_size;
>
> /* constructor func */
> diff --git a/mm/slab.c b/mm/slab.c
> index a5486ff8362a..d1f6e2c64c2e 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -1619,7 +1619,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slab)
> * although actual page can be freed in rcu context
> */
> if (OFF_SLAB(cachep))
> - kmem_cache_free(cachep->freelist_cache, freelist);
> + kfree(freelist);
> }
>
> /*
> @@ -1671,21 +1671,27 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
> if (flags & CFLGS_OFF_SLAB) {
> struct kmem_cache *freelist_cache;
> size_t freelist_size;
> + size_t freelist_cache_size;
>
> freelist_size = num * sizeof(freelist_idx_t);
> - freelist_cache = kmalloc_slab(freelist_size, 0u);
> - if (!freelist_cache)
> - continue;
> -
> - /*
> - * Needed to avoid possible looping condition
> - * in cache_grow_begin()
> - */
> - if (OFF_SLAB(freelist_cache))
> - continue;
> + if (freelist_size > KMALLOC_MAX_CACHE_SIZE) {
> + freelist_cache_size = PAGE_SIZE << get_order(freelist_size);
> + } else {
> + freelist_cache = kmalloc_slab(freelist_size, 0u);
> + if (!freelist_cache)
> + continue;
> + freelist_cache_size = freelist_cache->size;
> +
> + /*
> + * Needed to avoid possible looping condition
> + * in cache_grow_begin()
> + */
> + if (OFF_SLAB(freelist_cache))
> + continue;
> + }
>
> /* check if off slab has enough benefit */
> - if (freelist_cache->size > cachep->size / 2)
> + if (freelist_cache_size > cachep->size / 2)
> continue;
> }
>
> @@ -2061,11 +2067,6 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
> cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
> #endif
>
> - if (OFF_SLAB(cachep)) {
> - cachep->freelist_cache =
> - kmalloc_slab(cachep->freelist_size, 0u);
> - }
> -
> err = setup_cpu_cache(cachep, gfp);
> if (err) {
> __kmem_cache_release(cachep);
> @@ -2292,7 +2293,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
> freelist = NULL;
> else if (OFF_SLAB(cachep)) {
> /* Slab management obj is off-slab. */
> - freelist = kmem_cache_alloc_node(cachep->freelist_cache,
> + freelist = kmalloc_node(cachep->freelist_size,
> local_flags, nodeid);
> } else {
> /* We will use last bytes at the slab for freelist */
> --
> 2.32.0

2022-10-15 20:08:37

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [PATCH v4 10/17] mm/slab: kmalloc: pass requests larger than order-1 page to page allocator

On 10/15/22 01:48, Hyeonggon Yoo wrote:
> On Fri, Oct 14, 2022 at 01:58:18PM -0700, Guenter Roeck wrote:
>> Hi,
>>
>> On Wed, Aug 17, 2022 at 07:18:19PM +0900, Hyeonggon Yoo wrote:
>> > There is not much benefit for serving large objects in kmalloc().
>> > Let's pass large requests to page allocator like SLUB for better
>> > maintenance of common code.
>> >
>> > Signed-off-by: Hyeonggon Yoo <[email protected]>
>> > Reviewed-by: Vlastimil Babka <[email protected]>
>> > ---
>>
>> This patch results in a WARNING backtrace in all mips and sparc64
>> emulations.
>>
>> ------------[ cut here ]------------
>> WARNING: CPU: 0 PID: 0 at mm/slab_common.c:729 kmalloc_slab+0xc0/0xdc
>> Modules linked in:
>> CPU: 0 PID: 0 Comm: swapper Not tainted 6.0.0-11990-g9c9155a3509a #1
>> Stack : ffffffff 801b2a18 80dd0000 00000004 00000000 00000000 81023cd4 00000000
>> 81040000 811a9930 81040000 8104a628 81101833 00000001 81023c78 00000000
>> 00000000 00000000 80f5d858 81023b98 00000001 00000023 00000000 ffffffff
>> 00000000 00000064 00000002 81040000 81040000 00000001 80f5d858 000002d9
>> 00000000 00000000 80000000 80002000 00000000 00000000 00000000 00000000
>> ...
>> Call Trace:
>> [<8010a2bc>] show_stack+0x38/0x118
>> [<80cf5f7c>] dump_stack_lvl+0xac/0x104
>> [<80130d7c>] __warn+0xe0/0x224
>> [<80cdba5c>] warn_slowpath_fmt+0x64/0xb8
>> [<8028c058>] kmalloc_slab+0xc0/0xdc
>>
>> irq event stamp: 0
>> hardirqs last enabled at (0): [<00000000>] 0x0
>> hardirqs last disabled at (0): [<00000000>] 0x0
>> softirqs last enabled at (0): [<00000000>] 0x0
>> softirqs last disabled at (0): [<00000000>] 0x0
>> ---[ end trace 0000000000000000 ]---
>>
>> Guenter
>
> Hi.
>
> Thank you so much for this report!
>
> Hmm so SLAB tries to find kmalloc cache for freelist index array using
> kmalloc_slab() directly, and it becomes problematic when size of the
> array is larger than PAGE_SIZE * 2.

Hmm interesting, did you find out how exactly that can happen in practice,
or what's special about mips and sparc64 here? Because normally
calculate_slab_order() will only go up to slab_max_order, which AFAICS can
only go up to SLAB_MAX_ORDER_HI, thus 1, unless there's a boot command line
override.

And if we have two pages for objects, surely even with small objects they
can't be smaller than freelist_idx_t, so if the number of objects fits into
two pages (order 1), then the freelist array should also fit in two pages?

Thanks,
Vlastimil

> Will send a fix soon.
>

2022-10-16 09:36:19

by Hyeonggon Yoo

[permalink] [raw]
Subject: Re: [PATCH v4 10/17] mm/slab: kmalloc: pass requests larger than order-1 page to page allocator

On Sat, Oct 15, 2022 at 09:39:08PM +0200, Vlastimil Babka wrote:
> On 10/15/22 01:48, Hyeonggon Yoo wrote:
> > On Fri, Oct 14, 2022 at 01:58:18PM -0700, Guenter Roeck wrote:
> >> Hi,
> >>
> >> On Wed, Aug 17, 2022 at 07:18:19PM +0900, Hyeonggon Yoo wrote:
> >> > There is not much benefit for serving large objects in kmalloc().
> >> > Let's pass large requests to page allocator like SLUB for better
> >> > maintenance of common code.
> >> >
> >> > Signed-off-by: Hyeonggon Yoo <[email protected]>
> >> > Reviewed-by: Vlastimil Babka <[email protected]>
> >> > ---
> >>
> >> This patch results in a WARNING backtrace in all mips and sparc64
> >> emulations.
> >>
> >> ------------[ cut here ]------------
> >> WARNING: CPU: 0 PID: 0 at mm/slab_common.c:729 kmalloc_slab+0xc0/0xdc
> >> Modules linked in:
> >> CPU: 0 PID: 0 Comm: swapper Not tainted 6.0.0-11990-g9c9155a3509a #1
> >> Stack : ffffffff 801b2a18 80dd0000 00000004 00000000 00000000 81023cd4 00000000
> >> 81040000 811a9930 81040000 8104a628 81101833 00000001 81023c78 00000000
> >> 00000000 00000000 80f5d858 81023b98 00000001 00000023 00000000 ffffffff
> >> 00000000 00000064 00000002 81040000 81040000 00000001 80f5d858 000002d9
> >> 00000000 00000000 80000000 80002000 00000000 00000000 00000000 00000000
> >> ...
> >> Call Trace:
> >> [<8010a2bc>] show_stack+0x38/0x118
> >> [<80cf5f7c>] dump_stack_lvl+0xac/0x104
> >> [<80130d7c>] __warn+0xe0/0x224
> >> [<80cdba5c>] warn_slowpath_fmt+0x64/0xb8
> >> [<8028c058>] kmalloc_slab+0xc0/0xdc
> >>
> >> irq event stamp: 0
> >> hardirqs last enabled at (0): [<00000000>] 0x0
> >> hardirqs last disabled at (0): [<00000000>] 0x0
> >> softirqs last enabled at (0): [<00000000>] 0x0
> >> softirqs last disabled at (0): [<00000000>] 0x0
> >> ---[ end trace 0000000000000000 ]---
> >>
> >> Guenter
> >
> > Hi.
> >
> > Thank you so much for this report!
> >
> > Hmm so SLAB tries to find kmalloc cache for freelist index array using
> > kmalloc_slab() directly, and it becomes problematic when size of the
> > array is larger than PAGE_SIZE * 2.
>
> Hmm interesting, did you find out how exactly that can happen in practice,

> or what's special about mips and sparc64 here?

IIUC if page size is large, number of objects per slab is quite large and so
the possiblity of failing to use objfreelist slab is higher, and then it
tries to use off slab.

> Because normally
> calculate_slab_order() will only go up to slab_max_order, which AFAICS can
> only go up to SLAB_MAX_ORDER_HI, thus 1, unless there's a boot command line
> override.

AFAICS with mips default configuration and without setting slab_max_order,
It seems SLAB actually does not use too big freelist index array.

But it hits the warning because of tricky logic.

For example if the condition is true on


> if (freelist_cache->size > cachep->size / 2)
> continue;

or on (before kmalloc is up, in case of kmem_cache)
> freelist_cache = kmalloc_slab(freelist_size, 0u);
> if (!freelist_cache)
> continue;

it increases gfporder over and over until 'num' becomes larger than SLAB_MAX_OBJS.
(regardless of slab_max_order).

I think adding below would be more robust.

diff --git a/mm/slab.c b/mm/slab.c
index d1f6e2c64c2e..1321aca1887c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1679,7 +1679,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
} else {
freelist_cache = kmalloc_slab(freelist_size, 0u);
if (!freelist_cache)
- continue;
+ break;
freelist_cache_size = freelist_cache->size;

/*
@@ -1692,7 +1692,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,

/* check if off slab has enough benefit */
if (freelist_cache_size > cachep->size / 2)
- continue;
+ break;
}

/* Found something acceptable - save it away */


> And if we have two pages for objects, surely even with small objects they
> can't be smaller than freelist_idx_t, so if the number of objects fits into
> two pages (order 1), then the freelist array should also fit in two pages?

That's right but on certain condition it seem to go larger than slab_max_order.
(from code inspection)

>
> Thanks,
> Vlastimil
>
> > Will send a fix soon.
> >

--
Thanks,
Hyeonggon