2022-03-08 23:59:10

by Hyeonggon Yoo

[permalink] [raw]
Subject: [RFC PATCH v1 00/15] common kmalloc subsystem on SLAB/SLUB

Hello, this series is cleanup of slab common code.

After this series, kmalloc subsystem is perfectly generalized
between SLAB and SLUB.

This series is not small, and some of review and discussion may be needed.
But I bet you will like this! :D

Many thanks to Matthew, Marco, Vlastimil who gave comments
in previous series.

Any feedbacks will be appreciated.
Thanks!

========= series description ==========

patch 1 makes slab_alloc_node() in SLAB available for non-NUMA
configurations for further cleanup.

patch 2-8 are cleanup of unnecessary CONFIG_TRACING/NUMA ifdefs and
duplicate code. (and small optimization of kmalloc_node())

patch 9 makes SLAB pass requests larger than order-1 page to page
allocator. This is useful for further generalization.

patch 10-11 are cleanup of tracepoints. Currently there are five
trace points in slab: kmalloc, kmalloc_node, kmem_cache_alloc,
kmem_cahce_alloc_node, kmem_cache_free, kfree.

patch 10-11 make tracepoints print cache name and convert every
tracepoint to kmem_cache_alloc_node and kmem_cache_free.

patch 12 generalizes whole kmalloc subsystem on SLAB and SLUB.

patch 13 removes kmem_cache_alloc_node() that became useless
after patch 12.

patch 14-15 are small improvements of __ksize().

Hyeonggon Yoo (15):
mm/slab: cleanup slab_alloc() and slab_alloc_node()
mm/sl[auo]b: remove CONFIG_NUMA ifdefs for common functions
mm/sl[au]b: remove CONFIG_TRACING ifdefs for tracing functions
mm/sl[auo]b: fold kmalloc_order() into kmalloc_large()
mm/slub: move kmalloc_large_node() to slab_common.c
mm/slab_common: cleanup kmalloc_large()
mm/sl[au]b: kmalloc_node: pass large requests to page allocator
mm/sl[auo]b: cleanup kmalloc()
mm/slab: kmalloc: pass requests larger than order-1 page to page
allocator
mm/sl[auo]b: print cache name in tracepoints
mm/sl[auo]b: use same tracepoint in kmalloc and normal caches
mm/sl[au]b: generalize kmalloc subsystem
mm/sl[au]b: remove kmem_cache_alloc_node_trace()
mm/sl[auo]b: move definition of __ksize() to mm/slab.h
mm/sl[au]b: check if large object is valid in __ksize()

include/linux/slab.h | 252 +++++++++++---------------
include/trace/events/kmem.h | 107 ++---------
mm/slab.c | 347 ++++++++----------------------------
mm/slab.h | 9 +
mm/slab_common.c | 128 ++++++++++---
mm/slob.c | 69 +++----
mm/slub.c | 237 ++----------------------
7 files changed, 346 insertions(+), 803 deletions(-)

--
2.33.1


2022-03-09 00:29:21

by Hyeonggon Yoo

[permalink] [raw]
Subject: [RFC PATCH v1 05/15] mm/slub: move kmalloc_large_node() to slab_common.c

In later patch SLAB will also pass requests larger than order-1 page
to page allocator. Move kmalloc_large_node() to slab_common.c.

Fold kmalloc_large_node_hook() into kmalloc_large_node() as there is
no other caller.

Move tracepoint in kmalloc_large_node().

Add flag fix code. This exist in kmalloc_large() but omitted in
kmalloc_large_node().

Signed-off-by: Hyeonggon Yoo <[email protected]>
---
include/linux/slab.h | 3 +++
mm/slab_common.c | 26 ++++++++++++++++++++++++
mm/slub.c | 47 ++++----------------------------------------
3 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index aa14aba2b068..60d27635c13d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -470,6 +470,9 @@ extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
extern void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment
__alloc_size(1);

+extern void *kmalloc_large_node(size_t size, gfp_t flags, int node)
+ __assume_page_alignment __alloc_size(1);
+
/**
* kmalloc - allocate memory
* @size: how many bytes of memory are required.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1ba479f9d143..f61ac7458829 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -956,6 +956,32 @@ void *kmalloc_large(size_t size, gfp_t flags)
}
EXPORT_SYMBOL(kmalloc_large);

+void *kmalloc_large_node(size_t size, gfp_t flags, int node)
+{
+ struct page *page;
+ void *ptr = NULL;
+ unsigned int order = get_order(size);
+
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
+ flags |= __GFP_COMP;
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
+ ptr = page_address(page);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
+ }
+ ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ptr, size, 1, flags);
+ trace_kmalloc_node(_RET_IP_, ptr, size, PAGE_SIZE << order, flags,
+ node);
+ return ptr;
+
+}
+EXPORT_SYMBOL(kmalloc_large_node);
+
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Randomize a generic freelist */
static void freelist_randomize(struct rnd_state *state, unsigned int *list,
diff --git a/mm/slub.c b/mm/slub.c
index 267f700abac1..cdbbf0e97637 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1678,14 +1678,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
- ptr = kasan_kmalloc_large(ptr, size, flags);
- /* As ptr might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ptr, size, 1, flags);
- return ptr;
-}
-
static __always_inline void kfree_hook(void *x)
{
kmemleak_free(x);
@@ -4387,37 +4379,13 @@ static int __init setup_slub_min_objects(char *str)

__setup("slub_min_objects=", setup_slub_min_objects);

-static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
-{
- struct page *page;
- void *ptr = NULL;
- unsigned int order = get_order(size);
-
- flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, order);
- if (page) {
- ptr = page_address(page);
- mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
- }
-
- return kmalloc_large_node_hook(ptr, size, flags);
-}
-
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
struct kmem_cache *s;
void *ret;

- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, flags, node);
-
- trace_kmalloc_node(_RET_IP_, ret,
- size, PAGE_SIZE << get_order(size),
- flags, node);
-
- return ret;
- }
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return kmalloc_large_node(size, flags, node);

s = kmalloc_slab(size, flags);

@@ -4874,15 +4842,8 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
struct kmem_cache *s;
void *ret;

- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, gfpflags, node);
-
- trace_kmalloc_node(caller, ret,
- size, PAGE_SIZE << get_order(size),
- gfpflags, node);
-
- return ret;
- }
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return kmalloc_large_node(size, gfpflags, node);

s = kmalloc_slab(size, gfpflags);

--
2.33.1

2022-03-09 00:36:38

by Hyeonggon Yoo

[permalink] [raw]
Subject: [RFC PATCH v1 14/15] mm/sl[auo]b: move definition of __ksize() to mm/slab.h

__ksize() is only called by KASAN. Remove export symbol and move
definition to mm/slab.h as we don't want to grow its callers.

[ [email protected]: Move definition to mm/slab.h and reduce comments ]

Signed-off-by: Hyeonggon Yoo <[email protected]>
Reviewed-by: Vlastimil Babka <[email protected]>
---
include/linux/slab.h | 1 -
mm/slab.h | 2 ++
mm/slab_common.c | 11 +----------
mm/slob.c | 1 -
4 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 8da8beff712f..a3f8a103f318 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -182,7 +182,6 @@ int kmem_cache_shrink(struct kmem_cache *s);
void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2);
void kfree(const void *objp);
void kfree_sensitive(const void *objp);
-size_t __ksize(const void *objp);
size_t ksize(const void *objp);
#ifdef CONFIG_PRINTK
bool kmem_valid_obj(void *object);
diff --git a/mm/slab.h b/mm/slab.h
index bfedfe3900bb..4fd4bd7bb4d7 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -673,6 +673,8 @@ void free_large_kmalloc(struct folio *folio, void *object);

#endif /* CONFIG_SLOB */

+size_t __ksize(const void *objp);
+
static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifndef CONFIG_SLUB
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6533026b4a6b..07ed382ed5a9 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -976,15 +976,7 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);

-/**
- * __ksize -- Uninstrumented ksize.
- * @objp: pointer to the object
- *
- * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
- * safety checks as ksize() with KASAN instrumentation enabled.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
+/* Uninstrumented ksize. Only called by KASAN. */
size_t __ksize(const void *object)
{
struct folio *folio;
@@ -999,7 +991,6 @@ size_t __ksize(const void *object)

return slab_ksize(folio_slab(folio)->slab_cache);
}
-EXPORT_SYMBOL(__ksize);
#endif /* !CONFIG_SLOB */

gfp_t kmalloc_fix_flags(gfp_t flags)
diff --git a/mm/slob.c b/mm/slob.c
index 836a7d1ae996..59ddf80e987c 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -576,7 +576,6 @@ size_t __ksize(const void *block)
m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
}
-EXPORT_SYMBOL(__ksize);

int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
{
--
2.33.1

2022-03-09 00:54:48

by Hyeonggon Yoo

[permalink] [raw]
Subject: [RFC PATCH v1 12/15] mm/sl[au]b: generalize kmalloc subsystem

Now everything in kmalloc subsystem can be generalized.
Let's do it!

Generalize __kmalloc_node_track_caller(), kfree(), __ksize(),
and move them to slab_common.c.

Make __kmalloc_node() wrapper of __kmalloc_node_track_caller().
They are duplicate.

To keep caller address unchanged in kmalloc/kfree tracepoints, implement
__kmem_cache_{alloc_node,free}() that takes caller address.

Signed-off-by: Hyeonggon Yoo <[email protected]>
---
include/linux/slab.h | 79 ++++++++++++++++++-------
mm/slab.c | 135 ++++---------------------------------------
mm/slab_common.c | 75 ++++++++++++++++++++++++
mm/slob.c | 32 +++++-----
mm/slub.c | 105 +++------------------------------
5 files changed, 166 insertions(+), 260 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9ced225a3ea3..6b632137f799 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -401,14 +401,53 @@ static_assert(PAGE_SHIFT <= 20);
#define kmalloc_index(s) __kmalloc_index(s, true)
#endif /* !CONFIG_SLOB */

-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
- __alloc_size(1);
-void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
- __malloc;
+extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
+ unsigned long caller) __alloc_size(1);
+#define kmalloc_node_track_caller(size, flags, node) \
+ __kmalloc_node_track_caller(size, flags, node, \
+ _RET_IP_)
+/*
+ * kmalloc_track_caller is a special version of kmalloc that records the
+ * calling function of the routine calling it for slab leak tracking instead
+ * of just the calling function (confusing, eh?).
+ * It's useful when the call to kmalloc comes from a widely-used standard
+ * allocator where we care about the real place the memory allocation
+ * request comes from.
+ */
+#define kmalloc_track_caller(size, flags) \
+ __kmalloc_node_track_caller(size, flags, NUMA_NO_NODE, _RET_IP_)
+
+static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __kmalloc_node_track_caller(size, flags, node, _RET_IP_);
+}

static __always_inline void *__kmalloc(size_t size, gfp_t flags)
{
- return __kmalloc_node(size, flags, NUMA_NO_NODE);
+ return __kmalloc_node_track_caller(size, flags, NUMA_NO_NODE, _RET_IP_);
+}
+
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller);
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, unsigned long caller);
+
+/**
+ * kmem_cache_alloc_node - Allocate an object on the specified node
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ * @nodeid: node number of the target node.
+ *
+ * Identical to kmem_cache_alloc but it will allocate memory on the given
+ * node, which can improve the performance for cpu bound structures.
+ *
+ * Fallback to other node is possible if __GFP_THISNODE is not set.
+ *
+ * Return: pointer to the new object or %NULL in case of error
+ */
+static __always_inline void *
+kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
+{
+ return __kmem_cache_alloc_node(s, gfpflags, node, _RET_IP_);
}

/**
@@ -423,10 +462,21 @@ static __always_inline void *__kmalloc(size_t size, gfp_t flags)
*/
static __always_inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags)
{
- return kmem_cache_alloc_node(s, flags, NUMA_NO_NODE);
+ return __kmem_cache_alloc_node(s, flags, NUMA_NO_NODE, _RET_IP_);
}

-void kmem_cache_free(struct kmem_cache *s, void *objp);
+/**
+ * kmem_cache_free - Deallocate an object
+ * @cachep: The cache the allocation was from.
+ * @objp: The previously allocated object.
+ *
+ * Free an object which was previously allocated from this
+ * cache.
+ */
+static __always_inline void kmem_cache_free(struct kmem_cache *s, void *x)
+{
+ __kmem_cache_free(s, x, _RET_IP_);
+}

/*
* Bulk allocation and freeing operations. These are accelerated in an
@@ -613,21 +663,6 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t
return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
}

-extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
- unsigned long caller) __alloc_size(1);
-#define kmalloc_node_track_caller(size, flags, node) \
- __kmalloc_node_track_caller(size, flags, node, \
- _RET_IP_)
-/*
- * kmalloc_track_caller is a special version of kmalloc that records the
- * calling function of the routine calling it for slab leak tracking instead
- * of just the calling function (confusing, eh?).
- * It's useful when the call to kmalloc comes from a widely-used standard
- * allocator where we care about the real place the memory allocation
- * request comes from.
- */
-#define kmalloc_track_caller(size, flags) \
- __kmalloc_node_track_caller(size, flags, NUMA_NO_NODE, _RET_IP_)
/*
* Shortcuts
*/
diff --git a/mm/slab.c b/mm/slab.c
index 702a78f64b44..2f4d13bb511b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3519,30 +3519,19 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);

-/**
- * kmem_cache_alloc_node - Allocate an object on the specified node
- * @cachep: The cache to allocate from.
- * @flags: See kmalloc().
- * @nodeid: node number of the target node.
- *
- * Identical to kmem_cache_alloc but it will allocate memory on the given
- * node, which can improve the performance for cpu bound structures.
- *
- * Fallback to other node is possible if __GFP_THISNODE is not set.
- *
- * Return: pointer to the new object or %NULL in case of error
- */
-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid, unsigned long caller)
{
- void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
+ void *ret = slab_alloc_node(cachep, flags, nodeid,
+ cachep->object_size, caller);

- trace_kmem_cache_alloc_node(cachep->name, _RET_IP_, ret,
+ trace_kmem_cache_alloc_node(cachep->name, caller, ret,
cachep->object_size, cachep->size,
flags, nodeid);

return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_SYMBOL(__kmem_cache_alloc_node);

void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
gfp_t flags,
@@ -3561,36 +3550,6 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);

-static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
-{
- struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large_node(size, flags, node);
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
- ret = kasan_kmalloc(cachep, ret, size, flags);
-
- return ret;
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- return __do_kmalloc_node(size, flags, node, _RET_IP_);
-}
-EXPORT_SYMBOL(__kmalloc_node);
-
-void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
- int node, unsigned long caller)
-{
- return __do_kmalloc_node(size, flags, node, caller);
-}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-
#ifdef CONFIG_PRINTK
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
@@ -3613,30 +3572,23 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
}
#endif

-/**
- * kmem_cache_free - Deallocate an object
- * @cachep: The cache the allocation was from.
- * @objp: The previously allocated object.
- *
- * Free an object which was previously allocated from this
- * cache.
- */
-void kmem_cache_free(struct kmem_cache *cachep, void *objp)
+void __kmem_cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
unsigned long flags;
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;

- trace_kmem_cache_free(cachep->name, _RET_IP_, objp);
+ trace_kmem_cache_free(cachep->name, caller, objp);
local_irq_save(flags);
debug_check_no_locks_freed(objp, cachep->object_size);
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, cachep->object_size);
- __cache_free(cachep, objp, _RET_IP_);
+ __cache_free(cachep, objp, caller);
local_irq_restore(flags);
}
-EXPORT_SYMBOL(kmem_cache_free);
+EXPORT_SYMBOL(__kmem_cache_free);

void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
{
@@ -3676,44 +3628,6 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
}
EXPORT_SYMBOL(kmem_cache_free_bulk);

-/**
- * kfree - free previously allocated memory
- * @objp: pointer returned by kmalloc.
- *
- * If @objp is NULL, no operation is performed.
- *
- * Don't free memory not originally allocated by kmalloc()
- * or you will run into trouble.
- */
-void kfree(const void *objp)
-{
- struct kmem_cache *c;
- unsigned long flags;
- struct folio *folio;
- void *x = (void *) objp;
-
-
- if (unlikely(ZERO_OR_NULL_PTR(objp)))
- return;
-
- folio = virt_to_folio(objp);
- if (!folio_test_slab(folio)) {
- free_large_kmalloc(folio, x);
- return;
- }
-
- c = folio_slab(folio)->slab_cache;
- trace_kmem_cache_free(c->name, _RET_IP_, objp);
-
- local_irq_save(flags);
- kfree_debugcheck(objp);
- debug_check_no_locks_freed(objp, c->object_size);
- debug_check_no_obj_freed(objp, c->object_size);
- __cache_free(c, (void *)objp, _RET_IP_);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(kfree);
-
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
@@ -4116,30 +4030,3 @@ void __check_heap_object(const void *ptr, unsigned long n,
usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
-
-/**
- * __ksize -- Uninstrumented ksize.
- * @objp: pointer to the object
- *
- * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
- * safety checks as ksize() with KASAN instrumentation enabled.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
-size_t __ksize(const void *objp)
-{
- struct kmem_cache *c;
- struct folio *folio;
-
- BUG_ON(!objp);
- if (unlikely(objp == ZERO_SIZE_PTR))
- return 0;
-
- folio = virt_to_folio(objp);
- if (!folio_test_slab(folio))
- return folio_size(folio);
-
- c = folio_slab(folio)->slab_cache;
- return c->object_size;
-}
-EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 8a8330a777f5..6533026b4a6b 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -925,6 +925,81 @@ void free_large_kmalloc(struct folio *folio, void *object)
-(PAGE_SIZE << order));
__free_pages(folio_page(folio, 0), order);
}
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
+ int node, unsigned long caller)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
+ return kmalloc_large_node(size, gfpflags, node);
+
+ s = kmalloc_slab(size, gfpflags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = __kmem_cache_alloc_node(s, gfpflags, node, caller);
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+
+ return ret;
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+
+/**
+ * kfree - free previously allocated memory
+ * @objp: pointer returned by kmalloc.
+ *
+ * If @objp is NULL, no operation is performed.
+ *
+ * Don't free memory not originally allocated by kmalloc()
+ * or you will run into trouble.
+ */
+void kfree(const void *x)
+{
+ struct folio *folio;
+ void *object = (void *)x;
+ struct kmem_cache *s;
+
+ if (unlikely(ZERO_OR_NULL_PTR(x)))
+ return;
+
+ folio = virt_to_folio(x);
+ if (unlikely(!folio_test_slab(folio))) {
+ free_large_kmalloc(folio, object);
+ return;
+ }
+
+ s = folio_slab(folio)->slab_cache;
+ __kmem_cache_free(s, object, _RET_IP_);
+}
+EXPORT_SYMBOL(kfree);
+
+/**
+ * __ksize -- Uninstrumented ksize.
+ * @objp: pointer to the object
+ *
+ * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
+ * safety checks as ksize() with KASAN instrumentation enabled.
+ *
+ * Return: size of the actual memory used by @objp in bytes
+ */
+size_t __ksize(const void *object)
+{
+ struct folio *folio;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ folio = virt_to_folio(object);
+
+ if (unlikely(!folio_test_slab(folio)))
+ return folio_size(folio);
+
+ return slab_ksize(folio_slab(folio)->slab_cache);
+}
+EXPORT_SYMBOL(__ksize);
#endif /* !CONFIG_SLOB */

gfp_t kmalloc_fix_flags(gfp_t flags)
diff --git a/mm/slob.c b/mm/slob.c
index 3726b77a066b..836a7d1ae996 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -588,7 +588,8 @@ int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
return 0;
}

-static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags,
+ int node, unsigned long caller)
{
void *b;

@@ -598,12 +599,12 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)

if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node, 0);
- trace_kmem_cache_alloc_node(c->name, _RET_IP_, b, c->object_size,
+ trace_kmem_cache_alloc_node(c->name, caller, b, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
} else {
b = slob_new_pages(flags, get_order(c->size), node);
- trace_kmem_cache_alloc_node(c->name, _RET_IP_, b, c->object_size,
+ trace_kmem_cache_alloc_node(c->name, caller, b, c->object_size,
PAGE_SIZE << get_order(c->size),
flags, node);
}
@@ -617,19 +618,14 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
return b;
}

-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp,
+ int node, unsigned long caller)
{
- return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+ return slob_alloc_node(cachep, gfp, node, caller);
}
-EXPORT_SYMBOL(__kmalloc_node);
+EXPORT_SYMBOL(__kmem_cache_alloc_node);

-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
-{
- return slob_alloc_node(cachep, gfp, node);
-}
-EXPORT_SYMBOL(kmem_cache_alloc_node);
-
-static void __kmem_cache_free(void *b, int size)
+static void ___kmem_cache_free(void *b, int size)
{
if (size < PAGE_SIZE)
slob_free(b, size);
@@ -642,23 +638,23 @@ static void kmem_rcu_free(struct rcu_head *head)
struct slob_rcu *slob_rcu = (struct slob_rcu *)head;
void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu));

- __kmem_cache_free(b, slob_rcu->size);
+ ___kmem_cache_free(b, slob_rcu->size);
}

-void kmem_cache_free(struct kmem_cache *c, void *b)
+void __kmem_cache_free(struct kmem_cache *c, void *b, unsigned long caller)
{
kmemleak_free_recursive(b, c->flags);
- trace_kmem_cache_free(c->name, _RET_IP_, b);
+ trace_kmem_cache_free(c->name, caller, b);
if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
struct slob_rcu *slob_rcu;
slob_rcu = b + (c->size - sizeof(struct slob_rcu));
slob_rcu->size = c->size;
call_rcu(&slob_rcu->head, kmem_rcu_free);
} else {
- __kmem_cache_free(b, c->size);
+ ___kmem_cache_free(b, c->size);
}
}
-EXPORT_SYMBOL(kmem_cache_free);
+EXPORT_SYMBOL(__kmem_cache_free);

void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
{
diff --git a/mm/slub.c b/mm/slub.c
index c2e713bdb26c..f8fdb6b4fbd2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3224,16 +3224,17 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
}

-void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, unsigned long caller)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
+ void *ret = slab_alloc_node(s, gfpflags, node, caller, s->object_size);

- trace_kmem_cache_alloc_node(s->name, _RET_IP_, ret,
+ trace_kmem_cache_alloc_node(s->name, caller, ret,
s->object_size, s->size, gfpflags, node);

return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_SYMBOL(__kmem_cache_alloc_node);

void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
@@ -3477,15 +3478,15 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
}
#endif

-void kmem_cache_free(struct kmem_cache *s, void *x)
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller)
{
s = cache_from_obj(s, x);
if (!s)
return;
- trace_kmem_cache_free(s->name, _RET_IP_, x);
- slab_free(s, virt_to_slab(x), x, NULL, 1, _RET_IP_);
+ trace_kmem_cache_free(s->name, caller, x);
+ slab_free(s, virt_to_slab(x), x, NULL, 1, caller);
}
-EXPORT_SYMBOL(kmem_cache_free);
+EXPORT_SYMBOL(__kmem_cache_free);

struct detached_freelist {
struct slab *slab;
@@ -4351,30 +4352,6 @@ static int __init setup_slub_min_objects(char *str)

__setup("slub_min_objects=", setup_slub_min_objects);

-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large_node(size, flags, node);
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, flags, node, _RET_IP_, size);
-
- trace_kmem_cache_alloc_node(s->name, _RET_IP_, ret, size,
- s->size, flags, node);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_node);
-
#ifdef CONFIG_HARDENED_USERCOPY
/*
* Rejects incorrectly sized objects and objects that are to be copied
@@ -4425,46 +4402,6 @@ void __check_heap_object(const void *ptr, unsigned long n,
}
#endif /* CONFIG_HARDENED_USERCOPY */

-size_t __ksize(const void *object)
-{
- struct folio *folio;
-
- if (unlikely(object == ZERO_SIZE_PTR))
- return 0;
-
- folio = virt_to_folio(object);
-
- if (unlikely(!folio_test_slab(folio)))
- return folio_size(folio);
-
- return slab_ksize(folio_slab(folio)->slab_cache);
-}
-EXPORT_SYMBOL(__ksize);
-
-void kfree(const void *x)
-{
- struct folio *folio;
- struct slab *slab;
- void *object = (void *)x;
- struct kmem_cache *s;
-
- if (unlikely(ZERO_OR_NULL_PTR(x)))
- return;
-
- folio = virt_to_folio(x);
- if (unlikely(!folio_test_slab(folio))) {
- free_large_kmalloc(folio, object);
- return;
- }
-
- slab = folio_slab(folio);
- s = slab->slab_cache;
-
- trace_kmem_cache_free(s->name, _RET_IP_, x);
- slab_free(s, slab, object, NULL, 1, _RET_IP_);
-}
-EXPORT_SYMBOL(kfree);
-
#define SHRINK_PROMOTE_MAX 32

/*
@@ -4812,30 +4749,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
return 0;
}

-void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
- int node, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large_node(size, gfpflags, node);
-
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, gfpflags, node, caller, size);
-
- /* Honor the call site pointer we received. */
- trace_kmem_cache_alloc_node(s->name, caller, ret, size,
- s->size, gfpflags, node);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-
#ifdef CONFIG_SYSFS
static int count_inuse(struct slab *slab)
{
--
2.33.1

2022-03-09 01:05:51

by Hyeonggon Yoo

[permalink] [raw]
Subject: [RFC PATCH v1 01/15] mm/slab: cleanup slab_alloc() and slab_alloc_node()

Make slab_alloc_node() available for non-NUMA configurations and make
slab_alloc() wrapper of slab_alloc_node(). This is necessary for further
cleanup.

Do not check availability of node when allocating from locally cached
objects. It's redundant.

This patch was tested on both CONFIG_NUMA=y and n.

Signed-off-by: Hyeonggon Yoo <[email protected]>
---
mm/slab.c | 116 +++++++++++++++++++++++-------------------------------
1 file changed, 50 insertions(+), 66 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index ddf5737c63d9..5d102aaf1629 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3200,60 +3200,6 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
return obj ? obj : fallback_alloc(cachep, flags);
}

-static __always_inline void *
-slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
- unsigned long caller)
-{
- unsigned long save_flags;
- void *ptr;
- int slab_node = numa_mem_id();
- struct obj_cgroup *objcg = NULL;
- bool init = false;
-
- flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
- if (unlikely(!cachep))
- return NULL;
-
- ptr = kfence_alloc(cachep, orig_size, flags);
- if (unlikely(ptr))
- goto out_hooks;
-
- cache_alloc_debugcheck_before(cachep, flags);
- local_irq_save(save_flags);
-
- if (nodeid == NUMA_NO_NODE)
- nodeid = slab_node;
-
- if (unlikely(!get_node(cachep, nodeid))) {
- /* Node not bootstrapped yet */
- ptr = fallback_alloc(cachep, flags);
- goto out;
- }
-
- if (nodeid == slab_node) {
- /*
- * Use the locally cached objects if possible.
- * However ____cache_alloc does not allow fallback
- * to other nodes. It may fail while we still have
- * objects on other nodes available.
- */
- ptr = ____cache_alloc(cachep, flags);
- if (ptr)
- goto out;
- }
- /* ___cache_alloc_node can fall back to other nodes */
- ptr = ____cache_alloc_node(cachep, flags, nodeid);
- out:
- local_irq_restore(save_flags);
- ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- init = slab_want_init_on_alloc(flags, cachep);
-
-out_hooks:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
- return ptr;
-}
-
static __always_inline void *
__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
@@ -3283,14 +3229,24 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
return ____cache_alloc(cachep, flags);
}
-
#endif /* CONFIG_NUMA */

+static __always_inline bool node_match(int nodeid, int slab_node)
+{
+#ifdef CONFIG_NUMA
+ if (nodeid != NUMA_NO_NODE && nodeid != slab_node)
+ return false;
+#endif
+ return true;
+}
+
static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
+ unsigned long caller)
{
unsigned long save_flags;
- void *objp;
+ void *ptr;
+ int slab_node = numa_mem_id();
struct obj_cgroup *objcg = NULL;
bool init = false;

@@ -3299,21 +3255,49 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
if (unlikely(!cachep))
return NULL;

- objp = kfence_alloc(cachep, orig_size, flags);
- if (unlikely(objp))
- goto out;
+ ptr = kfence_alloc(cachep, orig_size, flags);
+ if (unlikely(ptr))
+ goto out_hooks;

cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
- objp = __do_cache_alloc(cachep, flags);
+
+ if (node_match(nodeid, slab_node)) {
+ /*
+ * Use the locally cached objects if possible.
+ * However ____cache_alloc does not allow fallback
+ * to other nodes. It may fail while we still have
+ * objects on other nodes available.
+ */
+ ptr = ____cache_alloc(cachep, flags);
+ if (ptr)
+ goto out;
+ }
+#ifdef CONFIG_NUMA
+ else if (unlikely(!get_node(cachep, nodeid))) {
+ /* Node not bootstrapped yet */
+ ptr = fallback_alloc(cachep, flags);
+ goto out;
+ }
+
+ /* ___cache_alloc_node can fall back to other nodes */
+ ptr = ____cache_alloc_node(cachep, flags, nodeid);
+#endif
+out:
local_irq_restore(save_flags);
- objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
- prefetchw(objp);
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
+ prefetchw(ptr);
init = slab_want_init_on_alloc(flags, cachep);

-out:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init);
- return objp;
+out_hooks:
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
+ return ptr;
+}
+
+static __always_inline void *
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
+{
+ return slab_alloc_node(cachep, flags, NUMA_NO_NODE, orig_size, caller);
}

/*
--
2.33.1

2022-03-09 01:08:55

by Hyeonggon Yoo

[permalink] [raw]
Subject: [RFC PATCH v1 06/15] mm/slab_common: cleanup kmalloc_large()

Now that kmalloc_large() and kmalloc_large_node() do same job,
make kmalloc_large() wrapper of kmalloc_large_node().

This makes slab allocators to use kmalloc_node tracepoint in
kmalloc_large().

Signed-off-by: Hyeonggon Yoo <[email protected]>
---
include/linux/slab.h | 8 +++++---
mm/slab_common.c | 24 ------------------------
2 files changed, 5 insertions(+), 27 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 60d27635c13d..8840b2d55567 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -467,12 +467,14 @@ extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
int node, size_t size) __assume_slab_alignment
__alloc_size(4);

-extern void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment
- __alloc_size(1);
-
extern void *kmalloc_large_node(size_t size, gfp_t flags, int node)
__assume_page_alignment __alloc_size(1);

+static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
+{
+ return kmalloc_large_node(size, flags, NUMA_NO_NODE);
+}
+
/**
* kmalloc - allocate memory
* @size: how many bytes of memory are required.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f61ac7458829..1fe2f2a7326d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -932,30 +932,6 @@ gfp_t kmalloc_fix_flags(gfp_t flags)
* directly to the page allocator. We use __GFP_COMP, because we will need to
* know the allocation order to free the pages properly in kfree.
*/
-void *kmalloc_large(size_t size, gfp_t flags)
-{
- void *ret = NULL;
- struct page *page;
- unsigned int order = get_order(size);
-
- if (unlikely(flags & GFP_SLAB_BUG_MASK))
- flags = kmalloc_fix_flags(flags);
-
- flags |= __GFP_COMP;
- page = alloc_pages(flags, order);
- if (likely(page)) {
- ret = page_address(page);
- mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
- }
- ret = kasan_kmalloc_large(ret, size, flags);
- /* As ret might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ret, size, 1, flags);
- trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
- return ret;
-}
-EXPORT_SYMBOL(kmalloc_large);
-
void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
--
2.33.1

2022-03-09 01:40:22

by Hyeonggon Yoo

[permalink] [raw]
Subject: [RFC PATCH v1 13/15] mm/sl[au]b: remove kmem_cache_alloc_node_trace()

kmem_cache_alloc_node_trace() was introduced by commit 4a92379bdfb4
("slub tracing: move trace calls out of always inlined functions to reduce
kernel code size") to avoid inlining tracepoints for inlined kmalloc
function calls.

Now that we use same tracepoint in kmalloc and normal caches,
kmem_cache_alloc_node_trace() can be replaced with
__kmem_cache_alloc_node() and kasan_kmalloc().

Signed-off-by: Hyeonggon Yoo <[email protected]>
---
include/linux/slab.h | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 6b632137f799..8da8beff712f 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -497,10 +497,6 @@ static __always_inline void kfree_bulk(size_t size, void **p)
kmem_cache_free_bulk(NULL, size, p);
}

-extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
- int node, size_t size) __assume_slab_alignment
- __alloc_size(4);
-
extern void *kmalloc_large_node(size_t size, gfp_t flags, int node)
__assume_page_alignment __alloc_size(1);

@@ -512,6 +508,9 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
#ifndef CONFIG_SLOB
static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
{
+ struct kmem_cache *s;
+ void *objp;
+
if (__builtin_constant_p(size)) {
unsigned int index;

@@ -523,9 +522,11 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla
if (!index)
return ZERO_SIZE_PTR;

- return kmem_cache_alloc_node_trace(
- kmalloc_caches[kmalloc_type(flags)][index],
- flags, node, size);
+ s = kmalloc_caches[kmalloc_type(flags)][index];
+
+ objp = __kmem_cache_alloc_node(s, flags, node, _RET_IP_);
+ objp = kasan_kmalloc(s, objp, size, flags);
+ return objp;
}
return __kmalloc_node(size, flags, node);
}
--
2.33.1

2022-03-09 01:47:19

by Hyeonggon Yoo

[permalink] [raw]
Subject: [RFC PATCH v1 02/15] mm/sl[auo]b: remove CONFIG_NUMA ifdefs for common functions

Now that slab_alloc_node() is available regardless of CONFIG_NUMA
on SLAB, just remove CONFIG_NUMA ifdefs and make non-NUMA version
of functions wrapper of NUMA version.

This makes slab allocators use NUMA version of tracepoints.
In later patch, tracepoints will be also cleaned up.

Remove now unused __do_kmalloc() in SLAB.

Signed-off-by: Hyeonggon Yoo <[email protected]>
---
include/linux/slab.h | 85 ++++++++++++++++++--------------------------
mm/slab.c | 63 --------------------------------
mm/slob.c | 22 ------------
mm/slub.c | 62 --------------------------------
4 files changed, 35 insertions(+), 197 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 37bde99b74af..df8e5dca00a2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -414,8 +414,31 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
#define kmalloc_index(s) __kmalloc_index(s, true)
#endif /* !CONFIG_SLOB */

-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
-void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc;
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
+ __alloc_size(1);
+void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
+ __malloc;
+
+static __always_inline void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __kmalloc_node(size, flags, NUMA_NO_NODE);
+}
+
+/**
+ * kmem_cache_alloc - Allocate an object
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache. The flags are only relevant
+ * if the cache has no available objects.
+ *
+ * Return: pointer to the new object or %NULL in case of error
+ */
+static __always_inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags)
+{
+ return kmem_cache_alloc_node(s, flags, NUMA_NO_NODE);
+}
+
void kmem_cache_free(struct kmem_cache *s, void *objp);

/*
@@ -437,38 +460,13 @@ static __always_inline void kfree_bulk(size_t size, void **p)
kmem_cache_free_bulk(NULL, size, p);
}

-#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
- __alloc_size(1);
-void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
- __malloc;
-#else
-static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- return __kmalloc(size, flags);
-}
-
-static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node)
-{
- return kmem_cache_alloc(s, flags);
-}
-#endif
-
#ifdef CONFIG_TRACING
extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
__assume_slab_alignment __alloc_size(3);

-#ifdef CONFIG_NUMA
extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
int node, size_t size) __assume_slab_alignment
__alloc_size(4);
-#else
-static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
- gfp_t gfpflags, int node, size_t size)
-{
- return kmem_cache_alloc_trace(s, gfpflags, size);
-}
-#endif /* CONFIG_NUMA */

#else /* CONFIG_TRACING */
static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s,
@@ -652,19 +650,6 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag
return kmalloc_array(n, size, flags | __GFP_ZERO);
}

-/*
- * kmalloc_track_caller is a special version of kmalloc that records the
- * calling function of the routine calling it for slab leak tracking instead
- * of just the calling function (confusing, eh?).
- * It's useful when the call to kmalloc comes from a widely-used standard
- * allocator where we care about the real place the memory allocation
- * request comes from.
- */
-extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
- __alloc_size(1);
-#define kmalloc_track_caller(size, flags) \
- __kmalloc_track_caller(size, flags, _RET_IP_)
-
static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
int node)
{
@@ -682,21 +667,21 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t
return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
}

-
-#ifdef CONFIG_NUMA
extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
unsigned long caller) __alloc_size(1);
#define kmalloc_node_track_caller(size, flags, node) \
__kmalloc_node_track_caller(size, flags, node, \
_RET_IP_)
-
-#else /* CONFIG_NUMA */
-
-#define kmalloc_node_track_caller(size, flags, node) \
- kmalloc_track_caller(size, flags)
-
-#endif /* CONFIG_NUMA */
-
+/*
+ * kmalloc_track_caller is a special version of kmalloc that records the
+ * calling function of the routine calling it for slab leak tracking instead
+ * of just the calling function (confusing, eh?).
+ * It's useful when the call to kmalloc comes from a widely-used standard
+ * allocator where we care about the real place the memory allocation
+ * request comes from.
+ */
+#define kmalloc_track_caller(size, flags) \
+ __kmalloc_node_track_caller(size, flags, NUMA_NO_NODE, _RET_IP_)
/*
* Shortcuts
*/
diff --git a/mm/slab.c b/mm/slab.c
index 5d102aaf1629..b41124a1efd9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3468,27 +3468,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
__free_one(ac, objp);
}

-/**
- * kmem_cache_alloc - Allocate an object
- * @cachep: The cache to allocate from.
- * @flags: See kmalloc().
- *
- * Allocate an object from this cache. The flags are only relevant
- * if the cache has no available objects.
- *
- * Return: pointer to the new object or %NULL in case of error
- */
-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
-{
- void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_);
-
- trace_kmem_cache_alloc(_RET_IP_, ret,
- cachep->object_size, cachep->size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc);
-
static __always_inline void
cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
size_t size, void **p, unsigned long caller)
@@ -3556,7 +3535,6 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
EXPORT_SYMBOL(kmem_cache_alloc_trace);
#endif

-#ifdef CONFIG_NUMA
/**
* kmem_cache_alloc_node - Allocate an object on the specified node
* @cachep: The cache to allocate from.
@@ -3630,7 +3608,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
return __do_kmalloc_node(size, flags, node, caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif /* CONFIG_NUMA */

#ifdef CONFIG_PRINTK
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
@@ -3654,46 +3631,6 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
}
#endif

-/**
- * __do_kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kmalloc).
- * @caller: function caller for debug tracking of the caller
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
- unsigned long caller)
-{
- struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = slab_alloc(cachep, flags, size, caller);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(caller, ret,
- size, cachep->size, flags);
-
- return ret;
-}
-
-void *__kmalloc(size_t size, gfp_t flags)
-{
- return __do_kmalloc(size, flags, _RET_IP_);
-}
-EXPORT_SYMBOL(__kmalloc);
-
-void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
-{
- return __do_kmalloc(size, flags, caller);
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
diff --git a/mm/slob.c b/mm/slob.c
index 60c5842215f1..c4f9c83900b0 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -522,26 +522,12 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
return ret;
}

-void *__kmalloc(size_t size, gfp_t gfp)
-{
- return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_);
-}
-EXPORT_SYMBOL(__kmalloc);
-
-void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
-{
- return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
int node, unsigned long caller)
{
return __do_kmalloc_node(size, gfp, node, caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif

void kfree(const void *block)
{
@@ -629,13 +615,6 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
return b;
}

-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
-{
- return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(kmem_cache_alloc);
-
-#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
return __do_kmalloc_node(size, gfp, node, _RET_IP_);
@@ -647,7 +626,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
return slob_alloc_node(cachep, gfp, node);
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif

static void __kmem_cache_free(void *b, int size)
{
diff --git a/mm/slub.c b/mm/slub.c
index 261474092e43..74369cadc243 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3238,17 +3238,6 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
}

-void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
-{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);
-
- trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
- s->size, gfpflags);
-
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc);
-
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
@@ -3260,7 +3249,6 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
EXPORT_SYMBOL(kmem_cache_alloc_trace);
#endif

-#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
@@ -3287,7 +3275,6 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
-#endif /* CONFIG_NUMA */

/*
* Slow path handling. This may still be called frequently since objects
@@ -4404,30 +4391,6 @@ static int __init setup_slub_min_objects(char *str)

__setup("slub_min_objects=", setup_slub_min_objects);

-void *__kmalloc(size_t size, gfp_t flags)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, flags);
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, flags, _RET_IP_, size);
-
- trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc);
-
-#ifdef CONFIG_NUMA
static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
@@ -4474,7 +4437,6 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_NUMA */

#ifdef CONFIG_HARDENED_USERCOPY
/*
@@ -4910,29 +4872,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
return 0;
}

-void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, gfpflags);
-
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, gfpflags, caller, size);
-
- /* Honor the call site pointer we received. */
- trace_kmalloc(caller, ret, size, s->size, gfpflags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
int node, unsigned long caller)
{
@@ -4962,7 +4901,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return ret;
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif

#ifdef CONFIG_SYSFS
static int count_inuse(struct slab *slab)
--
2.33.1

2022-03-09 01:57:03

by Hyeonggon Yoo

[permalink] [raw]
Subject: [RFC PATCH v1 09/15] mm/slab: kmalloc: pass requests larger than order-1 page to page allocator

There is not much benefit for serving large objects in kmalloc().
Let's pass large requests to page allocator like SLUB for better
maintenance of common code.

[ [email protected]: Enable and disable irq around free_large_kmalloc().
Do not lose NUMA locality in __do_kmalloc_node().
Use folio_slab(folio)->slab_cache instead of virt_to_cache().
Remove large sizes in __kmalloc_index(). ]

Signed-off-by: Hyeonggon Yoo <[email protected]>
---
include/linux/slab.h | 23 +++++-----------------
mm/slab.c | 45 ++++++++++++++++++++++++++++++--------------
mm/slab.h | 3 +++
mm/slab_common.c | 25 +++++++++++++++++-------
mm/slub.c | 19 -------------------
5 files changed, 57 insertions(+), 58 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index dfcc8301d969..9ced225a3ea3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -226,27 +226,17 @@ void kmem_dump_obj(void *object);

#ifdef CONFIG_SLAB
/*
- * The largest kmalloc size supported by the SLAB allocators is
- * 32 megabyte (2^25) or the maximum allocatable page order if that is
- * less than 32 MB.
- *
- * WARNING: Its not easy to increase this value since the allocators have
- * to do various tricks to work around compiler limitations in order to
- * ensure proper constant folding.
+ * SLAB and SLUB directly allocates requests fitting in to an order-1 page
+ * (PAGE_SIZE*2). Larger requests are passed to the page allocator.
*/
-#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
- (MAX_ORDER + PAGE_SHIFT - 1) : 25)
-#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH
+#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
+#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW 5
#endif
#endif

#ifdef CONFIG_SLUB
-/*
- * SLUB directly allocates requests fitting in to an order-1 page
- * (PAGE_SIZE*2). Larger requests are passed to the page allocator.
- */
#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
#ifndef KMALLOC_SHIFT_LOW
@@ -398,10 +388,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
if (size <= 512 * 1024) return 19;
if (size <= 1024 * 1024) return 20;
if (size <= 2 * 1024 * 1024) return 21;
- if (size <= 4 * 1024 * 1024) return 22;
- if (size <= 8 * 1024 * 1024) return 23;
- if (size <= 16 * 1024 * 1024) return 24;
- if (size <= 32 * 1024 * 1024) return 25;

if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
@@ -411,6 +397,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
/* Will never be reached. Needed because the compiler may complain */
return -1;
}
+static_assert(PAGE_SHIFT <= 20);
#define kmalloc_index(s) __kmalloc_index(s, true)
#endif /* !CONFIG_SLOB */

diff --git a/mm/slab.c b/mm/slab.c
index 6ebf509bf2de..f0041f0125ba 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3568,7 +3568,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
void *ret;

if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
+ return kmalloc_large_node(size, flags, node);
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
@@ -3642,15 +3642,25 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
{
struct kmem_cache *s;
size_t i;
+ struct folio *folio;

local_irq_disable();
for (i = 0; i < size; i++) {
void *objp = p[i];

- if (!orig_s) /* called via kfree_bulk */
- s = virt_to_cache(objp);
- else
+ if (!orig_s) {
+ folio = virt_to_folio(objp);
+ /* called via kfree_bulk */
+ if (!folio_test_slab(folio)) {
+ local_irq_enable();
+ free_large_kmalloc(folio, objp);
+ local_irq_disable();
+ continue;
+ }
+ s = folio_slab(folio)->slab_cache;
+ } else
s = cache_from_obj(orig_s, objp);
+
if (!s)
continue;

@@ -3679,20 +3689,25 @@ void kfree(const void *objp)
{
struct kmem_cache *c;
unsigned long flags;
+ struct folio *folio;
+ void *x = (void *) objp;

trace_kfree(_RET_IP_, objp);

if (unlikely(ZERO_OR_NULL_PTR(objp)))
return;
- local_irq_save(flags);
- kfree_debugcheck(objp);
- c = virt_to_cache(objp);
- if (!c) {
- local_irq_restore(flags);
+
+ folio = virt_to_folio(objp);
+ if (!folio_test_slab(folio)) {
+ free_large_kmalloc(folio, x);
return;
}
- debug_check_no_locks_freed(objp, c->object_size);

+ c = folio_slab(folio)->slab_cache;
+
+ local_irq_save(flags);
+ kfree_debugcheck(objp);
+ debug_check_no_locks_freed(objp, c->object_size);
debug_check_no_obj_freed(objp, c->object_size);
__cache_free(c, (void *)objp, _RET_IP_);
local_irq_restore(flags);
@@ -4114,15 +4129,17 @@ void __check_heap_object(const void *ptr, unsigned long n,
size_t __ksize(const void *objp)
{
struct kmem_cache *c;
- size_t size;
+ struct folio *folio;

BUG_ON(!objp);
if (unlikely(objp == ZERO_SIZE_PTR))
return 0;

- c = virt_to_cache(objp);
- size = c ? c->object_size : 0;
+ folio = virt_to_folio(objp);
+ if (!folio_test_slab(folio))
+ return folio_size(folio);

- return size;
+ c = folio_slab(folio)->slab_cache;
+ return c->object_size;
}
EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab.h b/mm/slab.h
index c7f2abc2b154..eb6e26784d69 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -664,6 +664,9 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
print_tracking(cachep, x);
return cachep;
}
+
+void free_large_kmalloc(struct folio *folio, void *object);
+
#endif /* CONFIG_SLOB */

static inline size_t slab_ksize(const struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1fe2f2a7326d..af67005a151f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -759,8 +759,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)

/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
- * kmalloc-32M.
+ * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
+ * kmalloc-2M.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(0, 0),
@@ -784,11 +784,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(262144, 256k),
INIT_KMALLOC_INFO(524288, 512k),
INIT_KMALLOC_INFO(1048576, 1M),
- INIT_KMALLOC_INFO(2097152, 2M),
- INIT_KMALLOC_INFO(4194304, 4M),
- INIT_KMALLOC_INFO(8388608, 8M),
- INIT_KMALLOC_INFO(16777216, 16M),
- INIT_KMALLOC_INFO(33554432, 32M)
+ INIT_KMALLOC_INFO(2097152, 2M)
};

/*
@@ -913,6 +909,21 @@ void __init create_kmalloc_caches(slab_flags_t flags)
}
#endif
}
+
+void free_large_kmalloc(struct folio *folio, void *object)
+{
+ unsigned int order = folio_order(folio);
+
+ if (WARN_ON_ONCE(order == 0))
+ pr_warn_once("object pointer: 0x%p\n", object);
+
+ kmemleak_free(object);
+ kasan_kfree_large(object);
+
+ mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(folio_page(folio, 0), order);
+}
#endif /* !CONFIG_SLOB */

gfp_t kmalloc_fix_flags(gfp_t flags)
diff --git a/mm/slub.c b/mm/slub.c
index d8fb987ff7e0..283c4ac92ffe 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1678,12 +1678,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static __always_inline void kfree_hook(void *x)
-{
- kmemleak_free(x);
- kasan_kfree_large(x);
-}
-
static __always_inline bool slab_free_hook(struct kmem_cache *s,
void *x, bool init)
{
@@ -3501,19 +3495,6 @@ struct detached_freelist {
struct kmem_cache *s;
};

-static inline void free_large_kmalloc(struct folio *folio, void *object)
-{
- unsigned int order = folio_order(folio);
-
- if (WARN_ON_ONCE(order == 0))
- pr_warn_once("object pointer: 0x%p\n", object);
-
- kfree_hook(object);
- mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
- __free_pages(folio_page(folio, 0), order);
-}
-
/*
* This function progressively scans the array with free objects (with
* a limited look ahead) and extract objects belonging to the same
--
2.33.1

2022-03-23 21:07:49

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [RFC PATCH v1 01/15] mm/slab: cleanup slab_alloc() and slab_alloc_node()

On 3/8/22 12:41, Hyeonggon Yoo wrote:
> +
> static __always_inline void *
> -slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
> +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
> + unsigned long caller)
> {
> unsigned long save_flags;
> - void *objp;
> + void *ptr;
> + int slab_node = numa_mem_id();
> struct obj_cgroup *objcg = NULL;
> bool init = false;
>
> @@ -3299,21 +3255,49 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
> if (unlikely(!cachep))
> return NULL;
>
> - objp = kfence_alloc(cachep, orig_size, flags);
> - if (unlikely(objp))
> - goto out;
> + ptr = kfence_alloc(cachep, orig_size, flags);
> + if (unlikely(ptr))
> + goto out_hooks;
>
> cache_alloc_debugcheck_before(cachep, flags);
> local_irq_save(save_flags);
> - objp = __do_cache_alloc(cachep, flags);

Looks like after this patch, slab_alloc() (without a node specified)
will not end up in __do_cache_alloc() anymore, so there's no more
possibility of alternate_node_alloc(), which looks like a functional
regression?

> +
> + if (node_match(nodeid, slab_node)) {
> + /*
> + * Use the locally cached objects if possible.
> + * However ____cache_alloc does not allow fallback
> + * to other nodes. It may fail while we still have
> + * objects on other nodes available.
> + */
> + ptr = ____cache_alloc(cachep, flags);
> + if (ptr)
> + goto out;
> + }
> +#ifdef CONFIG_NUMA
> + else if (unlikely(!get_node(cachep, nodeid))) {
> + /* Node not bootstrapped yet */
> + ptr = fallback_alloc(cachep, flags);
> + goto out;
> + }
> +
> + /* ___cache_alloc_node can fall back to other nodes */
> + ptr = ____cache_alloc_node(cachep, flags, nodeid);
> +#endif
> +out:
> local_irq_restore(save_flags);
> - objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
> - prefetchw(objp);
> + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
> + prefetchw(ptr);
> init = slab_want_init_on_alloc(flags, cachep);
>
> -out:
> - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init);
> - return objp;
> +out_hooks:
> + slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
> + return ptr;
> +}
> +
> +static __always_inline void *
> +slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
> +{
> + return slab_alloc_node(cachep, flags, NUMA_NO_NODE, orig_size, caller);
> }
>
> /*

2022-03-24 16:43:33

by Hyeonggon Yoo

[permalink] [raw]
Subject: Re: [RFC PATCH v1 01/15] mm/slab: cleanup slab_alloc() and slab_alloc_node()

Vlastimil wrote:
> On 3/8/22 12:41, Hyeonggon Yoo wrote:
> > +
> > static __always_inline void *
> > -slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
> > +slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
> > + unsigned long caller)
> > {
> > unsigned long save_flags;
> > - void *objp;
> > + void *ptr;
> > + int slab_node = numa_mem_id();
> > struct obj_cgroup *objcg = NULL;
> > bool init = false;
> >
> > @@ -3299,21 +3255,49 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
> > if (unlikely(!cachep))
> > return NULL;
> >
> > - objp = kfence_alloc(cachep, orig_size, flags);
> > - if (unlikely(objp))
> > - goto out;
> > + ptr = kfence_alloc(cachep, orig_size, flags);
> > + if (unlikely(ptr))
> > + goto out_hooks;
> >
> > cache_alloc_debugcheck_before(cachep, flags);
> > local_irq_save(save_flags);
> > - objp = __do_cache_alloc(cachep, flags);
>
> Looks like after this patch, slab_alloc() (without a node specified)
> will not end up in __do_cache_alloc() anymore, so there's no more
> possibility of alternate_node_alloc(), which looks like a functional
> regression?
>

Ah, that was not intended. Thank you for catching this!
Will fix in v2.

Thank you so much.

> > +
> > + if (node_match(nodeid, slab_node)) {
> > + /*
> > + * Use the locally cached objects if possible.
> > + * However ____cache_alloc does not allow fallback
> > + * to other nodes. It may fail while we still have
> > + * objects on other nodes available.
> > + */
> > + ptr = ____cache_alloc(cachep, flags);
> > + if (ptr)
> > + goto out;
> > + }
> > +#ifdef CONFIG_NUMA
> > + else if (unlikely(!get_node(cachep, nodeid))) {
> > + /* Node not bootstrapped yet */
> > + ptr = fallback_alloc(cachep, flags);
> > + goto out;
> > + }
> > +
> > + /* ___cache_alloc_node can fall back to other nodes */
> > + ptr = ____cache_alloc_node(cachep, flags, nodeid);
> > +#endif
> > +out:
> > local_irq_restore(save_flags);
> > - objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
> > - prefetchw(objp);
> > + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
> > + prefetchw(ptr);
> > init = slab_want_init_on_alloc(flags, cachep);
> >
> > -out:
> > - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init);
> > - return objp;
> > +out_hooks:
> > + slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
> > + return ptr;
> > +}
> > +
> > +static __always_inline void *
> > +slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
> > +{
> > + return slab_alloc_node(cachep, flags, NUMA_NO_NODE, orig_size, caller);
> > }
> >
> > /*

--
Thank you, You are awesome!
Hyeonggon :-)

2022-03-25 16:04:09

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [RFC PATCH v1 05/15] mm/slub: move kmalloc_large_node() to slab_common.c

On 3/8/22 12:41, Hyeonggon Yoo wrote:
> In later patch SLAB will also pass requests larger than order-1 page
> to page allocator. Move kmalloc_large_node() to slab_common.c.
>
> Fold kmalloc_large_node_hook() into kmalloc_large_node() as there is
> no other caller.
>
> Move tracepoint in kmalloc_large_node().
>
> Add flag fix code. This exist in kmalloc_large() but omitted in
> kmalloc_large_node().
>
> Signed-off-by: Hyeonggon Yoo <[email protected]>
> ---
> include/linux/slab.h | 3 +++
> mm/slab_common.c | 26 ++++++++++++++++++++++++
> mm/slub.c | 47 ++++----------------------------------------
> 3 files changed, 33 insertions(+), 43 deletions(-)
>

<snip>

>
> @@ -4874,15 +4842,8 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
> struct kmem_cache *s;
> void *ret;
>
> - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
> - ret = kmalloc_large_node(size, gfpflags, node);
> -
> - trace_kmalloc_node(caller, ret,
> - size, PAGE_SIZE << get_order(size),
> - gfpflags, node);

Hmm this throws away the caller for tracing, so looks like an unintended
functional change.

> -
> - return ret;
> - }
> + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
> + return kmalloc_large_node(size, gfpflags, node);
>
> s = kmalloc_slab(size, gfpflags);
>

2022-03-25 17:35:06

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [RFC PATCH v1 09/15] mm/slab: kmalloc: pass requests larger than order-1 page to page allocator

On 3/8/22 12:41, Hyeonggon Yoo wrote:
> There is not much benefit for serving large objects in kmalloc().
> Let's pass large requests to page allocator like SLUB for better
> maintenance of common code.
>
> [ [email protected]: Enable and disable irq around free_large_kmalloc().
> Do not lose NUMA locality in __do_kmalloc_node().
> Use folio_slab(folio)->slab_cache instead of virt_to_cache().
> Remove large sizes in __kmalloc_index(). ]

Thanks for the mention but that's generally only done like this if I took
your patch and made those changes myself. But I just suggested them. Small
suggested changes like this are usually just mentioned in e.g. v1->v2
changelogs.

> Signed-off-by: Hyeonggon Yoo <[email protected]>
> ---
> include/linux/slab.h | 23 +++++-----------------
> mm/slab.c | 45 ++++++++++++++++++++++++++++++--------------
> mm/slab.h | 3 +++
> mm/slab_common.c | 25 +++++++++++++++++-------
> mm/slub.c | 19 -------------------
> 5 files changed, 57 insertions(+), 58 deletions(-)
>
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index dfcc8301d969..9ced225a3ea3 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -226,27 +226,17 @@ void kmem_dump_obj(void *object);
>
> #ifdef CONFIG_SLAB
> /*
> - * The largest kmalloc size supported by the SLAB allocators is
> - * 32 megabyte (2^25) or the maximum allocatable page order if that is
> - * less than 32 MB.
> - *
> - * WARNING: Its not easy to increase this value since the allocators have
> - * to do various tricks to work around compiler limitations in order to
> - * ensure proper constant folding.
> + * SLAB and SLUB directly allocates requests fitting in to an order-1 page
> + * (PAGE_SIZE*2). Larger requests are passed to the page allocator.
> */
> -#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
> - (MAX_ORDER + PAGE_SHIFT - 1) : 25)
> -#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH
> +#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
> +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
> #ifndef KMALLOC_SHIFT_LOW
> #define KMALLOC_SHIFT_LOW 5
> #endif
> #endif
>
> #ifdef CONFIG_SLUB
> -/*
> - * SLUB directly allocates requests fitting in to an order-1 page
> - * (PAGE_SIZE*2). Larger requests are passed to the page allocator.
> - */
> #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
> #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
> #ifndef KMALLOC_SHIFT_LOW
> @@ -398,10 +388,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
> if (size <= 512 * 1024) return 19;
> if (size <= 1024 * 1024) return 20;
> if (size <= 2 * 1024 * 1024) return 21;
> - if (size <= 4 * 1024 * 1024) return 22;
> - if (size <= 8 * 1024 * 1024) return 23;
> - if (size <= 16 * 1024 * 1024) return 24;
> - if (size <= 32 * 1024 * 1024) return 25;
>
> if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
> BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
> @@ -411,6 +397,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
> /* Will never be reached. Needed because the compiler may complain */
> return -1;
> }
> +static_assert(PAGE_SHIFT <= 20);
> #define kmalloc_index(s) __kmalloc_index(s, true)
> #endif /* !CONFIG_SLOB */
>
> diff --git a/mm/slab.c b/mm/slab.c
> index 6ebf509bf2de..f0041f0125ba 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -3568,7 +3568,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
> void *ret;
>
> if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
> - return NULL;
> + return kmalloc_large_node(size, flags, node);

Similar issue with caller not traced.

> cachep = kmalloc_slab(size, flags);
> if (unlikely(ZERO_OR_NULL_PTR(cachep)))
> return cachep;
> @@ -3642,15 +3642,25 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
> {
> struct kmem_cache *s;
> size_t i;
> + struct folio *folio;
>
> local_irq_disable();
> for (i = 0; i < size; i++) {
> void *objp = p[i];
>
> - if (!orig_s) /* called via kfree_bulk */
> - s = virt_to_cache(objp);
> - else
> + if (!orig_s) {
> + folio = virt_to_folio(objp);
> + /* called via kfree_bulk */
> + if (!folio_test_slab(folio)) {
> + local_irq_enable();
> + free_large_kmalloc(folio, objp);
> + local_irq_disable();
> + continue;
> + }
> + s = folio_slab(folio)->slab_cache;
> + } else
> s = cache_from_obj(orig_s, objp);
> +
> if (!s)
> continue;
>
> @@ -3679,20 +3689,25 @@ void kfree(const void *objp)
> {
> struct kmem_cache *c;
> unsigned long flags;
> + struct folio *folio;
> + void *x = (void *) objp;

I think you don't need to add 'x', just do the cast while calling
free_large_kmalloc(), same as done for __cache_free().

>
> trace_kfree(_RET_IP_, objp);
>
> if (unlikely(ZERO_OR_NULL_PTR(objp)))
> return;
> - local_irq_save(flags);
> - kfree_debugcheck(objp);
> - c = virt_to_cache(objp);
> - if (!c) {
> - local_irq_restore(flags);
> +
> + folio = virt_to_folio(objp);
> + if (!folio_test_slab(folio)) {
> + free_large_kmalloc(folio, x);
> return;
> }
> - debug_check_no_locks_freed(objp, c->object_size);
>
> + c = folio_slab(folio)->slab_cache;
> +
> + local_irq_save(flags);
> + kfree_debugcheck(objp);
> + debug_check_no_locks_freed(objp, c->object_size);
> debug_check_no_obj_freed(objp, c->object_size);
> __cache_free(c, (void *)objp, _RET_IP_);
> local_irq_restore(flags);

2022-04-22 20:12:26

by Hyeonggon Yoo

[permalink] [raw]
Subject: Re: [RFC PATCH v1 09/15] mm/slab: kmalloc: pass requests larger than order-1 page to page allocator

On Thu, Mar 24, 2022 at 07:08:27PM +0100, Vlastimil Babka wrote:
> On 3/8/22 12:41, Hyeonggon Yoo wrote:
> > There is not much benefit for serving large objects in kmalloc().
> > Let's pass large requests to page allocator like SLUB for better
> > maintenance of common code.
> >
> > [ [email protected]: Enable and disable irq around free_large_kmalloc().
> > Do not lose NUMA locality in __do_kmalloc_node().
> > Use folio_slab(folio)->slab_cache instead of virt_to_cache().
> > Remove large sizes in __kmalloc_index(). ]

A bit late to reply but better late than never...

>
> Thanks for the mention but that's generally only done like this if I took
> your patch and made those changes myself. But I just suggested them. Small
> suggested changes like this are usually just mentioned in e.g. v1->v2
> changelogs.

Ah, okay. I didn't know about the convention. thanks for letting me
know!

> > Signed-off-by: Hyeonggon Yoo <[email protected]>
> > ---
> > include/linux/slab.h | 23 +++++-----------------
> > mm/slab.c | 45 ++++++++++++++++++++++++++++++--------------
> > mm/slab.h | 3 +++
> > mm/slab_common.c | 25 +++++++++++++++++-------
> > mm/slub.c | 19 -------------------
> > 5 files changed, 57 insertions(+), 58 deletions(-)
> >
> > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > index dfcc8301d969..9ced225a3ea3 100644
> > --- a/include/linux/slab.h
> > +++ b/include/linux/slab.h
> > @@ -226,27 +226,17 @@ void kmem_dump_obj(void *object);
> >
> > #ifdef CONFIG_SLAB
> > /*
> > - * The largest kmalloc size supported by the SLAB allocators is
> > - * 32 megabyte (2^25) or the maximum allocatable page order if that is
> > - * less than 32 MB.
> > - *
> > - * WARNING: Its not easy to increase this value since the allocators have
> > - * to do various tricks to work around compiler limitations in order to
> > - * ensure proper constant folding.
> > + * SLAB and SLUB directly allocates requests fitting in to an order-1 page
> > + * (PAGE_SIZE*2). Larger requests are passed to the page allocator.
> > */
> > -#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
> > - (MAX_ORDER + PAGE_SHIFT - 1) : 25)
> > -#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH
> > +#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
> > +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
> > #ifndef KMALLOC_SHIFT_LOW
> > #define KMALLOC_SHIFT_LOW 5
> > #endif
> > #endif
> >
> > #ifdef CONFIG_SLUB
> > -/*
> > - * SLUB directly allocates requests fitting in to an order-1 page
> > - * (PAGE_SIZE*2). Larger requests are passed to the page allocator.
> > - */
> > #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
> > #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1)
> > #ifndef KMALLOC_SHIFT_LOW
> > @@ -398,10 +388,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
> > if (size <= 512 * 1024) return 19;
> > if (size <= 1024 * 1024) return 20;
> > if (size <= 2 * 1024 * 1024) return 21;
> > - if (size <= 4 * 1024 * 1024) return 22;
> > - if (size <= 8 * 1024 * 1024) return 23;
> > - if (size <= 16 * 1024 * 1024) return 24;
> > - if (size <= 32 * 1024 * 1024) return 25;
> >
> > if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
> > BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
> > @@ -411,6 +397,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
> > /* Will never be reached. Needed because the compiler may complain */
> > return -1;
> > }
> > +static_assert(PAGE_SHIFT <= 20);
> > #define kmalloc_index(s) __kmalloc_index(s, true)
> > #endif /* !CONFIG_SLOB */
> >
> > diff --git a/mm/slab.c b/mm/slab.c
> > index 6ebf509bf2de..f0041f0125ba 100644
> > --- a/mm/slab.c
> > +++ b/mm/slab.c
> > @@ -3568,7 +3568,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
> > void *ret;
> >
> > if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
> > - return NULL;
> > + return kmalloc_large_node(size, flags, node);
>
> Similar issue with caller not traced.
>

Actually I moved tracepoint into kmalloc_large_node(),
but the problem I think was I write patches hard to review.

in v2 I split some patches to be more reviewable. thanks!!

> > cachep = kmalloc_slab(size, flags);
> > if (unlikely(ZERO_OR_NULL_PTR(cachep)))
> > return cachep;
> > @@ -3642,15 +3642,25 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
> > {
> > struct kmem_cache *s;
> > size_t i;
> > + struct folio *folio;
> >
> > local_irq_disable();
> > for (i = 0; i < size; i++) {
> > void *objp = p[i];
> >
> > - if (!orig_s) /* called via kfree_bulk */
> > - s = virt_to_cache(objp);
> > - else
> > + if (!orig_s) {
> > + folio = virt_to_folio(objp);
> > + /* called via kfree_bulk */
> > + if (!folio_test_slab(folio)) {
> > + local_irq_enable();
> > + free_large_kmalloc(folio, objp);
> > + local_irq_disable();
> > + continue;
> > + }
> > + s = folio_slab(folio)->slab_cache;
> > + } else
> > s = cache_from_obj(orig_s, objp);
> > +
> > if (!s)
> > continue;
> >
> > @@ -3679,20 +3689,25 @@ void kfree(const void *objp)
> > {
> > struct kmem_cache *c;
> > unsigned long flags;
> > + struct folio *folio;
> > + void *x = (void *) objp;
>
> I think you don't need to add 'x', just do the cast while calling
> free_large_kmalloc(), same as done for __cache_free().
>

in fact also SLUB's kfree defines x. But your suggestion sounds better.
Anyway did it in v2. thanks!

> >
> > trace_kfree(_RET_IP_, objp);
> >
> > if (unlikely(ZERO_OR_NULL_PTR(objp)))
> > return;
> > - local_irq_save(flags);
> > - kfree_debugcheck(objp);
> > - c = virt_to_cache(objp);
> > - if (!c) {
> > - local_irq_restore(flags);
> > +
> > + folio = virt_to_folio(objp);
> > + if (!folio_test_slab(folio)) {
> > + free_large_kmalloc(folio, x);
> > return;
> > }
> > - debug_check_no_locks_freed(objp, c->object_size);
> >
> > + c = folio_slab(folio)->slab_cache;
> > +
> > + local_irq_save(flags);
> > + kfree_debugcheck(objp);
> > + debug_check_no_locks_freed(objp, c->object_size);
> > debug_check_no_obj_freed(objp, c->object_size);
> > __cache_free(c, (void *)objp, _RET_IP_);
> > local_irq_restore(flags);

--
Thanks,
Hyeonggon