Hi,
The previous version (RFC, no cover letter) is here:
https://lore.kernel.org/all/[email protected]/
Git branch is here:
https://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git/log/?h=slab/for-6.2/fit_rcu_head
(also in linux-next since late last week)
The rationale for doing all this is in patch 3 - I hope there are still
plans for the rcu_head debugging, Joel?
The previous version was in linux-next, which brought crash reports due
to causing false positive __PageMovable() tests. There were several
attempts to deal with it, as explained in Patch 2, which is an updated
version of one of those attempts. It hasn't been formally posted and
reviewed yet, hence this posting.
Thanks,
Vlastimil
Vlastimil Babka (3):
mm/slub: perform free consistency checks before call_rcu
mm/migrate: make isolate_movable_page() skip slab pages
mm/sl[au]b: rearrange struct slab fields to allow larger rcu_head
mm/migrate.c | 15 ++++++++++++---
mm/slab.c | 6 +++++-
mm/slab.h | 54 +++++++++++++++++++++++++++++++---------------------
mm/slub.c | 26 ++++++++++++++-----------
4 files changed, 64 insertions(+), 37 deletions(-)
--
2.38.0
Joel reports [1] that increasing the rcu_head size for debugging
purposes used to work before struct slab was split from struct page, but
now runs into the various SLAB_MATCH() sanity checks of the layout.
This is because the rcu_head in struct page is in union with large
sub-structures and has space to grow without exceeding their size, while
in struct slab (for SLAB and SLUB) it's in union only with a list_head.
On closer inspection (and after the previous patch) we can put all
fields except slab_cache to a union with rcu_head, as slab_cache is
sufficient for the rcu freeing callbacks to work and the rest can be
overwritten by rcu_head without causing issues.
This is only somewhat complicated by the need to keep SLUB's
freelist+counters aligned for cmpxchg_double. As a result the fields
need to be reordered so that slab_cache is first (after page flags) and
the union with rcu_head follows. For consistency, do that for SLAB as
well, although not necessary there.
As a result, the rcu_head field in struct page and struct slab is no
longer at the same offset, but that doesn't matter as there is no
casting that would rely on that in the slab freeing callbacks, so we can
just drop the respective SLAB_MATCH() check.
Also we need to update the SLAB_MATCH() for compound_head to reflect the
new ordering.
While at it, also add a static_assert to check the alignment needed for
cmpxchg_double so mistakes are found sooner than a runtime GPF.
[1] https://lore.kernel.org/all/[email protected]/
Reported-by: Joel Fernandes <[email protected]>
Signed-off-by: Vlastimil Babka <[email protected]>
Acked-by: Hyeonggon Yoo <[email protected]>
---
mm/slab.h | 54 ++++++++++++++++++++++++++++++++----------------------
1 file changed, 32 insertions(+), 22 deletions(-)
diff --git a/mm/slab.h b/mm/slab.h
index 0202a8c2f0d2..b373952eef70 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -11,37 +11,43 @@ struct slab {
#if defined(CONFIG_SLAB)
+ struct kmem_cache *slab_cache;
union {
- struct list_head slab_list;
+ struct {
+ struct list_head slab_list;
+ void *freelist; /* array of free object indexes */
+ void *s_mem; /* first object */
+ };
struct rcu_head rcu_head;
};
- struct kmem_cache *slab_cache;
- void *freelist; /* array of free object indexes */
- void *s_mem; /* first object */
unsigned int active;
#elif defined(CONFIG_SLUB)
- union {
- struct list_head slab_list;
- struct rcu_head rcu_head;
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- struct {
- struct slab *next;
- int slabs; /* Nr of slabs left */
- };
-#endif
- };
struct kmem_cache *slab_cache;
- /* Double-word boundary */
- void *freelist; /* first free object */
union {
- unsigned long counters;
struct {
- unsigned inuse:16;
- unsigned objects:15;
- unsigned frozen:1;
+ union {
+ struct list_head slab_list;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct {
+ struct slab *next;
+ int slabs; /* Nr of slabs left */
+ };
+#endif
+ };
+ /* Double-word boundary */
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
};
+ struct rcu_head rcu_head;
};
unsigned int __unused;
@@ -66,9 +72,10 @@ struct slab {
#define SLAB_MATCH(pg, sl) \
static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
SLAB_MATCH(flags, __page_flags);
-SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */
#ifndef CONFIG_SLOB
-SLAB_MATCH(rcu_head, rcu_head);
+SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */
+#else
+SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */
#endif
SLAB_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
@@ -76,6 +83,9 @@ SLAB_MATCH(memcg_data, memcg_data);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB)
+static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *)));
+#endif
/**
* folio_slab - Converts from folio to slab.
--
2.38.0
For SLAB_TYPESAFE_BY_RCU caches we use call_rcu to perform empty slab
freeing. The rcu callback rcu_free_slab() calls __free_slab() that
currently includes checking the slab consistency for caches with
SLAB_CONSISTENCY_CHECKS flags. This check needs the slab->objects field
to be intact.
Because in the next patch we want to allow rcu_head in struct slab to
become larger in debug configurations and thus potentially overwrite
more fields through a union than slab_list, we want to limit the fields
used in rcu_free_slab(). Thus move the consistency checks to
free_slab() before call_rcu(). This can be done safely even for
SLAB_TYPESAFE_BY_RCU caches where accesses to the objects can still
occur after freeing them.
As a result, only the slab->slab_cache field has to be physically
separate from rcu_head for the freeing callback to work. We also save
some cycles in the rcu callback for caches with consistency checks
enabled.
Signed-off-by: Vlastimil Babka <[email protected]>
Reviewed-by: Hyeonggon Yoo <[email protected]>
---
mm/slub.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index 157527d7101b..99ba865afc4a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1999,14 +1999,6 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
int order = folio_order(folio);
int pages = 1 << order;
- if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
- void *p;
-
- slab_pad_check(s, slab);
- for_each_object(p, s, slab_address(slab), slab->objects)
- check_object(s, slab, p, SLUB_RED_INACTIVE);
- }
-
__slab_clear_pfmemalloc(slab);
__folio_clear_slab(folio);
folio->mapping = NULL;
@@ -2025,9 +2017,17 @@ static void rcu_free_slab(struct rcu_head *h)
static void free_slab(struct kmem_cache *s, struct slab *slab)
{
- if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
+ void *p;
+
+ slab_pad_check(s, slab);
+ for_each_object(p, s, slab_address(slab), slab->objects)
+ check_object(s, slab, p, SLUB_RED_INACTIVE);
+ }
+
+ if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
call_rcu(&slab->rcu_head, rcu_free_slab);
- } else
+ else
__free_slab(s, slab);
}
--
2.38.0