2022-05-29 09:40:13

by Rongwei Wang

[permalink] [raw]
Subject: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

In use cases where allocating and freeing slab frequently, some
error messages, such as "Left Redzone overwritten", "First byte
0xbb instead of 0xcc" would be printed when validating slabs.
That's because an object has been filled with SLAB_RED_INACTIVE,
but has not been added to slab's freelist. And between these
two states, the behaviour of validating slab is likely to occur.

Actually, it doesn't mean the slab can not work stably. But, these
confusing messages will disturb slab debugging more or less.

Signed-off-by: Rongwei Wang <[email protected]>
---
mm/slub.c | 40 +++++++++++++++++-----------------------
1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index ed5c2c03a47a..310e56d99116 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
void *head, void *tail, int bulk_cnt,
unsigned long addr)
{
- struct kmem_cache_node *n = get_node(s, slab_nid(slab));
void *object = head;
int cnt = 0;
- unsigned long flags, flags2;
+ unsigned long flags;
int ret = 0;

- spin_lock_irqsave(&n->list_lock, flags);
- slab_lock(slab, &flags2);
-
+ slab_lock(slab, &flags);
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
if (!check_slab(s, slab))
goto out;
@@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
bulk_cnt, cnt);

- slab_unlock(slab, &flags2);
- spin_unlock_irqrestore(&n->list_lock, flags);
+ slab_unlock(slab, &flags);
if (!ret)
slab_fix(s, "Object at 0x%p not freed", object);
return ret;
@@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,

{
void *prior;
- int was_frozen;
+ int was_frozen, to_take_off = 0;
struct slab new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
@@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
if (kfence_free(head))
return;

+ n = get_node(s, slab_nid(slab));
+ spin_lock_irqsave(&n->list_lock, flags);
+
if (kmem_cache_debug(s) &&
- !free_debug_processing(s, slab, head, tail, cnt, addr))
+ !free_debug_processing(s, slab, head, tail, cnt, addr)) {
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
return;
+ }

do {
- if (unlikely(n)) {
- spin_unlock_irqrestore(&n->list_lock, flags);
- n = NULL;
- }
+ if (unlikely(to_take_off))
+ to_take_off = 0;
prior = slab->freelist;
counters = slab->counters;
set_freepointer(s, tail, prior);
@@ -3343,18 +3343,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
new.frozen = 1;

} else { /* Needs to be taken off a list */
-
- n = get_node(s, slab_nid(slab));
/*
- * Speculatively acquire the list_lock.
* If the cmpxchg does not succeed then we may
- * drop the list_lock without any processing.
- *
- * Otherwise the list_lock will synchronize with
- * other processors updating the list of slabs.
+ * drop this behavior without any processing.
*/
- spin_lock_irqsave(&n->list_lock, flags);
-
+ to_take_off = 1;
}
}

@@ -3363,8 +3356,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
head, new.counters,
"__slab_free"));

- if (likely(!n)) {
+ if (likely(!to_take_off)) {

+ spin_unlock_irqrestore(&n->list_lock, flags);
if (likely(was_frozen)) {
/*
* The list lock was not taken therefore no list
--
2.27.0



2022-05-29 09:40:13

by Rongwei Wang

[permalink] [raw]
Subject: [PATCH 3/3] mm/slub: add nr_full count for debugging slub

The n->nr_slabs will be updated when really to allocate or
free a slab, but this slab is not necessarily in full list
or partial list of one node. That means the total count of
slab in node's full and partial list is not necessarily equal
to n->nr_slabs, even though flush_all() has been called.

An example here, an error message likes below will be
printed when 'slabinfo -v' is executed:

SLUB: kmemleak_object 4157 slabs counted but counter=4161
SLUB: kmemleak_object 4072 slabs counted but counter=4077
SLUB: kmalloc-2k 19 slabs counted but counter=20
SLUB: kmalloc-2k 12 slabs counted but counter=13
SLUB: kmemleak_object 4205 slabs counted but counter=4209

Here, nr_full is introduced in kmem_cache_node, to replace
nr_slabs and eliminate these confusing messages.

Signed-off-by: Rongwei Wang <[email protected]>
---
mm/slab.h | 1 +
mm/slub.c | 33 +++++++++++++++++++++++++++++++--
2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index 95eb34174c1b..b1190e41a243 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -782,6 +782,7 @@ struct kmem_cache_node {
unsigned long nr_partial;
struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
+ unsigned long nr_full;
atomic_long_t nr_slabs;
atomic_long_t total_objects;
struct list_head full;
diff --git a/mm/slub.c b/mm/slub.c
index bffb95bbb0ee..99e980c8295c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1220,6 +1220,9 @@ static void add_full(struct kmem_cache *s,

lockdep_assert_held(&n->list_lock);
list_add(&slab->slab_list, &n->full);
+#ifdef CONFIG_SLUB_DEBUG
+ n->nr_full++;
+#endif
}

static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
@@ -1229,6 +1232,9 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct

lockdep_assert_held(&n->list_lock);
list_del(&slab->slab_list);
+#ifdef CONFIG_SLUB_DEBUG
+ n->nr_full--;
+#endif
}

/* Tracking of the number of slabs for debugging purposes */
@@ -3880,6 +3886,7 @@ init_kmem_cache_node(struct kmem_cache_node *n)
INIT_LIST_HEAD(&n->partial);
#ifdef CONFIG_SLUB_DEBUG
atomic_long_set(&n->nr_slabs, 0);
+ n->nr_full = 0;
atomic_long_set(&n->total_objects, 0);
INIT_LIST_HEAD(&n->full);
#endif
@@ -4994,9 +5001,30 @@ static int validate_slab_node(struct kmem_cache *s,
unsigned long count = 0;
struct slab *slab;
unsigned long flags;
+ unsigned long nr_cpu_slab = 0, nr_cpu_partial = 0;
+ int cpu;

spin_lock_irqsave(&n->list_lock, flags);

+ for_each_possible_cpu(cpu) {
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ struct slab *slab;
+
+ slab = READ_ONCE(c->slab);
+ if (slab && n == get_node(s, slab_nid(slab)))
+ nr_cpu_slab += 1;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ slab = slub_percpu_partial_read_once(c);
+ if (slab && n == get_node(s, slab_nid(slab)))
+ nr_cpu_partial += slab->slabs;
+#endif
+ }
+ if (nr_cpu_slab || nr_cpu_partial) {
+ pr_err("SLUB %s: %ld cpu slabs and %ld cpu partial slabs counted\n",
+ s->name, nr_cpu_slab, nr_cpu_partial);
+ slab_add_kunit_errors();
+ }
+
list_for_each_entry(slab, &n->partial, slab_list) {
validate_slab(s, slab, obj_map);
count++;
@@ -5010,13 +5038,14 @@ static int validate_slab_node(struct kmem_cache *s,
if (!(s->flags & SLAB_STORE_USER))
goto out;

+ count = 0;
list_for_each_entry(slab, &n->full, slab_list) {
validate_slab(s, slab, obj_map);
count++;
}
- if (count != atomic_long_read(&n->nr_slabs)) {
+ if (count != n->nr_full) {
pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
- s->name, count, atomic_long_read(&n->nr_slabs));
+ s->name, count, n->nr_full);
slab_add_kunit_errors();
}

--
2.27.0


2022-05-29 10:53:41

by Rongwei Wang

[permalink] [raw]
Subject: [PATCH 2/3] mm/slub: improve consistency of nr_slabs count

Currently, discard_slab() can change nr_slabs count
without holding node's list_lock. This will lead some
error messages print when scanning node's partial or
full list, e.g. validate all slabs. Literally, it
affects the consistency of nr_slabs count.

Here, discard_slab() is abandoned, And dec_slabs_node()
is called before releasing node's list_lock.
dec_slabs_nodes() and free_slab() can be called separately
to ensure consistency of nr_slabs count.

Signed-off-by: Rongwei Wang <[email protected]>
---
mm/slub.c | 26 ++++++++++++++------------
1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 310e56d99116..bffb95bbb0ee 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2039,12 +2039,6 @@ static void free_slab(struct kmem_cache *s, struct slab *slab)
__free_slab(s, slab);
}

-static void discard_slab(struct kmem_cache *s, struct slab *slab)
-{
- dec_slabs_node(s, slab_nid(slab), slab->objects);
- free_slab(s, slab);
-}
-
/*
* Management of partially allocated slabs.
*/
@@ -2413,6 +2407,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,

if (!new.inuse && n->nr_partial >= s->min_partial) {
mode = M_FREE;
+ spin_lock_irqsave(&n->list_lock, flags);
} else if (new.freelist) {
mode = M_PARTIAL;
/*
@@ -2437,7 +2432,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab")) {
- if (mode == M_PARTIAL || mode == M_FULL)
+ if (mode != M_FULL_NOLIST)
spin_unlock_irqrestore(&n->list_lock, flags);
goto redo;
}
@@ -2449,7 +2444,10 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
stat(s, tail);
} else if (mode == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
- discard_slab(s, slab);
+ dec_slabs_node(s, slab_nid(slab), slab->objects);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ free_slab(s, slab);
stat(s, FREE_SLAB);
} else if (mode == M_FULL) {
add_full(s, n, slab);
@@ -2502,6 +2500,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
slab->next = slab_to_discard;
slab_to_discard = slab;
+ dec_slabs_node(s, slab_nid(slab), slab->objects);
} else {
add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
@@ -2516,7 +2515,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
slab_to_discard = slab_to_discard->next;

stat(s, DEACTIVATE_EMPTY);
- discard_slab(s, slab);
+ free_slab(s, slab);
stat(s, FREE_SLAB);
}
}
@@ -3404,9 +3403,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
remove_full(s, n, slab);
}

+ dec_slabs_node(s, slab_nid(slab), slab->objects);
spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
- discard_slab(s, slab);
+ free_slab(s, slab);
}

/*
@@ -4265,6 +4265,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
if (!slab->inuse) {
remove_partial(n, slab);
list_add(&slab->slab_list, &discard);
+ dec_slabs_node(s, slab_nid(slab), slab->objects);
} else {
list_slab_objects(s, slab,
"Objects remaining in %s on __kmem_cache_shutdown()");
@@ -4273,7 +4274,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
spin_unlock_irq(&n->list_lock);

list_for_each_entry_safe(slab, h, &discard, slab_list)
- discard_slab(s, slab);
+ free_slab(s, slab);
}

bool __kmem_cache_empty(struct kmem_cache *s)
@@ -4595,6 +4596,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
if (free == slab->objects) {
list_move(&slab->slab_list, &discard);
n->nr_partial--;
+ dec_slabs_node(s, slab_nid(slab), slab->objects);
} else if (free <= SHRINK_PROMOTE_MAX)
list_move(&slab->slab_list, promote + free - 1);
}
@@ -4610,7 +4612,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)

/* Release empty slabs */
list_for_each_entry_safe(slab, t, &discard, slab_list)
- discard_slab(s, slab);
+ free_slab(s, slab);

if (slabs_node(s, node))
ret = 1;
--
2.27.0


2022-05-29 16:21:43

by Hyeonggon Yoo

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm/slub: improve consistency of nr_slabs count

On Sun, May 29, 2022 at 04:15:34PM +0800, Rongwei Wang wrote:
> Currently, discard_slab() can change nr_slabs count
> without holding node's list_lock. This will lead some
> error messages print when scanning node's partial or
> full list, e.g. validate all slabs. Literally, it
> affects the consistency of nr_slabs count.

validate attribute is for debugging purpose.

I did not take much time for this series but
My quick impression is that it would be better to deprecate
the attribute than making it work by affecting allocation/free path.

Thanks,
Hyeonggon.

2022-05-30 06:24:03

by Hyeonggon Yoo

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote:
> In use cases where allocating and freeing slab frequently, some
> error messages, such as "Left Redzone overwritten", "First byte
> 0xbb instead of 0xcc" would be printed when validating slabs.
> That's because an object has been filled with SLAB_RED_INACTIVE,
> but has not been added to slab's freelist. And between these
> two states, the behaviour of validating slab is likely to occur.
>
> Actually, it doesn't mean the slab can not work stably. But, these
> confusing messages will disturb slab debugging more or less.
>
> Signed-off-by: Rongwei Wang <[email protected]>

Have you observed it or it's from code inspection?

> ---
> mm/slub.c | 40 +++++++++++++++++-----------------------
> 1 file changed, 17 insertions(+), 23 deletions(-)
>
> diff --git a/mm/slub.c b/mm/slub.c
> index ed5c2c03a47a..310e56d99116 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
> void *head, void *tail, int bulk_cnt,
> unsigned long addr)
> {
> - struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> void *object = head;
> int cnt = 0;
> - unsigned long flags, flags2;
> + unsigned long flags;
> int ret = 0;
>
> - spin_lock_irqsave(&n->list_lock, flags);
> - slab_lock(slab, &flags2);
> -
> + slab_lock(slab, &flags);
> if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> if (!check_slab(s, slab))
> goto out;
> @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
> slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
> bulk_cnt, cnt);
>
> - slab_unlock(slab, &flags2);
> - spin_unlock_irqrestore(&n->list_lock, flags);
> + slab_unlock(slab, &flags);
> if (!ret)
> slab_fix(s, "Object at 0x%p not freed", object);
> return ret;
> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>
> {
> void *prior;
> - int was_frozen;
> + int was_frozen, to_take_off = 0;
> struct slab new;
> unsigned long counters;
> struct kmem_cache_node *n = NULL;
> @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> if (kfence_free(head))
> return;
>
> + n = get_node(s, slab_nid(slab));
> + spin_lock_irqsave(&n->list_lock, flags);
> +

Oh please don't do this.

SLUB free slowpath can be hit a lot depending on workload.

__slab_free() try its best not to take n->list_lock. currently takes n->list_lock
only when the slab need to be taken from list.

Unconditionally taking n->list_lock will degrade performance.

> if (kmem_cache_debug(s) &&
> - !free_debug_processing(s, slab, head, tail, cnt, addr))
> + !free_debug_processing(s, slab, head, tail, cnt, addr)) {
> +
> + spin_unlock_irqrestore(&n->list_lock, flags);
> return;
> + }
>
> do {
> - if (unlikely(n)) {
> - spin_unlock_irqrestore(&n->list_lock, flags);
> - n = NULL;
> - }
> + if (unlikely(to_take_off))
> + to_take_off = 0;
> prior = slab->freelist;
> counters = slab->counters;
> set_freepointer(s, tail, prior);
> @@ -3343,18 +3343,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> new.frozen = 1;
>
> } else { /* Needs to be taken off a list */
> -
> - n = get_node(s, slab_nid(slab));
> /*
> - * Speculatively acquire the list_lock.
> * If the cmpxchg does not succeed then we may
> - * drop the list_lock without any processing.
> - *
> - * Otherwise the list_lock will synchronize with
> - * other processors updating the list of slabs.
> + * drop this behavior without any processing.
> */
> - spin_lock_irqsave(&n->list_lock, flags);
> -
> + to_take_off = 1;
> }
> }
>
> @@ -3363,8 +3356,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> head, new.counters,
> "__slab_free"));
>
> - if (likely(!n)) {
> + if (likely(!to_take_off)) {
>
> + spin_unlock_irqrestore(&n->list_lock, flags);
> if (likely(was_frozen)) {
> /*
> * The list lock was not taken therefore no list
>
> --
> 2.27.0
>

2022-06-01 18:45:14

by Muchun Song

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Sun, May 29, 2022 at 11:37:06AM +0000, Hyeonggon Yoo wrote:
> On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote:
> > In use cases where allocating and freeing slab frequently, some
> > error messages, such as "Left Redzone overwritten", "First byte
> > 0xbb instead of 0xcc" would be printed when validating slabs.
> > That's because an object has been filled with SLAB_RED_INACTIVE,
> > but has not been added to slab's freelist. And between these
> > two states, the behaviour of validating slab is likely to occur.
> >
> > Actually, it doesn't mean the slab can not work stably. But, these
> > confusing messages will disturb slab debugging more or less.
> >
> > Signed-off-by: Rongwei Wang <[email protected]>
>
> Have you observed it or it's from code inspection?
>
> > ---
> > mm/slub.c | 40 +++++++++++++++++-----------------------
> > 1 file changed, 17 insertions(+), 23 deletions(-)
> >
> > diff --git a/mm/slub.c b/mm/slub.c
> > index ed5c2c03a47a..310e56d99116 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
> > void *head, void *tail, int bulk_cnt,
> > unsigned long addr)
> > {
> > - struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> > void *object = head;
> > int cnt = 0;
> > - unsigned long flags, flags2;
> > + unsigned long flags;
> > int ret = 0;
> >
> > - spin_lock_irqsave(&n->list_lock, flags);
> > - slab_lock(slab, &flags2);
> > -
> > + slab_lock(slab, &flags);
> > if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> > if (!check_slab(s, slab))
> > goto out;
> > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
> > slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
> > bulk_cnt, cnt);
> >
> > - slab_unlock(slab, &flags2);
> > - spin_unlock_irqrestore(&n->list_lock, flags);
> > + slab_unlock(slab, &flags);
> > if (!ret)
> > slab_fix(s, "Object at 0x%p not freed", object);
> > return ret;
> > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> >
> > {
> > void *prior;
> > - int was_frozen;
> > + int was_frozen, to_take_off = 0;
> > struct slab new;
> > unsigned long counters;
> > struct kmem_cache_node *n = NULL;
> > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> > if (kfence_free(head))
> > return;
> >
> > + n = get_node(s, slab_nid(slab));
> > + spin_lock_irqsave(&n->list_lock, flags);
> > +
>
> Oh please don't do this.
>
> SLUB free slowpath can be hit a lot depending on workload.
>
> __slab_free() try its best not to take n->list_lock. currently takes n->list_lock
> only when the slab need to be taken from list.
>
> Unconditionally taking n->list_lock will degrade performance.
>

I can confirm you are right. We have encountered this issue in practise.
We have deployed somen HDFS instance on a 256-CPU machine. When there
are lots of IO requests from users, we can see lots of threads are contended
on n->list_lock. Lots of call traces are like following:

ffffffff810dfbb4 native_queued_spin_lock_slowpath+0x1a4
ffffffff81780ffb _raw_spin_lock+0x1b
ffffffff8127327e get_partial_node.isra.81+0x5e
ffffffff812752d3 ___slab_alloc+0x2f3
ffffffff8127559c __slab_alloc+0x1c
ffffffff81275828 kmem_cache_alloc+0x278
ffffffff812e9e3d alloc_buffer_head+0x1d
ffffffff812e9f74 alloc_page_buffers+0xa4
ffffffff812eb0e9 create_empty_buffers+0x19
ffffffff812eb37d create_page_buffers+0x7d
ffffffff812ed78d __block_write_begin_int+0x9d

I thought it was because there are lots of threads which consume local
CPU slab cache quickly and then both of them try to get a new slab
from node partial list. Since there are 256 CPUs, the contention
is more competitive and easy to be visible.

Any thoughts on this issue (e.e. how to ease contention)? Comments
are welcome.

Thanks.



2022-06-01 19:05:26

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Sun, 29 May 2022, Hyeonggon Yoo wrote:

> > diff --git a/mm/slub.c b/mm/slub.c
> > index ed5c2c03a47a..310e56d99116 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
> > void *head, void *tail, int bulk_cnt,
> > unsigned long addr)
> > {
> > - struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> > void *object = head;
> > int cnt = 0;
> > - unsigned long flags, flags2;
> > + unsigned long flags;
> > int ret = 0;
> >
> > - spin_lock_irqsave(&n->list_lock, flags);
> > - slab_lock(slab, &flags2);
> > -
> > + slab_lock(slab, &flags);
> > if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> > if (!check_slab(s, slab))
> > goto out;
> > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
> > slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
> > bulk_cnt, cnt);
> >
> > - slab_unlock(slab, &flags2);
> > - spin_unlock_irqrestore(&n->list_lock, flags);
> > + slab_unlock(slab, &flags);
> > if (!ret)
> > slab_fix(s, "Object at 0x%p not freed", object);
> > return ret;
> > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> >
> > {
> > void *prior;
> > - int was_frozen;
> > + int was_frozen, to_take_off = 0;
> > struct slab new;
> > unsigned long counters;
> > struct kmem_cache_node *n = NULL;
> > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> > if (kfence_free(head))
> > return;
> >
> > + n = get_node(s, slab_nid(slab));
> > + spin_lock_irqsave(&n->list_lock, flags);
> > +
>
> Oh please don't do this.
>
> SLUB free slowpath can be hit a lot depending on workload.
>
> __slab_free() try its best not to take n->list_lock. currently takes n->list_lock
> only when the slab need to be taken from list.
>
> Unconditionally taking n->list_lock will degrade performance.
>

This is a good point, it would be useful to gather some benchmarks for
workloads that are known to thrash some caches and would hit this path
such as netperf TCP_RR.

2022-06-01 20:42:21

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 5/29/22 7:37 PM, Hyeonggon Yoo wrote:
> On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote:
>> In use cases where allocating and freeing slab frequently, some
>> error messages, such as "Left Redzone overwritten", "First byte
>> 0xbb instead of 0xcc" would be printed when validating slabs.
>> That's because an object has been filled with SLAB_RED_INACTIVE,
>> but has not been added to slab's freelist. And between these
>> two states, the behaviour of validating slab is likely to occur.
>>
>> Actually, it doesn't mean the slab can not work stably. But, these
>> confusing messages will disturb slab debugging more or less.
>>
>> Signed-off-by: Rongwei Wang <[email protected]>
>
> Have you observed it or it's from code inspection?
Hi, Hyeonggon

I try to build a module to trigger the race:

#define SLUB_KTHREAD_MAX 1
static int do_slub_alloc(void *data)
{
char *mm = NULL;
char *mm1 = NULL;
char *mm2 = NULL;
char *mm3 = NULL;

allow_signal(SIGTERM);

while (1) {
mm = kmalloc(2048, GFP_KERNEL);
if (mm)
mm[0x100] = 0x21;

if (mm)
kfree(mm);

mm = NULL;
if (kthread_should_stop())
break;
}

return 0;
}

static int __init mini_init(void)
{
char *mm;
int i = 0;
unsigned int index;
char kth_name[11] = "do_slub_00";

for (i = 0; i < SLUB_KTHREAD_MAX; i++) {
kth_name[9] = '0' + i%10;
kth_name[8] = '0' + i/10;
slub_thread[i] = kthread_run(do_slub_alloc, NULL,
kth_name);
}

return 0;
}
module_init(mini_init);

And in my system, I add 'slub_debug=UFPZ' to the boot options. Next, the
error messages will be printed when I test "slabinfo -v" or "echo 1 >
/sys/kernel/slab/kmalloc-2048/validate".

>
>> ---
>> mm/slub.c | 40 +++++++++++++++++-----------------------
>> 1 file changed, 17 insertions(+), 23 deletions(-)
>>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index ed5c2c03a47a..310e56d99116 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
>> void *head, void *tail, int bulk_cnt,
>> unsigned long addr)
>> {
>> - struct kmem_cache_node *n = get_node(s, slab_nid(slab));
>> void *object = head;
>> int cnt = 0;
>> - unsigned long flags, flags2;
>> + unsigned long flags;
>> int ret = 0;
>>
>> - spin_lock_irqsave(&n->list_lock, flags);
>> - slab_lock(slab, &flags2);
>> -
>> + slab_lock(slab, &flags);
>> if (s->flags & SLAB_CONSISTENCY_CHECKS) {
>> if (!check_slab(s, slab))
>> goto out;
>> @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
>> slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
>> bulk_cnt, cnt);
>>
>> - slab_unlock(slab, &flags2);
>> - spin_unlock_irqrestore(&n->list_lock, flags);
>> + slab_unlock(slab, &flags);
>> if (!ret)
>> slab_fix(s, "Object at 0x%p not freed", object);
>> return ret;
>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>>
>> {
>> void *prior;
>> - int was_frozen;
>> + int was_frozen, to_take_off = 0;
>> struct slab new;
>> unsigned long counters;
>> struct kmem_cache_node *n = NULL;
>> @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>> if (kfence_free(head))
>> return;
>>
>> + n = get_node(s, slab_nid(slab));
>> + spin_lock_irqsave(&n->list_lock, flags);
>> +
>
> Oh please don't do this.
>
> SLUB free slowpath can be hit a lot depending on workload.
Thanks, your words remind me. Actually, I put the original in
free_debug_processing() lock on the outside of it. Looks this change is
small. Indeed, it will degrade performance more or less.

And do you have other ideas?:)

-wrw
>
> __slab_free() try its best not to take n->list_lock. currently takes n->list_lock
> only when the slab need to be taken from list.
>
> Unconditionally taking n->list_lock will degrade performance.
>
>> if (kmem_cache_debug(s) &&
>> - !free_debug_processing(s, slab, head, tail, cnt, addr))
>> + !free_debug_processing(s, slab, head, tail, cnt, addr)) {
>> +
>> + spin_unlock_irqrestore(&n->list_lock, flags);
>> return;
>> + }
>>
>> do {
>> - if (unlikely(n)) {
>> - spin_unlock_irqrestore(&n->list_lock, flags);
>> - n = NULL;
>> - }
>> + if (unlikely(to_take_off))
>> + to_take_off = 0;
>> prior = slab->freelist;
>> counters = slab->counters;
>> set_freepointer(s, tail, prior);
>> @@ -3343,18 +3343,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>> new.frozen = 1;
>>
>> } else { /* Needs to be taken off a list */
>> -
>> - n = get_node(s, slab_nid(slab));
>> /*
>> - * Speculatively acquire the list_lock.
>> * If the cmpxchg does not succeed then we may
>> - * drop the list_lock without any processing.
>> - *
>> - * Otherwise the list_lock will synchronize with
>> - * other processors updating the list of slabs.
>> + * drop this behavior without any processing.
>> */
>> - spin_lock_irqsave(&n->list_lock, flags);
>> -
>> + to_take_off = 1;
>> }
>> }
>>
>> @@ -3363,8 +3356,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>> head, new.counters,
>> "__slab_free"));
>>
>> - if (likely(!n)) {
>> + if (likely(!to_take_off)) {
>>
>> + spin_unlock_irqrestore(&n->list_lock, flags);
>> if (likely(was_frozen)) {
>> /*
>> * The list lock was not taken therefore no list
>>
>> --
>> 2.27.0
>>

2022-06-02 19:05:49

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Mon, 30 May 2022, David Rientjes wrote:

> > Unconditionally taking n->list_lock will degrade performance.
>
> This is a good point, it would be useful to gather some benchmarks for
> workloads that are known to thrash some caches and would hit this path
> such as netperf TCP_RR.

Its obvious that adding new spinlocks to some of the hottest functions in
the kernel will degrade performance. This goes against the basic design of
these functions to be as efficient as possible.


2022-06-03 17:44:25

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

Hi, Christoph, David, Muchun and Hyeonggon

Thanks for your time.

Recently, I am also find other ways to solve this. That case was
provided by Muchun is useful (Thanks Muchun!). Indeed, it seems that use
n->list_lock here is unwise. Actually, I'm not sure if you recognize the
existence of such race? If all agrees this race, then the next question
may be: do we want to solve this problem? or as David said, it would be
better to deprecate validate attribute directly. I have no idea about
it, hope to rely on your experience.

In fact, I mainly want to collect your views on whether or how to fix
this bug here. Thanks!

Thanks again for your time:).
-wrw

On 6/2/22 11:14 PM, Christoph Lameter wrote:
> On Mon, 30 May 2022, David Rientjes wrote:
>
>>> Unconditionally taking n->list_lock will degrade performance.
>>
>> This is a good point, it would be useful to gather some benchmarks for
>> workloads that are known to thrash some caches and would hit this path
>> such as netperf TCP_RR.
>
> Its obvious that adding new spinlocks to some of the hottest functions in
> the kernel will degrade performance. This goes against the basic design of
> these functions to be as efficient as possible.

2022-06-06 03:53:59

by Hyeonggon Yoo

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Tue, May 31, 2022 at 11:47:22AM +0800, Muchun Song wrote:
> On Sun, May 29, 2022 at 11:37:06AM +0000, Hyeonggon Yoo wrote:
> > On Sun, May 29, 2022 at 04:15:33PM +0800, Rongwei Wang wrote:
> > > In use cases where allocating and freeing slab frequently, some
> > > error messages, such as "Left Redzone overwritten", "First byte
> > > 0xbb instead of 0xcc" would be printed when validating slabs.
> > > That's because an object has been filled with SLAB_RED_INACTIVE,
> > > but has not been added to slab's freelist. And between these
> > > two states, the behaviour of validating slab is likely to occur.
> > >
> > > Actually, it doesn't mean the slab can not work stably. But, these
> > > confusing messages will disturb slab debugging more or less.
> > >
> > > Signed-off-by: Rongwei Wang <[email protected]>
> >
> > Have you observed it or it's from code inspection?
> >
> > > ---
> > > mm/slub.c | 40 +++++++++++++++++-----------------------
> > > 1 file changed, 17 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/mm/slub.c b/mm/slub.c
> > > index ed5c2c03a47a..310e56d99116 100644
> > > --- a/mm/slub.c
> > > +++ b/mm/slub.c
> > > @@ -1374,15 +1374,12 @@ static noinline int free_debug_processing(
> > > void *head, void *tail, int bulk_cnt,
> > > unsigned long addr)
> > > {
> > > - struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> > > void *object = head;
> > > int cnt = 0;
> > > - unsigned long flags, flags2;
> > > + unsigned long flags;
> > > int ret = 0;
> > >
> > > - spin_lock_irqsave(&n->list_lock, flags);
> > > - slab_lock(slab, &flags2);
> > > -
> > > + slab_lock(slab, &flags);
> > > if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> > > if (!check_slab(s, slab))
> > > goto out;
> > > @@ -1414,8 +1411,7 @@ static noinline int free_debug_processing(
> > > slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
> > > bulk_cnt, cnt);
> > >
> > > - slab_unlock(slab, &flags2);
> > > - spin_unlock_irqrestore(&n->list_lock, flags);
> > > + slab_unlock(slab, &flags);
> > > if (!ret)
> > > slab_fix(s, "Object at 0x%p not freed", object);
> > > return ret;
> > > @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> > >
> > > {
> > > void *prior;
> > > - int was_frozen;
> > > + int was_frozen, to_take_off = 0;
> > > struct slab new;
> > > unsigned long counters;
> > > struct kmem_cache_node *n = NULL;
> > > @@ -3315,15 +3311,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> > > if (kfence_free(head))
> > > return;
> > >
> > > + n = get_node(s, slab_nid(slab));
> > > + spin_lock_irqsave(&n->list_lock, flags);
> > > +
> >
> > Oh please don't do this.
> >
> > SLUB free slowpath can be hit a lot depending on workload.
> >
> > __slab_free() try its best not to take n->list_lock. currently takes n->list_lock
> > only when the slab need to be taken from list.
> >
> > Unconditionally taking n->list_lock will degrade performance.
> >
>
> I can confirm you are right. We have encountered this issue in practise.
> We have deployed somen HDFS instance on a 256-CPU machine. When there
> are lots of IO requests from users, we can see lots of threads are contended
> on n->list_lock. Lots of call traces are like following:
>
> ffffffff810dfbb4 native_queued_spin_lock_slowpath+0x1a4
> ffffffff81780ffb _raw_spin_lock+0x1b
> ffffffff8127327e get_partial_node.isra.81+0x5e
> ffffffff812752d3 ___slab_alloc+0x2f3
> ffffffff8127559c __slab_alloc+0x1c
> ffffffff81275828 kmem_cache_alloc+0x278
> ffffffff812e9e3d alloc_buffer_head+0x1d
> ffffffff812e9f74 alloc_page_buffers+0xa4
> ffffffff812eb0e9 create_empty_buffers+0x19
> ffffffff812eb37d create_page_buffers+0x7d
> ffffffff812ed78d __block_write_begin_int+0x9d
>
> I thought it was because there are lots of threads which consume local
> CPU slab cache quickly and then both of them try to get a new slab
> from node partial list. Since there are 256 CPUs, the contention
> is more competitive and easy to be visible.
>
> Any thoughts on this issue (e.e. how to ease contention)? Comments
> are welcome.

How does increasing number of partial slabs affect your situation?
(increasing /sys/slab/<cache name>/cpu_partial)

> Thanks.
>
>

2022-06-08 03:05:26

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Fri, 3 Jun 2022, Rongwei Wang wrote:

> Recently, I am also find other ways to solve this. That case was provided by
> Muchun is useful (Thanks Muchun!). Indeed, it seems that use n->list_lock here
> is unwise. Actually, I'm not sure if you recognize the existence of such race?
> If all agrees this race, then the next question may be: do we want to solve
> this problem? or as David said, it would be better to deprecate validate
> attribute directly. I have no idea about it, hope to rely on your experience.
>
> In fact, I mainly want to collect your views on whether or how to fix this bug
> here. Thanks!


Well validate_slab() is rarely used and should not cause the hot paths to
incur performance penalties. Fix it in the validation logic somehow? Or
document the issue and warn that validation may not be correct if there
are current operations on the slab being validated.


2022-06-08 12:50:11

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Wed, 8 Jun 2022, Rongwei Wang wrote:

> If available, I think document the issue and warn this incorrect behavior is
> OK. But it still prints a large amount of confusing messages, and disturbs us?

Correct it would be great if you could fix this in a way that does not
impact performance.

> > are current operations on the slab being validated.
> And I am trying to fix it in following way. In a short, these changes only
> works under the slub debug mode, and not affects the normal mode (I'm not
> sure). It looks not elegant enough. And if all approve of this way, I can
> submit the next version.


>
> Anyway, thanks for your time:).
> -wrw
>
> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
struct
> slab *slab,
>
> {
> void *prior;
> - int was_frozen;
> + int was_frozen, to_take_off = 0;
> struct slab new;

to_take_off has the role of !n ? Why is that needed?

> - do {
> - if (unlikely(n)) {
> + spin_lock_irqsave(&n->list_lock, flags);
> + ret = free_debug_processing(s, slab, head, tail, cnt, addr);

Ok so the idea is to take the lock only if kmem_cache_debug. That looks
ok. But it still adds a number of new branches etc to the free loop.

Some performance tests would be useful.

2022-06-11 09:30:51

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 6/8/22 8:23 PM, Christoph Lameter wrote:
> On Wed, 8 Jun 2022, Rongwei Wang wrote:
>
>> If available, I think document the issue and warn this incorrect behavior is
>> OK. But it still prints a large amount of confusing messages, and disturbs us?
>
> Correct it would be great if you could fix this in a way that does not
> impact performance.
>
>>> are current operations on the slab being validated.
>> And I am trying to fix it in following way. In a short, these changes only
>> works under the slub debug mode, and not affects the normal mode (I'm not
>> sure). It looks not elegant enough. And if all approve of this way, I can
>> submit the next version.
>
>
>>
>> Anyway, thanks for your time:).
>> -wrw
>>
>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
> struct
>> slab *slab,
>>
>> {
>> void *prior;
>> - int was_frozen;
>> + int was_frozen, to_take_off = 0;
>> struct slab new;
>
> to_take_off has the role of !n ? Why is that needed?
>
>> - do {
>> - if (unlikely(n)) {
>> + spin_lock_irqsave(&n->list_lock, flags);
>> + ret = free_debug_processing(s, slab, head, tail, cnt, addr);
>
> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
> ok. But it still adds a number of new branches etc to the free loop.
>
> Some performance tests would be useful.
Hi Christoph

Thanks for your time!
Do you have some advice in benchmarks that need me to test? And I find
that hackbench and lkp was used frequently in mm/slub.c commits[1,2].
But I have no idea how to use these two benchmarks test to cover the
above changes. Can you give some examples? Thanks very much!

Sorry for late reply!

[1]https://lore.kernel.org/lkml/20210301080404.GF12822@xsang-OptiPlex-9020/
[2]https://lore.kernel.org/linux-mm/[email protected]/

2022-06-13 17:22:21

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Sat, 11 Jun 2022, Rongwei Wang wrote:

> > Ok so the idea is to take the lock only if kmem_cache_debug. That looks
> > ok. But it still adds a number of new branches etc to the free loop.
> >
> > Some performance tests would be useful.
> Hi Christoph
>
> Thanks for your time!
> Do you have some advice in benchmarks that need me to test? And I find that
> hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no
> idea how to use these two benchmarks test to cover the above changes. Can you
> give some examples? Thanks very much!


Hi Rongwei,

Well run hackbench with an without the change.

There are also synthetic benchmarks available at
https://gentwo.org/christoph/slub/tests/

These measure the cycles that slab operations take. However, they are a
bit old and I think Pekka may have a newer version of these
patches.

Greetings,
Christoph

2022-06-14 03:07:00

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 6/13/22 9:50 PM, Christoph Lameter wrote:
> On Sat, 11 Jun 2022, Rongwei Wang wrote:
>
>>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>>> ok. But it still adds a number of new branches etc to the free loop.
>>>
>>> Some performance tests would be useful.
>> Hi Christoph
>>
>> Thanks for your time!
>> Do you have some advice in benchmarks that need me to test? And I find that
>> hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no
>> idea how to use these two benchmarks test to cover the above changes. Can you
>> give some examples? Thanks very much!
>
>
> Hi Rongwei,
>
> Well run hackbench with an without the change.
OK
>
> There are also synthetic benchmarks available at
> https://gentwo.org/christoph/slub/tests/
That's great, thanks very much!
>
> These measure the cycles that slab operations take. However, they are a
> bit old and I think Pekka may have a newer version of these
> patches.
>
> Greetings,
> Christoph

2022-06-17 08:15:01

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 6/13/22 9:50 PM, Christoph Lameter wrote:
> On Sat, 11 Jun 2022, Rongwei Wang wrote:
>
>>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>>> ok. But it still adds a number of new branches etc to the free loop.
>>>
>>> Some performance tests would be useful.
>> Hi Christoph
>>
>> Thanks for your time!
>> Do you have some advice in benchmarks that need me to test? And I find that
>> hackbench and lkp was used frequently in mm/slub.c commits[1,2]. But I have no
>> idea how to use these two benchmarks test to cover the above changes. Can you
>> give some examples? Thanks very much!
>
>
> Hi Rongwei,
>
> Well run hackbench with an without the change.
>
> There are also synthetic benchmarks available at
> https://gentwo.org/christoph/slub/tests/
Christoph, I refer [1] to test some data below. The slub_test case is
same to your provided. And here you the result of its test (the baseline
is the data of upstream kernel, and fix is results of patched kernel).

my test environment: arm64 vm (32 cores and 128G memory)

And I have removed 'slub_debug=UFPZ' in cmdline before testing the
following two groups of data.

[1]https://lore.kernel.org/linux-mm/20200527103545.4348ac10@carbon/

Single thread testing

1. Kmalloc: Repeatedly allocate then free test

before (baseline) fix
kmalloc kfree kmalloc kfree
10000 times 8 7 cycles 8 cycles 5 cycles 7 cycles
10000 times 16 4 cycles 8 cycles 3 cycles 6 cycles
10000 times 32 4 cycles 8 cycles 3 cycles 6 cycles
10000 times 64 3 cycles 8 cycles 3 cycles 6 cycles
10000 times 128 3 cycles 8 cycles 3 cycles 6 cycles
10000 times 256 12 cycles 8 cycles 11 cycles 7 cycles
10000 times 512 27 cycles 10 cycles 23 cycles 11 cycles
10000 times 1024 18 cycles 9 cycles 20 cycles 10 cycles
10000 times 2048 54 cycles 12 cycles 54 cycles 12 cycles
10000 times 4096 105 cycles 20 cycles 105 cycles 25 cycles
10000 times 8192 210 cycles 35 cycles 212 cycles 39 cycles
10000 times 16384 133 cycles 45 cycles 119 cycles 46 cycles


2. Kmalloc: alloc/free test

before (base) fix
10000 times kmalloc(8)/kfree 3 cycles 3 cycles
10000 times kmalloc(16)/kfree 3 cycles 3 cycles
10000 times kmalloc(32)/kfree 3 cycles 3 cycles
10000 times kmalloc(64)/kfree 3 cycles 3 cycles
10000 times kmalloc(128)/kfree 3 cycles 3 cycles
10000 times kmalloc(256)/kfree 3 cycles 3 cycles
10000 times kmalloc(512)/kfree 3 cycles 3 cycles
10000 times kmalloc(1024)/kfree 3 cycles 3 cycles
10000 times kmalloc(2048)/kfree 3 cycles 3 cycles
10000 times kmalloc(4096)/kfree 3 cycles 3 cycles
10000 times kmalloc(8192)/kfree 3 cycles 3 cycles
10000 times kmalloc(16384)/kfree 33 cycles 33 cycles


Concurrent allocs

before (baseline) fix
Kmalloc N*alloc N*free(8) Average=17/18 Average=11/11
Kmalloc N*alloc N*free(16) Average=15/49 Average=9/11
Kmalloc N*alloc N*free(32) Average=15/40 Average=9/11
Kmalloc N*alloc N*free(64) Average=15/44 Average=9/10
Kmalloc N*alloc N*free(128) Average=15/42 Average=10/10
Kmalloc N*alloc N*free(256) Average=128/28 Average=71/22
Kmalloc N*alloc N*free(512) Average=206/34 Average=178/26
Kmalloc N*alloc N*free(1024) Average=762/37 Average=369/27
Kmalloc N*alloc N*free(2048) Average=327/58 Average=339/33
Kmalloc N*alloc N*free(4096) Average=2255/128 Average=1813/64

before (baseline) fix
Kmalloc N*(alloc free)(8) Average=3 Average=3
Kmalloc N*(alloc free)(16) Average=3 Average=3
Kmalloc N*(alloc free)(32) Average=3 Average=3
Kmalloc N*(alloc free)(64) Average=3 Average=3
Kmalloc N*(alloc free)(128) Average=3 Average=3
Kmalloc N*(alloc free)(256) Average=3 Average=3
Kmalloc N*(alloc free)(512) Average=3 Average=3
Kmalloc N*(alloc free)(1024) Average=3 Average=3
Kmalloc N*(alloc free)(2048) Average=3 Average=3
Kmalloc N*(alloc free)(4096) Average=3 Average=3

According to the above data, It seems that no significant performance
degradation in patched kernel. Plus, in concurrent allocs test, likes
Kmalloc N*alloc N*free(1024), the data of 'fix' column is better than
baseline (it looks less is better, if I am wrong, please let me know).
And if you have other suggestions, I can try to test more data.

Thanks for your time!
-wrw
>
> These measure the cycles that slab operations take. However, they are a
> bit old and I think Pekka may have a newer version of these
> patches.
>
> Greetings,
> Christoph

2022-06-17 10:24:49

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On 6/8/22 14:23, Christoph Lameter wrote:
> On Wed, 8 Jun 2022, Rongwei Wang wrote:
>
>> If available, I think document the issue and warn this incorrect behavior is
>> OK. But it still prints a large amount of confusing messages, and disturbs us?
>
> Correct it would be great if you could fix this in a way that does not
> impact performance.
>
>> > are current operations on the slab being validated.
>> And I am trying to fix it in following way. In a short, these changes only
>> works under the slub debug mode, and not affects the normal mode (I'm not
>> sure). It looks not elegant enough. And if all approve of this way, I can
>> submit the next version.
>
>
>>
>> Anyway, thanks for your time:).
>> -wrw
>>
>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
> struct
>> slab *slab,
>>
>> {
>> void *prior;
>> - int was_frozen;
>> + int was_frozen, to_take_off = 0;
>> struct slab new;
>
> to_take_off has the role of !n ? Why is that needed?
>
>> - do {
>> - if (unlikely(n)) {
>> + spin_lock_irqsave(&n->list_lock, flags);
>> + ret = free_debug_processing(s, slab, head, tail, cnt, addr);
>
> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
> ok. But it still adds a number of new branches etc to the free loop.

It also further complicates the already tricky code. I wonder if we should
make more benefit from the fact that for kmem_cache_debug() caches we don't
leave any slabs on percpu or percpu partial lists, and also in
free_debug_processing() we aready take both list_lock and slab_lock. If we
just did the freeing immediately there under those locks, we would be
protected against other freeing cpus by that list_lock and don't need the
double cmpxchg tricks.

What about against allocating cpus? More tricky as those will currently end
up privatizing the freelist via get_partial(), only to deactivate it again,
so our list_lock+slab_lock in freeing path would not protect in the
meanwhile. But the allocation is currently very inefficient for debug
caches, as in get_partial() it will take the list_lock to take the slab from
partial list and then in most cases again in deactivate_slab() to return it.

If instead the allocation path for kmem_cache_debug() cache would take a
single object from the partial list (not whole freelist) under list_lock, it
would be ultimately more efficient, and protect against freeing using
list_lock. Sounds like an idea worth trying to me?
And of course we would stop creating the 'validate' sysfs files for
non-debug caches.

2022-06-17 14:24:44

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Fri, 17 Jun 2022, Rongwei Wang wrote:

> Christoph, I refer [1] to test some data below. The slub_test case is same to
> your provided. And here you the result of its test (the baseline is the data
> of upstream kernel, and fix is results of patched kernel).

Ah good.
> Single thread testing
>
> 1. Kmalloc: Repeatedly allocate then free test
>
> before (baseline) fix
> kmalloc kfree kmalloc kfree
> 10000 times 8 7 cycles 8 cycles 5 cycles 7 cycles
> 10000 times 16 4 cycles 8 cycles 3 cycles 6 cycles
> 10000 times 32 4 cycles 8 cycles 3 cycles 6 cycles

Well the cycle reduction is strange. Tests are not done in the same
environment? Maybe good to not use NUMA or bind to the same cpu

> 10000 times 64 3 cycles 8 cycles 3 cycles 6 cycles
> 10000 times 128 3 cycles 8 cycles 3 cycles 6 cycles
> 10000 times 256 12 cycles 8 cycles 11 cycles 7 cycles
> 10000 times 512 27 cycles 10 cycles 23 cycles 11 cycles
> 10000 times 1024 18 cycles 9 cycles 20 cycles 10 cycles
> 10000 times 2048 54 cycles 12 cycles 54 cycles 12 cycles
> 10000 times 4096 105 cycles 20 cycles 105 cycles 25 cycles
> 10000 times 8192 210 cycles 35 cycles 212 cycles 39 cycles
> 10000 times 16384 133 cycles 45 cycles 119 cycles 46 cycles


Seems to be different environments.

> According to the above data, It seems that no significant performance
> degradation in patched kernel. Plus, in concurrent allocs test, likes Kmalloc
> N*alloc N*free(1024), the data of 'fix' column is better than baseline (it
> looks less is better, if I am wrong, please let me know). And if you have
> other suggestions, I can try to test more data.

Well can you explain the cycle reduction?

2022-06-18 03:22:17

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 6/17/22 10:19 PM, Christoph Lameter wrote:
> On Fri, 17 Jun 2022, Rongwei Wang wrote:
>
>> Christoph, I refer [1] to test some data below. The slub_test case is same to
>> your provided. And here you the result of its test (the baseline is the data
>> of upstream kernel, and fix is results of patched kernel).
>
> Ah good.
>> Single thread testing
>>
>> 1. Kmalloc: Repeatedly allocate then free test
>>
>> before (baseline) fix
>> kmalloc kfree kmalloc kfree
>> 10000 times 8 7 cycles 8 cycles 5 cycles 7 cycles
>> 10000 times 16 4 cycles 8 cycles 3 cycles 6 cycles
>> 10000 times 32 4 cycles 8 cycles 3 cycles 6 cycles
>
> Well the cycle reduction is strange. Tests are not done in the same
> environment? Maybe good to not use NUMA or bind to the same cpu
It's the same environment. I can sure. And there are four nodes (32G
per-node and 8 cores per-node) in my test environment. whether I need to
test in one node? If right, I can try.
>
>> 10000 times 64 3 cycles 8 cycles 3 cycles 6 cycles
>> 10000 times 128 3 cycles 8 cycles 3 cycles 6 cycles
>> 10000 times 256 12 cycles 8 cycles 11 cycles 7 cycles
>> 10000 times 512 27 cycles 10 cycles 23 cycles 11 cycles
>> 10000 times 1024 18 cycles 9 cycles 20 cycles 10 cycles
>> 10000 times 2048 54 cycles 12 cycles 54 cycles 12 cycles
>> 10000 times 4096 105 cycles 20 cycles 105 cycles 25 cycles
>> 10000 times 8192 210 cycles 35 cycles 212 cycles 39 cycles
>> 10000 times 16384 133 cycles 45 cycles 119 cycles 46 cycles
>
>
> Seems to be different environments.
>
>> According to the above data, It seems that no significant performance
>> degradation in patched kernel. Plus, in concurrent allocs test, likes Kmalloc
>> N*alloc N*free(1024), the data of 'fix' column is better than baseline (it
>> looks less is better, if I am wrong, please let me know). And if you have
>> other suggestions, I can try to test more data.
>
> Well can you explain the cycle reduction?
Maybe because of four nodes in my system or only 8 cores (very small) in
each node? Thanks, you remind me that I need to increase core number of
each node or change node number to compere the results.

Thanks!


2022-06-20 12:53:37

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On Sat, 18 Jun 2022, Rongwei Wang wrote:

> > Well the cycle reduction is strange. Tests are not done in the same
> > environment? Maybe good to not use NUMA or bind to the same cpu
> It's the same environment. I can sure. And there are four nodes (32G per-node
> and 8 cores per-node) in my test environment. whether I need to test in one
> node? If right, I can try.

Ok in a NUMA environment the memory allocation is randomized on bootup.
You may get different numbers after you reboot the system. Try to switch
NUMA off. Use s a single node to get consistent numbers.

It maybe useful to figure out what memory structure causes the increase in
latency in a NUMA environment. If you can figure that out and properly
allocate the memory structure that causes the increases in latency then
you may be able to increase the performance of the allocator.

2022-06-26 16:55:08

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 6/20/22 7:57 PM, Christoph Lameter wrote:
> On Sat, 18 Jun 2022, Rongwei Wang wrote:
>
>>> Well the cycle reduction is strange. Tests are not done in the same
>>> environment? Maybe good to not use NUMA or bind to the same cpu
>> It's the same environment. I can sure. And there are four nodes (32G per-node
>> and 8 cores per-node) in my test environment. whether I need to test in one
>> node? If right, I can try.
>
> Ok in a NUMA environment the memory allocation is randomized on bootup.
> You may get different numbers after you reboot the system. Try to switch
> NUMA off. Use s a single node to get consistent numbers.
Sorry for late reply.

At first, let me share my test environment: arm64 VM (32 cores and 128G
memory), and I only configure one node for this VM.
Plus, I had use 'numactl -N 0 -m 0 qemu-kvm ...' to start this VM. It
mainly to avoid data jitter.

And I can sure my physical machine where this VM is located has same
configuration when I tested prototype kernel and patched kernel. If
above test environment has any problems, please let me know.

The following is the latest data:

Single thread testing
1. Kmalloc: Repeatedly allocate then free test
before fix
kmalloc kfree kmalloc kfree
10000 times 8 4 cycles 5 cycles 4 cycles 5 cycles
10000 times 16 3 cycles 5 cycles 3 cycles 5 cycles
10000 times 32 3 cycles 5 cycles 3 cycles 5 cycles
10000 times 64 3 cycles 5 cycles 3 cycles 5 cycles
10000 times 128 3 cycles 5 cycles 3 cycles 5 cycles
10000 times 256 14 cycles 9 cycles 6 cycles 8 cycles
10000 times 512 9 cycles 8 cycles 9 cycles 10 cycles
10000 times 1024 48 cycles 10 cycles 6 cycles 10 cycles
10000 times 2048 31 cycles 12 cycles 35 cycles 13 cycles
10000 times 4096 96 cycles 17 cycles 96 cycles 18 cycles
10000 times 8192 188 cycles 27 cycles 190 cycles 27 cycles
10000 times 16384 117 cycles 38 cycles 115 cycles 38 cycles

2. Kmalloc: alloc/free test
before fix
10000 times kmalloc(8)/kfree 3 cycles 3 cycles
10000 times kmalloc(16)/kfree 3 cycles 3 cycles
10000 times kmalloc(32)/kfree 3 cycles 3 cycles
10000 times kmalloc(64)/kfree 3 cycles 3 cycles
10000 times kmalloc(128)/kfree 3 cycles 3 cycles
10000 times kmalloc(256)/kfree 3 cycles 3 cycles
10000 times kmalloc(512)/kfree 3 cycles 3 cycles
10000 times kmalloc(1024)/kfree 3 cycles 3 cycles
10000 times kmalloc(2048)/kfree 3 cycles 3 cycles
10000 times kmalloc(4096)/kfree 3 cycles 3 cycles
10000 times kmalloc(8192)/kfree 3 cycles 3 cycles
10000 times kmalloc(16384)/kfree 33 cycles 33 cycles

Concurrent allocs
before fix
Kmalloc N*alloc N*free(8) Average=13/14 Average=14/15
Kmalloc N*alloc N*free(16) Average=13/15 Average=13/15
Kmalloc N*alloc N*free(32) Average=13/15 Average=13/15
Kmalloc N*alloc N*free(64) Average=13/15 Average=13/15
Kmalloc N*alloc N*free(128) Average=13/15 Average=13/15
Kmalloc N*alloc N*free(256) Average=137/29 Average=134/39
Kmalloc N*alloc N*free(512) Average=61/29 Average=64/28
Kmalloc N*alloc N*free(1024) Average=465/50 Average=656/55
Kmalloc N*alloc N*free(2048) Average=503/97 Average=422/97
Kmalloc N*alloc N*free(4096) Average=1592/206 Average=1624/207

Kmalloc N*(alloc free)(8) Average=3 Average=3
Kmalloc N*(alloc free)(16) Average=3 Average=3
Kmalloc N*(alloc free)(32) Average=3 Average=3
Kmalloc N*(alloc free)(64) Average=3 Average=3
Kmalloc N*(alloc free)(128) Average=3 Average=3
Kmalloc N*(alloc free)(256) Average=3 Average=3
Kmalloc N*(alloc free)(512) Average=3 Average=3
Kmalloc N*(alloc free)(1024) Average=3 Average=3
Kmalloc N*(alloc free)(2048) Average=3 Average=3
Kmalloc N*(alloc free)(4096) Average=3 Average=3

Can the above data indicate that this modification (only works when
kmem_cache_debug(s) is true) does not introduce significant performance
impact?

Thanks for your time.
>
> It maybe useful to figure out what memory structure causes the increase in
> latency in a NUMA environment. If you can figure that out and properly
> allocate the memory structure that causes the increases in latency then
> you may be able to increase the performance of the allocator.

2022-07-15 08:51:19

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 6/17/22 5:40 PM, Vlastimil Babka wrote:
> On 6/8/22 14:23, Christoph Lameter wrote:
>> On Wed, 8 Jun 2022, Rongwei Wang wrote:
>>
>>> If available, I think document the issue and warn this incorrect behavior is
>>> OK. But it still prints a large amount of confusing messages, and disturbs us?
>>
>> Correct it would be great if you could fix this in a way that does not
>> impact performance.
>>
>>>> are current operations on the slab being validated.
>>> And I am trying to fix it in following way. In a short, these changes only
>>> works under the slub debug mode, and not affects the normal mode (I'm not
>>> sure). It looks not elegant enough. And if all approve of this way, I can
>>> submit the next version.
>>
>>
>>>
>>> Anyway, thanks for your time:).
>>> -wrw
>>>
>>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
>> struct
>>> slab *slab,
>>>
>>> {
>>> void *prior;
>>> - int was_frozen;
>>> + int was_frozen, to_take_off = 0;
>>> struct slab new;
>>
>> to_take_off has the role of !n ? Why is that needed?
>>
>>> - do {
>>> - if (unlikely(n)) {
>>> + spin_lock_irqsave(&n->list_lock, flags);
>>> + ret = free_debug_processing(s, slab, head, tail, cnt, addr);
>>
>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>> ok. But it still adds a number of new branches etc to the free loop.
>
Hi, Vlastimil, sorry for missing your message long time.
> It also further complicates the already tricky code. I wonder if we should
> make more benefit from the fact that for kmem_cache_debug() caches we don't
> leave any slabs on percpu or percpu partial lists, and also in
> free_debug_processing() we aready take both list_lock and slab_lock. If we
> just did the freeing immediately there under those locks, we would be
> protected against other freeing cpus by that list_lock and don't need the
> double cmpxchg tricks.
enen, I'm not sure get your "don't need the double cmpxchg tricks" means
completely. What you want to say is that replace cmpxchg_double_slab()
here with following code when kmem_cache_debug(s)?

__slab_lock(slab);
if (slab->freelist == freelist_old && slab->counters == counters_old){
slab->freelist = freelist_new;
slab->counters = counters_new;
__slab_unlock(slab);
local_irq_restore(flags);
return true;
}
__slab_unlock(slab);

If I make mistakes for your words, please let me know.
Thanks!
>
> What about against allocating cpus? More tricky as those will currently end
> up privatizing the freelist via get_partial(), only to deactivate it again,
> so our list_lock+slab_lock in freeing path would not protect in the
> meanwhile. But the allocation is currently very inefficient for debug
> caches, as in get_partial() it will take the list_lock to take the slab from
> partial list and then in most cases again in deactivate_slab() to return it.
It seems that I need speed some time to eat these words. Anyway, thanks.
>
> If instead the allocation path for kmem_cache_debug() cache would take a
> single object from the partial list (not whole freelist) under list_lock, it
> would be ultimately more efficient, and protect against freeing using
> list_lock. Sounds like an idea worth trying to me?

Hyeonggon had a similar advice that split freeing and allocating slab
from debugging, likes below:


__slab_alloc() {
if (kmem_cache_debug(s))
slab_alloc_debug()
else
___slab_alloc()
}

I guess that above code aims to solve your mentioned problem (idea)?

slab_free() {
if (kmem_cache_debug(s))
slab_free_debug()
else
__do_slab_free()
}

Currently, I only modify the code of freeing slab to fix the confusing
messages of "slabinfo -v". If you agree, I can try to realize above
mentioned slab_alloc_debug() code. Maybe it's also a challenge to me.

Thanks for your time.

> And of course we would stop creating the 'validate' sysfs files for
> non-debug caches.

2022-07-15 11:13:56

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 7/15/22 6:33 PM, Vlastimil Babka wrote:
> On 7/15/22 10:05, Rongwei Wang wrote:
>>
>>
>> On 6/17/22 5:40 PM, Vlastimil Babka wrote:
>>> On 6/8/22 14:23, Christoph Lameter wrote:
>>>> On Wed, 8 Jun 2022, Rongwei Wang wrote:
>>>>
>>>>> If available, I think document the issue and warn this incorrect
>>>>> behavior is
>>>>> OK. But it still prints a large amount of confusing messages, and
>>>>> disturbs us?
>>>>
>>>> Correct it would be great if you could fix this in a way that does not
>>>> impact performance.
>>>>
>>>>>> are current operations on the slab being validated.
>>>>> And I am trying to fix it in following way. In a short, these changes only
>>>>> works under the slub debug mode, and not affects the normal mode (I'm not
>>>>> sure). It looks not elegant enough. And if all approve of this way, I can
>>>>> submit the next version.
>>>>
>>>>
>>>>>
>>>>> Anyway, thanks for your time:).
>>>>> -wrw
>>>>>
>>>>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
>>>> struct
>>>>> slab *slab,
>>>>>
>>>>>   {
>>>>>          void *prior;
>>>>> -       int was_frozen;
>>>>> +       int was_frozen, to_take_off = 0;
>>>>>          struct slab new;
>>>>
>>>> to_take_off has the role of !n ? Why is that needed?
>>>>
>>>>> -       do {
>>>>> -               if (unlikely(n)) {
>>>>> +               spin_lock_irqsave(&n->list_lock, flags);
>>>>> +               ret = free_debug_processing(s, slab, head, tail, cnt,
>>>>> addr);
>>>>
>>>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>>>> ok. But it still adds a number of new branches etc to the free loop.
>>>
>> Hi, Vlastimil, sorry for missing your message long time.
>
> Hi, no problem.
>
>>> It also further complicates the already tricky code. I wonder if we should
>>> make more benefit from the fact that for kmem_cache_debug() caches we don't
>>> leave any slabs on percpu or percpu partial lists, and also in
>>> free_debug_processing() we aready take both list_lock and slab_lock. If we
>>> just did the freeing immediately there under those locks, we would be
>>> protected against other freeing cpus by that list_lock and don't need the
>>> double cmpxchg tricks.
>> enen, I'm not sure get your "don't need the double cmpxchg tricks" means
>> completely. What you want to say is that replace cmpxchg_double_slab() here
>> with following code when kmem_cache_debug(s)?
>>
>> __slab_lock(slab);
>> if (slab->freelist == freelist_old && slab->counters == counters_old){
>>     slab->freelist = freelist_new;
>>     slab->counters = counters_new;
>>     __slab_unlock(slab);
>>     local_irq_restore(flags);
>>     return true;
>> }
>> __slab_unlock(slab);
>
> Pretty much, but it's more complicated.
Yes, actually, I think reuse cmpxchg_double_slab() here is more concise
in code. I'm already finish this part of code, but hesitating whether to
replace cmpxchg_double_slab().
>
>> If I make mistakes for your words, please let me know.
>> Thanks!
>>>
>>> What about against allocating cpus? More tricky as those will currently end
>>> up privatizing the freelist via get_partial(), only to deactivate it again,
>>> so our list_lock+slab_lock in freeing path would not protect in the
>>> meanwhile. But the allocation is currently very inefficient for debug
>>> caches, as in get_partial() it will take the list_lock to take the slab from
>>> partial list and then in most cases again in deactivate_slab() to return it.
>> It seems that I need speed some time to eat these words. Anyway, thanks.
>>>
>>> If instead the allocation path for kmem_cache_debug() cache would take a
>>> single object from the partial list (not whole freelist) under list_lock, it
>>> would be ultimately more efficient, and protect against freeing using
>>> list_lock. Sounds like an idea worth trying to me?
>>
>> Hyeonggon had a similar advice that split freeing and allocating slab from
>> debugging, likes below:
>>
>>
>> __slab_alloc() {
>>     if (kmem_cache_debug(s))
>>         slab_alloc_debug()
>>     else
>>         ___slab_alloc()
>> }
>>
>> I guess that above code aims to solve your mentioned problem (idea)?
>>
>> slab_free() {
>>     if (kmem_cache_debug(s))
>>         slab_free_debug()
>>     else
>>         __do_slab_free()
>> }
>>
>> Currently, I only modify the code of freeing slab to fix the confusing
>> messages of "slabinfo -v". If you agree, I can try to realize above
>> mentioned slab_alloc_debug() code. Maybe it's also a challenge to me.
>
> I already started working on this approach and hope to post a RFC soon.
OK, that's great.
>
>> Thanks for your time.
>>
>>> And of course we would stop creating the 'validate' sysfs files for
>>> non-debug caches.

2022-07-15 11:26:34

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On 7/15/22 10:05, Rongwei Wang wrote:
>
>
> On 6/17/22 5:40 PM, Vlastimil Babka wrote:
>> On 6/8/22 14:23, Christoph Lameter wrote:
>>> On Wed, 8 Jun 2022, Rongwei Wang wrote:
>>>
>>>> If available, I think document the issue and warn this incorrect
>>>> behavior is
>>>> OK. But it still prints a large amount of confusing messages, and
>>>> disturbs us?
>>>
>>> Correct it would be great if you could fix this in a way that does not
>>> impact performance.
>>>
>>>>> are current operations on the slab being validated.
>>>> And I am trying to fix it in following way. In a short, these changes only
>>>> works under the slub debug mode, and not affects the normal mode (I'm not
>>>> sure). It looks not elegant enough. And if all approve of this way, I can
>>>> submit the next version.
>>>
>>>
>>>>
>>>> Anyway, thanks for your time:).
>>>> -wrw
>>>>
>>>> @@ -3304,7 +3300,7 @@ static void __slab_free(struct kmem_cache *s,
>>> struct
>>>> slab *slab,
>>>>
>>>>   {
>>>>          void *prior;
>>>> -       int was_frozen;
>>>> +       int was_frozen, to_take_off = 0;
>>>>          struct slab new;
>>>
>>> to_take_off has the role of !n ? Why is that needed?
>>>
>>>> -       do {
>>>> -               if (unlikely(n)) {
>>>> +               spin_lock_irqsave(&n->list_lock, flags);
>>>> +               ret = free_debug_processing(s, slab, head, tail, cnt,
>>>> addr);
>>>
>>> Ok so the idea is to take the lock only if kmem_cache_debug. That looks
>>> ok. But it still adds a number of new branches etc to the free loop.
>>
> Hi, Vlastimil, sorry for missing your message long time.

Hi, no problem.

>> It also further complicates the already tricky code. I wonder if we should
>> make more benefit from the fact that for kmem_cache_debug() caches we don't
>> leave any slabs on percpu or percpu partial lists, and also in
>> free_debug_processing() we aready take both list_lock and slab_lock. If we
>> just did the freeing immediately there under those locks, we would be
>> protected against other freeing cpus by that list_lock and don't need the
>> double cmpxchg tricks.
> enen, I'm not sure get your "don't need the double cmpxchg tricks" means
> completely. What you want to say is that replace cmpxchg_double_slab() here
> with following code when kmem_cache_debug(s)?
>
> __slab_lock(slab);
> if (slab->freelist == freelist_old && slab->counters == counters_old){
>     slab->freelist = freelist_new;
>     slab->counters = counters_new;
>     __slab_unlock(slab);
>     local_irq_restore(flags);
>     return true;
> }
> __slab_unlock(slab);

Pretty much, but it's more complicated.

> If I make mistakes for your words, please let me know.
> Thanks!
>>
>> What about against allocating cpus? More tricky as those will currently end
>> up privatizing the freelist via get_partial(), only to deactivate it again,
>> so our list_lock+slab_lock in freeing path would not protect in the
>> meanwhile. But the allocation is currently very inefficient for debug
>> caches, as in get_partial() it will take the list_lock to take the slab from
>> partial list and then in most cases again in deactivate_slab() to return it.
> It seems that I need speed some time to eat these words. Anyway, thanks.
>>
>> If instead the allocation path for kmem_cache_debug() cache would take a
>> single object from the partial list (not whole freelist) under list_lock, it
>> would be ultimately more efficient, and protect against freeing using
>> list_lock. Sounds like an idea worth trying to me?
>
> Hyeonggon had a similar advice that split freeing and allocating slab from
> debugging, likes below:
>
>
> __slab_alloc() {
>     if (kmem_cache_debug(s))
>         slab_alloc_debug()
>     else
>         ___slab_alloc()
> }
>
> I guess that above code aims to solve your mentioned problem (idea)?
>
> slab_free() {
>     if (kmem_cache_debug(s))
>         slab_free_debug()
>     else
>         __do_slab_free()
> }
>
> Currently, I only modify the code of freeing slab to fix the confusing
> messages of "slabinfo -v". If you agree, I can try to realize above
> mentioned slab_alloc_debug() code. Maybe it's also a challenge to me.

I already started working on this approach and hope to post a RFC soon.

> Thanks for your time.
>
>> And of course we would stop creating the 'validate' sysfs files for
>> non-debug caches.

2022-07-18 11:50:06

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free


On 5/29/22 10:15, Rongwei Wang wrote:
> In use cases where allocating and freeing slab frequently, some
> error messages, such as "Left Redzone overwritten", "First byte
> 0xbb instead of 0xcc" would be printed when validating slabs.
> That's because an object has been filled with SLAB_RED_INACTIVE,
> but has not been added to slab's freelist. And between these
> two states, the behaviour of validating slab is likely to occur.
>
> Actually, it doesn't mean the slab can not work stably. But, these
> confusing messages will disturb slab debugging more or less.
>
> Signed-off-by: Rongwei Wang <[email protected]>

As I've said in the sub-thread I had the following kind of fix in mind. I
think it should cover the cases from your patches 1/3 and 3/3.

----8<----
From c35fe2a781a7bc4ef37ef3ded289f4ced82562bd Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <[email protected]>
Date: Mon, 4 Jul 2022 14:09:09 +0200
Subject: [RFC] mm, slub: restrict sysfs validation to debug caches and make
it safe

Rongwei Wang reports [1] that cache validation triggered by writing to
/sys/kernel/slab/<cache>/validate is racy against normal cache
operations (e.g. freeing) in a way that can cause false positive
inconsistency reports for caches with debugging enabled. The problem is
that debugging actions that mark object free or active and actual
freelist operations are not atomic, and the validation can see an
inconsistent state.

For caches that don't have debugging enabled, other races are possible
that result in false reports of wrong slab counts.

This patch attempts to solve these issues while not adding overhead to
normal (especially fastpath) operations for caches that do not have
debugging enabled, just to make possible userspace-triggered validation
safe. Instead, disable the validation for caches that don't have
debugging enabled and make the sysfs handler return -EINVAL.

For caches that do have debugging enabled, we can instead extend the
existing approach of not using percpu freelists to force all operations
to the slow paths where debugging is checked for and processed.

The processing on free in free_debug_processing() already happens under
n->list_lock and slab_lock() so we can extend it to actually do the
freeing as well and thus make it atomic against concurrent validation.

The processing on alloc in alloc_debug_processing() currently doesn't
take any locks, but we have to first allocate the object from a slab on
the partial list (as percpu slabs are always non-existent) and thus take
n->list_lock. Add a function alloc_single_from_partial() that
additionally takes slab_lock() for the debug processing and then grabs
just the allocated object instead of the whole freelist. This again
makes it atomic against validation and it is also ultimately more
efficient than the current grabbing of freelist immediately followed by
slab deactivation.

Neither of these changes affect the fast paths.

The function free_debug_processing() was moved so that it is placed
later than the definitions of add_partial(), remove_partial() and
discard_slab().

[1] https://lore.kernel.org/all/[email protected]/

Reported-by: Rongwei Wang <[email protected]>
Signed-off-by: Vlastimil Babka <[email protected]>
---
mm/slub.c | 250 +++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 174 insertions(+), 76 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index b1281b8654bd..954fe7ad5ee1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1324,17 +1324,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
}

static noinline int alloc_debug_processing(struct kmem_cache *s,
- struct slab *slab,
- void *object, unsigned long addr)
+ struct slab *slab, void *object)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
if (!alloc_consistency_checks(s, slab, object))
goto bad;
}

- /* Success perform special debug activities for allocs */
- if (s->flags & SLAB_STORE_USER)
- set_track(s, object, TRACK_ALLOC, addr);
+ /* Success. Perform special debug activities for allocs */
trace(s, slab, object, 1);
init_object(s, object, SLUB_RED_ACTIVE);
return 1;
@@ -1385,63 +1382,6 @@ static inline int free_consistency_checks(struct kmem_cache *s,
return 1;
}

-/* Supports checking bulk free of a constructed freelist */
-static noinline int free_debug_processing(
- struct kmem_cache *s, struct slab *slab,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr)
-{
- struct kmem_cache_node *n = get_node(s, slab_nid(slab));
- void *object = head;
- int cnt = 0;
- unsigned long flags, flags2;
- int ret = 0;
- depot_stack_handle_t handle = 0;
-
- if (s->flags & SLAB_STORE_USER)
- handle = set_track_prepare();
-
- spin_lock_irqsave(&n->list_lock, flags);
- slab_lock(slab, &flags2);
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!check_slab(s, slab))
- goto out;
- }
-
-next_object:
- cnt++;
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!free_consistency_checks(s, slab, object, addr))
- goto out;
- }
-
- if (s->flags & SLAB_STORE_USER)
- set_track_update(s, object, TRACK_FREE, addr, handle);
- trace(s, slab, object, 0);
- /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
- init_object(s, object, SLUB_RED_INACTIVE);
-
- /* Reached end of constructed freelist yet? */
- if (object != tail) {
- object = get_freepointer(s, object);
- goto next_object;
- }
- ret = 1;
-
-out:
- if (cnt != bulk_cnt)
- slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
- bulk_cnt, cnt);
-
- slab_unlock(slab, &flags2);
- spin_unlock_irqrestore(&n->list_lock, flags);
- if (!ret)
- slab_fix(s, "Object at 0x%p not freed", object);
- return ret;
-}
-
/*
* Parse a block of slub_debug options. Blocks are delimited by ';'
*
@@ -1661,7 +1601,7 @@ static inline
void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}

static inline int alloc_debug_processing(struct kmem_cache *s,
- struct slab *slab, void *object, unsigned long addr) { return 0; }
+ struct slab *slab, void *object) { return 0; }

static inline int free_debug_processing(
struct kmem_cache *s, struct slab *slab,
@@ -2102,6 +2042,42 @@ static inline void remove_partial(struct kmem_cache_node *n,
n->nr_partial--;
}

+/*
+ * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
+ * slab from the n->partial list. Removes only a single object from the slab
+ * under slab_lock(), does the alloc_debug_processing() checks and leaves the
+ * slab on the list, or moves it to full list if it was the last object.
+ */
+static void *alloc_single_from_partial(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct slab *slab)
+{
+ void *object;
+ unsigned long flags;
+
+ lockdep_assert_held(&n->list_lock);
+
+ slab_lock(slab, &flags);
+
+ object = slab->freelist;
+ slab->freelist = get_freepointer(s, object);
+ slab->inuse++;
+
+ if (!alloc_debug_processing(s, slab, object)) {
+ remove_partial(n, slab);
+ slab_unlock(slab, &flags);
+ return NULL;
+ }
+
+ if (slab->inuse == slab->objects) {
+ remove_partial(n, slab);
+ add_full(s, n, slab);
+ }
+
+ slab_unlock(slab, &flags);
+
+ return object;
+}
+
/*
* Remove slab from the partial list, freeze it and
* return the pointer to the freelist.
@@ -2182,6 +2158,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
if (!pfmemalloc_match(slab, gfpflags))
continue;

+ if (kmem_cache_debug(s)) {
+ object = alloc_single_from_partial(s, n, slab);
+ if (object)
+ break;
+ continue;
+ }
+
t = acquire_slab(s, n, slab, object == NULL);
if (!t)
break;
@@ -2788,6 +2771,104 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
{
return atomic_long_read(&n->total_objects);
}
+
+/* Supports checking bulk free of a constructed freelist */
+static noinline int free_debug_processing(
+ struct kmem_cache *s, struct slab *slab,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
+{
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+ struct slab *slab_to_discard = NULL;
+ void *object = head;
+ int cnt = 0;
+ unsigned long flags, flags2;
+ int ret = 0;
+ depot_stack_handle_t handle = 0;
+
+ if (s->flags & SLAB_STORE_USER)
+ handle = set_track_prepare();
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ slab_lock(slab, &flags2);
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, slab))
+ goto out;
+ }
+
+ if (slab->inuse < bulk_cnt) {
+ slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
+ slab->inuse, bulk_cnt);
+ goto out;
+ }
+
+next_object:
+
+ if (++cnt > bulk_cnt)
+ goto out_cnt;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, slab, object, addr))
+ goto out;
+ }
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track_update(s, object, TRACK_FREE, addr, handle);
+ trace(s, slab, object, 0);
+ /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
+ init_object(s, object, SLUB_RED_INACTIVE);
+
+ /* Reached end of constructed freelist yet? */
+ if (object != tail) {
+ object = get_freepointer(s, object);
+ goto next_object;
+ }
+ ret = 1;
+
+out_cnt:
+ if (cnt != bulk_cnt)
+ slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
+ bulk_cnt, cnt);
+
+out:
+ if (ret) {
+ void *prior = slab->freelist;
+
+ /* Perform the actual freeing while we still hold the locks */
+ slab->inuse -= cnt;
+ set_freepointer(s, tail, prior);
+ slab->freelist = head;
+
+ /* Do we need to remove the slab from full or partial list? */
+ if (!prior) {
+ remove_full(s, n, slab);
+ } else if (slab->inuse == 0) {
+ remove_partial(n, slab);
+ stat(s, FREE_REMOVE_PARTIAL);
+ }
+
+ /* Do we need to discard the slab or add to partial list? */
+ if (slab->inuse == 0) {
+ slab_to_discard = slab;
+ } else if (!prior) {
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+
+ }
+
+ slab_unlock(slab, &flags2);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ if (!ret)
+ slab_fix(s, "Object at 0x%p not freed", object);
+ if (slab_to_discard) {
+ stat(s, FREE_SLAB);
+ discard_slab(s, slab);
+ }
+
+ return ret;
+}
#endif /* CONFIG_SLUB_DEBUG */

#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
@@ -3045,19 +3126,35 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,

stat(s, ALLOC_SLAB);

-check_new_slab:
-
if (kmem_cache_debug(s)) {
- if (!alloc_debug_processing(s, slab, freelist, addr)) {
- /* Slab failed checks. Next slab needed */
- goto new_slab;
- } else {
+ if (!alloc_debug_processing(s, slab, freelist))
/*
- * For debug case, we don't load freelist so that all
- * allocations go through alloc_debug_processing()
+ * It's not really expected that this would fail on a
+ * freshly allocated slab, but a concurrent memory
+ * corruption in theory could cause that.
*/
- goto return_single;
- }
+ goto new_slab;
+
+ /*
+ * For debug caches we don't load the freelist so that all
+ * allocations have to go through alloc_debug_processing()
+ */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+ goto return_single;
+ }
+
+check_new_slab:
+
+ if (kmem_cache_debug(s)) {
+ /*
+ * For debug caches here we had to go through
+ * alloc_single_from_partial() so just store the tracking info
+ * and return the object
+ */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+ return freelist;
}

if (unlikely(!pfmemalloc_match(slab, gfpflags)))
@@ -3341,9 +3438,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
if (kfence_free(head))
return;

- if (kmem_cache_debug(s) &&
- !free_debug_processing(s, slab, head, tail, cnt, addr))
+ if (kmem_cache_debug(s)) {
+ free_debug_processing(s, slab, head, tail, cnt, addr);
return;
+ }

do {
if (unlikely(n)) {
@@ -5625,7 +5723,7 @@ static ssize_t validate_store(struct kmem_cache *s,
{
int ret = -EINVAL;

- if (buf[0] == '1') {
+ if (buf[0] == '1' && kmem_cache_debug(s)) {
ret = validate_slab_cache(s);
if (ret >= 0)
ret = length;
--
2.36.1

2022-07-19 14:33:43

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 7/18/22 7:09 PM, Vlastimil Babka wrote:
>
> On 5/29/22 10:15, Rongwei Wang wrote:
>> In use cases where allocating and freeing slab frequently, some
>> error messages, such as "Left Redzone overwritten", "First byte
>> 0xbb instead of 0xcc" would be printed when validating slabs.
>> That's because an object has been filled with SLAB_RED_INACTIVE,
>> but has not been added to slab's freelist. And between these
>> two states, the behaviour of validating slab is likely to occur.
>>
>> Actually, it doesn't mean the slab can not work stably. But, these
>> confusing messages will disturb slab debugging more or less.
>>
>> Signed-off-by: Rongwei Wang <[email protected]>
>
> As I've said in the sub-thread I had the following kind of fix in mind. I
> think it should cover the cases from your patches 1/3 and 3/3.
>
> ----8<----
> From c35fe2a781a7bc4ef37ef3ded289f4ced82562bd Mon Sep 17 00:00:00 2001
> From: Vlastimil Babka <[email protected]>
> Date: Mon, 4 Jul 2022 14:09:09 +0200
> Subject: [RFC] mm, slub: restrict sysfs validation to debug caches and make
> it safe
>
> Rongwei Wang reports [1] that cache validation triggered by writing to
> /sys/kernel/slab/<cache>/validate is racy against normal cache
> operations (e.g. freeing) in a way that can cause false positive
> inconsistency reports for caches with debugging enabled. The problem is
> that debugging actions that mark object free or active and actual
> freelist operations are not atomic, and the validation can see an
> inconsistent state.
>
> For caches that don't have debugging enabled, other races are possible
> that result in false reports of wrong slab counts.
>
> This patch attempts to solve these issues while not adding overhead to
> normal (especially fastpath) operations for caches that do not have
> debugging enabled, just to make possible userspace-triggered validation
> safe. Instead, disable the validation for caches that don't have
> debugging enabled and make the sysfs handler return -EINVAL.
>
> For caches that do have debugging enabled, we can instead extend the
> existing approach of not using percpu freelists to force all operations
> to the slow paths where debugging is checked for and processed.
>
> The processing on free in free_debug_processing() already happens under
> n->list_lock and slab_lock() so we can extend it to actually do the
> freeing as well and thus make it atomic against concurrent validation.
>
> The processing on alloc in alloc_debug_processing() currently doesn't
> take any locks, but we have to first allocate the object from a slab on
> the partial list (as percpu slabs are always non-existent) and thus take
> n->list_lock. Add a function alloc_single_from_partial() that
> additionally takes slab_lock() for the debug processing and then grabs
> just the allocated object instead of the whole freelist. This again
> makes it atomic against validation and it is also ultimately more
> efficient than the current grabbing of freelist immediately followed by
> slab deactivation.
>
> Neither of these changes affect the fast paths.
>
> The function free_debug_processing() was moved so that it is placed
> later than the definitions of add_partial(), remove_partial() and
> discard_slab().
>
> [1] https://lore.kernel.org/all/[email protected]/
>
> Reported-by: Rongwei Wang <[email protected]>
> Signed-off-by: Vlastimil Babka <[email protected]>
> ---
> mm/slub.c | 250 +++++++++++++++++++++++++++++++++++++-----------------
> 1 file changed, 174 insertions(+), 76 deletions(-)
>
> diff --git a/mm/slub.c b/mm/slub.c
> index b1281b8654bd..954fe7ad5ee1 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1324,17 +1324,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
> }
>
> static noinline int alloc_debug_processing(struct kmem_cache *s,
> - struct slab *slab,
> - void *object, unsigned long addr)
> + struct slab *slab, void *object)
> {
> if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> if (!alloc_consistency_checks(s, slab, object))
> goto bad;
> }
>
> - /* Success perform special debug activities for allocs */
> - if (s->flags & SLAB_STORE_USER)
> - set_track(s, object, TRACK_ALLOC, addr);
> + /* Success. Perform special debug activities for allocs */
> trace(s, slab, object, 1);
> init_object(s, object, SLUB_RED_ACTIVE);
> return 1;
> @@ -1385,63 +1382,6 @@ static inline int free_consistency_checks(struct kmem_cache *s,
> return 1;
> }
>
> -/* Supports checking bulk free of a constructed freelist */
> -static noinline int free_debug_processing(
> - struct kmem_cache *s, struct slab *slab,
> - void *head, void *tail, int bulk_cnt,
> - unsigned long addr)
> -{
> - struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> - void *object = head;
> - int cnt = 0;
> - unsigned long flags, flags2;
> - int ret = 0;
> - depot_stack_handle_t handle = 0;
> -
> - if (s->flags & SLAB_STORE_USER)
> - handle = set_track_prepare();
> -
> - spin_lock_irqsave(&n->list_lock, flags);
> - slab_lock(slab, &flags2);
> -
> - if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> - if (!check_slab(s, slab))
> - goto out;
> - }
> -
> -next_object:
> - cnt++;
> -
> - if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> - if (!free_consistency_checks(s, slab, object, addr))
> - goto out;
> - }
> -
> - if (s->flags & SLAB_STORE_USER)
> - set_track_update(s, object, TRACK_FREE, addr, handle);
> - trace(s, slab, object, 0);
> - /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
> - init_object(s, object, SLUB_RED_INACTIVE);
> -
> - /* Reached end of constructed freelist yet? */
> - if (object != tail) {
> - object = get_freepointer(s, object);
> - goto next_object;
> - }
> - ret = 1;
> -
> -out:
> - if (cnt != bulk_cnt)
> - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
> - bulk_cnt, cnt);
> -
> - slab_unlock(slab, &flags2);
> - spin_unlock_irqrestore(&n->list_lock, flags);
> - if (!ret)
> - slab_fix(s, "Object at 0x%p not freed", object);
> - return ret;
> -}
> -
> /*
> * Parse a block of slub_debug options. Blocks are delimited by ';'
> *
> @@ -1661,7 +1601,7 @@ static inline
> void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
>
> static inline int alloc_debug_processing(struct kmem_cache *s,
> - struct slab *slab, void *object, unsigned long addr) { return 0; }
> + struct slab *slab, void *object) { return 0; }
>
> static inline int free_debug_processing(
> struct kmem_cache *s, struct slab *slab,
> @@ -2102,6 +2042,42 @@ static inline void remove_partial(struct kmem_cache_node *n,
> n->nr_partial--;
> }
>
> +/*
> + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
> + * slab from the n->partial list. Removes only a single object from the slab
> + * under slab_lock(), does the alloc_debug_processing() checks and leaves the
> + * slab on the list, or moves it to full list if it was the last object.
> + */
> +static void *alloc_single_from_partial(struct kmem_cache *s,
> + struct kmem_cache_node *n, struct slab *slab)
> +{
> + void *object;
> + unsigned long flags;
> +
> + lockdep_assert_held(&n->list_lock);
> +
> + slab_lock(slab, &flags);
> +
> + object = slab->freelist;
> + slab->freelist = get_freepointer(s, object);
> + slab->inuse++;
> +
> + if (!alloc_debug_processing(s, slab, object)) {
> + remove_partial(n, slab);
> + slab_unlock(slab, &flags);
> + return NULL;
> + }
> +
> + if (slab->inuse == slab->objects) {
> + remove_partial(n, slab);
> + add_full(s, n, slab);
> + }
> +
> + slab_unlock(slab, &flags);
> +
> + return object;
> +}
> +
> /*
> * Remove slab from the partial list, freeze it and
> * return the pointer to the freelist.
> @@ -2182,6 +2158,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
> if (!pfmemalloc_match(slab, gfpflags))
> continue;
>
> + if (kmem_cache_debug(s)) {
> + object = alloc_single_from_partial(s, n, slab);
> + if (object)
> + break;
> + continue;
> + }
> +
> t = acquire_slab(s, n, slab, object == NULL);
> if (!t)
> break;
> @@ -2788,6 +2771,104 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
> {
> return atomic_long_read(&n->total_objects);
> }
> +
> +/* Supports checking bulk free of a constructed freelist */
> +static noinline int free_debug_processing(
> + struct kmem_cache *s, struct slab *slab,
> + void *head, void *tail, int bulk_cnt,
> + unsigned long addr)
> +{
> + struct kmem_cache_node *n = get_node(s, slab_nid(slab));
> + struct slab *slab_to_discard = NULL;
> + void *object = head;
> + int cnt = 0;
> + unsigned long flags, flags2;
> + int ret = 0;
> + depot_stack_handle_t handle = 0;
> +
> + if (s->flags & SLAB_STORE_USER)
> + handle = set_track_prepare();
> +
> + spin_lock_irqsave(&n->list_lock, flags);
> + slab_lock(slab, &flags2);
> +
> + if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> + if (!check_slab(s, slab))
> + goto out;
> + }
> +
> + if (slab->inuse < bulk_cnt) {
> + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
> + slab->inuse, bulk_cnt);
> + goto out;
> + }
> +
> +next_object:
> +
> + if (++cnt > bulk_cnt)
> + goto out_cnt;
> +
> + if (s->flags & SLAB_CONSISTENCY_CHECKS) {
> + if (!free_consistency_checks(s, slab, object, addr))
> + goto out;
> + }
> +
> + if (s->flags & SLAB_STORE_USER)
> + set_track_update(s, object, TRACK_FREE, addr, handle);
> + trace(s, slab, object, 0);
> + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
> + init_object(s, object, SLUB_RED_INACTIVE);
> +
> + /* Reached end of constructed freelist yet? */
> + if (object != tail) {
> + object = get_freepointer(s, object);
> + goto next_object;
> + }
> + ret = 1;
> +
> +out_cnt:
> + if (cnt != bulk_cnt)
> + slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
> + bulk_cnt, cnt);
> +
> +out:
> + if (ret) {
> + void *prior = slab->freelist;
> +
> + /* Perform the actual freeing while we still hold the locks */
> + slab->inuse -= cnt;
> + set_freepointer(s, tail, prior);
> + slab->freelist = head;
> +
> + /* Do we need to remove the slab from full or partial list? */
> + if (!prior) {
> + remove_full(s, n, slab);
> + } else if (slab->inuse == 0) {
> + remove_partial(n, slab);
> + stat(s, FREE_REMOVE_PARTIAL);
> + }
> +
> + /* Do we need to discard the slab or add to partial list? */
> + if (slab->inuse == 0) {
> + slab_to_discard = slab;
> + } else if (!prior) {
> + add_partial(n, slab, DEACTIVATE_TO_TAIL);
> + stat(s, FREE_ADD_PARTIAL);
> + }
> +
> + }
> +
> + slab_unlock(slab, &flags2);
> + spin_unlock_irqrestore(&n->list_lock, flags);
> + if (!ret)
> + slab_fix(s, "Object at 0x%p not freed", object);
> + if (slab_to_discard) {
> + stat(s, FREE_SLAB);
> + discard_slab(s, slab);
> + }
> +
> + return ret;
> +}
I had test this patch, and it indeed deal with the bug that I described.
Though I am also has prepared this part of code, your code is ok to me.

-wrw

> #endif /* CONFIG_SLUB_DEBUG */
>
> #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
> @@ -3045,19 +3126,35 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>
> stat(s, ALLOC_SLAB);
>
> -check_new_slab:
> -
> if (kmem_cache_debug(s)) {
> - if (!alloc_debug_processing(s, slab, freelist, addr)) {
> - /* Slab failed checks. Next slab needed */
> - goto new_slab;
> - } else {
> + if (!alloc_debug_processing(s, slab, freelist))
> /*
> - * For debug case, we don't load freelist so that all
> - * allocations go through alloc_debug_processing()
> + * It's not really expected that this would fail on a
> + * freshly allocated slab, but a concurrent memory
> + * corruption in theory could cause that.
> */
> - goto return_single;
> - }
> + goto new_slab;
> +
> + /*
> + * For debug caches we don't load the freelist so that all
> + * allocations have to go through alloc_debug_processing()
> + */
> + if (s->flags & SLAB_STORE_USER)
> + set_track(s, freelist, TRACK_ALLOC, addr);
> + goto return_single;
> + }
> +
> +check_new_slab:
> +
> + if (kmem_cache_debug(s)) {
> + /*
> + * For debug caches here we had to go through
> + * alloc_single_from_partial() so just store the tracking info
> + * and return the object
> + */
> + if (s->flags & SLAB_STORE_USER)
> + set_track(s, freelist, TRACK_ALLOC, addr);
> + return freelist;
> }
>
> if (unlikely(!pfmemalloc_match(slab, gfpflags)))
> @@ -3341,9 +3438,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
> if (kfence_free(head))
> return;
>
> - if (kmem_cache_debug(s) &&
> - !free_debug_processing(s, slab, head, tail, cnt, addr))
> + if (kmem_cache_debug(s)) {
> + free_debug_processing(s, slab, head, tail, cnt, addr);
> return;
> + }
>
> do {
> if (unlikely(n)) {
> @@ -5625,7 +5723,7 @@ static ssize_t validate_store(struct kmem_cache *s,
> {
> int ret = -EINVAL;
>
> - if (buf[0] == '1') {
> + if (buf[0] == '1' && kmem_cache_debug(s)) {
> ret = validate_slab_cache(s);
> if (ret >= 0)
> ret = length;

2022-07-19 14:55:02

by Rongwei Wang

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free



On 7/19/22 10:21 PM, Vlastimil Babka wrote:
> On 7/19/22 16:15, Rongwei Wang wrote:
>>
> ...
>>> +
>>> +    slab_unlock(slab, &flags2);
>>> +    spin_unlock_irqrestore(&n->list_lock, flags);
>>> +    if (!ret)
>>> +        slab_fix(s, "Object at 0x%p not freed", object);
>>> +    if (slab_to_discard) {
>>> +        stat(s, FREE_SLAB);
>>> +        discard_slab(s, slab);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>> I had test this patch, and it indeed deal with the bug that I described.
>
> Thanks.
>
>> Though I am also has prepared this part of code, your code is ok to me.
>
> Aha, feel free to post your version, maybe it's simpler? We can compare.
My code only includes the part of your free_debug_processing(), the
structure of it likes:

slab_free() {
if (kmem_cache_debug(s))
slab_free_debug();
else
__do_slab_free();
}

The __slab_free_debug() here likes your free_debug_processing().

+/*
+ * Slow path handling for debugging.
+ */
+static void __slab_free_debug(struct kmem_cache *s, struct slab *slab,
+ void *head, void *tail, int cnt,
+ unsigned long addr)
+
+{
+ void *prior;
+ int was_frozen;
+ struct slab new;
+ unsigned long counters;
+ struct kmem_cache_node *n = NULL;
+ unsigned long flags;
+ int ret;
+
+ stat(s, FREE_SLOWPATH);
+
+ if (kfence_free(head))
+ return;
+
+ n = get_node(s, slab_nid(slab));
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ ret = free_debug_processing(s, slab, head, tail, cnt, addr);
+ if (!ret) {
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return;
+ }
+
+ do {
+ prior = slab->freelist;
+ counters = slab->counters;
+ set_freepointer(s, tail, prior);
+ new.counters = counters;
+ was_frozen = new.frozen;
+ new.inuse -= cnt;
+ } while (!cmpxchg_double_slab(s, slab,
+ prior, counters,
+ head, new.counters,
+ "__slab_free"));
+
+ if ((new.inuse && prior) || was_frozen) {
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ if (likely(was_frozen)) {
+ stat(s, FREE_FROZEN);
+ }
+
+ return;
+ }
+
+ if (!new.inuse && n->nr_partial >= s->min_partial) {
+ /* Indicate no user in this slab, discarding it
naturally. */
+ if (prior) {
+ /* Slab on the partial list. */
+ remove_partial(n, slab);
+ stat(s, FREE_REMOVE_PARTIAL);
+ } else {
+ /* Slab must be on the full list */
+ remove_full(s, n, slab);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ stat(s, FREE_SLAB);
+ discard_slab(s, slab);
+ return;
+ }
+
+ /*
+ * Objects left in the slab. If it was not on the partial list
before
+ * then add it.
+ */
+ if (!prior) {
+ remove_full(s, n, slab);
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ return;
+}




2022-07-19 15:04:31

by Vlastimil Babka

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm/slub: fix the race between validate_slab and slab_free

On 7/19/22 16:15, Rongwei Wang wrote:
>
...
>> +
>> +    slab_unlock(slab, &flags2);
>> +    spin_unlock_irqrestore(&n->list_lock, flags);
>> +    if (!ret)
>> +        slab_fix(s, "Object at 0x%p not freed", object);
>> +    if (slab_to_discard) {
>> +        stat(s, FREE_SLAB);
>> +        discard_slab(s, slab);
>> +    }
>> +
>> +    return ret;
>> +}
> I had test this patch, and it indeed deal with the bug that I described.

Thanks.

> Though I am also has prepared this part of code, your code is ok to me.

Aha, feel free to post your version, maybe it's simpler? We can compare.