v1->v2:
- Improved changelog and variable naming for PATCH 1~2.
- PATCH3 adds per-cpu counter to avoid performance regression
in concurrent __slab_free().
[Testing]
On my 32-cpu 2-socket physical machine:
Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
perf stat --null --repeat 10 -- hackbench 20 thread 20000
== original, no patched
19.211637055 seconds time elapsed ( +- 0.57% )
== patched with patch1~2
Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
21.731833146 seconds time elapsed ( +- 0.17% )
== patched with patch1~3
Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
19.112106847 seconds time elapsed ( +- 0.64% )
Xunlei Pang (3):
mm/slub: Introduce two counters for partial objects
mm/slub: Get rid of count_partial()
mm/slub: Use percpu partial free counter
mm/slab.h | 2 +
mm/slub.c | 124 +++++++++++++++++++++++++++++++++++++++++++-------------------
2 files changed, 89 insertions(+), 37 deletions(-)
--
1.8.3.1
The node list_lock in count_partial() spends long time iterating
in case of large amount of partial page lists, which can cause
thunder herd effect to the list_lock contention.
We have HSF RT(High-speed Service Framework Response-Time) monitors,
the RT figures fluctuated randomly, then we deployed a tool detecting
"irq off" and "preempt off" to dump the culprit's calltrace, capturing
the list_lock cost nearly 100ms with irq off issued by "ss", this also
caused network timeouts.
This patch introduces two counters to maintain the actual number
of partial objects dynamically instead of iterating the partial
page lists with list_lock held.
New counters of kmem_cache_node: partial_free_objs, partial_total_objs.
The main operations are under list_lock in slow path, its performance
impact should be minimal except the __slab_free() path which will be
addressed later.
Acked-by: Pekka Enberg <[email protected]>
Co-developed-by: Wen Yang <[email protected]>
Signed-off-by: Xunlei Pang <[email protected]>
---
mm/slab.h | 2 ++
mm/slub.c | 37 ++++++++++++++++++++++++++++++++++++-
2 files changed, 38 insertions(+), 1 deletion(-)
diff --git a/mm/slab.h b/mm/slab.h
index 7e94700..c85e2fa 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -616,6 +616,8 @@ struct kmem_cache_node {
#ifdef CONFIG_SLUB
unsigned long nr_partial;
struct list_head partial;
+ atomic_long_t partial_free_objs;
+ atomic_long_t partial_total_objs;
#ifdef CONFIG_SLUB_DEBUG
atomic_long_t nr_slabs;
atomic_long_t total_objects;
diff --git a/mm/slub.c b/mm/slub.c
index 6589b41..6708d96 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1775,10 +1775,24 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
/*
* Management of partially allocated slabs.
*/
+
+static inline void
+__update_partial_free(struct kmem_cache_node *n, long delta)
+{
+ atomic_long_add(delta, &n->partial_free_objs);
+}
+
+static inline void
+__update_partial_total(struct kmem_cache_node *n, long delta)
+{
+ atomic_long_add(delta, &n->partial_total_objs);
+}
+
static inline void
__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
{
n->nr_partial++;
+ __update_partial_total(n, page->objects);
if (tail == DEACTIVATE_TO_TAIL)
list_add_tail(&page->slab_list, &n->partial);
else
@@ -1798,6 +1812,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
lockdep_assert_held(&n->list_lock);
list_del(&page->slab_list);
n->nr_partial--;
+ __update_partial_total(n, -page->objects);
}
/*
@@ -1842,6 +1857,7 @@ static inline void *acquire_slab(struct kmem_cache *s,
return NULL;
remove_partial(n, page);
+ __update_partial_free(n, -*objects);
WARN_ON(!freelist);
return freelist;
}
@@ -2174,8 +2190,11 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
"unfreezing slab"))
goto redo;
- if (lock)
+ if (lock) {
+ if (m == M_PARTIAL)
+ __update_partial_free(n, page->objects - page->inuse);
spin_unlock(&n->list_lock);
+ }
if (m == M_PARTIAL)
stat(s, tail);
@@ -2241,6 +2260,7 @@ static void unfreeze_partials(struct kmem_cache *s,
discard_page = page;
} else {
add_partial(n, page, DEACTIVATE_TO_TAIL);
+ __update_partial_free(n, page->objects - page->inuse);
stat(s, FREE_ADD_PARTIAL);
}
}
@@ -2915,6 +2935,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
head, new.counters,
"__slab_free"));
+ if (!was_frozen && prior) {
+ if (n)
+ __update_partial_free(n, cnt);
+ else
+ __update_partial_free(get_node(s, page_to_nid(page)), cnt);
+ }
+
if (likely(!n)) {
/*
@@ -2944,6 +2971,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
remove_full(s, n, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
+ __update_partial_free(n, page->objects - page->inuse);
stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -2955,6 +2983,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* Slab on the partial list.
*/
remove_partial(n, page);
+ __update_partial_free(n, page->inuse - page->objects);
stat(s, FREE_REMOVE_PARTIAL);
} else {
/* Slab must be on the full list */
@@ -3364,6 +3393,8 @@ static inline int calculate_order(unsigned int size)
n->nr_partial = 0;
spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
+ atomic_long_set(&n->partial_free_objs, 0);
+ atomic_long_set(&n->partial_total_objs, 0);
#ifdef CONFIG_SLUB_DEBUG
atomic_long_set(&n->nr_slabs, 0);
atomic_long_set(&n->total_objects, 0);
@@ -3437,6 +3468,7 @@ static void early_kmem_cache_node_alloc(int node)
* initialized and there is no concurrent access.
*/
__add_partial(n, page, DEACTIVATE_TO_HEAD);
+ __update_partial_free(n, page->objects - page->inuse);
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3747,6 +3779,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
list_for_each_entry_safe(page, h, &n->partial, slab_list) {
if (!page->inuse) {
remove_partial(n, page);
+ __update_partial_free(n, page->objects - page->inuse);
list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
@@ -4045,6 +4078,8 @@ int __kmem_cache_shrink(struct kmem_cache *s)
if (free == page->objects) {
list_move(&page->slab_list, &discard);
n->nr_partial--;
+ __update_partial_free(n, -free);
+ __update_partial_total(n, -free);
} else if (free <= SHRINK_PROMOTE_MAX)
list_move(&page->slab_list, promote + free - 1);
}
--
1.8.3.1
The only concern of introducing partial counter is that,
partial_free_objs may cause atomic operation contention
in case of same SLUB concurrent __slab_free().
This patch changes it to be a percpu counter to avoid that.
Co-developed-by: Wen Yang <[email protected]>
Signed-off-by: Xunlei Pang <[email protected]>
---
mm/slab.h | 2 +-
mm/slub.c | 38 +++++++++++++++++++++++++++++++-------
2 files changed, 32 insertions(+), 8 deletions(-)
diff --git a/mm/slab.h b/mm/slab.h
index c85e2fa..a709a70 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -616,7 +616,7 @@ struct kmem_cache_node {
#ifdef CONFIG_SLUB
unsigned long nr_partial;
struct list_head partial;
- atomic_long_t partial_free_objs;
+ atomic_long_t __percpu *partial_free_objs;
atomic_long_t partial_total_objs;
#ifdef CONFIG_SLUB_DEBUG
atomic_long_t nr_slabs;
diff --git a/mm/slub.c b/mm/slub.c
index 25a4421..f6fc60b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1775,11 +1775,21 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
/*
* Management of partially allocated slabs.
*/
+static inline long get_partial_free(struct kmem_cache_node *n)
+{
+ long nr = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ nr += atomic_long_read(per_cpu_ptr(n->partial_free_objs, cpu));
+
+ return nr;
+}
static inline void
__update_partial_free(struct kmem_cache_node *n, long delta)
{
- atomic_long_add(delta, &n->partial_free_objs);
+ atomic_long_add(delta, this_cpu_ptr(n->partial_free_objs));
}
static inline void
@@ -2429,12 +2439,12 @@ static unsigned long partial_counter(struct kmem_cache_node *n,
unsigned long ret = 0;
if (item == PARTIAL_FREE) {
- ret = atomic_long_read(&n->partial_free_objs);
+ ret = get_partial_free(n);
} else if (item == PARTIAL_TOTAL) {
ret = atomic_long_read(&n->partial_total_objs);
} else if (item == PARTIAL_INUSE) {
ret = atomic_long_read(&n->partial_total_objs) -
- atomic_long_read(&n->partial_free_objs);
+ get_partial_free(n);
if ((long)ret < 0)
ret = 0;
}
@@ -3390,19 +3400,28 @@ static inline int calculate_order(unsigned int size)
return -ENOSYS;
}
-static void
+static int
init_kmem_cache_node(struct kmem_cache_node *n)
{
+ int cpu;
+
n->nr_partial = 0;
spin_lock_init(&n->list_lock);
INIT_LIST_HEAD(&n->partial);
- atomic_long_set(&n->partial_free_objs, 0);
+
+ n->partial_free_objs = alloc_percpu(atomic_long_t);
+ if (!n->partial_free_objs)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu)
+ atomic_long_set(per_cpu_ptr(n->partial_free_objs, cpu), 0);
atomic_long_set(&n->partial_total_objs, 0);
#ifdef CONFIG_SLUB_DEBUG
atomic_long_set(&n->nr_slabs, 0);
atomic_long_set(&n->total_objects, 0);
INIT_LIST_HEAD(&n->full);
#endif
+
+ return 0;
}
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
@@ -3463,7 +3482,7 @@ static void early_kmem_cache_node_alloc(int node)
page->inuse = 1;
page->frozen = 0;
kmem_cache_node->node[node] = n;
- init_kmem_cache_node(n);
+ BUG_ON(init_kmem_cache_node(n) < 0);
inc_slabs_node(kmem_cache_node, node, page->objects);
/*
@@ -3481,6 +3500,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
for_each_kmem_cache_node(s, node, n) {
s->node[node] = NULL;
+ free_percpu(n->partial_free_objs);
kmem_cache_free(kmem_cache_node, n);
}
}
@@ -3511,7 +3531,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
return 0;
}
- init_kmem_cache_node(n);
+ if (init_kmem_cache_node(n) < 0) {
+ free_kmem_cache_nodes(s);
+ return 0;
+ }
+
s->node[node] = n;
}
return 1;
--
1.8.3.1
Now the partial counters are ready, let's use them directly
and get rid of count_partial().
Co-developed-by: Wen Yang <[email protected]>
Signed-off-by: Xunlei Pang <[email protected]>
---
mm/slub.c | 57 ++++++++++++++++++++++++---------------------------------
1 file changed, 24 insertions(+), 33 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index 6708d96..25a4421 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2414,11 +2414,6 @@ static inline int node_match(struct page *page, int node)
}
#ifdef CONFIG_SLUB_DEBUG
-static int count_free(struct page *page)
-{
- return page->objects - page->inuse;
-}
-
static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
{
return atomic_long_read(&n->total_objects);
@@ -2426,19 +2421,27 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
#endif /* CONFIG_SLUB_DEBUG */
#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
-static unsigned long count_partial(struct kmem_cache_node *n,
- int (*get_count)(struct page *))
-{
- unsigned long flags;
- unsigned long x = 0;
- struct page *page;
+enum partial_item { PARTIAL_FREE, PARTIAL_INUSE, PARTIAL_TOTAL };
+
+static unsigned long partial_counter(struct kmem_cache_node *n,
+ enum partial_item item)
+{
+ unsigned long ret = 0;
+
+ if (item == PARTIAL_FREE) {
+ ret = atomic_long_read(&n->partial_free_objs);
+ } else if (item == PARTIAL_TOTAL) {
+ ret = atomic_long_read(&n->partial_total_objs);
+ } else if (item == PARTIAL_INUSE) {
+ ret = atomic_long_read(&n->partial_total_objs) -
+ atomic_long_read(&n->partial_free_objs);
+ if ((long)ret < 0)
+ ret = 0;
+ }
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, slab_list)
- x += get_count(page);
- spin_unlock_irqrestore(&n->list_lock, flags);
- return x;
+ return ret;
}
+
#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
static noinline void
@@ -2468,7 +2471,7 @@ static unsigned long count_partial(struct kmem_cache_node *n,
unsigned long nr_objs;
unsigned long nr_free;
- nr_free = count_partial(n, count_free);
+ nr_free = partial_counter(n, PARTIAL_FREE);
nr_slabs = node_nr_slabs(n);
nr_objs = node_nr_objs(n);
@@ -4444,18 +4447,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
}
#endif
-#ifdef CONFIG_SYSFS
-static int count_inuse(struct page *page)
-{
- return page->inuse;
-}
-
-static int count_total(struct page *page)
-{
- return page->objects;
-}
-#endif
-
#ifdef CONFIG_SLUB_DEBUG
static void validate_slab(struct kmem_cache *s, struct page *page)
{
@@ -4912,7 +4903,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x = atomic_long_read(&n->total_objects);
else if (flags & SO_OBJECTS)
x = atomic_long_read(&n->total_objects) -
- count_partial(n, count_free);
+ partial_counter(n, PARTIAL_FREE);
else
x = atomic_long_read(&n->nr_slabs);
total += x;
@@ -4926,9 +4917,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
for_each_kmem_cache_node(s, node, n) {
if (flags & SO_TOTAL)
- x = count_partial(n, count_total);
+ x = partial_counter(n, PARTIAL_TOTAL);
else if (flags & SO_OBJECTS)
- x = count_partial(n, count_inuse);
+ x = partial_counter(n, PARTIAL_INUSE);
else
x = n->nr_partial;
total += x;
@@ -5961,7 +5952,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
for_each_kmem_cache_node(s, node, n) {
nr_slabs += node_nr_slabs(n);
nr_objs += node_nr_objs(n);
- nr_free += count_partial(n, count_free);
+ nr_free += partial_counter(n, PARTIAL_FREE);
}
sinfo->active_objs = nr_objs - nr_free;
--
1.8.3.1
On Mon, Aug 10, 2020 at 3:18 PM Xunlei Pang <[email protected]> wrote:
>
> v1->v2:
> - Improved changelog and variable naming for PATCH 1~2.
> - PATCH3 adds per-cpu counter to avoid performance regression
> in concurrent __slab_free().
>
> [Testing]
> On my 32-cpu 2-socket physical machine:
> Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
> perf stat --null --repeat 10 -- hackbench 20 thread 20000
>
> == original, no patched
> 19.211637055 seconds time elapsed ( +- 0.57% )
>
> == patched with patch1~2
> Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
>
> 21.731833146 seconds time elapsed ( +- 0.17% )
>
> == patched with patch1~3
> Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
>
> 19.112106847 seconds time elapsed ( +- 0.64% )
>
>
> Xunlei Pang (3):
> mm/slub: Introduce two counters for partial objects
> mm/slub: Get rid of count_partial()
> mm/slub: Use percpu partial free counter
>
> mm/slab.h | 2 +
> mm/slub.c | 124 +++++++++++++++++++++++++++++++++++++++++++-------------------
> 2 files changed, 89 insertions(+), 37 deletions(-)
We probably need to wrap the counters under CONFIG_SLUB_DEBUG because
AFAICT all the code that uses them is also wrapped under it.
An alternative approach for this patch would be to somehow make the
lock in count_partial() more granular, but I don't know how feasible
that actually is.
Anyway, I am OK with this approach:
Reviewed-by: Pekka Enberg <[email protected]>
You still need to convince Christoph, though, because he had
objections over this approach.
- Pekka
On 2020/8/20 下午10:02, Pekka Enberg wrote:
> On Mon, Aug 10, 2020 at 3:18 PM Xunlei Pang <[email protected]> wrote:
>>
>> v1->v2:
>> - Improved changelog and variable naming for PATCH 1~2.
>> - PATCH3 adds per-cpu counter to avoid performance regression
>> in concurrent __slab_free().
>>
>> [Testing]
>> On my 32-cpu 2-socket physical machine:
>> Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
>> perf stat --null --repeat 10 -- hackbench 20 thread 20000
>>
>> == original, no patched
>> 19.211637055 seconds time elapsed ( +- 0.57% )
>>
>> == patched with patch1~2
>> Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
>>
>> 21.731833146 seconds time elapsed ( +- 0.17% )
>>
>> == patched with patch1~3
>> Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
>>
>> 19.112106847 seconds time elapsed ( +- 0.64% )
>>
>>
>> Xunlei Pang (3):
>> mm/slub: Introduce two counters for partial objects
>> mm/slub: Get rid of count_partial()
>> mm/slub: Use percpu partial free counter
>>
>> mm/slab.h | 2 +
>> mm/slub.c | 124 +++++++++++++++++++++++++++++++++++++++++++-------------------
>> 2 files changed, 89 insertions(+), 37 deletions(-)
>
> We probably need to wrap the counters under CONFIG_SLUB_DEBUG because
> AFAICT all the code that uses them is also wrapped under it.
/sys/kernel/slab/***/partial sysfs also uses it, I can wrap it with
CONFIG_SLUB_DEBUG or CONFIG_SYSFS for backward compatibility.
>
> An alternative approach for this patch would be to somehow make the
> lock in count_partial() more granular, but I don't know how feasible
> that actually is.
>
> Anyway, I am OK with this approach:
>
> Reviewed-by: Pekka Enberg <[email protected]>
Thanks!
>
> You still need to convince Christoph, though, because he had
> objections over this approach.
Christoph, what do you think, or any better suggestion to address this
*in production* issue?
>
> - Pekka
>
Any progress on this? The problem addressed by this patch has also
made jitters to our online apps which are quite annoying.
On Mon, Aug 24, 2020 at 6:05 PM xunlei <[email protected]> wrote:
>
> On 2020/8/20 下午10:02, Pekka Enberg wrote:
> > On Mon, Aug 10, 2020 at 3:18 PM Xunlei Pang <[email protected]> wrote:
> >>
> >> v1->v2:
> >> - Improved changelog and variable naming for PATCH 1~2.
> >> - PATCH3 adds per-cpu counter to avoid performance regression
> >> in concurrent __slab_free().
> >>
> >> [Testing]
> >> On my 32-cpu 2-socket physical machine:
> >> Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
> >> perf stat --null --repeat 10 -- hackbench 20 thread 20000
> >>
> >> == original, no patched
> >> 19.211637055 seconds time elapsed ( +- 0.57% )
> >>
> >> == patched with patch1~2
> >> Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
> >>
> >> 21.731833146 seconds time elapsed ( +- 0.17% )
> >>
> >> == patched with patch1~3
> >> Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
> >>
> >> 19.112106847 seconds time elapsed ( +- 0.64% )
> >>
> >>
> >> Xunlei Pang (3):
> >> mm/slub: Introduce two counters for partial objects
> >> mm/slub: Get rid of count_partial()
> >> mm/slub: Use percpu partial free counter
> >>
> >> mm/slab.h | 2 +
> >> mm/slub.c | 124 +++++++++++++++++++++++++++++++++++++++++++-------------------
> >> 2 files changed, 89 insertions(+), 37 deletions(-)
> >
> > We probably need to wrap the counters under CONFIG_SLUB_DEBUG because
> > AFAICT all the code that uses them is also wrapped under it.
>
> /sys/kernel/slab/***/partial sysfs also uses it, I can wrap it with
> CONFIG_SLUB_DEBUG or CONFIG_SYSFS for backward compatibility.
>
> >
> > An alternative approach for this patch would be to somehow make the
> > lock in count_partial() more granular, but I don't know how feasible
> > that actually is.
> >
> > Anyway, I am OK with this approach:
> >
> > Reviewed-by: Pekka Enberg <[email protected]>
>
> Thanks!
>
> >
> > You still need to convince Christoph, though, because he had
> > objections over this approach.
>
> Christoph, what do you think, or any better suggestion to address this
> *in production* issue?
>
> >
> > - Pekka
> >
On Mon, Aug 10, 2020 at 8:22 PM Xunlei Pang <[email protected]> wrote:
> static inline void
> @@ -2429,12 +2439,12 @@ static unsigned long partial_counter(struct kmem_cache_node *n,
> unsigned long ret = 0;
>
> if (item == PARTIAL_FREE) {
> - ret = atomic_long_read(&n->partial_free_objs);
> + ret = get_partial_free(n);
> } else if (item == PARTIAL_TOTAL) {
> ret = atomic_long_read(&n->partial_total_objs);
> } else if (item == PARTIAL_INUSE) {
> ret = atomic_long_read(&n->partial_total_objs) -
> - atomic_long_read(&n->partial_free_objs);
> + get_partial_free(n);
Is it "ret = get_partial_free(n);" above?
> if ((long)ret < 0)
> ret = 0;
> }
On Mon, 10 Aug 2020, Xunlei Pang wrote:
>
> diff --git a/mm/slab.h b/mm/slab.h
> index c85e2fa..a709a70 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -616,7 +616,7 @@ struct kmem_cache_node {
> #ifdef CONFIG_SLUB
> unsigned long nr_partial;
> struct list_head partial;
> - atomic_long_t partial_free_objs;
> + atomic_long_t __percpu *partial_free_objs;
A percpu counter is never atomic. Just use unsigned long and use this_cpu
operations for this thing. That should cut down further on the overhead.
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1775,11 +1775,21 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
> /*
> * Management of partially allocated slabs.
> */
> +static inline long get_partial_free(struct kmem_cache_node *n)
> +{
> + long nr = 0;
> + int cpu;
> +
> + for_each_possible_cpu(cpu)
> + nr += atomic_long_read(per_cpu_ptr(n->partial_free_objs, cpu));
this_cpu_read(*n->partial_free_objs)
> static inline void
> __update_partial_free(struct kmem_cache_node *n, long delta)
> {
> - atomic_long_add(delta, &n->partial_free_objs);
> + atomic_long_add(delta, this_cpu_ptr(n->partial_free_objs));
this_cpu_add()
and so on.
On 3/1/21 6:31 PM, Shu Ming wrote:
> Any progress on this? The problem addressed by this patch has also
> made jitters to our online apps which are quite annoying.
>
Thanks for the attention.
There's some further improvements on v2, I'm gonna send v3 out later.
> On Mon, Aug 24, 2020 at 6:05 PM xunlei <[email protected]> wrote:
>>
>> On 2020/8/20 下午10:02, Pekka Enberg wrote:
>>> On Mon, Aug 10, 2020 at 3:18 PM Xunlei Pang <[email protected]> wrote:
>>>>
>>>> v1->v2:
>>>> - Improved changelog and variable naming for PATCH 1~2.
>>>> - PATCH3 adds per-cpu counter to avoid performance regression
>>>> in concurrent __slab_free().
>>>>
>>>> [Testing]
>>>> On my 32-cpu 2-socket physical machine:
>>>> Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
>>>> perf stat --null --repeat 10 -- hackbench 20 thread 20000
>>>>
>>>> == original, no patched
>>>> 19.211637055 seconds time elapsed ( +- 0.57% )
>>>>
>>>> == patched with patch1~2
>>>> Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
>>>>
>>>> 21.731833146 seconds time elapsed ( +- 0.17% )
>>>>
>>>> == patched with patch1~3
>>>> Performance counter stats for 'hackbench 20 thread 20000' (10 runs):
>>>>
>>>> 19.112106847 seconds time elapsed ( +- 0.64% )
>>>>
>>>>
>>>> Xunlei Pang (3):
>>>> mm/slub: Introduce two counters for partial objects
>>>> mm/slub: Get rid of count_partial()
>>>> mm/slub: Use percpu partial free counter
>>>>
>>>> mm/slab.h | 2 +
>>>> mm/slub.c | 124 +++++++++++++++++++++++++++++++++++++++++++-------------------
>>>> 2 files changed, 89 insertions(+), 37 deletions(-)
>>>
>>> We probably need to wrap the counters under CONFIG_SLUB_DEBUG because
>>> AFAICT all the code that uses them is also wrapped under it.
>>
>> /sys/kernel/slab/***/partial sysfs also uses it, I can wrap it with
>> CONFIG_SLUB_DEBUG or CONFIG_SYSFS for backward compatibility.
>>
>>>
>>> An alternative approach for this patch would be to somehow make the
>>> lock in count_partial() more granular, but I don't know how feasible
>>> that actually is.
>>>
>>> Anyway, I am OK with this approach:
>>>
>>> Reviewed-by: Pekka Enberg <[email protected]>
>>
>> Thanks!
>>
>>>
>>> You still need to convince Christoph, though, because he had
>>> objections over this approach.
>>
>> Christoph, what do you think, or any better suggestion to address this
>> *in production* issue?
>>
>>>
>>> - Pekka
>>>
On Tue, Mar 02, 2021 at 10:14:53AM +0100, Christoph Lameter wrote:
> On Mon, 10 Aug 2020, Xunlei Pang wrote:
> > - atomic_long_t partial_free_objs;
> > + atomic_long_t __percpu *partial_free_objs;
>
> A percpu counter is never atomic. Just use unsigned long and use this_cpu
> operations for this thing. That should cut down further on the overhead.
What about allocations from interrupt context? Should this be a local_t
instead?
On Wed, 3 Mar 2021, Matthew Wilcox wrote:
> On Tue, Mar 02, 2021 at 10:14:53AM +0100, Christoph Lameter wrote:
> > On Mon, 10 Aug 2020, Xunlei Pang wrote:
> > > - atomic_long_t partial_free_objs;
> > > + atomic_long_t __percpu *partial_free_objs;
> >
> > A percpu counter is never atomic. Just use unsigned long and use this_cpu
> > operations for this thing. That should cut down further on the overhead.
>
> What about allocations from interrupt context? Should this be a local_t
> instead?
Can this be allocated in an interrupt context?
And I am not sure how local_t relates to that? Percpu counters can be used
in an interrupt context without the overhead of the address calculations
that are required by a local_t.
On Wed, Mar 03, 2021 at 08:15:58PM +0100, Christoph Lameter wrote:
> On Wed, 3 Mar 2021, Matthew Wilcox wrote:
>
> > On Tue, Mar 02, 2021 at 10:14:53AM +0100, Christoph Lameter wrote:
> > > On Mon, 10 Aug 2020, Xunlei Pang wrote:
> > > > - atomic_long_t partial_free_objs;
> > > > + atomic_long_t __percpu *partial_free_objs;
> > >
> > > A percpu counter is never atomic. Just use unsigned long and use this_cpu
> > > operations for this thing. That should cut down further on the overhead.
> >
> > What about allocations from interrupt context? Should this be a local_t
> > instead?
>
> Can this be allocated in an interrupt context?
>
> And I am not sure how local_t relates to that? Percpu counters can be used
> in an interrupt context without the overhead of the address calculations
> that are required by a local_t.
As I understand the patch, this counts the number of partially free slabs.
So if we start to free an object from a completely full slab in process
context, as "load x, add one to x, store x" and take an interrupt
between loading x and adding one to x, that interrupt handler might
free a different object from another completely full slab. that would
also load the same x, add one to it and store x, but then the process
context would add one to the old x, overwriting the updated value from
interrupt context.
it's not the likeliest of races, and i don't know how important it is
that these counters remain accurate. but using a local_t instead of
a percpu long would fix the problem. i don't know why you think that
a local_t needs "address calculations". perhaps you've misremembered
what a local_t is?
On Wed, 3 Mar 2021, Matthew Wilcox wrote:
> > Can this be allocated in an interrupt context?
> >
> > And I am not sure how local_t relates to that? Percpu counters can be used
> > in an interrupt context without the overhead of the address calculations
> > that are required by a local_t.
>
> As I understand the patch, this counts the number of partially free slabs.
> So if we start to free an object from a completely full slab in process
> context, as "load x, add one to x, store x" and take an interrupt
> between loading x and adding one to x, that interrupt handler might
> free a different object from another completely full slab. that would
> also load the same x, add one to it and store x, but then the process
> context would add one to the old x, overwriting the updated value from
> interrupt context.
this_cpu operations are "atomic" vs. preemption but on some platforms not
vs interrupts. That could be an issue in kmem_cache_free(). This would
need a modification to the relevant this_cpu ops so that interrupts are
disabled on those platforms.
Like this_cpu_inc_irq() or so?
> it's not the likeliest of races, and i don't know how important it is
> that these counters remain accurate. but using a local_t instead of
> a percpu long would fix the problem. i don't know why you think that
> a local_t needs "address calculations". perhaps you've misremembered
> what a local_t is?
local_t does not include the address arithmetic that the this_cpu
operation can implicitly perform on x86 f.e. with an segment register or
maybe another register on another platform thereby avoiding the need to
disable preemption or interrupts.
Therefore a manual calculation of the target address for a local_t
operation needs to be done beforehand which usually means disabling
interrupts and/or preemption for the code segment. Otherwise we may end up
on a different processor due to scheduler or other interruptions and use
the percpu counter value of a different processor which could be racy.
On Wed, Mar 03, 2021 at 08:55:48PM +0100, Christoph Lameter wrote:
> On Wed, 3 Mar 2021, Matthew Wilcox wrote:
>
> > > Can this be allocated in an interrupt context?
> > >
> > > And I am not sure how local_t relates to that? Percpu counters can be used
> > > in an interrupt context without the overhead of the address calculations
> > > that are required by a local_t.
> >
> > As I understand the patch, this counts the number of partially free slabs.
> > So if we start to free an object from a completely full slab in process
> > context, as "load x, add one to x, store x" and take an interrupt
> > between loading x and adding one to x, that interrupt handler might
> > free a different object from another completely full slab. that would
> > also load the same x, add one to it and store x, but then the process
> > context would add one to the old x, overwriting the updated value from
> > interrupt context.
>
> this_cpu operations are "atomic" vs. preemption but on some platforms not
> vs interrupts. That could be an issue in kmem_cache_free(). This would
> need a modification to the relevant this_cpu ops so that interrupts are
> disabled on those platforms.
Hmmmm ... re-reading the documentation, it says that this_cpu_x is
atomic against interrupts:
These operations can be used without worrying about
preemption and interrupts::
[...]
this_cpu_add(pcp, val)
this_cpu_inc(pcp)
...
On 3/2/21 5:14 PM, Christoph Lameter wrote:
> On Mon, 10 Aug 2020, Xunlei Pang wrote:
>
>>
>> diff --git a/mm/slab.h b/mm/slab.h
>> index c85e2fa..a709a70 100644
>> --- a/mm/slab.h
>> +++ b/mm/slab.h
>> @@ -616,7 +616,7 @@ struct kmem_cache_node {
>> #ifdef CONFIG_SLUB
>> unsigned long nr_partial;
>> struct list_head partial;
>> - atomic_long_t partial_free_objs;
>> + atomic_long_t __percpu *partial_free_objs;
>
> A percpu counter is never atomic. Just use unsigned long and use this_cpu
> operations for this thing. That should cut down further on the overhead.
>
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -1775,11 +1775,21 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
>> /*
>> * Management of partially allocated slabs.
>> */
>> +static inline long get_partial_free(struct kmem_cache_node *n)
>> +{
>> + long nr = 0;
>> + int cpu;
>> +
>> + for_each_possible_cpu(cpu)
>> + nr += atomic_long_read(per_cpu_ptr(n->partial_free_objs, cpu));
>
> this_cpu_read(*n->partial_free_objs)
>
>> static inline void
>> __update_partial_free(struct kmem_cache_node *n, long delta)
>> {
>> - atomic_long_add(delta, &n->partial_free_objs);
>> + atomic_long_add(delta, this_cpu_ptr(n->partial_free_objs));
>
> this_cpu_add()
>
> and so on.
>
Thanks, I changed them both to use "unsigned long", and will send v3 out
after our internal performance regression test passes.