With kmem cgroup support, high memcgs churn can leave behind a lot of
empty kmem_caches. Usually such kmem_caches will be destroyed when the
corresponding memcg gets released but the memcg release can be
arbitrarily delayed. These empty kmem_caches wastes cache_reaper's time.
So, the reaper should destroy such empty offlined kmem_caches.
Signed-off-by: Shakeel Butt <[email protected]>
---
mm/slab.c | 18 ++++++++++++++++--
mm/slab.h | 15 +++++++++++++++
mm/slab_common.c | 2 +-
3 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/mm/slab.c b/mm/slab.c
index 66f2db98f026..9c174a799ffb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4004,6 +4004,16 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
slabs_destroy(cachep, &list);
}
+static bool is_slab_active(struct kmem_cache *cachep)
+{
+ int node;
+ struct kmem_cache_node *n;
+
+ for_each_kmem_cache_node(cachep, node, n)
+ if (READ_ONCE(n->total_slabs) - n->free_slabs)
+ return true;
+ return false;
+}
/**
* cache_reap - Reclaim memory from caches.
* @w: work descriptor
@@ -4018,7 +4028,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
*/
static void cache_reap(struct work_struct *w)
{
- struct kmem_cache *searchp;
+ struct kmem_cache *searchp, *tmp;
struct kmem_cache_node *n;
int node = numa_mem_id();
struct delayed_work *work = to_delayed_work(w);
@@ -4027,7 +4037,7 @@ static void cache_reap(struct work_struct *w)
/* Give up. Setup the next iteration. */
goto out;
- list_for_each_entry(searchp, &slab_caches, list) {
+ list_for_each_entry_safe(searchp, tmp, &slab_caches, list) {
check_irq_on();
/*
@@ -4061,6 +4071,10 @@ static void cache_reap(struct work_struct *w)
5 * searchp->num - 1) / (5 * searchp->num));
STATS_ADD_REAPED(searchp, freed);
}
+
+ /* Eagerly delete inactive kmem_cache of an offlined memcg. */
+ if (!is_memcg_online(searchp) && !is_slab_active(searchp))
+ shutdown_cache(searchp);
next:
cond_resched();
}
diff --git a/mm/slab.h b/mm/slab.h
index e8981e811c45..e911b10efae7 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -166,6 +166,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
SLAB_TEMPORARY | \
SLAB_ACCOUNT)
+int shutdown_cache(struct kmem_cache *s);
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
@@ -290,6 +291,15 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
memcg_kmem_uncharge(page, order);
}
+static __always_inline bool is_memcg_online(struct kmem_cache *s)
+{
+ if (!memcg_kmem_enabled())
+ return true;
+ if (is_root_cache(s))
+ return true;
+ return mem_cgroup_online(s->memcg_params.memcg);
+}
+
extern void slab_init_memcg_params(struct kmem_cache *);
extern void memcg_link_cache(struct kmem_cache *s);
extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
@@ -342,6 +352,11 @@ static inline void memcg_uncharge_slab(struct page *page, int order,
{
}
+static inline bool is_memcg_online(struct kmem_cache *s)
+{
+ return true;
+}
+
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 61ab2ca8bea7..d197e878636b 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -573,7 +573,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
}
}
-static int shutdown_cache(struct kmem_cache *s)
+int shutdown_cache(struct kmem_cache *s)
{
/* free asan quarantined objects */
kasan_cache_shutdown(s);
--
2.17.0.rc0.231.g781580f067-goog
[Cc Vladimir]
On Wed 21-03-18 15:43:01, Shakeel Butt wrote:
> With kmem cgroup support, high memcgs churn can leave behind a lot of
> empty kmem_caches. Usually such kmem_caches will be destroyed when the
> corresponding memcg gets released but the memcg release can be
> arbitrarily delayed. These empty kmem_caches wastes cache_reaper's time.
> So, the reaper should destroy such empty offlined kmem_caches.
>
> Signed-off-by: Shakeel Butt <[email protected]>
> ---
> mm/slab.c | 18 ++++++++++++++++--
> mm/slab.h | 15 +++++++++++++++
> mm/slab_common.c | 2 +-
> 3 files changed, 32 insertions(+), 3 deletions(-)
>
> diff --git a/mm/slab.c b/mm/slab.c
> index 66f2db98f026..9c174a799ffb 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -4004,6 +4004,16 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
> slabs_destroy(cachep, &list);
> }
>
> +static bool is_slab_active(struct kmem_cache *cachep)
> +{
> + int node;
> + struct kmem_cache_node *n;
> +
> + for_each_kmem_cache_node(cachep, node, n)
> + if (READ_ONCE(n->total_slabs) - n->free_slabs)
> + return true;
> + return false;
> +}
> /**
> * cache_reap - Reclaim memory from caches.
> * @w: work descriptor
> @@ -4018,7 +4028,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
> */
> static void cache_reap(struct work_struct *w)
> {
> - struct kmem_cache *searchp;
> + struct kmem_cache *searchp, *tmp;
> struct kmem_cache_node *n;
> int node = numa_mem_id();
> struct delayed_work *work = to_delayed_work(w);
> @@ -4027,7 +4037,7 @@ static void cache_reap(struct work_struct *w)
> /* Give up. Setup the next iteration. */
> goto out;
>
> - list_for_each_entry(searchp, &slab_caches, list) {
> + list_for_each_entry_safe(searchp, tmp, &slab_caches, list) {
> check_irq_on();
>
> /*
> @@ -4061,6 +4071,10 @@ static void cache_reap(struct work_struct *w)
> 5 * searchp->num - 1) / (5 * searchp->num));
> STATS_ADD_REAPED(searchp, freed);
> }
> +
> + /* Eagerly delete inactive kmem_cache of an offlined memcg. */
> + if (!is_memcg_online(searchp) && !is_slab_active(searchp))
> + shutdown_cache(searchp);
> next:
> cond_resched();
> }
> diff --git a/mm/slab.h b/mm/slab.h
> index e8981e811c45..e911b10efae7 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -166,6 +166,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
> SLAB_TEMPORARY | \
> SLAB_ACCOUNT)
>
> +int shutdown_cache(struct kmem_cache *s);
> int __kmem_cache_shutdown(struct kmem_cache *);
> void __kmem_cache_release(struct kmem_cache *);
> int __kmem_cache_shrink(struct kmem_cache *);
> @@ -290,6 +291,15 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
> memcg_kmem_uncharge(page, order);
> }
>
> +static __always_inline bool is_memcg_online(struct kmem_cache *s)
> +{
> + if (!memcg_kmem_enabled())
> + return true;
> + if (is_root_cache(s))
> + return true;
> + return mem_cgroup_online(s->memcg_params.memcg);
> +}
> +
> extern void slab_init_memcg_params(struct kmem_cache *);
> extern void memcg_link_cache(struct kmem_cache *s);
> extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
> @@ -342,6 +352,11 @@ static inline void memcg_uncharge_slab(struct page *page, int order,
> {
> }
>
> +static inline bool is_memcg_online(struct kmem_cache *s)
> +{
> + return true;
> +}
> +
> static inline void slab_init_memcg_params(struct kmem_cache *s)
> {
> }
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index 61ab2ca8bea7..d197e878636b 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -573,7 +573,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
> }
> }
>
> -static int shutdown_cache(struct kmem_cache *s)
> +int shutdown_cache(struct kmem_cache *s)
> {
> /* free asan quarantined objects */
> kasan_cache_shutdown(s);
> --
> 2.17.0.rc0.231.g781580f067-goog
--
Michal Hocko
SUSE Labs
Hello Shakeel,
The patch makes sense to me, but I have a concern about synchronization
of cache destruction vs concurrent kmem_cache_free. Please, see my
comments inline.
On Wed, Mar 21, 2018 at 03:43:01PM -0700, Shakeel Butt wrote:
> With kmem cgroup support, high memcgs churn can leave behind a lot of
> empty kmem_caches. Usually such kmem_caches will be destroyed when the
> corresponding memcg gets released but the memcg release can be
> arbitrarily delayed. These empty kmem_caches wastes cache_reaper's time.
> So, the reaper should destroy such empty offlined kmem_caches.
> diff --git a/mm/slab.c b/mm/slab.c
> index 66f2db98f026..9c174a799ffb 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -4004,6 +4004,16 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
> slabs_destroy(cachep, &list);
> }
>
> +static bool is_slab_active(struct kmem_cache *cachep)
> +{
> + int node;
> + struct kmem_cache_node *n;
> +
> + for_each_kmem_cache_node(cachep, node, n)
> + if (READ_ONCE(n->total_slabs) - n->free_slabs)
Why READ_ONCE total_slabs, but not free_slabs?
Anyway, AFAIU there's no guarantee that this CPU sees the two fields
updated in the same order as they were actually updated on another CPU.
For example, suppose total_slabs is 2 and free_slabs is 1, and another
CPU is freeing a slab page concurrently from kmem_cache_free, i.e.
subtracting 1 from both total_slabs and free_slabs. Then this CPU might
see a transient state, when total_slabs is already updated (set to 1),
but free_slabs is not (still equals 1), and decide that it's safe to
destroy this slab cache while in fact it isn't.
Such a race will probably not result in any serious problems, because
shutdown_cache() checks that the cache is empty and does nothing if it
isn't, but still it looks suspicious and at least deserves a comment.
To eliminate the race, we should check total_slabs vs free_slabs with
kmem_cache_node->list_lock held. Alternatively, I think we could just
check if total_slabs is 0 - sooner or later cache_reap() will release
all empty slabs anyway.
> + return true;
> + return false;
> +}
> @@ -4061,6 +4071,10 @@ static void cache_reap(struct work_struct *w)
> 5 * searchp->num - 1) / (5 * searchp->num));
> STATS_ADD_REAPED(searchp, freed);
> }
> +
> + /* Eagerly delete inactive kmem_cache of an offlined memcg. */
> + if (!is_memcg_online(searchp) && !is_slab_active(searchp))
I don't think we need to define is_memcg_online in generic code.
I would merge is_memcg_online and is_slab_active, and call the
resulting function cache_is_active.
> + shutdown_cache(searchp);
> next:
> cond_resched();
> }
+Tejun, Johannes
Hi Vladimir,
On Sat, Mar 24, 2018 at 6:11 AM, Vladimir Davydov
<[email protected]> wrote:
> Hello Shakeel,
>
> The patch makes sense to me, but I have a concern about synchronization
> of cache destruction vs concurrent kmem_cache_free. Please, see my
> comments inline.
>
> On Wed, Mar 21, 2018 at 03:43:01PM -0700, Shakeel Butt wrote:
>> With kmem cgroup support, high memcgs churn can leave behind a lot of
>> empty kmem_caches. Usually such kmem_caches will be destroyed when the
>> corresponding memcg gets released but the memcg release can be
>> arbitrarily delayed. These empty kmem_caches wastes cache_reaper's time.
>> So, the reaper should destroy such empty offlined kmem_caches.
>
>> diff --git a/mm/slab.c b/mm/slab.c
>> index 66f2db98f026..9c174a799ffb 100644
>> --- a/mm/slab.c
>> +++ b/mm/slab.c
>> @@ -4004,6 +4004,16 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
>> slabs_destroy(cachep, &list);
>> }
>>
>> +static bool is_slab_active(struct kmem_cache *cachep)
>> +{
>> + int node;
>> + struct kmem_cache_node *n;
>> +
>> + for_each_kmem_cache_node(cachep, node, n)
>> + if (READ_ONCE(n->total_slabs) - n->free_slabs)
>
> Why READ_ONCE total_slabs, but not free_slabs?
>
> Anyway, AFAIU there's no guarantee that this CPU sees the two fields
> updated in the same order as they were actually updated on another CPU.
> For example, suppose total_slabs is 2 and free_slabs is 1, and another
> CPU is freeing a slab page concurrently from kmem_cache_free, i.e.
> subtracting 1 from both total_slabs and free_slabs. Then this CPU might
> see a transient state, when total_slabs is already updated (set to 1),
> but free_slabs is not (still equals 1), and decide that it's safe to
> destroy this slab cache while in fact it isn't.
>
> Such a race will probably not result in any serious problems, because
> shutdown_cache() checks that the cache is empty and does nothing if it
> isn't, but still it looks suspicious and at least deserves a comment.
> To eliminate the race, we should check total_slabs vs free_slabs with
> kmem_cache_node->list_lock held. Alternatively, I think we could just
> check if total_slabs is 0 - sooner or later cache_reap() will release
> all empty slabs anyway.
>
Checking total_slabs is 0 seems much simpler, I will test that.
>> + return true;
>> + return false;
>> +}
>
>> @@ -4061,6 +4071,10 @@ static void cache_reap(struct work_struct *w)
>> 5 * searchp->num - 1) / (5 * searchp->num));
>> STATS_ADD_REAPED(searchp, freed);
>> }
>> +
>> + /* Eagerly delete inactive kmem_cache of an offlined memcg. */
>> + if (!is_memcg_online(searchp) && !is_slab_active(searchp))
>
> I don't think we need to define is_memcg_online in generic code.
> I would merge is_memcg_online and is_slab_active, and call the
> resulting function cache_is_active.
>
Ack.
>> + shutdown_cache(searchp);
>> next:
>> cond_resched();
>> }
Currently I am holding off this patch as Greg Thelen has pointed out
(offline) a race condition this patch will introduce between
memcg_kmem_get_cache and the cache reaper. The memcg of the cache
returned by memcg_kmem_get_cache() can get offline while the
allocation is happening on that cache (allocation can take long time
due to reclaim or memory pressure). The reaper will see that the memcg
of this cache is offlined and let's say at the moment s->total_slabs
is 0, the reaper will delete the cache while parallel allocation is
going on.
I was thinking of adding an API to force a memcg to be online (or
rather delay the call to css_offline), something like
css_tryget_stay_online()/css_put_online() and use it in
memcg_kmem_get_cache() and memcg_kmem_put_cache(). However Tejun has
advised to not go through that route, more specifically not to tie
on/offling a css with accounting artifacts.
I am still exploring more solutions.
thanks,
Shakeel