mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
pages on each iteration. This makes practically impossible to decrease
limit of memory cgroup. Tasks could easily allocate back 32 pages,
so we can't reduce memory usage, and once retry_count reaches zero we return
-EBUSY.
It's easy to reproduce the problem by running the following commands:
mkdir /sys/fs/cgroup/memory/test
echo $$ >> /sys/fs/cgroup/memory/test/tasks
cat big_file > /dev/null &
sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
-bash: echo: write error: Device or resource busy
Instead of trying to free small amount of pages, it's much more
reasonable to free 'usage - limit' pages.
Signed-off-by: Andrey Ryabinin <[email protected]>
---
mm/memcontrol.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f40b5ad3f959..09ee052cf684 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2476,7 +2476,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
retry_count = MEM_CGROUP_RECLAIM_RETRIES *
mem_cgroup_count_children(memcg);
- oldusage = page_counter_read(&memcg->memory);
+ curusage = oldusage = page_counter_read(&memcg->memory);
do {
if (signal_pending(current)) {
@@ -2498,7 +2498,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
+ try_to_free_mem_cgroup_pages(memcg, curusage - limit,
+ GFP_KERNEL, true);
curusage = page_counter_read(&memcg->memory);
/* Usage is reduced ? */
@@ -2527,7 +2528,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
retry_count = MEM_CGROUP_RECLAIM_RETRIES *
mem_cgroup_count_children(memcg);
- oldusage = page_counter_read(&memcg->memsw);
+ curusage = oldusage = page_counter_read(&memcg->memsw);
do {
if (signal_pending(current)) {
@@ -2549,7 +2550,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
+ try_to_free_mem_cgroup_pages(memcg, curusage - limit,
+ GFP_KERNEL, false);
curusage = page_counter_read(&memcg->memsw);
/* Usage is reduced ? */
--
2.13.6
mem_cgroup_resize_limit() and mem_cgroup_resize_memsw_limit() are almost
identical functions. Instead of having two of them, we could pass an
additional argument to mem_cgroup_resize_limit() and by using it,
consolidate all the code in a single function.
Signed-off-by: Andrey Ryabinin <[email protected]>
---
mm/memcontrol.c | 77 +++++++++++++--------------------------------------------
1 file changed, 17 insertions(+), 60 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09ee052cf684..b263500626fe 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2459,9 +2459,17 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
static DEFINE_MUTEX(memcg_limit_mutex);
-static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
+static bool invalid_mem_limit(struct mem_cgroup *memcg, bool memsw,
+ unsigned long limit)
+{
+ return (!memsw && limit > memcg->memsw.limit) ||
+ (memsw && limit < memcg->memory.limit);
+}
+
+static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, bool memsw,
unsigned long limit)
{
+ struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
unsigned long curusage;
unsigned long oldusage;
bool enlarge = false;
@@ -2476,7 +2484,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
retry_count = MEM_CGROUP_RECLAIM_RETRIES *
mem_cgroup_count_children(memcg);
- curusage = oldusage = page_counter_read(&memcg->memory);
+ curusage = oldusage = page_counter_read(counter);
do {
if (signal_pending(current)) {
@@ -2485,75 +2493,24 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
}
mutex_lock(&memcg_limit_mutex);
- if (limit > memcg->memsw.limit) {
+ if (invalid_mem_limit(memcg, memsw, limit)) {
mutex_unlock(&memcg_limit_mutex);
ret = -EINVAL;
break;
}
- if (limit > memcg->memory.limit)
- enlarge = true;
- ret = page_counter_limit(&memcg->memory, limit);
- mutex_unlock(&memcg_limit_mutex);
-
- if (!ret)
- break;
-
- try_to_free_mem_cgroup_pages(memcg, curusage - limit,
- GFP_KERNEL, true);
-
- curusage = page_counter_read(&memcg->memory);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
-
- if (!ret && enlarge)
- memcg_oom_recover(memcg);
-
- return ret;
-}
-static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
- unsigned long limit)
-{
- unsigned long curusage;
- unsigned long oldusage;
- bool enlarge = false;
- int retry_count;
- int ret;
-
- /* see mem_cgroup_resize_res_limit */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- curusage = oldusage = page_counter_read(&memcg->memsw);
-
- do {
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- mutex_lock(&memcg_limit_mutex);
- if (limit < memcg->memory.limit) {
- mutex_unlock(&memcg_limit_mutex);
- ret = -EINVAL;
- break;
- }
- if (limit > memcg->memsw.limit)
+ if (limit > counter->limit)
enlarge = true;
- ret = page_counter_limit(&memcg->memsw, limit);
+ ret = page_counter_limit(counter, limit);
mutex_unlock(&memcg_limit_mutex);
if (!ret)
break;
try_to_free_mem_cgroup_pages(memcg, curusage - limit,
- GFP_KERNEL, false);
+ GFP_KERNEL, !memsw);
- curusage = page_counter_read(&memcg->memsw);
+ curusage = page_counter_read(counter);
/* Usage is reduced ? */
if (curusage >= oldusage)
retry_count--;
@@ -3233,10 +3190,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
- ret = mem_cgroup_resize_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, false, nr_pages);
break;
case _MEMSWAP:
- ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, true, nr_pages);
break;
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
--
2.13.6
On Wed 20-12-17 13:24:28, Andrey Ryabinin wrote:
> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
> pages on each iteration. This makes practically impossible to decrease
> limit of memory cgroup. Tasks could easily allocate back 32 pages,
> so we can't reduce memory usage, and once retry_count reaches zero we return
> -EBUSY.
>
> It's easy to reproduce the problem by running the following commands:
>
> mkdir /sys/fs/cgroup/memory/test
> echo $$ >> /sys/fs/cgroup/memory/test/tasks
> cat big_file > /dev/null &
> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> -bash: echo: write error: Device or resource busy
>
> Instead of trying to free small amount of pages, it's much more
> reasonable to free 'usage - limit' pages.
But that only makes the issue less probable. It doesn't fix it because
if (curusage >= oldusage)
retry_count--;
can still be true because allocator might be faster than the reclaimer.
Wouldn't it be more reasonable to simply remove the retry count and keep
trying until interrupted or we manage to update the limit. Another
option would be to commit the new limit and allow temporal overcommit
of the hard limit. New allocations and the limit update paths would
reclaim to the hard limit.
> Signed-off-by: Andrey Ryabinin <[email protected]>
> ---
> mm/memcontrol.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f40b5ad3f959..09ee052cf684 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2476,7 +2476,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> retry_count = MEM_CGROUP_RECLAIM_RETRIES *
> mem_cgroup_count_children(memcg);
>
> - oldusage = page_counter_read(&memcg->memory);
> + curusage = oldusage = page_counter_read(&memcg->memory);
>
> do {
> if (signal_pending(current)) {
> @@ -2498,7 +2498,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
> + try_to_free_mem_cgroup_pages(memcg, curusage - limit,
> + GFP_KERNEL, true);
>
> curusage = page_counter_read(&memcg->memory);
> /* Usage is reduced ? */
> @@ -2527,7 +2528,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
> retry_count = MEM_CGROUP_RECLAIM_RETRIES *
> mem_cgroup_count_children(memcg);
>
> - oldusage = page_counter_read(&memcg->memsw);
> + curusage = oldusage = page_counter_read(&memcg->memsw);
>
> do {
> if (signal_pending(current)) {
> @@ -2549,7 +2550,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
> + try_to_free_mem_cgroup_pages(memcg, curusage - limit,
> + GFP_KERNEL, false);
>
> curusage = page_counter_read(&memcg->memsw);
> /* Usage is reduced ? */
> --
> 2.13.6
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
--
Michal Hocko
SUSE Labs
On 12/20/2017 01:33 PM, Michal Hocko wrote:
> On Wed 20-12-17 13:24:28, Andrey Ryabinin wrote:
>> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
>> pages on each iteration. This makes practically impossible to decrease
>> limit of memory cgroup. Tasks could easily allocate back 32 pages,
>> so we can't reduce memory usage, and once retry_count reaches zero we return
>> -EBUSY.
>>
>> It's easy to reproduce the problem by running the following commands:
>>
>> mkdir /sys/fs/cgroup/memory/test
>> echo $$ >> /sys/fs/cgroup/memory/test/tasks
>> cat big_file > /dev/null &
>> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>> -bash: echo: write error: Device or resource busy
>>
>> Instead of trying to free small amount of pages, it's much more
>> reasonable to free 'usage - limit' pages.
>
> But that only makes the issue less probable. It doesn't fix it because
> if (curusage >= oldusage)
> retry_count--;
> can still be true because allocator might be faster than the reclaimer.
> Wouldn't it be more reasonable to simply remove the retry count and keep
> trying until interrupted or we manage to update the limit.
But does it makes sense to continue reclaiming even if reclaimer can't make any
progress? I'd say no. "Allocator is faster than reclaimer" may be not the only reason
for failed reclaim. E.g. we could try to set limit lower than amount of mlock()ed memory
in cgroup, retrying reclaim would be just a waste of machine's resources.
Or we simply don't have any swap, and anon > new_limit. Should be burn the cpu in that case?
> Another option would be to commit the new limit and allow temporal overcommit
> of the hard limit. New allocations and the limit update paths would
> reclaim to the hard limit.
>
It sounds a bit fragile and tricky to me. I wouldn't go that way without unless we have a very good reason for this.
On Wed 20-12-17 14:32:19, Andrey Ryabinin wrote:
> On 12/20/2017 01:33 PM, Michal Hocko wrote:
> > On Wed 20-12-17 13:24:28, Andrey Ryabinin wrote:
> >> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
> >> pages on each iteration. This makes practically impossible to decrease
> >> limit of memory cgroup. Tasks could easily allocate back 32 pages,
> >> so we can't reduce memory usage, and once retry_count reaches zero we return
> >> -EBUSY.
> >>
> >> It's easy to reproduce the problem by running the following commands:
> >>
> >> mkdir /sys/fs/cgroup/memory/test
> >> echo $$ >> /sys/fs/cgroup/memory/test/tasks
> >> cat big_file > /dev/null &
> >> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> >> -bash: echo: write error: Device or resource busy
> >>
> >> Instead of trying to free small amount of pages, it's much more
> >> reasonable to free 'usage - limit' pages.
> >
> > But that only makes the issue less probable. It doesn't fix it because
> > if (curusage >= oldusage)
> > retry_count--;
> > can still be true because allocator might be faster than the reclaimer.
> > Wouldn't it be more reasonable to simply remove the retry count and keep
> > trying until interrupted or we manage to update the limit.
>
> But does it makes sense to continue reclaiming even if reclaimer can't
> make any progress? I'd say no. "Allocator is faster than reclaimer"
> may be not the only reason for failed reclaim. E.g. we could try to
> set limit lower than amount of mlock()ed memory in cgroup, retrying
> reclaim would be just a waste of machine's resources. Or we simply
> don't have any swap, and anon > new_limit. Should be burn the cpu in
> that case?
We can check the number of reclaimed pages and go EBUSY if it is 0.
> > Another option would be to commit the new limit and allow temporal overcommit
> > of the hard limit. New allocations and the limit update paths would
> > reclaim to the hard limit.
> >
>
> It sounds a bit fragile and tricky to me. I wouldn't go that way
> without unless we have a very good reason for this.
I haven't explored this, to be honest, so there may be dragons that way.
I've just mentioned that option for completness.
--
Michal Hocko
SUSE Labs
mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
pages on each iteration. This makes practically impossible to decrease
limit of memory cgroup. Tasks could easily allocate back 32 pages,
so we can't reduce memory usage, and once retry_count reaches zero we return
-EBUSY.
Easy to reproduce the problem by running the following commands:
mkdir /sys/fs/cgroup/memory/test
echo $$ >> /sys/fs/cgroup/memory/test/tasks
cat big_file > /dev/null &
sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
-bash: echo: write error: Device or resource busy
Instead of relying on retry_count, keep trying to free required amount of pages
until reclaimer makes any progress.
Signed-off-by: Andrey Ryabinin <[email protected]>
---
mm/memcontrol.c | 70 +++++++++++++--------------------------------------------
1 file changed, 16 insertions(+), 54 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f40b5ad3f959..0d26db9a665d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1176,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
}
/*
- * This function returns the number of memcg under hierarchy tree. Returns
- * 1(self count) if no children.
- */
-static int mem_cgroup_count_children(struct mem_cgroup *memcg)
-{
- int num = 0;
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg)
- num++;
- return num;
-}
-
-/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
@@ -2462,22 +2448,10 @@ static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- unsigned long curusage;
- unsigned long oldusage;
+ unsigned long usage;
bool enlarge = false;
- int retry_count;
int ret;
- /*
- * For keeping hierarchical_reclaim simple, how long we should retry
- * is depends on callers. We set our retry-count to be function
- * of # of children which we should visit in this loop.
- */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(&memcg->memory);
-
do {
if (signal_pending(current)) {
ret = -EINTR;
@@ -2498,15 +2472,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
-
- curusage = page_counter_read(&memcg->memory);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
+ usage = page_counter_read(&memcg->memory);
+ if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
+ GFP_KERNEL, true)) {
+ ret = -EBUSY;
+ break;
+ }
+ } while (true);
if (!ret && enlarge)
memcg_oom_recover(memcg);
@@ -2517,18 +2489,10 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- unsigned long curusage;
- unsigned long oldusage;
+ unsigned long usage;
bool enlarge = false;
- int retry_count;
int ret;
- /* see mem_cgroup_resize_res_limit */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(&memcg->memsw);
-
do {
if (signal_pending(current)) {
ret = -EINTR;
@@ -2549,15 +2513,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
-
- curusage = page_counter_read(&memcg->memsw);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
+ usage = page_counter_read(&memcg->memsw);
+ if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
+ GFP_KERNEL, false)) {
+ ret = -EBUSY;
+ break;
+ }
+ } while (true);
if (!ret && enlarge)
memcg_oom_recover(memcg);
--
2.13.6
mem_cgroup_resize_limit() and mem_cgroup_resize_memsw_limit() are almost
identical functions. Instead of having two of them, we could pass an
additional argument to mem_cgroup_resize_limit() and by using it,
consolidate all the code in a single function.
Signed-off-by: Andrey Ryabinin <[email protected]>
---
mm/memcontrol.c | 61 +++++++++++++--------------------------------------------
1 file changed, 14 insertions(+), 47 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0d26db9a665d..f6253c80a5c8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2445,50 +2445,17 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
static DEFINE_MUTEX(memcg_limit_mutex);
-static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static bool invalid_mem_limit(struct mem_cgroup *memcg, bool memsw,
+ unsigned long limit)
{
- unsigned long usage;
- bool enlarge = false;
- int ret;
-
- do {
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- mutex_lock(&memcg_limit_mutex);
- if (limit > memcg->memsw.limit) {
- mutex_unlock(&memcg_limit_mutex);
- ret = -EINVAL;
- break;
- }
- if (limit > memcg->memory.limit)
- enlarge = true;
- ret = page_counter_limit(&memcg->memory, limit);
- mutex_unlock(&memcg_limit_mutex);
-
- if (!ret)
- break;
-
- usage = page_counter_read(&memcg->memory);
- if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
- GFP_KERNEL, true)) {
- ret = -EBUSY;
- break;
- }
- } while (true);
-
- if (!ret && enlarge)
- memcg_oom_recover(memcg);
-
- return ret;
+ return (!memsw && limit > memcg->memsw.limit) ||
+ (memsw && limit < memcg->memory.limit);
}
-static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, bool memsw,
+ unsigned long limit)
{
+ struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
unsigned long usage;
bool enlarge = false;
int ret;
@@ -2500,22 +2467,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
}
mutex_lock(&memcg_limit_mutex);
- if (limit < memcg->memory.limit) {
+ if (invalid_mem_limit(memcg, memsw, limit)) {
mutex_unlock(&memcg_limit_mutex);
ret = -EINVAL;
break;
}
- if (limit > memcg->memsw.limit)
+ if (limit > counter->limit)
enlarge = true;
- ret = page_counter_limit(&memcg->memsw, limit);
+ ret = page_counter_limit(counter, limit);
mutex_unlock(&memcg_limit_mutex);
if (!ret)
break;
- usage = page_counter_read(&memcg->memsw);
+ usage = page_counter_read(counter);
if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
- GFP_KERNEL, false)) {
+ GFP_KERNEL, !memsw)) {
ret = -EBUSY;
break;
}
@@ -3193,10 +3160,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
- ret = mem_cgroup_resize_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, false, nr_pages);
break;
case _MEMSWAP:
- ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, true, nr_pages);
break;
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
--
2.13.6
On Wed 20-12-17 16:21:13, Andrey Ryabinin wrote:
> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
> pages on each iteration. This makes practically impossible to decrease
> limit of memory cgroup. Tasks could easily allocate back 32 pages,
> so we can't reduce memory usage, and once retry_count reaches zero we return
> -EBUSY.
>
> Easy to reproduce the problem by running the following commands:
>
> mkdir /sys/fs/cgroup/memory/test
> echo $$ >> /sys/fs/cgroup/memory/test/tasks
> cat big_file > /dev/null &
> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> -bash: echo: write error: Device or resource busy
>
> Instead of relying on retry_count, keep trying to free required amount of pages
> until reclaimer makes any progress.
The wording of the changelog has some room for improvements. The last
sentence should read something like "Instead of relying on retry_count,
keep retrying the reclaim until the desired limit is reached or fail
if the reclaim doesn't make any progress or a signal is pending."
I am bussy as hell today so I will look closer tomorrow or on Friday.
But from a very quick glance the patch seems reasonable.
> Signed-off-by: Andrey Ryabinin <[email protected]>
> ---
> mm/memcontrol.c | 70 +++++++++++++--------------------------------------------
> 1 file changed, 16 insertions(+), 54 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f40b5ad3f959..0d26db9a665d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1176,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
> }
>
> /*
> - * This function returns the number of memcg under hierarchy tree. Returns
> - * 1(self count) if no children.
> - */
> -static int mem_cgroup_count_children(struct mem_cgroup *memcg)
> -{
> - int num = 0;
> - struct mem_cgroup *iter;
> -
> - for_each_mem_cgroup_tree(iter, memcg)
> - num++;
> - return num;
> -}
> -
> -/*
> * Return the memory (and swap, if configured) limit for a memcg.
> */
> unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> @@ -2462,22 +2448,10 @@ static DEFINE_MUTEX(memcg_limit_mutex);
> static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> unsigned long limit)
> {
> - unsigned long curusage;
> - unsigned long oldusage;
> + unsigned long usage;
> bool enlarge = false;
> - int retry_count;
> int ret;
>
> - /*
> - * For keeping hierarchical_reclaim simple, how long we should retry
> - * is depends on callers. We set our retry-count to be function
> - * of # of children which we should visit in this loop.
> - */
> - retry_count = MEM_CGROUP_RECLAIM_RETRIES *
> - mem_cgroup_count_children(memcg);
> -
> - oldusage = page_counter_read(&memcg->memory);
> -
> do {
> if (signal_pending(current)) {
> ret = -EINTR;
> @@ -2498,15 +2472,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
> -
> - curusage = page_counter_read(&memcg->memory);
> - /* Usage is reduced ? */
> - if (curusage >= oldusage)
> - retry_count--;
> - else
> - oldusage = curusage;
> - } while (retry_count);
> + usage = page_counter_read(&memcg->memory);
> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
> + GFP_KERNEL, true)) {
> + ret = -EBUSY;
> + break;
> + }
> + } while (true);
>
> if (!ret && enlarge)
> memcg_oom_recover(memcg);
> @@ -2517,18 +2489,10 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
> unsigned long limit)
> {
> - unsigned long curusage;
> - unsigned long oldusage;
> + unsigned long usage;
> bool enlarge = false;
> - int retry_count;
> int ret;
>
> - /* see mem_cgroup_resize_res_limit */
> - retry_count = MEM_CGROUP_RECLAIM_RETRIES *
> - mem_cgroup_count_children(memcg);
> -
> - oldusage = page_counter_read(&memcg->memsw);
> -
> do {
> if (signal_pending(current)) {
> ret = -EINTR;
> @@ -2549,15 +2513,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
> -
> - curusage = page_counter_read(&memcg->memsw);
> - /* Usage is reduced ? */
> - if (curusage >= oldusage)
> - retry_count--;
> - else
> - oldusage = curusage;
> - } while (retry_count);
> + usage = page_counter_read(&memcg->memsw);
> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
> + GFP_KERNEL, false)) {
> + ret = -EBUSY;
> + break;
> + }
> + } while (true);
>
> if (!ret && enlarge)
> memcg_oom_recover(memcg);
> --
> 2.13.6
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
--
Michal Hocko
SUSE Labs
On Wed, Dec 20, 2017 at 3:34 AM, Michal Hocko <[email protected]> wrote:
> On Wed 20-12-17 14:32:19, Andrey Ryabinin wrote:
>> On 12/20/2017 01:33 PM, Michal Hocko wrote:
>> > On Wed 20-12-17 13:24:28, Andrey Ryabinin wrote:
>> >> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
>> >> pages on each iteration. This makes practically impossible to decrease
>> >> limit of memory cgroup. Tasks could easily allocate back 32 pages,
>> >> so we can't reduce memory usage, and once retry_count reaches zero we return
>> >> -EBUSY.
>> >>
>> >> It's easy to reproduce the problem by running the following commands:
>> >>
>> >> mkdir /sys/fs/cgroup/memory/test
>> >> echo $$ >> /sys/fs/cgroup/memory/test/tasks
>> >> cat big_file > /dev/null &
>> >> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>> >> -bash: echo: write error: Device or resource busy
>> >>
>> >> Instead of trying to free small amount of pages, it's much more
>> >> reasonable to free 'usage - limit' pages.
>> >
>> > But that only makes the issue less probable. It doesn't fix it because
>> > if (curusage >= oldusage)
>> > retry_count--;
>> > can still be true because allocator might be faster than the reclaimer.
>> > Wouldn't it be more reasonable to simply remove the retry count and keep
>> > trying until interrupted or we manage to update the limit.
>>
>> But does it makes sense to continue reclaiming even if reclaimer can't
>> make any progress? I'd say no. "Allocator is faster than reclaimer"
>> may be not the only reason for failed reclaim. E.g. we could try to
>> set limit lower than amount of mlock()ed memory in cgroup, retrying
>> reclaim would be just a waste of machine's resources. Or we simply
>> don't have any swap, and anon > new_limit. Should be burn the cpu in
>> that case?
>
> We can check the number of reclaimed pages and go EBUSY if it is 0.
>
>> > Another option would be to commit the new limit and allow temporal overcommit
>> > of the hard limit. New allocations and the limit update paths would
>> > reclaim to the hard limit.
>> >
>>
>> It sounds a bit fragile and tricky to me. I wouldn't go that way
>> without unless we have a very good reason for this.
>
> I haven't explored this, to be honest, so there may be dragons that way.
> I've just mentioned that option for completness.
>
We already do this for cgroup-v2's memory.max. So, I don't think it is
fragile or tricky.
On 12/20/2017 09:15 PM, Shakeel Butt wrote:
> On Wed, Dec 20, 2017 at 3:34 AM, Michal Hocko <[email protected]> wrote:
>> On Wed 20-12-17 14:32:19, Andrey Ryabinin wrote:
>>> On 12/20/2017 01:33 PM, Michal Hocko wrote:
>>>> On Wed 20-12-17 13:24:28, Andrey Ryabinin wrote:
>>>>> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
>>>>> pages on each iteration. This makes practically impossible to decrease
>>>>> limit of memory cgroup. Tasks could easily allocate back 32 pages,
>>>>> so we can't reduce memory usage, and once retry_count reaches zero we return
>>>>> -EBUSY.
>>>>>
>>>>> It's easy to reproduce the problem by running the following commands:
>>>>>
>>>>> mkdir /sys/fs/cgroup/memory/test
>>>>> echo $$ >> /sys/fs/cgroup/memory/test/tasks
>>>>> cat big_file > /dev/null &
>>>>> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>>>>> -bash: echo: write error: Device or resource busy
>>>>>
>>>>> Instead of trying to free small amount of pages, it's much more
>>>>> reasonable to free 'usage - limit' pages.
>>>>
>>>> But that only makes the issue less probable. It doesn't fix it because
>>>> if (curusage >= oldusage)
>>>> retry_count--;
>>>> can still be true because allocator might be faster than the reclaimer.
>>>> Wouldn't it be more reasonable to simply remove the retry count and keep
>>>> trying until interrupted or we manage to update the limit.
>>>
>>> But does it makes sense to continue reclaiming even if reclaimer can't
>>> make any progress? I'd say no. "Allocator is faster than reclaimer"
>>> may be not the only reason for failed reclaim. E.g. we could try to
>>> set limit lower than amount of mlock()ed memory in cgroup, retrying
>>> reclaim would be just a waste of machine's resources. Or we simply
>>> don't have any swap, and anon > new_limit. Should be burn the cpu in
>>> that case?
>>
>> We can check the number of reclaimed pages and go EBUSY if it is 0.
>>
>>>> Another option would be to commit the new limit and allow temporal overcommit
>>>> of the hard limit. New allocations and the limit update paths would
>>>> reclaim to the hard limit.
>>>>
>>>
>>> It sounds a bit fragile and tricky to me. I wouldn't go that way
>>> without unless we have a very good reason for this.
>>
>> I haven't explored this, to be honest, so there may be dragons that way.
>> I've just mentioned that option for completness.
>>
>
> We already do this for cgroup-v2's memory.max. So, I don't think it is
> fragile or tricky.
>
It has a potential to break userspace expectation. Userspace might expect that lowering
limit_in_bytes too much fails with EBUSY and doesn't trigger OOM killer.
mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
pages on each iteration. This makes practically impossible to decrease
limit of memory cgroup. Tasks could easily allocate back 32 pages,
so we can't reduce memory usage, and once retry_count reaches zero we return
-EBUSY.
Easy to reproduce the problem by running the following commands:
mkdir /sys/fs/cgroup/memory/test
echo $$ >> /sys/fs/cgroup/memory/test/tasks
cat big_file > /dev/null &
sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
-bash: echo: write error: Device or resource busy
Instead of relying on retry_count, keep retrying the reclaim until
the desired limit is reached or fail if the reclaim doesn't make
any progress or a signal is pending.
Signed-off-by: Andrey Ryabinin <[email protected]>
---
Changes since v2:
- Changelog wording per mhocko@
mm/memcontrol.c | 70 +++++++++++++--------------------------------------------
1 file changed, 16 insertions(+), 54 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f40b5ad3f959..0d26db9a665d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1176,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
}
/*
- * This function returns the number of memcg under hierarchy tree. Returns
- * 1(self count) if no children.
- */
-static int mem_cgroup_count_children(struct mem_cgroup *memcg)
-{
- int num = 0;
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg)
- num++;
- return num;
-}
-
-/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
@@ -2462,22 +2448,10 @@ static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- unsigned long curusage;
- unsigned long oldusage;
+ unsigned long usage;
bool enlarge = false;
- int retry_count;
int ret;
- /*
- * For keeping hierarchical_reclaim simple, how long we should retry
- * is depends on callers. We set our retry-count to be function
- * of # of children which we should visit in this loop.
- */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(&memcg->memory);
-
do {
if (signal_pending(current)) {
ret = -EINTR;
@@ -2498,15 +2472,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
-
- curusage = page_counter_read(&memcg->memory);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
+ usage = page_counter_read(&memcg->memory);
+ if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
+ GFP_KERNEL, true)) {
+ ret = -EBUSY;
+ break;
+ }
+ } while (true);
if (!ret && enlarge)
memcg_oom_recover(memcg);
@@ -2517,18 +2489,10 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- unsigned long curusage;
- unsigned long oldusage;
+ unsigned long usage;
bool enlarge = false;
- int retry_count;
int ret;
- /* see mem_cgroup_resize_res_limit */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(&memcg->memsw);
-
do {
if (signal_pending(current)) {
ret = -EINTR;
@@ -2549,15 +2513,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
-
- curusage = page_counter_read(&memcg->memsw);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
+ usage = page_counter_read(&memcg->memsw);
+ if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
+ GFP_KERNEL, false)) {
+ ret = -EBUSY;
+ break;
+ }
+ } while (true);
if (!ret && enlarge)
memcg_oom_recover(memcg);
--
2.13.6
mem_cgroup_resize_limit() and mem_cgroup_resize_memsw_limit() are almost
identical functions. Instead of having two of them, we could pass an
additional argument to mem_cgroup_resize_limit() and by using it,
consolidate all the code in a single function.
Signed-off-by: Andrey Ryabinin <[email protected]>
---
mm/memcontrol.c | 61 +++++++++++++--------------------------------------------
1 file changed, 14 insertions(+), 47 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0d26db9a665d..f6253c80a5c8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2445,50 +2445,17 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
static DEFINE_MUTEX(memcg_limit_mutex);
-static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static bool invalid_mem_limit(struct mem_cgroup *memcg, bool memsw,
+ unsigned long limit)
{
- unsigned long usage;
- bool enlarge = false;
- int ret;
-
- do {
- if (signal_pending(current)) {
- ret = -EINTR;
- break;
- }
-
- mutex_lock(&memcg_limit_mutex);
- if (limit > memcg->memsw.limit) {
- mutex_unlock(&memcg_limit_mutex);
- ret = -EINVAL;
- break;
- }
- if (limit > memcg->memory.limit)
- enlarge = true;
- ret = page_counter_limit(&memcg->memory, limit);
- mutex_unlock(&memcg_limit_mutex);
-
- if (!ret)
- break;
-
- usage = page_counter_read(&memcg->memory);
- if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
- GFP_KERNEL, true)) {
- ret = -EBUSY;
- break;
- }
- } while (true);
-
- if (!ret && enlarge)
- memcg_oom_recover(memcg);
-
- return ret;
+ return (!memsw && limit > memcg->memsw.limit) ||
+ (memsw && limit < memcg->memory.limit);
}
-static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, bool memsw,
+ unsigned long limit)
{
+ struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
unsigned long usage;
bool enlarge = false;
int ret;
@@ -2500,22 +2467,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
}
mutex_lock(&memcg_limit_mutex);
- if (limit < memcg->memory.limit) {
+ if (invalid_mem_limit(memcg, memsw, limit)) {
mutex_unlock(&memcg_limit_mutex);
ret = -EINVAL;
break;
}
- if (limit > memcg->memsw.limit)
+ if (limit > counter->limit)
enlarge = true;
- ret = page_counter_limit(&memcg->memsw, limit);
+ ret = page_counter_limit(counter, limit);
mutex_unlock(&memcg_limit_mutex);
if (!ret)
break;
- usage = page_counter_read(&memcg->memsw);
+ usage = page_counter_read(counter);
if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
- GFP_KERNEL, false)) {
+ GFP_KERNEL, !memsw)) {
ret = -EBUSY;
break;
}
@@ -3193,10 +3160,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
- ret = mem_cgroup_resize_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, false, nr_pages);
break;
case _MEMSWAP:
- ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
+ ret = mem_cgroup_resize_limit(memcg, true, nr_pages);
break;
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
--
2.13.6
On 01/09/2018 07:58 PM, Andrey Ryabinin wrote:
> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
> pages on each iteration. This makes practically impossible to decrease
> limit of memory cgroup. Tasks could easily allocate back 32 pages,
> so we can't reduce memory usage, and once retry_count reaches zero we return
> -EBUSY.
>
> Easy to reproduce the problem by running the following commands:
>
> mkdir /sys/fs/cgroup/memory/test
> echo $$ >> /sys/fs/cgroup/memory/test/tasks
> cat big_file > /dev/null &
> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> -bash: echo: write error: Device or resource busy
>
> Instead of relying on retry_count, keep retrying the reclaim until
> the desired limit is reached or fail if the reclaim doesn't make
> any progress or a signal is pending.
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
> ---
>
> Changes since v2:
> - Changelog wording per mhocko@
>
Ugh, sorry, I forgot to +Cc Michal this time.
Changelog, is the only thing than changed between v2 and v3.
> mm/memcontrol.c | 70 +++++++++++++--------------------------------------------
> 1 file changed, 16 insertions(+), 54 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f40b5ad3f959..0d26db9a665d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1176,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
> }
>
> /*
> - * This function returns the number of memcg under hierarchy tree. Returns
> - * 1(self count) if no children.
> - */
> -static int mem_cgroup_count_children(struct mem_cgroup *memcg)
> -{
> - int num = 0;
> - struct mem_cgroup *iter;
> -
> - for_each_mem_cgroup_tree(iter, memcg)
> - num++;
> - return num;
> -}
> -
> -/*
> * Return the memory (and swap, if configured) limit for a memcg.
> */
> unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> @@ -2462,22 +2448,10 @@ static DEFINE_MUTEX(memcg_limit_mutex);
> static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> unsigned long limit)
> {
> - unsigned long curusage;
> - unsigned long oldusage;
> + unsigned long usage;
> bool enlarge = false;
> - int retry_count;
> int ret;
>
> - /*
> - * For keeping hierarchical_reclaim simple, how long we should retry
> - * is depends on callers. We set our retry-count to be function
> - * of # of children which we should visit in this loop.
> - */
> - retry_count = MEM_CGROUP_RECLAIM_RETRIES *
> - mem_cgroup_count_children(memcg);
> -
> - oldusage = page_counter_read(&memcg->memory);
> -
> do {
> if (signal_pending(current)) {
> ret = -EINTR;
> @@ -2498,15 +2472,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
> -
> - curusage = page_counter_read(&memcg->memory);
> - /* Usage is reduced ? */
> - if (curusage >= oldusage)
> - retry_count--;
> - else
> - oldusage = curusage;
> - } while (retry_count);
> + usage = page_counter_read(&memcg->memory);
> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
> + GFP_KERNEL, true)) {
> + ret = -EBUSY;
> + break;
> + }
> + } while (true);
>
> if (!ret && enlarge)
> memcg_oom_recover(memcg);
> @@ -2517,18 +2489,10 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
> unsigned long limit)
> {
> - unsigned long curusage;
> - unsigned long oldusage;
> + unsigned long usage;
> bool enlarge = false;
> - int retry_count;
> int ret;
>
> - /* see mem_cgroup_resize_res_limit */
> - retry_count = MEM_CGROUP_RECLAIM_RETRIES *
> - mem_cgroup_count_children(memcg);
> -
> - oldusage = page_counter_read(&memcg->memsw);
> -
> do {
> if (signal_pending(current)) {
> ret = -EINTR;
> @@ -2549,15 +2513,13 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
> -
> - curusage = page_counter_read(&memcg->memsw);
> - /* Usage is reduced ? */
> - if (curusage >= oldusage)
> - retry_count--;
> - else
> - oldusage = curusage;
> - } while (retry_count);
> + usage = page_counter_read(&memcg->memsw);
> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
> + GFP_KERNEL, false)) {
> + ret = -EBUSY;
> + break;
> + }
> + } while (true);
>
> if (!ret && enlarge)
> memcg_oom_recover(memcg);
>
On Tue, Jan 9, 2018 at 8:58 AM, Andrey Ryabinin <[email protected]> wrote:
> mem_cgroup_resize_limit() and mem_cgroup_resize_memsw_limit() are almost
> identical functions. Instead of having two of them, we could pass an
> additional argument to mem_cgroup_resize_limit() and by using it,
> consolidate all the code in a single function.
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
I think this is already proposed and Acked.
https://patchwork.kernel.org/patch/10150719/
On Tue, Jan 9, 2018 at 8:58 AM, Andrey Ryabinin <[email protected]> wrote:
> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
> pages on each iteration. This makes practically impossible to decrease
> limit of memory cgroup. Tasks could easily allocate back 32 pages,
> so we can't reduce memory usage, and once retry_count reaches zero we return
> -EBUSY.
>
> Easy to reproduce the problem by running the following commands:
>
> mkdir /sys/fs/cgroup/memory/test
> echo $$ >> /sys/fs/cgroup/memory/test/tasks
> cat big_file > /dev/null &
> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> -bash: echo: write error: Device or resource busy
>
> Instead of relying on retry_count, keep retrying the reclaim until
> the desired limit is reached or fail if the reclaim doesn't make
> any progress or a signal is pending.
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
Reviewed-by: Shakeel Butt <[email protected]>
On 01/09/2018 08:10 PM, Shakeel Butt wrote:
> On Tue, Jan 9, 2018 at 8:58 AM, Andrey Ryabinin <[email protected]> wrote:
>> mem_cgroup_resize_limit() and mem_cgroup_resize_memsw_limit() are almost
>> identical functions. Instead of having two of them, we could pass an
>> additional argument to mem_cgroup_resize_limit() and by using it,
>> consolidate all the code in a single function.
>>
>> Signed-off-by: Andrey Ryabinin <[email protected]>
>
> I think this is already proposed and Acked.
>
> https://patchwork.kernel.org/patch/10150719/
>
Indeed. I'll rebase 1/2 patch on top, if it will be applied first.
On Tue, 9 Jan 2018 20:26:33 +0300 Andrey Ryabinin <[email protected]> wrote:
> On 01/09/2018 08:10 PM, Shakeel Butt wrote:
> > On Tue, Jan 9, 2018 at 8:58 AM, Andrey Ryabinin <[email protected]> wrote:
> >> mem_cgroup_resize_limit() and mem_cgroup_resize_memsw_limit() are almost
> >> identical functions. Instead of having two of them, we could pass an
> >> additional argument to mem_cgroup_resize_limit() and by using it,
> >> consolidate all the code in a single function.
> >>
> >> Signed-off-by: Andrey Ryabinin <[email protected]>
> >
> > I think this is already proposed and Acked.
> >
> > https://patchwork.kernel.org/patch/10150719/
> >
>
> Indeed. I'll rebase 1/2 patch on top, if it will be applied first.
Yes please.
mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
pages on each iteration. This makes practically impossible to decrease
limit of memory cgroup. Tasks could easily allocate back 32 pages,
so we can't reduce memory usage, and once retry_count reaches zero we return
-EBUSY.
Easy to reproduce the problem by running the following commands:
mkdir /sys/fs/cgroup/memory/test
echo $$ >> /sys/fs/cgroup/memory/test/tasks
cat big_file > /dev/null &
sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
-bash: echo: write error: Device or resource busy
Instead of relying on retry_count, keep retrying the reclaim until
the desired limit is reached or fail if the reclaim doesn't make
any progress or a signal is pending.
Signed-off-by: Andrey Ryabinin <[email protected]>
Reviewed-by: Shakeel Butt <[email protected]>
---
Changes since v3:
- Rebase
Changes since v2:
- Changelog wording per mhocko@
mm/memcontrol.c | 44 ++++++++------------------------------------
1 file changed, 8 insertions(+), 36 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13aeccf32c2e..c3d1eaef752d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1176,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
}
/*
- * This function returns the number of memcg under hierarchy tree. Returns
- * 1(self count) if no children.
- */
-static int mem_cgroup_count_children(struct mem_cgroup *memcg)
-{
- int num = 0;
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg)
- num++;
- return num;
-}
-
-/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
@@ -2462,24 +2448,12 @@ static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long limit, bool memsw)
{
- unsigned long curusage;
- unsigned long oldusage;
+ unsigned long usage;
bool enlarge = false;
- int retry_count;
int ret;
bool limits_invariant;
struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
- /*
- * For keeping hierarchical_reclaim simple, how long we should retry
- * is depends on callers. We set our retry-count to be function
- * of # of children which we should visit in this loop.
- */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(counter);
-
do {
if (signal_pending(current)) {
ret = -EINTR;
@@ -2506,15 +2480,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw);
-
- curusage = page_counter_read(counter);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
+ usage = page_counter_read(counter);
+ if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
+ GFP_KERNEL, !memsw)) {
+ ret = -EBUSY;
+ break;
+ }
+ } while (true);
if (!ret && enlarge)
memcg_oom_recover(memcg);
--
2.13.6
On Wed, 10 Jan 2018 15:43:17 +0300 Andrey Ryabinin <[email protected]> wrote:
> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
> pages on each iteration. This makes practically impossible to decrease
> limit of memory cgroup. Tasks could easily allocate back 32 pages,
> so we can't reduce memory usage, and once retry_count reaches zero we return
> -EBUSY.
>
> Easy to reproduce the problem by running the following commands:
>
> mkdir /sys/fs/cgroup/memory/test
> echo $$ >> /sys/fs/cgroup/memory/test/tasks
> cat big_file > /dev/null &
> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> -bash: echo: write error: Device or resource busy
>
> Instead of relying on retry_count, keep retrying the reclaim until
> the desired limit is reached or fail if the reclaim doesn't make
> any progress or a signal is pending.
>
Is there any situation under which that mem_cgroup_resize_limit() can
get stuck semi-indefinitely in a livelockish state? It isn't very
obvious that we're protected from this, so perhaps it would help to
have a comment which describes how loop termination is assured?
On Wed 10-01-18 15:43:17, Andrey Ryabinin wrote:
[...]
> @@ -2506,15 +2480,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw);
> -
> - curusage = page_counter_read(counter);
> - /* Usage is reduced ? */
> - if (curusage >= oldusage)
> - retry_count--;
> - else
> - oldusage = curusage;
> - } while (retry_count);
> + usage = page_counter_read(counter);
> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
> + GFP_KERNEL, !memsw)) {
If the usage drops below limit in the meantime then you get underflow
and reclaim the whole memcg. I do not think this is a good idea. This
can also lead to over reclaim. Why don't you simply stick with the
original SWAP_CLUSTER_MAX (aka 1 for try_to_free_mem_cgroup_pages)?
> + ret = -EBUSY;
> + break;
> + }
> + } while (true);
--
Michal Hocko
SUSE Labs
On 01/11/2018 01:31 AM, Andrew Morton wrote:
> On Wed, 10 Jan 2018 15:43:17 +0300 Andrey Ryabinin <[email protected]> wrote:
>
>> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
>> pages on each iteration. This makes practically impossible to decrease
>> limit of memory cgroup. Tasks could easily allocate back 32 pages,
>> so we can't reduce memory usage, and once retry_count reaches zero we return
>> -EBUSY.
>>
>> Easy to reproduce the problem by running the following commands:
>>
>> mkdir /sys/fs/cgroup/memory/test
>> echo $$ >> /sys/fs/cgroup/memory/test/tasks
>> cat big_file > /dev/null &
>> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>> -bash: echo: write error: Device or resource busy
>>
>> Instead of relying on retry_count, keep retrying the reclaim until
>> the desired limit is reached or fail if the reclaim doesn't make
>> any progress or a signal is pending.
>>
>
> Is there any situation under which that mem_cgroup_resize_limit() can
> get stuck semi-indefinitely in a livelockish state? It isn't very
> obvious that we're protected from this, so perhaps it would help to
> have a comment which describes how loop termination is assured?
>
We are not protected from this. If tasks in cgroup *indefinitely* generate reclaimable memory at high rate
and user asks to set unreachable limit, like 'echo 4096 > memory.limit_in_bytes', than
try_to_free_mem_cgroup_pages() will return non-zero indefinitely.
Is that a big deal? At least loop can be interrupted by a signal, and we don't hold any locks here.
On 01/11/2018 01:42 PM, Michal Hocko wrote:
> On Wed 10-01-18 15:43:17, Andrey Ryabinin wrote:
> [...]
>> @@ -2506,15 +2480,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>> if (!ret)
>> break;
>>
>> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw);
>> -
>> - curusage = page_counter_read(counter);
>> - /* Usage is reduced ? */
>> - if (curusage >= oldusage)
>> - retry_count--;
>> - else
>> - oldusage = curusage;
>> - } while (retry_count);
>> + usage = page_counter_read(counter);
>> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
>> + GFP_KERNEL, !memsw)) {
>
> If the usage drops below limit in the meantime then you get underflow
> and reclaim the whole memcg. I do not think this is a good idea. This
> can also lead to over reclaim. Why don't you simply stick with the
> original SWAP_CLUSTER_MAX (aka 1 for try_to_free_mem_cgroup_pages)?
>
Because, if new limit is gigabytes bellow the current usage, retrying to set
new limit after reclaiming only 32 pages seems unreasonable.
So, I made this:
From: Andrey Ryabinin <[email protected]>
Subject: mm-memcg-try-harder-to-decrease-limit_in_bytes-fix
Protect from overreclaim if usage become lower than limit.
Signed-off-by: Andrey Ryabinin <[email protected]>
---
mm/memcontrol.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4671ae8a8b1a..6120bb619547 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2455,7 +2455,7 @@ static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long limit, bool memsw)
{
- unsigned long usage;
+ unsigned long nr_pages;
bool enlarge = false;
int ret;
bool limits_invariant;
@@ -2487,8 +2487,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- usage = page_counter_read(counter);
- if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
+ nr_pages = max_t(long, 1, page_counter_read(counter) - limit);
+ if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
GFP_KERNEL, !memsw)) {
ret = -EBUSY;
break;
--
2.13.6
On Thu 11-01-18 15:21:33, Andrey Ryabinin wrote:
>
>
> On 01/11/2018 01:42 PM, Michal Hocko wrote:
> > On Wed 10-01-18 15:43:17, Andrey Ryabinin wrote:
> > [...]
> >> @@ -2506,15 +2480,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> >> if (!ret)
> >> break;
> >>
> >> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw);
> >> -
> >> - curusage = page_counter_read(counter);
> >> - /* Usage is reduced ? */
> >> - if (curusage >= oldusage)
> >> - retry_count--;
> >> - else
> >> - oldusage = curusage;
> >> - } while (retry_count);
> >> + usage = page_counter_read(counter);
> >> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
> >> + GFP_KERNEL, !memsw)) {
> >
> > If the usage drops below limit in the meantime then you get underflow
> > and reclaim the whole memcg. I do not think this is a good idea. This
> > can also lead to over reclaim. Why don't you simply stick with the
> > original SWAP_CLUSTER_MAX (aka 1 for try_to_free_mem_cgroup_pages)?
> >
>
> Because, if new limit is gigabytes bellow the current usage, retrying to set
> new limit after reclaiming only 32 pages seems unreasonable.
Who would do insanity like that?
> @@ -2487,8 +2487,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - usage = page_counter_read(counter);
> - if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
> + nr_pages = max_t(long, 1, page_counter_read(counter) - limit);
> + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
> GFP_KERNEL, !memsw)) {
> ret = -EBUSY;
> break;
How does this address the over reclaim concern?
--
Michal Hocko
SUSE Labs
On 01/11/2018 03:46 PM, Michal Hocko wrote:
> On Thu 11-01-18 15:21:33, Andrey Ryabinin wrote:
>>
>>
>> On 01/11/2018 01:42 PM, Michal Hocko wrote:
>>> On Wed 10-01-18 15:43:17, Andrey Ryabinin wrote:
>>> [...]
>>>> @@ -2506,15 +2480,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>>> if (!ret)
>>>> break;
>>>>
>>>> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw);
>>>> -
>>>> - curusage = page_counter_read(counter);
>>>> - /* Usage is reduced ? */
>>>> - if (curusage >= oldusage)
>>>> - retry_count--;
>>>> - else
>>>> - oldusage = curusage;
>>>> - } while (retry_count);
>>>> + usage = page_counter_read(counter);
>>>> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
>>>> + GFP_KERNEL, !memsw)) {
>>>
>>> If the usage drops below limit in the meantime then you get underflow
>>> and reclaim the whole memcg. I do not think this is a good idea. This
>>> can also lead to over reclaim. Why don't you simply stick with the
>>> original SWAP_CLUSTER_MAX (aka 1 for try_to_free_mem_cgroup_pages)?
>>>
>>
>> Because, if new limit is gigabytes bellow the current usage, retrying to set
>> new limit after reclaiming only 32 pages seems unreasonable.
>
> Who would do insanity like that?
>
What's insane about that?
>> @@ -2487,8 +2487,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>> if (!ret)
>> break;
>>
>> - usage = page_counter_read(counter);
>> - if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
>> + nr_pages = max_t(long, 1, page_counter_read(counter) - limit);
>> + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
>> GFP_KERNEL, !memsw)) {
>> ret = -EBUSY;
>> break;
>
> How does this address the over reclaim concern?
It protects from over reclaim due to underflow.
Assuming that yours over reclaim concern is situation like this:
Task A Task B
mem_cgroup_resize_limit(new_limit):
...
do {
...
try_to_free_mem_cgroup_pages():
//start reclaim
free memory => drop down usage below new_limit
//end reclaim
...
} while(true)
than I don't understand why is this a problem at all, and how try_to_free_mem_cgroup_pages(1) supposed to solve it.
First of all, this is highly unlikely situation. Decreasing limit is not something that happens very often.
I imagine that freeing large amounts of memory is also not very frequent operation, workloads mostly consume/use
resources. So this is something that should almost never happen, and when it does, who and how would notice?
I mean, that 'problem' has no user-visible effect.
Secondly, how try_to_free_mem_cgroup_pages(1) can help here? Task B could simply free() right after the limit
is successfully set. So it suddenly doesn't matter whether the memory was reclaimed by baby steps or in one go,
we 'over reclaimed' memory that B freed. Basically, your suggestion sounds like "lets slowly reclaim with baby
steps, and check the limit after each step in hope that tasks in cgroup did some of our job and freed some memory".
So, the only way to completely avoid such over reclaim would be to do not reclaim at all, and simply wait until the memory
usage goes down by itself. But we are not that crazy to do this, right?
On Thu 11-01-18 18:23:57, Andrey Ryabinin wrote:
> On 01/11/2018 03:46 PM, Michal Hocko wrote:
> > On Thu 11-01-18 15:21:33, Andrey Ryabinin wrote:
> >>
> >>
> >> On 01/11/2018 01:42 PM, Michal Hocko wrote:
> >>> On Wed 10-01-18 15:43:17, Andrey Ryabinin wrote:
> >>> [...]
> >>>> @@ -2506,15 +2480,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> >>>> if (!ret)
> >>>> break;
> >>>>
> >>>> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw);
> >>>> -
> >>>> - curusage = page_counter_read(counter);
> >>>> - /* Usage is reduced ? */
> >>>> - if (curusage >= oldusage)
> >>>> - retry_count--;
> >>>> - else
> >>>> - oldusage = curusage;
> >>>> - } while (retry_count);
> >>>> + usage = page_counter_read(counter);
> >>>> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
> >>>> + GFP_KERNEL, !memsw)) {
> >>>
> >>> If the usage drops below limit in the meantime then you get underflow
> >>> and reclaim the whole memcg. I do not think this is a good idea. This
> >>> can also lead to over reclaim. Why don't you simply stick with the
> >>> original SWAP_CLUSTER_MAX (aka 1 for try_to_free_mem_cgroup_pages)?
> >>>
> >>
> >> Because, if new limit is gigabytes bellow the current usage, retrying to set
> >> new limit after reclaiming only 32 pages seems unreasonable.
> >
> > Who would do insanity like that?
> >
>
> What's insane about that?
I haven't seen this being done in practice. Why would you want to
reclaim GBs of memory from a cgroup? Anyway, if you believe this is
really needed then simply do it in a separate patch.
> >> @@ -2487,8 +2487,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> >> if (!ret)
> >> break;
> >>
> >> - usage = page_counter_read(counter);
> >> - if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
> >> + nr_pages = max_t(long, 1, page_counter_read(counter) - limit);
> >> + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
> >> GFP_KERNEL, !memsw)) {
> >> ret = -EBUSY;
> >> break;
> >
> > How does this address the over reclaim concern?
>
> It protects from over reclaim due to underflow.
I do not think so. Consider that this reclaim races with other
reclaimers. Now you are reclaiming a large chunk so you might end up
reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
reclaim to be negligible.
--
Michal Hocko
SUSE Labs
On 01/11/2018 07:29 PM, Michal Hocko wrote:
> On Thu 11-01-18 18:23:57, Andrey Ryabinin wrote:
>> On 01/11/2018 03:46 PM, Michal Hocko wrote:
>>> On Thu 11-01-18 15:21:33, Andrey Ryabinin wrote:
>>>>
>>>>
>>>> On 01/11/2018 01:42 PM, Michal Hocko wrote:
>>>>> On Wed 10-01-18 15:43:17, Andrey Ryabinin wrote:
>>>>> [...]
>>>>>> @@ -2506,15 +2480,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>>>>> if (!ret)
>>>>>> break;
>>>>>>
>>>>>> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw);
>>>>>> -
>>>>>> - curusage = page_counter_read(counter);
>>>>>> - /* Usage is reduced ? */
>>>>>> - if (curusage >= oldusage)
>>>>>> - retry_count--;
>>>>>> - else
>>>>>> - oldusage = curusage;
>>>>>> - } while (retry_count);
>>>>>> + usage = page_counter_read(counter);
>>>>>> + if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
>>>>>> + GFP_KERNEL, !memsw)) {
>>>>>
>>>>> If the usage drops below limit in the meantime then you get underflow
>>>>> and reclaim the whole memcg. I do not think this is a good idea. This
>>>>> can also lead to over reclaim. Why don't you simply stick with the
>>>>> original SWAP_CLUSTER_MAX (aka 1 for try_to_free_mem_cgroup_pages)?
>>>>>
>>>>
>>>> Because, if new limit is gigabytes bellow the current usage, retrying to set
>>>> new limit after reclaiming only 32 pages seems unreasonable.
>>>
>>> Who would do insanity like that?
>>>
>>
>> What's insane about that?
>
> I haven't seen this being done in practice. Why would you want to
> reclaim GBs of memory from a cgroup? Anyway, if you believe this is
> really needed then simply do it in a separate patch.
>
For the same reason as anyone would want to set memory limit on some job that generates
too much pressure and disrupts others. Whether this GBs or MBs is just a matter of scale.
More concrete example is workload that generates lots of page cache. Without limit (or with too high limit)
it wakes up kswapd which starts trashing all other cgroups. It's pretty bad for anon mostly cgroups
as we may constantly swap back and forth hot data.
>>>> @@ -2487,8 +2487,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>>> if (!ret)
>>>> break;
>>>>
>>>> - usage = page_counter_read(counter);
>>>> - if (!try_to_free_mem_cgroup_pages(memcg, usage - limit,
>>>> + nr_pages = max_t(long, 1, page_counter_read(counter) - limit);
>>>> + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
>>>> GFP_KERNEL, !memsw)) {
>>>> ret = -EBUSY;
>>>> break;
>>>
>>> How does this address the over reclaim concern?
>>
>> It protects from over reclaim due to underflow.
>
> I do not think so. Consider that this reclaim races with other
> reclaimers. Now you are reclaiming a large chunk so you might end up
> reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
> reclaim to be negligible.
>
I did consider this. And I think, I already explained that sort of race in previous email.
Whether "Task B" is really a task in cgroup or it's actually a bunch of reclaimers,
doesn't matter. That doesn't change anything.
On Thu, 11 Jan 2018 14:59:23 +0300 Andrey Ryabinin <[email protected]> wrote:
> On 01/11/2018 01:31 AM, Andrew Morton wrote:
> > On Wed, 10 Jan 2018 15:43:17 +0300 Andrey Ryabinin <[email protected]> wrote:
> >
> >> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
> >> pages on each iteration. This makes practically impossible to decrease
> >> limit of memory cgroup. Tasks could easily allocate back 32 pages,
> >> so we can't reduce memory usage, and once retry_count reaches zero we return
> >> -EBUSY.
> >>
> >> Easy to reproduce the problem by running the following commands:
> >>
> >> mkdir /sys/fs/cgroup/memory/test
> >> echo $$ >> /sys/fs/cgroup/memory/test/tasks
> >> cat big_file > /dev/null &
> >> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> >> -bash: echo: write error: Device or resource busy
> >>
> >> Instead of relying on retry_count, keep retrying the reclaim until
> >> the desired limit is reached or fail if the reclaim doesn't make
> >> any progress or a signal is pending.
> >>
> >
> > Is there any situation under which that mem_cgroup_resize_limit() can
> > get stuck semi-indefinitely in a livelockish state? It isn't very
> > obvious that we're protected from this, so perhaps it would help to
> > have a comment which describes how loop termination is assured?
> >
>
> We are not protected from this. If tasks in cgroup *indefinitely* generate reclaimable memory at high rate
> and user asks to set unreachable limit, like 'echo 4096 > memory.limit_in_bytes', than
> try_to_free_mem_cgroup_pages() will return non-zero indefinitely.
>
> Is that a big deal? At least loop can be interrupted by a signal, and we don't hold any locks here.
It may be better to detect this condition, give up and return an error?
On 01/12/2018 03:21 AM, Andrew Morton wrote:
> On Thu, 11 Jan 2018 14:59:23 +0300 Andrey Ryabinin <[email protected]> wrote:
>
>> On 01/11/2018 01:31 AM, Andrew Morton wrote:
>>> On Wed, 10 Jan 2018 15:43:17 +0300 Andrey Ryabinin <[email protected]> wrote:
>>>
>>>> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
>>>> pages on each iteration. This makes practically impossible to decrease
>>>> limit of memory cgroup. Tasks could easily allocate back 32 pages,
>>>> so we can't reduce memory usage, and once retry_count reaches zero we return
>>>> -EBUSY.
>>>>
>>>> Easy to reproduce the problem by running the following commands:
>>>>
>>>> mkdir /sys/fs/cgroup/memory/test
>>>> echo $$ >> /sys/fs/cgroup/memory/test/tasks
>>>> cat big_file > /dev/null &
>>>> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>>>> -bash: echo: write error: Device or resource busy
>>>>
>>>> Instead of relying on retry_count, keep retrying the reclaim until
>>>> the desired limit is reached or fail if the reclaim doesn't make
>>>> any progress or a signal is pending.
>>>>
>>>
>>> Is there any situation under which that mem_cgroup_resize_limit() can
>>> get stuck semi-indefinitely in a livelockish state? It isn't very
>>> obvious that we're protected from this, so perhaps it would help to
>>> have a comment which describes how loop termination is assured?
>>>
>>
>> We are not protected from this. If tasks in cgroup *indefinitely* generate reclaimable memory at high rate
>> and user asks to set unreachable limit, like 'echo 4096 > memory.limit_in_bytes', than
>> try_to_free_mem_cgroup_pages() will return non-zero indefinitely.
>>
>> Is that a big deal? At least loop can be interrupted by a signal, and we don't hold any locks here.
>
> It may be better to detect this condition, give up and return an error?
>
That's basically what how v1 worked, "if (curusage >= oldusage)" used to be
the way to detect this potential livelock.
So we can just go back to it?
On Fri 12-01-18 00:59:38, Andrey Ryabinin wrote:
> On 01/11/2018 07:29 PM, Michal Hocko wrote:
[...]
> > I do not think so. Consider that this reclaim races with other
> > reclaimers. Now you are reclaiming a large chunk so you might end up
> > reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
> > reclaim to be negligible.
> >
>
> I did consider this. And I think, I already explained that sort of race in previous email.
> Whether "Task B" is really a task in cgroup or it's actually a bunch of reclaimers,
> doesn't matter. That doesn't change anything.
I would _really_ prefer two patches here. The first one removing the
hard coded reclaim count. That thing is just dubious at best. If you
_really_ think that the higher reclaim target is meaningfull then make
it a separate patch. I am not conviced but I will not nack it it either.
But it will make our life much easier if my over reclaim concern is
right and we will need to revert it. Conceptually those two changes are
independent anywa.
--
Michal Hocko
SUSE Labs
On Fri, Jan 12, 2018 at 4:24 AM, Michal Hocko <[email protected]> wrote:
> On Fri 12-01-18 00:59:38, Andrey Ryabinin wrote:
>> On 01/11/2018 07:29 PM, Michal Hocko wrote:
> [...]
>> > I do not think so. Consider that this reclaim races with other
>> > reclaimers. Now you are reclaiming a large chunk so you might end up
>> > reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
>> > reclaim to be negligible.
>> >
>>
>> I did consider this. And I think, I already explained that sort of race in previous email.
>> Whether "Task B" is really a task in cgroup or it's actually a bunch of reclaimers,
>> doesn't matter. That doesn't change anything.
>
> I would _really_ prefer two patches here. The first one removing the
> hard coded reclaim count. That thing is just dubious at best. If you
> _really_ think that the higher reclaim target is meaningfull then make
> it a separate patch. I am not conviced but I will not nack it it either.
> But it will make our life much easier if my over reclaim concern is
> right and we will need to revert it. Conceptually those two changes are
> independent anywa.
>
Personally I feel that the cgroup-v2 semantics are much cleaner for
setting limit. There is no race with the allocators in the memcg,
though oom-killer can be triggered. For cgroup-v1, the user does not
expect OOM killer and EBUSY is expected on unsuccessful reclaim. How
about we do something similar here and make sure oom killer can not be
triggered for the given memcg?
// pseudo code
disable_oom(memcg)
old = xchg(&memcg->memory.limit, requested_limit)
reclaim memory until usage gets below new limit or retries are exhausted
if (unsuccessful) {
reset_limit(memcg, old)
ret = EBUSY
} else
ret = 0;
enable_oom(memcg)
This way there is no race with the allocators and oom killer will not
be triggered. The processes in the memcg can suffer but that should be
within the expectation of the user. One disclaimer though, disabling
oom for memcg needs more thought.
Shakeel
On 01/13/2018 01:57 AM, Shakeel Butt wrote:
> On Fri, Jan 12, 2018 at 4:24 AM, Michal Hocko <[email protected]> wrote:
>> On Fri 12-01-18 00:59:38, Andrey Ryabinin wrote:
>>> On 01/11/2018 07:29 PM, Michal Hocko wrote:
>> [...]
>>>> I do not think so. Consider that this reclaim races with other
>>>> reclaimers. Now you are reclaiming a large chunk so you might end up
>>>> reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
>>>> reclaim to be negligible.
>>>>
>>>
>>> I did consider this. And I think, I already explained that sort of race in previous email.
>>> Whether "Task B" is really a task in cgroup or it's actually a bunch of reclaimers,
>>> doesn't matter. That doesn't change anything.
>>
>> I would _really_ prefer two patches here. The first one removing the
>> hard coded reclaim count. That thing is just dubious at best. If you
>> _really_ think that the higher reclaim target is meaningfull then make
>> it a separate patch. I am not conviced but I will not nack it it either.
>> But it will make our life much easier if my over reclaim concern is
>> right and we will need to revert it. Conceptually those two changes are
>> independent anywa.
>>
>
> Personally I feel that the cgroup-v2 semantics are much cleaner for
> setting limit. There is no race with the allocators in the memcg,
> though oom-killer can be triggered. For cgroup-v1, the user does not
> expect OOM killer and EBUSY is expected on unsuccessful reclaim. How
> about we do something similar here and make sure oom killer can not be
> triggered for the given memcg?
>
> // pseudo code
> disable_oom(memcg)
> old = xchg(&memcg->memory.limit, requested_limit)
>
> reclaim memory until usage gets below new limit or retries are exhausted
>
> if (unsuccessful) {
> reset_limit(memcg, old)
> ret = EBUSY
> } else
> ret = 0;
> enable_oom(memcg)
>
> This way there is no race with the allocators and oom killer will not
> be triggered. The processes in the memcg can suffer but that should be
> within the expectation of the user. One disclaimer though, disabling
> oom for memcg needs more thought.
That's might be worse. If limit is too low, all allocations (except __GFP_NOFAIL of course) will start
failing. And the kernel not always careful enough in -ENOMEM handling.
Also, it's not much different from oom killing everything, the end result is almost the same -
nothing will work in that cgroup.
> Shakeel
>
On 01/12/2018 03:24 PM, Michal Hocko wrote:
> On Fri 12-01-18 00:59:38, Andrey Ryabinin wrote:
>> On 01/11/2018 07:29 PM, Michal Hocko wrote:
> [...]
>>> I do not think so. Consider that this reclaim races with other
>>> reclaimers. Now you are reclaiming a large chunk so you might end up
>>> reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
>>> reclaim to be negligible.
>>>
>>
>> I did consider this. And I think, I already explained that sort of race in previous email.
>> Whether "Task B" is really a task in cgroup or it's actually a bunch of reclaimers,
>> doesn't matter. That doesn't change anything.
>
> I would _really_ prefer two patches here. The first one removing the
> hard coded reclaim count. That thing is just dubious at best. If you
> _really_ think that the higher reclaim target is meaningfull then make
> it a separate patch. I am not conviced but I will not nack it it either.
> But it will make our life much easier if my over reclaim concern is
> right and we will need to revert it. Conceptually those two changes are
> independent anywa.
>
Ok, fair point. But what about livelock than? Don't you think that we should
go back to something like in V1 patch to prevent it?
On 01/15/2018 03:46 PM, Michal Hocko wrote:
> On Mon 15-01-18 15:30:59, Andrey Ryabinin wrote:
>>
>>
>> On 01/12/2018 03:24 PM, Michal Hocko wrote:
>>> On Fri 12-01-18 00:59:38, Andrey Ryabinin wrote:
>>>> On 01/11/2018 07:29 PM, Michal Hocko wrote:
>>> [...]
>>>>> I do not think so. Consider that this reclaim races with other
>>>>> reclaimers. Now you are reclaiming a large chunk so you might end up
>>>>> reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
>>>>> reclaim to be negligible.
>>>>>
>>>>
>>>> I did consider this. And I think, I already explained that sort of race in previous email.
>>>> Whether "Task B" is really a task in cgroup or it's actually a bunch of reclaimers,
>>>> doesn't matter. That doesn't change anything.
>>>
>>> I would _really_ prefer two patches here. The first one removing the
>>> hard coded reclaim count. That thing is just dubious at best. If you
>>> _really_ think that the higher reclaim target is meaningfull then make
>>> it a separate patch. I am not conviced but I will not nack it it either.
>>> But it will make our life much easier if my over reclaim concern is
>>> right and we will need to revert it. Conceptually those two changes are
>>> independent anywa.
>>>
>>
>> Ok, fair point. But what about livelock than? Don't you think that we should
>> go back to something like in V1 patch to prevent it?
>
> I am not sure what do you mean by the livelock here.
>
Livelock is when tasks in cgroup constantly allocate reclaimable memory at high rate,
and user asked to set too low unreachable limit e.g. 'echo 4096 > memory.limit_in_bytes'.
We will loop indefinitely in mem_cgroup_resize_limit(), because try_to_free_mem_cgroup_pages() != 0
(as long as cgroup tasks generate new reclaimable pages fast enough).
On Mon 15-01-18 15:53:35, Andrey Ryabinin wrote:
>
>
> On 01/15/2018 03:46 PM, Michal Hocko wrote:
> > On Mon 15-01-18 15:30:59, Andrey Ryabinin wrote:
> >>
> >>
> >> On 01/12/2018 03:24 PM, Michal Hocko wrote:
> >>> On Fri 12-01-18 00:59:38, Andrey Ryabinin wrote:
> >>>> On 01/11/2018 07:29 PM, Michal Hocko wrote:
> >>> [...]
> >>>>> I do not think so. Consider that this reclaim races with other
> >>>>> reclaimers. Now you are reclaiming a large chunk so you might end up
> >>>>> reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
> >>>>> reclaim to be negligible.
> >>>>>
> >>>>
> >>>> I did consider this. And I think, I already explained that sort of race in previous email.
> >>>> Whether "Task B" is really a task in cgroup or it's actually a bunch of reclaimers,
> >>>> doesn't matter. That doesn't change anything.
> >>>
> >>> I would _really_ prefer two patches here. The first one removing the
> >>> hard coded reclaim count. That thing is just dubious at best. If you
> >>> _really_ think that the higher reclaim target is meaningfull then make
> >>> it a separate patch. I am not conviced but I will not nack it it either.
> >>> But it will make our life much easier if my over reclaim concern is
> >>> right and we will need to revert it. Conceptually those two changes are
> >>> independent anywa.
> >>>
> >>
> >> Ok, fair point. But what about livelock than? Don't you think that we should
> >> go back to something like in V1 patch to prevent it?
> >
> > I am not sure what do you mean by the livelock here.
> >
>
> Livelock is when tasks in cgroup constantly allocate reclaimable memory at high rate,
> and user asked to set too low unreachable limit e.g. 'echo 4096 > memory.limit_in_bytes'.
OK, I wasn't sure. The reclaim target, however, doesn't have a direct
influence on this, though.
> We will loop indefinitely in mem_cgroup_resize_limit(), because try_to_free_mem_cgroup_pages() != 0
> (as long as cgroup tasks generate new reclaimable pages fast enough).
I do not thing this is a real problem. The context is interruptible and
I would even consider it safer to keep retrying than simply failing
prematurely. My experience tells me that basically any hard coded retry
loop in the kernel is wrong.
--
Michal Hocko
SUSE Labs
On Mon 15-01-18 15:30:59, Andrey Ryabinin wrote:
>
>
> On 01/12/2018 03:24 PM, Michal Hocko wrote:
> > On Fri 12-01-18 00:59:38, Andrey Ryabinin wrote:
> >> On 01/11/2018 07:29 PM, Michal Hocko wrote:
> > [...]
> >>> I do not think so. Consider that this reclaim races with other
> >>> reclaimers. Now you are reclaiming a large chunk so you might end up
> >>> reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
> >>> reclaim to be negligible.
> >>>
> >>
> >> I did consider this. And I think, I already explained that sort of race in previous email.
> >> Whether "Task B" is really a task in cgroup or it's actually a bunch of reclaimers,
> >> doesn't matter. That doesn't change anything.
> >
> > I would _really_ prefer two patches here. The first one removing the
> > hard coded reclaim count. That thing is just dubious at best. If you
> > _really_ think that the higher reclaim target is meaningfull then make
> > it a separate patch. I am not conviced but I will not nack it it either.
> > But it will make our life much easier if my over reclaim concern is
> > right and we will need to revert it. Conceptually those two changes are
> > independent anywa.
> >
>
> Ok, fair point. But what about livelock than? Don't you think that we should
> go back to something like in V1 patch to prevent it?
I am not sure what do you mean by the livelock here.
--
Michal Hocko
SUSE Labs
On Mon, Jan 15, 2018 at 4:29 AM, Andrey Ryabinin
<[email protected]> wrote:
>
>
> On 01/13/2018 01:57 AM, Shakeel Butt wrote:
>> On Fri, Jan 12, 2018 at 4:24 AM, Michal Hocko <[email protected]> wrote:
>>> On Fri 12-01-18 00:59:38, Andrey Ryabinin wrote:
>>>> On 01/11/2018 07:29 PM, Michal Hocko wrote:
>>> [...]
>>>>> I do not think so. Consider that this reclaim races with other
>>>>> reclaimers. Now you are reclaiming a large chunk so you might end up
>>>>> reclaiming more than necessary. SWAP_CLUSTER_MAX would reduce the over
>>>>> reclaim to be negligible.
>>>>>
>>>>
>>>> I did consider this. And I think, I already explained that sort of race in previous email.
>>>> Whether "Task B" is really a task in cgroup or it's actually a bunch of reclaimers,
>>>> doesn't matter. That doesn't change anything.
>>>
>>> I would _really_ prefer two patches here. The first one removing the
>>> hard coded reclaim count. That thing is just dubious at best. If you
>>> _really_ think that the higher reclaim target is meaningfull then make
>>> it a separate patch. I am not conviced but I will not nack it it either.
>>> But it will make our life much easier if my over reclaim concern is
>>> right and we will need to revert it. Conceptually those two changes are
>>> independent anywa.
>>>
>>
>> Personally I feel that the cgroup-v2 semantics are much cleaner for
>> setting limit. There is no race with the allocators in the memcg,
>> though oom-killer can be triggered. For cgroup-v1, the user does not
>> expect OOM killer and EBUSY is expected on unsuccessful reclaim. How
>> about we do something similar here and make sure oom killer can not be
>> triggered for the given memcg?
>>
>> // pseudo code
>> disable_oom(memcg)
>> old = xchg(&memcg->memory.limit, requested_limit)
>>
>> reclaim memory until usage gets below new limit or retries are exhausted
>>
>> if (unsuccessful) {
>> reset_limit(memcg, old)
>> ret = EBUSY
>> } else
>> ret = 0;
>> enable_oom(memcg)
>>
>> This way there is no race with the allocators and oom killer will not
>> be triggered. The processes in the memcg can suffer but that should be
>> within the expectation of the user. One disclaimer though, disabling
>> oom for memcg needs more thought.
>
> That's might be worse. If limit is too low, all allocations (except __GFP_NOFAIL of course) will start
> failing. And the kernel not always careful enough in -ENOMEM handling.
> Also, it's not much different from oom killing everything, the end result is almost the same -
> nothing will work in that cgroup.
>
By disabling memcg oom, I meant to treat all allocations from that
memcg as __GFP_NOFAIL until the oom is disabled. I will see if I can
convert this into an actual code.
>
>> Shakeel
>>
mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
pages on each iteration. This makes it practically impossible to decrease
limit of memory cgroup. Tasks could easily allocate back 32 pages, so we
can't reduce memory usage, and once retry_count reaches zero we return
-EBUSY.
Easy to reproduce the problem by running the following commands:
mkdir /sys/fs/cgroup/memory/test
echo $$ >> /sys/fs/cgroup/memory/test/tasks
cat big_file > /dev/null &
sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
-bash: echo: write error: Device or resource busy
Instead of relying on retry_count, keep retrying the reclaim until the
desired limit is reached or fail if the reclaim doesn't make any progress
or a signal is pending.
Signed-off-by: Andrey Ryabinin <[email protected]>
Cc: Shakeel Butt <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Vladimir Davydov <[email protected]>
---
mm/memcontrol.c | 42 ++++++------------------------------------
1 file changed, 6 insertions(+), 36 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13aeccf32c2e..9d987f3e79dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1176,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
}
/*
- * This function returns the number of memcg under hierarchy tree. Returns
- * 1(self count) if no children.
- */
-static int mem_cgroup_count_children(struct mem_cgroup *memcg)
-{
- int num = 0;
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg)
- num++;
- return num;
-}
-
-/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
@@ -2462,24 +2448,11 @@ static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long limit, bool memsw)
{
- unsigned long curusage;
- unsigned long oldusage;
bool enlarge = false;
- int retry_count;
int ret;
bool limits_invariant;
struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
- /*
- * For keeping hierarchical_reclaim simple, how long we should retry
- * is depends on callers. We set our retry-count to be function
- * of # of children which we should visit in this loop.
- */
- retry_count = MEM_CGROUP_RECLAIM_RETRIES *
- mem_cgroup_count_children(memcg);
-
- oldusage = page_counter_read(counter);
-
do {
if (signal_pending(current)) {
ret = -EINTR;
@@ -2506,15 +2479,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw);
-
- curusage = page_counter_read(counter);
- /* Usage is reduced ? */
- if (curusage >= oldusage)
- retry_count--;
- else
- oldusage = curusage;
- } while (retry_count);
+ if (!try_to_free_mem_cgroup_pages(memcg, 1,
+ GFP_KERNEL, !memsw)) {
+ ret = -EBUSY;
+ break;
+ }
+ } while (true);
if (!ret && enlarge)
memcg_oom_recover(memcg);
--
2.13.6
Currently mem_cgroup_resize_limit() retries to set limit after reclaiming
32 pages. It makes more sense to reclaim needed amount of pages right away.
This works noticeably faster, especially if 'usage - limit' big.
E.g. bringing down limit from 4G to 50M:
Before:
# perf stat echo 50M > memory.limit_in_bytes
Performance counter stats for 'echo 50M':
386.582382 task-clock (msec) # 0.835 CPUs utilized
2,502 context-switches # 0.006 M/sec
0.463244382 seconds time elapsed
After:
# perf stat echo 50M > memory.limit_in_bytes
Performance counter stats for 'echo 50M':
169.403906 task-clock (msec) # 0.849 CPUs utilized
14 context-switches # 0.083 K/sec
0.199536900 seconds time elapsed
Signed-off-by: Andrey Ryabinin <[email protected]>
Cc: Shakeel Butt <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Vladimir Davydov <[email protected]>
---
mm/memcontrol.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9d987f3e79dc..09bac2df2f12 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2448,6 +2448,7 @@ static DEFINE_MUTEX(memcg_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long limit, bool memsw)
{
+ unsigned long nr_pages;
bool enlarge = false;
int ret;
bool limits_invariant;
@@ -2479,8 +2480,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret)
break;
- if (!try_to_free_mem_cgroup_pages(memcg, 1,
- GFP_KERNEL, !memsw)) {
+ nr_pages = max_t(long, 1, page_counter_read(counter) - limit);
+ if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
+ GFP_KERNEL, !memsw)) {
ret = -EBUSY;
break;
}
--
2.13.6
On Fri 19-01-18 16:25:43, Andrey Ryabinin wrote:
> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
> pages on each iteration. This makes it practically impossible to decrease
> limit of memory cgroup. Tasks could easily allocate back 32 pages, so we
> can't reduce memory usage, and once retry_count reaches zero we return
> -EBUSY.
>
> Easy to reproduce the problem by running the following commands:
>
> mkdir /sys/fs/cgroup/memory/test
> echo $$ >> /sys/fs/cgroup/memory/test/tasks
> cat big_file > /dev/null &
> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> -bash: echo: write error: Device or resource busy
>
> Instead of relying on retry_count, keep retrying the reclaim until the
> desired limit is reached or fail if the reclaim doesn't make any progress
> or a signal is pending.
Thanks for splitting the original patch. I am OK with this part.
> Signed-off-by: Andrey Ryabinin <[email protected]>
> Cc: Shakeel Butt <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
Acked-by: Michal Hocko <[email protected]>
> ---
> mm/memcontrol.c | 42 ++++++------------------------------------
> 1 file changed, 6 insertions(+), 36 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 13aeccf32c2e..9d987f3e79dc 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1176,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
> }
>
> /*
> - * This function returns the number of memcg under hierarchy tree. Returns
> - * 1(self count) if no children.
> - */
> -static int mem_cgroup_count_children(struct mem_cgroup *memcg)
> -{
> - int num = 0;
> - struct mem_cgroup *iter;
> -
> - for_each_mem_cgroup_tree(iter, memcg)
> - num++;
> - return num;
> -}
> -
> -/*
> * Return the memory (and swap, if configured) limit for a memcg.
> */
> unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> @@ -2462,24 +2448,11 @@ static DEFINE_MUTEX(memcg_limit_mutex);
> static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> unsigned long limit, bool memsw)
> {
> - unsigned long curusage;
> - unsigned long oldusage;
> bool enlarge = false;
> - int retry_count;
> int ret;
> bool limits_invariant;
> struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
>
> - /*
> - * For keeping hierarchical_reclaim simple, how long we should retry
> - * is depends on callers. We set our retry-count to be function
> - * of # of children which we should visit in this loop.
> - */
> - retry_count = MEM_CGROUP_RECLAIM_RETRIES *
> - mem_cgroup_count_children(memcg);
> -
> - oldusage = page_counter_read(counter);
> -
> do {
> if (signal_pending(current)) {
> ret = -EINTR;
> @@ -2506,15 +2479,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw);
> -
> - curusage = page_counter_read(counter);
> - /* Usage is reduced ? */
> - if (curusage >= oldusage)
> - retry_count--;
> - else
> - oldusage = curusage;
> - } while (retry_count);
> + if (!try_to_free_mem_cgroup_pages(memcg, 1,
> + GFP_KERNEL, !memsw)) {
> + ret = -EBUSY;
> + break;
> + }
> + } while (true);
>
> if (!ret && enlarge)
> memcg_oom_recover(memcg);
> --
> 2.13.6
>
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Michal Hocko
SUSE Labs
On Fri 19-01-18 16:25:44, Andrey Ryabinin wrote:
> Currently mem_cgroup_resize_limit() retries to set limit after reclaiming
> 32 pages. It makes more sense to reclaim needed amount of pages right away.
>
> This works noticeably faster, especially if 'usage - limit' big.
> E.g. bringing down limit from 4G to 50M:
>
> Before:
> # perf stat echo 50M > memory.limit_in_bytes
>
> Performance counter stats for 'echo 50M':
>
> 386.582382 task-clock (msec) # 0.835 CPUs utilized
> 2,502 context-switches # 0.006 M/sec
>
> 0.463244382 seconds time elapsed
>
> After:
> # perf stat echo 50M > memory.limit_in_bytes
>
> Performance counter stats for 'echo 50M':
>
> 169.403906 task-clock (msec) # 0.849 CPUs utilized
> 14 context-switches # 0.083 K/sec
>
> 0.199536900 seconds time elapsed
But I am not going ack this one. As already stated this has a risk
of over-reclaim if there a lot of charges are freed along with this
shrinking. This is more of a theoretical concern so I am _not_ going to
nack. If we ever see such a problem then reverting this patch should be
pretty straghtforward.
> Signed-off-by: Andrey Ryabinin <[email protected]>
> Cc: Shakeel Butt <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
> ---
> mm/memcontrol.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 9d987f3e79dc..09bac2df2f12 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2448,6 +2448,7 @@ static DEFINE_MUTEX(memcg_limit_mutex);
> static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> unsigned long limit, bool memsw)
> {
> + unsigned long nr_pages;
> bool enlarge = false;
> int ret;
> bool limits_invariant;
> @@ -2479,8 +2480,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - if (!try_to_free_mem_cgroup_pages(memcg, 1,
> - GFP_KERNEL, !memsw)) {
> + nr_pages = max_t(long, 1, page_counter_read(counter) - limit);
> + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
> + GFP_KERNEL, !memsw)) {
> ret = -EBUSY;
> break;
> }
> --
> 2.13.6
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
--
Michal Hocko
SUSE Labs
On Fri, Jan 19, 2018 at 5:35 AM, Michal Hocko <[email protected]> wrote:
> On Fri 19-01-18 16:25:44, Andrey Ryabinin wrote:
>> Currently mem_cgroup_resize_limit() retries to set limit after reclaiming
>> 32 pages. It makes more sense to reclaim needed amount of pages right away.
>>
>> This works noticeably faster, especially if 'usage - limit' big.
>> E.g. bringing down limit from 4G to 50M:
>>
>> Before:
>> # perf stat echo 50M > memory.limit_in_bytes
>>
>> Performance counter stats for 'echo 50M':
>>
>> 386.582382 task-clock (msec) # 0.835 CPUs utilized
>> 2,502 context-switches # 0.006 M/sec
>>
>> 0.463244382 seconds time elapsed
>>
>> After:
>> # perf stat echo 50M > memory.limit_in_bytes
>>
>> Performance counter stats for 'echo 50M':
>>
>> 169.403906 task-clock (msec) # 0.849 CPUs utilized
>> 14 context-switches # 0.083 K/sec
>>
>> 0.199536900 seconds time elapsed
>
> But I am not going ack this one. As already stated this has a risk
> of over-reclaim if there a lot of charges are freed along with this
> shrinking. This is more of a theoretical concern so I am _not_ going to
If you don't mind, can you explain why over-reclaim is a concern at
all? The only side effect of over reclaim I can think of is the job
might suffer a bit over (more swapins & pageins). Shouldn't this be
within the expectation of the user decreasing the limits?
> nack. If we ever see such a problem then reverting this patch should be
> pretty straghtforward.
>
>> Signed-off-by: Andrey Ryabinin <[email protected]>
Reviewed-by: Shakeel Butt <[email protected]>
>> Cc: Shakeel Butt <[email protected]>
>> Cc: Michal Hocko <[email protected]>
>> Cc: Johannes Weiner <[email protected]>
>> Cc: Vladimir Davydov <[email protected]>
>> ---
>> mm/memcontrol.c | 6 ++++--
>> 1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 9d987f3e79dc..09bac2df2f12 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -2448,6 +2448,7 @@ static DEFINE_MUTEX(memcg_limit_mutex);
>> static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>> unsigned long limit, bool memsw)
>> {
>> + unsigned long nr_pages;
>> bool enlarge = false;
>> int ret;
>> bool limits_invariant;
>> @@ -2479,8 +2480,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>> if (!ret)
>> break;
>>
>> - if (!try_to_free_mem_cgroup_pages(memcg, 1,
>> - GFP_KERNEL, !memsw)) {
>> + nr_pages = max_t(long, 1, page_counter_read(counter) - limit);
>> + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages,
>> + GFP_KERNEL, !memsw)) {
>> ret = -EBUSY;
>> break;
>> }
>> --
>> 2.13.6
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to [email protected]. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
> --
> Michal Hocko
> SUSE Labs
On Fri 19-01-18 06:49:29, Shakeel Butt wrote:
> On Fri, Jan 19, 2018 at 5:35 AM, Michal Hocko <[email protected]> wrote:
> > On Fri 19-01-18 16:25:44, Andrey Ryabinin wrote:
> >> Currently mem_cgroup_resize_limit() retries to set limit after reclaiming
> >> 32 pages. It makes more sense to reclaim needed amount of pages right away.
> >>
> >> This works noticeably faster, especially if 'usage - limit' big.
> >> E.g. bringing down limit from 4G to 50M:
> >>
> >> Before:
> >> # perf stat echo 50M > memory.limit_in_bytes
> >>
> >> Performance counter stats for 'echo 50M':
> >>
> >> 386.582382 task-clock (msec) # 0.835 CPUs utilized
> >> 2,502 context-switches # 0.006 M/sec
> >>
> >> 0.463244382 seconds time elapsed
> >>
> >> After:
> >> # perf stat echo 50M > memory.limit_in_bytes
> >>
> >> Performance counter stats for 'echo 50M':
> >>
> >> 169.403906 task-clock (msec) # 0.849 CPUs utilized
> >> 14 context-switches # 0.083 K/sec
> >>
> >> 0.199536900 seconds time elapsed
> >
> > But I am not going ack this one. As already stated this has a risk
> > of over-reclaim if there a lot of charges are freed along with this
> > shrinking. This is more of a theoretical concern so I am _not_ going to
>
> If you don't mind, can you explain why over-reclaim is a concern at
> all? The only side effect of over reclaim I can think of is the job
> might suffer a bit over (more swapins & pageins). Shouldn't this be
> within the expectation of the user decreasing the limits?
It is not a disaster. But it is an unexpected side effect of the
implementation. If you have limit 1GB and want to reduce it 500MB
then it would be quite surprising to land at 200M just because somebody
was freeing 300MB in parallel. Is this likely? Probably not but the more
is the limit touched and the larger are the differences the more likely
it is. Keep retrying in the smaller amounts and you will not see the
above happening.
And to be honest, I do not really see why keeping retrying from
mem_cgroup_resize_limit should be so much faster than keep retrying from
the direct reclaim path. We are doing SWAP_CLUSTER_MAX batches anyway.
mem_cgroup_resize_limit loop adds _some_ overhead but I am not really
sure why it should be that large.
--
Michal Hocko
SUSE Labs
On Fri, Jan 19, 2018 at 7:11 AM, Michal Hocko <[email protected]> wrote:
> On Fri 19-01-18 06:49:29, Shakeel Butt wrote:
>> On Fri, Jan 19, 2018 at 5:35 AM, Michal Hocko <[email protected]> wrote:
>> > On Fri 19-01-18 16:25:44, Andrey Ryabinin wrote:
>> >> Currently mem_cgroup_resize_limit() retries to set limit after reclaiming
>> >> 32 pages. It makes more sense to reclaim needed amount of pages right away.
>> >>
>> >> This works noticeably faster, especially if 'usage - limit' big.
>> >> E.g. bringing down limit from 4G to 50M:
>> >>
>> >> Before:
>> >> # perf stat echo 50M > memory.limit_in_bytes
>> >>
>> >> Performance counter stats for 'echo 50M':
>> >>
>> >> 386.582382 task-clock (msec) # 0.835 CPUs utilized
>> >> 2,502 context-switches # 0.006 M/sec
>> >>
>> >> 0.463244382 seconds time elapsed
>> >>
>> >> After:
>> >> # perf stat echo 50M > memory.limit_in_bytes
>> >>
>> >> Performance counter stats for 'echo 50M':
>> >>
>> >> 169.403906 task-clock (msec) # 0.849 CPUs utilized
>> >> 14 context-switches # 0.083 K/sec
>> >>
>> >> 0.199536900 seconds time elapsed
>> >
>> > But I am not going ack this one. As already stated this has a risk
>> > of over-reclaim if there a lot of charges are freed along with this
>> > shrinking. This is more of a theoretical concern so I am _not_ going to
>>
>> If you don't mind, can you explain why over-reclaim is a concern at
>> all? The only side effect of over reclaim I can think of is the job
>> might suffer a bit over (more swapins & pageins). Shouldn't this be
>> within the expectation of the user decreasing the limits?
>
> It is not a disaster. But it is an unexpected side effect of the
> implementation. If you have limit 1GB and want to reduce it 500MB
> then it would be quite surprising to land at 200M just because somebody
> was freeing 300MB in parallel. Is this likely? Probably not but the more
> is the limit touched and the larger are the differences the more likely
> it is. Keep retrying in the smaller amounts and you will not see the
> above happening.
>
> And to be honest, I do not really see why keeping retrying from
> mem_cgroup_resize_limit should be so much faster than keep retrying from
> the direct reclaim path. We are doing SWAP_CLUSTER_MAX batches anyway.
> mem_cgroup_resize_limit loop adds _some_ overhead but I am not really
> sure why it should be that large.
>
Thanks for the explanation. Another query, we do not call
drain_all_stock() in mem_cgroup_resize_limit() but memory_max_write()
does call drain_all_stock(). Was this intentional or missed
accidentally?
On Fri 19-01-18 07:24:08, Shakeel Butt wrote:
[...]
> Thanks for the explanation. Another query, we do not call
> drain_all_stock() in mem_cgroup_resize_limit() but memory_max_write()
> does call drain_all_stock(). Was this intentional or missed
> accidentally?
I think it is just an omission. I would have to look closer but I am
just leaving now and will be back on Tuesday. This is unrelated so I
would rather discuss it in a separate email thread.
--
Michal Hocko
SUSE Labs
On 01/19/2018 04:25 PM, Andrey Ryabinin wrote:
> mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX)
> pages on each iteration. This makes it practically impossible to decrease
> limit of memory cgroup. Tasks could easily allocate back 32 pages, so we
> can't reduce memory usage, and once retry_count reaches zero we return
> -EBUSY.
>
> Easy to reproduce the problem by running the following commands:
>
> mkdir /sys/fs/cgroup/memory/test
> echo $$ >> /sys/fs/cgroup/memory/test/tasks
> cat big_file > /dev/null &
> sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> -bash: echo: write error: Device or resource busy
>
> Instead of relying on retry_count, keep retrying the reclaim until the
> desired limit is reached or fail if the reclaim doesn't make any progress
> or a signal is pending.
>
> Signed-off-by: Andrey Ryabinin <[email protected]>
> Cc: Shakeel Butt <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
Andrew, are you ok to pick up the patch?
On Fri, 19 Jan 2018 16:11:18 +0100 Michal Hocko <[email protected]> wrote:
> And to be honest, I do not really see why keeping retrying from
> mem_cgroup_resize_limit should be so much faster than keep retrying from
> the direct reclaim path. We are doing SWAP_CLUSTER_MAX batches anyway.
> mem_cgroup_resize_limit loop adds _some_ overhead but I am not really
> sure why it should be that large.
Maybe restarting the scan lots of times results in rescanning lots of
ineligible pages at the start of the list before doing useful work?
Andrey, are you able to determine where all that CPU time is being spent?
Thanks.
On 02/21/2018 11:17 PM, Andrew Morton wrote:
> On Fri, 19 Jan 2018 16:11:18 +0100 Michal Hocko <[email protected]> wrote:
>
>> And to be honest, I do not really see why keeping retrying from
>> mem_cgroup_resize_limit should be so much faster than keep retrying from
>> the direct reclaim path. We are doing SWAP_CLUSTER_MAX batches anyway.
>> mem_cgroup_resize_limit loop adds _some_ overhead but I am not really
>> sure why it should be that large.
>
> Maybe restarting the scan lots of times results in rescanning lots of
> ineligible pages at the start of the list before doing useful work?
>
> Andrey, are you able to determine where all that CPU time is being spent?
>
I should have been more specific about the test I did. The full script looks like this:
mkdir -p /sys/fs/cgroup/memory/test
echo $$ > /sys/fs/cgroup/memory/test/tasks
cat 4G_file > /dev/null
while true; do cat 4G_file > /dev/null; done &
loop_pid=$!
perf stat echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
echo -1 > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
kill $loop_pid
I think the additional loops add some overhead and it's not that big by itself, but
this small overhead allows task to refill slightly more pages, increasing
the total amount of pages that mem_cgroup_resize_limit() need to reclaim.
By using the following commands to show the the amount of reclaimed pages:
perf record -e vmscan:mm_vmscan_memcg_reclaim_end echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
perf script|cut -d '=' -f 2| paste -sd+ |bc
I've got 1259841 pages (4.9G) with the patch vs 1394312 pages (5.4G) without it.
On Thu 22-02-18 16:50:33, Andrey Ryabinin wrote:
> On 02/21/2018 11:17 PM, Andrew Morton wrote:
> > On Fri, 19 Jan 2018 16:11:18 +0100 Michal Hocko <[email protected]> wrote:
> >
> >> And to be honest, I do not really see why keeping retrying from
> >> mem_cgroup_resize_limit should be so much faster than keep retrying from
> >> the direct reclaim path. We are doing SWAP_CLUSTER_MAX batches anyway.
> >> mem_cgroup_resize_limit loop adds _some_ overhead but I am not really
> >> sure why it should be that large.
> >
> > Maybe restarting the scan lots of times results in rescanning lots of
> > ineligible pages at the start of the list before doing useful work?
> >
> > Andrey, are you able to determine where all that CPU time is being spent?
> >
>
> I should have been more specific about the test I did. The full script looks like this:
>
> mkdir -p /sys/fs/cgroup/memory/test
> echo $$ > /sys/fs/cgroup/memory/test/tasks
> cat 4G_file > /dev/null
> while true; do cat 4G_file > /dev/null; done &
> loop_pid=$!
> perf stat echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> echo -1 > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> kill $loop_pid
>
>
> I think the additional loops add some overhead and it's not that big by itself, but
> this small overhead allows task to refill slightly more pages, increasing
> the total amount of pages that mem_cgroup_resize_limit() need to reclaim.
>
> By using the following commands to show the the amount of reclaimed pages:
> perf record -e vmscan:mm_vmscan_memcg_reclaim_end echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> perf script|cut -d '=' -f 2| paste -sd+ |bc
>
> I've got 1259841 pages (4.9G) with the patch vs 1394312 pages (5.4G) without it.
So how does the picture changes if you have multiple producers?
--
Michal Hocko
SUSE Labs
On 02/22/2018 05:09 PM, Michal Hocko wrote:
> On Thu 22-02-18 16:50:33, Andrey Ryabinin wrote:
>> On 02/21/2018 11:17 PM, Andrew Morton wrote:
>>> On Fri, 19 Jan 2018 16:11:18 +0100 Michal Hocko <[email protected]> wrote:
>>>
>>>> And to be honest, I do not really see why keeping retrying from
>>>> mem_cgroup_resize_limit should be so much faster than keep retrying from
>>>> the direct reclaim path. We are doing SWAP_CLUSTER_MAX batches anyway.
>>>> mem_cgroup_resize_limit loop adds _some_ overhead but I am not really
>>>> sure why it should be that large.
>>>
>>> Maybe restarting the scan lots of times results in rescanning lots of
>>> ineligible pages at the start of the list before doing useful work?
>>>
>>> Andrey, are you able to determine where all that CPU time is being spent?
>>>
>>
>> I should have been more specific about the test I did. The full script looks like this:
>>
>> mkdir -p /sys/fs/cgroup/memory/test
>> echo $$ > /sys/fs/cgroup/memory/test/tasks
>> cat 4G_file > /dev/null
>> while true; do cat 4G_file > /dev/null; done &
>> loop_pid=$!
>> perf stat echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>> echo -1 > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>> kill $loop_pid
>>
>>
>> I think the additional loops add some overhead and it's not that big by itself, but
>> this small overhead allows task to refill slightly more pages, increasing
>> the total amount of pages that mem_cgroup_resize_limit() need to reclaim.
>>
>> By using the following commands to show the the amount of reclaimed pages:
>> perf record -e vmscan:mm_vmscan_memcg_reclaim_end echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>> perf script|cut -d '=' -f 2| paste -sd+ |bc
>>
>> I've got 1259841 pages (4.9G) with the patch vs 1394312 pages (5.4G) without it.
>
> So how does the picture changes if you have multiple producers?
>
Drastically, in favor of the patch. But numbers *very* fickle from run to run.
Inside 5G vm with 4 cpus (qemu -m 5G -smp 4) and 4 processes in cgroup reading 1G files:
"while true; do cat /1g_f$i > /dev/null; done &"
with the patch:
best: 1.04 secs, 9.7G reclaimed
worst: 2.2 secs, 16G reclaimed.
without:
best: 5.4 sec, 35G reclaimed
worst: 22.2 sec, 136G reclaimed
On Thu 22-02-18 18:13:11, Andrey Ryabinin wrote:
>
>
> On 02/22/2018 05:09 PM, Michal Hocko wrote:
> > On Thu 22-02-18 16:50:33, Andrey Ryabinin wrote:
> >> On 02/21/2018 11:17 PM, Andrew Morton wrote:
> >>> On Fri, 19 Jan 2018 16:11:18 +0100 Michal Hocko <[email protected]> wrote:
> >>>
> >>>> And to be honest, I do not really see why keeping retrying from
> >>>> mem_cgroup_resize_limit should be so much faster than keep retrying from
> >>>> the direct reclaim path. We are doing SWAP_CLUSTER_MAX batches anyway.
> >>>> mem_cgroup_resize_limit loop adds _some_ overhead but I am not really
> >>>> sure why it should be that large.
> >>>
> >>> Maybe restarting the scan lots of times results in rescanning lots of
> >>> ineligible pages at the start of the list before doing useful work?
> >>>
> >>> Andrey, are you able to determine where all that CPU time is being spent?
> >>>
> >>
> >> I should have been more specific about the test I did. The full script looks like this:
> >>
> >> mkdir -p /sys/fs/cgroup/memory/test
> >> echo $$ > /sys/fs/cgroup/memory/test/tasks
> >> cat 4G_file > /dev/null
> >> while true; do cat 4G_file > /dev/null; done &
> >> loop_pid=$!
> >> perf stat echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> >> echo -1 > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> >> kill $loop_pid
> >>
> >>
> >> I think the additional loops add some overhead and it's not that big by itself, but
> >> this small overhead allows task to refill slightly more pages, increasing
> >> the total amount of pages that mem_cgroup_resize_limit() need to reclaim.
> >>
> >> By using the following commands to show the the amount of reclaimed pages:
> >> perf record -e vmscan:mm_vmscan_memcg_reclaim_end echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> >> perf script|cut -d '=' -f 2| paste -sd+ |bc
> >>
> >> I've got 1259841 pages (4.9G) with the patch vs 1394312 pages (5.4G) without it.
> >
> > So how does the picture changes if you have multiple producers?
> >
>
> Drastically, in favor of the patch. But numbers *very* fickle from run to run.
>
> Inside 5G vm with 4 cpus (qemu -m 5G -smp 4) and 4 processes in cgroup reading 1G files:
> "while true; do cat /1g_f$i > /dev/null; done &"
>
> with the patch:
> best: 1.04 secs, 9.7G reclaimed
> worst: 2.2 secs, 16G reclaimed.
>
> without:
> best: 5.4 sec, 35G reclaimed
> worst: 22.2 sec, 136G reclaimed
Could you also compare how much memory do we reclaim with/without the
patch?
--
Michal Hocko
SUSE Labs
On 02/22/2018 06:33 PM, Michal Hocko wrote:
> On Thu 22-02-18 18:13:11, Andrey Ryabinin wrote:
>>
>>
>> On 02/22/2018 05:09 PM, Michal Hocko wrote:
>>> On Thu 22-02-18 16:50:33, Andrey Ryabinin wrote:
>>>> On 02/21/2018 11:17 PM, Andrew Morton wrote:
>>>>> On Fri, 19 Jan 2018 16:11:18 +0100 Michal Hocko <[email protected]> wrote:
>>>>>
>>>>>> And to be honest, I do not really see why keeping retrying from
>>>>>> mem_cgroup_resize_limit should be so much faster than keep retrying from
>>>>>> the direct reclaim path. We are doing SWAP_CLUSTER_MAX batches anyway.
>>>>>> mem_cgroup_resize_limit loop adds _some_ overhead but I am not really
>>>>>> sure why it should be that large.
>>>>>
>>>>> Maybe restarting the scan lots of times results in rescanning lots of
>>>>> ineligible pages at the start of the list before doing useful work?
>>>>>
>>>>> Andrey, are you able to determine where all that CPU time is being spent?
>>>>>
>>>>
>>>> I should have been more specific about the test I did. The full script looks like this:
>>>>
>>>> mkdir -p /sys/fs/cgroup/memory/test
>>>> echo $$ > /sys/fs/cgroup/memory/test/tasks
>>>> cat 4G_file > /dev/null
>>>> while true; do cat 4G_file > /dev/null; done &
>>>> loop_pid=$!
>>>> perf stat echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>>>> echo -1 > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>>>> kill $loop_pid
>>>>
>>>>
>>>> I think the additional loops add some overhead and it's not that big by itself, but
>>>> this small overhead allows task to refill slightly more pages, increasing
>>>> the total amount of pages that mem_cgroup_resize_limit() need to reclaim.
>>>>
>>>> By using the following commands to show the the amount of reclaimed pages:
>>>> perf record -e vmscan:mm_vmscan_memcg_reclaim_end echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
>>>> perf script|cut -d '=' -f 2| paste -sd+ |bc
>>>>
>>>> I've got 1259841 pages (4.9G) with the patch vs 1394312 pages (5.4G) without it.
>>>
>>> So how does the picture changes if you have multiple producers?
>>>
>>
>> Drastically, in favor of the patch. But numbers *very* fickle from run to run.
>>
>> Inside 5G vm with 4 cpus (qemu -m 5G -smp 4) and 4 processes in cgroup reading 1G files:
>> "while true; do cat /1g_f$i > /dev/null; done &"
>>
>> with the patch:
>> best: 1.04 secs, 9.7G reclaimed
>> worst: 2.2 secs, 16G reclaimed.
>>
>> without:
>> best: 5.4 sec, 35G reclaimed
>> worst: 22.2 sec, 136G reclaimed
>
> Could you also compare how much memory do we reclaim with/without the
> patch?
>
I did and I wrote the results. Please look again.
On Thu 22-02-18 18:38:11, Andrey Ryabinin wrote:
>
>
> On 02/22/2018 06:33 PM, Michal Hocko wrote:
> > On Thu 22-02-18 18:13:11, Andrey Ryabinin wrote:
> >>
> >>
> >> On 02/22/2018 05:09 PM, Michal Hocko wrote:
> >>> On Thu 22-02-18 16:50:33, Andrey Ryabinin wrote:
> >>>> On 02/21/2018 11:17 PM, Andrew Morton wrote:
> >>>>> On Fri, 19 Jan 2018 16:11:18 +0100 Michal Hocko <[email protected]> wrote:
> >>>>>
> >>>>>> And to be honest, I do not really see why keeping retrying from
> >>>>>> mem_cgroup_resize_limit should be so much faster than keep retrying from
> >>>>>> the direct reclaim path. We are doing SWAP_CLUSTER_MAX batches anyway.
> >>>>>> mem_cgroup_resize_limit loop adds _some_ overhead but I am not really
> >>>>>> sure why it should be that large.
> >>>>>
> >>>>> Maybe restarting the scan lots of times results in rescanning lots of
> >>>>> ineligible pages at the start of the list before doing useful work?
> >>>>>
> >>>>> Andrey, are you able to determine where all that CPU time is being spent?
> >>>>>
> >>>>
> >>>> I should have been more specific about the test I did. The full script looks like this:
> >>>>
> >>>> mkdir -p /sys/fs/cgroup/memory/test
> >>>> echo $$ > /sys/fs/cgroup/memory/test/tasks
> >>>> cat 4G_file > /dev/null
> >>>> while true; do cat 4G_file > /dev/null; done &
> >>>> loop_pid=$!
> >>>> perf stat echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> >>>> echo -1 > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> >>>> kill $loop_pid
> >>>>
> >>>>
> >>>> I think the additional loops add some overhead and it's not that big by itself, but
> >>>> this small overhead allows task to refill slightly more pages, increasing
> >>>> the total amount of pages that mem_cgroup_resize_limit() need to reclaim.
> >>>>
> >>>> By using the following commands to show the the amount of reclaimed pages:
> >>>> perf record -e vmscan:mm_vmscan_memcg_reclaim_end echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> >>>> perf script|cut -d '=' -f 2| paste -sd+ |bc
> >>>>
> >>>> I've got 1259841 pages (4.9G) with the patch vs 1394312 pages (5.4G) without it.
> >>>
> >>> So how does the picture changes if you have multiple producers?
> >>>
> >>
> >> Drastically, in favor of the patch. But numbers *very* fickle from run to run.
> >>
> >> Inside 5G vm with 4 cpus (qemu -m 5G -smp 4) and 4 processes in cgroup reading 1G files:
> >> "while true; do cat /1g_f$i > /dev/null; done &"
> >>
> >> with the patch:
> >> best: 1.04 secs, 9.7G reclaimed
> >> worst: 2.2 secs, 16G reclaimed.
> >>
> >> without:
> >> best: 5.4 sec, 35G reclaimed
> >> worst: 22.2 sec, 136G reclaimed
> >
> > Could you also compare how much memory do we reclaim with/without the
> > patch?
> >
>
> I did and I wrote the results. Please look again.
I must have forgotten. Care to point me to the message-id?
[email protected] doesn't contain this
information and a quick glance over the follow up thread doesn't have
anything as well. Ideally, this should be in the patch changelog, btw.
--
Michal Hocko
SUSE Labs
On 02/22/2018 06:44 PM, Michal Hocko wrote:
> On Thu 22-02-18 18:38:11, Andrey Ryabinin wrote:
>>>>
>>>> with the patch:
>>>> best: 1.04 secs, 9.7G reclaimed
>>>> worst: 2.2 secs, 16G reclaimed.
>>>>
>>>> without:
>>>> best: 5.4 sec, 35G reclaimed
>>>> worst: 22.2 sec, 136G reclaimed
>>>
>>> Could you also compare how much memory do we reclaim with/without the
>>> patch?
>>>
>>
>> I did and I wrote the results. Please look again.
>
> I must have forgotten. Care to point me to the message-id?
The results are quoted right above, literally above. Raise your eyes up. message-id [email protected]
I write it here again:
with the patch:
best: 9.7G reclaimed
worst: 16G reclaimed
without:
best: 35G reclaimed
worst: 136G reclaimed
Or you asking about something else? If so, I don't understand what you want.
> [email protected] doesn't contain this
> information and a quick glance over the follow up thread doesn't have
> anything as well. Ideally, this should be in the patch changelog, btw.
>
Well, I did these measurements only today and I don't have time machine.
On Thu 22-02-18 19:01:58, Andrey Ryabinin wrote:
>
>
> On 02/22/2018 06:44 PM, Michal Hocko wrote:
> > On Thu 22-02-18 18:38:11, Andrey Ryabinin wrote:
>
> >>>>
> >>>> with the patch:
> >>>> best: 1.04 secs, 9.7G reclaimed
> >>>> worst: 2.2 secs, 16G reclaimed.
> >>>>
> >>>> without:
> >>>> best: 5.4 sec, 35G reclaimed
> >>>> worst: 22.2 sec, 136G reclaimed
> >>>
> >>> Could you also compare how much memory do we reclaim with/without the
> >>> patch?
> >>>
> >>
> >> I did and I wrote the results. Please look again.
> >
> > I must have forgotten. Care to point me to the message-id?
>
> The results are quoted right above, literally above. Raise your eyes
> up. message-id [email protected]
OK, I see. We were talking about 2 different things I guess.
> I write it here again:
>
> with the patch:
> best: 9.7G reclaimed
> worst: 16G reclaimed
>
> without:
> best: 35G reclaimed
> worst: 136G reclaimed
>
> Or you asking about something else? If so, I don't understand what you
> want.
Well, those numbers do not tell us much, right? You have 4 concurrent
readers each an own 1G file in a loop. The longer you keep running that
the more pages you are reclaiming of course. But you are not comparing
the same amount of work.
My main concern about the patch is that it might over-reclaim a lot if
we have workload which also frees memory rahther than constantly add
more easily reclaimable page cache. I realize such a test is not easy
to make.
I have already said that I will not block the patch but it should be at
least explained why a larger batch makes a difference.
--
Michal Hocko
SUSE Labs