LinuxLists.cc - [PATCH v3 1/4] mm: rename page_counter's count/limit into usage/max

2018-04-05 19:02:28

Subject: [PATCH v3 1/4] mm: rename page_counter's count/limit into usage/max

This patch renames struct page_counter fields:
count -> usage
limit -> max

and the corresponding functions:
page_counter_limit() -> page_counter_set_max()
mem_cgroup_get_limit() -> mem_cgroup_get_max()
mem_cgroup_resize_limit() -> mem_cgroup_resize_max()
memcg_update_kmem_limit() -> memcg_update_kmem_max()
memcg_update_tcp_limit() -> memcg_update_tcp_max()

The idea behind this renaming is to have the direct matching
between memory cgroup knobs (low, high, max) and page_counters API.

This is pure renaming, this patch doesn't bring any functional change.

Signed-off-by: Roman Gushchin <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
include/linux/memcontrol.h | 4 +-
include/linux/page_counter.h | 12 ++---
mm/hugetlb_cgroup.c | 6 +--
mm/memcontrol.c | 112 +++++++++++++++++++++----------------------
mm/oom_kill.c | 2 +-
mm/page_counter.c | 28 +++++------
6 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 44422e1d3def..caa31cc09e7e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -475,7 +475,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,

void mem_cgroup_handle_over_high(void);

-unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg);
+unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p);
@@ -882,7 +882,7 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
return 0;
}

-static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
return 0;
}
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index c15ab80ad32d..94029dad9317 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -7,8 +7,8 @@
#include <asm/page.h>

struct page_counter {
- atomic_long_t count;
- unsigned long limit;
+ atomic_long_t usage;
+ unsigned long max;
struct page_counter *parent;

/* legacy */
@@ -25,14 +25,14 @@ struct page_counter {
static inline void page_counter_init(struct page_counter *counter,
struct page_counter *parent)
{
- atomic_long_set(&counter->count, 0);
- counter->limit = PAGE_COUNTER_MAX;
+ atomic_long_set(&counter->usage, 0);
+ counter->max = PAGE_COUNTER_MAX;
counter->parent = parent;
}

static inline unsigned long page_counter_read(struct page_counter *counter)
{
- return atomic_long_read(&counter->count);
+ return atomic_long_read(&counter->usage);
}

void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
@@ -41,7 +41,7 @@ bool page_counter_try_charge(struct page_counter *counter,
unsigned long nr_pages,
struct page_counter **fail);
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
-int page_counter_limit(struct page_counter *counter, unsigned long limit);
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
int page_counter_memparse(const char *buf, const char *max,
unsigned long *nr_pages);

diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index eec1150125b9..68c2f2f3c05b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,

limit = round_down(PAGE_COUNTER_MAX,
1 << huge_page_order(&hstates[idx]));
- ret = page_counter_limit(counter, limit);
+ ret = page_counter_set_max(counter, limit);
VM_BUG_ON(ret);
}
}
@@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
case RES_USAGE:
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->limit * PAGE_SIZE;
+ return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
@@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
mutex_lock(&hugetlb_limit_mutex);
- ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
+ ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
mutex_unlock(&hugetlb_limit_mutex);
break;
default:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 18dda3f113bd..50d1ad6a8fdb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1033,13 +1033,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
unsigned long limit;

count = page_counter_read(&memcg->memory);
- limit = READ_ONCE(memcg->memory.limit);
+ limit = READ_ONCE(memcg->memory.max);
if (count < limit)
margin = limit - count;

if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
- limit = READ_ONCE(memcg->memsw.limit);
+ limit = READ_ONCE(memcg->memsw.max);
if (count <= limit)
margin = min(margin, limit - count);
else
@@ -1147,13 +1147,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)memcg->memory.limit), memcg->memory.failcnt);
+ K((u64)memcg->memory.max), memcg->memory.failcnt);
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);

for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats for ");
@@ -1178,21 +1178,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
-unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long limit;
+ unsigned long max;

- limit = memcg->memory.limit;
+ max = memcg->memory.max;
if (mem_cgroup_swappiness(memcg)) {
- unsigned long memsw_limit;
- unsigned long swap_limit;
+ unsigned long memsw_max;
+ unsigned long swap_max;

- memsw_limit = memcg->memsw.limit;
- swap_limit = memcg->swap.limit;
- swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
- limit = min(limit + swap_limit, memsw_limit);
+ memsw_max = memcg->memsw.max;
+ swap_max = memcg->swap.max;
+ swap_max = min(swap_max, (unsigned long)total_swap_pages);
+ max = min(max + swap_max, memsw_max);
}
- return limit;
+ return max;
}

static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2458,10 +2458,10 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
}
#endif

-static DEFINE_MUTEX(memcg_limit_mutex);
+static DEFINE_MUTEX(memcg_max_mutex);

-static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
- unsigned long limit, bool memsw)
+static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
+ unsigned long max, bool memsw)
{
bool enlarge = false;
int ret;
@@ -2474,22 +2474,22 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
break;
}

- mutex_lock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
/*
* Make sure that the new limit (memsw or memory limit) doesn't
- * break our basic invariant rule memory.limit <= memsw.limit.
+ * break our basic invariant rule memory.max <= memsw.max.
*/
- limits_invariant = memsw ? limit >= memcg->memory.limit :
- limit <= memcg->memsw.limit;
+ limits_invariant = memsw ? max >= memcg->memory.max :
+ max <= memcg->memsw.max;
if (!limits_invariant) {
- mutex_unlock(&memcg_limit_mutex);
+ mutex_unlock(&memcg_max_mutex);
ret = -EINVAL;
break;
}
- if (limit > counter->limit)
+ if (max > counter->max)
enlarge = true;
- ret = page_counter_limit(counter, limit);
- mutex_unlock(&memcg_limit_mutex);
+ ret = page_counter_set_max(counter, max);
+ mutex_unlock(&memcg_max_mutex);

if (!ret)
break;
@@ -2989,7 +2989,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->limit * PAGE_SIZE;
+ return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
@@ -3103,24 +3103,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
#endif /* !CONFIG_SLOB */

-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int memcg_update_kmem_max(struct mem_cgroup *memcg,
+ unsigned long max)
{
int ret;

- mutex_lock(&memcg_limit_mutex);
- ret = page_counter_limit(&memcg->kmem, limit);
- mutex_unlock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
+ ret = page_counter_set_max(&memcg->kmem, max);
+ mutex_unlock(&memcg_max_mutex);
return ret;
}

-static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
{
int ret;

- mutex_lock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);

- ret = page_counter_limit(&memcg->tcpmem, limit);
+ ret = page_counter_set_max(&memcg->tcpmem, max);
if (ret)
goto out;

@@ -3145,7 +3145,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
memcg->tcpmem_active = true;
}
out:
- mutex_unlock(&memcg_limit_mutex);
+ mutex_unlock(&memcg_max_mutex);
return ret;
}

@@ -3173,16 +3173,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
- ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
+ ret = mem_cgroup_resize_max(memcg, nr_pages, false);
break;
case _MEMSWAP:
- ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
+ ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
- ret = memcg_update_kmem_limit(memcg, nr_pages);
+ ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
- ret = memcg_update_tcp_limit(memcg, nr_pages);
+ ret = memcg_update_tcp_max(memcg, nr_pages);
break;
}
break;
@@ -3358,8 +3358,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
- memory = min(memory, mi->memory.limit);
- memsw = min(memsw, mi->memsw.limit);
+ memory = min(memory, mi->memory.max);
+ memsw = min(memsw, mi->memsw.max);
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
@@ -3858,7 +3858,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pheadroom = PAGE_COUNTER_MAX;

while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+ unsigned long ceiling = min(memcg->memory.max, memcg->high);
unsigned long used = page_counter_read(&memcg->memory);

*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4548,12 +4548,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

- page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
memcg->low = 0;
+ page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
@@ -5360,7 +5360,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
static int memory_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->memory.limit);
+ unsigned long max = READ_ONCE(memcg->memory.max);

if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5384,7 +5384,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;

- xchg(&memcg->memory.limit, max);
+ xchg(&memcg->memory.max, max);

for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -6331,7 +6331,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
- READ_ONCE(memcg->swap.limit) -
+ READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
return nr_swap_pages;
}
@@ -6352,7 +6352,7 @@ bool mem_cgroup_swap_full(struct page *page)
return false;

for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+ if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
return true;

return false;
@@ -6386,7 +6386,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
static int swap_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->swap.limit);
+ unsigned long max = READ_ONCE(memcg->swap.max);

if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -6408,9 +6408,9 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
if (err)
return err;

- mutex_lock(&memcg_limit_mutex);
- err = page_counter_limit(&memcg->swap, max);
- mutex_unlock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
+ err = page_counter_set_max(&memcg->swap, max);
+ mutex_unlock(&memcg_max_mutex);
if (err)
return err;

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dfd370526909..a420a703b8d5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
int nid;

if (is_memcg_oom(oc)) {
- oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
+ oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
return CONSTRAINT_MEMCG;
}

diff --git a/mm/page_counter.c b/mm/page_counter.c
index 2a8df3ad60a4..41937c9a9d11 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -22,7 +22,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
long new;

- new = atomic_long_sub_return(nr_pages, &counter->count);
+ new = atomic_long_sub_return(nr_pages, &counter->usage);
/* More uncharges than charges? */
WARN_ON_ONCE(new < 0);
}
@@ -41,7 +41,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
for (c = counter; c; c = c->parent) {
long new;

- new = atomic_long_add_return(nr_pages, &c->count);
+ new = atomic_long_add_return(nr_pages, &c->usage);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -82,9 +82,9 @@ bool page_counter_try_charge(struct page_counter *counter,
* we either see the new limit or the setter sees the
* counter has changed and retries.
*/
- new = atomic_long_add_return(nr_pages, &c->count);
- if (new > c->limit) {
- atomic_long_sub(nr_pages, &c->count);
+ new = atomic_long_add_return(nr_pages, &c->usage);
+ if (new > c->max) {
+ atomic_long_sub(nr_pages, &c->usage);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -123,20 +123,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
}

/**
- * page_counter_limit - limit the number of pages allowed
+ * page_counter_set_max - set the maximum number of pages allowed
* @counter: counter
- * @limit: limit to set
+ * @nr_pages: limit to set
*
* Returns 0 on success, -EBUSY if the current number of pages on the
* counter already exceeds the specified limit.
*
* The caller must serialize invocations on the same counter.
*/
-int page_counter_limit(struct page_counter *counter, unsigned long limit)
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
for (;;) {
unsigned long old;
- long count;
+ long usage;

/*
* Update the limit while making sure that it's not
@@ -149,17 +149,17 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
* the limit, so if it sees the old limit, we see the
* modified counter and retry.
*/
- count = atomic_long_read(&counter->count);
+ usage = atomic_long_read(&counter->usage);

- if (count > limit)
+ if (usage > nr_pages)
return -EBUSY;

- old = xchg(&counter->limit, limit);
+ old = xchg(&counter->max, nr_pages);

- if (atomic_long_read(&counter->count) <= count)
+ if (atomic_long_read(&counter->usage) <= usage)
return 0;

- counter->limit = old;
+ counter->max = old;
cond_resched();
}
}
--
2.14.3

2018-04-05 19:01:45

by Roman Gushchin

[permalink] [raw]

Subject: [PATCH v3 4/4] mm/docs: describe memory.low refinements

Refine cgroup v2 docs after latest memory.low changes.

Signed-off-by: Roman Gushchin <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
Documentation/cgroup-v2.txt | 28 +++++++++++++---------------
1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index f728e55602b2..7ee462b8a6ac 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1006,10 +1006,17 @@ PAGE_SIZE multiple when read back.
A read-write single value file which exists on non-root
cgroups. The default is "0".

- Best-effort memory protection. If the memory usages of a
- cgroup and all its ancestors are below their low boundaries,
- the cgroup's memory won't be reclaimed unless memory can be
- reclaimed from unprotected cgroups.
+ Best-effort memory protection. If the memory usage of a
+ cgroup is within its effective low boundary, the cgroup's
+ memory won't be reclaimed unless memory can be reclaimed
+ from unprotected cgroups.
+
+ Effective low boundary is limited by memory.low values of
+ all ancestor cgroups. If there is memory.low overcommitment
+ (child cgroup or cgroups are requiring more protected memory,
+ than parent will allow), then each child cgroup will get
+ the part of parent's protection proportional to the its
+ actual memory usage below memory.low.

Putting more memory than generally available under this
protection is discouraged.
@@ -2008,17 +2015,8 @@ system performance due to overreclaim, to the point where the feature
becomes self-defeating.

The memory.low boundary on the other hand is a top-down allocated
-reserve. A cgroup enjoys reclaim protection when it and all its
-ancestors are below their low boundaries, which makes delegation of
-subtrees possible. Secondly, new cgroups have no reserve per default
-and in the common case most cgroups are eligible for the preferred
-reclaim pass. This allows the new low boundary to be efficiently
-implemented with just a minor addition to the generic reclaim code,
-without the need for out-of-band data structures and reclaim passes.
-Because the generic reclaim code considers all cgroups except for the
-ones running low in the preferred first reclaim pass, overreclaim of
-individual groups is eliminated as well, resulting in much better
-overall workload performance.
+reserve. A cgroup enjoys reclaim protection when it's within its low,
+which makes delegation of subtrees possible.

The original high boundary, the hard limit, is defined as a strict
limit that can not budge, even if the OOM killer has to be called.
--
2.14.3

2018-04-05 19:03:44

by Roman Gushchin

[permalink] [raw]

Subject: [PATCH v3 3/4] mm: treat memory.low value inclusive

If memcg's usage is equal to the memory.low value, avoid reclaiming
from this cgroup while there is a surplus of reclaimable memory.

This sounds more logical and also matches memory.high and memory.max
behavior: both are inclusive.

Signed-off-by: Roman Gushchin <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
mm/memcontrol.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 78cf21f2a943..1cd6e9bf24f2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5608,14 +5608,14 @@ struct cgroup_subsys memory_cgrp_subsys = {
};

/**
- * mem_cgroup_low - check if memory consumption is below the normal range
+ * mem_cgroup_low - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
*
- * Returns %true if memory consumption of @memcg is below the normal range.
+ * Returns %true if memory consumption of @memcg is in the normal range.
*
* @root is exclusive; it is never low when looked at directly
*
@@ -5709,7 +5709,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
elow = min(elow, parent_elow * low_usage / siblings_low_usage);
exit:
memcg->memory.elow = elow;
- return usage < elow;
+ return usage <= elow;
}

/**
--
2.14.3

2018-04-05 19:05:34

by Roman Gushchin

[permalink] [raw]

Subject: [PATCH v3 2/4] mm: memory.low hierarchical behavior

This patch aims to address an issue in current memory.low semantics,
which makes it hard to use it in a hierarchy, where some leaf memory
cgroups are more valuable than others.

For example, there are memcgs A, A/B, A/C, A/D and A/E:

A A/memory.low = 2G, A/memory.current = 6G
//\\
BC DE B/memory.low = 3G B/memory.current = 2G
C/memory.low = 1G C/memory.current = 2G
D/memory.low = 0 D/memory.current = 2G
E/memory.low = 10G E/memory.current = 0

If we apply memory pressure, B, C and D are reclaimed at
the same pace while A's usage exceeds 2G.
This is obviously wrong, as B's usage is fully below B's memory.low,
and C has 1G of protection as well.
Also, A is pushed to the size, which is less than A's 2G memory.low,
which is also wrong.

A simple bash script (provided below) can be used to reproduce
the problem. Current results are:
A: 1430097920
A/B: 711929856
A/C: 717426688
A/D: 741376
A/E: 0

To address the issue a concept of effective memory.low is introduced.
Effective memory.low is always equal or less than original memory.low.
In a case, when there is no memory.low overcommittment (and also for
top-level cgroups), these two values are equal.
Otherwise it's a part of parent's effective memory.low, calculated as
a cgroup's memory.low usage divided by sum of sibling's memory.low
usages (under memory.low usage I mean the size of actually protected
memory: memory.current if memory.current < memory.low, 0 otherwise).
It's necessary to track the actual usage, because otherwise an empty
cgroup with memory.low set (A/E in my example) will affect actual
memory distribution, which makes no sense. To avoid traversing
the cgroup tree twice, page_counters code is reused.

Calculating effective memory.low can be done in the reclaim path,
as we conveniently traversing the cgroup tree from top to bottom and
check memory.low on each level. So, it's a perfect place to calculate
effective memory low and save it to use it for children cgroups.

This also eliminates a need to traverse the cgroup tree from bottom
to top each time to check if parent's guarantee is not exceeded.

Setting/resetting effective memory.low is intentionally racy, but
it's fine and shouldn't lead to any significant differences in
actual memory distribution.

With this patch applied results are matching the expectations:
A: 2147930112
A/B: 1428721664
A/C: 718393344
A/D: 815104
A/E: 0

Test script:
#!/bin/bash

CGPATH="/sys/fs/cgroup"

truncate /file1 --size 2G
truncate /file2 --size 2G
truncate /file3 --size 2G
truncate /file4 --size 50G

mkdir "${CGPATH}/A"
echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"

echo 2G > "${CGPATH}/A/memory.low"
echo 3G > "${CGPATH}/A/B/memory.low"
echo 1G > "${CGPATH}/A/C/memory.low"
echo 0 > "${CGPATH}/A/D/memory.low"
echo 10G > "${CGPATH}/A/E/memory.low"

echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4

echo "A: " `cat "${CGPATH}/A/memory.current"`
echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`

rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
rmdir "${CGPATH}/A"
rm /file1 /file2 /file3 /file4

Signed-off-by: Roman Gushchin <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
include/linux/memcontrol.h | 3 +-
include/linux/page_counter.h | 7 +++
mm/memcontrol.c | 112 ++++++++++++++++++++++++++++++++-----------
mm/page_counter.c | 43 +++++++++++++++++
4 files changed, 134 insertions(+), 31 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index caa31cc09e7e..0dfda3ac6e70 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -178,8 +178,7 @@ struct mem_cgroup {
struct page_counter kmem;
struct page_counter tcpmem;

- /* Normal memory consumption range */
- unsigned long low;
+ /* Upper bound of normal memory consumption range */
unsigned long high;

/* Range enforcement for interrupt charges */
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 94029dad9317..7902a727d3b6 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -9,8 +9,14 @@
struct page_counter {
atomic_long_t usage;
unsigned long max;
+ unsigned long low;
struct page_counter *parent;

+ /* effective memory.low and memory.low usage tracking */
+ unsigned long elow;
+ atomic_long_t low_usage;
+ atomic_long_t children_low_usage;
+
/* legacy */
unsigned long watermark;
unsigned long failcnt;
@@ -42,6 +48,7 @@ bool page_counter_try_charge(struct page_counter *counter,
struct page_counter **fail);
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
int page_counter_memparse(const char *buf, const char *max,
unsigned long *nr_pages);

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 50d1ad6a8fdb..78cf21f2a943 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4499,7 +4499,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);

- memcg->low = 0;
+ page_counter_set_low(&memcg->memory, 0);

memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
@@ -4548,12 +4548,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

- memcg->low = 0;
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+ page_counter_set_low(&memcg->memory, 0);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
@@ -5293,7 +5293,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = READ_ONCE(memcg->low);
+ unsigned long low = READ_ONCE(memcg->memory.low);

if (low == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5315,7 +5315,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
if (err)
return err;

- memcg->low = low;
+ page_counter_set_low(&memcg->memory, low);

return nbytes;
}
@@ -5612,36 +5612,72 @@ struct cgroup_subsys memory_cgrp_subsys = {
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
- * Returns %true if memory consumption of @memcg, and that of all
- * ancestors up to (but not including) @root, is below the normal range.
+ * WARNING: This function is not stateless! It can only be used as part
+ * of a top-down tree iteration, not for isolated queries.
*
- * @root is exclusive; it is never low when looked at directly and isn't
- * checked when traversing the hierarchy.
+ * Returns %true if memory consumption of @memcg is below the normal range.
*
- * Excluding @root enables using memory.low to prioritize memory usage
- * between cgroups within a subtree of the hierarchy that is limited by
- * memory.high or memory.max.
+ * @root is exclusive; it is never low when looked at directly
*
- * For example, given cgroup A with children B and C:
+ * To provide a proper hierarchical behavior, effective memory.low value
+ * is used.
*
- * A
- * / \
- * B C
+ * Effective memory.low is always equal or less than the original memory.low.
+ * If there is no memory.low overcommittment (which is always true for
+ * top-level memory cgroups), these two values are equal.
+ * Otherwise, it's a part of parent's effective memory.low,
+ * calculated as a cgroup's memory.low usage divided by sum of sibling's
+ * memory.low usages, where memory.low usage is the size of actually
+ * protected memory.
*
- * and
+ * low_usage
+ * elow = min( memory.low, parent->elow * ------------------ ),
+ * siblings_low_usage
*
- * 1. A/memory.current > A/memory.high
- * 2. A/B/memory.current < A/B/memory.low
- * 3. A/C/memory.current >= A/C/memory.low
+ * | memory.current, if memory.current < memory.low
+ * low_usage = |
+ | 0, otherwise.
*
- * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
- * should reclaim from 'C' until 'A' is no longer high or until we can
- * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
- * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
- * low and we will reclaim indiscriminately from both 'B' and 'C'.
+ *
+ * Such definition of the effective memory.low provides the expected
+ * hierarchical behavior: parent's memory.low value is limiting
+ * children, unprotected memory is reclaimed first and cgroups,
+ * which are not using their guarantee do not affect actual memory
+ * distribution.
+ *
+ * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
+ *
+ * A A/memory.low = 2G, A/memory.current = 6G
+ * //\\
+ * BC DE B/memory.low = 3G B/memory.current = 2G
+ * C/memory.low = 1G C/memory.current = 2G
+ * D/memory.low = 0 D/memory.current = 2G
+ * E/memory.low = 10G E/memory.current = 0
+ *
+ * and the memory pressure is applied, the following memory distribution
+ * is expected (approximately):
+ *
+ * A/memory.current = 2G
+ *
+ * B/memory.current = 1.3G
+ * C/memory.current = 0.6G
+ * D/memory.current = 0
+ * E/memory.current = 0
+ *
+ * These calculations require constant tracking of the actual low usages
+ * (see propagate_low_usage()), as well as recursive calculation of
+ * effective memory.low values. But as we do call mem_cgroup_low()
+ * path for each memory cgroup top-down from the reclaim,
+ * it's possible to optimize this part, and save calculated elow
+ * for next usage. This part is intentionally racy, but it's ok,
+ * as memory.low is a best-effort mechanism.
*/
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
+ unsigned long usage, low_usage, siblings_low_usage;
+ unsigned long elow, parent_elow;
+ struct mem_cgroup *parent;
+
if (mem_cgroup_disabled())
return false;

@@ -5650,12 +5686,30 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
if (memcg == root)
return false;

- for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
- if (page_counter_read(&memcg->memory) >= memcg->low)
- return false;
- }
+ elow = memcg->memory.low;
+ usage = page_counter_read(&memcg->memory);
+ parent = parent_mem_cgroup(memcg);

- return true;
+ if (parent == root)
+ goto exit;
+
+ parent_elow = READ_ONCE(parent->memory.elow);
+ elow = min(elow, parent_elow);
+
+ if (!elow || !parent_elow)
+ goto exit;
+
+ low_usage = min(usage, memcg->memory.low);
+ siblings_low_usage = atomic_long_read(
+ &parent->memory.children_low_usage);
+
+ if (!low_usage || !siblings_low_usage)
+ goto exit;
+
+ elow = min(elow, parent_elow * low_usage / siblings_low_usage);
+exit:
+ memcg->memory.elow = elow;
+ return usage < elow;
}

/**
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 41937c9a9d11..a5ff4cbc355a 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,6 +13,28 @@
#include <linux/bug.h>
#include <asm/page.h>

+static void propagate_low_usage(struct page_counter *c, unsigned long usage)
+{
+ unsigned long low_usage, old;
+ long delta;
+
+ if (!c->parent)
+ return;
+
+ if (!c->low && !atomic_long_read(&c->low_usage))
+ return;
+
+ if (usage <= c->low)
+ low_usage = usage;
+ else
+ low_usage = 0;
+
+ old = atomic_long_xchg(&c->low_usage, low_usage);
+ delta = low_usage - old;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_low_usage);
+}
+
/**
* page_counter_cancel - take pages out of the local counter
* @counter: counter
@@ -23,6 +45,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
long new;

new = atomic_long_sub_return(nr_pages, &counter->usage);
+ propagate_low_usage(counter, new);
/* More uncharges than charges? */
WARN_ON_ONCE(new < 0);
}
@@ -42,6 +65,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
long new;

new = atomic_long_add_return(nr_pages, &c->usage);
+ propagate_low_usage(counter, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -85,6 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
+ propagate_low_usage(counter, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -93,6 +118,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
+ propagate_low_usage(counter, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
@@ -164,6 +190,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
}
}

+/**
+ * page_counter_set_low - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ counter->low = nr_pages;
+
+ for (c = counter; c; c = c->parent)
+ propagate_low_usage(c, atomic_long_read(&c->usage));
+}
+
/**
* page_counter_memparse - memparse() for page counter limits
* @buf: string to parse
--
2.14.3

2018-04-05 19:33:55

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH v3 1/4] mm: rename page_counter's count/limit into usage/max

On Thu, Apr 05, 2018 at 07:59:18PM +0100, Roman Gushchin wrote:
> This patch renames struct page_counter fields:
> count -> usage
> limit -> max
>
> and the corresponding functions:
> page_counter_limit() -> page_counter_set_max()
> mem_cgroup_get_limit() -> mem_cgroup_get_max()
> mem_cgroup_resize_limit() -> mem_cgroup_resize_max()
> memcg_update_kmem_limit() -> memcg_update_kmem_max()
> memcg_update_tcp_limit() -> memcg_update_tcp_max()
>
> The idea behind this renaming is to have the direct matching
> between memory cgroup knobs (low, high, max) and page_counters API.
>
> This is pure renaming, this patch doesn't bring any functional change.
>
> Signed-off-by: Roman Gushchin <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]

Acked-by: Johannes Weiner <[email protected]>

2018-04-05 19:38:16

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH v3 2/4] mm: memory.low hierarchical behavior

On Thu, Apr 05, 2018 at 07:59:19PM +0100, Roman Gushchin wrote:
> This patch aims to address an issue in current memory.low semantics,
> which makes it hard to use it in a hierarchy, where some leaf memory
> cgroups are more valuable than others.
>
> For example, there are memcgs A, A/B, A/C, A/D and A/E:
>
> A A/memory.low = 2G, A/memory.current = 6G
> //\\
> BC DE B/memory.low = 3G B/memory.current = 2G
> C/memory.low = 1G C/memory.current = 2G
> D/memory.low = 0 D/memory.current = 2G
> E/memory.low = 10G E/memory.current = 0
>
> If we apply memory pressure, B, C and D are reclaimed at
> the same pace while A's usage exceeds 2G.
> This is obviously wrong, as B's usage is fully below B's memory.low,
> and C has 1G of protection as well.
> Also, A is pushed to the size, which is less than A's 2G memory.low,
> which is also wrong.
>
> A simple bash script (provided below) can be used to reproduce
> the problem. Current results are:
> A: 1430097920
> A/B: 711929856
> A/C: 717426688
> A/D: 741376
> A/E: 0
>
> To address the issue a concept of effective memory.low is introduced.
> Effective memory.low is always equal or less than original memory.low.
> In a case, when there is no memory.low overcommittment (and also for
> top-level cgroups), these two values are equal.
> Otherwise it's a part of parent's effective memory.low, calculated as
> a cgroup's memory.low usage divided by sum of sibling's memory.low
> usages (under memory.low usage I mean the size of actually protected
> memory: memory.current if memory.current < memory.low, 0 otherwise).
> It's necessary to track the actual usage, because otherwise an empty
> cgroup with memory.low set (A/E in my example) will affect actual
> memory distribution, which makes no sense. To avoid traversing
> the cgroup tree twice, page_counters code is reused.
>
> Calculating effective memory.low can be done in the reclaim path,
> as we conveniently traversing the cgroup tree from top to bottom and
> check memory.low on each level. So, it's a perfect place to calculate
> effective memory low and save it to use it for children cgroups.
>
> This also eliminates a need to traverse the cgroup tree from bottom
> to top each time to check if parent's guarantee is not exceeded.
>
> Setting/resetting effective memory.low is intentionally racy, but
> it's fine and shouldn't lead to any significant differences in
> actual memory distribution.
>
> With this patch applied results are matching the expectations:
> A: 2147930112
> A/B: 1428721664
> A/C: 718393344
> A/D: 815104
> A/E: 0
>
> Test script:
> #!/bin/bash
>
> CGPATH="/sys/fs/cgroup"
>
> truncate /file1 --size 2G
> truncate /file2 --size 2G
> truncate /file3 --size 2G
> truncate /file4 --size 50G
>
> mkdir "${CGPATH}/A"
> echo "+memory" > "${CGPATH}/A/cgroup.subtree_control"
> mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
>
> echo 2G > "${CGPATH}/A/memory.low"
> echo 3G > "${CGPATH}/A/B/memory.low"
> echo 1G > "${CGPATH}/A/C/memory.low"
> echo 0 > "${CGPATH}/A/D/memory.low"
> echo 10G > "${CGPATH}/A/E/memory.low"
>
> echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1
> echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2
> echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3
> echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4
>
> echo "A: " `cat "${CGPATH}/A/memory.current"`
> echo "A/B: " `cat "${CGPATH}/A/B/memory.current"`
> echo "A/C: " `cat "${CGPATH}/A/C/memory.current"`
> echo "A/D: " `cat "${CGPATH}/A/D/memory.current"`
> echo "A/E: " `cat "${CGPATH}/A/E/memory.current"`
>
> rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E"
> rmdir "${CGPATH}/A"
> rm /file1 /file2 /file3 /file4
>
> Signed-off-by: Roman Gushchin <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]

Acked-by: Johannes Weiner <[email protected]>

2018-04-05 19:47:22

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH v3 3/4] mm: treat memory.low value inclusive

On Thu, Apr 05, 2018 at 07:59:20PM +0100, Roman Gushchin wrote:
> If memcg's usage is equal to the memory.low value, avoid reclaiming
> from this cgroup while there is a surplus of reclaimable memory.
>
> This sounds more logical and also matches memory.high and memory.max
> behavior: both are inclusive.

I was trying to figure out why we did it this way in the first place
and found this patch:

commit 4e54dede38b45052a941bcf709f7d29f2e18174d
Author: Michal Hocko <[email protected]>
Date: Fri Feb 27 15:51:46 2015 -0800

memcg: fix low limit calculation

A memcg is considered low limited even when the current usage is equal to
the low limit. This leads to interesting side effects e.g.
groups/hierarchies with no memory accounted are considered protected and
so the reclaim will emit MEMCG_LOW event when encountering them.

Another and much bigger issue was reported by Joonsoo Kim. He has hit a
NULL ptr dereference with the legacy cgroup API which even doesn't have
low limit exposed. The limit is 0 by default but the initial check fails
for memcg with 0 consumption and parent_mem_cgroup() would return NULL if
use_hierarchy is 0 and so page_counter_read would try to dereference NULL.

I suppose that the current implementation is just an overlook because the
documentation in Documentation/cgroups/unified-hierarchy.txt says:

"The memory.low boundary on the other hand is a top-down allocated
reserve. A cgroup enjoys reclaim protection when it and all its
ancestors are below their low boundaries"

Fix the usage and the low limit comparision in mem_cgroup_low accordingly.

> @@ -5709,7 +5709,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
> elow = min(elow, parent_elow * low_usage / siblings_low_usage);
> exit:
> memcg->memory.elow = elow;
> - return usage < elow;
> + return usage <= elow;

So I think this needs to be usage && usage <= elow to not emit
MEMCG_LOW events in case usage == elow == 0.

2018-04-05 19:47:57

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH v3 4/4] mm/docs: describe memory.low refinements

On Thu, Apr 05, 2018 at 07:59:21PM +0100, Roman Gushchin wrote:
> Refine cgroup v2 docs after latest memory.low changes.
>
> Signed-off-by: Roman Gushchin <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]

Acked-by: Johannes Weiner <[email protected]>

2018-04-06 12:24:11

by Roman Gushchin

[permalink] [raw]

Subject: Re: [PATCH v3 3/4] mm: treat memory.low value inclusive

On Thu, Apr 05, 2018 at 03:45:26PM -0400, Johannes Weiner wrote:
> On Thu, Apr 05, 2018 at 07:59:20PM +0100, Roman Gushchin wrote:
> > If memcg's usage is equal to the memory.low value, avoid reclaiming
> > from this cgroup while there is a surplus of reclaimable memory.
> >
> > This sounds more logical and also matches memory.high and memory.max
> > behavior: both are inclusive.
>
> I was trying to figure out why we did it this way in the first place
> and found this patch:
>
> commit 4e54dede38b45052a941bcf709f7d29f2e18174d
> Author: Michal Hocko <[email protected]>
> Date: Fri Feb 27 15:51:46 2015 -0800
>
> memcg: fix low limit calculation
>
> A memcg is considered low limited even when the current usage is equal to
> the low limit. This leads to interesting side effects e.g.
> groups/hierarchies with no memory accounted are considered protected and
> so the reclaim will emit MEMCG_LOW event when encountering them.
>
> Another and much bigger issue was reported by Joonsoo Kim. He has hit a
> NULL ptr dereference with the legacy cgroup API which even doesn't have
> low limit exposed. The limit is 0 by default but the initial check fails
> for memcg with 0 consumption and parent_mem_cgroup() would return NULL if
> use_hierarchy is 0 and so page_counter_read would try to dereference NULL.
>
> I suppose that the current implementation is just an overlook because the
> documentation in Documentation/cgroups/unified-hierarchy.txt says:
>
> "The memory.low boundary on the other hand is a top-down allocated
> reserve. A cgroup enjoys reclaim protection when it and all its
> ancestors are below their low boundaries"
>
> Fix the usage and the low limit comparision in mem_cgroup_low accordingly.
>
> > @@ -5709,7 +5709,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
> > elow = min(elow, parent_elow * low_usage / siblings_low_usage);
> > exit:
> > memcg->memory.elow = elow;
> > - return usage < elow;
> > + return usage <= elow;
>
> So I think this needs to be usage && usage <= elow to not emit
> MEMCG_LOW events in case usage == elow == 0.

Perfect catch! Thanks, Johannes!

Updated version below.

--

From 466c35c36cae392cfee5e54a2884792972e789ee Mon Sep 17 00:00:00 2001
From: Roman Gushchin <[email protected]>
Date: Thu, 5 Apr 2018 19:31:35 +0100
Subject: [PATCH v4 3/4] mm: treat memory.low value inclusive

If memcg's usage is equal to the memory.low value, avoid reclaiming
from this cgroup while there is a surplus of reclaimable memory.

This sounds more logical and also matches memory.high and memory.max
behavior: both are inclusive.

Empty cgroups are not considered protected, so MEMCG_LOW events
are not emitted for empty cgroups, if there is no more reclaimable
memory in the system.

Signed-off-by: Roman Gushchin <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
mm/memcontrol.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 78cf21f2a943..3d039fa1a8f5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5608,14 +5608,14 @@ struct cgroup_subsys memory_cgrp_subsys = {
};

/**
- * mem_cgroup_low - check if memory consumption is below the normal range
+ * mem_cgroup_low - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
*
- * Returns %true if memory consumption of @memcg is below the normal range.
+ * Returns %true if memory consumption of @memcg is in the normal range.
*
* @root is exclusive; it is never low when looked at directly
*
@@ -5709,7 +5709,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
elow = min(elow, parent_elow * low_usage / siblings_low_usage);
exit:
memcg->memory.elow = elow;
- return usage < elow;
+ return usage && usage <= elow;
}

/**
--
2.14.3

2018-04-06 16:43:07

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH v3 3/4] mm: treat memory.low value inclusive

On Fri, Apr 06, 2018 at 01:21:38PM +0100, Roman Gushchin wrote:
> Updated version below.
>
> --
>
> From 466c35c36cae392cfee5e54a2884792972e789ee Mon Sep 17 00:00:00 2001
> From: Roman Gushchin <[email protected]>
> Date: Thu, 5 Apr 2018 19:31:35 +0100
> Subject: [PATCH v4 3/4] mm: treat memory.low value inclusive
>
> If memcg's usage is equal to the memory.low value, avoid reclaiming
> from this cgroup while there is a surplus of reclaimable memory.
>
> This sounds more logical and also matches memory.high and memory.max
> behavior: both are inclusive.
>
> Empty cgroups are not considered protected, so MEMCG_LOW events
> are not emitted for empty cgroups, if there is no more reclaimable
> memory in the system.
>
> Signed-off-by: Roman Gushchin <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]
> Cc: [email protected]

Looks good, thanks!

Acked-by: Johannes Weiner <[email protected]>

2018-04-17 19:04:22

by Roman Gushchin

[permalink] [raw]

Subject: Re: [PATCH v3 3/4] mm: treat memory.low value inclusive

Hello, Andrew!

Can you, please, pull this patchset?

Thanks!

Roman

On Fri, Apr 06, 2018 at 12:38:02PM -0400, Johannes Weiner wrote:
> On Fri, Apr 06, 2018 at 01:21:38PM +0100, Roman Gushchin wrote:
> >
> > From 466c35c36cae392cfee5e54a2884792972e789ee Mon Sep 17 00:00:00 2001
> > From: Roman Gushchin <[email protected]>
> > Date: Thu, 5 Apr 2018 19:31:35 +0100
> > Subject: [PATCH v4 3/4] mm: treat memory.low value inclusive
> >
> > If memcg's usage is equal to the memory.low value, avoid reclaiming
> > from this cgroup while there is a surplus of reclaimable memory.
> >
> > This sounds more logical and also matches memory.high and memory.max
> > behavior: both are inclusive.
> >
> > Empty cgroups are not considered protected, so MEMCG_LOW events
> > are not emitted for empty cgroups, if there is no more reclaimable
> > memory in the system.
> >
> > Signed-off-by: Roman Gushchin <[email protected]>
> > Cc: Andrew Morton <[email protected]>
> > Cc: Johannes Weiner <[email protected]>
> > Cc: Michal Hocko <[email protected]>
> > Cc: Vladimir Davydov <[email protected]>
> > Cc: Tejun Heo <[email protected]>
> > Cc: [email protected]
> > Cc: [email protected]
> > Cc: [email protected]
> > Cc: [email protected]
>
> Looks good, thanks!
>
> Acked-by: Johannes Weiner <[email protected]>
>