2018-04-20 16:38:51

by Roman Gushchin

[permalink] [raw]
Subject: [PATCH 1/2] mm: introduce memory.min

Memory controller implements the memory.low best-effort memory
protection mechanism, which works perfectly in many cases and
allows protecting working sets of important workloads from
sudden reclaim.

But it's semantics has a significant limitation: it works
only until there is a supply of reclaimable memory.
This makes it pretty useless against any sort of slow memory
leaks or memory usage increases. This is especially true
for swapless systems. If swap is enabled, memory soft protection
effectively postpones problems, allowing a leaking application
to fill all swap area, which makes no sense.
The only effective way to guarantee the memory protection
in this case is to invoke the OOM killer.

This patch introduces the memory.min interface for cgroup v2
memory controller. It works very similarly to memory.low
(sharing the same hierarchical behavior), except that it's
not disabled if there is no more reclaimable memory in the system.

Signed-off-by: Roman Gushchin <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Tejun Heo <[email protected]>
---
Documentation/cgroup-v2.txt | 20 +++++++++
include/linux/memcontrol.h | 15 ++++++-
include/linux/page_counter.h | 11 ++++-
mm/memcontrol.c | 99 ++++++++++++++++++++++++++++++++++++--------
mm/page_counter.c | 63 ++++++++++++++++++++--------
mm/vmscan.c | 19 ++++++++-
6 files changed, 189 insertions(+), 38 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 657fe1769c75..49c846020f96 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1002,6 +1002,26 @@ PAGE_SIZE multiple when read back.
The total amount of memory currently being used by the cgroup
and its descendants.

+ memory.min
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ Hard memory protection. If the memory usage of a cgroup
+ is within its effectife min boundary, the cgroup's memory
+ won't be reclaimed under any conditions. If there is no
+ unprotected reclaimable memory available, OOM killer
+ is invoked.
+
+ Effective low boundary is limited by memory.min values of
+ all ancestor cgroups. If there is memory.mn overcommitment
+ (child cgroup or cgroups are requiring more protected memory,
+ than parent will allow), then each child cgroup will get
+ the part of parent's protection proportional to the its
+ actual memory usage below memory.min.
+
+ Putting more memory than generally available under this
+ protection is discouraged and may lead to constant OOMs.
+
memory.low
A read-write single value file which exists on non-root
cgroups. The default is "0".
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ab60ff55bdb3..6ee19532f567 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -297,7 +297,14 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
+enum mem_cgroup_protection {
+ MEM_CGROUP_UNPROTECTED,
+ MEM_CGROUP_PROTECTED_LOW,
+ MEM_CGROUP_PROTECTED_MIN,
+};
+
+enum mem_cgroup_protection
+mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg);

int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcgp,
@@ -756,6 +763,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
{
}

+static inline bool mem_cgroup_min(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
+{
+ return false;
+}
+
static inline bool mem_cgroup_low(struct mem_cgroup *root,
struct mem_cgroup *memcg)
{
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 7902a727d3b6..bab7e57f659b 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -8,10 +8,16 @@

struct page_counter {
atomic_long_t usage;
- unsigned long max;
+ unsigned long min;
unsigned long low;
+ unsigned long max;
struct page_counter *parent;

+ /* effective memory.min and memory.min usage tracking */
+ unsigned long emin;
+ atomic_long_t min_usage;
+ atomic_long_t children_min_usage;
+
/* effective memory.low and memory.low usage tracking */
unsigned long elow;
atomic_long_t low_usage;
@@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter,
unsigned long nr_pages,
struct page_counter **fail);
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
-int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
int page_counter_memparse(const char *buf, const char *max,
unsigned long *nr_pages);

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 25b148c2d222..9c65de7937d0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4508,6 +4508,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);

+ page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);

memcg_offline_kmem(memcg);
@@ -4562,6 +4563,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+ page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -5299,6 +5301,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}

+static int memory_min_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long min = READ_ONCE(memcg->memory.min);
+
+ if (min == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t memory_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long min;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &min);
+ if (err)
+ return err;
+
+ page_counter_set_min(&memcg->memory, min);
+
+ return nbytes;
+}
+
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5566,6 +5598,12 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_current_read,
},
+ {
+ .name = "min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_min_show,
+ .write = memory_min_write,
+ },
{
.name = "low",
.flags = CFTYPE_NOT_ON_ROOT,
@@ -5685,44 +5723,71 @@ struct cgroup_subsys memory_cgrp_subsys = {
* for next usage. This part is intentionally racy, but it's ok,
* as memory.low is a best-effort mechanism.
*/
-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+enum mem_cgroup_protection
+mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
- unsigned long usage, low_usage, siblings_low_usage;
- unsigned long elow, parent_elow;
struct mem_cgroup *parent;
+ unsigned long emin, parent_emin;
+ unsigned long elow, parent_elow;
+ unsigned long usage;

if (mem_cgroup_disabled())
- return false;
+ return MEM_CGROUP_UNPROTECTED;

if (!root)
root = root_mem_cgroup;
if (memcg == root)
- return false;
+ return MEM_CGROUP_UNPROTECTED;

- elow = memcg->memory.low;
usage = page_counter_read(&memcg->memory);
- parent = parent_mem_cgroup(memcg);
+ if (!usage)
+ return MEM_CGROUP_UNPROTECTED;
+
+ emin = memcg->memory.min;
+ elow = memcg->memory.low;

+ parent = parent_mem_cgroup(memcg);
if (parent == root)
goto exit;

+ parent_emin = READ_ONCE(parent->memory.emin);
+ emin = min(emin, parent_emin);
+ if (emin && parent_emin) {
+ unsigned long min_usage, siblings_min_usage;
+
+ min_usage = min(usage, memcg->memory.min);
+ siblings_min_usage = atomic_long_read(
+ &parent->memory.children_min_usage);
+
+ if (min_usage && siblings_min_usage)
+ emin = min(emin, parent_emin * min_usage /
+ siblings_min_usage);
+ }
+
parent_elow = READ_ONCE(parent->memory.elow);
elow = min(elow, parent_elow);
+ if (elow && parent_elow) {
+ unsigned long low_usage, siblings_low_usage;

- if (!elow || !parent_elow)
- goto exit;
+ low_usage = min(usage, memcg->memory.low);
+ siblings_low_usage = atomic_long_read(
+ &parent->memory.children_low_usage);

- low_usage = min(usage, memcg->memory.low);
- siblings_low_usage = atomic_long_read(
- &parent->memory.children_low_usage);
-
- if (!low_usage || !siblings_low_usage)
- goto exit;
+ if (low_usage && siblings_low_usage)
+ elow = min(elow, parent_elow * low_usage /
+ siblings_low_usage);
+ }

- elow = min(elow, parent_elow * low_usage / siblings_low_usage);
exit:
+ memcg->memory.emin = emin;
memcg->memory.elow = elow;
- return usage && usage <= elow;
+
+ if (usage <= emin)
+ return MEM_CGROUP_PROTECTED_MIN;
+ else if (usage <= elow)
+ return MEM_CGROUP_PROTECTED_LOW;
+ else
+ return MEM_CGROUP_UNPROTECTED;
}

/**
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a5ff4cbc355a..de31470655f6 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,26 +13,38 @@
#include <linux/bug.h>
#include <asm/page.h>

-static void propagate_low_usage(struct page_counter *c, unsigned long usage)
+static void propagate_protected_usage(struct page_counter *c,
+ unsigned long usage)
{
- unsigned long low_usage, old;
+ unsigned long protected, old_protected;
long delta;

if (!c->parent)
return;

- if (!c->low && !atomic_long_read(&c->low_usage))
- return;
+ if (c->min || atomic_long_read(&c->min_usage)) {
+ if (usage <= c->min)
+ protected = usage;
+ else
+ protected = 0;
+
+ old_protected = atomic_long_xchg(&c->min_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_min_usage);
+ }

- if (usage <= c->low)
- low_usage = usage;
- else
- low_usage = 0;
+ if (c->low || atomic_long_read(&c->low_usage)) {
+ if (usage <= c->low)
+ protected = usage;
+ else
+ protected = 0;

- old = atomic_long_xchg(&c->low_usage, low_usage);
- delta = low_usage - old;
- if (delta)
- atomic_long_add(delta, &c->parent->children_low_usage);
+ old_protected = atomic_long_xchg(&c->low_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_low_usage);
+ }
}

/**
@@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
long new;

new = atomic_long_sub_return(nr_pages, &counter->usage);
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/* More uncharges than charges? */
WARN_ON_ONCE(new < 0);
}
@@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
long new;

new = atomic_long_add_return(nr_pages, &c->usage);
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
@@ -190,6 +202,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
}
}

+/**
+ * page_counter_set_min - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ counter->min = nr_pages;
+
+ for (c = counter; c; c = c->parent)
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
/**
* page_counter_set_low - set the amount of protected memory
* @counter: counter
@@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
counter->low = nr_pages;

for (c = counter; c; c = c->parent)
- propagate_low_usage(c, atomic_long_read(&c->usage));
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8b920ce3ae02..39ef143cf781 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2525,12 +2525,29 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long reclaimed;
unsigned long scanned;

- if (mem_cgroup_low(root, memcg)) {
+ switch (mem_cgroup_protected(root, memcg)) {
+ case MEM_CGROUP_PROTECTED_MIN:
+ /*
+ * Hard protection.
+ * If there is no reclaimable memory, OOM.
+ */
+ continue;
+
+ case MEM_CGROUP_PROTECTED_LOW:
+ /*
+ * Soft protection.
+ * Respect the protection only until there is
+ * a supply of reclaimable memory.
+ */
if (!sc->memcg_low_reclaim) {
sc->memcg_low_skipped = 1;
continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
+ break;
+
+ case MEM_CGROUP_UNPROTECTED:
+ break;
}

reclaimed = sc->nr_reclaimed;
--
2.14.3



2018-04-20 16:40:18

by Roman Gushchin

[permalink] [raw]
Subject: [PATCH 2/2] mm: move the high field from struct mem_cgroup to page_counter

We do store memory.min, memory.low and memory.max actual values
in struct page_counter fields, while memory.high value is located
in the struct mem_cgroup directly, which is not very consistent.

This patch moves the high field from struct mem_cgroup to
struct page_counter to simplify the code and make handling
of all limits/boundaries clearer.

Signed-off-by: Roman Gushchin <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Tejun Heo <[email protected]>
---
include/linux/memcontrol.h | 3 ---
include/linux/page_counter.h | 3 +++
mm/memcontrol.c | 18 +++++++++---------
mm/page_counter.c | 12 ++++++++++++
4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6ee19532f567..b89e060d0283 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -182,9 +182,6 @@ struct mem_cgroup {
struct page_counter kmem;
struct page_counter tcpmem;

- /* Upper bound of normal memory consumption range */
- unsigned long high;
-
/* Range enforcement for interrupt charges */
struct work_struct high_work;

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index bab7e57f659b..83999441a43e 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -10,6 +10,7 @@ struct page_counter {
atomic_long_t usage;
unsigned long min;
unsigned long low;
+ unsigned long high;
unsigned long max;
struct page_counter *parent;

@@ -38,6 +39,7 @@ static inline void page_counter_init(struct page_counter *counter,
struct page_counter *parent)
{
atomic_long_set(&counter->usage, 0);
+ counter->high = PAGE_COUNTER_MAX;
counter->max = PAGE_COUNTER_MAX;
counter->parent = parent;
}
@@ -55,6 +57,7 @@ bool page_counter_try_charge(struct page_counter *counter,
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_high(struct page_counter *counter, unsigned long nr_pages);
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
int page_counter_memparse(const char *buf, const char *max,
unsigned long *nr_pages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9c65de7937d0..f9724dea017c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1869,7 +1869,7 @@ static void reclaim_high(struct mem_cgroup *memcg,
gfp_t gfp_mask)
{
do {
- if (page_counter_read(&memcg->memory) <= memcg->high)
+ if (page_counter_read(&memcg->memory) <= memcg->memory.high)
continue;
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
@@ -2040,7 +2040,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
* reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) > memcg->high) {
+ if (page_counter_read(&memcg->memory) > memcg->memory.high) {
/* Don't bother a random interrupted task */
if (in_interrupt()) {
schedule_work(&memcg->high_work);
@@ -3857,7 +3857,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pheadroom = PAGE_COUNTER_MAX;

while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(memcg->memory.max, memcg->high);
+ unsigned long ceiling = min(memcg->memory.max,
+ memcg->memory.high);
unsigned long used = page_counter_read(&memcg->memory);

*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4433,7 +4434,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (!memcg)
return ERR_PTR(error);

- memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
@@ -4558,14 +4558,14 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);

+ page_counter_set_min(&memcg->memory, 0);
+ page_counter_set_low(&memcg->memory, 0);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
- page_counter_set_min(&memcg->memory, 0);
- page_counter_set_low(&memcg->memory, 0);
- memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
}
@@ -5364,7 +5364,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = READ_ONCE(memcg->high);
+ unsigned long high = READ_ONCE(memcg->memory.high);

if (high == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5387,7 +5387,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;

- memcg->high = high;
+ page_counter_set_high(&memcg->memory, high);

nr_pages = page_counter_read(&memcg->memory);
if (nr_pages > high)
diff --git a/mm/page_counter.c b/mm/page_counter.c
index de31470655f6..7f0013304afd 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -202,6 +202,18 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
}
}

+/**
+ * page_counter_set_high - set the upper soft boundary of pages allowed
+ * @counter: counter
+ * @nr_pages: limit to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_high(struct page_counter *counter, unsigned long nr_pages)
+{
+ counter->high = nr_pages;
+}
+
/**
* page_counter_set_min - set the amount of protected memory
* @counter: counter
--
2.14.3


2018-04-20 17:02:56

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: introduce memory.min

On 04/20/18 09:36, Roman Gushchin wrote:

> ---
> Documentation/cgroup-v2.txt | 20 +++++++++
> include/linux/memcontrol.h | 15 ++++++-
> include/linux/page_counter.h | 11 ++++-
> mm/memcontrol.c | 99 ++++++++++++++++++++++++++++++++++++--------
> mm/page_counter.c | 63 ++++++++++++++++++++--------
> mm/vmscan.c | 19 ++++++++-
> 6 files changed, 189 insertions(+), 38 deletions(-)
>
> diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
> index 657fe1769c75..49c846020f96 100644
> --- a/Documentation/cgroup-v2.txt
> +++ b/Documentation/cgroup-v2.txt
> @@ -1002,6 +1002,26 @@ PAGE_SIZE multiple when read back.
> The total amount of memory currently being used by the cgroup
> and its descendants.
>
> + memory.min
> + A read-write single value file which exists on non-root
> + cgroups. The default is "0".
> +
> + Hard memory protection. If the memory usage of a cgroup
> + is within its effectife min boundary, the cgroup's memory

effective

> + won't be reclaimed under any conditions. If there is no
> + unprotected reclaimable memory available, OOM killer
> + is invoked.
> +
> + Effective low boundary is limited by memory.min values of
> + all ancestor cgroups. If there is memory.mn overcommitment

memory.min ? overcommit

> + (child cgroup or cgroups are requiring more protected memory,

drop ending ',' ^^

> + than parent will allow), then each child cgroup will get
> + the part of parent's protection proportional to the its

to its

> + actual memory usage below memory.min.
> +
> + Putting more memory than generally available under this
> + protection is discouraged and may lead to constant OOMs.
> +
> memory.low
> A read-write single value file which exists on non-root
> cgroups. The default is "0".


--
~Randy

2018-04-20 17:24:00

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: introduce memory.min

On Fri, Apr 20, 2018 at 10:01:04AM -0700, Randy Dunlap wrote:
> On 04/20/18 09:36, Roman Gushchin wrote:
>
> > ---
> > Documentation/cgroup-v2.txt | 20 +++++++++
> > include/linux/memcontrol.h | 15 ++++++-
> > include/linux/page_counter.h | 11 ++++-
> > mm/memcontrol.c | 99 ++++++++++++++++++++++++++++++++++++--------
> > mm/page_counter.c | 63 ++++++++++++++++++++--------
> > mm/vmscan.c | 19 ++++++++-
> > 6 files changed, 189 insertions(+), 38 deletions(-)
> >
> > diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
> > index 657fe1769c75..49c846020f96 100644
> > --- a/Documentation/cgroup-v2.txt
> > +++ b/Documentation/cgroup-v2.txt
> > @@ -1002,6 +1002,26 @@ PAGE_SIZE multiple when read back.
> > The total amount of memory currently being used by the cgroup
> > and its descendants.
> >
> > + memory.min
> > + A read-write single value file which exists on non-root
> > + cgroups. The default is "0".
> > +
> > + Hard memory protection. If the memory usage of a cgroup
> > + is within its effectife min boundary, the cgroup's memory
>
> effective
>
> > + won't be reclaimed under any conditions. If there is no
> > + unprotected reclaimable memory available, OOM killer
> > + is invoked.
> > +
> > + Effective low boundary is limited by memory.min values of
> > + all ancestor cgroups. If there is memory.mn overcommitment
>
> memory.min ? overcommit
>
> > + (child cgroup or cgroups are requiring more protected memory,
>
> drop ending ',' ^^
>
> > + than parent will allow), then each child cgroup will get
> > + the part of parent's protection proportional to the its
>
> to its
>
> > + actual memory usage below memory.min.
> > +
> > + Putting more memory than generally available under this
> > + protection is discouraged and may lead to constant OOMs.
> > +
> > memory.low
> > A read-write single value file which exists on non-root
> > cgroups. The default is "0".
>
>
> --
> ~Randy


Hi, Randy!

An updated version below.

Thanks!

------------------------------------------------------------


From 2225fa0b3400431dd803f206b20a9344f0dfcd0a Mon Sep 17 00:00:00 2001
From: Roman Gushchin <[email protected]>
Date: Fri, 20 Apr 2018 15:24:44 +0100
Subject: [PATCH 1/2] mm: introduce memory.min

Memory controller implements the memory.low best-effort memory
protection mechanism, which works perfectly in many cases and
allows protecting working sets of important workloads from
sudden reclaim.

But it's semantics has a significant limitation: it works
only until there is a supply of reclaimable memory.
This makes it pretty useless against any sort of slow memory
leaks or memory usage increases. This is especially true
for swapless systems. If swap is enabled, memory soft protection
effectively postpones problems, allowing a leaking application
to fill all swap area, which makes no sense.
The only effective way to guarantee the memory protection
in this case is to invoke the OOM killer.

This patch introduces the memory.min interface for cgroup v2
memory controller. It works very similarly to memory.low
(sharing the same hierarchical behavior), except that it's
not disabled if there is no more reclaimable memory in the system.

Signed-off-by: Roman Gushchin <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Vladimir Davydov <[email protected]>
Cc: Tejun Heo <[email protected]>
---
Documentation/cgroup-v2.txt | 24 ++++++++++-
include/linux/memcontrol.h | 15 ++++++-
include/linux/page_counter.h | 11 ++++-
mm/memcontrol.c | 99 ++++++++++++++++++++++++++++++++++++--------
mm/page_counter.c | 63 ++++++++++++++++++++--------
mm/vmscan.c | 19 ++++++++-
6 files changed, 191 insertions(+), 40 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 657fe1769c75..a413118b9c29 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1002,6 +1002,26 @@ PAGE_SIZE multiple when read back.
The total amount of memory currently being used by the cgroup
and its descendants.

+ memory.min
+ A read-write single value file which exists on non-root
+ cgroups. The default is "0".
+
+ Hard memory protection. If the memory usage of a cgroup
+ is within its effective min boundary, the cgroup's memory
+ won't be reclaimed under any conditions. If there is no
+ unprotected reclaimable memory available, OOM killer
+ is invoked.
+
+ Effective low boundary is limited by memory.min values of
+ all ancestor cgroups. If there is memory.min overcommitment
+ (child cgroup or cgroups are requiring more protected memory
+ than parent will allow), then each child cgroup will get
+ the part of parent's protection proportional to its
+ actual memory usage below memory.min.
+
+ Putting more memory than generally available under this
+ protection is discouraged and may lead to constant OOMs.
+
memory.low
A read-write single value file which exists on non-root
cgroups. The default is "0".
@@ -1013,9 +1033,9 @@ PAGE_SIZE multiple when read back.

Effective low boundary is limited by memory.low values of
all ancestor cgroups. If there is memory.low overcommitment
- (child cgroup or cgroups are requiring more protected memory,
+ (child cgroup or cgroups are requiring more protected memory
than parent will allow), then each child cgroup will get
- the part of parent's protection proportional to the its
+ the part of parent's protection proportional to its
actual memory usage below memory.low.

Putting more memory than generally available under this
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ab60ff55bdb3..6ee19532f567 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -297,7 +297,14 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
+enum mem_cgroup_protection {
+ MEM_CGROUP_UNPROTECTED,
+ MEM_CGROUP_PROTECTED_LOW,
+ MEM_CGROUP_PROTECTED_MIN,
+};
+
+enum mem_cgroup_protection
+mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg);

int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, struct mem_cgroup **memcgp,
@@ -756,6 +763,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
{
}

+static inline bool mem_cgroup_min(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
+{
+ return false;
+}
+
static inline bool mem_cgroup_low(struct mem_cgroup *root,
struct mem_cgroup *memcg)
{
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 7902a727d3b6..bab7e57f659b 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -8,10 +8,16 @@

struct page_counter {
atomic_long_t usage;
- unsigned long max;
+ unsigned long min;
unsigned long low;
+ unsigned long max;
struct page_counter *parent;

+ /* effective memory.min and memory.min usage tracking */
+ unsigned long emin;
+ atomic_long_t min_usage;
+ atomic_long_t children_min_usage;
+
/* effective memory.low and memory.low usage tracking */
unsigned long elow;
atomic_long_t low_usage;
@@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter,
unsigned long nr_pages,
struct page_counter **fail);
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
-int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
int page_counter_memparse(const char *buf, const char *max,
unsigned long *nr_pages);

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 25b148c2d222..9c65de7937d0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4508,6 +4508,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);

+ page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);

memcg_offline_kmem(memcg);
@@ -4562,6 +4563,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+ page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -5299,6 +5301,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}

+static int memory_min_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long min = READ_ONCE(memcg->memory.min);
+
+ if (min == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t memory_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long min;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &min);
+ if (err)
+ return err;
+
+ page_counter_set_min(&memcg->memory, min);
+
+ return nbytes;
+}
+
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5566,6 +5598,12 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_current_read,
},
+ {
+ .name = "min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_min_show,
+ .write = memory_min_write,
+ },
{
.name = "low",
.flags = CFTYPE_NOT_ON_ROOT,
@@ -5685,44 +5723,71 @@ struct cgroup_subsys memory_cgrp_subsys = {
* for next usage. This part is intentionally racy, but it's ok,
* as memory.low is a best-effort mechanism.
*/
-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+enum mem_cgroup_protection
+mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
- unsigned long usage, low_usage, siblings_low_usage;
- unsigned long elow, parent_elow;
struct mem_cgroup *parent;
+ unsigned long emin, parent_emin;
+ unsigned long elow, parent_elow;
+ unsigned long usage;

if (mem_cgroup_disabled())
- return false;
+ return MEM_CGROUP_UNPROTECTED;

if (!root)
root = root_mem_cgroup;
if (memcg == root)
- return false;
+ return MEM_CGROUP_UNPROTECTED;

- elow = memcg->memory.low;
usage = page_counter_read(&memcg->memory);
- parent = parent_mem_cgroup(memcg);
+ if (!usage)
+ return MEM_CGROUP_UNPROTECTED;
+
+ emin = memcg->memory.min;
+ elow = memcg->memory.low;

+ parent = parent_mem_cgroup(memcg);
if (parent == root)
goto exit;

+ parent_emin = READ_ONCE(parent->memory.emin);
+ emin = min(emin, parent_emin);
+ if (emin && parent_emin) {
+ unsigned long min_usage, siblings_min_usage;
+
+ min_usage = min(usage, memcg->memory.min);
+ siblings_min_usage = atomic_long_read(
+ &parent->memory.children_min_usage);
+
+ if (min_usage && siblings_min_usage)
+ emin = min(emin, parent_emin * min_usage /
+ siblings_min_usage);
+ }
+
parent_elow = READ_ONCE(parent->memory.elow);
elow = min(elow, parent_elow);
+ if (elow && parent_elow) {
+ unsigned long low_usage, siblings_low_usage;

- if (!elow || !parent_elow)
- goto exit;
+ low_usage = min(usage, memcg->memory.low);
+ siblings_low_usage = atomic_long_read(
+ &parent->memory.children_low_usage);

- low_usage = min(usage, memcg->memory.low);
- siblings_low_usage = atomic_long_read(
- &parent->memory.children_low_usage);
-
- if (!low_usage || !siblings_low_usage)
- goto exit;
+ if (low_usage && siblings_low_usage)
+ elow = min(elow, parent_elow * low_usage /
+ siblings_low_usage);
+ }

- elow = min(elow, parent_elow * low_usage / siblings_low_usage);
exit:
+ memcg->memory.emin = emin;
memcg->memory.elow = elow;
- return usage && usage <= elow;
+
+ if (usage <= emin)
+ return MEM_CGROUP_PROTECTED_MIN;
+ else if (usage <= elow)
+ return MEM_CGROUP_PROTECTED_LOW;
+ else
+ return MEM_CGROUP_UNPROTECTED;
}

/**
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a5ff4cbc355a..de31470655f6 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,26 +13,38 @@
#include <linux/bug.h>
#include <asm/page.h>

-static void propagate_low_usage(struct page_counter *c, unsigned long usage)
+static void propagate_protected_usage(struct page_counter *c,
+ unsigned long usage)
{
- unsigned long low_usage, old;
+ unsigned long protected, old_protected;
long delta;

if (!c->parent)
return;

- if (!c->low && !atomic_long_read(&c->low_usage))
- return;
+ if (c->min || atomic_long_read(&c->min_usage)) {
+ if (usage <= c->min)
+ protected = usage;
+ else
+ protected = 0;
+
+ old_protected = atomic_long_xchg(&c->min_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_min_usage);
+ }

- if (usage <= c->low)
- low_usage = usage;
- else
- low_usage = 0;
+ if (c->low || atomic_long_read(&c->low_usage)) {
+ if (usage <= c->low)
+ protected = usage;
+ else
+ protected = 0;

- old = atomic_long_xchg(&c->low_usage, low_usage);
- delta = low_usage - old;
- if (delta)
- atomic_long_add(delta, &c->parent->children_low_usage);
+ old_protected = atomic_long_xchg(&c->low_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_low_usage);
+ }
}

/**
@@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
long new;

new = atomic_long_sub_return(nr_pages, &counter->usage);
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/* More uncharges than charges? */
WARN_ON_ONCE(new < 0);
}
@@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
long new;

new = atomic_long_add_return(nr_pages, &c->usage);
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
- propagate_low_usage(counter, new);
+ propagate_protected_usage(counter, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
@@ -190,6 +202,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
}
}

+/**
+ * page_counter_set_min - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ counter->min = nr_pages;
+
+ for (c = counter; c; c = c->parent)
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
/**
* page_counter_set_low - set the amount of protected memory
* @counter: counter
@@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
counter->low = nr_pages;

for (c = counter; c; c = c->parent)
- propagate_low_usage(c, atomic_long_read(&c->usage));
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8b920ce3ae02..39ef143cf781 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2525,12 +2525,29 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long reclaimed;
unsigned long scanned;

- if (mem_cgroup_low(root, memcg)) {
+ switch (mem_cgroup_protected(root, memcg)) {
+ case MEM_CGROUP_PROTECTED_MIN:
+ /*
+ * Hard protection.
+ * If there is no reclaimable memory, OOM.
+ */
+ continue;
+
+ case MEM_CGROUP_PROTECTED_LOW:
+ /*
+ * Soft protection.
+ * Respect the protection only until there is
+ * a supply of reclaimable memory.
+ */
if (!sc->memcg_low_reclaim) {
sc->memcg_low_skipped = 1;
continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
+ break;
+
+ case MEM_CGROUP_UNPROTECTED:
+ break;
}

reclaimed = sc->nr_reclaimed;
--
2.14.3


2018-04-20 17:30:47

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: introduce memory.min

On 04/20/18 10:20, Roman Gushchin wrote:
>
> Hi, Randy!
>
> An updated version below.
>
> Thanks!

OK, looks good now. Thanks.

FWIW:
Reviewed-by: Randy Dunlap <[email protected]> # for Documentation/ only.

> ------------------------------------------------------------
>
>
> From 2225fa0b3400431dd803f206b20a9344f0dfcd0a Mon Sep 17 00:00:00 2001
> From: Roman Gushchin <[email protected]>
> Date: Fri, 20 Apr 2018 15:24:44 +0100
> Subject: [PATCH 1/2] mm: introduce memory.min
>
> Memory controller implements the memory.low best-effort memory
> protection mechanism, which works perfectly in many cases and
> allows protecting working sets of important workloads from
> sudden reclaim.
>
> But it's semantics has a significant limitation: it works
> only until there is a supply of reclaimable memory.
> This makes it pretty useless against any sort of slow memory
> leaks or memory usage increases. This is especially true
> for swapless systems. If swap is enabled, memory soft protection
> effectively postpones problems, allowing a leaking application
> to fill all swap area, which makes no sense.
> The only effective way to guarantee the memory protection
> in this case is to invoke the OOM killer.
>
> This patch introduces the memory.min interface for cgroup v2
> memory controller. It works very similarly to memory.low
> (sharing the same hierarchical behavior), except that it's
> not disabled if there is no more reclaimable memory in the system.
>
> Signed-off-by: Roman Gushchin <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Vladimir Davydov <[email protected]>
> Cc: Tejun Heo <[email protected]>
> ---
> Documentation/cgroup-v2.txt | 24 ++++++++++-
> include/linux/memcontrol.h | 15 ++++++-
> include/linux/page_counter.h | 11 ++++-
> mm/memcontrol.c | 99 ++++++++++++++++++++++++++++++++++++--------
> mm/page_counter.c | 63 ++++++++++++++++++++--------
> mm/vmscan.c | 19 ++++++++-
> 6 files changed, 191 insertions(+), 40 deletions(-)
>
> diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
> index 657fe1769c75..a413118b9c29 100644
> --- a/Documentation/cgroup-v2.txt
> +++ b/Documentation/cgroup-v2.txt
> @@ -1002,6 +1002,26 @@ PAGE_SIZE multiple when read back.
> The total amount of memory currently being used by the cgroup
> and its descendants.
>
> + memory.min
> + A read-write single value file which exists on non-root
> + cgroups. The default is "0".
> +
> + Hard memory protection. If the memory usage of a cgroup
> + is within its effective min boundary, the cgroup's memory
> + won't be reclaimed under any conditions. If there is no
> + unprotected reclaimable memory available, OOM killer
> + is invoked.
> +
> + Effective low boundary is limited by memory.min values of
> + all ancestor cgroups. If there is memory.min overcommitment
> + (child cgroup or cgroups are requiring more protected memory
> + than parent will allow), then each child cgroup will get
> + the part of parent's protection proportional to its
> + actual memory usage below memory.min.
> +
> + Putting more memory than generally available under this
> + protection is discouraged and may lead to constant OOMs.
> +
> memory.low
> A read-write single value file which exists on non-root
> cgroups. The default is "0".
> @@ -1013,9 +1033,9 @@ PAGE_SIZE multiple when read back.
>
> Effective low boundary is limited by memory.low values of
> all ancestor cgroups. If there is memory.low overcommitment
> - (child cgroup or cgroups are requiring more protected memory,
> + (child cgroup or cgroups are requiring more protected memory
> than parent will allow), then each child cgroup will get
> - the part of parent's protection proportional to the its
> + the part of parent's protection proportional to its
> actual memory usage below memory.low.
>
> Putting more memory than generally available under this



--
~Randy

2018-04-20 20:47:47

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: introduce memory.min

Hi Roman,

this looks cool, and the implementation makes sense to me, so the
feedback I have is just smaller stuff.

On Fri, Apr 20, 2018 at 05:36:31PM +0100, Roman Gushchin wrote:
> Memory controller implements the memory.low best-effort memory
> protection mechanism, which works perfectly in many cases and
> allows protecting working sets of important workloads from
> sudden reclaim.
>
> But it's semantics has a significant limitation: it works

its

> only until there is a supply of reclaimable memory.

s/until/as long as/

Maybe "as long as there is an unprotected supply of reclaimable memory
from other groups"?

> This makes it pretty useless against any sort of slow memory
> leaks or memory usage increases. This is especially true
> for swapless systems. If swap is enabled, memory soft protection
> effectively postpones problems, allowing a leaking application
> to fill all swap area, which makes no sense.
> The only effective way to guarantee the memory protection
> in this case is to invoke the OOM killer.

This makes it sound like it has no purpose at all. But like with
memory.high, the point is to avoid the kernel OOM killer, which knows
jack about how the system is structured, and instead allow userspace
management software to monitor MEMCG_LOW and make informed decisions.

memory.min again is the fail-safe for memory.low, not the primary way
of implementing guarantees.

> @@ -297,7 +297,14 @@ static inline bool mem_cgroup_disabled(void)
> return !cgroup_subsys_enabled(memory_cgrp_subsys);
> }
>
> -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
> +enum mem_cgroup_protection {
> + MEM_CGROUP_UNPROTECTED,
> + MEM_CGROUP_PROTECTED_LOW,
> + MEM_CGROUP_PROTECTED_MIN,
> +};

These look a bit unwieldy. How about

MEMCG_PROT_NONE,
MEMCG_PROT_LOW,
MEMCG_PROT_HIGH,

> +enum mem_cgroup_protection
> +mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg);

Please don't wrap at the return type, wrap the parameter list instead.

> int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> gfp_t gfp_mask, struct mem_cgroup **memcgp,
> @@ -756,6 +763,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
> {
> }
>
> +static inline bool mem_cgroup_min(struct mem_cgroup *root,
> + struct mem_cgroup *memcg)
> +{
> + return false;
> +}
> +
> static inline bool mem_cgroup_low(struct mem_cgroup *root,
> struct mem_cgroup *memcg)
> {

The real implementation has these merged into the single
mem_cgroup_protected(). Probably a left-over from earlier versions?

It's always a good idea to build test the !CONFIG_MEMCG case too.

> @@ -5685,44 +5723,71 @@ struct cgroup_subsys memory_cgrp_subsys = {
> * for next usage. This part is intentionally racy, but it's ok,
> * as memory.low is a best-effort mechanism.
> */
> -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
> +enum mem_cgroup_protection
> +mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg)

Please fix the wrapping here too.

> @@ -2525,12 +2525,29 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> unsigned long reclaimed;
> unsigned long scanned;
>
> - if (mem_cgroup_low(root, memcg)) {
> + switch (mem_cgroup_protected(root, memcg)) {
> + case MEM_CGROUP_PROTECTED_MIN:
> + /*
> + * Hard protection.
> + * If there is no reclaimable memory, OOM.
> + */
> + continue;
> +
> + case MEM_CGROUP_PROTECTED_LOW:

Please drop that newline after continue.

> + /*
> + * Soft protection.
> + * Respect the protection only until there is
> + * a supply of reclaimable memory.

Same feedback here as in the changelog:

s/until/as long as/

Maybe "as long as there is an unprotected supply of reclaimable memory
from other groups"?

> + */
> if (!sc->memcg_low_reclaim) {
> sc->memcg_low_skipped = 1;
> continue;
> }
> memcg_memory_event(memcg, MEMCG_LOW);
> + break;
> +
> + case MEM_CGROUP_UNPROTECTED:

Please drop that newline after break, too.

Thanks!
Johannes

2018-04-20 20:58:14

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: move the high field from struct mem_cgroup to page_counter

On Fri, Apr 20, 2018 at 05:36:32PM +0100, Roman Gushchin wrote:
> We do store memory.min, memory.low and memory.max actual values
> in struct page_counter fields, while memory.high value is located
> in the struct mem_cgroup directly, which is not very consistent.
>
> This patch moves the high field from struct mem_cgroup to
> struct page_counter to simplify the code and make handling
> of all limits/boundaries clearer.

I would prefer not doing this.

Yes, it looks a bit neater if all these things are next to each other
in the struct, but on the other hand it separates the high variable
from high_work, and it adds an unnecessary setter function as well.

Plus, nothing in the page_counter code actually uses the value, it
really isn't part of that abstraction layer.

2018-04-23 12:44:12

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm: move the high field from struct mem_cgroup to page_counter

On Fri, Apr 20, 2018 at 04:54:50PM -0400, Johannes Weiner wrote:
> On Fri, Apr 20, 2018 at 05:36:32PM +0100, Roman Gushchin wrote:
> > We do store memory.min, memory.low and memory.max actual values
> > in struct page_counter fields, while memory.high value is located
> > in the struct mem_cgroup directly, which is not very consistent.
> >
> > This patch moves the high field from struct mem_cgroup to
> > struct page_counter to simplify the code and make handling
> > of all limits/boundaries clearer.
>
> I would prefer not doing this.
>
> Yes, it looks a bit neater if all these things are next to each other
> in the struct, but on the other hand it separates the high variable
> from high_work, and it adds an unnecessary setter function as well.
>
> Plus, nothing in the page_counter code actually uses the value, it
> really isn't part of that abstraction layer.
>

Ok, not a problem.
It's nice to have all 4 limits in one place, but separating
high and high_work isn't good, I agree. Let's leave it as it is.

Thanks!

2018-04-23 12:47:45

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH 1/2] mm: introduce memory.min

Hi Johannes!

Thanks for the review. I've just sent v2, which closely
follows your advice, really nothing contradictory.

Plus fixed some comments on top of mem_cgroup_protected
and fixed !CONFIG_MEMCG build.

Thank you!

Roman

On Fri, Apr 20, 2018 at 04:44:29PM -0400, Johannes Weiner wrote:
> Hi Roman,
>
> this looks cool, and the implementation makes sense to me, so the
> feedback I have is just smaller stuff.
>
> On Fri, Apr 20, 2018 at 05:36:31PM +0100, Roman Gushchin wrote:
> > Memory controller implements the memory.low best-effort memory
> > protection mechanism, which works perfectly in many cases and
> > allows protecting working sets of important workloads from
> > sudden reclaim.
> >
> > But it's semantics has a significant limitation: it works
>
> its
>
> > only until there is a supply of reclaimable memory.
>
> s/until/as long as/
>
> Maybe "as long as there is an unprotected supply of reclaimable memory
> from other groups"?
>
> > This makes it pretty useless against any sort of slow memory
> > leaks or memory usage increases. This is especially true
> > for swapless systems. If swap is enabled, memory soft protection
> > effectively postpones problems, allowing a leaking application
> > to fill all swap area, which makes no sense.
> > The only effective way to guarantee the memory protection
> > in this case is to invoke the OOM killer.
>
> This makes it sound like it has no purpose at all. But like with
> memory.high, the point is to avoid the kernel OOM killer, which knows
> jack about how the system is structured, and instead allow userspace
> management software to monitor MEMCG_LOW and make informed decisions.
>
> memory.min again is the fail-safe for memory.low, not the primary way
> of implementing guarantees.
>
> > @@ -297,7 +297,14 @@ static inline bool mem_cgroup_disabled(void)
> > return !cgroup_subsys_enabled(memory_cgrp_subsys);
> > }
> >
> > -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
> > +enum mem_cgroup_protection {
> > + MEM_CGROUP_UNPROTECTED,
> > + MEM_CGROUP_PROTECTED_LOW,
> > + MEM_CGROUP_PROTECTED_MIN,
> > +};
>
> These look a bit unwieldy. How about
>
> MEMCG_PROT_NONE,
> MEMCG_PROT_LOW,
> MEMCG_PROT_HIGH,
>
> > +enum mem_cgroup_protection
> > +mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg);
>
> Please don't wrap at the return type, wrap the parameter list instead.
>
> > int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> > gfp_t gfp_mask, struct mem_cgroup **memcgp,
> > @@ -756,6 +763,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
> > {
> > }
> >
> > +static inline bool mem_cgroup_min(struct mem_cgroup *root,
> > + struct mem_cgroup *memcg)
> > +{
> > + return false;
> > +}
> > +
> > static inline bool mem_cgroup_low(struct mem_cgroup *root,
> > struct mem_cgroup *memcg)
> > {
>
> The real implementation has these merged into the single
> mem_cgroup_protected(). Probably a left-over from earlier versions?
>
> It's always a good idea to build test the !CONFIG_MEMCG case too.
>
> > @@ -5685,44 +5723,71 @@ struct cgroup_subsys memory_cgrp_subsys = {
> > * for next usage. This part is intentionally racy, but it's ok,
> > * as memory.low is a best-effort mechanism.
> > */
> > -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
> > +enum mem_cgroup_protection
> > +mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg)
>
> Please fix the wrapping here too.
>
> > @@ -2525,12 +2525,29 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> > unsigned long reclaimed;
> > unsigned long scanned;
> >
> > - if (mem_cgroup_low(root, memcg)) {
> > + switch (mem_cgroup_protected(root, memcg)) {
> > + case MEM_CGROUP_PROTECTED_MIN:
> > + /*
> > + * Hard protection.
> > + * If there is no reclaimable memory, OOM.
> > + */
> > + continue;
> > +
> > + case MEM_CGROUP_PROTECTED_LOW:
>
> Please drop that newline after continue.
>
> > + /*
> > + * Soft protection.
> > + * Respect the protection only until there is
> > + * a supply of reclaimable memory.
>
> Same feedback here as in the changelog:
>
> s/until/as long as/
>
> Maybe "as long as there is an unprotected supply of reclaimable memory
> from other groups"?
>
> > + */
> > if (!sc->memcg_low_reclaim) {
> > sc->memcg_low_skipped = 1;
> > continue;
> > }
> > memcg_memory_event(memcg, MEMCG_LOW);
> > + break;
> > +
> > + case MEM_CGROUP_UNPROTECTED:
>
> Please drop that newline after break, too.
>
> Thanks!
> Johannes