2015-12-10 11:39:31

by Vladimir Davydov

[permalink] [raw]
Subject: [PATCH 0/7] Add swap accounting to cgroup2

Hi,

This patch set adds swap accounting to cgroup2. In contrast to the
legacy hierarchy, actual swap usage is accounted. It can be controlled
and monitored using new files, memory.swap.current and memory.swap.max.
For more details, please see patch 1 of the series, which introduces the
new counter. Patches 2-6 make memcg reclaim follow the heuristics used
on global reclaim for handling anon/swap. Patch 7 updates documentation.

Thanks,

Vladimir Davydov (7):
mm: memcontrol: charge swap to cgroup2
mm: vmscan: pass memcg to get_scan_count()
mm: memcontrol: replace mem_cgroup_lruvec_online with
mem_cgroup_online
swap.h: move memcg related stuff to the end of the file
mm: vmscan: do not scan anon pages if memcg swap limit is hit
mm: free swap cache aggressively if memcg swap is full
Documentation: cgroup: add memory.swap.{current,max} description

Documentation/cgroup.txt | 16 +++++
include/linux/memcontrol.h | 28 ++++----
include/linux/swap.h | 75 +++++++++++++--------
mm/memcontrol.c | 159 ++++++++++++++++++++++++++++++++++++++++++---
mm/memory.c | 3 +-
mm/shmem.c | 4 ++
mm/swap_state.c | 5 ++
mm/swapfile.c | 2 +-
mm/vmscan.c | 26 ++++----
9 files changed, 249 insertions(+), 69 deletions(-)

--
2.1.4


2015-12-10 11:39:36

by Vladimir Davydov

[permalink] [raw]
Subject: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

In the legacy hierarchy we charge memsw, which is dubious, because:

- memsw.limit must be >= memory.limit, so it is impossible to limit
swap usage less than memory usage. Taking into account the fact that
the primary limiting mechanism in the unified hierarchy is
memory.high while memory.limit is either left unset or set to a very
large value, moving memsw.limit knob to the unified hierarchy would
effectively make it impossible to limit swap usage according to the
user preference.

- memsw.usage != memory.usage + swap.usage, because a page occupying
both swap entry and a swap cache page is charged only once to memsw
counter. As a result, it is possible to effectively eat up to
memory.limit of memory pages *and* memsw.limit of swap entries, which
looks unexpected.

That said, we should provide a different swap limiting mechanism for
cgroup2.

This patch adds mem_cgroup->swap counter, which charges the actual
number of swap entries used by a cgroup. It is only charged in the
unified hierarchy, while the legacy hierarchy memsw logic is left
intact.

The swap usage can be monitored using new memory.swap.current file and
limited using memory.swap.max.

Signed-off-by: Vladimir Davydov <[email protected]>
---
include/linux/memcontrol.h | 1 +
include/linux/swap.h | 5 ++
mm/memcontrol.c | 123 +++++++++++++++++++++++++++++++++++++++++----
mm/shmem.c | 4 ++
mm/swap_state.c | 5 ++
5 files changed, 129 insertions(+), 9 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c6a5ed2f2744..993c9a26b637 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -169,6 +169,7 @@ struct mem_cgroup {

/* Accounted resources */
struct page_counter memory;
+ struct page_counter swap;
struct page_counter memsw;
struct page_counter kmem;

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 457181844b6e..f4b3ccdcba91 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -368,11 +368,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
#endif
#ifdef CONFIG_MEMCG_SWAP
extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
+extern int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry);
extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
#else
static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
}
+static inline int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry)
+{
+ return 0;
+}
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
{
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7f5c6abf5421..9d10e2819ec4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1212,7 +1212,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont(":");

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
K(mem_cgroup_read_stat(iter, i)));
@@ -1248,12 +1248,15 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
unsigned long limit;

- limit = memcg->memory.limit;
+ limit = READ_ONCE(memcg->memory.limit);
if (mem_cgroup_swappiness(memcg)) {
unsigned long memsw_limit;
+ unsigned long swap_limit;

- memsw_limit = memcg->memsw.limit;
- limit = min(limit + total_swap_pages, memsw_limit);
+ memsw_limit = READ_ONCE(memcg->memsw.limit);
+ swap_limit = min(READ_ONCE(memcg->swap.limit),
+ (unsigned long)total_swap_pages);
+ limit = min(limit + swap_limit, memsw_limit);
}
return limit;
}
@@ -4226,6 +4229,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_init(&memcg->memory, NULL);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
}
@@ -4276,6 +4280,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
page_counter_init(&memcg->memory, &parent->memory);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
@@ -4291,6 +4296,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
page_counter_init(&memcg->memory, NULL);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
#if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
@@ -5291,7 +5297,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
if (page->mem_cgroup)
goto out;

- if (do_memsw_account()) {
+ if (do_swap_account) {
swp_entry_t ent = { .val = page_private(page), };
unsigned short id = lookup_swap_cgroup_id(ent);

@@ -5754,26 +5760,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
memcg_check_events(memcg, page);
}

+/*
+ * mem_cgroup_charge_swap - charge a swap entry
+ * @page: page being added to swap
+ * @entry: swap entry to charge
+ *
+ * Try to charge @entry to the memcg that @page belongs to.
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ struct page_counter *counter;
+ unsigned short oldid;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+ return 0;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return 0;
+
+ if (!mem_cgroup_is_root(memcg) &&
+ !page_counter_try_charge(&memcg->swap, 1, &counter))
+ return -ENOMEM;
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ VM_BUG_ON_PAGE(oldid, page);
+ mem_cgroup_swap_statistics(memcg, true);
+
+ css_get(&memcg->css);
+ return 0;
+}
+
/**
* mem_cgroup_uncharge_swap - uncharge a swap entry
* @entry: swap entry to uncharge
*
- * Drop the memsw charge associated with @entry.
+ * Drop the swap charge associated with @entry.
*/
void mem_cgroup_uncharge_swap(swp_entry_t entry)
{
struct mem_cgroup *memcg;
unsigned short id;

- if (!do_memsw_account())
+ if (!do_swap_account)
return;

id = swap_cgroup_record(entry, 0);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memsw, 1);
+ if (!mem_cgroup_is_root(memcg)) {
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->swap, 1);
+ else
+ page_counter_uncharge(&memcg->memsw, 1);
+ }
mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css);
}
@@ -5797,6 +5843,63 @@ static int __init enable_swap_account(char *s)
}
__setup("swapaccount=", enable_swap_account);

+static u64 swap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
+}
+
+static int swap_max_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long max = READ_ONCE(memcg->swap.limit);
+
+ if (max == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t swap_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ mutex_lock(&memcg_limit_mutex);
+ err = page_counter_limit(&memcg->swap, max);
+ mutex_unlock(&memcg_limit_mutex);
+ if (err)
+ return err;
+
+ return nbytes;
+}
+
+static struct cftype swap_files[] = {
+ {
+ .name = "swap.current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_current_read,
+ },
+ {
+ .name = "swap.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_max_show,
+ .write = swap_max_write,
+ },
+ { } /* terminate */
+};
+
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
@@ -5828,6 +5931,8 @@ static int __init mem_cgroup_swap_init(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account) {
do_swap_account = 1;
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
+ swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
memsw_cgroup_files));
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 9b051115a100..659a90d8305c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -912,6 +912,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (!swap.val)
goto redirty;

+ if (mem_cgroup_charge_swap(page, swap))
+ goto free_swap;
+
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the page is
@@ -940,6 +943,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}

mutex_unlock(&shmem_swaplist_mutex);
+free_swap:
swapcache_free(swap);
redirty:
set_page_dirty(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d783872d746c..dea39cb03967 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list)
if (!entry.val)
return 0;

+ if (mem_cgroup_charge_swap(page, entry)) {
+ swapcache_free(entry);
+ return 0;
+ }
+
if (unlikely(PageTransHuge(page)))
if (unlikely(split_huge_page_to_list(page, list))) {
swapcache_free(entry);
--
2.1.4

2015-12-10 11:39:40

by Vladimir Davydov

[permalink] [raw]
Subject: [PATCH 2/7] mm: vmscan: pass memcg to get_scan_count()

memcg will come in handy in get_scan_count(). It can already be used for
getting swappiness immediately in get_scan_count() instead of passing it
around. The following patches will add more memcg-related values, which
will be used there.

Signed-off-by: Vladimir Davydov <[email protected]>
---
mm/vmscan.c | 20 ++++++++------------
1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index bb01b04154ad..acc6bff84e26 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1957,10 +1957,11 @@ enum scan_balance {
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
-static void get_scan_count(struct lruvec *lruvec, int swappiness,
+static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long *nr,
unsigned long *lru_pages)
{
+ int swappiness = mem_cgroup_swappiness(memcg);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
@@ -2184,9 +2185,10 @@ static inline void init_tlb_ubc(void)
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
- struct scan_control *sc, unsigned long *lru_pages)
+static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long *lru_pages)
{
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
@@ -2196,7 +2198,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
struct blk_plug plug;
bool scan_adjusted;

- get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
+ get_scan_count(lruvec, memcg, sc, nr, lru_pages);

/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2400,8 +2402,6 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
unsigned long lru_pages;
unsigned long reclaimed;
unsigned long scanned;
- struct lruvec *lruvec;
- int swappiness;

if (mem_cgroup_low(root, memcg)) {
if (!sc->may_thrash)
@@ -2409,12 +2409,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
mem_cgroup_events(memcg, MEMCG_LOW, 1);
}

- lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- swappiness = mem_cgroup_swappiness(memcg);
reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;

- shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
+ shrink_zone_memcg(zone, memcg, sc, &lru_pages);
zone_lru_pages += lru_pages;

if (memcg && is_classzone)
@@ -2884,8 +2882,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !noswap,
};
- struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- int swappiness = mem_cgroup_swappiness(memcg);
unsigned long lru_pages;

sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2902,7 +2898,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
+ shrink_zone_memcg(zone, memcg, &sc, &lru_pages);

trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);

--
2.1.4

2015-12-10 11:39:42

by Vladimir Davydov

[permalink] [raw]
Subject: [PATCH 3/7] mm: memcontrol: replace mem_cgroup_lruvec_online with mem_cgroup_online

mem_cgroup_lruvec_online() takes lruvec, but it only needs memcg. Since
get_scan_count(), which is the only user of this function, now possesses
pointer to memcg, let's pass memcg directly to mem_cgroup_online()
instead of picking it out of lruvec and rename the function accordingly.

Signed-off-by: Vladimir Davydov <[email protected]>
---
include/linux/memcontrol.h | 27 ++++++++++-----------------
mm/vmscan.c | 2 +-
2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 993c9a26b637..c9a14e1eab62 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -361,6 +361,13 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

+static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
+{
+ if (mem_cgroup_disabled())
+ return true;
+ return !!(memcg->css.flags & CSS_ONLINE);
+}
+
/*
* For memory reclaim.
*/
@@ -369,20 +376,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages);

-static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
- struct mem_cgroup_per_zone *mz;
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return true;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- memcg = mz->memcg;
-
- return !!(memcg->css.flags & CSS_ONLINE);
-}
-
static inline
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
@@ -706,13 +699,13 @@ static inline bool mem_cgroup_disabled(void)
return true;
}

-static inline bool
-mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
return true;
}

-static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+static inline bool
+mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
{
return true;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index acc6bff84e26..b220e6cda25d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1988,7 +1988,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
if (current_is_kswapd()) {
if (!zone_reclaimable(zone))
force_scan = true;
- if (!mem_cgroup_lruvec_online(lruvec))
+ if (!mem_cgroup_online(memcg))
force_scan = true;
}
if (!global_reclaim(sc))
--
2.1.4

2015-12-10 11:39:47

by Vladimir Davydov

[permalink] [raw]
Subject: [PATCH 4/7] swap.h: move memcg related stuff to the end of the file

The following patches will add more functions to the memcg section of
include/linux/swap.h. Some of them will need values defined below the
current location of the section. So let's move the section to the end of
the file. No functional changes intended.

Signed-off-by: Vladimir Davydov <[email protected]>
---
include/linux/swap.h | 68 ++++++++++++++++++++++++++++------------------------
1 file changed, 37 insertions(+), 31 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f4b3ccdcba91..66ea62cf256d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -350,38 +350,7 @@ extern void check_move_unevictable_pages(struct page **, int nr_pages);

extern int kswapd_run(int nid);
extern void kswapd_stop(int nid);
-#ifdef CONFIG_MEMCG
-static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
- /* root ? */
- if (mem_cgroup_disabled() || !memcg->css.parent)
- return vm_swappiness;
-
- return memcg->swappiness;
-}

-#else
-static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
-{
- return vm_swappiness;
-}
-#endif
-#ifdef CONFIG_MEMCG_SWAP
-extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
-extern int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry);
-extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
-#else
-static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
-{
-}
-static inline int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry)
-{
- return 0;
-}
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
-{
-}
-#endif
#ifdef CONFIG_SWAP
/* linux/mm/page_io.c */
extern int swap_readpage(struct page *);
@@ -560,5 +529,42 @@ static inline swp_entry_t get_swap_page(void)
}

#endif /* CONFIG_SWAP */
+
+#ifdef CONFIG_MEMCG
+static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+{
+ /* root ? */
+ if (mem_cgroup_disabled() || !memcg->css.parent)
+ return vm_swappiness;
+
+ return memcg->swappiness;
+}
+
+#else
+static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
+{
+ return vm_swappiness;
+}
+#endif
+
+#ifdef CONFIG_MEMCG_SWAP
+extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
+extern int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry);
+extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
+#else
+static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+}
+
+static inline int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry)
+{
+ return 0;
+}
+
+static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+}
+#endif
+
#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */
--
2.1.4

2015-12-10 11:39:51

by Vladimir Davydov

[permalink] [raw]
Subject: [PATCH 5/7] mm: vmscan: do not scan anon pages if memcg swap limit is hit

We don't scan anonymous memory if we ran out of swap, neither should we
do it in case memcg swap limit is hit, because swap out is impossible
anyway.

Signed-off-by: Vladimir Davydov <[email protected]>
---
include/linux/swap.h | 6 ++++++
mm/memcontrol.c | 13 +++++++++++++
mm/vmscan.c | 2 +-
3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 66ea62cf256d..e3344d8ca2e9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -551,6 +551,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
extern int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry);
extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
+extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
#else
static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
@@ -564,6 +565,11 @@ static inline int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry)
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
{
}
+
+static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
+{
+ return get_nr_swap_pages();
+}
#endif

#endif /* __KERNEL__*/
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9d10e2819ec4..2ee823d62f80 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5826,6 +5826,19 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
rcu_read_unlock();
}

+long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
+{
+ long nr_swap_pages = get_nr_swap_pages();
+
+ if (!do_swap_account)
+ return nr_swap_pages;
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ nr_swap_pages = min_t(long, nr_swap_pages,
+ READ_ONCE(memcg->swap.limit) -
+ page_counter_read(&memcg->swap));
+ return nr_swap_pages;
+}
+
/* for remember boot option*/
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b220e6cda25d..ab52d865d922 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1995,7 +1995,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
force_scan = true;

/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
+ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
scan_balance = SCAN_FILE;
goto out;
}
--
2.1.4

2015-12-10 11:39:56

by Vladimir Davydov

[permalink] [raw]
Subject: [PATCH 6/7] mm: free swap cache aggressively if memcg swap is full

Swap cache pages are freed aggressively if swap is nearly full (>50%
currently), because otherwise we are likely to stop scanning anonymous
when we near the swap limit even if there is plenty of freeable swap
cache pages. We should follow the same trend in case of memory cgroup,
which has its own swap limit.

Signed-off-by: Vladimir Davydov <[email protected]>
---
include/linux/swap.h | 6 ++++++
mm/memcontrol.c | 23 +++++++++++++++++++++++
mm/memory.c | 3 ++-
mm/swapfile.c | 2 +-
mm/vmscan.c | 2 +-
5 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index e3344d8ca2e9..1d708860be97 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -552,6 +552,7 @@ extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
extern int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry);
extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
+extern bool mem_cgroup_swap_full(struct page *page);
#else
static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
@@ -570,6 +571,11 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
return get_nr_swap_pages();
}
+
+static inline bool mem_cgroup_swap_full(struct page *page)
+{
+ return vm_swap_full();
+}
#endif

#endif /* __KERNEL__*/
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2ee823d62f80..e5bd43340cd8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5839,6 +5839,29 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
return nr_swap_pages;
}

+bool mem_cgroup_swap_full(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (vm_swap_full())
+ return true;
+ if (!do_swap_account || !PageSwapCache(page))
+ return false;
+
+ memcg = page->mem_cgroup;
+ if (!memcg)
+ return false;
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ if (page_counter_read(&memcg->swap) * 2 >=
+ READ_ONCE(memcg->swap.limit))
+ return true;
+ }
+ return false;
+}
+
/* for remember boot option*/
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
diff --git a/mm/memory.c b/mm/memory.c
index 3b115dcaa26e..2bd6a78c142b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2563,7 +2563,8 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}

swap_free(entry);
- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ if (mem_cgroup_swap_full(page) ||
+ (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7073faecb38f..c0aba04f7a59 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1011,7 +1011,7 @@ int free_swap_and_cache(swp_entry_t entry)
* Also recheck PageSwapCache now page is locked (above).
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || vm_swap_full())) {
+ (!page_mapped(page) || mem_cgroup_swap_full(page))) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ab52d865d922..1cd88e9b0383 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1206,7 +1206,7 @@ cull_mlocked:

activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && vm_swap_full())
+ if (PageSwapCache(page) && mem_cgroup_swap_full(page))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
--
2.1.4

2015-12-10 11:39:59

by Vladimir Davydov

[permalink] [raw]
Subject: [PATCH 7/7] Documentation: cgroup: add memory.swap.{current,max} description

Signed-off-by: Vladimir Davydov <[email protected]>
---
Documentation/cgroup.txt | 16 ++++++++++++++++
1 file changed, 16 insertions(+)

diff --git a/Documentation/cgroup.txt b/Documentation/cgroup.txt
index 31d1f7bf12a1..21c6c013c339 100644
--- a/Documentation/cgroup.txt
+++ b/Documentation/cgroup.txt
@@ -819,6 +819,22 @@ PAGE_SIZE multiple when read back.
the cgroup. This may not exactly match the number of
processes killed but should generally be close.

+ memory.swap.current
+
+ A read-only single value file which exists on non-root
+ cgroups.
+
+ The total amount of swap currently being used by the cgroup
+ and its descendants.
+
+ memory.swap.max
+
+ A read-write single value file which exists on non-root
+ cgroups. The default is "max".
+
+ Swap usage hard limit. If a cgroup's swap usage reaches this
+ limit, anonymous meomry of the cgroup will not be swapped out.
+

5-2-2. General Usage

--
2.1.4

2015-12-10 16:00:41

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Thu, Dec 10, 2015 at 02:39:14PM +0300, Vladimir Davydov wrote:
> In the legacy hierarchy we charge memsw, which is dubious, because:
>
> - memsw.limit must be >= memory.limit, so it is impossible to limit
> swap usage less than memory usage. Taking into account the fact that
> the primary limiting mechanism in the unified hierarchy is
> memory.high while memory.limit is either left unset or set to a very
> large value, moving memsw.limit knob to the unified hierarchy would
> effectively make it impossible to limit swap usage according to the
> user preference.
>
> - memsw.usage != memory.usage + swap.usage, because a page occupying
> both swap entry and a swap cache page is charged only once to memsw
> counter. As a result, it is possible to effectively eat up to
> memory.limit of memory pages *and* memsw.limit of swap entries, which
> looks unexpected.
>
> That said, we should provide a different swap limiting mechanism for
> cgroup2.
>
> This patch adds mem_cgroup->swap counter, which charges the actual
> number of swap entries used by a cgroup. It is only charged in the
> unified hierarchy, while the legacy hierarchy memsw logic is left
> intact.
>
> The swap usage can be monitored using new memory.swap.current file and
> limited using memory.swap.max.
>
> Signed-off-by: Vladimir Davydov <[email protected]>

This looks great!

Acked-by: Johannes Weiner <[email protected]>

I have a few questions, but none of them show-stoppers:

> ---
> include/linux/memcontrol.h | 1 +
> include/linux/swap.h | 5 ++
> mm/memcontrol.c | 123 +++++++++++++++++++++++++++++++++++++++++----
> mm/shmem.c | 4 ++
> mm/swap_state.c | 5 ++
> 5 files changed, 129 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index c6a5ed2f2744..993c9a26b637 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -169,6 +169,7 @@ struct mem_cgroup {
>
> /* Accounted resources */
> struct page_counter memory;
> + struct page_counter swap;
> struct page_counter memsw;
> struct page_counter kmem;

We should probably separate this to differentiate the new counters
from the old ones. Only memory and swap are actual resources, the
memsw and kmem counters are counting consumer-oriented.

> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 457181844b6e..f4b3ccdcba91 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -368,11 +368,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
> #endif
> #ifdef CONFIG_MEMCG_SWAP
> extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
> +extern int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry);

Should this be mem_cgroup_try_swap() to keep in line with the page
counter terminology? So it's clear this is not forcing a charge.

> @@ -1248,12 +1248,15 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> {
> unsigned long limit;
>
> - limit = memcg->memory.limit;
> + limit = READ_ONCE(memcg->memory.limit);
> if (mem_cgroup_swappiness(memcg)) {
> unsigned long memsw_limit;
> + unsigned long swap_limit;
>
> - memsw_limit = memcg->memsw.limit;
> - limit = min(limit + total_swap_pages, memsw_limit);
> + memsw_limit = READ_ONCE(memcg->memsw.limit);
> + swap_limit = min(READ_ONCE(memcg->swap.limit),
> + (unsigned long)total_swap_pages);
> + limit = min(limit + swap_limit, memsw_limit);
> }
> return limit;

This is taking a racy snapshot, so we don't rely on 100% accuracy. Can
we do without the READ_ONCE()?

> @@ -5754,26 +5760,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
> memcg_check_events(memcg, page);
> }
>
> +/*
> + * mem_cgroup_charge_swap - charge a swap entry
> + * @page: page being added to swap
> + * @entry: swap entry to charge
> + *
> + * Try to charge @entry to the memcg that @page belongs to.
> + *
> + * Returns 0 on success, -ENOMEM on failure.
> + */
> +int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry)
> +{
> + struct mem_cgroup *memcg;
> + struct page_counter *counter;
> + unsigned short oldid;
> +
> + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
> + return 0;
> +
> + memcg = page->mem_cgroup;
> +
> + /* Readahead page, never charged */
> + if (!memcg)
> + return 0;
> +
> + if (!mem_cgroup_is_root(memcg) &&
> + !page_counter_try_charge(&memcg->swap, 1, &counter))
> + return -ENOMEM;
> +
> + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
> + VM_BUG_ON_PAGE(oldid, page);
> + mem_cgroup_swap_statistics(memcg, true);
> +
> + css_get(&memcg->css);

I think we don't have to duplicate the swap record code. Both cgroup1
and cgroup2 could run this function to handle the swapout record and
statistics, and then mem_cgroup_swapout() would simply uncharge memsw.

> @@ -5828,6 +5931,8 @@ static int __init mem_cgroup_swap_init(void)
> {
> if (!mem_cgroup_disabled() && really_do_swap_account) {
> do_swap_account = 1;
> + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
> + swap_files));
> WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
> memsw_cgroup_files));

I guess we could also support cgroup.memory=noswap.

2015-12-10 17:00:36

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Thu, Dec 10, 2015 at 11:00:27AM -0500, Johannes Weiner wrote:
> On Thu, Dec 10, 2015 at 02:39:14PM +0300, Vladimir Davydov wrote:
...
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index c6a5ed2f2744..993c9a26b637 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -169,6 +169,7 @@ struct mem_cgroup {
> >
> > /* Accounted resources */
> > struct page_counter memory;
> > + struct page_counter swap;
> > struct page_counter memsw;
> > struct page_counter kmem;
>
> We should probably separate this to differentiate the new counters
> from the old ones. Only memory and swap are actual resources, the
> memsw and kmem counters are counting consumer-oriented.

Yeah, but we'd better do it in a separate patch.

>
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 457181844b6e..f4b3ccdcba91 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -368,11 +368,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
> > #endif
> > #ifdef CONFIG_MEMCG_SWAP
> > extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
> > +extern int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry);
>
> Should this be mem_cgroup_try_swap() to keep in line with the page
> counter terminology? So it's clear this is not forcing a charge.

Hmm, I thought we only use try_charge name in memcontrol.c if there is
commit stage, e.g. we have memcg_kmem_charge, not memcg_kmem_try_charge.
This conflicts with page_counter semantics though, so we might want to
rename it.

>
> > @@ -1248,12 +1248,15 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> > {
> > unsigned long limit;
> >
> > - limit = memcg->memory.limit;
> > + limit = READ_ONCE(memcg->memory.limit);
> > if (mem_cgroup_swappiness(memcg)) {
> > unsigned long memsw_limit;
> > + unsigned long swap_limit;
> >
> > - memsw_limit = memcg->memsw.limit;
> > - limit = min(limit + total_swap_pages, memsw_limit);
> > + memsw_limit = READ_ONCE(memcg->memsw.limit);
> > + swap_limit = min(READ_ONCE(memcg->swap.limit),
> > + (unsigned long)total_swap_pages);
> > + limit = min(limit + swap_limit, memsw_limit);
> > }
> > return limit;
>
> This is taking a racy snapshot, so we don't rely on 100% accuracy. Can
> we do without the READ_ONCE()?

Well, I suppose we can, but passing a volatile value to min macro looks
a bit scary to me. What if swap_limit is changed from a finite value
less than total_swap_pages to inf while we are there? We might use an
infinite memory size in OOM which would screw up OOM scores AFAIU.
Unlikely, but still.

>
> > @@ -5754,26 +5760,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
> > memcg_check_events(memcg, page);
> > }
> >
> > +/*
> > + * mem_cgroup_charge_swap - charge a swap entry
> > + * @page: page being added to swap
> > + * @entry: swap entry to charge
> > + *
> > + * Try to charge @entry to the memcg that @page belongs to.
> > + *
> > + * Returns 0 on success, -ENOMEM on failure.
> > + */
> > +int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry)
> > +{
> > + struct mem_cgroup *memcg;
> > + struct page_counter *counter;
> > + unsigned short oldid;
> > +
> > + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
> > + return 0;
> > +
> > + memcg = page->mem_cgroup;
> > +
> > + /* Readahead page, never charged */
> > + if (!memcg)
> > + return 0;
> > +
> > + if (!mem_cgroup_is_root(memcg) &&
> > + !page_counter_try_charge(&memcg->swap, 1, &counter))
> > + return -ENOMEM;
> > +
> > + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
> > + VM_BUG_ON_PAGE(oldid, page);
> > + mem_cgroup_swap_statistics(memcg, true);
> > +
> > + css_get(&memcg->css);
>
> I think we don't have to duplicate the swap record code. Both cgroup1
> and cgroup2 could run this function to handle the swapout record and
> statistics, and then mem_cgroup_swapout() would simply uncharge memsw.

Well, may be. I'm afraid this might make mem_cgroup_charge_swap look a
bit messy due to necessity to check cgroup_subsys_on_dfl in the middle
of it, but I'll give it a try.

>
> > @@ -5828,6 +5931,8 @@ static int __init mem_cgroup_swap_init(void)
> > {
> > if (!mem_cgroup_disabled() && really_do_swap_account) {
> > do_swap_account = 1;
> > + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
> > + swap_files));
> > WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
> > memsw_cgroup_files));
>
> I guess we could also support cgroup.memory=noswap.
>

Yeah, that would be cleaner. I wonder if we could drop swapaccount boot
param then so as not to clutter the API.

Thanks for the review!

2015-12-11 07:39:52

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Fri, Dec 11, 2015 at 11:48:57AM +0900, Kamezawa Hiroyuki wrote:
> On 2015/12/10 20:39, Vladimir Davydov wrote:
> > In the legacy hierarchy we charge memsw, which is dubious, because:
> >
> > - memsw.limit must be >= memory.limit, so it is impossible to limit
> > swap usage less than memory usage. Taking into account the fact that
> > the primary limiting mechanism in the unified hierarchy is
> > memory.high while memory.limit is either left unset or set to a very
> > large value, moving memsw.limit knob to the unified hierarchy would
> > effectively make it impossible to limit swap usage according to the
> > user preference.
> >
> > - memsw.usage != memory.usage + swap.usage, because a page occupying
> > both swap entry and a swap cache page is charged only once to memsw
> > counter. As a result, it is possible to effectively eat up to
> > memory.limit of memory pages *and* memsw.limit of swap entries, which
> > looks unexpected.
> >
> > That said, we should provide a different swap limiting mechanism for
> > cgroup2.
> >
> > This patch adds mem_cgroup->swap counter, which charges the actual
> > number of swap entries used by a cgroup. It is only charged in the
> > unified hierarchy, while the legacy hierarchy memsw logic is left
> > intact.
> >
> > The swap usage can be monitored using new memory.swap.current file and
> > limited using memory.swap.max.
> >
> > Signed-off-by: Vladimir Davydov <[email protected]>
>
> setting swap.max=0 will work like mlock ?

For anonymous memory - yes.

Thanks,
Vladimir

2015-12-11 19:25:08

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 2/7] mm: vmscan: pass memcg to get_scan_count()

On Thu, Dec 10, 2015 at 02:39:15PM +0300, Vladimir Davydov wrote:
> memcg will come in handy in get_scan_count(). It can already be used for
> getting swappiness immediately in get_scan_count() instead of passing it
> around. The following patches will add more memcg-related values, which
> will be used there.
>
> Signed-off-by: Vladimir Davydov <[email protected]>

Acked-by: Johannes Weiner <[email protected]>

2015-12-11 19:25:39

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 3/7] mm: memcontrol: replace mem_cgroup_lruvec_online with mem_cgroup_online

On Thu, Dec 10, 2015 at 02:39:16PM +0300, Vladimir Davydov wrote:
> mem_cgroup_lruvec_online() takes lruvec, but it only needs memcg. Since
> get_scan_count(), which is the only user of this function, now possesses
> pointer to memcg, let's pass memcg directly to mem_cgroup_online()
> instead of picking it out of lruvec and rename the function accordingly.
>
> Signed-off-by: Vladimir Davydov <[email protected]>

Acked-by: Johannes Weiner <[email protected]>

2015-12-11 19:26:08

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 4/7] swap.h: move memcg related stuff to the end of the file

On Thu, Dec 10, 2015 at 02:39:17PM +0300, Vladimir Davydov wrote:
> The following patches will add more functions to the memcg section of
> include/linux/swap.h. Some of them will need values defined below the
> current location of the section. So let's move the section to the end of
> the file. No functional changes intended.
>
> Signed-off-by: Vladimir Davydov <[email protected]>

Acked-by: Johannes Weiner <[email protected]>

2015-12-11 19:27:37

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 5/7] mm: vmscan: do not scan anon pages if memcg swap limit is hit

On Thu, Dec 10, 2015 at 02:39:18PM +0300, Vladimir Davydov wrote:
> We don't scan anonymous memory if we ran out of swap, neither should we
> do it in case memcg swap limit is hit, because swap out is impossible
> anyway.
>
> Signed-off-by: Vladimir Davydov <[email protected]>

Acked-by: Johannes Weiner <[email protected]>

2015-12-11 19:34:13

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 6/7] mm: free swap cache aggressively if memcg swap is full

On Thu, Dec 10, 2015 at 02:39:19PM +0300, Vladimir Davydov wrote:
> Swap cache pages are freed aggressively if swap is nearly full (>50%
> currently), because otherwise we are likely to stop scanning anonymous
> when we near the swap limit even if there is plenty of freeable swap
> cache pages. We should follow the same trend in case of memory cgroup,
> which has its own swap limit.
>
> Signed-off-by: Vladimir Davydov <[email protected]>

Acked-by: Johannes Weiner <[email protected]>

One note:

> @@ -5839,6 +5839,29 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
> return nr_swap_pages;
> }
>
> +bool mem_cgroup_swap_full(struct page *page)
> +{
> + struct mem_cgroup *memcg;
> +
> + VM_BUG_ON_PAGE(!PageLocked(page), page);
> +
> + if (vm_swap_full())
> + return true;
> + if (!do_swap_account || !PageSwapCache(page))
> + return false;

The callers establish PageSwapCache() under the page lock, which makes
sense since they only inquire about the swap state when deciding what
to do with a swapcache page at hand. So this check seems unnecessary.

2015-12-11 19:43:10

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 7/7] Documentation: cgroup: add memory.swap.{current,max} description

On Thu, Dec 10, 2015 at 02:39:20PM +0300, Vladimir Davydov wrote:
> Signed-off-by: Vladimir Davydov <[email protected]>

Acked-by: Johannes Weiner <[email protected]>

Can we include a blurb for R-5-1 of cgroups.txt as well to explain why
cgroup2 has a new swap interface? I had already written something up
in the past, pasted below, feel free to use it if you like. Otherwise,
you had pretty good reasons in your swap controller changelog as well.

---

- The combined memory+swap accounting and limiting is replaced by real
control over swap space.

memory.swap.current

The amount memory of this subtree that has been swapped to
disk.

memory.swap.max

The maximum amount of memory this subtree is allowed to swap
to disk.

The main argument for a combined memory+swap facility in the
original cgroup design was that global or parental pressure would
always be able to swap all anonymous memory of a child group,
regardless of the child's own (possibly untrusted) configuration.
However, untrusted groups can sabotage swapping by other means--such
as referencing its anonymous memory in a tight loop--and an admin
can not assume full swappability when overcommitting untrusted jobs.

For trusted jobs, on the other hand, a combined counter is not an
intuitive userspace interface, and it flies in the face of the idea
that cgroup controllers should account and limit specific physical
resources. Swap space is a resource like all others in the system,
and that's why unified hierarchy allows distributing it separately.

2015-12-12 16:19:09

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 6/7] mm: free swap cache aggressively if memcg swap is full

On Fri, Dec 11, 2015 at 02:33:58PM -0500, Johannes Weiner wrote:
> On Thu, Dec 10, 2015 at 02:39:19PM +0300, Vladimir Davydov wrote:
> > Swap cache pages are freed aggressively if swap is nearly full (>50%
> > currently), because otherwise we are likely to stop scanning anonymous
> > when we near the swap limit even if there is plenty of freeable swap
> > cache pages. We should follow the same trend in case of memory cgroup,
> > which has its own swap limit.
> >
> > Signed-off-by: Vladimir Davydov <[email protected]>
>
> Acked-by: Johannes Weiner <[email protected]>
>
> One note:
>
> > @@ -5839,6 +5839,29 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
> > return nr_swap_pages;
> > }
> >
> > +bool mem_cgroup_swap_full(struct page *page)
> > +{
> > + struct mem_cgroup *memcg;
> > +
> > + VM_BUG_ON_PAGE(!PageLocked(page), page);
> > +
> > + if (vm_swap_full())
> > + return true;
> > + if (!do_swap_account || !PageSwapCache(page))
> > + return false;
>
> The callers establish PageSwapCache() under the page lock, which makes
> sense since they only inquire about the swap state when deciding what
> to do with a swapcache page at hand. So this check seems unnecessary.

Yeah, you're right, we don't need it here. Will remove it in v2.

Besides, I think I should have inserted cgroup_subsys_on_dflt check in
this function so that it wouldn't check memcg->swap limit in case the
legacy hierarchy is used. Will do.

Thanks,
Vladimir

2015-12-12 16:19:54

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 7/7] Documentation: cgroup: add memory.swap.{current,max} description

On Fri, Dec 11, 2015 at 02:42:54PM -0500, Johannes Weiner wrote:
> On Thu, Dec 10, 2015 at 02:39:20PM +0300, Vladimir Davydov wrote:
> > Signed-off-by: Vladimir Davydov <[email protected]>
>
> Acked-by: Johannes Weiner <[email protected]>
>
> Can we include a blurb for R-5-1 of cgroups.txt as well to explain why
> cgroup2 has a new swap interface? I had already written something up
> in the past, pasted below, feel free to use it if you like. Otherwise,
> you had pretty good reasons in your swap controller changelog as well.

Will do in v2.

Thanks!

2015-12-14 15:31:23

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
> In the legacy hierarchy we charge memsw, which is dubious, because:
>
> - memsw.limit must be >= memory.limit, so it is impossible to limit
> swap usage less than memory usage. Taking into account the fact that
> the primary limiting mechanism in the unified hierarchy is
> memory.high while memory.limit is either left unset or set to a very
> large value, moving memsw.limit knob to the unified hierarchy would
> effectively make it impossible to limit swap usage according to the
> user preference.
>
> - memsw.usage != memory.usage + swap.usage, because a page occupying
> both swap entry and a swap cache page is charged only once to memsw
> counter. As a result, it is possible to effectively eat up to
> memory.limit of memory pages *and* memsw.limit of swap entries, which
> looks unexpected.
>
> That said, we should provide a different swap limiting mechanism for
> cgroup2.
> This patch adds mem_cgroup->swap counter, which charges the actual
> number of swap entries used by a cgroup. It is only charged in the
> unified hierarchy, while the legacy hierarchy memsw logic is left
> intact.

I agree that the previous semantic was awkward. The problem I can see
with this approach is that once the swap limit is reached the anon
memory pressure might spill over to other and unrelated memcgs during
the global memory pressure. I guess this is what Kame referred to as
anon would become mlocked basically. This would be even more of an issue
with resource delegation to sub-hierarchies because nobody will prevent
setting the swap amount to a small value and use that as an anon memory
protection.

I guess this was the reason why this approach hasn't been chosen before
but I think we can come up with a way to stop the run away consumption
even when the swap is accounted separately. All of them are quite nasty
but let me try.

We could allow charges to fail even for the high limit if the excess is
way above the amount of reclaimable memory in the given memcg/hierarchy.
A runaway load would be stopped before it can cause a considerable
damage outside of its hierarchy this way even when the swap limit
is configured small.
Now that goes against the high limit semantic which should only throttle
the consumer and shouldn't cause any functional failures but maybe this
is acceptable for the overall system stability. An alternative would
be to throttle in the high limit reclaim context proportionally to
the excess. This is normally done by the reclaim itself but with no
reclaimable memory this wouldn't work that way.

Another option would be to ignore the swap limit during the global
reclaim. This wouldn't stop the runaway loads but they would at least
see their fair share of the reclaim. The swap excess could be then used
as a "handicap" for a more aggressive throttling during high limit reclaim
or to trigger hard limit sooner.

Or we could teach the global OOM killer to select abusive anon memory
users with restricted swap. That would require to iterate through all
memcgs and checks whether their anon consumption is in a large excess to
their swap limit and fallback to the memcg OOM victim selection if that
is the case. This adds more complexity to the OOM killer path so I am
not sure this is generally acceptable, though.

My question now is. Is the knob usable/useful even without additional
heuristics? Do we want to protect swap space so rigidly that a swap
limited memcg can cause bigger problems than without the swap limit
globally?

> The swap usage can be monitored using new memory.swap.current file and
> limited using memory.swap.max.
>
> Signed-off-by: Vladimir Davydov <[email protected]>
> ---
> include/linux/memcontrol.h | 1 +
> include/linux/swap.h | 5 ++
> mm/memcontrol.c | 123 +++++++++++++++++++++++++++++++++++++++++----
> mm/shmem.c | 4 ++
> mm/swap_state.c | 5 ++
> 5 files changed, 129 insertions(+), 9 deletions(-)

[...]
--
Michal Hocko
SUSE Labs

2015-12-14 15:48:51

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Mon, Dec 14, 2015 at 04:30:37PM +0100, Michal Hocko wrote:
> On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
> > In the legacy hierarchy we charge memsw, which is dubious, because:
> >
> > - memsw.limit must be >= memory.limit, so it is impossible to limit
> > swap usage less than memory usage. Taking into account the fact that
> > the primary limiting mechanism in the unified hierarchy is
> > memory.high while memory.limit is either left unset or set to a very
> > large value, moving memsw.limit knob to the unified hierarchy would
> > effectively make it impossible to limit swap usage according to the
> > user preference.
> >
> > - memsw.usage != memory.usage + swap.usage, because a page occupying
> > both swap entry and a swap cache page is charged only once to memsw
> > counter. As a result, it is possible to effectively eat up to
> > memory.limit of memory pages *and* memsw.limit of swap entries, which
> > looks unexpected.
> >
> > That said, we should provide a different swap limiting mechanism for
> > cgroup2.
> > This patch adds mem_cgroup->swap counter, which charges the actual
> > number of swap entries used by a cgroup. It is only charged in the
> > unified hierarchy, while the legacy hierarchy memsw logic is left
> > intact.
>
> I agree that the previous semantic was awkward. The problem I can see
> with this approach is that once the swap limit is reached the anon
> memory pressure might spill over to other and unrelated memcgs during
> the global memory pressure. I guess this is what Kame referred to as
> anon would become mlocked basically. This would be even more of an issue
> with resource delegation to sub-hierarchies because nobody will prevent
> setting the swap amount to a small value and use that as an anon memory
> protection.

Overcommitting untrusted workloads is already problematic because
reclaim is based on heuristics and references, and a malicious
workload can already interfere with it and create pressure on the
system or its neighboring groups. This patch doesn't make it better,
but it's not a new problem.

If you don't trust subhierarchies, don't give them more memory than
you can handle them taking. And then giving them swap is a resource
for them to use on top of that memory, not for you at the toplevel.

2015-12-14 19:43:14

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Mon, Dec 14, 2015 at 04:30:37PM +0100, Michal Hocko wrote:
> On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
> > In the legacy hierarchy we charge memsw, which is dubious, because:
> >
> > - memsw.limit must be >= memory.limit, so it is impossible to limit
> > swap usage less than memory usage. Taking into account the fact that
> > the primary limiting mechanism in the unified hierarchy is
> > memory.high while memory.limit is either left unset or set to a very
> > large value, moving memsw.limit knob to the unified hierarchy would
> > effectively make it impossible to limit swap usage according to the
> > user preference.
> >
> > - memsw.usage != memory.usage + swap.usage, because a page occupying
> > both swap entry and a swap cache page is charged only once to memsw
> > counter. As a result, it is possible to effectively eat up to
> > memory.limit of memory pages *and* memsw.limit of swap entries, which
> > looks unexpected.
> >
> > That said, we should provide a different swap limiting mechanism for
> > cgroup2.
> > This patch adds mem_cgroup->swap counter, which charges the actual
> > number of swap entries used by a cgroup. It is only charged in the
> > unified hierarchy, while the legacy hierarchy memsw logic is left
> > intact.
>
> I agree that the previous semantic was awkward. The problem I can see
> with this approach is that once the swap limit is reached the anon
> memory pressure might spill over to other and unrelated memcgs during
> the global memory pressure. I guess this is what Kame referred to as
> anon would become mlocked basically. This would be even more of an issue
> with resource delegation to sub-hierarchies because nobody will prevent
> setting the swap amount to a small value and use that as an anon memory
> protection.

AFAICS such anon memory protection has a side-effect: real-life
workloads need page cache to run smoothly (at least for mapping
executables). Disabling swapping would switch pressure to page caches,
resulting in performance degradation. So, I don't think per memcg swap
limit can be abused to boost your workload on an overcommitted system.

If you mean malicious users, well, they already have plenty ways to eat
all available memory up to the hard limit by creating unreclaimable
kernel objects.

Anyway, if you don't trust a container you'd better set the hard memory
limit so that it can't hurt others no matter what it runs and how it
tweaks its sub-tree knobs.

...
> My question now is. Is the knob usable/useful even without additional
> heuristics? Do we want to protect swap space so rigidly that a swap
> limited memcg can cause bigger problems than without the swap limit
> globally?

Hmm, I don't see why problems might get bigger with per memcg swap limit
than w/o it. W/o swap limit, a memcg can eat all swap space on the host
and disable swapping for everyone, not just for itself alone.

Thanks,
Vladimir

2015-12-14 19:53:00

by Alan Cox

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

> Anyway, if you don't trust a container you'd better set the hard memory
> limit so that it can't hurt others no matter what it runs and how it
> tweaks its sub-tree knobs.

If you don't trust it put it in a VM. If it's got access to GEM graphics
ioctls/nodes or some other kernel interfaces then it can blow up the
kernel without trying hard unless its constrained within a VM. VMs can
be extremely light weight if you avoid KVM emulating an entire PC.

Alan

2015-12-15 03:23:24

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On 2015/12/15 0:30, Michal Hocko wrote:
> On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
>> In the legacy hierarchy we charge memsw, which is dubious, because:
>>
>> - memsw.limit must be >= memory.limit, so it is impossible to limit
>> swap usage less than memory usage. Taking into account the fact that
>> the primary limiting mechanism in the unified hierarchy is
>> memory.high while memory.limit is either left unset or set to a very
>> large value, moving memsw.limit knob to the unified hierarchy would
>> effectively make it impossible to limit swap usage according to the
>> user preference.
>>
>> - memsw.usage != memory.usage + swap.usage, because a page occupying
>> both swap entry and a swap cache page is charged only once to memsw
>> counter. As a result, it is possible to effectively eat up to
>> memory.limit of memory pages *and* memsw.limit of swap entries, which
>> looks unexpected.
>>
>> That said, we should provide a different swap limiting mechanism for
>> cgroup2.
>> This patch adds mem_cgroup->swap counter, which charges the actual
>> number of swap entries used by a cgroup. It is only charged in the
>> unified hierarchy, while the legacy hierarchy memsw logic is left
>> intact.
>
> I agree that the previous semantic was awkward. The problem I can see
> with this approach is that once the swap limit is reached the anon
> memory pressure might spill over to other and unrelated memcgs during
> the global memory pressure. I guess this is what Kame referred to as
> anon would become mlocked basically. This would be even more of an issue
> with resource delegation to sub-hierarchies because nobody will prevent
> setting the swap amount to a small value and use that as an anon memory
> protection.
>
> I guess this was the reason why this approach hasn't been chosen before

Yes. At that age, "never break global VM" was the policy. And "mlock" can be
used for attacking system.

> but I think we can come up with a way to stop the run away consumption
> even when the swap is accounted separately. All of them are quite nasty
> but let me try.
>
> We could allow charges to fail even for the high limit if the excess is
> way above the amount of reclaimable memory in the given memcg/hierarchy.
> A runaway load would be stopped before it can cause a considerable
> damage outside of its hierarchy this way even when the swap limit
> is configured small.
> Now that goes against the high limit semantic which should only throttle
> the consumer and shouldn't cause any functional failures but maybe this
> is acceptable for the overall system stability. An alternative would
> be to throttle in the high limit reclaim context proportionally to
> the excess. This is normally done by the reclaim itself but with no
> reclaimable memory this wouldn't work that way.
>
This seems hard to use for users who want to control resource precisely
even if stability is good.

> Another option would be to ignore the swap limit during the global
> reclaim. This wouldn't stop the runaway loads but they would at least
> see their fair share of the reclaim. The swap excess could be then used
> as a "handicap" for a more aggressive throttling during high limit reclaim
> or to trigger hard limit sooner.
>
This seems to work. But users need to understand swap-limit can be exceeded.

> Or we could teach the global OOM killer to select abusive anon memory
> users with restricted swap. That would require to iterate through all
> memcgs and checks whether their anon consumption is in a large excess to
> their swap limit and fallback to the memcg OOM victim selection if that
> is the case. This adds more complexity to the OOM killer path so I am
> not sure this is generally acceptable, though.
>

I think this is not acceptable.

> My question now is. Is the knob usable/useful even without additional
> heuristics? Do we want to protect swap space so rigidly that a swap
> limited memcg can cause bigger problems than without the swap limit
> globally?
>

swap requires some limit. If not, an application can eat up all swap
and it will not be never freed until the application access it or
swapoff runs.

Thanks,
-Kame

>> The swap usage can be monitored using new memory.swap.current file and
>> limited using memory.swap.max.
>>
>> Signed-off-by: Vladimir Davydov <[email protected]>
>> ---
>> include/linux/memcontrol.h | 1 +
>> include/linux/swap.h | 5 ++
>> mm/memcontrol.c | 123 +++++++++++++++++++++++++++++++++++++++++----
>> mm/shmem.c | 4 ++
>> mm/swap_state.c | 5 ++
>> 5 files changed, 129 insertions(+), 9 deletions(-)
>
> [...]
>

2015-12-15 03:23:08

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On 2015/12/15 4:42, Vladimir Davydov wrote:
> On Mon, Dec 14, 2015 at 04:30:37PM +0100, Michal Hocko wrote:
>> On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
>>> In the legacy hierarchy we charge memsw, which is dubious, because:
>>>
>>> - memsw.limit must be >= memory.limit, so it is impossible to limit
>>> swap usage less than memory usage. Taking into account the fact that
>>> the primary limiting mechanism in the unified hierarchy is
>>> memory.high while memory.limit is either left unset or set to a very
>>> large value, moving memsw.limit knob to the unified hierarchy would
>>> effectively make it impossible to limit swap usage according to the
>>> user preference.
>>>
>>> - memsw.usage != memory.usage + swap.usage, because a page occupying
>>> both swap entry and a swap cache page is charged only once to memsw
>>> counter. As a result, it is possible to effectively eat up to
>>> memory.limit of memory pages *and* memsw.limit of swap entries, which
>>> looks unexpected.
>>>
>>> That said, we should provide a different swap limiting mechanism for
>>> cgroup2.
>>> This patch adds mem_cgroup->swap counter, which charges the actual
>>> number of swap entries used by a cgroup. It is only charged in the
>>> unified hierarchy, while the legacy hierarchy memsw logic is left
>>> intact.
>>
>> I agree that the previous semantic was awkward. The problem I can see
>> with this approach is that once the swap limit is reached the anon
>> memory pressure might spill over to other and unrelated memcgs during
>> the global memory pressure. I guess this is what Kame referred to as
>> anon would become mlocked basically. This would be even more of an issue
>> with resource delegation to sub-hierarchies because nobody will prevent
>> setting the swap amount to a small value and use that as an anon memory
>> protection.
>
> AFAICS such anon memory protection has a side-effect: real-life
> workloads need page cache to run smoothly (at least for mapping
> executables). Disabling swapping would switch pressure to page caches,
> resulting in performance degradation. So, I don't think per memcg swap
> limit can be abused to boost your workload on an overcommitted system.
>
> If you mean malicious users, well, they already have plenty ways to eat
> all available memory up to the hard limit by creating unreclaimable
> kernel objects.
>
"protect anon" user's malicious degree is far lower than such cracker like users.

> Anyway, if you don't trust a container you'd better set the hard memory
> limit so that it can't hurt others no matter what it runs and how it
> tweaks its sub-tree knobs.
>

Limiting swap can easily cause "OOM-Killer even while there are available swap"
with easy mistake. Can't you add "swap excess" switch to sysctl to allow global
memory reclaim can ignore swap limitation ?

Regards,
-Kame







2015-12-15 08:30:25

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Tue, Dec 15, 2015 at 12:12:40PM +0900, Kamezawa Hiroyuki wrote:
> On 2015/12/15 0:30, Michal Hocko wrote:
> >On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
> >>In the legacy hierarchy we charge memsw, which is dubious, because:
> >>
> >> - memsw.limit must be >= memory.limit, so it is impossible to limit
> >> swap usage less than memory usage. Taking into account the fact that
> >> the primary limiting mechanism in the unified hierarchy is
> >> memory.high while memory.limit is either left unset or set to a very
> >> large value, moving memsw.limit knob to the unified hierarchy would
> >> effectively make it impossible to limit swap usage according to the
> >> user preference.
> >>
> >> - memsw.usage != memory.usage + swap.usage, because a page occupying
> >> both swap entry and a swap cache page is charged only once to memsw
> >> counter. As a result, it is possible to effectively eat up to
> >> memory.limit of memory pages *and* memsw.limit of swap entries, which
> >> looks unexpected.
> >>
> >>That said, we should provide a different swap limiting mechanism for
> >>cgroup2.
> >>This patch adds mem_cgroup->swap counter, which charges the actual
> >>number of swap entries used by a cgroup. It is only charged in the
> >>unified hierarchy, while the legacy hierarchy memsw logic is left
> >>intact.
> >
> >I agree that the previous semantic was awkward. The problem I can see
> >with this approach is that once the swap limit is reached the anon
> >memory pressure might spill over to other and unrelated memcgs during
> >the global memory pressure. I guess this is what Kame referred to as
> >anon would become mlocked basically. This would be even more of an issue
> >with resource delegation to sub-hierarchies because nobody will prevent
> >setting the swap amount to a small value and use that as an anon memory
> >protection.
> >
> >I guess this was the reason why this approach hasn't been chosen before
>
> Yes. At that age, "never break global VM" was the policy. And "mlock" can be
> used for attacking system.

If we are talking about "attacking system" from inside a container,
there are much easier and disruptive ways, e.g. running a fork-bomb or
creating pipes - such memory can't be reclaimed and global OOM killer
won't help.

Thanks,
Vladimir

2015-12-15 09:30:27

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On 2015/12/15 17:30, Vladimir Davydov wrote:
> On Tue, Dec 15, 2015 at 12:12:40PM +0900, Kamezawa Hiroyuki wrote:
>> On 2015/12/15 0:30, Michal Hocko wrote:
>>> On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
>>>> In the legacy hierarchy we charge memsw, which is dubious, because:
>>>>
>>>> - memsw.limit must be >= memory.limit, so it is impossible to limit
>>>> swap usage less than memory usage. Taking into account the fact that
>>>> the primary limiting mechanism in the unified hierarchy is
>>>> memory.high while memory.limit is either left unset or set to a very
>>>> large value, moving memsw.limit knob to the unified hierarchy would
>>>> effectively make it impossible to limit swap usage according to the
>>>> user preference.
>>>>
>>>> - memsw.usage != memory.usage + swap.usage, because a page occupying
>>>> both swap entry and a swap cache page is charged only once to memsw
>>>> counter. As a result, it is possible to effectively eat up to
>>>> memory.limit of memory pages *and* memsw.limit of swap entries, which
>>>> looks unexpected.
>>>>
>>>> That said, we should provide a different swap limiting mechanism for
>>>> cgroup2.
>>>> This patch adds mem_cgroup->swap counter, which charges the actual
>>>> number of swap entries used by a cgroup. It is only charged in the
>>>> unified hierarchy, while the legacy hierarchy memsw logic is left
>>>> intact.
>>>
>>> I agree that the previous semantic was awkward. The problem I can see
>>> with this approach is that once the swap limit is reached the anon
>>> memory pressure might spill over to other and unrelated memcgs during
>>> the global memory pressure. I guess this is what Kame referred to as
>>> anon would become mlocked basically. This would be even more of an issue
>>> with resource delegation to sub-hierarchies because nobody will prevent
>>> setting the swap amount to a small value and use that as an anon memory
>>> protection.
>>>
>>> I guess this was the reason why this approach hasn't been chosen before
>>
>> Yes. At that age, "never break global VM" was the policy. And "mlock" can be
>> used for attacking system.
>
> If we are talking about "attacking system" from inside a container,
> there are much easier and disruptive ways, e.g. running a fork-bomb or
> creating pipes - such memory can't be reclaimed and global OOM killer
> won't help.

You're right. We just wanted to avoid affecting global memory reclaim by
each cgroup settings.

Thanks,
-Kame


2015-12-15 11:02:35

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Tue, Dec 15, 2015 at 12:22:41PM +0900, Kamezawa Hiroyuki wrote:
> On 2015/12/15 4:42, Vladimir Davydov wrote:
> >On Mon, Dec 14, 2015 at 04:30:37PM +0100, Michal Hocko wrote:
> >>On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
> >>>In the legacy hierarchy we charge memsw, which is dubious, because:
> >>>
> >>> - memsw.limit must be >= memory.limit, so it is impossible to limit
> >>> swap usage less than memory usage. Taking into account the fact that
> >>> the primary limiting mechanism in the unified hierarchy is
> >>> memory.high while memory.limit is either left unset or set to a very
> >>> large value, moving memsw.limit knob to the unified hierarchy would
> >>> effectively make it impossible to limit swap usage according to the
> >>> user preference.
> >>>
> >>> - memsw.usage != memory.usage + swap.usage, because a page occupying
> >>> both swap entry and a swap cache page is charged only once to memsw
> >>> counter. As a result, it is possible to effectively eat up to
> >>> memory.limit of memory pages *and* memsw.limit of swap entries, which
> >>> looks unexpected.
> >>>
> >>>That said, we should provide a different swap limiting mechanism for
> >>>cgroup2.
> >>>This patch adds mem_cgroup->swap counter, which charges the actual
> >>>number of swap entries used by a cgroup. It is only charged in the
> >>>unified hierarchy, while the legacy hierarchy memsw logic is left
> >>>intact.
> >>
> >>I agree that the previous semantic was awkward. The problem I can see
> >>with this approach is that once the swap limit is reached the anon
> >>memory pressure might spill over to other and unrelated memcgs during
> >>the global memory pressure. I guess this is what Kame referred to as
> >>anon would become mlocked basically. This would be even more of an issue
> >>with resource delegation to sub-hierarchies because nobody will prevent
> >>setting the swap amount to a small value and use that as an anon memory
> >>protection.
> >
> >AFAICS such anon memory protection has a side-effect: real-life
> >workloads need page cache to run smoothly (at least for mapping
> >executables). Disabling swapping would switch pressure to page caches,
> >resulting in performance degradation. So, I don't think per memcg swap
> >limit can be abused to boost your workload on an overcommitted system.
> >
> >If you mean malicious users, well, they already have plenty ways to eat
> >all available memory up to the hard limit by creating unreclaimable
> >kernel objects.
> >
> "protect anon" user's malicious degree is far lower than such cracker like users.

What do you mean by "malicious degree"? What is such a user trying to
achieve? Killing the system? Well, there are much more effective ways to
do so. Or does it want to exploit a system specific feature to get
benefit for itself? If so, it will hardly win by mlocking all anonymous
memory, because this will result in higher pressure exerted upon its
page cache and dcache, which normal workloads just can't get along
without.

>
> >Anyway, if you don't trust a container you'd better set the hard memory
> >limit so that it can't hurt others no matter what it runs and how it
> >tweaks its sub-tree knobs.
> >
>
> Limiting swap can easily cause "OOM-Killer even while there are
> available swap" with easy mistake.

What do you mean by "easy mistake"? Misconfiguration? If so, it's a lame
excuse IMO. Admin should take system configuration seriously. If the
host is not overcommitted, it's trivial. Otherwise, there's always a
chance that things will go south, so it's not going to be easy. It's up
to admin to analyze risks and set limits accordingly. Exporting knobs
with clear meaning is the best we can do here. swap.max is one such knob
It defines maximal usage of swap resource. Allowing to breach it just
does not add up.

> Can't you add "swap excess" switch to sysctl to allow global memory
> reclaim can ignore swap limitation ?

I'd be opposed to it, because this would obscure the user API. OTOH, a
kind of swap soft limit (swap.high?) might be considered. I'm not sure
if it's really necessary though, because all arguments for it do not
look convincing to me for now. So, personally, I would refrain from
implementing it until it is really called for by users of cgroup v2.

Thanks,
Vladimir

2015-12-15 14:50:28

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Tue, Dec 15, 2015 at 12:22:41PM +0900, Kamezawa Hiroyuki wrote:
> On 2015/12/15 4:42, Vladimir Davydov wrote:
> >Anyway, if you don't trust a container you'd better set the hard memory
> >limit so that it can't hurt others no matter what it runs and how it
> >tweaks its sub-tree knobs.
>
> Limiting swap can easily cause "OOM-Killer even while there are available swap"
> with easy mistake. Can't you add "swap excess" switch to sysctl to allow global
> memory reclaim can ignore swap limitation ?

That never worked with a combined memory+swap limit, either. How could
it? The parent might swap you out under pressure, but simply touching
a few of your anon pages causes them to get swapped back in, thrashing
with whatever the parent was trying to do. Your ability to swap it out
is simply no protection against a group touching its pages.

Allowing the parent to exceed swap with separate counters makes even
less sense, because every page swapped out frees up a page of memory
that the child can reuse. For every swap page that exceeds the limit,
the child gets a free memory page! The child doesn't even have to
cause swapin, it can just steal whatever the parent tried to free up,
and meanwhile its combined memory & swap footprint explodes.

The answer is and always should have been: don't overcommit untrusted
cgroups. Think of swap as a resource you distribute, not as breathing
room for the parents to rely on. Because it can't and could never.

And the new separate swap counter makes this explicit.

2015-12-15 17:21:34

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Mon 14-12-15 22:42:58, Vladimir Davydov wrote:
> On Mon, Dec 14, 2015 at 04:30:37PM +0100, Michal Hocko wrote:
> > On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
> > > In the legacy hierarchy we charge memsw, which is dubious, because:
> > >
> > > - memsw.limit must be >= memory.limit, so it is impossible to limit
> > > swap usage less than memory usage. Taking into account the fact that
> > > the primary limiting mechanism in the unified hierarchy is
> > > memory.high while memory.limit is either left unset or set to a very
> > > large value, moving memsw.limit knob to the unified hierarchy would
> > > effectively make it impossible to limit swap usage according to the
> > > user preference.
> > >
> > > - memsw.usage != memory.usage + swap.usage, because a page occupying
> > > both swap entry and a swap cache page is charged only once to memsw
> > > counter. As a result, it is possible to effectively eat up to
> > > memory.limit of memory pages *and* memsw.limit of swap entries, which
> > > looks unexpected.
> > >
> > > That said, we should provide a different swap limiting mechanism for
> > > cgroup2.
> > > This patch adds mem_cgroup->swap counter, which charges the actual
> > > number of swap entries used by a cgroup. It is only charged in the
> > > unified hierarchy, while the legacy hierarchy memsw logic is left
> > > intact.
> >
> > I agree that the previous semantic was awkward. The problem I can see
> > with this approach is that once the swap limit is reached the anon
> > memory pressure might spill over to other and unrelated memcgs during
> > the global memory pressure. I guess this is what Kame referred to as
> > anon would become mlocked basically. This would be even more of an issue
> > with resource delegation to sub-hierarchies because nobody will prevent
> > setting the swap amount to a small value and use that as an anon memory
> > protection.
>
> AFAICS such anon memory protection has a side-effect: real-life
> workloads need page cache to run smoothly (at least for mapping
> executables). Disabling swapping would switch pressure to page caches,
> resulting in performance degradation. So, I don't think per memcg swap
> limit can be abused to boost your workload on an overcommitted system.

Well, you can trash on the page cache which could slow down the workload
but the executable pages get an additional protection so this might be
not sufficient and still trigger a massive disruption on the global level.

> If you mean malicious users, well, they already have plenty ways to eat
> all available memory up to the hard limit by creating unreclaimable
> kernel objects.
>
> Anyway, if you don't trust a container you'd better set the hard memory
> limit so that it can't hurt others no matter what it runs and how it
> tweaks its sub-tree knobs.

I completely agree that malicious/untrusted users absolutely have to
be capped by the hard limit. Then the separate swap limit would work
for sure. But I am less convinced about usefulness of the rigid (to
the global memory pressure) swap limit without the hard limit. All the
memory that could have been swapped out will make a memory pressure to
the rest of the system without being punished for it too much. Memcg
is allowed to grow over the high limit (in the current implementation)
without any way to shrink back in other words.

My understanding was that the primary use case for the swap limit is to
handle potential (not only malicious but also unexpectedly misbehaving
application) anon memory consumption runaways more gracefully without
the massive disruption on the global level. I simply didn't see swap
space partitioning as important enough because an alternative to swap
usage is to consume primary memory which is a more precious resource
IMO. Swap storage is really cheap and runtime expandable resource which
is not the case for the primary memory in general. Maybe there are other
use cases I am not aware of, though. Do you want to guarantee the swap
availability?

Just to make it clear. I am not against the new way of the swap
accounting. It is much more clear then the previous one. I am just
worried it allows for an easy misconfiguration and we do not have any
measures to help the global system healthiness. I am OK with the patch
if we document the risk for now. I still think we will end up doing some
heuristic to throttle for a large unreclaimable high limit excess in the
future but I agree this shouldn't be the prerequisite.

> ...
> > My question now is. Is the knob usable/useful even without additional
> > heuristics? Do we want to protect swap space so rigidly that a swap
> > limited memcg can cause bigger problems than without the swap limit
> > globally?
>
> Hmm, I don't see why problems might get bigger with per memcg swap limit
> than w/o it.

because the reclaim fairness between different memcg hierarchies will be
severely affected.

> W/o swap limit, a memcg can eat all swap space on the host
> and disable swapping for everyone, not just for itself alone.

this is true of course but this is not very much different from pushing
everybody else to the swap while eating the unreclaimable anonymous
memory and eventually hit the OOM killer.

--
Michal Hocko
SUSE Labs

2015-12-15 20:22:58

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Tue, Dec 15, 2015 at 06:21:28PM +0100, Michal Hocko wrote:
> > AFAICS such anon memory protection has a side-effect: real-life
> > workloads need page cache to run smoothly (at least for mapping
> > executables). Disabling swapping would switch pressure to page caches,
> > resulting in performance degradation. So, I don't think per memcg swap
> > limit can be abused to boost your workload on an overcommitted system.
>
> Well, you can trash on the page cache which could slow down the workload
> but the executable pages get an additional protection so this might be
> not sufficient and still trigger a massive disruption on the global level.

No, this is a real consequence. If you fill your available memory with
mostly unreclaimable memory and your executables start thrashing you
might not make forward progress for hours. We don't have a swap token
for page cache.

> Just to make it clear. I am not against the new way of the swap
> accounting. It is much more clear then the previous one. I am just
> worried it allows for an easy misconfiguration and we do not have any
> measures to help the global system healthiness. I am OK with the patch
> if we document the risk for now. I still think we will end up doing some
> heuristic to throttle for a large unreclaimable high limit excess in the
> future but I agree this shouldn't be the prerequisite.

It's unclear to me how the previous memory+swap counters did anything
tangible for global system health with malicious/buggy workloads. If
anything, the previous model seems to encourage blatant overcommit of
workloads on the flawed assumption that global pressure could always
claw back memory, including anonymous pages of untrusted workloads,
which does not actually work in practice. So I'm not sure what new
risk you are referring to here.

As far as the high limit goes, its job is to contain cache growth and
throttle applications during somewhat higher-than-expected consumption
peaks; not to contain "large unreclaimable high limit excess" from
buggy or malicious applications, that's what the hard limit is for.

All in all, it seems to me we should leave this discussion to actual
problems arising in the real world. There is a lot of unfocussed
speculation in this thread about things that might go wrong, without
much thought put into whether these scenarios are even meaningful or
real or whether they are new problems that come with the swap limit.

2015-12-16 02:44:38

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On 2015/12/15 20:02, Vladimir Davydov wrote:
> On Tue, Dec 15, 2015 at 12:22:41PM +0900, Kamezawa Hiroyuki wrote:
>> On 2015/12/15 4:42, Vladimir Davydov wrote:
>>> On Mon, Dec 14, 2015 at 04:30:37PM +0100, Michal Hocko wrote:
>>>> On Thu 10-12-15 14:39:14, Vladimir Davydov wrote:
>>>>> In the legacy hierarchy we charge memsw, which is dubious, because:
>>>>>
>>>>> - memsw.limit must be >= memory.limit, so it is impossible to limit
>>>>> swap usage less than memory usage. Taking into account the fact that
>>>>> the primary limiting mechanism in the unified hierarchy is
>>>>> memory.high while memory.limit is either left unset or set to a very
>>>>> large value, moving memsw.limit knob to the unified hierarchy would
>>>>> effectively make it impossible to limit swap usage according to the
>>>>> user preference.
>>>>>
>>>>> - memsw.usage != memory.usage + swap.usage, because a page occupying
>>>>> both swap entry and a swap cache page is charged only once to memsw
>>>>> counter. As a result, it is possible to effectively eat up to
>>>>> memory.limit of memory pages *and* memsw.limit of swap entries, which
>>>>> looks unexpected.
>>>>>
>>>>> That said, we should provide a different swap limiting mechanism for
>>>>> cgroup2.
>>>>> This patch adds mem_cgroup->swap counter, which charges the actual
>>>>> number of swap entries used by a cgroup. It is only charged in the
>>>>> unified hierarchy, while the legacy hierarchy memsw logic is left
>>>>> intact.
>>>>
>>>> I agree that the previous semantic was awkward. The problem I can see
>>>> with this approach is that once the swap limit is reached the anon
>>>> memory pressure might spill over to other and unrelated memcgs during
>>>> the global memory pressure. I guess this is what Kame referred to as
>>>> anon would become mlocked basically. This would be even more of an issue
>>>> with resource delegation to sub-hierarchies because nobody will prevent
>>>> setting the swap amount to a small value and use that as an anon memory
>>>> protection.
>>>
>>> AFAICS such anon memory protection has a side-effect: real-life
>>> workloads need page cache to run smoothly (at least for mapping
>>> executables). Disabling swapping would switch pressure to page caches,
>>> resulting in performance degradation. So, I don't think per memcg swap
>>> limit can be abused to boost your workload on an overcommitted system.
>>>
>>> If you mean malicious users, well, they already have plenty ways to eat
>>> all available memory up to the hard limit by creating unreclaimable
>>> kernel objects.
>>>
>> "protect anon" user's malicious degree is far lower than such cracker like users.
>
> What do you mean by "malicious degree"? What is such a user trying to
> achieve? Killing the system? Well, there are much more effective ways to
> do so. Or does it want to exploit a system specific feature to get
> benefit for itself? If so, it will hardly win by mlocking all anonymous
> memory, because this will result in higher pressure exerted upon its
> page cache and dcache, which normal workloads just can't get along
> without.
>

I wanted to say almost all application developers want to set swap.limit=0 if allowed.
So, it's a usual people who can kill the system if swap imbalance is allowed.

>>
>>> Anyway, if you don't trust a container you'd better set the hard memory
>>> limit so that it can't hurt others no matter what it runs and how it
>>> tweaks its sub-tree knobs.
>>>
>>
>> Limiting swap can easily cause "OOM-Killer even while there are
>> available swap" with easy mistake.
>
> What do you mean by "easy mistake"? Misconfiguration? If so, it's a lame
> excuse IMO. Admin should take system configuration seriously. If the
> host is not overcommitted, it's trivial. Otherwise, there's always a
> chance that things will go south, so it's not going to be easy. It's up
> to admin to analyze risks and set limits accordingly. Exporting knobs
> with clear meaning is the best we can do here. swap.max is one such knob
> It defines maximal usage of swap resource. Allowing to breach it just
> does not add up.
>
>> Can't you add "swap excess" switch to sysctl to allow global memory
>> reclaim can ignore swap limitation ?
>
> I'd be opposed to it, because this would obscure the user API. OTOH, a
> kind of swap soft limit (swap.high?) might be considered. I'm not sure
> if it's really necessary though, because all arguments for it do not
> look convincing to me for now. So, personally, I would refrain from
> implementing it until it is really called for by users of cgroup v2.
>

Considering my customers, running OOM-Killer while there are free swap space is
system's error rather than their misconfiguration.

BTW, mlock() requires CAP_IPC_LOCK.
please set default unlimited and check capability at setting swap limit, at least.

Thanks,
-Kame


> Thanks,
> Vladimir
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2015-12-16 03:19:06

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On 2015/12/15 23:50, Johannes Weiner wrote:
> On Tue, Dec 15, 2015 at 12:22:41PM +0900, Kamezawa Hiroyuki wrote:
>> On 2015/12/15 4:42, Vladimir Davydov wrote:
>>> Anyway, if you don't trust a container you'd better set the hard memory
>>> limit so that it can't hurt others no matter what it runs and how it
>>> tweaks its sub-tree knobs.
>>
>> Limiting swap can easily cause "OOM-Killer even while there are available swap"
>> with easy mistake. Can't you add "swap excess" switch to sysctl to allow global
>> memory reclaim can ignore swap limitation ?
>
> That never worked with a combined memory+swap limit, either. How could
> it? The parent might swap you out under pressure, but simply touching
> a few of your anon pages causes them to get swapped back in, thrashing
> with whatever the parent was trying to do. Your ability to swap it out
> is simply no protection against a group touching its pages.
>
> Allowing the parent to exceed swap with separate counters makes even
> less sense, because every page swapped out frees up a page of memory
> that the child can reuse. For every swap page that exceeds the limit,
> the child gets a free memory page! The child doesn't even have to
> cause swapin, it can just steal whatever the parent tried to free up,
> and meanwhile its combined memory & swap footprint explodes.
>
Sure.

> The answer is and always should have been: don't overcommit untrusted
> cgroups. Think of swap as a resource you distribute, not as breathing
> room for the parents to rely on. Because it can't and could never.
>
ok, don't overcommmit.

> And the new separate swap counter makes this explicit.
>
Hmm, my requests are
- set the same capabilities as mlock() to set swap.limit=0
- swap-full notification via vmpressure or something mechanism.
- OOM-Killer's available memory calculation may be corrupted, please check.
- force swap-in at reducing swap.limit

Thanks,
-Kame

2015-12-16 03:58:31

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On 2015/12/16 2:21, Michal Hocko wrote:

> I completely agree that malicious/untrusted users absolutely have to
> be capped by the hard limit. Then the separate swap limit would work
> for sure. But I am less convinced about usefulness of the rigid (to
> the global memory pressure) swap limit without the hard limit. All the
> memory that could have been swapped out will make a memory pressure to
> the rest of the system without being punished for it too much. Memcg
> is allowed to grow over the high limit (in the current implementation)
> without any way to shrink back in other words.
>
> My understanding was that the primary use case for the swap limit is to
> handle potential (not only malicious but also unexpectedly misbehaving
> application) anon memory consumption runaways more gracefully without
> the massive disruption on the global level. I simply didn't see swap
> space partitioning as important enough because an alternative to swap
> usage is to consume primary memory which is a more precious resource
> IMO. Swap storage is really cheap and runtime expandable resource which
> is not the case for the primary memory in general. Maybe there are other
> use cases I am not aware of, though. Do you want to guarantee the swap
> availability?
>

At the first implementation, NEC guy explained their use case in HPC area.
At that time, there was no swap support.

Considering 2 workloads partitioned into group A, B. total swap was 100GB.
A: memory.limit = 40G
B: memory.limit = 40G

Job scheduler runs applications in A and B in turn. Apps in A stops while Apps in B running.

If App-A requires 120GB of anonymous memory, it uses 80GB of swap. So, App-B can use only
20GB of swap. This can cause trouble if App-B needs 100GB of anonymous memory.
They need some knob to control amount of swap per cgroup.

The point is, at least for their customer, the swap is "resource", which should be under
control. With their use case, memory usage and swap usage has the same meaning. So,
mem+swap limit doesn't cause trouble.

Thanks,
-Kame



2015-12-16 11:09:31

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Wed, Dec 16, 2015 at 12:18:30PM +0900, Kamezawa Hiroyuki wrote:
> Hmm, my requests are
> - set the same capabilities as mlock() to set swap.limit=0

Setting swap.max is already privileged operation.

> - swap-full notification via vmpressure or something mechanism.

Why?

> - OOM-Killer's available memory calculation may be corrupted, please check.

Vladimir updated mem_cgroup_get_limit().

> - force swap-in at reducing swap.limit

Why?

2015-12-17 02:47:00

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On 2015/12/16 20:09, Johannes Weiner wrote:
> On Wed, Dec 16, 2015 at 12:18:30PM +0900, Kamezawa Hiroyuki wrote:
>> Hmm, my requests are
>> - set the same capabilities as mlock() to set swap.limit=0
>
> Setting swap.max is already privileged operation.
>
Sure.

>> - swap-full notification via vmpressure or something mechanism.
>
> Why?
>

I think it's a sign of unhealthy condition, starting file cache drop rate to rise.
But I forgot that there are resource threshold notifier already. Does the notifier work
for swap.usage ?

>> - OOM-Killer's available memory calculation may be corrupted, please check.
>
> Vladimir updated mem_cgroup_get_limit().
>
I'll check it.

>> - force swap-in at reducing swap.limit
>
> Why?
>
If full, swap.limit cannot be reduced even if there are available memory in a cgroup.
Another cgroup cannot make use of the swap resource while it's occupied by other cgroup.
The job scheduler should have a chance to fix the situation.

Thanks,
-Kame

2015-12-17 03:32:22

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On Thu, Dec 17, 2015 at 11:46:27AM +0900, Kamezawa Hiroyuki wrote:
> On 2015/12/16 20:09, Johannes Weiner wrote:
> >On Wed, Dec 16, 2015 at 12:18:30PM +0900, Kamezawa Hiroyuki wrote:
> >> - swap-full notification via vmpressure or something mechanism.
> >
> >Why?
> >
>
> I think it's a sign of unhealthy condition, starting file cache drop rate to rise.
> But I forgot that there are resource threshold notifier already. Does the notifier work
> for swap.usage ?

That will be reflected in vmpressure or other distress mechanisms. I'm
not convinced "ran out of swap space" needs special casing in any way.

> >> - force swap-in at reducing swap.limit
> >
> >Why?
> >
> If full, swap.limit cannot be reduced even if there are available memory in a cgroup.
> Another cgroup cannot make use of the swap resource while it's occupied by other cgroup.
> The job scheduler should have a chance to fix the situation.

I don't see why swap space allowance would need to be as dynamically
adjustable as the memory allowance. There is usually no need to be as
tight with swap space as with memory, and the performance penalty of
swapping, even with flash drives, is high enough that swap space acts
as an overflow vessel rather than be part of the regularly backing of
the anonymous/shmem working set. It really is NOT obvious that swap
space would need to be adjusted on the fly, and that it's important
that reducing the limit will be reflected in consumption right away.

We shouldn't be adding hundreds of lines of likely terrible heuristics
code* on speculation that somebody MIGHT find this useful in real life.
We should wait until we are presented with a real usecase that applies
to a whole class of users, and then see what the true requirements are.

* If a group has 200M swapped out and the swap limit is reduced by 10M
below the current consumption, which pages would you swap in? There is
no LRU list for swap space.

2015-12-17 04:30:53

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On 2015/12/17 12:32, Johannes Weiner wrote:
> On Thu, Dec 17, 2015 at 11:46:27AM +0900, Kamezawa Hiroyuki wrote:
>> On 2015/12/16 20:09, Johannes Weiner wrote:
>>> On Wed, Dec 16, 2015 at 12:18:30PM +0900, Kamezawa Hiroyuki wrote:
>>>> - swap-full notification via vmpressure or something mechanism.
>>>
>>> Why?
>>>
>>
>> I think it's a sign of unhealthy condition, starting file cache drop rate to rise.
>> But I forgot that there are resource threshold notifier already. Does the notifier work
>> for swap.usage ?
>
> That will be reflected in vmpressure or other distress mechanisms. I'm
> not convinced "ran out of swap space" needs special casing in any way.
>
Most users checks swap space shortage as "system alarm" in enterprise systems.
At least, our customers checks swap-full.

>>>> - force swap-in at reducing swap.limit
>>>
>>> Why?
>>>
>> If full, swap.limit cannot be reduced even if there are available memory in a cgroup.
>> Another cgroup cannot make use of the swap resource while it's occupied by other cgroup.
>> The job scheduler should have a chance to fix the situation.
>
> I don't see why swap space allowance would need to be as dynamically
> adjustable as the memory allowance. There is usually no need to be as
> tight with swap space as with memory, and the performance penalty of
> swapping, even with flash drives, is high enough that swap space acts
> as an overflow vessel rather than be part of the regularly backing of
> the anonymous/shmem working set. It really is NOT obvious that swap
> space would need to be adjusted on the fly, and that it's important
> that reducing the limit will be reflected in consumption right away.
>

With my OS support experience, some customers consider swap-space as a resource.


> We shouldn't be adding hundreds of lines of likely terrible heuristics
> code* on speculation that somebody MIGHT find this useful in real life.
> We should wait until we are presented with a real usecase that applies
> to a whole class of users, and then see what the true requirements are.
>
ok, we should wait. I'm just guessing (japanese) HPC people will want the
feature for their job control. I hear many programs relies on swap.

> * If a group has 200M swapped out and the swap limit is reduced by 10M
> below the current consumption, which pages would you swap in? There is
> no LRU list for swap space.
>
If a rotation can happen when a swap-in-by-real-pagefault, random swap-in
at reducing swap.limit will work enough.

Thanks,
-Kame


2015-12-11 02:59:44

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 1/7] mm: memcontrol: charge swap to cgroup2

On 2015/12/10 20:39, Vladimir Davydov wrote:
> In the legacy hierarchy we charge memsw, which is dubious, because:
>
> - memsw.limit must be >= memory.limit, so it is impossible to limit
> swap usage less than memory usage. Taking into account the fact that
> the primary limiting mechanism in the unified hierarchy is
> memory.high while memory.limit is either left unset or set to a very
> large value, moving memsw.limit knob to the unified hierarchy would
> effectively make it impossible to limit swap usage according to the
> user preference.
>
> - memsw.usage != memory.usage + swap.usage, because a page occupying
> both swap entry and a swap cache page is charged only once to memsw
> counter. As a result, it is possible to effectively eat up to
> memory.limit of memory pages *and* memsw.limit of swap entries, which
> looks unexpected.
>
> That said, we should provide a different swap limiting mechanism for
> cgroup2.
>
> This patch adds mem_cgroup->swap counter, which charges the actual
> number of swap entries used by a cgroup. It is only charged in the
> unified hierarchy, while the legacy hierarchy memsw logic is left
> intact.
>
> The swap usage can be monitored using new memory.swap.current file and
> limited using memory.swap.max.
>
> Signed-off-by: Vladimir Davydov <[email protected]>

setting swap.max=0 will work like mlock ?

Thanks,
-Kame


> ---
> include/linux/memcontrol.h | 1 +
> include/linux/swap.h | 5 ++
> mm/memcontrol.c | 123 +++++++++++++++++++++++++++++++++++++++++----
> mm/shmem.c | 4 ++
> mm/swap_state.c | 5 ++
> 5 files changed, 129 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index c6a5ed2f2744..993c9a26b637 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -169,6 +169,7 @@ struct mem_cgroup {
>
> /* Accounted resources */
> struct page_counter memory;
> + struct page_counter swap;
> struct page_counter memsw;
> struct page_counter kmem;
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 457181844b6e..f4b3ccdcba91 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -368,11 +368,16 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
> #endif
> #ifdef CONFIG_MEMCG_SWAP
> extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
> +extern int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry);
> extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
> #else
> static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
> {
> }
> +static inline int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry)
> +{
> + return 0;
> +}
> static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
> {
> }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7f5c6abf5421..9d10e2819ec4 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1212,7 +1212,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
> pr_cont(":");
>
> for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
> - if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
> + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
> continue;
> pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
> K(mem_cgroup_read_stat(iter, i)));
> @@ -1248,12 +1248,15 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
> {
> unsigned long limit;
>
> - limit = memcg->memory.limit;
> + limit = READ_ONCE(memcg->memory.limit);
> if (mem_cgroup_swappiness(memcg)) {
> unsigned long memsw_limit;
> + unsigned long swap_limit;
>
> - memsw_limit = memcg->memsw.limit;
> - limit = min(limit + total_swap_pages, memsw_limit);
> + memsw_limit = READ_ONCE(memcg->memsw.limit);
> + swap_limit = min(READ_ONCE(memcg->swap.limit),
> + (unsigned long)total_swap_pages);
> + limit = min(limit + swap_limit, memsw_limit);
> }
> return limit;
> }
> @@ -4226,6 +4229,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
> page_counter_init(&memcg->memory, NULL);
> memcg->high = PAGE_COUNTER_MAX;
> memcg->soft_limit = PAGE_COUNTER_MAX;
> + page_counter_init(&memcg->swap, NULL);
> page_counter_init(&memcg->memsw, NULL);
> page_counter_init(&memcg->kmem, NULL);
> }
> @@ -4276,6 +4280,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
> page_counter_init(&memcg->memory, &parent->memory);
> memcg->high = PAGE_COUNTER_MAX;
> memcg->soft_limit = PAGE_COUNTER_MAX;
> + page_counter_init(&memcg->swap, &parent->swap);
> page_counter_init(&memcg->memsw, &parent->memsw);
> page_counter_init(&memcg->kmem, &parent->kmem);
> #if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
> @@ -4291,6 +4296,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
> page_counter_init(&memcg->memory, NULL);
> memcg->high = PAGE_COUNTER_MAX;
> memcg->soft_limit = PAGE_COUNTER_MAX;
> + page_counter_init(&memcg->swap, NULL);
> page_counter_init(&memcg->memsw, NULL);
> page_counter_init(&memcg->kmem, NULL);
> #if defined(CONFIG_MEMCG_LEGACY_KMEM) && defined(CONFIG_INET)
> @@ -5291,7 +5297,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
> if (page->mem_cgroup)
> goto out;
>
> - if (do_memsw_account()) {
> + if (do_swap_account) {
> swp_entry_t ent = { .val = page_private(page), };
> unsigned short id = lookup_swap_cgroup_id(ent);
>
> @@ -5754,26 +5760,66 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
> memcg_check_events(memcg, page);
> }
>
> +/*
> + * mem_cgroup_charge_swap - charge a swap entry
> + * @page: page being added to swap
> + * @entry: swap entry to charge
> + *
> + * Try to charge @entry to the memcg that @page belongs to.
> + *
> + * Returns 0 on success, -ENOMEM on failure.
> + */
> +int mem_cgroup_charge_swap(struct page *page, swp_entry_t entry)
> +{
> + struct mem_cgroup *memcg;
> + struct page_counter *counter;
> + unsigned short oldid;
> +
> + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
> + return 0;
> +
> + memcg = page->mem_cgroup;
> +
> + /* Readahead page, never charged */
> + if (!memcg)
> + return 0;
> +
> + if (!mem_cgroup_is_root(memcg) &&
> + !page_counter_try_charge(&memcg->swap, 1, &counter))
> + return -ENOMEM;
> +
> + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
> + VM_BUG_ON_PAGE(oldid, page);
> + mem_cgroup_swap_statistics(memcg, true);
> +
> + css_get(&memcg->css);
> + return 0;
> +}
> +
> /**
> * mem_cgroup_uncharge_swap - uncharge a swap entry
> * @entry: swap entry to uncharge
> *
> - * Drop the memsw charge associated with @entry.
> + * Drop the swap charge associated with @entry.
> */
> void mem_cgroup_uncharge_swap(swp_entry_t entry)
> {
> struct mem_cgroup *memcg;
> unsigned short id;
>
> - if (!do_memsw_account())
> + if (!do_swap_account)
> return;
>
> id = swap_cgroup_record(entry, 0);
> rcu_read_lock();
> memcg = mem_cgroup_from_id(id);
> if (memcg) {
> - if (!mem_cgroup_is_root(memcg))
> - page_counter_uncharge(&memcg->memsw, 1);
> + if (!mem_cgroup_is_root(memcg)) {
> + if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> + page_counter_uncharge(&memcg->swap, 1);
> + else
> + page_counter_uncharge(&memcg->memsw, 1);
> + }
> mem_cgroup_swap_statistics(memcg, false);
> css_put(&memcg->css);
> }
> @@ -5797,6 +5843,63 @@ static int __init enable_swap_account(char *s)
> }
> __setup("swapaccount=", enable_swap_account);
>
> +static u64 swap_current_read(struct cgroup_subsys_state *css,
> + struct cftype *cft)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +
> + return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
> +}
> +
> +static int swap_max_show(struct seq_file *m, void *v)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
> + unsigned long max = READ_ONCE(memcg->swap.limit);
> +
> + if (max == PAGE_COUNTER_MAX)
> + seq_puts(m, "max\n");
> + else
> + seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
> +
> + return 0;
> +}
> +
> +static ssize_t swap_max_write(struct kernfs_open_file *of,
> + char *buf, size_t nbytes, loff_t off)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> + unsigned long max;
> + int err;
> +
> + buf = strstrip(buf);
> + err = page_counter_memparse(buf, "max", &max);
> + if (err)
> + return err;
> +
> + mutex_lock(&memcg_limit_mutex);
> + err = page_counter_limit(&memcg->swap, max);
> + mutex_unlock(&memcg_limit_mutex);
> + if (err)
> + return err;
> +
> + return nbytes;
> +}
> +
> +static struct cftype swap_files[] = {
> + {
> + .name = "swap.current",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .read_u64 = swap_current_read,
> + },
> + {
> + .name = "swap.max",
> + .flags = CFTYPE_NOT_ON_ROOT,
> + .seq_show = swap_max_show,
> + .write = swap_max_write,
> + },
> + { } /* terminate */
> +};
> +
> static struct cftype memsw_cgroup_files[] = {
> {
> .name = "memsw.usage_in_bytes",
> @@ -5828,6 +5931,8 @@ static int __init mem_cgroup_swap_init(void)
> {
> if (!mem_cgroup_disabled() && really_do_swap_account) {
> do_swap_account = 1;
> + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
> + swap_files));
> WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
> memsw_cgroup_files));
> }
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9b051115a100..659a90d8305c 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -912,6 +912,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
> if (!swap.val)
> goto redirty;
>
> + if (mem_cgroup_charge_swap(page, swap))
> + goto free_swap;
> +
> /*
> * Add inode to shmem_unuse()'s list of swapped-out inodes,
> * if it's not already there. Do it now before the page is
> @@ -940,6 +943,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
> }
>
> mutex_unlock(&shmem_swaplist_mutex);
> +free_swap:
> swapcache_free(swap);
> redirty:
> set_page_dirty(page);
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index d783872d746c..dea39cb03967 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list)
> if (!entry.val)
> return 0;
>
> + if (mem_cgroup_charge_swap(page, entry)) {
> + swapcache_free(entry);
> + return 0;
> + }
> +
> if (unlikely(PageTransHuge(page)))
> if (unlikely(split_huge_page_to_list(page, list))) {
> swapcache_free(entry);
>