2019-03-07 16:57:56

by Greg Thelen

[permalink] [raw]
Subject: [PATCH] writeback: sum memcg dirty counters as needed

Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting") memcg dirty and writeback counters are managed
as:
1) per-memcg per-cpu values in range of [-32..32]
2) per-memcg atomic counter
When a per-cpu counter cannot fit in [-32..32] it's flushed to the
atomic. Stat readers only check the atomic.
Thus readers such as balance_dirty_pages() may see a nontrivial error
margin: 32 pages per cpu.
Assuming 100 cpus:
4k x86 page_size: 13 MiB error per memcg
64k ppc page_size: 200 MiB error per memcg
Considering that dirty+writeback are used together for some decisions
the errors double.

This inaccuracy can lead to undeserved oom kills. One nasty case is
when all per-cpu counters hold positive values offsetting an atomic
negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
balance_dirty_pages() only consults the atomic and does not consider
throttling the next n_cpu*32 dirty pages. If the file_lru is in the
13..200 MiB range then there's absolutely no dirty throttling, which
burdens vmscan with only dirty+writeback pages thus resorting to oom
kill.

It could be argued that tiny containers are not supported, but it's more
subtle. It's the amount the space available for file lru that matters.
If a container has memory.max-200MiB of non reclaimable memory, then it
will also suffer such oom kills on a 100 cpu machine.

The following test reliably ooms without this patch. This patch avoids
oom kills.

$ cat test
mount -t cgroup2 none /dev/cgroup
cd /dev/cgroup
echo +io +memory > cgroup.subtree_control
mkdir test
cd test
echo 10M > memory.max
(echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
(echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)

$ cat memcg-writeback-stress.c
/*
* Dirty pages from all but one cpu.
* Clean pages from the non dirtying cpu.
* This is to stress per cpu counter imbalance.
* On a 100 cpu machine:
* - per memcg per cpu dirty count is 32 pages for each of 99 cpus
* - per memcg atomic is -99*32 pages
* - thus the complete dirty limit: sum of all counters 0
* - balance_dirty_pages() only sees atomic count -99*32 pages, which
* it max()s to 0.
* - So a workload can dirty -99*32 pages before balance_dirty_pages()
* cares.
*/
#define _GNU_SOURCE
#include <err.h>
#include <fcntl.h>
#include <sched.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/sysinfo.h>
#include <sys/types.h>
#include <unistd.h>

static char *buf;
static int bufSize;

static void set_affinity(int cpu)
{
cpu_set_t affinity;

CPU_ZERO(&affinity);
CPU_SET(cpu, &affinity);
if (sched_setaffinity(0, sizeof(affinity), &affinity))
err(1, "sched_setaffinity");
}

static void dirty_on(int output_fd, int cpu)
{
int i, wrote;

set_affinity(cpu);
for (i = 0; i < 32; i++) {
for (wrote = 0; wrote < bufSize; ) {
int ret = write(output_fd, buf+wrote, bufSize-wrote);
if (ret == -1)
err(1, "write");
wrote += ret;
}
}
}

int main(int argc, char **argv)
{
int cpu, flush_cpu = 1, output_fd;
const char *output;

if (argc != 2)
errx(1, "usage: output_file");

output = argv[1];
bufSize = getpagesize();
buf = malloc(getpagesize());
if (buf == NULL)
errx(1, "malloc failed");

output_fd = open(output, O_CREAT|O_RDWR);
if (output_fd == -1)
err(1, "open(%s)", output);

for (cpu = 0; cpu < get_nprocs(); cpu++) {
if (cpu != flush_cpu)
dirty_on(output_fd, cpu);
}

set_affinity(flush_cpu);
if (fsync(output_fd))
err(1, "fsync(%s)", output);
if (close(output_fd))
err(1, "close(%s)", output);
free(buf);
}

Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
collect exact per memcg counters when a memcg is close to the
throttling/writeback threshold. This avoids the aforementioned oom
kills.

This does not affect the overhead of memory.stat, which still reads the
single atomic counter.

Why not use percpu_counter? memcg already handles cpus going offline,
so no need for that overhead from percpu_counter. And the
percpu_counter spinlocks are more heavyweight than is required.

It probably also makes sense to include exact dirty and writeback
counters in memcg oom reports. But that is saved for later.

Signed-off-by: Greg Thelen <[email protected]>
---
include/linux/memcontrol.h | 33 +++++++++++++++++++++++++--------
mm/memcontrol.c | 26 ++++++++++++++++++++------
mm/page-writeback.c | 27 +++++++++++++++++++++------
3 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 83ae11cbd12c..6a133c90138c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -573,6 +573,22 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
return x;
}

+/* idx can be of type enum memcg_stat_item or node_stat_item */
+static inline unsigned long
+memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = atomic_long_read(&memcg->stat[idx]);
+#ifdef CONFIG_SMP
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void __mod_memcg_state(struct mem_cgroup *memcg,
int idx, int val)
@@ -1222,9 +1238,10 @@ static inline void dec_lruvec_page_state(struct page *page,
#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
- unsigned long *pheadroom, unsigned long *pdirty,
- unsigned long *pwriteback);
+unsigned long
+mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ unsigned long *pheadroom, unsigned long *pdirty,
+ unsigned long *pwriteback, bool exact);

#else /* CONFIG_CGROUP_WRITEBACK */

@@ -1233,12 +1250,12 @@ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
return NULL;
}

-static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
- unsigned long *pfilepages,
- unsigned long *pheadroom,
- unsigned long *pdirty,
- unsigned long *pwriteback)
+static inline unsigned long
+mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ unsigned long *pheadroom, unsigned long *pdirty,
+ unsigned long *pwriteback, bool exact)
{
+ return 0;
}

#endif /* CONFIG_CGROUP_WRITEBACK */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index af7f18b32389..0a50f1c523bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3880,6 +3880,7 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
* @pheadroom: out parameter for number of allocatable pages according to memcg
* @pdirty: out parameter for number of dirty pages
* @pwriteback: out parameter for number of pages under writeback
+ * @exact: determines exact counters are required, indicates more work.
*
* Determine the numbers of file, headroom, dirty, and writeback pages in
* @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
@@ -3890,18 +3891,29 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
* ancestors. Note that this doesn't consider the actual amount of
* available memory in the system. The caller should further cap
* *@pheadroom accordingly.
+ *
+ * Return value is the error precision associated with *@pdirty
+ * and *@pwriteback. When @exact is set this a minimal value.
*/
-void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
- unsigned long *pheadroom, unsigned long *pdirty,
- unsigned long *pwriteback)
+unsigned long
+mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+ unsigned long *pheadroom, unsigned long *pdirty,
+ unsigned long *pwriteback, bool exact)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
+ unsigned long precision;

- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
-
+ if (exact) {
+ precision = 0;
+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
+ } else {
+ precision = MEMCG_CHARGE_BATCH * num_online_cpus();
+ *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+ }
/* this should eventually include NR_UNSTABLE_NFS */
- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
(1 << LRU_ACTIVE_FILE));
*pheadroom = PAGE_COUNTER_MAX;
@@ -3913,6 +3925,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
memcg = parent;
}
+
+ return precision;
}

#else /* CONFIG_CGROUP_WRITEBACK */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7d1010453fb9..2c1c855db4b7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1612,14 +1612,17 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
}

if (mdtc) {
- unsigned long filepages, headroom, writeback;
+ bool exact = false;
+ unsigned long precision, filepages, headroom, writeback;

/*
* If @wb belongs to !root memcg, repeat the same
* basic calculations for the memcg domain.
*/
- mem_cgroup_wb_stats(wb, &filepages, &headroom,
- &mdtc->dirty, &writeback);
+memcg_stats:
+ precision = mem_cgroup_wb_stats(wb, &filepages,
+ &headroom, &mdtc->dirty,
+ &writeback, exact);
mdtc->dirty += writeback;
mdtc_calc_avail(mdtc, filepages, headroom);

@@ -1634,6 +1637,10 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
m_dirty = mdtc->dirty;
m_thresh = mdtc->thresh;
m_bg_thresh = mdtc->bg_thresh;
+ if (abs(m_dirty - mdtc->thresh) < precision) {
+ exact = true;
+ goto memcg_stats;
+ }
}
}

@@ -1945,10 +1952,13 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
return true;

if (mdtc) {
- unsigned long filepages, headroom, writeback;
+ bool exact = false;
+ unsigned long precision, filepages, headroom, writeback;

- mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty,
- &writeback);
+memcg_stats:
+ precision = mem_cgroup_wb_stats(wb, &filepages, &headroom,
+ &mdtc->dirty, &writeback,
+ exact);
mdtc_calc_avail(mdtc, filepages, headroom);
domain_dirty_limits(mdtc); /* ditto, ignore writeback */

@@ -1958,6 +1968,11 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (wb_stat(wb, WB_RECLAIMABLE) >
wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
return true;
+
+ if (abs(mdtc->dirty - mdtc->bg_thresh) < precision) {
+ exact = true;
+ goto memcg_stats;
+ }
}

return false;
--
2.21.0.352.gf09ad66450-goog



2019-03-21 23:46:01

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] writeback: sum memcg dirty counters as needed

On Thu, 7 Mar 2019 08:56:32 -0800 Greg Thelen <[email protected]> wrote:

> Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
> memory.stat reporting") memcg dirty and writeback counters are managed
> as:
> 1) per-memcg per-cpu values in range of [-32..32]
> 2) per-memcg atomic counter
> When a per-cpu counter cannot fit in [-32..32] it's flushed to the
> atomic. Stat readers only check the atomic.
> Thus readers such as balance_dirty_pages() may see a nontrivial error
> margin: 32 pages per cpu.
> Assuming 100 cpus:
> 4k x86 page_size: 13 MiB error per memcg
> 64k ppc page_size: 200 MiB error per memcg
> Considering that dirty+writeback are used together for some decisions
> the errors double.
>
> This inaccuracy can lead to undeserved oom kills. One nasty case is
> when all per-cpu counters hold positive values offsetting an atomic
> negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
> balance_dirty_pages() only consults the atomic and does not consider
> throttling the next n_cpu*32 dirty pages. If the file_lru is in the
> 13..200 MiB range then there's absolutely no dirty throttling, which
> burdens vmscan with only dirty+writeback pages thus resorting to oom
> kill.
>
> It could be argued that tiny containers are not supported, but it's more
> subtle. It's the amount the space available for file lru that matters.
> If a container has memory.max-200MiB of non reclaimable memory, then it
> will also suffer such oom kills on a 100 cpu machine.
>
> ...
>
> Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
> collect exact per memcg counters when a memcg is close to the
> throttling/writeback threshold. This avoids the aforementioned oom
> kills.
>
> This does not affect the overhead of memory.stat, which still reads the
> single atomic counter.
>
> Why not use percpu_counter? memcg already handles cpus going offline,
> so no need for that overhead from percpu_counter. And the
> percpu_counter spinlocks are more heavyweight than is required.
>
> It probably also makes sense to include exact dirty and writeback
> counters in memcg oom reports. But that is saved for later.

Nice changelog, thanks.

> Signed-off-by: Greg Thelen <[email protected]>

Did you consider cc:stable for this? We may as well - the stablebots
backport everything which might look slightly like a fix anyway :(

> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -573,6 +573,22 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
> return x;
> }
>
> +/* idx can be of type enum memcg_stat_item or node_stat_item */
> +static inline unsigned long
> +memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
> +{
> + long x = atomic_long_read(&memcg->stat[idx]);
> +#ifdef CONFIG_SMP
> + int cpu;
> +
> + for_each_online_cpu(cpu)
> + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
> + if (x < 0)
> + x = 0;
> +#endif
> + return x;
> +}

This looks awfully heavyweight for an inline function. Why not make it
a regular function and avoid the bloat and i-cache consumption?

Also, did you instead consider making this spill the percpu counters
into memcg->stat[idx]? That might be more useful for potential future
callers. It would become a little more expensive though.

2019-03-22 18:16:35

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH] writeback: sum memcg dirty counters as needed

On Thu, Mar 07, 2019 at 08:56:32AM -0800, Greg Thelen wrote:
> Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
> memory.stat reporting") memcg dirty and writeback counters are managed
> as:
> 1) per-memcg per-cpu values in range of [-32..32]
> 2) per-memcg atomic counter
> When a per-cpu counter cannot fit in [-32..32] it's flushed to the
> atomic. Stat readers only check the atomic.
> Thus readers such as balance_dirty_pages() may see a nontrivial error
> margin: 32 pages per cpu.
> Assuming 100 cpus:
> 4k x86 page_size: 13 MiB error per memcg
> 64k ppc page_size: 200 MiB error per memcg
> Considering that dirty+writeback are used together for some decisions
> the errors double.
>
> This inaccuracy can lead to undeserved oom kills. One nasty case is
> when all per-cpu counters hold positive values offsetting an atomic
> negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
> balance_dirty_pages() only consults the atomic and does not consider
> throttling the next n_cpu*32 dirty pages. If the file_lru is in the
> 13..200 MiB range then there's absolutely no dirty throttling, which
> burdens vmscan with only dirty+writeback pages thus resorting to oom
> kill.
>
> It could be argued that tiny containers are not supported, but it's more
> subtle. It's the amount the space available for file lru that matters.
> If a container has memory.max-200MiB of non reclaimable memory, then it
> will also suffer such oom kills on a 100 cpu machine.
>
> The following test reliably ooms without this patch. This patch avoids
> oom kills.
>
> ...
>
> Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
> collect exact per memcg counters when a memcg is close to the
> throttling/writeback threshold. This avoids the aforementioned oom
> kills.
>
> This does not affect the overhead of memory.stat, which still reads the
> single atomic counter.
>
> Why not use percpu_counter? memcg already handles cpus going offline,
> so no need for that overhead from percpu_counter. And the
> percpu_counter spinlocks are more heavyweight than is required.
>
> It probably also makes sense to include exact dirty and writeback
> counters in memcg oom reports. But that is saved for later.
>
> Signed-off-by: Greg Thelen <[email protected]>
> ---
> include/linux/memcontrol.h | 33 +++++++++++++++++++++++++--------
> mm/memcontrol.c | 26 ++++++++++++++++++++------
> mm/page-writeback.c | 27 +++++++++++++++++++++------
> 3 files changed, 66 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 83ae11cbd12c..6a133c90138c 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -573,6 +573,22 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
> return x;
> }

Hi Greg!

Thank you for the patch, definitely a good problem to be fixed!

>
> +/* idx can be of type enum memcg_stat_item or node_stat_item */
> +static inline unsigned long
> +memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
> +{
> + long x = atomic_long_read(&memcg->stat[idx]);
> +#ifdef CONFIG_SMP

I doubt that this #ifdef is correct without corresponding changes
in __mod_memcg_state(). As now, we do use per-cpu buffer which spills
to an atomic value event if !CONFIG_SMP. It's probably something
that we want to change, but as now, #ifdef CONFIG_SMP should protect
only "if (x < 0)" part.


> + int cpu;
> +
> + for_each_online_cpu(cpu)
> + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
> + if (x < 0)
> + x = 0;
> +#endif
> + return x;
> +}

Also, isn't it worth it to generalize memcg_page_state() instead?
By adding an bool exact argument? I believe dirty balance is not
the only place, where we need a better accuracy.

Thanks!

2019-03-27 22:32:24

by Greg Thelen

[permalink] [raw]
Subject: Re: [PATCH] writeback: sum memcg dirty counters as needed

On Fri, Mar 22, 2019 at 11:15 AM Roman Gushchin <[email protected]> wrote:
>
> On Thu, Mar 07, 2019 at 08:56:32AM -0800, Greg Thelen wrote:
> > Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
> > memory.stat reporting") memcg dirty and writeback counters are managed
> > as:
> > 1) per-memcg per-cpu values in range of [-32..32]
> > 2) per-memcg atomic counter
> > When a per-cpu counter cannot fit in [-32..32] it's flushed to the
> > atomic. Stat readers only check the atomic.
> > Thus readers such as balance_dirty_pages() may see a nontrivial error
> > margin: 32 pages per cpu.
> > Assuming 100 cpus:
> > 4k x86 page_size: 13 MiB error per memcg
> > 64k ppc page_size: 200 MiB error per memcg
> > Considering that dirty+writeback are used together for some decisions
> > the errors double.
> >
> > This inaccuracy can lead to undeserved oom kills. One nasty case is
> > when all per-cpu counters hold positive values offsetting an atomic
> > negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
> > balance_dirty_pages() only consults the atomic and does not consider
> > throttling the next n_cpu*32 dirty pages. If the file_lru is in the
> > 13..200 MiB range then there's absolutely no dirty throttling, which
> > burdens vmscan with only dirty+writeback pages thus resorting to oom
> > kill.
> >
> > It could be argued that tiny containers are not supported, but it's more
> > subtle. It's the amount the space available for file lru that matters.
> > If a container has memory.max-200MiB of non reclaimable memory, then it
> > will also suffer such oom kills on a 100 cpu machine.
> >
> > The following test reliably ooms without this patch. This patch avoids
> > oom kills.
> >
> > ...
> >
> > Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
> > collect exact per memcg counters when a memcg is close to the
> > throttling/writeback threshold. This avoids the aforementioned oom
> > kills.
> >
> > This does not affect the overhead of memory.stat, which still reads the
> > single atomic counter.
> >
> > Why not use percpu_counter? memcg already handles cpus going offline,
> > so no need for that overhead from percpu_counter. And the
> > percpu_counter spinlocks are more heavyweight than is required.
> >
> > It probably also makes sense to include exact dirty and writeback
> > counters in memcg oom reports. But that is saved for later.
> >
> > Signed-off-by: Greg Thelen <[email protected]>
> > ---
> > include/linux/memcontrol.h | 33 +++++++++++++++++++++++++--------
> > mm/memcontrol.c | 26 ++++++++++++++++++++------
> > mm/page-writeback.c | 27 +++++++++++++++++++++------
> > 3 files changed, 66 insertions(+), 20 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 83ae11cbd12c..6a133c90138c 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -573,6 +573,22 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
> > return x;
> > }
>
> Hi Greg!
>
> Thank you for the patch, definitely a good problem to be fixed!
>
> >
> > +/* idx can be of type enum memcg_stat_item or node_stat_item */
> > +static inline unsigned long
> > +memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
> > +{
> > + long x = atomic_long_read(&memcg->stat[idx]);
> > +#ifdef CONFIG_SMP
>
> I doubt that this #ifdef is correct without corresponding changes
> in __mod_memcg_state(). As now, we do use per-cpu buffer which spills
> to an atomic value event if !CONFIG_SMP. It's probably something
> that we want to change, but as now, #ifdef CONFIG_SMP should protect
> only "if (x < 0)" part.

Ack. I'll fix it.

> > + int cpu;
> > +
> > + for_each_online_cpu(cpu)
> > + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
> > + if (x < 0)
> > + x = 0;
> > +#endif
> > + return x;
> > +}
>
> Also, isn't it worth it to generalize memcg_page_state() instead?
> By adding an bool exact argument? I believe dirty balance is not
> the only place, where we need a better accuracy.

Nod. I'll provide a more general version of memcg_page_state(). I'm
testing updated (forthcoming v2) patch set now with feedback from
Andrew and Roman.

2019-03-28 14:07:08

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH] writeback: sum memcg dirty counters as needed

On Wed, Mar 27, 2019 at 03:29:47PM -0700, Greg Thelen wrote:
> On Fri, Mar 22, 2019 at 11:15 AM Roman Gushchin <[email protected]> wrote:
> > On Thu, Mar 07, 2019 at 08:56:32AM -0800, Greg Thelen wrote:
> > > + int cpu;
> > > +
> > > + for_each_online_cpu(cpu)
> > > + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
> > > + if (x < 0)
> > > + x = 0;
> > > +#endif
> > > + return x;
> > > +}
> >
> > Also, isn't it worth it to generalize memcg_page_state() instead?
> > By adding an bool exact argument? I believe dirty balance is not
> > the only place, where we need a better accuracy.
>
> Nod. I'll provide a more general version of memcg_page_state(). I'm
> testing updated (forthcoming v2) patch set now with feedback from
> Andrew and Roman.

I'm working on a patch series that reworks the memcg_page_state() API
and by far the most callers do NOT need the exact numbers. So I'd ask
to please keep this a separate function so I don't have to update tens
of callsites to pass "false". Thanks!

2019-03-28 14:22:57

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH] writeback: sum memcg dirty counters as needed

On Thu, Mar 07, 2019 at 08:56:32AM -0800, Greg Thelen wrote:
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3880,6 +3880,7 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
> * @pheadroom: out parameter for number of allocatable pages according to memcg
> * @pdirty: out parameter for number of dirty pages
> * @pwriteback: out parameter for number of pages under writeback
> + * @exact: determines exact counters are required, indicates more work.
> *
> * Determine the numbers of file, headroom, dirty, and writeback pages in
> * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
> @@ -3890,18 +3891,29 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
> * ancestors. Note that this doesn't consider the actual amount of
> * available memory in the system. The caller should further cap
> * *@pheadroom accordingly.
> + *
> + * Return value is the error precision associated with *@pdirty
> + * and *@pwriteback. When @exact is set this a minimal value.
> */
> -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> - unsigned long *pheadroom, unsigned long *pdirty,
> - unsigned long *pwriteback)
> +unsigned long
> +mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> + unsigned long *pheadroom, unsigned long *pdirty,
> + unsigned long *pwriteback, bool exact)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
> struct mem_cgroup *parent;
> + unsigned long precision;
>
> - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
> -
> + if (exact) {
> + precision = 0;
> + *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
> + *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
> + } else {
> + precision = MEMCG_CHARGE_BATCH * num_online_cpus();
> + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
> + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
> + }
> /* this should eventually include NR_UNSTABLE_NFS */
> - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
> *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
> (1 << LRU_ACTIVE_FILE));
> *pheadroom = PAGE_COUNTER_MAX;
> @@ -3913,6 +3925,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
> *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
> memcg = parent;
> }
> +
> + return precision;

Have you considered unconditionally using the exact version here?

It does for_each_online_cpu(), but until very, very recently we did
this per default for all stats, for years. It only became a problem in
conjunction with the for_each_memcg loops when frequently reading
memory stats at the top of a very large hierarchy.

balance_dirty_pages() is called against memcgs that actually own the
inodes/memory and doesn't do the additional recursive tree collection.

It's also not *that* hot of a function, and in the io path...

It would simplify this patch immensely.

2019-03-29 17:48:49

by Greg Thelen

[permalink] [raw]
Subject: Re: [PATCH] writeback: sum memcg dirty counters as needed

Andrew Morton <[email protected]> wrote:

> On Thu, 7 Mar 2019 08:56:32 -0800 Greg Thelen <[email protected]> wrote:
>
>> Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
>> memory.stat reporting") memcg dirty and writeback counters are managed
>> as:
>> 1) per-memcg per-cpu values in range of [-32..32]
>> 2) per-memcg atomic counter
>> When a per-cpu counter cannot fit in [-32..32] it's flushed to the
>> atomic. Stat readers only check the atomic.
>> Thus readers such as balance_dirty_pages() may see a nontrivial error
>> margin: 32 pages per cpu.
>> Assuming 100 cpus:
>> 4k x86 page_size: 13 MiB error per memcg
>> 64k ppc page_size: 200 MiB error per memcg
>> Considering that dirty+writeback are used together for some decisions
>> the errors double.
>>
>> This inaccuracy can lead to undeserved oom kills. One nasty case is
>> when all per-cpu counters hold positive values offsetting an atomic
>> negative value (i.e. per_cpu[*]=32, atomic=n_cpu*-32).
>> balance_dirty_pages() only consults the atomic and does not consider
>> throttling the next n_cpu*32 dirty pages. If the file_lru is in the
>> 13..200 MiB range then there's absolutely no dirty throttling, which
>> burdens vmscan with only dirty+writeback pages thus resorting to oom
>> kill.
>>
>> It could be argued that tiny containers are not supported, but it's more
>> subtle. It's the amount the space available for file lru that matters.
>> If a container has memory.max-200MiB of non reclaimable memory, then it
>> will also suffer such oom kills on a 100 cpu machine.
>>
>> ...
>>
>> Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
>> collect exact per memcg counters when a memcg is close to the
>> throttling/writeback threshold. This avoids the aforementioned oom
>> kills.
>>
>> This does not affect the overhead of memory.stat, which still reads the
>> single atomic counter.
>>
>> Why not use percpu_counter? memcg already handles cpus going offline,
>> so no need for that overhead from percpu_counter. And the
>> percpu_counter spinlocks are more heavyweight than is required.
>>
>> It probably also makes sense to include exact dirty and writeback
>> counters in memcg oom reports. But that is saved for later.
>
> Nice changelog, thanks.
>
>> Signed-off-by: Greg Thelen <[email protected]>
>
> Did you consider cc:stable for this? We may as well - the stablebots
> backport everything which might look slightly like a fix anyway :(

Good idea. Done in -v2 of the patch.

>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -573,6 +573,22 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
>> return x;
>> }
>>
>> +/* idx can be of type enum memcg_stat_item or node_stat_item */
>> +static inline unsigned long
>> +memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
>> +{
>> + long x = atomic_long_read(&memcg->stat[idx]);
>> +#ifdef CONFIG_SMP
>> + int cpu;
>> +
>> + for_each_online_cpu(cpu)
>> + x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
>> + if (x < 0)
>> + x = 0;
>> +#endif
>> + return x;
>> +}
>
> This looks awfully heavyweight for an inline function. Why not make it
> a regular function and avoid the bloat and i-cache consumption?

Done in -v2.

> Also, did you instead consider making this spill the percpu counters
> into memcg->stat[idx]? That might be more useful for potential future
> callers. It would become a little more expensive though.

I looked at that approach, but couldn't convince myself it was safe. I
kept staring at "Remote [...] Write accesses can cause unique problems
due to the relaxed synchronization requirements for this_cpu
operations." from this_cpu_ops.txt. So I'd like to delay this possible
optimization for a later patch.

2019-03-29 17:53:30

by Greg Thelen

[permalink] [raw]
Subject: Re: [PATCH] writeback: sum memcg dirty counters as needed

Johannes Weiner <[email protected]> wrote:

> On Thu, Mar 07, 2019 at 08:56:32AM -0800, Greg Thelen wrote:
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3880,6 +3880,7 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
>> * @pheadroom: out parameter for number of allocatable pages according to memcg
>> * @pdirty: out parameter for number of dirty pages
>> * @pwriteback: out parameter for number of pages under writeback
>> + * @exact: determines exact counters are required, indicates more work.
>> *
>> * Determine the numbers of file, headroom, dirty, and writeback pages in
>> * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom
>> @@ -3890,18 +3891,29 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
>> * ancestors. Note that this doesn't consider the actual amount of
>> * available memory in the system. The caller should further cap
>> * *@pheadroom accordingly.
>> + *
>> + * Return value is the error precision associated with *@pdirty
>> + * and *@pwriteback. When @exact is set this a minimal value.
>> */
>> -void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
>> - unsigned long *pheadroom, unsigned long *pdirty,
>> - unsigned long *pwriteback)
>> +unsigned long
>> +mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
>> + unsigned long *pheadroom, unsigned long *pdirty,
>> + unsigned long *pwriteback, bool exact)
>> {
>> struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
>> struct mem_cgroup *parent;
>> + unsigned long precision;
>>
>> - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
>> -
>> + if (exact) {
>> + precision = 0;
>> + *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
>> + *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
>> + } else {
>> + precision = MEMCG_CHARGE_BATCH * num_online_cpus();
>> + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
>> + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
>> + }
>> /* this should eventually include NR_UNSTABLE_NFS */
>> - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
>> *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
>> (1 << LRU_ACTIVE_FILE));
>> *pheadroom = PAGE_COUNTER_MAX;
>> @@ -3913,6 +3925,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
>> *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
>> memcg = parent;
>> }
>> +
>> + return precision;
>
> Have you considered unconditionally using the exact version here?
>
> It does for_each_online_cpu(), but until very, very recently we did
> this per default for all stats, for years. It only became a problem in
> conjunction with the for_each_memcg loops when frequently reading
> memory stats at the top of a very large hierarchy.
>
> balance_dirty_pages() is called against memcgs that actually own the
> inodes/memory and doesn't do the additional recursive tree collection.
>
> It's also not *that* hot of a function, and in the io path...
>
> It would simplify this patch immensely.

Good idea. Done in -v2 of the patch:
https://lore.kernel.org/lkml/[email protected]/