2015-12-12 13:34:15

by Vladimir Davydov

[permalink] [raw]
Subject: [PATCH] mm: memcontrol: fix possible memcg leak due to interrupted reclaim

Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break()
once enough pages have been reclaimed, in which case, in contrast to a
full round-trip over a cgroup sub-tree, the current position stored in
mem_cgroup_reclaim_iter of the target cgroup does not get invalidated
and so is left holding the reference to the last scanned cgroup. If the
target cgroup does not get scanned again (we might have just reclaimed
the last page or all processes might exit and free their memory
voluntary), we will leak it, because there is nobody to put the
reference held by the iterator.

The problem is easy to reproduce by running the following command
sequence in a loop:

mkdir /sys/fs/cgroup/memory/test
echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs
memhog 150M
echo $$ > /sys/fs/cgroup/memory/cgroup.procs
rmdir test

The cgroups generated by it will never get freed.

This patch fixes this issue by making mem_cgroup_iter_break() clear
mem_cgroup_reclaim_iter->position in case it points to the memory cgroup
we interrupted reclaim on.

Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
Signed-off-by: Vladimir Davydov <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: <[email protected]> # 3.19+
---
include/linux/memcontrol.h | 8 +++++---
mm/memcontrol.c | 28 +++++++++++++++++++++++-----
mm/vmscan.c | 2 +-
3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2bb14d021cd0..6000fadc6100 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -313,7 +313,8 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
struct mem_cgroup *,
struct mem_cgroup_reclaim_cookie *);
-void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *,
+ struct mem_cgroup_reclaim_cookie *);

/**
* parent_mem_cgroup - find the accounting parent of a memcg
@@ -585,8 +586,9 @@ mem_cgroup_iter(struct mem_cgroup *root,
return NULL;
}

-static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
- struct mem_cgroup *prev)
+static inline void
+mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev,
+ struct mem_cgroup_reclaim_cookie *reclaim)
{
}

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fae68111876d..6751ff4f4507 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -945,14 +945,32 @@ out:
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
* @root: hierarchy root
* @prev: last visited hierarchy member as returned by mem_cgroup_iter()
+ * @reclaim: cookie for shared reclaim walks, NULL for full walks
*/
void mem_cgroup_iter_break(struct mem_cgroup *root,
- struct mem_cgroup *prev)
+ struct mem_cgroup *prev,
+ struct mem_cgroup_reclaim_cookie *reclaim)
{
if (!root)
root = root_mem_cgroup;
if (prev && prev != root)
css_put(&prev->css);
+ if (prev && reclaim) {
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_reclaim_iter *iter;
+
+ mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
+ iter = &mz->iter[reclaim->priority];
+
+ /*
+ * There is no guarantee that root will ever get scanned again
+ * so we must put reference to prev held by the iterator so as
+ * not to risk pinning it forever.
+ */
+ if (reclaim->generation == iter->generation &&
+ cmpxchg(&iter->position, prev, NULL) == prev)
+ css_put(&prev->css);
+ }
}

/*
@@ -1308,7 +1326,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
continue;
case OOM_SCAN_ABORT:
css_task_iter_end(&it);
- mem_cgroup_iter_break(memcg, iter);
+ mem_cgroup_iter_break(memcg, iter, NULL);
if (chosen)
put_task_struct(chosen);
goto unlock;
@@ -1485,7 +1503,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
if (!soft_limit_excess(root_memcg))
break;
}
- mem_cgroup_iter_break(root_memcg, victim);
+ mem_cgroup_iter_break(root_memcg, victim, NULL);
return total;
}

@@ -1514,7 +1532,7 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
* so we cannot give a lock.
*/
failed = iter;
- mem_cgroup_iter_break(memcg, iter);
+ mem_cgroup_iter_break(memcg, iter, NULL);
break;
} else
iter->oom_lock = true;
@@ -1527,7 +1545,7 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
*/
for_each_mem_cgroup_tree(iter, memcg) {
if (iter == failed) {
- mem_cgroup_iter_break(memcg, iter);
+ mem_cgroup_iter_break(memcg, iter, NULL);
break;
}
iter->oom_lock = false;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bb01b04154ad..4313495f9bd0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2439,7 +2439,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
*/
if (!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
- mem_cgroup_iter_break(root, memcg);
+ mem_cgroup_iter_break(root, memcg, &reclaim);
break;
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
--
2.1.4


2015-12-12 16:45:53

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH] mm: memcontrol: fix possible memcg leak due to interrupted reclaim

On Sat, Dec 12, 2015 at 04:34:02PM +0300, Vladimir Davydov wrote:
> Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break()
> once enough pages have been reclaimed, in which case, in contrast to a
> full round-trip over a cgroup sub-tree, the current position stored in
> mem_cgroup_reclaim_iter of the target cgroup does not get invalidated
> and so is left holding the reference to the last scanned cgroup. If the
> target cgroup does not get scanned again (we might have just reclaimed
> the last page or all processes might exit and free their memory
> voluntary), we will leak it, because there is nobody to put the
> reference held by the iterator.
>
> The problem is easy to reproduce by running the following command
> sequence in a loop:
>
> mkdir /sys/fs/cgroup/memory/test
> echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs
> memhog 150M
> echo $$ > /sys/fs/cgroup/memory/cgroup.procs
> rmdir test
>
> The cgroups generated by it will never get freed.
>
> This patch fixes this issue by making mem_cgroup_iter_break() clear
> mem_cgroup_reclaim_iter->position in case it points to the memory cgroup
> we interrupted reclaim on.
>
> Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
> Signed-off-by: Vladimir Davydov <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: <[email protected]> # 3.19+

Good catch!

The downside of not remembering the last position across incomplete
reclaim cycles is that we always restart at the same position. If a
cgroup has a certain number of children, it's conceivable that we
might never get to the youngest cgroups in the subtree anymore.

So I'd be more comfortable removing incomplete reclaim walks. It was a
nice little optimization we could do while supporting interleave walks,
but it's not necessary: when global reclaim can walk the entire system,
limit reclaim should be okay walking subtrees exhaustively.

How about this?

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 34f4a14..62a4731 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -933,6 +933,9 @@ out:
* mem_cgroup_iter_break - abort a hierarchy walk prematurely
* @root: hierarchy root
* @prev: last visited hierarchy member as returned by mem_cgroup_iter()
+ *
+ * Note: do not use this from a shared iterator walk (when using a
+ * reclaim cookie), the iterator state may leak the css reference!
*/
void mem_cgroup_iter_break(struct mem_cgroup *root,
struct mem_cgroup *prev)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2dbc679..41b07b2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2425,21 +2425,6 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);

- /*
- * Direct reclaim and kswapd have to scan all memory
- * cgroups to fulfill the overall scan target for the
- * zone.
- *
- * Limit reclaim, on the other hand, only cares about
- * nr_to_reclaim pages to be reclaimed and it will
- * retry with decreasing priority if one round over the
- * whole hierarchy is not sufficient.
- */
- if (!global_reclaim(sc) &&
- sc->nr_reclaimed >= sc->nr_to_reclaim) {
- mem_cgroup_iter_break(root, memcg);
- break;
- }
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));

/*

2015-12-12 19:19:10

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH] mm: memcontrol: fix possible memcg leak due to interrupted reclaim

On Sat, Dec 12, 2015 at 11:45:40AM -0500, Johannes Weiner wrote:
> On Sat, Dec 12, 2015 at 04:34:02PM +0300, Vladimir Davydov wrote:
> > Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break()
> > once enough pages have been reclaimed, in which case, in contrast to a
> > full round-trip over a cgroup sub-tree, the current position stored in
> > mem_cgroup_reclaim_iter of the target cgroup does not get invalidated
> > and so is left holding the reference to the last scanned cgroup. If the
> > target cgroup does not get scanned again (we might have just reclaimed
> > the last page or all processes might exit and free their memory
> > voluntary), we will leak it, because there is nobody to put the
> > reference held by the iterator.
> >
> > The problem is easy to reproduce by running the following command
> > sequence in a loop:
> >
> > mkdir /sys/fs/cgroup/memory/test
> > echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> > echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs
> > memhog 150M
> > echo $$ > /sys/fs/cgroup/memory/cgroup.procs
> > rmdir test
> >
> > The cgroups generated by it will never get freed.
> >
> > This patch fixes this issue by making mem_cgroup_iter_break() clear
> > mem_cgroup_reclaim_iter->position in case it points to the memory cgroup
> > we interrupted reclaim on.
> >
> > Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
> > Signed-off-by: Vladimir Davydov <[email protected]>
> > Cc: Johannes Weiner <[email protected]>
> > Cc: Michal Hocko <[email protected]>
> > Cc: <[email protected]> # 3.19+
>
> Good catch!
>
> The downside of not remembering the last position across incomplete
> reclaim cycles is that we always restart at the same position. If a
> cgroup has a certain number of children, it's conceivable that we
> might never get to the youngest cgroups in the subtree anymore.

That's true, scratch this patch.

>
> So I'd be more comfortable removing incomplete reclaim walks. It was a
> nice little optimization we could do while supporting interleave walks,
> but it's not necessary: when global reclaim can walk the entire system,
> limit reclaim should be okay walking subtrees exhaustively.
>
> How about this?
...
> @@ -2425,21 +2425,6 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
> sc->nr_scanned - scanned,
> sc->nr_reclaimed - reclaimed);
>
> - /*
> - * Direct reclaim and kswapd have to scan all memory
> - * cgroups to fulfill the overall scan target for the
> - * zone.
> - *
> - * Limit reclaim, on the other hand, only cares about
> - * nr_to_reclaim pages to be reclaimed and it will
> - * retry with decreasing priority if one round over the
> - * whole hierarchy is not sufficient.
> - */
> - if (!global_reclaim(sc) &&
> - sc->nr_reclaimed >= sc->nr_to_reclaim) {
> - mem_cgroup_iter_break(root, memcg);
> - break;
> - }

Dunno. I like it, because it's simple and clean, but I'm unsure: can't
it result in lags when performing memcg reclaim for deep hierarchies?
For global reclaim we have kswapd, which tries to keep the system within
bounds so as to avoid direct reclaim at all. Memcg lacks such thing, and
interleave walks looks like a good compensation for it.

Alternatively, we could avoid taking reference to iter->position and
make use of css_released cgroup callback to invalidate reclaim
iterators. With this approach, upper level cgroups shouldn't receive
unfairly high pressure in comparison to their children. Something like
this, maybe?

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 87af26a24491..fcc5133210a0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -859,14 +859,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (prev && reclaim->generation != iter->generation)
goto out_unlock;

- do {
+ while (1) {
pos = READ_ONCE(iter->position);
- /*
- * A racing update may change the position and
- * put the last reference, hence css_tryget(),
- * or retry to see the updated position.
- */
- } while (pos && !css_tryget(&pos->css));
+ if (!pos || css_tryget(&pos->css))
+ break;
+ cmpxchg(&iter->position, pos, NULL);
+ }
}

if (pos)
@@ -912,12 +910,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
}

if (reclaim) {
- if (cmpxchg(&iter->position, pos, memcg) == pos) {
- if (memcg)
- css_get(&memcg->css);
- if (pos)
- css_put(&pos->css);
- }
+ cmpxchg(&iter->position, pos, memcg);

/*
* pairs with css_tryget when dereferencing iter->position
@@ -955,6 +948,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
css_put(&prev->css);
}

+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup_reclaim_iter *iter;
+ struct mem_cgroup_per_zone *mz;
+ int nid, zid;
+ int i;
+
+ while ((memcg = parent_mem_cgroup(memcg))) {
+ for_each_node(nid) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
+ }
+ }
+ }
+ }
+}
+
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -4375,6 +4390,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
wb_memcg_offline(memcg);
}

+static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ invalidate_reclaim_iterators(memcg);
+}
+
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5229,6 +5251,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
+ .css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,

2015-12-14 15:19:22

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH] mm: memcontrol: fix possible memcg leak due to interrupted reclaim

On Sat, Dec 12, 2015 at 10:18:55PM +0300, Vladimir Davydov wrote:
> On Sat, Dec 12, 2015 at 11:45:40AM -0500, Johannes Weiner wrote:
> > @@ -2425,21 +2425,6 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
> > sc->nr_scanned - scanned,
> > sc->nr_reclaimed - reclaimed);
> >
> > - /*
> > - * Direct reclaim and kswapd have to scan all memory
> > - * cgroups to fulfill the overall scan target for the
> > - * zone.
> > - *
> > - * Limit reclaim, on the other hand, only cares about
> > - * nr_to_reclaim pages to be reclaimed and it will
> > - * retry with decreasing priority if one round over the
> > - * whole hierarchy is not sufficient.
> > - */
> > - if (!global_reclaim(sc) &&
> > - sc->nr_reclaimed >= sc->nr_to_reclaim) {
> > - mem_cgroup_iter_break(root, memcg);
> > - break;
> > - }
>
> Dunno. I like it, because it's simple and clean, but I'm unsure: can't
> it result in lags when performing memcg reclaim for deep hierarchies?
> For global reclaim we have kswapd, which tries to keep the system within
> bounds so as to avoid direct reclaim at all. Memcg lacks such thing, and
> interleave walks looks like a good compensation for it.
>
> Alternatively, we could avoid taking reference to iter->position and
> make use of css_released cgroup callback to invalidate reclaim
> iterators. With this approach, upper level cgroups shouldn't receive
> unfairly high pressure in comparison to their children. Something like
> this, maybe?

This is surprisingly simple, to the point where I'm asking myself if I
miss something in this patch or if I missed something when I did weak
references the last time. But I think the last time we didn't want to
go through all iterator positions like we do here. It doesn't really
matter, though, that's even performed from a work item.

> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 87af26a24491..fcc5133210a0 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -859,14 +859,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
> if (prev && reclaim->generation != iter->generation)
> goto out_unlock;
>
> - do {
> + while (1) {
> pos = READ_ONCE(iter->position);
> - /*
> - * A racing update may change the position and
> - * put the last reference, hence css_tryget(),
> - * or retry to see the updated position.
> - */
> - } while (pos && !css_tryget(&pos->css));
> + if (!pos || css_tryget(&pos->css))
> + break;
> + cmpxchg(&iter->position, pos, NULL);
> + }

This cmpxchg() looks a little strange. Once tryget fails, the iterator
should be clear soon enough, no? If not, a comment would be good here.

> @@ -912,12 +910,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
> }
>
> if (reclaim) {
> - if (cmpxchg(&iter->position, pos, memcg) == pos) {
> - if (memcg)
> - css_get(&memcg->css);
> - if (pos)
> - css_put(&pos->css);
> - }
> + cmpxchg(&iter->position, pos, memcg);

This looks correct. The next iteration or break will put the memcg,
potentially free it, which will clear it from the iterator and then
rcu-free the css. Anybody who sees a pointer set under the RCU lock
can safely run css_tryget() against it. Awesome!

Care to resend this with changelog?

2015-12-14 18:58:38

by Vladimir Davydov

[permalink] [raw]
Subject: Re: [PATCH] mm: memcontrol: fix possible memcg leak due to interrupted reclaim

On Mon, Dec 14, 2015 at 10:19:01AM -0500, Johannes Weiner wrote:
...
> > @@ -859,14 +859,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
> > if (prev && reclaim->generation != iter->generation)
> > goto out_unlock;
> >
> > - do {
> > + while (1) {
> > pos = READ_ONCE(iter->position);
> > - /*
> > - * A racing update may change the position and
> > - * put the last reference, hence css_tryget(),
> > - * or retry to see the updated position.
> > - */
> > - } while (pos && !css_tryget(&pos->css));
> > + if (!pos || css_tryget(&pos->css))
> > + break;
> > + cmpxchg(&iter->position, pos, NULL);
> > + }
>
> This cmpxchg() looks a little strange. Once tryget fails, the iterator
> should be clear soon enough, no? If not, a comment would be good here.

If we are running on an unpreemptible UP system, busy-waiting might
block the ->css_free work, which is supposed to clear iter->position,
resulting in a dead lock. I guess it might happen on SMP if RT scheduler
is used. Will add a comment here.

>
> > @@ -912,12 +910,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
> > }
> >
> > if (reclaim) {
> > - if (cmpxchg(&iter->position, pos, memcg) == pos) {
> > - if (memcg)
> > - css_get(&memcg->css);
> > - if (pos)
> > - css_put(&pos->css);
> > - }
> > + cmpxchg(&iter->position, pos, memcg);
>
> This looks correct. The next iteration or break will put the memcg,
> potentially free it, which will clear it from the iterator and then
> rcu-free the css. Anybody who sees a pointer set under the RCU lock
> can safely run css_tryget() against it. Awesome!
>
> Care to resend this with changelog?

Will do.

Thanks,
Vladimir

2015-12-15 09:58:50

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm: memcontrol: fix possible memcg leak due to interrupted reclaim

On Sat 12-12-15 22:18:55, Vladimir Davydov wrote:
> On Sat, Dec 12, 2015 at 11:45:40AM -0500, Johannes Weiner wrote:
> > On Sat, Dec 12, 2015 at 04:34:02PM +0300, Vladimir Davydov wrote:
> > > Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break()
> > > once enough pages have been reclaimed, in which case, in contrast to a
> > > full round-trip over a cgroup sub-tree, the current position stored in
> > > mem_cgroup_reclaim_iter of the target cgroup does not get invalidated
> > > and so is left holding the reference to the last scanned cgroup. If the
> > > target cgroup does not get scanned again (we might have just reclaimed
> > > the last page or all processes might exit and free their memory
> > > voluntary), we will leak it, because there is nobody to put the
> > > reference held by the iterator.
> > >
> > > The problem is easy to reproduce by running the following command
> > > sequence in a loop:
> > >
> > > mkdir /sys/fs/cgroup/memory/test
> > > echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes
> > > echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs
> > > memhog 150M
> > > echo $$ > /sys/fs/cgroup/memory/cgroup.procs
> > > rmdir test
> > >
> > > The cgroups generated by it will never get freed.
> > >
> > > This patch fixes this issue by making mem_cgroup_iter_break() clear
> > > mem_cgroup_reclaim_iter->position in case it points to the memory cgroup
> > > we interrupted reclaim on.
> > >
> > > Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting")
> > > Signed-off-by: Vladimir Davydov <[email protected]>
> > > Cc: Johannes Weiner <[email protected]>
> > > Cc: Michal Hocko <[email protected]>
> > > Cc: <[email protected]> # 3.19+
> >
> > Good catch!

Indeed!

[...]
> > @@ -2425,21 +2425,6 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
> > sc->nr_scanned - scanned,
> > sc->nr_reclaimed - reclaimed);
> >
> > - /*
> > - * Direct reclaim and kswapd have to scan all memory
> > - * cgroups to fulfill the overall scan target for the
> > - * zone.
> > - *
> > - * Limit reclaim, on the other hand, only cares about
> > - * nr_to_reclaim pages to be reclaimed and it will
> > - * retry with decreasing priority if one round over the
> > - * whole hierarchy is not sufficient.
> > - */
> > - if (!global_reclaim(sc) &&
> > - sc->nr_reclaimed >= sc->nr_to_reclaim) {
> > - mem_cgroup_iter_break(root, memcg);
> > - break;
> > - }
>
> Dunno. I like it, because it's simple and clean, but I'm unsure: can't
> it result in lags when performing memcg reclaim for deep hierarchies?

Yes I think we want to preserve this.

> For global reclaim we have kswapd, which tries to keep the system within
> bounds so as to avoid direct reclaim at all. Memcg lacks such thing, and
> interleave walks looks like a good compensation for it.

Agreed

> Alternatively, we could avoid taking reference to iter->position and
> make use of css_released cgroup callback to invalidate reclaim
> iterators. With this approach, upper level cgroups shouldn't receive
> unfairly high pressure in comparison to their children. Something like
> this, maybe?
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 87af26a24491..fcc5133210a0 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -859,14 +859,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
> if (prev && reclaim->generation != iter->generation)
> goto out_unlock;
>
> - do {
> + while (1) {
> pos = READ_ONCE(iter->position);
> - /*
> - * A racing update may change the position and
> - * put the last reference, hence css_tryget(),
> - * or retry to see the updated position.
> - */
> - } while (pos && !css_tryget(&pos->css));
> + if (!pos || css_tryget(&pos->css))
> + break;
> + cmpxchg(&iter->position, pos, NULL);

This really deserves a comment.

> + }
> }
>
> if (pos)
> @@ -912,12 +910,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
> }
>
> if (reclaim) {
> - if (cmpxchg(&iter->position, pos, memcg) == pos) {
> - if (memcg)
> - css_get(&memcg->css);
> - if (pos)
> - css_put(&pos->css);
> - }
> + cmpxchg(&iter->position, pos, memcg);
>
> /*
> * pairs with css_tryget when dereferencing iter->position
> @@ -955,6 +948,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
> css_put(&prev->css);
> }
>
> +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
> +{
> + struct mem_cgroup *memcg = dead_memcg;
> + struct mem_cgroup_reclaim_iter *iter;
> + struct mem_cgroup_per_zone *mz;
> + int nid, zid;
> + int i;
> +
> + while ((memcg = parent_mem_cgroup(memcg))) {
> + for_each_node(nid) {
> + for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> + mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
> + for (i = 0; i <= DEF_PRIORITY; i++) {
> + iter = &mz->iter[i];
> + cmpxchg(&iter->position,
> + dead_memcg, NULL);
> + }
> + }
> + }
> + }

An earlier attempts (I cannot find the link right now) were doing this
walk but the patch was dropping a reference and this looks much better.

> +}
> +
> /*
> * Iteration constructs for visiting all cgroups (under a tree). If
> * loops are exited prematurely (break), mem_cgroup_iter_break() must
> @@ -4375,6 +4390,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
> wb_memcg_offline(memcg);
> }
>
> +static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
> +{
> + struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +
> + invalidate_reclaim_iterators(memcg);
> +}
> +
> static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> @@ -5229,6 +5251,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
> .css_alloc = mem_cgroup_css_alloc,
> .css_online = mem_cgroup_css_online,
> .css_offline = mem_cgroup_css_offline,
> + .css_released = mem_cgroup_css_released,
> .css_free = mem_cgroup_css_free,
> .css_reset = mem_cgroup_css_reset,
> .can_attach = mem_cgroup_can_attach,

I was not aware of css_released callback but from my reading of
7d172cc89b85 ("cgroup: add cgroup_subsys->css_released()") this looks
correct.

Feel free to add
Acked-by: Michal Hocko <[email protected]>

Thanks!
--
Michal Hocko
SUSE Labs