Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752736AbbLLNeP (ORCPT ); Sat, 12 Dec 2015 08:34:15 -0500 Received: from mx2.parallels.com ([199.115.105.18]:51388 "EHLO mx2.parallels.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751055AbbLLNeN (ORCPT ); Sat, 12 Dec 2015 08:34:13 -0500 From: Vladimir Davydov To: Andrew Morton CC: Johannes Weiner , Michal Hocko , , , Subject: [PATCH] mm: memcontrol: fix possible memcg leak due to interrupted reclaim Date: Sat, 12 Dec 2015 16:34:02 +0300 Message-ID: <1449927242-9608-1-git-send-email-vdavydov@virtuozzo.com> X-Mailer: git-send-email 2.1.4 MIME-Version: 1.0 Content-Type: text/plain X-ClientProxiedBy: US-EXCH2.sw.swsoft.com (10.255.249.46) To US-EXCH.sw.swsoft.com (10.255.249.47) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 5627 Lines: 158 Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break() once enough pages have been reclaimed, in which case, in contrast to a full round-trip over a cgroup sub-tree, the current position stored in mem_cgroup_reclaim_iter of the target cgroup does not get invalidated and so is left holding the reference to the last scanned cgroup. If the target cgroup does not get scanned again (we might have just reclaimed the last page or all processes might exit and free their memory voluntary), we will leak it, because there is nobody to put the reference held by the iterator. The problem is easy to reproduce by running the following command sequence in a loop: mkdir /sys/fs/cgroup/memory/test echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs memhog 150M echo $$ > /sys/fs/cgroup/memory/cgroup.procs rmdir test The cgroups generated by it will never get freed. This patch fixes this issue by making mem_cgroup_iter_break() clear mem_cgroup_reclaim_iter->position in case it points to the memory cgroup we interrupted reclaim on. Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting") Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: # 3.19+ --- include/linux/memcontrol.h | 8 +++++--- mm/memcontrol.c | 28 +++++++++++++++++++++++----- mm/vmscan.c | 2 +- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2bb14d021cd0..6000fadc6100 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -313,7 +313,8 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); -void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); +void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *, + struct mem_cgroup_reclaim_cookie *); /** * parent_mem_cgroup - find the accounting parent of a memcg @@ -585,8 +586,9 @@ mem_cgroup_iter(struct mem_cgroup *root, return NULL; } -static inline void mem_cgroup_iter_break(struct mem_cgroup *root, - struct mem_cgroup *prev) +static inline void +mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fae68111876d..6751ff4f4507 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -945,14 +945,32 @@ out: * mem_cgroup_iter_break - abort a hierarchy walk prematurely * @root: hierarchy root * @prev: last visited hierarchy member as returned by mem_cgroup_iter() + * @reclaim: cookie for shared reclaim walks, NULL for full walks */ void mem_cgroup_iter_break(struct mem_cgroup *root, - struct mem_cgroup *prev) + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) { if (!root) root = root_mem_cgroup; if (prev && prev != root) css_put(&prev->css); + if (prev && reclaim) { + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_reclaim_iter *iter; + + mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); + iter = &mz->iter[reclaim->priority]; + + /* + * There is no guarantee that root will ever get scanned again + * so we must put reference to prev held by the iterator so as + * not to risk pinning it forever. + */ + if (reclaim->generation == iter->generation && + cmpxchg(&iter->position, prev, NULL) == prev) + css_put(&prev->css); + } } /* @@ -1308,7 +1326,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, continue; case OOM_SCAN_ABORT: css_task_iter_end(&it); - mem_cgroup_iter_break(memcg, iter); + mem_cgroup_iter_break(memcg, iter, NULL); if (chosen) put_task_struct(chosen); goto unlock; @@ -1485,7 +1503,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, if (!soft_limit_excess(root_memcg)) break; } - mem_cgroup_iter_break(root_memcg, victim); + mem_cgroup_iter_break(root_memcg, victim, NULL); return total; } @@ -1514,7 +1532,7 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) * so we cannot give a lock. */ failed = iter; - mem_cgroup_iter_break(memcg, iter); + mem_cgroup_iter_break(memcg, iter, NULL); break; } else iter->oom_lock = true; @@ -1527,7 +1545,7 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) */ for_each_mem_cgroup_tree(iter, memcg) { if (iter == failed) { - mem_cgroup_iter_break(memcg, iter); + mem_cgroup_iter_break(memcg, iter, NULL); break; } iter->oom_lock = false; diff --git a/mm/vmscan.c b/mm/vmscan.c index bb01b04154ad..4313495f9bd0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2439,7 +2439,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, */ if (!global_reclaim(sc) && sc->nr_reclaimed >= sc->nr_to_reclaim) { - mem_cgroup_iter_break(root, memcg); + mem_cgroup_iter_break(root, memcg, &reclaim); break; } } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/