Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755203AbYLCFMZ (ORCPT ); Wed, 3 Dec 2008 00:12:25 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751182AbYLCFMM (ORCPT ); Wed, 3 Dec 2008 00:12:12 -0500 Received: from fgwmail7.fujitsu.co.jp ([192.51.44.37]:49044 "EHLO fgwmail7.fujitsu.co.jp" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750839AbYLCFMJ (ORCPT ); Wed, 3 Dec 2008 00:12:09 -0500 Date: Wed, 3 Dec 2008 14:11:17 +0900 From: KAMEZAWA Hiroyuki To: KAMEZAWA Hiroyuki Cc: "linux-mm@kvack.org" , "balbir@linux.vnet.ibm.com" , "nishimura@mxp.nes.nec.co.jp" , "kosaki.motohiro@jp.fujitsu.com" , "linux-kernel@vger.kernel.org" , "lizf@cn.fujitsu.com" Subject: [Experimental][PATCH 19/21] memcg-fix-pre-destroy.patch Message-Id: <20081203141117.d3685413.kamezawa.hiroyu@jp.fujitsu.com> In-Reply-To: <20081203134718.6b60986f.kamezawa.hiroyu@jp.fujitsu.com> References: <20081203134718.6b60986f.kamezawa.hiroyu@jp.fujitsu.com> Organization: FUJITSU Co. LTD. X-Mailer: Sylpheed 2.5.0 (GTK+ 2.10.14; i686-pc-mingw32) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9613 Lines: 305 still under development. == Now, final check of refcnt is done after pre_destroy(), so rmdir() can fail after pre_destroy(). memcg set mem->obsolete to be 1 at pre_destroy and this is buggy.. Several ways to fix this can be considered. This is an idea. Fortunately, the user of css_get()/css_put() is only memcg, now. I'd like to reuse it. This patch changes this css->refcnt usage and action as following - css->refcnt is initialized to 1. - after pre_destroy, before destroy(), try to drop css->refcnt to 0. - css_tryget() is added. This only success when css->refcnt > 0. - css_is_removed() is added. This checks css->refcnt == 0 and means this cgroup is under destroy() or not. - css_put() is changed not to call notify_on_release(). From documentation, notify_on_release() is called when there is no tasks/children in cgroup. On implementation, notify_on_release is not called if css->refcnt > 0. This is problematic. memcg has css->refcnt by each page even when there are no tasks. release handler will be never called. But, now, rmdir()/pre_destroy() of memcg works well and checking checking css->ref is not (and shouldn't be) necessary for notifying. Signed-off-by: KAMEZAWA Hiroyuki include/linux/cgroup.h | 21 +++++++++++++++++-- kernel/cgroup.c | 53 +++++++++++++++++++++++++++++++++++-------------- mm/memcontrol.c | 40 +++++++++++++++++++++++++----------- 3 files changed, 85 insertions(+), 29 deletions(-) Index: mmotm-2.6.28-Dec02/include/linux/cgroup.h =================================================================== --- mmotm-2.6.28-Dec02.orig/include/linux/cgroup.h +++ mmotm-2.6.28-Dec02/include/linux/cgroup.h @@ -54,7 +54,9 @@ struct cgroup_subsys_state { /* State maintained by the cgroup system to allow * subsystems to be "busy". Should be accessed via css_get() - * and css_put() */ + * and css_put(). If this value is 0, css is now under removal and + * destroy() will be called soon. (and there is no roll-back.) + */ atomic_t refcnt; @@ -86,7 +88,22 @@ extern void __css_put(struct cgroup_subs static inline void css_put(struct cgroup_subsys_state *css) { if (!test_bit(CSS_ROOT, &css->flags)) - __css_put(css); + atomic_dec(&css->refcnt); +} + +/* returns not-zero if success */ +static inline int css_tryget(struct cgroup_subsys_state *css) +{ + if (!test_bit(CSS_ROOT, &css->flags)) + return atomic_inc_not_zero(&css->refcnt); + return 1; +} + +static inline bool css_under_removal(struct cgroup_subsys_state *css) +{ + if (test_bit(CSS_ROOT, &css->flags)) + return false; + return atomic_read(&css->refcnt) == 0; } /* bits in struct cgroup flags field */ Index: mmotm-2.6.28-Dec02/kernel/cgroup.c =================================================================== --- mmotm-2.6.28-Dec02.orig/kernel/cgroup.c +++ mmotm-2.6.28-Dec02/kernel/cgroup.c @@ -589,6 +589,32 @@ static void cgroup_call_pre_destroy(stru return; } +/* + * Try to set all subsys's refcnt to be 0. + * css->refcnt==0 means this subsys will be destroy()'d. + */ +static bool cgroup_set_subsys_removed(struct cgroup *cgrp) +{ + struct cgroup_subsys *ss; + struct cgroup_subsys_state *css, *tmp; + + for_each_subsys(cgrp->root, ss) { + css = cgrp->subsys[ss->subsys_id]; + if (!atomic_dec_and_test(&css->refcnt)) + goto rollback; + } + return true; +rollback: + for_each_subsys(cgrp->root, ss) { + tmp = cgrp->subsys[ss->subsys_id]; + atomic_inc(&tmp->refcnt); + if (tmp == css) + break; + } + return false; +} + + static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -2310,7 +2336,7 @@ static void init_cgroup_css(struct cgrou struct cgroup *cgrp) { css->cgroup = cgrp; - atomic_set(&css->refcnt, 0); + atomic_set(&css->refcnt, 1); css->flags = 0; if (cgrp == dummytop) set_bit(CSS_ROOT, &css->flags); @@ -2438,7 +2464,7 @@ static int cgroup_has_css_refs(struct cg * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the * release agent to be called anyway. */ - if (css && atomic_read(&css->refcnt)) + if (css && (atomic_read(&css->refcnt) > 1)) return 1; } return 0; @@ -2465,7 +2491,8 @@ static int cgroup_rmdir(struct inode *un /* * Call pre_destroy handlers of subsys. Notify subsystems - * that rmdir() request comes. + * that rmdir() request comes. pre_destroy() is expected to drop all + * extra refcnt to css. (css->refcnt == 1) */ cgroup_call_pre_destroy(cgrp); @@ -2479,8 +2506,15 @@ static int cgroup_rmdir(struct inode *un return -EBUSY; } + /* last check ! */ + if (!cgroup_set_subsys_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); + if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); spin_unlock(&release_list_lock); @@ -3003,7 +3037,7 @@ static void check_for_release(struct cgr /* All of these checks rely on RCU to keep the cgroup * structure alive */ if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) - && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { + && list_empty(&cgrp->children)) { /* Control Group is currently removeable. If it's not * already queued for a userspace notification, queue * it now */ @@ -3020,17 +3054,6 @@ static void check_for_release(struct cgr } } -void __css_put(struct cgroup_subsys_state *css) -{ - struct cgroup *cgrp = css->cgroup; - rcu_read_lock(); - if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } - rcu_read_unlock(); -} - /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path Index: mmotm-2.6.28-Dec02/mm/memcontrol.c =================================================================== --- mmotm-2.6.28-Dec02.orig/mm/memcontrol.c +++ mmotm-2.6.28-Dec02/mm/memcontrol.c @@ -161,7 +161,6 @@ struct mem_cgroup { */ bool use_hierarchy; unsigned long last_oom_jiffies; - int obsolete; atomic_t refcnt; unsigned int swappiness; @@ -590,8 +589,14 @@ mem_cgroup_get_first_node(struct mem_cgr { struct cgroup *cgroup; struct mem_cgroup *ret; - bool obsolete = (root_mem->last_scanned_child && - root_mem->last_scanned_child->obsolete); + struct mem_cgroup *last_scan = root_mem->last_scanned_child; + bool obsolete = false; + + if (last_scan) { + if (css_under_removal(&last_scan->css)) + obsolete = true; + } else + obsolete = true; /* * Scan all children under the mem_cgroup mem @@ -679,7 +684,7 @@ static int mem_cgroup_hierarchical_recla next_mem = mem_cgroup_get_first_node(root_mem); while (next_mem != root_mem) { - if (next_mem->obsolete) { + if (css_under_removal(&next_mem->css)) { mem_cgroup_put(next_mem); cgroup_lock(); next_mem = mem_cgroup_get_first_node(root_mem); @@ -1063,6 +1068,7 @@ int mem_cgroup_try_charge_swapin(struct { struct mem_cgroup *mem; swp_entry_t ent; + int ret; if (mem_cgroup_disabled()) return 0; @@ -1081,10 +1087,18 @@ int mem_cgroup_try_charge_swapin(struct ent.val = page_private(page); mem = lookup_swap_cgroup(ent); - if (!mem || mem->obsolete) + /* + * Because we can't assume "mem" is alive now, use tryget() and + * drop extra count later + */ + if (!mem || !css_tryget(&mem->css)) goto charge_cur_mm; *ptr = mem; - return __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + /* drop extra count */ + css_put(&mem->css); + + return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; @@ -1115,14 +1129,16 @@ int mem_cgroup_cache_charge_swapin(struc ent.val = page_private(page); if (do_swap_account) { mem = lookup_swap_cgroup(ent); - if (mem && mem->obsolete) + if (mem && !css_tryget(&mem->css)) mem = NULL; if (mem) mm = NULL; } ret = mem_cgroup_charge_common(page, mm, mask, MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); - + /* drop extra ref */ + if (mem) + css_put(&mem->css); if (!ret && do_swap_account) { /* avoid double counting */ mem = swap_cgroup_record(ent, NULL); @@ -2065,8 +2081,8 @@ static struct mem_cgroup *mem_cgroup_all * the number of reference from swap_cgroup and free mem_cgroup when * it goes down to 0. * - * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and - * entry which points to this memcg will be ignore at swapin. + * When mem_cgroup is destroyed, css_under_removal() is true and entry which + * points to this memcg will be ignore at swapin. * * Removal of cgroup itself succeeds regardless of refs from swap. */ @@ -2096,7 +2112,7 @@ static void mem_cgroup_get(struct mem_cg static void mem_cgroup_put(struct mem_cgroup *mem) { if (atomic_dec_and_test(&mem->refcnt)) { - if (!mem->obsolete) + if (!css_under_removal(&mem->css)) return; mem_cgroup_free(mem); } @@ -2163,7 +2179,7 @@ static void mem_cgroup_pre_destroy(struc struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - mem->obsolete = 1; + /* dentry's mutex makes this safe. */ mem_cgroup_force_empty(mem, false); } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/