Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756630Ab0KVEGn (ORCPT ); Sun, 21 Nov 2010 23:06:43 -0500 Received: from smtp-out.google.com ([74.125.121.35]:49440 "EHLO smtp-out.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756460Ab0KVEGm (ORCPT ); Sun, 21 Nov 2010 23:06:42 -0500 From: Colin Cross To: linux-kernel@vger.kernel.org Cc: Colin Cross , Paul Menage , Li Zefan , containers@lists.linux-foundation.org Subject: [PATCH] cgroup: Convert synchronize_rcu to call_rcu in cgroup_attach_task Date: Sun, 21 Nov 2010 20:06:07 -0800 Message-Id: <1290398767-15230-1-git-send-email-ccross@android.com> X-Mailer: git-send-email 1.7.3.1 In-Reply-To: References: X-System-Of-Record: true Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 6578 Lines: 191 The synchronize_rcu call in cgroup_attach_task can be very expensive. All fastpath accesses to task->cgroups that expect task->cgroups not to change already use task_lock() or cgroup_lock() to protect against updates, and, in cgroup.c, only the CGROUP_DEBUG files have RCU read-side critical sections. sched.c uses RCU read-side-critical sections on task->cgroups, but only to ensure that a dereference of task->cgroups does not become invalid, not that it doesn't change. This patch adds a function put_css_set_rcu, which delays the put until after a grace period has elapsed. This ensures that any RCU read-side critical sections that dereferenced task->cgroups in sched.c have completed before the css_set is deleted. The synchronize_rcu()/put_css_set() combo in cgroup_attach_task() can then be replaced with put_css_set_rcu(). Also converts the CGROUP_DEBUG files that access current->cgroups to use task_lock(current) instead of rcu_read_lock(). Signed-off-by: Colin Cross --- This version fixes the problems with the previous patch by keeping the use of RCU in cgroup_attach_task, but allowing cgroup_attach_task to return immediately by deferring the final put_css_reg to an rcu callback. include/linux/cgroup.h | 4 +++ kernel/cgroup.c | 58 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed4ba11..fd26218 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -287,6 +287,10 @@ struct css_set { /* For RCU-protected deletion */ struct rcu_head rcu_head; + + /* For RCU-delayed puts */ + atomic_t delayed_put_count; + struct work_struct delayed_put_work; }; /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66a416b..c7348e7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -298,7 +298,8 @@ static int cgroup_init_idr(struct cgroup_subsys *ss, /* css_set_lock protects the list of css_set objects, and the * chain of tasks off each css_set. Nests outside task->alloc_lock - * due to cgroup_iter_start() */ + * due to cgroup_iter_start(). Never locked in irq context, so + * the non-irq variants of write_lock and read_lock are used. */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -396,6 +397,39 @@ static inline void put_css_set_taskexit(struct css_set *cg) __put_css_set(cg, 1); } +/* work function, executes in process context */ +static void __put_css_set_rcu(struct work_struct *work) +{ + struct css_set *cg; + cg = container_of(work, struct css_set, delayed_put_work); + + while (atomic_add_unless(&cg->delayed_put_count, -1, 0)) + put_css_set(cg); +} + +/* rcu callback, executes in softirq context */ +static void _put_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + + /* the rcu callback happens in softirq context, but css_set_lock + * is not irq safe, so bounce to process context. + */ + schedule_work(&cg->delayed_put_work); +} + +/* put_css_set_rcu - helper function to delay a put until after an rcu + * grace period + * + * free_css_set_rcu can never be called if there are outstanding + * put_css_set_rcu calls, so we can reuse cg->rcu_head. + */ +static inline void put_css_set_rcu(struct css_set *cg) +{ + if (atomic_inc_return(&cg->delayed_put_count) == 1) + call_rcu(&cg->rcu_head, _put_css_set_rcu); +} + /* * compare_css_sets - helper function for find_existing_css_set(). * @cg: candidate css_set being tested @@ -620,9 +654,11 @@ static struct css_set *find_css_set( } atomic_set(&res->refcount, 1); + atomic_set(&res->delayed_put_count, 0); INIT_LIST_HEAD(&res->cg_links); INIT_LIST_HEAD(&res->tasks); INIT_HLIST_NODE(&res->hlist); + INIT_WORK(&res->delayed_put_work, __put_css_set_rcu); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ @@ -725,9 +761,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * cgroup_attach_task(), which overwrites one tasks cgroup pointer with * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference - * task->cgroup without the expense of grabbing a system global + * task->cgroups without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use + * in cgroup_attach_task(), modifying a task's cgroups pointer we use * task_lock(), which acts on a spinlock (task->alloc_lock) already in * the task_struct routinely used for such matters. * @@ -1802,8 +1838,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) ss->attach(ss, cgrp, oldcgrp, tsk, false); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - synchronize_rcu(); - put_css_set(cg); + put_css_set_rcu(cg); /* * wake up rmdir() waiter. the rmdir should fail since the cgroup @@ -3900,6 +3935,7 @@ int __init cgroup_init_early(void) INIT_LIST_HEAD(&init_css_set.cg_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); + INIT_WORK(&init_css_set.delayed_put_work, __put_css_set_rcu); css_set_count = 1; init_cgroup_root(&rootnode); root_count = 1; @@ -4827,9 +4863,9 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, { u64 count; - rcu_read_lock(); + task_lock(current); count = atomic_read(¤t->cgroups->refcount); - rcu_read_unlock(); + task_unlock(current); return count; } @@ -4838,12 +4874,10 @@ static int current_css_set_cg_links_read(struct cgroup *cont, struct seq_file *seq) { struct cg_cgroup_link *link; - struct css_set *cg; read_lock(&css_set_lock); - rcu_read_lock(); - cg = rcu_dereference(current->cgroups); - list_for_each_entry(link, &cg->cg_links, cg_link_list) { + task_lock(current); + list_for_each_entry(link, ¤t->cgroups->cg_links, cg_link_list) { struct cgroup *c = link->cgrp; const char *name; @@ -4854,7 +4888,7 @@ static int current_css_set_cg_links_read(struct cgroup *cont, seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name); } - rcu_read_unlock(); + task_unlock(current); read_unlock(&css_set_lock); return 0; } -- 1.7.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/