Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751835Ab0LOJfR (ORCPT ); Wed, 15 Dec 2010 04:35:17 -0500 Received: from cn.fujitsu.com ([222.73.24.84]:58516 "EHLO song.cn.fujitsu.com" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1750922Ab0LOJfO (ORCPT ); Wed, 15 Dec 2010 04:35:14 -0500 Message-ID: <4D088BD4.7080103@cn.fujitsu.com> Date: Wed, 15 Dec 2010 17:35:16 +0800 From: Li Zefan User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1b3pre) Gecko/20090513 Fedora/3.0-2.3.beta2.fc11 Thunderbird/3.0b2 MIME-Version: 1.0 To: Andrew Morton CC: Paul Menage , Peter Zijlstra , Hiroyuki KAMEZAWA , Matt Helsley , Stephane Eranian , LKML , containers@lists.linux-foundation.org Subject: [PATCH v2 2/6] cgroups: Allow to bind a subsystem to a cgroup hierarchy References: <4D088BB5.30903@cn.fujitsu.com> In-Reply-To: <4D088BB5.30903@cn.fujitsu.com> X-MIMETrack: Itemize by SMTP Server on mailserver/fnst(Release 8.5.1FP4|July 25, 2010) at 2010-12-15 17:35:09, Serialize by Router on mailserver/fnst(Release 8.5.1FP4|July 25, 2010) at 2010-12-15 17:35:10, Serialize complete at 2010-12-15 17:35:10 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset=UTF-8 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12798 Lines: 445 Stephane posted a patchset to add perf_cgroup subsystem, so perf can be used to monitor all threads belonging to a cgroup. But if you already mounted a cgroup hierarchy but without perf_cgroup and the hierarchy has sub-cgroups, you can't bind perf_cgroup to it, and thus you're not able to use per-cgroup perf feature. This patch alleviates the pain, and then a subsytem can be bind to a hierarchy which has sub-cgroups in it. Matt also commented that users will appreciate this feature. For a cgroup subsystem to become bindable, the bindable flag of struct cgroup_subsys should be set. But for some constraints, not all subsystems can take advantage of this patch. For example, we can't decide a cgroup's cpuset.mems and cpuset.cpus automatically, so cpuset is not bindable. Usage: # mount -t cgroup -o cpuset xxx /mnt # mkdir /mnt/tmp # echo $$ > /mnt/tmp/tasks (assume cpuacct is bindable, and we add cpuacct to the hierarchy) # mount -o remount,cpuset,cpuacct xxx /mnt Changelog v2: - Add more code comments. - Use rcu_assign_pointer in hierarchy_update_css_sets(). - Fix to nullify css pointers in hierarchy_attach_css_failed(). - Fix to call post_clone() for newly-created css. Signed-off-by: Li Zefan --- include/linux/cgroup.h | 5 + kernel/cgroup.c | 273 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 221 insertions(+), 57 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 63d953d..d8c4e22 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -490,6 +490,11 @@ struct cgroup_subsys { * (not available in early_init time.) */ bool use_id:1; + /* + * Indicate if this subsystem can be bound to a cgroup hierarchy + * which has child cgroups. + */ + bool bindable:1; #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66a416b..caac80f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,6 +57,7 @@ #include /* TODO: replace with more sophisticated array */ #include #include +#include #include @@ -871,18 +872,13 @@ static void remove_dir(struct dentry *d) static void cgroup_clear_directory(struct dentry *dentry) { - struct list_head *node; + struct dentry *d, *tmp; BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); spin_lock(&dcache_lock); - node = dentry->d_subdirs.next; - while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_u.d_child); - list_del_init(node); - if (d->d_inode) { - /* This should never be called on a cgroup - * directory with child cgroups */ - BUG_ON(d->d_inode->i_mode & S_IFDIR); + list_for_each_entry_safe(d, tmp, &dentry->d_subdirs, d_u.d_child) { + if (d->d_inode && !(d->d_inode->i_mode & S_IFDIR)) { + list_del_init(&d->d_u.d_child); d = dget_locked(d); spin_unlock(&dcache_lock); d_delete(d); @@ -890,7 +886,6 @@ static void cgroup_clear_directory(struct dentry *dentry) dput(d); spin_lock(&dcache_lock); } - node = dentry->d_subdirs.next; } spin_unlock(&dcache_lock); } @@ -935,6 +930,171 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) css_put(css); } +static void init_cgroup_css(struct cgroup_subsys_state *css, + struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + css->cgroup = cgrp; + atomic_set(&css->refcnt, 1); + css->flags = 0; + css->id = NULL; + if (cgrp == dummytop) + set_bit(CSS_ROOT, &css->flags); + BUG_ON(cgrp->subsys[ss->subsys_id]); + cgrp->subsys[ss->subsys_id] = css; +} + +static int cgroup_attach_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cgroup_subsys_state *css; + int ret; + + css = ss->create(ss, cgrp); + if (IS_ERR(css)) + return PTR_ERR(css); + init_cgroup_css(css, ss, cgrp); + + if (ss->use_id) { + ret = alloc_css_id(ss, cgrp->parent, cgrp); + if (ret) + return ret; + } + /* At error, ->destroy() callback has to free assigned ID. */ + + if (clone_children(cgrp->parent) && ss->post_clone) + ss->post_clone(ss, cgrp); + + return 0; +} + +/* + * cgroup_walk_hierarchy - iterate through a cgroup hierarchy + * @process_cgroup: callback called on each cgroup in the hierarchy + * @data: will be passed to @process_cgroup + * @top_cgrp: the root cgroup of the hierarchy + * + * It's a pre-order traversal, so a parent cgroup will be processed before + * its children. + */ +static int cgroup_walk_hierarchy(int (*process_cgroup)(struct cgroup *, void *), + void *data, struct cgroup *top_cgrp) +{ + struct cgroup *parent = top_cgrp; + struct cgroup *child; + struct list_head *node; + int ret; + + node = parent->children.next; +repeat: + while (node != &parent->children) { + child = list_entry(node, struct cgroup, sibling); + + /* Process this cgroup */ + ret = process_cgroup(child, data); + if (ret) + return ret; + + /* Process its children */ + if (!list_empty(&child->children)) { + parent = child; + node = parent->children.next; + goto repeat; + } else + node = node->next; + } + + /* Process its siblings */ + if (parent != top_cgrp) { + child = parent; + parent = child->parent; + node = child->sibling.next; + goto repeat; + } + + return 0; +} + +/* + * If hierarchy_attach_css() failed, do some cleanup. + */ +static int hierarchy_attach_css_failed(struct cgroup *cgrp, void *data) +{ + unsigned long added_bits = (unsigned long)data; + int i; + + for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT) { + if (cgrp->subsys[i]) { + subsys[i]->destroy(subsys[i], cgrp); + cgrp->subsys[i] = NULL; + } + } + + return 0; +} + +/* + * Allocate css objects of added subsystems, and attach them to the + * existing cgroup. + */ +static int hierarchy_attach_css(struct cgroup *cgrp, void *data) +{ + unsigned long added_bits = (unsigned long)data; + int i; + int ret = 0; + + for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT) { + ret = cgroup_attach_css(subsys[i], cgrp); + if (ret) + break; + } + + if (ret) + cgroup_walk_hierarchy(hierarchy_attach_css_failed, data, + cgrp->top_cgroup); + return ret; +} + +/* + * After attaching new css objects to the cgroup, we need to entangle + * them into the existing css_sets. + */ +static int hierarchy_update_css_sets(struct cgroup *cgrp, void *data) +{ + unsigned long added_bits = (unsigned long)data; + int i; + struct cg_cgroup_link *link; + + write_lock(&css_set_lock); + list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + struct hlist_head *hhead; + + for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT) + rcu_assign_pointer(cg->subsys[i], cgrp->subsys[i]); + + /* rehash */ + hlist_del(&cg->hlist); + hhead = css_set_hash(cg->subsys); + hlist_add_head(&cg->hlist, hhead); + } + write_unlock(&css_set_lock); + + return 0; +} + +/* + * Re-populate each cgroup directory. + * + * Note root cgroup's inode mutex is held. + */ +static int hierarchy_populate_dir(struct cgroup *cgrp, void *data) +{ + mutex_lock_nested(&cgrp->dentry->d_inode->i_mutex, I_MUTEX_CHILD); + cgroup_populate_dir(cgrp); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + return 0; +} + /* * Call with cgroup_mutex held. Drops reference counts on modules, including * any duplicate ones that parse_cgroupfs_options took. If this function @@ -946,36 +1106,59 @@ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long added_bits, removed_bits; struct cgroup *cgrp = &root->top_cgroup; int i; + int err; BUG_ON(!mutex_is_locked(&cgroup_mutex)); removed_bits = root->actual_subsys_bits & ~final_bits; added_bits = final_bits & ~root->actual_subsys_bits; + /* Check that any added subsystems are currently free */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long bit = 1UL << i; - struct cgroup_subsys *ss = subsys[i]; - if (!(bit & added_bits)) - continue; + for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT) { /* * Nobody should tell us to do a subsys that doesn't exist: * parse_cgroupfs_options should catch that case and refcounts * ensure that subsystems won't disappear once selected. */ - BUG_ON(ss == NULL); - if (ss->root != &rootnode) { + BUG_ON(subsys[i] == NULL); + if (subsys[i]->root != &rootnode) { /* Subsystem isn't free */ return -EBUSY; } } - /* Currently we don't handle adding/removing subsystems when - * any child cgroups exist. This is theoretically supportable - * but involves complex error handling, so it's being left until - * later */ - if (root->number_of_cgroups > 1) + /* Removing will be supported later */ + if (root->number_of_cgroups > 1 && removed_bits) return -EBUSY; + /* + * For non-trivial hierarchy, check that added subsystems + * are all bindable + */ + if (root->number_of_cgroups > 1) { + for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT) + if (!subsys[i]->bindable) + return -EBUSY; + } + + /* Attach css objects to the top cgroup */ + for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT) { + BUG_ON(cgrp->subsys[i]); + BUG_ON(!dummytop->subsys[i]); + BUG_ON(dummytop->subsys[i]->cgroup != dummytop); + + cgrp->subsys[i] = dummytop->subsys[i]; + cgrp->subsys[i]->cgroup = cgrp; + } + + err = cgroup_walk_hierarchy(hierarchy_attach_css, + (void *)added_bits, cgrp); + if (err) + goto failed; + + cgroup_walk_hierarchy(hierarchy_update_css_sets, + (void *)added_bits, cgrp); + /* Process each subsystem */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -983,12 +1166,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (bit & added_bits) { /* We're binding this subsystem to this hierarchy */ BUG_ON(ss == NULL); - BUG_ON(cgrp->subsys[i]); - BUG_ON(!dummytop->subsys[i]); - BUG_ON(dummytop->subsys[i]->cgroup != dummytop); mutex_lock(&ss->hierarchy_mutex); - cgrp->subsys[i] = dummytop->subsys[i]; - cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) @@ -1001,10 +1179,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); - if (ss->bind) - ss->bind(ss, dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; + if (ss->bind) + ss->bind(ss, dummytop); subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); mutex_unlock(&ss->hierarchy_mutex); @@ -1031,6 +1209,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, synchronize_rcu(); return 0; + +failed: + for_each_set_bit(i, &added_bits, CGROUP_SUBSYS_COUNT) + cgrp->subsys[i] = NULL; + + return err; } static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) @@ -1286,6 +1470,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) /* (re)populate subsystem files */ cgroup_populate_dir(cgrp); + cgroup_walk_hierarchy(hierarchy_populate_dir, NULL, cgrp); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -3313,20 +3498,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp) return 0; } -static void init_cgroup_css(struct cgroup_subsys_state *css, - struct cgroup_subsys *ss, - struct cgroup *cgrp) -{ - css->cgroup = cgrp; - atomic_set(&css->refcnt, 1); - css->flags = 0; - css->id = NULL; - if (cgrp == dummytop) - set_bit(CSS_ROOT, &css->flags); - BUG_ON(cgrp->subsys[ss->subsys_id]); - cgrp->subsys[ss->subsys_id] = css; -} - static void cgroup_lock_hierarchy(struct cgroupfs_root *root) { /* We need to take each hierarchy_mutex in a consistent order */ @@ -3401,21 +3572,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(ss, cgrp); - - if (IS_ERR(css)) { - err = PTR_ERR(css); + err = cgroup_attach_css(ss, cgrp); + if (err) goto err_destroy; - } - init_cgroup_css(css, ss, cgrp); - if (ss->use_id) { - err = alloc_css_id(ss, parent, cgrp); - if (err) - goto err_destroy; - } - /* At error, ->destroy() callback has to free assigned ID. */ - if (clone_children(parent) && ss->post_clone) - ss->post_clone(ss, cgrp); } cgroup_lock_hierarchy(root); -- 1.6.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/