2009-07-28 23:26:41

by Paul Menage

[permalink] [raw]
Subject: [PATCH 0/4] CGroup: Support for named and empty hierarchies

The following series implements support for named cgroup hierarchies,
and for cgroup hierarchies that have no bound subsystems.

Signed-off-by: Paul Menage <[email protected]>

---

Paul Menage (4):
Allow cgroup hierarchies to be created with no bound subsystems
Add a back-pointer from struct cg_cgroup_link to struct cgroup
Move the cgroup debug subsys into cgroup.c to access internal state
Support named cgroups hierarchies


Documentation/cgroups/cgroups.txt | 20 +
kernel/Makefile | 1
kernel/cgroup.c | 659 +++++++++++++++++++++++++++++--------
kernel/cgroup_debug.c | 105 ------
4 files changed, 533 insertions(+), 252 deletions(-)
delete mode 100644 kernel/cgroup_debug.c


2009-07-28 23:27:32

by Paul Menage

[permalink] [raw]
Subject: [PATCH 2/4] Move the cgroup debug subsys into cgroup.c to access internal state

Move the cgroup debug subsys into cgroup.c to access internal state

While it's architecturally clean to have the cgroup debug subsystem be
completely independent of the cgroups framework, it limits its
usefulness for debugging the contents of internal data structures.
Move the debug subsystem code into the scope of all the cgroups data
structures to make more detailed debugging possible.

Signed-off-by: Paul Menage <[email protected]>
Reviewed-by: Li Zefan <[email protected]>
Reviewed-by: KAMEZAWA Hiroyuki <[email protected]>

---

kernel/Makefile | 1
kernel/cgroup.c | 88 +++++++++++++++++++++++++++++++++++++++++
kernel/cgroup_debug.c | 105 -------------------------------------------------
3 files changed, 88 insertions(+), 106 deletions(-)
delete mode 100644 kernel/cgroup_debug.c

diff --git a/kernel/Makefile b/kernel/Makefile
index ecbd483..251adfe 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 85573e8..9491dbb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3761,3 +3761,91 @@ css_get_next(struct cgroup_subsys *ss, int id,
return ret;
}

+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ kfree(cont->subsys[debug_subsys_id]);
+}
+
+static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
+{
+ return atomic_read(&cont->count);
+}
+
+static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+{
+ return cgroup_task_count(cont);
+}
+
+static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+{
+ return (u64)(unsigned long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup *cont,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = atomic_read(&current->cgroups->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
+
+static struct cftype debug_files[] = {
+ {
+ .name = "cgroup_refcount",
+ .read_u64 = cgroup_refcount_read,
+ },
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .read_u64 = current_css_set_read,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+};
+
+static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, debug_files,
+ ARRAY_SIZE(debug_files));
+}
+
+struct cgroup_subsys debug_subsys = {
+ .name = "debug",
+ .create = debug_create,
+ .destroy = debug_destroy,
+ .populate = debug_populate,
+ .subsys_id = debug_subsys_id,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d79..0000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * kernel/cgroup_debug.c - Example cgroup subsystem that
- * exposes debug info
- *
- * Copyright (C) Google Inc, 2007
- *
- * Developed by Paul Menage ([email protected])
- *
- */
-
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-
-#include <asm/atomic.h>
-
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
- struct cgroup *cont)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- kfree(cont->subsys[debug_subsys_id]);
-}
-
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
-{
- return atomic_read(&cont->count);
-}
-
-static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
-{
- u64 count;
-
- count = cgroup_task_count(cont);
- return count;
-}
-
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
-{
- return (u64)(long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup *cont,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = atomic_read(&current->cgroups->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
-{
- return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
-
-static struct cftype files[] = {
- {
- .name = "cgroup_refcount",
- .read_u64 = cgroup_refcount_read,
- },
- {
- .name = "taskcount",
- .read_u64 = taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-};
-
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
-}
-
-struct cgroup_subsys debug_subsys = {
- .name = "debug",
- .create = debug_create,
- .destroy = debug_destroy,
- .populate = debug_populate,
- .subsys_id = debug_subsys_id,
-};

2009-07-28 23:27:37

by Paul Menage

[permalink] [raw]
Subject: [PATCH 1/4] Support named cgroups hierarchies

Support named cgroups hierarchies

To simplify referring to cgroup hierarchies in mount statements, and
to allow disambiguation in the presence of empty hierarchies and
multiply-bindable subsystems this patch adds support for naming a new
cgroup hierarchy via the "name=" mount option

A pre-existing hierarchy may be specified by either name or by
subsystems; a hierarchy's name cannot be changed by a remount
operation.

Example usage:

# To create a hierarchy called "foo" containing the "cpu" subsystem
mount -t cgroup -oname=foo,cpu cgroup /mnt/cgroup1

# To mount the "foo" hierarchy on a second location
mount -t cgroup -oname=foo cgroup /mnt/cgroup2


Signed-off-by: Paul Menage <[email protected]>
Reviewed-by: Li Zefan <[email protected]>

---

Documentation/cgroups/cgroups.txt | 20 ++++
kernel/cgroup.c | 185 +++++++++++++++++++++++++++----------
2 files changed, 157 insertions(+), 48 deletions(-)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 6eb1a97..4bccfc1 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -408,6 +408,26 @@ You can attach the current shell task by echoing 0:

# echo 0 > tasks

+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy. This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems. Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
3. Kernel API
=============

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 18acba7..85573e8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
*/

#include <linux/cgroup.h>
+#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/kernel.h>
@@ -60,6 +61,8 @@ static struct cgroup_subsys *subsys[] = {
#include <linux/cgroup_subsys.h>
};

+#define MAX_CGROUP_ROOT_NAMELEN 64
+
/*
* A cgroupfs_root represents the root of a cgroup hierarchy,
* and may be associated with a superblock to form an active
@@ -94,6 +97,9 @@ struct cgroupfs_root {

/* The path to use for release notifications. */
char release_agent_path[PATH_MAX];
+
+ /* The name for this hierarchy - may be empty */
+ char name[MAX_CGROUP_ROOT_NAMELEN];
};

/*
@@ -829,6 +835,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",noprefix");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ if (strlen(root->name))
+ seq_printf(seq, ",name=%s", root->name);
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -837,6 +845,9 @@ struct cgroup_sb_opts {
unsigned long subsys_bits;
unsigned long flags;
char *release_agent;
+ char *name;
+
+ struct cgroupfs_root *new_root;
};

/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -851,9 +862,7 @@ static int parse_cgroupfs_options(char *data,
mask = ~(1UL << cpuset_subsys_id);
#endif

- opts->subsys_bits = 0;
- opts->flags = 0;
- opts->release_agent = NULL;
+ memset(opts, 0, sizeof(*opts));

while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
@@ -873,11 +882,33 @@ static int parse_cgroupfs_options(char *data,
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
- opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+ opts->release_agent =
+ kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
- strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
- opts->release_agent[PATH_MAX - 1] = 0;
+ } else if (!strncmp(token, "name=", 5)) {
+ int i;
+ const char *name = token + 5;
+ /* Can't specify an empty name */
+ if (!strlen(name))
+ return -EINVAL;
+ /* Must match [\w.-]+ */
+ for (i = 0; i < strlen(name); i++) {
+ char c = name[i];
+ if (isalnum(c))
+ continue;
+ if ((c == '.') || (c == '-') || (c == '_'))
+ continue;
+ return -EINVAL;
+ }
+ /* Specifying two names is forbidden */
+ if (opts->name)
+ return -EINVAL;
+ opts->name = kstrndup(name,
+ MAX_CGROUP_ROOT_NAMELEN,
+ GFP_KERNEL);
+ if (!opts->name)
+ return -ENOMEM;
} else {
struct cgroup_subsys *ss;
int i;
@@ -904,7 +935,7 @@ static int parse_cgroupfs_options(char *data,
return -EINVAL;

/* We can't have an empty hierarchy */
- if (!opts->subsys_bits)
+ if (!opts->subsys_bits && !opts->name)
return -EINVAL;

return 0;
@@ -932,6 +963,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
goto out_unlock;
}

+ /* Don't allow name to change at remount */
+ if (opts.name && strcmp(opts.name, root->name)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
ret = rebind_subsystems(root, opts.subsys_bits);
if (ret)
goto out_unlock;
@@ -943,6 +980,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
strcpy(root->release_agent_path, opts.release_agent);
out_unlock:
kfree(opts.release_agent);
+ kfree(opts.name);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
unlock_kernel();
@@ -965,6 +1003,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->pids_list);
init_rwsem(&cgrp->pids_mutex);
}
+
static void init_cgroup_root(struct cgroupfs_root *root)
{
struct cgroup *cgrp = &root->top_cgroup;
@@ -978,31 +1017,59 @@ static void init_cgroup_root(struct cgroupfs_root *root)

static int cgroup_test_super(struct super_block *sb, void *data)
{
- struct cgroupfs_root *new = data;
+ struct cgroup_sb_opts *opts = data;
struct cgroupfs_root *root = sb->s_fs_info;

- /* First check subsystems */
- if (new->subsys_bits != root->subsys_bits)
- return 0;
+ /* If we asked for a name then it must match */
+ if (opts->name && strcmp(opts->name, root->name))
+ return 0;

- /* Next check flags */
- if (new->flags != root->flags)
+ /* If we asked for subsystems then they must match */
+ if (opts->subsys_bits && (opts->subsys_bits != root->subsys_bits))
return 0;

return 1;
}

+static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
+{
+ struct cgroupfs_root *root;
+
+ /* Empty hierarchies aren't supported */
+ if (!opts->subsys_bits)
+ return NULL;
+
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ init_cgroup_root(root);
+ root->subsys_bits = opts->subsys_bits;
+ root->flags = opts->flags;
+ if (opts->release_agent)
+ strcpy(root->release_agent_path, opts->release_agent);
+ if (opts->name)
+ strcpy(root->name, opts->name);
+ return root;
+}
+
static int cgroup_set_super(struct super_block *sb, void *data)
{
int ret;
- struct cgroupfs_root *root = data;
+ struct cgroup_sb_opts *opts = data;
+
+ /* If we don't have a new root, we can't set up a new sb */
+ if (!opts->new_root)
+ return -EINVAL;
+
+ BUG_ON(!opts->subsys_bits);

ret = set_anon_super(sb, NULL);
if (ret)
return ret;

- sb->s_fs_info = root;
- root->sb = sb;
+ sb->s_fs_info = opts->new_root;
+ opts->new_root->sb = sb;

sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1039,48 +1106,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
void *data, struct vfsmount *mnt)
{
struct cgroup_sb_opts opts;
+ struct cgroupfs_root *root;
int ret = 0;
struct super_block *sb;
- struct cgroupfs_root *root;
- struct list_head tmp_cg_links;
+ struct cgroupfs_root *new_root;

/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
- if (ret) {
- kfree(opts.release_agent);
- return ret;
- }
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- kfree(opts.release_agent);
- return -ENOMEM;
- }
+ if (ret)
+ goto out_err;

- init_cgroup_root(root);
- root->subsys_bits = opts.subsys_bits;
- root->flags = opts.flags;
- if (opts.release_agent) {
- strcpy(root->release_agent_path, opts.release_agent);
- kfree(opts.release_agent);
+ /*
+ * Allocate a new cgroup root. We may not need it if we're
+ * reusing an existing hierarchy.
+ */
+ new_root = cgroup_root_from_opts(&opts);
+ if (IS_ERR(new_root)) {
+ ret = PTR_ERR(new_root);
+ goto out_err;
}
+ opts.new_root = new_root;

- sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
-
+ /* Locate an existing or new sb for this hierarchy */
+ sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
if (IS_ERR(sb)) {
- kfree(root);
- return PTR_ERR(sb);
+ ret = PTR_ERR(sb);
+ kfree(opts.new_root);
+ goto out_err;
}

- if (sb->s_fs_info != root) {
- /* Reusing an existing superblock */
- BUG_ON(sb->s_root == NULL);
- kfree(root);
- root = NULL;
- } else {
- /* New superblock */
+ root = sb->s_fs_info;
+ BUG_ON(!root);
+ if (root == opts.new_root) {
+ /* We used the new root structure, so this is a new hierarchy */
+ struct list_head tmp_cg_links;
struct cgroup *root_cgrp = &root->top_cgroup;
struct inode *inode;
+ struct cgroupfs_root *existing_root;
int i;

BUG_ON(sb->s_root != NULL);
@@ -1093,6 +1155,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);

+ if (strlen(root->name)) {
+ /* Check for name clashes with existing mounts */
+ for_each_active_root(existing_root) {
+ if (!strcmp(existing_root->name, root->name)) {
+ ret = -EBUSY;
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&inode->i_mutex);
+ goto drop_new_super;
+ }
+ }
+ }
+
/*
* We're accessing css_set_count without locking
* css_set_lock here, but that's OK - it can only be
@@ -1111,7 +1185,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
if (ret == -EBUSY) {
mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
- goto free_cg_links;
+ free_cg_links(&tmp_cg_links);
+ goto drop_new_super;
}

/* EBUSY should be the only error here */
@@ -1145,15 +1220,26 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
cgroup_populate_dir(root_cgrp);
mutex_unlock(&inode->i_mutex);
mutex_unlock(&cgroup_mutex);
+ } else {
+ /*
+ * We re-used an existing hierarchy - the new root (if
+ * any) is not needed
+ */
+ kfree(opts.new_root);
}

simple_set_mnt(mnt, sb);
+ kfree(opts.release_agent);
+ kfree(opts.name);
return 0;

- free_cg_links:
- free_cg_links(&tmp_cg_links);
drop_new_super:
deactivate_locked_super(sb);
+
+ out_err:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+
return ret;
}

@@ -2971,6 +3057,9 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
seq_printf(m, "%lu:", root->subsys_bits);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ if (strlen(root->name))
+ seq_printf(m, "%sname=%s", count ? "," : "",
+ root->name);
seq_putc(m, ':');
get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
cgrp = task_cgroup(tsk, subsys_id);

2009-07-28 23:27:45

by Paul Menage

[permalink] [raw]
Subject: [PATCH 4/4] Allow cgroup hierarchies to be created with no bound subsystems

Allow cgroup hierarchies to be created with no bound subsystems

This patch removes the restriction that a cgroup hierarchy must have
at least one bound subsystem. The mount option "none" is treated as
an explicit request for no bound subsystems.

A hierarchy with no subsystems can be useful for plain task tracking,
and is also a step towards the support for multiply-bindable
subsystems.

As part of this change, the hierarchy id is no longer calculated from
the bitmask of subsystems in the hierarchy (since this is not
guaranteed to be unique) but is allocated via an ida. Reference
counts on cgroups from css_set objects are now taken explicitly one
per hierarchy, rather than one per subsystem.

Example usage:

mount -t cgroup -o none,name=foo cgroup /mnt/cgroup

Based on the "no-op"/"none" subsystem concept proposed by
[email protected]

Signed-off-by: Paul Menage <[email protected]>
Reviewed-by: Li Zefan <[email protected]>

---

kernel/cgroup.c | 158 ++++++++++++++++++++++++++++++++++---------------------
1 files changed, 99 insertions(+), 59 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9972814..6af0650 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -49,6 +49,7 @@
#include <linux/namei.h>
#include <linux/smp_lock.h>
#include <linux/pid_namespace.h>
+#include <linux/idr.h>

#include <asm/atomic.h>

@@ -77,6 +78,9 @@ struct cgroupfs_root {
*/
unsigned long subsys_bits;

+ /* Unique id for this hierarchy. */
+ int hierarchy_id;
+
/* The bitmask of subsystems currently attached to this hierarchy */
unsigned long actual_subsys_bits;

@@ -147,6 +151,10 @@ struct css_id {
static LIST_HEAD(roots);
static int root_count;

+static DEFINE_IDA(hierarchy_ida);
+static int next_hierarchy_id;
+static DEFINE_SPINLOCK(hierarchy_id_lock);
+
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)

@@ -264,42 +272,10 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
* compiled into their kernel but not actually in use */
static int use_task_css_set_links __read_mostly;

-/* When we create or destroy a css_set, the operation simply
- * takes/releases a reference count on all the cgroups referenced
- * by subsystems in this css_set. This can end up multiple-counting
- * some cgroups, but that's OK - the ref-count is just a
- * busy/not-busy indicator; ensuring that we only count each cgroup
- * once would require taking a global lock to ensure that no
- * subsystems moved between hierarchies while we were doing so.
- *
- * Possible TODO: decide at boot time based on the number of
- * registered subsystems and the number of CPUs or NUMA nodes whether
- * it's better for performance to ref-count every subsystem, or to
- * take a global lock and only add one ref count to each hierarchy.
- */
-
-/*
- * unlink a css_set from the list and free it
- */
-static void unlink_css_set(struct css_set *cg)
+static void __put_css_set(struct css_set *cg, int taskexit)
{
struct cg_cgroup_link *link;
struct cg_cgroup_link *saved_link;
-
- hlist_del(&cg->hlist);
- css_set_count--;
-
- list_for_each_entry_safe(link, saved_link, &cg->cg_links,
- cg_link_list) {
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- kfree(link);
- }
-}
-
-static void __put_css_set(struct css_set *cg, int taskexit)
-{
- int i;
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
@@ -312,20 +288,27 @@ static void __put_css_set(struct css_set *cg, int taskexit)
write_unlock(&css_set_lock);
return;
}
- unlink_css_set(cg);
- write_unlock(&css_set_lock);

- rcu_read_lock();
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
+ /* This css_set is dead. unlink it and release cgroup refcounts */
+ hlist_del(&cg->hlist);
+ css_set_count--;
+
+ list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+ cg_link_list) {
+ struct cgroup *cgrp = link->cgrp;
+ list_del(&link->cg_link_list);
+ list_del(&link->cgrp_link_list);
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
+
+ kfree(link);
}
- rcu_read_unlock();
+
+ write_unlock(&css_set_lock);
kfree(cg);
}

@@ -519,6 +502,7 @@ static void link_css_set(struct list_head *tmp_cg_links,
cgrp_link_list);
link->cg = cg;
link->cgrp = cgrp;
+ atomic_inc(&cgrp->count);
list_move(&link->cgrp_link_list, &cgrp->css_sets);
/*
* Always add links to the tail of the list so that the list
@@ -539,7 +523,6 @@ static struct css_set *find_css_set(
{
struct css_set *res;
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
- int i;

struct list_head tmp_cg_links;

@@ -578,10 +561,6 @@ static struct css_set *find_css_set(

write_lock(&css_set_lock);
/* Add reference counts and links from the new css_set. */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup *cgrp = res->subsys[i]->cgroup;
- atomic_inc(&cgrp->count);
- }
list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
struct cgroup *c = link->cgrp;
if (c->root == cgrp->root)
@@ -960,8 +939,11 @@ struct cgroup_sb_opts {
unsigned long flags;
char *release_agent;
char *name;
+ /* User explicitly requested empty subsystem */
+ bool none;

struct cgroupfs_root *new_root;
+
};

/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -990,6 +972,9 @@ static int parse_cgroupfs_options(char *data,
if (!ss->disabled)
opts->subsys_bits |= 1ul << i;
}
+ } else if (!strcmp(token, "none")) {
+ /* Explicitly have no subsystems */
+ opts->none = true;
} else if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
} else if (!strncmp(token, "release_agent=", 14)) {
@@ -1039,6 +1024,8 @@ static int parse_cgroupfs_options(char *data,
}
}

+ /* Consistency checks */
+
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
@@ -1048,7 +1035,15 @@ static int parse_cgroupfs_options(char *data,
(opts->subsys_bits & mask))
return -EINVAL;

- /* We can't have an empty hierarchy */
+
+ /* Can't specify "none" and some subsystems */
+ if (opts->subsys_bits && opts->none)
+ return -EINVAL;
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
if (!opts->subsys_bits && !opts->name)
return -EINVAL;

@@ -1129,6 +1124,31 @@ static void init_cgroup_root(struct cgroupfs_root *root)
init_cgroup_housekeeping(cgrp);
}

+static bool init_root_id(struct cgroupfs_root *root)
+{
+ int ret = 0;
+
+ do {
+ if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
+ return false;
+ spin_lock(&hierarchy_id_lock);
+ /* Try to allocate the next unused ID */
+ ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
+ &root->hierarchy_id);
+ if (ret == -ENOSPC)
+ /* Try again starting from 0 */
+ ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
+ if (!ret) {
+ next_hierarchy_id = root->hierarchy_id + 1;
+ } else if (ret != -EAGAIN) {
+ /* Can only get here if the 31-bit IDR is full ... */
+ BUG_ON(ret);
+ }
+ spin_unlock(&hierarchy_id_lock);
+ } while (ret);
+ return true;
+}
+
static int cgroup_test_super(struct super_block *sb, void *data)
{
struct cgroup_sb_opts *opts = data;
@@ -1138,8 +1158,12 @@ static int cgroup_test_super(struct super_block *sb, void *data)
if (opts->name && strcmp(opts->name, root->name))
return 0;

- /* If we asked for subsystems then they must match */
- if (opts->subsys_bits && (opts->subsys_bits != root->subsys_bits))
+ /*
+ * If we asked for subsystems (or explicitly for no
+ * subsystems) then they must match
+ */
+ if ((opts->subsys_bits || opts->none)
+ && (opts->subsys_bits != root->subsys_bits))
return 0;

return 1;
@@ -1149,15 +1173,19 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
{
struct cgroupfs_root *root;

- /* Empty hierarchies aren't supported */
- if (!opts->subsys_bits)
+ if (!opts->subsys_bits && !opts->none)
return NULL;

root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);

+ if (!init_root_id(root)) {
+ kfree(root);
+ return ERR_PTR(-ENOMEM);
+ }
init_cgroup_root(root);
+
root->subsys_bits = opts->subsys_bits;
root->flags = opts->flags;
if (opts->release_agent)
@@ -1167,6 +1195,18 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
return root;
}

+static void cgroup_drop_root(struct cgroupfs_root *root)
+{
+ if (!root)
+ return;
+
+ BUG_ON(!root->hierarchy_id);
+ spin_lock(&hierarchy_id_lock);
+ ida_remove(&hierarchy_ida, root->hierarchy_id);
+ spin_unlock(&hierarchy_id_lock);
+ kfree(root);
+}
+
static int cgroup_set_super(struct super_block *sb, void *data)
{
int ret;
@@ -1176,7 +1216,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
if (!opts->new_root)
return -EINVAL;

- BUG_ON(!opts->subsys_bits);
+ BUG_ON(!opts->subsys_bits && !opts->none);

ret = set_anon_super(sb, NULL);
if (ret)
@@ -1245,7 +1285,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
- kfree(opts.new_root);
+ cgroup_drop_root(opts.new_root);
goto out_err;
}

@@ -1339,7 +1379,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
* We re-used an existing hierarchy - the new root (if
* any) is not needed
*/
- kfree(opts.new_root);
+ cgroup_drop_root(opts.new_root);
}

simple_set_mnt(mnt, sb);
@@ -1399,7 +1439,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
mutex_unlock(&cgroup_mutex);

kill_litter_super(sb);
- kfree(root);
+ cgroup_drop_root(root);
}

static struct file_system_type cgroup_fs_type = {
@@ -3089,7 +3129,7 @@ int __init cgroup_init(void)
/* Add init_css_set to the hash table */
hhead = css_set_hash(init_css_set.subsys);
hlist_add_head(&init_css_set.hlist, hhead);
-
+ BUG_ON(!init_root_id(&rootnode));
err = register_filesystem(&cgroup_fs_type);
if (err < 0)
goto out;
@@ -3144,7 +3184,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
struct cgroup *cgrp;
int count = 0;

- seq_printf(m, "%lu:", root->subsys_bits);
+ seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
@@ -3190,8 +3230,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
- seq_printf(m, "%s\t%lu\t%d\t%d\n",
- ss->name, ss->root->subsys_bits,
+ seq_printf(m, "%s\t%d\t%d\t%d\n",
+ ss->name, ss->root->hierarchy_id,
ss->root->number_of_cgroups, !ss->disabled);
}
mutex_unlock(&cgroup_mutex);
@@ -3909,8 +3949,8 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
name = c->dentry->d_name.name;
else
name = "?";
- seq_printf(seq, "Root %lu group %s\n",
- c->root->subsys_bits, name);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name);
}
rcu_read_unlock();
read_unlock(&css_set_lock);

2009-07-28 23:28:00

by Paul Menage

[permalink] [raw]
Subject: [PATCH 3/4] Add a back-pointer from struct cg_cgroup_link to struct cgroup

Add a back-pointer from struct cg_cgroup_link to struct cgroup

Currently the cgroups code makes the assumption that the subsystem
pointers in a struct css_set uniquely identify the hierarchy->cgroup
mappings associated with the css_set; and there's no way to directly
identify the associated set of cgroups other than by indirecting
through the appropriate subsystem state pointers.

This patch removes the need for that assumption by adding a
back-pointer from struct cg_cgroup_link object to its associated
cgroup; this allows the set of cgroups to be determined by traversing
the cg_links list in the struct css_set.

Signed-off-by: Paul Menage <[email protected]>
Reviewed-by: Li Zefan <[email protected]>

---

kernel/cgroup.c | 248 ++++++++++++++++++++++++++++++++++++++++++++-----------
1 files changed, 199 insertions(+), 49 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9491dbb..9972814 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -207,6 +207,7 @@ struct cg_cgroup_link {
* cgroup, anchored on cgroup->css_sets
*/
struct list_head cgrp_link_list;
+ struct cgroup *cgrp;
/*
* List running through cg_cgroup_links pointing at a
* single css_set object, anchored on css_set->cg_links
@@ -233,8 +234,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
static DEFINE_RWLOCK(css_set_lock);
static int css_set_count;

-/* hash table for cgroup groups. This improves the performance to
- * find an existing css_set */
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
#define CSS_SET_HASH_BITS 7
#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -344,6 +348,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
}

/*
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cg: candidate css_set being tested
+ * @old_cg: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cg,
+ struct css_set *old_cg,
+ struct cgroup *new_cgrp,
+ struct cgroup_subsys_state *template[])
+{
+ struct list_head *l1, *l2;
+
+ if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
+ /* Not all subsystems matched */
+ return false;
+ }
+
+ /*
+ * Compare cgroup pointers in order to distinguish between
+ * different cgroups in heirarchies with no subsystems. We
+ * could get by with just this check alone (and skip the
+ * memcmp above) but on most setups the memcmp check will
+ * avoid the need for this more expensive check on almost all
+ * candidates.
+ */
+
+ l1 = &cg->cg_links;
+ l2 = &old_cg->cg_links;
+ while (1) {
+ struct cg_cgroup_link *cgl1, *cgl2;
+ struct cgroup *cg1, *cg2;
+
+ l1 = l1->next;
+ l2 = l2->next;
+ /* See if we reached the end - both lists are equal length. */
+ if (l1 == &cg->cg_links) {
+ BUG_ON(l2 != &old_cg->cg_links);
+ break;
+ } else {
+ BUG_ON(l2 == &old_cg->cg_links);
+ }
+ /* Locate the cgroups associated with these links. */
+ cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
+ cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
+ cg1 = cgl1->cgrp;
+ cg2 = cgl2->cgrp;
+ /* Hierarchies should be linked in the same order. */
+ BUG_ON(cg1->root != cg2->root);
+
+ /*
+ * If this hierarchy is the hierarchy of the cgroup
+ * that's changing, then we need to check that this
+ * css_set points to the new cgroup; if it's any other
+ * hierarchy, then this css_set should point to the
+ * same cgroup as the old css_set.
+ */
+ if (cg1->root == new_cgrp->root) {
+ if (cg1 != new_cgrp)
+ return false;
+ } else {
+ if (cg1 != cg2)
+ return false;
+ }
+ }
+ return true;
+}
+
+/*
* find_existing_css_set() is a helper for
* find_css_set(), and checks to see whether an existing
* css_set is suitable.
@@ -384,10 +460,11 @@ static struct css_set *find_existing_css_set(

hhead = css_set_hash(template);
hlist_for_each_entry(cg, node, hhead, hlist) {
- if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
- /* All subsystems matched */
- return cg;
- }
+ if (!compare_css_sets(cg, oldcg, cgrp, template))
+ continue;
+
+ /* This css_set matches what we need */
+ return cg;
}

/* No existing cgroup group matched */
@@ -441,8 +518,13 @@ static void link_css_set(struct list_head *tmp_cg_links,
link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
cgrp_link_list);
link->cg = cg;
+ link->cgrp = cgrp;
list_move(&link->cgrp_link_list, &cgrp->css_sets);
- list_add(&link->cg_link_list, &cg->cg_links);
+ /*
+ * Always add links to the tail of the list so that the list
+ * is sorted by order of hierarchy creation
+ */
+ list_add_tail(&link->cg_link_list, &cg->cg_links);
}

/*
@@ -462,6 +544,7 @@ static struct css_set *find_css_set(
struct list_head tmp_cg_links;

struct hlist_head *hhead;
+ struct cg_cgroup_link *link;

/* First see if we already have a cgroup group that matches
* the desired set */
@@ -497,18 +580,14 @@ static struct css_set *find_css_set(
/* Add reference counts and links from the new css_set. */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup *cgrp = res->subsys[i]->cgroup;
- struct cgroup_subsys *ss = subsys[i];
atomic_inc(&cgrp->count);
- /*
- * We want to add a link once per cgroup, so we
- * only do it for the first subsystem in each
- * hierarchy
- */
- if (ss->root->subsys_list.next == &ss->sibling)
- link_css_set(&tmp_cg_links, res, cgrp);
}
- if (list_empty(&rootnode.subsys_list))
- link_css_set(&tmp_cg_links, res, dummytop);
+ list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+ struct cgroup *c = link->cgrp;
+ if (c->root == cgrp->root)
+ c = cgrp;
+ link_css_set(&tmp_cg_links, res, c);
+ }

BUG_ON(!list_empty(&tmp_cg_links));

@@ -524,6 +603,41 @@ static struct css_set *find_css_set(
}

/*
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroupfs_root *root)
+{
+ struct css_set *css;
+ struct cgroup *res = NULL;
+
+ BUG_ON(!mutex_is_locked(&cgroup_mutex));
+ read_lock(&css_set_lock);
+ /*
+ * No need to lock the task - since we hold cgroup_mutex the
+ * task can't change groups, so the only thing that can happen
+ * is that it exits and its css is set back to init_css_set.
+ */
+ css = task->cgroups;
+ if (css == &init_css_set) {
+ res = &root->top_cgroup;
+ } else {
+ struct cg_cgroup_link *link;
+ list_for_each_entry(link, &css->cg_links, cg_link_list) {
+ struct cgroup *c = link->cgrp;
+ if (c->root == root) {
+ res = c;
+ break;
+ }
+ }
+ }
+ read_unlock(&css_set_lock);
+ BUG_ON(!res);
+ return res;
+}
+
+/*
* There is one global cgroup mutex. We also require taking
* task_lock() when dereferencing a task's cgroup subsys pointers.
* See "The task_lock() exception", at the end of this comment.
@@ -1350,27 +1464,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
return 0;
}

-/*
- * Return the first subsystem attached to a cgroup's hierarchy, and
- * its subsystem id.
- */
-
-static void get_first_subsys(const struct cgroup *cgrp,
- struct cgroup_subsys_state **css, int *subsys_id)
-{
- const struct cgroupfs_root *root = cgrp->root;
- const struct cgroup_subsys *test_ss;
- BUG_ON(list_empty(&root->subsys_list));
- test_ss = list_entry(root->subsys_list.next,
- struct cgroup_subsys, sibling);
- if (css) {
- *css = cgrp->subsys[test_ss->subsys_id];
- BUG_ON(!*css);
- }
- if (subsys_id)
- *subsys_id = test_ss->subsys_id;
-}
-
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1387,12 +1480,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
struct css_set *cg;
struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
- int subsys_id;
-
- get_first_subsys(cgrp, NULL, &subsys_id);

/* Nothing to do if the task is already in that cgroup */
- oldcgrp = task_cgroup(tsk, subsys_id);
+ oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
return 0;

@@ -1950,7 +2040,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
* the start of a css_set
*/
static void cgroup_advance_iter(struct cgroup *cgrp,
- struct cgroup_iter *it)
+ struct cgroup_iter *it)
{
struct list_head *l = it->cg_link;
struct cg_cgroup_link *link;
@@ -2944,6 +3034,7 @@ int __init cgroup_init_early(void)
init_task.cgroups = &init_css_set;

init_css_set_link.cg = &init_css_set;
+ init_css_set_link.cgrp = dummytop;
list_add(&init_css_set_link.cgrp_link_list,
&rootnode.top_cgroup.css_sets);
list_add(&init_css_set_link.cg_link_list,
@@ -3051,7 +3142,6 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
for_each_active_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
- int subsys_id;
int count = 0;

seq_printf(m, "%lu:", root->subsys_bits);
@@ -3061,8 +3151,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
- get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
- cgrp = task_cgroup(tsk, subsys_id);
+ cgrp = task_cgroup_from_root(tsk, root);
retval = cgroup_path(cgrp, buf, PAGE_SIZE);
if (retval < 0)
goto out_unlock;
@@ -3388,13 +3477,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
{
int ret;
struct cgroup *target;
- int subsys_id;

if (cgrp == dummytop)
return 1;

- get_first_subsys(cgrp, NULL, &subsys_id);
- target = task_cgroup(task, subsys_id);
+ target = task_cgroup_from_root(task, cgrp->root);
while (cgrp != target && cgrp!= cgrp->top_cgroup)
cgrp = cgrp->parent;
ret = (cgrp == target);
@@ -3804,6 +3891,59 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
return count;
}

+static int current_css_set_cg_links_read(struct cgroup *cont,
+ struct cftype *cft,
+ struct seq_file *seq)
+{
+ struct cg_cgroup_link *link;
+ struct css_set *cg;
+
+ read_lock(&css_set_lock);
+ rcu_read_lock();
+ cg = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+ struct cgroup *c = link->cgrp;
+ const char *name;
+
+ if (c->dentry)
+ name = c->dentry->d_name.name;
+ else
+ name = "?";
+ seq_printf(seq, "Root %lu group %s\n",
+ c->root->subsys_bits, name);
+ }
+ rcu_read_unlock();
+ read_unlock(&css_set_lock);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct cgroup *cont,
+ struct cftype *cft,
+ struct seq_file *seq)
+{
+ struct cg_cgroup_link *link;
+
+ read_lock(&css_set_lock);
+ list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
+ struct css_set *cg = link->cg;
+ struct task_struct *task;
+ int count = 0;
+ seq_printf(seq, "css_set %p\n", cg);
+ list_for_each_entry(task, &cg->tasks, cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
+ seq_puts(seq, " ...\n");
+ break;
+ } else {
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+ }
+ }
+ read_unlock(&css_set_lock);
+ return 0;
+}
+
static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
{
return test_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3830,6 +3970,16 @@ static struct cftype debug_files[] = {
},

{
+ .name = "current_css_set_cg_links",
+ .read_seq_string = current_css_set_cg_links_read,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .read_seq_string = cgroup_css_links_read,
+ },
+
+ {
.name = "releasable",
.read_u64 = releasable_read,
},

2009-07-29 07:32:15

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH 0/4] CGroup: Support for named and empty hierarchies

Paul Menage wrote:
> The following series implements support for named cgroup hierarchies,
> and for cgroup hierarchies that have no bound subsystems.
>
> Signed-off-by: Paul Menage <[email protected]>
>

Looks nice to me!

2009-07-29 10:44:24

by Dhaval Giani

[permalink] [raw]
Subject: Re: [PATCH 1/4] Support named cgroups hierarchies

On Tue, Jul 28, 2009 at 04:26:21PM -0700, Paul Menage wrote:
> Support named cgroups hierarchies
>
> To simplify referring to cgroup hierarchies in mount statements, and
> to allow disambiguation in the presence of empty hierarchies and
> multiply-bindable subsystems this patch adds support for naming a new
> cgroup hierarchy via the "name=" mount option
>
> A pre-existing hierarchy may be specified by either name or by
> subsystems; a hierarchy's name cannot be changed by a remount
> operation.
>
> Example usage:
>
> # To create a hierarchy called "foo" containing the "cpu" subsystem
> mount -t cgroup -oname=foo,cpu cgroup /mnt/cgroup1
>
> # To mount the "foo" hierarchy on a second location
> mount -t cgroup -oname=foo cgroup /mnt/cgroup2
>
>
> Signed-off-by: Paul Menage <[email protected]>
> Reviewed-by: Li Zefan <[email protected]>
>
> ---
>
> Documentation/cgroups/cgroups.txt | 20 ++++
> kernel/cgroup.c | 185 +++++++++++++++++++++++++++----------
> 2 files changed, 157 insertions(+), 48 deletions(-)
>
> diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
> index 6eb1a97..4bccfc1 100644
> --- a/Documentation/cgroups/cgroups.txt
> +++ b/Documentation/cgroups/cgroups.txt
> @@ -408,6 +408,26 @@ You can attach the current shell task by echoing 0:
>
> # echo 0 > tasks
>
> +2.3 Mounting hierarchies by name
> +--------------------------------
> +
> +Passing the name=<x> option when mounting a cgroups hierarchy
> +associates the given name with the hierarchy. This can be used when
> +mounting a pre-existing hierarchy, in order to refer to it by name
> +rather than by its set of active subsystems. Each hierarchy is either
> +nameless, or has a unique name.
> +
> +The name should match [\w.-]+
> +
> +When passing a name=<x> option for a new hierarchy, you need to
> +specify subsystems manually; the legacy behaviour of mounting all
> +subsystems when none are explicitly specified is not supported when
> +you give a subsystem a name.
> +
> +The name of the subsystem appears as part of the hierarchy description
> +in /proc/mounts and /proc/<pid>/cgroups.
> +
> +
> 3. Kernel API
> =============
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 18acba7..85573e8 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -23,6 +23,7 @@
> */
>
> #include <linux/cgroup.h>
> +#include <linux/ctype.h>
> #include <linux/errno.h>
> #include <linux/fs.h>
> #include <linux/kernel.h>
> @@ -60,6 +61,8 @@ static struct cgroup_subsys *subsys[] = {
> #include <linux/cgroup_subsys.h>
> };
>
> +#define MAX_CGROUP_ROOT_NAMELEN 64
> +
> /*
> * A cgroupfs_root represents the root of a cgroup hierarchy,
> * and may be associated with a superblock to form an active
> @@ -94,6 +97,9 @@ struct cgroupfs_root {
>
> /* The path to use for release notifications. */
> char release_agent_path[PATH_MAX];
> +
> + /* The name for this hierarchy - may be empty */
> + char name[MAX_CGROUP_ROOT_NAMELEN];
> };
>
> /*
> @@ -829,6 +835,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
> seq_puts(seq, ",noprefix");
> if (strlen(root->release_agent_path))
> seq_printf(seq, ",release_agent=%s", root->release_agent_path);
> + if (strlen(root->name))
> + seq_printf(seq, ",name=%s", root->name);
> mutex_unlock(&cgroup_mutex);
> return 0;
> }
> @@ -837,6 +845,9 @@ struct cgroup_sb_opts {
> unsigned long subsys_bits;
> unsigned long flags;
> char *release_agent;
> + char *name;
> +
> + struct cgroupfs_root *new_root;
> };
>
> /* Convert a hierarchy specifier into a bitmask of subsystems and
> @@ -851,9 +862,7 @@ static int parse_cgroupfs_options(char *data,
> mask = ~(1UL << cpuset_subsys_id);
> #endif
>
> - opts->subsys_bits = 0;
> - opts->flags = 0;
> - opts->release_agent = NULL;
> + memset(opts, 0, sizeof(*opts));
>
> while ((token = strsep(&o, ",")) != NULL) {
> if (!*token)
> @@ -873,11 +882,33 @@ static int parse_cgroupfs_options(char *data,
> /* Specifying two release agents is forbidden */
> if (opts->release_agent)
> return -EINVAL;
> - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
> + opts->release_agent =
> + kstrndup(token + 14, PATH_MAX, GFP_KERNEL);

I am not sure how it can be acheived, but can we avoid using 14 here (it
took me a moment before I realized it was strlen("release_agent")

> if (!opts->release_agent)
> return -ENOMEM;
> - strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
> - opts->release_agent[PATH_MAX - 1] = 0;
> + } else if (!strncmp(token, "name=", 5)) {
> + int i;
> + const char *name = token + 5;

similarly here as well

> + /* Can't specify an empty name */
> + if (!strlen(name))
> + return -EINVAL;
> + /* Must match [\w.-]+ */
> + for (i = 0; i < strlen(name); i++) {
> + char c = name[i];
> + if (isalnum(c))
> + continue;
> + if ((c == '.') || (c == '-') || (c == '_'))
> + continue;
> + return -EINVAL;
> + }
> + /* Specifying two names is forbidden */
> + if (opts->name)
> + return -EINVAL;
> + opts->name = kstrndup(name,
> + MAX_CGROUP_ROOT_NAMELEN,
> + GFP_KERNEL);
> + if (!opts->name)
> + return -ENOMEM;
> } else {
> struct cgroup_subsys *ss;
> int i;
> @@ -904,7 +935,7 @@ static int parse_cgroupfs_options(char *data,
> return -EINVAL;
>
> /* We can't have an empty hierarchy */
> - if (!opts->subsys_bits)
> + if (!opts->subsys_bits && !opts->name)
> return -EINVAL;
>
> return 0;
> @@ -932,6 +963,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> goto out_unlock;
> }
>
> + /* Don't allow name to change at remount */
> + if (opts.name && strcmp(opts.name, root->name)) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> ret = rebind_subsystems(root, opts.subsys_bits);
> if (ret)
> goto out_unlock;
> @@ -943,6 +980,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> strcpy(root->release_agent_path, opts.release_agent);
> out_unlock:
> kfree(opts.release_agent);
> + kfree(opts.name);
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
> unlock_kernel();
> @@ -965,6 +1003,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> INIT_LIST_HEAD(&cgrp->pids_list);
> init_rwsem(&cgrp->pids_mutex);
> }
> +
> static void init_cgroup_root(struct cgroupfs_root *root)
> {
> struct cgroup *cgrp = &root->top_cgroup;
> @@ -978,31 +1017,59 @@ static void init_cgroup_root(struct cgroupfs_root *root)
>
> static int cgroup_test_super(struct super_block *sb, void *data)
> {
> - struct cgroupfs_root *new = data;
> + struct cgroup_sb_opts *opts = data;
> struct cgroupfs_root *root = sb->s_fs_info;
>
> - /* First check subsystems */
> - if (new->subsys_bits != root->subsys_bits)
> - return 0;
> + /* If we asked for a name then it must match */
> + if (opts->name && strcmp(opts->name, root->name))
> + return 0;
>
> - /* Next check flags */
> - if (new->flags != root->flags)
> + /* If we asked for subsystems then they must match */
> + if (opts->subsys_bits && (opts->subsys_bits != root->subsys_bits))
> return 0;
>
> return 1;
> }
>
> +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
> +{
> + struct cgroupfs_root *root;
> +
> + /* Empty hierarchies aren't supported */
> + if (!opts->subsys_bits)
> + return NULL;
> +
> + root = kzalloc(sizeof(*root), GFP_KERNEL);
> + if (!root)
> + return ERR_PTR(-ENOMEM);
> +
> + init_cgroup_root(root);
> + root->subsys_bits = opts->subsys_bits;
> + root->flags = opts->flags;
> + if (opts->release_agent)
> + strcpy(root->release_agent_path, opts->release_agent);
> + if (opts->name)
> + strcpy(root->name, opts->name);
> + return root;
> +}
> +
> static int cgroup_set_super(struct super_block *sb, void *data)
> {
> int ret;
> - struct cgroupfs_root *root = data;
> + struct cgroup_sb_opts *opts = data;
> +
> + /* If we don't have a new root, we can't set up a new sb */
> + if (!opts->new_root)
> + return -EINVAL;
> +
> + BUG_ON(!opts->subsys_bits);
>
> ret = set_anon_super(sb, NULL);
> if (ret)
> return ret;
>
> - sb->s_fs_info = root;
> - root->sb = sb;
> + sb->s_fs_info = opts->new_root;
> + opts->new_root->sb = sb;
>
> sb->s_blocksize = PAGE_CACHE_SIZE;
> sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
> @@ -1039,48 +1106,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> void *data, struct vfsmount *mnt)
> {
> struct cgroup_sb_opts opts;
> + struct cgroupfs_root *root;
> int ret = 0;
> struct super_block *sb;
> - struct cgroupfs_root *root;
> - struct list_head tmp_cg_links;
> + struct cgroupfs_root *new_root;
>
> /* First find the desired set of subsystems */
> ret = parse_cgroupfs_options(data, &opts);
> - if (ret) {
> - kfree(opts.release_agent);
> - return ret;
> - }
> -
> - root = kzalloc(sizeof(*root), GFP_KERNEL);
> - if (!root) {
> - kfree(opts.release_agent);
> - return -ENOMEM;
> - }
> + if (ret)
> + goto out_err;
>
> - init_cgroup_root(root);
> - root->subsys_bits = opts.subsys_bits;
> - root->flags = opts.flags;
> - if (opts.release_agent) {
> - strcpy(root->release_agent_path, opts.release_agent);
> - kfree(opts.release_agent);
> + /*
> + * Allocate a new cgroup root. We may not need it if we're
> + * reusing an existing hierarchy.
> + */
> + new_root = cgroup_root_from_opts(&opts);
> + if (IS_ERR(new_root)) {
> + ret = PTR_ERR(new_root);
> + goto out_err;
> }
> + opts.new_root = new_root;
>
> - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
> -
> + /* Locate an existing or new sb for this hierarchy */
> + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
> if (IS_ERR(sb)) {
> - kfree(root);
> - return PTR_ERR(sb);
> + ret = PTR_ERR(sb);
> + kfree(opts.new_root);
> + goto out_err;
> }
>
> - if (sb->s_fs_info != root) {
> - /* Reusing an existing superblock */
> - BUG_ON(sb->s_root == NULL);
> - kfree(root);
> - root = NULL;
> - } else {
> - /* New superblock */
> + root = sb->s_fs_info;
> + BUG_ON(!root);
> + if (root == opts.new_root) {
> + /* We used the new root structure, so this is a new hierarchy */
> + struct list_head tmp_cg_links;
> struct cgroup *root_cgrp = &root->top_cgroup;
> struct inode *inode;
> + struct cgroupfs_root *existing_root;
> int i;
>
> BUG_ON(sb->s_root != NULL);
> @@ -1093,6 +1155,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> mutex_lock(&inode->i_mutex);
> mutex_lock(&cgroup_mutex);
>
> + if (strlen(root->name)) {
> + /* Check for name clashes with existing mounts */
> + for_each_active_root(existing_root) {
> + if (!strcmp(existing_root->name, root->name)) {
> + ret = -EBUSY;
> + mutex_unlock(&cgroup_mutex);
> + mutex_unlock(&inode->i_mutex);
> + goto drop_new_super;
> + }
> + }
> + }
> +
> /*
> * We're accessing css_set_count without locking
> * css_set_lock here, but that's OK - it can only be
> @@ -1111,7 +1185,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> if (ret == -EBUSY) {
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&inode->i_mutex);
> - goto free_cg_links;
> + free_cg_links(&tmp_cg_links);
> + goto drop_new_super;
> }
>
> /* EBUSY should be the only error here */
> @@ -1145,15 +1220,26 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> cgroup_populate_dir(root_cgrp);
> mutex_unlock(&inode->i_mutex);
> mutex_unlock(&cgroup_mutex);
> + } else {
> + /*
> + * We re-used an existing hierarchy - the new root (if
> + * any) is not needed
> + */
> + kfree(opts.new_root);
> }
>
> simple_set_mnt(mnt, sb);
> + kfree(opts.release_agent);
> + kfree(opts.name);
> return 0;
>
> - free_cg_links:
> - free_cg_links(&tmp_cg_links);
> drop_new_super:
> deactivate_locked_super(sb);
> +
> + out_err:
> + kfree(opts.release_agent);
> + kfree(opts.name);
> +
> return ret;
> }
>
> @@ -2971,6 +3057,9 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
> seq_printf(m, "%lu:", root->subsys_bits);
> for_each_subsys(root, ss)
> seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
> + if (strlen(root->name))
> + seq_printf(m, "%sname=%s", count ? "," : "",
> + root->name);
> seq_putc(m, ':');
> get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
> cgrp = task_cgroup(tsk, subsys_id);
>

--
regards,
Dhaval