The following series adds support for:
- named cgroup hierarchies
- cgroup hierarchies with no bound subsystems
- cgroup subsystems that can be bound to multiple hierarchies
This allows more flexibility when constructing/mounting cgroups
hierarchies, and allows functionality that's not tied to specific
external resources to be made available on multiple cgroups
hierarchies rather than just one.
A few simple example cgroups subsystems that are multi-bindable are
included in the patch series.
---
Paul Menage (9):
[RFC] Support named cgroups hierarchies
[RFC] Move the cgroup debug subsys into cgroup.c to access internal state
[RFC] Add a back-pointer from struct cg_cgroup_link to struct cgroup
[RFC] Allow cgroup hierarchies to be created with no bound subsystems
[RFC] Remove cgroup_subsys.root pointer
[RFC] Remove the cgroup_subsys.bind callback
[RFC] Support multiply-bindable cgroup subsystems
[RFC] Example multi-bindable subsystem: a per-cgroup notes field
[RFC] Example multi-bindable subsystem: a max-depth controller
Documentation/cgroups/cgroups.txt | 8
include/linux/cgroup.h | 55 ++
include/linux/cgroup_subsys.h | 14 +
init/Kconfig | 17 +
kernel/Makefile | 3
kernel/cgroup.c | 905 +++++++++++++++++++++++++++----------
kernel/cgroup_debug.c | 105 ----
kernel/info_cgroup.c | 133 +++++
kernel/maxdepth_cgroup.c | 80 +++
9 files changed, 938 insertions(+), 382 deletions(-)
delete mode 100644 kernel/cgroup_debug.c
create mode 100644 kernel/info_cgroup.c
create mode 100644 kernel/maxdepth_cgroup.c
[RFC]Move the cgroup debug subsys into cgroup.c to access internal state
While it's architecturally clean to have the cgroup debug subsystem be
completely independent of the cgroups framework, it limits its
usefulness for debugging the contents of internal data structures.
Move the debug subsystem code into the scope of all the cgroups data
structures to make more detailed debugging possible.
Signed-off-by: Paul Menage <[email protected]>
---
kernel/Makefile | 1
kernel/cgroup.c | 88 +++++++++++++++++++++++++++++++++++++++++
kernel/cgroup_debug.c | 105 -------------------------------------------------
3 files changed, 88 insertions(+), 106 deletions(-)
delete mode 100644 kernel/cgroup_debug.c
diff --git a/kernel/Makefile b/kernel/Makefile
index 51e51fe..7ffdc16 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
-obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 940f28d..6788c97 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3664,3 +3664,91 @@ css_get_next(struct cgroup_subsys *ss, int id,
return ret;
}
+#ifdef CONFIG_CGROUP_DEBUG
+static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ kfree(cont->subsys[debug_subsys_id]);
+}
+
+static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
+{
+ return atomic_read(&cont->count);
+}
+
+static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+{
+ return cgroup_task_count(cont);
+}
+
+static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+{
+ return (u64)(long)current->cgroups;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup *cont,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = atomic_read(¤t->cgroups->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
+
+static struct cftype debug_files[] = {
+ {
+ .name = "cgroup_refcount",
+ .read_u64 = cgroup_refcount_read,
+ },
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .read_u64 = current_css_set_read,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+};
+
+static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, debug_files,
+ ARRAY_SIZE(debug_files));
+}
+
+struct cgroup_subsys debug_subsys = {
+ .name = "debug",
+ .create = debug_create,
+ .destroy = debug_destroy,
+ .populate = debug_populate,
+ .subsys_id = debug_subsys_id,
+};
+#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d79..0000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * kernel/cgroup_debug.c - Example cgroup subsystem that
- * exposes debug info
- *
- * Copyright (C) Google Inc, 2007
- *
- * Developed by Paul Menage ([email protected])
- *
- */
-
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-
-#include <asm/atomic.h>
-
-static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
- struct cgroup *cont)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- kfree(cont->subsys[debug_subsys_id]);
-}
-
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
-{
- return atomic_read(&cont->count);
-}
-
-static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
-{
- u64 count;
-
- count = cgroup_task_count(cont);
- return count;
-}
-
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
-{
- return (u64)(long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup *cont,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = atomic_read(¤t->cgroups->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
-{
- return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
-
-static struct cftype files[] = {
- {
- .name = "cgroup_refcount",
- .read_u64 = cgroup_refcount_read,
- },
- {
- .name = "taskcount",
- .read_u64 = taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-};
-
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
- return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
-}
-
-struct cgroup_subsys debug_subsys = {
- .name = "debug",
- .create = debug_create,
- .destroy = debug_destroy,
- .populate = debug_populate,
- .subsys_id = debug_subsys_id,
-};
[RFC] Add a back-pointer from struct cg_cgroup_link to struct cgroup
Currently the cgroups code makes the assumption that the subsystem
pointers in a struct css_set uniquely identify the hierarchy->cgroup
mappings associated with the css_set; and there's no way to directly
identify the associated set of cgroups other than by indirecting
through the appropriate subsystem state pointers.
This patch removes the need for that assumption by adding a
back-pointer from struct cg_cgroup_link object to its associated
cgroup; this allows the set of cgroups to be determined by traversing
the cg_links list in the struct css_set.
Signed-off-by: Paul Menage <[email protected]>
---
kernel/cgroup.c | 217 +++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 168 insertions(+), 49 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6788c97..cd543ec 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -205,6 +205,7 @@ struct cg_cgroup_link {
* cgroup, anchored on cgroup->css_sets
*/
struct list_head cgrp_link_list;
+ struct cgroup *cgrp;
/*
* List running through cg_cgroup_links pointing at a
* single css_set object, anchored on css_set->cg_links
@@ -231,8 +232,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
static DEFINE_RWLOCK(css_set_lock);
static int css_set_count;
-/* hash table for cgroup groups. This improves the performance to
- * find an existing css_set */
+/*
+ * hash table for cgroup groups. This improves the performance to find
+ * an existing css_set. This hash doesn't (currently) take into
+ * account cgroups in empty hierarchies.
+ */
#define CSS_SET_HASH_BITS 7
#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -342,6 +346,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
}
/*
+ * compare_css_sets - helper function for find_existing_css_set().
+ * @cg: candidate css_set being tested
+ * @old_cg: existing css_set for a task
+ * @new_cgrp: cgroup that's being entered by the task
+ * @template: desired set of css pointers in css_set (pre-calculated)
+ *
+ * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * which "new_cgrp" belongs to, for which it should match "new_cgrp".
+ */
+static bool compare_css_sets(struct css_set *cg,
+ struct css_set *old_cg,
+ struct cgroup *new_cgrp,
+ struct cgroup_subsys_state *template[])
+{
+ struct list_head *l1, *l2;
+
+ if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
+ /* Not all subsystems matched */
+ return false;
+ }
+
+ /*
+ * Compare cgroup pointers in order to distinguish between
+ * different cgroups in heirarchies with no subsystems. We
+ * could get by with just this check alone (and skip the
+ * memcmp above) but on most setups the memcmp check will
+ * avoid the need for this more expensive check on almost all
+ * candidates.
+ */
+
+ l1 = &cg->cg_links;
+ l2 = &old_cg->cg_links;
+ while (1) {
+ struct cg_cgroup_link *cgl1, *cgl2;
+ struct cgroup *cg1, *cg2;
+
+ l1 = l1->next;
+ l2 = l2->next;
+ /* See if we reached the end - both lists are equal length. */
+ if (l1 == &cg->cg_links) {
+ BUG_ON(l2 != &old_cg->cg_links);
+ break;
+ } else {
+ BUG_ON(l2 == &old_cg->cg_links);
+ }
+ /* Locate the cgroups associated with these links. */
+ cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
+ cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
+ cg1 = cgl1->cgrp;
+ cg2 = cgl2->cgrp;
+ /* Hierarchies should be linked in the same order. */
+ BUG_ON(cg1->root != cg2->root);
+
+ /*
+ * If this hierarchy is the hierarchy of the cgroup
+ * that's changing, then we need to check that this
+ * css_set points to the new cgroup; if it's any other
+ * hierarchy, then this css_set should point to the
+ * same cgroup as the old css_set.
+ */
+ if (cg1->root == new_cgrp->root) {
+ if (cg1 != new_cgrp)
+ return false;
+ } else {
+ if (cg1 != cg2)
+ return false;
+ }
+ }
+ return true;
+}
+
+/*
* find_existing_css_set() is a helper for
* find_css_set(), and checks to see whether an existing
* css_set is suitable.
@@ -382,10 +458,11 @@ static struct css_set *find_existing_css_set(
hhead = css_set_hash(template);
hlist_for_each_entry(cg, node, hhead, hlist) {
- if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
- /* All subsystems matched */
- return cg;
- }
+ if (!compare_css_sets(cg, oldcg, cgrp, template))
+ continue;
+
+ /* This css_set matches what we need */
+ return cg;
}
/* No existing cgroup group matched */
@@ -439,8 +516,13 @@ static void link_css_set(struct list_head *tmp_cg_links,
link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
cgrp_link_list);
link->cg = cg;
+ link->cgrp = cgrp;
list_move(&link->cgrp_link_list, &cgrp->css_sets);
- list_add(&link->cg_link_list, &cg->cg_links);
+ /*
+ * Always add links to the tail of the list so that the list
+ * is sorted by order of hierarchy creation
+ */
+ list_add_tail(&link->cg_link_list, &cg->cg_links);
}
/*
@@ -460,6 +542,7 @@ static struct css_set *find_css_set(
struct list_head tmp_cg_links;
struct hlist_head *hhead;
+ struct cg_cgroup_link *link;
/* First see if we already have a cgroup group that matches
* the desired set */
@@ -495,18 +578,14 @@ static struct css_set *find_css_set(
/* Add reference counts and links from the new css_set. */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup *cgrp = res->subsys[i]->cgroup;
- struct cgroup_subsys *ss = subsys[i];
atomic_inc(&cgrp->count);
- /*
- * We want to add a link once per cgroup, so we
- * only do it for the first subsystem in each
- * hierarchy
- */
- if (ss->root->subsys_list.next == &ss->sibling)
- link_css_set(&tmp_cg_links, res, cgrp);
}
- if (list_empty(&rootnode.subsys_list))
- link_css_set(&tmp_cg_links, res, dummytop);
+ list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+ struct cgroup *c = link->cgrp;
+ if (c->root == cgrp->root)
+ c = cgrp;
+ link_css_set(&tmp_cg_links, res, c);
+ }
BUG_ON(!list_empty(&tmp_cg_links));
@@ -522,6 +601,41 @@ static struct css_set *find_css_set(
}
/*
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+ struct cgroupfs_root *root)
+{
+ struct css_set *css;
+ struct cgroup *res = NULL;
+
+ BUG_ON(!mutex_is_locked(&cgroup_mutex));
+ read_lock(&css_set_lock);
+ /*
+ * No need to lock the task - since we hold cgroup_mutex the
+ * task can't change groups, so the only thing that can happen
+ * is that it exits and its css is set back to init_css_set.
+ */
+ css = task->cgroups;
+ if (css == &init_css_set) {
+ res = &root->top_cgroup;
+ } else {
+ struct cg_cgroup_link *link;
+ list_for_each_entry(link, &css->cg_links, cg_link_list) {
+ struct cgroup *c = link->cgrp;
+ if (c->root == root) {
+ res = c;
+ break;
+ }
+ }
+ }
+ read_unlock(&css_set_lock);
+ BUG_ON(!res);
+ return res;
+}
+
+/*
* There is one global cgroup mutex. We also require taking
* task_lock() when dereferencing a task's cgroup subsys pointers.
* See "The task_lock() exception", at the end of this comment.
@@ -1299,27 +1413,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
return 0;
}
-/*
- * Return the first subsystem attached to a cgroup's hierarchy, and
- * its subsystem id.
- */
-
-static void get_first_subsys(const struct cgroup *cgrp,
- struct cgroup_subsys_state **css, int *subsys_id)
-{
- const struct cgroupfs_root *root = cgrp->root;
- const struct cgroup_subsys *test_ss;
- BUG_ON(list_empty(&root->subsys_list));
- test_ss = list_entry(root->subsys_list.next,
- struct cgroup_subsys, sibling);
- if (css) {
- *css = cgrp->subsys[test_ss->subsys_id];
- BUG_ON(!*css);
- }
- if (subsys_id)
- *subsys_id = test_ss->subsys_id;
-}
-
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1336,12 +1429,9 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
struct css_set *cg;
struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
- int subsys_id;
-
- get_first_subsys(cgrp, NULL, &subsys_id);
/* Nothing to do if the task is already in that cgroup */
- oldcgrp = task_cgroup(tsk, subsys_id);
+ oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
return 0;
@@ -1899,7 +1989,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
* the start of a css_set
*/
static void cgroup_advance_iter(struct cgroup *cgrp,
- struct cgroup_iter *it)
+ struct cgroup_iter *it)
{
struct list_head *l = it->cg_link;
struct cg_cgroup_link *link;
@@ -2847,6 +2937,7 @@ int __init cgroup_init_early(void)
init_task.cgroups = &init_css_set;
init_css_set_link.cg = &init_css_set;
+ init_css_set_link.cgrp = dummytop;
list_add(&init_css_set_link.cgrp_link_list,
&rootnode.top_cgroup.css_sets);
list_add(&init_css_set_link.cg_link_list,
@@ -2954,7 +3045,6 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
for_each_active_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
- int subsys_id;
int count = 0;
seq_printf(m, "%lu:", root->subsys_bits);
@@ -2964,8 +3054,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
seq_printf(m, "%sname=%s",
count ? "," : "", root->name);
seq_putc(m, ':');
- get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
- cgrp = task_cgroup(tsk, subsys_id);
+ cgrp = task_cgroup_from_root(tsk, root);
retval = cgroup_path(cgrp, buf, PAGE_SIZE);
if (retval < 0)
goto out_unlock;
@@ -3291,13 +3380,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
{
int ret;
struct cgroup *target;
- int subsys_id;
if (cgrp == dummytop)
return 1;
- get_first_subsys(cgrp, NULL, &subsys_id);
- target = task_cgroup(task, subsys_id);
+ target = task_cgroup_from_root(task, cgrp->root);
while (cgrp != target && cgrp!= cgrp->top_cgroup)
cgrp = cgrp->parent;
ret = (cgrp == target);
@@ -3707,6 +3794,33 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
return count;
}
+static int current_css_set_cg_links_read(struct cgroup *cont,
+ struct cftype *cft,
+ struct seq_file *seq)
+{
+ struct cg_cgroup_link *link, *saved_link;
+ struct css_set *cg;
+ write_lock_irq(&css_set_lock);
+ task_lock(current);
+ cg = current->cgroups;
+ list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+ cg_link_list) {
+ struct cgroup *c = link->cgrp;
+ const char *name;
+ rcu_read_lock();
+ if (c->dentry)
+ name = c->dentry->d_name.name;
+ else
+ name = "?";
+ seq_printf(seq, "Root %lu group %s\n",
+ c->root->subsys_bits, name);
+ rcu_read_unlock();
+ }
+ task_unlock(current);
+ write_unlock_irq(&css_set_lock);
+ return 0;
+}
+
static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
{
return test_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3733,6 +3847,11 @@ static struct cftype debug_files[] = {
},
{
+ .name = "current_css_set_cg_links",
+ .read_seq_string = current_css_set_cg_links_read,
+ },
+
+ {
.name = "releasable",
.read_u64 = releasable_read,
},
[RFC] Support named cgroups hierarchies
To simplify referring to cgroup hierarchies in mount statements, and
to allow disambiguation in the presence of empty hierarchies and
multiply-bindable subsystems (see later patches in series) this patch
adds support for naming a new cgroup hierarchy via the "name=" mount
option
A pre-existing hierarchy may be specified by either name or by
subsystems; a hierarchy's name cannot be changed by a remount
operation.
Example usage:
# To create a hierarchy called "foo" containing the "cpu" subsystem
mount -t cgroup -oname=foo,cpu cgroup /mnt/cgroup1
# To mount the "foo" hierarchy on a second location
mount -t cgroup -oname=foo cgroup /mnt/cgroup2
Open issues:
- should the specification be via a name= option as in this patch, or
should we simply use the "device name" as passed to the mount()
system call? Using the device name is more conceptually clean and
consistent with the filesystem API; however, given that the device
name is currently ignored by cgroups, this would lead to a
user-visible behaviour change.
Signed-off-by: Paul Menage <[email protected]>
---
kernel/cgroup.c | 136 ++++++++++++++++++++++++++++++++++++-------------------
1 files changed, 88 insertions(+), 48 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ea255fe..940f28d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -59,6 +59,8 @@ static struct cgroup_subsys *subsys[] = {
#include <linux/cgroup_subsys.h>
};
+#define MAX_CGROUP_ROOT_NAMELEN 64
+
/*
* A cgroupfs_root represents the root of a cgroup hierarchy,
* and may be associated with a superblock to form an active
@@ -93,6 +95,9 @@ struct cgroupfs_root {
/* The path to use for release notifications. */
char release_agent_path[PATH_MAX];
+
+ /* The name for this hierarchy - may be empty */
+ char name[MAX_CGROUP_ROOT_NAMELEN];
};
/*
@@ -828,6 +833,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",noprefix");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ if (strlen(root->name))
+ seq_printf(seq, ",name=%s", root->name);
mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -836,12 +843,15 @@ struct cgroup_sb_opts {
unsigned long subsys_bits;
unsigned long flags;
char *release_agent;
+ char *name;
+ /* A flag indicating that a root was created from this options block */
+ bool created_root;
};
/* Convert a hierarchy specifier into a bitmask of subsystems and
* flags. */
static int parse_cgroupfs_options(char *data,
- struct cgroup_sb_opts *opts)
+ struct cgroup_sb_opts *opts)
{
char *token, *o = data ?: "all";
unsigned long mask = (unsigned long)-1;
@@ -850,9 +860,7 @@ static int parse_cgroupfs_options(char *data,
mask = ~(1UL << cpuset_subsys_id);
#endif
- opts->subsys_bits = 0;
- opts->flags = 0;
- opts->release_agent = NULL;
+ memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
@@ -872,11 +880,19 @@ static int parse_cgroupfs_options(char *data,
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
- opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+ opts->release_agent =
+ kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
- strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
- opts->release_agent[PATH_MAX - 1] = 0;
+ } else if (!strncmp(token, "name=", 5)) {
+ /* Specifying two names is forbidden */
+ if (opts->name)
+ return -EINVAL;
+ opts->name = kstrndup(token + 5,
+ MAX_CGROUP_ROOT_NAMELEN,
+ GFP_KERNEL);
+ if (!opts->name)
+ return -ENOMEM;
} else {
struct cgroup_subsys *ss;
int i;
@@ -903,7 +919,7 @@ static int parse_cgroupfs_options(char *data,
return -EINVAL;
/* We can't have an empty hierarchy */
- if (!opts->subsys_bits)
+ if (!opts->subsys_bits && !opts->name)
return -EINVAL;
return 0;
@@ -931,6 +947,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
goto out_unlock;
}
+ /* Don't allow name to change at remount */
+ if (opts.name && strcmp(opts.name, root->name)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
ret = rebind_subsystems(root, opts.subsys_bits);
if (ret)
goto out_unlock;
@@ -942,6 +964,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
strcpy(root->release_agent_path, opts.release_agent);
out_unlock:
kfree(opts.release_agent);
+ kfree(opts.name);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
unlock_kernel();
@@ -963,6 +986,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->release_list);
init_rwsem(&cgrp->pids_mutex);
}
+
static void init_cgroup_root(struct cgroupfs_root *root)
{
struct cgroup *cgrp = &root->top_cgroup;
@@ -976,28 +1000,56 @@ static void init_cgroup_root(struct cgroupfs_root *root)
static int cgroup_test_super(struct super_block *sb, void *data)
{
- struct cgroupfs_root *new = data;
+ struct cgroup_sb_opts *new = data;
struct cgroupfs_root *root = sb->s_fs_info;
- /* First check subsystems */
- if (new->subsys_bits != root->subsys_bits)
- return 0;
+ /* If we asked for a name then it must match */
+ if (new->name && strcmp(new->name, root->name))
+ return 0;
- /* Next check flags */
- if (new->flags != root->flags)
+ /* If we asked for subsystems then they must match */
+ if (new->subsys_bits && new->subsys_bits != root->subsys_bits)
return 0;
return 1;
}
+static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
+{
+ struct cgroupfs_root *root;
+
+ if (!opts->subsys_bits)
+ return ERR_PTR(-EINVAL);
+
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ init_cgroup_root(root);
+ root->subsys_bits = opts->subsys_bits;
+ root->flags = opts->flags;
+ if (opts->release_agent)
+ strcpy(root->release_agent_path, opts->release_agent);
+ if (opts->name)
+ strcpy(root->name, opts->name);
+ opts->created_root = true;
+ return root;
+}
+
static int cgroup_set_super(struct super_block *sb, void *data)
{
int ret;
- struct cgroupfs_root *root = data;
+ struct cgroup_sb_opts *opts = data;
+ struct cgroupfs_root *root;
+ root = cgroup_root_from_opts(opts);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
ret = set_anon_super(sb, NULL);
- if (ret)
+ if (ret) {
+ kfree(root);
return ret;
+ }
sb->s_fs_info = root;
root->sb = sb;
@@ -1039,44 +1091,23 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
struct cgroup_sb_opts opts;
int ret = 0;
struct super_block *sb;
- struct cgroupfs_root *root;
- struct list_head tmp_cg_links;
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
- if (ret) {
- kfree(opts.release_agent);
- return ret;
- }
-
- root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root) {
- kfree(opts.release_agent);
- return -ENOMEM;
- }
-
- init_cgroup_root(root);
- root->subsys_bits = opts.subsys_bits;
- root->flags = opts.flags;
- if (opts.release_agent) {
- strcpy(root->release_agent_path, opts.release_agent);
- kfree(opts.release_agent);
- }
+ if (ret)
+ goto out_err;
- sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
+ sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
if (IS_ERR(sb)) {
- kfree(root);
- return PTR_ERR(sb);
+ ret = PTR_ERR(sb);
+ goto out_err;
}
- if (sb->s_fs_info != root) {
- /* Reusing an existing superblock */
- BUG_ON(sb->s_root == NULL);
- kfree(root);
- root = NULL;
- } else {
+ if (opts.created_root) {
/* New superblock */
+ struct cgroupfs_root *root = sb->s_fs_info;
+ struct list_head tmp_cg_links;
struct cgroup *root_cgrp = &root->top_cgroup;
struct inode *inode;
int i;
@@ -1109,7 +1140,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
if (ret == -EBUSY) {
mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
- goto free_cg_links;
+ free_cg_links(&tmp_cg_links);
+ goto drop_new_super;
}
/* EBUSY should be the only error here */
@@ -1146,12 +1178,17 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
}
simple_set_mnt(mnt, sb);
+ kfree(opts.release_agent);
+ kfree(opts.name);
return 0;
- free_cg_links:
- free_cg_links(&tmp_cg_links);
drop_new_super:
deactivate_locked_super(sb);
+
+ out_err:
+ kfree(opts.release_agent);
+ kfree(opts.name);
+
return ret;
}
@@ -2923,6 +2960,9 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
seq_printf(m, "%lu:", root->subsys_bits);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ if (strlen(root->name))
+ seq_printf(m, "%sname=%s",
+ count ? "," : "", root->name);
seq_putc(m, ':');
get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
cgrp = task_cgroup(tsk, subsys_id);
[RFC] Allow cgroup hierarchies to be created with no bound subsystems
This patch removes the restriction that a cgroup hierarchy must have
at least one bound subsystem. The mount option "none" is treated as an
explicit request for no bound subsystems.
A hierarchy with no subsystems can be useful for plain task tracking,
and is also a step towards the support for multiply-bindable
subsystems in a later patch in this series.
As part of this change, the hierarchy id is no longer calculated from
the bitmask of subsystems in the hierarchy (since this is not
guaranteed to be unique) but is allocated via an ida. Reference counts
on cgroups from css_set objects are now taken explicitly one per
hierarchy, rather than one per subsystem.
Example usage:
mount -t cgroup -o none,name=foo cgroup /mnt/cgroup
Based on the "no-op"/"none" subsystem concept proposed by
[email protected]
Signed-off-by: Paul Menage <[email protected]>
---
kernel/cgroup.c | 176 ++++++++++++++++++++++++++++++++++++++-----------------
1 files changed, 123 insertions(+), 53 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cd543ec..dede632 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
#include <linux/hash.h>
#include <linux/namei.h>
#include <linux/smp_lock.h>
+#include <linux/idr.h>
#include <asm/atomic.h>
@@ -75,6 +76,9 @@ struct cgroupfs_root {
*/
unsigned long subsys_bits;
+ /* Unique id for this hierarchy. */
+ int hierarchy_id;
+
/* The bitmask of subsystems currently attached to this hierarchy */
unsigned long actual_subsys_bits;
@@ -145,6 +149,10 @@ struct css_id {
static LIST_HEAD(roots);
static int root_count;
+static DEFINE_IDA(hierarchy_ida);
+static int next_hierarchy_id;
+static DEFINE_SPINLOCK(hierarchy_id_lock);
+
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)
@@ -262,42 +270,10 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
* compiled into their kernel but not actually in use */
static int use_task_css_set_links __read_mostly;
-/* When we create or destroy a css_set, the operation simply
- * takes/releases a reference count on all the cgroups referenced
- * by subsystems in this css_set. This can end up multiple-counting
- * some cgroups, but that's OK - the ref-count is just a
- * busy/not-busy indicator; ensuring that we only count each cgroup
- * once would require taking a global lock to ensure that no
- * subsystems moved between hierarchies while we were doing so.
- *
- * Possible TODO: decide at boot time based on the number of
- * registered subsystems and the number of CPUs or NUMA nodes whether
- * it's better for performance to ref-count every subsystem, or to
- * take a global lock and only add one ref count to each hierarchy.
- */
-
-/*
- * unlink a css_set from the list and free it
- */
-static void unlink_css_set(struct css_set *cg)
+static void __put_css_set(struct css_set *cg, int taskexit)
{
struct cg_cgroup_link *link;
struct cg_cgroup_link *saved_link;
-
- hlist_del(&cg->hlist);
- css_set_count--;
-
- list_for_each_entry_safe(link, saved_link, &cg->cg_links,
- cg_link_list) {
- list_del(&link->cg_link_list);
- list_del(&link->cgrp_link_list);
- kfree(link);
- }
-}
-
-static void __put_css_set(struct css_set *cg, int taskexit)
-{
- int i;
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
@@ -310,20 +286,27 @@ static void __put_css_set(struct css_set *cg, int taskexit)
write_unlock(&css_set_lock);
return;
}
- unlink_css_set(cg);
- write_unlock(&css_set_lock);
- rcu_read_lock();
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
+ /* This css_set is dead. unlink it and release cgroup refcounts */
+ hlist_del(&cg->hlist);
+ css_set_count--;
+
+ list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+ cg_link_list) {
+ struct cgroup *cgrp = link->cgrp;
+ list_del(&link->cg_link_list);
+ list_del(&link->cgrp_link_list);
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
+
+ kfree(link);
}
- rcu_read_unlock();
+
+ write_unlock(&css_set_lock);
kfree(cg);
}
@@ -517,6 +500,7 @@ static void link_css_set(struct list_head *tmp_cg_links,
cgrp_link_list);
link->cg = cg;
link->cgrp = cgrp;
+ atomic_inc(&cgrp->count);
list_move(&link->cgrp_link_list, &cgrp->css_sets);
/*
* Always add links to the tail of the list so that the list
@@ -537,7 +521,6 @@ static struct css_set *find_css_set(
{
struct css_set *res;
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
- int i;
struct list_head tmp_cg_links;
@@ -576,10 +559,6 @@ static struct css_set *find_css_set(
write_lock(&css_set_lock);
/* Add reference counts and links from the new css_set. */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup *cgrp = res->subsys[i]->cgroup;
- atomic_inc(&cgrp->count);
- }
list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
struct cgroup *c = link->cgrp;
if (c->root == cgrp->root)
@@ -958,6 +937,10 @@ struct cgroup_sb_opts {
unsigned long flags;
char *release_agent;
char *name;
+
+ /* User explicitly requested empty subsystem */
+ bool none;
+
/* A flag indicating that a root was created from this options block */
bool created_root;
};
@@ -988,6 +971,9 @@ static int parse_cgroupfs_options(char *data,
if (!ss->disabled)
opts->subsys_bits |= 1ul << i;
}
+ } else if (!strcmp(token, "none")) {
+ /* Explicitly have no subsystems */
+ opts->none = true;
} else if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
} else if (!strncmp(token, "release_agent=", 14)) {
@@ -1023,6 +1009,8 @@ static int parse_cgroupfs_options(char *data,
}
}
+ /* Consistency checks */
+
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
@@ -1032,6 +1020,16 @@ static int parse_cgroupfs_options(char *data,
(opts->subsys_bits & mask))
return -EINVAL;
+
+ /* Can't specify "none" and some subsystems */
+ if (opts->subsys_bits && opts->none)
+ return -EINVAL;
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+
/* We can't have an empty hierarchy */
if (!opts->subsys_bits && !opts->name)
return -EINVAL;
@@ -1112,6 +1110,31 @@ static void init_cgroup_root(struct cgroupfs_root *root)
init_cgroup_housekeeping(cgrp);
}
+static bool init_root_id(struct cgroupfs_root *root)
+{
+ int ret = 0;
+
+ do {
+ if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
+ return false;
+ spin_lock(&hierarchy_id_lock);
+ /* Try to allocate the next unused ID */
+ ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
+ &root->hierarchy_id);
+ if (ret == -ENOSPC)
+ /* Try again starting from 0 */
+ ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
+ if (!ret) {
+ next_hierarchy_id = root->hierarchy_id + 1;
+ } else if (ret != -EAGAIN) {
+ /* Can only get here if the 31-bit IDR is full ... */
+ BUG_ON(ret);
+ }
+ spin_unlock(&hierarchy_id_lock);
+ } while (ret);
+ return true;
+}
+
static int cgroup_test_super(struct super_block *sb, void *data)
{
struct cgroup_sb_opts *new = data;
@@ -1132,14 +1155,19 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
{
struct cgroupfs_root *root;
- if (!opts->subsys_bits)
+ if (!opts->subsys_bits && !opts->none)
return ERR_PTR(-EINVAL);
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
+ if (!init_root_id(root)) {
+ kfree(root);
+ return ERR_PTR(-ENOMEM);
+ }
init_cgroup_root(root);
+
root->subsys_bits = opts->subsys_bits;
root->flags = opts->flags;
if (opts->release_agent)
@@ -1150,6 +1178,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
return root;
}
+static void cgroup_drop_root(struct cgroupfs_root *root)
+{
+ BUG_ON(!root->hierarchy_id);
+ spin_lock(&hierarchy_id_lock);
+ ida_remove(&hierarchy_ida, root->hierarchy_id);
+ spin_unlock(&hierarchy_id_lock);
+ kfree(root);
+}
+
static int cgroup_set_super(struct super_block *sb, void *data)
{
int ret;
@@ -1161,7 +1198,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)
return PTR_ERR(root);
ret = set_anon_super(sb, NULL);
if (ret) {
- kfree(root);
+ cgroup_drop_root(root);
return ret;
}
@@ -1348,7 +1385,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
mutex_unlock(&cgroup_mutex);
kill_litter_super(sb);
- kfree(root);
+ cgroup_drop_root(root);
}
static struct file_system_type cgroup_fs_type = {
@@ -2992,7 +3029,7 @@ int __init cgroup_init(void)
/* Add init_css_set to the hash table */
hhead = css_set_hash(init_css_set.subsys);
hlist_add_head(&init_css_set.hlist, hhead);
-
+ BUG_ON(!init_root_id(&rootnode));
err = register_filesystem(&cgroup_fs_type);
if (err < 0)
goto out;
@@ -3047,7 +3084,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
struct cgroup *cgrp;
int count = 0;
- seq_printf(m, "%lu:", root->subsys_bits);
+ seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
@@ -3093,8 +3130,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
- seq_printf(m, "%s\t%lu\t%d\t%d\n",
- ss->name, ss->root->subsys_bits,
+ seq_printf(m, "%s\t%d\t%d\t%d\n",
+ ss->name, ss->root->hierarchy_id,
ss->root->number_of_cgroups, !ss->disabled);
}
mutex_unlock(&cgroup_mutex);
@@ -3812,8 +3849,8 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
name = c->dentry->d_name.name;
else
name = "?";
- seq_printf(seq, "Root %lu group %s\n",
- c->root->subsys_bits, name);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name);
rcu_read_unlock();
}
task_unlock(current);
@@ -3821,6 +3858,34 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
return 0;
}
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct cgroup *cont,
+ struct cftype *cft,
+ struct seq_file *seq)
+{
+ struct cg_cgroup_link *link, *saved_link;
+ write_lock_irq(&css_set_lock);
+ list_for_each_entry_safe(link, saved_link, &cont->css_sets,
+ cgrp_link_list) {
+ struct css_set *cg = link->cg;
+ struct task_struct *task, *saved_task;
+ int count = 0;
+ seq_printf(seq, "css_set %p\n", cg);
+ list_for_each_entry_safe(task, saved_task, &cg->tasks,
+ cg_list) {
+ if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
+ seq_puts(seq, " ...\n");
+ break;
+ } else {
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+ }
+ }
+ write_unlock_irq(&css_set_lock);
+ return 0;
+}
+
static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
{
return test_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3852,6 +3917,11 @@ static struct cftype debug_files[] = {
},
{
+ .name = "cgroup_css_links",
+ .read_seq_string = cgroup_css_links_read,
+ },
+
+ {
.name = "releasable",
.read_u64 = releasable_read,
},
[RFC] Remove the cgroup_subsys.bind callback
In preparation for supporting cgroup subsystems that can be bound to
multiple hierarchies, remove the "bind" callback. It's not currently
used by any in-tree subsystem and its semantics become fuzzy for
multi-bindable subsystems. It can be re-added with clearer semantics
in the future if necessary.
Signed-off-by: Paul Menage <[email protected]>
---
Documentation/cgroups/cgroups.txt | 8 --------
include/linux/cgroup.h | 1 -
kernel/cgroup.c | 4 ----
3 files changed, 0 insertions(+), 13 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 6eb1a97..d5be8ae 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -544,14 +544,6 @@ initialization which might be required before a task could attach. For
example in cpusets, no task may attach before 'cpus' and 'mems' are set
up.
-void bind(struct cgroup_subsys *ss, struct cgroup *root)
-(cgroup_mutex and ss->hierarchy_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
-
4. Questions
============
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index adf6739..a6bb0ca 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -386,7 +386,6 @@ struct cgroup_subsys {
int (*populate)(struct cgroup_subsys *ss,
struct cgroup *cgrp);
void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
- void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
int subsys_id;
int active;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8b1b92f..74840ea 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -897,8 +897,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
mutex_lock(&ss->hierarchy_mutex);
cgrp->subsys[i] = dummytop->subsys[i];
cgrp->subsys[i]->cgroup = cgrp;
- if (ss->bind)
- ss->bind(ss, cgrp);
rootnode.subsys_bits &= ~bit;
root->subsys_bits |= bit;
mutex_unlock(&ss->hierarchy_mutex);
@@ -908,8 +906,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
BUG_ON(rootnode.subsys_bits & bit);
mutex_lock(&ss->hierarchy_mutex);
- if (ss->bind)
- ss->bind(ss, dummytop);
dummytop->subsys[i]->cgroup = dummytop;
cgrp->subsys[i] = NULL;
root->subsys_bits &= ~bit;
[RFC] Support multiply-bindable cgroup subsystems
This patch allows a cgroup subsystem to be marked as bindable on
multiple cgroup hierarchies independently, when declared in
cgroup_subsys.h via MULTI_SUBSYS() rather than SUBSYS().
The state for such subsystems cannot be accessed directly from a
task->cgroups (since there's no unique mapping for a task) but instead
must be accessed via a particular control group object.
Multiply-bound subsystems are useful in cases where there's no direct
correspondence between the cgroup configuration and some property of
the kernel outside of the cgroups subsystem. So this would not be
applicable to e.g. the CFS cgroup, since there has to a unique mapping
from a task to its CFS run queue.
As an example, the "debug" subsystem is marked multiply-bindable,
since it has no state outside the cgroups framework itself.
Example usage:
mount -t cgroup -o name=foo,debug,cpu cgroup /mnt1
mount -t cgroup -o name=bar,debug,memory cgroup /mnt2
Open Issues:
- in the current version of this patch, mounting a cgroups hierarchy
with no options does *not* get you any of the multi-bindable
subsystems; possibly for consistency it should give you all of the
multi-bindable subsystems as well as all of the single-bindable
subsystems.
- how can we avoid the checkpatch.pl errors due to creative use of
macros to generate enum names?
Signed-off-by: Paul Menage <[email protected]>
---
include/linux/cgroup.h | 38 +++++++-
include/linux/cgroup_subsys.h | 2
kernel/cgroup.c | 192 ++++++++++++++++++++++++++++-------------
3 files changed, 168 insertions(+), 64 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a6bb0ca..02fdea9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -39,10 +39,38 @@ extern int cgroupstats_build(struct cgroupstats *stats,
extern struct file_operations proc_cgroup_operations;
-/* Define the enumeration of all cgroup subsystems */
-#define SUBSYS(_x) _x ## _subsys_id,
+/*
+ * Define the enumeration of all cgroup subsystems. There are two
+ * kinds of subsystems:
+ *
+ * - regular (singleton) subsystems can be only mounted once; these
+ * generally correspond to some system that has substantial state
+ * outside of the cgroups framework, or where the state is being used
+ * to control some external behaviour (e.g. CPU scheduler). Every
+ * task has an associated state pointer (accessed efficiently through
+ * task->cgroups) for each singleton subsystem.
+ *
+ * - multiply-bound subsystems may be mounted on zero or more
+ * hierarchies. Since there's no unique mapping from a task to a
+ * subsystem state pointer for multiply-bound subsystems, the state of
+ * these subsystems cannot be accessed rapidly from a task
+ * pointer. These correspond to subsystems where most or all of the
+ * state is maintained within the subsystem itself, and/or the
+ * subsystem is used for monitoring rather than control.
+ */
enum cgroup_subsys_id {
+#define SUBSYS(_x) _x ## _subsys_id,
+#define MULTI_SUBSYS(_x)
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+#undef MULTI_SUBSYS
+ CGROUP_SINGLETON_SUBSYS_COUNT,
+ CGROUP_DUMMY_ENUM_RESET = CGROUP_SINGLETON_SUBSYS_COUNT - 1,
+#define SUBSYS(_x)
+#define MULTI_SUBSYS(_x) _x ## _subsys_id,
#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+#undef MULTI_SUBSYS
CGROUP_SUBSYS_COUNT
};
#undef SUBSYS
@@ -231,7 +259,7 @@ struct css_set {
* time). Multi-subsystems don't have an entry in here since
* there's no unique state for a given task.
*/
- struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+ struct cgroup_subsys_state *subsys[CGROUP_SINGLETON_SUBSYS_COUNT];
};
/*
@@ -418,8 +446,10 @@ struct cgroup_subsys {
};
#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
+#define MULTI_SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
#include <linux/cgroup_subsys.h>
#undef SUBSYS
+#undef MULTI_SUBSYS
static inline struct cgroup_subsys_state *cgroup_subsys_state(
struct cgroup *cgrp, int subsys_id)
@@ -430,6 +460,8 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
static inline struct cgroup_subsys_state *task_subsys_state(
struct task_struct *task, int subsys_id)
{
+ /* This check is optimized out for most callers */
+ BUG_ON(subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
return rcu_dereference(task->cgroups->subsys[subsys_id]);
}
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c8d31b..f78605e 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -14,7 +14,7 @@ SUBSYS(cpuset)
/* */
#ifdef CONFIG_CGROUP_DEBUG
-SUBSYS(debug)
+MULTI_SUBSYS(debug)
#endif
/* */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 74840ea..a527687 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -54,10 +54,18 @@
static DEFINE_MUTEX(cgroup_mutex);
/* Generate an array of cgroup subsystem pointers */
-#define SUBSYS(_x) &_x ## _subsys,
static struct cgroup_subsys *subsys[] = {
+#define SUBSYS(_x) (&_x ## _subsys),
+#define MULTI_SUBSYS(_x)
#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+#undef MULTI_SUBSYS
+#define SUBSYS(_x)
+#define MULTI_SUBSYS(_x) (&_x ## _subsys),
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+#undef MULTI_SUBSYS
};
#define MAX_CGROUP_ROOT_NAMELEN 64
@@ -269,7 +277,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
int index;
unsigned long tmp = 0UL;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+ for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++)
tmp += (unsigned long)css[i];
tmp = (tmp >> 16) ^ tmp;
@@ -440,7 +448,7 @@ static struct css_set *find_existing_css_set(
/* Build the set of subsystem state objects that we want to
* see in the new css_set */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
if (root->subsys_bits & (1UL << i)) {
/* Subsystem is in this hierarchy. So we want
* the subsystem state from the new
@@ -534,7 +542,7 @@ static struct css_set *find_css_set(
struct css_set *oldcg, struct cgroup *cgrp)
{
struct css_set *res;
- struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ struct cgroup_subsys_state *template[CGROUP_SINGLETON_SUBSYS_COUNT];
struct list_head tmp_cg_links;
@@ -857,17 +865,33 @@ static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
wake_up_all(&cgroup_rmdir_waitq);
}
+static void init_cgroup_css(struct cgroup_subsys_state *css,
+ struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+{
+ css->cgroup = cgrp;
+ atomic_set(&css->refcnt, 1);
+ css->flags = 0;
+ css->id = NULL;
+ if (cgrp == dummytop)
+ set_bit(CSS_ROOT, &css->flags);
+ BUG_ON(cgrp->subsys[ss->subsys_id]);
+ cgrp->subsys[ss->subsys_id] = css;
+}
+
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
- unsigned long added_bits, removed_bits;
+ unsigned long added_bits, removed_bits, rollback_bits;
+ int ret = 0;
struct cgroup *cgrp = &root->top_cgroup;
int i;
removed_bits = root->subsys_bits & ~final_bits;
added_bits = final_bits & ~root->subsys_bits;
+ rollback_bits = 0;
/* Check that any added subsystems are currently free */
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
unsigned long bit = 1UL << i;
if (!(bit & added_bits))
continue;
@@ -884,33 +908,45 @@ static int rebind_subsystems(struct cgroupfs_root *root,
if (root->number_of_cgroups > 1)
return -EBUSY;
- /* Process each subsystem */
+ /*
+ * Process each subsystem. First try to add all new
+ * subsystems, since this may require rollback
+ */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
+ bool singleton = i < CGROUP_SINGLETON_SUBSYS_COUNT;
unsigned long bit = 1UL << i;
if (bit & added_bits) {
+ struct cgroup_subsys_state *css;
/* We're binding this subsystem to this hierarchy */
BUG_ON(cgrp->subsys[i]);
- BUG_ON(!dummytop->subsys[i]);
- BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
- BUG_ON(!(rootnode.subsys_bits & bit));
+ if (singleton) {
+ css = dummytop->subsys[i];
+ BUG_ON(!css);
+
+ BUG_ON(css->cgroup != dummytop);
+ BUG_ON(!(rootnode.subsys_bits & bit));
+ } else {
+ BUG_ON(dummytop->subsys[i]);
+ BUG_ON(rootnode.subsys_bits & bit);
+ css = ss->create(ss, cgrp);
+ if (IS_ERR(css)) {
+ ret = PTR_ERR(css);
+ break;
+ }
+ init_cgroup_css(css, ss, cgrp);
+ }
mutex_lock(&ss->hierarchy_mutex);
- cgrp->subsys[i] = dummytop->subsys[i];
- cgrp->subsys[i]->cgroup = cgrp;
- rootnode.subsys_bits &= ~bit;
+ cgrp->subsys[i] = css;
+ css->cgroup = cgrp;
+ if (singleton)
+ rootnode.subsys_bits &= ~bit;
root->subsys_bits |= bit;
mutex_unlock(&ss->hierarchy_mutex);
+ rollback_bits |= bit;
} else if (bit & removed_bits) {
- /* We're removing this subsystem */
- BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
- BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
- BUG_ON(rootnode.subsys_bits & bit);
- mutex_lock(&ss->hierarchy_mutex);
- dummytop->subsys[i]->cgroup = dummytop;
- cgrp->subsys[i] = NULL;
- root->subsys_bits &= ~bit;
- rootnode.subsys_bits |= bit;
- mutex_unlock(&ss->hierarchy_mutex);
+ /* Deal with this after adds are successful */
+ BUG_ON(!cgrp->subsys[i]);
} else if (bit & final_bits) {
/* Subsystem state should already exist */
BUG_ON(!cgrp->subsys[i]);
@@ -919,9 +955,47 @@ static int rebind_subsystems(struct cgroupfs_root *root,
BUG_ON(cgrp->subsys[i]);
}
}
+ if (ret) {
+ /* We failed to add some subsystem - roll back */
+ removed_bits = rollback_bits;
+ }
+
+ /*
+ * Now either remove the subsystems that were requested to be
+ * removed, or roll back the subsystems that were added before
+ * the error occurred
+ */
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ bool singleton = i < CGROUP_SINGLETON_SUBSYS_COUNT;
+ unsigned long bit = 1UL << i;
+ if (bit & removed_bits) {
+ struct cgroup_subsys_state *css;
+ /* We're removing this subsystem */
+ css = cgrp->subsys[i];
+ BUG_ON(css->cgroup != cgrp);
+ if (singleton) {
+ BUG_ON(css != dummytop->subsys[i]);
+ BUG_ON(rootnode.subsys_bits & bit);
+ }
+ mutex_lock(&ss->hierarchy_mutex);
+ if (singleton) {
+ css->cgroup = dummytop;
+ } else {
+ /* Is this safe? */
+ ss->destroy(ss, cgrp);
+ }
+ cgrp->subsys[i] = NULL;
+ root->subsys_bits &= ~bit;
+ if (singleton)
+ rootnode.subsys_bits |= bit;
+ mutex_unlock(&ss->hierarchy_mutex);
+ }
+ }
+
synchronize_rcu();
- return 0;
+ return ret;
}
static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
@@ -976,7 +1050,7 @@ static int parse_cgroupfs_options(char *data,
/* Add all non-disabled subsystems */
int i;
opts->subsys_bits = 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (!ss->disabled)
opts->subsys_bits |= 1ul << i;
@@ -1296,16 +1370,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
}
ret = rebind_subsystems(root, opts.subsys_bits);
- if (ret == -EBUSY) {
+ if (ret) {
mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
free_cg_links(&tmp_cg_links);
goto drop_new_super;
}
- /* EBUSY should be the only error here */
- BUG_ON(ret);
-
list_add(&root->root_list, &roots);
root_count++;
@@ -2623,20 +2694,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
return 0;
}
-static void init_cgroup_css(struct cgroup_subsys_state *css,
- struct cgroup_subsys *ss,
- struct cgroup *cgrp)
-{
- css->cgroup = cgrp;
- atomic_set(&css->refcnt, 1);
- css->flags = 0;
- css->id = NULL;
- if (cgrp == dummytop)
- set_bit(CSS_ROOT, &css->flags);
- BUG_ON(cgrp->subsys[ss->subsys_id]);
- cgrp->subsys[ss->subsys_id] = css;
-}
-
static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
{
/* We need to take each hierarchy_mutex in a consistent order */
@@ -2922,21 +2979,23 @@ again:
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
-
+ bool singleton = ss->subsys_id < CGROUP_SINGLETON_SUBSYS_COUNT;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
- /* Create the top cgroup state for this subsystem */
- css = ss->create(ss, dummytop);
- /* We don't handle early failures gracefully */
- BUG_ON(IS_ERR(css));
- init_cgroup_css(css, ss, dummytop);
-
- /* Update the init_css_set to contain a subsys
- * pointer to this state - since the subsystem is
- * newly registered, all tasks and hence the
- * init_css_set is in the subsystem's top cgroup. */
- init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+ if (singleton) {
+ /* Create the top cgroup state for this subsystem */
+ css = ss->create(ss, dummytop);
+ /* We don't handle early failures gracefully */
+ BUG_ON(IS_ERR(css));
+ init_cgroup_css(css, ss, dummytop);
+ /* Update the init_css_set to contain a subsys
+ * pointer to this state - since the subsystem is
+ * newly registered, all tasks and hence the
+ * init_css_set is in the subsystem's top cgroup. */
+ init_css_set.subsys[ss->subsys_id] = css;
+ rootnode.subsys_bits |= 1ULL << ss->subsys_id;
+ }
need_forkexit_callback |= ss->fork || ss->exit;
/* At system boot, before all subsystems have been
@@ -2948,7 +3007,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ss->active = 1;
- rootnode.subsys_bits |= 1ULL << ss->subsys_id;
}
/**
@@ -3125,23 +3183,35 @@ struct file_operations proc_cgroup_operations = {
static void proc_show_subsys(struct seq_file *m, struct cgroupfs_root *root,
struct cgroup_subsys *ss)
{
- seq_printf(m, "%s\t%d\t%d\t%d\n",
+ seq_printf(m, "%s\t%d\t%d\t%d\t%d\n",
ss->name, root->hierarchy_id,
- root->number_of_cgroups, !ss->disabled);
+ root->number_of_cgroups, !ss->disabled,
+ ss->subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
}
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
int i;
- seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+ seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\tmulti\n");
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ bool seen = false;
struct cgroup_subsys *ss = subsys[i];
unsigned long bit = 1ULL << ss->subsys_id;
struct cgroupfs_root *root;
for_each_root(root) {
- if (root->subsys_bits & bit)
+ if (root->subsys_bits & bit) {
proc_show_subsys(m, root, ss);
+ seen = true;
+ }
+ }
+ if (!seen) {
+ BUG_ON(i < CGROUP_SINGLETON_SUBSYS_COUNT);
+ /*
+ * multi-bindable subsystems show up in the
+ * rootnode if they're not bound elsewhere.
+ */
+ proc_show_subsys(m, &rootnode, ss);
}
}
mutex_unlock(&cgroup_mutex);
@@ -3317,6 +3387,8 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
/* We shouldn't be called by an unregistered subsystem */
BUG_ON(!subsys->active);
+ /* We can only support singleton subsystems */
+ BUG_ON(subsys->subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
/* First figure out what hierarchy and cgroup we're dealing
* with, and pin them so we can drop cgroup_mutex */
[RFC] Remove cgroup_subsys.root pointer
In preparation for supporting cgroup subsystems that can be bound to
multiple hierarchies, remove the "root" pointer and associated list
structures. Subsystem hierarchy membership is now determined entirely
through the subsystem bitmasks in struct cgroupfs_root.
Minor changes include:
- root_list now includes the inactive root
- for_each_active_root() -> for_each_root()
- for_each_subsys() is now guaranteed to be in subsys_id order
Signed-off-by: Paul Menage <[email protected]>
---
include/linux/cgroup.h | 16 ++----
kernel/cgroup.c | 128 +++++++++++++++++++++++++++---------------------
2 files changed, 77 insertions(+), 67 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 665fa70..adf6739 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -227,7 +227,9 @@ struct css_set {
/*
* Set of subsystem states, one for each subsystem. This array
* is immutable after creation apart from the init_css_set
- * during subsystem registration (at boot time).
+ * during subsystem registration (at boot
+ * time). Multi-subsystems don't have an entry in here since
+ * there's no unique state for a given task.
*/
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
};
@@ -401,9 +403,9 @@ struct cgroup_subsys {
/*
* Protects sibling/children links of cgroups in this
* hierarchy, plus protects which hierarchy (or none) the
- * subsystem is a part of (i.e. root/sibling). To avoid
- * potential deadlocks, the following operations should not be
- * undertaken while holding any hierarchy_mutex:
+ * subsystem is a part of. To avoid potential deadlocks, the
+ * following operations should not be undertaken while holding
+ * any hierarchy_mutex:
*
* - allocating memory
* - initiating hotplug events
@@ -411,12 +413,6 @@ struct cgroup_subsys {
struct mutex hierarchy_mutex;
struct lock_class_key subsys_key;
- /*
- * Link to parent, and list entry in parent's children.
- * Protected by this->hierarchy_mutex and cgroup_lock()
- */
- struct cgroupfs_root *root;
- struct list_head sibling;
/* used when use_id == true */
struct idr idr;
spinlock_t id_lock;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dede632..8b1b92f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -70,28 +70,19 @@ static struct cgroup_subsys *subsys[] = {
struct cgroupfs_root {
struct super_block *sb;
- /*
- * The bitmask of subsystems intended to be attached to this
- * hierarchy
- */
+ /* The bitmask of subsystems attached to this hierarchy */
unsigned long subsys_bits;
/* Unique id for this hierarchy. */
int hierarchy_id;
- /* The bitmask of subsystems currently attached to this hierarchy */
- unsigned long actual_subsys_bits;
-
- /* A list running through the attached subsystems */
- struct list_head subsys_list;
-
/* The root cgroup for this hierarchy */
struct cgroup top_cgroup;
/* Tracks how many cgroups are currently defined in hierarchy.*/
int number_of_cgroups;
- /* A list running through the active hierarchies */
+ /* A list running through all hierarchies */
struct list_head root_list;
/* Hierarchy-specific flags */
@@ -191,13 +182,36 @@ static int notify_on_release(const struct cgroup *cgrp)
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
*/
+static inline struct cgroup_subsys *nth_ss(int n)
+{
+ return (n >= CGROUP_SUBSYS_COUNT) ? NULL : subsys[n];
+}
#define for_each_subsys(_root, _ss) \
-list_for_each_entry(_ss, &_root->subsys_list, sibling)
+for (_ss = nth_ss(find_first_bit(&(_root)->subsys_bits, CGROUP_SUBSYS_COUNT));\
+ _ss != NULL; \
+ _ss = nth_ss(find_next_bit(&(_root)->subsys_bits, CGROUP_SUBSYS_COUNT, \
+ _ss->subsys_id + 1)))
-/* for_each_active_root() allows you to iterate across the active hierarchies */
-#define for_each_active_root(_root) \
+
+/* for_each_root() allows you to iterate across all hierarchies */
+#define for_each_root(_root) \
list_for_each_entry(_root, &roots, root_list)
+/* Find the root for a given subsystem */
+static struct cgroupfs_root *find_root(struct cgroup_subsys *ss)
+{
+ int id = ss->subsys_id;
+ struct cgroupfs_root *root, *res = NULL;
+ for_each_root(root) {
+ if (root->subsys_bits && (1UL << id)) {
+ BUG_ON(res);
+ res = root;
+ }
+ }
+ BUG_ON(!res);
+ return res;
+}
+
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
@@ -424,7 +438,7 @@ static struct css_set *find_existing_css_set(
struct hlist_node *node;
struct css_set *cg;
- /* Built the set of subsystem state objects that we want to
+ /* Build the set of subsystem state objects that we want to
* see in the new css_set */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
if (root->subsys_bits & (1UL << i)) {
@@ -844,21 +858,20 @@ static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
}
static int rebind_subsystems(struct cgroupfs_root *root,
- unsigned long final_bits)
+ unsigned long final_bits)
{
unsigned long added_bits, removed_bits;
struct cgroup *cgrp = &root->top_cgroup;
int i;
- removed_bits = root->actual_subsys_bits & ~final_bits;
- added_bits = final_bits & ~root->actual_subsys_bits;
+ removed_bits = root->subsys_bits & ~final_bits;
+ added_bits = final_bits & ~root->subsys_bits;
/* Check that any added subsystems are currently free */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
unsigned long bit = 1UL << i;
- struct cgroup_subsys *ss = subsys[i];
if (!(bit & added_bits))
continue;
- if (ss->root != &rootnode) {
+ if (!(rootnode.subsys_bits & bit)) {
/* Subsystem isn't free */
return -EBUSY;
}
@@ -880,25 +893,27 @@ static int rebind_subsystems(struct cgroupfs_root *root,
BUG_ON(cgrp->subsys[i]);
BUG_ON(!dummytop->subsys[i]);
BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+ BUG_ON(!(rootnode.subsys_bits & bit));
mutex_lock(&ss->hierarchy_mutex);
cgrp->subsys[i] = dummytop->subsys[i];
cgrp->subsys[i]->cgroup = cgrp;
- list_move(&ss->sibling, &root->subsys_list);
- ss->root = root;
if (ss->bind)
ss->bind(ss, cgrp);
+ rootnode.subsys_bits &= ~bit;
+ root->subsys_bits |= bit;
mutex_unlock(&ss->hierarchy_mutex);
} else if (bit & removed_bits) {
/* We're removing this subsystem */
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+ BUG_ON(rootnode.subsys_bits & bit);
mutex_lock(&ss->hierarchy_mutex);
if (ss->bind)
ss->bind(ss, dummytop);
dummytop->subsys[i]->cgroup = dummytop;
cgrp->subsys[i] = NULL;
- subsys[i]->root = &rootnode;
- list_move(&ss->sibling, &rootnode.subsys_list);
+ root->subsys_bits &= ~bit;
+ rootnode.subsys_bits |= bit;
mutex_unlock(&ss->hierarchy_mutex);
} else if (bit & final_bits) {
/* Subsystem state should already exist */
@@ -908,7 +923,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
BUG_ON(cgrp->subsys[i]);
}
}
- root->subsys_bits = root->actual_subsys_bits = final_bits;
synchronize_rcu();
return 0;
@@ -1102,7 +1116,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
static void init_cgroup_root(struct cgroupfs_root *root)
{
struct cgroup *cgrp = &root->top_cgroup;
- INIT_LIST_HEAD(&root->subsys_list);
INIT_LIST_HEAD(&root->root_list);
root->number_of_cgroups = 1;
cgrp->root = root;
@@ -1168,7 +1181,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
}
init_cgroup_root(root);
- root->subsys_bits = opts->subsys_bits;
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
@@ -1287,7 +1299,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
goto drop_new_super;
}
- ret = rebind_subsystems(root, root->subsys_bits);
+ ret = rebind_subsystems(root, opts.subsys_bits);
if (ret == -EBUSY) {
mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
@@ -2632,24 +2644,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
{
/* We need to take each hierarchy_mutex in a consistent order */
- int i;
-
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->root == root)
- mutex_lock(&ss->hierarchy_mutex);
- }
+ struct cgroup_subsys *ss;
+ for_each_subsys(root, ss)
+ mutex_lock(&ss->hierarchy_mutex);
}
static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
{
- int i;
-
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->root == root)
- mutex_unlock(&ss->hierarchy_mutex);
- }
+ struct cgroup_subsys *ss;
+ for_each_subsys(root, ss)
+ mutex_unlock(&ss->hierarchy_mutex);
}
/*
@@ -2766,13 +2770,9 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
* we can be called via check_for_release() with no
* synchronization other than RCU, and the subsystem linked
* list isn't RCU-safe */
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
+ struct cgroup_subsys *ss;
+ for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css;
- /* Skip subsystems not in this hierarchy */
- if (ss->root != cgrp->root)
- continue;
css = cgrp->subsys[ss->subsys_id];
/* When called from check_for_release() it's possible
* that by this point the cgroup has been removed
@@ -2930,8 +2930,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
/* Create the top cgroup state for this subsystem */
- list_add(&ss->sibling, &rootnode.subsys_list);
- ss->root = &rootnode;
css = ss->create(ss, dummytop);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
@@ -2953,6 +2951,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
mutex_init(&ss->hierarchy_mutex);
lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
ss->active = 1;
+
+ rootnode.subsys_bits |= 1ULL << ss->subsys_id;
}
/**
@@ -2970,6 +2970,7 @@ int __init cgroup_init_early(void)
INIT_HLIST_NODE(&init_css_set.hlist);
css_set_count = 1;
init_cgroup_root(&rootnode);
+ list_add(&rootnode.root_list, &roots);
root_count = 1;
init_task.cgroups = &init_css_set;
@@ -3079,11 +3080,14 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
mutex_lock(&cgroup_mutex);
- for_each_active_root(root) {
+ for_each_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
int count = 0;
+ if (root == &rootnode)
+ continue;
+
seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -3122,17 +3126,27 @@ struct file_operations proc_cgroup_operations = {
};
/* Display information about each subsystem and each hierarchy */
+static void proc_show_subsys(struct seq_file *m, struct cgroupfs_root *root,
+ struct cgroup_subsys *ss)
+{
+ seq_printf(m, "%s\t%d\t%d\t%d\n",
+ ss->name, root->hierarchy_id,
+ root->number_of_cgroups, !ss->disabled);
+}
+
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
int i;
-
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
- seq_printf(m, "%s\t%d\t%d\t%d\n",
- ss->name, ss->root->hierarchy_id,
- ss->root->number_of_cgroups, !ss->disabled);
+ unsigned long bit = 1ULL << ss->subsys_id;
+ struct cgroupfs_root *root;
+ for_each_root(root) {
+ if (root->subsys_bits & bit)
+ proc_show_subsys(m, root, ss);
+ }
}
mutex_unlock(&cgroup_mutex);
return 0;
@@ -3312,7 +3326,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
* with, and pin them so we can drop cgroup_mutex */
mutex_lock(&cgroup_mutex);
again:
- root = subsys->root;
+ root = find_root(subsys);
if (root == &rootnode) {
mutex_unlock(&cgroup_mutex);
return 0;
@@ -3364,7 +3378,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
* that we're still in the same state that we thought we
* were. */
mutex_lock(&cgroup_mutex);
- if ((root != subsys->root) ||
+ if ((root != find_root(subsys)) ||
(parent != task_cgroup(tsk, subsys->subsys_id))) {
/* Aargh, we raced ... */
mutex_unlock(&inode->i_mutex);
[RFC] Example multi-bindable subsystem: a per-cgroup notes field
As an example of a multiply-bindable subsystem, this patch introduces
the "info" subsystem, which provides a single file, "info.notes", in
which user-space middleware can store an arbitrary (by default up to
one page) binary string representing configuration data about that
cgroup. This reduces the need to keep additional state outside the
cgroup filesystem. The maximum notes size for a hierarchy can be set
by updating the "info.size" file in the root cgroup.
Signed-off-by: Paul Menage <[email protected]>
---
include/linux/cgroup_subsys.h | 6 ++
init/Kconfig | 9 +++
kernel/Makefile | 1
kernel/info_cgroup.c | 133 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 149 insertions(+), 0 deletions(-)
create mode 100644 kernel/info_cgroup.c
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index f78605e..5dfea38 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -60,3 +60,9 @@ SUBSYS(net_cls)
#endif
/* */
+
+#ifdef CONFIG_CGROUP_INFO
+MULTI_SUBSYS(info)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index d904d6c..3bd4685 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -604,6 +604,15 @@ config CGROUP_MEM_RES_CTLR_SWAP
Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
size is 4096bytes, 512k per 1Gbytes of swap.
+config CGROUP_INFO
+ bool "Simple application-specific info cgroup subsystem"
+ depends on CGROUPS
+ help
+ Provides a simple cgroups subsystem with an "info.notes"
+ field, which can be used by middleware to store
+ application-specific configuration data about a cgroup. Can
+ be mounted on multiple hierarchies at once.
+
endif # CGROUPS
config MM_OWNER
diff --git a/kernel/Makefile b/kernel/Makefile
index 7ffdc16..e713a67 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_CGROUP_INFO) += info_cgroup.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/info_cgroup.c b/kernel/info_cgroup.c
new file mode 100644
index 0000000..34cfdb8
--- /dev/null
+++ b/kernel/info_cgroup.c
@@ -0,0 +1,133 @@
+/*
+ * info_cgroup.c - simple cgroup providing a "notes" field
+ */
+
+#include "linux/cgroup.h"
+#include "linux/err.h"
+#include "linux/seq_file.h"
+
+struct info_cgroup {
+ struct cgroup_subsys_state css;
+ /* notes string for this cgroup */
+ const char *notes;
+ size_t len;
+ /*
+ * size limit for notes in this hierarchy. Only relevant for
+ * the root cgroup. Not synchronized since it's a single word
+ * value and writes to it never depend on previously read
+ * values.
+ */
+ size_t max_len;
+ spinlock_t lock;
+};
+
+static inline struct info_cgroup *cg_info(struct cgroup *cg)
+{
+ return container_of(cgroup_subsys_state(cg, info_subsys_id),
+ struct info_cgroup, css);
+}
+
+static struct cgroup_subsys_state *info_create(struct cgroup_subsys *ss,
+ struct cgroup *cg)
+{
+ struct info_cgroup *info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+ spin_lock_init(&info->lock);
+ if (!cg->parent)
+ info->max_len = PAGE_SIZE;
+ return &info->css;
+}
+
+static void info_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct info_cgroup *css = cg_info(cont);
+ kfree(css->notes);
+ kfree(css);
+}
+
+
+static int info_read(struct cgroup *cont,
+ struct cftype *cft,
+ struct seq_file *seq)
+{
+ struct info_cgroup *css = cg_info(cont);
+ spin_lock(&css->lock);
+ if (css->notes)
+ seq_write(seq, css->notes, css->len);
+ spin_unlock(&css->lock);
+ return 0;
+}
+
+/*
+ * Use a custom write function so that we can handle binary data
+ */
+
+static ssize_t info_write(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *unused_ppos) {
+ struct info_cgroup *css = cg_info(cgrp);
+ char *notes = NULL;
+ if (nbytes > cg_info(cgrp->top_cgroup)->max_len)
+ return -E2BIG;
+ if (nbytes) {
+ notes = kmalloc(nbytes, GFP_USER);
+ if (!notes)
+ return -ENOMEM;
+ if (copy_from_user(notes, userbuf, nbytes))
+ return -EFAULT;
+ }
+
+ spin_lock(&css->lock);
+ kfree(css->notes);
+ css->notes = notes;
+ css->len = nbytes;
+ spin_unlock(&css->lock);
+ return nbytes;
+}
+
+static u64 notes_size_read(struct cgroup *cont, struct cftype *cft)
+{
+ struct info_cgroup *css = cg_info(cont);
+ return css->max_len;
+}
+
+static int notes_size_write(struct cgroup *cont, struct cftype *cft, u64 val)
+{
+ struct info_cgroup *css = cg_info(cont);
+ css->max_len = val;
+ return 0;
+}
+
+static struct cftype info_files[] = {
+ {
+ .name = "notes",
+ .read_seq_string = info_read,
+ .write = info_write,
+ },
+};
+
+static struct cftype info_root_files[] = {
+ {
+ .name = "size",
+ .read_u64 = notes_size_read,
+ .write_u64 = notes_size_write,
+ },
+};
+
+static int info_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ if (!cont->parent)
+ cgroup_add_files(cont, ss, info_root_files,
+ ARRAY_SIZE(info_root_files));
+ return cgroup_add_files(cont, ss, info_files, ARRAY_SIZE(info_files));
+}
+
+struct cgroup_subsys info_subsys = {
+ .name = "info",
+ .create = info_create,
+ .destroy = info_destroy,
+ .populate = info_populate,
+ .subsys_id = info_subsys_id,
+};
[RFC] Example multi-bindable subsystem: a max-depth controller
This subsystem introduces a new file, "maxdepth.children", in each
cgroup in its hierarchy. This value defaults to -1, meaning no
limit. If this maxdepth.children is >= 0, then no child cgroup may be
created below this cgroup at a depth of more than maxdepth.children.
The limit is checked at cgroup creation time for all ancestor cgroups.
Signed-off-by: Paul Menage <[email protected]>
---
include/linux/cgroup_subsys.h | 6 +++
init/Kconfig | 8 ++++
kernel/Makefile | 1 +
kernel/maxdepth_cgroup.c | 80 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 95 insertions(+), 0 deletions(-)
create mode 100644 kernel/maxdepth_cgroup.c
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 5dfea38..021bfc1 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -66,3 +66,9 @@ MULTI_SUBSYS(info)
#endif
/* */
+
+#ifdef CONFIG_CGROUP_MAXDEPTH
+MULTI_SUBSYS(maxdepth)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index 3bd4685..69907cc 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -613,6 +613,14 @@ config CGROUP_INFO
application-specific configuration data about a cgroup. Can
be mounted on multiple hierarchies at once.
+config CGROUP_MAXDEPTH
+ bool "CGroups controller to limit the max depth of a hierarchy"
+ depends on CGROUPS
+ help
+ Provides a simple cgroups subsystem with an
+ "maxdepth.children" field that limits the maximum number of
+ child generations permitted to a cgroup.
+
endif # CGROUPS
config MM_OWNER
diff --git a/kernel/Makefile b/kernel/Makefile
index e713a67..5712cd5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
obj-$(CONFIG_CGROUP_INFO) += info_cgroup.o
+obj-$(CONFIG_CGROUP_MAXDEPTH) += maxdepth_cgroup.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/maxdepth_cgroup.c b/kernel/maxdepth_cgroup.c
new file mode 100644
index 0000000..2ca92d8
--- /dev/null
+++ b/kernel/maxdepth_cgroup.c
@@ -0,0 +1,80 @@
+/*
+ * maxdepth_cgroup.c - simple cgroup providing a child generation
+ * limit field
+ */
+
+#include "linux/cgroup.h"
+#include "linux/err.h"
+#include "linux/seq_file.h"
+
+struct maxdepth_cgroup {
+ struct cgroup_subsys_state css;
+ int maxdepth;
+};
+
+static inline struct maxdepth_cgroup *cg_md(struct cgroup *cg)
+{
+ return container_of(cgroup_subsys_state(cg, maxdepth_subsys_id),
+ struct maxdepth_cgroup, css);
+}
+
+static struct cgroup_subsys_state *md_create(struct cgroup_subsys *ss,
+ struct cgroup *cg)
+{
+ struct maxdepth_cgroup *md;
+ int depth = 1, maxdepth;
+ while (1) {
+ cg = cg->parent;
+ if (!cg)
+ break;
+ maxdepth = cg_md(cg)->maxdepth;
+ if ((maxdepth >= 0) && (depth > maxdepth))
+ return ERR_PTR(-EINVAL);
+ depth++;
+ }
+
+ md = kzalloc(sizeof(*md), GFP_KERNEL);
+ if (!md)
+ return ERR_PTR(-ENOMEM);
+ md->maxdepth = -1;
+ return &md->css;
+}
+
+static void md_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ kfree(cg_md(cont));
+}
+
+static s64 md_read(struct cgroup *cont, struct cftype *cft)
+{
+ return cg_md(cont)->maxdepth;
+}
+
+static int md_write(struct cgroup *cont, struct cftype *cft, s64 val)
+{
+ if ((val < -1) || (val >= INT_MAX))
+ return -EINVAL;
+ cg_md(cont)->maxdepth = val;
+ return 0;
+}
+
+static struct cftype md_files[] = {
+ {
+ .name = "children",
+ .read_s64 = md_read,
+ .write_s64 = md_write,
+ }
+};
+
+static int md_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, md_files, ARRAY_SIZE(md_files));
+}
+
+struct cgroup_subsys maxdepth_subsys = {
+ .name = "maxdepth",
+ .create = md_create,
+ .destroy = md_destroy,
+ .populate = md_populate,
+ .subsys_id = maxdepth_subsys_id,
+};
On Wed, 01 Jul 2009 19:10:58 -0700
Paul Menage <[email protected]> wrote:
> [RFC] Support named cgroups hierarchies
>
> To simplify referring to cgroup hierarchies in mount statements, and
> to allow disambiguation in the presence of empty hierarchies and
> multiply-bindable subsystems (see later patches in series) this patch
> adds support for naming a new cgroup hierarchy via the "name=" mount
> option
>
> A pre-existing hierarchy may be specified by either name or by
> subsystems; a hierarchy's name cannot be changed by a remount
> operation.
>
> Example usage:
>
> # To create a hierarchy called "foo" containing the "cpu" subsystem
> mount -t cgroup -oname=foo,cpu cgroup /mnt/cgroup1
>
> # To mount the "foo" hierarchy on a second location
> mount -t cgroup -oname=foo cgroup /mnt/cgroup2
>
> Open issues:
>
> - should the specification be via a name= option as in this patch, or
> should we simply use the "device name" as passed to the mount()
> system call? Using the device name is more conceptually clean and
> consistent with the filesystem API; however, given that the device
> name is currently ignored by cgroups, this would lead to a
> user-visible behaviour change.
>
IMHO, name= option is good because people think device name for pseudo file
system has no meanings. I think just leaving it as "no meaning" is better.
> Signed-off-by: Paul Menage <[email protected]>
>
> ---
>
> kernel/cgroup.c | 136 ++++++++++++++++++++++++++++++++++++-------------------
> 1 files changed, 88 insertions(+), 48 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index ea255fe..940f28d 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -59,6 +59,8 @@ static struct cgroup_subsys *subsys[] = {
> #include <linux/cgroup_subsys.h>
> };
>
> +#define MAX_CGROUP_ROOT_NAMELEN 64
> +
I don't like long names but....isn't this too short ? How about NAME_MAX ?
> /*
> * A cgroupfs_root represents the root of a cgroup hierarchy,
> * and may be associated with a superblock to form an active
> @@ -93,6 +95,9 @@ struct cgroupfs_root {
>
> /* The path to use for release notifications. */
> char release_agent_path[PATH_MAX];
> +
> + /* The name for this hierarchy - may be empty */
> + char name[MAX_CGROUP_ROOT_NAMELEN];
> };
>
If you don't want to make cgroupfs_root bigger,
cgroupfs_root {
......
/* this must be the bottom of struct */
char name[0];
}
Is a choice.
BTW, reading a patch, any kind of charactors are allowed ?
Thanks,
-Kame
> /*
> @@ -828,6 +833,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
> seq_puts(seq, ",noprefix");
> if (strlen(root->release_agent_path))
> seq_printf(seq, ",release_agent=%s", root->release_agent_path);
> + if (strlen(root->name))
> + seq_printf(seq, ",name=%s", root->name);
> mutex_unlock(&cgroup_mutex);
> return 0;
> }
> @@ -836,12 +843,15 @@ struct cgroup_sb_opts {
> unsigned long subsys_bits;
> unsigned long flags;
> char *release_agent;
> + char *name;
> + /* A flag indicating that a root was created from this options block */
> + bool created_root;
> };
>
> /* Convert a hierarchy specifier into a bitmask of subsystems and
> * flags. */
> static int parse_cgroupfs_options(char *data,
> - struct cgroup_sb_opts *opts)
> + struct cgroup_sb_opts *opts)
> {
> char *token, *o = data ?: "all";
> unsigned long mask = (unsigned long)-1;
> @@ -850,9 +860,7 @@ static int parse_cgroupfs_options(char *data,
> mask = ~(1UL << cpuset_subsys_id);
> #endif
>
> - opts->subsys_bits = 0;
> - opts->flags = 0;
> - opts->release_agent = NULL;
> + memset(opts, 0, sizeof(*opts));
>
> while ((token = strsep(&o, ",")) != NULL) {
> if (!*token)
> @@ -872,11 +880,19 @@ static int parse_cgroupfs_options(char *data,
> /* Specifying two release agents is forbidden */
> if (opts->release_agent)
> return -EINVAL;
> - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
> + opts->release_agent =
> + kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
> if (!opts->release_agent)
> return -ENOMEM;
> - strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
> - opts->release_agent[PATH_MAX - 1] = 0;
> + } else if (!strncmp(token, "name=", 5)) {
> + /* Specifying two names is forbidden */
> + if (opts->name)
> + return -EINVAL;
> + opts->name = kstrndup(token + 5,
> + MAX_CGROUP_ROOT_NAMELEN,
> + GFP_KERNEL);
> + if (!opts->name)
> + return -ENOMEM;
> } else {
> struct cgroup_subsys *ss;
> int i;
> @@ -903,7 +919,7 @@ static int parse_cgroupfs_options(char *data,
> return -EINVAL;
>
> /* We can't have an empty hierarchy */
> - if (!opts->subsys_bits)
> + if (!opts->subsys_bits && !opts->name)
> return -EINVAL;
>
> return 0;
> @@ -931,6 +947,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> goto out_unlock;
> }
>
> + /* Don't allow name to change at remount */
> + if (opts.name && strcmp(opts.name, root->name)) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> ret = rebind_subsystems(root, opts.subsys_bits);
> if (ret)
> goto out_unlock;
> @@ -942,6 +964,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> strcpy(root->release_agent_path, opts.release_agent);
> out_unlock:
> kfree(opts.release_agent);
> + kfree(opts.name);
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
> unlock_kernel();
> @@ -963,6 +986,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> INIT_LIST_HEAD(&cgrp->release_list);
> init_rwsem(&cgrp->pids_mutex);
> }
> +
> static void init_cgroup_root(struct cgroupfs_root *root)
> {
> struct cgroup *cgrp = &root->top_cgroup;
> @@ -976,28 +1000,56 @@ static void init_cgroup_root(struct cgroupfs_root *root)
>
> static int cgroup_test_super(struct super_block *sb, void *data)
> {
> - struct cgroupfs_root *new = data;
> + struct cgroup_sb_opts *new = data;
> struct cgroupfs_root *root = sb->s_fs_info;
>
> - /* First check subsystems */
> - if (new->subsys_bits != root->subsys_bits)
> - return 0;
> + /* If we asked for a name then it must match */
> + if (new->name && strcmp(new->name, root->name))
> + return 0;
>
> - /* Next check flags */
> - if (new->flags != root->flags)
> + /* If we asked for subsystems then they must match */
> + if (new->subsys_bits && new->subsys_bits != root->subsys_bits)
> return 0;
>
> return 1;
> }
>
> +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
> +{
> + struct cgroupfs_root *root;
> +
> + if (!opts->subsys_bits)
> + return ERR_PTR(-EINVAL);
> +
> + root = kzalloc(sizeof(*root), GFP_KERNEL);
> + if (!root)
> + return ERR_PTR(-ENOMEM);
> +
> + init_cgroup_root(root);
> + root->subsys_bits = opts->subsys_bits;
> + root->flags = opts->flags;
> + if (opts->release_agent)
> + strcpy(root->release_agent_path, opts->release_agent);
> + if (opts->name)
> + strcpy(root->name, opts->name);
> + opts->created_root = true;
> + return root;
> +}
> +
> static int cgroup_set_super(struct super_block *sb, void *data)
> {
> int ret;
> - struct cgroupfs_root *root = data;
> + struct cgroup_sb_opts *opts = data;
> + struct cgroupfs_root *root;
>
> + root = cgroup_root_from_opts(opts);
> + if (IS_ERR(root))
> + return PTR_ERR(root);
> ret = set_anon_super(sb, NULL);
> - if (ret)
> + if (ret) {
> + kfree(root);
> return ret;
> + }
>
> sb->s_fs_info = root;
> root->sb = sb;
> @@ -1039,44 +1091,23 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> struct cgroup_sb_opts opts;
> int ret = 0;
> struct super_block *sb;
> - struct cgroupfs_root *root;
> - struct list_head tmp_cg_links;
>
> /* First find the desired set of subsystems */
> ret = parse_cgroupfs_options(data, &opts);
> - if (ret) {
> - kfree(opts.release_agent);
> - return ret;
> - }
> -
> - root = kzalloc(sizeof(*root), GFP_KERNEL);
> - if (!root) {
> - kfree(opts.release_agent);
> - return -ENOMEM;
> - }
> -
> - init_cgroup_root(root);
> - root->subsys_bits = opts.subsys_bits;
> - root->flags = opts.flags;
> - if (opts.release_agent) {
> - strcpy(root->release_agent_path, opts.release_agent);
> - kfree(opts.release_agent);
> - }
> + if (ret)
> + goto out_err;
>
> - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
> + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
>
> if (IS_ERR(sb)) {
> - kfree(root);
> - return PTR_ERR(sb);
> + ret = PTR_ERR(sb);
> + goto out_err;
> }
>
> - if (sb->s_fs_info != root) {
> - /* Reusing an existing superblock */
> - BUG_ON(sb->s_root == NULL);
> - kfree(root);
> - root = NULL;
> - } else {
> + if (opts.created_root) {
> /* New superblock */
> + struct cgroupfs_root *root = sb->s_fs_info;
> + struct list_head tmp_cg_links;
> struct cgroup *root_cgrp = &root->top_cgroup;
> struct inode *inode;
> int i;
> @@ -1109,7 +1140,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> if (ret == -EBUSY) {
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&inode->i_mutex);
> - goto free_cg_links;
> + free_cg_links(&tmp_cg_links);
> + goto drop_new_super;
> }
>
> /* EBUSY should be the only error here */
> @@ -1146,12 +1178,17 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> }
>
> simple_set_mnt(mnt, sb);
> + kfree(opts.release_agent);
> + kfree(opts.name);
> return 0;
>
> - free_cg_links:
> - free_cg_links(&tmp_cg_links);
> drop_new_super:
> deactivate_locked_super(sb);
> +
> + out_err:
> + kfree(opts.release_agent);
> + kfree(opts.name);
> +
> return ret;
> }
>
> @@ -2923,6 +2960,9 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
> seq_printf(m, "%lu:", root->subsys_bits);
> for_each_subsys(root, ss)
> seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
> + if (strlen(root->name))
> + seq_printf(m, "%sname=%s",
> + count ? "," : "", root->name);
> seq_putc(m, ':');
> get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
> cgrp = task_cgroup(tsk, subsys_id);
>
>
On Wed, 01 Jul 2009 19:11:28 -0700
Paul Menage <[email protected]> wrote:
> [RFC] Support multiply-bindable cgroup subsystems
>
> This patch allows a cgroup subsystem to be marked as bindable on
> multiple cgroup hierarchies independently, when declared in
> cgroup_subsys.h via MULTI_SUBSYS() rather than SUBSYS().
>
> The state for such subsystems cannot be accessed directly from a
> task->cgroups (since there's no unique mapping for a task) but instead
> must be accessed via a particular control group object.
>
> Multiply-bound subsystems are useful in cases where there's no direct
> correspondence between the cgroup configuration and some property of
> the kernel outside of the cgroups subsystem. So this would not be
> applicable to e.g. the CFS cgroup, since there has to a unique mapping
> from a task to its CFS run queue.
>
> As an example, the "debug" subsystem is marked multiply-bindable,
> since it has no state outside the cgroups framework itself.
>
> Example usage:
>
> mount -t cgroup -o name=foo,debug,cpu cgroup /mnt1
> mount -t cgroup -o name=bar,debug,memory cgroup /mnt2
>
> Open Issues:
>
> - in the current version of this patch, mounting a cgroups hierarchy
> with no options does *not* get you any of the multi-bindable
> subsystems; possibly for consistency it should give you all of the
> multi-bindable subsystems as well as all of the single-bindable
> subsystems.
>
I don't think this is a big problem. Hmm, I wonder there are no people who
uses cgroup without any options (= mounts all subsys at once)...
> - how can we avoid the checkpatch.pl errors due to creative use of
> macros to generate enum names?
>
> Signed-off-by: Paul Menage <[email protected]>
>
> ---
>
> include/linux/cgroup.h | 38 +++++++-
> include/linux/cgroup_subsys.h | 2
> kernel/cgroup.c | 192 ++++++++++++++++++++++++++++-------------
> 3 files changed, 168 insertions(+), 64 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index a6bb0ca..02fdea9 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -39,10 +39,38 @@ extern int cgroupstats_build(struct cgroupstats *stats,
>
> extern struct file_operations proc_cgroup_operations;
>
> -/* Define the enumeration of all cgroup subsystems */
> -#define SUBSYS(_x) _x ## _subsys_id,
> +/*
> + * Define the enumeration of all cgroup subsystems. There are two
> + * kinds of subsystems:
> + *
> + * - regular (singleton) subsystems can be only mounted once; these
> + * generally correspond to some system that has substantial state
> + * outside of the cgroups framework, or where the state is being used
> + * to control some external behaviour (e.g. CPU scheduler). Every
> + * task has an associated state pointer (accessed efficiently through
> + * task->cgroups) for each singleton subsystem.
> + *
> + * - multiply-bound subsystems may be mounted on zero or more
> + * hierarchies. Since there's no unique mapping from a task to a
> + * subsystem state pointer for multiply-bound subsystems, the state of
> + * these subsystems cannot be accessed rapidly from a task
> + * pointer. These correspond to subsystems where most or all of the
> + * state is maintained within the subsystem itself, and/or the
> + * subsystem is used for monitoring rather than control.
> + */
> enum cgroup_subsys_id {
> +#define SUBSYS(_x) _x ## _subsys_id,
> +#define MULTI_SUBSYS(_x)
> +#include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> + CGROUP_SINGLETON_SUBSYS_COUNT,
> + CGROUP_DUMMY_ENUM_RESET = CGROUP_SINGLETON_SUBSYS_COUNT - 1,
> +#define SUBSYS(_x)
> +#define MULTI_SUBSYS(_x) _x ## _subsys_id,
> #include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> CGROUP_SUBSYS_COUNT
> };
Wow...seems complicated. How about adding linux/cgroup_multisubsys.h ?
Thanks,
-Kame
> #undef SUBSYS
> @@ -231,7 +259,7 @@ struct css_set {
> * time). Multi-subsystems don't have an entry in here since
> * there's no unique state for a given task.
> */
> - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
> + struct cgroup_subsys_state *subsys[CGROUP_SINGLETON_SUBSYS_COUNT];
> };
>
> /*
> @@ -418,8 +446,10 @@ struct cgroup_subsys {
> };
>
> #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
> +#define MULTI_SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
> #include <linux/cgroup_subsys.h>
> #undef SUBSYS
> +#undef MULTI_SUBSYS
>
> static inline struct cgroup_subsys_state *cgroup_subsys_state(
> struct cgroup *cgrp, int subsys_id)
> @@ -430,6 +460,8 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
> static inline struct cgroup_subsys_state *task_subsys_state(
> struct task_struct *task, int subsys_id)
> {
> + /* This check is optimized out for most callers */
> + BUG_ON(subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
> return rcu_dereference(task->cgroups->subsys[subsys_id]);
> }
>
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index 9c8d31b..f78605e 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -14,7 +14,7 @@ SUBSYS(cpuset)
> /* */
>
> #ifdef CONFIG_CGROUP_DEBUG
> -SUBSYS(debug)
> +MULTI_SUBSYS(debug)
> #endif
>
> /* */
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 74840ea..a527687 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -54,10 +54,18 @@
> static DEFINE_MUTEX(cgroup_mutex);
>
> /* Generate an array of cgroup subsystem pointers */
> -#define SUBSYS(_x) &_x ## _subsys,
>
> static struct cgroup_subsys *subsys[] = {
> +#define SUBSYS(_x) (&_x ## _subsys),
> +#define MULTI_SUBSYS(_x)
> #include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> +#define SUBSYS(_x)
> +#define MULTI_SUBSYS(_x) (&_x ## _subsys),
> +#include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> };
>
> #define MAX_CGROUP_ROOT_NAMELEN 64
> @@ -269,7 +277,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
> int index;
> unsigned long tmp = 0UL;
>
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
> + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++)
> tmp += (unsigned long)css[i];
> tmp = (tmp >> 16) ^ tmp;
>
> @@ -440,7 +448,7 @@ static struct css_set *find_existing_css_set(
>
> /* Build the set of subsystem state objects that we want to
> * see in the new css_set */
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
> if (root->subsys_bits & (1UL << i)) {
> /* Subsystem is in this hierarchy. So we want
> * the subsystem state from the new
> @@ -534,7 +542,7 @@ static struct css_set *find_css_set(
> struct css_set *oldcg, struct cgroup *cgrp)
> {
> struct css_set *res;
> - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> + struct cgroup_subsys_state *template[CGROUP_SINGLETON_SUBSYS_COUNT];
>
> struct list_head tmp_cg_links;
>
> @@ -857,17 +865,33 @@ static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
> wake_up_all(&cgroup_rmdir_waitq);
> }
>
> +static void init_cgroup_css(struct cgroup_subsys_state *css,
> + struct cgroup_subsys *ss,
> + struct cgroup *cgrp)
> +{
> + css->cgroup = cgrp;
> + atomic_set(&css->refcnt, 1);
> + css->flags = 0;
> + css->id = NULL;
> + if (cgrp == dummytop)
> + set_bit(CSS_ROOT, &css->flags);
> + BUG_ON(cgrp->subsys[ss->subsys_id]);
> + cgrp->subsys[ss->subsys_id] = css;
> +}
> +
> static int rebind_subsystems(struct cgroupfs_root *root,
> unsigned long final_bits)
> {
> - unsigned long added_bits, removed_bits;
> + unsigned long added_bits, removed_bits, rollback_bits;
> + int ret = 0;
> struct cgroup *cgrp = &root->top_cgroup;
> int i;
>
> removed_bits = root->subsys_bits & ~final_bits;
> added_bits = final_bits & ~root->subsys_bits;
> + rollback_bits = 0;
> /* Check that any added subsystems are currently free */
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
> unsigned long bit = 1UL << i;
> if (!(bit & added_bits))
> continue;
> @@ -884,33 +908,45 @@ static int rebind_subsystems(struct cgroupfs_root *root,
> if (root->number_of_cgroups > 1)
> return -EBUSY;
>
> - /* Process each subsystem */
> + /*
> + * Process each subsystem. First try to add all new
> + * subsystems, since this may require rollback
> + */
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> struct cgroup_subsys *ss = subsys[i];
> + bool singleton = i < CGROUP_SINGLETON_SUBSYS_COUNT;
> unsigned long bit = 1UL << i;
> if (bit & added_bits) {
> + struct cgroup_subsys_state *css;
> /* We're binding this subsystem to this hierarchy */
> BUG_ON(cgrp->subsys[i]);
> - BUG_ON(!dummytop->subsys[i]);
> - BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
> - BUG_ON(!(rootnode.subsys_bits & bit));
> + if (singleton) {
> + css = dummytop->subsys[i];
> + BUG_ON(!css);
> +
> + BUG_ON(css->cgroup != dummytop);
> + BUG_ON(!(rootnode.subsys_bits & bit));
> + } else {
> + BUG_ON(dummytop->subsys[i]);
> + BUG_ON(rootnode.subsys_bits & bit);
> + css = ss->create(ss, cgrp);
> + if (IS_ERR(css)) {
> + ret = PTR_ERR(css);
> + break;
> + }
> + init_cgroup_css(css, ss, cgrp);
> + }
> mutex_lock(&ss->hierarchy_mutex);
> - cgrp->subsys[i] = dummytop->subsys[i];
> - cgrp->subsys[i]->cgroup = cgrp;
> - rootnode.subsys_bits &= ~bit;
> + cgrp->subsys[i] = css;
> + css->cgroup = cgrp;
> + if (singleton)
> + rootnode.subsys_bits &= ~bit;
> root->subsys_bits |= bit;
> mutex_unlock(&ss->hierarchy_mutex);
> + rollback_bits |= bit;
> } else if (bit & removed_bits) {
> - /* We're removing this subsystem */
> - BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
> - BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
> - BUG_ON(rootnode.subsys_bits & bit);
> - mutex_lock(&ss->hierarchy_mutex);
> - dummytop->subsys[i]->cgroup = dummytop;
> - cgrp->subsys[i] = NULL;
> - root->subsys_bits &= ~bit;
> - rootnode.subsys_bits |= bit;
> - mutex_unlock(&ss->hierarchy_mutex);
> + /* Deal with this after adds are successful */
> + BUG_ON(!cgrp->subsys[i]);
> } else if (bit & final_bits) {
> /* Subsystem state should already exist */
> BUG_ON(!cgrp->subsys[i]);
> @@ -919,9 +955,47 @@ static int rebind_subsystems(struct cgroupfs_root *root,
> BUG_ON(cgrp->subsys[i]);
> }
> }
> + if (ret) {
> + /* We failed to add some subsystem - roll back */
> + removed_bits = rollback_bits;
> + }
> +
> + /*
> + * Now either remove the subsystems that were requested to be
> + * removed, or roll back the subsystems that were added before
> + * the error occurred
> + */
> + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + struct cgroup_subsys *ss = subsys[i];
> + bool singleton = i < CGROUP_SINGLETON_SUBSYS_COUNT;
> + unsigned long bit = 1UL << i;
> + if (bit & removed_bits) {
> + struct cgroup_subsys_state *css;
> + /* We're removing this subsystem */
> + css = cgrp->subsys[i];
> + BUG_ON(css->cgroup != cgrp);
> + if (singleton) {
> + BUG_ON(css != dummytop->subsys[i]);
> + BUG_ON(rootnode.subsys_bits & bit);
> + }
> + mutex_lock(&ss->hierarchy_mutex);
> + if (singleton) {
> + css->cgroup = dummytop;
> + } else {
> + /* Is this safe? */
> + ss->destroy(ss, cgrp);
> + }
> + cgrp->subsys[i] = NULL;
> + root->subsys_bits &= ~bit;
> + if (singleton)
> + rootnode.subsys_bits |= bit;
> + mutex_unlock(&ss->hierarchy_mutex);
> + }
> + }
> +
> synchronize_rcu();
>
> - return 0;
> + return ret;
> }
>
> static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
> @@ -976,7 +1050,7 @@ static int parse_cgroupfs_options(char *data,
> /* Add all non-disabled subsystems */
> int i;
> opts->subsys_bits = 0;
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
> struct cgroup_subsys *ss = subsys[i];
> if (!ss->disabled)
> opts->subsys_bits |= 1ul << i;
> @@ -1296,16 +1370,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> }
>
> ret = rebind_subsystems(root, opts.subsys_bits);
> - if (ret == -EBUSY) {
> + if (ret) {
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&inode->i_mutex);
> free_cg_links(&tmp_cg_links);
> goto drop_new_super;
> }
>
> - /* EBUSY should be the only error here */
> - BUG_ON(ret);
> -
> list_add(&root->root_list, &roots);
> root_count++;
>
> @@ -2623,20 +2694,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
> return 0;
> }
>
> -static void init_cgroup_css(struct cgroup_subsys_state *css,
> - struct cgroup_subsys *ss,
> - struct cgroup *cgrp)
> -{
> - css->cgroup = cgrp;
> - atomic_set(&css->refcnt, 1);
> - css->flags = 0;
> - css->id = NULL;
> - if (cgrp == dummytop)
> - set_bit(CSS_ROOT, &css->flags);
> - BUG_ON(cgrp->subsys[ss->subsys_id]);
> - cgrp->subsys[ss->subsys_id] = css;
> -}
> -
> static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
> {
> /* We need to take each hierarchy_mutex in a consistent order */
> @@ -2922,21 +2979,23 @@ again:
> static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
> {
> struct cgroup_subsys_state *css;
> -
> + bool singleton = ss->subsys_id < CGROUP_SINGLETON_SUBSYS_COUNT;
> printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
>
> - /* Create the top cgroup state for this subsystem */
> - css = ss->create(ss, dummytop);
> - /* We don't handle early failures gracefully */
> - BUG_ON(IS_ERR(css));
> - init_cgroup_css(css, ss, dummytop);
> -
> - /* Update the init_css_set to contain a subsys
> - * pointer to this state - since the subsystem is
> - * newly registered, all tasks and hence the
> - * init_css_set is in the subsystem's top cgroup. */
> - init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
> + if (singleton) {
> + /* Create the top cgroup state for this subsystem */
> + css = ss->create(ss, dummytop);
> + /* We don't handle early failures gracefully */
> + BUG_ON(IS_ERR(css));
> + init_cgroup_css(css, ss, dummytop);
>
> + /* Update the init_css_set to contain a subsys
> + * pointer to this state - since the subsystem is
> + * newly registered, all tasks and hence the
> + * init_css_set is in the subsystem's top cgroup. */
> + init_css_set.subsys[ss->subsys_id] = css;
> + rootnode.subsys_bits |= 1ULL << ss->subsys_id;
> + }
> need_forkexit_callback |= ss->fork || ss->exit;
>
> /* At system boot, before all subsystems have been
> @@ -2948,7 +3007,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
> lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
> ss->active = 1;
>
> - rootnode.subsys_bits |= 1ULL << ss->subsys_id;
> }
>
> /**
> @@ -3125,23 +3183,35 @@ struct file_operations proc_cgroup_operations = {
> static void proc_show_subsys(struct seq_file *m, struct cgroupfs_root *root,
> struct cgroup_subsys *ss)
> {
> - seq_printf(m, "%s\t%d\t%d\t%d\n",
> + seq_printf(m, "%s\t%d\t%d\t%d\t%d\n",
> ss->name, root->hierarchy_id,
> - root->number_of_cgroups, !ss->disabled);
> + root->number_of_cgroups, !ss->disabled,
> + ss->subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
> }
>
> static int proc_cgroupstats_show(struct seq_file *m, void *v)
> {
> int i;
> - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
> + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\tmulti\n");
> mutex_lock(&cgroup_mutex);
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + bool seen = false;
> struct cgroup_subsys *ss = subsys[i];
> unsigned long bit = 1ULL << ss->subsys_id;
> struct cgroupfs_root *root;
> for_each_root(root) {
> - if (root->subsys_bits & bit)
> + if (root->subsys_bits & bit) {
> proc_show_subsys(m, root, ss);
> + seen = true;
> + }
> + }
> + if (!seen) {
> + BUG_ON(i < CGROUP_SINGLETON_SUBSYS_COUNT);
> + /*
> + * multi-bindable subsystems show up in the
> + * rootnode if they're not bound elsewhere.
> + */
> + proc_show_subsys(m, &rootnode, ss);
> }
> }
> mutex_unlock(&cgroup_mutex);
> @@ -3317,6 +3387,8 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
>
> /* We shouldn't be called by an unregistered subsystem */
> BUG_ON(!subsys->active);
> + /* We can only support singleton subsystems */
> + BUG_ON(subsys->subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
>
> /* First figure out what hierarchy and cgroup we're dealing
> * with, and pin them so we can drop cgroup_mutex */
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
On Wed, Jul 1, 2009 at 7:28 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
>> Open issues:
>>
>> - should the specification be via a name= option as in this patch, or
>> ? should we simply use the "device name" as passed to the mount()
>> ? system call? ?Using the device name is more conceptually clean and
>> ? consistent with the filesystem API; however, given that the device
>> ? name is currently ignored by cgroups, this would lead to a
>> ? user-visible behaviour change.
>>
>
> IMHO, name= option is good because people think device name for pseudo file
> system has no meanings. I think just leaving it as "no meaning" is better.
>
Yes, I guess that makes sense. That was Li Zefan's opinion too.
>>
>> +#define MAX_CGROUP_ROOT_NAMELEN 64
>> +
>
> I don't like long names but....isn't this too short ? How about NAME_MAX ?
>
>
>> ?/*
>> ? * A cgroupfs_root represents the root of a cgroup hierarchy,
>> ? * and may be associated with a superblock to form an active
>> @@ -93,6 +95,9 @@ struct cgroupfs_root {
>>
>> ? ? ? /* The path to use for release notifications. */
>> ? ? ? char release_agent_path[PATH_MAX];
>> +
>> + ? ? /* The name for this hierarchy - may be empty */
>> + ? ? char name[MAX_CGROUP_ROOT_NAMELEN];
>> ?};
>>
> If you don't want to make cgroupfs_root bigger,
>
> ? ? ? ?cgroupfs_root {
> ? ? ? ? ? ? ? ?......
> ? ? ? ? ? ? ? ?/* this must be the bottom of struct */
> ? ? ? ? ? ? ? ?char name[0];
> ? ? ? ?}
>
> Is a choice.
I'd rather avoid something like that since I think it's less readable
- I'd probably just make the name into a pointer in that case.
>
> BTW, reading a patch, any kind of charactors are allowed ?
Yes, other than \000 of course. I guess maybe I should use
seq_escape() to print the name to avoid confusion in the event that
people put whitespace in there, or else just ban whitespace (or maybe
all non-alphanumeric chars).
Paul
On Wed, 01 Jul 2009 19:11:34 -0700
Paul Menage <[email protected]> wrote:
> [RFC] Example multi-bindable subsystem: a per-cgroup notes field
>
> As an example of a multiply-bindable subsystem, this patch introduces
> the "info" subsystem, which provides a single file, "info.notes", in
> which user-space middleware can store an arbitrary (by default up to
> one page) binary string representing configuration data about that
> cgroup. This reduces the need to keep additional state outside the
> cgroup filesystem. The maximum notes size for a hierarchy can be set
> by updating the "info.size" file in the root cgroup.
>
> Signed-off-by: Paul Menage <[email protected]>
>
Hmm, do we need to this "info" file as subsys ? How about making this as
default file set ? (if there are users.)
Thanks,
-Kame
> ---
>
> include/linux/cgroup_subsys.h | 6 ++
> init/Kconfig | 9 +++
> kernel/Makefile | 1
> kernel/info_cgroup.c | 133 +++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 149 insertions(+), 0 deletions(-)
> create mode 100644 kernel/info_cgroup.c
>
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index f78605e..5dfea38 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -60,3 +60,9 @@ SUBSYS(net_cls)
> #endif
>
> /* */
> +
> +#ifdef CONFIG_CGROUP_INFO
> +MULTI_SUBSYS(info)
> +#endif
> +
> +/* */
> diff --git a/init/Kconfig b/init/Kconfig
> index d904d6c..3bd4685 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -604,6 +604,15 @@ config CGROUP_MEM_RES_CTLR_SWAP
> Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
> size is 4096bytes, 512k per 1Gbytes of swap.
>
> +config CGROUP_INFO
> + bool "Simple application-specific info cgroup subsystem"
> + depends on CGROUPS
> + help
> + Provides a simple cgroups subsystem with an "info.notes"
> + field, which can be used by middleware to store
> + application-specific configuration data about a cgroup. Can
> + be mounted on multiple hierarchies at once.
> +
> endif # CGROUPS
>
> config MM_OWNER
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 7ffdc16..e713a67 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -61,6 +61,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
> obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
> obj-$(CONFIG_CPUSETS) += cpuset.o
> obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
> +obj-$(CONFIG_CGROUP_INFO) += info_cgroup.o
> obj-$(CONFIG_UTS_NS) += utsname.o
> obj-$(CONFIG_USER_NS) += user_namespace.o
> obj-$(CONFIG_PID_NS) += pid_namespace.o
> diff --git a/kernel/info_cgroup.c b/kernel/info_cgroup.c
> new file mode 100644
> index 0000000..34cfdb8
> --- /dev/null
> +++ b/kernel/info_cgroup.c
> @@ -0,0 +1,133 @@
> +/*
> + * info_cgroup.c - simple cgroup providing a "notes" field
> + */
> +
> +#include "linux/cgroup.h"
> +#include "linux/err.h"
> +#include "linux/seq_file.h"
> +
> +struct info_cgroup {
> + struct cgroup_subsys_state css;
> + /* notes string for this cgroup */
> + const char *notes;
> + size_t len;
> + /*
> + * size limit for notes in this hierarchy. Only relevant for
> + * the root cgroup. Not synchronized since it's a single word
> + * value and writes to it never depend on previously read
> + * values.
> + */
> + size_t max_len;
> + spinlock_t lock;
> +};
> +
> +static inline struct info_cgroup *cg_info(struct cgroup *cg)
> +{
> + return container_of(cgroup_subsys_state(cg, info_subsys_id),
> + struct info_cgroup, css);
> +}
> +
> +static struct cgroup_subsys_state *info_create(struct cgroup_subsys *ss,
> + struct cgroup *cg)
> +{
> + struct info_cgroup *info = kzalloc(sizeof(*info), GFP_KERNEL);
> + if (!info)
> + return ERR_PTR(-ENOMEM);
> + spin_lock_init(&info->lock);
> + if (!cg->parent)
> + info->max_len = PAGE_SIZE;
> + return &info->css;
> +}
> +
> +static void info_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
> +{
> + struct info_cgroup *css = cg_info(cont);
> + kfree(css->notes);
> + kfree(css);
> +}
> +
> +
> +static int info_read(struct cgroup *cont,
> + struct cftype *cft,
> + struct seq_file *seq)
> +{
> + struct info_cgroup *css = cg_info(cont);
> + spin_lock(&css->lock);
> + if (css->notes)
> + seq_write(seq, css->notes, css->len);
> + spin_unlock(&css->lock);
> + return 0;
> +}
> +
> +/*
> + * Use a custom write function so that we can handle binary data
> + */
> +
> +static ssize_t info_write(struct cgroup *cgrp, struct cftype *cft,
> + struct file *file,
> + const char __user *userbuf,
> + size_t nbytes, loff_t *unused_ppos) {
> + struct info_cgroup *css = cg_info(cgrp);
> + char *notes = NULL;
> + if (nbytes > cg_info(cgrp->top_cgroup)->max_len)
> + return -E2BIG;
> + if (nbytes) {
> + notes = kmalloc(nbytes, GFP_USER);
> + if (!notes)
> + return -ENOMEM;
> + if (copy_from_user(notes, userbuf, nbytes))
> + return -EFAULT;
> + }
> +
> + spin_lock(&css->lock);
> + kfree(css->notes);
> + css->notes = notes;
> + css->len = nbytes;
> + spin_unlock(&css->lock);
> + return nbytes;
> +}
> +
> +static u64 notes_size_read(struct cgroup *cont, struct cftype *cft)
> +{
> + struct info_cgroup *css = cg_info(cont);
> + return css->max_len;
> +}
> +
> +static int notes_size_write(struct cgroup *cont, struct cftype *cft, u64 val)
> +{
> + struct info_cgroup *css = cg_info(cont);
> + css->max_len = val;
> + return 0;
> +}
> +
> +static struct cftype info_files[] = {
> + {
> + .name = "notes",
> + .read_seq_string = info_read,
> + .write = info_write,
> + },
> +};
> +
> +static struct cftype info_root_files[] = {
> + {
> + .name = "size",
> + .read_u64 = notes_size_read,
> + .write_u64 = notes_size_write,
> + },
> +};
> +
> +static int info_populate(struct cgroup_subsys *ss, struct cgroup *cont)
> +{
> + if (!cont->parent)
> + cgroup_add_files(cont, ss, info_root_files,
> + ARRAY_SIZE(info_root_files));
> + return cgroup_add_files(cont, ss, info_files, ARRAY_SIZE(info_files));
> +}
> +
> +struct cgroup_subsys info_subsys = {
> + .name = "info",
> + .create = info_create,
> + .destroy = info_destroy,
> + .populate = info_populate,
> + .subsys_id = info_subsys_id,
> +};
>
>
On Wed, Jul 1, 2009 at 7:45 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
>> - in the current version of this patch, mounting a cgroups hierarchy
>> ? with no options does *not* get you any of the multi-bindable
>> ? subsystems; possibly for consistency it should give you all of the
>> ? multi-bindable subsystems as well as all of the single-bindable
>> ? subsystems.
>>
> I don't think this is a big problem. Hmm, I wonder there are no people who
> uses cgroup without any options (= mounts all subsys at once)...
In practice I suspect that it's a rare usage outside of manual
playing/testing - any real production system is going to want to be
aware of what subsystems are available and decide which to mount on
each hierarchy.
>
> Wow...seems complicated. How about adding linux/cgroup_multisubsys.h ?
I think that the readability benefits in cgroup.c would be outweighed
by having two different subsys include files.
Paul
On Wed, Jul 1, 2009 at 7:48 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
>
> Hmm, do we need to this "info" file as subsys ? How about making this as
> default file set ? (if there are users.)
>
That would certainly be possible, and would be an alternative to
having multi-bindable subsystem support.
The advantage of adding multi-bindable subsystems is that you can
avoid bloating the core cgroups code, by putting individual small
cgroups features in their own code modules, and you get to decide at
mount time which features are actually mounted; if they were part of
the core cgroups files, then there would either need to be special
mount options for each separate feature, or else no way to pick which
features were mounted on each hierarchy.
Paul
On Wed, 1 Jul 2009 19:52:16 -0700
Paul Menage <[email protected]> wrote:
> On Wed, Jul 1, 2009 at 7:45 PM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
> >> - in the current version of this patch, mounting a cgroups hierarchy
> >> Â with no options does *not* get you any of the multi-bindable
> >> Â subsystems; possibly for consistency it should give you all of the
> >> Â multi-bindable subsystems as well as all of the single-bindable
> >> Â subsystems.
> >>
> > I don't think this is a big problem. Hmm, I wonder there are no people who
> > uses cgroup without any options (= mounts all subsys at once)...
>
> In practice I suspect that it's a rare usage outside of manual
> playing/testing - any real production system is going to want to be
> aware of what subsystems are available and decide which to mount on
> each hierarchy.
>
> >
> > Wow...seems complicated. How about adding linux/cgroup_multisubsys.h ?
>
> I think that the readability benefits in cgroup.c would be outweighed
> by having two different subsys include files.
>
Hm, then, moving SUBSYS() macro to linux/cgroup_subsys.h is a sane way, I think.
IMHO, it's not very good habit that cgroup_subsys.h is parsed in different ways in
cgroup.h and cgroup.c
I think cgroup_subsys.h like following is much simpler even if it's not very
sophisticated.
==
#define SUBSYSID(_name) _name ## _subsys_id,
#define SUBSYSP(_name) &_name ## _subsys,
#ifdef CONFIG_CPUSETS
#define CPUSETS_ID SUBSYSID(cpuset)
#define CPUSETS_SUBSYS SUBSYSP(cpuset)
#else
#define CPUSETS_ID
#define CPUSETS_SUBSYS
#endif
#ifdef CONFIG_CPU
#define CPU_ID SUBSYSID(cpu)
#define CPU_SUBSYS SUBSYSP(cpu)
#else
#define CPU_ID
#define CPU_SUBSYS
#endif
#ifdef CONFIG_MEMORY
#define MEMORY_ID SUBSYSID(memory)
#define MEMORY_SUBSYS SUBSYSP(memory)
#else
#define MEMORY_ID
#define MEMORY_SUBSYS
#endif
#define CGROUP_ALL_SUBSYSID CPUSETS_ID CPU_ID MEMORY_ID
#define CGROUP_ALL_SUBSYSP CPUSETS_SUBSYS CPU_SUBSYS MEMORY_SUBSYS
==
But I know I'm not a man who can talk about beauty of codes ;)
Thanks,
-Kame
On Wed, 1 Jul 2009 19:56:28 -0700
Paul Menage <[email protected]> wrote:
> On Wed, Jul 1, 2009 at 7:48 PM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
> >
> > Hmm, do we need to this "info" file as subsys ? How about making this as
> > default file set ? (if there are users.)
> >
>
> That would certainly be possible, and would be an alternative to
> having multi-bindable subsystem support.
>
> The advantage of adding multi-bindable subsystems is that you can
> avoid bloating the core cgroups code, by putting individual small
> cgroups features in their own code modules, and you get to decide at
> mount time which features are actually mounted; if they were part of
> the core cgroups files, then there would either need to be special
> mount options for each separate feature, or else no way to pick which
> features were mounted on each hierarchy.
>
Sure, thanks.
-Kame
> Paul
>
On Wed, Jul 1, 2009 at 8:16 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
>
> Hm, then, moving SUBSYS() macro to linux/cgroup_subsys.h is a sane way, I think.
> IMHO, it's not very good habit that cgroup_subsys.h is parsed in different ways in
> cgroup.h and cgroup.c
>
> I think cgroup_subsys.h like following is much simpler even if it's not very
> sophisticated.
> ==
> #define SUBSYSID(_name) ? ? ? ? _name ## _subsys_id,
> #define SUBSYSP(_name) ? ? ? ? ?&_name ## _subsys,
You'd also need a macro for the extern declaration of each *_subsys
variable. So it would be 8 lines of boilerplate for each subsystem,
plus adding to the CGROUP_ALL_* macros below. Yes, it could be done
that way, but I'm not convinced that it's hugely simpler.
The state of things prior to this patch series is that to do something
at compile time that involves all subsystems (currently we have three
cases - defining the subsys_id enum, declaring the extern subsys
structures, and defining the array of subsys structure pointers) you
just define the macro SUBSYS to expand to the code fragment you want
to repeat and include cgroup_subsys.h. This is very simple from the
subsystem writer's PoV, and not too tricky in the core framework.
I agree that this patch does increase the complexity in the framework
code, since the framework code needs to deal with all the non-multi
subsystems ahead of all multi-subsystems, and so has to do two passes
through the file.
OK, thinking more about this, your earlier suggestion of splitting
cgroup_subsys.h and using a single SUBSYS macro would achieve this
more simply after all. Let me try that and see how it looks.
Paul
On Wed, Jul 1, 2009 at 7:56 PM, Paul Menage<[email protected]> wrote:
>> Hmm, do we need to this "info" file as subsys ? How about making this as
>> default file set ? (if there are users.)
>>
>
> That would certainly be possible, and would be an alternative to
> having multi-bindable subsystem support.
>
> The advantage of adding multi-bindable subsystems is that you can
> avoid bloating the core cgroups code, by putting individual small
> cgroups features in their own code modules, and you get to decide at
> mount time which features are actually mounted; if they were part of
> the core cgroups files, then there would either need to be special
> mount options for each separate feature, or else no way to pick which
> features were mounted on each hierarchy.
BTW, just to give a balanced argument: I agree that these example
multi-bindable subsystems are somewhat weak justifications for the new
feature - they each supply a single control file, they're not
connected to anything in the kernel outside of the core cgroups
framework, and they're almost zero overhead if they're not actively
used, so making them part of the cgroups framework directly wouldn't
be totally unreasonable.
An example of a less-trivial multi-bindable subsystem could be cpuacct
- logically there's no reason that you couldn't track CPU usage in
multiple different hierarchies, keeping totals aggregated in different
ways for the groupings in different hierarchies, and the overhead
associated with tracking would mean that you wouldn't want to
automatically link cpuacct into every hierarchy. The practical problem
with this would be that finding the cgroup for a process would be
slower since there wouldn't be a 1:1 mapping from a task to a cpuacct
cgroup state object.
Instead each task would have multiple such states and to update the
usage accounting on each of them you'd have to do a list traversal
rather than a direct lookup (and worse, right now that list traversal
can only be done while holding cgroup_mutex, which is impossible when
doing cpuacct charging from the guts of the scheduler). I can see how
to extend the multi-bindable support to make it cheaper and to require
less synchronization (i.e. walking an RCU-safe array to find the
various state objects rather than doing a list traversal).
Although before doing that I guess it would be worth asking whether
anyone would actually *want* to aggregate CPU usage different ways for
different hierarchies, even if it makes logical sense to be able to do
so.
Paul
On Wed, Jul 01, 2009 at 07:10:58PM -0700, Paul Menage wrote:
> [RFC] Support named cgroups hierarchies
>
> To simplify referring to cgroup hierarchies in mount statements, and
> to allow disambiguation in the presence of empty hierarchies and
> multiply-bindable subsystems (see later patches in series) this patch
> adds support for naming a new cgroup hierarchy via the "name=" mount
> option
>
> A pre-existing hierarchy may be specified by either name or by
> subsystems; a hierarchy's name cannot be changed by a remount
> operation.
>
> Example usage:
>
> # To create a hierarchy called "foo" containing the "cpu" subsystem
> mount -t cgroup -oname=foo,cpu cgroup /mnt/cgroup1
>
> # To mount the "foo" hierarchy on a second location
> mount -t cgroup -oname=foo cgroup /mnt/cgroup2
>
> Open issues:
>
> - should the specification be via a name= option as in this patch, or
> should we simply use the "device name" as passed to the mount()
> system call? Using the device name is more conceptually clean and
> consistent with the filesystem API; however, given that the device
> name is currently ignored by cgroups, this would lead to a
> user-visible behaviour change.
I did not see anything preventing two hierarchies from having the same
(non empty) name. I guess that in such a case trying to mount a named
hierarchy on a second location is unspecified. Could we just check for
unique (non empty) names?
Thanks,
Louis
>
> Signed-off-by: Paul Menage <[email protected]>
>
> ---
>
> kernel/cgroup.c | 136 ++++++++++++++++++++++++++++++++++++-------------------
> 1 files changed, 88 insertions(+), 48 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index ea255fe..940f28d 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -59,6 +59,8 @@ static struct cgroup_subsys *subsys[] = {
> #include <linux/cgroup_subsys.h>
> };
>
> +#define MAX_CGROUP_ROOT_NAMELEN 64
> +
> /*
> * A cgroupfs_root represents the root of a cgroup hierarchy,
> * and may be associated with a superblock to form an active
> @@ -93,6 +95,9 @@ struct cgroupfs_root {
>
> /* The path to use for release notifications. */
> char release_agent_path[PATH_MAX];
> +
> + /* The name for this hierarchy - may be empty */
> + char name[MAX_CGROUP_ROOT_NAMELEN];
> };
>
> /*
> @@ -828,6 +833,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
> seq_puts(seq, ",noprefix");
> if (strlen(root->release_agent_path))
> seq_printf(seq, ",release_agent=%s", root->release_agent_path);
> + if (strlen(root->name))
> + seq_printf(seq, ",name=%s", root->name);
> mutex_unlock(&cgroup_mutex);
> return 0;
> }
> @@ -836,12 +843,15 @@ struct cgroup_sb_opts {
> unsigned long subsys_bits;
> unsigned long flags;
> char *release_agent;
> + char *name;
> + /* A flag indicating that a root was created from this options block */
> + bool created_root;
> };
>
> /* Convert a hierarchy specifier into a bitmask of subsystems and
> * flags. */
> static int parse_cgroupfs_options(char *data,
> - struct cgroup_sb_opts *opts)
> + struct cgroup_sb_opts *opts)
> {
> char *token, *o = data ?: "all";
> unsigned long mask = (unsigned long)-1;
> @@ -850,9 +860,7 @@ static int parse_cgroupfs_options(char *data,
> mask = ~(1UL << cpuset_subsys_id);
> #endif
>
> - opts->subsys_bits = 0;
> - opts->flags = 0;
> - opts->release_agent = NULL;
> + memset(opts, 0, sizeof(*opts));
>
> while ((token = strsep(&o, ",")) != NULL) {
> if (!*token)
> @@ -872,11 +880,19 @@ static int parse_cgroupfs_options(char *data,
> /* Specifying two release agents is forbidden */
> if (opts->release_agent)
> return -EINVAL;
> - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
> + opts->release_agent =
> + kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
> if (!opts->release_agent)
> return -ENOMEM;
> - strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
> - opts->release_agent[PATH_MAX - 1] = 0;
> + } else if (!strncmp(token, "name=", 5)) {
> + /* Specifying two names is forbidden */
> + if (opts->name)
> + return -EINVAL;
> + opts->name = kstrndup(token + 5,
> + MAX_CGROUP_ROOT_NAMELEN,
> + GFP_KERNEL);
> + if (!opts->name)
> + return -ENOMEM;
> } else {
> struct cgroup_subsys *ss;
> int i;
> @@ -903,7 +919,7 @@ static int parse_cgroupfs_options(char *data,
> return -EINVAL;
>
> /* We can't have an empty hierarchy */
> - if (!opts->subsys_bits)
> + if (!opts->subsys_bits && !opts->name)
> return -EINVAL;
>
> return 0;
> @@ -931,6 +947,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> goto out_unlock;
> }
>
> + /* Don't allow name to change at remount */
> + if (opts.name && strcmp(opts.name, root->name)) {
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> ret = rebind_subsystems(root, opts.subsys_bits);
> if (ret)
> goto out_unlock;
> @@ -942,6 +964,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> strcpy(root->release_agent_path, opts.release_agent);
> out_unlock:
> kfree(opts.release_agent);
> + kfree(opts.name);
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
> unlock_kernel();
> @@ -963,6 +986,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
> INIT_LIST_HEAD(&cgrp->release_list);
> init_rwsem(&cgrp->pids_mutex);
> }
> +
> static void init_cgroup_root(struct cgroupfs_root *root)
> {
> struct cgroup *cgrp = &root->top_cgroup;
> @@ -976,28 +1000,56 @@ static void init_cgroup_root(struct cgroupfs_root *root)
>
> static int cgroup_test_super(struct super_block *sb, void *data)
> {
> - struct cgroupfs_root *new = data;
> + struct cgroup_sb_opts *new = data;
> struct cgroupfs_root *root = sb->s_fs_info;
>
> - /* First check subsystems */
> - if (new->subsys_bits != root->subsys_bits)
> - return 0;
> + /* If we asked for a name then it must match */
> + if (new->name && strcmp(new->name, root->name))
> + return 0;
>
> - /* Next check flags */
> - if (new->flags != root->flags)
> + /* If we asked for subsystems then they must match */
> + if (new->subsys_bits && new->subsys_bits != root->subsys_bits)
> return 0;
>
> return 1;
> }
>
> +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
> +{
> + struct cgroupfs_root *root;
> +
> + if (!opts->subsys_bits)
> + return ERR_PTR(-EINVAL);
> +
> + root = kzalloc(sizeof(*root), GFP_KERNEL);
> + if (!root)
> + return ERR_PTR(-ENOMEM);
> +
> + init_cgroup_root(root);
> + root->subsys_bits = opts->subsys_bits;
> + root->flags = opts->flags;
> + if (opts->release_agent)
> + strcpy(root->release_agent_path, opts->release_agent);
> + if (opts->name)
> + strcpy(root->name, opts->name);
> + opts->created_root = true;
> + return root;
> +}
> +
> static int cgroup_set_super(struct super_block *sb, void *data)
> {
> int ret;
> - struct cgroupfs_root *root = data;
> + struct cgroup_sb_opts *opts = data;
> + struct cgroupfs_root *root;
>
> + root = cgroup_root_from_opts(opts);
> + if (IS_ERR(root))
> + return PTR_ERR(root);
> ret = set_anon_super(sb, NULL);
> - if (ret)
> + if (ret) {
> + kfree(root);
> return ret;
> + }
>
> sb->s_fs_info = root;
> root->sb = sb;
> @@ -1039,44 +1091,23 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> struct cgroup_sb_opts opts;
> int ret = 0;
> struct super_block *sb;
> - struct cgroupfs_root *root;
> - struct list_head tmp_cg_links;
>
> /* First find the desired set of subsystems */
> ret = parse_cgroupfs_options(data, &opts);
> - if (ret) {
> - kfree(opts.release_agent);
> - return ret;
> - }
> -
> - root = kzalloc(sizeof(*root), GFP_KERNEL);
> - if (!root) {
> - kfree(opts.release_agent);
> - return -ENOMEM;
> - }
> -
> - init_cgroup_root(root);
> - root->subsys_bits = opts.subsys_bits;
> - root->flags = opts.flags;
> - if (opts.release_agent) {
> - strcpy(root->release_agent_path, opts.release_agent);
> - kfree(opts.release_agent);
> - }
> + if (ret)
> + goto out_err;
>
> - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
> + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
>
> if (IS_ERR(sb)) {
> - kfree(root);
> - return PTR_ERR(sb);
> + ret = PTR_ERR(sb);
> + goto out_err;
> }
>
> - if (sb->s_fs_info != root) {
> - /* Reusing an existing superblock */
> - BUG_ON(sb->s_root == NULL);
> - kfree(root);
> - root = NULL;
> - } else {
> + if (opts.created_root) {
> /* New superblock */
> + struct cgroupfs_root *root = sb->s_fs_info;
> + struct list_head tmp_cg_links;
> struct cgroup *root_cgrp = &root->top_cgroup;
> struct inode *inode;
> int i;
> @@ -1109,7 +1140,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> if (ret == -EBUSY) {
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&inode->i_mutex);
> - goto free_cg_links;
> + free_cg_links(&tmp_cg_links);
> + goto drop_new_super;
> }
>
> /* EBUSY should be the only error here */
> @@ -1146,12 +1178,17 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> }
>
> simple_set_mnt(mnt, sb);
> + kfree(opts.release_agent);
> + kfree(opts.name);
> return 0;
>
> - free_cg_links:
> - free_cg_links(&tmp_cg_links);
> drop_new_super:
> deactivate_locked_super(sb);
> +
> + out_err:
> + kfree(opts.release_agent);
> + kfree(opts.name);
> +
> return ret;
> }
>
> @@ -2923,6 +2960,9 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
> seq_printf(m, "%lu:", root->subsys_bits);
> for_each_subsys(root, ss)
> seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
> + if (strlen(root->name))
> + seq_printf(m, "%sname=%s",
> + count ? "," : "", root->name);
> seq_putc(m, ':');
> get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
> cgrp = task_cgroup(tsk, subsys_id);
>
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
On Thu, Jul 2, 2009 at 1:09 AM, Louis Rilling<[email protected]> wrote:
>
> I did not see anything preventing two hierarchies from having the same
> (non empty) name.
If you try to mount with a name that matches an already existing
hierarchy, and you leave the subsystem options empty or give options
that match the existing mount, then you get the original mount; if you
specify conflicting options you get an error. I don't think the code
will let you have two hierarchies with the same non-empty name.
Paul
On Thu, Jul 02, 2009 at 01:19:30AM -0700, Paul Menage wrote:
> On Thu, Jul 2, 2009 at 1:09 AM, Louis Rilling<[email protected]> wrote:
> >
> > I did not see anything preventing two hierarchies from having the same
> > (non empty) name.
>
> If you try to mount with a name that matches an already existing
> hierarchy, and you leave the subsystem options empty or give options
> that match the existing mount, then you get the original mount; if you
> specify conflicting options you get an error. I don't think the code
> will let you have two hierarchies with the same non-empty name.
Ah, true. Don't know why I concluded that, even after having read the
relevant portion of the code.
Sorry for the noise.
Louis
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
On Wed, Jul 01, 2009 at 07:11:18PM -0700, Paul Menage wrote:
> [RFC] Remove cgroup_subsys.root pointer
>
> In preparation for supporting cgroup subsystems that can be bound to
> multiple hierarchies, remove the "root" pointer and associated list
> structures. Subsystem hierarchy membership is now determined entirely
> through the subsystem bitmasks in struct cgroupfs_root.
>
> Minor changes include:
> - root_list now includes the inactive root
> - for_each_active_root() -> for_each_root()
> - for_each_subsys() is now guaranteed to be in subsys_id order
>
[...]
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index dede632..8b1b92f 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
[...]
> @@ -191,13 +182,36 @@ static int notify_on_release(const struct cgroup *cgrp)
> * for_each_subsys() allows you to iterate on each subsystem attached to
> * an active hierarchy
> */
> +static inline struct cgroup_subsys *nth_ss(int n)
> +{
> + return (n >= CGROUP_SUBSYS_COUNT) ? NULL : subsys[n];
> +}
> #define for_each_subsys(_root, _ss) \
> -list_for_each_entry(_ss, &_root->subsys_list, sibling)
> +for (_ss = nth_ss(find_first_bit(&(_root)->subsys_bits, CGROUP_SUBSYS_COUNT));\
> + _ss != NULL; \
> + _ss = nth_ss(find_next_bit(&(_root)->subsys_bits, CGROUP_SUBSYS_COUNT, \
> + _ss->subsys_id + 1)))
>
> -/* for_each_active_root() allows you to iterate across the active hierarchies */
> -#define for_each_active_root(_root) \
> +
> +/* for_each_root() allows you to iterate across all hierarchies */
> +#define for_each_root(_root) \
> list_for_each_entry(_root, &roots, root_list)
>
> +/* Find the root for a given subsystem */
> +static struct cgroupfs_root *find_root(struct cgroup_subsys *ss)
> +{
> + int id = ss->subsys_id;
> + struct cgroupfs_root *root, *res = NULL;
> + for_each_root(root) {
> + if (root->subsys_bits && (1UL << id)) {
Should be &, not && ------------------^^
Louis
> + BUG_ON(res);
> + res = root;
> + }
> + }
> + BUG_ON(!res);
> + return res;
> +}
> +
> /* the list of cgroups eligible for automatic release. Protected by
> * release_list_lock */
> static LIST_HEAD(release_list);
[...]
--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes
On Thu, Jul 2, 2009 at 2:04 AM, Louis Rilling<[email protected]> wrote:
>> +/* Find the root for a given subsystem */
>> +static struct cgroupfs_root *find_root(struct cgroup_subsys *ss)
>> +{
>> + ? ? int id = ss->subsys_id;
>> + ? ? struct cgroupfs_root *root, *res = NULL;
>> + ? ? for_each_root(root) {
>> + ? ? ? ? ? ? if (root->subsys_bits && (1UL << id)) {
> Should be &, not && ------------------^^
>
Good catch. This is only used by cgroup_clone() via ns_cgroup, and I
hadn't done any testing with namespace creation and ns_cgroup.
Paul
>>> /*
>>> * A cgroupfs_root represents the root of a cgroup hierarchy,
>>> * and may be associated with a superblock to form an active
>>> @@ -93,6 +95,9 @@ struct cgroupfs_root {
>>>
>>> /* The path to use for release notifications. */
>>> char release_agent_path[PATH_MAX];
>>> +
>>> + /* The name for this hierarchy - may be empty */
>>> + char name[MAX_CGROUP_ROOT_NAMELEN];
>>> };
>>>
>> If you don't want to make cgroupfs_root bigger,
>>
>> cgroupfs_root {
>> ......
>> /* this must be the bottom of struct */
>> char name[0];
>> }
>>
>> Is a choice.
>
> I'd rather avoid something like that since I think it's less readable
> - I'd probably just make the name into a pointer in that case.
>
Whichever choice we make, the length should be limited I think.
>> BTW, reading a patch, any kind of charactors are allowed ?
>
> Yes, other than \000 of course. I guess maybe I should use
> seq_escape() to print the name to avoid confusion in the event that
> people put whitespace in there, or else just ban whitespace (or maybe
> all non-alphanumeric chars).
>
I don't think we need to care about this.
mount -t cgroup -o debug xxx /mnt
"xxx" can be any chars.
> +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
> +{
> + struct cgroupfs_root *root;
> +
> + if (!opts->subsys_bits)
> + return ERR_PTR(-EINVAL);
> +
> + root = kzalloc(sizeof(*root), GFP_KERNEL);
> + if (!root)
> + return ERR_PTR(-ENOMEM);
> +
> + init_cgroup_root(root);
> + root->subsys_bits = opts->subsys_bits;
> + root->flags = opts->flags;
> + if (opts->release_agent)
> + strcpy(root->release_agent_path, opts->release_agent);
> + if (opts->name)
> + strcpy(root->name, opts->name);
> + opts->created_root = true;
> + return root;
> +}
> +
> static int cgroup_set_super(struct super_block *sb, void *data)
> {
> int ret;
> - struct cgroupfs_root *root = data;
> + struct cgroup_sb_opts *opts = data;
> + struct cgroupfs_root *root;
>
> + root = cgroup_root_from_opts(opts);
> + if (IS_ERR(root))
> + return PTR_ERR(root);
> ret = set_anon_super(sb, NULL);
> - if (ret)
> + if (ret) {
> + kfree(root);
> return ret;
> + }
>
> sb->s_fs_info = root;
> root->sb = sb;
> @@ -1039,44 +1091,23 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> struct cgroup_sb_opts opts;
> int ret = 0;
> struct super_block *sb;
> - struct cgroupfs_root *root;
> - struct list_head tmp_cg_links;
>
> /* First find the desired set of subsystems */
> ret = parse_cgroupfs_options(data, &opts);
> - if (ret) {
> - kfree(opts.release_agent);
> - return ret;
> - }
> -
> - root = kzalloc(sizeof(*root), GFP_KERNEL);
> - if (!root) {
> - kfree(opts.release_agent);
> - return -ENOMEM;
> - }
> -
> - init_cgroup_root(root);
> - root->subsys_bits = opts.subsys_bits;
> - root->flags = opts.flags;
> - if (opts.release_agent) {
> - strcpy(root->release_agent_path, opts.release_agent);
> - kfree(opts.release_agent);
> - }
> + if (ret)
> + goto out_err;
>
> - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
> + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
>
cgroup_set_super() is called with spinlock held, so we
can't do kmalloc(GFP_KERNEL).
sget(...)
{
spin_lock(&sb_lock);
...
err = set(s, data);
if (err) {
spin_unlock(&sb_lock);
...
}
> if (IS_ERR(sb)) {
> - kfree(root);
> - return PTR_ERR(sb);
> + ret = PTR_ERR(sb);
> + goto out_err;
> }
>
> - if (sb->s_fs_info != root) {
> - /* Reusing an existing superblock */
> - BUG_ON(sb->s_root == NULL);
> - kfree(root);
> - root = NULL;
> - } else {
> + if (opts.created_root) {
> /* New superblock */
> + struct cgroupfs_root *root = sb->s_fs_info;
> + struct list_head tmp_cg_links;
> struct cgroup *root_cgrp = &root->top_cgroup;
> struct inode *inode;
> int i;
> @@ -1109,7 +1140,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> if (ret == -EBUSY) {
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&inode->i_mutex);
> - goto free_cg_links;
> + free_cg_links(&tmp_cg_links);
> + goto drop_new_super;
> }
>
> /* EBUSY should be the only error here */
> @@ -1146,12 +1178,17 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> }
>
> simple_set_mnt(mnt, sb);
> + kfree(opts.release_agent);
> + kfree(opts.name);
> return 0;
>
> - free_cg_links:
> - free_cg_links(&tmp_cg_links);
> drop_new_super:
> deactivate_locked_super(sb);
> +
> + out_err:
> + kfree(opts.release_agent);
> + kfree(opts.name);
> +
> return ret;
> }
> +static int current_css_set_cg_links_read(struct cgroup *cont,
> + struct cftype *cft,
> + struct seq_file *seq)
> +{
> + struct cg_cgroup_link *link, *saved_link;
> + struct css_set *cg;
call for a newline
> + write_lock_irq(&css_set_lock);
can be read_lock(&css_set_lock);
> + task_lock(current);
> + cg = current->cgroups;
> + list_for_each_entry_safe(link, saved_link, &cg->cg_links,
> + cg_link_list) {
> + struct cgroup *c = link->cgrp;
> + const char *name;
call for a newline
> + rcu_read_lock();
> + if (c->dentry)
> + name = c->dentry->d_name.name;
> + else
> + name = "?";
> + seq_printf(seq, "Root %lu group %s\n",
> + c->root->subsys_bits, name);
> + rcu_read_unlock();
> + }
> + task_unlock(current);
> + write_unlock_irq(&css_set_lock);
> + return 0;
> +}
> +
> +#define MAX_TASKS_SHOWN_PER_CSS 25
> +static int cgroup_css_links_read(struct cgroup *cont,
> + struct cftype *cft,
> + struct seq_file *seq)
> +{
> + struct cg_cgroup_link *link, *saved_link;
need a newline
> + write_lock_irq(&css_set_lock);
can be read_lock(&css_set_lock);
> + list_for_each_entry_safe(link, saved_link, &cont->css_sets,
can be list_for_each_entry()
> + cgrp_link_list) {
> + struct css_set *cg = link->cg;
> + struct task_struct *task, *saved_task;
> + int count = 0;
> + seq_printf(seq, "css_set %p\n", cg);
> + list_for_each_entry_safe(task, saved_task, &cg->tasks,
ditto
> + cg_list) {
> + if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
> + seq_puts(seq, " ...\n");
> + break;
> + } else {
> + seq_printf(seq, " task %d\n",
> + task_pid_vnr(task));
> + }
> + }
> + }
> + write_unlock_irq(&css_set_lock);
> + return 0;
> +}
Paul Menage wrote:
> [RFC] Support multiply-bindable cgroup subsystems
>
> This patch allows a cgroup subsystem to be marked as bindable on
> multiple cgroup hierarchies independently, when declared in
> cgroup_subsys.h via MULTI_SUBSYS() rather than SUBSYS().
>
> The state for such subsystems cannot be accessed directly from a
> task->cgroups (since there's no unique mapping for a task) but instead
> must be accessed via a particular control group object.
>
> Multiply-bound subsystems are useful in cases where there's no direct
> correspondence between the cgroup configuration and some property of
> the kernel outside of the cgroups subsystem. So this would not be
> applicable to e.g. the CFS cgroup, since there has to a unique mapping
> from a task to its CFS run queue.
>
> As an example, the "debug" subsystem is marked multiply-bindable,
> since it has no state outside the cgroups framework itself.
>
Great, this makes the debug subsystem more useful. Sometimes
I want to see some debug info in different hierarchies, but
I can't just because it can only be bound to one hierarchy.
> Example usage:
>
> mount -t cgroup -o name=foo,debug,cpu cgroup /mnt1
> mount -t cgroup -o name=bar,debug,memory cgroup /mnt2
>
> Open Issues:
>
> - in the current version of this patch, mounting a cgroups hierarchy
> with no options does *not* get you any of the multi-bindable
> subsystems; possibly for consistency it should give you all of the
> multi-bindable subsystems as well as all of the single-bindable
> subsystems.
>
Yeah, the latter is preferrable.
> - how can we avoid the checkpatch.pl errors due to creative use of
> macros to generate enum names?
checkpatch.pl can sometimes generate false-positive, let's happily
ignore those "errors". ;)
But it whould be better if those macros can be handled in a cleaner way.
Paul Menage wrote:
> [RFC] Example multi-bindable subsystem: a per-cgroup notes field
>
> As an example of a multiply-bindable subsystem, this patch introduces
> the "info" subsystem, which provides a single file, "info.notes", in
> which user-space middleware can store an arbitrary (by default up to
> one page) binary string representing configuration data about that
> cgroup. This reduces the need to keep additional state outside the
> cgroup filesystem. The maximum notes size for a hierarchy can be set
> by updating the "info.size" file in the root cgroup.
>
> Signed-off-by: Paul Menage <[email protected]>
>
> ---
>
> include/linux/cgroup_subsys.h | 6 ++
> init/Kconfig | 9 +++
> kernel/Makefile | 1
> kernel/info_cgroup.c | 133 +++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 149 insertions(+), 0 deletions(-)
> create mode 100644 kernel/info_cgroup.c
>
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index f78605e..5dfea38 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -60,3 +60,9 @@ SUBSYS(net_cls)
> #endif
>
> /* */
> +
> +#ifdef CONFIG_CGROUP_INFO
> +MULTI_SUBSYS(info)
> +#endif
> +
> +/* */
> diff --git a/init/Kconfig b/init/Kconfig
> index d904d6c..3bd4685 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -604,6 +604,15 @@ config CGROUP_MEM_RES_CTLR_SWAP
> Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
> size is 4096bytes, 512k per 1Gbytes of swap.
>
> +config CGROUP_INFO
> + bool "Simple application-specific info cgroup subsystem"
> + depends on CGROUPS
> + help
> + Provides a simple cgroups subsystem with an "info.notes"
> + field, which can be used by middleware to store
> + application-specific configuration data about a cgroup. Can
> + be mounted on multiple hierarchies at once.
> +
> endif # CGROUPS
>
> config MM_OWNER
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 7ffdc16..e713a67 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -61,6 +61,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
> obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
> obj-$(CONFIG_CPUSETS) += cpuset.o
> obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
> +obj-$(CONFIG_CGROUP_INFO) += info_cgroup.o
> obj-$(CONFIG_UTS_NS) += utsname.o
> obj-$(CONFIG_USER_NS) += user_namespace.o
> obj-$(CONFIG_PID_NS) += pid_namespace.o
> diff --git a/kernel/info_cgroup.c b/kernel/info_cgroup.c
> new file mode 100644
> index 0000000..34cfdb8
> --- /dev/null
> +++ b/kernel/info_cgroup.c
> @@ -0,0 +1,133 @@
> +/*
> + * info_cgroup.c - simple cgroup providing a "notes" field
> + */
> +
> +#include "linux/cgroup.h"
> +#include "linux/err.h"
> +#include "linux/seq_file.h"
> +
#include <linux/xxx>
And I got compile error, because of missing
#include <linux/uaccess.h>
> +struct info_cgroup {
> + struct cgroup_subsys_state css;
> + /* notes string for this cgroup */
> + const char *notes;
> + size_t len;
> + /*
> + * size limit for notes in this hierarchy. Only relevant for
> + * the root cgroup. Not synchronized since it's a single word
> + * value and writes to it never depend on previously read
> + * values.
> + */
> + size_t max_len;
If it's not per cgroup, it can be a global value.
But why not make it per cgroup?
> + spinlock_t lock;
> +};
> +
> +static inline struct info_cgroup *cg_info(struct cgroup *cg)
> +{
> + return container_of(cgroup_subsys_state(cg, info_subsys_id),
> + struct info_cgroup, css);
> +}
> +
> +static struct cgroup_subsys_state *info_create(struct cgroup_subsys *ss,
> + struct cgroup *cg)
> +{
> + struct info_cgroup *info = kzalloc(sizeof(*info), GFP_KERNEL);
newline needed
> + if (!info)
> + return ERR_PTR(-ENOMEM);
> + spin_lock_init(&info->lock);
> + if (!cg->parent)
> + info->max_len = PAGE_SIZE;
> + return &info->css;
> +}
> +
> +static void info_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
> +{
> + struct info_cgroup *css = cg_info(cont);
newline needed
> + kfree(css->notes);
> + kfree(css);
> +}
> +
> +
> +static int info_read(struct cgroup *cont,
> + struct cftype *cft,
> + struct seq_file *seq)
> +{
> + struct info_cgroup *css = cg_info(cont);
newline needed
> + spin_lock(&css->lock);
> + if (css->notes)
> + seq_write(seq, css->notes, css->len);
> + spin_unlock(&css->lock);
> + return 0;
> +}
> +
> +/*
> + * Use a custom write function so that we can handle binary data
> + */
> +
> +static ssize_t info_write(struct cgroup *cgrp, struct cftype *cft,
> + struct file *file,
> + const char __user *userbuf,
> + size_t nbytes, loff_t *unused_ppos) {
> + struct info_cgroup *css = cg_info(cgrp);
> + char *notes = NULL;
newline needed
> + if (nbytes > cg_info(cgrp->top_cgroup)->max_len)
> + return -E2BIG;
> + if (nbytes) {
> + notes = kmalloc(nbytes, GFP_USER);
> + if (!notes)
> + return -ENOMEM;
> + if (copy_from_user(notes, userbuf, nbytes))
missing kfree(notes)
> + return -EFAULT;
> + }
> +
> + spin_lock(&css->lock);
> + kfree(css->notes);
> + css->notes = notes;
> + css->len = nbytes;
> + spin_unlock(&css->lock);
> + return nbytes;
> +}
> +
> +static u64 notes_size_read(struct cgroup *cont, struct cftype *cft)
> +{
> + struct info_cgroup *css = cg_info(cont);
> + return css->max_len;
> +}
> +
> +static int notes_size_write(struct cgroup *cont, struct cftype *cft, u64 val)
> +{
> + struct info_cgroup *css = cg_info(cont);
> + css->max_len = val;
> + return 0;
> +}
> +
> +static struct cftype info_files[] = {
> + {
> + .name = "notes",
> + .read_seq_string = info_read,
> + .write = info_write,
> + },
> +};
> +
> +static struct cftype info_root_files[] = {
> + {
> + .name = "size",
> + .read_u64 = notes_size_read,
> + .write_u64 = notes_size_write,
> + },
> +};
> +
> +static int info_populate(struct cgroup_subsys *ss, struct cgroup *cont)
> +{
> + if (!cont->parent)
> + cgroup_add_files(cont, ss, info_root_files,
> + ARRAY_SIZE(info_root_files));
> + return cgroup_add_files(cont, ss, info_files, ARRAY_SIZE(info_files));
> +}
> +
> +struct cgroup_subsys info_subsys = {
> + .name = "info",
> + .create = info_create,
> + .destroy = info_destroy,
> + .populate = info_populate,
> + .subsys_id = info_subsys_id,
> +};
>
>
>
On Thu, Jul 2, 2009 at 7:32 PM, Li Zefan<[email protected]> wrote:
>
> cgroup_set_super() is called with spinlock held, so we
> can't do kmalloc(GFP_KERNEL).
Good point. I'll have to go back to pre-allocating the root before
calling sget()
Paul
On Fri, Jul 3, 2009 at 1:58 AM, Li Zefan<[email protected]> wrote:
>
> If it's not per cgroup, it can be a global value.
No it can't, since the notes subsystem can be mounted on multiple
hierarchies simultaneously.
> But why not make it per cgroup?
Setting notes.size to a large value allows userspace to create large
kmalloc'd objects; so if it's per-cgroup, then it's not safe to
delegate control of part of a cgroups hierarchy to untrusted users.
(E.g. we want to be able to give an untrusted user process the power
to create sub-cgroups in the CPU scheduler hierarchy, so that it can
give different CPU guarantees to each of its threadpools).
I guess that an alternative would be to have a per-cgroup size field,
and use the min of the cgroup and all its ancestors when doing length
checking.
>> + ? ? if (nbytes > cg_info(cgrp->top_cgroup)->max_len)
>> + ? ? ? ? ? ? return -E2BIG;
>> + ? ? if (nbytes) {
>> + ? ? ? ? ? ? notes = kmalloc(nbytes, GFP_USER);
>> + ? ? ? ? ? ? if (!notes)
>> + ? ? ? ? ? ? ? ? ? ? return -ENOMEM;
>> + ? ? ? ? ? ? if (copy_from_user(notes, userbuf, nbytes))
>
> missing kfree(notes)
>
Good catch, thanks. Fixed (and all the newlines added).
Paul
OK, updated for the next version of these patches.
Thanks,
Paul
On Fri, Jul 3, 2009 at 12:57 AM, Li Zefan<[email protected]> wrote:
>> +#define MAX_TASKS_SHOWN_PER_CSS 25
>> +static int cgroup_css_links_read(struct cgroup *cont,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct cftype *cft,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct seq_file *seq)
>> +{
>> + ? ? struct cg_cgroup_link *link, *saved_link;
>
> need a newline
>
>> + ? ? write_lock_irq(&css_set_lock);
>
> can be read_lock(&css_set_lock);
>
>> + ? ? list_for_each_entry_safe(link, saved_link, &cont->css_sets,
>
> can be list_for_each_entry()
>
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?cgrp_link_list) {
>> + ? ? ? ? ? ? struct css_set *cg = link->cg;
>> + ? ? ? ? ? ? struct task_struct *task, *saved_task;
>> + ? ? ? ? ? ? int count = 0;
>> + ? ? ? ? ? ? seq_printf(seq, "css_set %p\n", cg);
>> + ? ? ? ? ? ? list_for_each_entry_safe(task, saved_task, &cg->tasks,
>
> ditto
>
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?cg_list) {
>> + ? ? ? ? ? ? ? ? ? ? if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? seq_puts(seq, " ?...\n");
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? break;
>> + ? ? ? ? ? ? ? ? ? ? } else {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? seq_printf(seq, " ?task %d\n",
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?task_pid_vnr(task));
>> + ? ? ? ? ? ? ? ? ? ? }
>> + ? ? ? ? ? ? }
>> + ? ? }
>> + ? ? write_unlock_irq(&css_set_lock);
>> + ? ? return 0;
>> +}
>
On Fri, Jul 3, 2009 at 12:07 AM, Li Zefan<[email protected]> wrote:
>
>> + ? ? write_lock_irq(&css_set_lock);
>
> can be read_lock(&css_set_lock);
I simplified the whole thing by removing the task_lock(current) and
expanding the RCU section to cover the point where we read
current->cgroups too.
Paul