Hi,
cgroups that use css_id only have this number assigned after ->create()
returns. This means it is not possible to use this number for anything during
cgroup initialization.
There are situations in which it may come handy: in the kmemcg-slab patchset,
for instance, each memcg that has a limit on kernel memory will use its css_id
as an index in an array of per-memcg kmem_caches. This array increases in size
(in bulks, of course) as the maximum possible index grows, so we need to be
aware of that early.
This could be achieved with a hook called after ->create(). Such a hook exists:
this is ->post_clone(). However, this is called only for cgroups that set the
clone_children flag. I believe a more general hook is useful, and the following
two patches generalize post_clone() into a hook that is always called after
create.
As I mentioned, I intend to do memory allocations from that hook so it can fail
(in patch 2, for simplicity).
I consider a general hook acceptable and useful, and is the simplest solution to
the problem I face. Let me know what you guys think of it.
Glauber Costa (2):
generalize post_clone into post_create
allow post_create to fail
Documentation/cgroups/cgroups.txt | 13 +++++++------
include/linux/cgroup.h | 7 ++++++-
kernel/cgroup.c | 12 +++++-------
kernel/cpuset.c | 19 +++++++++++--------
4 files changed, 29 insertions(+), 22 deletions(-)
--
1.7.11.7
When we call create_cgroup(), the css_ids are not yet initialized. The
only creation-time hook that is called with everything already setup is
post_clone().
However, post_clone is too fragile, in the sense that whether or not it
will be called depends on flag that can be switched on or off at
userspace will. So if any cgroup wants to do any mandatory
initialization, this won't hold.
The proposal of this patch is to generalize this into "post_create()",
which will always be called after create. The subsystem may then check
for the clone_children flag itself, and act accordingly. The cpuset
controller is currently the only in-tree user, and is converted.
Signed-off-by: Glauber Costa <[email protected]>
CC: Tejun Heo <[email protected]>
CC: Michal Hocko <[email protected]>
CC: Li Zefan <[email protected]>
---
Documentation/cgroups/cgroups.txt | 13 +++++++------
include/linux/cgroup.h | 7 ++++++-
kernel/cgroup.c | 9 ++-------
kernel/cpuset.c | 15 +++++++++------
4 files changed, 24 insertions(+), 20 deletions(-)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 4a0b64c..5bc08f1 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -298,11 +298,12 @@ a cgroup hierarchy's release_agent path is empty.
1.5 What does clone_children do ?
---------------------------------
-If the clone_children flag is enabled (1) in a cgroup, then all
-cgroups created beneath will call the post_clone callbacks for each
-subsystem of the newly created cgroup. Usually when this callback is
-implemented for a subsystem, it copies the values of the parent
-subsystem, this is the case for the cpuset.
+If the clone_children flag is enabled (1) in a cgroup, the group may perform
+specific initialization on its attributes based on the values in the parent,
+for each subsystem of the newly created cgroup that implements the
+post_create() callback. Usually when this callback is implemented for a
+subsystem, it copies the values of the parent subsystem, this is the case for
+the cpuset.
1.6 How do I use cgroups ?
--------------------------
@@ -634,7 +635,7 @@ void exit(struct task_struct *task)
Called during task exit.
-void post_clone(struct cgroup *cgrp)
+void post_create(struct cgroup *cgrp)
(cgroup_mutex held by caller)
Called during cgroup_create() to do any parameter
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 68e8df7..7f422ba 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -218,6 +218,11 @@ struct cgroup {
spinlock_t event_list_lock;
};
+static inline bool clone_children(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+}
+
/*
* A css_set is a structure holding pointers to a set of
* cgroup_subsys_state objects. This saves space in the task struct
@@ -472,7 +477,7 @@ struct cgroup_subsys {
void (*fork)(struct task_struct *task);
void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
struct task_struct *task);
- void (*post_clone)(struct cgroup *cgrp);
+ void (*post_create)(struct cgroup *cgrp);
void (*bind)(struct cgroup *root);
int subsys_id;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b7d9606..40caab1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -292,11 +292,6 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
-static int clone_children(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-}
-
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
@@ -3968,8 +3963,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_destroy;
}
/* At error, ->destroy() callback has to free assigned ID. */
- if (clone_children(parent) && ss->post_clone)
- ss->post_clone(cgrp);
+ if (ss->post_create)
+ ss->post_create(cgrp);
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c715..ca97af8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1784,9 +1784,9 @@ static struct cftype files[] = {
};
/*
- * post_clone() is called during cgroup_create() when the
- * clone_children mount argument was specified. The cgroup
- * can not yet have any tasks.
+ * post_create() is called during cgroup_create() if create succeeds. The
+ * cgroup can not yet have any tasks. In here, we will take action based
+ * on the value of the clone_children flag.
*
* Currently we refuse to set up the cgroup - thereby
* refusing the task to be entered, and as a result refusing
@@ -1794,16 +1794,19 @@ static struct cftype files[] = {
* sibling cpusets have exclusive cpus or mem.
*
* If this becomes a problem for some users who wish to
- * allow that scenario, then cpuset_post_clone() could be
+ * allow that scenario, then cpuset_post_create() could be
* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
* (and likewise for mems) to the new cgroup. Called with cgroup_mutex
* held.
*/
-static void cpuset_post_clone(struct cgroup *cgroup)
+static void cpuset_post_create(struct cgroup *cgroup)
{
struct cgroup *parent, *child;
struct cpuset *cs, *parent_cs;
+ if (!clone_children(cgroup))
+ return;
+
parent = cgroup->parent;
list_for_each_entry(child, &parent->children, sibling) {
cs = cgroup_cs(child);
@@ -1882,7 +1885,7 @@ struct cgroup_subsys cpuset_subsys = {
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
.attach = cpuset_attach,
- .post_clone = cpuset_post_clone,
+ .post_create = cpuset_post_create,
.subsys_id = cpuset_subsys_id,
.base_cftypes = files,
.early_init = 1,
--
1.7.11.7
Initialization in post_create can theoretically fail (although it won't
in cpuset). The comment in cgroup.c even seem to indicate that the
possibility of failure was the intention.
It is not terribly complicated, so let us just allow it to fail.
Signed-off-by: Glauber Costa <[email protected]>
CC: Tejun Heo <[email protected]>
CC: Michal Hocko <[email protected]>
CC: Li Zefan <[email protected]>
---
include/linux/cgroup.h | 2 +-
kernel/cgroup.c | 7 +++++--
kernel/cpuset.c | 8 ++++----
3 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7f422ba..34cb906 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -477,7 +477,7 @@ struct cgroup_subsys {
void (*fork)(struct task_struct *task);
void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
struct task_struct *task);
- void (*post_create)(struct cgroup *cgrp);
+ int (*post_create)(struct cgroup *cgrp);
void (*bind)(struct cgroup *root);
int subsys_id;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 40caab1..20a1422 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3963,8 +3963,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_destroy;
}
/* At error, ->destroy() callback has to free assigned ID. */
- if (ss->post_create)
- ss->post_create(cgrp);
+ if (ss->post_create) {
+ err = ss->post_create(cgrp);
+ if (err)
+ goto err_destroy;
+ }
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ca97af8..e7623ae 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1799,19 +1799,19 @@ static struct cftype files[] = {
* (and likewise for mems) to the new cgroup. Called with cgroup_mutex
* held.
*/
-static void cpuset_post_create(struct cgroup *cgroup)
+static int cpuset_post_create(struct cgroup *cgroup)
{
struct cgroup *parent, *child;
struct cpuset *cs, *parent_cs;
if (!clone_children(cgroup))
- return;
+ return 0;
parent = cgroup->parent;
list_for_each_entry(child, &parent->children, sibling) {
cs = cgroup_cs(child);
if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
- return;
+ return 0;
}
cs = cgroup_cs(cgroup);
parent_cs = cgroup_cs(parent);
@@ -1820,7 +1820,7 @@ static void cpuset_post_create(struct cgroup *cgroup)
cs->mems_allowed = parent_cs->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
mutex_unlock(&callback_mutex);
- return;
+ return 0;
}
/*
--
1.7.11.7