Hello,
This patchset contains the following four fixes found during
conversion to kernfs. 0001-0003 are marked with -stable.
0001-cgroup-fix-error-return-value-in-cgroup_mount.patch
0002-cgroup-fix-error-return-from-cgroup_create.patch
0003-cgroup-fix-locking-in-cgroup_cfts_commit.patch
0004-cgroup-move-the-one-off-opts-sanity-check-in-cgroup_.patch
The patches are also available in the following git branch.
git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git review-3.14-fixes
diffstat follows.
kernel/cgroup.c | 46 ++++++++++++++++++----------------------------
1 file changed, 18 insertions(+), 28 deletions(-)
Thanks.
--
tejun
cgroup_create() was returning 0 after allocation failures. Fix it.
Signed-off-by: Tejun Heo <[email protected]>
Cc: [email protected]
---
kernel/cgroup.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 364aeb22..2e9f12a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
struct cgroup *cgrp;
struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
- int ssid, err = 0;
+ int ssid, err;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
@@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
return -ENOMEM;
name = cgroup_alloc_name(dentry);
- if (!name)
+ if (!name) {
+ err = -ENOMEM;
goto err_free_cgrp;
+ }
rcu_assign_pointer(cgrp->name, name);
/*
@@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
* a half-baked cgroup.
*/
cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
- if (cgrp->id < 0)
+ if (cgrp->id < 0) {
+ err = -ENOMEM;
goto err_free_name;
+ }
/*
* Only live parents can have children. Note that the liveliness
--
1.8.5.3
cgroup_root_from_opts() checks whether (!opts->subsys_mask &&
!opts->none) and returns NULL if so. After that, if allocation fails,
returns ERR_PTR(-ENOMEM). The caller, cgroup_mount(), doesn't treat
NULL as an error but set opts.new_root to NULL; however, later on,
cgroup_set_super() fails with -EINVAL if new_root is NULL.
This is one bizarre error handling sequence especially when all other
opts sanity checks including the very close (!opts->subsys_mask &&
!opts->name) check are done in parse_cgroupfs_options().
Let's move the one-off check in cgroup_root_from_opts() to
parse_cgroupfs_options() where it can be combined with the
(!opts->subsys_mask && !opts->name) check. cgroup_root_from_opts() is
updated to return NULL on memory allocation failure.
Signed-off-by: Tejun Heo <[email protected]>
---
kernel/cgroup.c | 19 ++++++-------------
1 file changed, 6 insertions(+), 13 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b0e030f..5c8fe40 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1265,10 +1265,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -EINVAL;
/*
- * We either have to specify by name or by subsystems. (So all
- * empty hierarchies must have a name).
+ * We either have to specify by name or by subsystems. All empty
+ * hierarchies must have a name and also "none" option specified.
*/
- if (!opts->subsys_mask && !opts->name)
+ if (!opts->subsys_mask && (!opts->name || !opts->none))
return -EINVAL;
return 0;
@@ -1417,12 +1417,9 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
{
struct cgroupfs_root *root;
- if (!opts->subsys_mask && !opts->none)
- return NULL;
-
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root)
- return ERR_PTR(-ENOMEM);
+ return NULL;
init_cgroup_root(root);
@@ -1461,10 +1458,6 @@ static int cgroup_set_super(struct super_block *sb, void *data)
int ret;
struct cgroup_sb_opts *opts = data;
- /* If we don't have a new root, we can't set up a new sb */
- if (!opts->new_root)
- return -EINVAL;
-
BUG_ON(!opts->subsys_mask && !opts->none);
ret = set_anon_super(sb, NULL);
@@ -1532,8 +1525,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
* reusing an existing hierarchy.
*/
new_root = cgroup_root_from_opts(&opts);
- if (IS_ERR(new_root)) {
- ret = PTR_ERR(new_root);
+ if (!new_root) {
+ ret = -ENOMEM;
goto out_err;
}
opts.new_root = new_root;
--
1.8.5.3
cgroup_cfts_commit() walks the cgroup hierarchy that the target
subsystem is attached to and tries to apply the file changes. Due to
the convolution with inode locking, it can't keep cgroup_mutex locked
while iterating. It currently holds only RCU read lock around the
actual iteration and then pins the found cgroup using dget().
Unfortunately, this is incorrect. Although the iteration does check
cgroup_is_dead() before invoking dget(), there's nothing which
prevents the dentry from going away inbetween. Note that this is
different from the usual css iterations where css_tryget() is used to
pin the css - css_tryget() tests whether the css can be pinned and
fails if not.
The problem can be solved by simply holding cgroup_mutex instead of
RCU read lock around the iteration, which actually reduces LOC.
Signed-off-by: Tejun Heo <[email protected]>
Cc: [email protected]
---
kernel/cgroup.c | 11 ++---------
1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2e9f12a..b0e030f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
*/
update_before = cgroup_serial_nr_next;
- mutex_unlock(&cgroup_mutex);
-
/* add/rm files for all cgroups created before */
- rcu_read_lock();
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup;
@@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
inode = cgrp->dentry->d_inode;
dget(cgrp->dentry);
- rcu_read_unlock();
-
dput(prev);
prev = cgrp->dentry;
+ mutex_unlock(&cgroup_mutex);
mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);
if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
ret = cgroup_addrm_files(cgrp, cfts, is_add);
- mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
-
- rcu_read_lock();
if (ret)
break;
}
- rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
dput(prev);
deactivate_super(sb);
return ret;
--
1.8.5.3
When cgroup_mount() fails to allocate an id for the root, it didn't
set ret before jumping to unlock_drop ending up returning 0 after a
failure. Fix it.
Signed-off-by: Tejun Heo <[email protected]>
Cc: [email protected]
---
kernel/cgroup.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2f46ba..364aeb22 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
- root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
- 0, 1, GFP_KERNEL);
- if (root_cgrp->id < 0)
+ ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+ if (ret < 0)
goto unlock_drop;
+ root_cgrp->id = ret;
/* Check for name clashes with existing mounts */
ret = -EBUSY;
--
1.8.5.3
On 2014/1/28 23:32, Tejun Heo wrote:
> When cgroup_mount() fails to allocate an id for the root, it didn't
> set ret before jumping to unlock_drop ending up returning 0 after a
> failure. Fix it.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Cc: [email protected]
Acked-by: Li Zefan <[email protected]>
On 2014/1/28 23:32, Tejun Heo wrote:
> cgroup_create() was returning 0 after allocation failures. Fix it.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Cc: [email protected]
Acked-by: Li Zefan <[email protected]>
On 2014/1/28 23:32, Tejun Heo wrote:
> cgroup_cfts_commit() walks the cgroup hierarchy that the target
> subsystem is attached to and tries to apply the file changes. Due to
> the convolution with inode locking, it can't keep cgroup_mutex locked
> while iterating. It currently holds only RCU read lock around the
> actual iteration and then pins the found cgroup using dget().
>
> Unfortunately, this is incorrect. Although the iteration does check
> cgroup_is_dead() before invoking dget(), there's nothing which
> prevents the dentry from going away inbetween. Note that this is
> different from the usual css iterations where css_tryget() is used to
> pin the css - css_tryget() tests whether the css can be pinned and
> fails if not.
>
> The problem can be solved by simply holding cgroup_mutex instead of
> RCU read lock around the iteration, which actually reduces LOC.
>
> Signed-off-by: Tejun Heo <[email protected]>
> Cc: [email protected]
Good catch!
Acked-by: Li Zefan <[email protected]>
On 2014/1/28 23:32, Tejun Heo wrote:
> cgroup_root_from_opts() checks whether (!opts->subsys_mask &&
> !opts->none) and returns NULL if so. After that, if allocation fails,
> returns ERR_PTR(-ENOMEM). The caller, cgroup_mount(), doesn't treat
> NULL as an error but set opts.new_root to NULL; however, later on,
> cgroup_set_super() fails with -EINVAL if new_root is NULL.
This patch changes mount semantics.
If cgroup_root_from_opts() returns NULL, it means we should be looking
for existing superblock only.
This will fail:
# mount -t cgroup -o name=abc xxx /mnt
But this is ok:
# mount -t cgroup -o none,name=abc xxx /mnt
# mkdir /mnt/sub
# umount /mnt
# mount -t cgroup -o name=abc xxx /mnt <-- this won't work with your patch
>
> This is one bizarre error handling sequence especially when all other
> opts sanity checks including the very close (!opts->subsys_mask &&
> !opts->name) check are done in parse_cgroupfs_options().
>
> Let's move the one-off check in cgroup_root_from_opts() to
> parse_cgroupfs_options() where it can be combined with the
> (!opts->subsys_mask && !opts->name) check. cgroup_root_from_opts() is
> updated to return NULL on memory allocation failure.
>
> Signed-off-by: Tejun Heo <[email protected]>
On Wed, Jan 29, 2014 at 12:02:44PM +0800, Li Zefan wrote:
> On 2014/1/28 23:32, Tejun Heo wrote:
> > cgroup_root_from_opts() checks whether (!opts->subsys_mask &&
> > !opts->none) and returns NULL if so. After that, if allocation fails,
> > returns ERR_PTR(-ENOMEM). The caller, cgroup_mount(), doesn't treat
> > NULL as an error but set opts.new_root to NULL; however, later on,
> > cgroup_set_super() fails with -EINVAL if new_root is NULL.
>
> This patch changes mount semantics.
>
> If cgroup_root_from_opts() returns NULL, it means we should be looking
> for existing superblock only.
>
> This will fail:
>
> # mount -t cgroup -o name=abc xxx /mnt
>
> But this is ok:
>
> # mount -t cgroup -o none,name=abc xxx /mnt
> # mkdir /mnt/sub
> # umount /mnt
> # mount -t cgroup -o name=abc xxx /mnt <-- this won't work with your patch
Ewwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww....
urgghhghghghhh.............
Alright, dropping this patch and will update later patches to maintain
the behavior.
Thanks.
--
tejun
On Tue, Jan 28, 2014 at 10:32:01AM -0500, Tejun Heo wrote:
> Hello,
>
> This patchset contains the following four fixes found during
> conversion to kernfs. 0001-0003 are marked with -stable.
>
> 0001-cgroup-fix-error-return-value-in-cgroup_mount.patch
> 0002-cgroup-fix-error-return-from-cgroup_create.patch
> 0003-cgroup-fix-locking-in-cgroup_cfts_commit.patch
> 0004-cgroup-move-the-one-off-opts-sanity-check-in-cgroup_.patch
Applied 0001-0003 to cgroup/for-3.14-fixes. Thanks!
--
tejun