After running:
# mount -t cgroup cpu xxx /cgroup && mkdir /cgroup/sub && \
rmdir /cgroup/sub && umount /cgroup
I found the cgroup root still existed:
# cat /proc/cgroups
#subsys_name hierarchy num_cgroups enabled
cpuset 0 1 1
cpu 1 1 1
...
It turned out css_has_online_children() is broken.
Signed-off-by: Li Zefan <[email protected]>
---
kernel/cgroup.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 05b8ca4..1c65f24 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3327,7 +3327,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
rcu_read_lock();
css_for_each_child(child, css) {
- if (css->flags & CSS_ONLINE) {
+ if (child->flags & CSS_ONLINE) {
ret = true;
break;
}
--
1.8.0.2
This is used to check if the percpu_ref has been killed.
Signed-off-by: Li Zefan <[email protected]>
---
include/linux/percpu-refcount.h | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index dba35c4..1d5f2b3 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -96,6 +96,17 @@ static inline void percpu_ref_kill(struct percpu_ref *ref)
#define REF_STATUS(count) (((unsigned long) count) & PCPU_STATUS_MASK)
/**
+ * percpu_ref_alive - check if the ref has been killed
+ * @ref: percpu_ref to check
+ *
+ * Return true if percpu_ref_kill() has been called to drop the initial ref.
+ */
+static inline bool percpu_ref_alive(struct percpu_ref *ref)
+{
+ return !(REF_STATUS(ref->pcpu_count) == PCPU_REF_DEAD);
+}
+
+/**
* percpu_ref_get - increment a percpu refcount
* @ref: percpu_ref to get
*
--
1.8.0.2
# cat test.sh
#! /bin/bash
mount -t cgroup -o cpu xxx /cgroup
umount /cgroup
mount -t cgroup -o cpu,cpuacct xxx /cgroup
umount /cgroup
# ./test.sh
mount: xxx already mounted or /cgroup busy
mount: according to mtab, xxx is already mounted on /cgroup
It's because the cgroupfs_root of the first mount was under destruction
asynchronously.
Fix this by delaying and then retrying mount in this case.
Signed-off-by: Li Zefan <[email protected]>
---
kernel/cgroup.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1c65f24..bd37e8d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
struct dentry *dentry;
int ret;
+ int i;
bool new_sb;
/*
@@ -1677,6 +1679,22 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
+ /*
+ * If some subsystems have been bound to existing cgroup hierarchies,
+ * but those hierachies are being destroyed, let's wait a little bit
+ * and retry.
+ */
+ for_each_subsys(ss, i) {
+ if (!(opts.subsys_mask & (1 << i)))
+ continue;
+ if (!percpu_ref_alive(&ss->root->cgrp.self.refcnt)) {
+ mutex_unlock(&cgroup_mutex);
+ msleep(10);
+ ret = restart_syscall();
+ goto out_free;
+ }
+ }
+
for_each_root(root) {
bool name_match = false;
--
1.8.0.2
kernfs_pin_sb() tries to get a refcnt of the superblock, while
kernfs_drop_sb() drops this refcnt.
This will be used by cgroupfs.
Signed-off-by: Li Zefan <[email protected]>
---
fs/kernfs/mount.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/kernfs.h | 3 +++
2 files changed, 48 insertions(+)
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f25a7c0..4f924e0 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -210,6 +210,51 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn);
}
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations.
+ */
+bool kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+ struct kernfs_super_info *info;
+ int ret = false;
+
+ mutex_lock(&kernfs_mutex);
+ list_for_each_entry(info, &root->supers, node) {
+ if (info->ns == ns) {
+ ret = atomic_inc_not_zero(&info->sb->s_active);
+ break;
+ }
+ }
+ mutex_unlock(&kernfs_mutex);
+ return ret;
+}
+
+/**
+ * kernfs_drop_sb: drop the refcnt that we got by kernfs_pin_sb()
+ * @root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * This must be paired with kernfs_pin_sb(). It will require sb->u_mount
+ * if the refcnt reaches zero.
+ */
+void kernfs_drop_sb(struct kernfs_root *root, const void *ns)
+{
+ struct kernfs_super_info *info;
+
+ mutex_lock(&kernfs_mutex);
+ list_for_each_entry(info, &root->supers, node) {
+ if (info->ns == ns)
+ break;
+ }
+ mutex_unlock(&kernfs_mutex);
+ deactivate_super(info->sb);
+}
+
void __init kernfs_init(void)
{
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 589318b..1958017 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -288,6 +288,9 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
const void *ns);
void kernfs_kill_sb(struct super_block *sb);
+bool kernfs_pin_sb(struct kernfs_root *root, const void *ns);
+void kernfs_drop_sb(struct kernfs_root *root, const void *ns);
+
void kernfs_init(void);
#else /* CONFIG_KERNFS */
--
1.8.0.2
We've converted cgroup to kernfs so cgroup won't be intertwined with
vfs objects and locking, but there are dark areas.
Run two instances of this script concurrently:
for ((; ;))
{
mount -t cgroup -o cpuacct xxx /cgroup
umount /cgroup
}
After a while, I saw two mount processes were stuck at retrying, because
they were waiting for a subsystem to become free, but the root associated
with this subsystem never got freed.
This can happen, if thread A is in the process of killing superblock but
hasn't called percpu_ref_kill(), and at this time thread B is mounting
the same cgroup root and finds the root in the root list and performs
percpu_ref_try_get().
To fix this, we increase the refcnt of the superblock instead of increasing
the percpu refcnt of cgroup root.
Signed-off-by: Li Zefan <[email protected]>
---
A better fix is welcome!
---
kernel/cgroup.c | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bd37e8d..94e1814 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1654,7 +1654,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
struct dentry *dentry;
int ret;
int i;
- bool new_sb;
+ bool sb_pinned = false;
/*
* The first time anyone tries to mount a cgroup, enable the list
@@ -1735,19 +1735,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
}
/*
- * A root's lifetime is governed by its root cgroup.
- * tryget_live failure indicate that the root is being
- * destroyed. Wait for destruction to complete so that the
- * subsystems are free. We can use wait_queue for the wait
- * but this path is super cold. Let's just sleep for a bit
- * and retry.
+ * This may fail for two reasons:
+ * - A concurrent mount is in process. We wait for that mount
+ to complete.
+ * - The superblock is being destroyed. We wait for the
+ * desctruction to complete so that the subsystems are free.
+ * We can use wait_queue for the wait but this path is super
+ * cold. Let's just sleep for a bit and retry.
*/
- if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+ if (!kernfs_pin_sb(root->kf_root, NULL)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
}
+ sb_pinned = true;
ret = 0;
goto out_unlock;
@@ -1784,8 +1786,10 @@ out_free:
if (ret)
return ERR_PTR(ret);
- dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
- if (IS_ERR(dentry) || !new_sb)
+ dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
+ if (sb_pinned)
+ kernfs_drop_sb(root->kf_root, NULL);
+ if (!sb_pinned && IS_ERR(dentry))
cgroup_put(&root->cgrp);
return dentry;
}
--
1.8.0.2
On Thu, Jun 12, 2014 at 02:31:31PM +0800, Li Zefan wrote:
> After running:
>
> # mount -t cgroup cpu xxx /cgroup && mkdir /cgroup/sub && \
> rmdir /cgroup/sub && umount /cgroup
>
> I found the cgroup root still existed:
>
> # cat /proc/cgroups
> #subsys_name hierarchy num_cgroups enabled
> cpuset 0 1 1
> cpu 1 1 1
> ...
>
> It turned out css_has_online_children() is broken.
>
> Signed-off-by: Li Zefan <[email protected]>
Applied to cgroup/for-3.16-fixes.
Thanks.
--
tejun
On Thu, Jun 12, 2014 at 02:32:13PM +0800, Li Zefan wrote:
> @@ -1677,6 +1679,22 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
> goto out_unlock;
> }
>
> + /*
> + * If some subsystems have been bound to existing cgroup hierarchies,
> + * but those hierachies are being destroyed, let's wait a little bit
> + * and retry.
> + */
> + for_each_subsys(ss, i) {
> + if (!(opts.subsys_mask & (1 << i)))
> + continue;
> + if (!percpu_ref_alive(&ss->root->cgrp.self.refcnt)) {
Can't we just do tryget_live() instead and then put before retrying?
It's not exactly a hot path and the operations are dirt cheap anyway.
Thanks.
--
tejun
Hello, Li.
Sorry about the long delay.
On Thu, Jun 12, 2014 at 02:33:05PM +0800, Li Zefan wrote:
> We've converted cgroup to kernfs so cgroup won't be intertwined with
> vfs objects and locking, but there are dark areas.
>
> Run two instances of this script concurrently:
>
> for ((; ;))
> {
> mount -t cgroup -o cpuacct xxx /cgroup
> umount /cgroup
> }
>
> After a while, I saw two mount processes were stuck at retrying, because
> they were waiting for a subsystem to become free, but the root associated
> with this subsystem never got freed.
>
> This can happen, if thread A is in the process of killing superblock but
> hasn't called percpu_ref_kill(), and at this time thread B is mounting
> the same cgroup root and finds the root in the root list and performs
> percpu_ref_try_get().
>
> To fix this, we increase the refcnt of the superblock instead of increasing
> the percpu refcnt of cgroup root.
Ah, right. Gees, I'm really hating the fact that we have ->mount but
not ->umount. However, can't we make it a bit simpler by just
introducing a mutex protecting looking up and refing up an existing
root and a sb going away? The only problem is that the refcnt being
killed isn't atomic w.r.t. new live ref coming up, right? Why not
just add a mutex around them so that they can't race?
Thanks.
--
tejun
On 2014/6/21 3:10, Tejun Heo wrote:
> On Thu, Jun 12, 2014 at 02:32:13PM +0800, Li Zefan wrote:
>> @@ -1677,6 +1679,22 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
>> goto out_unlock;
>> }
>>
>> + /*
>> + * If some subsystems have been bound to existing cgroup hierarchies,
>> + * but those hierachies are being destroyed, let's wait a little bit
>> + * and retry.
>> + */
>> + for_each_subsys(ss, i) {
>> + if (!(opts.subsys_mask & (1 << i)))
>> + continue;
>> + if (!percpu_ref_alive(&ss->root->cgrp.self.refcnt)) {
>
> Can't we just do tryget_live() instead and then put before retrying?
> It's not exactly a hot path and the operations are dirt cheap anyway.
>
No much difference, though would be a bit more code. I can do that.
On 2014/6/21 3:35, Tejun Heo wrote:
> Hello, Li.
>
> Sorry about the long delay.
>
> On Thu, Jun 12, 2014 at 02:33:05PM +0800, Li Zefan wrote:
>> We've converted cgroup to kernfs so cgroup won't be intertwined with
>> vfs objects and locking, but there are dark areas.
>>
>> Run two instances of this script concurrently:
>>
>> for ((; ;))
>> {
>> mount -t cgroup -o cpuacct xxx /cgroup
>> umount /cgroup
>> }
>>
>> After a while, I saw two mount processes were stuck at retrying, because
>> they were waiting for a subsystem to become free, but the root associated
>> with this subsystem never got freed.
>>
>> This can happen, if thread A is in the process of killing superblock but
>> hasn't called percpu_ref_kill(), and at this time thread B is mounting
>> the same cgroup root and finds the root in the root list and performs
>> percpu_ref_try_get().
>>
>> To fix this, we increase the refcnt of the superblock instead of increasing
>> the percpu refcnt of cgroup root.
>
> Ah, right. Gees, I'm really hating the fact that we have ->mount but
> not ->umount. However, can't we make it a bit simpler by just
> introducing a mutex protecting looking up and refing up an existing
> root and a sb going away? The only problem is that the refcnt being
> killed isn't atomic w.r.t. new live ref coming up, right? Why not
> just add a mutex around them so that they can't race?
>
Well, kill_sb() is called with sb->s_umount held, while kernfs_mount()
returned with sb->s_umount held, so adding a mutex will lead to ABBA
deadlock.
Hello, Li.
On Tue, Jun 24, 2014 at 09:22:00AM +0800, Li Zefan wrote:
> > Ah, right. Gees, I'm really hating the fact that we have ->mount but
> > not ->umount. However, can't we make it a bit simpler by just
> > introducing a mutex protecting looking up and refing up an existing
> > root and a sb going away? The only problem is that the refcnt being
> > killed isn't atomic w.r.t. new live ref coming up, right? Why not
> > just add a mutex around them so that they can't race?
>
> Well, kill_sb() is called with sb->s_umount held, while kernfs_mount()
> returned with sb->s_umount held, so adding a mutex will lead to ABBA
> deadlock.
Hmmm? Why does that matter? The only region in cgroup_mount() which
needs to be put inside such mutex would be root lookup, no?
Thanks.
--
tejun
On 2014/6/25 5:01, Tejun Heo wrote:
> Hello, Li.
>
> On Tue, Jun 24, 2014 at 09:22:00AM +0800, Li Zefan wrote:
>>> Ah, right. Gees, I'm really hating the fact that we have ->mount but
>>> not ->umount. However, can't we make it a bit simpler by just
>>> introducing a mutex protecting looking up and refing up an existing
>>> root and a sb going away? The only problem is that the refcnt being
>>> killed isn't atomic w.r.t. new live ref coming up, right? Why not
>>> just add a mutex around them so that they can't race?
>>
>> Well, kill_sb() is called with sb->s_umount held, while kernfs_mount()
>> returned with sb->s_umount held, so adding a mutex will lead to ABBA
>> deadlock.
>
> Hmmm? Why does that matter? The only region in cgroup_mount() which
> needs to be put inside such mutex would be root lookup, no?
>
unfortunately that won't help. I think what you suggest is:
cgroup_mount()
{
mutex_lock();
lookup_cgroup_root();
mutex_unlock();
kernfs_mount();
}
cgroup_kill_sb()
{
mutex_lock();
percpu_ref_kill();
mutex_Unlock();
kernfs_kill_sb();
}
See, we may still be destroying the superblock after we've succeeded
in getting the refcnt of cgroup root.
Hey,
On Wed, Jun 25, 2014 at 09:56:31AM +0800, Li Zefan wrote:
> > Hmmm? Why does that matter? The only region in cgroup_mount() which
> > needs to be put inside such mutex would be root lookup, no?
>
> unfortunately that won't help. I think what you suggest is:
>
> cgroup_mount()
> {
> mutex_lock();
> lookup_cgroup_root();
> mutex_unlock();
> kernfs_mount();
> }
>
> cgroup_kill_sb()
> {
> mutex_lock();
> percpu_ref_kill();
> mutex_Unlock();
> kernfs_kill_sb();
> }
>
> See, we may still be destroying the superblock after we've succeeded
> in getting the refcnt of cgroup root.
Sure, but now the decision to kill is synchronized so the other side
can interlock with it. e.g.
cgroup_mount()
{
mutex_lock();
lookup_cgroup_root();
if (root isn't killed yet)
root->this_better_stay_alive++;
mutex_unlock();
kernfs_mount();
}
cgroup_kill_sb()
{
mutex_lock();
if (check whether root can be killed)
percpu_ref_kill();
mutex_unlock();
if (the above condition was true)
kernfs_kill_sb();
}
--
tejun
On 2014/6/25 23:00, Tejun Heo wrote:
> Hey,
>
> On Wed, Jun 25, 2014 at 09:56:31AM +0800, Li Zefan wrote:
>>> Hmmm? Why does that matter? The only region in cgroup_mount() which
>>> needs to be put inside such mutex would be root lookup, no?
>>
>> unfortunately that won't help. I think what you suggest is:
>>
>> cgroup_mount()
>> {
>> mutex_lock();
>> lookup_cgroup_root();
>> mutex_unlock();
>> kernfs_mount();
>> }
>>
>> cgroup_kill_sb()
>> {
>> mutex_lock();
>> percpu_ref_kill();
>> mutex_Unlock();
>> kernfs_kill_sb();
>> }
>>
>> See, we may still be destroying the superblock after we've succeeded
>> in getting the refcnt of cgroup root.
>
> Sure, but now the decision to kill is synchronized so the other side
> can interlock with it. e.g.
>
> cgroup_mount()
> {
> mutex_lock();
> lookup_cgroup_root();
> if (root isn't killed yet)
> root->this_better_stay_alive++;
> mutex_unlock();
> kernfs_mount();
> }
>
> cgroup_kill_sb()
> {
> mutex_lock();
> if (check whether root can be killed)
> percpu_ref_kill();
> mutex_unlock();
> if (the above condition was true)
> kernfs_kill_sb();
> }
>
This looks nasty, and I don't think it's correct. If we skip the call
to kernfs_kill_sb(), kernfs_super_info won't be freed but super_block
will, so we will end up either leaking memory or accessing invalid
memory. There are other problems like returning with sb->s_umount still
held.
On Fri, Jun 27, 2014 at 02:32:33PM +0800, Li Zefan wrote:
> > cgroup_mount()
> > {
> > mutex_lock();
> > lookup_cgroup_root();
> > if (root isn't killed yet)
> > root->this_better_stay_alive++;
> > mutex_unlock();
> > kernfs_mount();
> > }
> >
> > cgroup_kill_sb()
> > {
> > mutex_lock();
> > if (check whether root can be killed)
> > percpu_ref_kill();
> > mutex_unlock();
> > if (the above condition was true)
> > kernfs_kill_sb();
> > }
> >
>
> This looks nasty, and I don't think it's correct. If we skip the call
> to kernfs_kill_sb(), kernfs_super_info won't be freed but super_block
> will, so we will end up either leaking memory or accessing invalid
> memory. There are other problems like returning with sb->s_umount still
> held.
Yeah, right, the conditional shouldn't be on kernfs_kill_sb(). It
should only be on percpu_ref_kill(). kernfs mount code will wait out
the dead sb and create a new one; however, this is still not feasible
because we don't have a place to dec ->this_better_stay_alive as
there's no umount callback. :(
Thanks.
--
tejun