cgroup_update_dfl_csses() write-lock the threadgroup_rwsem as updating the
csses can trigger process migrations. However, if the subtree doesn't
contain any tasks, there aren't gonna be any cgroup migrations. This
condition can be trivially detected by testing whether
mgctx.preloaded_src_csets is empty. Elide write-locking threadgroup_rwsem if
the subtree is empty.
After this optimization, the usage pattern of creating a cgroup, enabling
the necessary controllers, and then seeding it with CLONE_INTO_CGROUP and
then removing the cgroup after it becomes empty doesn't need to write-lock
threadgroup_rwsem at all.
Signed-off-by: Tejun Heo <[email protected]>
Cc: Christian Brauner <[email protected]>
Cc: Michal Koutn? <[email protected]>
---
kernel/cgroup/cgroup.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2933,12 +2933,11 @@ static int cgroup_update_dfl_csses(struc
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
@@ -2949,6 +2948,16 @@ static int cgroup_update_dfl_csses(struc
}
spin_unlock_irq(&css_set_lock);
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+ if (has_tasks)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
@@ -2967,7 +2976,8 @@ static int cgroup_update_dfl_csses(struc
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ if (has_tasks)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
We allow modifying these mount options via remount. Let's add "no" prefixed
variants so that they can be turned off too.
Signed-off-by: Tejun Heo <[email protected]>
Cc: Christian Brauner <[email protected]>
Cc: Michal Koutn? <[email protected]>
---
Documentation/admin-guide/cgroup-v2.rst | 6 +++---
kernel/cgroup/cgroup.c | 20 +++++++++++++++-----
2 files changed, 18 insertions(+), 8 deletions(-)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -177,14 +177,14 @@ disabling controllers in v1 and make the
cgroup v2 currently supports the following mount options.
- nsdelegate
+ [no]nsdelegate
Consider cgroup namespaces as delegation boundaries. This
option is system wide and can only be set on mount or modified
through remount from the init namespace. The mount option is
ignored on non-init namespace mounts. Please refer to the
Delegation section for details.
- memory_localevents
+ memory_[no]localevents
Only populate memory.events with data for the current cgroup,
and not any subtrees. This is legacy behaviour, the default
behaviour without this option is to include subtree counts.
@@ -192,7 +192,7 @@ cgroup v2 currently supports the followi
modified through remount from the init namespace. The mount
option is ignored on non-init namespace mounts.
- memory_recursiveprot
+ memory_[no]recursiveprot
Recursively apply memory.min and memory.low protection to
entire subtrees, without requiring explicit downward
propagation into leaf cgroups. This allows protecting entire
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -279,8 +279,6 @@ bool cgroup_ssid_enabled(int ssid)
*
* - When mounting an existing superblock, mount options should match.
*
- * - Remount is disallowed.
- *
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
@@ -1859,16 +1857,19 @@ int cgroup_show_path(struct seq_file *sf
}
enum cgroup2_param {
- Opt_nsdelegate,
- Opt_memory_localevents,
- Opt_memory_recursiveprot,
+ Opt_nsdelegate, Opt_nonsdelegate,
+ Opt_memory_localevents, Opt_memory_nolocalevents,
+ Opt_memory_recursiveprot, Opt_memory_norecursiveprot,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("nonsdelegate", Opt_nonsdelegate),
fsparam_flag("memory_localevents", Opt_memory_localevents),
+ fsparam_flag("memory_nolocalevents", Opt_memory_nolocalevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
+ fsparam_flag("memory_norecursiveprot", Opt_memory_norecursiveprot),
{}
};
@@ -1886,12 +1887,21 @@ static int cgroup2_parse_param(struct fs
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_nonsdelegate:
+ ctx->flags &= ~CGRP_ROOT_NS_DELEGATE;
+ return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
+ case Opt_memory_nolocalevents:
+ ctx->flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ return 0;
case Opt_memory_recursiveprot:
ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
return 0;
+ case Opt_memory_norecursiveprot:
+ ctx->flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ return 0;
}
return -EINVAL;
}
On Thu, Jul 14, 2022 at 06:38:43PM -1000, Tejun Heo <[email protected]> wrote:
> We allow modifying these mount options via remount. Let's add "no" prefixed
> variants so that they can be turned off too.
They can be turned off:
> // on v5.19-rc?
> :~ # grep cg /proc/mounts
> cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate 0 0
> :~ # mount -t cgroup2 cgroup2 /sys/fs/cgroup/ -oremount
> :~ # grep cg /proc/mounts
> cgroup2 /sys/fs/cgroup cgroup2 rw,relatime 0 0
The mount(2) says about remounting:
> The mountflags and data arguments should match the values used in
> the original mount() call, except for those parameters that are being
> deliberately changed.
Or is this a provision for the fsconfig(2) API?
> + fsparam_flag("memory_nolocalevents", Opt_memory_nolocalevents),
> + fsparam_flag("memory_norecursiveprot", Opt_memory_norecursiveprot),
These are not 'no' prefixes of the option :-)
I.e. it seem more consistent to prefix whole boolean option name (in
accordance with other FS options but I know limited subset of them).
In the end, this should be handled generically for boolean options in
the VFS and not via custom options.
Also, this allows both
'nsdelegate,nonsdelegate'
and
'nonsdelegate,nsdelegate'
(nsdelegate is just an example) where the 'no' always overrides being a
hidden implementation detail.
I find this patch a bit weird.
Thanks,
Michal
Hello.
On Thu, Jul 14, 2022 at 06:38:15PM -1000, Tejun Heo <[email protected]> wrote:
> However, if the subtree doesn't contain any tasks, there aren't gonna
> be any cgroup migrations.
Nice catch.
> This condition can be trivially detected by testing whether
> mgctx.preloaded_src_csets is empty. Elide write-locking
> threadgroup_rwsem if the subtree is empty.
This check is perhaps even more robust than, e.g. cgroup_is_populated()
due to possible zombie cases.
> kernel/cgroup/cgroup.c | 16 +++++++++++++---
> 1 file changed, 13 insertions(+), 3 deletions(-)
Reviewed-by: Michal Koutn? <[email protected]>
Hello,
On Tue, Jul 26, 2022 at 04:32:46PM +0200, Michal Koutn? wrote:
> On Thu, Jul 14, 2022 at 06:38:43PM -1000, Tejun Heo <[email protected]> wrote:
> > We allow modifying these mount options via remount. Let's add "no" prefixed
> > variants so that they can be turned off too.
>
> They can be turned off:
>
> > // on v5.19-rc?
> > :~ # grep cg /proc/mounts
> > cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate 0 0
> > :~ # mount -t cgroup2 cgroup2 /sys/fs/cgroup/ -oremount
> > :~ # grep cg /proc/mounts
> > cgroup2 /sys/fs/cgroup cgroup2 rw,relatime 0 0
>
> The mount(2) says about remounting:
> > The mountflags and data arguments should match the values used in
> > the original mount() call, except for those parameters that are being
> > deliberately changed.
>
> Or is this a provision for the fsconfig(2) API?
It's just me not knowing how these things work. I just looked at other real
filesystems and copied.
> > + fsparam_flag("memory_nolocalevents", Opt_memory_nolocalevents),
> > + fsparam_flag("memory_norecursiveprot", Opt_memory_norecursiveprot),
>
> These are not 'no' prefixes of the option :-)
Oh, I tried that first but nomemory_recursiveprot looked really weird. The
thing is that the underbar is added to separate the subsystem from the
actual option and we're now prepending no to the subsystem part of the name.
I'm not super attached to the current names tho.
> I.e. it seem more consistent to prefix whole boolean option name (in
> accordance with other FS options but I know limited subset of them).
> In the end, this should be handled generically for boolean options in
> the VFS and not via custom options.
>
> Also, this allows both
> 'nsdelegate,nonsdelegate'
> and
> 'nonsdelegate,nsdelegate'
> (nsdelegate is just an example) where the 'no' always overrides being a
> hidden implementation detail.
>
> I find this patch a bit weird.
It is a bit weird. Lemme play a bit with turning off the options and I'll
remove the no options if they can be turned off without explicitly
specifying them.
Thanks.
--
tejun
Hello,
On Tue, Jul 26, 2022 at 10:01:04AM -1000, Tejun Heo wrote:
> On Tue, Jul 26, 2022 at 04:32:46PM +0200, Michal Koutn? wrote:
> > On Thu, Jul 14, 2022 at 06:38:43PM -1000, Tejun Heo <[email protected]> wrote:
> > > We allow modifying these mount options via remount. Let's add "no" prefixed
> > > variants so that they can be turned off too.
> >
> > They can be turned off:
> >
> > > // on v5.19-rc?
> > > :~ # grep cg /proc/mounts
> > > cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate 0 0
> > > :~ # mount -t cgroup2 cgroup2 /sys/fs/cgroup/ -oremount
> > > :~ # grep cg /proc/mounts
> > > cgroup2 /sys/fs/cgroup cgroup2 rw,relatime 0 0
root@test ~# grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot 0 0
root@test ~# mount -o remount,exec /sys/fs/cgroup
root@test ~# grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,relatime,nsdelegate,memory_recursiveprot 0 0
root@test ~# mount -o remount /sys/fs/cgroup
root@test ~# grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,relatime,nsdelegate,memory_recursiveprot 0 0
root@test ~# mount -o remount,nsdelegate,memory_recursiveprot cgroup2 /sys/fs/cgroup
root@test ~# grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,relatime,nsdelegate,memory_recursiveprot 0 0
root@test ~# mount -o remount,memory_recursiveprot cgroup2 /sys/fs/cgroup
root@test ~# grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,relatime,memory_recursiveprot 0 0
root@test ~# mount -o remount cgroup2 /sys/fs/cgroup
root@test ~# grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,relatime 0 0
Man, I had no idea that `mount -o remount,$OPTS $MOUNT_POINT` and `mount -o
remount,$OPTS $SRC $MOUNT_POINT` behave completely differently in how they
handle existing options. I wonder why other filesystems are implementing
explicit no prefixed options.
Anyways, will soon post a patch to remove the no prefixed options.
Thanks for pointing it out.
--
tejun
30312730bd02 ("cgroup: Add "no" prefixed mount options") added "no" prefixed
mount options to allow turning them off and 6a010a49b63a ("cgroup: Make
!percpu threadgroup_rwsem operations optional") added one more "no" prefixed
mount option. However, Michal pointed out that the "no" prefixed options
aren't necessary in allowing mount options to be turned off:
# grep group /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,relatime,nsdelegate,memory_recursiveprot 0 0
# mount -o remount,nsdelegate,memory_recursiveprot none /sys/fs/cgroup
# grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,relatime,nsdelegate,memory_recursiveprot 0 0
Note that this is different from the remount behavior when the mount(1) is
invoked without the device argument - "none":
# grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot 0 0
# mount -o remount,nsdelegate,memory_recursiveprot /sys/fs/cgroup
# grep cgroup /proc/mounts
cgroup2 /sys/fs/cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot 0 0
While a bit confusing, given that there is a way to turn off the options,
there's no reason to have the explicit "no" prefixed options. Let's remove
them.
Signed-off-by: Tejun Heo <[email protected]>
Cc: Michal Koutn? <[email protected]>
---
Documentation/admin-guide/cgroup-v2.rst | 8 ++++----
kernel/cgroup/cgroup.c | 24 ++++--------------------
2 files changed, 8 insertions(+), 24 deletions(-)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -177,14 +177,14 @@ disabling controllers in v1 and make the
cgroup v2 currently supports the following mount options.
- [no]nsdelegate
+ nsdelegate
Consider cgroup namespaces as delegation boundaries. This
option is system wide and can only be set on mount or modified
through remount from the init namespace. The mount option is
ignored on non-init namespace mounts. Please refer to the
Delegation section for details.
- [no]favordynmods
+ favordynmods
Reduce the latencies of dynamic cgroup modifications such as
task migrations and controller on/offs at the cost of making
hot path operations such as forks and exits more expensive.
@@ -192,7 +192,7 @@ cgroup v2 currently supports the followi
controllers, and then seeding it with CLONE_INTO_CGROUP is
not affected by this option.
- memory_[no]localevents
+ memory_localevents
Only populate memory.events with data for the current cgroup,
and not any subtrees. This is legacy behaviour, the default
behaviour without this option is to include subtree counts.
@@ -200,7 +200,7 @@ cgroup v2 currently supports the followi
modified through remount from the init namespace. The mount
option is ignored on non-init namespace mounts.
- memory_[no]recursiveprot
+ memory_recursiveprot
Recursively apply memory.min and memory.low protection to
entire subtrees, without requiring explicit downward
propagation into leaf cgroups. This allows protecting entire
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1872,22 +1872,18 @@ int cgroup_show_path(struct seq_file *sf
}
enum cgroup2_param {
- Opt_nsdelegate, Opt_nonsdelegate,
- Opt_favordynmods, Opt_nofavordynmods,
- Opt_memory_localevents, Opt_memory_nolocalevents,
- Opt_memory_recursiveprot, Opt_memory_norecursiveprot,
+ Opt_nsdelegate,
+ Opt_favordynmods,
+ Opt_memory_localevents,
+ Opt_memory_recursiveprot,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
- fsparam_flag("nonsdelegate", Opt_nonsdelegate),
fsparam_flag("favordynmods", Opt_favordynmods),
- fsparam_flag("nofavordynmods", Opt_nofavordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
- fsparam_flag("memory_nolocalevents", Opt_memory_nolocalevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
- fsparam_flag("memory_norecursiveprot", Opt_memory_norecursiveprot),
{}
};
@@ -1905,27 +1901,15 @@ static int cgroup2_parse_param(struct fs
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
- case Opt_nonsdelegate:
- ctx->flags &= ~CGRP_ROOT_NS_DELEGATE;
- return 0;
case Opt_favordynmods:
ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
return 0;
- case Opt_nofavordynmods:
- ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
- return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
- case Opt_memory_nolocalevents:
- ctx->flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
- return 0;
case Opt_memory_recursiveprot:
ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
return 0;
- case Opt_memory_norecursiveprot:
- ctx->flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
- return 0;
}
return -EINVAL;
}
On Tue, Jul 26, 2022 at 01:48:17PM -1000, Tejun Heo <[email protected]> wrote:
Thanks.
> While a bit confusing, given that there is a way to turn off the options,
> there's no reason to have the explicit "no" prefixed options. Let's remove
> them.
This is sensible...
> Documentation/admin-guide/cgroup-v2.rst | 8 ++++----
> kernel/cgroup/cgroup.c | 24 ++++--------------------
> 2 files changed, 8 insertions(+), 24 deletions(-)
...and cleaner.
Michal
On Wed, Jul 27, 2022 at 11:27:15AM +0200, Michal Koutn? wrote:
> On Tue, Jul 26, 2022 at 01:48:17PM -1000, Tejun Heo <[email protected]> wrote:
> Thanks.
>
> > While a bit confusing, given that there is a way to turn off the options,
> > there's no reason to have the explicit "no" prefixed options. Let's remove
> > them.
>
> This is sensible...
>
> > Documentation/admin-guide/cgroup-v2.rst | 8 ++++----
> > kernel/cgroup/cgroup.c | 24 ++++--------------------
> > 2 files changed, 8 insertions(+), 24 deletions(-)
>
> ...and cleaner.
Alright, applied to cgroup/for-5.20.
Thanks.
--
tejun