The first patch of this series fixes the incorrect cpumask problem
when the CLONE_INTO_CGROUP flag is used to clone a child to a different
cgroup. It also includes 2 other miscellaneous fixes to the cpuset code.
Waiman Long (3):
cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly
cgroup/cpuset: Make cpuset_attach_task() skip subpartitions CPUs for
top_cpuset
cgroup/cpuset: Allow only one active attach operation per cpuset
kernel/cgroup/cpuset.c | 76 ++++++++++++++++++++++++++++--------------
1 file changed, 51 insertions(+), 25 deletions(-)
--
2.31.1
The current cpuset code uses the global cpuset_attach_old_cs variable
to store the old cpuset value between consecutive cpuset_can_attach()
and cpuset_attach() calls. Since a caller of cpuset_can_attach() may
not need to hold the global cgroup_threadgroup_rwsem, parallel cpuset
attach operations are possible.
When there are concurrent cpuset attach operations in progress,
cpuset_attach() may fetch the wrong value from cpuset_attach_old_cs
causing incorrect result. To avoid this problem while still allowing
certain level of parallelism, drop cpuset_attach_old_cs and use a
per-cpuset attach_old_cs value. Also restrict to at most one active
attach operation per cpuset to avoid corrupting the value of the
per-cpuset attach_old_cs value.
Signed-off-by: Waiman Long <[email protected]>
---
kernel/cgroup/cpuset.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2367de611c42..3f925c261513 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -198,6 +198,8 @@ struct cpuset {
/* Handle for cpuset.cpus.partition */
struct cgroup_file partition_file;
+
+ struct cpuset *attach_old_cs;
};
/*
@@ -2456,22 +2458,27 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-static struct cpuset *cpuset_attach_old_cs;
-
/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct cpuset *cs;
+ struct cpuset *cs, *oldcs;
struct task_struct *task;
int ret;
/* used later by cpuset_attach() */
- cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+ oldcs = task_cs(cgroup_taskset_first(tset, &css));
cs = css_cs(css);
percpu_down_write(&cpuset_rwsem);
+ /*
+ * Only one cpuset attach operation is allowed for each cpuset.
+ */
+ ret = -EBUSY;
+ if (cs->attach_in_progress)
+ goto out_unlock;
+
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
if (!is_in_v2_mode() &&
@@ -2498,6 +2505,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
+ cs->attach_old_cs = oldcs;
ret = 0;
out_unlock:
percpu_up_write(&cpuset_rwsem);
@@ -2548,7 +2556,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
struct task_struct *leader;
struct cgroup_subsys_state *css;
struct cpuset *cs;
- struct cpuset *oldcs = cpuset_attach_old_cs;
+ struct cpuset *oldcs;
bool cpus_updated, mems_updated;
cgroup_taskset_first(tset, &css);
@@ -2556,6 +2564,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
+ oldcs = cs->attach_old_cs;
cpus_updated = !cpumask_equal(cs->effective_cpus,
oldcs->effective_cpus);
mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
--
2.31.1
It is found that attaching a task to the top_cpuset does not currently
ignore CPUs allocated to subpartitions in cpuset_attach_task(). So the
code is changed to fix that.
Signed-off-by: Waiman Long <[email protected]>
---
kernel/cgroup/cpuset.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f6d5614982d7..2367de611c42 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2530,7 +2530,8 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
if (cs != &top_cpuset)
guarantee_online_cpus(task, cpus_attach);
else
- cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
+ cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
+ cs->subparts_cpus);
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
--
2.31.1
By default, the clone(2) syscall spawn a child process into the same
cgroup as its parent. With the use of the CLONE_INTO_CGROUP flag
introduced by commit ef2c41cf38a7 ("clone3: allow spawning processes
into cgroups"), the child will be spawned into a different cgroup which
is somewhat similar to writing the child's tid into "cgroup.threads".
The current cpuset_fork() method does not properly handle the
CLONE_INTO_CGROUP case where the cpuset of the child may be different
from that of its parent. Update the cpuset_fork() method to treat the
CLONE_INTO_CGROUP case similar to cpuset_attach().
Since the newly cloned task has not been running yet, its actual
memory usage isn't known. So it is not necessary to make change to mm
in cpuset_fork().
Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups")
Signed-off-by: Waiman Long <[email protected]>
---
kernel/cgroup/cpuset.c | 56 +++++++++++++++++++++++++++---------------
1 file changed, 36 insertions(+), 20 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index bc4dcfd7bee5..f6d5614982d7 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2516,16 +2516,33 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
}
/*
- * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_to;
+
+static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
+{
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(task, cpus_attach);
+ else
+ cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flags(cs, task);
+}
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_rwsem */
- static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
struct cgroup_subsys_state *css;
@@ -2556,20 +2573,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
- cgroup_taskset_for_each(task, css, tset) {
- if (cs != &top_cpuset)
- guarantee_online_cpus(task, cpus_attach);
- else
- cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
- /*
- * can_attach beforehand should guarantee that this doesn't
- * fail. TODO: have a better way to handle failure here
- */
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
-
- cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flags(cs, task);
- }
+ cgroup_taskset_for_each(task, css, tset)
+ cpuset_attach_task(cs, task);
/*
* Change mm for all threadgroup leaders. This is expensive and may
@@ -3267,11 +3272,22 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
*/
static void cpuset_fork(struct task_struct *task)
{
- if (task_css_is_root(task, cpuset_cgrp_id))
+ struct cpuset *cs = task_cs(task);
+
+ if (cs == task_cs(current)) {
+ if (cs == &top_cpuset)
+ return;
+
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
+ task->mems_allowed = current->mems_allowed;
return;
+ }
- set_cpus_allowed_ptr(task, current->cpus_ptr);
- task->mems_allowed = current->mems_allowed;
+ /* CLONE_INTO_CGROUP */
+ percpu_down_write(&cpuset_rwsem);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ cpuset_attach_task(cs, task);
+ percpu_up_write(&cpuset_rwsem);
}
struct cgroup_subsys cpuset_cgrp_subsys = {
--
2.31.1
Hi.
On Fri, Mar 31, 2023 at 10:50:44AM -0400, Waiman Long <[email protected]> wrote:
> It is found that attaching a task to the top_cpuset does not currently
> ignore CPUs allocated to subpartitions in cpuset_attach_task(). So the
> code is changed to fix that.
>
> Signed-off-by: Waiman Long <[email protected]>
> ---
> kernel/cgroup/cpuset.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
Reviewed-by: Michal Koutn? <[email protected]>
On Fri, Mar 31, 2023 at 10:50:43AM -0400, Waiman Long <[email protected]> wrote:
> By default, the clone(2) syscall spawn a child process into the same
> cgroup as its parent. With the use of the CLONE_INTO_CGROUP flag
> introduced by commit ef2c41cf38a7 ("clone3: allow spawning processes
> into cgroups"), the child will be spawned into a different cgroup which
> is somewhat similar to writing the child's tid into "cgroup.threads".
>
> The current cpuset_fork() method does not properly handle the
> CLONE_INTO_CGROUP case where the cpuset of the child may be different
> from that of its parent. Update the cpuset_fork() method to treat the
> CLONE_INTO_CGROUP case similar to cpuset_attach().
Should .can_fork=cpuset_can_fork in analogy to cpuset_can_attach be also
devised? (Sorry if I missed that in the previous discussion.)
>
> Since the newly cloned task has not been running yet, its actual
> memory usage isn't known. So it is not necessary to make change to mm
> in cpuset_fork().
>
> Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups")
> Signed-off-by: Waiman Long <[email protected]>
Thanks,
Michal
On Fri, Mar 31, 2023 at 10:50:45AM -0400, Waiman Long <[email protected]> wrote:
> The current cpuset code uses the global cpuset_attach_old_cs variable
> to store the old cpuset value between consecutive cpuset_can_attach()
> and cpuset_attach() calls. Since a caller of cpuset_can_attach() may
> not need to hold the global cgroup_threadgroup_rwsem, parallel cpuset
> attach operations are possible.
Do I understand correctly this consequence of the cpuset_attach_task()
on the clone path?
In that particular case (with CLONE_INTO_CGROUP) cgroup_mutex is taken,
so the access the the old_cs variable should still be synchronized with
regular migrations that are also under cgroup_mutex.
> When there are concurrent cpuset attach operations in progress,
> cpuset_attach() may fetch the wrong value from cpuset_attach_old_cs
> causing incorrect result. To avoid this problem while still allowing
> certain level of parallelism, drop cpuset_attach_old_cs and use a
> per-cpuset attach_old_cs value. Also restrict to at most one active
> attach operation per cpuset to avoid corrupting the value of the
> per-cpuset attach_old_cs value.
Secondly, semantically wouldn't a `void *ss_priv[CGROUP_SUBSYS_COUNT]`
in struct cgroup_taskset make it simpler wrt the exclusivity guarantees?
Thirdly, if my initial assumptino is right -- I'd suggest ordering this
before the patch `cgroup/cpuset: Make cpuset_fork() handle
CLONE_INTO_CGROUP properly` to spare backporters possible troubles if
this is would be a fixup to that.
Thanks,
Michal
On 4/3/23 12:55, Michal Koutný wrote:
> On Fri, Mar 31, 2023 at 10:50:43AM -0400, Waiman Long <[email protected]> wrote:
>> By default, the clone(2) syscall spawn a child process into the same
>> cgroup as its parent. With the use of the CLONE_INTO_CGROUP flag
>> introduced by commit ef2c41cf38a7 ("clone3: allow spawning processes
>> into cgroups"), the child will be spawned into a different cgroup which
>> is somewhat similar to writing the child's tid into "cgroup.threads".
>>
>> The current cpuset_fork() method does not properly handle the
>> CLONE_INTO_CGROUP case where the cpuset of the child may be different
>> from that of its parent. Update the cpuset_fork() method to treat the
>> CLONE_INTO_CGROUP case similar to cpuset_attach().
> Should .can_fork=cpuset_can_fork in analogy to cpuset_can_attach be also
> devised? (Sorry if I missed that in the previous discussion.)
I have thought about that too.
However, the can_attach method checks only a couple of things:
1) PF_NO_SETAFFINITY flag - which won't be set in the case of fork() as
it is for kthread only.
2) DL bandwidth - Juri has a cpuset outstanding to modify the way this
check is being done. I want to wait until it is settled before tackling
this, if necessary.
3) security_task_setscheduler() - the CLONE_INTO_CGROUP code has already
checked that, we don't need to duplicate the check.
So we don't need a can_fork() check for now.
Cheers,
Longman
On 4/3/23 13:18, Waiman Long wrote:
>
> On 4/3/23 12:55, Michal Koutný wrote:
>> On Fri, Mar 31, 2023 at 10:50:43AM -0400, Waiman Long
>> <[email protected]> wrote:
>>> By default, the clone(2) syscall spawn a child process into the same
>>> cgroup as its parent. With the use of the CLONE_INTO_CGROUP flag
>>> introduced by commit ef2c41cf38a7 ("clone3: allow spawning processes
>>> into cgroups"), the child will be spawned into a different cgroup which
>>> is somewhat similar to writing the child's tid into "cgroup.threads".
>>>
>>> The current cpuset_fork() method does not properly handle the
>>> CLONE_INTO_CGROUP case where the cpuset of the child may be different
>>> from that of its parent. Update the cpuset_fork() method to treat the
>>> CLONE_INTO_CGROUP case similar to cpuset_attach().
>> Should .can_fork=cpuset_can_fork in analogy to cpuset_can_attach be also
>> devised? (Sorry if I missed that in the previous discussion.)
>
> I have thought about that too.
>
> However, the can_attach method checks only a couple of things:
>
> 1) PF_NO_SETAFFINITY flag - which won't be set in the case of fork()
> as it is for kthread only.
> 2) DL bandwidth - Juri has a cpuset outstanding to modify the way this
> check is being done. I want to wait until it is settled before
> tackling this, if necessary.
BTW, even cloning without CLONE_INTO_CGROUP can be an issue WRT DL
bandwidth. So I will let the scheduler folk figure out what to do with that.
Cheers,
Longman
On 4/3/23 12:47, Michal Koutný wrote:
> On Fri, Mar 31, 2023 at 10:50:45AM -0400, Waiman Long <[email protected]> wrote:
>> The current cpuset code uses the global cpuset_attach_old_cs variable
>> to store the old cpuset value between consecutive cpuset_can_attach()
>> and cpuset_attach() calls. Since a caller of cpuset_can_attach() may
>> not need to hold the global cgroup_threadgroup_rwsem, parallel cpuset
>> attach operations are possible.
> Do I understand correctly this consequence of the cpuset_attach_task()
> on the clone path?
> In that particular case (with CLONE_INTO_CGROUP) cgroup_mutex is taken,
> so the access the the old_cs variable should still be synchronized with
> regular migrations that are also under cgroup_mutex.
This patch is actually not related to the CLONE_INTO_GROUP problem in
patch 1. It is a generic problem when multiple users are moving threads
into cgroup.threads of the same or different cpusets simultaneously.
>> When there are concurrent cpuset attach operations in progress,
>> cpuset_attach() may fetch the wrong value from cpuset_attach_old_cs
>> causing incorrect result. To avoid this problem while still allowing
>> certain level of parallelism, drop cpuset_attach_old_cs and use a
>> per-cpuset attach_old_cs value. Also restrict to at most one active
>> attach operation per cpuset to avoid corrupting the value of the
>> per-cpuset attach_old_cs value.
> Secondly, semantically wouldn't a `void *ss_priv[CGROUP_SUBSYS_COUNT]`
> in struct cgroup_taskset make it simpler wrt the exclusivity guarantees?
I guess we can put the old_cs value into cgroup_taskset. Since the
related attach_in_progress value is in cpuset, so I put the old_cs there
too.
> Thirdly, if my initial assumptino is right -- I'd suggest ordering this
> before the patch `cgroup/cpuset: Make cpuset_fork() handle
> CLONE_INTO_CGROUP properly` to spare backporters possible troubles if
> this is would be a fixup to that.
I don't believe this patch has a dependency on patch 1.
I had thought about adding a fixes tag so it will be backported to
stable. However, this problem should be rare. Let's see what others
think about it.
Cheers,
Longman
Hi.
On Mon, Apr 03, 2023 at 01:41:33PM -0400, Waiman Long <[email protected]> wrote:
> This patch is actually not related to the CLONE_INTO_GROUP problem in patch
> 1. It is a generic problem when multiple users are moving threads into
> cgroup.threads of the same or different cpusets simultaneously.
I meant this:
__cgroup_procs_write
cgroup_kn_lock_live
mutex_lock(&cgroup_mutex)
and (more succintly)
cgroup_update_dfl_csses
lockdep_assert_held(&cgroup_mutex)
Even the threaded migrations should be synchronized here.
Can you please explain in more detail what's the problematic case?
> I don't believe this patch has a dependency on patch 1.
(I meant the opposite, patch 1 would depend in this 3/3. But maybe this
one isn't need.)
Michal
On Mon, Apr 03, 2023 at 01:18:42PM -0400, Waiman Long <[email protected]> wrote:
> 1) PF_NO_SETAFFINITY flag - which won't be set in the case of fork() as it
> is for kthread only.
> 2) DL bandwidth - Juri has a cpuset outstanding to modify the way this check
> is being done. I want to wait until it is settled before tackling this, if
> necessary.
BTW what about CLONE_INTO_CGROUP where the target cpuset has empty
effective cpuset?
> 3) security_task_setscheduler() - the CLONE_INTO_CGROUP code has already
> checked that, we don't need to duplicate the check.
Not sure what this refers to.
> So we don't need a can_fork() check for now.
Anyway, good breakdown. Could you please add it to the commit message
too?
Regards,
Michal
On 4/4/23 05:19, Michal Koutný wrote:
> On Mon, Apr 03, 2023 at 01:18:42PM -0400, Waiman Long <[email protected]> wrote:
>> 1) PF_NO_SETAFFINITY flag - which won't be set in the case of fork() as it
>> is for kthread only.
>> 2) DL bandwidth - Juri has a cpuset outstanding to modify the way this check
>> is being done. I want to wait until it is settled before tackling this, if
>> necessary.
> BTW what about CLONE_INTO_CGROUP where the target cpuset has empty
> effective cpuset?
Good point. That will require a can_fork() method then. I will look into
that.
>
>> 3) security_task_setscheduler() - the CLONE_INTO_CGROUP code has already
>> checked that, we don't need to duplicate the check.
> Not sure what this refers to.
It is just checking of the task the has privilege of running into that
cgroup.
>
>> So we don't need a can_fork() check for now.
> Anyway, good breakdown. Could you please add it to the commit message
> too?
Yes, I can put that into the commit log.
Cheers,
Longman
> Regards,
> Michal
On 4/4/23 05:07, Michal Koutný wrote:
> Hi.
>
> On Mon, Apr 03, 2023 at 01:41:33PM -0400, Waiman Long <[email protected]> wrote:
>> This patch is actually not related to the CLONE_INTO_GROUP problem in patch
>> 1. It is a generic problem when multiple users are moving threads into
>> cgroup.threads of the same or different cpusets simultaneously.
> I meant this:
> __cgroup_procs_write
> cgroup_kn_lock_live
> mutex_lock(&cgroup_mutex)
>
> and (more succintly)
> cgroup_update_dfl_csses
> lockdep_assert_held(&cgroup_mutex)
>
> Even the threaded migrations should be synchronized here.
> Can you please explain in more detail what's the problematic case?
You are right. I missed the cgroup_mutex synchronization here. So this
patch isn't needed. I will drop it in the next version.
Cheers,
Longman
On 4/4/23 09:52, Waiman Long wrote:
>
> On 4/4/23 05:19, Michal Koutný wrote:
>> On Mon, Apr 03, 2023 at 01:18:42PM -0400, Waiman Long
>> <[email protected]> wrote:
>>> 1) PF_NO_SETAFFINITY flag - which won't be set in the case of fork()
>>> as it
>>> is for kthread only.
>>> 2) DL bandwidth - Juri has a cpuset outstanding to modify the way
>>> this check
>>> is being done. I want to wait until it is settled before tackling
>>> this, if
>>> necessary.
>> BTW what about CLONE_INTO_CGROUP where the target cpuset has empty
>> effective cpuset?
> Good point. That will require a can_fork() method then. I will look
> into that.
>>
>>> 3) security_task_setscheduler() - the CLONE_INTO_CGROUP code has
>>> already
>>> checked that, we don't need to duplicate the check.
>> Not sure what this refers to.
> It is just checking of the task the has privilege of running into that
> cgroup.
>>
>>> So we don't need a can_fork() check for now.
>> Anyway, good breakdown. Could you please add it to the commit message
>> too?
>
> Yes, I can put that into the commit log.
I decide to add the can_fork method to do a pre-check. So I don't think
I need to update the commit log of this patch.
Cheers,
Longman
Hello,
kernel test robot noticed "WARNING:suspicious_RCU_usage" on:
commit: a53ab2ba098e6839db602212831c8b62a38c2956 ("[PATCH 1/3] cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly")
url: https://github.com/intel-lab-lkp/linux/commits/Waiman-Long/cgroup-cpuset-Make-cpuset_fork-handle-CLONE_INTO_CGROUP-properly/20230331-225527
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 62bad54b26db8bc98e28749cd76b2d890edb4258
patch link: https://lore.kernel.org/all/[email protected]/
patch subject: [PATCH 1/3] cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly
in testcase: boot
compiler: gcc-11
test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
(please refer to attached dmesg/kmsg for entire log/backtrace)
If you fix the issue, kindly add following tag
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-lkp/[email protected]
[ 2.798551][ T2] WARNING: suspicious RCU usage
[ 2.799473][ T2] 6.3.0-rc4-00162-ga53ab2ba098e #1 Not tainted
[ 2.799901][ T2] -----------------------------
[ 2.800551][ T2] include/linux/cgroup.h:437 suspicious rcu_dereference_check() usage!
[ 2.802044][ T2]
[ 2.802044][ T2] other info that might help us debug this:
[ 2.802044][ T2]
[ 2.803158][ T2]
[ 2.803158][ T2] rcu_scheduler_active = 1, debug_locks = 1
[ 2.804024][ T2] 1 lock held by kthreadd/2:
[ 2.804851][ T2] #0: ffffffff84c38230 (cgroup_threadgroup_rwsem){.+.+}-{0:0}, at: cgroup_can_fork (kernel/cgroup/cgroup.c:6515)
[ 2.806112][ T2]
[ 2.806112][ T2] stack backtrace:
[ 2.806958][ T2] CPU: 0 PID: 2 Comm: kthreadd Not tainted 6.3.0-rc4-00162-ga53ab2ba098e #1
[ 2.807537][ T2] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
[ 2.807537][ T2] Call Trace:
[ 2.807537][ T2] <TASK>
[ 2.807537][ T2] dump_stack_lvl (lib/dump_stack.c:107)
[ 2.807537][ T2] lockdep_rcu_suspicious (include/linux/context_tracking.h:153 kernel/locking/lockdep.c:6600)
[ 2.807537][ T2] cpuset_fork (include/linux/cgroup.h:437 kernel/cgroup/cpuset.c:240 kernel/cgroup/cpuset.c:3262)
[ 2.807537][ T2] cgroup_post_fork (kernel/cgroup/cgroup.c:6635 (discriminator 6))
[ 2.807537][ T2] ? cgroup_cancel_fork (kernel/cgroup/cgroup.c:6573)
[ 2.807537][ T2] ? mark_held_locks (kernel/locking/lockdep.c:4237)
[ 2.807537][ T2] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4529)
[ 2.807537][ T2] copy_process (kernel/fork.c:2499)
[ 2.807537][ T2] ? __cleanup_sighand (kernel/fork.c:2013)
[ 2.807537][ T2] ? __lock_acquire (kernel/locking/lockdep.c:186 kernel/locking/lockdep.c:3836 kernel/locking/lockdep.c:5056)
[ 2.807537][ T2] kernel_clone (include/linux/random.h:26 kernel/fork.c:2680)
[ 2.807537][ T2] ? create_io_thread (kernel/fork.c:2639)
[ 2.807537][ T2] ? mark_usage (kernel/locking/lockdep.c:4914)
[ 2.807537][ T2] ? finish_task_switch+0x21c/0x910
[ 2.807537][ T2] ? __switch_to (arch/x86/include/asm/bitops.h:55 include/asm-generic/bitops/instrumented-atomic.h:29 include/linux/thread_info.h:89 arch/x86/include/asm/fpu/sched.h:65 arch/x86/kernel/process_64.c:623)
[ 2.807537][ T2] kernel_thread (kernel/fork.c:2729)
[ 2.807537][ T2] ? __ia32_sys_clone3 (kernel/fork.c:2729)
[ 2.807537][ T2] ? lock_downgrade (kernel/locking/lockdep.c:5321)
[ 2.807537][ T2] ? kthread_complete_and_exit (kernel/kthread.c:331)
[ 2.807537][ T2] ? do_raw_spin_unlock (arch/x86/include/asm/atomic.h:29 include/linux/atomic/atomic-instrumented.h:28 include/asm-generic/qspinlock.h:57 kernel/locking/spinlock_debug.c:100 kernel/locking/spinlock_debug.c:140)
[ 2.807537][ T2] kthreadd (kernel/kthread.c:400 kernel/kthread.c:746)
[ 2.807537][ T2] ? kthread_is_per_cpu (kernel/kthread.c:719)
[ 2.807537][ T2] ret_from_fork (arch/x86/entry/entry_64.S:314)
[ 2.807537][ T2] </TASK>
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
On Fri, Mar 31, 2023 at 10:50:43AM -0400, Waiman Long wrote:
> By default, the clone(2) syscall spawn a child process into the same
> cgroup as its parent. With the use of the CLONE_INTO_CGROUP flag
> introduced by commit ef2c41cf38a7 ("clone3: allow spawning processes
> into cgroups"), the child will be spawned into a different cgroup which
> is somewhat similar to writing the child's tid into "cgroup.threads".
>
> The current cpuset_fork() method does not properly handle the
> CLONE_INTO_CGROUP case where the cpuset of the child may be different
> from that of its parent. Update the cpuset_fork() method to treat the
> CLONE_INTO_CGROUP case similar to cpuset_attach().
>
> Since the newly cloned task has not been running yet, its actual
> memory usage isn't known. So it is not necessary to make change to mm
> in cpuset_fork().
>
> Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups")
> Signed-off-by: Waiman Long <[email protected]>
> ---
Just two nits. I think this needs a Cc stable and it would be nice if
you could give Giuseppe a "Reported-by".
On Fri, Mar 31, 2023 at 10:50:45AM -0400, Waiman Long wrote:
> The current cpuset code uses the global cpuset_attach_old_cs variable
> to store the old cpuset value between consecutive cpuset_can_attach()
> and cpuset_attach() calls. Since a caller of cpuset_can_attach() may
> not need to hold the global cgroup_threadgroup_rwsem, parallel cpuset
> attach operations are possible.
>
> When there are concurrent cpuset attach operations in progress,
> cpuset_attach() may fetch the wrong value from cpuset_attach_old_cs
> causing incorrect result. To avoid this problem while still allowing
> certain level of parallelism, drop cpuset_attach_old_cs and use a
> per-cpuset attach_old_cs value. Also restrict to at most one active
> attach operation per cpuset to avoid corrupting the value of the
> per-cpuset attach_old_cs value.
>
> Signed-off-by: Waiman Long <[email protected]>
> ---
> kernel/cgroup/cpuset.c | 19 ++++++++++++++-----
> 1 file changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 2367de611c42..3f925c261513 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -198,6 +198,8 @@ struct cpuset {
>
> /* Handle for cpuset.cpus.partition */
> struct cgroup_file partition_file;
> +
> + struct cpuset *attach_old_cs;
> };
>
> /*
> @@ -2456,22 +2458,27 @@ static int fmeter_getrate(struct fmeter *fmp)
> return val;
> }
>
> -static struct cpuset *cpuset_attach_old_cs;
> -
> /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
> static int cpuset_can_attach(struct cgroup_taskset *tset)
> {
> struct cgroup_subsys_state *css;
> - struct cpuset *cs;
> + struct cpuset *cs, *oldcs;
> struct task_struct *task;
> int ret;
>
> /* used later by cpuset_attach() */
> - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
> + oldcs = task_cs(cgroup_taskset_first(tset, &css));
> cs = css_cs(css);
>
> percpu_down_write(&cpuset_rwsem);
>
> + /*
> + * Only one cpuset attach operation is allowed for each cpuset.
> + */
> + ret = -EBUSY;
> + if (cs->attach_in_progress)
> + goto out_unlock;
That'll mean CLONE_INTO_CGROUP becomes even more interestig because it
isn't subject to this restriction in contrast to fork()+migrate, right?
On 4/5/23 22:51, kernel test robot wrote:
> Hello,
>
> kernel test robot noticed "WARNING:suspicious_RCU_usage" on:
>
> commit: a53ab2ba098e6839db602212831c8b62a38c2956 ("[PATCH 1/3] cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly")
> url: https://github.com/intel-lab-lkp/linux/commits/Waiman-Long/cgroup-cpuset-Make-cpuset_fork-handle-CLONE_INTO_CGROUP-properly/20230331-225527
> base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 62bad54b26db8bc98e28749cd76b2d890edb4258
> patch link: https://lore.kernel.org/all/[email protected]/
> patch subject: [PATCH 1/3] cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly
>
> in testcase: boot
>
> compiler: gcc-11
> test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
>
> (please refer to attached dmesg/kmsg for entire log/backtrace)
>
>
> If you fix the issue, kindly add following tag
> | Reported-by: kernel test robot <[email protected]>
> | Link: https://lore.kernel.org/oe-lkp/[email protected]
>
>
> [ 2.798551][ T2] WARNING: suspicious RCU usage
> [ 2.799473][ T2] 6.3.0-rc4-00162-ga53ab2ba098e #1 Not tainted
> [ 2.799901][ T2] -----------------------------
> [ 2.800551][ T2] include/linux/cgroup.h:437 suspicious rcu_dereference_check() usage!
> [ 2.802044][ T2]
> [ 2.802044][ T2] other info that might help us debug this:
> [ 2.802044][ T2]
> [ 2.803158][ T2]
> [ 2.803158][ T2] rcu_scheduler_active = 1, debug_locks = 1
> [ 2.804024][ T2] 1 lock held by kthreadd/2:
> [ 2.804851][ T2] #0: ffffffff84c38230 (cgroup_threadgroup_rwsem){.+.+}-{0:0}, at: cgroup_can_fork (kernel/cgroup/cgroup.c:6515)
> [ 2.806112][ T2]
> [ 2.806112][ T2] stack backtrace:
> [ 2.806958][ T2] CPU: 0 PID: 2 Comm: kthreadd Not tainted 6.3.0-rc4-00162-ga53ab2ba098e #1
> [ 2.807537][ T2] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-5 04/01/2014
> [ 2.807537][ T2] Call Trace:
> [ 2.807537][ T2] <TASK>
> [ 2.807537][ T2] dump_stack_lvl (lib/dump_stack.c:107)
> [ 2.807537][ T2] lockdep_rcu_suspicious (include/linux/context_tracking.h:153 kernel/locking/lockdep.c:6600)
> [ 2.807537][ T2] cpuset_fork (include/linux/cgroup.h:437 kernel/cgroup/cpuset.c:240 kernel/cgroup/cpuset.c:3262)
> [ 2.807537][ T2] cgroup_post_fork (kernel/cgroup/cgroup.c:6635 (discriminator 6))
> [ 2.807537][ T2] ? cgroup_cancel_fork (kernel/cgroup/cgroup.c:6573)
> [ 2.807537][ T2] ? mark_held_locks (kernel/locking/lockdep.c:4237)
> [ 2.807537][ T2] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4529)
> [ 2.807537][ T2] copy_process (kernel/fork.c:2499)
> [ 2.807537][ T2] ? __cleanup_sighand (kernel/fork.c:2013)
> [ 2.807537][ T2] ? __lock_acquire (kernel/locking/lockdep.c:186 kernel/locking/lockdep.c:3836 kernel/locking/lockdep.c:5056)
> [ 2.807537][ T2] kernel_clone (include/linux/random.h:26 kernel/fork.c:2680)
> [ 2.807537][ T2] ? create_io_thread (kernel/fork.c:2639)
> [ 2.807537][ T2] ? mark_usage (kernel/locking/lockdep.c:4914)
> [ 2.807537][ T2] ? finish_task_switch+0x21c/0x910
> [ 2.807537][ T2] ? __switch_to (arch/x86/include/asm/bitops.h:55 include/asm-generic/bitops/instrumented-atomic.h:29 include/linux/thread_info.h:89 arch/x86/include/asm/fpu/sched.h:65 arch/x86/kernel/process_64.c:623)
> [ 2.807537][ T2] kernel_thread (kernel/fork.c:2729)
> [ 2.807537][ T2] ? __ia32_sys_clone3 (kernel/fork.c:2729)
> [ 2.807537][ T2] ? lock_downgrade (kernel/locking/lockdep.c:5321)
> [ 2.807537][ T2] ? kthread_complete_and_exit (kernel/kthread.c:331)
> [ 2.807537][ T2] ? do_raw_spin_unlock (arch/x86/include/asm/atomic.h:29 include/linux/atomic/atomic-instrumented.h:28 include/asm-generic/qspinlock.h:57 kernel/locking/spinlock_debug.c:100 kernel/locking/spinlock_debug.c:140)
> [ 2.807537][ T2] kthreadd (kernel/kthread.c:400 kernel/kthread.c:746)
> [ 2.807537][ T2] ? kthread_is_per_cpu (kernel/kthread.c:719)
> [ 2.807537][ T2] ret_from_fork (arch/x86/entry/entry_64.S:314)
> [ 2.807537][ T2] </TASK>
>
It looks like task_cs() can only be used under rcu_read_lock(). I will
update the patch to add that.
Thanks,
Longman
On 4/6/23 05:45, Christian Brauner wrote:
> On Fri, Mar 31, 2023 at 10:50:43AM -0400, Waiman Long wrote:
>> By default, the clone(2) syscall spawn a child process into the same
>> cgroup as its parent. With the use of the CLONE_INTO_CGROUP flag
>> introduced by commit ef2c41cf38a7 ("clone3: allow spawning processes
>> into cgroups"), the child will be spawned into a different cgroup which
>> is somewhat similar to writing the child's tid into "cgroup.threads".
>>
>> The current cpuset_fork() method does not properly handle the
>> CLONE_INTO_CGROUP case where the cpuset of the child may be different
>> from that of its parent. Update the cpuset_fork() method to treat the
>> CLONE_INTO_CGROUP case similar to cpuset_attach().
>>
>> Since the newly cloned task has not been running yet, its actual
>> memory usage isn't known. So it is not necessary to make change to mm
>> in cpuset_fork().
>>
>> Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups")
>> Signed-off-by: Waiman Long <[email protected]>
>> ---
> Just two nits. I think this needs a Cc stable and it would be nice if
> you could give Giuseppe a "Reported-by".
The presence of a fixes tag should make this patch picked up by stable.
Yes, I will cc stable when I need to update the patch next time.
Right, I should have added
Reported-by: Giuseppe Scrivano <[email protected]>
Cheers,
Longman
On 4/6/23 05:44, Christian Brauner wrote:
> On Fri, Mar 31, 2023 at 10:50:45AM -0400, Waiman Long wrote:
>> The current cpuset code uses the global cpuset_attach_old_cs variable
>> to store the old cpuset value between consecutive cpuset_can_attach()
>> and cpuset_attach() calls. Since a caller of cpuset_can_attach() may
>> not need to hold the global cgroup_threadgroup_rwsem, parallel cpuset
>> attach operations are possible.
>>
>> When there are concurrent cpuset attach operations in progress,
>> cpuset_attach() may fetch the wrong value from cpuset_attach_old_cs
>> causing incorrect result. To avoid this problem while still allowing
>> certain level of parallelism, drop cpuset_attach_old_cs and use a
>> per-cpuset attach_old_cs value. Also restrict to at most one active
>> attach operation per cpuset to avoid corrupting the value of the
>> per-cpuset attach_old_cs value.
>>
>> Signed-off-by: Waiman Long <[email protected]>
>> ---
>> kernel/cgroup/cpuset.c | 19 ++++++++++++++-----
>> 1 file changed, 14 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 2367de611c42..3f925c261513 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -198,6 +198,8 @@ struct cpuset {
>>
>> /* Handle for cpuset.cpus.partition */
>> struct cgroup_file partition_file;
>> +
>> + struct cpuset *attach_old_cs;
>> };
>>
>> /*
>> @@ -2456,22 +2458,27 @@ static int fmeter_getrate(struct fmeter *fmp)
>> return val;
>> }
>>
>> -static struct cpuset *cpuset_attach_old_cs;
>> -
>> /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
>> static int cpuset_can_attach(struct cgroup_taskset *tset)
>> {
>> struct cgroup_subsys_state *css;
>> - struct cpuset *cs;
>> + struct cpuset *cs, *oldcs;
>> struct task_struct *task;
>> int ret;
>>
>> /* used later by cpuset_attach() */
>> - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
>> + oldcs = task_cs(cgroup_taskset_first(tset, &css));
>> cs = css_cs(css);
>>
>> percpu_down_write(&cpuset_rwsem);
>>
>> + /*
>> + * Only one cpuset attach operation is allowed for each cpuset.
>> + */
>> + ret = -EBUSY;
>> + if (cs->attach_in_progress)
>> + goto out_unlock;
> That'll mean CLONE_INTO_CGROUP becomes even more interestig because it
> isn't subject to this restriction in contrast to fork()+migrate, right?
This patch is not related to the CLONE_INTO_CGROUP. In fact, I have
dropped that in latter version.
Cheers,
Longman
On Fri, Apr 07, 2023 at 11:30:14AM -0400, Waiman Long wrote:
> On 4/6/23 05:44, Christian Brauner wrote:
> > On Fri, Mar 31, 2023 at 10:50:45AM -0400, Waiman Long wrote:
> > > The current cpuset code uses the global cpuset_attach_old_cs variable
> > > to store the old cpuset value between consecutive cpuset_can_attach()
> > > and cpuset_attach() calls. Since a caller of cpuset_can_attach() may
> > > not need to hold the global cgroup_threadgroup_rwsem, parallel cpuset
> > > attach operations are possible.
> > >
> > > When there are concurrent cpuset attach operations in progress,
> > > cpuset_attach() may fetch the wrong value from cpuset_attach_old_cs
> > > causing incorrect result. To avoid this problem while still allowing
> > > certain level of parallelism, drop cpuset_attach_old_cs and use a
> > > per-cpuset attach_old_cs value. Also restrict to at most one active
> > > attach operation per cpuset to avoid corrupting the value of the
> > > per-cpuset attach_old_cs value.
> > >
> > > Signed-off-by: Waiman Long <[email protected]>
> > > ---
> > > kernel/cgroup/cpuset.c | 19 ++++++++++++++-----
> > > 1 file changed, 14 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> > > index 2367de611c42..3f925c261513 100644
> > > --- a/kernel/cgroup/cpuset.c
> > > +++ b/kernel/cgroup/cpuset.c
> > > @@ -198,6 +198,8 @@ struct cpuset {
> > > /* Handle for cpuset.cpus.partition */
> > > struct cgroup_file partition_file;
> > > +
> > > + struct cpuset *attach_old_cs;
> > > };
> > > /*
> > > @@ -2456,22 +2458,27 @@ static int fmeter_getrate(struct fmeter *fmp)
> > > return val;
> > > }
> > > -static struct cpuset *cpuset_attach_old_cs;
> > > -
> > > /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
> > > static int cpuset_can_attach(struct cgroup_taskset *tset)
> > > {
> > > struct cgroup_subsys_state *css;
> > > - struct cpuset *cs;
> > > + struct cpuset *cs, *oldcs;
> > > struct task_struct *task;
> > > int ret;
> > > /* used later by cpuset_attach() */
> > > - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
> > > + oldcs = task_cs(cgroup_taskset_first(tset, &css));
> > > cs = css_cs(css);
> > > percpu_down_write(&cpuset_rwsem);
> > > + /*
> > > + * Only one cpuset attach operation is allowed for each cpuset.
> > > + */
> > > + ret = -EBUSY;
> > > + if (cs->attach_in_progress)
> > > + goto out_unlock;
> > That'll mean CLONE_INTO_CGROUP becomes even more interestig because it
> > isn't subject to this restriction in contrast to fork()+migrate, right?
>
> This patch is not related to the CLONE_INTO_CGROUP. In fact, I have dropped
I know that. My point had been that this patch introduced a new
restriction affecting parallelism in the migration path while the
CLONE_INTO_CGROUP path remained unaffected.