Since commit 8f9ea86fdf99 ("sched: Always preserve the user requested
cpumask"), user provided CPU affinity via sched_setaffinity(2) is
perserved even if the task is being moved to a different cpuset. However,
that affinity is also being inherited by any subsequently created child
processes which may not want or be aware of that affinity.
One way to solve this problem is to provide a way to back off from
that user provided CPU affinity. This patch implements such a scheme
by using an empty cpumask to signal a reset of the cpumasks to the
default as allowed by the current cpuset.
Before this patch, passing in an empty cpumask to sched_setaffinity(2)
will return an EINVAL error. With this patch, an error will no longer
be returned. Instead, the user_cpus_ptr that stores the user provided
affinity, if set, will be cleared and the task's CPU affinity will be
reset to that of the current cpuset. This reverts the cpumask change
done by all the previous sched_setaffinity(2) calls.
Signed-off-by: Waiman Long <[email protected]>
---
kernel/sched/core.c | 26 +++++++++++++++++++++-----
1 file changed, 21 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c52c2eba7c73..f4806d969fc9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8317,7 +8317,12 @@ __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
}
cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ /* Default to cpus_allowed with NULL new_mask */
+ if (ctx->new_mask)
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+ else
+ cpumask_copy(new_mask, cpus_allowed);
ctx->new_mask = new_mask;
ctx->flags |= SCA_CHECK;
@@ -8366,6 +8371,7 @@ __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
+ bool reset_cpumasks = cpumask_empty(in_mask);
struct affinity_context ac;
struct cpumask *user_mask;
struct task_struct *p;
@@ -8403,13 +8409,23 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_put_task;
/*
- * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
- * alloc_user_cpus_ptr() returns NULL.
+ * If an empty cpumask is passed in, clear user_cpus_ptr, if set,
+ * and reset the current cpu affinity to the default for the
+ * current cpuset.
*/
- user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (reset_cpumasks) {
+ in_mask = NULL; /* To be updated in __sched_setaffinity */
+ user_mask = NULL;
+ } else {
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used
+ * and alloc_user_cpus_ptr() returns NULL.
+ */
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ }
if (user_mask) {
cpumask_copy(user_mask, in_mask);
- } else if (IS_ENABLED(CONFIG_SMP)) {
+ } else if (!reset_cpumasks && IS_ENABLED(CONFIG_SMP)) {
retval = -ENOMEM;
goto out_put_task;
}
--
2.31.1
On Wed, Jun 28, 2023 at 05:16:37PM -0400, Waiman Long wrote:
> Since commit 8f9ea86fdf99 ("sched: Always preserve the user requested
> cpumask"), user provided CPU affinity via sched_setaffinity(2) is
> perserved even if the task is being moved to a different cpuset. However,
> that affinity is also being inherited by any subsequently created child
> processes which may not want or be aware of that affinity.
>
> One way to solve this problem is to provide a way to back off from
> that user provided CPU affinity. This patch implements such a scheme
> by using an empty cpumask to signal a reset of the cpumasks to the
> default as allowed by the current cpuset.
>
> Before this patch, passing in an empty cpumask to sched_setaffinity(2)
> will return an EINVAL error. With this patch, an error will no longer
> be returned. Instead, the user_cpus_ptr that stores the user provided
> affinity, if set, will be cleared and the task's CPU affinity will be
> reset to that of the current cpuset. This reverts the cpumask change
> done by all the previous sched_setaffinity(2) calls.
>
This is a user visible ABI change -- but with very limited motivation.
Why do we want this? Who will use this?
> Signed-off-by: Waiman Long <[email protected]>
> ---
> kernel/sched/core.c | 26 +++++++++++++++++++++-----
> 1 file changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c52c2eba7c73..f4806d969fc9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -8317,7 +8317,12 @@ __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
> }
>
> cpuset_cpus_allowed(p, cpus_allowed);
> - cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> +
> + /* Default to cpus_allowed with NULL new_mask */
> + if (ctx->new_mask)
> + cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
> + else
> + cpumask_copy(new_mask, cpus_allowed);
>
> ctx->new_mask = new_mask;
> ctx->flags |= SCA_CHECK;
> @@ -8366,6 +8371,7 @@ __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
>
> long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
> {
> + bool reset_cpumasks = cpumask_empty(in_mask);
> struct affinity_context ac;
> struct cpumask *user_mask;
> struct task_struct *p;
> @@ -8403,13 +8409,23 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
> goto out_put_task;
>
> /*
> - * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
> - * alloc_user_cpus_ptr() returns NULL.
> + * If an empty cpumask is passed in, clear user_cpus_ptr, if set,
> + * and reset the current cpu affinity to the default for the
> + * current cpuset.
> */
> - user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
> + if (reset_cpumasks) {
> + in_mask = NULL; /* To be updated in __sched_setaffinity */
> + user_mask = NULL;
> + } else {
> + /*
> + * With non-SMP configs, user_cpus_ptr/user_mask isn't used
> + * and alloc_user_cpus_ptr() returns NULL.
> + */
> + user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
> + }
> if (user_mask) {
> cpumask_copy(user_mask, in_mask);
> - } else if (IS_ENABLED(CONFIG_SMP)) {
> + } else if (!reset_cpumasks && IS_ENABLED(CONFIG_SMP)) {
> retval = -ENOMEM;
> goto out_put_task;
> }
> --
> 2.31.1
>
On 7/3/23 06:26, Peter Zijlstra wrote:
> On Wed, Jun 28, 2023 at 05:16:37PM -0400, Waiman Long wrote:
>> Since commit 8f9ea86fdf99 ("sched: Always preserve the user requested
>> cpumask"), user provided CPU affinity via sched_setaffinity(2) is
>> perserved even if the task is being moved to a different cpuset. However,
>> that affinity is also being inherited by any subsequently created child
>> processes which may not want or be aware of that affinity.
>>
>> One way to solve this problem is to provide a way to back off from
>> that user provided CPU affinity. This patch implements such a scheme
>> by using an empty cpumask to signal a reset of the cpumasks to the
>> default as allowed by the current cpuset.
>>
>> Before this patch, passing in an empty cpumask to sched_setaffinity(2)
>> will return an EINVAL error. With this patch, an error will no longer
>> be returned. Instead, the user_cpus_ptr that stores the user provided
>> affinity, if set, will be cleared and the task's CPU affinity will be
>> reset to that of the current cpuset. This reverts the cpumask change
>> done by all the previous sched_setaffinity(2) calls.
>>
> This is a user visible ABI change -- but with very limited motivation.
> Why do we want this? Who will use this?
Yes, this is a visible ABI change, but it should be backward compatible
as I doubt there are applications out there depending on the fact that
passing an empty cpumask to sched_setaffinity() must return an error.
Our OpenShift team has actually hit a problem with the recent persistent
user provided cpu affinity change because they are relying on the fact
that moving a task to a different cpuset will reset cpu affinity to the
cpuset default which is no longer true. That is the main reason behind
this patch to provide a way to reset cpu affinity to the cpuset default.
I am thinking of requesting sched_setaffinity(2) manpage update to
document the persistent user provided cpu affinity change and the way to
reset it after this patch is merged upstream.
Cheers,
Longman
On Mon, Jul 03, 2023 at 10:55:02AM -0400, Waiman Long wrote:
> Our OpenShift team has actually hit a problem with the recent persistent
> user provided cpu affinity change because they are relying on the fact that
> moving a task to a different cpuset will reset cpu affinity to the cpuset
> default which is no longer true. That is the main reason behind this patch
> to provide a way to reset cpu affinity to the cpuset default.
Where is the sched_setaffinity() in that story?
So somewhere this thing did a sched_setaffinity() and then starts
playing with cpusets. Instead of adding more sched_setaffinity() calls,
can't it just remove some?
On 7/5/23 05:37, Peter Zijlstra wrote:
> On Mon, Jul 03, 2023 at 10:55:02AM -0400, Waiman Long wrote:
>
>> Our OpenShift team has actually hit a problem with the recent persistent
>> user provided cpu affinity change because they are relying on the fact that
>> moving a task to a different cpuset will reset cpu affinity to the cpuset
>> default which is no longer true. That is the main reason behind this patch
>> to provide a way to reset cpu affinity to the cpuset default.
> Where is the sched_setaffinity() in that story?
>
> So somewhere this thing did a sched_setaffinity() and then starts
> playing with cpusets. Instead of adding more sched_setaffinity() calls,
> can't it just remove some?
I don't know the full picture. From what I understand, there is a master
control process that limit its cpu affinity to just a limited set of
housekeeping CPUs. It then spawn child processes to be run in different
containers. The control process doesn't need to change its cpu affinity.
In the past, putting the child processes in a different container
(cpuset) will reset its affinity to that of the cpuset. That is not true
anymore because user_cpus_ptr is inherited in the forked child process.
I have thought about 2 ways to address that. Either we introduce a new
clone flag to disable the inheritance of users_cpu_ptr or a way to reset
the cpu affinity to the default which is what this patch does.
Cheers,
Longman
Hello,
kernel test robot noticed "ltp.sched_setaffinity01.fail" on:
commit: 5ae608d0d3901386665fb64090f93843f4135cc0 ("[PATCH] sched/core: Use empty mask to reset cpumasks in sched_setaffinity()")
url: https://github.com/intel-lab-lkp/linux/commits/Waiman-Long/sched-core-Use-empty-mask-to-reset-cpumasks-in-sched_setaffinity/20230629-052600
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git ebb83d84e49b54369b0db67136a5fe1087124dcc
patch link: https://lore.kernel.org/all/[email protected]/
patch subject: [PATCH] sched/core: Use empty mask to reset cpumasks in sched_setaffinity()
in testcase: ltp
version: ltp-x86_64-14c1f76-1_20230708
with following parameters:
disk: 1HDD
fs: f2fs
test: syscalls-04/sched_setaffinity01
compiler: gcc-12
test machine: 4 threads 1 sockets Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz (Ivy Bridge) with 8G memory
(please refer to attached dmesg/kmsg for entire log/backtrace)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-lkp/[email protected]
Running tests.......
<<<test_start>>>
tag=sched_setaffinity01 stime=1689382567
cmdline="sched_setaffinity01"
contacts=""
analysis=exit
<<<test_output>>>
tst_test.c:1558: TINFO: Timeout per run is 0h 02m 30s
sched_setaffinity01.c:83: TPASS: sched_setaffinity() failed: EFAULT (14)
sched_setaffinity01.c:73: TFAIL: sched_setaffinity() succeded unexpectedly
tst_test.c:1612: TINFO: If you are running on slow machine, try exporting LTP_TIMEOUT_MUL > 1
tst_test.c:1614: TBROK: Test killed! (timeout?)
Summary:
passed 1
failed 1
broken 1
skipped 0
warnings 0
incrementing stop
<<<execution_status>>>
initiation_status="ok"
duration=0 termination_type=exited termination_id=3 corefile=no
cutime=0 cstime=1
<<<test_end>>>
INFO: ltp-pan reported some tests FAIL
LTP Version: 20230516-68-g9512c5da4
###############################################################
Done executing testcases.
LTP Version: 20230516-68-g9512c5da4
###############################################################
To reproduce:
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
sudo bin/lkp install job.yaml # job file is attached in this email
bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
sudo bin/lkp run generated-yaml-file
# if come across any failure that blocks the test,
# please remove ~/.lkp and /lkp dir to run from a clean state.
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi!
> Running tests.......
> <<<test_start>>>
> tag=sched_setaffinity01 stime=1689382567
> cmdline="sched_setaffinity01"
> contacts=""
> analysis=exit
> <<<test_output>>>
> tst_test.c:1558: TINFO: Timeout per run is 0h 02m 30s
> sched_setaffinity01.c:83: TPASS: sched_setaffinity() failed: EFAULT (14)
> sched_setaffinity01.c:73: TFAIL: sched_setaffinity() succeded unexpectedly
> tst_test.c:1612: TINFO: If you are running on slow machine, try exporting LTP_TIMEOUT_MUL > 1
> tst_test.c:1614: TBROK: Test killed! (timeout?)
So what the test does is that it sets empty affinity mask to
sched_setaffinity() and expects EINVAL. Instead it looks like the call
now succeeeds, the test stops getting schedulled and is killed by
timeout.
--
Cyril Hrubis
[email protected]
On 7/17/23 10:41, Cyril Hrubis wrote:
> Hi!
>> Running tests.......
>> <<<test_start>>>
>> tag=sched_setaffinity01 stime=1689382567
>> cmdline="sched_setaffinity01"
>> contacts=""
>> analysis=exit
>> <<<test_output>>>
>> tst_test.c:1558: TINFO: Timeout per run is 0h 02m 30s
>> sched_setaffinity01.c:83: TPASS: sched_setaffinity() failed: EFAULT (14)
>> sched_setaffinity01.c:73: TFAIL: sched_setaffinity() succeded unexpectedly
>> tst_test.c:1612: TINFO: If you are running on slow machine, try exporting LTP_TIMEOUT_MUL > 1
>> tst_test.c:1614: TBROK: Test killed! (timeout?)
> So what the test does is that it sets empty affinity mask to
> sched_setaffinity() and expects EINVAL. Instead it looks like the call
> now succeeeds, the test stops getting schedulled and is killed by
> timeout.
>
I had sent out a v2 patch should not fail the LTP's sched_setaffinity()
test.
Cheers,
Longman