2020-10-23 15:27:26

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

Concurrent migrate_disable() and set_cpus_allowed_ptr() has
interesting features. We rely on set_cpus_allowed_ptr() to not return
until the task runs inside the provided mask. This expectation is
exported to userspace.

This means that any set_cpus_allowed_ptr() caller must wait until
migrate_enable() allows migrations.

At the same time, we don't want migrate_enable() to schedule, due to
patterns like:

preempt_disable();
migrate_disable();
...
migrate_enable();
preempt_enable();

And:

raw_spin_lock(&B);
spin_unlock(&A);

this means that when migrate_enable() must restore the affinity
mask, it cannot wait for completion thereof. Luck will have it that
that is exactly the case where there is a pending
set_cpus_allowed_ptr(), so let that provide storage for the async stop
machine.

Much thanks to Valentin who used TLA+ most effective and found lots of
'interesting' cases.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/sched.h | 1
kernel/sched/core.c | 234 +++++++++++++++++++++++++++++++++++++++++++-------
2 files changed, 205 insertions(+), 30 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -713,6 +713,7 @@ struct task_struct {
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t cpus_mask;
+ void *migration_pending;
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
int migration_disabled;
#endif
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1732,15 +1732,26 @@ void migrate_enable(void)
{
struct task_struct *p = current;

- if (--p->migration_disabled)
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
return;
+ }

+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ preempt_disable();
+ if (p->cpus_ptr != &p->cpus_mask)
+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
barrier();
-
- if (p->cpus_ptr == &p->cpus_mask)
- return;
-
- __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ p->migration_disabled = 0;
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);

@@ -1805,8 +1816,16 @@ static struct rq *move_queued_task(struc
}

struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
+ struct task_struct *task;
+ int dest_cpu;
+ struct set_affinity_pending *pending;
+};
+
+struct set_affinity_pending {
+ refcount_t refs;
+ struct completion done;
+ struct cpu_stop_work stop_work;
+ struct migration_arg arg;
};

/*
@@ -1838,16 +1857,19 @@ static struct rq *__migrate_task(struct
*/
static int migration_cpu_stop(void *data)
{
+ struct set_affinity_pending *pending;
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
+ int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
+ bool complete = false;
struct rq_flags rf;

/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
- local_irq_disable();
+ local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1857,21 +1879,83 @@ static int migration_cpu_stop(void *data

raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
+
+ pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) {
+ if (is_migration_disabled(p))
+ goto out;
+
+ if (pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+
+ /* migrate_enable() -- we must not race against SCA */
+ if (dest_cpu < 0) {
+ /*
+ * When this was migrate_enable() but we no longer
+ * have a @pending, a concurrent SCA 'fixed' things
+ * and we should be valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+ goto out;
+ }
+
+ dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+ }
+
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+ rq = __migrate_task(rq, &rf, p, dest_cpu);
else
- p->wake_cpu = arg->dest_cpu;
+ p->wake_cpu = dest_cpu;
+
+ } else if (dest_cpu < 0) {
+ /*
+ * This happens when we get migrated between migrate_enable()'s
+ * preempt_enable() and scheduling the stopper task. At that
+ * point we're a regular task again and not current anymore.
+ *
+ * A !PREEMPT kernel has a giant hole here, which makes it far
+ * more likely.
+ */
+
+ /*
+ * When this was migrate_enable() but we no longer have an
+ * @pending, a concurrent SCA 'fixed' things and we should be
+ * valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+ goto out;
+ }
+
+ /*
+ * When migrate_enable() hits a rq mis-match we can't reliably
+ * determine is_migration_disabled() and so have to chase after
+ * it.
+ */
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ return 0;
}
- rq_unlock(rq, &rf);
- raw_spin_unlock(&p->pi_lock);
+out:
+ task_rq_unlock(rq, p, &rf);
+
+ if (complete)
+ complete_all(&pending->done);
+
+ /* For pending->{arg,stop_work} */
+ pending = arg->pending;
+ if (pending && refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);

- local_irq_enable();
return 0;
}

@@ -1941,6 +2025,110 @@ void do_set_cpus_allowed(struct task_str
}

/*
+ * This function is wildly self concurrent, consider at least 3 times.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+ int dest_cpu, unsigned int flags)
+{
+ struct set_affinity_pending my_pending = { }, *pending = NULL;
+ struct migration_arg arg = {
+ .task = p,
+ .dest_cpu = dest_cpu,
+ };
+ bool complete = false;
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ pending = p->migration_pending;
+ if (pending) {
+ refcount_inc(&pending->refs);
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ task_rq_unlock(rq, p, rf);
+
+ if (complete)
+ goto do_complete;
+
+ return 0;
+ }
+
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
+ /* serialized by p->pi_lock */
+ if (!p->migration_pending) {
+ refcount_set(&my_pending.refs, 1);
+ init_completion(&my_pending.done);
+ p->migration_pending = &my_pending;
+ } else {
+ pending = p->migration_pending;
+ refcount_inc(&pending->refs);
+ }
+ }
+ pending = p->migration_pending;
+ /*
+ * - !MIGRATE_ENABLE:
+ * we'll have installed a pending if there wasn't one already.
+ *
+ * - MIGRATE_ENABLE:
+ * we're here because the current CPU isn't matching anymore,
+ * the only way that can happen is because of a concurrent
+ * set_cpus_allowed_ptr() call, which should then still be
+ * pending completion.
+ *
+ * Either way, we really should have a @pending here.
+ */
+ if (WARN_ON_ONCE(!pending))
+ return -EINVAL;
+
+ if (flags & SCA_MIGRATE_ENABLE) {
+
+ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+ task_rq_unlock(rq, p, rf);
+
+ pending->arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = -1,
+ .pending = pending,
+ };
+
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+
+ return 0;
+ }
+
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+
+ task_rq_unlock(rq, p, rf);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+ } else {
+
+ if (!is_migration_disabled(p)) {
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(rq, rf, p, dest_cpu);
+
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ task_rq_unlock(rq, p, rf);
+
+do_complete:
+ if (complete)
+ complete_all(&pending->done);
+ }
+
+ wait_for_completion(&pending->done);
+
+ if (refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);
+
+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+ return 0;
+}
+
+/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
* is removed from the allowed bitmask.
@@ -2009,23 +2197,8 @@ static int __set_cpus_allowed_ptr(struct
p->nr_cpus_allowed != 1);
}

- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
+ return affine_move_task(rq, p, &rf, dest_cpu, flags);

- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- return 0;
- } else if (task_on_rq_queued(p)) {
- /*
- * OK, since we're going to drop the lock immediately
- * afterwards anyway.
- */
- rq = move_queued_task(rq, &rf, p, dest_cpu);
- }
out:
task_rq_unlock(rq, p, &rf);

@@ -3205,6 +3378,7 @@ static void __sched_fork(unsigned long c
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ p->migration_pending = NULL;
#endif
}




2020-11-11 08:27:42

by tip-bot2 for Jacob Pan

[permalink] [raw]
Subject: [tip: sched/core] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 6d337eab041d56bb8f0e7794f39906c21054c512
Gitweb: https://git.kernel.org/tip/6d337eab041d56bb8f0e7794f39906c21054c512
Author: Peter Zijlstra <[email protected]>
AuthorDate: Fri, 18 Sep 2020 17:24:31 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 10 Nov 2020 18:39:00 +01:00

sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

Concurrent migrate_disable() and set_cpus_allowed_ptr() has
interesting features. We rely on set_cpus_allowed_ptr() to not return
until the task runs inside the provided mask. This expectation is
exported to userspace.

This means that any set_cpus_allowed_ptr() caller must wait until
migrate_enable() allows migrations.

At the same time, we don't want migrate_enable() to schedule, due to
patterns like:

preempt_disable();
migrate_disable();
...
migrate_enable();
preempt_enable();

And:

raw_spin_lock(&B);
spin_unlock(&A);

this means that when migrate_enable() must restore the affinity
mask, it cannot wait for completion thereof. Luck will have it that
that is exactly the case where there is a pending
set_cpus_allowed_ptr(), so let that provide storage for the async stop
machine.

Much thanks to Valentin who used TLA+ most effective and found lots of
'interesting' cases.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Valentin Schneider <[email protected]>
Reviewed-by: Daniel Bristot de Oliveira <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
include/linux/sched.h | 1 +-
kernel/sched/core.c | 236 +++++++++++++++++++++++++++++++++++------
2 files changed, 207 insertions(+), 30 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0732356..90a0c92 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -714,6 +714,7 @@ struct task_struct {
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t cpus_mask;
+ void *migration_pending;
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
int migration_disabled;
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6a3f1c2..0efc1e4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1732,15 +1732,26 @@ void migrate_enable(void)
{
struct task_struct *p = current;

- if (--p->migration_disabled)
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
return;
+ }

+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ preempt_disable();
+ if (p->cpus_ptr != &p->cpus_mask)
+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
barrier();
-
- if (p->cpus_ptr == &p->cpus_mask)
- return;
-
- __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ p->migration_disabled = 0;
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);

@@ -1805,8 +1816,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
}

struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
+ struct task_struct *task;
+ int dest_cpu;
+ struct set_affinity_pending *pending;
+};
+
+struct set_affinity_pending {
+ refcount_t refs;
+ struct completion done;
+ struct cpu_stop_work stop_work;
+ struct migration_arg arg;
};

/*
@@ -1838,16 +1857,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
+ struct set_affinity_pending *pending;
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
+ int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
+ bool complete = false;
struct rq_flags rf;

/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
- local_irq_disable();
+ local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1857,21 +1879,83 @@ static int migration_cpu_stop(void *data)

raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
+
+ pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) {
+ if (is_migration_disabled(p))
+ goto out;
+
+ if (pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+
+ /* migrate_enable() -- we must not race against SCA */
+ if (dest_cpu < 0) {
+ /*
+ * When this was migrate_enable() but we no longer
+ * have a @pending, a concurrent SCA 'fixed' things
+ * and we should be valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+ goto out;
+ }
+
+ dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+ }
+
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+ rq = __migrate_task(rq, &rf, p, dest_cpu);
else
- p->wake_cpu = arg->dest_cpu;
+ p->wake_cpu = dest_cpu;
+
+ } else if (dest_cpu < 0) {
+ /*
+ * This happens when we get migrated between migrate_enable()'s
+ * preempt_enable() and scheduling the stopper task. At that
+ * point we're a regular task again and not current anymore.
+ *
+ * A !PREEMPT kernel has a giant hole here, which makes it far
+ * more likely.
+ */
+
+ /*
+ * When this was migrate_enable() but we no longer have an
+ * @pending, a concurrent SCA 'fixed' things and we should be
+ * valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
+ goto out;
+ }
+
+ /*
+ * When migrate_enable() hits a rq mis-match we can't reliably
+ * determine is_migration_disabled() and so have to chase after
+ * it.
+ */
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ return 0;
}
- rq_unlock(rq, &rf);
- raw_spin_unlock(&p->pi_lock);
+out:
+ task_rq_unlock(rq, p, &rf);
+
+ if (complete)
+ complete_all(&pending->done);
+
+ /* For pending->{arg,stop_work} */
+ pending = arg->pending;
+ if (pending && refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);

- local_irq_enable();
return 0;
}

@@ -1941,6 +2025,112 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
}

/*
+ * This function is wildly self concurrent, consider at least 3 times.
+ */
+static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
+ int dest_cpu, unsigned int flags)
+{
+ struct set_affinity_pending my_pending = { }, *pending = NULL;
+ struct migration_arg arg = {
+ .task = p,
+ .dest_cpu = dest_cpu,
+ };
+ bool complete = false;
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ pending = p->migration_pending;
+ if (pending) {
+ refcount_inc(&pending->refs);
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ task_rq_unlock(rq, p, rf);
+
+ if (complete)
+ goto do_complete;
+
+ return 0;
+ }
+
+ if (!(flags & SCA_MIGRATE_ENABLE)) {
+ /* serialized by p->pi_lock */
+ if (!p->migration_pending) {
+ refcount_set(&my_pending.refs, 1);
+ init_completion(&my_pending.done);
+ p->migration_pending = &my_pending;
+ } else {
+ pending = p->migration_pending;
+ refcount_inc(&pending->refs);
+ }
+ }
+ pending = p->migration_pending;
+ /*
+ * - !MIGRATE_ENABLE:
+ * we'll have installed a pending if there wasn't one already.
+ *
+ * - MIGRATE_ENABLE:
+ * we're here because the current CPU isn't matching anymore,
+ * the only way that can happen is because of a concurrent
+ * set_cpus_allowed_ptr() call, which should then still be
+ * pending completion.
+ *
+ * Either way, we really should have a @pending here.
+ */
+ if (WARN_ON_ONCE(!pending)) {
+ task_rq_unlock(rq, p, rf);
+ return -EINVAL;
+ }
+
+ if (flags & SCA_MIGRATE_ENABLE) {
+
+ refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
+ task_rq_unlock(rq, p, rf);
+
+ pending->arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = -1,
+ .pending = pending,
+ };
+
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+
+ return 0;
+ }
+
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+
+ task_rq_unlock(rq, p, rf);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+
+ } else {
+
+ if (!is_migration_disabled(p)) {
+ if (task_on_rq_queued(p))
+ rq = move_queued_task(rq, rf, p, dest_cpu);
+
+ p->migration_pending = NULL;
+ complete = true;
+ }
+ task_rq_unlock(rq, p, rf);
+
+do_complete:
+ if (complete)
+ complete_all(&pending->done);
+ }
+
+ wait_for_completion(&pending->done);
+
+ if (refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);
+
+ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+
+ return 0;
+}
+
+/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
* is removed from the allowed bitmask.
@@ -2009,23 +2199,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
p->nr_cpus_allowed != 1);
}

- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
+ return affine_move_task(rq, p, &rf, dest_cpu, flags);

- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- return 0;
- } else if (task_on_rq_queued(p)) {
- /*
- * OK, since we're going to drop the lock immediately
- * afterwards anyway.
- */
- rq = move_queued_task(rq, &rf, p, dest_cpu);
- }
out:
task_rq_unlock(rq, p, &rf);

@@ -3205,6 +3380,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
+ p->migration_pending = NULL;
#endif
}

2020-11-12 16:41:37

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

On Fri, 2020-10-23 at 12:12 +0200, Peter Zijlstra wrote:
> Concurrent migrate_disable() and set_cpus_allowed_ptr() has
> interesting features. We rely on set_cpus_allowed_ptr() to not return
> until the task runs inside the provided mask. This expectation is
> exported to userspace.
>
> This means that any set_cpus_allowed_ptr() caller must wait until
> migrate_enable() allows migrations.
>
> At the same time, we don't want migrate_enable() to schedule, due to
> patterns like:
>
> preempt_disable();
> migrate_disable();
> ...
> migrate_enable();
> preempt_enable();
>
> And:
>
> raw_spin_lock(&B);
> spin_unlock(&A);
>
> this means that when migrate_enable() must restore the affinity
> mask, it cannot wait for completion thereof. Luck will have it that
> that is exactly the case where there is a pending
> set_cpus_allowed_ptr(), so let that provide storage for the async stop
> machine.
>
> Much thanks to Valentin who used TLA+ most effective and found lots of
> 'interesting' cases.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> include/linux/sched.h | 1
> kernel/sched/core.c | 234 +++++++++++++++++++++++++++++++++++++++++++-----
> --
> 2 files changed, 205 insertions(+), 30 deletions(-)

Some syscall fuzzing from an unprivileged user starts to trigger this below
since this commit first appeared in the linux-next today. Does it ring any
bells?

[12065.065837][ T1310] INFO: task trinity-c30:91730 blocked for more than 368 seconds.
[12065.073524][ T1310] Tainted: G L 5.10.0-rc3-next-20201112 #2
[12065.081076][ T1310] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[12065.089648][ T1310] task:trinity-c30 state:D stack:26576 pid:91730 ppid: 82688 flags:0x00000000
[12065.098818][ T1310] Call trace:
[12065.101987][ T1310] __switch_to+0xf0/0x1a8
[12065.106227][ T1310] __schedule+0x6ec/0x1708
[12065.110505][ T1310] schedule+0x1bc/0x3b0
[12065.114562][ T1310] schedule_timeout+0x3c4/0x4c0
[12065.119275][ T1310] wait_for_completion+0x13c/0x248
[12065.124257][ T1310] affine_move_task+0x410/0x688
(inlined by) affine_move_task at kernel/sched/core.c:2261
[12065.129013][ T1310] __set_cpus_allowed_ptr+0x1b4/0x370
[12065.134248][ T1310] sched_setaffinity+0x4f0/0x7e8
[12065.139088][ T1310] __arm64_sys_sched_setaffinity+0x1f4/0x2a0
[12065.144972][ T1310] do_el0_svc+0x124/0x228
[12065.149165][ T1310] el0_sync_handler+0x208/0x384
[12065.153876][ T1310] el0_sync+0x140/0x180
[12065.157971][ T1310]
[12065.157971][ T1310] Showing all locks held in the system:
[12065.166401][ T1310] 1 lock held by khungtaskd/1310:
[12065.171288][ T1310] #0: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire.constprop.56+0x0/0x38
[12065.182210][ T1310] 4 locks held by trinity-main/82688:
[12065.187515][ T1310] 2 locks held by kworker/u513:3/82813:
[12065.192922][ T1310] #0: ffff000000419d38 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x69c/0x18c8
[12065.203890][ T1310] #1: ffff0000122bfd40 ((work_completion)(&buf->work)){+.+.}-{0:0}, at: __update_idle_core+0xa8/0x460
[12065.214916][ T1310] 1 lock held by trinity-c35/137168:
[12065.220061][ T1310] #0: ffff0087ce767898 (&tty->ldisc_sem){++++}-{0:0}, at: ldsem_down_read+0x3c/0x48
[12065.229483][ T1310] 3 locks held by trinity-c61/137611:
[12065.234757][ T1310] 1 lock held by trinity-c7/137630:
[12065.239828][ T1310] 1 lock held by trinity-c57/137714:
[12065.242612][T137611] futex_wake_op: trinity-c61 tries to shift op by 1008; fix this program
[12065.245012][ T1310] 1 lock held by trinity-c52/137771:
[12065.258538][ T1310] 2 locks held by trinity-c42/137835:
[12065.263783][ T1310] 4 locks held by trinity-c22/137868:
[12065.269051][ T1310] #0: ffff000e78503798 (&rq->lock){-.-.}-{2:2}, at: newidle_balance+0x92c/0xd78
[12065.278155][ T1310] #1: ffff0087ce767930 (&tty->atomic_write_lock){+.+.}-{3:3}, at: tty_write_lock+0x30/0x58
[12065.288317][ T1310] #2: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: __mutex_lock+0x24c/0x1310
[12065.297592][ T1310] #3: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: lock_page_memcg+0x98/0x240
[12065.307026][ T1310] 2 locks held by trinity-c34/137896:
[12065.312266][ T1310] #0: ffff000e78463798 (&rq->lock){-.-.}-{2:2}, at: __schedule+0x22c/0x1708
[12065.321023][ T1310] #1: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: __update_idle_core+0xa8/0x460
[12065.330663][ T1310] 2 locks held by trinity-c43/137909:
[12065.335996][ T1310] 1 lock held by trinity-c24/137910:
[12065.341164][ T1310] 1 lock held by trinity-c1/137954:
[12065.346272][ T1310] 1 lock held by trinity-c49/138020:
[12065.351425][ T1310] 1 lock held by trinity-c10/138021:
[12065.356649][ T1310] 1 lock held by trinity-c32/138039:
[12065.361813][ T1310] 4 locks held by trinity-c36/138042:
[12065.367129][ T1310] 2 locks held by trinity-c14/138061:
[12065.372378][ T1310] 2 locks held by trinity-c38/138070:
[12065.377688][ T1310] 1 lock held by trinity-c50/138074:
[12065.382885][ T1310] 1 lock held by trinity-c12/138085:
[12065.388186][ T1310] 1 lock held by trinity-c4/138087:
[12065.393272][ T1310] 3 locks held by trinity-c6/138091:
[12065.398492][ T1310] 2 locks held by trinity-c48/138095:
[12065.403757][ T1310] 2 locks held by trinity-c62/138097:
[12065.409045][ T1310] 2 locks held by trinity-main/138107:
[12065.414441][ T1310] 1 lock held by modprobe/138108:
[12065.419351][ T1310]
[12065.421560][ T1310] =============================================
[12065.421560][ T1310]

2020-11-12 17:30:19

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()


On 12/11/20 16:38, Qian Cai wrote:
> Some syscall fuzzing from an unprivileged user starts to trigger this below
> since this commit first appeared in the linux-next today. Does it ring any
> bells?
>

What's the .config? I'm interested in
CONFIG_PREEMPT
CONFIG_PREEMPT_RT
CONFIG_SMP

From a quick look it seems that tree doesn't have Thomas' "generalization" of
migrate_disable(), so if this doesn't have PREEMPT_RT we could forget about
migrate_disable() for now.

> [12065.065837][ T1310] INFO: task trinity-c30:91730 blocked for more than 368 seconds.
> [12065.073524][ T1310] Tainted: G L 5.10.0-rc3-next-20201112 #2
> [12065.081076][ T1310] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [12065.089648][ T1310] task:trinity-c30 state:D stack:26576 pid:91730 ppid: 82688 flags:0x00000000
> [12065.098818][ T1310] Call trace:
> [12065.101987][ T1310] __switch_to+0xf0/0x1a8
> [12065.106227][ T1310] __schedule+0x6ec/0x1708
> [12065.110505][ T1310] schedule+0x1bc/0x3b0
> [12065.114562][ T1310] schedule_timeout+0x3c4/0x4c0
> [12065.119275][ T1310] wait_for_completion+0x13c/0x248
> [12065.124257][ T1310] affine_move_task+0x410/0x688
> (inlined by) affine_move_task at kernel/sched/core.c:2261
> [12065.129013][ T1310] __set_cpus_allowed_ptr+0x1b4/0x370
> [12065.134248][ T1310] sched_setaffinity+0x4f0/0x7e8
> [12065.139088][ T1310] __arm64_sys_sched_setaffinity+0x1f4/0x2a0
> [12065.144972][ T1310] do_el0_svc+0x124/0x228
> [12065.149165][ T1310] el0_sync_handler+0x208/0x384
> [12065.153876][ T1310] el0_sync+0x140/0x180
> [12065.157971][ T1310]

So that's a task changing the affinity of some task (either itself or
another; I can't say without a decoded stacktrace), and then blocking on a
wait_for_completion() that apparently never happens.

I don't see stop_one_cpu() in the trace, so I assume it's the !task_running
case, for which the completion should be completed before getting to the
wait (unless we *do* have migrate_disable()).

Could you please run scripts/decode_stacktrace.sh on the above?

> [12065.157971][ T1310] Showing all locks held in the system:
> [12065.166401][ T1310] 1 lock held by khungtaskd/1310:
> [12065.171288][ T1310] #0: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire.constprop.56+0x0/0x38
> [12065.182210][ T1310] 4 locks held by trinity-main/82688:
> [12065.187515][ T1310] 2 locks held by kworker/u513:3/82813:
> [12065.192922][ T1310] #0: ffff000000419d38 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x69c/0x18c8
> [12065.203890][ T1310] #1: ffff0000122bfd40 ((work_completion)(&buf->work)){+.+.}-{0:0}, at: __update_idle_core+0xa8/0x460
> [12065.214916][ T1310] 1 lock held by trinity-c35/137168:
> [12065.220061][ T1310] #0: ffff0087ce767898 (&tty->ldisc_sem){++++}-{0:0}, at: ldsem_down_read+0x3c/0x48
> [12065.229483][ T1310] 3 locks held by trinity-c61/137611:
> [12065.234757][ T1310] 1 lock held by trinity-c7/137630:
> [12065.239828][ T1310] 1 lock held by trinity-c57/137714:
> [12065.242612][T137611] futex_wake_op: trinity-c61 tries to shift op by 1008; fix this program
> [12065.245012][ T1310] 1 lock held by trinity-c52/137771:
> [12065.258538][ T1310] 2 locks held by trinity-c42/137835:
> [12065.263783][ T1310] 4 locks held by trinity-c22/137868:
> [12065.269051][ T1310] #0: ffff000e78503798 (&rq->lock){-.-.}-{2:2}, at: newidle_balance+0x92c/0xd78
> [12065.278155][ T1310] #1: ffff0087ce767930 (&tty->atomic_write_lock){+.+.}-{3:3}, at: tty_write_lock+0x30/0x58
> [12065.288317][ T1310] #2: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: __mutex_lock+0x24c/0x1310
> [12065.297592][ T1310] #3: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: lock_page_memcg+0x98/0x240
> [12065.307026][ T1310] 2 locks held by trinity-c34/137896:
> [12065.312266][ T1310] #0: ffff000e78463798 (&rq->lock){-.-.}-{2:2}, at: __schedule+0x22c/0x1708
> [12065.321023][ T1310] #1: ffff800018d0cb40 (rcu_read_lock){....}-{1:2}, at: __update_idle_core+0xa8/0x460
> [12065.330663][ T1310] 2 locks held by trinity-c43/137909:
> [12065.335996][ T1310] 1 lock held by trinity-c24/137910:
> [12065.341164][ T1310] 1 lock held by trinity-c1/137954:
> [12065.346272][ T1310] 1 lock held by trinity-c49/138020:
> [12065.351425][ T1310] 1 lock held by trinity-c10/138021:
> [12065.356649][ T1310] 1 lock held by trinity-c32/138039:
> [12065.361813][ T1310] 4 locks held by trinity-c36/138042:
> [12065.367129][ T1310] 2 locks held by trinity-c14/138061:
> [12065.372378][ T1310] 2 locks held by trinity-c38/138070:
> [12065.377688][ T1310] 1 lock held by trinity-c50/138074:
> [12065.382885][ T1310] 1 lock held by trinity-c12/138085:
> [12065.388186][ T1310] 1 lock held by trinity-c4/138087:
> [12065.393272][ T1310] 3 locks held by trinity-c6/138091:
> [12065.398492][ T1310] 2 locks held by trinity-c48/138095:
> [12065.403757][ T1310] 2 locks held by trinity-c62/138097:
> [12065.409045][ T1310] 2 locks held by trinity-main/138107:
> [12065.414441][ T1310] 1 lock held by modprobe/138108:
> [12065.419351][ T1310]
> [12065.421560][ T1310] =============================================
> [12065.421560][ T1310]

2020-11-12 18:05:09

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

On Thu, 2020-11-12 at 17:26 +0000, Valentin Schneider wrote:
> On 12/11/20 16:38, Qian Cai wrote:
> > Some syscall fuzzing from an unprivileged user starts to trigger this below
> > since this commit first appeared in the linux-next today. Does it ring any
> > bells?
> >
>
> What's the .config? I'm interested in
> CONFIG_PREEMPT
> CONFIG_PREEMPT_RT
> CONFIG_SMP

https://cailca.coding.net/public/linux/mm/git/files/master/arm64.config

# CONFIG_PREEMPT is not set
CONFIG_SMP=y

Also, I have been able to reproduce this on powerpc as well just now.

>
> From a quick look it seems that tree doesn't have Thomas' "generalization" of
> migrate_disable(), so if this doesn't have PREEMPT_RT we could forget about
> migrate_disable() for now.
>
> > [12065.065837][ T1310] INFO: task trinity-c30:91730 blocked for more than
> > 368 seconds.
> > [12065.073524][ T1310] Tainted: G L 5.10.0-rc3-next-
> > 20201112 #2
> > [12065.081076][ T1310] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
> > disables this message.
> > [12065.089648][ T1310] task:trinity-c30 state:D stack:26576 pid:91730
> > ppid: 82688 flags:0x00000000
> > [12065.098818][ T1310] Call trace:
> > [12065.101987][ T1310] __switch_to+0xf0/0x1a8
> > [12065.106227][ T1310] __schedule+0x6ec/0x1708
> > [12065.110505][ T1310] schedule+0x1bc/0x3b0
> > [12065.114562][ T1310] schedule_timeout+0x3c4/0x4c0
> > [12065.119275][ T1310] wait_for_completion+0x13c/0x248
> > [12065.124257][ T1310] affine_move_task+0x410/0x688
> > (inlined by) affine_move_task at kernel/sched/core.c:2261
> > [12065.129013][ T1310] __set_cpus_allowed_ptr+0x1b4/0x370
> > [12065.134248][ T1310] sched_setaffinity+0x4f0/0x7e8
> > [12065.139088][ T1310] __arm64_sys_sched_setaffinity+0x1f4/0x2a0
> > [12065.144972][ T1310] do_el0_svc+0x124/0x228
> > [12065.149165][ T1310] el0_sync_handler+0x208/0x384
> > [12065.153876][ T1310] el0_sync+0x140/0x180
> > [12065.157971][ T1310]
>
> So that's a task changing the affinity of some task (either itself or
> another; I can't say without a decoded stacktrace), and then blocking on a
> wait_for_completion() that apparently never happens.
>
> I don't see stop_one_cpu() in the trace, so I assume it's the !task_running
> case, for which the completion should be completed before getting to the
> wait (unless we *do* have migrate_disable()).
>
> Could you please run scripts/decode_stacktrace.sh on the above?

[12065.101987][ T1310] __switch_to (arch/arm64/kernel/process.c:580)
[12065.106227][ T1310] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019)
[12065.110505][ T1310] schedule (./arch/arm64/include/asm/current.h:19 (discriminator 1) ./arch/arm64/include/asm/preempt.h:53 (discriminator 1) kernel/sched/core.c:5099 (discriminator 1))
[12065.114562][ T1310] schedule_timeout (kernel/time/timer.c:1848)
[12065.119275][ T1310] wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
[12065.124257][ T1310] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263)
[12065.129013][ T1310] __set_cpus_allowed_ptr (kernel/sched/core.c:2353)
[12065.134248][ T1310] sched_setaffinity (kernel/sched/core.c:6460)
[12065.139088][ T1310] __arm64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500)
[12065.144972][ T1310] do_el0_svc (arch/arm64/kernel/syscall.c:36 arch/arm64/kernel/syscall.c:48 arch/arm64/kernel/syscall.c:159 arch/arm64/kernel/syscall.c:205)
[12065.149165][ T1310] el0_sync_handler (arch/arm64/kernel/entry-common.c:236 arch/arm64/kernel/entry-common.c:254)
[12065.153876][ T1310] el0_sync (arch/arm64/kernel/entry.S:741)

== powerpc ==
[18060.020301][ T676] [c000200014227670] [c000000000a6d1e8] __func__.5350+0x1220e0/0x181338 unreliable
[18060.020333][ T676] [c000200014227850] [c00000000001a278] __switch_to (arch/powerpc/kernel/process.c:1273)
[18060.020351][ T676] [c0002000142278c0] [c0000000008f3e94] __schedule (kernel/sched/core.c:4269 kernel/sched/core.c:5019)
[18060.020377][ T676] [c000200014227990] [c0000000008f4638] schedule (./include/asm-generic/preempt.h:59 (discriminator 1) kernel/sched/core.c:5099 (discriminator 1))
[18060.020394][ T676] [c0002000142279c0] [c0000000008fbd34] schedule_timeout (kernel/time/timer.c:1847)
[18060.020420][ T676] [c000200014227ac0] [c0000000008f6398] wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
[18060.020455][ T676] [c000200014227b30] [c000000000100fd4] affine_move_task (kernel/sched/core.c:2261)
[18060.020481][ T676] [c000200014227c90] [c000000000101444] __set_cpus_allowed_ptr (kernel/sched/core.c:2353)
[18060.020507][ T676] [c000200014227d00] [c000000000106eac] sched_setaffinity (kernel/sched/core.c:6460)
[18060.020533][ T676] [c000200014227d70] [c000000000107134] sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500)
[18060.020559][ T676] [c000200014227dc0] [c00000000002a6d8] system_call_exception (arch/powerpc/kernel/syscall_64.c:111)
[18060.020585][ T676] [c000200014227e20] [c00000000000d0a8] system_call_common (arch/powerpc/kernel/entry_64.S:302)


2020-11-12 18:39:55

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

On Thu, 2020-11-12 at 17:26 +0000, Valentin Schneider wrote:
> On 12/11/20 16:38, Qian Cai wrote:
> > Some syscall fuzzing from an unprivileged user starts to trigger this below
> > since this commit first appeared in the linux-next today. Does it ring any
> > bells?

X86 in a KVM guest as well.

guest .config:
https://cailca.coding.net/public/linux/mm/git/files/master/x86.config

To reproduce:

# /usr/libexec/qemu-kvm -name kata -cpu host -smp 48 -m 48g -hda rhel-8.3-
x86_64-kvm.img.qcow2 -cdrom kata.iso -nic user,hostfwd=tcp::2222-:22 -nographic

== inside the guest ===
# git clone https://e.coding.net/cailca/linux/mm
# cd mm; make
# ./random -x 0-100 -f

[17213.432777][ T348] INFO: task trinity-c7:216885 can't die for more than 122 seconds.
[17213.434895][ T348] task:trinity-c7 state:D stack:27088 pid:216885 ppid:103237 flags:0x00004004
[17213.437297][ T348] Call Trace:
[17213.438142][ T348] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019)
[17213.439256][ T348] ? __sched_text_start (kernel/sched/core.c:4901)
[17213.440477][ T348] schedule (./arch/x86/include/asm/current.h:15 (discriminator 1) ./include/linux/sched.h:1892 (discriminator 1) kernel/sched/core.c:5100 (discriminator 1))
[17213.441501][ T348] schedule_timeout (kernel/time/timer.c:1848)
[17213.442834][ T348] ? usleep_range (kernel/time/timer.c:1833)
[17213.444070][ T348] ? wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
[17213.445457][ T348] ? lock_downgrade (kernel/locking/lockdep.c:5443)
[17213.446695][ T348] ? rcu_read_unlock (./include/linux/rcupdate.h:692 (discriminator 5))
[17213.447911][ T348] ? do_raw_spin_lock (./arch/x86/include/asm/atomic.h:202 ./include/asm-generic/atomic-instrumented.h:707 ./include/asm-generic/qspinlock.h:82 kernel/locking/spinlock_debug.c:113)
[17213.449190][ T348] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4036 kernel/locking/lockdep.c:4096 kernel/locking/lockdep.c:4048)
[17213.450714][ T348] ? _raw_spin_unlock_irq (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 ./include/linux/spinlock_api_smp.h:168 kernel/locking/spinlock.c:199)
[17213.452042][ T348] wait_for_completion (kernel/sched/completion.c:86 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
[17213.453468][ T348] ? wait_for_completion_interruptible (kernel/sched/completion.c:137)
[17213.455152][ T348] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4036 kernel/locking/lockdep.c:4096 kernel/locking/lockdep.c:4048)
[17213.456651][ T348] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191)
[17213.458115][ T348] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263)
[17213.459313][ T348] ? move_queued_task (kernel/sched/core.c:2151)
[17213.460553][ T348] ? update_curr (kernel/sched/sched.h:1176 kernel/sched/fair.c:845)
[17213.461684][ T348] ? enqueue_entity (kernel/sched/fair.c:4247)
[17213.463001][ T348] ? set_next_task_fair (./arch/x86/include/asm/jump_label.h:25 (discriminator 2) ./include/linux/jump_label.h:200 (discriminator 2) kernel/sched/fair.c:4567 (discriminator 2) kernel/sched/fair.c:4683 (discriminator 2) kernel/sched/fair.c:10953 (discriminator 2))
[17213.464294][ T348] __set_cpus_allowed_ptr (kernel/sched/core.c:2353)
[17213.465668][ T348] ? affine_move_task (kernel/sched/core.c:2287)
[17213.466952][ T348] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4036 kernel/locking/lockdep.c:4096 kernel/locking/lockdep.c:4048)
[17213.468452][ T348] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191)
[17213.469908][ T348] sched_setaffinity (kernel/sched/core.c:6460)
[17213.471127][ T348] ? __ia32_sys_sched_getattr (kernel/sched/core.c:6393)
[17213.472644][ T348] ? _copy_from_user (./arch/x86/include/asm/uaccess_64.h:46 ./arch/x86/include/asm/uaccess_64.h:52 lib/usercopy.c:16)
[17213.473850][ T348] __x64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500)
[17213.475307][ T348] ? sched_setaffinity (kernel/sched/core.c:6500)
[17213.476542][ T348] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4036 kernel/locking/lockdep.c:4096 kernel/locking/lockdep.c:4048)
[17213.477991][ T348] ? syscall_enter_from_user_mode (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 kernel/entry/common.c:98)
[17213.479428][ T348] ? trace_hardirqs_on (kernel/trace/trace_preemptirq.c:50 (discriminator 22))
[17213.480642][ T348] do_syscall_64 (arch/x86/entry/common.c:46)
[17213.481706][ T348] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:127)
[17213.483236][ T348] RIP: 0033:0x7f2f00ebe78d

2020-11-12 19:33:33

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()


On 12/11/20 18:01, Qian Cai wrote:
> On Thu, 2020-11-12 at 17:26 +0000, Valentin Schneider wrote:
>> On 12/11/20 16:38, Qian Cai wrote:
>> > Some syscall fuzzing from an unprivileged user starts to trigger this below
>> > since this commit first appeared in the linux-next today. Does it ring any
>> > bells?
>> >
>>
>> What's the .config? I'm interested in
>> CONFIG_PREEMPT
>> CONFIG_PREEMPT_RT
>> CONFIG_SMP
>
> https://cailca.coding.net/public/linux/mm/git/files/master/arm64.config
>
> # CONFIG_PREEMPT is not set
> CONFIG_SMP=y
>

So that's CONFIG_PREEMPT_NONE=y

> Also, I have been able to reproduce this on powerpc as well just now.
>
[...]
>
> [12065.101987][ T1310] __switch_to (arch/arm64/kernel/process.c:580)
> [12065.106227][ T1310] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019)
> [12065.110505][ T1310] schedule (./arch/arm64/include/asm/current.h:19 (discriminator 1) ./arch/arm64/include/asm/preempt.h:53 (discriminator 1) kernel/sched/core.c:5099 (discriminator 1))
> [12065.114562][ T1310] schedule_timeout (kernel/time/timer.c:1848)
> [12065.119275][ T1310] wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
> [12065.124257][ T1310] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263)
> [12065.129013][ T1310] __set_cpus_allowed_ptr (kernel/sched/core.c:2353)
> [12065.134248][ T1310] sched_setaffinity (kernel/sched/core.c:6460)
> [12065.139088][ T1310] __arm64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500)
> [12065.144972][ T1310] do_el0_svc (arch/arm64/kernel/syscall.c:36 arch/arm64/kernel/syscall.c:48 arch/arm64/kernel/syscall.c:159 arch/arm64/kernel/syscall.c:205)
> [12065.149165][ T1310] el0_sync_handler (arch/arm64/kernel/entry-common.c:236 arch/arm64/kernel/entry-common.c:254)
> [12065.153876][ T1310] el0_sync (arch/arm64/kernel/entry.S:741)
>

Thanks!

One thing I don't get: that trace shows refcount_dec_and_test()
(kernel/sched/core.c:2263) happening before the wait_for_completion(). It's
not the case in the below trace.

> == powerpc ==
> [18060.020301][ T676] [c000200014227670] [c000000000a6d1e8] __func__.5350+0x1220e0/0x181338 unreliable
> [18060.020333][ T676] [c000200014227850] [c00000000001a278] __switch_to (arch/powerpc/kernel/process.c:1273)
> [18060.020351][ T676] [c0002000142278c0] [c0000000008f3e94] __schedule (kernel/sched/core.c:4269 kernel/sched/core.c:5019)
> [18060.020377][ T676] [c000200014227990] [c0000000008f4638] schedule (./include/asm-generic/preempt.h:59 (discriminator 1) kernel/sched/core.c:5099 (discriminator 1))
> [18060.020394][ T676] [c0002000142279c0] [c0000000008fbd34] schedule_timeout (kernel/time/timer.c:1847)
> [18060.020420][ T676] [c000200014227ac0] [c0000000008f6398] wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
> [18060.020455][ T676] [c000200014227b30] [c000000000100fd4] affine_move_task (kernel/sched/core.c:2261)
> [18060.020481][ T676] [c000200014227c90] [c000000000101444] __set_cpus_allowed_ptr (kernel/sched/core.c:2353)
> [18060.020507][ T676] [c000200014227d00] [c000000000106eac] sched_setaffinity (kernel/sched/core.c:6460)
> [18060.020533][ T676] [c000200014227d70] [c000000000107134] sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500)
> [18060.020559][ T676] [c000200014227dc0] [c00000000002a6d8] system_call_exception (arch/powerpc/kernel/syscall_64.c:111)
> [18060.020585][ T676] [c000200014227e20] [c00000000000d0a8] system_call_common (arch/powerpc/kernel/entry_64.S:302)

I take back what I said in that previous email; we could have gone through
the task_running() stop_one_cpu() and *then* hit the

wait_for_completion(&pending->done);

and that is actually the only case that makes sense to me here, because the
!task_running() one will do the completion before waiting (that kernel has
no way to make a task Migration Disabled).

I think what is happening here is:

affine_move_task()
// task_running() case
stop_one_cpu()
wait_for_completion(&pending->done);

and this is !PREEMPT, so the stopper can very well hit:

migration_cpu_stop()
// Task moved between unlocks and scheduling the stopper
task_rq(p) != rq &&
// task_running() case
dest_cpu >= 0

=> no complete_all(), ever :(

This is an annoying case because we didn't have to bother about it before;
a rq mismatch meant the task was fine, because we modified
->cpus_allowed_mask prior.

With migrate_disable(), we have to chase after the bloody task because
we have to preempt it to get a stable is_migration_disabled() reading. It
could have been Migration Disabled, got some pending installed, got out of
Migration Disabled, moved around, gone Migration Disabled again and got
some more pending before we get to run the stopper :(

a) Do you also get this on CONFIG_PREEMPT=y?
b) Could you try the below?

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 02076e6d3792..fad0a8e62aca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1923,7 +1923,7 @@ static int migration_cpu_stop(void *data)
else
p->wake_cpu = dest_cpu;

- } else if (dest_cpu < 0) {
+ } else if (dest_cpu < 0 || pending) {
/*
* This happens when we get migrated between migrate_enable()'s
* preempt_enable() and scheduling the stopper task. At that
@@ -1933,6 +1933,17 @@ static int migration_cpu_stop(void *data)
* more likely.
*/

+ /*
+ * The task moved before the stopper got to run. We're holding
+ * ->pi_lock, so the allowed mask is stable - if it got
+ * somewhere allowed, we're done.
+ */
+ if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ p->migration_pending = NULL;
+ complete = true;
+ goto out;
+ }
+
/*
* When this was migrate_enable() but we no longer have an
* @pending, a concurrent SCA 'fixed' things and we should be

2020-11-12 19:43:32

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

On Thu, 2020-11-12 at 19:31 +0000, Valentin Schneider wrote:
> One thing I don't get: that trace shows refcount_dec_and_test()
> (kernel/sched/core.c:2263) happening before the wait_for_completion(). It's
> not the case in the below trace.

Yes, that is normal. Sometimes, the decoding is a bit off not sure because of
some debugging options like KASAN obscures it.

> a) Do you also get this on CONFIG_PREEMPT=y?

I don't know. None of the systems here has that, but I could probably try.

> b) Could you try the below?

Let me run it and report.

>
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 02076e6d3792..fad0a8e62aca 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1923,7 +1923,7 @@ static int migration_cpu_stop(void *data)
> else
> p->wake_cpu = dest_cpu;
>
> - } else if (dest_cpu < 0) {
> + } else if (dest_cpu < 0 || pending) {
> /*
> * This happens when we get migrated between migrate_enable()'s
> * preempt_enable() and scheduling the stopper task. At that
> @@ -1933,6 +1933,17 @@ static int migration_cpu_stop(void *data)
> * more likely.
> */
>
> + /*
> + * The task moved before the stopper got to run. We're holding
> + * ->pi_lock, so the allowed mask is stable - if it got
> + * somewhere allowed, we're done.
> + */
> + if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
> + p->migration_pending = NULL;
> + complete = true;
> + goto out;
> + }
> +
> /*
> * When this was migrate_enable() but we no longer have an
> * @pending, a concurrent SCA 'fixed' things and we should be
>

2020-11-12 21:20:55

by Qian Cai

[permalink] [raw]
Subject: Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

On Thu, 2020-11-12 at 19:31 +0000, Valentin Schneider wrote:
> a) Do you also get this on CONFIG_PREEMPT=y?

This also happens with:

CONFIG_PREEMPT=y
CONFIG_PREEMPTION=y
CONFIG_PREEMPT_RCU=y
CONFIG_PREEMPT_NOTIFIERS=y
CONFIG_DEBUG_PREEMPT=y
CONFIG_PREEMPTIRQ_TRACEPOINTS=y

[ 1235.044945][ T330] INFO: task trinity-c4:60050 blocked for more than 245 seconds.
[ 1235.052540][ T330] Not tainted 5.10.0-rc3-next-20201112+ #2
[ 1235.058774][ T330] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 1235.067392][ T330] task:trinity-c4 state:D stack:26880 pid:60050 ppid: 1722 flags:0x00004000
[ 1235.076505][ T330] Call Trace:
[ 1235.079680][ T330] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019)
[ 1235.083971][ T330] ? __sched_text_start (kernel/sched/core.c:4901)
[ 1235.088721][ T330] schedule (kernel/sched/core.c:5099 (discriminator 1))
[ 1235.092661][ T330] schedule_timeout (kernel/time/timer.c:1848)
[ 1235.097399][ T330] ? usleep_range (kernel/time/timer.c:1833)
[ 1235.101945][ T330] ? wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
[ 1235.107156][ T330] ? lock_downgrade (kernel/locking/lockdep.c:5443)
[ 1235.111883][ T330] ? rcu_read_unlock (./include/linux/rcupdate.h:692 (discriminator 5))
[ 1235.116561][ T330] ? do_raw_spin_lock (./arch/x86/include/asm/atomic.h:202 ./include/asm-generic/atomic-instrumented.h:707 ./include/asm-generic/qspinlock.h:82 kernel/locking/spinlock_debug.c:113)
[ 1235.121459][ T330] ? _raw_spin_unlock_irq (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 ./include/linux/spinlock_api_smp.h:168 kernel/locking/spinlock.c:199)
[ 1235.126601][ T330] wait_for_completion (kernel/sched/completion.c:86 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
[ 1235.131591][ T330] ? wait_for_completion_interruptible (kernel/sched/completion.c:137)
[ 1235.138013][ T330] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191)
[ 1235.143698][ T330] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263)
[ 1235.148451][ T330] ? move_queued_task (kernel/sched/core.c:2151)
[ 1235.153351][ T330] ? update_curr (kernel/sched/sched.h:1176 kernel/sched/fair.c:845)
[ 1235.157848][ T330] ? enqueue_entity (kernel/sched/fair.c:4247)
[ 1235.162658][ T330] ? set_next_task_fair (./arch/x86/include/asm/jump_label.h:25 (discriminator 2) ./include/linux/jump_label.h:200 (discriminator 2) kernel/sched/fair.c:4567 (discriminator 2) kernel/sched/fair.c:4683 (discriminator 2) kernel/sched/fair.c:10953 (discriminator 2))
[ 1235.167667][ T330] __set_cpus_allowed_ptr (kernel/sched/core.c:2353)
[ 1235.172905][ T330] ? affine_move_task (kernel/sched/core.c:2287)
[ 1235.177826][ T330] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191)
[ 1235.183501][ T330] sched_setaffinity (kernel/sched/core.c:6460)
[ 1235.188345][ T330] ? __ia32_sys_sched_getattr (kernel/sched/core.c:6393)
[ 1235.193937][ T330] ? _copy_from_user (./arch/x86/include/asm/uaccess_64.h:46 ./arch/x86/include/asm/uaccess_64.h:52 lib/usercopy.c:16)
[ 1235.198605][ T330] __x64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500)
[ 1235.204291][ T330] ? sched_setaffinity (kernel/sched/core.c:6500)
[ 1235.209324][ T330] ? syscall_enter_from_user_mode (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 kernel/entry/common.c:98)
[ 1235.215133][ T330] do_syscall_64 (arch/x86/entry/common.c:46)
[ 1235.219431][ T330] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:127)
[ 1235.225251][ T330] RIP: 0033:0x7fb102b1178d

> b) Could you try the below?

It is running good so far on multiple systems. I'll keep it running and report
back if it happens again.

2020-11-12 21:28:36

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()


On 12/11/20 20:37, Qian Cai wrote:
> On Thu, 2020-11-12 at 19:31 +0000, Valentin Schneider wrote:
>> a) Do you also get this on CONFIG_PREEMPT=y?
>
> This also happens with:
>
> CONFIG_PREEMPT=y
> CONFIG_PREEMPTION=y
> CONFIG_PREEMPT_RCU=y
> CONFIG_PREEMPT_NOTIFIERS=y
> CONFIG_DEBUG_PREEMPT=y
> CONFIG_PREEMPTIRQ_TRACEPOINTS=y
>

Hmph, it should be much less likely to happen with PREEMPT=y, but isn't per
se impossible. Thanks for giving it a shot.

> [ 1235.044945][ T330] INFO: task trinity-c4:60050 blocked for more than 245 seconds.
> [ 1235.052540][ T330] Not tainted 5.10.0-rc3-next-20201112+ #2
> [ 1235.058774][ T330] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 1235.067392][ T330] task:trinity-c4 state:D stack:26880 pid:60050 ppid: 1722 flags:0x00004000
> [ 1235.076505][ T330] Call Trace:
> [ 1235.079680][ T330] __schedule (kernel/sched/core.c:4272 kernel/sched/core.c:5019)
> [ 1235.083971][ T330] ? __sched_text_start (kernel/sched/core.c:4901)
> [ 1235.088721][ T330] schedule (kernel/sched/core.c:5099 (discriminator 1))
> [ 1235.092661][ T330] schedule_timeout (kernel/time/timer.c:1848)
> [ 1235.097399][ T330] ? usleep_range (kernel/time/timer.c:1833)
> [ 1235.101945][ T330] ? wait_for_completion (kernel/sched/completion.c:85 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
> [ 1235.107156][ T330] ? lock_downgrade (kernel/locking/lockdep.c:5443)
> [ 1235.111883][ T330] ? rcu_read_unlock (./include/linux/rcupdate.h:692 (discriminator 5))
> [ 1235.116561][ T330] ? do_raw_spin_lock (./arch/x86/include/asm/atomic.h:202 ./include/asm-generic/atomic-instrumented.h:707 ./include/asm-generic/qspinlock.h:82 kernel/locking/spinlock_debug.c:113)
> [ 1235.121459][ T330] ? _raw_spin_unlock_irq (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 ./include/linux/spinlock_api_smp.h:168 kernel/locking/spinlock.c:199)
> [ 1235.126601][ T330] wait_for_completion (kernel/sched/completion.c:86 kernel/sched/completion.c:106 kernel/sched/completion.c:117 kernel/sched/completion.c:138)
> [ 1235.131591][ T330] ? wait_for_completion_interruptible (kernel/sched/completion.c:137)
> [ 1235.138013][ T330] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191)
> [ 1235.143698][ T330] affine_move_task (./include/linux/instrumented.h:101 ./include/asm-generic/atomic-instrumented.h:220 ./include/linux/refcount.h:272 ./include/linux/refcount.h:315 ./include/linux/refcount.h:333 kernel/sched/core.c:2263)
> [ 1235.148451][ T330] ? move_queued_task (kernel/sched/core.c:2151)
> [ 1235.153351][ T330] ? update_curr (kernel/sched/sched.h:1176 kernel/sched/fair.c:845)
> [ 1235.157848][ T330] ? enqueue_entity (kernel/sched/fair.c:4247)
> [ 1235.162658][ T330] ? set_next_task_fair (./arch/x86/include/asm/jump_label.h:25 (discriminator 2) ./include/linux/jump_label.h:200 (discriminator 2) kernel/sched/fair.c:4567 (discriminator 2) kernel/sched/fair.c:4683 (discriminator 2) kernel/sched/fair.c:10953 (discriminator 2))
> [ 1235.167667][ T330] __set_cpus_allowed_ptr (kernel/sched/core.c:2353)
> [ 1235.172905][ T330] ? affine_move_task (kernel/sched/core.c:2287)
> [ 1235.177826][ T330] ? _raw_spin_unlock_irqrestore (./include/linux/spinlock_api_smp.h:160 kernel/locking/spinlock.c:191)
> [ 1235.183501][ T330] sched_setaffinity (kernel/sched/core.c:6460)
> [ 1235.188345][ T330] ? __ia32_sys_sched_getattr (kernel/sched/core.c:6393)
> [ 1235.193937][ T330] ? _copy_from_user (./arch/x86/include/asm/uaccess_64.h:46 ./arch/x86/include/asm/uaccess_64.h:52 lib/usercopy.c:16)
> [ 1235.198605][ T330] __x64_sys_sched_setaffinity (kernel/sched/core.c:6511 kernel/sched/core.c:6500 kernel/sched/core.c:6500)
> [ 1235.204291][ T330] ? sched_setaffinity (kernel/sched/core.c:6500)
> [ 1235.209324][ T330] ? syscall_enter_from_user_mode (./arch/x86/include/asm/irqflags.h:54 ./arch/x86/include/asm/irqflags.h:94 kernel/entry/common.c:98)
> [ 1235.215133][ T330] do_syscall_64 (arch/x86/entry/common.c:46)
> [ 1235.219431][ T330] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:127)
> [ 1235.225251][ T330] RIP: 0033:0x7fb102b1178d
>
>> b) Could you try the below?
>
> It is running good so far on multiple systems. I'll keep it running and report
> back if it happens again.

Thanks! All of this is somewhat fragile, so I'll want to have another look
with a fresher mind; if the diff makes a difference at least it'll mean
I wasn't completely off.

2020-11-13 10:30:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v4 10/19] sched: Fix migrate_disable() vs set_cpus_allowed_ptr()

On Thu, Nov 12, 2020 at 07:31:12PM +0000, Valentin Schneider wrote:

> I think what is happening here is:
>
> affine_move_task()
> // task_running() case
> stop_one_cpu()
> wait_for_completion(&pending->done);
>
> and this is !PREEMPT, so the stopper can very well hit:
>
> migration_cpu_stop()
> // Task moved between unlocks and scheduling the stopper
> task_rq(p) != rq &&
> // task_running() case
> dest_cpu >= 0
>
> => no complete_all(), ever :(

Damn...

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 02076e6d3792..fad0a8e62aca 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1923,7 +1923,7 @@ static int migration_cpu_stop(void *data)
> else
> p->wake_cpu = dest_cpu;
>
> - } else if (dest_cpu < 0) {
> + } else if (dest_cpu < 0 || pending) {
> /*
> * This happens when we get migrated between migrate_enable()'s
> * preempt_enable() and scheduling the stopper task. At that
> @@ -1933,6 +1933,17 @@ static int migration_cpu_stop(void *data)
> * more likely.
> */
>
> + /*
> + * The task moved before the stopper got to run. We're holding
> + * ->pi_lock, so the allowed mask is stable - if it got
> + * somewhere allowed, we're done.
> + */
> + if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
> + p->migration_pending = NULL;
> + complete = true;
> + goto out;
> + }
> +
> /*
> * When this was migrate_enable() but we no longer have an
> * @pending, a concurrent SCA 'fixed' things and we should be

Agreed, this is very clearly a missing case and the proposed solution
seems straight forward enough; but I'm struggling to convince my sleep
deprived brain we're actually complete now.

I'll continue staring at it a little more. Could you make it into a
proper patch?

2020-11-20 12:37:09

by tip-bot2 for Jacob Pan

[permalink] [raw]
Subject: [tip: sched/core] sched/core: Add missing completion for affine_move_task() waiters

The following commit has been merged into the sched/core branch of tip:

Commit-ID: d707faa64d03d26b529cc4aea59dab1b016d4d33
Gitweb: https://git.kernel.org/tip/d707faa64d03d26b529cc4aea59dab1b016d4d33
Author: Valentin Schneider <[email protected]>
AuthorDate: Fri, 13 Nov 2020 11:24:14
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 19 Nov 2020 11:25:45 +01:00

sched/core: Add missing completion for affine_move_task() waiters

Qian reported that some fuzzer issuing sched_setaffinity() ends up stuck on
a wait_for_completion(). The problematic pattern seems to be:

affine_move_task()
// task_running() case
stop_one_cpu();
wait_for_completion(&pending->done);

Combined with, on the stopper side:

migration_cpu_stop()
// Task moved between unlocks and scheduling the stopper
task_rq(p) != rq &&
// task_running() case
dest_cpu >= 0

=> no complete_all()

This can happen with both PREEMPT and !PREEMPT, although !PREEMPT should
be more likely to see this given the targeted task has a much bigger window
to block and be woken up elsewhere before the stopper runs.

Make migration_cpu_stop() always look at pending affinity requests; signal
their completion if the stopper hits a rq mismatch but the task is
still within its allowed mask. When Migrate-Disable isn't involved, this
matches the previous set_cpus_allowed_ptr() vs migration_cpu_stop()
behaviour.

Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
Reported-by: Qian Cai <[email protected]>
Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lore.kernel.org/lkml/[email protected]
---
kernel/sched/core.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6aaf9f..4d1fd4b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1923,7 +1923,7 @@ static int migration_cpu_stop(void *data)
else
p->wake_cpu = dest_cpu;

- } else if (dest_cpu < 0) {
+ } else if (dest_cpu < 0 || pending) {
/*
* This happens when we get migrated between migrate_enable()'s
* preempt_enable() and scheduling the stopper task. At that
@@ -1934,6 +1934,17 @@ static int migration_cpu_stop(void *data)
*/

/*
+ * The task moved before the stopper got to run. We're holding
+ * ->pi_lock, so the allowed mask is stable - if it got
+ * somewhere allowed, we're done.
+ */
+ if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ p->migration_pending = NULL;
+ complete = true;
+ goto out;
+ }
+
+ /*
* When this was migrate_enable() but we no longer have an
* @pending, a concurrent SCA 'fixed' things and we should be
* valid again. Nothing to do.