2021-02-24 14:47:20

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 6/6] sched: Simplify set_affinity_pending refcounts

Now that we have set_affinity_pending::stop_pending to indicate if a
stopper is in progress, and we have the guarantee that if that stopper
exists, it will (eventually) complete our @pending we can simplify the
refcount scheme by no longer counting the stopper thread.

Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 32 ++++++++++++++++++++------------
1 file changed, 20 insertions(+), 12 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1862,6 +1862,10 @@ struct migration_arg {
struct set_affinity_pending *pending;
};

+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
struct set_affinity_pending {
refcount_t refs;
unsigned int stop_pending;
@@ -1997,10 +2001,6 @@ static int migration_cpu_stop(void *data
if (complete)
complete_all(&pending->done);

- /* For pending->{arg,stop_work} */
- if (pending && refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
-
return 0;
}

@@ -2199,12 +2199,16 @@ static int affine_move_task(struct rq *r
push_task = get_task_struct(p);
}

+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
pending = p->migration_pending;
- if (pending) {
- refcount_inc(&pending->refs);
+ if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
+
task_rq_unlock(rq, p, rf);

if (push_task) {
@@ -2213,7 +2217,7 @@ static int affine_move_task(struct rq *r
}

if (complete)
- goto do_complete;
+ complete_all(&pending->done);

return 0;
}
@@ -2264,9 +2268,9 @@ static int affine_move_task(struct rq *r
if (!stop_pending)
pending->stop_pending = true;

- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;
+
task_rq_unlock(rq, p, rf);

if (!stop_pending) {
@@ -2282,12 +2286,13 @@ static int affine_move_task(struct rq *r
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);

- p->migration_pending = NULL;
- complete = true;
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
}
task_rq_unlock(rq, p, rf);

-do_complete:
if (complete)
complete_all(&pending->done);
}
@@ -2295,7 +2300,7 @@ static int affine_move_task(struct rq *r
wait_for_completion(&pending->done);

if (refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
+ wake_up_var(&pending->refs); /* No UaF, just an address */

/*
* Block the original owner of &pending until all subsequent callers
@@ -2303,6 +2308,9 @@ static int affine_move_task(struct rq *r
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));

+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
return 0;
}




2021-02-24 15:52:37

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 6/6] sched: Simplify set_affinity_pending refcounts

On Wed, Feb 24, 2021 at 01:24:45PM +0100, Peter Zijlstra wrote:
> @@ -2199,12 +2199,16 @@ static int affine_move_task(struct rq *r
> push_task = get_task_struct(p);
> }
>
> + /*
> + * If there are pending waiters, but no pending stop_work,
> + * then complete now.
> + */
> pending = p->migration_pending;
> + if (pending && !pending->stop_pending) {
> p->migration_pending = NULL;
> complete = true;
> }

> @@ -2282,12 +2286,13 @@ static int affine_move_task(struct rq *r
> if (task_on_rq_queued(p))
> rq = move_queued_task(rq, rf, p, dest_cpu);
>
> + if (!pending->stop_pending) {
> + p->migration_pending = NULL;
> + complete = true;
> + }
> }
> task_rq_unlock(rq, p, rf);

Elsewhere Valentin argued something like the below ought to be possible.
I've not drawn diagrams yet, but if I understood his argument right it
should be possible.

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1c56ac4df2c9..3ffbd1b76f3e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2204,9 +2204,10 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
* then complete now.
*/
pending = p->migration_pending;
- if (pending && !pending->stop_pending) {
+ if (pending) {
p->migration_pending = NULL;
- complete = true;
+ if (!pending->stop_pending)
+ complete = true;
}

task_rq_unlock(rq, p, rf);
@@ -2286,10 +2287,9 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);

- if (!pending->stop_pending) {
- p->migration_pending = NULL;
+ p->migration_pending = NULL;
+ if (!pending->stop_pending)
complete = true;
- }
}
task_rq_unlock(rq, p, rf);

2021-02-24 15:56:08

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 6/6] sched: Simplify set_affinity_pending refcounts

On 24/02/21 13:24, Peter Zijlstra wrote:
> Now that we have set_affinity_pending::stop_pending to indicate if a
> stopper is in progress, and we have the guarantee that if that stopper
> exists, it will (eventually) complete our @pending we can simplify the
> refcount scheme by no longer counting the stopper thread.
>
> Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> @@ -2199,12 +2199,16 @@ static int affine_move_task(struct rq *r
> push_task = get_task_struct(p);
> }
>
> + /*
> + * If there are pending waiters, but no pending stop_work,
> + * then complete now.
> + */
> pending = p->migration_pending;
> - if (pending) {
> - refcount_inc(&pending->refs);
> + if (pending && !pending->stop_pending) {
> p->migration_pending = NULL;
> complete = true;
> }
> +
> task_rq_unlock(rq, p, rf);
>
> if (push_task) {
> @@ -2213,7 +2217,7 @@ static int affine_move_task(struct rq *r
> }
>
> if (complete)
> - goto do_complete;
> + complete_all(&pending->done);

We could've done this in the first place, right? I don't think this path
actually needed to deal with the refcounts (at least not since we started
counting the stoppers).

Musings aside, I believe the above means, for migration_cpu_stop():

(pending != NULL) => (pending == p->migration_pending)

Since, when ->stop_pending, only the stopper can uninstall
p->migration_pending. This could simplify a few if's.

Also, the fatty comment above affine_move_task() probably needs a bit of
gardening:

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9492f8eb242a..6f649aa2795c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2165,16 +2165,21 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
*
* (1) In the cases covered above. There is one more where the completion is
* signaled within affine_move_task() itself: when a subsequent affinity request
- * cancels the need for an active migration. Consider:
+ * occurs after the stopper bailed out due to the targeted task still being
+ * Migrate-Disable. Consider:
*
* Initial conditions: P0->cpus_mask = [0, 1]
*
- * P0@CPU0 P1 P2
- *
- * migrate_disable();
- * <preempted>
+ * CPU0 P1 P2
+ * <P0>
+ * migrate_disable();
+ * <preempted>
* set_cpus_allowed_ptr(P0, [1]);
* <blocks>
+ * <migration/0>
+ * migration_cpu_stop()
+ * is_migration_disabled()
+ * <bails>
* set_cpus_allowed_ptr(P0, [0, 1]);
* <signal completion>
* <awakes>

2021-02-24 18:01:07

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 6/6] sched: Simplify set_affinity_pending refcounts

On 24/02/21 16:34, Peter Zijlstra wrote:
> Elsewhere Valentin argued something like the below ought to be possible.
> I've not drawn diagrams yet, but if I understood his argument right it
> should be possible.
>
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 1c56ac4df2c9..3ffbd1b76f3e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2204,9 +2204,10 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
> * then complete now.
> */
> pending = p->migration_pending;
> - if (pending && !pending->stop_pending) {
> + if (pending) {
> p->migration_pending = NULL;
> - complete = true;
> + if (!pending->stop_pending)
> + complete = true;
> }
>
> task_rq_unlock(rq, p, rf);
> @@ -2286,10 +2287,9 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
> if (task_on_rq_queued(p))
> rq = move_queued_task(rq, rf, p, dest_cpu);
>
> - if (!pending->stop_pending) {
> - p->migration_pending = NULL;
> + p->migration_pending = NULL;
> + if (!pending->stop_pending)
> complete = true;
> - }
> }
> task_rq_unlock(rq, p, rf);
>

I was thinking of the "other way around"; i.e. modify migration_cpu_stop()
into:

---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9492f8eb242a..9546f0263970 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1926,6 +1926,11 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);

+ /*
+ * If we were passed a pending, then ->stop_pending was set, thus
+ * p->migration_pending must have remained stable.
+ */
+ WARN_ON_ONCE(pending && pending != p->migration_pending);
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1936,8 +1941,7 @@ static int migration_cpu_stop(void *data)
goto out;

if (pending) {
- if (p->migration_pending == pending)
- p->migration_pending = NULL;
+ p->migration_pending = NULL;
complete = true;
}

@@ -1976,8 +1980,7 @@ static int migration_cpu_stop(void *data)
* somewhere allowed, we're done.
*/
if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
- if (p->migration_pending == pending)
- p->migration_pending = NULL;
+ p->migration_pending = NULL;
complete = true;
goto out;
}
---

Your change reinstores the "triple SCA" pattern, where a stopper can run
with arg->pending && arg->pending != p->migration_pending, which I was
kinda happy to see go away...

2021-02-25 10:00:28

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 6/6] sched: Simplify set_affinity_pending refcounts

On Wed, Feb 24, 2021 at 05:59:01PM +0000, Valentin Schneider wrote:

> Your change reinstores the "triple SCA" pattern, where a stopper can run
> with arg->pending && arg->pending != p->migration_pending, which I was
> kinda happy to see go away...

Right, fair enough. Any workload that can tell the difference is doing
it wrong anyway :-)

OK, I've munged your two patches together into the below.

---
Subject: sched: Simplify migration_cpu_stop()
From: Valentin Schneider <[email protected]>
Date: Thu Feb 25 10:22:30 CET 2021

Since, when ->stop_pending, only the stopper can uninstall
p->migration_pending. This could simplify a few ifs, because:

(pending != NULL) => (pending == p->migration_pending)

Also, the fatty comment above affine_move_task() probably needs a bit
of gardening.

Signed-off-by: Valentin Schneider <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 27 ++++++++++++++++++---------
1 file changed, 18 insertions(+), 9 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1927,6 +1927,12 @@ static int migration_cpu_stop(void *data
rq_lock(rq, &rf);

/*
+ * If we were passed a pending, then ->stop_pending was set, thus
+ * p->migration_pending must have remained stable.
+ */
+ WARN_ON_ONCE(pending && pending != p->migration_pending);
+
+ /*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
@@ -1936,8 +1942,7 @@ static int migration_cpu_stop(void *data
goto out;

if (pending) {
- if (p->migration_pending == pending)
- p->migration_pending = NULL;
+ p->migration_pending = NULL;
complete = true;
}

@@ -1976,8 +1981,7 @@ static int migration_cpu_stop(void *data
* somewhere allowed, we're done.
*/
if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
- if (p->migration_pending == pending)
- p->migration_pending = NULL;
+ p->migration_pending = NULL;
complete = true;
goto out;
}
@@ -2165,16 +2169,21 @@ void do_set_cpus_allowed(struct task_str
*
* (1) In the cases covered above. There is one more where the completion is
* signaled within affine_move_task() itself: when a subsequent affinity request
- * cancels the need for an active migration. Consider:
+ * occurs after the stopper bailed out due to the targeted task still being
+ * Migrate-Disable. Consider:
*
* Initial conditions: P0->cpus_mask = [0, 1]
*
- * P0@CPU0 P1 P2
- *
- * migrate_disable();
- * <preempted>
+ * CPU0 P1 P2
+ * <P0>
+ * migrate_disable();
+ * <preempted>
* set_cpus_allowed_ptr(P0, [1]);
* <blocks>
+ * <migration/0>
+ * migration_cpu_stop()
+ * is_migration_disabled()
+ * <bails>
* set_cpus_allowed_ptr(P0, [0, 1]);
* <signal completion>
* <awakes>

2021-02-25 11:46:39

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 6/6] sched: Simplify set_affinity_pending refcounts

On 25/02/21 10:27, Peter Zijlstra wrote:
> On Wed, Feb 24, 2021 at 05:59:01PM +0000, Valentin Schneider wrote:
>
>> Your change reinstores the "triple SCA" pattern, where a stopper can run
>> with arg->pending && arg->pending != p->migration_pending, which I was
>> kinda happy to see go away...
>
> Right, fair enough. Any workload that can tell the difference is doing
> it wrong anyway :-)
>
> OK, I've munged your two patches together into the below.
>

Thanks!

I haven't found much else to say on the series after having slept on it, so
feel free to add:

Reviewed-by: Valentin Schneider <[email protected]>

to the rest. I'll go see about testing it in some way.

2021-03-01 10:19:49

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: sched/urgent] sched: Simplify set_affinity_pending refcounts

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID: a4c2579076dc6951709a8e425df8369ab6eb2f24
Gitweb: https://git.kernel.org/tip/a4c2579076dc6951709a8e425df8369ab6eb2f24
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 24 Feb 2021 11:42:08 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Mon, 01 Mar 2021 11:02:15 +01:00

sched: Simplify set_affinity_pending refcounts

Now that we have set_affinity_pending::stop_pending to indicate if a
stopper is in progress, and we have the guarantee that if that stopper
exists, it will (eventually) complete our @pending we can simplify the
refcount scheme by no longer counting the stopper thread.

Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
Cc: [email protected]
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Valentin Schneider <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/core.c | 32 ++++++++++++++++++++------------
1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4e4d100..9819121 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1862,6 +1862,10 @@ struct migration_arg {
struct set_affinity_pending *pending;
};

+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
struct set_affinity_pending {
refcount_t refs;
unsigned int stop_pending;
@@ -1997,10 +2001,6 @@ out:
if (complete)
complete_all(&pending->done);

- /* For pending->{arg,stop_work} */
- if (pending && refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
-
return 0;
}

@@ -2199,12 +2199,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
push_task = get_task_struct(p);
}

+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
pending = p->migration_pending;
- if (pending) {
- refcount_inc(&pending->refs);
+ if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
+
task_rq_unlock(rq, p, rf);

if (push_task) {
@@ -2213,7 +2217,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}

if (complete)
- goto do_complete;
+ complete_all(&pending->done);

return 0;
}
@@ -2264,9 +2268,9 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
if (!stop_pending)
pending->stop_pending = true;

- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;
+
task_rq_unlock(rq, p, rf);

if (!stop_pending) {
@@ -2282,12 +2286,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);

- p->migration_pending = NULL;
- complete = true;
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
}
task_rq_unlock(rq, p, rf);

-do_complete:
if (complete)
complete_all(&pending->done);
}
@@ -2295,7 +2300,7 @@ do_complete:
wait_for_completion(&pending->done);

if (refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
+ wake_up_var(&pending->refs); /* No UaF, just an address */

/*
* Block the original owner of &pending until all subsequent callers
@@ -2303,6 +2308,9 @@ do_complete:
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));

+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
return 0;
}

2021-03-06 11:47:54

by tip-bot2 for Haifeng Xu

[permalink] [raw]
Subject: [tip: sched/core] sched: Simplify set_affinity_pending refcounts

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 50caf9c14b1498c90cf808dbba2ca29bd32ccba4
Gitweb: https://git.kernel.org/tip/50caf9c14b1498c90cf808dbba2ca29bd32ccba4
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 24 Feb 2021 11:42:08 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Sat, 06 Mar 2021 12:40:21 +01:00

sched: Simplify set_affinity_pending refcounts

Now that we have set_affinity_pending::stop_pending to indicate if a
stopper is in progress, and we have the guarantee that if that stopper
exists, it will (eventually) complete our @pending we can simplify the
refcount scheme by no longer counting the stopper thread.

Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()")
Cc: [email protected]
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Reviewed-by: Valentin Schneider <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/core.c | 32 ++++++++++++++++++++------------
1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4e4d100..9819121 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1862,6 +1862,10 @@ struct migration_arg {
struct set_affinity_pending *pending;
};

+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
struct set_affinity_pending {
refcount_t refs;
unsigned int stop_pending;
@@ -1997,10 +2001,6 @@ out:
if (complete)
complete_all(&pending->done);

- /* For pending->{arg,stop_work} */
- if (pending && refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
-
return 0;
}

@@ -2199,12 +2199,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
push_task = get_task_struct(p);
}

+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
pending = p->migration_pending;
- if (pending) {
- refcount_inc(&pending->refs);
+ if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
+
task_rq_unlock(rq, p, rf);

if (push_task) {
@@ -2213,7 +2217,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}

if (complete)
- goto do_complete;
+ complete_all(&pending->done);

return 0;
}
@@ -2264,9 +2268,9 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
if (!stop_pending)
pending->stop_pending = true;

- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
if (flags & SCA_MIGRATE_ENABLE)
p->migration_flags &= ~MDF_PUSH;
+
task_rq_unlock(rq, p, rf);

if (!stop_pending) {
@@ -2282,12 +2286,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);

- p->migration_pending = NULL;
- complete = true;
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
}
task_rq_unlock(rq, p, rf);

-do_complete:
if (complete)
complete_all(&pending->done);
}
@@ -2295,7 +2300,7 @@ do_complete:
wait_for_completion(&pending->done);

if (refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
+ wake_up_var(&pending->refs); /* No UaF, just an address */

/*
* Block the original owner of &pending until all subsequent callers
@@ -2303,6 +2308,9 @@ do_complete:
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));

+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
return 0;
}