2021-03-10 15:06:37

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

Use the new cpu_dying() state to simplify and fix the balance_push()
vs CPU hotplug rollback state.

Specifically, we currently rely on notifiers sched_cpu_dying() /
sched_cpu_activate() to terminate balance_push, however if the
cpu_down() fails when we're past sched_cpu_deactivate(), it should
terminate balance_push at that point and not wait until we hit
sched_cpu_activate().

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 33 ++++++++++++++++++++-------------
kernel/sched/sched.h | 1 -
2 files changed, 20 insertions(+), 14 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1811,7 +1811,7 @@ static inline bool is_cpu_allowed(struct
return cpu_online(cpu);

/* Regular kernel threads don't get to stay during offline. */
- if (cpu_rq(cpu)->balance_push)
+ if (cpu_dying(cpu))
return false;

/* But are allowed during online. */
@@ -7647,12 +7647,19 @@ static void balance_push(struct rq *rq)

lockdep_assert_held(&rq->lock);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
/*
* Ensure the thing is persistent until balance_push_set(.on = false);
*/
rq->balance_callback = &balance_push_callback;

/*
+ * Only active while going offline.
+ */
+ if (!cpu_dying(rq->cpu))
+ return;
+
+ /*
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
*
@@ -7705,7 +7712,6 @@ static void balance_push_set(int cpu, bo
struct rq_flags rf;

rq_lock_irqsave(rq, &rf);
- rq->balance_push = on;
if (on) {
WARN_ON_ONCE(rq->balance_callback);
rq->balance_callback = &balance_push_callback;
@@ -7830,8 +7836,8 @@ int sched_cpu_activate(unsigned int cpu)
struct rq_flags rf;

/*
- * Make sure that when the hotplug state machine does a roll-back
- * we clear balance_push. Ideally that would happen earlier...
+ * Clear the balance_push callback and prepare to schedule
+ * regular tasks.
*/
balance_push_set(cpu, false);

@@ -7883,14 +7889,6 @@ int sched_cpu_deactivate(unsigned int cp
set_cpu_active(cpu, false);

/*
- * From this point forward, this CPU will refuse to run any task that
- * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
- * push those tasks away until this gets cleared, see
- * sched_cpu_dying().
- */
- balance_push_set(cpu, true);
-
- /*
* We've cleared cpu_active_mask / set balance_push, wait for all
* preempt-disabled and RCU users of this state to go away such that
* all new such users will observe it.
@@ -7910,6 +7908,14 @@ int sched_cpu_deactivate(unsigned int cp
}
rq_unlock_irqrestore(rq, &rf);

+ /*
+ * From this point forward, this CPU will refuse to run any task that
+ * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
+ * push those tasks away until this gets cleared, see
+ * sched_cpu_dying().
+ */
+ balance_push_set(cpu, true);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
@@ -8206,7 +8212,7 @@ void __init sched_init(void)
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->balance_callback = NULL;
+ rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -8253,6 +8259,7 @@ void __init sched_init(void)

#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+ balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -982,7 +982,6 @@ struct rq {
unsigned long cpu_capacity_orig;

struct callback_head *balance_callback;
- unsigned char balance_push;

unsigned char nohz_idle_balance;
unsigned char idle_balance;



2021-03-11 15:14:58

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

Peter Zijlstra <[email protected]> writes:
> @@ -7883,14 +7889,6 @@ int sched_cpu_deactivate(unsigned int cp
> set_cpu_active(cpu, false);
>
> /*
> - * From this point forward, this CPU will refuse to run any task that
> - * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
> - * push those tasks away until this gets cleared, see
> - * sched_cpu_dying().
> - */
> - balance_push_set(cpu, true);
> -
> - /*
> * We've cleared cpu_active_mask / set balance_push, wait for all
> * preempt-disabled and RCU users of this state to go away such that
> * all new such users will observe it.
> @@ -7910,6 +7908,14 @@ int sched_cpu_deactivate(unsigned int cp
> }
> rq_unlock_irqrestore(rq, &rf);
>
> + /*
> + * From this point forward, this CPU will refuse to run any task that
> + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
> + * push those tasks away until this gets cleared, see
> + * sched_cpu_dying().
> + */
> + balance_push_set(cpu, true);
> +

AIUI with cpu_dying_mask being flipped before even entering
sched_cpu_deactivate(), we don't need this to be before the
synchronize_rcu() anymore; is there more than that to why you're punting it
back this side of it?

> #ifdef CONFIG_SCHED_SMT
> /*
> * When going down, decrement the number of cores with SMT present.

> @@ -8206,7 +8212,7 @@ void __init sched_init(void)
> rq->sd = NULL;
> rq->rd = NULL;
> rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
> - rq->balance_callback = NULL;
> + rq->balance_callback = &balance_push_callback;
> rq->active_balance = 0;
> rq->next_balance = jiffies;
> rq->push_cpu = 0;
> @@ -8253,6 +8259,7 @@ void __init sched_init(void)
>
> #ifdef CONFIG_SMP
> idle_thread_set_boot_cpu();
> + balance_push_set(smp_processor_id(), false);
> #endif
> init_sched_fair_class();
>

I don't get what these two changes do - the end result is the same as
before, no?


Also, AIUI this patch covers the cpu_dying -> !cpu_dying rollback case
since balance_push gets numbed down by !cpu_dying. What about the other way
around (hot-plug failure + rollback)? We may have allowed !pcpu tasks on the
now-dying CPU, and we'd need to re-install the balance_push callback.

I'm starting to think we'd need to have

rq->balance_callback = &balance_push_callback

for any CPU with hotplug state < CPUHP_AP_ACTIVE. Thus we would
need:

balance_push_set(cpu, true) in sched_init() and sched_cpu_deactivate()
balance_push_set(cpu, false) in sched_cpu_activate()

and the rest would be driven by the cpu_dying_mask.

2021-03-11 16:47:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Thu, Mar 11, 2021 at 03:13:04PM +0000, Valentin Schneider wrote:

> > #ifdef CONFIG_SCHED_SMT
> > /*
> > * When going down, decrement the number of cores with SMT present.
>
> > @@ -8206,7 +8212,7 @@ void __init sched_init(void)
> > rq->sd = NULL;
> > rq->rd = NULL;
> > rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
> > - rq->balance_callback = NULL;
> > + rq->balance_callback = &balance_push_callback;
> > rq->active_balance = 0;
> > rq->next_balance = jiffies;
> > rq->push_cpu = 0;
> > @@ -8253,6 +8259,7 @@ void __init sched_init(void)
> >
> > #ifdef CONFIG_SMP
> > idle_thread_set_boot_cpu();
> > + balance_push_set(smp_processor_id(), false);
> > #endif
> > init_sched_fair_class();
> >
>
> I don't get what these two changes do - the end result is the same as
> before, no?

IIRC the idea was to initialize the offline CPUs to the same state as if
they'd been offlined.

2021-04-12 12:05:30

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Thu, Mar 11, 2021 at 03:13:04PM +0000, Valentin Schneider wrote:
> Peter Zijlstra <[email protected]> writes:
> > @@ -7883,14 +7889,6 @@ int sched_cpu_deactivate(unsigned int cp
> > set_cpu_active(cpu, false);
> >
> > /*
> > - * From this point forward, this CPU will refuse to run any task that
> > - * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
> > - * push those tasks away until this gets cleared, see
> > - * sched_cpu_dying().
> > - */
> > - balance_push_set(cpu, true);
> > -
> > - /*
> > * We've cleared cpu_active_mask / set balance_push, wait for all
> > * preempt-disabled and RCU users of this state to go away such that
> > * all new such users will observe it.
> > @@ -7910,6 +7908,14 @@ int sched_cpu_deactivate(unsigned int cp
> > }
> > rq_unlock_irqrestore(rq, &rf);
> >
> > + /*
> > + * From this point forward, this CPU will refuse to run any task that
> > + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
> > + * push those tasks away until this gets cleared, see
> > + * sched_cpu_dying().
> > + */
> > + balance_push_set(cpu, true);
> > +
>
> AIUI with cpu_dying_mask being flipped before even entering
> sched_cpu_deactivate(), we don't need this to be before the
> synchronize_rcu() anymore; is there more than that to why you're punting it
> back this side of it?

I think it does does need to be like this, we need to clearly separate
the active=true and balance_push_set(). If we were to somehow observe
both balance_push_set() and active==false, we'd be in trouble.

> > #ifdef CONFIG_SCHED_SMT
> > /*
> > * When going down, decrement the number of cores with SMT present.
>
> > @@ -8206,7 +8212,7 @@ void __init sched_init(void)
> > rq->sd = NULL;
> > rq->rd = NULL;
> > rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
> > - rq->balance_callback = NULL;
> > + rq->balance_callback = &balance_push_callback;
> > rq->active_balance = 0;
> > rq->next_balance = jiffies;
> > rq->push_cpu = 0;
> > @@ -8253,6 +8259,7 @@ void __init sched_init(void)
> >
> > #ifdef CONFIG_SMP
> > idle_thread_set_boot_cpu();
> > + balance_push_set(smp_processor_id(), false);
> > #endif
> > init_sched_fair_class();
> >
>
> I don't get what these two changes do - the end result is the same as
> before, no?

Not quite; we have to make sure the state of the offline CPUs matches
that of a CPU that's been offlined. For consistency if nothing else, but
it's actually important for a point below.

> Also, AIUI this patch covers the cpu_dying -> !cpu_dying rollback case
> since balance_push gets numbed down by !cpu_dying. What about the other way
> around (hot-plug failure + rollback)? We may have allowed !pcpu tasks on the
> now-dying CPU, and we'd need to re-install the balance_push callback.

This is in fact handled. Note how the previous point initialized the
offline CPU to have the push_callback installed.

Also note how balance_push() re-instates itself unconditionally.

So the thing is, we install the push callback on deactivate() and leave
it in place until activate, it is always there, regardless what way
we're moving.

However, it is only effective whild going down, see the early exit.

2021-04-12 17:24:31

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On 12/04/21 14:03, Peter Zijlstra wrote:
> On Thu, Mar 11, 2021 at 03:13:04PM +0000, Valentin Schneider wrote:
>> Peter Zijlstra <[email protected]> writes:
>> > @@ -7910,6 +7908,14 @@ int sched_cpu_deactivate(unsigned int cp
>> > }
>> > rq_unlock_irqrestore(rq, &rf);
>> >
>> > + /*
>> > + * From this point forward, this CPU will refuse to run any task that
>> > + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
>> > + * push those tasks away until this gets cleared, see
>> > + * sched_cpu_dying().
>> > + */
>> > + balance_push_set(cpu, true);
>> > +
>>
>> AIUI with cpu_dying_mask being flipped before even entering
>> sched_cpu_deactivate(), we don't need this to be before the
>> synchronize_rcu() anymore; is there more than that to why you're punting it
>> back this side of it?
>
> I think it does does need to be like this, we need to clearly separate
> the active=true and balance_push_set(). If we were to somehow observe
> both balance_push_set() and active==false, we'd be in trouble.
>

I'm afraid I don't follow; we're replacing a read of rq->balance_push with
cpu_dying(), and those are still written on the same side of the
synchronize_rcu(). What am I missing?

>> > #ifdef CONFIG_SCHED_SMT
>> > /*
>> > * When going down, decrement the number of cores with SMT present.
>>
>> > @@ -8206,7 +8212,7 @@ void __init sched_init(void)
>> > rq->sd = NULL;
>> > rq->rd = NULL;
>> > rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
>> > - rq->balance_callback = NULL;
>> > + rq->balance_callback = &balance_push_callback;
>> > rq->active_balance = 0;
>> > rq->next_balance = jiffies;
>> > rq->push_cpu = 0;
>> > @@ -8253,6 +8259,7 @@ void __init sched_init(void)
>> >
>> > #ifdef CONFIG_SMP
>> > idle_thread_set_boot_cpu();
>> > + balance_push_set(smp_processor_id(), false);
>> > #endif
>> > init_sched_fair_class();
>> >
>>
>> I don't get what these two changes do - the end result is the same as
>> before, no?
>
> Not quite; we have to make sure the state of the offline CPUs matches
> that of a CPU that's been offlined. For consistency if nothing else, but
> it's actually important for a point below.
>
>> Also, AIUI this patch covers the cpu_dying -> !cpu_dying rollback case
>> since balance_push gets numbed down by !cpu_dying. What about the other way
>> around (hot-plug failure + rollback)? We may have allowed !pcpu tasks on the
>> now-dying CPU, and we'd need to re-install the balance_push callback.
>
> This is in fact handled. Note how the previous point initialized the
> offline CPU to have the push_callback installed.
>
> Also note how balance_push() re-instates itself unconditionally.
>
> So the thing is, we install the push callback on deactivate() and leave
> it in place until activate, it is always there, regardless what way
> we're moving.
>
> However, it is only effective whild going down, see the early exit.


Oooh, I can't read, only the boot CPU gets its callback uninstalled in
sched_init()! So secondaries keep push_callback installed up until
sched_cpu_activate(), but as you said it's not effective unless a rollback
happens.

Now, doesn't that mean we should *not* uninstall the callback in
sched_cpu_dying()? AFAIK it's possible for the initial secondary CPU
boot to go fine, but the next offline+online cycle fails while going up -
that would need to rollback with push_callback installed.

2021-04-13 12:45:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Mon, Apr 12, 2021 at 06:22:42PM +0100, Valentin Schneider wrote:
> On 12/04/21 14:03, Peter Zijlstra wrote:
> > On Thu, Mar 11, 2021 at 03:13:04PM +0000, Valentin Schneider wrote:
> >> Peter Zijlstra <[email protected]> writes:
> >> > @@ -7910,6 +7908,14 @@ int sched_cpu_deactivate(unsigned int cp
> >> > }
> >> > rq_unlock_irqrestore(rq, &rf);
> >> >
> >> > + /*
> >> > + * From this point forward, this CPU will refuse to run any task that
> >> > + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
> >> > + * push those tasks away until this gets cleared, see
> >> > + * sched_cpu_dying().
> >> > + */
> >> > + balance_push_set(cpu, true);
> >> > +
> >>
> >> AIUI with cpu_dying_mask being flipped before even entering
> >> sched_cpu_deactivate(), we don't need this to be before the
> >> synchronize_rcu() anymore; is there more than that to why you're punting it
> >> back this side of it?
> >
> > I think it does does need to be like this, we need to clearly separate
> > the active=true and balance_push_set(). If we were to somehow observe
> > both balance_push_set() and active==false, we'd be in trouble.
> >
>
> I'm afraid I don't follow; we're replacing a read of rq->balance_push with
> cpu_dying(), and those are still written on the same side of the
> synchronize_rcu(). What am I missing?

Yeah, I'm not sure anymnore either; I tried to work out why I'd done
that but upon closer examination everything fell flat.

Let me try again today :-)

> Oooh, I can't read, only the boot CPU gets its callback uninstalled in
> sched_init()! So secondaries keep push_callback installed up until
> sched_cpu_activate(), but as you said it's not effective unless a rollback
> happens.
>
> Now, doesn't that mean we should *not* uninstall the callback in
> sched_cpu_dying()? AFAIK it's possible for the initial secondary CPU
> boot to go fine, but the next offline+online cycle fails while going up -
> that would need to rollback with push_callback installed.

Quite; I removed that shortly after sending this; when I tried to write
a comment and found it.

2021-04-15 09:01:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Tue, Apr 13, 2021 at 08:51:23AM +0200, Peter Zijlstra wrote:

> > I'm afraid I don't follow; we're replacing a read of rq->balance_push with
> > cpu_dying(), and those are still written on the same side of the
> > synchronize_rcu(). What am I missing?
>
> Yeah, I'm not sure anymnore either; I tried to work out why I'd done
> that but upon closer examination everything fell flat.
>
> Let me try again today :-)

Can't make sense of what I did.. I've removed that hunk. Patch now looks
like this.

---
Subject: sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
From: Peter Zijlstra <[email protected]>
Date: Thu Jan 21 16:09:32 CET 2021

Use the new cpu_dying() state to simplify and fix the balance_push()
vs CPU hotplug rollback state.

Specifically, we currently rely on notifiers sched_cpu_dying() /
sched_cpu_activate() to terminate balance_push, however if the
cpu_down() fails when we're past sched_cpu_deactivate(), it should
terminate balance_push at that point and not wait until we hit
sched_cpu_activate().

Similarly, when cpu_up() fails and we're going back down, balance_push
should be active, where it currently is not.

So instead, make sure balance_push is enabled between
sched_cpu_deactivate() and sched_cpu_activate() (eg. when
!cpu_active()), and gate it's utility with cpu_dying().

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 26 +++++++++++++++-----------
kernel/sched/sched.h | 1 -
2 files changed, 15 insertions(+), 12 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1811,7 +1811,7 @@ static inline bool is_cpu_allowed(struct
return cpu_online(cpu);

/* Regular kernel threads don't get to stay during offline. */
- if (cpu_rq(cpu)->balance_push)
+ if (cpu_dying(cpu))
return false;

/* But are allowed during online. */
@@ -7639,6 +7639,9 @@ static DEFINE_PER_CPU(struct cpu_stop_wo

/*
* Ensure we only run per-cpu kthreads once the CPU goes !active.
+ *
+ * This is active/set between sched_cpu_deactivate() / sched_cpu_activate().
+ * But only effective when the hotplug motion is down.
*/
static void balance_push(struct rq *rq)
{
@@ -7646,12 +7649,19 @@ static void balance_push(struct rq *rq)

lockdep_assert_held(&rq->lock);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
/*
* Ensure the thing is persistent until balance_push_set(.on = false);
*/
rq->balance_callback = &balance_push_callback;

/*
+ * Only active while going offline.
+ */
+ if (!cpu_dying(rq->cpu))
+ return;
+
+ /*
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
*
@@ -7704,7 +7714,6 @@ static void balance_push_set(int cpu, bo
struct rq_flags rf;

rq_lock_irqsave(rq, &rf);
- rq->balance_push = on;
if (on) {
WARN_ON_ONCE(rq->balance_callback);
rq->balance_callback = &balance_push_callback;
@@ -7829,8 +7838,8 @@ int sched_cpu_activate(unsigned int cpu)
struct rq_flags rf;

/*
- * Make sure that when the hotplug state machine does a roll-back
- * we clear balance_push. Ideally that would happen earlier...
+ * Clear the balance_push callback and prepare to schedule
+ * regular tasks.
*/
balance_push_set(cpu, false);

@@ -8015,12 +8024,6 @@ int sched_cpu_dying(unsigned int cpu)
}
rq_unlock_irqrestore(rq, &rf);

- /*
- * Now that the CPU is offline, make sure we're welcome
- * to new tasks once we come back up.
- */
- balance_push_set(cpu, false);
-
calc_load_migrate(rq);
update_max_interval();
hrtick_clear(rq);
@@ -8205,7 +8208,7 @@ void __init sched_init(void)
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->balance_callback = NULL;
+ rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -8252,6 +8255,7 @@ void __init sched_init(void)

#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+ balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -983,7 +983,6 @@ struct rq {
unsigned long cpu_capacity_orig;

struct callback_head *balance_callback;
- unsigned char balance_push;

unsigned char nohz_idle_balance;
unsigned char idle_balance;

2021-04-15 14:33:12

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On 15/04/21 10:59, Peter Zijlstra wrote:
> Can't make sense of what I did.. I've removed that hunk. Patch now looks
> like this.
>

Small nit below, but regardless feel free to apply to the whole lot:
Reviewed-by: Valentin Schneider <[email protected]>

@VincentD, ISTR you had tested the initial version of this with your fancy
shmancy hotplug rollback stresser. Feel like doing this

> So instead, make sure balance_push is enabled between
> sched_cpu_deactivate() and sched_cpu_activate() (eg. when
> !cpu_active()), and gate it's utility with cpu_dying().

I'd word that "is enabled below sched_cpu_activate()", since
sched_cpu_deactivate() is now out of the picture.

[...]
> @@ -7639,6 +7639,9 @@ static DEFINE_PER_CPU(struct cpu_stop_wo
>
> /*
> * Ensure we only run per-cpu kthreads once the CPU goes !active.
> + *
> + * This is active/set between sched_cpu_deactivate() / sched_cpu_activate().

Ditto

> + * But only effective when the hotplug motion is down.
> */
> static void balance_push(struct rq *rq)
> {

2021-04-15 15:33:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Thu, Apr 15, 2021 at 03:32:11PM +0100, Valentin Schneider wrote:
> > So instead, make sure balance_push is enabled between
> > sched_cpu_deactivate() and sched_cpu_activate() (eg. when
> > !cpu_active()), and gate it's utility with cpu_dying().
>
> I'd word that "is enabled below sched_cpu_activate()", since
> sched_cpu_deactivate() is now out of the picture.

I are confused (again!). Did you mean to say something like: 'is enabled
below SCHED_AP_ACTIVE' ?

2021-04-15 15:35:54

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On 15/04/21 17:29, Peter Zijlstra wrote:
> On Thu, Apr 15, 2021 at 03:32:11PM +0100, Valentin Schneider wrote:
>> I'd word that "is enabled below sched_cpu_activate()", since
>> sched_cpu_deactivate() is now out of the picture.
>
> I are confused (again!). Did you mean to say something like: 'is enabled
> below SCHED_AP_ACTIVE' ?

Something like this yes; wording is hard.

Subject: [tip: sched/core] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

The following commit has been merged into the sched/core branch of tip:

Commit-ID: b5c4477366fb5e6a2f0f38742c33acd666c07698
Gitweb: https://git.kernel.org/tip/b5c4477366fb5e6a2f0f38742c33acd666c07698
Author: Peter Zijlstra <[email protected]>
AuthorDate: Thu, 21 Jan 2021 16:09:32 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Fri, 16 Apr 2021 17:06:32 +02:00

sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

Use the new cpu_dying() state to simplify and fix the balance_push()
vs CPU hotplug rollback state.

Specifically, we currently rely on notifiers sched_cpu_dying() /
sched_cpu_activate() to terminate balance_push, however if the
cpu_down() fails when we're past sched_cpu_deactivate(), it should
terminate balance_push at that point and not wait until we hit
sched_cpu_activate().

Similarly, when cpu_up() fails and we're going back down, balance_push
should be active, where it currently is not.

So instead, make sure balance_push is enabled below SCHED_AP_ACTIVE
(when !cpu_active()), and gate it's utility with cpu_dying().

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Valentin Schneider <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/core.c | 26 +++++++++++++++-----------
kernel/sched/sched.h | 1 -
2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 95bd6ab..7d031da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1811,7 +1811,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
return cpu_online(cpu);

/* Regular kernel threads don't get to stay during offline. */
- if (cpu_rq(cpu)->balance_push)
+ if (cpu_dying(cpu))
return false;

/* But are allowed during online. */
@@ -7638,6 +7638,9 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work);

/*
* Ensure we only run per-cpu kthreads once the CPU goes !active.
+ *
+ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
+ * effective when the hotplug motion is down.
*/
static void balance_push(struct rq *rq)
{
@@ -7645,12 +7648,19 @@ static void balance_push(struct rq *rq)

lockdep_assert_held(&rq->lock);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
/*
* Ensure the thing is persistent until balance_push_set(.on = false);
*/
rq->balance_callback = &balance_push_callback;

/*
+ * Only active while going offline.
+ */
+ if (!cpu_dying(rq->cpu))
+ return;
+
+ /*
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
*
@@ -7703,7 +7713,6 @@ static void balance_push_set(int cpu, bool on)
struct rq_flags rf;

rq_lock_irqsave(rq, &rf);
- rq->balance_push = on;
if (on) {
WARN_ON_ONCE(rq->balance_callback);
rq->balance_callback = &balance_push_callback;
@@ -7828,8 +7837,8 @@ int sched_cpu_activate(unsigned int cpu)
struct rq_flags rf;

/*
- * Make sure that when the hotplug state machine does a roll-back
- * we clear balance_push. Ideally that would happen earlier...
+ * Clear the balance_push callback and prepare to schedule
+ * regular tasks.
*/
balance_push_set(cpu, false);

@@ -8014,12 +8023,6 @@ int sched_cpu_dying(unsigned int cpu)
}
rq_unlock_irqrestore(rq, &rf);

- /*
- * Now that the CPU is offline, make sure we're welcome
- * to new tasks once we come back up.
- */
- balance_push_set(cpu, false);
-
calc_load_migrate(rq);
update_max_interval();
hrtick_clear(rq);
@@ -8204,7 +8207,7 @@ void __init sched_init(void)
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->balance_callback = NULL;
+ rq->balance_callback = &balance_push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -8251,6 +8254,7 @@ void __init sched_init(void)

#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+ balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cbb0b01..7e7e936 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -983,7 +983,6 @@ struct rq {
unsigned long cpu_capacity_orig;

struct callback_head *balance_callback;
- unsigned char balance_push;

unsigned char nohz_idle_balance;
unsigned char idle_balance;

2021-04-19 13:02:18

by Vincent Donnefort

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Thu, Apr 15, 2021 at 03:32:11PM +0100, Valentin Schneider wrote:
> On 15/04/21 10:59, Peter Zijlstra wrote:
> > Can't make sense of what I did.. I've removed that hunk. Patch now looks
> > like this.
> >
>
> Small nit below, but regardless feel free to apply to the whole lot:
> Reviewed-by: Valentin Schneider <[email protected]>
>
> @VincentD, ISTR you had tested the initial version of this with your fancy
> shmancy hotplug rollback stresser. Feel like doing this

I indeed wrote a test to verify all the rollback cases, up and down.

It seems I encounter an intermitent issue while running several iterations of
that test ... but I need more time to debug and figure-out where it is blocking.

>
> > So instead, make sure balance_push is enabled between
> > sched_cpu_deactivate() and sched_cpu_activate() (eg. when
> > !cpu_active()), and gate it's utility with cpu_dying().
>
> I'd word that "is enabled below sched_cpu_activate()", since
> sched_cpu_deactivate() is now out of the picture.
>
> [...]
> > @@ -7639,6 +7639,9 @@ static DEFINE_PER_CPU(struct cpu_stop_wo
> >
> > /*
> > * Ensure we only run per-cpu kthreads once the CPU goes !active.
> > + *
> > + * This is active/set between sched_cpu_deactivate() / sched_cpu_activate().
>
> Ditto
>
> > + * But only effective when the hotplug motion is down.
> > */
> > static void balance_push(struct rq *rq)
> > {

2021-04-20 09:47:38

by Vincent Donnefort

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Mon, Apr 19, 2021 at 11:56:30AM +0100, Vincent Donnefort wrote:
> On Thu, Apr 15, 2021 at 03:32:11PM +0100, Valentin Schneider wrote:
> > On 15/04/21 10:59, Peter Zijlstra wrote:
> > > Can't make sense of what I did.. I've removed that hunk. Patch now looks
> > > like this.
> > >
> >
> > Small nit below, but regardless feel free to apply to the whole lot:
> > Reviewed-by: Valentin Schneider <[email protected]>
> >
> > @VincentD, ISTR you had tested the initial version of this with your fancy
> > shmancy hotplug rollback stresser. Feel like doing this
>
> I indeed wrote a test to verify all the rollback cases, up and down.
>
> It seems I encounter an intermitent issue while running several iterations of
> that test ... but I need more time to debug and figure-out where it is blocking.

Found the issue:

$ cat hotplug/states:
219: sched:active
220: online

CPU0:

$ echo 219 > hotplug/fail
$ echo 0 > online

=> cpu_active = 1 cpu_dying = 1

which means that later on, for another CPU hotunplug, in
__balance_push_cpu_stop(), the fallback rq for a kthread can select that
CPU0, but __migrate_task() would fail and we end-up in an infinite loop,
trying to migrate that task to CPU0.

The problem is that for a failure in sched:active, as "online" has no callback,
there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit would
not be reset.

Maybe cpuhp_reset_state() and cpuhp_set_state() would then be a better place to
switch the dying bit?

>
> >
> > > So instead, make sure balance_push is enabled between
> > > sched_cpu_deactivate() and sched_cpu_activate() (eg. when
> > > !cpu_active()), and gate it's utility with cpu_dying().
> >
> > I'd word that "is enabled below sched_cpu_activate()", since
> > sched_cpu_deactivate() is now out of the picture.
> >
> > [...]
> > > @@ -7639,6 +7639,9 @@ static DEFINE_PER_CPU(struct cpu_stop_wo
> > >
> > > /*
> > > * Ensure we only run per-cpu kthreads once the CPU goes !active.
> > > + *
> > > + * This is active/set between sched_cpu_deactivate() / sched_cpu_activate().
> >
> > Ditto
> >
> > > + * But only effective when the hotplug motion is down.
> > > */
> > > static void balance_push(struct rq *rq)
> > > {

2021-04-20 14:25:30

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote:

> Found the issue:
>
> $ cat hotplug/states:
> 219: sched:active
> 220: online
>
> CPU0:
>
> $ echo 219 > hotplug/fail
> $ echo 0 > online
>
> => cpu_active = 1 cpu_dying = 1
>
> which means that later on, for another CPU hotunplug, in
> __balance_push_cpu_stop(), the fallback rq for a kthread can select that
> CPU0, but __migrate_task() would fail and we end-up in an infinite loop,
> trying to migrate that task to CPU0.
>
> The problem is that for a failure in sched:active, as "online" has no callback,
> there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit would
> not be reset.

Urgh! Good find.

> Maybe cpuhp_reset_state() and cpuhp_set_state() would then be a better place to
> switch the dying bit?

Yes, except now cpuhp_invoke_ap_callback() makes my head hurt, that runs
the callbacks out of order. I _think_ we can ignore it, but ....

Something like the below, let me see if I can reproduce and test.

---
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 838dcf238f92..05272bae953d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -160,9 +160,6 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;

- if (cpu_dying(cpu) != !bringup)
- set_cpu_dying(cpu, !bringup);
-
if (st->fail == state) {
st->fail = CPUHP_INVALID;
return -EAGAIN;
@@ -467,13 +464,16 @@ static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
+ bool bringup = st->state < target;

st->rollback = false;
st->last = NULL;

st->target = target;
st->single = false;
- st->bringup = st->state < target;
+ st->bringup = bringup;
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);

return prev_state;
}
@@ -481,6 +481,8 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
static inline void
cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
{
+ bool bringup = !st->bringup;
+
st->target = prev_state;

/*
@@ -503,7 +505,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
st->state++;
}

- st->bringup = !st->bringup;
+ st->bringup = bringup;
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
}

/* Regular hotplug invocation of the AP hotplug thread */

2021-04-20 14:40:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Tue, Apr 20, 2021 at 04:20:56PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote:
>
> > Found the issue:
> >
> > $ cat hotplug/states:
> > 219: sched:active
> > 220: online
> >
> > CPU0:
> >
> > $ echo 219 > hotplug/fail
> > $ echo 0 > online
> >
> > => cpu_active = 1 cpu_dying = 1
> >
> > which means that later on, for another CPU hotunplug, in
> > __balance_push_cpu_stop(), the fallback rq for a kthread can select that
> > CPU0, but __migrate_task() would fail and we end-up in an infinite loop,
> > trying to migrate that task to CPU0.
> >
> > The problem is that for a failure in sched:active, as "online" has no callback,
> > there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit would
> > not be reset.
>
> Urgh! Good find.
>
> > Maybe cpuhp_reset_state() and cpuhp_set_state() would then be a better place to
> > switch the dying bit?
>
> Yes, except now cpuhp_invoke_ap_callback() makes my head hurt, that runs
> the callbacks out of order. I _think_ we can ignore it, but ....
>
> Something like the below, let me see if I can reproduce and test.

I seem to have triggered the BUG() in select_fallback_rq() with your recipie.
Have cpu0 fail on sched:active, then offline all other CPUs.

Now lemme add that patch.

2021-04-20 14:59:41

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Tue, Apr 20, 2021 at 04:39:04PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 20, 2021 at 04:20:56PM +0200, Peter Zijlstra wrote:
> > On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote:
> >
> > > Found the issue:
> > >
> > > $ cat hotplug/states:
> > > 219: sched:active
> > > 220: online
> > >
> > > CPU0:
> > >
> > > $ echo 219 > hotplug/fail
> > > $ echo 0 > online
> > >
> > > => cpu_active = 1 cpu_dying = 1
> > >
> > > which means that later on, for another CPU hotunplug, in
> > > __balance_push_cpu_stop(), the fallback rq for a kthread can select that
> > > CPU0, but __migrate_task() would fail and we end-up in an infinite loop,
> > > trying to migrate that task to CPU0.
> > >
> > > The problem is that for a failure in sched:active, as "online" has no callback,
> > > there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit would
> > > not be reset.
> >
> > Urgh! Good find.

> I seem to have triggered the BUG() in select_fallback_rq() with your recipie.
> Have cpu0 fail on sched:active, then offline all other CPUs.
>
> Now lemme add that patch.

(which obviously didn't actually build) seems to fix it.

---
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 838dcf238f92..e538518556f4 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,6 +63,7 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
+ int cpu;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -160,9 +161,6 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;

- if (cpu_dying(cpu) != !bringup)
- set_cpu_dying(cpu, !bringup);
-
if (st->fail == state) {
st->fail = CPUHP_INVALID;
return -EAGAIN;
@@ -467,13 +465,16 @@ static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
+ bool bringup = st->state < target;

st->rollback = false;
st->last = NULL;

st->target = target;
st->single = false;
- st->bringup = st->state < target;
+ st->bringup = bringup;
+ if (cpu_dying(st->cpu) != !bringup)
+ set_cpu_dying(st->cpu, !bringup);

return prev_state;
}
@@ -481,6 +482,8 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
static inline void
cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
{
+ bool bringup = !st->bringup;
+
st->target = prev_state;

/*
@@ -503,7 +506,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
st->state++;
}

- st->bringup = !st->bringup;
+ st->bringup = bringup;
+ if (cpu_dying(st->cpu) != !bringup)
+ set_cpu_dying(st->cpu, !bringup);
}

/* Regular hotplug invocation of the AP hotplug thread */
@@ -693,6 +698,7 @@ static void cpuhp_create(unsigned int cpu)

init_completion(&st->done_up);
init_completion(&st->done_down);
+ st->cpu = cpu;
}

static int cpuhp_should_run(unsigned int cpu)

2021-04-20 16:55:06

by Vincent Donnefort

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Tue, Apr 20, 2021 at 04:58:00PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 20, 2021 at 04:39:04PM +0200, Peter Zijlstra wrote:
> > On Tue, Apr 20, 2021 at 04:20:56PM +0200, Peter Zijlstra wrote:
> > > On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote:
> > >
> > > > Found the issue:
> > > >
> > > > $ cat hotplug/states:
> > > > 219: sched:active
> > > > 220: online
> > > >
> > > > CPU0:
> > > >
> > > > $ echo 219 > hotplug/fail
> > > > $ echo 0 > online
> > > >
> > > > => cpu_active = 1 cpu_dying = 1
> > > >
> > > > which means that later on, for another CPU hotunplug, in
> > > > __balance_push_cpu_stop(), the fallback rq for a kthread can select that
> > > > CPU0, but __migrate_task() would fail and we end-up in an infinite loop,
> > > > trying to migrate that task to CPU0.
> > > >
> > > > The problem is that for a failure in sched:active, as "online" has no callback,
> > > > there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit would
> > > > not be reset.
> > >
> > > Urgh! Good find.
>
> > I seem to have triggered the BUG() in select_fallback_rq() with your recipie.
> > Have cpu0 fail on sched:active, then offline all other CPUs.
> >
> > Now lemme add that patch.
>
> (which obviously didn't actually build) seems to fix it.
>
> ---
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 838dcf238f92..e538518556f4 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -63,6 +63,7 @@ struct cpuhp_cpu_state {
> bool rollback;
> bool single;
> bool bringup;
> + int cpu;
> struct hlist_node *node;
> struct hlist_node *last;
> enum cpuhp_state cb_state;
> @@ -160,9 +161,6 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
> int (*cb)(unsigned int cpu);
> int ret, cnt;
>
> - if (cpu_dying(cpu) != !bringup)
> - set_cpu_dying(cpu, !bringup);
> -
> if (st->fail == state) {
> st->fail = CPUHP_INVALID;
> return -EAGAIN;
> @@ -467,13 +465,16 @@ static inline enum cpuhp_state
> cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
> {
> enum cpuhp_state prev_state = st->state;
> + bool bringup = st->state < target;
>
> st->rollback = false;
> st->last = NULL;
>
> st->target = target;
> st->single = false;
> - st->bringup = st->state < target;
> + st->bringup = bringup;
> + if (cpu_dying(st->cpu) != !bringup)
> + set_cpu_dying(st->cpu, !bringup);
>
> return prev_state;
> }
> @@ -481,6 +482,8 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
> static inline void
> cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
> {
> + bool bringup = !st->bringup;
> +
> st->target = prev_state;
>
> /*
> @@ -503,7 +506,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
> st->state++;
> }
>
> - st->bringup = !st->bringup;
> + st->bringup = bringup;
> + if (cpu_dying(st->cpu) != !bringup)
> + set_cpu_dying(st->cpu, !bringup);
> }
>
> /* Regular hotplug invocation of the AP hotplug thread */
> @@ -693,6 +698,7 @@ static void cpuhp_create(unsigned int cpu)
>
> init_completion(&st->done_up);
> init_completion(&st->done_down);
> + st->cpu = cpu;
> }
>
> static int cpuhp_should_run(unsigned int cpu)

All good with that snippet on my end.

I wonder if balance_push() shouldn't use the cpu_of() accessor
instead of rq->cpu.

Otherwise,

+ Reviewed-by: Vincent Donnefort <[email protected]>

2021-04-20 18:10:20

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On Tue, Apr 20, 2021 at 05:53:40PM +0100, Vincent Donnefort wrote:
> All good with that snippet on my end.
>
> I wonder if balance_push() shouldn't use the cpu_of() accessor
> instead of rq->cpu.

That might be a personal quirk of mine, but for code that is under
CONFIG_SMP (as all balancing code must be) I tend to prefer the more
explicit rq->cpu usage. cpu_of() obviously also works.

> Otherwise,
>
> + Reviewed-by: Vincent Donnefort <[email protected]>

Thanks!, now I get to write a Changelog :-)

2021-04-21 11:46:24

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

On 20/04/21 16:58, Peter Zijlstra wrote:
> On Tue, Apr 20, 2021 at 04:39:04PM +0200, Peter Zijlstra wrote:
>> On Tue, Apr 20, 2021 at 04:20:56PM +0200, Peter Zijlstra wrote:
>> > On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote:
>> >
>> > > Found the issue:
>> > >
>> > > $ cat hotplug/states:
>> > > 219: sched:active
>> > > 220: online
>> > >
>> > > CPU0:
>> > >
>> > > $ echo 219 > hotplug/fail
>> > > $ echo 0 > online
>> > >
>> > > => cpu_active = 1 cpu_dying = 1
>> > >
>> > > which means that later on, for another CPU hotunplug, in
>> > > __balance_push_cpu_stop(), the fallback rq for a kthread can select that
>> > > CPU0, but __migrate_task() would fail and we end-up in an infinite loop,
>> > > trying to migrate that task to CPU0.
>> > >
>> > > The problem is that for a failure in sched:active, as "online" has no callback,
>> > > there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit would
>> > > not be reset.
>> >
>> > Urgh! Good find.
>
>> I seem to have triggered the BUG() in select_fallback_rq() with your recipie.
>> Have cpu0 fail on sched:active, then offline all other CPUs.
>>
>> Now lemme add that patch.
>
> (which obviously didn't actually build) seems to fix it.
>

Moving the cpu_dying_mask update from cpuhp_invoke_callback() to
cpuhp_{set, reset}_state() means we lose an update in cpuhp_issue_call(),
but AFAICT that wasn't required (this doesn't actually change a CPU's
hotplug state, rather executes some newly installed/removed callbacks whose
state maps below the CPU's current hp state).

Actually staring at it some more, it might have caused bugs: if a
cpuhp_setup_state() fails, we can end up in cpuhp_rollback_install() which
will end up calling cpuhp_invoke_callback(bringup=false) and mess with the
dying mask.

FWIW:

Reviewed-by: Valentin Schneider <[email protected]>

Subject: [tip: sched/core] cpumask/hotplug: Fix cpu_dying() state tracking

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 2ea46c6fc9452ac100ad907b051d797225847e33
Gitweb: https://git.kernel.org/tip/2ea46c6fc9452ac100ad907b051d797225847e33
Author: Peter Zijlstra <[email protected]>
AuthorDate: Tue, 20 Apr 2021 20:04:19 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Wed, 21 Apr 2021 13:55:43 +02:00

cpumask/hotplug: Fix cpu_dying() state tracking

Vincent reported that for states with a NULL startup/teardown function
we do not call cpuhp_invoke_callback() (because there is none) and as
such we'll not update the cpu_dying() state.

The stale cpu_dying() can eventually lead to triggering BUG().

Rectify this by updating cpu_dying() in the exact same places the
hotplug machinery tracks its directional state, namely
cpuhp_set_state() and cpuhp_reset_state().

Reported-by: Vincent Donnefort <[email protected]>
Suggested-by: Vincent Donnefort <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Vincent Donnefort <[email protected]>
Reviewed-by: Valentin Schneider <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/cpu.c | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 838dcf2..e538518 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,6 +63,7 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
+ int cpu;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -160,9 +161,6 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;

- if (cpu_dying(cpu) != !bringup)
- set_cpu_dying(cpu, !bringup);
-
if (st->fail == state) {
st->fail = CPUHP_INVALID;
return -EAGAIN;
@@ -467,13 +465,16 @@ static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
+ bool bringup = st->state < target;

st->rollback = false;
st->last = NULL;

st->target = target;
st->single = false;
- st->bringup = st->state < target;
+ st->bringup = bringup;
+ if (cpu_dying(st->cpu) != !bringup)
+ set_cpu_dying(st->cpu, !bringup);

return prev_state;
}
@@ -481,6 +482,8 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
static inline void
cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
{
+ bool bringup = !st->bringup;
+
st->target = prev_state;

/*
@@ -503,7 +506,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
st->state++;
}

- st->bringup = !st->bringup;
+ st->bringup = bringup;
+ if (cpu_dying(st->cpu) != !bringup)
+ set_cpu_dying(st->cpu, !bringup);
}

/* Regular hotplug invocation of the AP hotplug thread */
@@ -693,6 +698,7 @@ static void cpuhp_create(unsigned int cpu)

init_completion(&st->done_up);
init_completion(&st->done_down);
+ st->cpu = cpu;
}

static int cpuhp_should_run(unsigned int cpu)