2020-09-11 08:44:08

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug

In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This contraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 5 ++
2 files changed, 106 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3513,8 +3513,10 @@ static inline struct callback_head *spli
struct callback_head *head = rq->balance_callback;

lockdep_assert_held(&rq->lock);
- if (head)
+ if (head) {
rq->balance_callback = NULL;
+ rq->balance_flags &= ~BALANCE_WORK;
+ }

return head;
}
@@ -3569,6 +3571,8 @@ prepare_lock_switch(struct rq *rq, struc
#endif
}

+static bool balance_push(struct rq *rq);
+
static inline void finish_lock_switch(struct rq *rq)
{
/*
@@ -3577,7 +3581,16 @@ static inline void finish_lock_switch(st
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- __balance_callbacks(rq);
+ if (unlikely(rq->balance_flags)) {
+ /*
+ * Run the balance_callbacks, except on hotplug
+ * when we need to push the current task away.
+ */
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ !(rq->balance_flags & BALANCE_PUSH) ||
+ !balance_push(rq))
+ __balance_callbacks(rq);
+ }
raw_spin_unlock_irq(&rq->lock);
}

@@ -6836,6 +6849,87 @@ static void migrate_tasks(struct rq *dea

rq->stop = stop;
}
+
+static int __balance_push_stop(void *arg)
+{
+ struct task_struct *p = arg;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+ int cpu;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
+
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ cpu = select_fallback_rq(rq->cpu, p);
+ rq = __migrate_task(rq, &rf, p, cpu);
+ }
+
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static bool balance_push(struct rq *rq)
+{
+ struct task_struct *push_task = rq->curr;
+
+ lockdep_assert_held(&rq->lock);
+ SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+ /*
+ * Both the cpu-hotplug and stop task are in this class and are
+ * required to complete the hotplug process.
+ */
+ if (is_per_cpu_kthread(push_task))
+ return false;
+
+ get_task_struct(push_task);
+ /*
+ * Temporarily drop rq->lock such that we can wake-up the stop task.
+ * Both preemption and IRQs are still disabled.
+ */
+ raw_spin_unlock(&rq->lock);
+ stop_one_cpu_nowait(rq->cpu, __balance_push_stop, push_task,
+ this_cpu_ptr(&push_work));
+ /*
+ * At this point need_resched() is true and we'll take the loop in
+ * schedule(). The next pick is obviously going to be the stop task
+ * which is_per_cpu_kthread() and will push this task away.
+ */
+ raw_spin_lock(&rq->lock);
+
+ return true;
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (on)
+ rq->balance_flags |= BALANCE_PUSH;
+ else
+ rq->balance_flags &= ~BALANCE_PUSH;
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline bool balance_push(struct rq *rq)
+{
+ return false;
+}
+
#endif /* CONFIG_HOTPLUG_CPU */

void set_rq_online(struct rq *rq)
@@ -6921,6 +7015,8 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;

+ balance_push_set(cpu, false);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
@@ -6968,6 +7064,8 @@ int sched_cpu_deactivate(unsigned int cp
*/
synchronize_rcu();

+ balance_push_set(cpu, true);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
@@ -6981,6 +7079,7 @@ int sched_cpu_deactivate(unsigned int cp

ret = cpuset_cpu_inactive(cpu);
if (ret) {
+ balance_push_set(cpu, false);
set_cpu_active(cpu, true);
return ret;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
unsigned long cpu_capacity_orig;

struct callback_head *balance_callback;
+ unsigned char balance_flags;

unsigned char nohz_idle_balance;
unsigned char idle_balance;
@@ -1384,6 +1385,9 @@ init_numa_balancing(unsigned long clone_

#ifdef CONFIG_SMP

+#define BALANCE_WORK 0x01
+#define BALANCE_PUSH 0x02
+
static inline void
queue_balance_callback(struct rq *rq,
struct callback_head *head,
@@ -1397,6 +1401,7 @@ queue_balance_callback(struct rq *rq,
head->func = (void (*)(struct callback_head *))func;
head->next = rq->balance_callback;
rq->balance_callback = head;
+ rq->balance_flags |= BALANCE_WORK;
}

#define rcu_dereference_check_sched_domain(p) \



2020-09-11 12:24:01

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug


(+Cc VincentD, who's been looking at some of this)

On 11/09/20 09:17, Peter Zijlstra wrote:
> In preparation for migrate_disable(), make sure only per-cpu kthreads
> are allowed to run on !active CPUs.
>
> This is ran (as one of the very first steps) from the cpu-hotplug
> task which is a per-cpu kthread and completion of the hotplug
> operation only requires such tasks.
>
> This contraint enables the migrate_disable() implementation to wait

s/contraint/constraint/

> for completion of all migrate_disable regions on this CPU at hotplug
> time without fear of any new ones starting.
>
> This replaces the unlikely(rq->balance_callbacks) test at the tail of
> context_switch with an unlikely(rq->balance_work), the fast path is
> not affected.
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++-
> kernel/sched/sched.h | 5 ++
> 2 files changed, 106 insertions(+), 2 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6836,6 +6849,87 @@ static void migrate_tasks(struct rq *dea
>
> rq->stop = stop;
> }
> +
> +static int __balance_push_stop(void *arg)

__balance_push_cpu_stop()? _cpu_stop() seems to be the usual suffix.

> +{
> + struct task_struct *p = arg;
> + struct rq *rq = this_rq();
> + struct rq_flags rf;
> + int cpu;
> +
> + raw_spin_lock_irq(&p->pi_lock);
> + rq_lock(rq, &rf);
> +
> + if (task_rq(p) == rq && task_on_rq_queued(p)) {
> + cpu = select_fallback_rq(rq->cpu, p);
> + rq = __migrate_task(rq, &rf, p, cpu);
> + }
> +
> + rq_unlock(rq, &rf);
> + raw_spin_unlock_irq(&p->pi_lock);
> +
> + put_task_struct(p);
> +
> + return 0;
> +}
> +
> +static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
> +
> +/*
> + * Ensure we only run per-cpu kthreads once the CPU goes !active.
> + */
> +static bool balance_push(struct rq *rq)
> +{
> + struct task_struct *push_task = rq->curr;
> +
> + lockdep_assert_held(&rq->lock);
> + SCHED_WARN_ON(rq->cpu != smp_processor_id());
> +
> + /*
> + * Both the cpu-hotplug and stop task are in this class and are
> + * required to complete the hotplug process.

Nit: s/class/case/? I can't not read "class" as "sched_class", and
those two are *not* in the same sched_class, and confusion ensues.

> + */
> + if (is_per_cpu_kthread(push_task))
> + return false;
> +
> + get_task_struct(push_task);
> + /*
> + * Temporarily drop rq->lock such that we can wake-up the stop task.
> + * Both preemption and IRQs are still disabled.
> + */
> + raw_spin_unlock(&rq->lock);
> + stop_one_cpu_nowait(rq->cpu, __balance_push_stop, push_task,
> + this_cpu_ptr(&push_work));
> + /*
> + * At this point need_resched() is true and we'll take the loop in
> + * schedule(). The next pick is obviously going to be the stop task
> + * which is_per_cpu_kthread() and will push this task away.
> + */
> + raw_spin_lock(&rq->lock);
> +
> + return true;
> +}
> +
> @@ -6968,6 +7064,8 @@ int sched_cpu_deactivate(unsigned int cp
> */
> synchronize_rcu();
>
> + balance_push_set(cpu, true);
> +

IIUC this is going to make every subsequent finish_lock_switch()
migrate the switched-to task if it isn't a pcpu kthread. So this is going
to lead to a dance of

switch_to(<task0>) -> switch_to(<stopper>) -> switch_to(<task1>) -> switch_to(<stopper>) ...

Wouldn't it be better to batch all those in a migrate_tasks() sibling that
skips pcpu kthreads?

> #ifdef CONFIG_SCHED_SMT
> /*
> * When going down, decrement the number of cores with SMT present.

2020-09-11 16:04:34

by Valentin Schneider

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug


On 11/09/20 13:29, [email protected] wrote:
> On Fri, Sep 11, 2020 at 01:17:45PM +0100, Valentin Schneider wrote:
>
>> > @@ -6968,6 +7064,8 @@ int sched_cpu_deactivate(unsigned int cp
>> > */
>> > synchronize_rcu();
>> >
>> > + balance_push_set(cpu, true);
>> > +
>>
>> IIUC this is going to make every subsequent finish_lock_switch()
>> migrate the switched-to task if it isn't a pcpu kthread. So this is going
>> to lead to a dance of
>>
>> switch_to(<task0>) -> switch_to(<stopper>) -> switch_to(<task1>) -> switch_to(<stopper>) ...
>>
>> Wouldn't it be better to batch all those in a migrate_tasks() sibling that
>> skips pcpu kthreads?
>
> That's 'difficult', this is hotplug, performance is not a consideration.
>

Aye aye.

The reason I'm trying to care about this is (don't wince too hard) for that
CPU pause thing [1]. Vincent's been the one doing all the actual work so I
should let him pitch in, but TL;DR if we could "relatively quickly" migrate
all !pcpu tasks after clearing the active bit, we could stop hotplug there
and have our "CPU pause" thing.

[1]: https://lwn.net/Articles/820825/

> Basically we don't have an iterator for the runqueues, so finding these
> tasks is hard.

Mmph right, we'd pretty much have to "enjoy" iterating & skipping pcpu
threads every __pick_migrate_task() call...

2020-09-11 17:47:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug

On Fri, Sep 11, 2020 at 01:17:45PM +0100, Valentin Schneider wrote:

> > @@ -6968,6 +7064,8 @@ int sched_cpu_deactivate(unsigned int cp
> > */
> > synchronize_rcu();
> >
> > + balance_push_set(cpu, true);
> > +
>
> IIUC this is going to make every subsequent finish_lock_switch()
> migrate the switched-to task if it isn't a pcpu kthread. So this is going
> to lead to a dance of
>
> switch_to(<task0>) -> switch_to(<stopper>) -> switch_to(<task1>) -> switch_to(<stopper>) ...
>
> Wouldn't it be better to batch all those in a migrate_tasks() sibling that
> skips pcpu kthreads?

That's 'difficult', this is hotplug, performance is not a consideration.

Basically we don't have an iterator for the runqueues, so finding these
tasks is hard.

Subject: Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug

On 2020-09-11 10:17:47 [+0200], Peter Zijlstra wrote:
> In preparation for migrate_disable(), make sure only per-cpu kthreads
> are allowed to run on !active CPUs.
>
> This is ran (as one of the very first steps) from the cpu-hotplug
> task which is a per-cpu kthread and completion of the hotplug
> operation only requires such tasks.
>
> This contraint enables the migrate_disable() implementation to wait
> for completion of all migrate_disable regions on this CPU at hotplug
> time without fear of any new ones starting.
>
> This replaces the unlikely(rq->balance_callbacks) test at the tail of
> context_switch with an unlikely(rq->balance_work), the fast path is
> not affected.

With this on top of -rc5 I get:

[ 42.678670] process 1816 (hackbench) no longer affine to cpu2
[ 42.678684] process 1817 (hackbench) no longer affine to cpu2
[ 42.710502] ------------[ cut here ]------------
[ 42.711505] rq->clock_update_flags < RQCF_ACT_SKIP
[ 42.711514] WARNING: CPU: 2 PID: 19 at kernel/sched/sched.h:1141 update_curr+0xe3/0x3f0
[ 42.714005] Modules linked in:
[ 42.714582] CPU: 2 PID: 19 Comm: migration/2 Not tainted 5.9.0-rc5+ #107
[ 42.715836] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
[ 42.717367] RIP: 0010:update_curr+0xe3/0x3f0
[ 42.718212] Code: 09 00 00 01 0f 87 6c ff ff ff 80 3d 52 bd 59 01 00 0f 85 5f ff ff ff 48 c7 c7 f8 7e 2b 82 c6 05 3e bd 59 01 01 e8 bc 9a fb ff <0f> 0b e9 45 ff ff ff 8b 05 d8 d4 59 01 48 8d 6b 80 8b 15 ba 5a 5b
[ 42.721839] RSP: 0018:ffffc900000e3d60 EFLAGS: 00010086
[ 42.722827] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[ 42.724166] RDX: ffff888275fa4540 RSI: ffffffff810f37e6 RDI: ffffffff82463940
[ 42.725547] RBP: ffff888276ca9600 R08: 0000000000000000 R09: 0000000000000026
[ 42.726925] R10: 0000000000000046 R11: 0000000000000046 R12: ffff888276ca9580
[ 42.728268] R13: ffff888276ca9580 R14: ffff888276ca9600 R15: ffff88826aa5c5c0
[ 42.729659] FS: 0000000000000000(0000) GS:ffff888276c80000(0000) knlGS:0000000000000000
[ 42.731186] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 42.732272] CR2: 00007f7952603f90 CR3: 000000026aa56000 CR4: 00000000003506e0
[ 42.733657] Call Trace:
[ 42.734136] dequeue_task_fair+0xfa/0x6f0
[ 42.734899] do_set_cpus_allowed+0xbb/0x1c0
[ 42.735692] cpuset_cpus_allowed_fallback+0x73/0x1a0
[ 42.736635] select_fallback_rq+0x129/0x160
[ 42.737450] __balance_push_stop+0x132/0x230
[ 42.738291] ? migration_cpu_stop+0x1f0/0x1f0
[ 42.739118] cpu_stopper_thread+0x76/0x100
[ 42.739897] ? smpboot_thread_fn+0x21/0x1f0
[ 42.740691] smpboot_thread_fn+0x106/0x1f0
[ 42.741487] ? __smpboot_create_thread.part.0+0x100/0x100
[ 42.742539] kthread+0x135/0x150
[ 42.743153] ? kthread_create_worker_on_cpu+0x60/0x60
[ 42.744106] ret_from_fork+0x22/0x30
[ 42.744792] irq event stamp: 100
[ 42.745433] hardirqs last enabled at (99): [<ffffffff81a8d46f>] _raw_spin_unlock_irq+0x1f/0x30
[ 42.747116] hardirqs last disabled at (100): [<ffffffff81a8d27e>] _raw_spin_lock_irq+0x3e/0x40
[ 42.748748] softirqs last enabled at (0): [<ffffffff81077b75>] copy_process+0x665/0x1b00
[ 42.750353] softirqs last disabled at (0): [<0000000000000000>] 0x0
[ 42.751552] ---[ end trace d98bb30ad2616d58 ]---

That comes due to DEQUEUE_NOCLOCK in do_set_cpus_allowed(). And then
there is this:

[ 42.752454] ======================================================
[ 42.752455] WARNING: possible circular locking dependency detected
[ 42.752455] 5.9.0-rc5+ #107 Not tainted
[ 42.752455] ------------------------------------------------------
[ 42.752456] migration/2/19 is trying to acquire lock:
[ 42.752456] ffffffff824639f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0xa/0x30
[ 42.752458] but task is already holding lock:
[ 42.752458] ffff888276ca9598 (&rq->lock){-.-.}-{2:2}, at: __balance_push_stop+0x50/0x230
[ 42.752460] which lock already depends on the new lock.
[ 42.752460] the existing dependency chain (in reverse order) is:
[ 42.752461] -> #2 (&rq->lock){-.-.}-{2:2}:
[ 42.752462] _raw_spin_lock+0x27/0x40
[ 42.752462] task_fork_fair+0x30/0x1b0
[ 42.752462] sched_fork+0x10b/0x280
[ 42.752463] copy_process+0x702/0x1b00
[ 42.752464] _do_fork+0x5a/0x450
[ 42.752464] kernel_thread+0x50/0x70
[ 42.752465] rest_init+0x19/0x240
[ 42.752465] start_kernel+0x548/0x56a
[ 42.752466] secondary_startup_64+0xa4/0xb0

[ 42.752467] -> #1 (&p->pi_lock){-.-.}-{2:2}:
[ 42.752468] _raw_spin_lock_irqsave+0x36/0x50
[ 42.752469] try_to_wake_up+0x4e/0x720
[ 42.752469] up+0x3b/0x50
[ 42.752470] __up_console_sem+0x29/0x60
[ 42.752470] console_unlock+0x390/0x690
[ 42.752471] con_font_op+0x2ec/0x480
[ 42.752471] vt_ioctl+0x4d1/0x16e0
[ 42.752471] tty_ioctl+0x3e2/0x9a0
[ 42.752472] __x64_sys_ioctl+0x81/0x9a
[ 42.752472] do_syscall_64+0x33/0x40
[ 42.752472] entry_SYSCALL_64_after_hwframe+0x44/0xa9

[ 42.752473] -> #0 ((console_sem).lock){-.-.}-{2:2}:
[ 42.752474] __lock_acquire+0x11af/0x2010
[ 42.752474] lock_acquire+0xb7/0x400
[ 42.752474] _raw_spin_lock_irqsave+0x36/0x50
[ 42.752474] down_trylock+0xa/0x30
[ 42.752475] __down_trylock_console_sem+0x23/0x90
[ 42.752475] vprintk_emit+0xf6/0x380
[ 42.752476] printk+0x53/0x6a
[ 42.752476] __warn_printk+0x42/0x84
[ 42.752476] update_curr+0xe3/0x3f0
[ 42.752477] dequeue_task_fair+0xfa/0x6f0
[ 42.752477] do_set_cpus_allowed+0xbb/0x1c0
[ 42.752478] cpuset_cpus_allowed_fallback+0x73/0x1a0
[ 42.752479] select_fallback_rq+0x129/0x160
[ 42.752479] __balance_push_stop+0x132/0x230
[ 42.752480] cpu_stopper_thread+0x76/0x100
[ 42.752480] smpboot_thread_fn+0x106/0x1f0
[ 42.752480] kthread+0x135/0x150
[ 42.752480] ret_from_fork+0x22/0x30

[ 42.752481] other info that might help us debug this:
[ 42.752482] Chain exists of:
[ 42.752482] (console_sem).lock --> &p->pi_lock --> &rq->lock
[ 42.752483] Possible unsafe locking scenario:
[ 42.752484] CPU0 CPU1
[ 42.752484] ---- ----
[ 42.752484] lock(&rq->lock);
[ 42.752485] lock(&p->pi_lock);
[ 42.752486] lock(&rq->lock);
[ 42.752486] lock((console_sem).lock);
[ 42.752487] *** DEADLOCK ***
[ 42.752488] 3 locks held by migration/2/19:
[ 42.752488] #0: ffff88826aa5d0f8 (&p->pi_lock){-.-.}-{2:2}, at: __balance_push_stop+0x48/0x230
[ 42.752489] #1: ffff888276ca9598 (&rq->lock){-.-.}-{2:2}, at: __balance_push_stop+0x50/0x230
[ 42.752490] #2: ffffffff82478e00 (rcu_read_lock){....}-{1:2}, at: cpuset_cpus_allowed_fallback+0x0/0x1a0
[ 42.752491] stack backtrace:
[ 42.752492] CPU: 2 PID: 19 Comm: migration/2 Not tainted 5.9.0-rc5+ #107
[ 42.752492] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-1 04/01/2014
[ 42.752492] Call Trace:
[ 42.752493] dump_stack+0x77/0xa0
[ 42.752493] check_noncircular+0x156/0x170
[ 42.752493] __lock_acquire+0x11af/0x2010
[ 42.752493] ? __lock_acquire+0x1692/0x2010
[ 42.752493] lock_acquire+0xb7/0x400
[ 42.752494] ? down_trylock+0xa/0x30
[ 42.752494] ? log_store.constprop.0+0x1a4/0x250
[ 42.752494] ? printk+0x53/0x6a
[ 42.752494] _raw_spin_lock_irqsave+0x36/0x50
[ 42.752495] ? down_trylock+0xa/0x30
[ 42.752495] down_trylock+0xa/0x30
[ 42.752495] ? printk+0x53/0x6a
[ 42.752495] __down_trylock_console_sem+0x23/0x90
[ 42.752495] vprintk_emit+0xf6/0x380
[ 42.752496] printk+0x53/0x6a
[ 42.752496] __warn_printk+0x42/0x84
[ 42.752496] update_curr+0xe3/0x3f0
[ 42.752496] dequeue_task_fair+0xfa/0x6f0
[ 42.752497] do_set_cpus_allowed+0xbb/0x1c0
[ 42.752497] cpuset_cpus_allowed_fallback+0x73/0x1a0
[ 42.752497] select_fallback_rq+0x129/0x160
[ 42.752497] __balance_push_stop+0x132/0x230
[ 42.752497] ? migration_cpu_stop+0x1f0/0x1f0
[ 42.752498] cpu_stopper_thread+0x76/0x100
[ 42.752498] ? smpboot_thread_fn+0x21/0x1f0
[ 42.752498] smpboot_thread_fn+0x106/0x1f0
[ 42.752498] ? __smpboot_create_thread.part.0+0x100/0x100
[ 42.752499] kthread+0x135/0x150
[ 42.752499] ? kthread_create_worker_on_cpu+0x60/0x60
[ 42.752499] ret_from_fork+0x22/0x30
[ 42.878095] numa_remove_cpu cpu 2 node 0: mask now 0-1,3-7
[ 42.879977] smpboot: CPU 2 is now offline

Sebastian

2020-09-16 18:57:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug

On Wed, Sep 16, 2020 at 12:18:45PM +0200, Sebastian Andrzej Siewior wrote:
> With this on top of -rc5 I get:
>
> [ 42.678670] process 1816 (hackbench) no longer affine to cpu2
> [ 42.678684] process 1817 (hackbench) no longer affine to cpu2
> [ 42.710502] ------------[ cut here ]------------
> [ 42.711505] rq->clock_update_flags < RQCF_ACT_SKIP
> [ 42.711514] WARNING: CPU: 2 PID: 19 at kernel/sched/sched.h:1141 update_curr+0xe3/0x3f0

> [ 42.736635] select_fallback_rq+0x129/0x160
> [ 42.737450] __balance_push_stop+0x132/0x230

Duh.. I wonder why I didn't see that.. anyway, I think the below version
ought to cure that.

> That comes due to DEQUEUE_NOCLOCK in do_set_cpus_allowed(). And then
> there is this:
>
> [ 42.752454] ======================================================
> [ 42.752455] WARNING: possible circular locking dependency detected
> [ 42.752455] 5.9.0-rc5+ #107 Not tainted
> [ 42.752455] ------------------------------------------------------
> [ 42.752456] migration/2/19 is trying to acquire lock:
> [ 42.752456] ffffffff824639f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0xa/0x30
> [ 42.752458] but task is already holding lock:
> [ 42.752458] ffff888276ca9598 (&rq->lock){-.-.}-{2:2}, at: __balance_push_stop+0x50/0x230
> [ 42.752460] which lock already depends on the new lock.

Yeah, that's the known issue with printk()...


---
Subject: sched/hotplug: Ensure only per-cpu kthreads run during hotplug
From: Peter Zijlstra <[email protected]>
Date: Fri Sep 11 09:54:27 CEST 2020

In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This constraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sched/sched.h | 5 ++
2 files changed, 117 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3513,8 +3513,10 @@ static inline struct callback_head *spli
struct callback_head *head = rq->balance_callback;

lockdep_assert_held(&rq->lock);
- if (head)
+ if (head) {
rq->balance_callback = NULL;
+ rq->balance_flags &= ~BALANCE_WORK;
+ }

return head;
}
@@ -3535,6 +3537,22 @@ static inline void balance_callbacks(str
}
}

+static bool balance_push(struct rq *rq);
+
+static inline void balance_switch(struct rq *rq)
+{
+ if (unlikely(rq->balance_flags)) {
+ /*
+ * Run the balance_callbacks, except on hotplug
+ * when we need to push the current task away.
+ */
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+ !(rq->balance_flags & BALANCE_PUSH) ||
+ !balance_push(rq))
+ __balance_callbacks(rq);
+ }
+}
+
#else

static inline void __balance_callbacks(struct rq *rq)
@@ -3550,6 +3568,10 @@ static inline void balance_callbacks(str
{
}

+static inline void balance_switch(struct rq *rq)
+{
+}
+
#endif

static inline void
@@ -3577,7 +3599,7 @@ static inline void finish_lock_switch(st
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- __balance_callbacks(rq);
+ balance_switch(rq);
raw_spin_unlock_irq(&rq->lock);
}

@@ -6836,6 +6858,89 @@ static void migrate_tasks(struct rq *dea

rq->stop = stop;
}
+
+static int __balance_push_cpu_stop(void *arg)
+{
+ struct task_struct *p = arg;
+ struct rq *rq = this_rq();
+ struct rq_flags rf;
+ int cpu;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
+
+ update_rq_clock();
+
+ if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ cpu = select_fallback_rq(rq->cpu, p);
+ rq = __migrate_task(rq, &rf, p, cpu);
+ }
+
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static bool balance_push(struct rq *rq)
+{
+ struct task_struct *push_task = rq->curr;
+
+ lockdep_assert_held(&rq->lock);
+ SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+ /*
+ * Both the cpu-hotplug and stop task are in this case and are
+ * required to complete the hotplug process.
+ */
+ if (is_per_cpu_kthread(push_task))
+ return false;
+
+ get_task_struct(push_task);
+ /*
+ * Temporarily drop rq->lock such that we can wake-up the stop task.
+ * Both preemption and IRQs are still disabled.
+ */
+ raw_spin_unlock(&rq->lock);
+ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task,
+ this_cpu_ptr(&push_work));
+ /*
+ * At this point need_resched() is true and we'll take the loop in
+ * schedule(). The next pick is obviously going to be the stop task
+ * which is_per_cpu_kthread() and will push this task away.
+ */
+ raw_spin_lock(&rq->lock);
+
+ return true;
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+ if (on)
+ rq->balance_flags |= BALANCE_PUSH;
+ else
+ rq->balance_flags &= ~BALANCE_PUSH;
+ rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline bool balance_push(struct rq *rq)
+{
+ return false;
+}
+
#endif /* CONFIG_HOTPLUG_CPU */

void set_rq_online(struct rq *rq)
@@ -6921,6 +7026,8 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;

+ balance_push_set(cpu, false);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going up, increment the number of cores with SMT present.
@@ -6968,6 +7075,8 @@ int sched_cpu_deactivate(unsigned int cp
*/
synchronize_rcu();

+ balance_push_set(cpu, true);
+
#ifdef CONFIG_SCHED_SMT
/*
* When going down, decrement the number of cores with SMT present.
@@ -6981,6 +7090,7 @@ int sched_cpu_deactivate(unsigned int cp

ret = cpuset_cpu_inactive(cpu);
if (ret) {
+ balance_push_set(cpu, false);
set_cpu_active(cpu, true);
return ret;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
unsigned long cpu_capacity_orig;

struct callback_head *balance_callback;
+ unsigned char balance_flags;

unsigned char nohz_idle_balance;
unsigned char idle_balance;
@@ -1384,6 +1385,9 @@ init_numa_balancing(unsigned long clone_

#ifdef CONFIG_SMP

+#define BALANCE_WORK 0x01
+#define BALANCE_PUSH 0x02
+
static inline void
queue_balance_callback(struct rq *rq,
struct callback_head *head,
@@ -1397,6 +1401,7 @@ queue_balance_callback(struct rq *rq,
head->func = (void (*)(struct callback_head *))func;
head->next = rq->balance_callback;
rq->balance_callback = head;
+ rq->balance_flags |= BALANCE_WORK;
}

#define rcu_dereference_check_sched_domain(p) \

Subject: Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug

On 2020-09-16 14:10:20 [+0200], [email protected] wrote:

squeeze that in please:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a4fe22b8b8418..bed3cd28af578 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6866,7 +6866,7 @@ static int __balance_push_cpu_stop(void *arg)
raw_spin_lock_irq(&p->pi_lock);
rq_lock(rq, &rf);

- update_rq_clock();
+ update_rq_clock(rq);

if (task_rq(p) == rq && task_on_rq_queued(p)) {
cpu = select_fallback_rq(rq->cpu, p);


and count me in :)

Sebastian

2020-09-16 20:48:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched/hotplug: Ensure only per-cpu kthreads run during hotplug

On Wed, Sep 16, 2020 at 03:58:17PM +0200, Sebastian Andrzej Siewior wrote:
> On 2020-09-16 14:10:20 [+0200], [email protected] wrote:
>
> squeeze that in please:
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a4fe22b8b8418..bed3cd28af578 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6866,7 +6866,7 @@ static int __balance_push_cpu_stop(void *arg)
> raw_spin_lock_irq(&p->pi_lock);
> rq_lock(rq, &rf);
>
> - update_rq_clock();
> + update_rq_clock(rq);
>
> if (task_rq(p) == rq && task_on_rq_queued(p)) {
> cpu = select_fallback_rq(rq->cpu, p);
>
>
> and count me in :)

Duh... /me goes in search of the brown paper bag -- again!