From: Thomas Gleixner <[email protected]>
On CPU unplug tasks which are in a migrate disabled region cannot be pushed
to a different CPU until they returned to migrateable state.
Account the number of tasks on a runqueue which are in a migrate disabled
section and make the hotplug wait mechanism respect that.
Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------
kernel/sched/sched.h | 4 ++++
2 files changed, 34 insertions(+), 6 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1725,10 +1725,17 @@ static void migrate_disable_switch(struc
void migrate_disable(void)
{
- if (current->migration_disabled++)
+ struct task_struct *p = current;
+
+ if (p->migration_disabled) {
+ p->migration_disabled++;
return;
+ }
- barrier();
+ preempt_disable();
+ this_rq()->nr_pinned++;
+ p->migration_disabled = 1;
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
@@ -1755,6 +1762,7 @@ void migrate_enable(void)
*/
barrier();
p->migration_disabled = 0;
+ this_rq()->nr_pinned--;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -1764,6 +1772,11 @@ static inline bool is_migration_disabled
return p->migration_disabled;
}
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return rq->nr_pinned;
+}
+
#endif
/*
@@ -2634,6 +2647,11 @@ static inline bool is_migration_disabled
return false;
}
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return false;
+}
+
#endif
static void
@@ -7006,15 +7024,20 @@ static bool balance_push(struct rq *rq)
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
*/
- if (is_per_cpu_kthread(push_task)) {
+ if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
/*
* If this is the idle task on the outgoing CPU try to wake
* up the hotplug control thread which might wait for the
* last task to vanish. The rcuwait_active() check is
* accurate here because the waiter is pinned on this CPU
* and can't obviously be running in parallel.
+ *
+ * On RT kernels this also has to check whether there are
+ * pinned and scheduled out tasks on the runqueue. They
+ * need to leave the migrate disabled section first.
*/
- if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+ rcuwait_active(&rq->hotplug_wait)) {
raw_spin_unlock(&rq->lock);
rcuwait_wake_up(&rq->hotplug_wait);
raw_spin_lock(&rq->lock);
@@ -7063,7 +7086,8 @@ static void balance_hotplug_wait(void)
{
struct rq *rq = this_rq();
- rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+ rcuwait_wait_event(&rq->hotplug_wait,
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
TASK_UNINTERRUPTIBLE);
}
@@ -7310,7 +7334,7 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
- BUG_ON(rq->nr_running != 1);
+ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1053,6 +1053,10 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+ unsigned int nr_pinned;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
On 23/10/20 11:12, Peter Zijlstra wrote:
> @@ -7006,15 +7024,20 @@ static bool balance_push(struct rq *rq)
> * Both the cpu-hotplug and stop task are in this case and are
> * required to complete the hotplug process.
> */
> - if (is_per_cpu_kthread(push_task)) {
> + if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
is_migration_disabled(p) implies rq_has_pinned_tasks(task_rq(p)), right?
So having a "simple"
if (is_migration_disabled(push_task))
return;
would help simpletons like me trying to read through this.
> /*
> * If this is the idle task on the outgoing CPU try to wake
> * up the hotplug control thread which might wait for the
> * last task to vanish. The rcuwait_active() check is
> * accurate here because the waiter is pinned on this CPU
> * and can't obviously be running in parallel.
> + *
> + * On RT kernels this also has to check whether there are
> + * pinned and scheduled out tasks on the runqueue. They
> + * need to leave the migrate disabled section first.
> */
> - if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
> + if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
> + rcuwait_active(&rq->hotplug_wait)) {
> raw_spin_unlock(&rq->lock);
> rcuwait_wake_up(&rq->hotplug_wait);
> raw_spin_lock(&rq->lock);
On Thu, Oct 29, 2020 at 04:27:09PM +0000, Valentin Schneider wrote:
>
> On 23/10/20 11:12, Peter Zijlstra wrote:
> > @@ -7006,15 +7024,20 @@ static bool balance_push(struct rq *rq)
> > * Both the cpu-hotplug and stop task are in this case and are
> > * required to complete the hotplug process.
> > */
> > - if (is_per_cpu_kthread(push_task)) {
> > + if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
>
> is_migration_disabled(p) implies rq_has_pinned_tasks(task_rq(p)), right?
In general, no, in this particular case, yes. Specifically you need
migrate_disable() + schedule() in order to get nr_pinned incremented.
We just happen to run at the tail end of schedule(), so yeah, here it
works.
> So having a "simple"
>
> if (is_migration_disabled(push_task))
> return;
>
> would help simpletons like me trying to read through this.
Can do I suppose, although I'm no sure what, if anything that helps,
because then we needs yet another comment explaining things.
I ended up with the below. Is that an improvement?
---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d7d5b7b9c99..c9c69511ece4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7226,11 +7226,19 @@ static void balance_push(struct rq *rq)
lockdep_assert_held(&rq->lock);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
+ /*
+ * When migrate_disable(), we'll also have nr_pinned incremented due to
+ * this being the tail end of schedule(). Therefore we do not need to wake
+ * the hotplug_wait and go straight to jail^Wexit.
+ */
+ if (is_migration_disabled(push_task))
+ return;
+
/*
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
*/
- if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
+ if (is_per_cpu_kthread(push_task)) {
/*
* If this is the idle task on the outgoing CPU try to wake
* up the hotplug control thread which might wait for the
On 29/10/20 17:34, Peter Zijlstra wrote:
> On Thu, Oct 29, 2020 at 04:27:09PM +0000, Valentin Schneider wrote:
[...]
> Can do I suppose, although I'm no sure what, if anything that helps,
> because then we needs yet another comment explaining things.
>
> I ended up with the below. Is that an improvement?
I'm leaning towards "yes", but YMMV.
>
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3d7d5b7b9c99..c9c69511ece4 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7226,11 +7226,19 @@ static void balance_push(struct rq *rq)
> lockdep_assert_held(&rq->lock);
> SCHED_WARN_ON(rq->cpu != smp_processor_id());
>
> + /*
> + * When migrate_disable(), we'll also have nr_pinned incremented due to
> + * this being the tail end of schedule(). Therefore we do not need to wake
> + * the hotplug_wait and go straight to jail^Wexit.
> + */
> + if (is_migration_disabled(push_task))
> + return;
> +
> /*
> * Both the cpu-hotplug and stop task are in this case and are
> * required to complete the hotplug process.
> */
> - if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
> + if (is_per_cpu_kthread(push_task)) {
> /*
> * If this is the idle task on the outgoing CPU try to wake
> * up the hotplug control thread which might wait for the
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 3015ef4b98f53fe7eba4f5f82f562c0e074d213c
Gitweb: https://git.kernel.org/tip/3015ef4b98f53fe7eba4f5f82f562c0e074d213c
Author: Thomas Gleixner <[email protected]>
AuthorDate: Wed, 26 Aug 2020 14:08:10 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 10 Nov 2020 18:39:00 +01:00
sched/core: Make migrate disable and CPU hotplug cooperative
On CPU unplug tasks which are in a migrate disabled region cannot be pushed
to a different CPU until they returned to migrateable state.
Account the number of tasks on a runqueue which are in a migrate disabled
section and make the hotplug wait mechanism respect that.
Signed-off-by: Thomas Gleixner <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Valentin Schneider <[email protected]>
Reviewed-by: Daniel Bristot de Oliveira <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------
kernel/sched/sched.h | 4 ++++
2 files changed, 34 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0efc1e4..6ea593c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1721,10 +1721,17 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
void migrate_disable(void)
{
- if (current->migration_disabled++)
+ struct task_struct *p = current;
+
+ if (p->migration_disabled) {
+ p->migration_disabled++;
return;
+ }
- barrier();
+ preempt_disable();
+ this_rq()->nr_pinned++;
+ p->migration_disabled = 1;
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
@@ -1751,6 +1758,7 @@ void migrate_enable(void)
*/
barrier();
p->migration_disabled = 0;
+ this_rq()->nr_pinned--;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
@@ -1760,6 +1768,11 @@ static inline bool is_migration_disabled(struct task_struct *p)
return p->migration_disabled;
}
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return rq->nr_pinned;
+}
+
#endif
/*
@@ -2693,6 +2706,11 @@ static inline bool is_migration_disabled(struct task_struct *p)
return false;
}
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return false;
+}
+
#endif
static void
@@ -7066,15 +7084,20 @@ static void balance_push(struct rq *rq)
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
*/
- if (is_per_cpu_kthread(push_task)) {
+ if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
/*
* If this is the idle task on the outgoing CPU try to wake
* up the hotplug control thread which might wait for the
* last task to vanish. The rcuwait_active() check is
* accurate here because the waiter is pinned on this CPU
* and can't obviously be running in parallel.
+ *
+ * On RT kernels this also has to check whether there are
+ * pinned and scheduled out tasks on the runqueue. They
+ * need to leave the migrate disabled section first.
*/
- if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) {
+ if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
+ rcuwait_active(&rq->hotplug_wait)) {
raw_spin_unlock(&rq->lock);
rcuwait_wake_up(&rq->hotplug_wait);
raw_spin_lock(&rq->lock);
@@ -7121,7 +7144,8 @@ static void balance_hotplug_wait(void)
{
struct rq *rq = this_rq();
- rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1,
+ rcuwait_wait_event(&rq->hotplug_wait,
+ rq->nr_running == 1 && !rq_has_pinned_tasks(rq),
TASK_UNINTERRUPTIBLE);
}
@@ -7366,7 +7390,7 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
- BUG_ON(rq->nr_running != 1);
+ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72d8e47..42de140 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1053,6 +1053,10 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
#endif
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+ unsigned int nr_pinned;
+#endif
};
#ifdef CONFIG_FAIR_GROUP_SCHED
On Fri, 2020-10-23 at 12:12 +0200, Peter Zijlstra wrote:
> From: Thomas Gleixner <[email protected]>
>
> On CPU unplug tasks which are in a migrate disabled region cannot be pushed
> to a different CPU until they returned to migrateable state.
>
> Account the number of tasks on a runqueue which are in a migrate disabled
> section and make the hotplug wait mechanism respect that.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------
> kernel/sched/sched.h | 4 ++++
> 2 files changed, 34 insertions(+), 6 deletions(-)
>
[]
> @@ -7310,7 +7334,7 @@ int sched_cpu_dying(unsigned int cpu)
> sched_tick_stop(cpu);
>
> rq_lock_irqsave(rq, &rf);
> - BUG_ON(rq->nr_running != 1);
> + BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
CPU hotplug is now triggering this. This is with Valentin's affine_move_task()
fix on top:
https://lore.kernel.org/lkml/[email protected]/
[ 809.412232][ T428] kernel BUG at kernel/sched/core.c:7547!
[ 809.417841][ T428] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 809.423445][ T428] CPU: 72 PID: 428 Comm: migration/72 Tainted: G I 5.10.0-rc3-next-20201113+ #1
[ 809.433678][ T428] Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10, BIOS U34 11/13/2019
[ 809.442951][ T428] Stopper: multi_cpu_stop+0x0/0x350 <- 0x0
[ 809.448643][ T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
[ 809.454071][ T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48 8b 34 24 e9 5d ff ff ff e8 88
[ 809.473650][ T428] RSP: 0018:ffffc9000889fca0 EFLAGS: 00010002
[ 809.479606][ T428] RAX: 0000000000000000 RBX: ffff8887dfcb23c0 RCX: ffffffff8e057e0d
[ 809.487482][ T428] RDX: 1ffff110fbf96480 RSI: 0000000000007c11 RDI: ffff8887dfcb2400
[ 809.495355][ T428] RBP: ffffc9000889fcc0 R08: fffffbfff2cb8e96 R09: fffffbfff2cb8e96
[ 809.503229][ T428] R10: ffffffff965c74af R11: fffffbfff2cb8e95 R12: ffff8887dfcb23d8
[ 809.511103][ T428] R13: 0000000000000086 R14: ffffffff8d5038e0 R15: 0000000000000003
[ 809.518979][ T428] FS: 0000000000000000(0000) GS:ffff8887dfc80000(0000) knlGS:0000000000000000
[ 809.527815][ T428] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 809.534291][ T428] CR2: 00007fea4cdf899c CR3: 00000018c7414002 CR4: 00000000007706e0
[ 809.542165][ T428] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 809.550040][ T428] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 809.557913][ T428] PKRU: 55555554
[ 809.561332][ T428] Call Trace:
[ 809.564489][ T428] ? x86_pmu_starting_cpu+0x20/0x20
[ 809.569570][ T428] ? sched_cpu_wait_empty+0x220/0x220
[ 809.574826][ T428] cpuhp_invoke_callback+0x1d8/0x1520
[ 809.580082][ T428] ? x2apic_send_IPI_mask+0x10/0x10
[ 809.585161][ T428] ? clear_local_APIC+0x788/0xc10
[ 809.590068][ T428] ? cpuhp_invoke_callback+0x1520/0x1520
[ 809.595584][ T428] take_cpu_down+0x10f/0x1a0
[ 809.600053][ T428] multi_cpu_stop+0x149/0x350
[ 809.604607][ T428] ? stop_machine_yield+0x10/0x10
[ 809.609511][ T428] cpu_stopper_thread+0x200/0x400
[ 809.614416][ T428] ? cpu_stop_create+0x70/0x70
[ 809.619059][ T428] smpboot_thread_fn+0x30a/0x770
[ 809.623878][ T428] ? smpboot_register_percpu_thread+0x370/0x370
[ 809.630005][ T428] ? trace_hardirqs_on+0x1c/0x150
[ 809.634910][ T428] ? __kthread_parkme+0xcc/0x1a0
[ 809.639729][ T428] ? smpboot_register_percpu_thread+0x370/0x370
[ 809.645855][ T428] kthread+0x352/0x420
[ 809.649798][ T428] ? kthread_create_on_node+0xc0/0xc0
[ 809.655052][ T428] ret_from_fork+0x22/0x30
[ 809.659345][ T428] Modules linked in: nls_ascii nls_cp437 vfat fat kvm_intel kvm ses enclosure irqbypass efivarfs ip_tables x_tables sd_mod nvme tg3 firmware_class smartpqi nvme_core scsi_transport_sas libphy dm_mirror dm_region_hash dm_log dm_mod
[ 809.681502][ T428] ---[ end trace 416318a3e677bf17 ]---
[ 809.686844][ T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
[ 809.692273][ T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48 8b 34 24 e9 5d ff ff ff e8 88
[ 809.711853][ T428] RSP: 0018:ffffc9000889fca0 EFLAGS: 00010002
[ 809.717807][ T428] RAX: 0000000000000000 RBX: ffff8887dfcb23c0 RCX: ffffffff8e057e0d
[ 809.725681][ T428] RDX: 1ffff110fbf96480 RSI: 0000000000007c11 RDI: ffff8887dfcb2400
[ 809.733556][ T428] RBP: ffffc9000889fcc0 R08: fffffbfff2cb8e96 R09: fffffbfff2cb8e96
[ 809.741432][ T428] R10: ffffffff965c74af R11: fffffbfff2cb8e95 R12: ffff8887dfcb23d8
[ 809.749307][ T428] R13: 0000000000000086 R14: ffffffff8d5038e0 R15: 0000000000000003
[ 809.757182][ T428] FS: 0000000000000000(0000) GS:ffff8887dfc80000(0000) knlGS:0000000000000000
[ 809.766018][ T428] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 809.772495][ T428] CR2: 00007fea4cdf899c CR3: 00000018c7414002 CR4: 00000000007706e0
[ 809.780369][ T428] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 809.788242][ T428] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 809.796118][ T428] PKRU: 55555554
[ 809.799538][ T428] Kernel panic - not syncing: Fatal exception
[ 809.805543][ T428]
[ 809.805544][ T428] unchecked MSR access error: WRMSR to 0x83f (tried to write 0x00000000000000f6) at rIP: 0xffffffff8d4c8a43 (native_apic_msr_write+0x23/0x40)
[ 809.805545][ T428] Call Trace:
[ 809.805546][ T428] arch_irq_work_raise+0x9b/0x110
[ 809.805547][ T428] irq_work_queue+0x24/0x40
[ 809.805548][ T428] printk_safe_log_store+0x185/0x1b0
[ 809.805549][ T428] ? kmsg_dump_rewind_nolock+0x80/0x80
[ 809.805549][ T428] ? ret_from_fork+0x22/0x30
[ 809.805550][ T428] printk+0x9a/0xc0
[ 809.805551][ T428] ? record_print_text.cold.38+0x11/0x11
[ 809.805552][ T428] ? stack_trace_consume_entry+0x160/0x160
[ 809.805553][ T428] ? save_trace+0x3d/0xb40
[ 809.805554][ T428] print_circular_bug_header.cold.69+0x10/0xd7
[ 809.805556][ T428] print_circular_bug.isra.42+0x1ac/0x300
[ 809.805557][ T428] check_noncircular+0x27b/0x320
[ 809.805558][ T428] ? print_circular_bug.isra.42+0x300/0x300
[ 809.805559][ T428] ? data_alloc.isra.8+0x15f/0x470
[ 809.805560][ T428] ? mark_lock.part.47+0x109/0x1a30
[ 809.805560][ T428] ? data_push_tail.part.6+0x310/0x310
[ 809.805561][ T428] ? print_usage_bug+0x1a0/0x1a0
[ 809.805562][ T428] check_prevs_add+0x38e/0x26f0
[ 809.805563][ T428] ? lock_is_held_type+0x19/0xe0
[ 809.805563][ T428] ? check_irq_usage+0xbb0/0xbb0
[ 809.805564][ T428] ? rcu_read_lock_held+0xb0/0xb0
[ 809.805565][ T428] __lock_acquire+0x2b9e/0x3920
[ 809.805566][ T428] ? __add_preferred_console.constprop.25+0x1d0/0x1d0
[ 809.805567][ T428] ? lockdep_hardirqs_on_prepare+0x3d0/0x3d0
[ 809.805567][ T428] lock_acquire+0x1c8/0x820
[ 809.805568][ T428] ? down_trylock+0xe/0x70
[ 809.805569][ T428] ? rcu_read_unlock+0x40/0x40
[ 809.805570][ T428] ? vprintk_emit+0x89/0x2c0
[ 809.805571][ T428] ? lock_downgrade+0x700/0x700
[ 809.805572][ T428] ? rcu_read_unlock+0x40/0x40
[ 809.805573][ T428] ? vprintk_emit+0x107/0x2c0
[ 809.805575][ T428] _raw_spin_lock_irqsave+0x30/0x50
[ 809.805576][ T428] ? down_trylock+0xe/0x70
[ 809.805577][ T428] down_trylock+0xe/0x70
[ 809.805578][ T428] __down_trylock_console_sem+0x23/0x90
[ 809.805578][ T428] console_trylock+0xe/0x60
[ 809.805579][ T428] vprintk_emit+0x107/0x2c0
[ 809.805580][ T428] ? sched_cpu_dying+0x10f/0x130
[ 809.805581][ T428] printk+0x9a/0xc0
[ 809.805582][ T428] ? record_print_text.cold.38+0x11/0x11
[ 809.805583][ T428] report_bug.cold.2+0x30/0x52
[ 809.805584][ T428] handle_bug+0x44/0x80
[ 809.805585][ T428] exc_invalid_op+0x13/0x40
[ 809.805585][ T428] asm_exc_invalid_op+0x12/0x20
[ 809.805586][ T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
[ 809.805588][ T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48 8b 34 24 e9 5d ff ff ff e8 88
[ 809.805589][ T428] RSP: 0018:ffffc9000889fca0 EFLAGS: 00010002
[ 809.805591][ T428] RAX: 0000000000000000 RBX: ffff8887dfcb23c0 RCX: ffffffff8e057e0d
[ 809.805591][ T428] RDX: 1ffff110fbf96480 RSI: 0000000000007c11 RDI: ffff8887dfcb2400
[ 809.805593][ T428] RBP: ffffc9000889fcc0 R08: fffffbfff2cb8e96 R09: fffffbfff2cb8e96
[ 809.805594][ T428] R10: ffffffff965c74af R11: fffffbfff2cb8e95 R12: ffff8887dfcb23d8
[ 809.805594][ T428] R13: 0000000000000086 R14: ffffffff8d5038e0 R15: 0000000000000003
[ 809.805595][ T428] ? cpuhp_invoke_callback+0x1520/0x1520
[ 809.805596][ T428] ? prandom_u32+0x18d/0x390
[ 809.805597][ T428] ? x86_pmu_starting_cpu+0x20/0x20
[ 809.805598][ T428] ? sched_cpu_wait_empty+0x220/0x220
[ 809.805599][ T428] cpuhp_invoke_callback+0x1d8/0x1520
[ 809.805600][ T428] ? x2apic_send_IPI_mask+0x10/0x10
[ 809.805600][ T428] ? clear_local_APIC+0x788/0xc10
[ 809.805601][ T428] ? cpuhp_invoke_callback+0x1520/0x1520
[ 809.805602][ T428] take_cpu_down+0x10f/0x1a0
[ 809.805603][ T428] multi_cpu_stop+0x149/0x350
[ 809.805603][ T428] ? stop_machine_yield+0x10/0x10
[ 809.805604][ T428] cpu_stopper_thread+0x200/0x400
[ 809.805605][ T428] ? cpu_stop_create+0x70/0x70
[ 809.805606][ T428] smpboot_thread_fn+0x30a/0x770
[ 809.805607][ T428] ? smpboot_register_percpu_thread+0x370/0x370
[ 809.805607][ T428] ? trace_hardirqs_on+0x1c/0x150
[ 809.805608][ T428] ? __kthread_parkme+0xcc/0x1a0
[ 809.805609][ T428] ? smpboot_register_percpu_thread+0x370/0x370
[ 809.805610][ T428] kthread+0x352/0x420
[ 809.805611][ T428] ? kthread_create_on_node+0xc0/0xc0
[ 809.805612][ T428] ret_from_fork+0x22/0x30
[ 809.805613][ T428] ======================================================
[ 809.805614][ T428] WARNING: possible circular locking dependency detected
[ 809.805616][ T428] 5.10.0-rc3-next-20201113+ #1 Tainted: G I
[ 809.805617][ T428] ------------------------------------------------------
[ 809.805618][ T428] migration/72/428 is trying to acquire lock:
[ 809.805619][ T428] ffffffff962c7d58 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0xe/0x70
[ 809.805623][ T428]
[ 809.805623][ T428] but task is already holding lock:
[ 809.805624][ T428] ffff8887dfcb23d8 (&rq->lock){-.-.}-{2:2}, at: sched_cpu_dying+0x4e/0x130
[ 809.805628][ T428]
[ 809.805628][ T428] which lock already depends on the new lock.
[ 809.805629][ T428]
[ 809.805630][ T428]
[ 809.805631][ T428] the existing dependency chain (in reverse order) is:
[ 809.805631][ T428]
[ 809.805632][ T428] -> #2 (&rq->lock){-.-.}-{2:2}:
[ 809.805635][ T428] lock_acquire+0x1c8/0x820
[ 809.805636][ T428] _raw_spin_lock+0x27/0x40
[ 809.805637][ T428] task_fork_fair+0x32/0x4c0
[ 809.805637][ T428] sched_fork+0x3aa/0x8c0
[ 809.805638][ T428] copy_process+0x1c87/0x6440
[ 809.805639][ T428] kernel_clone+0xbd/0xc90
[ 809.805640][ T428] kernel_thread+0x95/0xd0
[ 809.805640][ T428] rest_init+0x21/0x28a
[ 809.805641][ T428] start_kernel+0x381/0x39f
[ 809.805642][ T428] secondary_startup_64_no_verify+0xc2/0xcb
[ 809.805642][ T428]
[ 809.805643][ T428] -> #1 (&p->pi_lock){-.-.}-{2:2}:
[ 809.805646][ T428] lock_acquire+0x1c8/0x820
[ 809.805647][ T428] _raw_spin_lock_irqsave+0x30/0x50
[ 809.805648][ T428] try_to_wake_up+0x9b/0xc40
[ 809.805648][ T428] up+0x8d/0xd0
[ 809.805649][ T428] __up_console_sem+0x29/0x60
[ 809.805650][ T428] 0x89/0x2c0
[ 809.805726][ T428] ? lock_downgrade+0x700/0x700
[ 809.805727][ T428] ? rcu_read_unlock+0x40/0x40
[ 809.805727][ T428] ? vprintk_emit+0x107/0x2c0
[ 809.805728][ T428] _raw_spin_lock_irqsave+0x30/0x50
[ 809.805729][ T428] ? down_trylock+0xe/0x70
[ 809.805729][ T428] down_trylock+0xe/0x70
[ 809.805730][ T428] __down_trylock_console_sem+0x23/0x90
[ 809.805731][ T428] console_trylock+0xe/0x60
[ 809.805732][ T428] vprintk_emit+0x107/0x2c0
[ 809.805733][ T428] ? sched_cpu_dying+0x10f/0x130
[ 809.805733][ T428] printk+0x9a/0xc0
[ 809.805734][ T428] ? record_print_text.cold.38+0x11/0x11
[ 809.805735][ T428] report_bug.cold.2+0x30/0x52
[ 809.805735][ T428] handle_bug+0x44/0x80
[ 809.805736][ T428] exc_invalid_op+0x13/0x40
[ 809.805737][ T428] asm_exc_invalid_op+0x12/0x20
[ 809.805738][ T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
[ 809.805739][ T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48
> rq_unlock_irqrestore(rq, &rf);
>
> calc_load_migrate(rq);
On 13/11/20 15:06, Qian Cai wrote:
> On Fri, 2020-10-23 at 12:12 +0200, Peter Zijlstra wrote:
[...]
>> @@ -7310,7 +7334,7 @@ int sched_cpu_dying(unsigned int cpu)
>> sched_tick_stop(cpu);
>>
>> rq_lock_irqsave(rq, &rf);
>> - BUG_ON(rq->nr_running != 1);
>> + BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
>
> CPU hotplug is now triggering this. This is with Valentin's affine_move_task()
> fix on top:
>
> https://lore.kernel.org/lkml/[email protected]/
>
> [ 809.412232][ T428] kernel BUG at kernel/sched/core.c:7547!
> [ 809.417841][ T428] invalid opcode: 0000 [#1] SMP KASAN PTI
> [ 809.423445][ T428] CPU: 72 PID: 428 Comm: migration/72 Tainted: G I 5.10.0-rc3-next-20201113+ #1
> [ 809.433678][ T428] Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10, BIOS U34 11/13/2019
> [ 809.442951][ T428] Stopper: multi_cpu_stop+0x0/0x350 <- 0x0
> [ 809.448643][ T428] RIP: 0010:sched_cpu_dying+0x10f/0x130
> [ 809.454071][ T428] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 3f b5 96 e8 ab 81 4d 00 f0 4c 01 25 73 c4 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 86 7d 4d 00 48 8b 34 24 e9 5d ff ff ff e8 88
> [ 809.473650][ T428] RSP: 0018:ffffc9000889fca0 EFLAGS: 00010002
> [ 809.479606][ T428] RAX: 0000000000000000 RBX: ffff8887dfcb23c0 RCX: ffffffff8e057e0d
> [ 809.487482][ T428] RDX: 1ffff110fbf96480 RSI: 0000000000007c11 RDI: ffff8887dfcb2400
> [ 809.495355][ T428] RBP: ffffc9000889fcc0 R08: fffffbfff2cb8e96 R09: fffffbfff2cb8e96
> [ 809.503229][ T428] R10: ffffffff965c74af R11: fffffbfff2cb8e95 R12: ffff8887dfcb23d8
> [ 809.511103][ T428] R13: 0000000000000086 R14: ffffffff8d5038e0 R15: 0000000000000003
> [ 809.518979][ T428] FS: 0000000000000000(0000) GS:ffff8887dfc80000(0000) knlGS:0000000000000000
> [ 809.527815][ T428] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 809.534291][ T428] CR2: 00007fea4cdf899c CR3: 00000018c7414002 CR4: 00000000007706e0
> [ 809.542165][ T428] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [ 809.550040][ T428] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> [ 809.557913][ T428] PKRU: 55555554
> [ 809.561332][ T428] Call Trace:
> [ 809.564489][ T428] ? x86_pmu_starting_cpu+0x20/0x20
> [ 809.569570][ T428] ? sched_cpu_wait_empty+0x220/0x220
> [ 809.574826][ T428] cpuhp_invoke_callback+0x1d8/0x1520
> [ 809.580082][ T428] ? x2apic_send_IPI_mask+0x10/0x10
> [ 809.585161][ T428] ? clear_local_APIC+0x788/0xc10
> [ 809.590068][ T428] ? cpuhp_invoke_callback+0x1520/0x1520
> [ 809.595584][ T428] take_cpu_down+0x10f/0x1a0
> [ 809.600053][ T428] multi_cpu_stop+0x149/0x350
> [ 809.604607][ T428] ? stop_machine_yield+0x10/0x10
> [ 809.609511][ T428] cpu_stopper_thread+0x200/0x400
> [ 809.614416][ T428] ? cpu_stop_create+0x70/0x70
> [ 809.619059][ T428] smpboot_thread_fn+0x30a/0x770
> [ 809.623878][ T428] ? smpboot_register_percpu_thread+0x370/0x370
> [ 809.630005][ T428] ? trace_hardirqs_on+0x1c/0x150
> [ 809.634910][ T428] ? __kthread_parkme+0xcc/0x1a0
> [ 809.639729][ T428] ? smpboot_register_percpu_thread+0x370/0x370
> [ 809.645855][ T428] kthread+0x352/0x420
> [ 809.649798][ T428] ? kthread_create_on_node+0xc0/0xc0
> [ 809.655052][ T428] ret_from_fork+0x22/0x30
> [ 809.659345][ T428] Modules linked in: nls_ascii nls_cp437 vfat fat kvm_intel kvm ses enclosure irqbypass efivarfs ip_tables x_tables sd_mod nvme tg3 firmware_class smartpqi nvme_core scsi_transport_sas libphy dm_mirror dm_region_hash dm_log dm_mod
We did have some breakage in that area, but all the holes I was aware of
have been plugged. What would help here is to see which tasks are still
queued on that outgoing CPU, and their recent activity.
Something like
- ftrace_dump_on_oops on your kernel cmdline
- trace-cmd start -e 'sched:*'
<start the test here>
ought to do it. Then you can paste the (tail of the) ftrace dump.
I also had this laying around, which may or may not be of some help:
---
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6aaf9fb3400..c4a4cb8b47a2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7534,7 +7534,25 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
- BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
+
+ if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
+ struct task_struct *g, *p;
+
+ pr_crit("CPU%d nr_running=%d\n", cpu, rq->nr_running);
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ if (task_cpu(p) != cpu)
+ continue;
+
+ if (!task_on_rq_queued(p))
+ continue;
+
+ pr_crit("\tp=%s\n", p->comm);
+ }
+ rcu_read_unlock();
+ BUG();
+ }
+
rq_unlock_irqrestore(rq, &rf);
calc_load_migrate(rq);
On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> We did have some breakage in that area, but all the holes I was aware of
> have been plugged. What would help here is to see which tasks are still
> queued on that outgoing CPU, and their recent activity.
>
> Something like
> - ftrace_dump_on_oops on your kernel cmdline
> - trace-cmd start -e 'sched:*'
> <start the test here>
>
> ought to do it. Then you can paste the (tail of the) ftrace dump.
>
> I also had this laying around, which may or may not be of some help:
Once I have found a reliable reproducer, I'll report back.
On 2020-11-18 09:44:34 [-0500], Qian Cai wrote:
> On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> > We did have some breakage in that area, but all the holes I was aware of
> > have been plugged. What would help here is to see which tasks are still
> > queued on that outgoing CPU, and their recent activity.
> >
> > Something like
> > - ftrace_dump_on_oops on your kernel cmdline
> > - trace-cmd start -e 'sched:*'
> > <start the test here>
> >
> > ought to do it. Then you can paste the (tail of the) ftrace dump.
> >
> > I also had this laying around, which may or may not be of some help:
>
> Once I have found a reliable reproducer, I'll report back.
any update?
Sebastian
On Mon, 2020-11-23 at 19:13 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-18 09:44:34 [-0500], Qian Cai wrote:
> > On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> > > We did have some breakage in that area, but all the holes I was aware of
> > > have been plugged. What would help here is to see which tasks are still
> > > queued on that outgoing CPU, and their recent activity.
> > >
> > > Something like
> > > - ftrace_dump_on_oops on your kernel cmdline
> > > - trace-cmd start -e 'sched:*'
> > > <start the test here>
> > >
> > > ought to do it. Then you can paste the (tail of the) ftrace dump.
> > >
> > > I also had this laying around, which may or may not be of some help:
> >
> > Once I have found a reliable reproducer, I'll report back.
>
> any update?
Just back from a vacation. I have been running the same workload on today's
linux-next for a few hours and it has been good so far. I'll surely report back
if it happens again in our daily runs.
On Mon, 2020-11-23 at 19:13 +0100, Sebastian Andrzej Siewior wrote:
> On 2020-11-18 09:44:34 [-0500], Qian Cai wrote:
> > On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> > > We did have some breakage in that area, but all the holes I was aware of
> > > have been plugged. What would help here is to see which tasks are still
> > > queued on that outgoing CPU, and their recent activity.
> > >
> > > Something like
> > > - ftrace_dump_on_oops on your kernel cmdline
> > > - trace-cmd start -e 'sched:*'
> > > <start the test here>
> > >
> > > ought to do it. Then you can paste the (tail of the) ftrace dump.
> > >
> > > I also had this laying around, which may or may not be of some help:
> >
> > Once I have found a reliable reproducer, I'll report back.
>
> any update?
Hmm, the bug is still there after running a bit longer. Let me apply Valentin's
patch and setup ftrace to try to catch it again.
[ 6152.085915][ T61] kernel BUG at kernel/sched/core.c:7594!
[ 6152.091523][ T61] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 6152.097126][ T61] CPU: 10 PID: 61 Comm: migration/10 Tainted: G IO 5.10.0-rc6-next-20201201+ #1
[ 6152.107272][ T61] Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10, BIOS U34 11/13/2019
[ 6152.116545][ T61] Stopper: multi_cpu_stop+0x0/0x350 <- 0x0
[ 6152.122237][ T61] RIP: 0010:sched_cpu_dying+0x14f/0x180
[ 6152.127667][ T61] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 5f 15 a1 e8 1b e5 4d 00 f0 4c 01 25 63 c8 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 f6 e0 4d 00 48 8b 34 24 e9 1e ff ff ff 48 89
[ 6152.147248][ T61] RSP: 0018:ffffc90006fbfca0 EFLAGS: 00010002
[ 6152.153202][ T61] RAX: 000000000000723d RBX: ffff8887dfab2400 RCX: 1ffff110fbf56488
[ 6152.161076][ T61] RDX: 0000000000000000 RSI: 000000000000723d RDI: ffff8887dfab2440
[ 6152.168950][ T61] RBP: ffffc90006fbfcc0 R08: fffffbfff417923d R09: fffffbfff417923d
[ 6152.176824][ T61] R10: ffffffffa0bc91e7 R11: fffffbfff417923c R12: ffff8887dfab2418
[ 6152.184698][ T61] R13: 0000000000000086 R14: ffffffff97b03da0 R15: 0000000000000003
[ 6152.192574][ T61] FS: 0000000000000000(0000) GS:ffff8887dfa80000(0000) knlGS:0000000000000000
[ 6152.201409][ T61] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6152.207886][ T61] CR2: 000055fed2192f58 CR3: 0000000cb7e14006 CR4: 00000000007706e0
[ 6152.215761][ T61] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 6152.223636][ T61] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 6152.231509][ T61] PKRU: 55555554
[ 6152.234928][ T61] Call Trace:
[ 6152.238086][ T61] ? x86_pmu_starting_cpu+0x20/0x20
[ 6152.243166][ T61] ? sched_cpu_wait_empty+0x290/0x290
[ 6152.248422][ T61] cpuhp_invoke_callback+0x1d8/0x1520
[ 6152.253677][ T61] ? x2apic_send_IPI_mask+0x10/0x10
[ 6152.258758][ T61] ? clear_local_APIC+0x788/0xc10
[ 6152.263663][ T61] ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.269178][ T61] take_cpu_down+0x10f/0x1a0
[ 6152.273646][ T61] multi_cpu_stop+0x149/0x350
[ 6152.278201][ T61] ? stop_machine_yield+0x10/0x10
[ 6152.283106][ T61] cpu_stopper_thread+0x200/0x400
[ 6152.288012][ T61] ? cpu_stop_create+0x70/0x70
[ 6152.292655][ T61] smpboot_thread_fn+0x30a/0x770
[ 6152.297472][ T61] ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.303600][ T61] ? trace_hardirqs_on+0x1c/0x150
[ 6152.308504][ T61] ? __kthread_parkme+0xcc/0x1a0
[ 6152.313321][ T61] ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.319447][ T61] kthread+0x354/0x420
[ 6152.323390][ T61] ? kthread_create_on_node+0xc0/0xc0
[ 6152.328645][ T61] ret_from_fork+0x22/0x30
[ 6152.332938][ T61] Modules linked in: isofs cdrom fuse loop nls_ascii nls_cp437 vfat fat kvm_intel kvm irqbypass ses enclosure efivarfs ip_tables x_tables sd_mod tg3 nvme firmware_class smartpqi nvme_core libphy scsi_transport_sas dm_mirror dm_region_hash dm_log dm_mod [last unloaded: dummy_del_mod]
[ 6152.359732][ T61] ---[ end trace f59b31dec044f746 ]---
[ 6152.365076][ T61] RIP: 0010:sched_cpu_dying+0x14f/0x180
[ 6152.370505][ T61] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 5f 15 a1 e8 1b e5 4d 00 f0 4c 01 25 63 c8 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 f6 e0 4d 00 48 8b 34 24 e9 1e ff ff ff 48 89
[ 6152.390085][ T61] RSP: 0018:ffffc90006fbfca0 EFLAGS: 00010002
[ 6152.396039][ T61] RAX: 000000000000723d RBX: ffff8887dfab2400 RCX: 1ffff110fbf56488
[ 6152.403914][ T61] RDX: 0000000000000000 RSI: 000000000000723d RDI: ffff8887dfab2440
[ 6152.411789][ T61] RBP: ffffc90006fbfcc0 R08: fffffbfff417923d R09: fffffbfff417923d
[ 6152.419662][ T61] R10: ffffffffa0bc91e7 R11: fffffbfff417923c R12: ffff8887dfab2418
[ 6152.427537][ T61] R13: 0000000000000086 R14: ffffffff97b03da0 R15: 0000000000000003
[ 6152.435412][ T61] FS: 0000000000000000(0000) GS:ffff8887dfa80000(0000) knlGS:0000000000000000
[ 6152.444248][ T61] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 6152.450725][ T61] CR2: 000055fed2192f58 CR3: 0000000cb7e14006 CR4: 00000000007706e0
[ 6152.458600][ T61] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 6152.466476][ T61] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 6152.474350][ T61] PKRU: 55555554
[ 6152.477771][ T61] Kernel panic - not syncing: Fatal exception
[ 6152.483729][ T61]
[ 6152.483731][ T61] unchecked MSR access error: WRMSR to 0x83f (tried to write 0x00000000000000f6) at rIP: 0xffffffff97ac8b93 (native_apic_msr_write+0x23/0x40)
[ 6152.483732][ T61] Call Trace:
[ 6152.483733][ T61] arch_irq_work_raise+0x9b/0x110
[ 6152.483733][ T61] irq_work_queue+0x24/0x40
[ 6152.483734][ T61] printk_safe_log_store+0x185/0x1b0
[ 6152.483735][ T61] ? kmsg_dump_rewind_nolock+0x80/0x80
[ 6152.483736][ T61] ? ret_from_fork+0x22/0x30
[ 6152.483736][ T61] printk+0x9a/0xc0
[ 6152.483737][ T61] ? record_print_text.cold.37+0x11/0x11
[ 6152.483739][ T61] ? stack_trace_consume_entry+0x160/0x160
[ 6152.483740][ T61] ? save_trace+0x3d/0xc40
[ 6152.483741][ T61] print_circular_bug_header.cold.69+0x10/0xd7
[ 6152.483742][ T61] print_circular_bug.isra.42+0x230/0x430
[ 6152.483743][ T61] check_noncircular+0x27b/0x320
[ 6152.483745][ T61] ? print_circular_bug.isra.42+0x430/0x430
[ 6152.483746][ T61] ? mark_lock.part.47+0x109/0x1f90
[ 6152.483747][ T61] ? print_usage_bug+0x2b0/0x2b0
[ 6152.483748][ T61] ? mark_lock.part.47+0x109/0x1f90
[ 6152.483749][ T61] check_prevs_add+0x38e/0x2800
[ 6152.483749][ T61] ? lock_is_held_type+0x19/0xe0
[ 6152.483750][ T61] ? check_irq_usage+0xcf0/0xcf0
[ 6152.483751][ T61] ? rcu_read_lock_held+0xb0/0xb0
[ 6152.483752][ T61] __lock_acquire+0x2c86/0x3e60
[ 6152.483753][ T61] ? __add_preferred_console.constprop.25+0x220/0x220
[ 6152.483753][ T61] ? lockdep_hardirqs_on_prepare+0x3d0/0x3d0
[ 6152.483754][ T61] lock_acquire+0x1c8/0x820
[ 6152.483755][ T61] ? down_trylock+0xe/0x70
[ 6152.483756][ T61] ? rcu_read_unlock+0x40/0x40
[ 6152.483757][ T61] ? vprintk_emit+0x89/0x2c0
[ 6152.483758][ T61] ? lock_downgrade+0x700/0x700
[ 6152.483759][ T61] ? rcu_read_unlock+0x40/0x40
[ 6152.483760][ T61] ? vprintk_emit+0x107/0x2c0
[ 6152.483761][ T61] _raw_spin_lock_irqsave+0x30/0x50
[ 6152.483761][ T61] ? down_trylock+0xe/0x70
[ 6152.483762][ T61] down_trylock+0xe/0x70
[ 6152.483763][ T61] __down_trylock_console_sem+0x23/0x90
[ 6152.483764][ T61] console_trylock+0xe/0x60
[ 6152.483764][ T61] vprintk_emit+0x107/0x2c0
[ 6152.483765][ T61] ? sched_cpu_dying+0x14f/0x180
[ 6152.483765][ T61] printk+0x9a/0xc0
[ 6152.483766][ T61] ? record_print_text.cold.37+0x11/0x11
[ 6152.483767][ T61] report_bug.cold.2+0x30/0x52
[ 6152.483768][ T61] handle_bug+0x44/0x80
[ 6152.483769][ T61] exc_invalid_op+0x13/0x40
[ 6152.483769][ T61] asm_exc_invalid_op+0x12/0x20
[ 6152.483770][ T61] RIP: 0010:sched_cpu_dying+0x14f/0x180
[ 6152.483772][ T61] Code: 10 00 31 c0 48 83 c4 08 5b 41 5c 41 5d 5d c3 be 08 00 00 00 48 c7 c7 60 5f 15 a1 e8 1b e5 4d 00 f0 4c 01 25 63 c8 5a 09 eb a3 <0f> 0b 48 89 34 24 e8 f6 e0 4d 00 48 8b 34 24 e9 1e ff ff ff 48 89
[ 6152.483774][ T61] RSP: 0018:ffffc90006fbfca0 EFLAGS: 00010002
[ 6152.483775][ T61] RAX: 000000000000723d RBX: ffff8887dfab2400 RCX: 1ffff110fbf56488
[ 6152.483777][ T61] RDX: 0000000000000000 RSI: 000000000000723d RDI: ffff8887dfab2440
[ 6152.483778][ T61] RBP: ffffc90006fbfcc0 R08: fffffbfff417923d R09: fffffbfff417923d
[ 6152.483779][ T61] R10: ffffffffa0bc91e7 R11: fffffbfff417923c R12: ffff8887dfab2418
[ 6152.483780][ T61] R13: 0000000000000086 R14: ffffffff97b03da0 R15: 0000000000000003
[ 6152.483781][ T61] ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.483781][ T61] ? x86_pmu_starting_cpu+0x20/0x20
[ 6152.483782][ T61] ? sched_cpu_wait_empty+0x290/0x290
[ 6152.483783][ T61] cpuhp_invoke_callback+0x1d8/0x1520
[ 6152.483784][ T61] ? x2apic_send_IPI_mask+0x10/0x10
[ 6152.483784][ T61] ? clear_local_APIC+0x788/0xc10
[ 6152.483785][ T61] ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.483786][ T61] take_cpu_down+0x10f/0x1a0
[ 6152.483786][ T61] multi_cpu_stop+0x149/0x350
[ 6152.483787][ T61] ? stop_machine_yield+0x10/0x10
[ 6152.483788][ T61] cpu_stopper_thread+0x200/0x400
[ 6152.483789][ T61] ? cpu_stop_create+0x70/0x70
[ 6152.483789][ T61] smpboot_thread_fn+0x30a/0x770
[ 6152.483790][ T61] ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.483791][ T61] ? trace_hardirqs_on+0x1c/0x150
[ 6152.483792][ T61] ? __kthread_parkme+0xcc/0x1a0
[ 6152.483792][ T61] ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.483793][ T61] kthread+0x354/0x420
[ 6152.483794][ T61] ? kthread_create_on_node+0xc0/0xc0
[ 6152.483794][ T61] ret_from_fork+0x22/0x30
[ 6152.483796][ T61] ======================================================
[ 6152.483797][ T61] WARNING: possible circular locking dependency detected
[ 6152.483798][ T61] 5.10.0-rc6-next-20201201+ #1 Tainted: G IO
[ 6152.483799][ T61] ------------------------------------------------------
[ 6152.483800][ T61] migration/10/61 is trying to acquire lock:
[ 6152.483801][ T61] ffffffffa08c7e98 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0xe/0x70
[ 6152.483805][ T61]
[ 6152.483805][ T61] but task is already holding lock:
[ 6152.483806][ T61] ffff8887dfab2418 (&rq->lock){-.-.}-{2:2}, at: sched_cpu_dying+0x4e/0x180
[ 6152.483809][ T61]
[ 6152.483810][ T61] which lock already depends on the new lock.
[ 6152.483810][ T61]
[ 6152.483811][ T61]
[ 6152.483812][ T61] the existing dependency chain (in reverse order) is:
[ 6152.483812][ T61]
[ 6152.483813][ T61] -> #2 (&rq->lock){-.-.}-{2:2}:
[ 6152.483815][ T61] lock_acquire+0x1c8/0x820
[ 6152.483816][ T61] _raw_spin_lock+0x27/0x40
[ 6152.483817][ T61] task_fork_fair+0x32/0x4c0
[ 6152.483818][ T61] sched_fork+0x3b3/0x8d0
[ 6152.483818][ T61] copy_process+0x1c89/0x6470
[ 6152.483819][ T61] kernel_clone+0xbd/0xc90
[ 6152.483820][ T61] kernel_thread+0x95/0xd0
[ 6152.483820][ T61] rest_init+0x21/0x28a
[ 6152.483821][ T61] start_kernel+0x37c/0x39a
[ 6152.483822][ T61] secondary_startup_64_no_verify+0xc2/0xcb
[ 6152.483822][ T61]
[ 6152.483823][ T61] -> #1 (&p->pi_lock){-.-.}-{2:2}:
[ 6152.483826][ T61] lock_acquire+0x1c8/0x820
[ 6152.483827][ T61] _raw_spin_lock_irqsave+0x30/0x50
[ 6152.483827][ T61] try_to_wake_up+0x9c/0xf90
[ 6152.483828][ T61] up+0x8d/0xd0
[ 6152.483829][ T61] __up_console_sem+0x29/0x60
[ 6152.483830][ T61] console_unlock+0x581/0xa20
[ 6152.483830][ T61] vprintk_emit+0x201/0x2c0
[ 6152.483831][ T61] printk+0x9a/0xc0
[ 6152.483832][ T61] do_exit.cold.38+0x55/0x1e5
[ 6152.483832][ T61] do_group_exit+0xeb/0x2d0
[ 6152.483833][ T61] __x64_sys_exit_group+0x35/0x40
[ 6152.483834][ T61] do_syscall_64+0x33/0x40
[ 6152.483835][ T61] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 6152.483835][ T61]
[ 6152.483836][ T61] -> #0 ((console_sem).lock){-.-.}-{2:2}:
[ 6152.483839][ T61] check_prevs_add+0x38e/0x2800
[ 6152.483839][ T61] __lock_acquire+0x2c86/0x3e60
[ 6152.483840][ T61] lock_acquire+0x1c8/0x820
[ 6152.483841][ T61] _raw_spin_lock_irqsave+0x30/0x50
[ 6152.483842][ T61] down_trylock+0xe/0x70
[ 6152.483843][ T61] __down_trylock_console_sem+0x23/0x90
[ 6152.483844][ T61] console_trylock+0xe/0x60
[ 6152.483845][ T61] vprintk_emit+0x107/0x2c0
[ 6152.483846][ T61] printk+0x9a/0xc0
[ 6152.483847][ T61] report_bug.cold.2+0x30/0x52
[ 6152.483848][ T61] handle_bug+0x44/0x80
[ 6152.483849][ T61] exc_invalid_][ T61] R10: ffffffffa0bc91e7 R11: fffffbfff417923c R12: ffff8887dfab2418
[ 6152.483920][ T61] R13: 0000000000000086 R14: ffffffff97b03da0 R15: 0000000000000003
[ 6152.483920][ T61] ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.483921][ T61] ? x86_pmu_starting_cpu+0x20/0x20
[ 6152.483922][ T61] ? sched_cpu_wait_empty+0x290/0x290
[ 6152.483923][ T61] cpuhp_invoke_callback+0x1d8/0x1520
[ 6152.483923][ T61] ? x2apic_send_IPI_mask+0x10/0x10
[ 6152.483924][ T61] ? clear_local_APIC+0x788/0xc10
[ 6152.483925][ T61] ? cpuhp_invoke_callback+0x1520/0x1520
[ 6152.483925][ T61] take_cpu_down+0x10f/0x1a0
[ 6152.483926][ T61] multi_cpu_stop+0x149/0x350
[ 6152.483927][ T61] ? stop_machine_yield+0x10/0x10
[ 6152.483928][ T61] cpu_stopper_thread+0x200/0x400
[ 6152.483929][ T61] ? cpu_stop_create+0x70/0x70
[ 6152.483930][ T61] smpboot_thread_fn+0x30a/0x770
[ 6152.483931][ T61] ? smpboot_register_percpu_thread+0x370/0x370
[ 6152.483932][ T61] ? trace_hardirqs_on+0x1c/0x150
FYI, it did crash on arm64 (Thunder X2) as well, so I'll re-run to gather more
information too.
.config: https://cailca.coding.net/public/linux/mm/git/files/master/arm64.config
[20370.682747][T77637] psci: CPU123 killed (polled 0 ms)
[20370.823651][ T635] IRQ 43: no longer affine to CPU124
[20370.828862][ T635] IRQ 49: no longer affine to CPU124
[20370.834072][ T635] IRQ 60: no longer affine to CPU124
[20370.839517][ T635] IRQ 94: no longer affine to CPU124
[20370.845778][T77637] CPU124: shutdown
[20370.861891][T77637] psci: CPU124 killed (polled 10 ms)
[20371.425434][T77637] CPU125: shutdown
[20371.441464][T77637] psci: CPU125 killed (polled 10 ms)
[20371.984072][T77637] CPU126: shutdown
[20372.000057][T77637] psci: CPU126 killed (polled 10 ms)
[20372.223858][ T650] ------------[ cut here ]------------
[20372.229599][ T650] kernel BUG at kernel/sched/core.c:7594!
[20372.235165][ T650] Internal error: Oops - BUG: 0 [#1] SMP
[20372.240643][ T650] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_type1
vfio loop processor ip_tables x_tables sd_mod mlx5_core firmware_class ahci
libahci libata dm_mirror dm_region_hash dm_log dm_mod efivarfs
[20372.259814][ T650] CPU: 127 PID: 650 Comm: migration/127 Tainted: G L 5.10.0-rc6-next-20201203+ #5
[20372.270152][ T650] Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.16 07/29/2020
[20372.280579][ T650] Stopper: multi_cpu_stop+0x0/0x390 <- 0x0
[20372.286230][ T650] pstate: 20400089 (nzCv daIf +PAN -UAO -TCO BTYPE=--)
[20372.292923][ T650] pc : sched_cpu_dying+0x198/0x1b8
[20372.297879][ T650] lr : sched_cpu_dying+0x68/0x1b8
[20372.302748][ T650] sp : ffff00001076fba0
[20372.306747][ T650] x29: ffff00001076fba0 x28: 0000000000000000
[20372.312751][ T650] x27: 0000000000000001 x26: ffff800011db3000
[20372.318753][ T650] x25: ffff000e7bdd16a8 x24: 000000000000005a
[20372.324754][ T650] x23: 000000000000007f x22: 0000000000000080
[20372.330756][ T650] x21: 000000000000fab7 x20: ffff000e7be63818
[20372.336757][ T650] x19: ffff000e7be63800 x18: 1fffe001cf7cb3ed
[20372.342758][ T650] x17: 0000000000001308 x16: 0000000000000000
[20372.348759][ T650] x15: 000000000001053f x14: 000000000001053f
[20372.354761][ T650] x13: ffff6000020edf65 x12: 1fffe000020edf64
[20372.360763][ T650] x11: 1fffe000020edf64 x10: ffff6000020edf64
[20372.366764][ T650] x9 : dfff800000000000 x8 : ffff00001076fb23
[20372.372766][ T650] x7 : 0000000000000001 x6 : 0000000000000001
[20372.378767][ T650] x5 : 1fffe000020b9a0a x4 : dfff800000000000
[20372.384769][ T650] x3 : dfff800000000000 x2 : 0000000000000003
[20372.390770][ T650] x1 : ffff000e7be63840 x0 : 0000000000000002
[20372.396771][ T650] Call trace:
[20372.399905][ T650] sched_cpu_dying+0x198/0x1b8
[20372.404514][ T650] cpuhp_invoke_callback+0x208/0x2bf0
[20372.409730][ T650] take_cpu_down+0x11c/0x1f0
[20372.414165][ T650] multi_cpu_stop+0x184/0x390
[20372.418687][ T650] cpu_stopper_thread+0x1f0/0x430
[20372.423557][ T650] smpboot_thread_fn+0x3a8/0x9c8
[20372.428339][ T650] kthread+0x3a0/0x448
[20372.432253][ T650] ret_from_fork+0x10/0x1c
[20372.436517][ T650] Code: d65f03c0 911a82a2 140004fb 17ffffd9 (d4210000)
[20372.443298][ T650] ---[ end trace c51d5b6889ec29a8 ]---
[20372.448602][ T650] Kernel panic - not syncing: Oops - BUG: Fatal exception
On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
> We did have some breakage in that area, but all the holes I was aware of
> have been plugged. What would help here is to see which tasks are still
> queued on that outgoing CPU, and their recent activity.
>
> Something like
> - ftrace_dump_on_oops on your kernel cmdline
> - trace-cmd start -e 'sched:*'
> <start the test here>
>
> ought to do it. Then you can paste the (tail of the) ftrace dump.
>
> I also had this laying around, which may or may not be of some help:
Okay, your patch did not help, since it can still be reproduced using this,
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/hotplug/cpu_hotplug/functional/cpuhotplug04.sh
# while :; do cpuhotplug04.sh -l 1; done
The ftrace dump has too much output on this 256-CPU system, so I have not had
the patient to wait for it to finish after 15-min. But here is the log capturing
so far (search for "kernel BUG" there).
http://people.redhat.com/qcai/console.log
> ---
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a6aaf9fb3400..c4a4cb8b47a2 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7534,7 +7534,25 @@ int sched_cpu_dying(unsigned int cpu)
> sched_tick_stop(cpu);
>
> rq_lock_irqsave(rq, &rf);
> - BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
> +
> + if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
> + struct task_struct *g, *p;
> +
> + pr_crit("CPU%d nr_running=%d\n", cpu, rq->nr_running);
> + rcu_read_lock();
> + for_each_process_thread(g, p) {
> + if (task_cpu(p) != cpu)
> + continue;
> +
> + if (!task_on_rq_queued(p))
> + continue;
> +
> + pr_crit("\tp=%s\n", p->comm);
> + }
> + rcu_read_unlock();
> + BUG();
> + }
> +
> rq_unlock_irqrestore(rq, &rf);
>
> calc_load_migrate(rq);
>
On 04/12/20 21:19, Qian Cai wrote:
> On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
>> We did have some breakage in that area, but all the holes I was aware of
>> have been plugged. What would help here is to see which tasks are still
>> queued on that outgoing CPU, and their recent activity.
>>
>> Something like
>> - ftrace_dump_on_oops on your kernel cmdline
>> - trace-cmd start -e 'sched:*'
>> <start the test here>
>>
>> ought to do it. Then you can paste the (tail of the) ftrace dump.
>>
>> I also had this laying around, which may or may not be of some help:
>
> Okay, your patch did not help, since it can still be reproduced using this,
>
It wasn't meant to fix this, only add some more debug prints :)
> https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/hotplug/cpu_hotplug/functional/cpuhotplug04.sh
>
> # while :; do cpuhotplug04.sh -l 1; done
>
> The ftrace dump has too much output on this 256-CPU system, so I have not had
> the patient to wait for it to finish after 15-min. But here is the log capturing
> so far (search for "kernel BUG" there).
>
> http://people.redhat.com/qcai/console.log
>
From there I see:
[20798.166987][ T650] CPU127 nr_running=2
[20798.171185][ T650] p=migration/127
[20798.175161][ T650] p=kworker/127:1
so this might be another workqueue hurdle. This should be prevented by:
06249738a41a ("workqueue: Manually break affinity on hotplug")
In any case, I'll give this a try on a TX2 next week and see where it gets
me.
Note that much earlier in your log, you have a softlockup on CPU127:
[ 74.278367][ C127] watchdog: BUG: soft lockup - CPU#127 stuck for 23s! [swapper/0:1]
On Sat, 2020-12-05 at 18:37 +0000, Valentin Schneider wrote:
> From there I see:
>
> [20798.166987][ T650] CPU127 nr_running=2
> [20798.171185][ T650] p=migration/127
> [20798.175161][ T650] p=kworker/127:1
>
> so this might be another workqueue hurdle. This should be prevented by:
>
> 06249738a41a ("workqueue: Manually break affinity on hotplug")
Well, since it was reproduced on the latest linux-next which has already
included the commit.
> Note that much earlier in your log, you have a softlockup on CPU127:
>
> [ 74.278367][ C127] watchdog: BUG: soft lockup - CPU#127 stuck for 23s!
> [swapper/0:1]
That's something separate. It was there all the time.
https://lore.kernel.org/linux-acpi/[email protected]/
On 04/12/20 21:19, Qian Cai wrote:
> On Tue, 2020-11-17 at 19:28 +0000, Valentin Schneider wrote:
>> We did have some breakage in that area, but all the holes I was aware of
>> have been plugged. What would help here is to see which tasks are still
>> queued on that outgoing CPU, and their recent activity.
>>
>> Something like
>> - ftrace_dump_on_oops on your kernel cmdline
>> - trace-cmd start -e 'sched:*'
>> <start the test here>
>>
>> ought to do it. Then you can paste the (tail of the) ftrace dump.
>>
>> I also had this laying around, which may or may not be of some help:
>
> Okay, your patch did not help, since it can still be reproduced using this,
>
> https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/hotplug/cpu_hotplug/functional/cpuhotplug04.sh
>
> # while :; do cpuhotplug04.sh -l 1; done
>
Ok, can reproduce this on a TX2 on next-20201207. I didn't use your config,
I oldconfig'd my distro config and only modified it to CONFIG_PREEMPT_NONE.
Interestingly the BUG happens on CPU127 here too...
I'll start digging.
On Mon, 2020-12-07 at 19:27 +0000, Valentin Schneider wrote:
> Ok, can reproduce this on a TX2 on next-20201207. I didn't use your config,
> I oldconfig'd my distro config and only modified it to CONFIG_PREEMPT_NONE.
> Interestingly the BUG happens on CPU127 here too...
I think that number is totally random. For example, on this x86, it could happen
for CPU8 or CPU111.
On 08/12/20 13:46, Qian Cai wrote:
> On Mon, 2020-12-07 at 19:27 +0000, Valentin Schneider wrote:
>> Ok, can reproduce this on a TX2 on next-20201207. I didn't use your config,
>> I oldconfig'd my distro config and only modified it to CONFIG_PREEMPT_NONE.
>> Interestingly the BUG happens on CPU127 here too...
>
> I think that number is totally random. For example, on this x86, it could happen
> for CPU8 or CPU111.
Actually on the TX2 it seems to *always* happen on CPU127. Your hotplug
script sequentially offlines CPUs in increasing id values, so when CPU127
gets hotplugged it is the last online CPU of NUMA node 0.
I've been staring at traces collected via
echo 2 > /proc/sys/kernel/ftrace_dump_on_oops
trace-cmd start -e 'sched:*' -e 'cpuhp:*' -e 'workqueue:*'
./hotplug.sh
but it's still not entirely clear to me WTH is going on. I do see kworkers
getting their affinity reset in workqueue_offline_cpu(), but for some
reason there's a new one that wakes up on CPU127 sometime later. I haven't
been able to figure out where it comes from - it obviously isn't part of
the percpu worker pools, as it isn't handled during
workqueue_offline_cpu(), but it still ends up affined to a single CPU...
It looks something like this; traces are only from CPU127
cpuhp:sched_cpu_wait_empty() # Resets the affinity of some kworker/127:x<2
sched_switch(idle)
sched_wakeup(kworker/127:2) # picks CPU127
sched_switch(kworker/127:2)
# maybe_create_worker() -> creates kworker/127:3
sched_wakeup(kworker/127:3) # picks CPU127
sched_switch(kworker/127:3)
# maybe_create_worker() -> creates kworker/127:4
sched_wakeup(kworker/127:4) # picks CPU127
sched_switch(kworker/127:4)
# maybe_create_worker() -> creates kworker/127:5
sched_wakeup(kworker/127:5) # picks CPU127
sched_wakeup(migration/127)
sched_switch(migration/127)
cpuhp:take_cpu_down()
BUG