2014-02-19 18:09:20

by Sasha Levin

[permalink] [raw]
Subject: sched: hang in migrate_swap

Hi all,

While fuzzing with trinity inside a KVM tools guest, running latest -next kernel, I see to hit the
following hang quite often.

The initial spew is:

[ 293.110057] BUG: soft lockup - CPU#8 stuck for 22s! [migration/8:258]
[ 293.110057] Modules linked in:
[ 293.110057] irq event stamp: 20828
[ 293.110057] hardirqs last enabled at (20827): [<ffffffff84385437>] restore_args+0x0/0x30
[ 293.110057] hardirqs last disabled at (20828): [<ffffffff8438f02d>] apic_timer_interrupt+0x6d/0x80
[ 293.110057] softirqs last enabled at (20826): [<ffffffff811418e7>] __do_softirq+0x447/0x4f0
[ 293.110057] softirqs last disabled at (20821): [<ffffffff81141fa3>] irq_exit+0x83/0x160
[ 293.110057] CPU: 8 PID: 258 Comm: migration/8 Tainted: G W
3.14.0-rc3-next-20140218-sasha-00008-g16140ff-dirty #109
[ 293.110057] task: ffff8805b8283000 ti: ffff8805b82c0000 task.ti: ffff8805b82c0000
[ 293.110057] RIP: 0010:[<ffffffff81203892>] [<ffffffff81203892>] multi_cpu_stop+0xc2/0x1c0
[ 293.110057] RSP: 0000:ffff8805b82c1cd8 EFLAGS: 00000293
[ 293.110057] RAX: 0000000000000001 RBX: ffffffff84385437 RCX: 0000000000000000
[ 293.110057] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff8800c40bb978
[ 293.110057] RBP: ffff8805b82c1d18 R08: 0000000000000000 R09: 0000000000000000
[ 293.110057] R10: 0000000000000001 R11: 0000000000000001 R12: ffff8805b82c1c48
[ 293.110057] R13: ffff8805b8283000 R14: ffff8805b82c0000 R15: ffff8805b8283000
[ 293.110057] FS: 0000000000000000(0000) GS:ffff8805b9000000(0000) knlGS:0000000000000000
[ 293.110057] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 293.110057] CR2: 0000000000942d58 CR3: 0000000005c26000 CR4: 00000000000006e0
[ 293.110057] DR0: 000000000089e000 DR1: 00007f938d51a000 DR2: 0000000000000000
[ 293.110057] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 293.110057] Stack:
[ 293.110057] ffff8805b91d0be0 00000000c40bb9d8 ffff8805b82c1cf8 ffff8805b91d0be0
[ 293.110057] ffff8800c40bba28 ffffffff812037d0 ffff8800c40bb978 ffff8805b91d0c30
[ 293.110057] ffff8805b82c1de8 ffffffff81202f56 0000000000000001 ffffffff81202e44
[ 293.110057] Call Trace:
[ 293.110057] [<ffffffff812037d0>] ? stop_cpus+0x60/0x60
[ 293.110057] [<ffffffff81202f56>] cpu_stopper_thread+0x96/0x190
[ 293.110057] [<ffffffff81202e44>] ? cpu_stop_should_run+0x44/0x60
[ 293.110057] [<ffffffff811a3d1a>] ? __lock_release+0x1da/0x1f0
[ 293.110057] [<ffffffff84384ec5>] ? _raw_spin_unlock_irqrestore+0x65/0xb0
[ 293.110057] [<ffffffff811a1f0d>] ? trace_hardirqs_on+0xd/0x10
[ 293.110057] [<ffffffff81170826>] smpboot_thread_fn+0x2b6/0x2c0
[ 293.110057] [<ffffffff81170570>] ? smpboot_register_percpu_thread+0x100/0x100
[ 293.110057] [<ffffffff81167755>] kthread+0x105/0x110
[ 293.110057] [<ffffffff8437fca2>] ? wait_for_completion+0xc2/0x110
[ 293.110057] [<ffffffff81167650>] ? set_kthreadd_affinity+0x30/0x30
[ 293.110057] [<ffffffff8438e28c>] ret_from_fork+0x7c/0xb0
[ 293.110057] [<ffffffff81167650>] ? set_kthreadd_affinity+0x30/0x30
[ 293.110057] Code: b5 05 01 66 0f 1f 44 00 00 45 89 ed 4d 0f a3 2e 45 19 ed 45 85 ed 41 0f 95 c7
4c 8d 73 24 31 c0 c7 45 cc 00 00 00 00 66 90 f3 90 <44> 8b 6b 20 41 39 c5 74 66 41 83 fd 02 74 0f 41
83 fd 03 75 41

This is followed by a dump of the rest of the CPU states, and I see the following:

[ 268.450444] NMI backtrace for cpu 34
[ 268.450444] CPU: 34 PID: 9859 Comm: trinity-c129 Tainted: G W
3.14.0-rc3-next-20140218-sasha-00008-g16140ff-dirty #109
[ 268.450444] task: ffff880154aa3000 ti: ffff8800c40ba000 task.ti: ffff8800c40ba000
[ 268.450444] RIP: 0010:[<ffffffff810b801a>] [<ffffffff810b801a>] pvclock_clocksource_read+0x1a/0xc0
[ 268.450444] RSP: 0018:ffff8800c40bb538 EFLAGS: 00000082
[ 268.450444] RAX: 00000000905f62ec RBX: ffff8805b93d8500 RCX: 000000000000000a
[ 268.450444] RDX: 00000000000000a1 RSI: 0000000000000040 RDI: ffff881027f12880
[ 268.450444] RBP: ffff8800c40bb548 R08: 0000000000000000 R09: 0000000000000001
[ 268.450444] R10: 0000000000000002 R11: 0000000000000001 R12: ffff8805b93d8510
[ 268.450444] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001
[ 268.450444] FS: 00007f938e16a700(0000) GS:ffff8805b9200000(0000) knlGS:0000000000000000
[ 268.450444] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 268.450444] CR2: 00007f938a658a38 CR3: 00000000c40b2000 CR4: 00000000000006e0
[ 268.450444] DR0: 000000000089e000 DR1: 00007f938d51a000 DR2: 0000000000000000
[ 268.450444] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000f7060a
[ 268.450444] Stack:
[ 268.450444] ffff8805b93d8500 ffff8805b93d8510 ffff8800c40bb558 ffffffff810b6f04
[ 268.450444] ffff8800c40bb568 ffffffff81076e1d ffff8800c40bb598 ffffffff81180635
[ 268.450444] ffffffff875f7be0 0000000000000022 00000000001d8500 ffff8805b93d8500
[ 268.450444] Call Trace:
[ 268.450444] [<ffffffff810b6f04>] kvm_clock_read+0x24/0x50
[ 268.450444] [<ffffffff81076e1d>] sched_clock+0x1d/0x30
[ 268.450444] [<ffffffff81180635>] sched_clock_local+0x25/0x90
[ 268.450444] [<ffffffff811808a8>] sched_clock_cpu+0xb8/0x110
[ 268.450444] [<ffffffff8118094d>] local_clock+0x2d/0x40
[ 268.450444] [<ffffffff811a3358>] __lock_acquire+0x2a8/0x580
[ 268.450444] [<ffffffff8118c84d>] ? update_blocked_averages+0x61d/0x670
[ 268.450444] [<ffffffff811a37b2>] lock_acquire+0x182/0x1d0
[ 268.450444] [<ffffffff8118ce2c>] ? idle_balance+0x3c/0x2c0
[ 268.450444] [<ffffffff8118ce61>] idle_balance+0x71/0x2c0
[ 268.450444] [<ffffffff8118ce2c>] ? idle_balance+0x3c/0x2c0
[ 268.450444] [<ffffffff8118d2fe>] pick_next_task_fair+0x24e/0x290
[ 268.450444] [<ffffffff81173a2e>] ? dequeue_task+0x6e/0x80
[ 268.450444] [<ffffffff8437e3b5>] __schedule+0x2a5/0x840
[ 268.450444] [<ffffffff8437ec15>] schedule+0x65/0x70
[ 268.450444] [<ffffffff8437d8d8>] schedule_timeout+0x38/0x2a0
[ 268.450444] [<ffffffff8119c8fe>] ? put_lock_stats+0xe/0x30
[ 268.450444] [<ffffffff811a196c>] ? mark_held_locks+0x6c/0x90
[ 268.450444] [<ffffffff84384e0b>] ? _raw_spin_unlock_irq+0x2b/0x80
[ 268.450444] [<ffffffff8437fc97>] wait_for_completion+0xb7/0x110
[ 268.450444] [<ffffffff8117e6b0>] ? try_to_wake_up+0x2a0/0x2a0
[ 268.450444] [<ffffffff812034a1>] stop_two_cpus+0x261/0x2b0
[ 268.450444] [<ffffffff81176c00>] ? __migrate_swap_task+0xa0/0xa0
[ 268.450444] [<ffffffff810b6f04>] ? kvm_clock_read+0x24/0x50
[ 268.450444] [<ffffffff812037d0>] ? stop_cpus+0x60/0x60
[ 268.450444] [<ffffffff812037d0>] ? stop_cpus+0x60/0x60
[ 268.450444] [<ffffffff8119c8fe>] ? put_lock_stats+0xe/0x30
[ 268.450444] [<ffffffff8437fc1f>] ? wait_for_completion+0x3f/0x110
[ 268.450444] [<ffffffff811740d5>] migrate_swap+0x175/0x1a0
[ 268.450444] [<ffffffff8118b1e0>] task_numa_migrate+0x510/0x600
[ 268.450444] [<ffffffff8118acd0>] ? select_task_rq_fair+0x440/0x440
[ 268.450444] [<ffffffff8118b31d>] numa_migrate_preferred+0x4d/0x60
[ 268.450444] [<ffffffff8118b900>] task_numa_fault+0x190/0x210
[ 268.450444] [<ffffffff812c4f62>] do_huge_pmd_numa_page+0x432/0x450
[ 268.450444] [<ffffffff8119c8fe>] ? put_lock_stats+0xe/0x30
[ 268.450444] [<ffffffff81abccda>] ? delay_tsc+0xea/0x110
[ 268.450444] [<ffffffff81288724>] __handle_mm_fault+0x244/0x3a0
[ 268.450444] [<ffffffff812c776d>] ? rcu_read_unlock+0x5d/0x60
[ 268.450444] [<ffffffff8128898b>] handle_mm_fault+0x10b/0x1b0
[ 268.450444] [<ffffffff84389252>] ? __do_page_fault+0x2e2/0x590
[ 268.450444] [<ffffffff843894c1>] __do_page_fault+0x551/0x590
[ 268.450444] [<ffffffff81181561>] ? vtime_account_user+0x91/0xa0
[ 268.450444] [<ffffffff8124f0c8>] ? context_tracking_user_exit+0xa8/0x1c0
[ 268.450444] [<ffffffff84384f40>] ? _raw_spin_unlock+0x30/0x50
[ 268.450444] [<ffffffff81181561>] ? vtime_account_user+0x91/0xa0
[ 268.450444] [<ffffffff8124f0c8>] ? context_tracking_user_exit+0xa8/0x1c0
[ 268.450444] [<ffffffff843895bd>] do_page_fault+0x3d/0x70
[ 268.450444] [<ffffffff843888b5>] do_async_page_fault+0x35/0x100
[ 268.450444] [<ffffffff84385798>] async_page_fault+0x28/0x30
[ 268.450444] Code: 6d e1 10 00 e8 98 8a 15 00 c9 c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec 10 8b
37 eb 06 0f 1f 40 00 89 c6 0f 1f 00 0f ae e8 0f 31 <48> c1 e2 20 89 c0 0f be 4f 1c 48 09 c2 44 8b 47
18 48 2b 57 08

The stack trace is always pointing to migrate_swap() calling stop_two_cpus(). Note that I don't
see another CPU in a stopped state when it occurs.


Thanks,
Sasha


2014-02-20 04:32:57

by Michael wang

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/20/2014 02:08 AM, Sasha Levin wrote:
> Hi all,
>
> While fuzzing with trinity inside a KVM tools guest, running latest
> -next kernel, I see to hit the following hang quite often.

Fix for the stuck issue around idle_balance() is now in progress, this
may caused be the same problem, I suggest we do some retest after these
patch got merged.

Regards,
Michael Wang

>
> The initial spew is:
>
> [ 293.110057] BUG: soft lockup - CPU#8 stuck for 22s! [migration/8:258]
> [ 293.110057] Modules linked in:
> [ 293.110057] irq event stamp: 20828
> [ 293.110057] hardirqs last enabled at (20827): [<ffffffff84385437>]
> restore_args+0x0/0x30
> [ 293.110057] hardirqs last disabled at (20828): [<ffffffff8438f02d>]
> apic_timer_interrupt+0x6d/0x80
> [ 293.110057] softirqs last enabled at (20826): [<ffffffff811418e7>]
> __do_softirq+0x447/0x4f0
> [ 293.110057] softirqs last disabled at (20821): [<ffffffff81141fa3>]
> irq_exit+0x83/0x160
> [ 293.110057] CPU: 8 PID: 258 Comm: migration/8 Tainted: G W
> 3.14.0-rc3-next-20140218-sasha-00008-g16140ff-dirty #109
> [ 293.110057] task: ffff8805b8283000 ti: ffff8805b82c0000 task.ti:
> ffff8805b82c0000
> [ 293.110057] RIP: 0010:[<ffffffff81203892>] [<ffffffff81203892>]
> multi_cpu_stop+0xc2/0x1c0
> [ 293.110057] RSP: 0000:ffff8805b82c1cd8 EFLAGS: 00000293
> [ 293.110057] RAX: 0000000000000001 RBX: ffffffff84385437 RCX:
> 0000000000000000
> [ 293.110057] RDX: 0000000000000001 RSI: 0000000000000001 RDI:
> ffff8800c40bb978
> [ 293.110057] RBP: ffff8805b82c1d18 R08: 0000000000000000 R09:
> 0000000000000000
> [ 293.110057] R10: 0000000000000001 R11: 0000000000000001 R12:
> ffff8805b82c1c48
> [ 293.110057] R13: ffff8805b8283000 R14: ffff8805b82c0000 R15:
> ffff8805b8283000
> [ 293.110057] FS: 0000000000000000(0000) GS:ffff8805b9000000(0000)
> knlGS:0000000000000000
> [ 293.110057] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 293.110057] CR2: 0000000000942d58 CR3: 0000000005c26000 CR4:
> 00000000000006e0
> [ 293.110057] DR0: 000000000089e000 DR1: 00007f938d51a000 DR2:
> 0000000000000000
> [ 293.110057] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
> 0000000000000600
> [ 293.110057] Stack:
> [ 293.110057] ffff8805b91d0be0 00000000c40bb9d8 ffff8805b82c1cf8
> ffff8805b91d0be0
> [ 293.110057] ffff8800c40bba28 ffffffff812037d0 ffff8800c40bb978
> ffff8805b91d0c30
> [ 293.110057] ffff8805b82c1de8 ffffffff81202f56 0000000000000001
> ffffffff81202e44
> [ 293.110057] Call Trace:
> [ 293.110057] [<ffffffff812037d0>] ? stop_cpus+0x60/0x60
> [ 293.110057] [<ffffffff81202f56>] cpu_stopper_thread+0x96/0x190
> [ 293.110057] [<ffffffff81202e44>] ? cpu_stop_should_run+0x44/0x60
> [ 293.110057] [<ffffffff811a3d1a>] ? __lock_release+0x1da/0x1f0
> [ 293.110057] [<ffffffff84384ec5>] ?
> _raw_spin_unlock_irqrestore+0x65/0xb0
> [ 293.110057] [<ffffffff811a1f0d>] ? trace_hardirqs_on+0xd/0x10
> [ 293.110057] [<ffffffff81170826>] smpboot_thread_fn+0x2b6/0x2c0
> [ 293.110057] [<ffffffff81170570>] ?
> smpboot_register_percpu_thread+0x100/0x100
> [ 293.110057] [<ffffffff81167755>] kthread+0x105/0x110
> [ 293.110057] [<ffffffff8437fca2>] ? wait_for_completion+0xc2/0x110
> [ 293.110057] [<ffffffff81167650>] ? set_kthreadd_affinity+0x30/0x30
> [ 293.110057] [<ffffffff8438e28c>] ret_from_fork+0x7c/0xb0
> [ 293.110057] [<ffffffff81167650>] ? set_kthreadd_affinity+0x30/0x30
> [ 293.110057] Code: b5 05 01 66 0f 1f 44 00 00 45 89 ed 4d 0f a3 2e 45
> 19 ed 45 85 ed 41 0f 95 c7 4c 8d 73 24 31 c0 c7 45 cc 00 00 00 00 66 90
> f3 90 <44> 8b 6b 20 41 39 c5 74 66 41 83 fd 02 74 0f 41 83 fd 03 75 41
>
> This is followed by a dump of the rest of the CPU states, and I see the
> following:
>
> [ 268.450444] NMI backtrace for cpu 34
> [ 268.450444] CPU: 34 PID: 9859 Comm: trinity-c129 Tainted: G W
> 3.14.0-rc3-next-20140218-sasha-00008-g16140ff-dirty #109
> [ 268.450444] task: ffff880154aa3000 ti: ffff8800c40ba000 task.ti:
> ffff8800c40ba000
> [ 268.450444] RIP: 0010:[<ffffffff810b801a>] [<ffffffff810b801a>]
> pvclock_clocksource_read+0x1a/0xc0
> [ 268.450444] RSP: 0018:ffff8800c40bb538 EFLAGS: 00000082
> [ 268.450444] RAX: 00000000905f62ec RBX: ffff8805b93d8500 RCX:
> 000000000000000a
> [ 268.450444] RDX: 00000000000000a1 RSI: 0000000000000040 RDI:
> ffff881027f12880
> [ 268.450444] RBP: ffff8800c40bb548 R08: 0000000000000000 R09:
> 0000000000000001
> [ 268.450444] R10: 0000000000000002 R11: 0000000000000001 R12:
> ffff8805b93d8510
> [ 268.450444] R13: 0000000000000000 R14: 0000000000000000 R15:
> 0000000000000001
> [ 268.450444] FS: 00007f938e16a700(0000) GS:ffff8805b9200000(0000)
> knlGS:0000000000000000
> [ 268.450444] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 268.450444] CR2: 00007f938a658a38 CR3: 00000000c40b2000 CR4:
> 00000000000006e0
> [ 268.450444] DR0: 000000000089e000 DR1: 00007f938d51a000 DR2:
> 0000000000000000
> [ 268.450444] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
> 0000000000f7060a
> [ 268.450444] Stack:
> [ 268.450444] ffff8805b93d8500 ffff8805b93d8510 ffff8800c40bb558
> ffffffff810b6f04
> [ 268.450444] ffff8800c40bb568 ffffffff81076e1d ffff8800c40bb598
> ffffffff81180635
> [ 268.450444] ffffffff875f7be0 0000000000000022 00000000001d8500
> ffff8805b93d8500
> [ 268.450444] Call Trace:
> [ 268.450444] [<ffffffff810b6f04>] kvm_clock_read+0x24/0x50
> [ 268.450444] [<ffffffff81076e1d>] sched_clock+0x1d/0x30
> [ 268.450444] [<ffffffff81180635>] sched_clock_local+0x25/0x90
> [ 268.450444] [<ffffffff811808a8>] sched_clock_cpu+0xb8/0x110
> [ 268.450444] [<ffffffff8118094d>] local_clock+0x2d/0x40
> [ 268.450444] [<ffffffff811a3358>] __lock_acquire+0x2a8/0x580
> [ 268.450444] [<ffffffff8118c84d>] ? update_blocked_averages+0x61d/0x670
> [ 268.450444] [<ffffffff811a37b2>] lock_acquire+0x182/0x1d0
> [ 268.450444] [<ffffffff8118ce2c>] ? idle_balance+0x3c/0x2c0
> [ 268.450444] [<ffffffff8118ce61>] idle_balance+0x71/0x2c0
> [ 268.450444] [<ffffffff8118ce2c>] ? idle_balance+0x3c/0x2c0
> [ 268.450444] [<ffffffff8118d2fe>] pick_next_task_fair+0x24e/0x290
> [ 268.450444] [<ffffffff81173a2e>] ? dequeue_task+0x6e/0x80
> [ 268.450444] [<ffffffff8437e3b5>] __schedule+0x2a5/0x840
> [ 268.450444] [<ffffffff8437ec15>] schedule+0x65/0x70
> [ 268.450444] [<ffffffff8437d8d8>] schedule_timeout+0x38/0x2a0
> [ 268.450444] [<ffffffff8119c8fe>] ? put_lock_stats+0xe/0x30
> [ 268.450444] [<ffffffff811a196c>] ? mark_held_locks+0x6c/0x90
> [ 268.450444] [<ffffffff84384e0b>] ? _raw_spin_unlock_irq+0x2b/0x80
> [ 268.450444] [<ffffffff8437fc97>] wait_for_completion+0xb7/0x110
> [ 268.450444] [<ffffffff8117e6b0>] ? try_to_wake_up+0x2a0/0x2a0
> [ 268.450444] [<ffffffff812034a1>] stop_two_cpus+0x261/0x2b0
> [ 268.450444] [<ffffffff81176c00>] ? __migrate_swap_task+0xa0/0xa0
> [ 268.450444] [<ffffffff810b6f04>] ? kvm_clock_read+0x24/0x50
> [ 268.450444] [<ffffffff812037d0>] ? stop_cpus+0x60/0x60
> [ 268.450444] [<ffffffff812037d0>] ? stop_cpus+0x60/0x60
> [ 268.450444] [<ffffffff8119c8fe>] ? put_lock_stats+0xe/0x30
> [ 268.450444] [<ffffffff8437fc1f>] ? wait_for_completion+0x3f/0x110
> [ 268.450444] [<ffffffff811740d5>] migrate_swap+0x175/0x1a0
> [ 268.450444] [<ffffffff8118b1e0>] task_numa_migrate+0x510/0x600
> [ 268.450444] [<ffffffff8118acd0>] ? select_task_rq_fair+0x440/0x440
> [ 268.450444] [<ffffffff8118b31d>] numa_migrate_preferred+0x4d/0x60
> [ 268.450444] [<ffffffff8118b900>] task_numa_fault+0x190/0x210
> [ 268.450444] [<ffffffff812c4f62>] do_huge_pmd_numa_page+0x432/0x450
> [ 268.450444] [<ffffffff8119c8fe>] ? put_lock_stats+0xe/0x30
> [ 268.450444] [<ffffffff81abccda>] ? delay_tsc+0xea/0x110
> [ 268.450444] [<ffffffff81288724>] __handle_mm_fault+0x244/0x3a0
> [ 268.450444] [<ffffffff812c776d>] ? rcu_read_unlock+0x5d/0x60
> [ 268.450444] [<ffffffff8128898b>] handle_mm_fault+0x10b/0x1b0
> [ 268.450444] [<ffffffff84389252>] ? __do_page_fault+0x2e2/0x590
> [ 268.450444] [<ffffffff843894c1>] __do_page_fault+0x551/0x590
> [ 268.450444] [<ffffffff81181561>] ? vtime_account_user+0x91/0xa0
> [ 268.450444] [<ffffffff8124f0c8>] ?
> context_tracking_user_exit+0xa8/0x1c0
> [ 268.450444] [<ffffffff84384f40>] ? _raw_spin_unlock+0x30/0x50
> [ 268.450444] [<ffffffff81181561>] ? vtime_account_user+0x91/0xa0
> [ 268.450444] [<ffffffff8124f0c8>] ?
> context_tracking_user_exit+0xa8/0x1c0
> [ 268.450444] [<ffffffff843895bd>] do_page_fault+0x3d/0x70
> [ 268.450444] [<ffffffff843888b5>] do_async_page_fault+0x35/0x100
> [ 268.450444] [<ffffffff84385798>] async_page_fault+0x28/0x30
> [ 268.450444] Code: 6d e1 10 00 e8 98 8a 15 00 c9 c3 66 0f 1f 44 00 00
> 55 48 89 e5 48 83 ec 10 8b 37 eb 06 0f 1f 40 00 89 c6 0f 1f 00 0f ae e8
> 0f 31 <48> c1 e2 20 89 c0 0f be 4f 1c 48 09 c2 44 8b 47 18 48 2b 57 08
>
> The stack trace is always pointing to migrate_swap() calling
> stop_two_cpus(). Note that I don't
> see another CPU in a stopped state when it occurs.
>
>
> Thanks,
> Sasha
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2014-02-21 16:44:43

by Sasha Levin

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/19/2014 11:32 PM, Michael wang wrote:
> On 02/20/2014 02:08 AM, Sasha Levin wrote:
>> >Hi all,
>> >
>> >While fuzzing with trinity inside a KVM tools guest, running latest
>> >-next kernel, I see to hit the following hang quite often.
> Fix for the stuck issue around idle_balance() is now in progress, this
> may caused be the same problem, I suggest we do some retest after these
> patch got merged.

Could I get a link to the patch please? It's a pain testing other things with this issue reproducing
every time.


Thanks,
Sasha

2014-02-22 01:45:16

by Michael wang

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/22/2014 12:43 AM, Sasha Levin wrote:
> On 02/19/2014 11:32 PM, Michael wang wrote:
>> On 02/20/2014 02:08 AM, Sasha Levin wrote:
>>> >Hi all,
>>> >
>>> >While fuzzing with trinity inside a KVM tools guest, running latest
>>> >-next kernel, I see to hit the following hang quite often.
>> Fix for the stuck issue around idle_balance() is now in progress, this
>> may caused be the same problem, I suggest we do some retest after these
>> patch got merged.
>
> Could I get a link to the patch please? It's a pain testing other things
> with this issue reproducing every time.

Please take a try on the latest tip tree, patches has been merged as I
saw :)

Regards,
Michael Wang

>
>
> Thanks,
> Sasha
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2014-02-24 03:24:22

by Sasha Levin

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/21/2014 08:45 PM, Michael wang wrote:
> On 02/22/2014 12:43 AM, Sasha Levin wrote:
>> On 02/19/2014 11:32 PM, Michael wang wrote:
>>> On 02/20/2014 02:08 AM, Sasha Levin wrote:
>>>>> Hi all,
>>>>>
>>>>> While fuzzing with trinity inside a KVM tools guest, running latest
>>>>> -next kernel, I see to hit the following hang quite often.
>>> Fix for the stuck issue around idle_balance() is now in progress, this
>>> may caused be the same problem, I suggest we do some retest after these
>>> patch got merged.
>>
>> Could I get a link to the patch please? It's a pain testing other things
>> with this issue reproducing every time.
>
> Please take a try on the latest tip tree, patches has been merged as I
> saw :)

Nope, still see it with latest -tip.

I ran tip's master branch, should I have tried a different one?


Thanks,
sasha

2014-02-24 05:19:24

by Michael wang

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/24/2014 11:23 AM, Sasha Levin wrote:
[snip]
>>>
>>> Could I get a link to the patch please? It's a pain testing other things
>>> with this issue reproducing every time.
>>
>> Please take a try on the latest tip tree, patches has been merged as I
>> saw :)
>
> Nope, still see it with latest -tip.
>
> I ran tip's master branch, should I have tried a different one?

Hmm... I don't see the changes we expected on master either...

Peter, do we accidentally missed this commit?

http://git.kernel.org/tip/477af336ba06ef4c32e97892bb0d2027ce30f466

Regards,
Michael Wang

>
>
> Thanks,
> sasha
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2014-02-24 05:55:21

by Sasha Levin

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/24/2014 12:19 AM, Michael wang wrote:
> On 02/24/2014 11:23 AM, Sasha Levin wrote:
> [snip]
>>>> >>>
>>>> >>>Could I get a link to the patch please? It's a pain testing other things
>>>> >>>with this issue reproducing every time.
>>> >>
>>> >>Please take a try on the latest tip tree, patches has been merged as I
>>> >>saw:)
>> >
>> >Nope, still see it with latest -tip.
>> >
>> >I ran tip's master branch, should I have tried a different one?
> Hmm... I don't see the changes we expected on master either...
>
> Peter, do we accidentally missed this commit?
>
> http://git.kernel.org/tip/477af336ba06ef4c32e97892bb0d2027ce30f466

I've tried grabbing this patch and it does pretty bad things on my testing VM, it just gets stuck
early in the boot process and doesn't show any progress. The VM never even gets close to starting up
all the cpus (usually dies around the lockdep self-testing).

Are there any other dependencies?


Thanks,
Sasha

2014-02-24 07:10:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On Mon, Feb 24, 2014 at 01:19:15PM +0800, Michael wang wrote:
> Peter, do we accidentally missed this commit?
>
> http://git.kernel.org/tip/477af336ba06ef4c32e97892bb0d2027ce30f466

Ingo dropped it on Saturday because it makes locking_selftest() unhappy.

That is because we call locking_selftest() way before we're ready to
call schedule() and guess what it does :-/

I'm not entirely sure what to do.. ideally I'd shoot locking_selftest in
the head, but clearly that's not entirely desired either.

2014-02-24 10:14:38

by Michael wang

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/24/2014 03:10 PM, Peter Zijlstra wrote:
> On Mon, Feb 24, 2014 at 01:19:15PM +0800, Michael wang wrote:
>> Peter, do we accidentally missed this commit?
>>
>> http://git.kernel.org/tip/477af336ba06ef4c32e97892bb0d2027ce30f466
>
> Ingo dropped it on Saturday because it makes locking_selftest() unhappy.
>
> That is because we call locking_selftest() way before we're ready to
> call schedule() and guess what it does :-/
>
> I'm not entirely sure what to do.. ideally I'd shoot locking_selftest in
> the head, but clearly that's not entirely desired either.

...what about move idle_balance() back to it's old position?

pull_rt_task() logical could be after idle_balance() if still no FAIR
and DL, then go into the pick loop, that may could make things more
clean & clear, should we have a try?

Regards,
Michael Wang

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2014-02-24 12:12:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On Mon, Feb 24, 2014 at 06:14:24PM +0800, Michael wang wrote:
> On 02/24/2014 03:10 PM, Peter Zijlstra wrote:
> > On Mon, Feb 24, 2014 at 01:19:15PM +0800, Michael wang wrote:
> >> Peter, do we accidentally missed this commit?
> >>
> >> http://git.kernel.org/tip/477af336ba06ef4c32e97892bb0d2027ce30f466
> >
> > Ingo dropped it on Saturday because it makes locking_selftest() unhappy.
> >
> > That is because we call locking_selftest() way before we're ready to
> > call schedule() and guess what it does :-/
> >
> > I'm not entirely sure what to do.. ideally I'd shoot locking_selftest in
> > the head, but clearly that's not entirely desired either.
>
> ...what about move idle_balance() back to it's old position?

I've always hated that, idle_balance() is very much a fair policy thing
and shouldn't live in the core code.

> pull_rt_task() logical could be after idle_balance() if still no FAIR
> and DL, then go into the pick loop, that may could make things more
> clean & clear, should we have a try?

So the reason pull_{rt,dl}_task() is before idle_balance() is that we
don't want to add the execution latency of idle_balance() to the rt/dl
task pulling.

Anyway, the below seems to work; it avoids playing tricks with the idle
thread and instead uses a magic constant.

The comparison should be faster too; seeing how we avoid dereferencing
p->sched_class.

---
Subject: sched: Guarantee task priority in pick_next_task()
From: Peter Zijlstra <[email protected]>
Date: Fri Feb 14 12:25:08 CET 2014

Michael spotted that the idle_balance() push down created a task
priority problem.

Previously, when we called idle_balance() before pick_next_task() it
wasn't a problem when -- because of the rq->lock droppage -- an rt/dl
task slipped in.

Similarly for pre_schedule(), rt pre-schedule could have a dl task
slip in.

But by pulling it into the pick_next_task() loop, we'll not try a
higher task priority again.

Cure this by creating a re-start condition in pick_next_task(); and
triggering this from pick_next_task_{rt,fair}().

Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
Cc: Juri Lelli <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Steven Rostedt <[email protected]>
Reported-by: Michael Wang <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
---
kernel/sched/core.c | 12 ++++++++----
kernel/sched/fair.c | 13 ++++++++++++-
kernel/sched/rt.c | 10 +++++++++-
kernel/sched/sched.h | 5 +++++
4 files changed, 34 insertions(+), 6 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2586,24 +2586,28 @@ static inline void schedule_debug(struct
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev)
{
- const struct sched_class *class;
+ const struct sched_class *class = &fair_sched_class;
struct task_struct *p;

/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(prev->sched_class == &fair_sched_class &&
+ if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev);
- if (likely(p))
+ if (likely(p && p != RETRY_TASK))
return p;
}

+again:
for_each_class(class) {
p = class->pick_next_task(rq, prev);
- if (p)
+ if (p) {
+ if (unlikely(p == RETRY_TASK))
+ goto again;
return p;
+ }
}

BUG(); /* the idle class will always have a runnable task */
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4687,6 +4687,7 @@ pick_next_task_fair(struct rq *rq, struc
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
+ int new_tasks;

again:
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4785,7 +4786,17 @@ pick_next_task_fair(struct rq *rq, struc
return p;

idle:
- if (idle_balance(rq)) /* drops rq->lock */
+ /*
+ * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * possible for any higher priority task to appear. In that case we
+ * must re-start the pick_next_entity() loop.
+ */
+ new_tasks = idle_balance(rq);
+
+ if (rq->nr_running != rq->cfs.h_nr_running)
+ return RETRY_TASK;
+
+ if (new_tasks)
goto again;

return NULL;
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1360,8 +1360,16 @@ pick_next_task_rt(struct rq *rq, struct
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;

- if (need_pull_rt_task(rq, prev))
+ if (need_pull_rt_task(rq, prev)) {
pull_rt_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a dl task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (unlikely(rq->dl.dl_nr_running))
+ return RETRY_TASK;
+ }

if (!rt_rq->rt_nr_running)
return NULL;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1090,6 +1090,8 @@ static const u32 prio_to_wmult[40] = {

#define DEQUEUE_SLEEP 1

+#define RETRY_TASK ((void *)-1UL)
+
struct sched_class {
const struct sched_class *next;

@@ -1104,6 +1106,9 @@ struct sched_class {
* It is the responsibility of the pick_next_task() method that will
* return the next task to call put_prev_task() on the @prev task or
* something equivalent.
+ *
+ * May return RETRY_TASK when it finds a higher prio class has runnable
+ * tasks.
*/
struct task_struct * (*pick_next_task) (struct rq *rq,
struct task_struct *prev);

2014-02-24 13:10:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On Mon, Feb 24, 2014 at 01:12:18PM +0100, Peter Zijlstra wrote:
> + if (p) {
> + if (unlikely(p == RETRY_TASK))
> + goto again;

We could even make that: unlikely(p & 1), I think most CPUs can encode
that far better than the full pointer immediate.

2014-02-24 18:22:23

by Sasha Levin

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/24/2014 07:12 AM, Peter Zijlstra wrote:
> Anyway, the below seems to work; it avoids playing tricks with the idle
> thread and instead uses a magic constant.
>
> The comparison should be faster too; seeing how we avoid dereferencing
> p->sched_class.
>
> ---
> Subject: sched: Guarantee task priority in pick_next_task()
> From: Peter Zijlstra<[email protected]>
> Date: Fri Feb 14 12:25:08 CET 2014
>
> Michael spotted that the idle_balance() push down created a task
> priority problem.
>
> Previously, when we called idle_balance() before pick_next_task() it
> wasn't a problem when -- because of the rq->lock droppage -- an rt/dl
> task slipped in.
>
> Similarly for pre_schedule(), rt pre-schedule could have a dl task
> slip in.
>
> But by pulling it into the pick_next_task() loop, we'll not try a
> higher task priority again.
>
> Cure this by creating a re-start condition in pick_next_task(); and
> triggering this from pick_next_task_{rt,fair}().
>
> Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
> Cc: Juri Lelli<[email protected]>
> Cc: Ingo Molnar<[email protected]>
> Cc: Steven Rostedt<[email protected]>
> Reported-by: Michael Wang<[email protected]>
> Signed-off-by: Peter Zijlstra<[email protected]>

Sign me up to the fan club of this patch, I love it.

After running it for a bit I see that all the hangs I've seen before are gone, even those that I
didn't assume were sched related.


Thanks,
Sasha


Attachments:
0.txt.gz (20.88 kB)

2014-02-25 02:48:09

by Michael wang

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/25/2014 02:21 AM, Sasha Levin wrote:
[snip]
>>
>> Fixes: 38033c37faab ("sched: Push down pre_schedule() and
>> idle_balance()")
>> Cc: Juri Lelli<[email protected]>
>> Cc: Ingo Molnar<[email protected]>
>> Cc: Steven Rostedt<[email protected]>
>> Reported-by: Michael Wang<[email protected]>
>> Signed-off-by: Peter Zijlstra<[email protected]>
>
> Sign me up to the fan club of this patch, I love it.
>
> After running it for a bit I see that all the hangs I've seen before are
> gone, even those that I didn't assume were sched related.

Good to know the problem got solved :)

Regards,
Michael Wang

>
>
> Thanks,
> Sasha

2014-02-25 03:01:12

by Michael wang

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/24/2014 08:12 PM, Peter Zijlstra wrote:
[snip]
>>
>> ...what about move idle_balance() back to it's old position?
>
> I've always hated that, idle_balance() is very much a fair policy thing
> and shouldn't live in the core code.
>
>> pull_rt_task() logical could be after idle_balance() if still no FAIR
>> and DL, then go into the pick loop, that may could make things more
>> clean & clear, should we have a try?
>
> So the reason pull_{rt,dl}_task() is before idle_balance() is that we
> don't want to add the execution latency of idle_balance() to the rt/dl
> task pulling.

Yeah, that make sense, just wondering... since RT also has balance
stuff, may be we can use a new call back for each class in the old position?

The new idle_balance could like:

void idle_balance() {
for_each_class(class)
if class->idle_balance()
break
}

>
> Anyway, the below seems to work; it avoids playing tricks with the idle
> thread and instead uses a magic constant.
>
> The comparison should be faster too; seeing how we avoid dereferencing
> p->sched_class.

Great, it once appeared in my mind but you achieved this without new
parameter, now let's ignore my wondering above :)

Regards,
Michael Wang

>
> ---
> Subject: sched: Guarantee task priority in pick_next_task()
> From: Peter Zijlstra <[email protected]>
> Date: Fri Feb 14 12:25:08 CET 2014
>
> Michael spotted that the idle_balance() push down created a task
> priority problem.
>
> Previously, when we called idle_balance() before pick_next_task() it
> wasn't a problem when -- because of the rq->lock droppage -- an rt/dl
> task slipped in.
>
> Similarly for pre_schedule(), rt pre-schedule could have a dl task
> slip in.
>
> But by pulling it into the pick_next_task() loop, we'll not try a
> higher task priority again.
>
> Cure this by creating a re-start condition in pick_next_task(); and
> triggering this from pick_next_task_{rt,fair}().
>
> Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
> Cc: Juri Lelli <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Steven Rostedt <[email protected]>
> Reported-by: Michael Wang <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> kernel/sched/core.c | 12 ++++++++----
> kernel/sched/fair.c | 13 ++++++++++++-
> kernel/sched/rt.c | 10 +++++++++-
> kernel/sched/sched.h | 5 +++++
> 4 files changed, 34 insertions(+), 6 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2586,24 +2586,28 @@ static inline void schedule_debug(struct
> static inline struct task_struct *
> pick_next_task(struct rq *rq, struct task_struct *prev)
> {
> - const struct sched_class *class;
> + const struct sched_class *class = &fair_sched_class;
> struct task_struct *p;
>
> /*
> * Optimization: we know that if all tasks are in
> * the fair class we can call that function directly:
> */
> - if (likely(prev->sched_class == &fair_sched_class &&
> + if (likely(prev->sched_class == class &&
> rq->nr_running == rq->cfs.h_nr_running)) {
> p = fair_sched_class.pick_next_task(rq, prev);
> - if (likely(p))
> + if (likely(p && p != RETRY_TASK))
> return p;
> }
>
> +again:
> for_each_class(class) {
> p = class->pick_next_task(rq, prev);
> - if (p)
> + if (p) {
> + if (unlikely(p == RETRY_TASK))
> + goto again;
> return p;
> + }
> }
>
> BUG(); /* the idle class will always have a runnable task */
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4687,6 +4687,7 @@ pick_next_task_fair(struct rq *rq, struc
> struct cfs_rq *cfs_rq = &rq->cfs;
> struct sched_entity *se;
> struct task_struct *p;
> + int new_tasks;
>
> again:
> #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -4785,7 +4786,17 @@ pick_next_task_fair(struct rq *rq, struc
> return p;
>
> idle:
> - if (idle_balance(rq)) /* drops rq->lock */
> + /*
> + * Because idle_balance() releases (and re-acquires) rq->lock, it is
> + * possible for any higher priority task to appear. In that case we
> + * must re-start the pick_next_entity() loop.
> + */
> + new_tasks = idle_balance(rq);
> +
> + if (rq->nr_running != rq->cfs.h_nr_running)
> + return RETRY_TASK;
> +
> + if (new_tasks)
> goto again;
>
> return NULL;
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1360,8 +1360,16 @@ pick_next_task_rt(struct rq *rq, struct
> struct task_struct *p;
> struct rt_rq *rt_rq = &rq->rt;
>
> - if (need_pull_rt_task(rq, prev))
> + if (need_pull_rt_task(rq, prev)) {
> pull_rt_task(rq);
> + /*
> + * pull_rt_task() can drop (and re-acquire) rq->lock; this
> + * means a dl task can slip in, in which case we need to
> + * re-start task selection.
> + */
> + if (unlikely(rq->dl.dl_nr_running))
> + return RETRY_TASK;
> + }
>
> if (!rt_rq->rt_nr_running)
> return NULL;
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1090,6 +1090,8 @@ static const u32 prio_to_wmult[40] = {
>
> #define DEQUEUE_SLEEP 1
>
> +#define RETRY_TASK ((void *)-1UL)
> +
> struct sched_class {
> const struct sched_class *next;
>
> @@ -1104,6 +1106,9 @@ struct sched_class {
> * It is the responsibility of the pick_next_task() method that will
> * return the next task to call put_prev_task() on the @prev task or
> * something equivalent.
> + *
> + * May return RETRY_TASK when it finds a higher prio class has runnable
> + * tasks.
> */
> struct task_struct * (*pick_next_task) (struct rq *rq,
> struct task_struct *prev);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2014-02-25 04:47:11

by Michael wang

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/24/2014 09:10 PM, Peter Zijlstra wrote:
> On Mon, Feb 24, 2014 at 01:12:18PM +0100, Peter Zijlstra wrote:
>> + if (p) {
>> + if (unlikely(p == RETRY_TASK))
>> + goto again;
>
> We could even make that: unlikely(p & 1), I think most CPUs can encode
> that far better than the full pointer immediate.

Agree, unless odd-align stuff appeared...

Regards,
Michael Wang

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2014-02-25 10:49:30

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On Tue, Feb 25, 2014 at 12:47:01PM +0800, Michael wang wrote:
> On 02/24/2014 09:10 PM, Peter Zijlstra wrote:
> > On Mon, Feb 24, 2014 at 01:12:18PM +0100, Peter Zijlstra wrote:
> >> + if (p) {
> >> + if (unlikely(p == RETRY_TASK))
> >> + goto again;
> >
> > We could even make that: unlikely(p & 1), I think most CPUs can encode
> > that far better than the full pointer immediate.
>
> Agree, unless odd-align stuff appeared...

That shouldn't happen; we rely on u32 alignment all over the place. So
for general objects we should have at least the lower two bits free to
play with.

2014-02-25 11:03:29

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On Mon, Feb 24, 2014 at 01:21:48PM -0500, Sasha Levin wrote:
> Sign me up to the fan club of this patch, I love it.

I've converted that in a Tested-by: tag :-)

2014-02-26 02:32:32

by Michael wang

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/25/2014 06:49 PM, Peter Zijlstra wrote:
> On Tue, Feb 25, 2014 at 12:47:01PM +0800, Michael wang wrote:
>> On 02/24/2014 09:10 PM, Peter Zijlstra wrote:
>>> On Mon, Feb 24, 2014 at 01:12:18PM +0100, Peter Zijlstra wrote:
>>>> + if (p) {
>>>> + if (unlikely(p == RETRY_TASK))
>>>> + goto again;
>>>
>>> We could even make that: unlikely(p & 1), I think most CPUs can encode
>>> that far better than the full pointer immediate.
>>
>> Agree, unless odd-align stuff appeared...
>
> That shouldn't happen; we rely on u32 alignment all over the place. So
> for general objects we should have at least the lower two bits free to
> play with.

Make sense :)

Regards,
Michael Wang

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

Subject: [tip:sched/core] sched: Guarantee task priority in pick_next_task ()

Commit-ID: 37e117c07b89194aae7062bc63bde1104c03db02
Gitweb: http://git.kernel.org/tip/37e117c07b89194aae7062bc63bde1104c03db02
Author: Peter Zijlstra <[email protected]>
AuthorDate: Fri, 14 Feb 2014 12:25:08 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 27 Feb 2014 12:41:02 +0100

sched: Guarantee task priority in pick_next_task()

Michael spotted that the idle_balance() push down created a task
priority problem.

Previously, when we called idle_balance() before pick_next_task() it
wasn't a problem when -- because of the rq->lock droppage -- an rt/dl
task slipped in.

Similarly for pre_schedule(), rt pre-schedule could have a dl task
slip in.

But by pulling it into the pick_next_task() loop, we'll not try a
higher task priority again.

Cure this by creating a re-start condition in pick_next_task(); and
triggering this from pick_next_task_{rt,fair}().

It also fixes a live-lock where we get stuck in pick_next_task_fair()
due to idle_balance() seeing !0 nr_running but there not actually
being any fair tasks about.

Reported-by: Michael Wang <[email protected]>
Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
Tested-by: Sasha Levin <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Steven Rostedt <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/core.c | 12 ++++++++----
kernel/sched/fair.c | 13 ++++++++++++-
kernel/sched/rt.c | 10 +++++++++-
kernel/sched/sched.h | 5 +++++
4 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a8a73b8..cde573d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2586,24 +2586,28 @@ static inline void schedule_debug(struct task_struct *prev)
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev)
{
- const struct sched_class *class;
+ const struct sched_class *class = &fair_sched_class;
struct task_struct *p;

/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(prev->sched_class == &fair_sched_class &&
+ if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev);
- if (likely(p))
+ if (likely(p && p != RETRY_TASK))
return p;
}

+again:
for_each_class(class) {
p = class->pick_next_task(rq, prev);
- if (p)
+ if (p) {
+ if (unlikely(p == RETRY_TASK))
+ goto again;
return p;
+ }
}

BUG(); /* the idle class will always have a runnable task */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index be4f7d9..16042b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4686,6 +4686,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
struct task_struct *p;
+ int new_tasks;

again:
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4784,7 +4785,17 @@ simple:
return p;

idle:
- if (idle_balance(rq)) /* drops rq->lock */
+ /*
+ * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * possible for any higher priority task to appear. In that case we
+ * must re-start the pick_next_entity() loop.
+ */
+ new_tasks = idle_balance(rq);
+
+ if (rq->nr_running != rq->cfs.h_nr_running)
+ return RETRY_TASK;
+
+ if (new_tasks)
goto again;

return NULL;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4d4b386..398b3f9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1360,8 +1360,16 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;

- if (need_pull_rt_task(rq, prev))
+ if (need_pull_rt_task(rq, prev)) {
pull_rt_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a dl task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (unlikely(rq->dl.dl_nr_running))
+ return RETRY_TASK;
+ }

if (!rt_rq->rt_nr_running)
return NULL;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 046084e..1929deb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1091,6 +1091,8 @@ static const u32 prio_to_wmult[40] = {

#define DEQUEUE_SLEEP 1

+#define RETRY_TASK ((void *)-1UL)
+
struct sched_class {
const struct sched_class *next;

@@ -1105,6 +1107,9 @@ struct sched_class {
* It is the responsibility of the pick_next_task() method that will
* return the next task to call put_prev_task() on the @prev task or
* something equivalent.
+ *
+ * May return RETRY_TASK when it finds a higher prio class has runnable
+ * tasks.
*/
struct task_struct * (*pick_next_task) (struct rq *rq,
struct task_struct *prev);

2014-04-10 03:32:29

by Sasha Levin

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 02/24/2014 07:12 AM, Peter Zijlstra wrote:
> Subject: sched: Guarantee task priority in pick_next_task()
> From: Peter Zijlstra <[email protected]>
> Date: Fri Feb 14 12:25:08 CET 2014
>
> Michael spotted that the idle_balance() push down created a task
> priority problem.
>
> Previously, when we called idle_balance() before pick_next_task() it
> wasn't a problem when -- because of the rq->lock droppage -- an rt/dl
> task slipped in.
>
> Similarly for pre_schedule(), rt pre-schedule could have a dl task
> slip in.
>
> But by pulling it into the pick_next_task() loop, we'll not try a
> higher task priority again.
>
> Cure this by creating a re-start condition in pick_next_task(); and
> triggering this from pick_next_task_{rt,fair}().
>
> Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
> Cc: Juri Lelli <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Steven Rostedt <[email protected]>
> Reported-by: Michael Wang <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>

I'd like to re-open this issue. It seems that something broke and I'm
now seeing the same issues that have gone away 2 months with this patch
again.

Stack trace is similar to before:

[ 6004.990292] CPU: 20 PID: 26054 Comm: trinity-c58 Not tainted 3.14.0-next-20140409-sasha-00022-g984f7c5-dirty #385
[ 6004.990292] task: ffff880375bb3000 ti: ffff88036058e000 task.ti: ffff88036058e000
[ 6004.990292] RIP: generic_exec_single (kernel/smp.c:91 kernel/smp.c:175)
[ 6004.990292] RSP: 0000:ffff88036058f978 EFLAGS: 00000202
[ 6004.990292] RAX: ffff8802b71dec00 RBX: ffff88036058f978 RCX: ffff8802b71decd8
[ 6004.990292] RDX: ffff8802b71d85c0 RSI: ffff88036058f978 RDI: ffff88036058f978
[ 6004.990292] RBP: ffff88036058f9c8 R08: 0000000000000001 R09: ffffffffa70bc580
[ 6004.990292] R10: ffff880375bb3000 R11: 0000000000000000 R12: 000000000000000c
[ 6004.990292] R13: 0000000000000001 R14: ffff88036058fa20 R15: ffffffffa121f560
[ 6004.990292] FS: 00007fe993fbd700(0000) GS:ffff880437000000(0000) knlGS:0000000000000000
[ 6004.990292] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 6004.990292] CR2: 00007fffb56b0a18 CR3: 00000003755df000 CR4: 00000000000006a0
[ 6004.990292] DR0: 0000000000695000 DR1: 0000000000695000 DR2: 0000000000000000
[ 6004.990292] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 6004.990292] Stack:
[ 6004.990292] ffff88040513da18 ffffffffa121f560 ffff88036058fa20 0000000000000002
[ 6004.990292] 000000000000000c 000000000000000c ffffffffa121f560 ffff88036058fa20
[ 6004.990292] 0000000000000001 ffff880189fe3000 ffff88036058fa08 ffffffffa11ff7b2
[ 6004.990292] Call Trace:
[ 6004.990292] ? cpu_stop_queue_work (kernel/stop_machine.c:227)
[ 6004.990292] ? cpu_stop_queue_work (kernel/stop_machine.c:227)
[ 6004.990292] smp_call_function_single (kernel/smp.c:234 (discriminator 7))
[ 6004.990292] ? lg_local_lock (kernel/locking/lglock.c:25)
[ 6004.990292] stop_two_cpus (kernel/stop_machine.c:297)
[ 6004.990292] ? retint_restore_args (arch/x86/kernel/entry_64.S:1040)
[ 6004.990292] ? __stop_cpus (kernel/stop_machine.c:170)
[ 6004.990292] ? __stop_cpus (kernel/stop_machine.c:170)
[ 6004.990292] ? __migrate_swap_task (kernel/sched/core.c:1042)
[ 6004.990292] migrate_swap (kernel/sched/core.c:1110)
[ 6004.990292] task_numa_migrate (kernel/sched/fair.c:1321)
[ 6004.990292] ? task_numa_migrate (kernel/sched/fair.c:1227)
[ 6004.990292] ? sched_clock_cpu (kernel/sched/clock.c:311)
[ 6004.990292] numa_migrate_preferred (kernel/sched/fair.c:1342)
[ 6004.990292] task_numa_fault (kernel/sched/fair.c:1796)
[ 6004.990292] __handle_mm_fault (mm/memory.c:3812 mm/memory.c:3812 mm/memory.c:3925)
[ 6004.990292] ? __const_udelay (arch/x86/lib/delay.c:126)
[ 6004.990292] ? __rcu_read_unlock (kernel/rcu/update.c:97)
[ 6004.990292] handle_mm_fault (include/linux/memcontrol.h:147 mm/memory.c:3951)
[ 6004.990292] __do_page_fault (arch/x86/mm/fault.c:1220)
[ 6004.990292] ? vtime_account_user (kernel/sched/cputime.c:687)
[ 6004.990292] ? get_parent_ip (kernel/sched/core.c:2472)
[ 6004.990292] ? context_tracking_user_exit (include/linux/vtime.h:89 include/linux/jump_label.h:105 include/trace/events/context_tracking.h:47 kernel/context_tracking.c:178)
[ 6004.990292] ? preempt_count_sub (kernel/sched/core.c:2527)
[ 6004.990292] ? context_tracking_user_exit (kernel/context_tracking.c:182)
[ 6004.990292] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
[ 6004.990292] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2638 (discriminator 2))
[ 6004.990292] do_page_fault (arch/x86/mm/fault.c:1272 include/linux/jump_label.h:105 include/linux/context_tracking_state.h:27 include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1273)
[ 6004.990292] do_async_page_fault (arch/x86/kernel/kvm.c:263)
[ 6004.990292] async_page_fault (arch/x86/kernel/entry_64.S:1496)
[ 6004.990292] Code: 44 89 e7 ff 15 70 2d c5 04 45 85 ed 75 0b 31 c0 eb 27 0f 1f 80 00 00 00 00 f6 43 18 01 74 ef 66 2e 0f 1f 84 00 00 00 00 00 f3 90 <f6> 43 18 01 75 f8 eb db 66 0f 1f 44 00 00 48 83 c4 28 5b 41 5c


Thanks,
Sasha

2014-04-10 07:00:00

by Michael wang

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 04/10/2014 11:31 AM, Sasha Levin wrote:
[snip]
>
> I'd like to re-open this issue. It seems that something broke and I'm
> now seeing the same issues that have gone away 2 months with this patch
> again.

A new mechanism has been designed to move the priority checking inside
idle_balance(), including Kirill who is the designer ;-)

Regards,
Michael Wang

>
> Stack trace is similar to before:
>
> [ 6004.990292] CPU: 20 PID: 26054 Comm: trinity-c58 Not tainted 3.14.0-next-20140409-sasha-00022-g984f7c5-dirty #385
> [ 6004.990292] task: ffff880375bb3000 ti: ffff88036058e000 task.ti: ffff88036058e000
> [ 6004.990292] RIP: generic_exec_single (kernel/smp.c:91 kernel/smp.c:175)
> [ 6004.990292] RSP: 0000:ffff88036058f978 EFLAGS: 00000202
> [ 6004.990292] RAX: ffff8802b71dec00 RBX: ffff88036058f978 RCX: ffff8802b71decd8
> [ 6004.990292] RDX: ffff8802b71d85c0 RSI: ffff88036058f978 RDI: ffff88036058f978
> [ 6004.990292] RBP: ffff88036058f9c8 R08: 0000000000000001 R09: ffffffffa70bc580
> [ 6004.990292] R10: ffff880375bb3000 R11: 0000000000000000 R12: 000000000000000c
> [ 6004.990292] R13: 0000000000000001 R14: ffff88036058fa20 R15: ffffffffa121f560
> [ 6004.990292] FS: 00007fe993fbd700(0000) GS:ffff880437000000(0000) knlGS:0000000000000000
> [ 6004.990292] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 6004.990292] CR2: 00007fffb56b0a18 CR3: 00000003755df000 CR4: 00000000000006a0
> [ 6004.990292] DR0: 0000000000695000 DR1: 0000000000695000 DR2: 0000000000000000
> [ 6004.990292] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
> [ 6004.990292] Stack:
> [ 6004.990292] ffff88040513da18 ffffffffa121f560 ffff88036058fa20 0000000000000002
> [ 6004.990292] 000000000000000c 000000000000000c ffffffffa121f560 ffff88036058fa20
> [ 6004.990292] 0000000000000001 ffff880189fe3000 ffff88036058fa08 ffffffffa11ff7b2
> [ 6004.990292] Call Trace:
> [ 6004.990292] ? cpu_stop_queue_work (kernel/stop_machine.c:227)
> [ 6004.990292] ? cpu_stop_queue_work (kernel/stop_machine.c:227)
> [ 6004.990292] smp_call_function_single (kernel/smp.c:234 (discriminator 7))
> [ 6004.990292] ? lg_local_lock (kernel/locking/lglock.c:25)
> [ 6004.990292] stop_two_cpus (kernel/stop_machine.c:297)
> [ 6004.990292] ? retint_restore_args (arch/x86/kernel/entry_64.S:1040)
> [ 6004.990292] ? __stop_cpus (kernel/stop_machine.c:170)
> [ 6004.990292] ? __stop_cpus (kernel/stop_machine.c:170)
> [ 6004.990292] ? __migrate_swap_task (kernel/sched/core.c:1042)
> [ 6004.990292] migrate_swap (kernel/sched/core.c:1110)
> [ 6004.990292] task_numa_migrate (kernel/sched/fair.c:1321)
> [ 6004.990292] ? task_numa_migrate (kernel/sched/fair.c:1227)
> [ 6004.990292] ? sched_clock_cpu (kernel/sched/clock.c:311)
> [ 6004.990292] numa_migrate_preferred (kernel/sched/fair.c:1342)
> [ 6004.990292] task_numa_fault (kernel/sched/fair.c:1796)
> [ 6004.990292] __handle_mm_fault (mm/memory.c:3812 mm/memory.c:3812 mm/memory.c:3925)
> [ 6004.990292] ? __const_udelay (arch/x86/lib/delay.c:126)
> [ 6004.990292] ? __rcu_read_unlock (kernel/rcu/update.c:97)
> [ 6004.990292] handle_mm_fault (include/linux/memcontrol.h:147 mm/memory.c:3951)
> [ 6004.990292] __do_page_fault (arch/x86/mm/fault.c:1220)
> [ 6004.990292] ? vtime_account_user (kernel/sched/cputime.c:687)
> [ 6004.990292] ? get_parent_ip (kernel/sched/core.c:2472)
> [ 6004.990292] ? context_tracking_user_exit (include/linux/vtime.h:89 include/linux/jump_label.h:105 include/trace/events/context_tracking.h:47 kernel/context_tracking.c:178)
> [ 6004.990292] ? preempt_count_sub (kernel/sched/core.c:2527)
> [ 6004.990292] ? context_tracking_user_exit (kernel/context_tracking.c:182)
> [ 6004.990292] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
> [ 6004.990292] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2638 (discriminator 2))
> [ 6004.990292] do_page_fault (arch/x86/mm/fault.c:1272 include/linux/jump_label.h:105 include/linux/context_tracking_state.h:27 include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1273)
> [ 6004.990292] do_async_page_fault (arch/x86/kernel/kvm.c:263)
> [ 6004.990292] async_page_fault (arch/x86/kernel/entry_64.S:1496)
> [ 6004.990292] Code: 44 89 e7 ff 15 70 2d c5 04 45 85 ed 75 0b 31 c0 eb 27 0f 1f 80 00 00 00 00 f6 43 18 01 74 ef 66 2e 0f 1f 84 00 00 00 00 00 f3 90 <f6> 43 18 01 75 f8 eb db 66 0f 1f 44 00 00 48 83 c4 28 5b 41 5c
>
>
> Thanks,
> Sasha
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2014-04-10 07:42:49

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On Wed, Apr 09, 2014 at 11:31:48PM -0400, Sasha Levin wrote:
> I'd like to re-open this issue. It seems that something broke and I'm
> now seeing the same issues that have gone away 2 months with this patch
> again.

Weird; we didn't touch anything in the last few weeks :-/

> Stack trace is similar to before:
>
> [ 6004.990292] CPU: 20 PID: 26054 Comm: trinity-c58 Not tainted 3.14.0-next-20140409-sasha-00022-g984f7c5-dirty #385
> [ 6004.990292] task: ffff880375bb3000 ti: ffff88036058e000 task.ti: ffff88036058e000
> [ 6004.990292] RIP: generic_exec_single (kernel/smp.c:91 kernel/smp.c:175)
> [ 6004.990292] RSP: 0000:ffff88036058f978 EFLAGS: 00000202
> [ 6004.990292] RAX: ffff8802b71dec00 RBX: ffff88036058f978 RCX: ffff8802b71decd8
> [ 6004.990292] RDX: ffff8802b71d85c0 RSI: ffff88036058f978 RDI: ffff88036058f978
> [ 6004.990292] RBP: ffff88036058f9c8 R08: 0000000000000001 R09: ffffffffa70bc580
> [ 6004.990292] R10: ffff880375bb3000 R11: 0000000000000000 R12: 000000000000000c
> [ 6004.990292] R13: 0000000000000001 R14: ffff88036058fa20 R15: ffffffffa121f560
> [ 6004.990292] FS: 00007fe993fbd700(0000) GS:ffff880437000000(0000) knlGS:0000000000000000
> [ 6004.990292] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [ 6004.990292] CR2: 00007fffb56b0a18 CR3: 00000003755df000 CR4: 00000000000006a0
> [ 6004.990292] DR0: 0000000000695000 DR1: 0000000000695000 DR2: 0000000000000000
> [ 6004.990292] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
> [ 6004.990292] Stack:
> [ 6004.990292] ffff88040513da18 ffffffffa121f560 ffff88036058fa20 0000000000000002
> [ 6004.990292] 000000000000000c 000000000000000c ffffffffa121f560 ffff88036058fa20
> [ 6004.990292] 0000000000000001 ffff880189fe3000 ffff88036058fa08 ffffffffa11ff7b2
> [ 6004.990292] Call Trace:
> [ 6004.990292] ? cpu_stop_queue_work (kernel/stop_machine.c:227)
> [ 6004.990292] ? cpu_stop_queue_work (kernel/stop_machine.c:227)
> [ 6004.990292] smp_call_function_single (kernel/smp.c:234 (discriminator 7))
> [ 6004.990292] ? lg_local_lock (kernel/locking/lglock.c:25)
> [ 6004.990292] stop_two_cpus (kernel/stop_machine.c:297)
> [ 6004.990292] ? retint_restore_args (arch/x86/kernel/entry_64.S:1040)
> [ 6004.990292] ? __stop_cpus (kernel/stop_machine.c:170)
> [ 6004.990292] ? __stop_cpus (kernel/stop_machine.c:170)
> [ 6004.990292] ? __migrate_swap_task (kernel/sched/core.c:1042)
> [ 6004.990292] migrate_swap (kernel/sched/core.c:1110)
> [ 6004.990292] task_numa_migrate (kernel/sched/fair.c:1321)
> [ 6004.990292] ? task_numa_migrate (kernel/sched/fair.c:1227)
> [ 6004.990292] ? sched_clock_cpu (kernel/sched/clock.c:311)
> [ 6004.990292] numa_migrate_preferred (kernel/sched/fair.c:1342)
> [ 6004.990292] task_numa_fault (kernel/sched/fair.c:1796)
> [ 6004.990292] __handle_mm_fault (mm/memory.c:3812 mm/memory.c:3812 mm/memory.c:3925)
> [ 6004.990292] ? __const_udelay (arch/x86/lib/delay.c:126)
> [ 6004.990292] ? __rcu_read_unlock (kernel/rcu/update.c:97)
> [ 6004.990292] handle_mm_fault (include/linux/memcontrol.h:147 mm/memory.c:3951)
> [ 6004.990292] __do_page_fault (arch/x86/mm/fault.c:1220)
> [ 6004.990292] ? vtime_account_user (kernel/sched/cputime.c:687)
> [ 6004.990292] ? get_parent_ip (kernel/sched/core.c:2472)
> [ 6004.990292] ? context_tracking_user_exit (include/linux/vtime.h:89 include/linux/jump_label.h:105 include/trace/events/context_tracking.h:47 kernel/context_tracking.c:178)
> [ 6004.990292] ? preempt_count_sub (kernel/sched/core.c:2527)
> [ 6004.990292] ? context_tracking_user_exit (kernel/context_tracking.c:182)
> [ 6004.990292] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
> [ 6004.990292] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2638 (discriminator 2))
> [ 6004.990292] do_page_fault (arch/x86/mm/fault.c:1272 include/linux/jump_label.h:105 include/linux/context_tracking_state.h:27 include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1273)
> [ 6004.990292] do_async_page_fault (arch/x86/kernel/kvm.c:263)
> [ 6004.990292] async_page_fault (arch/x86/kernel/entry_64.S:1496)
> [ 6004.990292] Code: 44 89 e7 ff 15 70 2d c5 04 45 85 ed 75 0b 31 c0 eb 27 0f 1f 80 00 00 00 00 f6 43 18 01 74 ef 66 2e 0f 1f 84 00 00 00 00 00 f3 90 <f6> 43 18 01 75 f8 eb db 66 0f 1f 44 00 00 48 83 c4 28 5b 41 5c

This different stack trace format throws your brain...

In any case, do any of the other CPUs do anything interesting?

2014-04-10 13:46:19

by Kirill Tkhai

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

10.04.2014, 11:00, "Michael wang" <[email protected]>:
> On 04/10/2014 11:31 AM, Sasha Levin wrote:
> [snip]
>
>> ?I'd like to re-open this issue. It seems that something broke and I'm
>> ?now seeing the same issues that have gone away 2 months with this patch
>> ?again.
>
> A new mechanism has been designed to move the priority checking inside
> idle_balance(), including Kirill who is the designer ;-)

Not sure, it's connected with my patch. But looks like, we forgot
exactly stop class. Maybe this will help?

[PATCH] sched: Checking for stop task appearance when balancing happens

Just do it, like we do for other higher priority classes...

Signed-off-by: Kirill Tkhai <[email protected]>
---
kernel/sched/deadline.c | 11 ++++++++++-
kernel/sched/fair.c | 3 ++-
kernel/sched/rt.c | 7 ++++---
3 files changed, 16 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27ef409..b080957 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1021,8 +1021,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)

dl_rq = &rq->dl;

- if (need_pull_dl_task(rq, prev))
+ if (need_pull_dl_task(rq, prev)) {
pull_dl_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a stop task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (rq->stop && rq->stop->on_rq)
+ return RETRY_TASK;
+ }
+
/*
* When prev is DL, we may throttle it in put_prev_task().
* So, we update time before we check for dl_nr_running.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e9bd0b..c50275b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6727,7 +6727,8 @@ static int idle_balance(struct rq *this_rq)
out:
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
- (this_rq->dl.dl_nr_running ||
+ ((this_rq->stop && this_rq->stop->on_rq) ||
+ this_rq->dl.dl_nr_running ||
(this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
pulled_task = -1;

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8cdf16..bd2267a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1362,10 +1362,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
pull_rt_task(rq);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl task can slip in, in which case we need to
- * re-start task selection.
+ * means a dl or stop task can slip in, in which case we need
+ * to re-start task selection.
*/
- if (unlikely(rq->dl.dl_nr_running))
+ if (unlikely((rq->stop && rq->stop->on_rq) ||
+ rq->dl.dl_nr_running))
return RETRY_TASK;
}

2014-04-11 14:32:59

by Sasha Levin

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On 04/10/2014 09:38 AM, Kirill Tkhai wrote:
> 10.04.2014, 11:00, "Michael wang" <[email protected]>:
>> > On 04/10/2014 11:31 AM, Sasha Levin wrote:
>> > [snip]
>> >
>>> >> I'd like to re-open this issue. It seems that something broke and I'm
>>> >> now seeing the same issues that have gone away 2 months with this patch
>>> >> again.
>> >
>> > A new mechanism has been designed to move the priority checking inside
>> > idle_balance(), including Kirill who is the designer ;-)
> Not sure, it's connected with my patch. But looks like, we forgot
> exactly stop class. Maybe this will help?
>
> [PATCH] sched: Checking for stop task appearance when balancing happens
>
> Just do it, like we do for other higher priority classes...
>
> Signed-off-by: Kirill Tkhai <[email protected]>

I've been running with this patch for the last two days and the hang
seems to be gone.

I'll leave it running on the weekend and will update again on Monday.


Thanks,
Sasha

2014-04-11 15:23:54

by Kirill Tkhai

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

11.04.2014, 18:33, "Sasha Levin" <[email protected]>:
> On 04/10/2014 09:38 AM, Kirill Tkhai wrote:
>
>> ?10.04.2014, 11:00, "Michael wang" <[email protected]>:
>>>> ?On 04/10/2014 11:31 AM, Sasha Levin wrote:
>>>> ?[snip]
>>>>>> ??I'd like to re-open this issue. It seems that something broke and I'm
>>>>>> ??now seeing the same issues that have gone away 2 months with this patch
>>>>>> ??again.
>>>> ?A new mechanism has been designed to move the priority checking inside
>>>> ?idle_balance(), including Kirill who is the designer ;-)
>> ?Not sure, it's connected with my patch. But looks like, we forgot
>> ?exactly stop class. Maybe this will help?
>>
>> ?[PATCH] sched: Checking for stop task appearance when balancing happens
>>
>> ?Just do it, like we do for other higher priority classes...
>>
>> ?Signed-off-by: Kirill Tkhai <[email protected]>
>
> I've been running with this patch for the last two days and the hang
> seems to be gone.
>
> I'll leave it running on the weekend and will update again on Monday.

Thanks for testing.

Sorry for I killed your linuxes by this lost need_resched().

>
> Thanks,
> Sasha

Subject: [tip:sched/urgent] sched: Check for stop task appearance when balancing happens

Commit-ID: a1d9a3231eac4117cadaf4b6bba5b2902c15a33e
Gitweb: http://git.kernel.org/tip/a1d9a3231eac4117cadaf4b6bba5b2902c15a33e
Author: Kirill Tkhai <[email protected]>
AuthorDate: Thu, 10 Apr 2014 17:38:36 +0400
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 17 Apr 2014 13:39:51 +0200

sched: Check for stop task appearance when balancing happens

We need to do it like we do for the other higher priority classes..

Signed-off-by: Kirill Tkhai <[email protected]>
Cc: Michael wang <[email protected]>
Cc: Sasha Levin <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/deadline.c | 11 ++++++++++-
kernel/sched/fair.c | 3 ++-
kernel/sched/rt.c | 7 ++++---
3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 27ef409..b080957 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1021,8 +1021,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)

dl_rq = &rq->dl;

- if (need_pull_dl_task(rq, prev))
+ if (need_pull_dl_task(rq, prev)) {
pull_dl_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a stop task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (rq->stop && rq->stop->on_rq)
+ return RETRY_TASK;
+ }
+
/*
* When prev is DL, we may throttle it in put_prev_task().
* So, we update time before we check for dl_nr_running.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4f14a65..7570dd9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6728,7 +6728,8 @@ static int idle_balance(struct rq *this_rq)
out:
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
- (this_rq->dl.dl_nr_running ||
+ ((this_rq->stop && this_rq->stop->on_rq) ||
+ this_rq->dl.dl_nr_running ||
(this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
pulled_task = -1;

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8cdf16..bd2267a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1362,10 +1362,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
pull_rt_task(rq);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl task can slip in, in which case we need to
- * re-start task selection.
+ * means a dl or stop task can slip in, in which case we need
+ * to re-start task selection.
*/
- if (unlikely(rq->dl.dl_nr_running))
+ if (unlikely((rq->stop && rq->stop->on_rq) ||
+ rq->dl.dl_nr_running))
return RETRY_TASK;
}

2015-06-15 19:38:31

by Rafael David Tinoco

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

Peter, Sasha, coming back to this?

Not that this is happening frequently or I can easily reproduce, but?

> On May14, 2014, at 07:26 AM, Peter Zijlstra <[email protected]> wrote:
>
> On Wed, May 14, 2014 at 02:21:04PM +0400, Kirill Tkhai wrote:
>>
>>
>> 14.05.2014, 14:14, "Peter Zijlstra" <[email protected]>:
>>> On Wed, May 14, 2014 at 01:42:32PM +0400, Kirill Tkhai wrote:
>>>
>>>> Peter, do we have to queue stop works orderly?
>>>>
>>>> Is there is not a possibility, when two pair of works queued different on
>>>> different cpus?
>>>>
>>>> kernel/stop_machine.c | 10 ++++++++--
>>>> 1 file changed, 8 insertions(+), 2 deletions(-)
>>>> diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
>>>> index b6b67ec..29e221b 100644
>>>> --- a/kernel/stop_machine.c
>>>> +++ b/kernel/stop_machine.c
>>>> @@ -250,8 +250,14 @@ struct irq_cpu_stop_queue_work_info {
>>>> static void irq_cpu_stop_queue_work(void *arg)
>>>> {
>>>> struct irq_cpu_stop_queue_work_info *info = arg;
>>>> - cpu_stop_queue_work(info->cpu1, info->work1);
>>>> - cpu_stop_queue_work(info->cpu2, info->work2);
>>>> +
>>>> + if (info->cpu1 < info->cpu2) {
>>>> + cpu_stop_queue_work(info->cpu1, info->work1);
>>>> + cpu_stop_queue_work(info->cpu2, info->work2);
>>>> + } else {
>>>> + cpu_stop_queue_work(info->cpu2, info->work2);
>>>> + cpu_stop_queue_work(info->cpu1, info->work1);
>>>> + }
>>>> }
>>>
>>> I'm not sure, we already send the IPI to the first cpu of the pair, so
>>> supposing we have 4 cpus, and get 4 pairs like:
>>>
>>> 0,1 1,2 2,3 3,0
>>>
>>> That would result in IPIs to 0, 1, 2, and 0 again, and since the IPI
>>> function is serialized I don't immediately see a way for this to
>>> deadlock.
>>
>> It's about stop_two_cpus(), I have a distrust about other users of stop task:
>>
>> queue_stop_cpus_work() queues work consequentially:
>>
>> 0 1 2 4
>>
>> stop_two_cpus() may queue:
>>
>> 1 0
>>
>> Looks like, stop thread on 0th and on 1th are waiting for wrong works.
>
> so we serialize stop_cpus_work() vs stop_two_cpus() with an l/g lock.
>
> Ah, but stop_cpus_work() only holds the global lock over queueing, it
> doesn't wait for completion, that might indeed cause a problem.
>
> Also, since its two different cpus queueing, the ordered queue doesn't
> really matter, you can still interleave the all and two sets and get
> into this state.

Do you think __stop_cpus->queue_stop_cpus_work() & stop_two_cpus might be stepping into each other because of this global lock being on held on queuing only (and not completion) ?

In the past I described to Sasha the follow scenario from one of my 3.13 kernels:

> -> multi_cpu_stop -> do { } while (curstate != MULTI_STOP_EXIT);
>
> In my case, curstate is WAY different from enum containing MULTI_STOP_EXIT (4).
>
> Register totally messed up (probably after cpu_relax(), right where
> you were trapped -> after the pause instruction).
>
> my case:
>
> PID: 118 TASK: ffff883fd28ec7d0 CPU: 9 COMMAND: "migration/9"
> ...
> [exception RIP: multi_cpu_stop+0x64]
> RIP: ffffffff810f5944 RSP: ffff883fd2907d98 RFLAGS: 00000246
> RAX: 0000000000000010 RBX: 0000000000000010 RCX: 0000000000000246
> RDX: ffff883fd2907d98 RSI: 0000000000000000 RDI: 0000000000000001
> RBP: ffffffff810f5944 R8: ffffffff810f5944 R9: 0000000000000000
> R10: ffff883fd2907d98 R11: 0000000000000246 R12: ffffffffffffffff
> R13: ffff883f55d01b48 R14: 0000000000000000 R15: 0000000000000001
> ORIG_RAX: 0000000000000001 CS: 0010 SS: 0000
> --- <NMI exception stack> ---
> #4 [ffff883fd2907d98] multi_cpu_stop+0x64 at ffffffff810f5944
>
> 208 } while (curstate != MULTI_STOP_EXIT);
> ---> RIP
> RIP 0xffffffff810f5944 <+100>: cmp $0x4,%edx
> ---> CHECKING FOR MULTI_STOP_EXIT
>
> RDX: ffff883fd2907d98 -> does not make any sense
>
> ###
>
> If i'm reading this right,
>
> """
> CPU 05 - PID 14990
>
> do_numa_page
> task_numa_fault
> numa_migrate_preferred
> task_numa_migrate
> migrate_swap (curr: 14990, task: 14996)
> stop_two_cpus (cpu1=05(14996), cpu2=00(14990))
> wait_for_completion
>
> 14990 - CPU05
> 14996 - CPU00
>
> stop_two_cpus:
> multi_stop_data (msdata->state = MULTI_STOP_PREPARE)
> smp_call_function_single (min=cpu2=00, irq_cpu_stop_queue_work, wait=1)
> smp_call_function_single (ran on lowest CPU, 00 for this case)
> irq_cpu_stop_queue_work
> cpu_stop_queue_work(cpu1=05(14996)) # add work (multi_cpu_stop) to cpu 05 cpu_stopper queue
> cpu_stop_queue_work(cpu2=00(14990)) # add work (multi_cpu_stop) to cpu 00 cpu_stopper queue
> wait_for_completion() --> HERE
> """
>
> in my case, checking task structs for tasks scheduled when
> "waiting_for_completion()":
>
> PID 14990 CPU 05 -> PID 14996 CPU 00
> PID 14991 CPU 30 -> PID 14998 CPU 01
> PID 14992 CPU 30 -> PID 14998 CPU 01
> PID 14996 CPU 00 -> PID 14992 CPU 30
> PID 14998 CPU 01 -> PID 14990 CPU 05
>
> AND
>
> 102 2 6 ffff881fd2ea97f0 RU 0.0 0 0 [migration/6]
> 118 2 9 ffff883fd28ec7d0 RU 0.0 0 0 [migration/9]
> 143 2 14 ffff883fd29d47d0 RU 0.0 0 0 [migration/14]
> 148 2 15 ffff883fd29fc7d0 RU 0.0 0 0 [migration/15]
> 153 2 16 ffff881fd2f517f0 RU 0.0 0 0 [migration/16]
>
> THEN
>
> I am still waiting for 5 cpu_stopper_thread -> multi_cpu_stop just
> scheduled (probably in the per cpu's queue of cpus 0,1,5,30), not
> running yet.
>
> AND
>
> I don't have any "wait_for_completion" for those "OLDER" migration
> threads (6, 9, 14, 15 and 16). Probably wait_for_completion signalled
> done.completion before racing.

And following this thread?s discussion, and commits bellow:

commit a1d9a3231eac4117cadaf4b6bba5b2902c15a33e
Author: Kirill Tkhai <[email protected]>
Date: Thu Apr 10 17:38:36 2014 +0400

sched: Check for stop task appearance when balancing happens

commit 37e117c07b89194aae7062bc63bde1104c03db02
Author: Peter Zijlstra <[email protected]>
Date: Fri Feb 14 12:25:08 2014 +0100

sched: Guarantee task priority in pick_next_task()

commit 38033c37faab850ed5d33bb675c4de6c66be84d8
Author: Peter Zijlstra <[email protected]>
Date: Thu Jan 23 20:32:21 2014 +0100

sched: Push down pre_schedule() and idle_balance()

commit 606dba2e289446600a0b68422ed2019af5355c12
Author: Peter Zijlstra <[email protected]>
Date: Sat Feb 11 06:05:00 2012 +0100

sched: Push put_prev_task() into pick_next_task()

3.13 kernel still had old logic (before 3.15) - no RETRY_TASK, idle_balance() before pick_next_task(), no deadline scheduler yet - so commit ?a1d9a32? does not play a role into this panic. I?m causing ~ 150 stop_two_cpus calls / sec, for task migration, in a 32 fake numa environment, and I am NOT able to reproduce this lockup but, still, the dump is says it is there :\. For 3.13 series this lockup was seen once, no info on other versions.

Any thoughts ?

Thank you

-Rafael Tinoco

2015-06-15 19:47:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: sched: hang in migrate_swap

On Mon, Jun 15, 2015 at 04:38:21PM -0300, Rafael David Tinoco wrote:
> Any thoughts ?

This recently came up again and I proposed the below. Reposting because
the original had a silly compile fail.

---
Subject: stop_machine: Fix deadlock between multiple stop_two_cpus()
From: Peter Zijlstra <[email protected]>
Date: Fri, 5 Jun 2015 17:30:23 +0200

Jiri reported a machine stuck in multi_cpu_stop() with
migrate_swap_stop() as function and with the following src,dst cpu
pairs: {11, 4} {13, 11} { 4, 13}

4 11 13

cpuM: queue(4 ,13)
*Ma
cpuN: queue(13,11)
*N Na
*M Mb
cpuO: queue(11, 4)
*O Oa
*Nb
*Ob

Where *X denotes the cpu running the queueing of cpu-X and X[ab] denotes
the first/second queued work.

You'll observe the top of the workqueue for each cpu: 4,11,13 to be work
from cpus: M, O, N resp. IOW. deadlock.

Do away with the queueing trickery and introduce lg_double_lock() to
lock both CPUs and fully serialize the stop_two_cpus() callers instead
of the partial (and buggy) serialization we have now.

Completely untested..

Cc: Ingo Molnar <[email protected]>
Cc: Rik van Riel <[email protected]>
Reported-by: Jiri Olsa <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
---
include/linux/lglock.h | 5 +++++
kernel/locking/lglock.c | 22 ++++++++++++++++++++++
kernel/stop_machine.c | 42 +++++-------------------------------------
3 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0081f000e34b..c92ebd100d9b 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -52,10 +52,15 @@ struct lglock {
static struct lglock name = { .lock = &name ## _lock }

void lg_lock_init(struct lglock *lg, char *name);
+
void lg_local_lock(struct lglock *lg);
void lg_local_unlock(struct lglock *lg);
void lg_local_lock_cpu(struct lglock *lg, int cpu);
void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+
+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
+
void lg_global_lock(struct lglock *lg);
void lg_global_unlock(struct lglock *lg);

diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index 86ae2aebf004..951cfcd10b4a 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -60,6 +60,28 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
}
EXPORT_SYMBOL(lg_local_unlock_cpu);

+void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
+{
+ BUG_ON(cpu1 == cpu2);
+
+ /* lock in cpu order, just like lg_global_lock */
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);
+
+ preempt_disable();
+ lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
+}
+
+void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
+{
+ lock_release(&lg->lock_dep_map, 1, _RET_IP_);
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
+ arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
+ preempt_enable();
+}
+
void lg_global_lock(struct lglock *lg)
{
int i;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 695f0c6cd169..fd643d8c4b42 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -211,25 +211,6 @@ static int multi_cpu_stop(void *data)
return err;
}

-struct irq_cpu_stop_queue_work_info {
- int cpu1;
- int cpu2;
- struct cpu_stop_work *work1;
- struct cpu_stop_work *work2;
-};
-
-/*
- * This function is always run with irqs and preemption disabled.
- * This guarantees that both work1 and work2 get queued, before
- * our local migrate thread gets the chance to preempt us.
- */
-static void irq_cpu_stop_queue_work(void *arg)
-{
- struct irq_cpu_stop_queue_work_info *info = arg;
- cpu_stop_queue_work(info->cpu1, info->work1);
- cpu_stop_queue_work(info->cpu2, info->work2);
-}
-
/**
* stop_two_cpus - stops two cpus
* @cpu1: the cpu to stop
@@ -245,7 +226,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
{
struct cpu_stop_done done;
struct cpu_stop_work work1, work2;
- struct irq_cpu_stop_queue_work_info call_args;
struct multi_stop_data msdata;

preempt_disable();
@@ -262,13 +242,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
.done = &done
};

- call_args = (struct irq_cpu_stop_queue_work_info){
- .cpu1 = cpu1,
- .cpu2 = cpu2,
- .work1 = &work1,
- .work2 = &work2,
- };
-
cpu_stop_init_done(&done, 2);
set_state(&msdata, MULTI_STOP_PREPARE);

@@ -285,16 +258,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
return -ENOENT;
}

- lg_local_lock(&stop_cpus_lock);
- /*
- * Queuing needs to be done by the lowest numbered CPU, to ensure
- * that works are always queued in the same order on every CPU.
- * This prevents deadlocks.
- */
- smp_call_function_single(min(cpu1, cpu2),
- &irq_cpu_stop_queue_work,
- &call_args, 1);
- lg_local_unlock(&stop_cpus_lock);
+ lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+ cpu_stop_queue_work(cpu1, &work1);
+ cpu_stop_queue_work(cpu2, &work2);
+ lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
preempt_enable();

wait_for_completion(&done.completion);