2020-04-07 17:10:31

by Leon Romanovsky

[permalink] [raw]
Subject: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

From: Leon Romanovsky <[email protected]>

clockevents_switch_state() calls printk() from under hrtimer_bases.lock.
That causes lock inversion on scheduler locks because printk() can call
into the scheduler. Lockdep puts it as:

[ 728.464312] ====================================================================================================
[ 735.312580] TSC deadline timer enabled
[ 735.324143]
[ 735.324146] ======================================================
[ 735.324148] WARNING: possible circular locking dependency detected
[ 735.324150] 5.6.0-for-upstream-dbg-2020-04-03_10-44-43-70 #1 Not tainted
[ 735.324152] ------------------------------------------------------
[ 735.324154] swapper/3/0 is trying to acquire lock:
[ 735.324155] ffffffff8442c858 ((console_sem).lock){-...}-{2:2}, at: down_trylock+0x13/0x70
[ 735.324162]
[ 735.324164] but task is already holding lock:
[ 735.324165] ffff88842dfb9958 (hrtimer_bases.lock){-.-.}-{2:2}, at: lock_hrtimer_base+0x71/0x120
[ 735.324171]
[ 735.324173] which lock already depends on the new lock.
[ 735.324174]
[ 735.324175]
[ 735.324177] the existing dependency chain (in reverse order) is:
[ 735.324179]
[ 735.324180] -> #4 (hrtimer_bases.lock){-.-.}-{2:2}:
[ 735.324186] _raw_spin_lock_irqsave+0x3c/0x4b
[ 735.324187] lock_hrtimer_base+0x71/0x120
[ 735.324189] hrtimer_start_range_ns+0xc6/0xaa0
[ 735.324191] __enqueue_rt_entity+0xc44/0xf50
[ 735.324192] enqueue_rt_entity+0x79/0xc0
[ 735.324194] enqueue_task_rt+0x5c/0x2e0
[ 735.324195] activate_task+0x15a/0x2c0
[ 735.324197] ttwu_do_activate+0xcf/0x120
[ 735.324199] sched_ttwu_pending+0x160/0x230
[ 735.324200] scheduler_ipi+0x1c0/0x530
[ 735.324202] reschedule_interrupt+0xf/0x20
[ 735.324204] tick_nohz_idle_enter+0x16c/0x250
[ 735.324205] do_idle+0x90/0x530
[ 735.324207] cpu_startup_entry+0x19/0x20
[ 735.324208] start_secondary+0x2ee/0x3e0
[ 735.324210] secondary_startup_64+0xa4/0xb0
[ 735.324211]
[ 735.324212] -> #3 (&rt_b->rt_runtime_lock){-...}-{2:2}:
[ 735.324218] _raw_spin_lock+0x25/0x30
[ 735.324219] rq_online_rt+0x288/0x550
[ 735.324221] set_rq_online+0x11f/0x190
[ 735.324223] sched_cpu_activate+0x1d4/0x390
[ 735.324225] cpuhp_invoke_callback+0x1c5/0x1560
[ 735.324226] cpuhp_thread_fun+0x3f8/0x6f0
[ 735.324228] smpboot_thread_fn+0x305/0x5f0
[ 735.324229] kthread+0x2f8/0x3b0
[ 735.324231] ret_from_fork+0x24/0x30
[ 735.324232]
[ 735.324233] -> #2 (&rq->lock){-.-.}-{2:2}:
[ 735.324238] _raw_spin_lock+0x25/0x30
[ 735.324240] task_fork_fair+0x34/0x430
[ 735.324241] sched_fork+0x48a/0xa60
[ 735.324243] copy_process+0x15df/0x5970
[ 735.324244] _do_fork+0x106/0xcd0
[ 735.324246] kernel_thread+0x9e/0xe0
[ 735.324247] rest_init+0x28/0x330
[ 735.324249] start_kernel+0x6ac/0x6ed
[ 735.324251] secondary_startup_64+0xa4/0xb0
[ 735.324252]
[ 735.324253] -> #1 (&p->pi_lock){-.-.}-{2:2}:
[ 735.324258] _raw_spin_lock_irqsave+0x3c/0x4b
[ 735.324260] try_to_wake_up+0x9a/0x1700
[ 735.324261] up+0x7a/0xb0
[ 735.324263] __up_console_sem+0x3c/0x70
[ 735.324264] console_unlock+0x4f4/0xab0
[ 735.324266] con_font_op+0x907/0x1010
[ 735.324267] vt_ioctl+0x10a6/0x2890
[ 735.324269] tty_ioctl+0x257/0x1240
[ 735.324270] ksys_ioctl+0x3e9/0x1190
[ 735.324272] __x64_sys_ioctl+0x6f/0xb0
[ 735.324273] do_syscall_64+0xe7/0x12c0
[ 735.324275] entry_SYSCALL_64_after_hwframe+0x49/0xb3
[ 735.324276]
[ 735.324277] -> #0 ((console_sem).lock){-...}-{2:2}:
[ 735.324283] __lock_acquire+0x374a/0x5210
[ 735.324284] lock_acquire+0x1b9/0x920
[ 735.324286] _raw_spin_lock_irqsave+0x3c/0x4b
[ 735.324288] down_trylock+0x13/0x70
[ 735.324289] __down_trylock_console_sem+0x33/0xa0
[ 735.324291] console_trylock+0x13/0x60
[ 735.324292] vprintk_emit+0xec/0x370
[ 735.324294] printk+0x9c/0xc3
[ 735.324296] lapic_timer_set_oneshot+0x4e/0x60
[ 735.324297] clockevents_switch_state+0x1e1/0x360
[ 735.324299] tick_program_event+0xae/0xc0
[ 735.324301] hrtimer_start_range_ns+0x4b6/0xaa0
[ 735.324302] tick_nohz_idle_stop_tick+0x67c/0xa90
[ 735.324304] do_idle+0x326/0x530
[ 735.324305] cpu_startup_entry+0x19/0x20
[ 735.324307] start_secondary+0x2ee/0x3e0
[ 735.324309] secondary_startup_64+0xa4/0xb0
[ 735.324310]
[ 735.324311] other info that might help us debug this:
[ 735.324312]
[ 735.324314] Chain exists of:
[ 735.324315] (console_sem).lock --> &rt_b->rt_runtime_lock --> hrtimer_bases.lock
[ 735.324322]
[ 735.324324] Possible unsafe locking scenario:
[ 735.324325]
[ 735.324327] CPU0 CPU1
[ 735.324328] ---- ----
[ 735.324329] lock(hrtimer_bases.lock);
[ 735.324333] lock(&rt_b->rt_runtime_lock);
[ 735.324337] lock(hrtimer_bases.lock);
[ 735.324341] lock((console_sem).lock);
[ 735.324344]
[ 735.324345] *** DEADLOCK ***
[ 735.324346]
[ 735.324348] 1 lock held by swapper/3/0:
[ 735.324349] #0: ffff88842dfb9958 (hrtimer_bases.lock){-.-.}-{2:2}, at: lock_hrtimer_base+0x71/0x120
[ 735.324356]
[ 735.324357] stack backtrace:
[ 735.324360] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.6.0-for-upstream-dbg-2020-04-03_10-44-43-70 #1
[ 735.324363] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
[ 735.324364] Call Trace:
[ 735.324366] dump_stack+0xb7/0x10b
[ 735.324367] check_noncircular+0x37f/0x460
[ 735.324369] ? arch_stack_walk+0x7c/0xd0
[ 735.324370] ? print_circular_bug+0x4e0/0x4e0
[ 735.324372] ? mark_lock+0x1a4/0xb60
[ 735.324373] ? __lock_acquire+0x374a/0x5210
[ 735.324375] __lock_acquire+0x374a/0x5210
[ 735.324376] ? register_lock_class+0x17e0/0x17e0
[ 735.324378] ? register_lock_class+0x17e0/0x17e0
[ 735.324380] lock_acquire+0x1b9/0x920
[ 735.324381] ? down_trylock+0x13/0x70
[ 735.324383] ? check_flags.part.29+0x450/0x450
[ 735.324384] ? lock_downgrade+0x760/0x760
[ 735.324386] ? vprintk_emit+0xec/0x370
[ 735.324387] _raw_spin_lock_irqsave+0x3c/0x4b
[ 735.324389] ? down_trylock+0x13/0x70
[ 735.324390] down_trylock+0x13/0x70
[ 735.324392] __down_trylock_console_sem+0x33/0xa0
[ 735.324393] console_trylock+0x13/0x60
[ 735.324395] vprintk_emit+0xec/0x370
[ 735.324396] printk+0x9c/0xc3
[ 735.324398] ? kmsg_dump_rewind_nolock+0xd9/0xd9
[ 735.324399] lapic_timer_set_oneshot+0x4e/0x60
[ 735.324401] clockevents_switch_state+0x1e1/0x360
[ 735.324402] ? enqueue_hrtimer+0x116/0x310
[ 735.324404] tick_program_event+0xae/0xc0
[ 735.324406] hrtimer_start_range_ns+0x4b6/0xaa0
[ 735.324407] ? hrtimer_run_softirq+0x210/0x210
[ 735.324409] ? rcu_read_lock_sched_held+0xab/0xe0
[ 735.324410] ? rcu_read_lock_bh_held+0xe0/0xe0
[ 735.324412] tick_nohz_idle_stop_tick+0x67c/0xa90
[ 735.324413] ? tsc_verify_tsc_adjust+0x71/0x290
[ 735.324415] do_idle+0x326/0x530
[ 735.324416] ? arch_cpu_idle_exit+0x40/0x40
[ 735.324418] cpu_startup_entry+0x19/0x20
[ 735.324419] start_secondary+0x2ee/0x3e0
[ 735.324421] ? set_cpu_sibling_map+0x2f70/0x2f70
[ 735.324423] secondary_startup_64+0xa4/0xb0
[ 760.028504] ====================================================================================================

Fix by using deferred variant of printk which doesn't call to the scheduler.

Fixes: 279f1461432c ("x86: apic: Use tsc deadline for oneshot when available")
Signed-off-by: Leon Romanovsky <[email protected]>
---
It is far away from my main expertise and I'm not sure that the solution
is correct, but it definitely fixed our regression.
---
arch/x86/kernel/apic/apic.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d254cebdd3c3..6706b2cd9aec 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -353,7 +353,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
*/
asm volatile("mfence" : : : "memory");

- printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
+ printk_deferred_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}

--
2.25.1


2020-04-14 14:29:54

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

Hi,

Any feedback?
https://lore.kernel.org/lkml/[email protected]/

Thanks

On Tue, Apr 07, 2020 at 08:09:25PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <[email protected]>
>
> clockevents_switch_state() calls printk() from under hrtimer_bases.lock.
> That causes lock inversion on scheduler locks because printk() can call
> into the scheduler. Lockdep puts it as:
>
> [ 728.464312] ====================================================================================================
> [ 735.312580] TSC deadline timer enabled
> [ 735.324143]
> [ 735.324146] ======================================================
> [ 735.324148] WARNING: possible circular locking dependency detected
> [ 735.324150] 5.6.0-for-upstream-dbg-2020-04-03_10-44-43-70 #1 Not tainted
> [ 735.324152] ------------------------------------------------------
> [ 735.324154] swapper/3/0 is trying to acquire lock:
> [ 735.324155] ffffffff8442c858 ((console_sem).lock){-...}-{2:2}, at: down_trylock+0x13/0x70
> [ 735.324162]
> [ 735.324164] but task is already holding lock:
> [ 735.324165] ffff88842dfb9958 (hrtimer_bases.lock){-.-.}-{2:2}, at: lock_hrtimer_base+0x71/0x120
> [ 735.324171]
> [ 735.324173] which lock already depends on the new lock.
> [ 735.324174]
> [ 735.324175]
> [ 735.324177] the existing dependency chain (in reverse order) is:
> [ 735.324179]
> [ 735.324180] -> #4 (hrtimer_bases.lock){-.-.}-{2:2}:
> [ 735.324186] _raw_spin_lock_irqsave+0x3c/0x4b
> [ 735.324187] lock_hrtimer_base+0x71/0x120
> [ 735.324189] hrtimer_start_range_ns+0xc6/0xaa0
> [ 735.324191] __enqueue_rt_entity+0xc44/0xf50
> [ 735.324192] enqueue_rt_entity+0x79/0xc0
> [ 735.324194] enqueue_task_rt+0x5c/0x2e0
> [ 735.324195] activate_task+0x15a/0x2c0
> [ 735.324197] ttwu_do_activate+0xcf/0x120
> [ 735.324199] sched_ttwu_pending+0x160/0x230
> [ 735.324200] scheduler_ipi+0x1c0/0x530
> [ 735.324202] reschedule_interrupt+0xf/0x20
> [ 735.324204] tick_nohz_idle_enter+0x16c/0x250
> [ 735.324205] do_idle+0x90/0x530
> [ 735.324207] cpu_startup_entry+0x19/0x20
> [ 735.324208] start_secondary+0x2ee/0x3e0
> [ 735.324210] secondary_startup_64+0xa4/0xb0
> [ 735.324211]
> [ 735.324212] -> #3 (&rt_b->rt_runtime_lock){-...}-{2:2}:
> [ 735.324218] _raw_spin_lock+0x25/0x30
> [ 735.324219] rq_online_rt+0x288/0x550
> [ 735.324221] set_rq_online+0x11f/0x190
> [ 735.324223] sched_cpu_activate+0x1d4/0x390
> [ 735.324225] cpuhp_invoke_callback+0x1c5/0x1560
> [ 735.324226] cpuhp_thread_fun+0x3f8/0x6f0
> [ 735.324228] smpboot_thread_fn+0x305/0x5f0
> [ 735.324229] kthread+0x2f8/0x3b0
> [ 735.324231] ret_from_fork+0x24/0x30
> [ 735.324232]
> [ 735.324233] -> #2 (&rq->lock){-.-.}-{2:2}:
> [ 735.324238] _raw_spin_lock+0x25/0x30
> [ 735.324240] task_fork_fair+0x34/0x430
> [ 735.324241] sched_fork+0x48a/0xa60
> [ 735.324243] copy_process+0x15df/0x5970
> [ 735.324244] _do_fork+0x106/0xcd0
> [ 735.324246] kernel_thread+0x9e/0xe0
> [ 735.324247] rest_init+0x28/0x330
> [ 735.324249] start_kernel+0x6ac/0x6ed
> [ 735.324251] secondary_startup_64+0xa4/0xb0
> [ 735.324252]
> [ 735.324253] -> #1 (&p->pi_lock){-.-.}-{2:2}:
> [ 735.324258] _raw_spin_lock_irqsave+0x3c/0x4b
> [ 735.324260] try_to_wake_up+0x9a/0x1700
> [ 735.324261] up+0x7a/0xb0
> [ 735.324263] __up_console_sem+0x3c/0x70
> [ 735.324264] console_unlock+0x4f4/0xab0
> [ 735.324266] con_font_op+0x907/0x1010
> [ 735.324267] vt_ioctl+0x10a6/0x2890
> [ 735.324269] tty_ioctl+0x257/0x1240
> [ 735.324270] ksys_ioctl+0x3e9/0x1190
> [ 735.324272] __x64_sys_ioctl+0x6f/0xb0
> [ 735.324273] do_syscall_64+0xe7/0x12c0
> [ 735.324275] entry_SYSCALL_64_after_hwframe+0x49/0xb3
> [ 735.324276]
> [ 735.324277] -> #0 ((console_sem).lock){-...}-{2:2}:
> [ 735.324283] __lock_acquire+0x374a/0x5210
> [ 735.324284] lock_acquire+0x1b9/0x920
> [ 735.324286] _raw_spin_lock_irqsave+0x3c/0x4b
> [ 735.324288] down_trylock+0x13/0x70
> [ 735.324289] __down_trylock_console_sem+0x33/0xa0
> [ 735.324291] console_trylock+0x13/0x60
> [ 735.324292] vprintk_emit+0xec/0x370
> [ 735.324294] printk+0x9c/0xc3
> [ 735.324296] lapic_timer_set_oneshot+0x4e/0x60
> [ 735.324297] clockevents_switch_state+0x1e1/0x360
> [ 735.324299] tick_program_event+0xae/0xc0
> [ 735.324301] hrtimer_start_range_ns+0x4b6/0xaa0
> [ 735.324302] tick_nohz_idle_stop_tick+0x67c/0xa90
> [ 735.324304] do_idle+0x326/0x530
> [ 735.324305] cpu_startup_entry+0x19/0x20
> [ 735.324307] start_secondary+0x2ee/0x3e0
> [ 735.324309] secondary_startup_64+0xa4/0xb0
> [ 735.324310]
> [ 735.324311] other info that might help us debug this:
> [ 735.324312]
> [ 735.324314] Chain exists of:
> [ 735.324315] (console_sem).lock --> &rt_b->rt_runtime_lock --> hrtimer_bases.lock
> [ 735.324322]
> [ 735.324324] Possible unsafe locking scenario:
> [ 735.324325]
> [ 735.324327] CPU0 CPU1
> [ 735.324328] ---- ----
> [ 735.324329] lock(hrtimer_bases.lock);
> [ 735.324333] lock(&rt_b->rt_runtime_lock);
> [ 735.324337] lock(hrtimer_bases.lock);
> [ 735.324341] lock((console_sem).lock);
> [ 735.324344]
> [ 735.324345] *** DEADLOCK ***
> [ 735.324346]
> [ 735.324348] 1 lock held by swapper/3/0:
> [ 735.324349] #0: ffff88842dfb9958 (hrtimer_bases.lock){-.-.}-{2:2}, at: lock_hrtimer_base+0x71/0x120
> [ 735.324356]
> [ 735.324357] stack backtrace:
> [ 735.324360] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.6.0-for-upstream-dbg-2020-04-03_10-44-43-70 #1
> [ 735.324363] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
> [ 735.324364] Call Trace:
> [ 735.324366] dump_stack+0xb7/0x10b
> [ 735.324367] check_noncircular+0x37f/0x460
> [ 735.324369] ? arch_stack_walk+0x7c/0xd0
> [ 735.324370] ? print_circular_bug+0x4e0/0x4e0
> [ 735.324372] ? mark_lock+0x1a4/0xb60
> [ 735.324373] ? __lock_acquire+0x374a/0x5210
> [ 735.324375] __lock_acquire+0x374a/0x5210
> [ 735.324376] ? register_lock_class+0x17e0/0x17e0
> [ 735.324378] ? register_lock_class+0x17e0/0x17e0
> [ 735.324380] lock_acquire+0x1b9/0x920
> [ 735.324381] ? down_trylock+0x13/0x70
> [ 735.324383] ? check_flags.part.29+0x450/0x450
> [ 735.324384] ? lock_downgrade+0x760/0x760
> [ 735.324386] ? vprintk_emit+0xec/0x370
> [ 735.324387] _raw_spin_lock_irqsave+0x3c/0x4b
> [ 735.324389] ? down_trylock+0x13/0x70
> [ 735.324390] down_trylock+0x13/0x70
> [ 735.324392] __down_trylock_console_sem+0x33/0xa0
> [ 735.324393] console_trylock+0x13/0x60
> [ 735.324395] vprintk_emit+0xec/0x370
> [ 735.324396] printk+0x9c/0xc3
> [ 735.324398] ? kmsg_dump_rewind_nolock+0xd9/0xd9
> [ 735.324399] lapic_timer_set_oneshot+0x4e/0x60
> [ 735.324401] clockevents_switch_state+0x1e1/0x360
> [ 735.324402] ? enqueue_hrtimer+0x116/0x310
> [ 735.324404] tick_program_event+0xae/0xc0
> [ 735.324406] hrtimer_start_range_ns+0x4b6/0xaa0
> [ 735.324407] ? hrtimer_run_softirq+0x210/0x210
> [ 735.324409] ? rcu_read_lock_sched_held+0xab/0xe0
> [ 735.324410] ? rcu_read_lock_bh_held+0xe0/0xe0
> [ 735.324412] tick_nohz_idle_stop_tick+0x67c/0xa90
> [ 735.324413] ? tsc_verify_tsc_adjust+0x71/0x290
> [ 735.324415] do_idle+0x326/0x530
> [ 735.324416] ? arch_cpu_idle_exit+0x40/0x40
> [ 735.324418] cpu_startup_entry+0x19/0x20
> [ 735.324419] start_secondary+0x2ee/0x3e0
> [ 735.324421] ? set_cpu_sibling_map+0x2f70/0x2f70
> [ 735.324423] secondary_startup_64+0xa4/0xb0
> [ 760.028504] ====================================================================================================
>
> Fix by using deferred variant of printk which doesn't call to the scheduler.
>
> Fixes: 279f1461432c ("x86: apic: Use tsc deadline for oneshot when available")
> Signed-off-by: Leon Romanovsky <[email protected]>
> ---
> It is far away from my main expertise and I'm not sure that the solution
> is correct, but it definitely fixed our regression.
> ---
> arch/x86/kernel/apic/apic.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index d254cebdd3c3..6706b2cd9aec 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -353,7 +353,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
> */
> asm volatile("mfence" : : : "memory");
>
> - printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
> + printk_deferred_once(KERN_DEBUG "TSC deadline timer enabled\n");
> return;
> }
>
> --
> 2.25.1
>

2020-04-14 14:45:59

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

On Tue, Apr 14, 2020 at 08:24:54AM +0200, Ingo Molnar wrote:
>
> * Leon Romanovsky <[email protected]> wrote:
>
> > Hi,
> >
> > Any feedback?
> > https://lore.kernel.org/lkml/[email protected]/
>
> The fix definitely looks legit, lockdep is right that we shouldn't take
> the console_sem.lock even under trylock.
>
> It's only a printk_once(), yet I'm wondering why in the last ~8 years
> this never triggered. Nobody ever ran lockdep and debug console level
> enabled on such hardware, or did something else change?
>
> One possibility would be that apic_check_deadline_errata() marked almost
> all Intel systems as broken and the TSC-deadline hardware never actually
> got activated. In that case you have triggered rarely tested code and
> might see other weirdnesses. Just saying. :-)

It is hard to say what was changed, but after -rc1, we started to see
those lockdep splats almost in every run.

But if we are talking about other weirdnesses, I have another splat in my
QEMU machine, which is different, but from the same code area.

[ 1.383968] ACPI: Added _OSI(Module Device)
[ 1.385684] ACPI: Added _OSI(Processor Device)
[ 1.389345] ACPI: Added _OSI(3.0 _SCP Extensions)
[ 1.389345] ACPI: Added _OSI(Processor Aggregator Device)
[ 1.393454] ACPI: Added _OSI(Linux-Dell-Video)
[ 1.394920] ACPI: Added _OSI(Linux-Lenovo-NV-HDMI-Audio)
[ 1.396481] ACPI: Added _OSI(Linux-HPI-Hybrid-Graphics)
[ 1.793418] ACPI: 1 ACPI AML tables successfully acquired and loaded
[ 1.845452]
[ 1.846768] =============================
[ 1.849293] [ BUG: Invalid wait context ]
[ 1.849293] 5.7.0-rc1+ #1229 Not tainted
[ 1.849293] -----------------------------
[ 1.849293] swapper/1/0 is trying to lock:
[ 1.849293] ffff88806c8a4418 (&n->list_lock){-...}-{3:3}, at: deactivate_slab.isra.0+0x304/0x6d0
[ 1.849293] other info that might help us debug this:
[ 1.849293] context-{2:2}
[ 1.849293] 1 lock held by swapper/1/0:
[ 1.849293] #0: ffffffff826cb000 (rcu_callback){....}-{0:0}, at: rcu_core+0x317/0x9e0
[ 1.849293] stack backtrace:
[ 1.849293] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.7.0-rc1+ #1229
[ 1.849293] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 0
4/01/2014
[ 1.849293] Call Trace:
[ 1.849293] <IRQ>
[ 1.849293] dump_stack+0x97/0xe0
[ 1.849293] __lock_acquire.cold+0xd6/0x354
[ 1.849293] ? sched_clock_cpu+0x11f/0x140
[ 1.849293] ? mark_held_locks+0x90/0x90
[ 1.849293] ? __lock_acquire+0x8bf/0x3800
[ 1.849293] lock_acquire+0x155/0x4f0
[ 1.849293] ? deactivate_slab.isra.0+0x304/0x6d0
[ 1.849293] ? lock_release+0x660/0x660
[ 1.849293] ? sched_clock_local+0x99/0xc0
[ 1.849293] ? sched_clock_cpu+0x11f/0x140
[ 1.849293] _raw_spin_lock+0x31/0x80
[ 1.849293] ? deactivate_slab.isra.0+0x304/0x6d0
[ 1.849293] deactivate_slab.isra.0+0x304/0x6d0
[ 1.849293] ? pvclock_clocksource_read+0xd9/0x190
[ 1.849293] ? slub_cpu_dead+0xd0/0xd0
[ 1.849293] flush_cpu_slab+0x36/0x50
[ 1.849293] flush_smp_call_function_queue+0xc2/0x1e0
[ 1.849293] smp_call_function_interrupt+0x93/0x2f0
[ 1.849293] call_function_interrupt+0xf/0x20
[ 1.849293] RIP: 0010:quarantine_put+0xba/0x150
[ 1.849293] Code: e8 bb bb ec ff 48 8b 44 24 18 65 48 33 04 25 28 00 00 00 0f 85 a0 00 00 00 48 83 c4 20 5b 5d 41 5
c c3 e8 39 b8 ec ff 41 54 9d <eb> d9 48 89 e6 e8 cc fe ff ff 48 c7 c7 60 e3 76 82 e8 c0 f3 6b 00
[ 1.849293] RSP: 0000:ffff88806d209db0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff03
[ 1.849293] RAX: 0000000000000000 RBX: ffff88806be33658 RCX: dffffc0000000000
[ 1.849293] RDX: 0000000000000007 RSI: 0000000000000002 RDI: ffff88806cb5876c
[ 1.849293] RBP: ffff88806c80de00 R08: ffffffff811656ed R09: 0000000000000000
[ 1.849293] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000246
[ 1.849293] R13: ffff88806c80de00 R14: 0000000000000206 R15: 0000000000000000
[ 1.849293] ? call_function_interrupt+0xa/0x20
[ 1.849293] ? lockdep_hardirqs_on+0x17d/0x270
[ 1.849293] __kasan_slab_free+0x144/0x180
[ 1.849293] ? rcu_core+0x35c/0x9e0
[ 1.849293] kmem_cache_free+0xc6/0x370
[ 1.849293] ? migrate_pages+0xa70/0xa70
[ 1.849293] rcu_core+0x35c/0x9e0
[ 1.849293] ? rcu_core+0x317/0x9e0
[ 1.849293] ? rcu_note_context_switch+0x300/0x300
[ 1.849293] ? rcu_read_lock_sched_held+0xa1/0xd0
[ 1.849293] ? rcu_read_lock_bh_held+0xb0/0xb0
[ 1.849293] ? run_rebalance_domains+0x11d/0x140
[ 1.849293] __do_softirq+0x11c/0x612
[ 1.849293] ? rcu_read_lock_bh_held+0xb0/0xb0
[ 1.849293] irq_exit+0xfb/0x100
[ 1.849293] smp_apic_timer_interrupt+0x12e/0x360
[ 1.849293] apic_timer_interrupt+0xf/0x20
[ 1.849293] </IRQ>
[ 1.849293] RIP: 0010:default_idle+0x6e/0x250
[ 1.849293] Code: c7 c7 40 11 01 83 e8 f1 ec 93 ff 48 c7 c7 40 11 01 83 e8 b5 0c 94 ff 8b 05 6f a8 5f 01 85 c0 7e 0
7 0f 00 2d 84 f4 5f 00 fb f4 <be> 04 00 00 00 48 c7 c7 c8 bc 98 82 65 8b 2d 4f 48 60 7e e8 ba ec
[ 1.849293] RSP: 0000:ffff88806cb67dc8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
[ 1.849293] RAX: 0000000000000000 RBX: ffff88806cb58000 RCX: dffffc0000000000
[ 1.849293] RDX: 0000000000000003 RSI: 0000000000000004 RDI: ffffffff83011140
[ 1.849293] RBP: 0000000000000001 R08: ffffffff81a168cb R09: fffffbfff0602229
[ 1.849293] R10: 0000000000000003 R11: fffffbfff0602228 R12: ffffffff829a13f0
[ 1.849293] R13: 0000000000000000 R14: ffff88806cb58000 R15: 0000000000000000
[ 1.849293] ? default_idle+0x5b/0x250
[ 1.849293] ? default_idle+0x5b/0x250
[ 1.849293] do_idle+0x321/0x380
[ 1.849293] ? arch_cpu_idle_exit+0x40/0x40
[ 1.849293] ? mark_held_locks+0x1d/0x90
[ 1.849293] ? _raw_spin_unlock_irqrestore+0x3e/0x50
[ 1.849293] cpu_startup_entry+0x19/0x20
[ 1.849293] start_secondary+0x214/0x280
[ 1.849293] ? set_cpu_sibling_map+0x9e0/0x9e0
[ 1.849293] secondary_startup_64+0xa4/0xb0
[ 2.093362] ACPI: Interpreter enabled

>
> Or a bootup with "debug" specified is much more rare in production
> systems, hence the 8 years old bug.
>
> > > It is far away from my main expertise and I'm not sure that the solution
> > > is correct, but it definitely fixed our regression.
> > > ---
> > > arch/x86/kernel/apic/apic.c | 2 +-
> > > 1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> > > index d254cebdd3c3..6706b2cd9aec 100644
> > > --- a/arch/x86/kernel/apic/apic.c
> > > +++ b/arch/x86/kernel/apic/apic.c
> > > @@ -353,7 +353,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
> > > */
> > > asm volatile("mfence" : : : "memory");
> > >
> > > - printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
> > > + printk_deferred_once(KERN_DEBUG "TSC deadline timer enabled\n");
>
> I think we should move this essentially initialization-time message much
> earlier during bootup, when we are not holding any hrtimer locks.
>
> One good place would be apic_check_deadline_errata(). This place:
>
> if (boot_cpu_data.microcode >= rev)
> return;
>
> setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
> pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
> "please update microcode to version: 0x%x (or later)\n", rev);
>
> Could be something like:
>
> if (boot_cpu_data.microcode >= rev) {
> pr_debug("x86/apic: TSC deadline timer enabled.\n");
> return;
> }
>
> setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
> pr_err(FW_BUG "TSC_DEADLINE disabled due to CPU errata, please update microcode to version: 0x%x (or later)\n", rev);

I don't know what is x2apic_mode, but won't x2apic_mode == 1 prevent from setting timer?
If yes, we will print "enabled" message too early.

2091 void __init init_apic_mappings(void)
<...>
2095 apic_check_deadline_errata();
2096
2097 if (x2apic_mode) {
2098 boot_cpu_physical_apicid = read_apic_id();
2099 return;
2100 }

>
> (Note the small fixes I did to the errata message - we should do that and
> also move all user-facing messages into a single line while at it.)
>
> Thanks,
>
> Ingo

2020-04-15 10:29:51

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks


* Leon Romanovsky <[email protected]> wrote:

> Hi,
>
> Any feedback?
> https://lore.kernel.org/lkml/[email protected]/

The fix definitely looks legit, lockdep is right that we shouldn't take
the console_sem.lock even under trylock.

It's only a printk_once(), yet I'm wondering why in the last ~8 years
this never triggered. Nobody ever ran lockdep and debug console level
enabled on such hardware, or did something else change?

One possibility would be that apic_check_deadline_errata() marked almost
all Intel systems as broken and the TSC-deadline hardware never actually
got activated. In that case you have triggered rarely tested code and
might see other weirdnesses. Just saying. :-)

Or a bootup with "debug" specified is much more rare in production
systems, hence the 8 years old bug.

> > It is far away from my main expertise and I'm not sure that the solution
> > is correct, but it definitely fixed our regression.
> > ---
> > arch/x86/kernel/apic/apic.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> > index d254cebdd3c3..6706b2cd9aec 100644
> > --- a/arch/x86/kernel/apic/apic.c
> > +++ b/arch/x86/kernel/apic/apic.c
> > @@ -353,7 +353,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
> > */
> > asm volatile("mfence" : : : "memory");
> >
> > - printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
> > + printk_deferred_once(KERN_DEBUG "TSC deadline timer enabled\n");

I think we should move this essentially initialization-time message much
earlier during bootup, when we are not holding any hrtimer locks.

One good place would be apic_check_deadline_errata(). This place:

if (boot_cpu_data.microcode >= rev)
return;

setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
"please update microcode to version: 0x%x (or later)\n", rev);

Could be something like:

if (boot_cpu_data.microcode >= rev) {
pr_debug("x86/apic: TSC deadline timer enabled.\n");
return;
}

setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
pr_err(FW_BUG "TSC_DEADLINE disabled due to CPU errata, please update microcode to version: 0x%x (or later)\n", rev);

(Note the small fixes I did to the errata message - we should do that and
also move all user-facing messages into a single line while at it.)

Thanks,

Ingo

2020-04-23 07:17:45

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

On Tue, Apr 14, 2020 at 10:05:02AM +0300, Leon Romanovsky wrote:
> On Tue, Apr 14, 2020 at 08:24:54AM +0200, Ingo Molnar wrote:
> >
> > * Leon Romanovsky <[email protected]> wrote:
> >
> > > Hi,
> > >
> > > Any feedback?
> > > https://lore.kernel.org/lkml/[email protected]/
> >
> > The fix definitely looks legit, lockdep is right that we shouldn't take
> > the console_sem.lock even under trylock.
> >
> > It's only a printk_once(), yet I'm wondering why in the last ~8 years
> > this never triggered. Nobody ever ran lockdep and debug console level
> > enabled on such hardware, or did something else change?
> >
> > One possibility would be that apic_check_deadline_errata() marked almost
> > all Intel systems as broken and the TSC-deadline hardware never actually
> > got activated. In that case you have triggered rarely tested code and
> > might see other weirdnesses. Just saying. :-)
>
> It is hard to say what was changed, but after -rc1, we started to see
> those lockdep splats almost in every run.

Anything that I can help to speed up this patch?
Are you expecting anything from me that I missed to do?

Our regression is flooded with ACPI splats.

Thanks

>
> But if we are talking about other weirdnesses, I have another splat in my
> QEMU machine, which is different, but from the same code area.
>
> [ 1.383968] ACPI: Added _OSI(Module Device)
> [ 1.385684] ACPI: Added _OSI(Processor Device)
> [ 1.389345] ACPI: Added _OSI(3.0 _SCP Extensions)
> [ 1.389345] ACPI: Added _OSI(Processor Aggregator Device)
> [ 1.393454] ACPI: Added _OSI(Linux-Dell-Video)
> [ 1.394920] ACPI: Added _OSI(Linux-Lenovo-NV-HDMI-Audio)
> [ 1.396481] ACPI: Added _OSI(Linux-HPI-Hybrid-Graphics)
> [ 1.793418] ACPI: 1 ACPI AML tables successfully acquired and loaded
> [ 1.845452]
> [ 1.846768] =============================
> [ 1.849293] [ BUG: Invalid wait context ]
> [ 1.849293] 5.7.0-rc1+ #1229 Not tainted
> [ 1.849293] -----------------------------
> [ 1.849293] swapper/1/0 is trying to lock:
> [ 1.849293] ffff88806c8a4418 (&n->list_lock){-...}-{3:3}, at: deactivate_slab.isra.0+0x304/0x6d0
> [ 1.849293] other info that might help us debug this:
> [ 1.849293] context-{2:2}
> [ 1.849293] 1 lock held by swapper/1/0:
> [ 1.849293] #0: ffffffff826cb000 (rcu_callback){....}-{0:0}, at: rcu_core+0x317/0x9e0
> [ 1.849293] stack backtrace:
> [ 1.849293] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.7.0-rc1+ #1229
> [ 1.849293] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 0
> 4/01/2014
> [ 1.849293] Call Trace:
> [ 1.849293] <IRQ>
> [ 1.849293] dump_stack+0x97/0xe0
> [ 1.849293] __lock_acquire.cold+0xd6/0x354
> [ 1.849293] ? sched_clock_cpu+0x11f/0x140
> [ 1.849293] ? mark_held_locks+0x90/0x90
> [ 1.849293] ? __lock_acquire+0x8bf/0x3800
> [ 1.849293] lock_acquire+0x155/0x4f0
> [ 1.849293] ? deactivate_slab.isra.0+0x304/0x6d0
> [ 1.849293] ? lock_release+0x660/0x660
> [ 1.849293] ? sched_clock_local+0x99/0xc0
> [ 1.849293] ? sched_clock_cpu+0x11f/0x140
> [ 1.849293] _raw_spin_lock+0x31/0x80
> [ 1.849293] ? deactivate_slab.isra.0+0x304/0x6d0
> [ 1.849293] deactivate_slab.isra.0+0x304/0x6d0
> [ 1.849293] ? pvclock_clocksource_read+0xd9/0x190
> [ 1.849293] ? slub_cpu_dead+0xd0/0xd0
> [ 1.849293] flush_cpu_slab+0x36/0x50
> [ 1.849293] flush_smp_call_function_queue+0xc2/0x1e0
> [ 1.849293] smp_call_function_interrupt+0x93/0x2f0
> [ 1.849293] call_function_interrupt+0xf/0x20
> [ 1.849293] RIP: 0010:quarantine_put+0xba/0x150
> [ 1.849293] Code: e8 bb bb ec ff 48 8b 44 24 18 65 48 33 04 25 28 00 00 00 0f 85 a0 00 00 00 48 83 c4 20 5b 5d 41 5
> c c3 e8 39 b8 ec ff 41 54 9d <eb> d9 48 89 e6 e8 cc fe ff ff 48 c7 c7 60 e3 76 82 e8 c0 f3 6b 00
> [ 1.849293] RSP: 0000:ffff88806d209db0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff03
> [ 1.849293] RAX: 0000000000000000 RBX: ffff88806be33658 RCX: dffffc0000000000
> [ 1.849293] RDX: 0000000000000007 RSI: 0000000000000002 RDI: ffff88806cb5876c
> [ 1.849293] RBP: ffff88806c80de00 R08: ffffffff811656ed R09: 0000000000000000
> [ 1.849293] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000246
> [ 1.849293] R13: ffff88806c80de00 R14: 0000000000000206 R15: 0000000000000000
> [ 1.849293] ? call_function_interrupt+0xa/0x20
> [ 1.849293] ? lockdep_hardirqs_on+0x17d/0x270
> [ 1.849293] __kasan_slab_free+0x144/0x180
> [ 1.849293] ? rcu_core+0x35c/0x9e0
> [ 1.849293] kmem_cache_free+0xc6/0x370
> [ 1.849293] ? migrate_pages+0xa70/0xa70
> [ 1.849293] rcu_core+0x35c/0x9e0
> [ 1.849293] ? rcu_core+0x317/0x9e0
> [ 1.849293] ? rcu_note_context_switch+0x300/0x300
> [ 1.849293] ? rcu_read_lock_sched_held+0xa1/0xd0
> [ 1.849293] ? rcu_read_lock_bh_held+0xb0/0xb0
> [ 1.849293] ? run_rebalance_domains+0x11d/0x140
> [ 1.849293] __do_softirq+0x11c/0x612
> [ 1.849293] ? rcu_read_lock_bh_held+0xb0/0xb0
> [ 1.849293] irq_exit+0xfb/0x100
> [ 1.849293] smp_apic_timer_interrupt+0x12e/0x360
> [ 1.849293] apic_timer_interrupt+0xf/0x20
> [ 1.849293] </IRQ>
> [ 1.849293] RIP: 0010:default_idle+0x6e/0x250
> [ 1.849293] Code: c7 c7 40 11 01 83 e8 f1 ec 93 ff 48 c7 c7 40 11 01 83 e8 b5 0c 94 ff 8b 05 6f a8 5f 01 85 c0 7e 0
> 7 0f 00 2d 84 f4 5f 00 fb f4 <be> 04 00 00 00 48 c7 c7 c8 bc 98 82 65 8b 2d 4f 48 60 7e e8 ba ec
> [ 1.849293] RSP: 0000:ffff88806cb67dc8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
> [ 1.849293] RAX: 0000000000000000 RBX: ffff88806cb58000 RCX: dffffc0000000000
> [ 1.849293] RDX: 0000000000000003 RSI: 0000000000000004 RDI: ffffffff83011140
> [ 1.849293] RBP: 0000000000000001 R08: ffffffff81a168cb R09: fffffbfff0602229
> [ 1.849293] R10: 0000000000000003 R11: fffffbfff0602228 R12: ffffffff829a13f0
> [ 1.849293] R13: 0000000000000000 R14: ffff88806cb58000 R15: 0000000000000000
> [ 1.849293] ? default_idle+0x5b/0x250
> [ 1.849293] ? default_idle+0x5b/0x250
> [ 1.849293] do_idle+0x321/0x380
> [ 1.849293] ? arch_cpu_idle_exit+0x40/0x40
> [ 1.849293] ? mark_held_locks+0x1d/0x90
> [ 1.849293] ? _raw_spin_unlock_irqrestore+0x3e/0x50
> [ 1.849293] cpu_startup_entry+0x19/0x20
> [ 1.849293] start_secondary+0x214/0x280
> [ 1.849293] ? set_cpu_sibling_map+0x9e0/0x9e0
> [ 1.849293] secondary_startup_64+0xa4/0xb0
> [ 2.093362] ACPI: Interpreter enabled
>
> >
> > Or a bootup with "debug" specified is much more rare in production
> > systems, hence the 8 years old bug.
> >
> > > > It is far away from my main expertise and I'm not sure that the solution
> > > > is correct, but it definitely fixed our regression.
> > > > ---
> > > > arch/x86/kernel/apic/apic.c | 2 +-
> > > > 1 file changed, 1 insertion(+), 1 deletion(-)
> > > >
> > > > diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> > > > index d254cebdd3c3..6706b2cd9aec 100644
> > > > --- a/arch/x86/kernel/apic/apic.c
> > > > +++ b/arch/x86/kernel/apic/apic.c
> > > > @@ -353,7 +353,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
> > > > */
> > > > asm volatile("mfence" : : : "memory");
> > > >
> > > > - printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
> > > > + printk_deferred_once(KERN_DEBUG "TSC deadline timer enabled\n");
> >
> > I think we should move this essentially initialization-time message much
> > earlier during bootup, when we are not holding any hrtimer locks.
> >
> > One good place would be apic_check_deadline_errata(). This place:
> >
> > if (boot_cpu_data.microcode >= rev)
> > return;
> >
> > setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
> > pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
> > "please update microcode to version: 0x%x (or later)\n", rev);
> >
> > Could be something like:
> >
> > if (boot_cpu_data.microcode >= rev) {
> > pr_debug("x86/apic: TSC deadline timer enabled.\n");
> > return;
> > }
> >
> > setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
> > pr_err(FW_BUG "TSC_DEADLINE disabled due to CPU errata, please update microcode to version: 0x%x (or later)\n", rev);
>
> I don't know what is x2apic_mode, but won't x2apic_mode == 1 prevent from setting timer?
> If yes, we will print "enabled" message too early.
>
> 2091 void __init init_apic_mappings(void)
> <...>
> 2095 apic_check_deadline_errata();
> 2096
> 2097 if (x2apic_mode) {
> 2098 boot_cpu_physical_apicid = read_apic_id();
> 2099 return;
> 2100 }
>
> >
> > (Note the small fixes I did to the errata message - we should do that and
> > also move all user-facing messages into a single line while at it.)
> >
> > Thanks,
> >
> > Ingo

2020-04-27 11:12:25

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

Ingo Molnar <[email protected]> writes:
> * Leon Romanovsky <[email protected]> wrote:
> The fix definitely looks legit, lockdep is right that we shouldn't take
> the console_sem.lock even under trylock.
>
> It's only a printk_once(), yet I'm wondering why in the last ~8 years
> this never triggered. Nobody ever ran lockdep and debug console level
> enabled on such hardware, or did something else change?
>
> One possibility would be that apic_check_deadline_errata() marked almost
> all Intel systems as broken and the TSC-deadline hardware never actually
> got activated. In that case you have triggered rarely tested code and
> might see other weirdnesses. Just saying. :-)
>
> Or a bootup with "debug" specified is much more rare in production
> systems, hence the 8 years old bug.

None of this makes any sense at all.

The local APIC timer (in this case the TSC deadline timer) is set up
during early boot on the boot CPU (before SMP setup) with this call
chain:

smp_prepare_cpus()
native_smp_prepare_cpus()
x86_init.timers.setup_percpu_clockev()
setup_boot_APIC_clock()
setup_APIC_timer()
clockevents_config_and_register()
tick_check_new_device()
tick_setup_device()
tick_setup_oneshot()
clockevents_switch_state()
lapic_timer_set_oneshot()
__setup_APIC_LVTT()
printk_once(...)

Nothing holds hrtimer.base_lock in this call chain.

But the lockdep splat clearly says:

[ 735.324357] stack backtrace:
[ 735.324360] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.6.0-for-upstream-dbg-2020-04-03_10-44-43-70 #1

...

So how can that be the first invocation of that printk_once()?

While the patch looks innocent, it papers over the underlying problem
and wild theories are not really helping here.

Here is a boot log excerpt with lockdep enabled and 'debug' on the
command line:

[ 0.000000] Linux version 5.7.0-rc3 ...
...
[ 3.992125] TSC deadline timer enabled
[ 3.995820] smpboot: CPU0: Intel(R) ....
...
[ 4.050766] smp: Bringing up secondary CPUs ...

No splat nothing. The real question is WHY this triggers on Leons
machine 735 seconds after boot and on CPU3.

Thanks,

tglx

2020-04-27 11:35:16

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

On Mon, Apr 27, 2020 at 01:09:49PM +0200, Thomas Gleixner wrote:
> Ingo Molnar <[email protected]> writes:
> > * Leon Romanovsky <[email protected]> wrote:
> > The fix definitely looks legit, lockdep is right that we shouldn't take
> > the console_sem.lock even under trylock.
> >
> > It's only a printk_once(), yet I'm wondering why in the last ~8 years
> > this never triggered. Nobody ever ran lockdep and debug console level
> > enabled on such hardware, or did something else change?
> >
> > One possibility would be that apic_check_deadline_errata() marked almost
> > all Intel systems as broken and the TSC-deadline hardware never actually
> > got activated. In that case you have triggered rarely tested code and
> > might see other weirdnesses. Just saying. :-)
> >
> > Or a bootup with "debug" specified is much more rare in production
> > systems, hence the 8 years old bug.
>
> None of this makes any sense at all.
>
> The local APIC timer (in this case the TSC deadline timer) is set up
> during early boot on the boot CPU (before SMP setup) with this call
> chain:
>
> smp_prepare_cpus()
> native_smp_prepare_cpus()
> x86_init.timers.setup_percpu_clockev()
> setup_boot_APIC_clock()
> setup_APIC_timer()
> clockevents_config_and_register()
> tick_check_new_device()
> tick_setup_device()
> tick_setup_oneshot()
> clockevents_switch_state()
> lapic_timer_set_oneshot()
> __setup_APIC_LVTT()
> printk_once(...)
>
> Nothing holds hrtimer.base_lock in this call chain.

Can't printk hold that lock through console/netconsole?

>
> But the lockdep splat clearly says:
>
> [ 735.324357] stack backtrace:
> [ 735.324360] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.6.0-for-upstream-dbg-2020-04-03_10-44-43-70 #1
>
> ...
>
> So how can that be the first invocation of that printk_once()?
>
> While the patch looks innocent, it papers over the underlying problem
> and wild theories are not really helping here.
>
> Here is a boot log excerpt with lockdep enabled and 'debug' on the
> command line:
>
> [ 0.000000] Linux version 5.7.0-rc3 ...
> ...
> [ 3.992125] TSC deadline timer enabled
> [ 3.995820] smpboot: CPU0: Intel(R) ....
> ...
> [ 4.050766] smp: Bringing up secondary CPUs ...
>
> No splat nothing. The real question is WHY this triggers on Leons
> machine 735 seconds after boot and on CPU3.

I want to believe that the timestamp are not correct, have no clue if it
is even possible.

But let's talk about facts:
1. It is started after -rc1 (we don't test linux-next).
2. This workaround helped to eliminate the splat.
3. My machine experiences the extra splat all the time
https://lore.kernel.org/lkml/20200414070502.GR334007@unreal/

Unfortunately, I can't bisect because the failure mentioned in the
commit message because it doesn't happen on one machine all the time,
but when we are talking about night run regression, at least one of the
runners hits such lockdep prints.

I can add to our regression any debug patch and get results day after,
if it helps.

Thanks

>
> Thanks,
>
> tglx

2020-04-27 13:01:05

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

Leon Romanovsky <[email protected]> writes:
> On Mon, Apr 27, 2020 at 01:09:49PM +0200, Thomas Gleixner wrote:
>> The local APIC timer (in this case the TSC deadline timer) is set up
>> during early boot on the boot CPU (before SMP setup) with this call
>> chain:
>>
>> smp_prepare_cpus()
>> native_smp_prepare_cpus()
>> x86_init.timers.setup_percpu_clockev()
>> setup_boot_APIC_clock()
>> setup_APIC_timer()
>> clockevents_config_and_register()
>> tick_check_new_device()
>> tick_setup_device()
>> tick_setup_oneshot()
>> clockevents_switch_state()
>> lapic_timer_set_oneshot()
>> __setup_APIC_LVTT()
>> printk_once(...)
>>
>> Nothing holds hrtimer.base_lock in this call chain.
>
> Can't printk hold that lock through console/netconsole?

How so?

The lockdep splat is about this:

[ 735.324154] swapper/3/0 is trying to acquire lock:
[ 735.324155] ffffffff8442c858 ((console_sem).lock){-...}-{2:2}, at: down_trylock+0x13/0x70
[ 735.324162]
[ 735.324164] but task is already holding lock:
[ 735.324165] ffff88842dfb9958 (hrtimer_bases.lock){-.-.}-{2:2}, at: lock_hrtimer_base+0x71/0x120

and the call chain is:

[ 735.324283] __lock_acquire+0x374a/0x5210
[ 735.324284] lock_acquire+0x1b9/0x920
[ 735.324286] _raw_spin_lock_irqsave+0x3c/0x4b
[ 735.324288] down_trylock+0x13/0x70
[ 735.324289] __down_trylock_console_sem+0x33/0xa0
[ 735.324291] console_trylock+0x13/0x60
[ 735.324292] vprintk_emit+0xec/0x370
[ 735.324294] printk+0x9c/0xc3
[ 735.324296] lapic_timer_set_oneshot+0x4e/0x60
[ 735.324297] clockevents_switch_state+0x1e1/0x360
[ 735.324299] tick_program_event+0xae/0xc0
[ 735.324301] hrtimer_start_range_ns+0x4b6/0xaa0
[ 735.324302] tick_nohz_idle_stop_tick+0x67c/0xa90
[ 735.324304] do_idle+0x326/0x530
[ 735.324305] cpu_startup_entry+0x19/0x20
[ 735.324307] start_secondary+0x2ee/0x3e0
[ 735.324309] secondary_startup_64+0xa4/0xb0

hrtimer_start_range_ns() clearly holds htimer_base::lock

>> But the lockdep splat clearly says:
>>
>> [ 735.324357] stack backtrace:
>> [ 735.324360] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.6.0-for-upstream-dbg-2020-04-03_10-44-43-70 #1
>>
>> ...
>>
>> So how can that be the first invocation of that printk_once()?
>>
>> While the patch looks innocent, it papers over the underlying problem
>> and wild theories are not really helping here.
>>
>> Here is a boot log excerpt with lockdep enabled and 'debug' on the
>> command line:
>>
>> [ 0.000000] Linux version 5.7.0-rc3 ...
>> ...
>> [ 3.992125] TSC deadline timer enabled
>> [ 3.995820] smpboot: CPU0: Intel(R) ....
>> ...
>> [ 4.050766] smp: Bringing up secondary CPUs ...
>>
>> No splat nothing. The real question is WHY this triggers on Leons
>> machine 735 seconds after boot and on CPU3.
>
> I want to believe that the timestamp are not correct, have no clue if it
> is even possible.

It does not matter whether the timestamps are correct or not. Even if
they are not, then this does not change the fact that this happens on
CPU3 way after the first invocation of __setup_APIC_LVTT() on CPU0 which
simply cannot trigger this splat.

Can you please provide the full dmesg with the splat?

> But let's talk about facts:
> 1. It is started after -rc1 (we don't test linux-next).

Is that a plain kernel from Linus tree or do you have other patches
applied?

A .config file would be appreciated as well along with information about
the hardware or whatever this runs on.

> 2. This workaround helped to eliminate the splat.

It's eliminating the symptom, but this does not make the root cause go
away nor does it explain anything.

> 3. My machine experiences the extra splat all the time
> https://lore.kernel.org/lkml/20200414070502.GR334007@unreal/

which is completely unrelated.

Thanks,

tglx

2020-04-27 13:44:22

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

On Mon, Apr 27, 2020 at 02:59:00PM +0200, Thomas Gleixner wrote:
> Leon Romanovsky <[email protected]> writes:
> > On Mon, Apr 27, 2020 at 01:09:49PM +0200, Thomas Gleixner wrote:
> >> The local APIC timer (in this case the TSC deadline timer) is set up
> >> during early boot on the boot CPU (before SMP setup) with this call
> >> chain:
> >>
> >> smp_prepare_cpus()
> >> native_smp_prepare_cpus()
> >> x86_init.timers.setup_percpu_clockev()
> >> setup_boot_APIC_clock()
> >> setup_APIC_timer()
> >> clockevents_config_and_register()
> >> tick_check_new_device()
> >> tick_setup_device()
> >> tick_setup_oneshot()
> >> clockevents_switch_state()
> >> lapic_timer_set_oneshot()
> >> __setup_APIC_LVTT()
> >> printk_once(...)
> >>
> >> Nothing holds hrtimer.base_lock in this call chain.
> >
> > Can't printk hold that lock through console/netconsole?
>
> How so?

OK, I consulted with verification people and back then the trigger was:
Reproduce when run "echo 1 > /sys/kernel/debug/clear_warn_once" after reboot

>
> The lockdep splat is about this:
>
> [ 735.324154] swapper/3/0 is trying to acquire lock:
> [ 735.324155] ffffffff8442c858 ((console_sem).lock){-...}-{2:2}, at: down_trylock+0x13/0x70
> [ 735.324162]
> [ 735.324164] but task is already holding lock:
> [ 735.324165] ffff88842dfb9958 (hrtimer_bases.lock){-.-.}-{2:2}, at: lock_hrtimer_base+0x71/0x120
>
> and the call chain is:
>
> [ 735.324283] __lock_acquire+0x374a/0x5210
> [ 735.324284] lock_acquire+0x1b9/0x920
> [ 735.324286] _raw_spin_lock_irqsave+0x3c/0x4b
> [ 735.324288] down_trylock+0x13/0x70
> [ 735.324289] __down_trylock_console_sem+0x33/0xa0
> [ 735.324291] console_trylock+0x13/0x60
> [ 735.324292] vprintk_emit+0xec/0x370
> [ 735.324294] printk+0x9c/0xc3
> [ 735.324296] lapic_timer_set_oneshot+0x4e/0x60
> [ 735.324297] clockevents_switch_state+0x1e1/0x360
> [ 735.324299] tick_program_event+0xae/0xc0
> [ 735.324301] hrtimer_start_range_ns+0x4b6/0xaa0
> [ 735.324302] tick_nohz_idle_stop_tick+0x67c/0xa90
> [ 735.324304] do_idle+0x326/0x530
> [ 735.324305] cpu_startup_entry+0x19/0x20
> [ 735.324307] start_secondary+0x2ee/0x3e0
> [ 735.324309] secondary_startup_64+0xa4/0xb0
>
> hrtimer_start_range_ns() clearly holds htimer_base::lock
>
> >> But the lockdep splat clearly says:
> >>
> >> [ 735.324357] stack backtrace:
> >> [ 735.324360] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.6.0-for-upstream-dbg-2020-04-03_10-44-43-70 #1
> >>
> >> ...
> >>
> >> So how can that be the first invocation of that printk_once()?
> >>
> >> While the patch looks innocent, it papers over the underlying problem
> >> and wild theories are not really helping here.
> >>
> >> Here is a boot log excerpt with lockdep enabled and 'debug' on the
> >> command line:
> >>
> >> [ 0.000000] Linux version 5.7.0-rc3 ...
> >> ...
> >> [ 3.992125] TSC deadline timer enabled
> >> [ 3.995820] smpboot: CPU0: Intel(R) ....
> >> ...
> >> [ 4.050766] smp: Bringing up secondary CPUs ...
> >>
> >> No splat nothing. The real question is WHY this triggers on Leons
> >> machine 735 seconds after boot and on CPU3.
> >
> > I want to believe that the timestamp are not correct, have no clue if it
> > is even possible.
>
> It does not matter whether the timestamps are correct or not. Even if
> they are not, then this does not change the fact that this happens on
> CPU3 way after the first invocation of __setup_APIC_LVTT() on CPU0 which
> simply cannot trigger this splat.
>
> Can you please provide the full dmesg with the splat?

Attached

>
> > But let's talk about facts:
> > 1. It is started after -rc1 (we don't test linux-next).
>
> Is that a plain kernel from Linus tree or do you have other patches
> applied?

It is clean one.

>
> A .config file would be appreciated as well along with information about
> the hardware or whatever this runs on.

This is passthrough VM with Mellanox NICs in it, everything else is
virtual.

>
> > 2. This workaround helped to eliminate the splat.
>
> It's eliminating the symptom, but this does not make the root cause go
> away nor does it explain anything.
>
> > 3. My machine experiences the extra splat all the time
> > https://lore.kernel.org/lkml/20200414070502.GR334007@unreal/
>
> which is completely unrelated.
>
> Thanks,
>
> tglx


Attachments:
(No filename) (4.59 kB)
bug_2131735 (52.66 kB)
config.gz (40.06 kB)
Download all attachments

2020-04-27 15:36:36

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

Leon Romanovsky <[email protected]> writes:
> OK, I consulted with verification people and back then the trigger was:
> Reproduce when run "echo 1 > /sys/kernel/debug/clear_warn_once" after
> reboot

That explains it.

> [ 0.937310] Freeing SMP alternatives memory: 32K
> [ 0.940471] TSC deadline timer enabled

So here is the first one which sets 'once'. Of course if you clear 'once'
afterwards then this triggers because the context is completely
different.

So the right thing to do is to move this out of __setup_APIC_LVTT() and
be done with it.

Thanks

tglx

8<-------------------
Subject: x86/apic: Move TSC deadline timer debug printk
From: Thomas Gleixner <[email protected]>
Date: Mon, 27 Apr 2020 16:55:57 +0200

Leon reported that the printk_once() in __setup_APIC_LVTT() triggers a
lockdep splat due to a lock order violation between hrtimer_base::lock and
console_sem, when the 'once' condition is reset via
/sys/kernel/debug/clear_warn_once after boot.

The initial printk cannot trigger this because that happens during boot
when the local APIC timer is set up on the boot CPU.

Prevent it by moving the printk to a place which is guaranteed to be only
called once during boot.

Mark the deadline timer check related functions and data __init while at
it.

Reported-by: Leon Romanovsky <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/apic/apic.c | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)

--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -352,8 +352,6 @@ static void __setup_APIC_LVTT(unsigned i
* According to Intel, MFENCE can do the serialization here.
*/
asm volatile("mfence" : : : "memory");
-
- printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}

@@ -546,7 +544,7 @@ static struct clock_event_device lapic_c
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);

-static u32 hsx_deadline_rev(void)
+static __init u32 hsx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x3a; /* EP */
@@ -556,7 +554,7 @@ static u32 hsx_deadline_rev(void)
return ~0U;
}

-static u32 bdx_deadline_rev(void)
+static __init u32 bdx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x00000011;
@@ -568,7 +566,7 @@ static u32 bdx_deadline_rev(void)
return ~0U;
}

-static u32 skx_deadline_rev(void)
+static __init u32 skx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x03: return 0x01000136;
@@ -581,7 +579,7 @@ static u32 skx_deadline_rev(void)
return ~0U;
}

-static const struct x86_cpu_id deadline_match[] = {
+static const struct x86_cpu_id deadline_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev),
X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020),
X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev),
@@ -603,18 +601,19 @@ static const struct x86_cpu_id deadline_
{},
};

-static void apic_check_deadline_errata(void)
+static __init bool apic_validate_deadline_timer(void)
{
const struct x86_cpu_id *m;
u32 rev;

- if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
- boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return;
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;

m = x86_match_cpu(deadline_match);
if (!m)
- return;
+ return true;

/*
* Function pointers will have the MSB set due to address layout,
@@ -626,11 +625,12 @@ static void apic_check_deadline_errata(v
rev = (u32)m->driver_data;

if (boot_cpu_data.microcode >= rev)
- return;
+ return true;

setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
"please update microcode to version: 0x%x (or later)\n", rev);
+ return false;
}

/*
@@ -2092,7 +2092,8 @@ void __init init_apic_mappings(void)
{
unsigned int new_apicid;

- apic_check_deadline_errata();
+ if (apic_validate_deadline_timer())
+ pr_debug("TSC deadline timer available\n");

if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();

2020-04-27 15:37:33

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

Leon Romanovsky <[email protected]> writes:
> But if we are talking about other weirdnesses, I have another splat in my
> QEMU machine, which is different, but from the same code area.

It's a completely different code area, really. This triggers in
deactivate_slab().

> [ 1.383968] ACPI: Added _OSI(Module Device)
> [ 1.385684] ACPI: Added _OSI(Processor Device)
> [ 1.389345] ACPI: Added _OSI(3.0 _SCP Extensions)
> [ 1.389345] ACPI: Added _OSI(Processor Aggregator Device)
> [ 1.393454] ACPI: Added _OSI(Linux-Dell-Video)
> [ 1.394920] ACPI: Added _OSI(Linux-Lenovo-NV-HDMI-Audio)
> [ 1.396481] ACPI: Added _OSI(Linux-HPI-Hybrid-Graphics)
> [ 1.793418] ACPI: 1 ACPI AML tables successfully acquired and loaded
> [ 1.845452]
> [ 1.846768] =============================
> [ 1.849293] [ BUG: Invalid wait context ]

Do you have CONFIG_PROVE_RAW_LOCK_NESTING enabled? If yes, please
disable it. The Kconfig option explains that this will trigger
splats.

If not, then this is mysterious but has absolutely nothing to do with
the other thing. It's also completely unrelated to ACPI ...

Thanks,

tglx

2020-04-27 15:56:05

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

On Mon, Apr 27, 2020 at 05:31:21PM +0200, Thomas Gleixner wrote:
> Leon Romanovsky <[email protected]> writes:
> > OK, I consulted with verification people and back then the trigger was:
> > Reproduce when run "echo 1 > /sys/kernel/debug/clear_warn_once" after
> > reboot
>
> That explains it.
>
> > [ 0.937310] Freeing SMP alternatives memory: 32K
> > [ 0.940471] TSC deadline timer enabled
>
> So here is the first one which sets 'once'. Of course if you clear 'once'
> afterwards then this triggers because the context is completely
> different.
>
> So the right thing to do is to move this out of __setup_APIC_LVTT() and
> be done with it.

Thanks a lot.

2020-04-27 16:36:09

by Leon Romanovsky

[permalink] [raw]
Subject: Re: [PATCH] x86/apic: Fix circular locking dependency between console and hrtimer locks

On Mon, Apr 27, 2020 at 05:35:18PM +0200, Thomas Gleixner wrote:
> Leon Romanovsky <[email protected]> writes:
> > But if we are talking about other weirdnesses, I have another splat in my
> > QEMU machine, which is different, but from the same code area.
>
> It's a completely different code area, really. This triggers in
> deactivate_slab().
>
> > [ 1.383968] ACPI: Added _OSI(Module Device)
> > [ 1.385684] ACPI: Added _OSI(Processor Device)
> > [ 1.389345] ACPI: Added _OSI(3.0 _SCP Extensions)
> > [ 1.389345] ACPI: Added _OSI(Processor Aggregator Device)
> > [ 1.393454] ACPI: Added _OSI(Linux-Dell-Video)
> > [ 1.394920] ACPI: Added _OSI(Linux-Lenovo-NV-HDMI-Audio)
> > [ 1.396481] ACPI: Added _OSI(Linux-HPI-Hybrid-Graphics)
> > [ 1.793418] ACPI: 1 ACPI AML tables successfully acquired and loaded
> > [ 1.845452]
> > [ 1.846768] =============================
> > [ 1.849293] [ BUG: Invalid wait context ]
>
> Do you have CONFIG_PROVE_RAW_LOCK_NESTING enabled? If yes, please
> disable it. The Kconfig option explains that this will trigger
> splats.

Yes, I have it enabled.

Thanks for the tip.

>
> If not, then this is mysterious but has absolutely nothing to do with
> the other thing. It's also completely unrelated to ACPI ...
>
> Thanks,
>
> tglx

2020-05-01 18:25:47

by tip-bot2 for Tony Luck

[permalink] [raw]
Subject: [tip: x86/urgent] x86/apic: Move TSC deadline timer debug printk

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: c84cb3735fd53c91101ccdb191f2e3331a9262cb
Gitweb: https://git.kernel.org/tip/c84cb3735fd53c91101ccdb191f2e3331a9262cb
Author: Thomas Gleixner <[email protected]>
AuthorDate: Mon, 27 Apr 2020 16:55:57 +02:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Fri, 01 May 2020 19:15:41 +02:00

x86/apic: Move TSC deadline timer debug printk

Leon reported that the printk_once() in __setup_APIC_LVTT() triggers a
lockdep splat due to a lock order violation between hrtimer_base::lock and
console_sem, when the 'once' condition is reset via
/sys/kernel/debug/clear_warn_once after boot.

The initial printk cannot trigger this because that happens during boot
when the local APIC timer is set up on the boot CPU.

Prevent it by moving the printk to a place which is guaranteed to be only
called once during boot.

Mark the deadline timer check related functions and data __init while at
it.

Reported-by: Leon Romanovsky <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]

---
arch/x86/kernel/apic/apic.c | 27 ++++++++++++++-------------
1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 81b9c63..e53dda2 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -352,8 +352,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
* According to Intel, MFENCE can do the serialization here.
*/
asm volatile("mfence" : : : "memory");
-
- printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}

@@ -546,7 +544,7 @@ static struct clock_event_device lapic_clockevent = {
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);

-static u32 hsx_deadline_rev(void)
+static __init u32 hsx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x3a; /* EP */
@@ -556,7 +554,7 @@ static u32 hsx_deadline_rev(void)
return ~0U;
}

-static u32 bdx_deadline_rev(void)
+static __init u32 bdx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x00000011;
@@ -568,7 +566,7 @@ static u32 bdx_deadline_rev(void)
return ~0U;
}

-static u32 skx_deadline_rev(void)
+static __init u32 skx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x03: return 0x01000136;
@@ -581,7 +579,7 @@ static u32 skx_deadline_rev(void)
return ~0U;
}

-static const struct x86_cpu_id deadline_match[] = {
+static const struct x86_cpu_id deadline_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev),
X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020),
X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev),
@@ -603,18 +601,19 @@ static const struct x86_cpu_id deadline_match[] = {
{},
};

-static void apic_check_deadline_errata(void)
+static __init bool apic_validate_deadline_timer(void)
{
const struct x86_cpu_id *m;
u32 rev;

- if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
- boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return;
+ if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;

m = x86_match_cpu(deadline_match);
if (!m)
- return;
+ return true;

/*
* Function pointers will have the MSB set due to address layout,
@@ -626,11 +625,12 @@ static void apic_check_deadline_errata(void)
rev = (u32)m->driver_data;

if (boot_cpu_data.microcode >= rev)
- return;
+ return true;

setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
"please update microcode to version: 0x%x (or later)\n", rev);
+ return false;
}

/*
@@ -2092,7 +2092,8 @@ void __init init_apic_mappings(void)
{
unsigned int new_apicid;

- apic_check_deadline_errata();
+ if (apic_validate_deadline_timer())
+ pr_debug("TSC deadline timer available\n");

if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();