Oleg suggested to replace the "watchdog/%u" threads with
cpu_stop_work. That removes one thread per cpu while at the same time
fixes softlockup vs SCHED_DEADLINE.
But more importantly, it does away with the single
smpboot_update_cpumask_percpu_thread() user, which allows
cleanups/shrinkage of the smpboot interface.
Cc: Don Zickus <[email protected]>
Suggested-by: Oleg Nesterov <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
include/linux/cpuhotplug.h | 1
include/linux/nmi.h | 5 +
kernel/cpu.c | 5 +
kernel/watchdog.c | 137 +++++++++++++++++++--------------------------
4 files changed, 71 insertions(+), 77 deletions(-)
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -164,6 +164,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
+ CPUHP_AP_WATCHDOG_ONLINE,
CPUHP_AP_WORKQUEUE_ONLINE,
CPUHP_AP_RCUTREE_ONLINE,
CPUHP_AP_ONLINE_DYN,
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -33,10 +33,15 @@ extern int sysctl_hardlockup_all_cpu_bac
#define sysctl_hardlockup_all_cpu_backtrace 0
#endif /* !CONFIG_SMP */
+extern int lockup_detector_online_cpu(unsigned int cpu);
+extern int lockup_detector_offline_cpu(unsigned int cpu);
+
#else /* CONFIG_LOCKUP_DETECTOR */
static inline void lockup_detector_init(void) { }
static inline void lockup_detector_soft_poweroff(void) { }
static inline void lockup_detector_cleanup(void) { }
+#define lockup_detector_online_cpu NULL
+#define lockup_detector_offline_cpu NULL
#endif /* !CONFIG_LOCKUP_DETECTOR */
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
+ [CPUHP_AP_WATCHDOG_ONLINE] = {
+ .name = "lockup_detector:online",
+ .startup.single = lockup_detector_online_cpu,
+ .teardown.single = lockup_detector_offline_cpu,
+ },
[CPUHP_AP_WORKQUEUE_ONLINE] = {
.name = "workqueue:online",
.startup.single = workqueue_online_cpu,
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -18,18 +18,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sysctl.h>
-#include <linux/smpboot.h>
-#include <linux/sched/rt.h>
-#include <uapi/linux/sched/types.h>
#include <linux/tick.h>
-#include <linux/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
+#include <linux/stop_machine.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
-#include <linux/kthread.h>
static DEFINE_MUTEX(watchdog_mutex);
@@ -169,11 +165,10 @@ static void lockup_detector_update_enabl
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-static bool softlockup_threads_initialized __read_mostly;
+static bool softlockup_initialized __read_mostly;
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
-static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
@@ -335,6 +330,25 @@ static void watchdog_interrupt_count(voi
__this_cpu_inc(hrtimer_interrupts);
}
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every sample_period seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static int softlockup_fn(void *data)
+{
+ __this_cpu_write(soft_lockup_hrtimer_cnt,
+ __this_cpu_read(hrtimer_interrupts));
+ __touch_watchdog();
+
+ return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
+
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@@ -350,7 +364,9 @@ static enum hrtimer_restart watchdog_tim
watchdog_interrupt_count();
/* kick the softlockup detector */
- wake_up_process(__this_cpu_read(softlockup_watchdog));
+ stop_one_cpu_nowait(smp_processor_id(),
+ softlockup_fn, NULL,
+ this_cpu_ptr(&softlockup_stop_work));
/* .. and repeat */
hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
@@ -448,17 +464,12 @@ static enum hrtimer_restart watchdog_tim
return HRTIMER_RESTART;
}
-static void watchdog_set_prio(unsigned int policy, unsigned int prio)
-{
- struct sched_param param = { .sched_priority = prio };
-
- sched_setscheduler(current, policy, ¶m);
-}
-
static void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
/*
* Start the timer first to prevent the NMI watchdog triggering
* before the timer has a chance to fire.
@@ -473,15 +484,14 @@ static void watchdog_enable(unsigned int
/* Enable the perf event */
if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
watchdog_nmi_enable(cpu);
-
- watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
}
static void watchdog_disable(unsigned int cpu)
{
struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
- watchdog_set_prio(SCHED_NORMAL, 0);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+
/*
* Disable the perf event first. That prevents that a large delay
* between disabling the timer and disabling the perf event causes
@@ -491,77 +501,63 @@ static void watchdog_disable(unsigned in
hrtimer_cancel(hrtimer);
}
-static void watchdog_cleanup(unsigned int cpu, bool online)
+static int softlockup_stop_fn(void *data)
{
- watchdog_disable(cpu);
+ watchdog_disable(smp_processor_id());
+ return 0;
}
-static int watchdog_should_run(unsigned int cpu)
+static void softlockup_stop_all(void)
{
- return __this_cpu_read(hrtimer_interrupts) !=
- __this_cpu_read(soft_lockup_hrtimer_cnt);
+ int cpu;
+
+ if (!softlockup_initialized)
+ return;
+
+ for_each_cpu(cpu, &watchdog_allowed_mask)
+ smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
+
+ cpumask_clear(&watchdog_allowed_mask);
}
-/*
- * The watchdog thread function - touches the timestamp.
- *
- * It only runs once every sample_period seconds (4 seconds by
- * default) to reset the softlockup timestamp. If this gets delayed
- * for more than 2*watchdog_thresh seconds then the debug-printout
- * triggers in watchdog_timer_fn().
- */
-static void watchdog(unsigned int cpu)
+static int softlockup_start_fn(void *data)
{
- __this_cpu_write(soft_lockup_hrtimer_cnt,
- __this_cpu_read(hrtimer_interrupts));
- __touch_watchdog();
+ watchdog_enable(smp_processor_id());
+ return 0;
}
-static struct smp_hotplug_thread watchdog_threads = {
- .store = &softlockup_watchdog,
- .thread_should_run = watchdog_should_run,
- .thread_fn = watchdog,
- .thread_comm = "watchdog/%u",
- .setup = watchdog_enable,
- .cleanup = watchdog_cleanup,
- .park = watchdog_disable,
- .unpark = watchdog_enable,
-};
-
-static void softlockup_update_smpboot_threads(void)
+static void softlockup_start_all(void)
{
- lockdep_assert_held(&watchdog_mutex);
-
- if (!softlockup_threads_initialized)
- return;
+ int cpu;
- smpboot_update_cpumask_percpu_thread(&watchdog_threads,
- &watchdog_allowed_mask);
+ cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+ for_each_cpu(cpu, &watchdog_allowed_mask)
+ smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
}
-/* Temporarily park all watchdog threads */
-static void softlockup_park_all_threads(void)
+int lockup_detector_online_cpu(unsigned int cpu)
{
- cpumask_clear(&watchdog_allowed_mask);
- softlockup_update_smpboot_threads();
+ watchdog_enable(cpu);
+ return 0;
}
-/* Unpark enabled threads */
-static void softlockup_unpark_threads(void)
+int lockup_detector_offline_cpu(unsigned int cpu)
{
- cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
- softlockup_update_smpboot_threads();
+ watchdog_disable(cpu);
+ return 0;
}
static void lockup_detector_reconfigure(void)
{
cpus_read_lock();
watchdog_nmi_stop();
- softlockup_park_all_threads();
+
+ softlockup_stop_all();
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
- softlockup_unpark_threads();
+ softlockup_start_all();
+
watchdog_nmi_start();
cpus_read_unlock();
/*
@@ -580,8 +576,6 @@ static void lockup_detector_reconfigure(
*/
static __init void lockup_detector_setup(void)
{
- int ret;
-
/*
* If sysctl is off and watchdog got disabled on the command line,
* nothing to do here.
@@ -592,24 +586,13 @@ static __init void lockup_detector_setup
!(watchdog_enabled && watchdog_thresh))
return;
- ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
- &watchdog_allowed_mask);
- if (ret) {
- pr_err("Failed to initialize soft lockup detector threads\n");
- return;
- }
-
mutex_lock(&watchdog_mutex);
- softlockup_threads_initialized = true;
+ softlockup_initialized = true;
lockup_detector_reconfigure();
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static inline int watchdog_park_threads(void) { return 0; }
-static inline void watchdog_unpark_threads(void) { }
-static inline int watchdog_enable_all_cpus(void) { return 0; }
-static inline void watchdog_disable_all_cpus(void) { }
static void lockup_detector_reconfigure(void)
{
cpus_read_lock();
FYI, we noticed the following commit (built with gcc-5):
commit: 4808e7a5dc055fd8776e6b59e02775730ea716f6 ("watchdog/softlockup: Replace "watchdog/%u" threads with cpu_stop_work")
url: https://github.com/0day-ci/linux/commits/Peter-Zijlstra/kthread-smpboot-More-fixes/20180613-003329
in testcase: boot
on test machine: qemu-system-x86_64 -enable-kvm -smp 2 -m 512M
caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace):
+--------------------------------------------------------------------+------------+------------+
| | 1e88b12632 | 4808e7a5dc |
+--------------------------------------------------------------------+------------+------------+
| boot_successes | 0 | 0 |
| boot_failures | 10 | 11 |
| WARNING:at_lib/debugobjects.c:#__debug_object_init | 10 | |
| RIP:__debug_object_init | 10 | |
| WARNING:suspicious_RCU_usage | 10 | |
| lib/test_rhashtable.c:#suspicious_rcu_dereference_protected()usage | 10 | |
| WARNING:possible_circular_locking_dependency_detected | 9 | |
| BUG:workqueue_lockup-pool | 1 | |
| BUG:KASAN:null-ptr-deref_in_h | 0 | 11 |
| BUG:unable_to_handle_kernel | 0 | 11 |
| Oops:#[##] | 0 | 11 |
| RIP:hrtimer_active | 0 | 11 |
| Kernel_panic-not_syncing:Fatal_exception | 0 | 11 |
+--------------------------------------------------------------------+------------+------------+
[ 0.037000] BUG: KASAN: null-ptr-deref in hrtimer_active+0x70/0xa0
[ 0.037000] Read of size 4 at addr 0000000000000010 by task swapper/1
[ 0.037000]
[ 0.037000] CPU: 0 PID: 1 Comm: swapper Tainted: G T 4.17.0-11348-g4808e7a #1
[ 0.037000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[ 0.037000] Call Trace:
[ 0.037000] ? kasan_report+0xe3/0x360
[ 0.037000] ? hrtimer_active+0x70/0xa0
[ 0.037000] ? hrtimer_try_to_cancel+0x17/0x210
[ 0.037000] ? hrtimer_cancel+0x15/0x20
[ 0.037000] ? softlockup_stop_fn+0x11/0x20
[ 0.037000] ? lockup_detector_reconfigure+0x25/0xa0
[ 0.037000] ? lockup_detector_init+0x51/0x5d
[ 0.037000] ? kernel_init_freeable+0xa9/0x243
[ 0.037000] ? rest_init+0xd0/0xd0
[ 0.037000] ? kernel_init+0xf/0x120
[ 0.037000] ? rest_init+0xd0/0xd0
[ 0.037000] ? ret_from_fork+0x24/0x30
[ 0.037000] ==================================================================
[ 0.037000] Disabling lock debugging due to kernel taint
[ 0.037032] BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
[ 0.038000] PGD 0 P4D 0
[ 0.038000] Oops: 0000 [#1] PREEMPT KASAN PTI
[ 0.038000] CPU: 0 PID: 1 Comm: swapper Tainted: G B T 4.17.0-11348-g4808e7a #1
[ 0.038000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[ 0.038000] RIP: 0010:hrtimer_active+0x70/0xa0
[ 0.038000] Code: 11 4c 89 f7 e8 a1 05 19 00 48 8b 45 30 48 39 c3 74 36 4c 89 f7 e8 90 05 19 00 48 8b 5d 30 4c 8d 6b 10 4c 89 ef e8 80 04 19 00 <44> 8b 63 10 41 f6 c4 01 74 a2 f3 90 eb ea 5b b8 01 00 00 00 5d 41
[ 0.038000] RSP: 0000:ffff88000015fe68 EFLAGS: 00010282
[ 0.038000] RAX: ffff880000154900 RBX: 0000000000000000 RCX: 0000000000000000
[ 0.038000] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffff8242a236
[ 0.038000] RBP: ffffffff8351ef20 R08: 0000000000000000 R09: 0000000000000000
[ 0.038000] R10: 0000000000000001 R11: fffffbfff09346c7 R12: 0000000000000000
[ 0.038000] R13: 0000000000000010 R14: ffffffff8351ef50 R15: ffffffff8351ef58
[ 0.038000] FS: 0000000000000000(0000) GS:ffffffff83482000(0000) knlGS:0000000000000000
[ 0.038000] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 0.038000] CR2: 0000000000000010 CR3: 0000000003424000 CR4: 00000000000006b0
[ 0.038000] Call Trace:
[ 0.038000] ? hrtimer_try_to_cancel+0x17/0x210
[ 0.038000] ? hrtimer_cancel+0x15/0x20
[ 0.038000] ? softlockup_stop_fn+0x11/0x20
[ 0.038000] ? lockup_detector_reconfigure+0x25/0xa0
[ 0.038000] ? lockup_detector_init+0x51/0x5d
[ 0.038000] ? kernel_init_freeable+0xa9/0x243
[ 0.038000] ? rest_init+0xd0/0xd0
[ 0.038000] ? kernel_init+0xf/0x120
[ 0.038000] ? rest_init+0xd0/0xd0
[ 0.038000] ? ret_from_fork+0x24/0x30
[ 0.038000] CR2: 0000000000000010
[ 0.038000] ---[ end trace 223de5392cf44f69 ]---
To reproduce:
git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp qemu -k <bzImage> job-script # job-script is attached in this email
Thanks,
Xiaolong
On Wed, Jun 13, 2018 at 01:08:38PM +0800, kernel test robot wrote:
> [ 0.037000] BUG: KASAN: null-ptr-deref in hrtimer_active+0x70/0xa0
> [ 0.037000] Read of size 4 at addr 0000000000000010 by task swapper/1
> [ 0.037000]
> [ 0.037000] CPU: 0 PID: 1 Comm: swapper Tainted: G T 4.17.0-11348-g4808e7a #1
> [ 0.037000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
> [ 0.037000] Call Trace:
> [ 0.037000] ? kasan_report+0xe3/0x360
> [ 0.037000] ? hrtimer_active+0x70/0xa0
> [ 0.037000] ? hrtimer_try_to_cancel+0x17/0x210
> [ 0.037000] ? hrtimer_cancel+0x15/0x20
> [ 0.037000] ? softlockup_stop_fn+0x11/0x20
> [ 0.037000] ? lockup_detector_reconfigure+0x25/0xa0
> [ 0.037000] ? lockup_detector_init+0x51/0x5d
> [ 0.037000] ? kernel_init_freeable+0xa9/0x243
> [ 0.037000] ? rest_init+0xd0/0xd0
> [ 0.037000] ? kernel_init+0xf/0x120
> [ 0.037000] ? rest_init+0xd0/0xd0
> [ 0.037000] ? ret_from_fork+0x24/0x30
ARGH, I thought I fixed that one..
I think that'll cure things, but let me do a KASAN build myself.
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -587,8 +587,8 @@ static __init void lockup_detector_setup
return;
mutex_lock(&watchdog_mutex);
- softlockup_initialized = true;
lockup_detector_reconfigure();
+ softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}