2017-08-31 07:31:40

by Thomas Gleixner

[permalink] [raw]
Subject: [patch 11/29] lockup_detector: Remove park_in_progress hackery

b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded
system") tries to fix the following issue:

watchdog_stop()
hrtimer_cancel()
perf_nmi_event_stop()

If the task gets preempted after canceling the hrtimer or the VM gets
scheduled out long enough, then there is a chance that the next NMI will
see a stale hrtimer interrupt count and trigger a false positive hard
lockup splat.

That commit added a complete abstrusity with a atomic variable telling the
NMI watchdog function that stop is in progress. This is so stupid, that
it's not funny anymore.

Of course the patch missed that the same issue can happen in start:

watchdog_start()
perf_nmi_event_start()
hrtimer_start()

The same effect can be achieved by reversing the start/stop order:

watchdog_stop()
perf_nmi_event_stop()
hrtimer_cancel()

watchdog_start()
hrtimer_start()
perf_nmi_event_start()

Get rid of the nonsense.

Signed-off-by: Thomas Gleixner <[email protected]>
---
include/linux/nmi.h | 1 -
kernel/watchdog.c | 39 ++++++++++++++++++---------------------
kernel/watchdog_hld.c | 7 ++-----
3 files changed, 20 insertions(+), 27 deletions(-)

--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,7 +27,6 @@ extern void touch_softlockup_watchdog_sy
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
extern int soft_watchdog_enabled;
-extern atomic_t watchdog_park_in_progress;
#else
static inline void touch_softlockup_watchdog_sched(void)
{
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -134,8 +134,6 @@ void __weak watchdog_nmi_reconfigure(voi
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)

-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
-
static u64 __read_mostly sample_period;

static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -320,8 +318,7 @@ static enum hrtimer_restart watchdog_tim
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;

- if (!watchdog_enabled ||
- atomic_read(&watchdog_park_in_progress) != 0)
+ if (!watchdog_enabled)
return HRTIMER_NORESTART;

/* kick the hardlockup detector */
@@ -435,33 +432,38 @@ static void watchdog_set_prio(unsigned i

static void watchdog_enable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);

- /* kick off the timer for the hardlockup detector */
+ /*
+ * Start the timer first to prevent the NMI watchdog triggering
+ * before the timer has a chance to fire.
+ */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL_PINNED);

+ /* Initialize timestamp */
+ __touch_watchdog();
/* Enable the perf event */
watchdog_nmi_enable(cpu);

- /* done here because hrtimer_start can only pin to smp_processor_id() */
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
-
- /* initialize timestamp */
watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
- __touch_watchdog();
}

static void watchdog_disable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);

watchdog_set_prio(SCHED_NORMAL, 0);
- hrtimer_cancel(hrtimer);
- /* disable the perf event */
- watchdog_nmi_disable(cpu);
+ /*
+ * Disable the perf event first. That prevents that a large delay
+ * between disabling the timer and disabling the perf event causes
+ * the perf NMI to detect a false positive.
+ */
hardlockup_detector_perf_disable();
+ watchdog_nmi_disable(cpu);
+ hrtimer_cancel(hrtimer);
}

static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -517,16 +519,11 @@ static int watchdog_park_threads(void)
{
int cpu, ret = 0;

- atomic_set(&watchdog_park_in_progress, 1);
-
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
if (ret)
break;
}
-
- atomic_set(&watchdog_park_in_progress, 0);
-
return ret;
}

--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,15 +106,12 @@ static struct perf_event_attr wd_hw_attr

/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;

- if (atomic_read(&watchdog_park_in_progress) != 0)
- return;
-
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;



2017-09-04 12:11:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch 11/29] lockup_detector: Remove park_in_progress hackery

On Mon, Sep 04, 2017 at 01:09:06PM +0200, Ulrich Obergfell wrote:

> - A thread hogs CPU N (soft lockup) so that watchdog/N is unable to run.
> - A user re-configures 'watchdog_thresh' on the fly. The reconfiguration
> requires parking/unparking of all watchdog threads.

This is where you fail, its silly to require parking for
reconfiguration.

2017-09-05 13:58:36

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 11/29] lockup_detector: Remove park_in_progress hackery

On Mon, 4 Sep 2017, Ulrich Obergfell wrote:
> "b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded
> system") tries to fix the following issue:
>
> watchdog_stop()
> hrtimer_cancel()
> perf_nmi_event_stop()
>
> If the task gets preempted after canceling the hrtimer or the VM gets
> scheduled out long enough, then there is a chance that the next NMI will
> see a stale hrtimer interrupt count and trigger a false positive hard
> lockup splat."
>
> This is not exactly the issue that b94f51183b06 is actually supposed to fix.
> For details, please see the accompanying commit log message at:
>
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/kernel/watchdog.c?id=b94f51183b0617e7b9b4fb4137d4cf1cab7547c2
>
> For convenience, here is a slightly different description of the scenario
> that b94f51183b06 originally aims to address:
>
> - A thread hogs CPU N (soft lockup) so that watchdog/N is unable to run.
> - A user re-configures 'watchdog_thresh' on the fly. The reconfiguration
> requires parking/unparking of all watchdog threads.
> - watchdog_timer_fn() on all CPUs can already pick up the new sample period,
> but the perf counter frequency cannot be updated on all CPUs yet because
> watchdog/N is unable to park, so watchdog_overflow_callback() still occurs
> at a frequency that is based on the old sample period (on CPUs >= N which
> have not had a chance to park their watchdog threads yet).
> - If 'watchdog_thresh' was increased by the reconfiguration, the interval
> at which watchdog_timer_fn() is now running could be too large for the
> watchdog_overflow_callback() function to observe a change of the timer
> interrupt counter. This can trigger a false positive.

Oh well. You did not fix that at all with that hack.

The real problem is that the watchdog threshold write in proc immediately
updates sample_period, which is evaluated by the running thread.

proc_write()
set_sample_period()
<----- Broken starts
proc_watchdog_update()
watchdog_enable_all_cpus()
update_watchdog_all_cpus()
watchdog_park_threads()
<----- Broken ends
watchdog_park_in_progress = 1

So up to the point where watchdog_park_in_progress is set to 1, the change
to sample_period is visible to all active watchdog threads and will be
picked up by the watchdog threads to rearm their timer. The NMI watchdog
will run with the old frequency until the thread is parked and the watchdog
NMI disarmed.

The underlying problem is the non synchronized update of sample_period and
this band aid hack is not solving that at all.

> The above scenario should be rare in practice, so it seemed reasonable to
> come up with a simple low-risk fix that addresses merely this particular
> scenario (watchdog_timer_fn() and watchdog_overflow_callback() now return
> immediately while park is in progress / no lockup detection is performed
> during that window of time).

That thing is neither reasonable nor a fix. It's mindless hackery which
papers over the problem instead of fixing the root cause. It neither saw
the problem in the park/unpark in general which is again a valid source for
false positives.

> In relation to:
>
> "That commit added a complete abstrusity with a atomic variable telling the
> NMI watchdog function that stop is in progress. This is so stupid, that
> it's not funny anymore."
>
> I cannot see any 'abstrusity' or 'stupidity' in the pragmatic approach that
> was taken in b94f51183b06 to fix an issue that should be rare in practice
> as described above.
>
> Please change the commit log of [patch 11/29] since it is inconsistent with
> the commit log of b94f51183b06.

Yes, I will change it to include the above reasoning why this is complete
crap and keep the extra findings for reference.

Thanks,

tglx

2017-09-05 15:15:42

by Don Zickus

[permalink] [raw]
Subject: Re: [patch 11/29] lockup_detector: Remove park_in_progress hackery

On Mon, Sep 04, 2017 at 02:10:50PM +0200, Peter Zijlstra wrote:
> On Mon, Sep 04, 2017 at 01:09:06PM +0200, Ulrich Obergfell wrote:
>
> > - A thread hogs CPU N (soft lockup) so that watchdog/N is unable to run.
> > - A user re-configures 'watchdog_thresh' on the fly. The reconfiguration
> > requires parking/unparking of all watchdog threads.
>
> This is where you fail, its silly to require parking for
> reconfiguration.

Hi Peter,

Ok, please elaborate. Unless I am misunderstanding, that is what Thomas
requested us do years ago when he implemented the parking/unparking scheme
and what his current patch set is doing now.

The point of parking I believe was to avoid the overhead of tearing down a
thread and restarting it when the code needed to update various lockup
detector settings.

So if we can't depend on parking for reconfiguration, then are the other
options (besides tearing down threads)?

I am not trying to be argumentative here, just trying to fill in the
disconnect between us.


Hi Uli,

I think the race you detailed is solved with Thomas's patches. In the
original design we set the sample period first, then tried parking the
threads, which created the mess. With this patchset, Thomas properly
parks the threads first, then sets the sample period, thus avoiding the race
I believe. You should be able to see that in patch 16,
softlockup_reconfigure_threads().

Cheers,
Don

2017-09-05 15:42:42

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 11/29] lockup_detector: Remove park_in_progress hackery

On Tue, 5 Sep 2017, Don Zickus wrote:

> On Mon, Sep 04, 2017 at 02:10:50PM +0200, Peter Zijlstra wrote:
> > On Mon, Sep 04, 2017 at 01:09:06PM +0200, Ulrich Obergfell wrote:
> >
> > > - A thread hogs CPU N (soft lockup) so that watchdog/N is unable to run.
> > > - A user re-configures 'watchdog_thresh' on the fly. The reconfiguration
> > > requires parking/unparking of all watchdog threads.
> >
> > This is where you fail, its silly to require parking for
> > reconfiguration.
>
> Hi Peter,
>
> Ok, please elaborate. Unless I am misunderstanding, that is what Thomas
> requested us do years ago when he implemented the parking/unparking scheme
> and what his current patch set is doing now.
>
> The point of parking I believe was to avoid the overhead of tearing down a
> thread and restarting it when the code needed to update various lockup
> detector settings.
>
> So if we can't depend on parking for reconfiguration, then are the other
> options (besides tearing down threads)?

Yes, the park/unpark is what I still use as this was the simplest way to
keep everything in sync.

I pondered to do on the fly reconfiguration as well, but that would add
more code and would not solve the general issue of park/unpark. So I rather
went for a single mechanism which just works, even if it is suboptimal cpu
cycle wise. OTOH that reconfiguration is not something which happens every
5ms, so we can just go for the stupid, but simple mechanism.

Thanks,

tglx

2017-09-05 19:19:17

by Thomas Gleixner

[permalink] [raw]
Subject: [patch V2 11/29] lockup_detector: Remove park_in_progress obfuscation

b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded
system") tries to fix the following issue:

proc_write()
set_sample_period() <--- New sample period becoms visible
<----- Broken starts
proc_watchdog_update()
watchdog_enable_all_cpus() watchdog_hrtimer_fn()
update_watchdog_all_cpus() restart_timer(sample_period)
watchdog_park_threads()

thread->park()
disable_nmi()
<----- Broken ends

The reason why this is broken is that the update of the watchdog threshold
becomes immediately effective and visible for the hrtimer function which
uses that value to rearm the timer. But the NMI/perf side still uses the
old value up to the point where it is disabled. If the rate has been
lowered then the NMI can run fast enough to 'detect' a hard lockup because
the timer has not fired due to the longer period.

The patch 'fixed' this by adding a variable:

proc_write()
set_sample_period()
<----- Broken starts
proc_watchdog_update()
watchdog_enable_all_cpus() watchdog_hrtimer_fn()
update_watchdog_all_cpus() restart_timer(sample_period)
watchdog_park_threads()
park_in_progress = 1
<----- Broken ends
nmi_watchdog()
if (park_in_progress)
return;

The only effect of this variable was to make the window where the breakage
can hit small enough that it was not longer observable in testing. From a
correctness point of view it is a pointless bandaid which merily papers
over the root cause: the unsychronized update of the variable.

Looking deeper into the related code pathes unearthed similar problems in
the watchdog_start()/stop() functions.

watchdog_start()
perf_nmi_event_start()
hrtimer_start()

watchdog_stop()
hrtimer_cancel()
perf_nmi_event_stop()

In both cases the call order is wrong because if the tasks gets preempted
or the VM gets scheduled out long enough after the first call, then there is
a chance that the next NMI will see a stale hrtimer interrupt count and
trigger a false positive hard lockup splat.

Get rid of park_in_progress so the code can be gradually deobfuscated and
pruned from several layers of duct tape papering over the root cause,
which has been either ignored or not understood at all.

Once this is removed the underlying problem will be fixed by rewriting the
proc interface to do a proper synchronized update.

Address the start/stop() ordering problem as well by reverting the call
order, so this part is at least correct now.

Signed-off-by: Thomas Gleixner <[email protected]>
---

V2: Make the changelog technically and politically correct.

include/linux/nmi.h | 1 -
kernel/watchdog.c | 39 ++++++++++++++++++---------------------
kernel/watchdog_hld.c | 7 ++-----
3 files changed, 20 insertions(+), 27 deletions(-)

--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,7 +27,6 @@ extern void touch_softlockup_watchdog_sy
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
extern int soft_watchdog_enabled;
-extern atomic_t watchdog_park_in_progress;
#else
static inline void touch_softlockup_watchdog_sched(void)
{
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -134,8 +134,6 @@ void __weak watchdog_nmi_reconfigure(voi
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)

-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
-
static u64 __read_mostly sample_period;

static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -320,8 +318,7 @@ static enum hrtimer_restart watchdog_tim
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;

- if (!watchdog_enabled ||
- atomic_read(&watchdog_park_in_progress) != 0)
+ if (!watchdog_enabled)
return HRTIMER_NORESTART;

/* kick the hardlockup detector */
@@ -435,33 +432,38 @@ static void watchdog_set_prio(unsigned i

static void watchdog_enable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);

- /* kick off the timer for the hardlockup detector */
+ /*
+ * Start the timer first to prevent the NMI watchdog triggering
+ * before the timer has a chance to fire.
+ */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL_PINNED);

+ /* Initialize timestamp */
+ __touch_watchdog();
/* Enable the perf event */
watchdog_nmi_enable(cpu);

- /* done here because hrtimer_start can only pin to smp_processor_id() */
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
-
- /* initialize timestamp */
watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
- __touch_watchdog();
}

static void watchdog_disable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);

watchdog_set_prio(SCHED_NORMAL, 0);
- hrtimer_cancel(hrtimer);
- /* disable the perf event */
- watchdog_nmi_disable(cpu);
+ /*
+ * Disable the perf event first. That prevents that a large delay
+ * between disabling the timer and disabling the perf event causes
+ * the perf NMI to detect a false positive.
+ */
hardlockup_detector_perf_disable();
+ watchdog_nmi_disable(cpu);
+ hrtimer_cancel(hrtimer);
}

static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -517,16 +519,11 @@ static int watchdog_park_threads(void)
{
int cpu, ret = 0;

- atomic_set(&watchdog_park_in_progress, 1);
-
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
if (ret)
break;
}
-
- atomic_set(&watchdog_park_in_progress, 0);
-
return ret;
}

--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,15 +106,12 @@ static struct perf_event_attr wd_hw_attr

/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;

- if (atomic_read(&watchdog_park_in_progress) != 0)
- return;
-
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;


Subject: [tip:core/urgent] watchdog/core: Remove the park_in_progress obfuscation

Commit-ID: 01f0a02701cbcf32d22cfc9d1ab9a3f0ff2ba68c
Gitweb: http://git.kernel.org/tip/01f0a02701cbcf32d22cfc9d1ab9a3f0ff2ba68c
Author: Thomas Gleixner <[email protected]>
AuthorDate: Tue, 12 Sep 2017 21:37:05 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Thu, 14 Sep 2017 11:41:05 +0200

watchdog/core: Remove the park_in_progress obfuscation

Commit:

b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded system")

tries to fix the following issue:

proc_write()
set_sample_period() <--- New sample period becoms visible
<----- Broken starts
proc_watchdog_update()
watchdog_enable_all_cpus() watchdog_hrtimer_fn()
update_watchdog_all_cpus() restart_timer(sample_period)
watchdog_park_threads()

thread->park()
disable_nmi()
<----- Broken ends

The reason why this is broken is that the update of the watchdog threshold
becomes immediately effective and visible for the hrtimer function which
uses that value to rearm the timer. But the NMI/perf side still uses the
old value up to the point where it is disabled. If the rate has been
lowered then the NMI can run fast enough to 'detect' a hard lockup because
the timer has not fired due to the longer period.

The patch 'fixed' this by adding a variable:

proc_write()
set_sample_period()
<----- Broken starts
proc_watchdog_update()
watchdog_enable_all_cpus() watchdog_hrtimer_fn()
update_watchdog_all_cpus() restart_timer(sample_period)
watchdog_park_threads()
park_in_progress = 1
<----- Broken ends
nmi_watchdog()
if (park_in_progress)
return;

The only effect of this variable was to make the window where the breakage
can hit small enough that it was not longer observable in testing. From a
correctness point of view it is a pointless bandaid which merily papers
over the root cause: the unsychronized update of the variable.

Looking deeper into the related code pathes unearthed similar problems in
the watchdog_start()/stop() functions.

watchdog_start()
perf_nmi_event_start()
hrtimer_start()

watchdog_stop()
hrtimer_cancel()
perf_nmi_event_stop()

In both cases the call order is wrong because if the tasks gets preempted
or the VM gets scheduled out long enough after the first call, then there is
a chance that the next NMI will see a stale hrtimer interrupt count and
trigger a false positive hard lockup splat.

Get rid of park_in_progress so the code can be gradually deobfuscated and
pruned from several layers of duct tape papering over the root cause,
which has been either ignored or not understood at all.

Once this is removed the underlying problem will be fixed by rewriting the
proc interface to do a proper synchronized update.

Address the start/stop() ordering problem as well by reverting the call
order, so this part is at least correct now.

Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Don Zickus <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Chris Metcalf <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Nicholas Piggin <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Sebastian Siewior <[email protected]>
Cc: Ulrich Obergfell <[email protected]>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1709052038270.2393@nanos
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/nmi.h | 1 -
kernel/watchdog.c | 37 +++++++++++++++++--------------------
kernel/watchdog_hld.c | 7 ++-----
3 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 80354e6..91a3a4a 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,7 +27,6 @@ extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
extern int soft_watchdog_enabled;
-extern atomic_t watchdog_park_in_progress;
#else
static inline void touch_softlockup_watchdog_sched(void)
{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dd1fd59..c290135 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -136,8 +136,6 @@ void __weak watchdog_nmi_reconfigure(void)
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)

-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
-
static u64 __read_mostly sample_period;

static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -322,8 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;

- if (!watchdog_enabled ||
- atomic_read(&watchdog_park_in_progress) != 0)
+ if (!watchdog_enabled)
return HRTIMER_NORESTART;

/* kick the hardlockup detector */
@@ -437,32 +434,37 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)

static void watchdog_enable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);

- /* kick off the timer for the hardlockup detector */
+ /*
+ * Start the timer first to prevent the NMI watchdog triggering
+ * before the timer has a chance to fire.
+ */
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
+ hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+ HRTIMER_MODE_REL_PINNED);

+ /* Initialize timestamp */
+ __touch_watchdog();
/* Enable the perf event */
watchdog_nmi_enable(cpu);

- /* done here because hrtimer_start can only pin to smp_processor_id() */
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
-
- /* initialize timestamp */
watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
- __touch_watchdog();
}

static void watchdog_disable(unsigned int cpu)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+ struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);

watchdog_set_prio(SCHED_NORMAL, 0);
- hrtimer_cancel(hrtimer);
- /* disable the perf event */
+ /*
+ * Disable the perf event first. That prevents that a large delay
+ * between disabling the timer and disabling the perf event causes
+ * the perf NMI to detect a false positive.
+ */
watchdog_nmi_disable(cpu);
+ hrtimer_cancel(hrtimer);
}

static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -518,16 +520,11 @@ static int watchdog_park_threads(void)
{
int cpu, ret = 0;

- atomic_set(&watchdog_park_in_progress, 1);
-
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
if (ret)
break;
}
-
- atomic_set(&watchdog_park_in_progress, 0);
-
return ret;
}

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 94111cc..0aa191e 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,15 +106,12 @@ static struct perf_event_attr wd_hw_attr = {

/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
/* Ensure the watchdog never gets throttled */
event->hw.interrupts = 0;

- if (atomic_read(&watchdog_park_in_progress) != 0)
- return;
-
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;