2011-05-23 05:11:54

by Mandeep Singh Baines

[permalink] [raw]
Subject: [PATCH 1/4] watchdog: fix rounding issues in get_sample_period()

In get_sample_period(), softlockup_thresh is integer divided by 5 before
the multiplication by NSEC_PER_SEC. This results in softlockup_thresh
being rounded down to the nearest integer multiple of 5.

For example, a softlockup_thresh of 4 rounds down to 0.

Signed-off-by: Mandeep Singh Baines <[email protected]>
Cc: Marcin Slusarz <[email protected]>
Cc: Don Zickus <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Andrew Morton <[email protected]>
---
kernel/watchdog.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14733d4..a06972d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -110,7 +110,7 @@ static unsigned long get_sample_period(void)
* increment before the hardlockup detector generates
* a warning
*/
- return softlockup_thresh / 5 * NSEC_PER_SEC;
+ return softlockup_thresh * (NSEC_PER_SEC / 5);
}

/* Commands for resetting the watchdog */
--
1.7.3.1


2011-05-23 05:12:21

by Mandeep Singh Baines

[permalink] [raw]
Subject: [PATCH 2/4] watchdog: only disable/enable watchdog if neccessary

Don't take any action on an unsuccessful write to /proc.

Signed-off-by: Mandeep Singh Baines <[email protected]>
Cc: Marcin Slusarz <[email protected]>
Cc: Don Zickus <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Andrew Morton <[email protected]>
---
kernel/watchdog.c | 20 ++++++++++++--------
1 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a06972d..cf0e09f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -507,15 +507,19 @@ static void watchdog_disable_all_cpus(void)
int proc_dowatchdog_enabled(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
+ int ret;

- if (write) {
- if (watchdog_enabled)
- watchdog_enable_all_cpus();
- else
- watchdog_disable_all_cpus();
- }
- return 0;
+ ret = proc_dointvec(table, write, buffer, length, ppos);
+ if (ret || !write)
+ goto out;
+
+ if (watchdog_enabled)
+ watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+
+out:
+ return ret;
}

int proc_dowatchdog_thresh(struct ctl_table *table, int write,
--
1.7.3.1

2011-05-23 05:12:05

by Mandeep Singh Baines

[permalink] [raw]
Subject: [PATCH 3/4] watchdog: disable watchdog when thresh is zero

This restores the previous behavior of softlock_thresh.

Currently, setting watchdog_thresh to zero causes the watchdog
kthreads to consume a lot of CPU.

In addition, the logic of proc_dowatchdog_thresh and proc_dowatchdog_enabled
has been factored into proc_dowatchdog.

Signed-off-by: Mandeep Singh Baines <[email protected]>
LKML-Reference: <[email protected]>
Cc: Marcin Slusarz <[email protected]>
Cc: Don Zickus <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Andrew Morton <[email protected]>
---
include/linux/nmi.h | 5 +++--
include/linux/sched.h | 1 -
kernel/sysctl.c | 12 ++++++++----
kernel/watchdog.c | 25 +++++++++----------------
4 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index c536f85..5317b8b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -47,9 +47,10 @@ static inline bool trigger_all_cpu_backtrace(void)
int hw_nmi_is_cpu_stuck(struct pt_regs *);
u64 hw_nmi_get_sample_period(void);
extern int watchdog_enabled;
+extern int watchdog_thresh;
struct ctl_table;
-extern int proc_dowatchdog_enabled(struct ctl_table *, int ,
- void __user *, size_t *, loff_t *);
+extern int proc_dowatchdog(struct ctl_table *, int ,
+ void __user *, size_t *, loff_t *);
#endif

#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7ee2530..aaf71e0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -315,7 +315,6 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos);
extern unsigned int softlockup_panic;
-extern int softlockup_thresh;
void lockup_detector_init(void);
#else
static inline void touch_softlockup_watchdog(void)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c0bb324..3dd0c46 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -730,14 +730,16 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dowatchdog_enabled,
+ .proc_handler = proc_dowatchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
},
{
.procname = "watchdog_thresh",
- .data = &softlockup_thresh,
+ .data = &watchdog_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dowatchdog_thresh,
+ .proc_handler = proc_dowatchdog,
.extra1 = &neg_one,
.extra2 = &sixty,
},
@@ -755,7 +757,9 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dowatchdog_enabled,
+ .proc_handler = proc_dowatchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
},
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index cf0e09f..6030191 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -28,7 +28,7 @@
#include <linux/perf_event.h>

int watchdog_enabled = 1;
-int __read_mostly softlockup_thresh = 60;
+int __read_mostly watchdog_thresh = 60;

static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -105,12 +105,12 @@ static unsigned long get_timestamp(int this_cpu)
static unsigned long get_sample_period(void)
{
/*
- * convert softlockup_thresh from seconds to ns
+ * convert watchdog_thresh from seconds to ns
* the divide by 5 is to give hrtimer 5 chances to
* increment before the hardlockup detector generates
* a warning
*/
- return softlockup_thresh * (NSEC_PER_SEC / 5);
+ return watchdog_thresh * (NSEC_PER_SEC / 5);
}

/* Commands for resetting the watchdog */
@@ -182,7 +182,7 @@ static int is_softlockup(unsigned long touch_ts)
unsigned long now = get_timestamp(smp_processor_id());

/* Warn about unreasonable delays: */
- if (time_after(now, touch_ts + softlockup_thresh))
+ if (time_after(now, touch_ts + watchdog_thresh))
return now - touch_ts;

return 0;
@@ -501,19 +501,19 @@ static void watchdog_disable_all_cpus(void)
/* sysctl functions */
#ifdef CONFIG_SYSCTL
/*
- * proc handler for /proc/sys/kernel/nmi_watchdog
+ * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
*/

-int proc_dowatchdog_enabled(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+int proc_dowatchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
int ret;

- ret = proc_dointvec(table, write, buffer, length, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto out;

- if (watchdog_enabled)
+ if (watchdog_enabled && watchdog_thresh)
watchdog_enable_all_cpus();
else
watchdog_disable_all_cpus();
@@ -521,13 +521,6 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write,
out:
return ret;
}
-
-int proc_dowatchdog_thresh(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-}
#endif /* CONFIG_SYSCTL */


--
1.7.3.1

2011-05-23 05:11:48

by Mandeep Singh Baines

[permalink] [raw]
Subject: [PATCH 4/4] watchdog: configure nmi watchdog period based on watchdog_thresh

Before the conversion of the NMI watchdog to perf event, the watchdog
timeout was 5 seconds. Now it is 60 seconds. For my particular application,
netbooks, 5 seconds was a better timeout. With a short timeout, we
catch faults earlier and are able to send back a panic. With a 60 second
timeout, the user is unlikely to wait and will instead hit the power
button, causing us to lose the panic info.

This change configures the NMI period to watchdog_thresh and sets
the softlockup_thresh to watchdog_thresh * 2. In addition, watchdog_thresh
was reduced to 10 seconds as suggested by Ingo Molnar.

Signed-off-by: Mandeep Singh Baines <[email protected]>
LKML-Reference: <[email protected]>
Cc: Marcin Slusarz <[email protected]>
Cc: Don Zickus <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Andrew Morton <[email protected]>
---
arch/x86/kernel/apic/hw_nmi.c | 4 ++--
include/linux/nmi.h | 2 +-
kernel/watchdog.c | 19 +++++++++++++++----
3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 5260fe9..d5e57db0 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -19,9 +19,9 @@
#include <linux/delay.h>

#ifdef CONFIG_HARDLOCKUP_DETECTOR
-u64 hw_nmi_get_sample_period(void)
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
{
- return (u64)(cpu_khz) * 1000 * 60;
+ return (u64)(cpu_khz) * 1000 * watchdog_thresh;
}
#endif

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 5317b8b..2d304ef 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -45,7 +45,7 @@ static inline bool trigger_all_cpu_backtrace(void)

#ifdef CONFIG_LOCKUP_DETECTOR
int hw_nmi_is_cpu_stuck(struct pt_regs *);
-u64 hw_nmi_get_sample_period(void);
+u64 hw_nmi_get_sample_period(int watchdog_thresh);
extern int watchdog_enabled;
extern int watchdog_thresh;
struct ctl_table;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6030191..6e63097 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -28,7 +28,7 @@
#include <linux/perf_event.h>

int watchdog_enabled = 1;
-int __read_mostly watchdog_thresh = 60;
+int __read_mostly watchdog_thresh = 10;

static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)
__setup("nosoftlockup", nosoftlockup_setup);
/* */

+/*
+ * Hard-lockup warnings should be triggered after just a few seconds. Soft-
+ * lockups can have false positives under extreme conditions. So we generally
+ * want a higher threshold for soft lockups than for hard lockups. So we couple
+ * the thresholds with a factor: we make the soft threshold twice the amount of
+ * time the hard threshold is.
+ */
+static int get_softlockup_thresh()
+{
+ return watchdog_thresh * 2;
+}

/*
* Returns seconds, approximately. We don't need nanosecond
@@ -110,7 +121,7 @@ static unsigned long get_sample_period(void)
* increment before the hardlockup detector generates
* a warning
*/
- return watchdog_thresh * (NSEC_PER_SEC / 5);
+ return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
}

/* Commands for resetting the watchdog */
@@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts)
unsigned long now = get_timestamp(smp_processor_id());

/* Warn about unreasonable delays: */
- if (time_after(now, touch_ts + watchdog_thresh))
+ if (time_after(now, touch_ts + get_softlockup_thresh()))
return now - touch_ts;

return 0;
@@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu)

/* Try to register using hardware perf events */
wd_attr = &wd_hw_attr;
- wd_attr->sample_period = hw_nmi_get_sample_period();
+ wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
if (!IS_ERR(event)) {
printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
--
1.7.3.1

2011-05-23 11:12:40

by Mandeep Singh Baines

[permalink] [raw]
Subject: [tip:perf/urgent] watchdog: Fix rounding bug in get_sample_period()

Commit-ID: 824c6b7f6294101f30e141117def224a56c203e6
Gitweb: http://git.kernel.org/tip/824c6b7f6294101f30e141117def224a56c203e6
Author: Mandeep Singh Baines <[email protected]>
AuthorDate: Sun, 22 May 2011 22:10:20 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 23 May 2011 11:58:58 +0200

watchdog: Fix rounding bug in get_sample_period()

In get_sample_period(), softlockup_thresh is integer divided by
5 before the multiplication by NSEC_PER_SEC. This results in
softlockup_thresh being rounded down to the nearest integer
multiple of 5.

For example, a softlockup_thresh of 4 rounds down to 0.

Signed-off-by: Mandeep Singh Baines <[email protected]>
Cc: Marcin Slusarz <[email protected]>
Cc: Don Zickus <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/watchdog.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14733d4..a06972d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -110,7 +110,7 @@ static unsigned long get_sample_period(void)
* increment before the hardlockup detector generates
* a warning
*/
- return softlockup_thresh / 5 * NSEC_PER_SEC;
+ return softlockup_thresh * (NSEC_PER_SEC / 5);
}

/* Commands for resetting the watchdog */

2011-05-23 11:12:56

by Mandeep Singh Baines

[permalink] [raw]
Subject: [tip:perf/urgent] watchdog: Only disable/enable watchdog if neccessary

Commit-ID: e04ab2bc41b35c0cb6cdb07c8443f91aa738cf78
Gitweb: http://git.kernel.org/tip/e04ab2bc41b35c0cb6cdb07c8443f91aa738cf78
Author: Mandeep Singh Baines <[email protected]>
AuthorDate: Sun, 22 May 2011 22:10:21 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 23 May 2011 11:58:58 +0200

watchdog: Only disable/enable watchdog if neccessary

Don't take any action on an unsuccessful write to /proc.

Signed-off-by: Mandeep Singh Baines <[email protected]>
Cc: Marcin Slusarz <[email protected]>
Cc: Don Zickus <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/watchdog.c | 20 ++++++++++++--------
1 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a06972d..cf0e09f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -507,15 +507,19 @@ static void watchdog_disable_all_cpus(void)
int proc_dowatchdog_enabled(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
+ int ret;

- if (write) {
- if (watchdog_enabled)
- watchdog_enable_all_cpus();
- else
- watchdog_disable_all_cpus();
- }
- return 0;
+ ret = proc_dointvec(table, write, buffer, length, ppos);
+ if (ret || !write)
+ goto out;
+
+ if (watchdog_enabled)
+ watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+
+out:
+ return ret;
}

int proc_dowatchdog_thresh(struct ctl_table *table, int write,