2009-01-12 06:52:53

by Mandeep Baines

[permalink] [raw]
Subject: [PATCH v2] softlockup: decouple hung tasks check from softlockup detection

Decoupling allows:

* hung tasks check to happen at very low priority
* hung tasks check and softlockup to be enabled/disabled independently
at compile and/or run-time
* individual panic settings to be enabled disabled independently
at compile and/or run-time
* softlockup threshold to be reduced without increasing hung tasks
poll frequency (hung task check is expensive relative to softlock watchdog)
* hung task check to be zero over-head when disabled at run-time
---
include/linux/sched.h | 14 +++++--
kernel/Makefile | 1 +
kernel/softlockup.c | 102 -------------------------------------------------
kernel/sysctl.c | 15 +++++++-
lib/Kconfig.debug | 38 ++++++++++++++++++
5 files changed, 63 insertions(+), 107 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d1..9e6a39b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -296,9 +296,6 @@ extern void softlockup_tick(void);
extern void touch_softlockup_watchdog(void);
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
-extern unsigned long sysctl_hung_task_check_count;
-extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_warnings;
extern int softlockup_thresh;
#else
static inline void softlockup_tick(void)
@@ -315,6 +312,15 @@ static inline void touch_all_softlockup_watchdogs(void)
}
#endif

+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos);
+#endif

/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
@@ -1207,7 +1213,7 @@ struct task_struct {
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
unsigned long last_switch_timestamp;
unsigned long last_switch_count;
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad00..2809bc6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index dc0b3be..8081b88 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -157,97 +157,11 @@ void softlockup_tick(void)
}

/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
- unsigned long switch_count = t->nvcsw + t->nivcsw;
-
- if (t->flags & PF_FROZEN)
- return;
-
- if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
- t->last_switch_count = switch_count;
- t->last_switch_timestamp = now;
- return;
- }
- if ((long)(now - t->last_switch_timestamp) <
- sysctl_hung_task_timeout_secs)
- return;
- if (!sysctl_hung_task_warnings)
- return;
- sysctl_hung_task_warnings--;
-
- /*
- * Ok, the task did not get scheduled for more than 2 minutes,
- * complain:
- */
- printk(KERN_ERR "INFO: task %s:%d blocked for more than "
- "%ld seconds.\n", t->comm, t->pid,
- sysctl_hung_task_timeout_secs);
- printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
- __debug_show_held_locks(t);
-
- t->last_switch_timestamp = now;
- touch_nmi_watchdog();
-
- if (softlockup_panic)
- panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
- int max_count = sysctl_hung_task_check_count;
- unsigned long now = get_timestamp(this_cpu);
- struct task_struct *g, *t;
-
- /*
- * If the system crashed already then all bets are off,
- * do not report extra hung tasks:
- */
- if (test_taint(TAINT_DIE) || did_panic)
- return;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, t) {
- if (!--max_count)
- goto unlock;
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (t->state == TASK_UNINTERRUPTIBLE)
- check_hung_task(t, now);
- } while_each_thread(g, t);
- unlock:
- read_unlock(&tasklist_lock);
-}
-
-/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- int this_cpu = (long)__bind_cpu;

sched_setscheduler(current, SCHED_FIFO, &param);

@@ -267,11 +181,6 @@ static int watchdog(void *__bind_cpu)
if (kthread_should_stop())
break;

- if (this_cpu == check_cpu) {
- if (sysctl_hung_task_timeout_secs)
- check_hung_uninterruptible_tasks(this_cpu);
- }
-
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
@@ -303,20 +212,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- check_cpu = any_online_cpu(cpu_online_map);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- if (hotcpu == check_cpu) {
- cpumask_t temp_cpu_online_map = cpu_online_map;
-
- cpu_clear(hotcpu, temp_cpu_online_map);
- check_cpu = any_online_cpu(temp_cpu_online_map);
- }
- break;
-
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3d56fe7..51a4748 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -771,6 +771,19 @@ static struct ctl_table kern_table[] = {
.extra1 = &neg_one,
.extra2 = &sixty,
},
+#endif
+#ifdef CONFIG_HUNG_TASK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_check_count",
@@ -786,7 +799,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &proc_dohung_task_timeout_secs,
.strategy = &sysctl_intvec,
},
{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e..a52aa38 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -186,6 +186,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
default 1 if BOOTPARAM_SOFTLOCKUP_PANIC

+config DETECT_HUNG_TASK
+ bool "Detect Hung Tasks"
+ depends on DEBUG_KERNEL
+ default y
+ help
+ Say Y here to enable the kernel to detect "hung tasks",
+ which are bugs that cause the task to be stuck in
+ uninterruptible "D" state indefinitiley.
+
+ When a hung task is detected, the kernel will print the
+ current stack trace (which you should report), but the
+ task will stay in uninterruptible state. If lockdep is
+ enabled then all held locks will also be reported. This
+ feature has negligible overhead.
+
+config BOOTPARAM_HUNG_TASK_PANIC
+ bool "Panic (Reboot) On Hung Tasks"
+ depends on DETECT_HUNG_TASK
+ help
+ Say Y here to enable the kernel to panic on "hung tasks",
+ which are bugs that cause the kernel to leave a task stuck
+ in uninterruptible "D" state.
+
+ The panic can be used in combination with panic_timeout,
+ to cause the system to reboot automatically after a
+ hung task has been detected. This feature is useful for
+ high-availability systems that have uptime guarantees and
+ where a hung tasks must be resolved ASAP.
+
+ Say N if unsure.
+
+config BOOTPARAM_HUNG_TASK_PANIC_VALUE
+ int
+ depends on DETECT_HUNG_TASK
+ range 0 1
+ default 0 if !BOOTPARAM_HUNG_TASK_PANIC
+ default 1 if BOOTPARAM_HUNG_TASK_PANIC
+
config SCHED_DEBUG
bool "Collect scheduler debugging info"
depends on DEBUG_KERNEL && PROC_FS
--
1.5.4.5


2009-01-12 09:03:38

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v2] softlockup: decouple hung tasks check from softlockup detection


* Mandeep Singh Baines <[email protected]> wrote:

> Decoupling allows:
>
> * hung tasks check to happen at very low priority
> * hung tasks check and softlockup to be enabled/disabled independently
> at compile and/or run-time
> * individual panic settings to be enabled disabled independently
> at compile and/or run-time
> * softlockup threshold to be reduced without increasing hung tasks
> poll frequency (hung task check is expensive relative to softlock watchdog)
> * hung task check to be zero over-head when disabled at run-time
> ---
> include/linux/sched.h | 14 +++++--
> kernel/Makefile | 1 +
> kernel/softlockup.c | 102 -------------------------------------------------
> kernel/sysctl.c | 15 +++++++-
> lib/Kconfig.debug | 38 ++++++++++++++++++
> 5 files changed, 63 insertions(+), 107 deletions(-)

Looks good in principle, but i suspect your patch submission should
include the new kernel/hung_task.c file too ;-)

Ingo

2009-01-12 14:32:42

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH v2] softlockup: decouple hung tasks check from softlockup detection

On Sun, Jan 11, 2009 at 10:52:13PM -0800, Mandeep Singh Baines wrote:
> Decoupling allows:
>
> * hung tasks check to happen at very low priority
> * hung tasks check and softlockup to be enabled/disabled independently
> at compile and/or run-time
> * individual panic settings to be enabled disabled independently
> at compile and/or run-time
> * softlockup threshold to be reduced without increasing hung tasks
> poll frequency (hung task check is expensive relative to softlock watchdog)
> * hung task check to be zero over-head when disabled at run-time
> ---
> include/linux/sched.h | 14 +++++--
> kernel/Makefile | 1 +
> kernel/softlockup.c | 102 -------------------------------------------------
> kernel/sysctl.c | 15 +++++++-
> lib/Kconfig.debug | 38 ++++++++++++++++++
> 5 files changed, 63 insertions(+), 107 deletions(-)

kernel/hung_task.c missing?

Hannes

2009-01-12 17:58:30

by Mandeep Baines

[permalink] [raw]
Subject: [PATCH v3] softlockup: decouple hung tasks check from softlockup detection

Ingo Molnar ([email protected]) wrote:
> Looks good in principle, but i suspect your patch submission should
> include the new kernel/hung_task.c file too ;-)
>
> Ingo

Doh. Thanks for reviewing. Here is the same patch with hung_task.c included.

---

Decoupling allows:

* hung tasks check to happen at very low priority
* hung tasks check and softlockup to be enabled/disabled independently
at compile and/or run-time
* individual panic settings to be enabled disabled independently
at compile and/or run-time
* softlockup threshold to be reduced without increasing hung tasks
poll frequency (hung task check is expensive relative to softlock watchdog)
* hung task check to be zero over-head when disabled at run-time

Signed-off-by: Mandeep Singh Baines <[email protected]>
---
include/linux/sched.h | 14 +++-
kernel/Makefile | 1 +
kernel/hung_task.c | 198 +++++++++++++++++++++++++++++++++++++++++++++++++
kernel/softlockup.c | 102 -------------------------
kernel/sysctl.c | 15 ++++-
lib/Kconfig.debug | 38 ++++++++++
6 files changed, 261 insertions(+), 107 deletions(-)
create mode 100644 kernel/hung_task.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d1..9e6a39b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -296,9 +296,6 @@ extern void softlockup_tick(void);
extern void touch_softlockup_watchdog(void);
extern void touch_all_softlockup_watchdogs(void);
extern unsigned int softlockup_panic;
-extern unsigned long sysctl_hung_task_check_count;
-extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_warnings;
extern int softlockup_thresh;
#else
static inline void softlockup_tick(void)
@@ -315,6 +312,15 @@ static inline void touch_all_softlockup_watchdogs(void)
}
#endif

+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos);
+#endif

/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
@@ -1207,7 +1213,7 @@ struct task_struct {
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
unsigned long last_switch_timestamp;
unsigned long last_switch_count;
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad00..2809bc6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 0000000..ba5a77c
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,198 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+static unsigned long __read_mostly hung_task_poll_jiffies;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+ sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+ return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ did_panic = 1;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+ .notifier_call = hung_task_panic,
+};
+
+/*
+ * Returns seconds, approximately. We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+ int this_cpu = raw_smp_processor_id();
+
+ return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
+}
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ if (t->flags & PF_FROZEN)
+ return;
+
+ if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+ t->last_switch_count = switch_count;
+ t->last_switch_timestamp = now;
+ return;
+ }
+ if ((long)(now - t->last_switch_timestamp) <
+ sysctl_hung_task_timeout_secs)
+ return;
+ if (!sysctl_hung_task_warnings)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+ "%ld seconds.\n", t->comm, t->pid,
+ sysctl_hung_task_timeout_secs);
+ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ t->last_switch_timestamp = now;
+ touch_nmi_watchdog();
+
+ if (sysctl_hung_task_panic)
+ panic("hung_task: blocked tasks");
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(void)
+{
+ int max_count = sysctl_hung_task_check_count;
+ unsigned long now = get_timestamp();
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if (test_taint(TAINT_DIE) || did_panic)
+ return;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (!--max_count)
+ goto unlock;
+ /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+ if (t->state == TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, now);
+ } while_each_thread(g, t);
+ unlock:
+ read_unlock(&tasklist_lock);
+}
+
+static void update_poll_jiffies(void)
+{
+ /* timeout of 0 will disable the watchdog */
+ if (sysctl_hung_task_timeout_secs == 0)
+ hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
+ else
+ hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+ if (ret || !write)
+ goto out;
+
+ update_poll_jiffies();
+
+ wake_up_process(watchdog_task);
+
+ out:
+ return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+ set_user_nice(current, 0);
+ update_poll_jiffies();
+
+ for ( ; ; ) {
+ while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+ check_hung_uninterruptible_tasks();
+ }
+
+ return 0;
+}
+
+static int __init hung_task_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+ watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+ return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index dc0b3be..8081b88 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -157,97 +157,11 @@ void softlockup_tick(void)
}

/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
- unsigned long switch_count = t->nvcsw + t->nivcsw;
-
- if (t->flags & PF_FROZEN)
- return;
-
- if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
- t->last_switch_count = switch_count;
- t->last_switch_timestamp = now;
- return;
- }
- if ((long)(now - t->last_switch_timestamp) <
- sysctl_hung_task_timeout_secs)
- return;
- if (!sysctl_hung_task_warnings)
- return;
- sysctl_hung_task_warnings--;
-
- /*
- * Ok, the task did not get scheduled for more than 2 minutes,
- * complain:
- */
- printk(KERN_ERR "INFO: task %s:%d blocked for more than "
- "%ld seconds.\n", t->comm, t->pid,
- sysctl_hung_task_timeout_secs);
- printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
- __debug_show_held_locks(t);
-
- t->last_switch_timestamp = now;
- touch_nmi_watchdog();
-
- if (softlockup_panic)
- panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
- int max_count = sysctl_hung_task_check_count;
- unsigned long now = get_timestamp(this_cpu);
- struct task_struct *g, *t;
-
- /*
- * If the system crashed already then all bets are off,
- * do not report extra hung tasks:
- */
- if (test_taint(TAINT_DIE) || did_panic)
- return;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, t) {
- if (!--max_count)
- goto unlock;
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (t->state == TASK_UNINTERRUPTIBLE)
- check_hung_task(t, now);
- } while_each_thread(g, t);
- unlock:
- read_unlock(&tasklist_lock);
-}
-
-/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- int this_cpu = (long)__bind_cpu;

sched_setscheduler(current, SCHED_FIFO, &param);

@@ -267,11 +181,6 @@ static int watchdog(void *__bind_cpu)
if (kthread_should_stop())
break;

- if (this_cpu == check_cpu) {
- if (sysctl_hung_task_timeout_secs)
- check_hung_uninterruptible_tasks(this_cpu);
- }
-
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
@@ -303,20 +212,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- check_cpu = any_online_cpu(cpu_online_map);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- if (hotcpu == check_cpu) {
- cpumask_t temp_cpu_online_map = cpu_online_map;
-
- cpu_clear(hotcpu, temp_cpu_online_map);
- check_cpu = any_online_cpu(temp_cpu_online_map);
- }
- break;
-
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3d56fe7..51a4748 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -771,6 +771,19 @@ static struct ctl_table kern_table[] = {
.extra1 = &neg_one,
.extra2 = &sixty,
},
+#endif
+#ifdef CONFIG_HUNG_TASK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_check_count",
@@ -786,7 +799,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &proc_dohung_task_timeout_secs,
.strategy = &sysctl_intvec,
},
{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e..a52aa38 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -186,6 +186,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
default 1 if BOOTPARAM_SOFTLOCKUP_PANIC

+config DETECT_HUNG_TASK
+ bool "Detect Hung Tasks"
+ depends on DEBUG_KERNEL
+ default y
+ help
+ Say Y here to enable the kernel to detect "hung tasks",
+ which are bugs that cause the task to be stuck in
+ uninterruptible "D" state indefinitiley.
+
+ When a hung task is detected, the kernel will print the
+ current stack trace (which you should report), but the
+ task will stay in uninterruptible state. If lockdep is
+ enabled then all held locks will also be reported. This
+ feature has negligible overhead.
+
+config BOOTPARAM_HUNG_TASK_PANIC
+ bool "Panic (Reboot) On Hung Tasks"
+ depends on DETECT_HUNG_TASK
+ help
+ Say Y here to enable the kernel to panic on "hung tasks",
+ which are bugs that cause the kernel to leave a task stuck
+ in uninterruptible "D" state.
+
+ The panic can be used in combination with panic_timeout,
+ to cause the system to reboot automatically after a
+ hung task has been detected. This feature is useful for
+ high-availability systems that have uptime guarantees and
+ where a hung tasks must be resolved ASAP.
+
+ Say N if unsure.
+
+config BOOTPARAM_HUNG_TASK_PANIC_VALUE
+ int
+ depends on DETECT_HUNG_TASK
+ range 0 1
+ default 0 if !BOOTPARAM_HUNG_TASK_PANIC
+ default 1 if BOOTPARAM_HUNG_TASK_PANIC
+
config SCHED_DEBUG
bool "Collect scheduler debugging info"
depends on DEBUG_KERNEL && PROC_FS
--
1.5.4.5

2009-01-14 10:54:43

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v3] softlockup: decouple hung tasks check from softlockup detection


* Mandeep Singh Baines <[email protected]> wrote:

> Ingo Molnar ([email protected]) wrote:
> > Looks good in principle, but i suspect your patch submission should
> > include the new kernel/hung_task.c file too ;-)
> >
> > Ingo
>
> Doh. Thanks for reviewing. Here is the same patch with hung_task.c included.

this now conflicts with the softlockup-panic fix you posted - and that fix
has to go first as it's v2.6.29 material. (while the decoupling patch is
2.6.30 material)

So could you please send a patch against tip/master:

http://people.redhat.com/mingo/tip.git/README

Thanks,

Ingo

2009-01-15 19:19:56

by Mandeep Baines

[permalink] [raw]
Subject: [PATCH v4] softlockup: decouple hung tasks check from softlockup detection

Ingo Molnar ([email protected]) wrote:
>
> this now conflicts with the softlockup-panic fix you posted - and that fix
> has to go first as it's v2.6.29 material. (while the decoupling patch is
> 2.6.30 material)
>
> So could you please send a patch against tip/master:
>
> http://people.redhat.com/mingo/tip.git/README
>
> Thanks,
>
> Ingo

No problem. Rebased to tip/master.

---

Decoupling allows:

* hung tasks check to happen at very low priority
* hung tasks check and softlockup to be enabled/disabled independently
at compile and/or run-time
* individual panic settings to be enabled disabled independently
at compile and/or run-time
* softlockup threshold to be reduced without increasing hung tasks
poll frequency (hung task check is expensive relative to softlock watchdog)
* hung task check to be zero over-head when disabled at run-time

Signed-off-by: Mandeep Singh Baines <[email protected]>
---
include/linux/sched.h | 14 +++-
kernel/Makefile | 1 +
kernel/hung_task.c | 198 +++++++++++++++++++++++++++++++++++++++++++++++++
kernel/softlockup.c | 100 -------------------------
kernel/sysctl.c | 15 ++++-
lib/Kconfig.debug | 38 ++++++++++
6 files changed, 261 insertions(+), 105 deletions(-)
create mode 100644 kernel/hung_task.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a0b9b6..bda891e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -298,9 +298,6 @@ extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
size_t *lenp, loff_t *ppos);
extern unsigned int softlockup_panic;
-extern unsigned long sysctl_hung_task_check_count;
-extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_warnings;
extern int softlockup_thresh;
#else
static inline void softlockup_tick(void)
@@ -317,6 +314,15 @@ static inline void touch_all_softlockup_watchdogs(void)
}
#endif

+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos);
+#endif

/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
@@ -1240,7 +1246,7 @@ struct task_struct {
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
unsigned long last_switch_timestamp;
unsigned long last_switch_count;
diff --git a/kernel/Makefile b/kernel/Makefile
index e411592..46cd2b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 0000000..ba5a77c
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,198 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+static unsigned long __read_mostly hung_task_poll_jiffies;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+ CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+ sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+ return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+ did_panic = 1;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+ .notifier_call = hung_task_panic,
+};
+
+/*
+ * Returns seconds, approximately. We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+ int this_cpu = raw_smp_processor_id();
+
+ return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
+}
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ if (t->flags & PF_FROZEN)
+ return;
+
+ if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+ t->last_switch_count = switch_count;
+ t->last_switch_timestamp = now;
+ return;
+ }
+ if ((long)(now - t->last_switch_timestamp) <
+ sysctl_hung_task_timeout_secs)
+ return;
+ if (!sysctl_hung_task_warnings)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+ "%ld seconds.\n", t->comm, t->pid,
+ sysctl_hung_task_timeout_secs);
+ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ t->last_switch_timestamp = now;
+ touch_nmi_watchdog();
+
+ if (sysctl_hung_task_panic)
+ panic("hung_task: blocked tasks");
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(void)
+{
+ int max_count = sysctl_hung_task_check_count;
+ unsigned long now = get_timestamp();
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if (test_taint(TAINT_DIE) || did_panic)
+ return;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (!--max_count)
+ goto unlock;
+ /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+ if (t->state == TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, now);
+ } while_each_thread(g, t);
+ unlock:
+ read_unlock(&tasklist_lock);
+}
+
+static void update_poll_jiffies(void)
+{
+ /* timeout of 0 will disable the watchdog */
+ if (sysctl_hung_task_timeout_secs == 0)
+ hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
+ else
+ hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+ if (ret || !write)
+ goto out;
+
+ update_poll_jiffies();
+
+ wake_up_process(watchdog_task);
+
+ out:
+ return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+ set_user_nice(current, 0);
+ update_poll_jiffies();
+
+ for ( ; ; ) {
+ while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+ check_hung_uninterruptible_tasks();
+ }
+
+ return 0;
+}
+
+static int __init hung_task_init(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+ watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+ return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 85d5a24..88796c3 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -166,97 +166,11 @@ void softlockup_tick(void)
}

/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
- unsigned long switch_count = t->nvcsw + t->nivcsw;
-
- if (t->flags & PF_FROZEN)
- return;
-
- if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
- t->last_switch_count = switch_count;
- t->last_switch_timestamp = now;
- return;
- }
- if ((long)(now - t->last_switch_timestamp) <
- sysctl_hung_task_timeout_secs)
- return;
- if (!sysctl_hung_task_warnings)
- return;
- sysctl_hung_task_warnings--;
-
- /*
- * Ok, the task did not get scheduled for more than 2 minutes,
- * complain:
- */
- printk(KERN_ERR "INFO: task %s:%d blocked for more than "
- "%ld seconds.\n", t->comm, t->pid,
- sysctl_hung_task_timeout_secs);
- printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
- " disables this message.\n");
- sched_show_task(t);
- __debug_show_held_locks(t);
-
- t->last_switch_timestamp = now;
- touch_nmi_watchdog();
-
- if (softlockup_panic)
- panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
- int max_count = sysctl_hung_task_check_count;
- unsigned long now = get_timestamp(this_cpu);
- struct task_struct *g, *t;
-
- /*
- * If the system crashed already then all bets are off,
- * do not report extra hung tasks:
- */
- if (test_taint(TAINT_DIE) || did_panic)
- return;
-
- read_lock(&tasklist_lock);
- do_each_thread(g, t) {
- if (!--max_count)
- goto unlock;
- /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
- if (t->state == TASK_UNINTERRUPTIBLE)
- check_hung_task(t, now);
- } while_each_thread(g, t);
- unlock:
- read_unlock(&tasklist_lock);
-}
-
-/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- int this_cpu = (long)__bind_cpu;

sched_setscheduler(current, SCHED_FIFO, &param);

@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
if (kthread_should_stop())
break;

- if (this_cpu == check_cpu) {
- if (sysctl_hung_task_timeout_secs)
- check_hung_uninterruptible_tasks(this_cpu);
- }
-
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- check_cpu = cpumask_any(cpu_online_mask);
wake_up_process(per_cpu(watchdog_task, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- if (hotcpu == check_cpu) {
- /* Pick any other online cpu. */
- check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
- }
- break;
-
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 80849e0..d7971f6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -806,6 +806,19 @@ static struct ctl_table kern_table[] = {
.extra1 = &neg_one,
.extra2 = &sixty,
},
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{
.ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_check_count",
@@ -821,7 +834,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &proc_dohung_task_timeout_secs,
.strategy = &sysctl_intvec,
},
{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 393eb8c..8ccaf97 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -199,6 +199,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
default 1 if BOOTPARAM_SOFTLOCKUP_PANIC

+config DETECT_HUNG_TASK
+ bool "Detect Hung Tasks"
+ depends on DEBUG_KERNEL
+ default y
+ help
+ Say Y here to enable the kernel to detect "hung tasks",
+ which are bugs that cause the task to be stuck in
+ uninterruptible "D" state indefinitiley.
+
+ When a hung task is detected, the kernel will print the
+ current stack trace (which you should report), but the
+ task will stay in uninterruptible state. If lockdep is
+ enabled then all held locks will also be reported. This
+ feature has negligible overhead.
+
+config BOOTPARAM_HUNG_TASK_PANIC
+ bool "Panic (Reboot) On Hung Tasks"
+ depends on DETECT_HUNG_TASK
+ help
+ Say Y here to enable the kernel to panic on "hung tasks",
+ which are bugs that cause the kernel to leave a task stuck
+ in uninterruptible "D" state.
+
+ The panic can be used in combination with panic_timeout,
+ to cause the system to reboot automatically after a
+ hung task has been detected. This feature is useful for
+ high-availability systems that have uptime guarantees and
+ where a hung tasks must be resolved ASAP.
+
+ Say N if unsure.
+
+config BOOTPARAM_HUNG_TASK_PANIC_VALUE
+ int
+ depends on DETECT_HUNG_TASK
+ range 0 1
+ default 0 if !BOOTPARAM_HUNG_TASK_PANIC
+ default 1 if BOOTPARAM_HUNG_TASK_PANIC
+
config SCHED_DEBUG
bool "Collect scheduler debugging info"
depends on DEBUG_KERNEL && PROC_FS
--
1.5.4.5

2009-01-16 13:06:38

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v4] softlockup: decouple hung tasks check from softlockup detection


* Mandeep Singh Baines <[email protected]> wrote:

> Ingo Molnar ([email protected]) wrote:
> >
> > this now conflicts with the softlockup-panic fix you posted - and that fix
> > has to go first as it's v2.6.29 material. (while the decoupling patch is
> > 2.6.30 material)
> >
> > So could you please send a patch against tip/master:
> >
> > http://people.redhat.com/mingo/tip.git/README
> >
> > Thanks,
> >
> > Ingo
>
> No problem. Rebased to tip/master.

applied to tip/core/softlockup - thanks Mandeep!

Ingo

2009-01-16 13:25:29

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v4] softlockup: decouple hung tasks check from softlockup detection


* Ingo Molnar <[email protected]> wrote:

> > No problem. Rebased to tip/master.
>
> applied to tip/core/softlockup - thanks Mandeep!

doesnt build with !SOFTLOCKUP:

kernel/fork.c:1049: error: 'struct task_struct' has no member named 'last_switch_count'
kernel/fork.c:1050: error: 'struct task_struct' has no member named 'last_switch_timestamp'

Could you send a delta fix patch relative to -v4 please? Thanks,

Ingo

2009-01-16 16:58:36

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH v2] softlockup: decouple hung tasks check from softlockup detection

Hi Mandeep,


2009/1/12 Mandeep Singh Baines <[email protected]>:
> -/*
> - * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
> - * a really long time (120 seconds). If that happens, print out
> - * a warning.
> - */
> -static void check_hung_uninterruptible_tasks(int this_cpu)
> -{
> - int max_count = sysctl_hung_task_check_count;
> - unsigned long now = get_timestamp(this_cpu);
> - struct task_struct *g, *t;
> -
> - /*
> - * If the system crashed already then all bets are off,
> - * do not report extra hung tasks:
> - */
> - if (test_taint(TAINT_DIE) || did_panic)
> - return;
> -
> - read_lock(&tasklist_lock);
> - do_each_thread(g, t) {
> - if (!--max_count)
> - goto unlock;


Instead of having this arbitrary limit of tasks, why not just
lurk the need_resched() and then schedule if it needs too.

I know that sounds a bit racy, because you will have to release the
tasklist_lock and
a lot of things can happen in the task list until you become resched.
But you can do a get_task_struct() on g and t before your thread is
going to sleep and then put them
when it is awaken.
Perhaps some tasks will disappear or be appended in the list before g
and t, but that doesn't really matter:
if they disappear, they didn't lockup, and if they were appended, they
are not enough cold to be analyzed :-)

This way you can drop the arbitrary limit of task number given by the user....

Frederic.


> - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
> - if (t->state == TASK_UNINTERRUPTIBLE)
> - check_hung_task(t, now);
> - } while_each_thread(g, t);
> - unlock:
> - read_unlock(&tasklist_lock);
> -}
> -
> -/*
> * The watchdog thread - runs every second and touches the timestamp.
> */
> static int watchdog(void *__bind_cpu)
> {
> struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
> - int this_cpu = (long)__bind_cpu;
>
> sched_setscheduler(current, SCHED_FIFO, &param);
>
> @@ -267,11 +181,6 @@ static int watchdog(void *__bind_cpu)
> if (kthread_should_stop())
> break;
>
> - if (this_cpu == check_cpu) {
> - if (sysctl_hung_task_timeout_secs)
> - check_hung_uninterruptible_tasks(this_cpu);
> - }
> -
> set_current_state(TASK_INTERRUPTIBLE);
> }
> __set_current_state(TASK_RUNNING);
> @@ -303,20 +212,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
> break;
> case CPU_ONLINE:
> case CPU_ONLINE_FROZEN:
> - check_cpu = any_online_cpu(cpu_online_map);
> wake_up_process(per_cpu(watchdog_task, hotcpu));
> break;
> #ifdef CONFIG_HOTPLUG_CPU
> - case CPU_DOWN_PREPARE:
> - case CPU_DOWN_PREPARE_FROZEN:
> - if (hotcpu == check_cpu) {
> - cpumask_t temp_cpu_online_map = cpu_online_map;
> -
> - cpu_clear(hotcpu, temp_cpu_online_map);
> - check_cpu = any_online_cpu(temp_cpu_online_map);
> - }
> - break;
> -
> case CPU_UP_CANCELED:
> case CPU_UP_CANCELED_FROZEN:
> if (!per_cpu(watchdog_task, hotcpu))
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 3d56fe7..51a4748 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -771,6 +771,19 @@ static struct ctl_table kern_table[] = {
> .extra1 = &neg_one,
> .extra2 = &sixty,
> },
> +#endif
> +#ifdef CONFIG_HUNG_TASK
> + {
> + .ctl_name = CTL_UNNUMBERED,
> + .procname = "hung_task_panic",
> + .data = &sysctl_hung_task_panic,
> + .maxlen = sizeof(int),
> + .mode = 0644,
> + .proc_handler = &proc_dointvec_minmax,
> + .strategy = &sysctl_intvec,
> + .extra1 = &zero,
> + .extra2 = &one,
> + },
> {
> .ctl_name = CTL_UNNUMBERED,
> .procname = "hung_task_check_count",
> @@ -786,7 +799,7 @@ static struct ctl_table kern_table[] = {
> .data = &sysctl_hung_task_timeout_secs,
> .maxlen = sizeof(unsigned long),
> .mode = 0644,
> - .proc_handler = &proc_doulongvec_minmax,
> + .proc_handler = &proc_dohung_task_timeout_secs,
> .strategy = &sysctl_intvec,
> },
> {
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index b0f239e..a52aa38 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -186,6 +186,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
> default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
> default 1 if BOOTPARAM_SOFTLOCKUP_PANIC
>
> +config DETECT_HUNG_TASK
> + bool "Detect Hung Tasks"
> + depends on DEBUG_KERNEL
> + default y
> + help
> + Say Y here to enable the kernel to detect "hung tasks",
> + which are bugs that cause the task to be stuck in
> + uninterruptible "D" state indefinitiley.
> +
> + When a hung task is detected, the kernel will print the
> + current stack trace (which you should report), but the
> + task will stay in uninterruptible state. If lockdep is
> + enabled then all held locks will also be reported. This
> + feature has negligible overhead.
> +
> +config BOOTPARAM_HUNG_TASK_PANIC
> + bool "Panic (Reboot) On Hung Tasks"
> + depends on DETECT_HUNG_TASK
> + help
> + Say Y here to enable the kernel to panic on "hung tasks",
> + which are bugs that cause the kernel to leave a task stuck
> + in uninterruptible "D" state.
> +
> + The panic can be used in combination with panic_timeout,
> + to cause the system to reboot automatically after a
> + hung task has been detected. This feature is useful for
> + high-availability systems that have uptime guarantees and
> + where a hung tasks must be resolved ASAP.
> +
> + Say N if unsure.
> +
> +config BOOTPARAM_HUNG_TASK_PANIC_VALUE
> + int
> + depends on DETECT_HUNG_TASK
> + range 0 1
> + default 0 if !BOOTPARAM_HUNG_TASK_PANIC
> + default 1 if BOOTPARAM_HUNG_TASK_PANIC
> +
> config SCHED_DEBUG
> bool "Collect scheduler debugging info"
> depends on DEBUG_KERNEL && PROC_FS
> --
> 1.5.4.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2009-01-17 04:13:55

by Mandeep Baines

[permalink] [raw]
Subject: Re: [PATCH v2] softlockup: decouple hung tasks check from softlockup detection

Hi Fr?d?ric,

Fr?d?ric Weisbecker ([email protected]) wrote:
> > - read_lock(&tasklist_lock);
> > - do_each_thread(g, t) {
> > - if (!--max_count)
> > - goto unlock;
>
>
> Instead of having this arbitrary limit of tasks, why not just
> lurk the need_resched() and then schedule if it needs too.
>
> I know that sounds a bit racy, because you will have to release the
> tasklist_lock and
> a lot of things can happen in the task list until you become resched.
> But you can do a get_task_struct() on g and t before your thread is
> going to sleep and then put them
> when it is awaken.
> Perhaps some tasks will disappear or be appended in the list before g
> and t, but that doesn't really matter:
> if they disappear, they didn't lockup, and if they were appended, they
> are not enough cold to be analyzed :-)
>
> This way you can drop the arbitrary limit of task number given by the user....
>
> Frederic.
>

Would be nice to remove the limit. But I don't think get_task_struct()
can be used to prevent a task from being unlinked from the task list. It
only prevents the task_struct from being freed. So hung_task could end up
holding a reference to an unlinked task after it returns from schedule().

That doesn't mean what you are suggesting can't be implemented. Just means
that the case of the held task being unlinked needs to be handled.

Regards,
Mandeep

2009-01-17 14:07:47

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH v2] softlockup: decouple hung tasks check from softlockup detection

On Fri, Jan 16, 2009 at 08:13:30PM -0800, Mandeep Singh Baines wrote:
> Hi Fr?d?ric,
>
> Fr?d?ric Weisbecker ([email protected]) wrote:
> > > - read_lock(&tasklist_lock);
> > > - do_each_thread(g, t) {
> > > - if (!--max_count)
> > > - goto unlock;
> >
> >
> > Instead of having this arbitrary limit of tasks, why not just
> > lurk the need_resched() and then schedule if it needs too.
> >
> > I know that sounds a bit racy, because you will have to release the
> > tasklist_lock and
> > a lot of things can happen in the task list until you become resched.
> > But you can do a get_task_struct() on g and t before your thread is
> > going to sleep and then put them
> > when it is awaken.
> > Perhaps some tasks will disappear or be appended in the list before g
> > and t, but that doesn't really matter:
> > if they disappear, they didn't lockup, and if they were appended, they
> > are not enough cold to be analyzed :-)
> >
> > This way you can drop the arbitrary limit of task number given by the user....
> >
> > Frederic.
> >
>
> Would be nice to remove the limit. But I don't think get_task_struct()
> can be used to prevent a task from being unlinked from the task list. It
> only prevents the task_struct from being freed. So hung_task could end up
> holding a reference to an unlinked task after it returns from schedule().
>
> That doesn't mean what you are suggesting can't be implemented. Just means
> that the case of the held task being unlinked needs to be handled.
>
> Regards,
> Mandeep

Hmm, you're right.
Why not testing 1024 tasks, then check need_resched and if you sleep
and the task becomes unlinked (there are few chances) so... that's not
a big deal actually, you will have better chances on the next check :-)

I think that's a bit important since you are more likely to see a
soft-lockup if you have a lot of tasks.