2007-12-01 09:21:24

by Ingo Molnar

[permalink] [raw]
Subject: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

this patch extends the soft-lockup detector to automatically
detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
printed the following way:

------------------>
BUG: task lockup - task prctl:3042 blocked more than 120 seconds!
prctl D fd5e3793 0 3042 2997
f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286
f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000
f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b
Call Trace:
[<c04883a5>] schedule_timeout+0x6d/0x8b
[<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17
[<c0133a76>] msleep+0x10/0x16
[<c0138974>] sys_prctl+0x30/0x1e2
[<c0104c52>] sysenter_past_esp+0x5f/0xa5
=======================
2 locks held by prctl/3042:
#0: (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a
#1: (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9
<------------------

the current default timeout is 120 seconds. Such messages are printed
up to 10 times per bootup. If the system has crashed already then the
messages are not printed.

if lockdep is enabled then all held locks are printed as well.

this feature is a natural extension to the softlockup-detector (kernel
locked up without scheduling) and to the NMI watchdog (kernel locked up
with IRQs disabled).

Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Arjan van de Ven <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/process_32.c | 15 ++++--
include/linux/debug_locks.h | 5 ++
include/linux/sched.h | 10 ++++
kernel/fork.c | 5 ++
kernel/lockdep.c | 14 ++++-
kernel/sched.c | 4 -
kernel/softlockup.c | 104 ++++++++++++++++++++++++++++++++++++++-----
kernel/sysctl.c | 27 +++++++++++
8 files changed, 167 insertions(+), 17 deletions(-)

Index: linux/arch/x86/kernel/process_32.c
===================================================================
--- linux.orig/arch/x86/kernel/process_32.c
+++ linux/arch/x86/kernel/process_32.c
@@ -114,10 +114,19 @@ void default_idle(void)
smp_mb();

local_irq_disable();
- if (!need_resched())
+ if (!need_resched()) {
+ ktime_t t0, t1;
+ u64 t0n, t1n;
+
+ t0 = ktime_get();
+ t0n = ktime_to_ns(t0);
safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
+ local_irq_disable();
+ t1 = ktime_get();
+ t1n = ktime_to_ns(t1);
+ sched_clock_idle_wakeup_event(t1n - t0n);
+ }
+ local_irq_enable();
current_thread_info()->status |= TS_POLLING;
} else {
/* loop is done by the caller */
Index: linux/include/linux/debug_locks.h
===================================================================
--- linux.orig/include/linux/debug_locks.h
+++ linux/include/linux/debug_locks.h
@@ -47,6 +47,7 @@ struct task_struct;

#ifdef CONFIG_LOCKDEP
extern void debug_show_all_locks(void);
+extern void __debug_show_held_locks(struct task_struct *task);
extern void debug_show_held_locks(struct task_struct *task);
extern void debug_check_no_locks_freed(const void *from, unsigned long len);
extern void debug_check_no_locks_held(struct task_struct *task);
@@ -55,6 +56,10 @@ static inline void debug_show_all_locks(
{
}

+static inline void __debug_show_held_locks(struct task_struct *task)
+{
+}
+
static inline void debug_show_held_locks(struct task_struct *task)
{
}
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -261,12 +261,17 @@ extern void account_process_tick(struct
extern void update_process_times(int user);
extern void scheduler_tick(void);

+extern void sched_show_task(struct task_struct *p);
+
#ifdef CONFIG_DETECT_SOFTLOCKUP
extern void softlockup_tick(void);
extern void spawn_softlockup_task(void);
extern void touch_softlockup_watchdog(void);
extern void touch_all_softlockup_watchdogs(void);
extern int softlockup_thresh;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern long sysctl_hung_task_warnings;
#else
static inline void softlockup_tick(void)
{
@@ -1052,6 +1057,11 @@ struct task_struct {
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+/* hung task detection */
+ unsigned long last_switch_timestamp;
+ unsigned long last_switch_count;
+#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
Index: linux/kernel/fork.c
===================================================================
--- linux.orig/kernel/fork.c
+++ linux/kernel/fork.c
@@ -1059,6 +1059,11 @@ static struct task_struct *copy_process(
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;

+#ifdef CONFIG_DETECT_SOFTLOCKUP
+ p->last_switch_count = 0;
+ p->last_switch_timestamp = 0;
+#endif
+
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
Index: linux/kernel/lockdep.c
===================================================================
--- linux.orig/kernel/lockdep.c
+++ linux/kernel/lockdep.c
@@ -3190,14 +3190,24 @@ retry:
}
EXPORT_SYMBOL_GPL(debug_show_all_locks);

-void debug_show_held_locks(struct task_struct *task)
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void __debug_show_held_locks(struct task_struct *task)
{
if (unlikely(!debug_locks)) {
printk("INFO: lockdep is turned off.\n");
return;
}
+ lockdep_print_held_locks(task);
+}
+EXPORT_SYMBOL_GPL(__debug_show_held_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
if (task == current)
- lockdep_print_held_locks(task);
+ __debug_show_held_locks(task);
}
EXPORT_SYMBOL_GPL(debug_show_held_locks);

Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -4833,7 +4833,7 @@ out_unlock:

static const char stat_nam[] = "RSDTtZX";

-static void show_task(struct task_struct *p)
+void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
unsigned state;
@@ -4886,7 +4886,7 @@ void show_state_filter(unsigned long sta
*/
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
- show_task(p);
+ sched_show_task(p);
} while_each_thread(g, p);

touch_all_softlockup_watchdogs();
Index: linux/kernel/softlockup.c
===================================================================
--- linux.orig/kernel/softlockup.c
+++ linux/kernel/softlockup.c
@@ -8,6 +8,7 @@
*/
#include <linux/mm.h>
#include <linux/cpu.h>
+#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
@@ -24,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, pri
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);

static int did_panic;
-int softlockup_thresh = 10;
+int softlockup_thresh = 60;

static int
softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block
*/
static unsigned long get_timestamp(int this_cpu)
{
- return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */
+ return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
}

void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)

now = get_timestamp(this_cpu);

- /* Wake up the high-prio watchdog task every second: */
- if (now > (touch_timestamp + 1))
- wake_up_process(per_cpu(watchdog_task, this_cpu));
-
- /* Warn about unreasonable 10+ seconds delays: */
+ /* Warn about unreasonable delays: */
if (now <= (touch_timestamp + softlockup_thresh))
return;

@@ -122,11 +119,86 @@ void softlockup_tick(void)
}

/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long sysctl_hung_task_timeout_secs = 120;
+
+long sysctl_hung_task_warnings = 10;
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+ t->last_switch_count = switch_count;
+ t->last_switch_timestamp = now;
+ return;
+ }
+ if ((long)(now - t->last_switch_timestamp) <
+ sysctl_hung_task_timeout_secs)
+ return;
+ if (sysctl_hung_task_warnings < 0)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "BUG: task lockup - task %s:%d blocked more than "
+ "%ld seconds!\n", t->comm, t->pid,
+ sysctl_hung_task_timeout_secs);
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ t->last_switch_timestamp = now;
+ touch_nmi_watchdog();
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(int this_cpu)
+{
+ int max_count = sysctl_hung_task_check_count;
+ unsigned long now = get_timestamp(this_cpu);
+ struct task_struct *g, *t;
+ int checked = 0;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if ((tainted & TAINT_DIE) || did_panic)
+ return;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (!--max_count)
+ break;
+ if (t->state & TASK_UNINTERRUPTIBLE) {
+ check_hung_task(t, now);
+ checked++;
+ }
+ } while_each_thread(g, t);
+
+ read_unlock(&tasklist_lock);
+}
+
+/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ int this_cpu = (int)__bind_cpu, check_cpu;

sched_setscheduler(current, SCHED_FIFO, &param);

@@ -135,13 +207,25 @@ static int watchdog(void *__bind_cpu)

/*
* Run briefly once per second to reset the softlockup timestamp.
- * If this gets delayed for more than 10 seconds then the
+ * If this gets delayed for more than 60 seconds then the
* debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
touch_softlockup_watchdog();
- schedule();
+ msleep_interruptible(10000);
+
+ /*
+ * Only do the hung-tasks check on one CPU:
+ */
+ get_online_cpus();
+ check_cpu = any_online_cpu(cpu_online_map);
+ put_online_cpus();
+
+ if (this_cpu != check_cpu)
+ continue;
+
+ if (sysctl_hung_task_timeout_secs)
+ check_hung_uninterruptible_tasks(this_cpu);
}

return 0;
Index: linux/kernel/sysctl.c
===================================================================
--- linux.orig/kernel/sysctl.c
+++ linux/kernel/sysctl.c
@@ -753,6 +753,33 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
.extra2 = &sixty,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
#endif
#ifdef CONFIG_COMPAT
{


2007-12-01 18:32:32

by David Rientjes

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

The checked auto variable isn't doing anything in
check_hung_uninterruptible_tasks().

Signed-off-by: David Rientjes <[email protected]>
---
kernel/softlockup.c | 5 +----
1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -170,7 +170,6 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
int max_count = sysctl_hung_task_check_count;
unsigned long now = get_timestamp(this_cpu);
struct task_struct *g, *t;
- int checked = 0;

/*
* If the system crashed already then all bets are off,
@@ -183,10 +182,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
do_each_thread(g, t) {
if (!--max_count)
break;
- if (t->state & TASK_UNINTERRUPTIBLE) {
+ if (t->state & TASK_UNINTERRUPTIBLE)
check_hung_task(t, now);
- checked++;
- }
} while_each_thread(g, t);

read_unlock(&tasklist_lock);

2007-12-01 18:34:11

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* David Rientjes <[email protected]> wrote:

> The checked auto variable isn't doing anything in
> check_hung_uninterruptible_tasks().

yeah, i used to print it out in a printk but removed it in the final
version.

Ingo

2007-12-01 18:43:44

by David Rientjes

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Sat, 1 Dec 2007, Ingo Molnar wrote:

> this patch extends the soft-lockup detector to automatically
> detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
> printed the following way:
>

Wouldn't a natural extension of this feature be to mark these hung
TASK_UNINTERRUPTIBLE tasks with a new thread flag such as TIF_HUNG for the
purposes of the OOM killer?

Right now, the OOM killer will become a no-op when any candidate task that
it scans through is found to have the TIF_MEMDIE flag when selecting a
task to kill. So any hung task in this state could cause the OOM killer
to infinitely loop.

If lockdep could set_tsk_thread_flag(g, TIF_HUNG), this could be detected
in the OOM killer and not only could we prevent the infinite looping but
we could also clear TIF_MEMDIE and reduce the increased timeslice that the
OOM killer gives to the tasks it kills.

David

2007-12-01 19:37:25

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* David Rientjes <[email protected]> wrote:

> > this patch extends the soft-lockup detector to automatically detect
> > hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are printed the
> > following way:
>
> Wouldn't a natural extension of this feature be to mark these hung
> TASK_UNINTERRUPTIBLE tasks with a new thread flag such as TIF_HUNG for
> the purposes of the OOM killer?

maybe, but we'd have to see how often this gets triggered. An OOM is
something that could happen in any overloaded system - while a hung task
is likely due to a kernel bug.

Ingo

2007-12-02 00:53:32

by Ingo Oeser

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Saturday 01 December 2007, Ingo Molnar wrote:
> maybe, but we'd have to see how often this gets triggered. An OOM is
> something that could happen in any overloaded system - while a hung task
> is likely due to a kernel bug.

What about a client using hard mounted NFS shares here? That shouldn't be
killed by the OOM killer in that situation, should it?

Am I missing sth.?


Best Regards

Ingo Oeser

2007-12-02 08:59:22

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Ingo Oeser <[email protected]> wrote:

> On Saturday 01 December 2007, Ingo Molnar wrote:
> > maybe, but we'd have to see how often this gets triggered. An OOM is
> > something that could happen in any overloaded system - while a hung task
> > is likely due to a kernel bug.
>
> What about a client using hard mounted NFS shares here? That shouldn't
> be killed by the OOM killer in that situation, should it?

NFS is a bit weird in this regard - fundamentally everything should be
interruptible (or at least killable). Wont the TASK_KILLABLE solve these
problems?

Ingo

2007-12-02 15:55:47

by David Rientjes

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Sun, 2 Dec 2007, Ingo Oeser wrote:

> > maybe, but we'd have to see how often this gets triggered. An OOM is
> > something that could happen in any overloaded system - while a hung task
> > is likely due to a kernel bug.
>
> What about a client using hard mounted NFS shares here? That shouldn't be
> killed by the OOM killer in that situation, should it?
>

That's orthogonal to the point I was making; the problem with the OOM
killer right now is that it can easily enter an infinite loop in out of
memory conditions if the task that it has selected to be killed fails to
exit. This only happens when the task hangs in TASK_UNINTERRUPTIBLE state
and doesn't respond to the SIGKILL that the OOM killer has sent it.

That behavior is a consequence of trying to avoid needlessly killing tasks
by giving already-killed tasks time to exit in subsequent OOM conditions.
During the tasklist scan of eligible tasks to kill, if any task is found
to have access to memory reserves that only the OOM killer can provide
(signified by the TIF_MEMDIE thread flag) and it has not yet died, the OOM
killer becomes a complete no-op.

This happens on occasion and completely deadlocks the system because the
out of memory condition will never be alleviated. With the hang detection
addition to lockdep, it would be easy to correct this situation. I
understand the primary purpose of the patch is to identify potential
kernel bugs that aren't hardware induced, but I think it has relevance to
the OOM killer problem until such time as tasks hanging in
TASK_UNINTERRUPTIBLE state becomes passe.

David

2007-12-02 18:57:44

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

Ingo Molnar <[email protected]> writes:

> this patch extends the soft-lockup detector to automatically
> detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
> printed the following way:

That will likely trigger anytime a hard nfs/cifs mount loses its
server for 120s. To make this work you would need a new
TASK_UNINTERRUPTIBLE_EXTERNAL_EVENT or similar and mark
all the places which depend on those.

I've also seen kernel modules that use semaphores like wait queues
While that gives a little funny results (high load average) it worked
so far, until this patch.

-Andi

2007-12-02 19:00:36

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> Ingo Molnar <[email protected]> writes:
>
> > this patch extends the soft-lockup detector to automatically
> > detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
> > printed the following way:
>
> That will likely trigger anytime a hard nfs/cifs mount loses its
> server for 120s. To make this work you would need a new
> TASK_UNINTERRUPTIBLE_EXTERNAL_EVENT or similar and mark all the places
> which depend on those.

TASK_KILLABLE should be the right solution i think.

> I've also seen kernel modules that use semaphores like wait queues
> While that gives a little funny results (high load average) it worked
> so far, until this patch.

it still works with this patch of course. With a funny results like high
load average _AND_ a clear debug message that tells us (and the user)
where that high load average comes from.

Ingo

2007-12-02 19:43:37

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Sun, 2 Dec 2007 19:59:45 +0100
Ingo Molnar <[email protected]> wrote:

>
> * Andi Kleen <[email protected]> wrote:
>
> > Ingo Molnar <[email protected]> writes:
> >
> > > this patch extends the soft-lockup detector to automatically
> > > detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
> > > printed the following way:
> >
> > That will likely trigger anytime a hard nfs/cifs mount loses its
> > server for 120s. To make this work you would need a new
> > TASK_UNINTERRUPTIBLE_EXTERNAL_EVENT or similar and mark all the
> > places which depend on those.
>
> TASK_KILLABLE should be the right solution i think.

.. and it's even a tool to show where we missed making something
TASK_KILLABLE... anything that triggers from NFS and the like really
ought to be TASK_KILLABLE after all. This patch will point any
omissions out quite nicely without having to do any kind of destructive
testing.


--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2007-12-02 20:09:29

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Arjan van de Ven <[email protected]> wrote:

> > TASK_KILLABLE should be the right solution i think.
>
> .. and it's even a tool to show where we missed making something
> TASK_KILLABLE... anything that triggers from NFS and the like really
> ought to be TASK_KILLABLE after all. This patch will point any
> omissions out quite nicely without having to do any kind of
> destructive testing.

yeah, exactly. Having something "hard blocked" for a long amount of time
is rarely a good thing.

Ingo

2007-12-02 20:10:06

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

> .. and it's even a tool to show where we missed making something
> TASK_KILLABLE... anything that triggers from NFS and the like really
> ought to be TASK_KILLABLE after all. This patch will point any
> omissions out quite nicely without having to do any kind of destructive
> testing.

It would be better to just audit the source for those. Outlawing
something which was previously legal without auditing the source
is bad.

Anyways, i suspect it would just lead to more people disabling
softlockup. I remember during some older stress testing it also
tended to explode regularly, so e.g. SUSE kernel rpms have it disabled.
That patch would probably make it worse.

-Andi

2007-12-02 20:26:33

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> > .. and it's even a tool to show where we missed making something
> > TASK_KILLABLE... anything that triggers from NFS and the like really
> > ought to be TASK_KILLABLE after all. This patch will point any
> > omissions out quite nicely without having to do any kind of
> > destructive testing.
>
> It would be better to just audit the source for those. [...]

that was wishful thinking 10 years ago already, when Linux was 10 times
smaller.

> [...] Outlawing something which was previously legal without auditing
> the source is bad.

to the contrary, being 120+ seconds uninterruptible without a very good
reason is certainly something that was unreasonable (and harmful) for a
long time already - we just never had the mechanism to warn about this
intelligently without crashing the system.

Out of direct experience, 95% of the "too long delay" cases are plain
old bugs. The rest we can (and must!) convert to TASK_KILLABLE or could
annotate if it _really_ needs to be TASK_UNINTERRUPTIBLE.

> Anyways, i suspect it would just lead to more people disabling
> softlockup. I remember during some older stress testing it also tended
> to explode regularly, so e.g. SUSE kernel rpms have it disabled. That
> patch would probably make it worse.

There are no softlockup false positive bugs open at the moment. If you
know about any, then please do not hesitate and report them, i'll be
eager to fix them. The softlockup detector is turned on by default in
Fedora (alongside lockdep in rawhide), and it helped us find countless
number of bugs. You are the first person to suggest that it's somehow
harmful.

Ingo

2007-12-02 20:47:34

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

> Out of direct experience, 95% of the "too long delay" cases are plain
> old bugs. The rest we can (and must!) convert to TASK_KILLABLE or could

I already pointed out a few cases (nfs, cifs, smbfs, ncpfs, afs).
It would be pretty bad to merge this patch without converting them to
TASK_KILLABLE first

There's also the additional issue that even block devices are often
network or SAN backed these days. Having 120 second delays in there
is quite possible.

So most likely adding this patch and still keeping a robust kernel
would require converting most of these delays to TASK_KILLABLE first.
That would not be a bad thing -- i would often like to kill a
process stuck on a bad block device -- but is likely a lot of work.

> There are no softlockup false positive bugs open at the moment. If you
> know about any, then please do not hesitate and report them, i'll be
> eager to fix them. The softlockup detector is turned on by default in
> Fedora (alongside lockdep in rawhide), and it helped us find countless

That just means nobody runs stress tests on those. e.g. lockdep
tends to explode even on simple stress tests on larger systems because it
tracks all locks in all dynamic objects in memory and towards 6k-10k entries
the graph walks tend to take multiple seconds on some NUMA systems.

-Andi

2007-12-02 21:10:57

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> > Out of direct experience, 95% of the "too long delay" cases are plain
> > old bugs. The rest we can (and must!) convert to TASK_KILLABLE or could
>
> I already pointed out a few cases (nfs, cifs, smbfs, ncpfs, afs). It
> would be pretty bad to merge this patch without converting them to
> TASK_KILLABLE first

which we want to do in 2.6.25 anyway, so i dont see any big problems
here. Also, it costs nothing to just stick it in and see the results,
worst case we'd have to flip around the default. I think this is much
ado about nothing - so far i dont really see any objective basis for
your negative attitude.

> There's also the additional issue that even block devices are often
> network or SAN backed these days. Having 120 second delays in there is
> quite possible.
>
> So most likely adding this patch and still keeping a robust kernel
> would require converting most of these delays to TASK_KILLABLE first.
> That would not be a bad thing -- i would often like to kill a process
> stuck on a bad block device -- but is likely a lot of work.

what if you considered - just for a minute - the possibility of this
debug tool being the thing that actually animates developers to fix such
long delay bugs that have bothered users for almost a decade meanwhile?

Until now users had little direct recourse to get such problems fixed.
(we had sysrq-t, but that included no real metric of how long a task was
blocked, so there was no direct link in the typical case and users had
no real reliable tool to express their frustration about unreasonable
delays.)

Now this changes: they get a "smoking gun" backtrace reported by the
kernel, and blamed on exactly the place that caused that unreasonable
delay. And it's not like the kernel breaks - at most 10 such messages
are reported per bootup.

We increase the delay timeout to say 300 seconds, and if the system is
under extremely high IO load then 120+ might be a reasonable delay, so
it's all tunable and runtime disable-able anyway. So if you _know_ that
you will see and tolerate such long delays, you can tweak it - but i can
tell you with 100% certainty that 99.9% of the typical Linux users do
not characterize such long delays as "correct behavior".

> > There are no softlockup false positive bugs open at the moment. If
> > you know about any, then please do not hesitate and report them,
> > i'll be eager to fix them. The softlockup detector is turned on by
> > default in Fedora (alongside lockdep in rawhide), and it helped us
> > find countless
>
> That just means nobody runs stress tests on those. [...]

that is an all-encompassing blanket assertion that sadly drips of ill
will (which permeates your mails lately). I for example run tons of
stress tests on "those" and of course many others do too. So i dont
really know what to think of your statement :-(

> [...] e.g. lockdep tends to explode even on simple stress tests on
> larger systems because it tracks all locks in all dynamic objects in
> memory and towards 6k-10k entries the graph walks tend to take
> multiple seconds on some NUMA systems.

a bug was fixed in this area - can you still see this with 2.6.24-rc3?

[ But i'd be the first one to point out that lockdep is certainly not
from the cheap tools department, that's why i said above that lockdep
is enabled in Fedora rawhide (i.e. development) kernels. Softlockup
detector is much cheaper and it's default enabled all the time. ]

Ingo

2007-12-02 21:19:36

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Sun, Dec 02, 2007 at 10:10:27PM +0100, Ingo Molnar wrote:
> what if you considered - just for a minute - the possibility of this
> debug tool being the thing that actually animates developers to fix such
> long delay bugs that have bothered users for almost a decade meanwhile?

Throwing frequent debugging messages for non buggy cases will
just lead to people generally ignore softlockups.

I don't think runtime instrumentation is the way to introduce
TASK_KILLABLE in general. The only way there is people going through
the source and identify places where it makes sense.

>
> Until now users had little direct recourse to get such problems fixed.
> (we had sysrq-t, but that included no real metric of how long a task was

Actually task delay accounting can measure this now. iirc someone
had a latencytop based on it already.

> blocked, so there was no direct link in the typical case and users had
> no real reliable tool to express their frustration about unreasonable
> delays.)
>
> Now this changes: they get a "smoking gun" backtrace reported by the
> kernel, and blamed on exactly the place that caused that unreasonable
> delay. And it's not like the kernel breaks - at most 10 such messages
> are reported per bootup.
>
> We increase the delay timeout to say 300 seconds, and if the system is
> under extremely high IO load then 120+ might be a reasonable delay, so
> it's all tunable and runtime disable-able anyway. So if you _know_ that
> you will see and tolerate such long delays, you can tweak it - but i can

This means the user has to see their kernel log fill by such
messages at least once - do a round trip to some mailing list to
explain that it is expected and not a kernel bug - then tweak
some obscure parameters. Doesn't seem like a particular fruitful
procedure to me.

> tell you with 100% certainty that 99.9% of the typical Linux users do
> not characterize such long delays as "correct behavior".

It's about robustness, not the typical case.
Throwing backtraces when something slightly unusual happens is not a robust system.

-Andi

2007-12-02 21:24:36

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> On Sun, Dec 02, 2007 at 10:10:27PM +0100, Ingo Molnar wrote:
> > what if you considered - just for a minute - the possibility of this
> > debug tool being the thing that actually animates developers to fix such
> > long delay bugs that have bothered users for almost a decade meanwhile?
>
> Throwing frequent debugging messages for non buggy cases will just
> lead to people generally ignore softlockups.

do you realize that more than 120 seconds TASK_UNINTERRUPTIBLE _is_
something that most humans consider as "buggy" in the overwhelming
majority of cases, regardless of the reason? Yes, there are and will be
some exceptions, but not nearly as countless as you try to paint it. A
quick test in the next -mm will give us a good idea about the ratio of
false positives.

Ingo

2007-12-02 21:34:19

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

Ingo Molnar <[email protected]> writes:
>
> do you realize that more than 120 seconds TASK_UNINTERRUPTIBLE _is_
> something that most humans consider as "buggy" in the overwhelming
> majority of cases, regardless of the reason? Yes, there are and will be
> some exceptions, but not nearly as countless as you try to paint it. A
> quick test in the next -mm will give us a good idea about the ratio of
> false positives.

That would assume error paths get regularly exercised in -mm.
Doubtful. Most likely we'll only hear about it after it's
out in the wild on some bigger release.

The problem I have with your patch is that it will mess up Linux (in
particular block/network file system) error handling even more than it
already is. In error handling cases such "unusual" things happen
frequently unfortunately.

I used to fight with this with the NMI watchdog on on x86-64 -- it
tended to trigger regularly on SCSI error handlers for example
disabling interrupts too long while handling the error. They
eventually got all fixed, but with that change they will likely
all start throwing nasty messages again.

And usually it is not simply broken code neither but really
doing something difficult.

-Andi

2007-12-02 22:20:17

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Sun, 2 Dec 2007 22:19:25 +0100
Andi Kleen <[email protected]> wrote:

> >
> > Until now users had little direct recourse to get such problems
> > fixed. (we had sysrq-t, but that included no real metric of how
> > long a task was
>
> Actually task delay accounting can measure this now. iirc someone
> had a latencytop based on it already.


I have written a latencytop tool, but it's not based quite on the task
delay accounting (it doesn't provide the right information to make such
a tool). I've not released the tool mostly because I'm not quite happy
about the kernel side yet... but if there's real interest I'll fix it
up soon and release it.

--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2007-12-02 22:21:18

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> > Until now users had little direct recourse to get such problems
> > fixed. (we had sysrq-t, but that included no real metric of how long
> > a task was
>
> Actually task delay accounting can measure this now. iirc someone had
> a latencytop based on it already.

Delay accounting (or the /proc/<PID>/sched fields that i added recently)
only get updated once a task has finished its unreasonably long delay
and has scheduled. So lockups or extremely long delays _wont be
detected_ this way. This is a debugging facility that clearly belongs
into the kernel. Your arguments just make no objective sense.

Ingo

2007-12-02 22:26:07

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> > do you realize that more than 120 seconds TASK_UNINTERRUPTIBLE _is_
> > something that most humans consider as "buggy" in the overwhelming
> > majority of cases, regardless of the reason? Yes, there are and will
> > be some exceptions, but not nearly as countless as you try to paint
> > it. A quick test in the next -mm will give us a good idea about the
> > ratio of false positives.
>
> That would assume error paths get regularly exercised in -mm.
> Doubtful. Most likely we'll only hear about it after it's out in the
> wild on some bigger release.

by that argument we could never include _anything_ in -mm because ...
only some bigger release would excercise error paths?

Your argument makes no objective sense to me - my patch is a
non-intrusive debugging facility that people clearly find useful and
that would increase the quality of kernel bugreporting.

If, contrary to expectation, it decreases kernel bugreporting quality
then we'll disable it quickly - just like we did it with other debugging
facilities that were causing more trouble than good. (suck as the stack
unwinder code)

In fact it can already by disabled easily, from user-space, without any
kernel change, by doing:

echo 0 > /proc/sys/kernel/hung_task_timeout_secs

and there you go, no warnings at all. Or you can add this to
/etc/sysctl.conf to disable it permanently:

kernel.hung_task_timeout_secs = 0

or you can disable it in the .config. So i dont see your problem. It's
just like most other debug facilities. (in fact it's more flexible than
most other debug facilities)

Ingo

2007-12-02 22:45:28

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Sun, 2 Dec 2007 21:47:25 +0100
Andi Kleen <[email protected]> wrote:

> > Out of direct experience, 95% of the "too long delay" cases are
> > plain old bugs. The rest we can (and must!) convert to
> > TASK_KILLABLE or could
>
> I already pointed out a few cases (nfs, cifs, smbfs, ncpfs, afs).
> It would be pretty bad to merge this patch without converting them to
> TASK_KILLABLE first

"pretty bad" as in "a few people see warnings in their dmesg" ?
And TASK_KILLABLE is hopefully about to get merged anyway.


We really need to get better diagnostics for the
bad-kernel-behavior-that-is-seen-as-bug cases. If we ever want to get
to the scenario where we have a more or less robust measure of kernel
quality (and we're not all that far off for several cases), one thing
we need keep doing is have the kernel detect bad cases as much as
possible. This patch is a step in the right direction there, by quite a
lot.

I really don't understand what your objection is to this patch... is it
that an enterprise distro can't ship with it on? (Which is fine btw)

>


--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2007-12-03 00:00:19

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

> Delay accounting (or the /proc/<PID>/sched fields that i added recently)
> only get updated once a task has finished its unreasonably long delay
> and has scheduled.

If it is stuck forever then you can just use sysrq-t

If it recovers delay accounting will catch it.

> detected_ this way. This is a debugging facility that clearly belongs
> into the kernel.

My worry is that it will flag various legitimate cases. So far
you seem to try to just hand-wave them away.

-Andi

2007-12-03 00:07:51

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

> We really need to get better diagnostics for the
> bad-kernel-behavior-that-is-seen-as-bug cases. If we ever want to get
> to the scenario where we have a more or less robust measure of kernel
> quality (and we're not all that far off for several cases), one thing

One measure to kernel quality is to recover well from IO errors
(like network problems or broken block devices)

This patch will likely work against that by breaking error paths.

> This patch is a step in the right direction there, by quite a
> lot.
>
> I really don't understand what your objection is to this patch... is it
> that an enterprise distro can't ship with it on? (Which is fine btw)

Any distribution aimed at end users cannot ship with it on.
Most likely not even a standard Linus kernel should really enable
it without warnings.

Also in general I have my doubts that the false positive:real bug
ratio of this warning is well balanced. Just consider the original
example of dead network servers. Even in my relatively small
home network that that is a quite common occurrence. This patch
will break that all by throwing random backtraces when this
happens.

-Andi

2007-12-03 01:01:05

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, 3 Dec 2007 01:07:41 +0100
Andi Kleen <[email protected]> wrote:

> > We really need to get better diagnostics for the
> > bad-kernel-behavior-that-is-seen-as-bug cases. If we ever want to
> > get to the scenario where we have a more or less robust measure of
> > kernel quality (and we're not all that far off for several cases),
> > one thing
>
> One measure to kernel quality is to recover well from IO errors
> (like network problems or broken block devices)

yes. and this patch will flag cases that don't (yet) work well

>
> This patch will likely work against that by breaking error paths.

it won't break error paths, it will at most put a warning in the log.
It doesn't kill or otherwise damage the system or process.

>
> > This patch is a step in the right direction there, by quite a
> > lot.
> >
> > I really don't understand what your objection is to this patch...
> > is it that an enterprise distro can't ship with it on? (Which is
> > fine btw)
>
> Any distribution aimed at end users cannot ship with it on.

That's a pretty bold statement; assuming that the TASK_KILLABLE patch
is in, I don't see the problem.

And even if a distro doesn't turn it on, I still don't see a problem;
it's a diagnostics patch that people can turn on (even at runtime) if
they see problems.

> Also in general I have my doubts that the false positive:real bug
> ratio of this warning is well balanced.

I'll just have to disagree with you then; but both of us are making
wild guesses. Only one way to get the real false positive percentage.

> Just consider the original
> example of dead network servers. Even in my relatively small
> home network that that is a quite common occurrence. This patch
> will break that all by throwing random backtraces when this
> happens.

1) with TASK_KILLABLE that shouldn't happen
2) how does "throwing a backtrace" "break" things?


--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2007-12-03 09:55:15

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Sun, Dec 02, 2007 at 04:59:13PM -0800, Arjan van de Ven wrote:
> On Mon, 3 Dec 2007 01:07:41 +0100
> Andi Kleen <[email protected]> wrote:
>
> > > We really need to get better diagnostics for the
> > > bad-kernel-behavior-that-is-seen-as-bug cases. If we ever want to
> > > get to the scenario where we have a more or less robust measure of
> > > kernel quality (and we're not all that far off for several cases),
> > > one thing
> >
> > One measure to kernel quality is to recover well from IO errors
> > (like network problems or broken block devices)
>
> yes. and this patch will flag cases that don't (yet) work well

If the device/server/... takes more than 2 minutes to recover, how does this
imply the error path "does not work well" ? Or is your goal to handle all
possible errors in less than two minutes? [That might be a worthy goal,
but is probably far from easy and likely impossible in some cases]

> > This patch will likely work against that by breaking error paths.
>
> it won't break error paths, it will at most put a warning in the log.
> It doesn't kill or otherwise damage the system or process.

>From the user perspective a kernel randomly throwing backtraces is
a broken kernel.

>
> >
> > > This patch is a step in the right direction there, by quite a
> > > lot.
> > >
> > > I really don't understand what your objection is to this patch...
> > > is it that an enterprise distro can't ship with it on? (Which is
> > > fine btw)
> >
> > Any distribution aimed at end users cannot ship with it on.
>
> That's a pretty bold statement; assuming that the TASK_KILLABLE patch
> is in, I don't see the problem.

iirc TASK_KILLABLE fixed NFS only. While that's a good thing there are
unfortunately a lot more subsystems that would need the same treatment.

> > Also in general I have my doubts that the false positive:real bug
> > ratio of this warning is well balanced.
>
> I'll just have to disagree with you then; but both of us are making
> wild guesses. Only one way to get the real false positive percentage.

Yes let's break things first instead of looking at the implications closely.

-Andi

2007-12-03 10:15:47

by Radoslaw Szkodzinski

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, 3 Dec 2007 10:55:01 +0100
Andi Kleen <[email protected]> wrote:

> On Sun, Dec 02, 2007 at 04:59:13PM -0800, Arjan van de Ven wrote:
> > On Mon, 3 Dec 2007 01:07:41 +0100
> > Andi Kleen <[email protected]> wrote:
> >
> > > This patch will likely work against that by breaking error paths.
> >
> > it won't break error paths, it will at most put a warning in the log.
> > It doesn't kill or otherwise damage the system or process.
>
> From the user perspective a kernel randomly throwing backtraces is
> a broken kernel.

Throwing in my 2c:
Kernel waiting 2 minutes on TASK_UNINTERRUPTIBLE is certainly broken.
I wouldn't wait that long for the system to become responsive, I yanked
the power cord already.

Hm, that's already detected with sleep_uninterruptible logic.

A task that's not killable for more than 2 minutes is broken still, but
less so.

> > > > This patch is a step in the right direction there, by quite a
> > > > lot.
> > > >
> > > > I really don't understand what your objection is to this patch...
> > > > is it that an enterprise distro can't ship with it on? (Which is
> > > > fine btw)
> > >
> > > Any distribution aimed at end users cannot ship with it on.
> >
> > That's a pretty bold statement; assuming that the TASK_KILLABLE patch
> > is in, I don't see the problem.
>
> iirc TASK_KILLABLE fixed NFS only. While that's a good thing there are
> unfortunately a lot more subsystems that would need the same treatment.

Yes, that's exactly why the patch is needed - to find the bugs and fix
them. Otherwise you'll have problems finding some places to convert to
TASK_KILLABLE.

CIFS and similar have to be fixed - it tends to lock the app
using it, in unkillable state.

> > > Also in general I have my doubts that the false positive:real bug
> > > ratio of this warning is well balanced.
> >
> > I'll just have to disagree with you then; but both of us are making
> > wild guesses. Only one way to get the real false positive percentage.
>
> Yes let's break things first instead of looking at the implications closely.

Throwing _rare_ stack traces is not breakage. 120s task_uninterruptible
in the usual case (no errors) is already broken - there are no sane
loads that can invoke that IMO.

A stack trace on x subsystem error is not that bad, especially as these
are limited to 10 per session.

Disclaimer: I am not a kernel developer, just a user.


Attachments:
signature.asc (189.00 B)

2007-12-03 10:24:27

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Radoslaw Szkodzinski <[email protected]> wrote:

> > iirc TASK_KILLABLE fixed NFS only. While that's a good thing there
> > are unfortunately a lot more subsystems that would need the same
> > treatment.
>
> Yes, that's exactly why the patch is needed - to find the bugs and fix
> them. Otherwise you'll have problems finding some places to convert to
> TASK_KILLABLE.
>
> CIFS and similar have to be fixed - it tends to lock the app using it,
> in unkillable state.

Amen. I still have to see a single rational argument against this
debugging feature - and tons of arguments were listed in favor of it. So
let's just try and see what happens.

> > Yes let's break things first instead of looking at the implications
> > closely.
>
> Throwing _rare_ stack traces is not breakage. 120s
> task_uninterruptible in the usual case (no errors) is already broken -
> there are no sane loads that can invoke that IMO.
>
> A stack trace on x subsystem error is not that bad, especially as
> these are limited to 10 per session.

we could lower that limit to 1 per bootup - if they become annoying.
There's lots of flexibility in the code. Really, we should have done
this 10 years ago - it would have literally saved me many days of
debugging time combined, and i really have experience in identifying
such bad tasks. (and it would have sped up debugging in countless number
of instances when users were met with an uninterruptible task.)

Ingo

2007-12-03 10:27:25

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

> Kernel waiting 2 minutes on TASK_UNINTERRUPTIBLE is certainly broken.

What should it do when the NFS server doesn't answer anymore or
when the network to the SAN RAID array located a few hundred KM away develops
some hickup? Or just the SCSI driver decides to do lengthy error
recovery -- you could argue that is broken if it takes longer
than 2 minutes, but in practice these things are hard to test
and to fix.

> Yes, that's exactly why the patch is needed - to find the bugs and fix

The way to find that would be to use source auditing, not break
perfectly fine error handling paths. Especially since this at least
traditionally hasn't been considered a bug, but a fundamental design
parameter of network/block/etc. file systems

> CIFS and similar have to be fixed - it tends to lock the app
> using it, in unkillable state.

Actually that's not true. You can umount -f and then kill for at least
NFS and CIFS. Not sure it is true for the other network file systems
though.

You could in theory do TASK_KILLABLE for all block devices too (not
a bad thing; I would love to have it).

But it would be equivalent in work (has to patch all the same places with
similar code) to Suparna's big old fs AIO retry
patchkit that never went forward because everyone was too worried
about excessive code impact. Maybe that has changed, maybe not ...

And even then you would need to check all possible error handling
paths (scsi_error and low level drivers at least) that they all
finish in less than two minutes.

> > > wild guesses. Only one way to get the real false positive percentage.
> >
> > Yes let's break things first instead of looking at the implications closely.
>
> Throwing _rare_ stack traces is not breakage. 120s task_uninterruptible

Sorry that's total bogus. Throwing a stack trace is the kernel
equivalent of sending S.O.S. and worrying the user significantly,
taxing reporting resources etc. and in the interest of saving
everybody trouble it should only do that when it is really
sure it is truly broken.

> in the usual case (no errors) is already broken - there are no sane
> loads that can invoke that IMO.

You are wrong on that.

-Andi

2007-12-03 10:38:50

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> > Kernel waiting 2 minutes on TASK_UNINTERRUPTIBLE is certainly broken.
>
> What should it do when the NFS server doesn't answer anymore or when
> the network to the SAN RAID array located a few hundred KM away
> develops some hickup? [...]

maybe: if the user does a Ctrl-C (or a kill -9), the kernel should try
to honor it, instead of staying there stuck for a very long time
(possibly forever)?

I think you are somehow confusing two issues: this patch in no way
declares that "long waits are bad" - if the user _choses_ to wait for
the NFS server (after phoning IT quickly or whatever), he can wait an
hour. This patch only declares that "long waits _that the user has no
way to stop_ are quite likely bad".

Do you see the important distinction between the two cases? Please
reconsider your position (or re-state it differently), it just makes no
rational sense to me so far.

Ingo

2007-12-03 11:04:22

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, Dec 03, 2007 at 11:38:15AM +0100, Ingo Molnar wrote:
>
> * Andi Kleen <[email protected]> wrote:
>
> > > Kernel waiting 2 minutes on TASK_UNINTERRUPTIBLE is certainly broken.
> >
> > What should it do when the NFS server doesn't answer anymore or when
> > the network to the SAN RAID array located a few hundred KM away
> > develops some hickup? [...]
>
> maybe: if the user does a Ctrl-C (or a kill -9), the kernel should try

You mean NFS intr should be default? Traditionally that was not done,
although that decision dates back to long before Linux to original SunOS.

I was not there but I suspect it was because it is hard to distingush
between "abort IO" and "abort program". With aborted IO you tend to end up
with a page in page cache that is marked as IO error and will affect
other programs too.

Perhaps that can be cleanly solved -- personally I'm not sure -- but
it is likely not easy otherwise people would have done that a long
time ago.

> to honor it, instead of staying there stuck for a very long time
> (possibly forever)?

Sure everybody hates that (it is like trying to argue against
free video games @), but fixing it properly is quite hard.
I just think it's a bad idea to outlaw it before even attempting
to fix it.

If you consider any of the arguments in the following
paragraph "not rational" please state your objection precisely.
Thanks.

Consider the block case: First a lot of block
IO runs over networks too these days (iSCSI, drbd, nbd, SANs etc.)
so the same considerations as for other network file systems
apply. Networks can have hickups and might take long to recover.
Now implementing TASK_KILLABLE in all block IO paths
there properly is equivalent to implementing EIOCBRETRY aio because
it has to error out in near the same ways in all the same
places. While I would like to see that (and it would probably make syslets
obsolete too ;-) it has been rejected as too difficult in the past.

> I think you are somehow confusing two issues: this patch in no way
> declares that "long waits are bad" - if the user _choses_ to wait for

Throwing a backtrace is the kernel's way to declare something as bad.
The only more clear ways to that I know of would be BUG or panic().

> way to stop_ are quite likely bad".

The user will just see the backtraces and think the kernel
has crashed.

-Andi

2007-12-03 11:59:38

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> On Mon, Dec 03, 2007 at 11:38:15AM +0100, Ingo Molnar wrote:
> >
> > * Andi Kleen <[email protected]> wrote:
> >
> > > > Kernel waiting 2 minutes on TASK_UNINTERRUPTIBLE is certainly broken.
> > >
> > > What should it do when the NFS server doesn't answer anymore or when
> > > the network to the SAN RAID array located a few hundred KM away
> > > develops some hickup? [...]
> >
> > maybe: if the user does a Ctrl-C (or a kill -9), the kernel should
> > try
>
> You mean NFS intr should be default? [...]

no. (that's why i added the '(or a kill -9)' qualification above - if
NFS is mounted noninterruptible then standard signals (such as Ctrl-C)
should not have an interrupting effect.)

> If you consider any of the arguments in the following paragraph "not
> rational" please state your objection precisely. Thanks.
>
> Consider the block case: First a lot of block IO runs over networks
> too these days (iSCSI, drbd, nbd, SANs etc.) so the same
> considerations as for other network file systems apply. Networks can
> have hickups and might take long to recover. Now implementing
> TASK_KILLABLE in all block IO paths there properly is equivalent to
> implementing EIOCBRETRY aio because it has to error out in near the
> same ways in all the same places. While I would like to see that (and
> it would probably make syslets obsolete too ;-) it has been rejected
> as too difficult in the past.

your syslet snide comment aside (which is quite incomprehensible - a
retry based asynchonous IO model is clearly inferior even if it were
implemented everywhere), i do think that most if not all of these
supposedly "difficult to fix" codepaths are just on the backburner out
of lack of a clear blame vector.

"audit thousands of callsites in 8 million lines of code first" is a
nice euphemism for hiding from the blame forever. We had 10 years for it
and it didnt happen. As we've seen it again and again, getting a
non-fatal reminder in the dmesg about the suckage is quite efficient at
getting people to fix crappy solutions, and gives users and exact blame
point of where to start. That will create pressure to fix these
problems.

> > I think you are somehow confusing two issues: this patch in no way
> > declares that "long waits are bad" - if the user _choses_ to wait
> > for
>
> Throwing a backtrace is the kernel's way to declare something as bad.
> The only more clear ways to that I know of would be BUG or panic().

there are various levels of declarig something bad, and you are quite
wrong to suggest that a BUG() would be the only recourse.

> > way to stop_ are quite likely bad".
>
> The user will just see the backtraces and think the kernel has
> crashed.

i've just changed the message to:

INFO: task keventd/5 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message

Ingo

2007-12-03 12:14:15

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, Dec 03, 2007 at 12:59:00PM +0100, Ingo Molnar wrote:
> no. (that's why i added the '(or a kill -9)' qualification above - if
> NFS is mounted noninterruptible then standard signals (such as Ctrl-C)
> should not have an interrupting effect.)

NFS is already interruptible with umount -f (I use that all the time...),
but softlockup won't know that and throw the warning anyways.

> your syslet snide comment aside (which is quite incomprehensible - a

For the record I have no principle problem with syslets, just I do
consider them roughly equivalent in end result to a explicit retry based
AIO implementation.

> retry based asynchonous IO model is clearly inferior even if it were
> implemented everywhere), i do think that most if not all of these
> supposedly "difficult to fix" codepaths are just on the backburner out
> of lack of a clear blame vector.

Hmm. -ENOPARSE. Can you please clarify?

>
> "audit thousands of callsites in 8 million lines of code first" is a
> nice euphemism for hiding from the blame forever. We had 10 years for it

Ok your approach is then to "let's warn about it and hope
it will go away"

> and it didnt happen. As we've seen it again and again, getting a
> non-fatal reminder in the dmesg about the suckage is quite efficient at

It's not universal suckage I would say, but sometimes unavoidable
conditions. Now it is better of course to have these all TASK_KILLABLE,
but then fixing that all in the kernel will probably a long term
project. I'm not arguing against that, just forcing it through
backtraces before even starting all that is probably not the right
strategy to do that.

> getting people to fix crappy solutions, and gives users and exact blame
> point of where to start. That will create pressure to fix these
> problems.

After impacting the user base -- many of these conditions are infrequent
enough that we will likely only see them during real production. Throwing
warnings for lots of known cases is probably ok for a -mm kernel
(where users expect things lik that), but not a "release" (be it
Linus release or any kind of end user distribution) imho.

I don't think there is a real alternative to code audit first
(and someone doing all the work of fixing all these first)


>
> > > I think you are somehow confusing two issues: this patch in no way
> > > declares that "long waits are bad" - if the user _choses_ to wait
> > > for
> >
> > Throwing a backtrace is the kernel's way to declare something as bad.
> > The only more clear ways to that I know of would be BUG or panic().
>
> there are various levels of declarig something bad, and you are quite
> wrong to suggest that a BUG() would be the only recourse.

I didn't write that, please reread my sentence..

But we seem to agree that a backtrace is something "declared bad" anyways,
which was my point.


>
> > > way to stop_ are quite likely bad".
> >
> > The user will just see the backtraces and think the kernel has
> > crashed.
>
> i've just changed the message to:
>
> INFO: task keventd/5 blocked for more than 120 seconds.
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message

That's better, but the backtrace is still there isn't it?

Anyways I think I could live with it a one liner warning (if it's
seriously rate limited etc.) and a sysctl to enable the backtraces;
off by default. Or if you prefer that record
the backtrace always in a buffer and make it available somewhere in /proc
or /sys or /debug. Would that work for you?

-Andi

2007-12-03 12:29:09

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> On Mon, Dec 03, 2007 at 12:59:00PM +0100, Ingo Molnar wrote:
> > no. (that's why i added the '(or a kill -9)' qualification above - if
> > NFS is mounted noninterruptible then standard signals (such as Ctrl-C)
> > should not have an interrupting effect.)
>
> NFS is already interruptible with umount -f (I use that all the
> time...), but softlockup won't know that and throw the warning
> anyways.

umount -f is a spectacularly unintelligent solution (it requires the
user to know precisely which path to umount, etc.), TASK_KILLABLE is a
lot more useful.

> > your syslet snide comment aside (which is quite incomprehensible - a
>
> For the record I have no principle problem with syslets, just I do
> consider them roughly equivalent in end result to a explicit retry
> based AIO implementation.

which suggests you have not really understood syslets. Syslets have no
"retry" component, they just process straight through the workflow.
Retry based AIO has a retry component, which - as its name suggests
already - retries operations instead of processing through the workload
intelligently. Depending on how "deep" the context of an operation the
retries might or might not make a noticeable difference in performance,
but it sure is an inferior approach.

> > retry based asynchonous IO model is clearly inferior even if it were
> > implemented everywhere), i do think that most if not all of these
> > supposedly "difficult to fix" codepaths are just on the backburner
> > out of lack of a clear blame vector.
>
> Hmm. -ENOPARSE. Can you please clarify?

which bit was unclear to you? The retry bit i've explained above, lemme
know if there's any other unclarity.

> > "audit thousands of callsites in 8 million lines of code first" is a
> > nice euphemism for hiding from the blame forever. We had 10 years
> > for it
>
> Ok your approach is then to "let's warn about it and hope it will go
> away"

s/hope//, but yes. Surprisingly, this works quite well :-) [as long as
the warnings are not excessively bogus, of course]

and note that this is just a happy side-effect - the primary motivation
is to get warnings about tasks that are uninterruptible forever. (which
is a quite common kernel bug pattern.)

> Anyways I think I could live with it a one liner warning (if it's
> seriously rate limited etc.) and a sysctl to enable the backtraces;
> off by default. Or if you prefer that record the backtrace always in a
> buffer and make it available somewhere in /proc or /sys or /debug.
> Would that work for you?

you are over-designing it way too much - a backtrace is obviously very
helpful and it must be printed by default. There's enough
configurability in it already so that you can turn it off if you want.
(And you said SLES has softlockup turned off already so it shouldnt
affect you anyway.)

Ingo

2007-12-03 12:41:54

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, Dec 03, 2007 at 01:28:33PM +0100, Ingo Molnar wrote:
>
> > On Mon, Dec 03, 2007 at 12:59:00PM +0100, Ingo Molnar wrote:
> > > no. (that's why i added the '(or a kill -9)' qualification above - if
> > > NFS is mounted noninterruptible then standard signals (such as Ctrl-C)
> > > should not have an interrupting effect.)
> >
> > NFS is already interruptible with umount -f (I use that all the
> > time...), but softlockup won't know that and throw the warning
> > anyways.
>
> umount -f is a spectacularly unintelligent solution (it requires the
> user to know precisely which path to umount, etc.),

lsof | grep programname

> TASK_KILLABLE is a lot more useful.

Not sure it is better on all measures.

One problem is how to distingush again between program abort
(which only affects the program) and IO abort (which leaves
EIO marked pages in the page cache affecting other processes too)
umount -f does this at last.

I didn't think TASK_KILLABLE has solved that cleanly (although
I admit I haven't read the latest patchkit, perhaps that has changed
over the first iteration)

But it also probably doesn't make things much worse than they were before.

>
> > > your syslet snide comment aside (which is quite incomprehensible - a
> >
> > For the record I have no principle problem with syslets, just I do
> > consider them roughly equivalent in end result to a explicit retry
> > based AIO implementation.
>
> which suggests you have not really understood syslets. Syslets have no

That's possible.

> "retry" component, they just process straight through the workflow.
> Retry based AIO has a retry component, which - as its name suggests
> already - retries operations instead of processing through the workload
> intelligently. Depending on how "deep" the context of an operation the
> retries might or might not make a noticeable difference in performance,
> but it sure is an inferior approach.

Not sure what is that less intelligent in retry (you're
refering to more CPU cycles needed?), but I admit I haven't
thought very deeply about that.

>
> > > retry based asynchonous IO model is clearly inferior even if it were
> > > implemented everywhere), i do think that most if not all of these
> > > supposedly "difficult to fix" codepaths are just on the backburner
> > > out of lack of a clear blame vector.
> >
> > Hmm. -ENOPARSE. Can you please clarify?
>
> which bit was unclear to you? The retry bit i've explained above, lemme
> know if there's any other unclarity.

The clear blame vector bit was unclear.

> > > nice euphemism for hiding from the blame forever. We had 10 years
> > > for it
> >
> > Ok your approach is then to "let's warn about it and hope it will go
> > away"
>
> s/hope//, but yes. Surprisingly, this works quite well :-) [as long as
> the warnings are not excessively bogus, of course]

Well i consider a backtrace excessively bogus.

> > Anyways I think I could live with it a one liner warning (if it's
> > seriously rate limited etc.) and a sysctl to enable the backtraces;
> > off by default. Or if you prefer that record the backtrace always in a
> > buffer and make it available somewhere in /proc or /sys or /debug.
> > Would that work for you?
>
> you are over-designing it way too much - a backtrace is obviously very
> helpful and it must be printed by default. There's enough
> configurability in it already so that you can turn it off if you want.

So it will hit everybody first before they can figure out how
to get rid of it? That was the part I was objecting too.

If it is decided to warn about something which is not 100% clear a bug
(and I think I have established this for now -- at least you didn't
object to many of my examples...) then the likely
false positives shouldn't be too obnoxious. Backtraces are unfortunately
obnoxious and always come at a high cost (worried user, linux reputation
as a buggy OS, mailing list bandwidth, support load etc.) and having that
for too many false positives is a bad thing.

> (And you said SLES has softlockup turned off already so it shouldnt
> affect you anyway.)

My objection was not really for SLES, but for general Linux kernel
quality.

-Andi

2007-12-03 13:00:47

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> > you are over-designing it way too much - a backtrace is obviously
> > very helpful and it must be printed by default. There's enough
> > configurability in it already so that you can turn it off if you
> > want.
>
> So it will hit everybody first before they can figure out how to get
> rid of it? That was the part I was objecting too.

you are apparently arguing in a circle. This is a debug mechanism. It
goes the normal upstream acceptance process. I have booted this patch a
few hundred times on a number of boxes and got not a single false
positive so far. While this is in no way an exhaustive test, only more
testing (in -mm, etc.) will tell us more, one way or another. Your
negative feedback about an impending catastrophy has been duly noted
(which vision of yours has not been shared by anyone else in this thread
so far), and is given its due weight. Can we now please move on to a
more productive stage?

Ingo

2007-12-03 13:14:54

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

> negative

I would consider it positive, but ok. If I was negative I would
probably not care and just make always sure to disable SOFTLOCKUP
in the kernels I use.

> feedback about an impending catastrophy has been duly noted

The point was less about an impending catastrophe, but more of a timebomb
ticking until the next widely used release.

> Can we now please move on to a
> more productive stage?

Sure no more theory.

To what value do you want to set the backtrace sysctl by default?

-Andi

2007-12-03 13:41:49

by Radoslaw Szkodzinski

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, 3 Dec 2007 14:29:56 +0100
> * Andi Kleen <[email protected]> wrote:
>
> > > feedback about an impending catastrophy has been duly noted
> >
> > The point was less about an impending catastrophe, but more of a
> > timebomb ticking until the next widely used release.

I think I know why Andi is so much against backtraces: he has some log
filter listening on netconsole mailing him with suspicious log messages.

Apparently, that log filter is much too stupid to discern an
OOPS or OOM from softlockup and other informative backtraces even when
given a clear hint what it is.

Or maybe he's just underestimating user's ability to read. ;)


Attachments:
signature.asc (189.00 B)

2007-12-03 13:48:19

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


I would still appreciate if you could state what default value
you plan to set the backtrace sysctl to in the submitted patch.

-Andi

2007-12-03 13:50:30

by Pekka Enberg

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

Hi,

On Mon, Dec 03, 2007 at 12:59:00PM +0100, Ingo Molnar wrote:
> > "audit thousands of callsites in 8 million lines of code first" is a
> > nice euphemism for hiding from the blame forever. We had 10 years for it

On Dec 3, 2007 2:13 PM, Andi Kleen <[email protected]> wrote:
> Ok your approach is then to "let's warn about it and hope
> it will go away"

It's more like "lets warn about it and fix the problems when we find
some." Btw, how is this different from how the lockdep patches went
in?

Pekka

2007-12-03 13:56:22

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> I would still appreciate if you could state what default value you
> plan to set the backtrace sysctl to in the submitted patch.

there's no "backtrace sysctl" planned for the moment. This "hung tasks"
debugging feature can be disabled/enabled on a wide scale already:

- in the .config

- runtime, temporarily, via:

echo 0 > /proc/sys/kernel/hung_task_timeout_secs

- runtime, permanently, via adding:

kernel.hung_task_timeout_secs = 0

to /etc/sysctl.conf.

the backtrace, as other posters have stated it in this thread too, is
very useful, so i'll not remove it from the printout.

Ingo

2007-12-03 13:58:17

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Pekka Enberg <[email protected]> wrote:

> Hi,
>
> On Mon, Dec 03, 2007 at 12:59:00PM +0100, Ingo Molnar wrote:
> > > "audit thousands of callsites in 8 million lines of code first" is a
> > > nice euphemism for hiding from the blame forever. We had 10 years for it
>
> On Dec 3, 2007 2:13 PM, Andi Kleen <[email protected]> wrote:
> > Ok your approach is then to "let's warn about it and hope
> > it will go away"
>
> It's more like "lets warn about it and fix the problems when we find
> some." Btw, how is this different from how the lockdep patches went
> in?

yeah, it's quite similar. (in fact this feature is expected to have a
false positive rate lower than that of lockdep) The backtraces are
essential as well, they help kernel developer find the bugs.

Ingo

2007-12-03 13:59:47

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Radoslaw Szkodzinski <[email protected]> wrote:

> On Mon, 3 Dec 2007 14:29:56 +0100
> > * Andi Kleen <[email protected]> wrote:
> >
> > > > feedback about an impending catastrophy has been duly noted
> > >
> > > The point was less about an impending catastrophe, but more of a
> > > timebomb ticking until the next widely used release.
>
> I think I know why Andi is so much against backtraces: he has some log
> filter listening on netconsole mailing him with suspicious log
> messages.

Andi, is that true? If yes, why didnt Andi state this concern outright,
instead of pooh-pooh-ing the patch on various other grounds?

Ingo

2007-12-03 14:14:47

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

> It's more like "lets warn about it and fix the problems when we find
> some."

It is already known there are lots of problems. I won't repeat
them because I already wrote too much about them. Feel free
to read back in the thread.

Now if all the known problems are fixed and only some hard to know
hidden ones remain it might make sense to enable (but even then
a little dubious considering how important error handling robustness
is), but not at the current state (at least with the full default
backtraces)

> Btw, how is this different from how the lockdep patches went
> in?

lockdep is clearly a "only enable if you're developing the kernel"
feature (just alone because of its overhead and other problems).
So side effects etc. are expected.

Softlockup previously (before this patch) was a "can be always safely
enabled" feature.

Now Ingo's latest unreleased version with single
line messages might be actually ok if he turns off the backtraces
by default. Unfortunately I wasn't able to find out so far if he
has done that or not, he always cuts away these parts of the emails.

-Andi

2007-12-03 14:15:45

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, Dec 03, 2007 at 02:59:16PM +0100, Ingo Molnar wrote:
> Andi, is that true? If yes, why didnt Andi state this concern outright,
> instead of pooh-pooh-ing the patch on various other grounds?

No of course not. Radoslaw is talking nonsense.

-Andi

2007-12-03 14:17:48

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, Dec 03, 2007 at 02:55:47PM +0100, Ingo Molnar wrote:
>
> * Andi Kleen <[email protected]> wrote:
>
> > I would still appreciate if you could state what default value you
> > plan to set the backtrace sysctl to in the submitted patch.
>
> there's no "backtrace sysctl" planned for the moment. This "hung tasks"

I hope you'll reconsider that at least before submitting
this patch to mainline. For -mm it's probably ok.

> debugging feature can be disabled/enabled on a wide scale already:
>
> - in the .config
>
> - runtime, temporarily, via:
>
> echo 0 > /proc/sys/kernel/hung_task_timeout_secs

That won't address my concerns about already "breaking" (as in
frightening the user etc.) common error handling scenarios by default.

-Andi

2007-12-03 14:20:06

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> Now Ingo's latest unreleased version with single line messages might
> be actually ok if he turns off the backtraces by default.
> Unfortunately I wasn't able to find out so far if he has done that or
> not, he always cuts away these parts of the emails.

Andi, what's this crap supposed to mean?? I of course answered your
question:

http://lkml.org/lkml/2007/12/3/90

below is the latest patch.

Ingo

---------------------->
Subject: softlockup: automatically detect hung TASK_UNINTERRUPTIBLE tasks
From: Ingo Molnar <[email protected]>

this patch extends the soft-lockup detector to automatically
detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
printed the following way:

------------------>
INFO: task prctl:3042 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message
prctl D fd5e3793 0 3042 2997
f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286
f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000
f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b
Call Trace:
[<c04883a5>] schedule_timeout+0x6d/0x8b
[<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17
[<c0133a76>] msleep+0x10/0x16
[<c0138974>] sys_prctl+0x30/0x1e2
[<c0104c52>] sysenter_past_esp+0x5f/0xa5
=======================
2 locks held by prctl/3042:
#0: (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a
#1: (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9
<------------------

the current default timeout is 120 seconds. Such messages are printed
up to 10 times per bootup. If the system has crashed already then the
messages are not printed.

if lockdep is enabled then all held locks are printed as well.

this feature is a natural extension to the softlockup-detector (kernel
locked up without scheduling) and to the NMI watchdog (kernel locked up
with IRQs disabled).

Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Arjan van de Ven <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/process_32.c | 15 +++++-
include/linux/debug_locks.h | 5 ++
include/linux/sched.h | 10 ++++
kernel/fork.c | 5 ++
kernel/lockdep.c | 14 +++++
kernel/sched.c | 4 -
kernel/softlockup.c | 103 ++++++++++++++++++++++++++++++++++++++-----
kernel/sysctl.c | 27 +++++++++++
8 files changed, 166 insertions(+), 17 deletions(-)

Index: linux/arch/x86/kernel/process_32.c
===================================================================
--- linux.orig/arch/x86/kernel/process_32.c
+++ linux/arch/x86/kernel/process_32.c
@@ -114,10 +114,19 @@ void default_idle(void)
smp_mb();

local_irq_disable();
- if (!need_resched())
+ if (!need_resched()) {
+ ktime_t t0, t1;
+ u64 t0n, t1n;
+
+ t0 = ktime_get();
+ t0n = ktime_to_ns(t0);
safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
+ local_irq_disable();
+ t1 = ktime_get();
+ t1n = ktime_to_ns(t1);
+ sched_clock_idle_wakeup_event(t1n - t0n);
+ }
+ local_irq_enable();
current_thread_info()->status |= TS_POLLING;
} else {
/* loop is done by the caller */
Index: linux/include/linux/debug_locks.h
===================================================================
--- linux.orig/include/linux/debug_locks.h
+++ linux/include/linux/debug_locks.h
@@ -47,6 +47,7 @@ struct task_struct;

#ifdef CONFIG_LOCKDEP
extern void debug_show_all_locks(void);
+extern void __debug_show_held_locks(struct task_struct *task);
extern void debug_show_held_locks(struct task_struct *task);
extern void debug_check_no_locks_freed(const void *from, unsigned long len);
extern void debug_check_no_locks_held(struct task_struct *task);
@@ -55,6 +56,10 @@ static inline void debug_show_all_locks(
{
}

+static inline void __debug_show_held_locks(struct task_struct *task)
+{
+}
+
static inline void debug_show_held_locks(struct task_struct *task)
{
}
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -261,12 +261,17 @@ extern void account_process_tick(struct
extern void update_process_times(int user);
extern void scheduler_tick(void);

+extern void sched_show_task(struct task_struct *p);
+
#ifdef CONFIG_DETECT_SOFTLOCKUP
extern void softlockup_tick(void);
extern void spawn_softlockup_task(void);
extern void touch_softlockup_watchdog(void);
extern void touch_all_softlockup_watchdogs(void);
extern int softlockup_thresh;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern long sysctl_hung_task_warnings;
#else
static inline void softlockup_tick(void)
{
@@ -1052,6 +1057,11 @@ struct task_struct {
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+/* hung task detection */
+ unsigned long last_switch_timestamp;
+ unsigned long last_switch_count;
+#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
Index: linux/kernel/fork.c
===================================================================
--- linux.orig/kernel/fork.c
+++ linux/kernel/fork.c
@@ -1059,6 +1059,11 @@ static struct task_struct *copy_process(
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;

+#ifdef CONFIG_DETECT_SOFTLOCKUP
+ p->last_switch_count = 0;
+ p->last_switch_timestamp = 0;
+#endif
+
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
Index: linux/kernel/lockdep.c
===================================================================
--- linux.orig/kernel/lockdep.c
+++ linux/kernel/lockdep.c
@@ -3190,14 +3190,24 @@ retry:
}
EXPORT_SYMBOL_GPL(debug_show_all_locks);

-void debug_show_held_locks(struct task_struct *task)
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void __debug_show_held_locks(struct task_struct *task)
{
if (unlikely(!debug_locks)) {
printk("INFO: lockdep is turned off.\n");
return;
}
+ lockdep_print_held_locks(task);
+}
+EXPORT_SYMBOL_GPL(__debug_show_held_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
if (task == current)
- lockdep_print_held_locks(task);
+ __debug_show_held_locks(task);
}
EXPORT_SYMBOL_GPL(debug_show_held_locks);

Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -4833,7 +4833,7 @@ out_unlock:

static const char stat_nam[] = "RSDTtZX";

-static void show_task(struct task_struct *p)
+void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
unsigned state;
@@ -4886,7 +4886,7 @@ void show_state_filter(unsigned long sta
*/
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
- show_task(p);
+ sched_show_task(p);
} while_each_thread(g, p);

touch_all_softlockup_watchdogs();
Index: linux/kernel/softlockup.c
===================================================================
--- linux.orig/kernel/softlockup.c
+++ linux/kernel/softlockup.c
@@ -8,6 +8,7 @@
*/
#include <linux/mm.h>
#include <linux/cpu.h>
+#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
@@ -24,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, pri
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);

static int did_panic;
-int softlockup_thresh = 10;
+int softlockup_thresh = 60;

static int
softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block
*/
static unsigned long get_timestamp(int this_cpu)
{
- return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */
+ return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
}

void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)

now = get_timestamp(this_cpu);

- /* Wake up the high-prio watchdog task every second: */
- if (now > (touch_timestamp + 1))
- wake_up_process(per_cpu(watchdog_task, this_cpu));
-
- /* Warn about unreasonable 10+ seconds delays: */
+ /* Warn about unreasonable delays: */
if (now <= (touch_timestamp + softlockup_thresh))
return;

@@ -122,11 +119,85 @@ void softlockup_tick(void)
}

/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long sysctl_hung_task_timeout_secs = 120;
+
+long sysctl_hung_task_warnings = 10;
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+ t->last_switch_count = switch_count;
+ t->last_switch_timestamp = now;
+ return;
+ }
+ if ((long)(now - t->last_switch_timestamp) <
+ sysctl_hung_task_timeout_secs)
+ return;
+ if (sysctl_hung_task_warnings < 0)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+ "%ld seconds.\n", t->comm, t->pid,
+ sysctl_hung_task_timeout_secs);
+ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ t->last_switch_timestamp = now;
+ touch_nmi_watchdog();
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(int this_cpu)
+{
+ int max_count = sysctl_hung_task_check_count;
+ unsigned long now = get_timestamp(this_cpu);
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if ((tainted & TAINT_DIE) || did_panic)
+ return;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (!--max_count)
+ break;
+ if (t->state & TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, now);
+ } while_each_thread(g, t);
+
+ read_unlock(&tasklist_lock);
+}
+
+/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ int this_cpu = (int)__bind_cpu, check_cpu;

sched_setscheduler(current, SCHED_FIFO, &param);

@@ -135,13 +206,25 @@ static int watchdog(void *__bind_cpu)

/*
* Run briefly once per second to reset the softlockup timestamp.
- * If this gets delayed for more than 10 seconds then the
+ * If this gets delayed for more than 60 seconds then the
* debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
touch_softlockup_watchdog();
- schedule();
+ msleep_interruptible(10000);
+
+ /*
+ * Only do the hung-tasks check on one CPU:
+ */
+ get_online_cpus();
+ check_cpu = any_online_cpu(cpu_online_map);
+ put_online_cpus();
+
+ if (this_cpu != check_cpu)
+ continue;
+
+ if (sysctl_hung_task_timeout_secs)
+ check_hung_uninterruptible_tasks(this_cpu);
}

return 0;
Index: linux/kernel/sysctl.c
===================================================================
--- linux.orig/kernel/sysctl.c
+++ linux/kernel/sysctl.c
@@ -753,6 +753,33 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
.extra2 = &sixty,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
#endif
#ifdef CONFIG_COMPAT
{

2007-12-03 14:33:53

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Andi Kleen <[email protected]> wrote:

> > debugging feature can be disabled/enabled on a wide scale already:
> >
> > - in the .config
> >
> > - runtime, temporarily, via:
> >
> > echo 0 > /proc/sys/kernel/hung_task_timeout_secs
>
> That won't address my concerns about already "breaking" (as in
> frightening the user etc.) common error handling scenarios by default.

wow, now you again use big scary words like "breaking" and
"frightening", with is a nice addition to the ticking timebomb
metaphore.

Andi, sadly you are not listening AT ALL, you are repeating the same
argument again and again that has been disputed and countered. Either
come up with a new argument or please stop wasting everyone's time.

It is undisputable that for most users an unexplained hang is far more
"frightening" than a hang followed by an explanation in the syslog. That
is a _hard fact_, and if you cannot accept that it's really your
cognitive problem not ours...

Ingo

2007-12-03 15:25:38

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, 3 Dec 2007 11:27:15 +0100
Andi Kleen <[email protected]> wrote:

> > Kernel waiting 2 minutes on TASK_UNINTERRUPTIBLE is certainly
> > broken.
>
> What should it do when the NFS server doesn't answer anymore or
> when the network to the SAN RAID array located a few hundred KM away
> develops some hickup? Or just the SCSI driver decides to do lengthy
> error recovery -- you could argue that is broken if it takes longer
> than 2 minutes, but in practice these things are hard to test
> and to fix.
>

the scsi layer will have the IO totally aborted within that time anyway;
the retry timeout for disks is 30 seconds after all.




--
If you want to reach me at my work email, use [email protected]
For development, discussion and tips for power savings,
visit http://www.lesswatts.org

2007-12-03 16:36:36

by Andi Kleen

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

> the scsi layer will have the IO totally aborted within that time anyway;
> the retry timeout for disks is 30 seconds after all.

There are blocking waits who wait for multiple IOs.

Also i think the SCSI driver can tune this anyways and I suspect
iSCSI and friends increase it (?)

-Andi

2007-12-03 17:02:18

by Ray Lee

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Dec 3, 2007 6:17 AM, Andi Kleen <[email protected]> wrote:
> That won't address my concerns about already "breaking" (as in
> frightening the user etc.) common error handling scenarios by default.

Andi, may I respectfully submit that you're not understanding real users here?

Real users either:
- Never look at their syslog
- Or look, and aren't terribly scared by what's in there.

Perhaps I'm wrong, and you can point to a message on lkml where
someone has posted a backtrace with fear in their voice.

2007-12-03 17:58:56

by Andrew Morton

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Mon, 3 Dec 2007 15:19:25 +0100
Ingo Molnar <[email protected]> wrote:

> this patch extends the soft-lockup detector to automatically
> detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
> printed the following way:
>
> ------------------>
> INFO: task prctl:3042 blocked for more than 120 seconds.
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message
> prctl D fd5e3793 0 3042 2997
> f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286
> f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000
> f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b
> Call Trace:
> [<c04883a5>] schedule_timeout+0x6d/0x8b
> [<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17
> [<c0133a76>] msleep+0x10/0x16
> [<c0138974>] sys_prctl+0x30/0x1e2
> [<c0104c52>] sysenter_past_esp+0x5f/0xa5
> =======================
> 2 locks held by prctl/3042:
> #0: (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a
> #1: (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9
> <------------------
>
> the current default timeout is 120 seconds. Such messages are printed
> up to 10 times per bootup. If the system has crashed already then the
> messages are not printed.
>
> if lockdep is enabled then all held locks are printed as well.
>
> this feature is a natural extension to the softlockup-detector (kernel
> locked up without scheduling) and to the NMI watchdog (kernel locked up
> with IRQs disabled).

This feature will save one full reporter-developer round-trip during
investigation of a significant number of bug reports.

It might be more practical if it were to dump the traces for _all_
D-state processes when it fires - basically an auto-triggered sysrq-W.

2007-12-03 18:10:43

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Monday, 3 of December 2007, Andrew Morton wrote:
> On Mon, 3 Dec 2007 15:19:25 +0100
> Ingo Molnar <[email protected]> wrote:
>
> > this patch extends the soft-lockup detector to automatically
> > detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
> > printed the following way:
> >
> > ------------------>
> > INFO: task prctl:3042 blocked for more than 120 seconds.
> > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message
> > prctl D fd5e3793 0 3042 2997
> > f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286
> > f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000
> > f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b
> > Call Trace:
> > [<c04883a5>] schedule_timeout+0x6d/0x8b
> > [<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17
> > [<c0133a76>] msleep+0x10/0x16
> > [<c0138974>] sys_prctl+0x30/0x1e2
> > [<c0104c52>] sysenter_past_esp+0x5f/0xa5
> > =======================
> > 2 locks held by prctl/3042:
> > #0: (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a
> > #1: (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9
> > <------------------
> >
> > the current default timeout is 120 seconds. Such messages are printed
> > up to 10 times per bootup. If the system has crashed already then the
> > messages are not printed.
> >
> > if lockdep is enabled then all held locks are printed as well.
> >
> > this feature is a natural extension to the softlockup-detector (kernel
> > locked up without scheduling) and to the NMI watchdog (kernel locked up
> > with IRQs disabled).
>
> This feature will save one full reporter-developer round-trip during
> investigation of a significant number of bug reports.
>
> It might be more practical if it were to dump the traces for _all_
> D-state processes when it fires - basically an auto-triggered sysrq-W.

Er, it won't play well if that happen when tasks are frozen for suspend.

2007-12-03 19:26:16

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Rafael J. Wysocki <[email protected]> wrote:

> > This feature will save one full reporter-developer round-trip during
> > investigation of a significant number of bug reports.
> >
> > It might be more practical if it were to dump the traces for _all_
> > D-state processes when it fires - basically an auto-triggered
> > sysrq-W.
>
> Er, it won't play well if that happen when tasks are frozen for
> suspend.

right now any suspend attempt times out after 20 seconds:

$ grep TIMEOUT kernel/power/process.c
#define TIMEOUT (20 * HZ)
end_time = jiffies + TIMEOUT;

which should be well before the 120 seconds timeout that the detector
uses. But indeed you are right in that the refrigerator() works via
TASK_UNINTERRUPTIBLE too. I've updated the patch to exclude PF_FROZEN -
attached below. That should solve this particular issue, even if the
timeout increased to above 20 secs, right?

Ingo

-------------->
Subject: softlockup: automatically detect hung TASK_UNINTERRUPTIBLE tasks
From: Ingo Molnar <[email protected]>

this patch extends the soft-lockup detector to automatically
detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are
printed the following way:

------------------>
INFO: task prctl:3042 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message
prctl D fd5e3793 0 3042 2997
f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286
f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000
f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b
Call Trace:
[<c04883a5>] schedule_timeout+0x6d/0x8b
[<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17
[<c0133a76>] msleep+0x10/0x16
[<c0138974>] sys_prctl+0x30/0x1e2
[<c0104c52>] sysenter_past_esp+0x5f/0xa5
=======================
2 locks held by prctl/3042:
#0: (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a
#1: (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9
<------------------

the current default timeout is 120 seconds. Such messages are printed
up to 10 times per bootup. If the system has crashed already then the
messages are not printed.

if lockdep is enabled then all held locks are printed as well.

this feature is a natural extension to the softlockup-detector (kernel
locked up without scheduling) and to the NMI watchdog (kernel locked up
with IRQs disabled).

Signed-off-by: Ingo Molnar <[email protected]>
Signed-off-by: Arjan van de Ven <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/process_32.c | 15 ++++--
include/linux/debug_locks.h | 5 ++
include/linux/sched.h | 10 ++++
kernel/fork.c | 5 ++
kernel/lockdep.c | 14 ++++-
kernel/sched.c | 4 -
kernel/softlockup.c | 106 ++++++++++++++++++++++++++++++++++++++-----
kernel/sysctl.c | 27 ++++++++++
8 files changed, 169 insertions(+), 17 deletions(-)

Index: linux/arch/x86/kernel/process_32.c
===================================================================
--- linux.orig/arch/x86/kernel/process_32.c
+++ linux/arch/x86/kernel/process_32.c
@@ -114,10 +114,19 @@ void default_idle(void)
smp_mb();

local_irq_disable();
- if (!need_resched())
+ if (!need_resched()) {
+ ktime_t t0, t1;
+ u64 t0n, t1n;
+
+ t0 = ktime_get();
+ t0n = ktime_to_ns(t0);
safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
+ local_irq_disable();
+ t1 = ktime_get();
+ t1n = ktime_to_ns(t1);
+ sched_clock_idle_wakeup_event(t1n - t0n);
+ }
+ local_irq_enable();
current_thread_info()->status |= TS_POLLING;
} else {
/* loop is done by the caller */
Index: linux/include/linux/debug_locks.h
===================================================================
--- linux.orig/include/linux/debug_locks.h
+++ linux/include/linux/debug_locks.h
@@ -47,6 +47,7 @@ struct task_struct;

#ifdef CONFIG_LOCKDEP
extern void debug_show_all_locks(void);
+extern void __debug_show_held_locks(struct task_struct *task);
extern void debug_show_held_locks(struct task_struct *task);
extern void debug_check_no_locks_freed(const void *from, unsigned long len);
extern void debug_check_no_locks_held(struct task_struct *task);
@@ -55,6 +56,10 @@ static inline void debug_show_all_locks(
{
}

+static inline void __debug_show_held_locks(struct task_struct *task)
+{
+}
+
static inline void debug_show_held_locks(struct task_struct *task)
{
}
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -261,12 +261,17 @@ extern void account_process_tick(struct
extern void update_process_times(int user);
extern void scheduler_tick(void);

+extern void sched_show_task(struct task_struct *p);
+
#ifdef CONFIG_DETECT_SOFTLOCKUP
extern void softlockup_tick(void);
extern void spawn_softlockup_task(void);
extern void touch_softlockup_watchdog(void);
extern void touch_all_softlockup_watchdogs(void);
extern int softlockup_thresh;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern long sysctl_hung_task_warnings;
#else
static inline void softlockup_tick(void)
{
@@ -1052,6 +1057,11 @@ struct task_struct {
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+/* hung task detection */
+ unsigned long last_switch_timestamp;
+ unsigned long last_switch_count;
+#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
Index: linux/kernel/fork.c
===================================================================
--- linux.orig/kernel/fork.c
+++ linux/kernel/fork.c
@@ -1059,6 +1059,11 @@ static struct task_struct *copy_process(
p->prev_utime = cputime_zero;
p->prev_stime = cputime_zero;

+#ifdef CONFIG_DETECT_SOFTLOCKUP
+ p->last_switch_count = 0;
+ p->last_switch_timestamp = 0;
+#endif
+
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
Index: linux/kernel/lockdep.c
===================================================================
--- linux.orig/kernel/lockdep.c
+++ linux/kernel/lockdep.c
@@ -3190,14 +3190,24 @@ retry:
}
EXPORT_SYMBOL_GPL(debug_show_all_locks);

-void debug_show_held_locks(struct task_struct *task)
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void __debug_show_held_locks(struct task_struct *task)
{
if (unlikely(!debug_locks)) {
printk("INFO: lockdep is turned off.\n");
return;
}
+ lockdep_print_held_locks(task);
+}
+EXPORT_SYMBOL_GPL(__debug_show_held_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
if (task == current)
- lockdep_print_held_locks(task);
+ __debug_show_held_locks(task);
}
EXPORT_SYMBOL_GPL(debug_show_held_locks);

Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -4833,7 +4833,7 @@ out_unlock:

static const char stat_nam[] = "RSDTtZX";

-static void show_task(struct task_struct *p)
+void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
unsigned state;
@@ -4886,7 +4886,7 @@ void show_state_filter(unsigned long sta
*/
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
- show_task(p);
+ sched_show_task(p);
} while_each_thread(g, p);

touch_all_softlockup_watchdogs();
Index: linux/kernel/softlockup.c
===================================================================
--- linux.orig/kernel/softlockup.c
+++ linux/kernel/softlockup.c
@@ -8,6 +8,7 @@
*/
#include <linux/mm.h>
#include <linux/cpu.h>
+#include <linux/nmi.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/freezer.h>
@@ -24,7 +25,7 @@ static DEFINE_PER_CPU(unsigned long, pri
static DEFINE_PER_CPU(struct task_struct *, watchdog_task);

static int did_panic;
-int softlockup_thresh = 10;
+int softlockup_thresh = 60;

static int
softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block
*/
static unsigned long get_timestamp(int this_cpu)
{
- return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */
+ return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
}

void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)

now = get_timestamp(this_cpu);

- /* Wake up the high-prio watchdog task every second: */
- if (now > (touch_timestamp + 1))
- wake_up_process(per_cpu(watchdog_task, this_cpu));
-
- /* Warn about unreasonable 10+ seconds delays: */
+ /* Warn about unreasonable delays: */
if (now <= (touch_timestamp + softlockup_thresh))
return;

@@ -122,11 +119,88 @@ void softlockup_tick(void)
}

/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long sysctl_hung_task_timeout_secs = 120;
+
+long sysctl_hung_task_warnings = 10;
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+ unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+ if (t->flags & PF_FROZEN)
+ return;
+
+ if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+ t->last_switch_count = switch_count;
+ t->last_switch_timestamp = now;
+ return;
+ }
+ if ((long)(now - t->last_switch_timestamp) <
+ sysctl_hung_task_timeout_secs)
+ return;
+ if (sysctl_hung_task_warnings < 0)
+ return;
+ sysctl_hung_task_warnings--;
+
+ /*
+ * Ok, the task did not get scheduled for more than 2 minutes,
+ * complain:
+ */
+ printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+ "%ld seconds.\n", t->comm, t->pid,
+ sysctl_hung_task_timeout_secs);
+ printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+ " disables this message.\n");
+ sched_show_task(t);
+ __debug_show_held_locks(t);
+
+ t->last_switch_timestamp = now;
+ touch_nmi_watchdog();
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(int this_cpu)
+{
+ int max_count = sysctl_hung_task_check_count;
+ unsigned long now = get_timestamp(this_cpu);
+ struct task_struct *g, *t;
+
+ /*
+ * If the system crashed already then all bets are off,
+ * do not report extra hung tasks:
+ */
+ if ((tainted & TAINT_DIE) || did_panic)
+ return;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, t) {
+ if (!--max_count)
+ break;
+ if (t->state & TASK_UNINTERRUPTIBLE)
+ check_hung_task(t, now);
+ } while_each_thread(g, t);
+
+ read_unlock(&tasklist_lock);
+}
+
+/*
* The watchdog thread - runs every second and touches the timestamp.
*/
static int watchdog(void *__bind_cpu)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ int this_cpu = (int)__bind_cpu, check_cpu;

sched_setscheduler(current, SCHED_FIFO, &param);

@@ -135,13 +209,25 @@ static int watchdog(void *__bind_cpu)

/*
* Run briefly once per second to reset the softlockup timestamp.
- * If this gets delayed for more than 10 seconds then the
+ * If this gets delayed for more than 60 seconds then the
* debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
touch_softlockup_watchdog();
- schedule();
+ msleep_interruptible(10000);
+
+ /*
+ * Only do the hung-tasks check on one CPU:
+ */
+ get_online_cpus();
+ check_cpu = any_online_cpu(cpu_online_map);
+ put_online_cpus();
+
+ if (this_cpu != check_cpu)
+ continue;
+
+ if (sysctl_hung_task_timeout_secs)
+ check_hung_uninterruptible_tasks(this_cpu);
}

return 0;
Index: linux/kernel/sysctl.c
===================================================================
--- linux.orig/kernel/sysctl.c
+++ linux/kernel/sysctl.c
@@ -753,6 +753,33 @@ static struct ctl_table kern_table[] = {
.extra1 = &one,
.extra2 = &sixty,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ },
#endif
#ifdef CONFIG_COMPAT
{

2007-12-03 22:28:58

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

On Monday, 3 of December 2007, Ingo Molnar wrote:
>
> * Rafael J. Wysocki <[email protected]> wrote:
>
> > > This feature will save one full reporter-developer round-trip during
> > > investigation of a significant number of bug reports.
> > >
> > > It might be more practical if it were to dump the traces for _all_
> > > D-state processes when it fires - basically an auto-triggered
> > > sysrq-W.
> >
> > Er, it won't play well if that happen when tasks are frozen for
> > suspend.
>
> right now any suspend attempt times out after 20 seconds:
>
> $ grep TIMEOUT kernel/power/process.c
> #define TIMEOUT (20 * HZ)
> end_time = jiffies + TIMEOUT;

This is the timeout for freezing tasks, but if the freezing succeeds, they
can stay in TASK_UNINTERRUPTIBLE for quite some more time, especially during
a hibernation (the tasks stay frozen until we power off the system after saving
the image).

> which should be well before the 120 seconds timeout that the detector
> uses. But indeed you are right in that the refrigerator() works via
> TASK_UNINTERRUPTIBLE too. I've updated the patch to exclude PF_FROZEN -
> attached below. That should solve this particular issue, even if the
> timeout increased to above 20 secs, right?

Sure.

Thanks,
Rafael

2007-12-04 00:05:53

by Ingo Molnar

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks


* Rafael J. Wysocki <[email protected]> wrote:

> > > Er, it won't play well if that happen when tasks are frozen for
> > > suspend.
> >
> > right now any suspend attempt times out after 20 seconds:
> >
> > $ grep TIMEOUT kernel/power/process.c
> > #define TIMEOUT (20 * HZ)
> > end_time = jiffies + TIMEOUT;
>
> This is the timeout for freezing tasks, but if the freezing succeeds,
> they can stay in TASK_UNINTERRUPTIBLE for quite some more time,
> especially during a hibernation (the tasks stay frozen until we power
> off the system after saving the image).

ah, ok. So this was a live bug - thanks for the clarification.

Ingo

2007-12-05 22:31:39

by Mark Lord

[permalink] [raw]
Subject: Re: [feature] automatically detect hung TASK_UNINTERRUPTIBLE tasks

Arjan van de Ven wrote:
> On Mon, 3 Dec 2007 11:27:15 +0100
> Andi Kleen <[email protected]> wrote:
>
>>> Kernel waiting 2 minutes on TASK_UNINTERRUPTIBLE is certainly
>>> broken.
>> What should it do when the NFS server doesn't answer anymore or
>> when the network to the SAN RAID array located a few hundred KM away
>> develops some hickup? Or just the SCSI driver decides to do lengthy
>> error recovery -- you could argue that is broken if it takes longer
>> than 2 minutes, but in practice these things are hard to test
>> and to fix.
>>
>
> the scsi layer will have the IO totally aborted within that time anyway;
> the retry timeout for disks is 30 seconds after all.
..

Mmm.. but the SCSI layer may do many retries, each with 30sec timeouts..