2009-01-25 20:50:41

by Frederic Weisbecker

[permalink] [raw]
Subject: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock

This patch adds a counter for writers that enter a rwlock slow path.
For example it can be useful for slow background tasks which perform some jobs
on the tasklist, such as the hung_task detector (kernel/hung_task.c).

It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
is not null.

Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
is perhaps not a good idea).

Comments?

Signed-off-by: Frederic Weisbecker <[email protected]>
---
arch/Kconfig | 4 ++++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/spinlock.h | 5 +++++
arch/x86/include/asm/spinlock_types.h | 1 +
arch/x86/lib/rwlock_64.S | 10 +++++++---
include/linux/spinlock.h | 7 +++++++
6 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 550dab2..86c22e0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -106,3 +106,7 @@ config HAVE_CLK
The <linux/clk.h> calls support software clock gating and
thus are a key power management tool on many systems.

+config HAVE_NB_WRITERS_SPINNING_LOCK
+ bool
+
+
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 967e114..0aeae17 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -40,6 +40,7 @@ config X86
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select USER_STACKTRACE_SUPPORT
+ select HAVE_NB_WRITERS_SPINNING_LOCK

config ARCH_DEFCONFIG
string
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 139b424..d9ee21b 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -283,6 +283,11 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
return 0;
}

+static inline int __raw_writers_spinning_lock(raw_rwlock_t *lock)
+{
+ return lock->spinning_writers;
+}
+
static inline void __raw_read_unlock(raw_rwlock_t *rw)
{
asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 845f81c..163e6de 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -13,6 +13,7 @@ typedef struct raw_spinlock {

typedef struct {
unsigned int lock;
+ unsigned int spinning_writers;
} raw_rwlock_t;

#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
index 05ea55f..9589b74 100644
--- a/arch/x86/lib/rwlock_64.S
+++ b/arch/x86/lib/rwlock_64.S
@@ -9,14 +9,18 @@
ENTRY(__write_lock_failed)
CFI_STARTPROC
LOCK_PREFIX
+ incl 4(%rdi) /* add ourself as a spinning writer */
+1: LOCK_PREFIX
addl $RW_LOCK_BIAS,(%rdi)
-1: rep
+2: rep
nop
cmpl $RW_LOCK_BIAS,(%rdi)
- jne 1b
+ jne 2b
LOCK_PREFIX
subl $RW_LOCK_BIAS,(%rdi)
- jnz __write_lock_failed
+ jnz 1b
+ LOCK_PREFIX
+ decl 4(%rdi)
ret
CFI_ENDPROC
END(__write_lock_failed)
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index e0c0fcc..8ce22af 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -121,6 +121,13 @@ do { \

#define spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock)

+#ifdef CONFIG_HAVE_NB_WRITERS_SPINNING_LOCK
+#define writers_spinning_lock(rwlock) \
+ __raw_writers_spinning_lock(&(rwlock)->raw_lock)
+#else
+#define writers_spinning_lock(rwlock) 0
+#endif
+
#ifdef CONFIG_GENERIC_LOCKBREAK
#define spin_is_contended(lock) ((lock)->break_lock)
#else
--
1.6.1


2009-01-26 13:32:32

by Ingo Molnar

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock


* Frederic Weisbecker <[email protected]> wrote:

> This patch adds a counter for writers that enter a rwlock slow path. For
> example it can be useful for slow background tasks which perform some
> jobs on the tasklist, such as the hung_task detector
> (kernel/hung_task.c).
>
> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so
> the overhead is not null.
>
> Only x86 is supported for now, writers_spinning_lock() will return 0 on
> other archs (which is perhaps not a good idea).
>
> Comments?

hm, it increases the rwlock data type:

> diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
> index 845f81c..163e6de 100644
> --- a/arch/x86/include/asm/spinlock_types.h
> +++ b/arch/x86/include/asm/spinlock_types.h
> @@ -13,6 +13,7 @@ typedef struct raw_spinlock {
>
> typedef struct {
> unsigned int lock;
> + unsigned int spinning_writers;
> } raw_rwlock_t;

that's generally not done lightly. Performance figures for a relevant
workload are obligatory in this case - proving that it's worth the size
bloat.

Ingo

2009-01-26 13:48:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock

On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
> This patch adds a counter for writers that enter a rwlock slow path.
> For example it can be useful for slow background tasks which perform some jobs
> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
>
> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
> is not null.
>
> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
> is perhaps not a good idea).
>
> Comments?

_why_ ?


2009-01-26 15:25:58

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock

2009/1/26 Peter Zijlstra <[email protected]>:
> On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
>> This patch adds a counter for writers that enter a rwlock slow path.
>> For example it can be useful for slow background tasks which perform some jobs
>> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
>>
>> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
>> is not null.
>>
>> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
>> is perhaps not a good idea).
>>
>> Comments?
>
> _why_ ?

The hung task detector runs a periodic loop through the task_list, and
currently it doesn't run
over an arbitrary threshold of tasks to not hold the task_list lock
for too long.

So we thought about a way to detect if there are some writers waiting
for the lock, anf if so, release
the lock, schedule and retry.

I'm not sure this is something easy to measure, since this is more
about small latency gains (if there are).

2009-01-26 15:37:50

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock

On Mon, 2009-01-26 at 16:25 +0100, Frédéric Weisbecker wrote:
> 2009/1/26 Peter Zijlstra <[email protected]>:
> > On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
> >> This patch adds a counter for writers that enter a rwlock slow path.
> >> For example it can be useful for slow background tasks which perform some jobs
> >> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
> >>
> >> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
> >> is not null.
> >>
> >> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
> >> is perhaps not a good idea).
> >>
> >> Comments?
> >
> > _why_ ?
>
> The hung task detector runs a periodic loop through the task_list, and
> currently it doesn't run
> over an arbitrary threshold of tasks to not hold the task_list lock
> for too long.
>
> So we thought about a way to detect if there are some writers waiting
> for the lock, anf if so, release
> the lock, schedule and retry.

Ah, if it can do that, then it can also use RCU, no? Only users who
really have to hold off new tasks need the read-task_lock. The rest can
use RCU.

2009-01-26 16:04:45

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock

2009/1/26 Peter Zijlstra <[email protected]>:
> On Mon, 2009-01-26 at 16:25 +0100, Fr?d?ric Weisbecker wrote:
>> 2009/1/26 Peter Zijlstra <[email protected]>:
>> > On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
>> >> This patch adds a counter for writers that enter a rwlock slow path.
>> >> For example it can be useful for slow background tasks which perform some jobs
>> >> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
>> >>
>> >> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
>> >> is not null.
>> >>
>> >> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
>> >> is perhaps not a good idea).
>> >>
>> >> Comments?
>> >
>> > _why_ ?
>>
>> The hung task detector runs a periodic loop through the task_list, and
>> currently it doesn't run
>> over an arbitrary threshold of tasks to not hold the task_list lock
>> for too long.
>>
>> So we thought about a way to detect if there are some writers waiting
>> for the lock, anf if so, release
>> the lock, schedule and retry.
>
> Ah, if it can do that, then it can also use RCU, no? Only users who
> really have to hold off new tasks need the read-task_lock. The rest can
> use RCU.


Really?
That sounds a good news. Mandeep, what do you think?

2009-01-26 17:36:22

by Mandeep Baines

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock

Hi Fr?d?ric,

Unfortunately, this can't be done for hung_task. It writes to the
task_struct here:

static void check_hung_task(struct task_struct *t, unsigned long now,
unsigned long timeout)
{
unsigned long switch_count = t->nvcsw + t->nivcsw;

if (t->flags & PF_FROZEN)
return;

if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
t->last_switch_count = switch_count;
t->last_switch_timestamp = now;
return;
}

It is able to get away with using only a read_lock because no one else
reads or writes to these fields.

Regards,
Mandeep

On Mon, Jan 26, 2009 at 8:04 AM, Fr?d?ric Weisbecker <[email protected]> wrote:
> 2009/1/26 Peter Zijlstra <[email protected]>:
>> On Mon, 2009-01-26 at 16:25 +0100, Fr?d?ric Weisbecker wrote:
>>> 2009/1/26 Peter Zijlstra <[email protected]>:
>>> > On Sun, 2009-01-25 at 12:50 -0800, Frederic Weisbecker wrote:
>>> >> This patch adds a counter for writers that enter a rwlock slow path.
>>> >> For example it can be useful for slow background tasks which perform some jobs
>>> >> on the tasklist, such as the hung_task detector (kernel/hung_task.c).
>>> >>
>>> >> It adds a inc/dec pair on the slow path and 4 bytes for each rwlocks, so the overhead
>>> >> is not null.
>>> >>
>>> >> Only x86 is supported for now, writers_spinning_lock() will return 0 on other archs (which
>>> >> is perhaps not a good idea).
>>> >>
>>> >> Comments?
>>> >
>>> > _why_ ?
>>>
>>> The hung task detector runs a periodic loop through the task_list, and
>>> currently it doesn't run
>>> over an arbitrary threshold of tasks to not hold the task_list lock
>>> for too long.
>>>
>>> So we thought about a way to detect if there are some writers waiting
>>> for the lock, anf if so, release
>>> the lock, schedule and retry.
>>
>> Ah, if it can do that, then it can also use RCU, no? Only users who
>> really have to hold off new tasks need the read-task_lock. The rest can
>> use RCU.
>
>
> Really?
> That sounds a good news. Mandeep, what do you think?
>

2009-01-26 17:42:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] add a counter for writers spinning on a rwlock

On Mon, 2009-01-26 at 09:36 -0800, Mandeep Baines wrote:

> Unfortunately, this can't be done for hung_task. It writes to the
> task_struct here:

Don't top post!

> static void check_hung_task(struct task_struct *t, unsigned long now,
> unsigned long timeout)
> {
> unsigned long switch_count = t->nvcsw + t->nivcsw;
>
> if (t->flags & PF_FROZEN)
> return;
>
> if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
> t->last_switch_count = switch_count;
> t->last_switch_timestamp = now;
> return;
> }
>
> It is able to get away with using only a read_lock because no one else
> reads or writes to these fields.

How would RCU be different here?

2009-01-27 00:31:23

by Mandeep Baines

[permalink] [raw]
Subject: [PATCH v4] softlockup: remove hung_task_check_count

Peter Zijlstra ([email protected]) wrote:
> On Mon, 2009-01-26 at 09:36 -0800, Mandeep Baines wrote:
>
> > Unfortunately, this can't be done for hung_task. It writes to the
> > task_struct here:
>
> Don't top post!
>
> > static void check_hung_task(struct task_struct *t, unsigned long now,
> > unsigned long timeout)
> > {
> > unsigned long switch_count = t->nvcsw + t->nivcsw;
> >
> > if (t->flags & PF_FROZEN)
> > return;
> >
> > if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
> > t->last_switch_count = switch_count;
> > t->last_switch_timestamp = now;
> > return;
> > }
> >
> > It is able to get away with using only a read_lock because no one else
> > reads or writes to these fields.
>
> How would RCU be different here?
>

My bad, RCU wouldn't be any different. I misunderstood how RCU works. Just
spent the morning reading the LWN 3-part series on RCU and I think I'm able to
grok it now;)

Below is a patch to hung_task which removes the hung_task_check_count and
converts the read_locks to RCU.

Thanks Fr?d?ric and Peter!

---
To avoid holding the tasklist lock too long, hung_task_check_count was used
as an upper bound on the number of tasks that are checked by hung_task.
This patch removes the hung_task_check_count sysctl.

Instead of checking a limited number of tasks, all tasks are checked. To
avoid holding the CPU for too long, need_resched() is checked often. To
avoid blocking out writers, the read_lock has been converted to an
rcu_read_lock().

It is safe convert to an rcu_read_lock() because the tasks and thread_group
lists are both protected by list_*_rcu() operations. The worst that can
happen is that hung_task will update last_switch_timestamp field of a DEAD
task.

The design was proposed by Fr?d?ric Weisbecker. Peter Zijlstra suggested
the use of RCU.

Signed-off-by: Mandeep Singh Baines <[email protected]>
---
include/linux/sched.h | 1 -
kernel/hung_task.c | 12 +++---------
kernel/sysctl.c | 9 ---------
3 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f2f94d5..278121c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -315,7 +315,6 @@ static inline void touch_all_softlockup_watchdogs(void)

#ifdef CONFIG_DETECT_HUNG_TASK
extern unsigned int sysctl_hung_task_panic;
-extern unsigned long sysctl_hung_task_check_count;
extern unsigned long sysctl_hung_task_timeout_secs;
extern unsigned long sysctl_hung_task_warnings;
extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba8ccd4..7d67350 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,11 +17,6 @@
#include <linux/sysctl.h>

/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
@@ -116,7 +111,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
*/
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
- int max_count = sysctl_hung_task_check_count;
unsigned long now = get_timestamp();
struct task_struct *g, *t;

@@ -127,16 +121,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (test_taint(TAINT_DIE) || did_panic)
return;

- read_lock(&tasklist_lock);
+ rcu_read_lock();
do_each_thread(g, t) {
- if (!--max_count)
+ if (need_resched())
goto unlock;
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, now, timeout);
} while_each_thread(g, t);
unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
}

static void update_poll_jiffies(void)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2481ed3..16526a2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -820,15 +820,6 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "hung_task_check_count",
- .data = &sysctl_hung_task_check_count,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
- .strategy = &sysctl_intvec,
- },
- {
- .ctl_name = CTL_UNNUMBERED,
.procname = "hung_task_timeout_secs",
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
--
1.5.4.5

2009-01-27 09:27:26

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH v4] softlockup: remove hung_task_check_count

On Mon, Jan 26, 2009 at 04:30:55PM -0800, Mandeep Singh Baines wrote:
> Peter Zijlstra ([email protected]) wrote:
> > On Mon, 2009-01-26 at 09:36 -0800, Mandeep Baines wrote:
> >
> > > Unfortunately, this can't be done for hung_task. It writes to the
> > > task_struct here:
> >
> > Don't top post!
> >
> > > static void check_hung_task(struct task_struct *t, unsigned long now,
> > > unsigned long timeout)
> > > {
> > > unsigned long switch_count = t->nvcsw + t->nivcsw;
> > >
> > > if (t->flags & PF_FROZEN)
> > > return;
> > >
> > > if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
> > > t->last_switch_count = switch_count;
> > > t->last_switch_timestamp = now;
> > > return;
> > > }
> > >
> > > It is able to get away with using only a read_lock because no one else
> > > reads or writes to these fields.
> >
> > How would RCU be different here?
> >
>
> My bad, RCU wouldn't be any different. I misunderstood how RCU works. Just
> spent the morning reading the LWN 3-part series on RCU and I think I'm able to
> grok it now;)
>
> Below is a patch to hung_task which removes the hung_task_check_count and
> converts the read_locks to RCU.
>
> Thanks Fr?d?ric and Peter!
>
> ---
> To avoid holding the tasklist lock too long, hung_task_check_count was used
> as an upper bound on the number of tasks that are checked by hung_task.
> This patch removes the hung_task_check_count sysctl.
>
> Instead of checking a limited number of tasks, all tasks are checked. To
> avoid holding the CPU for too long, need_resched() is checked often. To
> avoid blocking out writers, the read_lock has been converted to an
> rcu_read_lock().
>
> It is safe convert to an rcu_read_lock() because the tasks and thread_group
> lists are both protected by list_*_rcu() operations. The worst that can
> happen is that hung_task will update last_switch_timestamp field of a DEAD
> task.
>
> The design was proposed by Fr?d?ric Weisbecker. Peter Zijlstra suggested
> the use of RCU.
>
> Signed-off-by: Mandeep Singh Baines <[email protected]>
> ---
> include/linux/sched.h | 1 -
> kernel/hung_task.c | 12 +++---------
> kernel/sysctl.c | 9 ---------
> 3 files changed, 3 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f2f94d5..278121c 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -315,7 +315,6 @@ static inline void touch_all_softlockup_watchdogs(void)
>
> #ifdef CONFIG_DETECT_HUNG_TASK
> extern unsigned int sysctl_hung_task_panic;
> -extern unsigned long sysctl_hung_task_check_count;
> extern unsigned long sysctl_hung_task_timeout_secs;
> extern unsigned long sysctl_hung_task_warnings;
> extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index ba8ccd4..7d67350 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -17,11 +17,6 @@
> #include <linux/sysctl.h>
>
> /*
> - * Have a reasonable limit on the number of tasks checked:
> - */
> -unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
> -
> -/*
> * Zero means infinite timeout - no checking done:
> */
> unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
> @@ -116,7 +111,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
> */
> static void check_hung_uninterruptible_tasks(unsigned long timeout)
> {
> - int max_count = sysctl_hung_task_check_count;
> unsigned long now = get_timestamp();
> struct task_struct *g, *t;
>
> @@ -127,16 +121,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
> if (test_taint(TAINT_DIE) || did_panic)
> return;
>
> - read_lock(&tasklist_lock);
> + rcu_read_lock();
> do_each_thread(g, t) {
> - if (!--max_count)
> + if (need_resched())
> goto unlock;
> /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
> if (t->state == TASK_UNINTERRUPTIBLE)
> check_hung_task(t, now, timeout);
> } while_each_thread(g, t);
> unlock:
> - read_unlock(&tasklist_lock);
> + rcu_read_unlock();
> }
>
> static void update_poll_jiffies(void)
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 2481ed3..16526a2 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -820,15 +820,6 @@ static struct ctl_table kern_table[] = {
> },
> {
> .ctl_name = CTL_UNNUMBERED,
> - .procname = "hung_task_check_count",
> - .data = &sysctl_hung_task_check_count,
> - .maxlen = sizeof(unsigned long),
> - .mode = 0644,
> - .proc_handler = &proc_doulongvec_minmax,
> - .strategy = &sysctl_intvec,
> - },
> - {
> - .ctl_name = CTL_UNNUMBERED,
> .procname = "hung_task_timeout_secs",
> .data = &sysctl_hung_task_timeout_secs,
> .maxlen = sizeof(unsigned long),
> --
> 1.5.4.5
>


That looks good :-)

2009-01-27 13:26:51

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH v4] softlockup: remove hung_task_check_count


* Mandeep Singh Baines <[email protected]> wrote:

> The design was proposed by Fr?d?ric Weisbecker. Peter Zijlstra suggested
> the use of RCU.

ok, this looks _much_ cleaner.

One question:

> - read_lock(&tasklist_lock);
> + rcu_read_lock();
> do_each_thread(g, t) {
> - if (!--max_count)
> + if (need_resched())
> goto unlock;

Isnt it dangerous to skip a check just because we got marked for
reschedule? Since it runs so rarely it could by accident be preempted and
we'd not get any checking done for a long time.

Ingo

2009-01-27 18:49:19

by Mandeep Baines

[permalink] [raw]
Subject: Re: [PATCH v4] softlockup: remove hung_task_check_count

Ingo Molnar ([email protected]) wrote:
>
> * Mandeep Singh Baines <[email protected]> wrote:
>
> > The design was proposed by Fr?d?ric Weisbecker. Peter Zijlstra suggested
> > the use of RCU.
>
> ok, this looks _much_ cleaner.
>
> One question:
>
> > - read_lock(&tasklist_lock);
> > + rcu_read_lock();
> > do_each_thread(g, t) {
> > - if (!--max_count)
> > + if (need_resched())
> > goto unlock;
>
> Isnt it dangerous to skip a check just because we got marked for
> reschedule? Since it runs so rarely it could by accident be preempted and
> we'd not get any checking done for a long time.
>

Yeah, the checking could be deferred indefinitely. So you could have a system
where tasks are hung but it takes a really long time to detect this and
finally panic the system. Not so good for high-availability.

What if a check_count sysctl was added? But instead of being a max_check_count,
it would be a min_check_count. This would guarantee that a minimum amount of
checking is done.

Alternatively, the code for trying to continue the iteration after a
reschedule could be re-inserted. That code is a little tricky and
potentially fragile since continuation of a tasklist iteration is not
really supported by the sched.h APIs. But it is does have the nice properties
of getting through the entire list almost all the time and still playing nice
with the scheduler.

Regards,
Mandeep

2009-01-28 08:25:23

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v4] softlockup: remove hung_task_check_count

On Tue, 2009-01-27 at 10:48 -0800, Mandeep Singh Baines wrote:
> Ingo Molnar ([email protected]) wrote:
> >
> > * Mandeep Singh Baines <[email protected]> wrote:
> >
> > > The design was proposed by Frédéric Weisbecker. Peter Zijlstra suggested
> > > the use of RCU.
> >
> > ok, this looks _much_ cleaner.
> >
> > One question:
> >
> > > - read_lock(&tasklist_lock);
> > > + rcu_read_lock();
> > > do_each_thread(g, t) {
> > > - if (!--max_count)
> > > + if (need_resched())
> > > goto unlock;
> >
> > Isnt it dangerous to skip a check just because we got marked for
> > reschedule? Since it runs so rarely it could by accident be preempted and
> > we'd not get any checking done for a long time.
> >
>
> Yeah, the checking could be deferred indefinitely. So you could have a system
> where tasks are hung but it takes a really long time to detect this and
> finally panic the system. Not so good for high-availability.

Why break out at all? Are you that worried about khungtaskd introducing
latencies? Is using preemptible RCU an option for you?

2009-01-29 01:43:18

by Mandeep Baines

[permalink] [raw]
Subject: Re: [PATCH v4] softlockup: remove hung_task_check_count

Peter Zijlstra ([email protected]) wrote:
> On Tue, 2009-01-27 at 10:48 -0800, Mandeep Singh Baines wrote:
> > Ingo Molnar ([email protected]) wrote:
> > >
> > > * Mandeep Singh Baines <[email protected]> wrote:
> > >
> > > > The design was proposed by Fr?d?ric Weisbecker. Peter Zijlstra suggested
> > > > the use of RCU.
> > >
> > > ok, this looks _much_ cleaner.
> > >
> > > One question:
> > >
> > > > - read_lock(&tasklist_lock);
> > > > + rcu_read_lock();
> > > > do_each_thread(g, t) {
> > > > - if (!--max_count)
> > > > + if (need_resched())
> > > > goto unlock;
> > >
> > > Isnt it dangerous to skip a check just because we got marked for
> > > reschedule? Since it runs so rarely it could by accident be preempted and
> > > we'd not get any checking done for a long time.
> > >
> >
> > Yeah, the checking could be deferred indefinitely. So you could have a system
> > where tasks are hung but it takes a really long time to detect this and
> > finally panic the system. Not so good for high-availability.
>
> Why break out at all? Are you that worried about khungtaskd introducing
> latencies?

Yes, I was worried about disabling preemption for an unbounded amount of
time.

> Is using preemptible RCU an option for you?
>

I had not even considered that. To be honest, I had not even heard of it
till now. So I spent another morning at LWN grokking preemptible RCU;)

I think it can work. I'm a little worried about the OOM risk. It could take
a really long time to iterate over the task list. A lot of pending kfree()s
could build up in that time.

I'm still unclear as to whether khungtaskd would get priority boosted
or not. Are only tasks that explicitly block (by sleeping on a lock)
boostable or would a task which got pre-empted inside the critical section
also be boostable? My worry here is khungtaskd would run at high priority for
an indefinite amount of time making it more overhead. Is there an OOMing-soon
booster?

But I think this can all be mitigated by adding back the max_count sysctl.
But preemptible RCU would allow increasing the default to a much higher
value or we may even be able to make the default "check all tasks".

Running out of time today but I'll send out a patch tomorrow.

Thanks Peter!

Regards,
Mandeep

2009-01-30 20:50:49

by Mandeep Baines

[permalink] [raw]
Subject: Re: [PATCH v4] softlockup: remove hung_task_check_count

Mandeep Singh Baines ([email protected]) wrote:
> Peter Zijlstra ([email protected]) wrote:
> >
> > Why break out at all? Are you that worried about khungtaskd introducing
> > latencies?
>
> Yes, I was worried about disabling preemption for an unbounded amount of
> time.
>
> > Is using preemptible RCU an option for you?
> >
>
> I had not even considered that. To be honest, I had not even heard of it
> till now. So I spent another morning at LWN grokking preemptible RCU;)
>
> I think it can work. I'm a little worried about the OOM risk. It could take
> a really long time to iterate over the task list. A lot of pending kfree()s
> could build up in that time.
>

I misunderstood preemptible RCU. I assumed it was a new API but its not. So
I don't think preemptible RCU is an option since it would force a dependency
on CONFIG_PREEMPT_RCU.

I'm going to break up this patch in two. One patch for converting to rcu.
A second patch which will support checking all tasks. To support checking
all tasks I reverted back to a design similar to Fr?d?ric original proposal.

I'll send the patches out right after this email.

[PATCH 1/2] softlockup: convert read_lock in hung_task to rcu_read_lock
[PATCH 2/2] softlockup: check all tasks in hung_task

Regards,
Mandeep

2009-01-30 20:51:06

by Mandeep Baines

[permalink] [raw]
Subject: [PATCH 1/2] softlockup: convert read_lock in hung_task to rcu_read_lock

Peter Zijlstra suggested the use of RCU.

Signed-off-by: Mandeep Singh Baines <[email protected]>
---
kernel/hung_task.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba8ccd4..a841db3 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -127,7 +127,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
if (test_taint(TAINT_DIE) || did_panic)
return;

- read_lock(&tasklist_lock);
+ rcu_read_lock();
do_each_thread(g, t) {
if (!--max_count)
goto unlock;
@@ -136,7 +136,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
check_hung_task(t, now, timeout);
} while_each_thread(g, t);
unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
}

static void update_poll_jiffies(void)
--
1.5.4.5

2009-01-30 20:51:31

by Mandeep Baines

[permalink] [raw]
Subject: [PATCH 2/2] softlockup: check all tasks in hung_task

Instead of checking only hung_task_check_count tasks, all tasks are checked.
hung_task_check_count is still used to put an upper bound on the critical
section. Every hung_task_check_count checks, the critical section is
refreshed. Keeping the critical section small minimizes time preemption is
disabled and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Fr?d?ric Weisbecker.

Fr?d?ric Weisbecker ([email protected]) wrote:
>
> Instead of having this arbitrary limit of tasks, why not just
> lurk the need_resched() and then schedule if it needs too.
>
> I know that sounds a bit racy, because you will have to release the
> tasklist_lock and
> a lot of things can happen in the task list until you become resched.
> But you can do a get_task_struct() on g and t before your thread is
> going to sleep and then put them
> when it is awaken.
> Perhaps some tasks will disappear or be appended in the list before g
> and t, but that doesn't really matter:
> if they disappear, they didn't lockup, and if they were appended, they
> are not enough cold to be analyzed :-)
>
> This way you can drop the arbitrary limit of task number given by the user....
>
> Frederic.
>

Signed-off-by: Mandeep Singh Baines <[email protected]>
---
kernel/hung_task.c | 28 ++++++++++++++++++++++++++--
1 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index a841db3..1c8c9f9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -109,6 +109,25 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
panic("hung_task: blocked tasks");
}

+ /*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * exit the grace period. For classic RCU, a reschedule is required.
+ */
+static void check_hung_rcu_refresh(struct task_struct *g, struct task_struct *t)
+{
+ get_task_struct(g);
+ get_task_struct(t);
+ rcu_read_unlock();
+ if (need_resched())
+ schedule();
+ rcu_read_lock();
+ put_task_struct(t);
+ put_task_struct(g);
+}
+
/*
* Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
* a really long time (120 seconds). If that happens, print out
@@ -129,8 +148,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)

rcu_read_lock();
do_each_thread(g, t) {
- if (!--max_count)
- goto unlock;
+ if (sysctl_hung_task_check_count && !(max_count--)) {
+ max_count = sysctl_hung_task_check_count;
+ check_hung_rcu_refresh(g, t);
+ /* Exit if t or g was unhashed during refresh. */
+ if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+ goto unlock;
+ }
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, now, timeout);
--
1.5.4.5

2009-01-31 19:23:01

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] softlockup: check all tasks in hung_task

On Fri, 2009-01-30 at 12:49 -0800, Mandeep Singh Baines wrote:
> Instead of checking only hung_task_check_count tasks, all tasks are checked.
> hung_task_check_count is still used to put an upper bound on the critical
> section. Every hung_task_check_count checks, the critical section is
> refreshed. Keeping the critical section small minimizes time preemption is
> disabled and keeps rcu grace periods small.
>
> To prevent following a stale pointer, get_task_struct is called on g and t.
> To verify that g and t have not been unhashed while outside the critical
> section, the task states are checked.
>
> The design was proposed by Frédéric Weisbecker.
>
> Frédéric Weisbecker ([email protected]) wrote:
> >
> > Instead of having this arbitrary limit of tasks, why not just
> > lurk the need_resched() and then schedule if it needs too.
> >
> > I know that sounds a bit racy, because you will have to release the
> > tasklist_lock and
> > a lot of things can happen in the task list until you become resched.
> > But you can do a get_task_struct() on g and t before your thread is
> > going to sleep and then put them
> > when it is awaken.
> > Perhaps some tasks will disappear or be appended in the list before g
> > and t, but that doesn't really matter:
> > if they disappear, they didn't lockup, and if they were appended, they
> > are not enough cold to be analyzed :-)
> >
> > This way you can drop the arbitrary limit of task number given by the user....
> >
> > Frederic.
> >
>
> Signed-off-by: Mandeep Singh Baines <[email protected]>
> ---
> kernel/hung_task.c | 28 ++++++++++++++++++++++++++--
> 1 files changed, 26 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/hung_task.c b/kernel/hung_task.c
> index a841db3..1c8c9f9 100644
> --- a/kernel/hung_task.c
> +++ b/kernel/hung_task.c
> @@ -109,6 +109,25 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
> panic("hung_task: blocked tasks");
> }
>
> + /*
> + * To avoid extending the RCU grace period for an unbounded amount of time,
> + * periodically exit the critical section and enter a new one.
> + *
> + * For preemptible RCU it is sufficient to call rcu_read_unlock in order
> + * exit the grace period. For classic RCU, a reschedule is required.
> + */
> +static void check_hung_rcu_refresh(struct task_struct *g, struct task_struct *t)
> +{
> + get_task_struct(g);
> + get_task_struct(t);
> + rcu_read_unlock();
> + if (need_resched())
> + schedule();

won't a simple cond_resched(), do?

> + rcu_read_lock();
> + put_task_struct(t);
> + put_task_struct(g);
> +}
> +
> /*
> * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
> * a really long time (120 seconds). If that happens, print out
> @@ -129,8 +148,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
>
> rcu_read_lock();
> do_each_thread(g, t) {
> - if (!--max_count)
> - goto unlock;
> + if (sysctl_hung_task_check_count && !(max_count--)) {
> + max_count = sysctl_hung_task_check_count;
> + check_hung_rcu_refresh(g, t);
> + /* Exit if t or g was unhashed during refresh. */
> + if (t->state == TASK_DEAD || g->state == TASK_DEAD)
> + goto unlock;
> + }

Its all a bit ugly, but I suppose there's no way around that.

> /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
> if (t->state == TASK_UNINTERRUPTIBLE)
> check_hung_task(t, now, timeout);