2010-08-13 10:22:10

by Sergey Senozhatsky

[permalink] [raw]
Subject: fix BUG: using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

Hello,

Got this traces today:

[ 67.703556] BUG: using smp_processor_id() in preemptible [00000000] code: s2disk/5139
[ 67.703563] caller is touch_nmi_watchdog+0x15/0x2c
[ 67.703566] Pid: 5139, comm: s2disk Not tainted 2.6.36-rc0-git12-07921-g60bf26a-dirty #116
[ 67.703568] Call Trace:
[ 67.703575] [<ffffffff811f6bf1>] debug_smp_processor_id+0xc9/0xe4
[ 67.703578] [<ffffffff81092766>] touch_nmi_watchdog+0x15/0x2c
[ 67.703584] [<ffffffff81222950>] acpi_os_stall+0x34/0x40
[ 67.703589] [<ffffffff812398d2>] acpi_ex_system_do_stall+0x34/0x38
[ 67.703591] [<ffffffff81238396>] acpi_ex_opcode_1A_0T_0R+0x6d/0xa1
[ 67.703595] [<ffffffff8122e280>] acpi_ds_exec_end_op+0xf8/0x578
[ 67.703598] [<ffffffff812457f9>] acpi_ps_parse_loop+0x88a/0xa55
[ 67.703604] [<ffffffff81244a00>] acpi_ps_parse_aml+0x104/0x3c4
[ 67.703607] [<ffffffff81246198>] acpi_ps_execute_method+0x20f/0x2f3
[ 67.703610] [<ffffffff8124021f>] acpi_ns_evaluate+0x18b/0x2d2
[ 67.703614] [<ffffffff8123fad0>] acpi_evaluate_object+0x1b8/0x2fc
[ 67.703617] [<ffffffff8123e020>] ? acpi_get_sleep_type_data+0x21c/0x236
[ 67.703620] [<ffffffff8123d9fb>] acpi_enter_sleep_state_prep+0x61/0xd9
[ 67.703623] [<ffffffff81224205>] acpi_sleep_prepare+0x4f/0x56
[ 67.703626] [<ffffffff81224268>] __acpi_pm_prepare+0x13/0x2e
[ 67.703629] [<ffffffff81224448>] acpi_pm_prepare+0xe/0x1f
[ 67.703632] [<ffffffff81224466>] acpi_hibernation_pre_snapshot+0xd/0x1e
[ 67.703637] [<ffffffff81071b80>] hibernation_snapshot+0xaf/0x258
[ 67.703641] [<ffffffff81074dca>] snapshot_ioctl+0x25c/0x547
[ 67.703645] [<ffffffff81056efc>] ? __srcu_read_unlock+0x3b/0x57
[ 67.703649] [<ffffffff810e7f7d>] vfs_ioctl+0x31/0xa2
[ 67.703652] [<ffffffff810e88dc>] do_vfs_ioctl+0x47c/0x4af
[ 67.703655] [<ffffffff8125ee3c>] ? n_tty_write+0x0/0x35e
[ 67.703659] [<ffffffff8100203a>] ? sysret_check+0x2e/0x69
[ 67.703662] [<ffffffff810e8960>] sys_ioctl+0x51/0x75
[ 67.703665] [<ffffffff81002002>] system_call_fastpath+0x16/0x1b


[ 67.703668] BUG: using smp_processor_id() in preemptible [00000000] code: s2disk/5139
[ 67.703670] caller is touch_softlockup_watchdog+0x15/0x2b
[ 67.703672] Pid: 5139, comm: s2disk Not tainted 2.6.36-rc0-git12-07921-g60bf26a-dirty #116
[ 67.703674] Call Trace:
[ 67.703677] [<ffffffff811f6bf1>] debug_smp_processor_id+0xc9/0xe4
[ 67.703680] [<ffffffff8109273b>] touch_softlockup_watchdog+0x15/0x2b
[ 67.703682] [<ffffffff81092779>] touch_nmi_watchdog+0x28/0x2c
[ 67.703685] [<ffffffff81222950>] acpi_os_stall+0x34/0x40
[ 67.703688] [<ffffffff812398d2>] acpi_ex_system_do_stall+0x34/0x38
[ 67.703690] [<ffffffff81238396>] acpi_ex_opcode_1A_0T_0R+0x6d/0xa1
[ 67.703693] [<ffffffff8122e280>] acpi_ds_exec_end_op+0xf8/0x578
[ 67.703696] [<ffffffff812457f9>] acpi_ps_parse_loop+0x88a/0xa55
[ 67.703699] [<ffffffff81244a00>] acpi_ps_parse_aml+0x104/0x3c4
[ 67.703702] [<ffffffff81246198>] acpi_ps_execute_method+0x20f/0x2f3
[ 67.703705] [<ffffffff8124021f>] acpi_ns_evaluate+0x18b/0x2d2
[ 67.703708] [<ffffffff8123fad0>] acpi_evaluate_object+0x1b8/0x2fc
[ 67.703710] [<ffffffff8123e020>] ? acpi_get_sleep_type_data+0x21c/0x236
[ 67.703714] [<ffffffff8123d9fb>] acpi_enter_sleep_state_prep+0x61/0xd9
[ 67.703717] [<ffffffff81224205>] acpi_sleep_prepare+0x4f/0x56
[ 67.703719] [<ffffffff81224268>] __acpi_pm_prepare+0x13/0x2e
[ 67.703722] [<ffffffff81224448>] acpi_pm_prepare+0xe/0x1f
[ 67.703725] [<ffffffff81224466>] acpi_hibernation_pre_snapshot+0xd/0x1e
[ 67.703728] [<ffffffff81071b80>] hibernation_snapshot+0xaf/0x258
[ 67.703731] [<ffffffff81074dca>] snapshot_ioctl+0x25c/0x547
[ 67.703733] [<ffffffff81056efc>] ? __srcu_read_unlock+0x3b/0x57
[ 67.703736] [<ffffffff810e7f7d>] vfs_ioctl+0x31/0xa2
[ 67.703739] [<ffffffff810e88dc>] do_vfs_ioctl+0x47c/0x4af
[ 67.703741] [<ffffffff8125ee3c>] ? n_tty_write+0x0/0x35e
[ 67.703744] [<ffffffff8100203a>] ? sysret_check+0x2e/0x69
[ 67.703747] [<ffffffff810e8960>] sys_ioctl+0x51/0x75
[ 67.703750] [<ffffffff81002002>] system_call_fastpath+0x16/0x1b



Please kindly review the patch:

Avoid using smp_processor_id in touch_softlockup_watchdog and touch_nmi_watchdog.
Patch also "removes" second call to smp_processor_id in __touch_watchdog
(smp_processor_id itself and smp_processor_id in __get_cpu_var).

---

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 613bc1f..8822f1e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -116,13 +116,14 @@ static unsigned long get_sample_period(void)
static void __touch_watchdog(void)
{
int this_cpu = smp_processor_id();
-
- __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+ per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
}

void touch_softlockup_watchdog(void)
{
- __get_cpu_var(watchdog_touch_ts) = 0;
+ int this_cpu = get_cpu();
+ per_cpu(watchdog_touch_ts, this_cpu) = 0;
+ put_cpu();
}
EXPORT_SYMBOL(touch_softlockup_watchdog);

@@ -142,7 +143,9 @@ void touch_all_softlockup_watchdogs(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
- __get_cpu_var(watchdog_nmi_touch) = true;
+ int this_cpu = get_cpu();
+ per_cpu(watchdog_nmi_touch, this_cpu) = true;
+ put_cpu();
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);


2010-08-16 08:23:04

by Peter Zijlstra

[permalink] [raw]
Subject: Re: fix BUG: using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Fri, 2010-08-13 at 13:21 +0300, Sergey Senozhatsky wrote:

> [ 67.703556] BUG: using smp_processor_id() in preemptible [00000000] code: s2disk/5139
> [ 67.703563] caller is touch_nmi_watchdog+0x15/0x2c
> [ 67.703566] Pid: 5139, comm: s2disk Not tainted 2.6.36-rc0-git12-07921-g60bf26a-dirty #116
> [ 67.703568] Call Trace:
> [ 67.703575] [<ffffffff811f6bf1>] debug_smp_processor_id+0xc9/0xe4
> [ 67.703578] [<ffffffff81092766>] touch_nmi_watchdog+0x15/0x2c
> [ 67.703584] [<ffffffff81222950>] acpi_os_stall+0x34/0x40
> [ 67.703589] [<ffffffff812398d2>] acpi_ex_system_do_stall+0x34/0x38

Which could mean two things, either ACPI got funny on us, or Don's new
watchdog stuff has a hole in it.


> ---
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 613bc1f..8822f1e 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -116,13 +116,14 @@ static unsigned long get_sample_period(void)
> static void __touch_watchdog(void)
> {
> int this_cpu = smp_processor_id();
> -
> - __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
> + per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
> }

That change seems sensible enough..

> void touch_softlockup_watchdog(void)
> {
> - __get_cpu_var(watchdog_touch_ts) = 0;
> + int this_cpu = get_cpu();
> + per_cpu(watchdog_touch_ts, this_cpu) = 0;
> + put_cpu();
> }
> EXPORT_SYMBOL(touch_softlockup_watchdog);
>
> @@ -142,7 +143,9 @@ void touch_all_softlockup_watchdogs(void)
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> void touch_nmi_watchdog(void)
> {
> - __get_cpu_var(watchdog_nmi_touch) = true;
> + int this_cpu = get_cpu();
> + per_cpu(watchdog_nmi_touch, this_cpu) = true;
> + put_cpu();
> touch_softlockup_watchdog();
> }
> EXPORT_SYMBOL(touch_nmi_watchdog);

These other two really are about assumptions we make on the call sites,
which at the very least are violated by ACPI.

Don/Ingo, remember if we require touch_*_watchdog callers to have
preemption disabled? Or is the proposed patch sensible?

2010-08-16 13:35:19

by Don Zickus

[permalink] [raw]
Subject: Re: fix BUG: using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Mon, Aug 16, 2010 at 10:22:50AM +0200, Peter Zijlstra wrote:
> On Fri, 2010-08-13 at 13:21 +0300, Sergey Senozhatsky wrote:
>
> > [ 67.703556] BUG: using smp_processor_id() in preemptible [00000000] code: s2disk/5139
> > [ 67.703563] caller is touch_nmi_watchdog+0x15/0x2c
> > [ 67.703566] Pid: 5139, comm: s2disk Not tainted 2.6.36-rc0-git12-07921-g60bf26a-dirty #116
> > [ 67.703568] Call Trace:
> > [ 67.703575] [<ffffffff811f6bf1>] debug_smp_processor_id+0xc9/0xe4
> > [ 67.703578] [<ffffffff81092766>] touch_nmi_watchdog+0x15/0x2c
> > [ 67.703584] [<ffffffff81222950>] acpi_os_stall+0x34/0x40
> > [ 67.703589] [<ffffffff812398d2>] acpi_ex_system_do_stall+0x34/0x38
>
> Which could mean two things, either ACPI got funny on us, or Don's new
> watchdog stuff has a hole in it.

it could. :-)

>
>
> > ---
> >
> > diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> > index 613bc1f..8822f1e 100644
> > --- a/kernel/watchdog.c
> > +++ b/kernel/watchdog.c
> > @@ -116,13 +116,14 @@ static unsigned long get_sample_period(void)
> > static void __touch_watchdog(void)
> > {
> > int this_cpu = smp_processor_id();
> > -
> > - __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
> > + per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
> > }
>
> That change seems sensible enough..

ok.

>
> > void touch_softlockup_watchdog(void)
> > {
> > - __get_cpu_var(watchdog_touch_ts) = 0;
> > + int this_cpu = get_cpu();
> > + per_cpu(watchdog_touch_ts, this_cpu) = 0;
> > + put_cpu();
> > }
> > EXPORT_SYMBOL(touch_softlockup_watchdog);
> >
> > @@ -142,7 +143,9 @@ void touch_all_softlockup_watchdogs(void)
> > #ifdef CONFIG_HARDLOCKUP_DETECTOR
> > void touch_nmi_watchdog(void)
> > {
> > - __get_cpu_var(watchdog_nmi_touch) = true;
> > + int this_cpu = get_cpu();
> > + per_cpu(watchdog_nmi_touch, this_cpu) = true;
> > + put_cpu();
> > touch_softlockup_watchdog();
> > }
> > EXPORT_SYMBOL(touch_nmi_watchdog);
>
> These other two really are about assumptions we make on the call sites,
> which at the very least are violated by ACPI.
>
> Don/Ingo, remember if we require touch_*_watchdog callers to have
> preemption disabled? Or is the proposed patch sensible?

I don't recall any requirement to have preemption disabled when using
those functions. It seems sensible to put it in the
touch_{softlockup|nmi}_watchdog code.

I assume the reason for having preemption disabled when using
smp_processor_id() is that the code could migrate to another cpu when
rescheduled?

I don't see a problem with the patch, but my low level understanding of
the __get_cpu_var vs. per_cpu isn't very strong.

Cheers,
Don

>

2010-08-16 13:47:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: fix BUG: using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Mon, 2010-08-16 at 09:34 -0400, Don Zickus wrote:
> > Don/Ingo, remember if we require touch_*_watchdog callers to have
> > preemption disabled? Or is the proposed patch sensible?
>
> I don't recall any requirement to have preemption disabled when using
> those functions. It seems sensible to put it in the
> touch_{softlockup|nmi}_watchdog code.

OK, in that case the patch looks sensible.

> I assume the reason for having preemption disabled when using
> smp_processor_id() is that the code could migrate to another cpu when
> rescheduled?

Right, if you can freely schedule, you can get migrated, which means you
can get migrated between having determined the return value and using
it, at which point the computed value is meaningless.

> I don't see a problem with the patch, but my low level understanding of
> the __get_cpu_var vs. per_cpu isn't very strong.

__get_cpu_var() gets you the value on the current cpu, per_cpu() takes a
cpu argument.

2010-08-16 14:07:05

by Yong Zhang

[permalink] [raw]
Subject: Re: fix BUG: using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Mon, Aug 16, 2010 at 09:34:52AM -0400, Don Zickus wrote:
> I don't recall any requirement to have preemption disabled when using
> those functions.

Isn't that implicit? I mean the caller of touch_{softlockup|nmi}_watchdog
will sticky to that cpu before it finish running.

> It seems sensible to put it in the
> touch_{softlockup|nmi}_watchdog code.

I don't think so. Such as:

...
preempt_disable() <===A
touch_{softlockup|nmi}_watchdog <===B
preempt_enable() <===C
...

You just scroll A and C into B, but what will happen before preempt
occur before A?

>
> I assume the reason for having preemption disabled when using
> smp_processor_id() is that the code could migrate to another cpu when
> rescheduled?

If the migration could happen, then we could touch the wrong cpu-data,
and the detection on the original cpu will trigger anyway.

>
> I don't see a problem with the patch, but my low level understanding of
> the __get_cpu_var vs. per_cpu isn't very strong.

Maybe we should use __raw_get_cpu_var() instead.

Thanks,
Yong

2010-08-16 14:08:49

by Sergey Senozhatsky

[permalink] [raw]
Subject: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

Fix: acpi_os_stall calls touch_nmi_watchdog and touch_softlockup_watchdog
with preemption enabled causing 'BUG: using smp_processor_id() in preemptible
code'.

Patch also removes double smp_processor_id call (smp_processor_id itself and in
__get_cpu_var) in __touch_watchdog.

Signed-off-by: Sergey Senozhatsky <[email protected]>

---

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 613bc1f..8822f1e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -116,13 +116,14 @@ static unsigned long get_sample_period(void)
static void __touch_watchdog(void)
{
int this_cpu = smp_processor_id();
-
- __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+ per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
}

void touch_softlockup_watchdog(void)
{
- __get_cpu_var(watchdog_touch_ts) = 0;
+ int this_cpu = get_cpu();
+ per_cpu(watchdog_touch_ts, this_cpu) = 0;
+ put_cpu();
}
EXPORT_SYMBOL(touch_softlockup_watchdog);

@@ -142,7 +143,9 @@ void touch_all_softlockup_watchdogs(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
- __get_cpu_var(watchdog_nmi_touch) = true;
+ int this_cpu = get_cpu();
+ per_cpu(watchdog_nmi_touch, this_cpu) = true;
+ put_cpu();
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);

2010-08-16 14:13:13

by Don Zickus

[permalink] [raw]
Subject: Re: fix BUG: using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Mon, Aug 16, 2010 at 03:46:58PM +0200, Peter Zijlstra wrote:
> > I don't see a problem with the patch, but my low level understanding of
> > the __get_cpu_var vs. per_cpu isn't very strong.
>
> __get_cpu_var() gets you the value on the current cpu, per_cpu() takes a
> cpu argument.

Well I know that much. :-) It seems that __get_cpu_var depends on
preemption being disabled whereas per_cpu does not? Though for some
reason I thought __get_cpu_var would be more atomic when it grabbed the
current cpu such that you wouldn't need to disable preemption. Guess not.

Cheers,
Don

2010-08-16 14:29:30

by Peter Zijlstra

[permalink] [raw]
Subject: Re: fix BUG: using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Mon, 2010-08-16 at 10:12 -0400, Don Zickus wrote:
> On Mon, Aug 16, 2010 at 03:46:58PM +0200, Peter Zijlstra wrote:
> > > I don't see a problem with the patch, but my low level understanding of
> > > the __get_cpu_var vs. per_cpu isn't very strong.
> >
> > __get_cpu_var() gets you the value on the current cpu, per_cpu() takes a
> > cpu argument.
>
> Well I know that much. :-) It seems that __get_cpu_var depends on
> preemption being disabled whereas per_cpu does not? Though for some
> reason I thought __get_cpu_var would be more atomic when it grabbed the
> current cpu such that you wouldn't need to disable preemption. Guess not.

Indeed, it can't be implemented atomically on all smp systems, hence its
really nothing other than a 'convenient' short for per_cpu(foo,
smp_processor_id()).

2010-08-16 14:31:46

by Don Zickus

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

cc'ing Frederic

On Mon, Aug 16, 2010 at 05:08:29PM +0300, Sergey Senozhatsky wrote:
> Fix: acpi_os_stall calls touch_nmi_watchdog and touch_softlockup_watchdog
> with preemption enabled causing 'BUG: using smp_processor_id() in preemptible
> code'.
>
> Patch also removes double smp_processor_id call (smp_processor_id itself and in
> __get_cpu_var) in __touch_watchdog.
>
> Signed-off-by: Sergey Senozhatsky <[email protected]>

Acked-by: Don Zickus <[email protected]>

>
> ---
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 613bc1f..8822f1e 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -116,13 +116,14 @@ static unsigned long get_sample_period(void)
> static void __touch_watchdog(void)
> {
> int this_cpu = smp_processor_id();
> -
> - __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
> + per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
> }
>
> void touch_softlockup_watchdog(void)
> {
> - __get_cpu_var(watchdog_touch_ts) = 0;
> + int this_cpu = get_cpu();
> + per_cpu(watchdog_touch_ts, this_cpu) = 0;
> + put_cpu();
> }
> EXPORT_SYMBOL(touch_softlockup_watchdog);
>
> @@ -142,7 +143,9 @@ void touch_all_softlockup_watchdogs(void)
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> void touch_nmi_watchdog(void)
> {
> - __get_cpu_var(watchdog_nmi_touch) = true;
> + int this_cpu = get_cpu();
> + per_cpu(watchdog_nmi_touch, this_cpu) = true;
> + put_cpu();
> touch_softlockup_watchdog();
> }
> EXPORT_SYMBOL(touch_nmi_watchdog);
>

2010-08-17 03:00:06

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Mon, Aug 16, 2010 at 05:08:29PM +0300, Sergey Senozhatsky wrote:
> void touch_softlockup_watchdog(void)
> {
> - __get_cpu_var(watchdog_touch_ts) = 0;
> + int this_cpu = get_cpu();
> + per_cpu(watchdog_touch_ts, this_cpu) = 0;
> + put_cpu();
> }



If preemption is disabled and you deal with the current cpu,
then please use __get_cpu_var, it makes the code more
readable:


void touch_softlockup_watchdog(void)
{
preempt_disable();
__get_cpu_var(watchdog_touch_ts) = 0;
preempt_enable();
}


Same below.

Thanks.


> EXPORT_SYMBOL(touch_softlockup_watchdog);
>
> @@ -142,7 +143,9 @@ void touch_all_softlockup_watchdogs(void)
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> void touch_nmi_watchdog(void)
> {
> - __get_cpu_var(watchdog_nmi_touch) = true;
> + int this_cpu = get_cpu();
> + per_cpu(watchdog_nmi_touch, this_cpu) = true;
> + put_cpu();
> touch_softlockup_watchdog();
> }
> EXPORT_SYMBOL(touch_nmi_watchdog);
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

2010-08-17 03:17:08

by Yong Zhang

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Tue, Aug 17, 2010 at 10:59 AM, Frederic Weisbecker
<[email protected]> wrote:
> If preemption is disabled and you deal with the current cpu,
> then please use __get_cpu_var, it makes the code more
> readable:
>
>
> void touch_softlockup_watchdog(void)
> {
>        preempt_disable();
>        __(watchdog_touch_ts) = 0;
>        preempt_enable();
> }

Why not use __raw_get_cpu_var() instead?
You know adding preempt protection in touch_softlockup_watchdog()
just suppress the warning. Am I missing something?

Thanks,
Yong

2010-08-17 04:27:08

by Yong Zhang

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Mon, Aug 16, 2010 at 10:30 PM, Don Zickus <[email protected]> wrote:
>> Patch also removes double smp_processor_id call (smp_processor_id itself and in
>> __get_cpu_var) in __touch_watchdog.

After checking touch_softlockup_watchdog() and touch_nmi_watchdog() in
before version, __raw_get_cpu_var() is used there.

Thanks,
Yong

>>
>> Signed-off-by: Sergey Senozhatsky <[email protected]>
>
> Acked-by: Don Zickus <[email protected]>
>
>>
>> ---
>>
>> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
>> index 613bc1f..8822f1e 100644
>> --- a/kernel/watchdog.c
>> +++ b/kernel/watchdog.c
>> @@ -116,13 +116,14 @@ static unsigned long get_sample_period(void)
>>  static void __touch_watchdog(void)
>>  {
>>       int this_cpu = smp_processor_id();
>> -
>> -     __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
>> +     per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
>>  }
>>
>>  void touch_softlockup_watchdog(void)
>>  {
>> -     __get_cpu_var(watchdog_touch_ts) = 0;
>> +     int this_cpu = get_cpu();
>> +     per_cpu(watchdog_touch_ts, this_cpu) = 0;
>> +     put_cpu();
>>  }
>>  EXPORT_SYMBOL(touch_softlockup_watchdog);
>>
>> @@ -142,7 +143,9 @@ void touch_all_softlockup_watchdogs(void)
>>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
>>  void touch_nmi_watchdog(void)
>>  {
>> -     __get_cpu_var(watchdog_nmi_touch) = true;
>> +     int this_cpu = get_cpu();
>> +     per_cpu(watchdog_nmi_touch, this_cpu) = true;
>> +     put_cpu();
>>       touch_softlockup_watchdog();
>>  }
>>  EXPORT_SYMBOL(touch_nmi_watchdog);
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

2010-08-17 07:56:34

by Sergey Senozhatsky

[permalink] [raw]
Subject: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog (v2)

Fix: acpi_os_stall calls touch_nmi_watchdog and touch_softlockup_watchdog
with preemption enabled causing 'BUG: using smp_processor_id() in preemptible
code'.

Patch also removes double smp_processor_id call (smp_processor_id itself and
in __get_cpu_var) in __touch_watchdog.

Signed-off-by: Sergey Senozhatsky <[email protected]>

---

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 613bc1f..cb4f4d4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -116,13 +116,14 @@ static unsigned long get_sample_period(void)
static void __touch_watchdog(void)
{
int this_cpu = smp_processor_id();
-
- __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+ per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
}

void touch_softlockup_watchdog(void)
{
+ preempt_disable();
__get_cpu_var(watchdog_touch_ts) = 0;
+ preempt_enable();
}
EXPORT_SYMBOL(touch_softlockup_watchdog);

@@ -142,7 +143,10 @@ void touch_all_softlockup_watchdogs(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
+ preempt_disable();
__get_cpu_var(watchdog_nmi_touch) = true;
+ preempt_enable();
+
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);

2010-08-17 08:40:08

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

Hello,

On (08/17/10 11:16), Yong Zhang wrote:
> On Tue, Aug 17, 2010 at 10:59 AM, Frederic Weisbecker
> <[email protected]> wrote:
> > If preemption is disabled and you deal with the current cpu,
> > then please use __get_cpu_var, it makes the code more
> > readable:
> >
> >
> > void touch_softlockup_watchdog(void)
> > {
> > ? ? ? ?preempt_disable();
> > ? ? ? ?__(watchdog_touch_ts) = 0;
> > ? ? ? ?preempt_enable();
> > }
>
> Why not use __raw_get_cpu_var() instead?
> You know adding preempt protection in touch_softlockup_watchdog()
> just suppress the warning. Am I missing something?
>

Sorry, my low level understanding of the __raw_get_cpu_var isn't very strong.
I assume it uses current_thread_info()->cpu in some cases (right?) or
percpu_from_op.


Should it be
acpi_os_stall
preepmt_disable
touch_nmi_watchdog
touch_softlockup_watchdog
preempt_enable

?

Sergey


Attachments:
(No filename) (886.00 B)
(No filename) (316.00 B)
Download all attachments

2010-08-17 09:05:40

by Yong Zhang

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Tue, Aug 17, 2010 at 4:39 PM, Sergey Senozhatsky
<[email protected]> wrote:
>> Why not use __raw_get_cpu_var() instead?
>> You know adding preempt protection in touch_softlockup_watchdog()
>> just suppress the warning. Am I missing something?
>>
>
> Sorry, my low level understanding of the __raw_get_cpu_var isn't very strong.
> I assume it uses current_thread_info()->cpu in some cases (right?) or
> percpu_from_op.

The difference is __raw_get_cpu_var() is using raw_smp_processor_id().

>
>
> Should it be
> acpi_os_stall
>        preepmt_disable
>        touch_nmi_watchdog
>                touch_softlockup_watchdog
>        preempt_enable

Actually I don't think this is helpful for the whole function. Because
if acpi_os_stall()
migrate(I don't know if it could) to another CPU just before
preepmt_disable(), we'll
be on the wrong way. Adding preempt protection is just smoothing the warning.

So I prefer using __raw_get_cpu_var() as what we have been done before.

Thanks,
Yong

2010-08-17 09:24:29

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On (08/17/10 17:05), Yong Zhang wrote:
> >> Why not use __raw_get_cpu_var() instead?
> >> You know adding preempt protection in touch_softlockup_watchdog()
> >> just suppress the warning. Am I missing something?
> >>
> >
> > Sorry, my low level understanding of the __raw_get_cpu_var isn't very strong.
> > I assume it uses current_thread_info()->cpu in some cases (right?) or
> > percpu_from_op.
>
> The difference is __raw_get_cpu_var() is using raw_smp_processor_id().
>
> >
> >
> > Should it be
> > acpi_os_stall
> > ? ? ? ?preepmt_disable
> > ? ? ? ?touch_nmi_watchdog
> > ? ? ? ? ? ? ? ?touch_softlockup_watchdog
> > ? ? ? ?preempt_enable
>
> Actually I don't think this is helpful for the whole function. Because
> if acpi_os_stall()
> migrate(I don't know if it could) to another CPU just before
> preepmt_disable(), we'll
> be on the wrong way. Adding preempt protection is just smoothing the warning.
>

OK. Suppose (I don't know if it could) migration has happen

acpi_os_stall
__migration__
touch_nmi_watchdog

How calling raw_smp_processor_id() (which is current_thread_info()->cpu)
vs. preepmt_disable - smp_processor_id() will give us different CPUs?

> So I prefer using __raw_get_cpu_var() as what we have been done before.
>

Hm...

26e09c6eee14f4827b55137ba0eedc4e77cd50ab

static void __touch_watchdog(void)
{
- int this_cpu = raw_smp_processor_id();
+ int this_cpu = smp_processor_id();


Sergey


Attachments:
(No filename) (1.41 kB)
(No filename) (316.00 B)
Download all attachments

2010-08-17 09:37:36

by Yong Zhang

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Tue, Aug 17, 2010 at 5:24 PM, Sergey Senozhatsky
<[email protected]> wrote:
> OK. Suppose (I don't know if it could) migration has happen
>
> acpi_os_stall
>        __migration__
>        touch_nmi_watchdog
>
> How calling raw_smp_processor_id() (which is current_thread_info()->cpu)
> vs. preepmt_disable - smp_processor_id() will give us different CPUs?

I don't mean you will get different CPUS(sorry for my poor english).
I mean if the migration could happen, you want to touch_nmi_watchdog()
on CPU A(otherwise the watchdog will shout on us), but eventually we
touch_nmi_watchdog() on CPU B(because of migration),
and this is not what we want.

So preempt_disable() is redundant here.

>
>> So I prefer using __raw_get_cpu_var() as what we have been done before.
>>
>
> Hm...
>
> 26e09c6eee14f4827b55137ba0eedc4e77cd50ab

f69bcf60c3f17aa367e16eef7bc6ab001ea6d58a
2508ce1845a3b256798532b2c6b7997c2dc6533b

you can get the previous touch_*_watchdog there.

Thanks,
Yong

2010-08-17 10:28:42

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On (08/17/10 17:37), Yong Zhang wrote:
> On Tue, Aug 17, 2010 at 5:24 PM, Sergey Senozhatsky
> <[email protected]> wrote:
> > OK. Suppose (I don't know if it could) migration has happen
> >
> > acpi_os_stall
> > ? ? ? ?__migration__
> > ? ? ? ?touch_nmi_watchdog
> >
> > How calling raw_smp_processor_id() (which is current_thread_info()->cpu)
> > vs. preepmt_disable - smp_processor_id() will give us different CPUs?
>
> I don't mean you will get different CPUS(sorry for my poor english).
> I mean if the migration could happen, you want to touch_nmi_watchdog()
> on CPU A(otherwise the watchdog will shout on us), but eventually we
> touch_nmi_watchdog() on CPU B(because of migration),
> and this is not what we want.
>
> So preempt_disable() is redundant here.
>

Shouldn't we be for sure not preepmtible when calling __raw_get_cpu_var?

preempt_disable is reduntant here because current_thread_info()->cpu is
atomic and we just don't want preempt_(enable|disable) overhead?

Sergey

> >
> >> So I prefer using __raw_get_cpu_var() as what we have been done before.
> >>
> >
> > Hm...
> >
> > 26e09c6eee14f4827b55137ba0eedc4e77cd50ab
>
> f69bcf60c3f17aa367e16eef7bc6ab001ea6d58a
> 2508ce1845a3b256798532b2c6b7997c2dc6533b
>
> you can get the previous touch_*_watchdog there.
>
> Thanks,
> Yong
>


Attachments:
(No filename) (1.29 kB)
(No filename) (316.00 B)
Download all attachments

2010-08-17 10:40:11

by Sergey Senozhatsky

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

Please kindly review.

---

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 613bc1f..22dd388 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -116,13 +116,12 @@ static unsigned long get_sample_period(void)
static void __touch_watchdog(void)
{
int this_cpu = smp_processor_id();
-
- __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
+ per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
}

void touch_softlockup_watchdog(void)
{
- __get_cpu_var(watchdog_touch_ts) = 0;
+ __raw_get_cpu_var(watchdog_touch_ts) = 0;
}
EXPORT_SYMBOL(touch_softlockup_watchdog);

@@ -142,7 +141,7 @@ void touch_all_softlockup_watchdogs(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
- __get_cpu_var(watchdog_nmi_touch) = true;
+ __raw_get_cpu_var(watchdog_nmi_touch) = true;
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);

2010-08-17 12:48:48

by Yong Zhang

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Tue, Aug 17, 2010 at 01:28:19PM +0300, Sergey Senozhatsky wrote:
> > So preempt_disable() is redundant here.
> >
>
> Shouldn't we be for sure not preepmtible when calling __raw_get_cpu_var?

IMHO, it's the caller's responsibility.

>
> preempt_disable is reduntant here because current_thread_info()->cpu is
> atomic and we just don't want preempt_(enable|disable) overhead?

Yep.

Thanks,
Yong

2010-08-17 12:57:07

by Yong Zhang

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Tue, Aug 17, 2010 at 01:39:48PM +0300, Sergey Senozhatsky wrote:
> Please kindly review.
>
> ---
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 613bc1f..22dd388 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -116,13 +116,12 @@ static unsigned long get_sample_period(void)
> static void __touch_watchdog(void)
> {
> int this_cpu = smp_processor_id();
> -
> - __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
> + per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
> }

The two caller of __touch_watchdog() is:
1)watchdog_timer_fn(): it's preempt disabled when called.
2)watchdog(): it's bound to one cpu. Then means using smp_processor_id()
safely.

So I think this change is needless, but anyway it's harmless.

Below looks fine to me.
But you still need comments from others.

Thanks,
Yong

>
> void touch_softlockup_watchdog(void)
> {
> - __get_cpu_var(watchdog_touch_ts) = 0;
> + __raw_get_cpu_var(watchdog_touch_ts) = 0;
> }
> EXPORT_SYMBOL(touch_softlockup_watchdog);
>
> @@ -142,7 +141,7 @@ void touch_all_softlockup_watchdogs(void)
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> void touch_nmi_watchdog(void)
> {
> - __get_cpu_var(watchdog_nmi_touch) = true;
> + __raw_get_cpu_var(watchdog_nmi_touch) = true;
> touch_softlockup_watchdog();
> }
> EXPORT_SYMBOL(touch_nmi_watchdog);

2010-08-17 13:14:12

by Don Zickus

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Tue, Aug 17, 2010 at 01:39:48PM +0300, Sergey Senozhatsky wrote:
> Please kindly review.

I don't have a deep enough understanding of the subtleties between
per_cpu, __get_cpu_var, and __raw_get_cpu_var to really say which is
correct. To me, all three versions of your patch look they do the same
thing.

Technically, it seems like preempt_disable/enable would be the correct
thing to do. But as someone pointed out earlier, if the code is preempted
and switches cpu, then the touch_*_watchdog effectively becomes a no-op
(which I guess it can do even with the preempt_disable/enable surrounding
it). So I have no idea. I am going to wait for smarter people than me to
provide an opinion. :-)

Cheers,
Don

>
> ---
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 613bc1f..22dd388 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -116,13 +116,12 @@ static unsigned long get_sample_period(void)
> static void __touch_watchdog(void)
> {
> int this_cpu = smp_processor_id();
> -
> - __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
> + per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
> }
>
> void touch_softlockup_watchdog(void)
> {
> - __get_cpu_var(watchdog_touch_ts) = 0;
> + __raw_get_cpu_var(watchdog_touch_ts) = 0;
> }
> EXPORT_SYMBOL(touch_softlockup_watchdog);
>
> @@ -142,7 +141,7 @@ void touch_all_softlockup_watchdogs(void)
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> void touch_nmi_watchdog(void)
> {
> - __get_cpu_var(watchdog_nmi_touch) = true;
> + __raw_get_cpu_var(watchdog_nmi_touch) = true;
> touch_softlockup_watchdog();
> }
> EXPORT_SYMBOL(touch_nmi_watchdog);
>

2010-08-18 02:48:15

by Frederic Weisbecker

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Tue, Aug 17, 2010 at 09:13:20AM -0400, Don Zickus wrote:
> On Tue, Aug 17, 2010 at 01:39:48PM +0300, Sergey Senozhatsky wrote:
> > Please kindly review.
>
> I don't have a deep enough understanding of the subtleties between
> per_cpu, __get_cpu_var, and __raw_get_cpu_var to really say which is
> correct. To me, all three versions of your patch look they do the same
> thing.
>
> Technically, it seems like preempt_disable/enable would be the correct
> thing to do. But as someone pointed out earlier, if the code is preempted
> and switches cpu, then the touch_*_watchdog effectively becomes a no-op
> (which I guess it can do even with the preempt_disable/enable surrounding
> it). So I have no idea. I am going to wait for smarter people than me to
> provide an opinion. :-)
>
> Cheers,
> Don


(Adding Len Brown in Cc.

Len, this is about acpi_os_stall() that touches the watchdog while
running in a preemptable section, this triggers warnings because of
the use of local cpu accessors. We are debating about the appropriate
way to solve this).

The more I think about it, the more I think that doesn't make sense
to have touch_nmi_watchdog() callable from preemptable code.

It is buggy by nature.

If you run in a preemptable section, then interrupts can fire, and if
they can, the nmi watchdog is fine and doesn't need to be touched.

Here the problem is more in the softlockup watchdog, because even if you
run in a preemptable section, if you run a !CONFIG_PREEMPT kernel, then
you can't be preempted and the watchdog won't be scheduled until the
udelay loop finishes. But to solve that you would need cond_resched()
calls, not touching the watchdog.

Because touching the softlockup watchdog doesn't make sense either
if you can migrate: you can run the udelay on CPU 0, then migrate on
CPU 1 and call touch_softlockup_watchdog() from there. Which makes
definetely no sense. This is buggy.

And because we want to avoid such buggy uses of the touch_whatever_watchdog()
APIs, these function must continue to check they are called from non-preemptable
code. Randomly touching the watchdog could hide real lockups to the user.

The problem is on the caller. Considering such udelays loop:

* if it's in a irq disabled section, call touch_nmi_watchdog(), because this
could prevent the nmi watchdog irq from firing
* if it's in a non-preemptable section, call touch_softlockup_watchdog(), because
this could prevent the softlockup watchdog task from beeing scheduled
* if it's from a preemptable task context, this should call cond_resched() to
avoid huge latencies on !CONFIG_PREEMPT


But acpi_os_stall() seem to be called from 4 different places, and these places
may run in different context like the above described.

The ACPI code should probably use more specific busy-loop APIs, depending on the
context it runs.

2010-08-18 19:34:19

by Andrew Morton

[permalink] [raw]
Subject: Re: fix BUG: using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Fri, 13 Aug 2010 13:21:58 +0300
Sergey Senozhatsky <[email protected]> wrote:

> Hello,
>
> Got this traces today:
>
> ...
>
> Avoid using smp_processor_id in touch_softlockup_watchdog and touch_nmi_watchdog.
> Patch also "removes" second call to smp_processor_id in __touch_watchdog
> (smp_processor_id itself and smp_processor_id in __get_cpu_var).
>
> ---
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 613bc1f..8822f1e 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -116,13 +116,14 @@ static unsigned long get_sample_period(void)
> static void __touch_watchdog(void)
> {
> int this_cpu = smp_processor_id();
> -
> - __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu);
> + per_cpu(watchdog_touch_ts, this_cpu) = get_timestamp(this_cpu);
> }

Fair enough, although strictly speaking this should be done in a
separate and later patch.

> void touch_softlockup_watchdog(void)
> {
> - __get_cpu_var(watchdog_touch_ts) = 0;
> + int this_cpu = get_cpu();
> + per_cpu(watchdog_touch_ts, this_cpu) = 0;
> + put_cpu();
> }
> EXPORT_SYMBOL(touch_softlockup_watchdog);
>
> @@ -142,7 +143,9 @@ void touch_all_softlockup_watchdogs(void)
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> void touch_nmi_watchdog(void)
> {
> - __get_cpu_var(watchdog_nmi_touch) = true;
> + int this_cpu = get_cpu();
> + per_cpu(watchdog_nmi_touch, this_cpu) = true;
> + put_cpu();
> touch_softlockup_watchdog();
> }
> EXPORT_SYMBOL(touch_nmi_watchdog);

Why did this start happening? Surely we've called
touch_softlockup_watchdog() from within preemptible code before now.
Methinks that

: commit 26e09c6eee14f4827b55137ba0eedc4e77cd50ab
: Author: Don Zickus <[email protected]>
: AuthorDate: Mon May 17 18:06:04 2010 -0400
: Commit: Frederic Weisbecker <[email protected]>
: CommitDate: Wed May 19 11:32:14 2010 +0200
:
: lockup_detector: Convert per_cpu to __get_cpu_var for readability

was simply broken? That would be strange, given that it's been sitting
around since May 17.

If we don't want to revert 26e09c6eee14f4827b55137ba0eedc4e77cd50ab
then I'd suggest that we simply switch to raw_smp_processor_id(): those
newly-added get_cpu/put_cpu calls don't do anything useful.

2010-08-18 20:02:28

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Wed, 18 Aug 2010 04:48:05 +0200
Frederic Weisbecker <[email protected]> wrote:

> On Tue, Aug 17, 2010 at 09:13:20AM -0400, Don Zickus wrote:
> > On Tue, Aug 17, 2010 at 01:39:48PM +0300, Sergey Senozhatsky wrote:
> > > Please kindly review.
> >
> > I don't have a deep enough understanding of the subtleties between
> > per_cpu, __get_cpu_var, and __raw_get_cpu_var to really say which is
> > correct. To me, all three versions of your patch look they do the same
> > thing.
> >
> > Technically, it seems like preempt_disable/enable would be the correct
> > thing to do. But as someone pointed out earlier, if the code is preempted
> > and switches cpu, then the touch_*_watchdog effectively becomes a no-op
> > (which I guess it can do even with the preempt_disable/enable surrounding
> > it). So I have no idea. I am going to wait for smarter people than me to
> > provide an opinion. :-)
> >
> > Cheers,
> > Don
>
>
> (Adding Len Brown in Cc.

I'm not sure who looks after osl.c. I added linux-acpi to cc.

> Len, this is about acpi_os_stall() that touches the watchdog while
> running in a preemptable section, this triggers warnings because of
> the use of local cpu accessors. We are debating about the appropriate
> way to solve this).
>
> The more I think about it, the more I think that doesn't make sense
> to have touch_nmi_watchdog() callable from preemptable code.
>
> It is buggy by nature.
>
> If you run in a preemptable section, then interrupts can fire, and if
> they can, the nmi watchdog is fine and doesn't need to be touched.
>
> Here the problem is more in the softlockup watchdog, because even if you
> run in a preemptable section, if you run a !CONFIG_PREEMPT kernel, then
> you can't be preempted and the watchdog won't be scheduled until the
> udelay loop finishes. But to solve that you would need cond_resched()
> calls, not touching the watchdog.
>
> Because touching the softlockup watchdog doesn't make sense either
> if you can migrate: you can run the udelay on CPU 0, then migrate on
> CPU 1 and call touch_softlockup_watchdog() from there. Which makes
> definetely no sense. This is buggy.
>
> And because we want to avoid such buggy uses of the touch_whatever_watchdog()
> APIs, these function must continue to check they are called from non-preemptable
> code. Randomly touching the watchdog could hide real lockups to the user.
>
> The problem is on the caller. Considering such udelays loop:
>
> * if it's in a irq disabled section, call touch_nmi_watchdog(), because this
> could prevent the nmi watchdog irq from firing
> * if it's in a non-preemptable section, call touch_softlockup_watchdog(), because
> this could prevent the softlockup watchdog task from beeing scheduled
> * if it's from a preemptable task context, this should call cond_resched() to
> avoid huge latencies on !CONFIG_PREEMPT
>
>
> But acpi_os_stall() seem to be called from 4 different places, and these places
> may run in different context like the above described.
>
> The ACPI code should probably use more specific busy-loop APIs, depending on the
> context it runs.

The touch_nmi_watchdog() was added to acpi_os_stall() by little old me
in 2003. It was committed by Andy with the patch title "ACPI:
Correctly handle NMI watchdog during long stalls (Andrew Morton)". My
title was "ACPI poweroff trigers the NMI watchdog". My changelog was

ACPI poweroff trigers the NMI watchdog. Fix.

(My spelling has improved with age).

So. If we remove that touch, will poweroff still trigger the NMI?
Dunno.


The surprise new requirement that touch_nmi_watchdog() be called from
non-preemptible code does seem to make sense IMO. It's hard to see why
anyone would be touching the watchdog unless he's spinning in irqs-off
code. Except, of course, when we have a utility function which can be
called from wither irqs-on or irqs-off: acpi_os_stall().

That being said, it's not good to introduce new API requirements by
accident! An audit of all callers should first be performed, at least.


The surprise new requirement that touch_softlockup_watchdog() be called
from non-preemptible code doesn't make sense IMO. If I have a piece of
code in the kernel which I expect to sit in TASK_UNINTERRUPTIBLE state
for three minutes waiting for my egg to boil, I should be able to do
that and I should be able to touch the softlockup detector without
needing to go non-preemptible.

2010-08-18 21:44:53

by Cyrill Gorcunov

[permalink] [raw]
Subject: Re: fix BUG: using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Wed, Aug 18, 2010 at 12:33:46PM -0700, Andrew Morton wrote:
> On Fri, 13 Aug 2010 13:21:58 +0300
> Sergey Senozhatsky <[email protected]> wrote:
>
> > Hello,
> >
> > Got this traces today:
> >
> > ...
> >

...

> > void touch_softlockup_watchdog(void)
> > {
> > - __get_cpu_var(watchdog_touch_ts) = 0;
> > + int this_cpu = get_cpu();
> > + per_cpu(watchdog_touch_ts, this_cpu) = 0;
> > + put_cpu();
> > }
> > EXPORT_SYMBOL(touch_softlockup_watchdog);
> >
> > @@ -142,7 +143,9 @@ void touch_all_softlockup_watchdogs(void)
> > #ifdef CONFIG_HARDLOCKUP_DETECTOR
> > void touch_nmi_watchdog(void)
> > {
> > - __get_cpu_var(watchdog_nmi_touch) = true;
> > + int this_cpu = get_cpu();
> > + per_cpu(watchdog_nmi_touch, this_cpu) = true;
> > + put_cpu();
> > touch_softlockup_watchdog();
> > }
> > EXPORT_SYMBOL(touch_nmi_watchdog);
>
> Why did this start happening? Surely we've called
> touch_softlockup_watchdog() from within preemptible code before now.

indeed, and we've been using __raw interface before (2.6.18)

void touch_softlockup_watchdog(void)
{
__raw_get_cpu_var(touch_timestamp) = jiffies;
}

> Methinks that
>
> : commit 26e09c6eee14f4827b55137ba0eedc4e77cd50ab
> : Author: Don Zickus <[email protected]>
> : AuthorDate: Mon May 17 18:06:04 2010 -0400
> : Commit: Frederic Weisbecker <[email protected]>
> : CommitDate: Wed May 19 11:32:14 2010 +0200
> :
> : lockup_detector: Convert per_cpu to __get_cpu_var for readability
>
> was simply broken? That would be strange, given that it's been sitting
> around since May 17.
>
> If we don't want to revert 26e09c6eee14f4827b55137ba0eedc4e77cd50ab
> then I'd suggest that we simply switch to raw_smp_processor_id(): those
> newly-added get_cpu/put_cpu calls don't do anything useful.
>

I think it's fine to use __get_cpu_var in touch_nmi_watchdog (which should not
be called with irq enabled since it ticks anyway then, at least on x86) for hardware
nmi watchdog, can't conclude anything about softlockup (except that we had __raw
interface before) since I'm not familiar with soflockup watchdog at moment.

-- Cyrill

2010-08-19 02:28:15

by Don Zickus

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Wed, Aug 18, 2010 at 01:01:56PM -0700, Andrew Morton wrote:
> The surprise new requirement that touch_nmi_watchdog() be called from
> non-preemptible code does seem to make sense IMO. It's hard to see why
> anyone would be touching the watchdog unless he's spinning in irqs-off
> code. Except, of course, when we have a utility function which can be
> called from wither irqs-on or irqs-off: acpi_os_stall().
>
> That being said, it's not good to introduce new API requirements by
> accident! An audit of all callers should first be performed, at least.
>
>
> The surprise new requirement that touch_softlockup_watchdog() be called
> from non-preemptible code doesn't make sense IMO. If I have a piece of
> code in the kernel which I expect to sit in TASK_UNINTERRUPTIBLE state
> for three minutes waiting for my egg to boil, I should be able to do
> that and I should be able to touch the softlockup detector without
> needing to go non-preemptible.

Wow. So after re-reading what the original touch_*_watchdog code did and what I
copied to kernel/watchdog.c, I'm a little embarrassed on how I managed to
mangle the internals of both those functions.

While the idea is the same, the semantics are clearly different.

touch_nmi_watchdog had a for_each_cpu_present loop, which means it didn't
have to deal with the preempt issue.

touch_softlockup_watchdog used __raw_get_cpu_var to excuse itself from
dealing with the preempt issue.

I'll put together a patch that brings those functions back in line with
what they used to be. Sorry for the trouble.

Cheers,
Don

2010-08-20 02:58:46

by Don Zickus

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Wed, Aug 18, 2010 at 01:01:56PM -0700, Andrew Morton wrote:
> The surprise new requirement that touch_nmi_watchdog() be called from
> non-preemptible code does seem to make sense IMO. It's hard to see why
> anyone would be touching the watchdog unless he's spinning in irqs-off
> code. Except, of course, when we have a utility function which can be
> called from wither irqs-on or irqs-off: acpi_os_stall().
>
> That being said, it's not good to introduce new API requirements by
> accident! An audit of all callers should first be performed, at least.
>
>
> The surprise new requirement that touch_softlockup_watchdog() be called
> from non-preemptible code doesn't make sense IMO. If I have a piece of
> code in the kernel which I expect to sit in TASK_UNINTERRUPTIBLE state
> for three minutes waiting for my egg to boil, I should be able to do
> that and I should be able to touch the softlockup detector without
> needing to go non-preemptible.

Ok, so here is my patch that syncs the touch_*_watchdog back in line with
the old semantics. Hopefully this will undo any harm I caused.

------------cut -->---------------------------

>From b372e821c804982438db090db6b4a2f753c78091 Mon Sep 17 00:00:00 2001
From: Don Zickus <[email protected]>
Date: Thu, 19 Aug 2010 22:48:26 -0400
Subject: [PATCH] [lockup detector] sync touch_*_watchdog back to old semantics

During my rewrite, the semantics of touch_nmi_watchdog and
touch_softlockup_watchdog changed enough to break some drivers
(mostly over preemptable regions).

This change brings those touch_*_watchdog functions back in line
to how they used to work.

Signed-off-by: Don Zickus <[email protected]>
---
kernel/watchdog.c | 17 ++++++++++++-----
1 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 613bc1f..99e35a2 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -122,7 +122,7 @@ static void __touch_watchdog(void)

void touch_softlockup_watchdog(void)
{
- __get_cpu_var(watchdog_touch_ts) = 0;
+ __raw_get_cpu_var(watchdog_touch_ts) = 0;
}
EXPORT_SYMBOL(touch_softlockup_watchdog);

@@ -142,7 +142,14 @@ void touch_all_softlockup_watchdogs(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
- __get_cpu_var(watchdog_nmi_touch) = true;
+ if (watchdog_enabled) {
+ unsigned cpu;
+
+ for_each_present_cpu(cpu) {
+ if (per_cpu(watchdog_nmi_touch, cpu) != true)
+ per_cpu(watchdog_nmi_touch, cpu) = true;
+ }
+ }
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -430,6 +437,9 @@ static int watchdog_enable(int cpu)
wake_up_process(p);
}

+ /* if any cpu succeeds, watchdog is considered enabled for the system */
+ watchdog_enabled = 1;
+
return 0;
}

@@ -452,9 +462,6 @@ static void watchdog_disable(int cpu)
per_cpu(softlockup_watchdog, cpu) = NULL;
kthread_stop(p);
}
-
- /* if any cpu succeeds, watchdog is considered enabled for the system */
- watchdog_enabled = 1;
}

static void watchdog_enable_all_cpus(void)
--
1.7.2.1

2010-08-20 03:40:40

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Thu, 19 Aug 2010 22:57:49 -0400 Don Zickus <[email protected]> wrote:

> On Wed, Aug 18, 2010 at 01:01:56PM -0700, Andrew Morton wrote:
> > The surprise new requirement that touch_nmi_watchdog() be called from
> > non-preemptible code does seem to make sense IMO. It's hard to see why
> > anyone would be touching the watchdog unless he's spinning in irqs-off
> > code. Except, of course, when we have a utility function which can be
> > called from wither irqs-on or irqs-off: acpi_os_stall().
> >
> > That being said, it's not good to introduce new API requirements by
> > accident! An audit of all callers should first be performed, at least.
> >
> >
> > The surprise new requirement that touch_softlockup_watchdog() be called
> > from non-preemptible code doesn't make sense IMO. If I have a piece of
> > code in the kernel which I expect to sit in TASK_UNINTERRUPTIBLE state
> > for three minutes waiting for my egg to boil, I should be able to do
> > that and I should be able to touch the softlockup detector without
> > needing to go non-preemptible.
>
> Ok, so here is my patch that syncs the touch_*_watchdog back in line with
> the old semantics. Hopefully this will undo any harm I caused.
>
> ------------cut -->---------------------------
>
> >From b372e821c804982438db090db6b4a2f753c78091 Mon Sep 17 00:00:00 2001
> From: Don Zickus <[email protected]>
> Date: Thu, 19 Aug 2010 22:48:26 -0400
> Subject: [PATCH] [lockup detector] sync touch_*_watchdog back to old semantics
>
> During my rewrite, the semantics of touch_nmi_watchdog and
> touch_softlockup_watchdog changed enough to break some drivers
> (mostly over preemptable regions).
>
> This change brings those touch_*_watchdog functions back in line
> to how they used to work.
>
> Signed-off-by: Don Zickus <[email protected]>
> ---
> kernel/watchdog.c | 17 ++++++++++++-----
> 1 files changed, 12 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 613bc1f..99e35a2 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -122,7 +122,7 @@ static void __touch_watchdog(void)
>
> void touch_softlockup_watchdog(void)
> {
> - __get_cpu_var(watchdog_touch_ts) = 0;
> + __raw_get_cpu_var(watchdog_touch_ts) = 0;
> }
> EXPORT_SYMBOL(touch_softlockup_watchdog);
>
> @@ -142,7 +142,14 @@ void touch_all_softlockup_watchdogs(void)
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> void touch_nmi_watchdog(void)
> {
> - __get_cpu_var(watchdog_nmi_touch) = true;
> + if (watchdog_enabled) {
> + unsigned cpu;
> +
> + for_each_present_cpu(cpu) {
> + if (per_cpu(watchdog_nmi_touch, cpu) != true)
> + per_cpu(watchdog_nmi_touch, cpu) = true;
> + }
> + }
> touch_softlockup_watchdog();
> }
> EXPORT_SYMBOL(touch_nmi_watchdog);
> @@ -430,6 +437,9 @@ static int watchdog_enable(int cpu)
> wake_up_process(p);
> }
>
> + /* if any cpu succeeds, watchdog is considered enabled for the system */
> + watchdog_enabled = 1;
> +
> return 0;
> }
>
> @@ -452,9 +462,6 @@ static void watchdog_disable(int cpu)
> per_cpu(softlockup_watchdog, cpu) = NULL;
> kthread_stop(p);
> }
> -
> - /* if any cpu succeeds, watchdog is considered enabled for the system */
> - watchdog_enabled = 1;
> }
>
> static void watchdog_enable_all_cpus(void)

hm, the code seems a bit screwy. Maybe it was always thus.

watchdog_enabled gets set in the per-cpu function but it gets cleared
in the all-cpus function. Asymmetric.

Also afacit the action of cpu-hotunplug+cpu-hotplug will reenable the
watchdog on a CPU which was supposed to have it disabled. Perhaps you
could recheck that and make sure it all makes sense - perhaps we need a
separate state variable which is purely "current setting of
/proc/sys/kernel/nmi_watchdog" and doesn't get altered internally.

Anyway, I'll be disappearing for a few days so perhaps Frederic or hpa
can help get this all fixed/merged up?

2010-08-20 12:35:55

by Don Zickus

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Thu, Aug 19, 2010 at 08:42:56PM -0700, Andrew Morton wrote:
> On Thu, 19 Aug 2010 22:57:49 -0400 Don Zickus <[email protected]> wrote:
>
> > On Wed, Aug 18, 2010 at 01:01:56PM -0700, Andrew Morton wrote:
> > @@ -430,6 +437,9 @@ static int watchdog_enable(int cpu)
> > wake_up_process(p);
> > }
> >
> > + /* if any cpu succeeds, watchdog is considered enabled for the system */
> > + watchdog_enabled = 1;
> > +
> > return 0;
> > }
> >
> > @@ -452,9 +462,6 @@ static void watchdog_disable(int cpu)
> > per_cpu(softlockup_watchdog, cpu) = NULL;
> > kthread_stop(p);
> > }
> > -
> > - /* if any cpu succeeds, watchdog is considered enabled for the system */
> > - watchdog_enabled = 1;
> > }
> >
> > static void watchdog_enable_all_cpus(void)
>
> hm, the code seems a bit screwy. Maybe it was always thus.

No, watchdog_enabled was something newly created for the lockup dectector.

>
> watchdog_enabled gets set in the per-cpu function but it gets cleared
> in the all-cpus function. Asymmetric.

Yes it is by design. I was using watchdog_enabled as a global state
variable. As soon as one cpu was enabled, I would set the bit. But only
if all the cpus disabled the watchdog would I clear the bit.

>
> Also afacit the action of cpu-hotunplug+cpu-hotplug will reenable the
> watchdog on a CPU which was supposed to have it disabled. Perhaps you
> could recheck that and make sure it all makes sense - perhaps we need a
> separate state variable which is purely "current setting of
> /proc/sys/kernel/nmi_watchdog" and doesn't get altered internally.

I wasn't tracking it on a per cpu basis. I didn't see a need to. The
watchdog should globally be on/off across the system. If a system comes
up and one of the cpus could not bring the watchdog online for some
reason, then that is a problem. If a cpu-hotunplug+cpu-hotplug fixes it,
all the better. :-)

Also, if I wanted to track it per cpu, there is a bunch of status bits in
per-cpu variables that could let the code know whether a particular cpu
watchdog is on/off for either hardlockup or softlockup.

Cheers,
Don

2010-08-20 15:02:45

by Yong Zhang

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Thu, Aug 19, 2010 at 10:57:49PM -0400, Don Zickus wrote:
> On Wed, Aug 18, 2010 at 01:01:56PM -0700, Andrew Morton wrote:
> > The surprise new requirement that touch_nmi_watchdog() be called from
> > non-preemptible code does seem to make sense IMO. It's hard to see why
> > anyone would be touching the watchdog unless he's spinning in irqs-off
> > code. Except, of course, when we have a utility function which can be
> > called from wither irqs-on or irqs-off: acpi_os_stall().
> >
> > That being said, it's not good to introduce new API requirements by
> > accident! An audit of all callers should first be performed, at least.
> >
> >
> > The surprise new requirement that touch_softlockup_watchdog() be called
> > from non-preemptible code doesn't make sense IMO. If I have a piece of
> > code in the kernel which I expect to sit in TASK_UNINTERRUPTIBLE state
> > for three minutes waiting for my egg to boil, I should be able to do
> > that and I should be able to touch the softlockup detector without
> > needing to go non-preemptible.
>
> Ok, so here is my patch that syncs the touch_*_watchdog back in line with
> the old semantics. Hopefully this will undo any harm I caused.
>
> ------------cut -->---------------------------
>
> >From b372e821c804982438db090db6b4a2f753c78091 Mon Sep 17 00:00:00 2001
> From: Don Zickus <[email protected]>
> Date: Thu, 19 Aug 2010 22:48:26 -0400
> Subject: [PATCH] [lockup detector] sync touch_*_watchdog back to old semantics
>
> During my rewrite, the semantics of touch_nmi_watchdog and
> touch_softlockup_watchdog changed enough to break some drivers
> (mostly over preemptable regions).
>
> This change brings those touch_*_watchdog functions back in line
> to how they used to work.

This one looks good to me.
Thank you Don.

-Yong

>
> Signed-off-by: Don Zickus <[email protected]>
> ---
> kernel/watchdog.c | 17 ++++++++++++-----
> 1 files changed, 12 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 613bc1f..99e35a2 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -122,7 +122,7 @@ static void __touch_watchdog(void)
>
> void touch_softlockup_watchdog(void)
> {
> - __get_cpu_var(watchdog_touch_ts) = 0;
> + __raw_get_cpu_var(watchdog_touch_ts) = 0;
> }
> EXPORT_SYMBOL(touch_softlockup_watchdog);
>
> @@ -142,7 +142,14 @@ void touch_all_softlockup_watchdogs(void)
> #ifdef CONFIG_HARDLOCKUP_DETECTOR
> void touch_nmi_watchdog(void)
> {
> - __get_cpu_var(watchdog_nmi_touch) = true;
> + if (watchdog_enabled) {
> + unsigned cpu;
> +
> + for_each_present_cpu(cpu) {
> + if (per_cpu(watchdog_nmi_touch, cpu) != true)
> + per_cpu(watchdog_nmi_touch, cpu) = true;
> + }
> + }
> touch_softlockup_watchdog();
> }
> EXPORT_SYMBOL(touch_nmi_watchdog);
> @@ -430,6 +437,9 @@ static int watchdog_enable(int cpu)
> wake_up_process(p);
> }
>
> + /* if any cpu succeeds, watchdog is considered enabled for the system */
> + watchdog_enabled = 1;
> +
> return 0;
> }
>
> @@ -452,9 +462,6 @@ static void watchdog_disable(int cpu)
> per_cpu(softlockup_watchdog, cpu) = NULL;
> kthread_stop(p);
> }
> -
> - /* if any cpu succeeds, watchdog is considered enabled for the system */
> - watchdog_enabled = 1;
> }
>
> static void watchdog_enable_all_cpus(void)
> --
> 1.7.2.1

2010-08-26 10:14:42

by Maxim Levitsky

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Thu, 2010-08-19 at 22:57 -0400, Don Zickus wrote:
> On Wed, Aug 18, 2010 at 01:01:56PM -0700, Andrew Morton wrote:
> > The surprise new requirement that touch_nmi_watchdog() be called from
> > non-preemptible code does seem to make sense IMO. It's hard to see why
> > anyone would be touching the watchdog unless he's spinning in irqs-off
> > code. Except, of course, when we have a utility function which can be
> > called from wither irqs-on or irqs-off: acpi_os_stall().
> >
> > That being said, it's not good to introduce new API requirements by
> > accident! An audit of all callers should first be performed, at least.
> >
> >
> > The surprise new requirement that touch_softlockup_watchdog() be called
> > from non-preemptible code doesn't make sense IMO. If I have a piece of
> > code in the kernel which I expect to sit in TASK_UNINTERRUPTIBLE state
> > for three minutes waiting for my egg to boil, I should be able to do
> > that and I should be able to touch the softlockup detector without
> > needing to go non-preemptible.
>
> Ok, so here is my patch that syncs the touch_*_watchdog back in line with
> the old semantics. Hopefully this will undo any harm I caused.

Was this patch forgotten?

Best regards,
Maxim Levitsky

2010-08-26 14:41:34

by Don Zickus

[permalink] [raw]
Subject: Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog

On Thu, Aug 26, 2010 at 01:14:31PM +0300, Maxim Levitsky wrote:
> On Thu, 2010-08-19 at 22:57 -0400, Don Zickus wrote:
> > On Wed, Aug 18, 2010 at 01:01:56PM -0700, Andrew Morton wrote:
> > > The surprise new requirement that touch_nmi_watchdog() be called from
> > > non-preemptible code does seem to make sense IMO. It's hard to see why
> > > anyone would be touching the watchdog unless he's spinning in irqs-off
> > > code. Except, of course, when we have a utility function which can be
> > > called from wither irqs-on or irqs-off: acpi_os_stall().
> > >
> > > That being said, it's not good to introduce new API requirements by
> > > accident! An audit of all callers should first be performed, at least.
> > >
> > >
> > > The surprise new requirement that touch_softlockup_watchdog() be called
> > > from non-preemptible code doesn't make sense IMO. If I have a piece of
> > > code in the kernel which I expect to sit in TASK_UNINTERRUPTIBLE state
> > > for three minutes waiting for my egg to boil, I should be able to do
> > > that and I should be able to touch the softlockup detector without
> > > needing to go non-preemptible.
> >
> > Ok, so here is my patch that syncs the touch_*_watchdog back in line with
> > the old semantics. Hopefully this will undo any harm I caused.
>
> Was this patch forgotten?

Hm, apparently it was separated out by the mail server. Here it is again
with some of the headers removed I guess.

Cheers,
Don


From: Don Zickus <[email protected]>
Date: Thu, 19 Aug 2010 22:48:26 -0400
Subject: [PATCH] [lockup detector] sync touch_*_watchdog back to old semantics

During my rewrite, the semantics of touch_nmi_watchdog and
touch_softlockup_watchdog changed enough to break some drivers
(mostly over preemptable regions).

This change brings those touch_*_watchdog functions back in line
to how they used to work.

Signed-off-by: Don Zickus <[email protected]>
---
kernel/watchdog.c | 17 ++++++++++++-----
1 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 613bc1f..99e35a2 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -122,7 +122,7 @@ static void __touch_watchdog(void)

void touch_softlockup_watchdog(void)
{
- __get_cpu_var(watchdog_touch_ts) = 0;
+ __raw_get_cpu_var(watchdog_touch_ts) = 0;
}
EXPORT_SYMBOL(touch_softlockup_watchdog);

@@ -142,7 +142,14 @@ void touch_all_softlockup_watchdogs(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
void touch_nmi_watchdog(void)
{
- __get_cpu_var(watchdog_nmi_touch) = true;
+ if (watchdog_enabled) {
+ unsigned cpu;
+
+ for_each_present_cpu(cpu) {
+ if (per_cpu(watchdog_nmi_touch, cpu) != true)
+ per_cpu(watchdog_nmi_touch, cpu) = true;
+ }
+ }
touch_softlockup_watchdog();
}
EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -430,6 +437,9 @@ static int watchdog_enable(int cpu)
wake_up_process(p);
}

+ /* if any cpu succeeds, watchdog is considered enabled for the system */
+ watchdog_enabled = 1;
+
return 0;
}

@@ -452,9 +462,6 @@ static void watchdog_disable(int cpu)
per_cpu(softlockup_watchdog, cpu) = NULL;
kthread_stop(p);
}
-
- /* if any cpu succeeds, watchdog is considered enabled for the system */
- watchdog_enabled = 1;
}

static void watchdog_enable_all_cpus(void)
--
1.7.2.1

2010-08-26 17:18:13

by Len Brown

[permalink] [raw]
Subject: acpi_os_stall() and touch_nmi_watchdog() (was Re: [PATCH] fix BUG using smp_processor_id() in touch_nmi_watchdog and touch_softlockup_watchdog)

acpi_os_stall() is used in two ways.

The typical way is what triggered this e-mail thread.
It implements the AML "Stall()" operator, and is called
with interrupts enabled with durations <= 100 usec.
So one would expect it to be identical to udelay().

The exception case is when ACPICA calls it with interrupts off
and huge durations when we wrote the poweroff or sleep
register, yet we find outselves still running...

Apparently akpm added touch_nmi_watchdog() to keep the
watchdog from firing in this exception case.

Is it useful to have the watchdog running when
we are waiting for firmware to poweroff the machine?
If no, maybe we should turn it off as part of the shutdown
process rather than using yet another invocation
of touch_nmi_watchdog()?

Is calling delay() with IRQs disabled the best thing
we can do after we ask the firmware to cut power
and it takes a long time?

thanks,
Len Brown, Intel Open Source Technology Center