2005-04-12 05:38:58

by Shaohua Li

[permalink] [raw]
Subject: [PATCH 6/6]suspend/resume SMP support

Using CPU hotplug to support suspend/resume SMP. Both S3 and S4 use
disable/enable_nonboot_cpus API. The S4 part is based on Pavel's
original S4 SMP patch.

Signed-off-by: Li Shaohua<[email protected]>
---

linux-2.6.11-root/drivers/acpi/Kconfig | 2
linux-2.6.11-root/include/linux/suspend.h | 2
linux-2.6.11-root/kernel/power/Kconfig | 2
linux-2.6.11-root/kernel/power/disk.c | 36 ++++++-----
linux-2.6.11-root/kernel/power/main.c | 16 +++--
linux-2.6.11-root/kernel/power/smp.c | 91 +++++++++++-------------------
linux-2.6.11-root/kernel/power/swsusp.c | 2
7 files changed, 69 insertions(+), 82 deletions(-)

diff -puN drivers/acpi/Kconfig~smp_sleep drivers/acpi/Kconfig
--- linux-2.6.11/drivers/acpi/Kconfig~smp_sleep 2005-04-12 11:11:14.884685080 +0800
+++ linux-2.6.11-root/drivers/acpi/Kconfig 2005-04-12 11:11:14.898682952 +0800
@@ -57,7 +57,7 @@ if ACPI_INTERPRETER

config ACPI_SLEEP
bool "Sleep States (EXPERIMENTAL)"
- depends on X86
+ depends on X86 && (!SMP || HOTPLUG_CPU)
depends on EXPERIMENTAL
default y
---help---
diff -puN include/linux/suspend.h~smp_sleep include/linux/suspend.h
--- linux-2.6.11/include/linux/suspend.h~smp_sleep 2005-04-12 11:11:14.885684928 +0800
+++ linux-2.6.11-root/include/linux/suspend.h 2005-04-12 11:11:14.898682952 +0800
@@ -58,7 +58,7 @@ static inline int software_suspend(void)
}
#endif

-#ifdef CONFIG_SMP
+#ifdef CONFIG_HOTPLUG_CPU
extern void disable_nonboot_cpus(void);
extern void enable_nonboot_cpus(void);
#else
diff -puN kernel/power/disk.c~smp_sleep kernel/power/disk.c
--- linux-2.6.11/kernel/power/disk.c~smp_sleep 2005-04-12 11:11:14.887684624 +0800
+++ linux-2.6.11-root/kernel/power/disk.c 2005-04-12 11:11:14.899682800 +0800
@@ -117,8 +117,8 @@ static void finish(void)
{
device_resume();
platform_finish();
- enable_nonboot_cpus();
thaw_processes();
+ enable_nonboot_cpus();
pm_restore_console();
}

@@ -131,28 +131,36 @@ static int prepare_processes(void)

sys_sync();

+ disable_nonboot_cpus();
+
if (freeze_processes()) {
error = -EBUSY;
- return error;
+ goto enable_cpu;
}

if (pm_disk_mode == PM_DISK_PLATFORM) {
if (pm_ops && pm_ops->prepare) {
if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
- return error;
+ goto thaw;
}
}

/* Free memory before shutting down devices. */
free_some_memory();
-
return 0;
+thaw:
+ thaw_processes();
+enable_cpu:
+ enable_nonboot_cpus();
+ pm_restore_console();
+ return error;
}

static void unprepare_processes(void)
{
- enable_nonboot_cpus();
+ platform_finish();
thaw_processes();
+ enable_nonboot_cpus();
pm_restore_console();
}

@@ -160,15 +168,9 @@ static int prepare_devices(void)
{
int error;

- disable_nonboot_cpus();
- if ((error = device_suspend(PMSG_FREEZE))) {
+ if ((error = device_suspend(PMSG_FREEZE)))
printk("Some devices failed to suspend\n");
- platform_finish();
- enable_nonboot_cpus();
- return error;
- }
-
- return 0;
+ return error;
}

/**
@@ -185,9 +187,9 @@ int pm_suspend_disk(void)
int error;

error = prepare_processes();
- if (!error) {
- error = prepare_devices();
- }
+ if (error)
+ return error;
+ error = prepare_devices();

if (error) {
unprepare_processes();
@@ -250,7 +252,7 @@ static int software_resume(void)

if ((error = prepare_processes())) {
swsusp_close();
- goto Cleanup;
+ goto Done;
}

pr_debug("PM: Reading swsusp image.\n");
diff -puN kernel/power/Kconfig~smp_sleep kernel/power/Kconfig
--- linux-2.6.11/kernel/power/Kconfig~smp_sleep 2005-04-12 11:11:14.888684472 +0800
+++ linux-2.6.11-root/kernel/power/Kconfig 2005-04-12 11:11:14.899682800 +0800
@@ -28,7 +28,7 @@ config PM_DEBUG

config SOFTWARE_SUSPEND
bool "Software Suspend (EXPERIMENTAL)"
- depends on EXPERIMENTAL && PM && SWAP
+ depends on EXPERIMENTAL && PM && SWAP && (HOTPLUG_CPU || !SMP)
---help---
Enable the possibility of suspending the machine.
It doesn't need APM.
diff -puN kernel/power/main.c~smp_sleep kernel/power/main.c
--- linux-2.6.11/kernel/power/main.c~smp_sleep 2005-04-12 11:11:14.890684168 +0800
+++ linux-2.6.11-root/kernel/power/main.c 2005-04-12 11:11:14.899682800 +0800
@@ -59,6 +59,13 @@ static int suspend_prepare(suspend_state

pm_prepare_console();

+ disable_nonboot_cpus();
+
+ if (num_online_cpus() != 1) {
+ error = -EPERM;
+ goto Enable_cpu;
+ }
+
if (freeze_processes()) {
error = -EAGAIN;
goto Thaw;
@@ -89,6 +96,8 @@ static int suspend_prepare(suspend_state
pm_ops->finish(state);
Thaw:
thaw_processes();
+ Enable_cpu:
+ enable_nonboot_cpus();
pm_restore_console();
return error;
}
@@ -127,6 +136,7 @@ static void suspend_finish(suspend_state
if (pm_ops && pm_ops->finish)
pm_ops->finish(state);
thaw_processes();
+ enable_nonboot_cpus();
pm_restore_console();
}

@@ -164,12 +174,6 @@ static int enter_state(suspend_state_t s
goto Unlock;
}

- /* Suspend is hard to get right on SMP. */
- if (num_online_cpus() != 1) {
- error = -EPERM;
- goto Unlock;
- }
-
pr_debug("PM: Preparing system for suspend\n");
if ((error = suspend_prepare(state)))
goto Unlock;
diff -puN kernel/power/smp.c~smp_sleep kernel/power/smp.c
--- linux-2.6.11/kernel/power/smp.c~smp_sleep 2005-04-12 11:11:14.891684016 +0800
+++ linux-2.6.11-root/kernel/power/smp.c 2005-04-12 11:11:14.899682800 +0800
@@ -13,73 +13,52 @@
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/module.h>
+#include <linux/cpu.h>
#include <asm/atomic.h>
#include <asm/tlbflush.h>

-static atomic_t cpu_counter, freeze;
-
-
-static void smp_pause(void * data)
-{
- struct saved_context ctxt;
- __save_processor_state(&ctxt);
- printk("Sleeping in:\n");
- dump_stack();
- atomic_inc(&cpu_counter);
- while (atomic_read(&freeze)) {
- /* FIXME: restore takes place at random piece inside this.
- This should probably be written in assembly, and
- preserve general-purpose registers, too
-
- What about stack? We may need to move to new stack here.
-
- This should better be ran with interrupts disabled.
- */
- cpu_relax();
- barrier();
- }
- atomic_dec(&cpu_counter);
- __restore_processor_state(&ctxt);
-}
-
-static cpumask_t oldmask;
+/* This is protected by pm_sem semaphore */
+static cpumask_t frozen_cpus;

void disable_nonboot_cpus(void)
{
- oldmask = current->cpus_allowed;
- set_cpus_allowed(current, cpumask_of_cpu(0));
- printk("Freezing CPUs (at %d)", _smp_processor_id());
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(HZ);
- printk("...");
- BUG_ON(_smp_processor_id() != 0);
-
- /* FIXME: for this to work, all the CPUs must be running
- * "idle" thread (or we deadlock). Is that guaranteed? */
-
- atomic_set(&cpu_counter, 0);
- atomic_set(&freeze, 1);
- smp_call_function(smp_pause, NULL, 0, 0);
- while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
- cpu_relax();
- barrier();
+ int cpu, error;
+
+ error = 0;
+ cpus_clear(frozen_cpus);
+ printk("Freezing cpus ...\n");
+ for_each_online_cpu(cpu) {
+ if (cpu == 0)
+ continue;
+ error = cpu_down(cpu);
+ if (!error) {
+ cpu_set(cpu, frozen_cpus);
+ printk("CPU%d is down\n", cpu);
+ continue;
+ }
+ printk("Error taking cpu %d down: %d\n", cpu, error);
}
- printk("ok\n");
+ BUG_ON(smp_processor_id() != 0);
+ if (error)
+ panic("cpus not sleeping");
}

void enable_nonboot_cpus(void)
{
- printk("Restarting CPUs");
- atomic_set(&freeze, 0);
- while (atomic_read(&cpu_counter)) {
- cpu_relax();
- barrier();
- }
- printk("...");
- set_cpus_allowed(current, oldmask);
- schedule();
- printk("ok\n");
+ int cpu, error;

+ printk("Thawing cpus ...\n");
+ for_each_cpu_mask(cpu, frozen_cpus) {
+ error = smp_prepare_cpu(cpu);
+ if (!error)
+ error = cpu_up(cpu);
+ if (!error) {
+ printk("CPU%d is up\n", cpu);
+ continue;
+ }
+ printk("Error taking cpu %d up: %d\n", cpu, error);
+ panic("Not enough cpus");
+ }
+ cpus_clear(frozen_cpus);
}

-
diff -puN kernel/power/swsusp.c~smp_sleep kernel/power/swsusp.c
--- linux-2.6.11/kernel/power/swsusp.c~smp_sleep 2005-04-12 11:11:14.892683864 +0800
+++ linux-2.6.11-root/kernel/power/swsusp.c 2005-04-12 11:11:14.900682648 +0800
@@ -1194,8 +1194,10 @@ static const char * sanity_check(void)
return "version";
if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
return "machine";
+#if 0
if(swsusp_info.cpus != num_online_cpus())
return "number of cpus";
+#endif
return NULL;
}

_



2005-04-12 09:44:48

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

Hi!

> Using CPU hotplug to support suspend/resume SMP. Both S3 and S4 use
> disable/enable_nonboot_cpus API. The S4 part is based on Pavel's
> original S4 SMP patch.

The series looks good to me, but I was not yet able to actually try
it. Will try to do that in few hours.
Pavel

--
Boycott Kodak -- for their patent abuse against Java.

2005-04-12 11:24:12

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

Hi!

> Using CPU hotplug to support suspend/resume SMP. Both S3 and S4 use
> disable/enable_nonboot_cpus API. The S4 part is based on Pavel's
> original S4 SMP patch.

I tested it on 2x PII(?) 550MHz system. Suspend went ok, resume loaded
image from disk, but then I got

Thawing cpus ....
Booting processor 1/0 eip 3000

...and very funny effect on keyboard leds. They started to blink
(panic-like), but with very wrong frequency. It looked like 2 cpus
doing panic blinks at once...

Pavel

--
Boycott Kodak -- for their patent abuse against Java.

2005-04-12 12:44:48

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

On Tue, 2005-04-12 at 18:51, Pavel Machek wrote:
> > Using CPU hotplug to support suspend/resume SMP. Both S3 and S4 use
> > disable/enable_nonboot_cpus API. The S4 part is based on Pavel's
> > original S4 SMP patch.
>
> I tested it on 2x PII(?) 550MHz system. Suspend went ok, resume loaded
> image from disk, but then I got
>
> Thawing cpus ....
> Booting processor 1/0 eip 3000
>
> ...and very funny effect on keyboard leds. They started to blink
> (panic-like), but with very wrong frequency. It looked like 2 cpus
> doing panic blinks at once...
Check if /sys/device/system/cpu/cpu1/online attribute works. If it
works, then it's other issue. I only tested the patches in two HT based
systems.

Thanks,
Shaohua

2005-04-13 08:32:57

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

Hi!

> > > Using CPU hotplug to support suspend/resume SMP. Both S3 and S4 use
> > > disable/enable_nonboot_cpus API. The S4 part is based on Pavel's
> > > original S4 SMP patch.
> >
> > I tested it on 2x PII(?) 550MHz system. Suspend went ok, resume loaded
> > image from disk, but then I got
> >
> > Thawing cpus ....
> > Booting processor 1/0 eip 3000
> >
> > ...and very funny effect on keyboard leds. They started to blink
> > (panic-like), but with very wrong frequency. It looked like 2 cpus
> > doing panic blinks at once...
> Check if /sys/device/system/cpu/cpu1/online attribute works. If it
> works, then it's other issue. I only tested the patches in two HT based
> systems.

Ok, this is PIII system 550MHz system:

root@hobit:/home/pavel# cat /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 7
model name : Pentium III (Katmai)
stepping : 3
cpu MHz : 551.309
cache size : 512 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 2
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
mca cmov pat pse36 mmx fxsr sse
bogomips : 1085.44

core id : 255
cpu cores : 1
processor : 1
vendor_id : GenuineIntel
cpu family : 6
model : 7
model name : Pentium III (Katmai)
stepping : 3
cpu MHz : 551.309
cache size : 512 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 2
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
mca cmov pat pse36 mmx fxsr sse
bogomips : 1097.72

core id : 255
cpu cores : 1
root@hobit:/home/pavel#

Offlining CPU seems to work ok:

root@hobit:/sys/devices/system/cpu/cpu1# cat online
1
root@hobit:/sys/devices/system/cpu/cpu1# echo 0 > online
root@hobit:/sys/devices/system/cpu/cpu1# sync
root@hobit:/sys/devices/system/cpu/cpu1# cat /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 7
model name : Pentium III (Katmai)
stepping : 3
cpu MHz : 551.309
cache size : 512 KB
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 2
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge
mca cmov pat pse36 mmx fxsr sse
bogomips : 1085.44

core id : 255
cpu cores : 1
root@hobit:/sys/devices/system/cpu/cpu1#

Putting cpu back online, I get:

Booting processor 1/0 eip 3000

on console, and mess in syslog. Seems like my cpu #1 panicked but cpu
#0 just keeps going?!

root@hobit:/sys/devices/system/cpu/cpu1# dmesg | tail -20
[<c02c758f>] vt_console_print+0x24f/0x260
[<c02c7340>] vt_console_print+0x0/0x260
[<c01247e7>] __call_console_drivers+0x57/0x60
[<c01248e0>] call_console_drivers+0x80/0x110
[<c0124d8e>] release_console_sem+0x4e/0xc0
[<c0124c12>] vprintk+0x192/0x240
[<c0528891>] preempt_schedule_irq+0x51/0x80
[<c02adeca>] acpi_processor_idle+0x0/0x265
[<c010325e>] need_resched+0x1f/0x21
[<c02adeca>] acpi_processor_idle+0x0/0x265
[<c0124a77>] printk+0x17/0x20
[<c010b583>] cpu_init+0x73/0x360
[<c0117bd6>] start_secondary+0x6/0x170
Code: d2 74 bd fc 8b 44 24 28 b9 0e 00 00 00 8b 74 24 14 01 c6 b8 0e
00 00 00 89 74 24 1c 8b 74 24 30 89 44 24 10 8b 7c 24 1c 83 c6 10 <f3>
a5 8b 74 24 24 8b 44 24 1c 89 4c 24 10 01 ee f7 d5 21 ee 89
<0>Kernel panic - not syncing: Attempted to kill the idle task!
Stuck ??
Inquiring remote APIC #0...
... APIC #0 ID: 00000000
... APIC #0 VERSION: 00040011
... APIC #0 SPIV: 000000ff
root@hobit:/sys/devices/system/cpu/cpu1# dmesg | tail -25
[<c011d001>] activate_task+0x1/0xa0
[<c011d128>] resched_task+0x68/0x90
[<c011d8ba>] try_to_wake_up+0x2aa/0x2f0
[<c025ed7a>] fbcon_cursor+0x19a/0x270
[<c02c4958>] hide_cursor+0x18/0x30
[<c02c758f>] vt_console_print+0x24f/0x260
[<c02c7340>] vt_console_print+0x0/0x260
[<c01247e7>] __call_console_drivers+0x57/0x60
[<c01248e0>] call_console_drivers+0x80/0x110
[<c0124d8e>] release_console_sem+0x4e/0xc0
[<c0124c12>] vprintk+0x192/0x240
[<c0528891>] preempt_schedule_irq+0x51/0x80
[<c02adeca>] acpi_processor_idle+0x0/0x265
[<c010325e>] need_resched+0x1f/0x21
[<c02adeca>] acpi_processor_idle+0x0/0x265
[<c0124a77>] printk+0x17/0x20
[<c010b583>] cpu_init+0x73/0x360
[<c0117bd6>] start_secondary+0x6/0x170
Code: d2 74 bd fc 8b 44 24 28 b9 0e 00 00 00 8b 74 24 14 01 c6 b8 0e
00 00 00 89 74 24 1c 8b 74 24 30 89 44 24 10 8b 7c 24 1c 83 c6 10 <f3>
a5 8b 74 24 24 8b 44 24 1c 89 4c 24 10 01 ee f7 d5 21 ee 89
<0>Kernel panic - not syncing: Attempted to kill the idle task!
Stuck ??
Inquiring remote APIC #0...
... APIC #0 ID: 00000000
... APIC #0 VERSION: 00040011
... APIC #0 SPIV: 000000ff
root@hobit:/sys/devices/system/cpu/cpu1#

Pavel
--
Boycott Kodak -- for their patent abuse against Java.

2005-04-14 08:30:40

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

On Wed, 2005-04-13 at 16:32, Pavel Machek wrote:
> root@hobit:/sys/devices/system/cpu/cpu1# dmesg | tail -25
> [<c011d001>] activate_task+0x1/0xa0
> [<c011d128>] resched_task+0x68/0x90
> [<c011d8ba>] try_to_wake_up+0x2aa/0x2f0
> [<c025ed7a>] fbcon_cursor+0x19a/0x270
> [<c02c4958>] hide_cursor+0x18/0x30
> [<c02c758f>] vt_console_print+0x24f/0x260
> [<c02c7340>] vt_console_print+0x0/0x260
> [<c01247e7>] __call_console_drivers+0x57/0x60
> [<c01248e0>] call_console_drivers+0x80/0x110
> [<c0124d8e>] release_console_sem+0x4e/0xc0
> [<c0124c12>] vprintk+0x192/0x240
> [<c0528891>] preempt_schedule_irq+0x51/0x80
> [<c02adeca>] acpi_processor_idle+0x0/0x265
> [<c010325e>] need_resched+0x1f/0x21
> [<c02adeca>] acpi_processor_idle+0x0/0x265
> [<c0124a77>] printk+0x17/0x20
> [<c010b583>] cpu_init+0x73/0x360
> [<c0117bd6>] start_secondary+0x6/0x170
> Code: d2 74 bd fc 8b 44 24 28 b9 0e 00 00 00 8b 74 24 14 01 c6 b8 0e
> 00 00 00 89 74 24 1c 8b 74 24 30 89 44 24 10 8b 7c 24 1c 83 c6 10 <f3>
> a5 8b 74 24 24 8b 44 24 1c 89 4c 24 10 01 ee f7 d5 21 ee 89
> <0>Kernel panic - not syncing: Attempted to kill the idle task!
> Stuck ??
> Inquiring remote APIC #0...
> ... APIC #0 ID: 00000000
> ... APIC #0 VERSION: 00040011
> ... APIC #0 SPIV: 000000ff
> root@hobit:/sys/devices/system/cpu/cpu1#
Andrew,
Below patch fixed Pavel's oops. But strange is the 'system_state' check
is added for CPU hotplug by Rusty. This really makes me confused. Could
you please look at it.
This can be reproduced 100% with radeonfb driver load. Attached is the
dmesg of an oops. It seems the 'objp' parameter for
'cache_alloc_debugcheck_after' is invalid.

Thanks,
Shaohua

--- a/kernel/printk.c 2005-04-12 10:12:19.000000000 +0800
+++ b/kernel/printk.c 2005-04-13 17:22:40.912897328 +0800
@@ -624,8 +624,7 @@ asmlinkage int vprintk(const char *fmt,
log_level_unknown = 1;
}

- if (!cpu_online(smp_processor_id()) &&
- system_state != SYSTEM_RUNNING) {
+ if (!cpu_online(smp_processor_id())) {
/*
* Some console drivers may assume that per-cpu resources have
* been allocated. So don't allow them to be called by this


Attachments:
ht (2.06 kB)

2005-04-14 08:43:09

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

On Thu, 2005-04-14 at 16:27, Li Shaohua wrote:
> On Wed, 2005-04-13 at 16:32, Pavel Machek wrote:
> > root@hobit:/sys/devices/system/cpu/cpu1# dmesg | tail -25
> > [<c011d001>] activate_task+0x1/0xa0
> > [<c011d128>] resched_task+0x68/0x90
> > [<c011d8ba>] try_to_wake_up+0x2aa/0x2f0
> > [<c025ed7a>] fbcon_cursor+0x19a/0x270
> > [<c02c4958>] hide_cursor+0x18/0x30
> > [<c02c758f>] vt_console_print+0x24f/0x260
> > [<c02c7340>] vt_console_print+0x0/0x260
> > [<c01247e7>] __call_console_drivers+0x57/0x60
> > [<c01248e0>] call_console_drivers+0x80/0x110
> > [<c0124d8e>] release_console_sem+0x4e/0xc0
> > [<c0124c12>] vprintk+0x192/0x240
> > [<c0528891>] preempt_schedule_irq+0x51/0x80
> > [<c02adeca>] acpi_processor_idle+0x0/0x265
> > [<c010325e>] need_resched+0x1f/0x21
> > [<c02adeca>] acpi_processor_idle+0x0/0x265
> > [<c0124a77>] printk+0x17/0x20
> > [<c010b583>] cpu_init+0x73/0x360
> > [<c0117bd6>] start_secondary+0x6/0x170
> > Code: d2 74 bd fc 8b 44 24 28 b9 0e 00 00 00 8b 74 24 14 01 c6 b8 0e
> > 00 00 00 89 74 24 1c 8b 74 24 30 89 44 24 10 8b 7c 24 1c 83 c6 10 <f3>
> > a5 8b 74 24 24 8b 44 24 1c 89 4c 24 10 01 ee f7 d5 21 ee 89
> > <0>Kernel panic - not syncing: Attempted to kill the idle task!
> > Stuck ??
> > Inquiring remote APIC #0...
> > ... APIC #0 ID: 00000000
> > ... APIC #0 VERSION: 00040011
> > ... APIC #0 SPIV: 000000ff
> > root@hobit:/sys/devices/system/cpu/cpu1#
> Andrew,
> Below patch fixed Pavel's oops. But strange is the 'system_state' check
> is added for CPU hotplug by Rusty. This really makes me confused. Could
> you please look at it.
> This can be reproduced 100% with radeonfb driver load. Attached is the
> dmesg of an oops. It seems the 'objp' parameter for
> 'cache_alloc_debugcheck_after' is invalid.
Looks the per-cpu array_cache isn't initialized. It's initialized in a
cpuhotplug callback. So before the CPU call cpu_up, all kmalloc will
failed. Isn't it?

Thanks,
Shaohua

2005-04-23 22:37:13

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

Li Shaohua <[email protected]> wrote:
>
> CPU0 attaching NULL sched-domain.
> CPU1 attaching NULL sched-domain.
> CPU0 attaching NULL sched-domain.
> Booting processor 1/1 eip 3000
> Initializing CPU#1
> masked ExtINT on CPU#1
> Unable to handle kernel paging request at virtual address f000acb2
> printing eip:
> c014e4cc
> *pde = 00000000
> Oops: 0000 [#1]
> PREEMPT SMP
> Modules linked in:
> CPU: 1
> EIP: 0060:[<c014e4cc>] Not tainted VLI
> EFLAGS: 00010097 (2.6.12-rc2-mm3)
> EIP is at check_poison_obj+0x4c/0x1e0
> eax: 0000006b ebx: 0000005a ecx: dff6e080 edx: dff6e480
> esi: 00000000 edi: f000acb2 ebp: 00000080 esp: c14fdcd4
> ds: 007b es: 007b ss: 0068
> Process swapper (pid: 0, threadinfo=c14fc000 task=dff42560)
> Stack: dff6e480 5a5a5a5a 5a5a5a5a 0000007f 5a5a5a5a 00000000 0000005a f000acae
> dff6e480 c021192e c0150031 dff6e480 f000acae 5a5a5a5a 5a5a5a5a dff6e480
> 00000046 00000020 00000010 c015044b dff6e480 00000020 f000acae c021192e
> Call Trace:
> [<c021192e>] soft_cursor+0x5e/0x260
> [<c0150031>] cache_alloc_debugcheck_after+0x181/0x1a0
> [<c015044b>] __kmalloc+0x9b/0xd0
> [<c021192e>] soft_cursor+0x5e/0x260
> [<c021192e>] soft_cursor+0x5e/0x260
> [<c020a459>] bit_cursor+0x339/0x540
> [<c0118998>] recalc_task_prio+0x88/0x150
> [<c0205c32>] fbcon_cursor+0x1a2/0x270
> [<c0265475>] hide_cursor+0x25/0x40
> [<c026843a>] vt_console_print+0x2aa/0x2b0
> [<c0120d32>] __call_console_drivers+0x62/0x70
> [<c0120e66>] call_console_drivers+0x96/0x130
> [<c0121361>] release_console_sem+0x51/0xc0
> [<c01211df>] vprintk+0x19f/0x250
> [<c01264b6>] __do_softirq+0xd6/0xf0
> [<c0438f5b>] preempt_schedule_irq+0x4b/0x80
> [<c0121037>] printk+0x17/0x20
> [<c0114132>] setup_local_APIC+0xe2/0x1d0
> [<c01130ba>] smp_callin+0x7a/0x120
> [<c011316e>] start_secondary+0xe/0x190
> Code: 24 30 89 14 24 01 c7 e8 13 f8 ff ff 39 44 24 14 89 c5 0f 8d b7 00 00 00 8d 40 ff 89 44 24 0c 3b 74 24 0c b0 6b 0f 84 8c 01 00 00 <38> 04 3e 74 46 8b 44 24 14 85 c0 0f 84 48 01 00 00 89 3c 24 83
> <0>Kernel panic - not syncing: Attempted to kill the idle task!
>

fbcon is trying to call kmalloc before the slab system is initialised.
It would be best to fix that within fbcon.


> > Code: d2 74 bd fc 8b 44 24 28 b9 0e 00 00 00 8b 74 24 14 01 c6 b8 0e
> > 00 00 00 89 74 24 1c 8b 74 24 30 89 44 24 10 8b 7c 24 1c 83 c6 10 <f3>
> > a5 8b 74 24 24 8b 44 24 1c 89 4c 24 10 01 ee f7 d5 21 ee 89
> > <0>Kernel panic - not syncing: Attempted to kill the idle task!
> > Stuck ??
> > Inquiring remote APIC #0...
> > ... APIC #0 ID: 00000000
> > ... APIC #0 VERSION: 00040011
> > ... APIC #0 SPIV: 000000ff
> > root@hobit:/sys/devices/system/cpu/cpu1#
> Andrew,
> Below patch fixed Pavel's oops. But strange is the 'system_state' check
> is added for CPU hotplug by Rusty. This really makes me confused. Could
> you please look at it.

Well as the comment says, if this CPU isn't online yet, and if the system
has not yet reached SYSTEM_RUNNING state then we bale out of the printk
because this cpu's per-cpu resources may not yet be fully set up.


> This can be reproduced 100% with radeonfb driver load. Attached is the
> dmesg of an oops. It seems the 'objp' parameter for
> 'cache_alloc_debugcheck_after' is invalid.
>
> --- a/kernel/printk.c 2005-04-12 10:12:19.000000000 +0800
> +++ b/kernel/printk.c 2005-04-13 17:22:40.912897328 +0800
> @@ -624,8 +624,7 @@ asmlinkage int vprintk(const char *fmt,
> log_level_unknown = 1;
> }
>
> - if (!cpu_online(smp_processor_id()) &&
> - system_state != SYSTEM_RUNNING) {
> + if (!cpu_online(smp_processor_id())) {
> /*
> * Some console drivers may assume that per-cpu resources have
> * been allocated. So don't allow them to be called by this
>

That being said, I don't see why your patch should fix the oops. The test
(system_state != SYSTEM_RUNNING) should be returning true at this time.


2005-04-23 22:42:45

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

Hi!

> > > Code: d2 74 bd fc 8b 44 24 28 b9 0e 00 00 00 8b 74 24 14 01 c6 b8 0e
> > > 00 00 00 89 74 24 1c 8b 74 24 30 89 44 24 10 8b 7c 24 1c 83 c6 10 <f3>
> > > a5 8b 74 24 24 8b 44 24 1c 89 4c 24 10 01 ee f7 d5 21 ee 89
> > > <0>Kernel panic - not syncing: Attempted to kill the idle task!
> > > Stuck ??
> > > Inquiring remote APIC #0...
> > > ... APIC #0 ID: 00000000
> > > ... APIC #0 VERSION: 00040011
> > > ... APIC #0 SPIV: 000000ff
> > > root@hobit:/sys/devices/system/cpu/cpu1#
> > Andrew,
> > Below patch fixed Pavel's oops. But strange is the 'system_state' check
> > is added for CPU hotplug by Rusty. This really makes me confused. Could
> > you please look at it.
>
> Well as the comment says, if this CPU isn't online yet, and if the system
> has not yet reached SYSTEM_RUNNING state then we bale out of the printk
> because this cpu's per-cpu resources may not yet be fully set up.
>
>
> > This can be reproduced 100% with radeonfb driver load. Attached is the
> > dmesg of an oops. It seems the 'objp' parameter for
> > 'cache_alloc_debugcheck_after' is invalid.
> >
> > --- a/kernel/printk.c 2005-04-12 10:12:19.000000000 +0800
> > +++ b/kernel/printk.c 2005-04-13 17:22:40.912897328 +0800
> > @@ -624,8 +624,7 @@ asmlinkage int vprintk(const char *fmt,
> > log_level_unknown = 1;
> > }
> >
> > - if (!cpu_online(smp_processor_id()) &&
> > - system_state != SYSTEM_RUNNING) {
> > + if (!cpu_online(smp_processor_id())) {
> > /*
> > * Some console drivers may assume that per-cpu resources have
> > * been allocated. So don't allow them to be called by this
> >
>
> That being said, I don't see why your patch should fix the oops. The test
> (system_state != SYSTEM_RUNNING) should be returning true at this time.

We were booting CPU on running system. system_state was definitely
SYSTEM_RUNNING, but we were trying to bring CPU#1 online...

Check is wrong. If cpu is not yet online, we may not printk on
it... It does not really depend on system_state...

Pavel
--
Boycott Kodak -- for their patent abuse against Java.

2005-04-25 00:52:32

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

On Sun, 2005-04-24 at 06:35, Andrew Morton wrote:
> fbcon is trying to call kmalloc before the slab system is initialised.
> It would be best to fix that within fbcon.
This solves the problem too.

> Well as the comment says, if this CPU isn't online yet, and if the system
> has not yet reached SYSTEM_RUNNING state then we bale out of the printk
> because this cpu's per-cpu resources may not yet be fully set up.
After the CPU is offline (we are doing CPU hotplug). Its per-cpu
resources like kmalloc also aren't initialized. This is in
SYSTEM_RUNNING state too, so the system state doesn't matter. Removing
the system state check seems better than fixing the fbcon driver to me.

Thanks,
Shaohua

2005-04-28 07:23:44

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

Li Shaohua <[email protected]> wrote:
>
> Using CPU hotplug to support suspend/resume SMP. Both S3 and S4 use
> disable/enable_nonboot_cpus API. The S4 part is based on Pavel's
> original S4 SMP patch.



On ia64, with tiger_defconfig:

kernel/built-in.o(.text+0x59e12): In function `suspend_prepare':
: undefined reference to `disable_nonboot_cpus'
kernel/built-in.o(.text+0x59e62): In function `suspend_prepare':
: undefined reference to `enable_nonboot_cpus'
kernel/built-in.o(.text+0x5a222): In function `suspend_finish':
: undefined reference to `enable_nonboot_cpus'


Signed-off-by: Andrew Morton <[email protected]>
---

include/linux/suspend.h | 2 +-
1 files changed, 1 insertion(+), 1 deletion(-)

diff -puN include/linux/suspend.h~suspend-resume-smp-support-fix include/linux/suspend.h
--- 25/include/linux/suspend.h~suspend-resume-smp-support-fix Thu Apr 28 15:12:21 2005
+++ 25-akpm/include/linux/suspend.h Thu Apr 28 15:13:13 2005
@@ -58,7 +58,7 @@ static inline int software_suspend(void)
}
#endif

-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_SOFTWARE_SUSPEND)
extern void disable_nonboot_cpus(void);
extern void enable_nonboot_cpus(void);
#else
_

2005-04-28 07:31:22

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

On Thu, 2005-04-28 at 15:22, Andrew Morton wrote:
> -#ifdef CONFIG_HOTPLUG_CPU
> +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_SOFTWARE_SUSPEND)
ACPI_SLEEP also requires it. So it will be
#if defined(CONFIG_HOTPLUG_CPU) && (defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP))

Thanks,
Shaohua

2005-04-28 07:38:46

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

On Thu, 2005-04-28 at 15:22, Andrew Morton wrote:
>
> On ia64, with tiger_defconfig:
>
> kernel/built-in.o(.text+0x59e12): In function `suspend_prepare':
> : undefined reference to `disable_nonboot_cpus'
> kernel/built-in.o(.text+0x59e62): In function `suspend_prepare':
> : undefined reference to `enable_nonboot_cpus'
> kernel/built-in.o(.text+0x5a222): In function `suspend_finish':
> : undefined reference to `enable_nonboot_cpus'
Pavel,
Could IA64 do software suspend? There possibly are other troubles here.

Thanks,
Shaohua

2005-04-28 07:44:54

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

Li Shaohua <[email protected]> wrote:
>
> On Thu, 2005-04-28 at 15:22, Andrew Morton wrote:
> > -#ifdef CONFIG_HOTPLUG_CPU
> > +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_SOFTWARE_SUSPEND)
> ACPI_SLEEP also requires it. So it will be
> #if defined(CONFIG_HOTPLUG_CPU) && (defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP))
>

But does setting CONFIG_ACPI_SLEEP cause kernel/power/smp.o to be actually
compiled and linked? I don't think so?

Anyway, please send a tested fix.

2005-04-28 07:44:48

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

Hi!

> > On ia64, with tiger_defconfig:
> >
> > kernel/built-in.o(.text+0x59e12): In function `suspend_prepare':
> > : undefined reference to `disable_nonboot_cpus'
> > kernel/built-in.o(.text+0x59e62): In function `suspend_prepare':
> > : undefined reference to `enable_nonboot_cpus'
> > kernel/built-in.o(.text+0x5a222): In function `suspend_finish':
> > : undefined reference to `enable_nonboot_cpus'
> Pavel,
> Could IA64 do software suspend? There possibly are other troubles here.

Someone would have to write low-level support. Bring me ia64 notebook
and I'll do it ;-)))))))))))))))))))).
Pavel
--
Boycott Kodak -- for their patent abuse against Java.

2005-04-28 08:12:43

by Luming Yu

[permalink] [raw]
Subject: Re: [ACPI] Re: [PATCH 6/6]suspend/resume SMP support

On Thursday 28 April 2005 15:42, Pavel Machek wrote:
> Hi!
>
> > > On ia64, with tiger_defconfig:
> > >
> > > kernel/built-in.o(.text+0x59e12): In function `suspend_prepare':
> > > : undefined reference to `disable_nonboot_cpus'
> > >
> > > kernel/built-in.o(.text+0x59e62): In function `suspend_prepare':
> > > : undefined reference to `enable_nonboot_cpus'
> > >
> > > kernel/built-in.o(.text+0x5a222): In function `suspend_finish':
> > > : undefined reference to `enable_nonboot_cpus'
> >
> > Pavel,
> > Could IA64 do software suspend? There possibly are other troubles here.
>
> Someone would have to write low-level support. Bring me ia64 notebook
> and I'll do it ;-)))))))))))))))))))).
> ??????? ??????? ??????? ??????? ??????? ??????? ??????? ??????? Pavel

I just checked DSDT on one ia64 box, it has S0, S4, S5 object.
So, the platform should have sufficient support for sleep-to-disk.

Thanks,
Luming

2005-04-28 08:40:03

by Pavel Machek

[permalink] [raw]
Subject: Re: [ACPI] Re: [PATCH 6/6]suspend/resume SMP support

Hi!

> > > > On ia64, with tiger_defconfig:
> > > >
> > > > kernel/built-in.o(.text+0x59e12): In function `suspend_prepare':
> > > > : undefined reference to `disable_nonboot_cpus'
> > > >
> > > > kernel/built-in.o(.text+0x59e62): In function `suspend_prepare':
> > > > : undefined reference to `enable_nonboot_cpus'
> > > >
> > > > kernel/built-in.o(.text+0x5a222): In function `suspend_finish':
> > > > : undefined reference to `enable_nonboot_cpus'
> > >
> > > Pavel,
> > > Could IA64 do software suspend? There possibly are other troubles here.
> >
> > Someone would have to write low-level support. Bring me ia64 notebook
> > and I'll do it ;-)))))))))))))))))))).
>
> I just checked DSDT on one ia64 box, it has S0, S4, S5 object.
> So, the platform should have sufficient support for sleep-to-disk.

No arguments about that one... Something like
arch/i386/power/{swsusp.S,cpu.c} needs to be written for ia64,
then. I'm not goot at ia64 assembly (never done that), and do not have
ia64 machine nearby, so I'm probably not doing that work.

Pavel
--
Boycott Kodak -- for their patent abuse against Java.

2005-04-28 08:40:03

by Pavel Machek

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

Hi!

> > But does setting CONFIG_ACPI_SLEEP cause kernel/power/smp.o to be actually
> > compiled and linked? I don't think so?
> >
> > Anyway, please send a tested fix.
> Ha, this one should be ok. Only IA32 support SMP suspend now.

Seems okay... Rafael, what is status of x86-64 smp swsusp?
Pavel
--
Boycott Kodak -- for their patent abuse against Java.

2005-04-28 08:40:02

by Luming Yu

[permalink] [raw]
Subject: Re: [ACPI] Re: [PATCH 6/6]suspend/resume SMP support

On Thursday 28 April 2005 16:20, Pavel Machek wrote:
> Hi!
>
> > > > > On ia64, with tiger_defconfig:
> > > > >
> > > > > kernel/built-in.o(.text+0x59e12): In function `suspend_prepare':
> > > > > : undefined reference to `disable_nonboot_cpus'
> > > > >
> > > > > kernel/built-in.o(.text+0x59e62): In function `suspend_prepare':
> > > > > : undefined reference to `enable_nonboot_cpus'
> > > > >
> > > > > kernel/built-in.o(.text+0x5a222): In function `suspend_finish':
> > > > > : undefined reference to `enable_nonboot_cpus'
> > > >
> > > > Pavel,
> > > > Could IA64 do software suspend? There possibly are other troubles
> > > > here.
> > >
> > > Someone would have to write low-level support. Bring me ia64 notebook
> > > and I'll do it ;-)))))))))))))))))))).
> >
> > I just checked DSDT on one ia64 box, it has S0, S4, S5 object.
> > So, the platform should have sufficient support for sleep-to-disk.
>
> No arguments about that one... Something like
> arch/i386/power/{swsusp.S,cpu.c} needs to be written for ia64,
> then. I'm not goot at ia64 assembly (never done that), and do not have
> ia64 machine nearby, so I'm probably not doing that work.
>
> Pavel
It sounds not a big deal just to rewrite this two files. Let me try. :-)

Thanks,
Luming

2005-04-28 09:08:24

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [ACPI] Re: [PATCH 6/6]suspend/resume SMP support

Hi,

On Thursday, 28 of April 2005 10:35, Pavel Machek wrote:
> Hi!
>
> > > But does setting CONFIG_ACPI_SLEEP cause kernel/power/smp.o to be actually
> > > compiled and linked? I don't think so?
> > >
> > > Anyway, please send a tested fix.
> > Ha, this one should be ok. Only IA32 support SMP suspend now.
>
> Seems okay... Rafael, what is status of x86-64 smp swsusp?

Preliminary, semi-working. ;-) Actually, I'm waiting for the i386 CPU hotplug
to settle down a bit to see how much of it I can use on x86-64.

Greets,
Rafael


--
- Would you tell me, please, which way I ought to go from here?
- That depends a good deal on where you want to get to.
-- Lewis Carroll "Alice's Adventures in Wonderland"

2005-04-28 08:39:58

by Shaohua Li

[permalink] [raw]
Subject: Re: [PATCH 6/6]suspend/resume SMP support

On Thu, 2005-04-28 at 15:41, Andrew Morton wrote:
>
> But does setting CONFIG_ACPI_SLEEP cause kernel/power/smp.o to be actually
> compiled and linked? I don't think so?
>
> Anyway, please send a tested fix.
Ha, this one should be ok. Only IA32 support SMP suspend now.

Thanks,
Shaohua

---

linux-2.6.11-root/drivers/acpi/Kconfig | 2 +-
linux-2.6.11-root/include/linux/suspend.h | 2 +-
linux-2.6.11-root/kernel/power/Kconfig | 6 +++++-
linux-2.6.11-root/kernel/power/Makefile | 6 +++---
4 files changed, 10 insertions(+), 6 deletions(-)

diff -puN kernel/power/Makefile~kconfig kernel/power/Makefile
--- linux-2.6.11/kernel/power/Makefile~kconfig 2005-04-28 15:47:44.862281448 +0800
+++ linux-2.6.11-root/kernel/power/Makefile 2005-04-28 16:12:04.285415408 +0800
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
endif

-swsusp-smp-$(CONFIG_SMP) += smp.o
-
obj-y := main.o process.o console.o pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o
+obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o
+
+obj-$(CONFIG_SUSPEND_SMP) += smp.o

obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff -puN kernel/power/Kconfig~kconfig kernel/power/Kconfig
--- linux-2.6.11/kernel/power/Kconfig~kconfig 2005-04-28 15:48:45.195109464 +0800
+++ linux-2.6.11-root/kernel/power/Kconfig 2005-04-28 16:24:22.821140912 +0800
@@ -28,7 +28,7 @@ config PM_DEBUG

config SOFTWARE_SUSPEND
bool "Software Suspend (EXPERIMENTAL)"
- depends on EXPERIMENTAL && PM && SWAP && (HOTPLUG_CPU || !SMP)
+ depends on EXPERIMENTAL && PM && SWAP && (SUSPEND_SMP || !SMP)
---help---
Enable the possibility of suspending the machine.
It doesn't need APM.
@@ -72,3 +72,7 @@ config PM_STD_PARTITION
suspended image to. It will simply pick the first available swap
device.

+config SUSPEND_SMP
+ bool
+ depends on HOTPLUG_CPU && X86 && PM
+ default y
diff -puN drivers/acpi/Kconfig~kconfig drivers/acpi/Kconfig
--- linux-2.6.11/drivers/acpi/Kconfig~kconfig 2005-04-28 15:52:38.148695136 +0800
+++ linux-2.6.11-root/drivers/acpi/Kconfig 2005-04-28 15:53:01.707113712 +0800
@@ -57,7 +57,7 @@ if ACPI_INTERPRETER

config ACPI_SLEEP
bool "Sleep States (EXPERIMENTAL)"
- depends on X86 && (!SMP || HOTPLUG_CPU)
+ depends on X86 && (!SMP || SUSPEND_SMP)
depends on EXPERIMENTAL
default y
---help---
diff -puN include/linux/suspend.h~kconfig include/linux/suspend.h
--- linux-2.6.11/include/linux/suspend.h~kconfig 2005-04-28 15:53:50.261732288 +0800
+++ linux-2.6.11-root/include/linux/suspend.h 2005-04-28 15:54:12.928286448 +0800
@@ -58,7 +58,7 @@ static inline int software_suspend(void)
}
#endif

-#ifdef CONFIG_HOTPLUG_CPU
+#ifdef CONFIG_SUSPEND_SMP
extern void disable_nonboot_cpus(void);
extern void enable_nonboot_cpus(void);
#else
_


2005-05-26 02:52:33

by Shaohua Li

[permalink] [raw]
Subject: Hotplug CPU printk issue

On Thu, 2005-04-14 at 16:27 +0800, Li Shaohua wrote:
> On Wed, 2005-04-13 at 16:32, Pavel Machek wrote:
> > root@hobit:/sys/devices/system/cpu/cpu1# dmesg | tail -25
> > [<c011d001>] activate_task+0x1/0xa0
> > [<c011d128>] resched_task+0x68/0x90
> > [<c011d8ba>] try_to_wake_up+0x2aa/0x2f0
> > [<c025ed7a>] fbcon_cursor+0x19a/0x270
> > [<c02c4958>] hide_cursor+0x18/0x30
> > [<c02c758f>] vt_console_print+0x24f/0x260
> > [<c02c7340>] vt_console_print+0x0/0x260
> > [<c01247e7>] __call_console_drivers+0x57/0x60
> > [<c01248e0>] call_console_drivers+0x80/0x110
> > [<c0124d8e>] release_console_sem+0x4e/0xc0
> > [<c0124c12>] vprintk+0x192/0x240
> > [<c0528891>] preempt_schedule_irq+0x51/0x80
> > [<c02adeca>] acpi_processor_idle+0x0/0x265
> > [<c010325e>] need_resched+0x1f/0x21
> > [<c02adeca>] acpi_processor_idle+0x0/0x265
> > [<c0124a77>] printk+0x17/0x20
> > [<c010b583>] cpu_init+0x73/0x360
> > [<c0117bd6>] start_secondary+0x6/0x170
> > Code: d2 74 bd fc 8b 44 24 28 b9 0e 00 00 00 8b 74 24 14 01 c6 b8
> 0e
> > 00 00 00 89 74 24 1c 8b 74 24 30 89 44 24 10 8b 7c 24 1c 83 c6 10
> <f3>
> > a5 8b 74 24 24 8b 44 24 1c 89 4c 24 10 01 ee f7 d5 21 ee 89
> > <0>Kernel panic - not syncing: Attempted to kill the idle task!
> > Stuck ??
> > Inquiring remote APIC #0...
> > ... APIC #0 ID: 00000000
> > ... APIC #0 VERSION: 00040011
> > ... APIC #0 SPIV: 000000ff
> > root@hobit:/sys/devices/system/cpu/cpu1#
> Andrew,
> Below patch fixed Pavel's oops. But strange is the 'system_state'
> check
> is added for CPU hotplug by Rusty. This really makes me confused.
> Could
> you please look at it.
> This can be reproduced 100% with radeonfb driver load. Attached is
> the
> dmesg of an oops. It seems the 'objp' parameter for
> 'cache_alloc_debugcheck_after' is invalid.
>
> Thanks,
> Shaohua
>
> --- a/kernel/printk.c 2005-04-12 10:12:19.000000000 +0800
> +++ b/kernel/printk.c 2005-04-13 17:22:40.912897328 +0800
> @@ -624,8 +624,7 @@ asmlinkage int vprintk(const char *fmt,
> log_level_unknown = 1;
> }
>
> - if (!cpu_online(smp_processor_id()) &&
> - system_state != SYSTEM_RUNNING) {
> + if (!cpu_online(smp_processor_id())) {
> /*
> * Some console drivers may assume that per-cpu resources have
> * been allocated. So don't allow them to be called by this
Andrew,
Could above patch be put into mm tree? It fixes the oops of CPU hotplug
with radeon fb enabled.
The reason is the per-cpu data (radeon fb calls kmalloc) isn't
initialized when CPU hotplug is processing. system_state is
SYSTEM_RUNNING for cpu hotplug.

Thanks,
Shaohua

2005-05-26 03:52:33

by Andrew Morton

[permalink] [raw]
Subject: Re: Hotplug CPU printk issue

Shaohua Li <[email protected]> wrote:
>
> > --- a/kernel/printk.c 2005-04-12 10:12:19.000000000 +0800
> > +++ b/kernel/printk.c 2005-04-13 17:22:40.912897328 +0800
> > @@ -624,8 +624,7 @@ asmlinkage int vprintk(const char *fmt,
> > log_level_unknown = 1;
> > }
> >
> > - if (!cpu_online(smp_processor_id()) &&
> > - system_state != SYSTEM_RUNNING) {
> > + if (!cpu_online(smp_processor_id())) {
> > /*
> > * Some console drivers may assume that per-cpu resources have
> > * been allocated. So don't allow them to be called by this
> Andrew,
> Could above patch be put into mm tree?

Well not in that form. I'd appreciate being sent patches which are
applyable rather than mangled messes, please.

> It fixes the oops of CPU hotplug
> with radeon fb enabled.
> The reason is the per-cpu data (radeon fb calls kmalloc) isn't
> initialized when CPU hotplug is processing. system_state is
> SYSTEM_RUNNING for cpu hotplug.

That system_state test was explicitly added by davidm a year ago:

"- Allow printk on down cpus once system is running."

Please confirm that we in fact do not want to allow downed CPUs to
print things, then send a patch.

2005-05-26 05:37:13

by Shaohua Li

[permalink] [raw]
Subject: Re: Hotplug CPU printk issue

On Wed, 2005-05-25 at 20:48 -0700, Andrew Morton wrote:
> > > --- a/kernel/printk.c 2005-04-12 10:12:19.000000000 +0800
> > > +++ b/kernel/printk.c 2005-04-13 17:22:40.912897328 +0800
> > > @@ -624,8 +624,7 @@ asmlinkage int vprintk(const char *fmt,
> > > log_level_unknown = 1;
> > > }
> > >
> > > - if (!cpu_online(smp_processor_id()) &&
> > > - system_state != SYSTEM_RUNNING) {
> > > + if (!cpu_online(smp_processor_id())) {
> > > /*
> > > * Some console drivers may assume that per-cpu resources have
> > > * been allocated. So don't allow them to be called by this
> > Andrew,
> > Could above patch be put into mm tree?
>
> Well not in that form. I'd appreciate being sent patches which are
> applyable rather than mangled messes, please.
>
> > It fixes the oops of CPU hotplug
> > with radeon fb enabled.
> > The reason is the per-cpu data (radeon fb calls kmalloc) isn't
> > initialized when CPU hotplug is processing. system_state is
> > SYSTEM_RUNNING for cpu hotplug.
>
> That system_state test was explicitly added by davidm a year ago:
>
> "- Allow printk on down cpus once system is running."
>
> Please confirm that we in fact do not want to allow downed CPUs to
> print things, then send a patch.
Yep. In the cpu hotplug case, per-cpu data possibly isn't initialized
even the system state is 'running'. As the comments say in the original
code, some console drivers assume per-cpu resources have been allocated.
radeon fb is one such driver, which uses kmalloc. After a CPU is down,
the per-cpu data of slab is freed, so the system crashed when printing
some info.

Thanks,
Shaohua

---

linux-2.6.11-rc5-mm1-root/kernel/printk.c | 3 +--
1 files changed, 1 insertion(+), 2 deletions(-)

diff -puN kernel/printk.c~printk kernel/printk.c
--- linux-2.6.11-rc5-mm1/kernel/printk.c~printk 2005-05-26 13:11:47.333540352 +0800
+++ linux-2.6.11-rc5-mm1-root/kernel/printk.c 2005-05-26 13:11:47.336539896 +0800
@@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt,
log_level_unknown = 1;
}

- if (!cpu_online(smp_processor_id()) &&
- system_state != SYSTEM_RUNNING) {
+ if (!cpu_online(smp_processor_id())) {
/*
* Some console drivers may assume that per-cpu resources have
* been allocated. So don't allow them to be called by this
_


2005-05-26 05:56:14

by Andrew Morton

[permalink] [raw]
Subject: Re: Hotplug CPU printk issue

Shaohua Li <[email protected]> wrote:
>
> > Please confirm that we in fact do not want to allow downed CPUs to
> > print things, then send a patch.
> Yep. In the cpu hotplug case, per-cpu data possibly isn't initialized
> even the system state is 'running'. As the comments say in the original
> code, some console drivers assume per-cpu resources have been allocated.
> radeon fb is one such driver, which uses kmalloc. After a CPU is down,
> the per-cpu data of slab is freed, so the system crashed when printing
> some info.

hm, that certainly sounds sane, but I do recall there were
reasonable-sounding reasons why the ia64 guys wanted printk-on-a-down-CPU
to work. Hopefully David can remember what the problem was so we can find
a more thorough fix.

2005-05-26 19:35:02

by David Mosberger

[permalink] [raw]
Subject: Re: Hotplug CPU printk issue

>>>>> On Wed, 25 May 2005 22:52:04 -0700, Andrew Morton <[email protected]> said:

Andrew> Shaohua Li <[email protected]> wrote:

>> > Please confirm that we in fact do not want to allow downed CPUs to
>> > print things, then send a patch.
>> Yep. In the cpu hotplug case, per-cpu data possibly isn't initialized
>> even the system state is 'running'. As the comments say in the original
>> code, some console drivers assume per-cpu resources have been allocated.
>> radeon fb is one such driver, which uses kmalloc. After a CPU is down,
>> the per-cpu data of slab is freed, so the system crashed when printing
>> some info.

Andrew> hm, that certainly sounds sane, but I do recall there were
Andrew> reasonable-sounding reasons why the ia64 guys wanted
Andrew> printk-on-a-down-CPU to work. Hopefully David can remember
Andrew> what the problem was so we can find a more thorough fix.

I don't recall having submitted such a patch. According to the bk
log, it was Rusty who added the !system_running check (which was later
changed to system_state != SYSTEM_RUNNING).

The changelog only says:

"- Allow printk on down cpus once system is running"

Rusty?

--david

2005-05-27 02:09:09

by Rusty Russell

[permalink] [raw]
Subject: Re: Hotplug CPU printk issue

On Thu, 2005-05-26 at 12:32 -0700, David Mosberger wrote:
> >>>>> On Wed, 25 May 2005 22:52:04 -0700, Andrew Morton <[email protected]> said:
>
> Andrew> Shaohua Li <[email protected]> wrote:
>
> >> > Please confirm that we in fact do not want to allow downed CPUs to
> >> > print things, then send a patch.
> >> Yep. In the cpu hotplug case, per-cpu data possibly isn't initialized
> >> even the system state is 'running'. As the comments say in the original
> >> code, some console drivers assume per-cpu resources have been allocated.
> >> radeon fb is one such driver, which uses kmalloc. After a CPU is down,
> >> the per-cpu data of slab is freed, so the system crashed when printing
> >> some info.
>
> Andrew> hm, that certainly sounds sane, but I do recall there were
> Andrew> reasonable-sounding reasons why the ia64 guys wanted
> Andrew> printk-on-a-down-CPU to work. Hopefully David can remember
> Andrew> what the problem was so we can find a more thorough fix.
>
> I don't recall having submitted such a patch. According to the bk
> log, it was Rusty who added the !system_running check (which was later
> changed to system_state != SYSTEM_RUNNING).
>
> The changelog only says:
>
> "- Allow printk on down cpus once system is running"

IIRC it was a great aid to debugging hotplug CPU, and there seemed no
reason to ban it. I mean, you have to be running code on a down CPU,
which implies you're in arch code or you've done something wrong... I
don't have religious attachment to it, however!

Cheers,
Rusty.
--
A bad analogy is like a leaky screwdriver -- Richard Braakman