Some multiprocessor 64-bit AMD systems don't allow the user to disable
the C1E C-state. The kernel detects C1E and marks the LAPIC as
broken, thereby disabling dynticks. This patch adds an option to
disable C1E when detected. It also allows the user to enable this
processor feature even if that means disabling dynticks, which is
useful in case C1E might provide better power savings (e.g.: C-states
beyond C1 don't work). Tested on a Turion X2 TL-56 laptop. Thanks to
Mikhail Kshevetskiy and FreeBSD for pointing out the relevant AMD docs.
Signed-off-by: Eduard-Gabriel Munteanu <[email protected]>
---
Documentation/kernel-parameters.txt | 6 ++++++
arch/x86/Kconfig | 16 ++++++++++++++++
arch/x86/kernel/setup_64.c | 27 +++++++++++++++++++++++++--
3 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 33121d6..0deee7a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -643,6 +643,12 @@ and is between 256 and 4096 characters. It is defined in the file
floppy= [HW]
See Documentation/floppy.txt.
+ force_amd_c1e [KNL,SMP,HW,BUGS=X86-64]
+ Don't disable C1E on AMD systems even if this means
+ disabling nohz. This is _not_ automatically implied by
+ any other parameters, such as "nohz=off".
+ Depends on CONFIG_X86_AMD_C1E_WORKAROUND.
+
gamecon.map[2|3]=
[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
support via parallel port (up to 5 devices per port)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 368864d..8b9bb49 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -198,6 +198,22 @@ config SMP
If you don't know what to do here, say N.
+config X86_AMD_C1E_WORKAROUND
+ bool "Disable C1E on AMD systems to make dynticks work"
+ default y
+ depends on X86_64 && SMP && NO_HZ
+ ---help---
+ On some systems, the C1E C-state is enabled by default and cannot be
+ disabled from the CMOS setup. Local APICs don't behave as they should
+ in this case. If you say Y here, C1E will be disabled to allow
+ dynamic ticks to work. It's safe to enable this option even if
+ your system doesn't have an AMD CPU (there are no side-effects if
+ such a CPU isn't detected).
+
+ You can pass the "force_amd_c1e" boot parameter to the kernel to
+ disable this workaround without recompiling.
+ See Documentation/kernel-parameters.txt for more details.
+
choice
prompt "Subarchitecture Type"
default X86_PC
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 30d94d1..15556a0 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -583,6 +583,17 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
#endif
}
+#ifdef CONFIG_X86_AMD_C1E_WORKAROUND
+static int __cpuinit disable_amd_c1e = 1;
+
+static int __cpuinit force_amd_c1e(char *str) {
+ disable_amd_c1e = 0;
+ return 1;
+}
+
+__setup("force_amd_c1e", force_amd_c1e);
+#endif /* CONFIG_X86_AMD_C1E_WORKAROUND */
+
#define ENABLE_C1E_MASK 0x18000000
#define CPUID_PROCESSOR_SIGNATURE 1
#define CPUID_XFAM 0x0ff00000
@@ -597,6 +608,7 @@ static __cpuinit int amd_apic_timer_broken(void)
{
u32 lo, hi;
u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+
switch (eax & CPUID_XFAM) {
case CPUID_XFAM_K8:
if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
@@ -604,8 +616,19 @@ static __cpuinit int amd_apic_timer_broken(void)
case CPUID_XFAM_10H:
case CPUID_XFAM_11H:
rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
- if (lo & ENABLE_C1E_MASK)
- return 1;
+#ifdef CONFIG_X86_AMD_C1E_WORKAROUND
+ if ((lo & ENABLE_C1E_MASK) && disable_amd_c1e) {
+ printk(KERN_INFO "Disabling AMD C1E on CPU %d\n",
+ smp_processor_id());
+ /*
+ * See AMD's "BIOS and Kernel Developer's Guide for AMD
+ * NPT Family 0Fh Processors", publication #32559,
+ * for details.
+ */
+ wrmsr(MSR_K8_ENABLE_C1E, lo & ~ENABLE_C1E_MASK, hi);
+ } else
+#endif /* CONFIG_X86_AMD_C1E_WORKAROUND */
+ if (lo & ENABLE_C1E_MASK) return 1;
break;
default:
/* err on the side of caution */
Hi,
Eduard-Gabriel Munteanu <[email protected]> writes:
> diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
> index 30d94d1..15556a0 100644
> --- a/arch/x86/kernel/setup_64.c
> +++ b/arch/x86/kernel/setup_64.c
> @@ -583,6 +583,17 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
> #endif
> }
>
> +#ifdef CONFIG_X86_AMD_C1E_WORKAROUND
> +static int __cpuinit disable_amd_c1e = 1;
> +
> +static int __cpuinit force_amd_c1e(char *str) {
> + disable_amd_c1e = 0;
> + return 1;
> +}
> +
> +__setup("force_amd_c1e", force_amd_c1e);
> +#endif /* CONFIG_X86_AMD_C1E_WORKAROUND */
> +
> #define ENABLE_C1E_MASK 0x18000000
> #define CPUID_PROCESSOR_SIGNATURE 1
> #define CPUID_XFAM 0x0ff00000
> @@ -597,6 +608,7 @@ static __cpuinit int amd_apic_timer_broken(void)
> {
> u32 lo, hi;
> u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
> +
> switch (eax & CPUID_XFAM) {
> case CPUID_XFAM_K8:
> if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
> @@ -604,8 +616,19 @@ static __cpuinit int amd_apic_timer_broken(void)
> case CPUID_XFAM_10H:
> case CPUID_XFAM_11H:
> rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
> - if (lo & ENABLE_C1E_MASK)
> - return 1;
> +#ifdef CONFIG_X86_AMD_C1E_WORKAROUND
> + if ((lo & ENABLE_C1E_MASK) && disable_amd_c1e) {
> + printk(KERN_INFO "Disabling AMD C1E on CPU %d\n",
> + smp_processor_id());
> + /*
> + * See AMD's "BIOS and Kernel Developer's Guide for AMD
> + * NPT Family 0Fh Processors", publication #32559,
> + * for details.
> + */
> + wrmsr(MSR_K8_ENABLE_C1E, lo & ~ENABLE_C1E_MASK, hi);
> + } else
> +#endif /* CONFIG_X86_AMD_C1E_WORKAROUND */
> + if (lo & ENABLE_C1E_MASK) return 1;
I would find this way more readable:
if (lo & ENABLE_C1E_MASK) {
#ifdef CONFIG_X86_AMD_C1E_WORKAROUND
if (disable_amd_c1e) {
...
#else
return 1;
#endif
}
Why does it require to be enabled by compile-time AND run-time? Is that
something you might want to decide on every boot? Could we make it
settable at boot-time xor at compile-time?
Hannes
Hello Eduard-Gabriel,
You write:
> + dynamic ticks to work. It's safe to enable this option even if
> + your system doesn't have an AMD CPU (there are no side-effects if
> + such a CPU isn't detected).
It's definitely not safe. There are the computers with totally broken
BIOSes, they setup C1e flags for one core only. If you try to disable
C1e on such machine, you most likely kill the kernel.
>From my point of view, you patch should work in both directions, i.e.
1) user should be allowed to force disable C1e for all cores.
2) user should be allowed to force enable C1e for both cores.
and if the user don't use this flag, the C1e state should be untouched.
Mikhail Kshevetskiy
On Thu, 13 Dec 2007 23:58:50 +0100
Johannes Weiner <[email protected]> wrote:
> I would find this way more readable:
>
> if (lo & ENABLE_C1E_MASK) {
> #ifdef CONFIG_X86_AMD_C1E_WORKAROUND
> if (disable_amd_c1e) {
> ...
> #else
> return 1;
> #endif
> }
I wanted to avoid using too many tabs, as that would require the lines
inside the if block to be split too many times. I'll resubmit the patch
if you think it's necessary.
> Why does it require to be enabled by compile-time AND run-time? Is
> that something you might want to decide on every boot? Could we make
> it settable at boot-time xor at compile-time?
Making it settable only at boot-time means compiling in unnecessary
code. Also, making it settable only at compile-time would be bad for
distros that distribute binary kernels, as some users may experience
worse power savings with dynticks than with C1E (especially when C2, C3
and higher C-states are not available, due to brain-dead firmware).
This allows them to opt for leaving C1E enabled.
It is not required to enable it at both compile-time and boot-time. You
enable it at compile-time and then you _can disable_ it at boot-time.
On Fri 2007-12-14 00:47:10, Eduard-Gabriel Munteanu wrote:
> Some multiprocessor 64-bit AMD systems don't allow the user to disable
> the C1E C-state. The kernel detects C1E and marks the LAPIC as
> broken, thereby disabling dynticks. This patch adds an option to
> disable C1E when detected. It also allows the user to enable this
> processor feature even if that means disabling dynticks, which is
> useful in case C1E might provide better power savings (e.g.: C-states
> beyond C1 don't work). Tested on a Turion X2 TL-56 laptop. Thanks to
> Mikhail Kshevetskiy and FreeBSD for pointing out the relevant AMD docs.
...
> +static int __cpuinit force_amd_c1e(char *str) {
{ on new line, please.
> + disable_amd_c1e = 0;
> + return 1;
> +}
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Tue, 18 Dec 2007 15:43:52 +0000
Pavel Machek <[email protected]> wrote:
> On Fri 2007-12-14 00:47:10, Eduard-Gabriel Munteanu wrote:
>
> > +static int __cpuinit force_amd_c1e(char *str) {
>
> { on new line, please.
>
> > + disable_amd_c1e = 0;
> > + return 1;
> > +}
Sorry, I know it's a bad habit. I'll be more careful next time. Thanks
for pointing it out, though.
Anyway, there's no point in resubmitting a corrected patch, as we
already discussed that this approach isn't helpful.
On Fri, 28 Dec 2007 13:57:57 -0500
Richard Harman <[email protected]> wrote:
> I just saw this thread online from someone else who was having
> problems with an HP laptop -- I believe my laptop falls into this
> category.
>
> The laptop is currently running Fedora Core 8, but I couldn't figure
> out what kernel that diff was against. I'm not experienced with
> git. Can you give me a few pointers on how to get that patch applied
> to a kernel tarball? I'll be more than willing to be your test
> subject to see how well this patch and whatever future revision of it
> may work.
>
> Richard
Hi,
I'll assume you didn't read the thread careful enough, so I have to
emphasize the fact that this patch won't improve power savings (even
more, it may actually consume an additional 1-2 Watts). Try it only if
you need dynticks for another purpose than just saving power (in which
case, you should wait until a better solution comes up, namely
LAPIC-less dynticks).
IIRC, the diff was made against 2.6.24-rc4, but it applies okay on the
latest -rc (2.6.24-rc6 as of now) as far as I can see. Copy starting
with the line below "---" all the way to the end and save it to a file
(make sure tabs don't get messed up), then apply with "patch -p1" as
you would apply any other patch. Run "make xconfig" (or whatever
interface to Kconfig you use), make sure "Processor type and
features -> Disable C1E on AMD systems to make dynticks work" is set,
along with all other relevant options, then build and the kernel as
usual.
Eduard-Gabriel Munteanu wrote:
> On Fri, 28 Dec 2007 13:57:57 -0500
> Richard Harman <[email protected]> wrote:
>
>> I just saw this thread online from someone else who was having
>> problems with an HP laptop -- I believe my laptop falls into this
>> category.
>>
>> The laptop is currently running Fedora Core 8, but I couldn't figure
>> out what kernel that diff was against. I'm not experienced with
>> git. Can you give me a few pointers on how to get that patch applied
>> to a kernel tarball? I'll be more than willing to be your test
>> subject to see how well this patch and whatever future revision of it
>> may work.
>>
>> Richard
>
> Hi,
>
> I'll assume you didn't read the thread careful enough, so I have to
> emphasize the fact that this patch won't improve power savings (even
> more, it may actually consume an additional 1-2 Watts). Try it only if
> you need dynticks for another purpose than just saving power (in which
> case, you should wait until a better solution comes up, namely
> LAPIC-less dynticks).
Oh, at the moment I couldn't care less about power savings, I'm more
interested in simply getting the kernel properly -booting- on this hardware.
Right now, with your patch and the 'noirqdebug' option or disabling nohz
the system appears to be stable. This laptop otherwise locks up trying
to configure apic/lapic, or locks up solid later with NO oops/bug and
nothing will bring it out of it (eg, sysrq has no effect). NMI watchdog
was also inoperable, which I believe was due to the apic/lapic being
disabled.
With your patch, the lapic is used:
ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
ACPI: LAPIC (acpi_id[0x01] lapic_id[0x01] enabled)
These kernel dmesg logs are hand-transcribed, and are from 2.6.22.5, but
this shows where the kernel would lock up good and solid:
http://www.richardharman.com/pavilion/2.6.22.5/dmesg.txt
> IIRC, the diff was made against 2.6.24-rc4, but it applies okay on the
> latest -rc (2.6.24-rc6 as of now) as far as I can see. Copy starting
> with the line below "---" all the way to the end and save it to a file
> (make sure tabs don't get messed up), then apply with "patch -p1" as
> you would apply any other patch. Run "make xconfig" (or whatever
> interface to Kconfig you use), make sure "Processor type and
> features -> Disable C1E on AMD systems to make dynticks work" is set,
> along with all other relevant options, then build and the kernel as
> usual.
Yeah, I actually found a how-to on how to pull down Linus's git tree,
and managed to patch -p1 against that. Kudos to the folks who set up
faqs on using git.
Anyway, I'm extremely open to getting to the bottom of working around
the quirks on this hardware. If I havn't mentioned it previously, this
laptop is an HP dv6408nr, with an amd turion tl-56 cpu and nVidia MCP51
chipset.
What can I do to help? It has been pulling teeth trying to get to the
bottom of this.
On Sat, 29 Dec 2007 04:09:41 -0500
Richard Harman <[email protected]> wrote:
> Right now, with your patch and the 'noirqdebug' option or disabling
> nohz the system appears to be stable. This laptop otherwise locks up
> trying to configure apic/lapic, or locks up solid later with NO
> oops/bug and nothing will bring it out of it (eg, sysrq has no
> effect). NMI watchdog was also inoperable, which I believe was due
> to the apic/lapic being disabled.
>
> With your patch, the lapic is used:
>
> ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
> ACPI: LAPIC (acpi_id[0x01] lapic_id[0x01] enabled)
The kernel should display these messages even without my patch.
> These kernel dmesg logs are hand-transcribed, and are from 2.6.22.5,
> but this shows where the kernel would lock up good and solid:
>
> http://www.richardharman.com/pavilion/2.6.22.5/dmesg.txt
I doubt it is my patch that makes your laptop work. C1E detection was
added by the following commit:
$ git-describe fb79d22e1d4b06385796cc0db0084a2e07beccee
v2.6.23-2294-gfb79d22
Try building a 2.6.24-rc6 without my patch and see if it still works
(apply it again in reverse mode, for example). I guess it will work,
because the kernel shouldn't use the LAPIC as a timer at all (except as
a dummy one), with or without dynticks support.
When I started investigating the "nohz is not working problem", I
stumbled across a few threads on LKML where a few people had
similar problems with their laptops and they where instructed to pass
"nolapic" or something similar as a boot parameter to the kernel. I
guess the problem disappeared after the commit I mentioned previously,
since such reports weren't that recent. IIRC, that was on kernels
previous to 2.6.24-*, where x86 and x86-64 weren't unified and there
were no dynticks on the latter.
To be more clear, my patch disables C1E instead of disabling the
LAPIC, as these two don't agree with each other.
Hello.
I was interested in getting dynticks to work on my compaq presario v6000
to help with the 1 hour thirty minutes battery time, but after this
discussion I lost interest.
I too had the early boot time hang, and found it was udev triggering the
bug.
Changing the /etc/init.d/udev script so that the line containing
/sbin/udevtrigger
to
/sbin/udevtrigger --subsystem-nomatch="*misc*"
seemed to fix things.
the hang is triggered specifically by
echo add > /sys/class/misc/rtc/uevent
after inserting rtc.ko
Also using hwclock to set the rtc , will cause a hard hang, if you are
using 64bit linux. Disable the init scripts that set the time, or use
the 32bit binary, as suggested here :
http://www.mail-archive.com/[email protected]/msg41964.html
I hope this helps. But your hardware is slightly different though.
On Sat, 2007-12-29 at 04:09 -0500, Richard Harman wrote:
> Eduard-Gabriel Munteanu wrote:
>
> > On Fri, 28 Dec 2007 13:57:57 -0500
> > Richard Harman <[email protected]> wrote:
> >
> >> I just saw this thread online from someone else who was having
> >> problems with an HP laptop -- I believe my laptop falls into this
> >> category.
> >>
> >> The laptop is currently running Fedora Core 8, but I couldn't figure
> >> out what kernel that diff was against. I'm not experienced with
> >> git. Can you give me a few pointers on how to get that patch applied
> >> to a kernel tarball? I'll be more than willing to be your test
> >> subject to see how well this patch and whatever future revision of it
> >> may work.
> >>
> >> Richard
> >
> > Hi,
> >
> > I'll assume you didn't read the thread careful enough, so I have to
> > emphasize the fact that this patch won't improve power savings (even
> > more, it may actually consume an additional 1-2 Watts). Try it only if
> > you need dynticks for another purpose than just saving power (in which
> > case, you should wait until a better solution comes up, namely
> > LAPIC-less dynticks).
>
> Oh, at the moment I couldn't care less about power savings, I'm more
> interested in simply getting the kernel properly -booting- on this hardware.
>
> Right now, with your patch and the 'noirqdebug' option or disabling nohz
> the system appears to be stable. This laptop otherwise locks up trying
> to configure apic/lapic, or locks up solid later with NO oops/bug and
> nothing will bring it out of it (eg, sysrq has no effect). NMI watchdog
> was also inoperable, which I believe was due to the apic/lapic being
> disabled.
>
> With your patch, the lapic is used:
>
> ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
> ACPI: LAPIC (acpi_id[0x01] lapic_id[0x01] enabled)
>
> These kernel dmesg logs are hand-transcribed, and are from 2.6.22.5, but
> this shows where the kernel would lock up good and solid:
>
> http://www.richardharman.com/pavilion/2.6.22.5/dmesg.txt
>
> > IIRC, the diff was made against 2.6.24-rc4, but it applies okay on the
> > latest -rc (2.6.24-rc6 as of now) as far as I can see. Copy starting
> > with the line below "---" all the way to the end and save it to a file
> > (make sure tabs don't get messed up), then apply with "patch -p1" as
> > you would apply any other patch. Run "make xconfig" (or whatever
> > interface to Kconfig you use), make sure "Processor type and
> > features -> Disable C1E on AMD systems to make dynticks work" is set,
> > along with all other relevant options, then build and the kernel as
> > usual.
>
> Yeah, I actually found a how-to on how to pull down Linus's git tree,
> and managed to patch -p1 against that. Kudos to the folks who set up
> faqs on using git.
>
> Anyway, I'm extremely open to getting to the bottom of working around
> the quirks on this hardware. If I havn't mentioned it previously, this
> laptop is an HP dv6408nr, with an amd turion tl-56 cpu and nVidia MCP51
> chipset.
>
> What can I do to help? It has been pulling teeth trying to get to the
> bottom of this.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
Islam Amer wrote:
> Hello.
> I was interested in getting dynticks to work on my compaq presario v6000
> to help with the 1 hour thirty minutes battery time, but after this
> discussion I lost interest.
>
> I too had the early boot time hang, and found it was udev triggering the
> bug.
>
This early boot time hang is *almost certainly* due to the in/out port
80 bug, which I discovered a few weeks ago, which affects hwclock and
other I/O device drivers on a number of HP/Compaq machines in exactly
this way. The proper fix for this bug is in dispute, and will probably
not occur in the 2.6.24 release because it touches code in many, many
drivers. The simplest way to test if you have a problem of this sort is
to try this shell line as root, after you boot successfully. If your
machine hangs hard, you have a problem that really looks like the port
80 problem.
for ((i = 0; i < 1000; i = i + 1)); do cat /dev/nvram > /dev/null; done
I have also attached a c program that only touches port 80. Compile it
for 32-bit mode (see comment), run it as root, and after two or three
runs, it will hang a system that has the port 80 bug.
If you then run:
dmidecode -s baseboard-manufacturer
dmidecode -s baseboard-product-name
are the values you should plug into the .matches field in the
dmi_system_id struct in the attached patch. It would be great if you
could do that, test, and post back with those values so they can be
accumulated. HP/Compaq machines with quanta m/b's are very popular, and
very common - so at least a quirk patch for all the broken models would
be worth doing in 2.6.25 or downstream in the distros. The right
patches will probably take a long time - there is a dispute as to what
the semantics of port 80 writes even mean among the core kernel
developers, because the hack is lost in the dim dark days of history,
and safe resolution will take time
There is also a C1E issue with the BIOS in my machine (an HP Pavilion
dv9000z). I don't know if it is a bug, yet, but that's a different
problem - associated with dynticks, perhaps. I have to say that
researching the AMD Kernel/BIOS docs on C1E (a very new feature in the
last year on AMD) leaves me puzzled as to whether the dynticks problem
exists on my machine at all, but the patch for it turns off dynticks!
> Changing the /etc/init.d/udev script so that the line containing
>
> /sbin/udevtrigger
>
> to
>
> /sbin/udevtrigger --subsystem-nomatch="*misc*"
>
> seemed to fix things.
>
> the hang is triggered specifically by
>
> echo add > /sys/class/misc/rtc/uevent
> after inserting rtc.ko
>
> Also using hwclock to set the rtc , will cause a hard hang, if you are
> using 64bit linux. Disable the init scripts that set the time, or use
> the 32bit binary, as suggested here :
>
> http://www.mail-archive.com/[email protected]/msg41964.html
>
> I hope this helps. But your hardware is slightly different though.
>
Thanks for the detailed response.
I thought I had gotten to the bottom of my problems when I found that
udev workaround, I guess I was naive.
I did the two tests you described and they predictably caused the hard
hangs. I needed to run the port80 program only once to get the hard
hang.
The output of the dmidecode commands were :
Quanta
30B7
I applied the patch you provided ( luckily I am using 2.6.24-rc6-git3
kernel because I need the b43 driver ), added these values and compiled.
{
.callback = dmi_io_delay_port_alt,
.ident = "Compaq Presario v6000",
.matches = {
DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
DMI_MATCH(DMI_BOARD_NAME, "30B7")
}
},
I was able to boot without the udev workaround and can now use hwclock
without hanging the system. In dmesg I can see this new line :
On Sat, 2007-12-29 at 09:43 -0500, David P. Reed wrote:
> Islam Amer wrote:
> > Hello.
> > I was interested in getting dynticks to work on my compaq presario v6000
> > to help with the 1 hour thirty minutes battery time, but after this
> > discussion I lost interest.
> >
> > I too had the early boot time hang, and found it was udev triggering the
> > bug.
> >
> This early boot time hang is *almost certainly* due to the in/out port
> 80 bug, which I discovered a few weeks ago, which affects hwclock and
> other I/O device drivers on a number of HP/Compaq machines in exactly
> this way. The proper fix for this bug is in dispute, and will probably
> not occur in the 2.6.24 release because it touches code in many, many
> drivers. The simplest way to test if you have a problem of this sort is
> to try this shell line as root, after you boot successfully. If your
> machine hangs hard, you have a problem that really looks like the port
> 80 problem.
>
> for ((i = 0; i < 1000; i = i + 1)); do cat /dev/nvram > /dev/null; done
>
> I have also attached a c program that only touches port 80. Compile it
> for 32-bit mode (see comment), run it as root, and after two or three
> runs, it will hang a system that has the port 80 bug.
>
> If you then run:
>
> dmidecode -s baseboard-manufacturer
> dmidecode -s baseboard-product-name
>
> are the values you should plug into the .matches field in the
> dmi_system_id struct in the attached patch. It would be great if you
> could do that, test, and post back with those values so they can be
> accumulated. HP/Compaq machines with quanta m/b's are very popular, and
> very common - so at least a quirk patch for all the broken models would
> be worth doing in 2.6.25 or downstream in the distros. The right
> patches will probably take a long time - there is a dispute as to what
> the semantics of port 80 writes even mean among the core kernel
> developers, because the hack is lost in the dim dark days of history,
> and safe resolution will take time
>
> There is also a C1E issue with the BIOS in my machine (an HP Pavilion
> dv9000z). I don't know if it is a bug, yet, but that's a different
> problem - associated with dynticks, perhaps. I have to say that
> researching the AMD Kernel/BIOS docs on C1E (a very new feature in the
> last year on AMD) leaves me puzzled as to whether the dynticks problem
> exists on my machine at all, but the patch for it turns off dynticks!
>
>
>
> > Changing the /etc/init.d/udev script so that the line containing
> >
> > /sbin/udevtrigger
> >
> > to
> >
> > /sbin/udevtrigger --subsystem-nomatch="*misc*"
> >
> > seemed to fix things.
> >
> > the hang is triggered specifically by
> >
> > echo add > /sys/class/misc/rtc/uevent
> > after inserting rtc.ko
> >
> > Also using hwclock to set the rtc , will cause a hard hang, if you are
> > using 64bit linux. Disable the init scripts that set the time, or use
> > the 32bit binary, as suggested here :
> >
> > http://www.mail-archive.com/[email protected]/msg41964.html
> >
> > I hope this helps. But your hardware is slightly different though.
> >
>
Sorry, pressed send too soon. here is the line in dmesg:
Compaq Presario v6000: using alternate I/O delay port
Thanks, please tell me if you need anymore info.
commit 5f27525d3e796ae12e3186afe8ef0ec41af9e160
Author: Rene Herman <[email protected]>
Date: Mon Dec 17 21:23:55 2007 +0100
x86: provide a DMI based port 0x80 I/O delay override.
Certain (HP/Compaq) laptops experience trouble from our port 0x80
I/O delay writes. This patch provides for a DMI based switch to the
"alternate diagnostic port" 0xed (as used by some BIOSes as well)
for these.
David P. Reed confirmed that using port 0xed works and provides a
proper delay on his HP Pavilion dv9000z, Islam Amer comfirmed that
it does so on a Compaq Presario V6000. Both are Quanta boards, type
30B9 and 30B7 respectively and are the (only) machines for which
the DMI based switch triggers. HP Pavilion dv6000z is expected to
also need this but its DMI info hasn't been verified yet.
The symptoms of _not_ working are a hanging machine, with "hwclock"
use being a direct trigger and therefore the bootup often hanging
already on these machines.
Earlier versions of this attempted to simply use udelay(2), with the
2 being a value tested to be a nicely conservative upper-bound with
help from many on the linux-kernel mailinglist, but that approach has
two problems.
First, pre-loops_per_jiffy calibration (which is post PIT init while
some implementations of the PIT are actually one of the historically
problematic devices that need the delay) udelay() isn't particularly
well-defined. We could initialise loops_per_jiffy conservatively (and
based on CPU family so as to not unduly delay old machines) which
would sort of work, but still leaves:
Second, delaying isn't the only effect that a write to port 0x80 has.
It's also a PCI posting barrier which some devices may be explicitly
or implicitly relying on. Alan Cox did a survey and found evidence
that additionally various drivers are racy on SMP without the bus
locking outb.
Switching to an inb() makes the timing too unpredictable and as such,
this DMI based switch should be the safest approach for now. Any more
invasive changes should get more rigid testing first. It's moreover
only very few machines with the problem and a DMI based hack seems
to fit that situation.
An early boot parameter to make the choice manually (and override any
possible DMI based decision) is also provided:
io_delay=standard|alternate
This does not change the io_delay() in the boot code which is using
the same port 0x80 I/O delay but those do not appear to be a problem
as tested by David P. Reed. He moreover reported that booting with
"acpi=off" also fixed things and seeing as how ACPI isn't touched
until after this DMI based I/O port switch leaving the ones in the
boot code be is safe.
This patch is partly based on earlier patches from Pavel Machek and
David P. Reed.
Signed-off-by: Rene Herman <[email protected]>
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 33121d6..6948e25 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -785,6 +785,12 @@ and is between 256 and 4096 characters. It is defined in the file
for translation below 32 bit and if not available
then look in the higher range.
+ io_delay= [X86-32,X86-64] I/O delay port
+ standard
+ Use the 0x80 standard I/O delay port (default)
+ alternate
+ Use the 0xed alternate I/O delay port
+
io7= [HW] IO7 for Marvel based alpha systems
See comment before marvel_specify_io7 in
arch/alpha/kernel/core_marvel.c.
diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc_32.c
index b74d60d..288e162 100644
--- a/arch/x86/boot/compressed/misc_32.c
+++ b/arch/x86/boot/compressed/misc_32.c
@@ -276,10 +276,10 @@ static void putstr(const char *s)
RM_SCREEN_INFO.orig_y = y;
pos = (x + cols * y) * 2; /* Update cursor position */
- outb_p(14, vidport);
- outb_p(0xff & (pos >> 9), vidport+1);
- outb_p(15, vidport);
- outb_p(0xff & (pos >> 1), vidport+1);
+ outb(14, vidport);
+ outb(0xff & (pos >> 9), vidport+1);
+ outb(15, vidport);
+ outb(0xff & (pos >> 1), vidport+1);
}
static void* memset(void* s, int c, unsigned n)
diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c
index 6ea015a..43e5fcc 100644
--- a/arch/x86/boot/compressed/misc_64.c
+++ b/arch/x86/boot/compressed/misc_64.c
@@ -269,10 +269,10 @@ static void putstr(const char *s)
RM_SCREEN_INFO.orig_y = y;
pos = (x + cols * y) * 2; /* Update cursor position */
- outb_p(14, vidport);
- outb_p(0xff & (pos >> 9), vidport+1);
- outb_p(15, vidport);
- outb_p(0xff & (pos >> 1), vidport+1);
+ outb(14, vidport);
+ outb(0xff & (pos >> 9), vidport+1);
+ outb(15, vidport);
+ outb(0xff & (pos >> 1), vidport+1);
}
static void* memset(void* s, int c, unsigned n)
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index a7bc93c..0cc1981 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -8,7 +8,7 @@ CPPFLAGS_vmlinux.lds += -Ui386
obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
- quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o
+ quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
index 5a88890..08a68f0 100644
--- a/arch/x86/kernel/Makefile_64
+++ b/arch/x86/kernel/Makefile_64
@@ -11,7 +11,7 @@ obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
- i8253.o
+ i8253.o io_delay.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
new file mode 100644
index 0000000..77a8bcd
--- /dev/null
+++ b/arch/x86/kernel/io_delay.c
@@ -0,0 +1,77 @@
+/*
+ * I/O delay strategies for inb_p/outb_p
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <asm/io.h>
+
+/*
+ * Allow for a DMI based override of port 0x80
+ */
+#define IO_DELAY_PORT_STD 0x80
+#define IO_DELAY_PORT_ALT 0xed
+
+static unsigned short io_delay_port __read_mostly = IO_DELAY_PORT_STD;
+
+void native_io_delay(void)
+{
+ asm volatile ("outb %%al, %w0" : : "d" (io_delay_port));
+}
+EXPORT_SYMBOL(native_io_delay);
+
+static int __init dmi_io_delay_port_alt(const struct dmi_system_id *id)
+{
+ printk(KERN_NOTICE "%s: using alternate I/O delay port\n", id->ident);
+ io_delay_port = IO_DELAY_PORT_ALT;
+ return 0;
+}
+
+static struct dmi_system_id __initdata dmi_io_delay_port_alt_table[] = {
+ {
+ .callback = dmi_io_delay_port_alt,
+ .ident = "Compaq Presario V6000",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B7")
+ }
+ },
+ {
+ .callback = dmi_io_delay_port_alt,
+ .ident = "HP Pavilion dv9000z",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+ DMI_MATCH(DMI_BOARD_NAME, "30B9")
+ }
+ },
+ {
+ }
+};
+
+static int __initdata io_delay_override;
+
+static int __init io_delay_param(char *s)
+{
+ if (!s)
+ return -EINVAL;
+
+ if (!strcmp(s, "standard"))
+ io_delay_port = IO_DELAY_PORT_STD;
+ else if (!strcmp(s, "alternate"))
+ io_delay_port = IO_DELAY_PORT_ALT;
+ else
+ return -EINVAL;
+
+ io_delay_override = 1;
+ return 0;
+}
+
+early_param("io_delay", io_delay_param);
+
+void __init io_delay_init(void)
+{
+ if (!io_delay_override)
+ dmi_check_system(dmi_io_delay_port_alt_table);
+}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index e1e18c3..6c3a3b4 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -648,6 +648,8 @@ void __init setup_arch(char **cmdline_p)
dmi_scan_machine();
+ io_delay_init();;
+
#ifdef CONFIG_X86_GENERICARCH
generic_apic_probe();
#endif
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 30d94d1..ec976ed 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -311,6 +311,8 @@ void __init setup_arch(char **cmdline_p)
dmi_scan_machine();
+ io_delay_init();
+
#ifdef CONFIG_SMP
/* setup to use the static apicid table during kernel startup */
x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
index fe881cd..690b8f4 100644
--- a/include/asm-x86/io_32.h
+++ b/include/asm-x86/io_32.h
@@ -250,10 +250,8 @@ static inline void flush_write_buffers(void)
#endif /* __KERNEL__ */
-static inline void native_io_delay(void)
-{
- asm volatile("outb %%al,$0x80" : : : "memory");
-}
+extern void io_delay_init(void);
+extern void native_io_delay(void);
#if defined(CONFIG_PARAVIRT)
#include <asm/paravirt.h>
diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
index a037b07..b2d4994 100644
--- a/include/asm-x86/io_64.h
+++ b/include/asm-x86/io_64.h
@@ -35,13 +35,18 @@
* - Arnaldo Carvalho de Melo <[email protected]>
*/
-#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
+extern void io_delay_init(void);
+extern void native_io_delay(void);
+static inline void slow_down_io(void)
+{
+ native_io_delay();
#ifdef REALLY_SLOW_IO
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
-#else
-#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
+ native_io_delay();
+ native_io_delay();
+ native_io_delay();
#endif
+}
/*
* Talk about misusing macros..
@@ -50,21 +55,21 @@
static inline void out##s(unsigned x value, unsigned short port) {
#define __OUT2(s,s1,s2) \
-__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
+__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1" : : "a" (value), "Nd" (port))
#define __OUT(s,s1,x) \
-__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
-__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
+__OUT1(s,x) __OUT2(s,s1,"w"); } \
+__OUT1(s##_p,x) __OUT2(s,s1,"w"); slow_down_io(); }
#define __IN1(s) \
static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
#define __IN2(s,s1,s2) \
-__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
+__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0" : "=a" (_v) : "Nd" (port))
-#define __IN(s,s1,i...) \
-__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
-__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
+#define __IN(s,s1) \
+__IN1(s) __IN2(s,s1,"w"); return _v; } \
+__IN1(s##_p) __IN2(s,s1,"w"); slow_down_io(); return _v; }
#define __INS(s) \
static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
Glad I could be of help.
Sure please go ahead, I will keep testing this patch with upcoming git
kernels, and report any problems.
So what I understand now is that AMD C1E state saves battery like
dynticks, so we don't need dynticks ?
On Sun, 2007-12-30 at 02:38 +0100, Rene Herman wrote:
> On 29-12-07 23:28, Islam Amer wrote:
>
> > Thanks for the detailed response.
> >
> > I thought I had gotten to the bottom of my problems when I found that
> > udev workaround, I guess I was naive.
> >
> > I did the two tests you described and they predictably caused the hard
> > hangs. I needed to run the port80 program only once to get the hard
> > hang.
> >
> > The output of the dmidecode commands were :
> >
> > Quanta
> > 30B7
> >
> > I applied the patch you provided ( luckily I am using 2.6.24-rc6-git3
> > kernel because I need the b43 driver ), added these values and compiled.
> >
> >
> > {
> > .callback = dmi_io_delay_port_alt,
> > .ident = "Compaq Presario v6000",
> > .matches = {
> > DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
> > DMI_MATCH(DMI_BOARD_NAME, "30B7")
> > }
> > },
> >
> > I was able to boot without the udev workaround and can now use hwclock
> > without hanging the system. In dmesg I can see this new line :
>
> Thanks much for testing (and David -- thanks for asking). Updated patch
> attached with this information. Compaq itself seems to spell the type with a
> capital V so that's the only difference...
>
> Can I add a "Tested-by: Islam Amer <[email protected]>" to this? (and David,
> same for you with this address you're using in this thread?)
>
> Rene.
> plain text document attachment (dmi-port80-minimal-bootparam.diff)
> commit 5f27525d3e796ae12e3186afe8ef0ec41af9e160
> Author: Rene Herman <[email protected]>
> Date: Mon Dec 17 21:23:55 2007 +0100
>
> x86: provide a DMI based port 0x80 I/O delay override.
>
> Certain (HP/Compaq) laptops experience trouble from our port 0x80
> I/O delay writes. This patch provides for a DMI based switch to the
> "alternate diagnostic port" 0xed (as used by some BIOSes as well)
> for these.
>
> David P. Reed confirmed that using port 0xed works and provides a
> proper delay on his HP Pavilion dv9000z, Islam Amer comfirmed that
> it does so on a Compaq Presario V6000. Both are Quanta boards, type
> 30B9 and 30B7 respectively and are the (only) machines for which
> the DMI based switch triggers. HP Pavilion dv6000z is expected to
> also need this but its DMI info hasn't been verified yet.
>
> The symptoms of _not_ working are a hanging machine, with "hwclock"
> use being a direct trigger and therefore the bootup often hanging
> already on these machines.
>
> Earlier versions of this attempted to simply use udelay(2), with the
> 2 being a value tested to be a nicely conservative upper-bound with
> help from many on the linux-kernel mailinglist, but that approach has
> two problems.
>
> First, pre-loops_per_jiffy calibration (which is post PIT init while
> some implementations of the PIT are actually one of the historically
> problematic devices that need the delay) udelay() isn't particularly
> well-defined. We could initialise loops_per_jiffy conservatively (and
> based on CPU family so as to not unduly delay old machines) which
> would sort of work, but still leaves:
>
> Second, delaying isn't the only effect that a write to port 0x80 has.
> It's also a PCI posting barrier which some devices may be explicitly
> or implicitly relying on. Alan Cox did a survey and found evidence
> that additionally various drivers are racy on SMP without the bus
> locking outb.
>
> Switching to an inb() makes the timing too unpredictable and as such,
> this DMI based switch should be the safest approach for now. Any more
> invasive changes should get more rigid testing first. It's moreover
> only very few machines with the problem and a DMI based hack seems
> to fit that situation.
>
> An early boot parameter to make the choice manually (and override any
> possible DMI based decision) is also provided:
>
> io_delay=standard|alternate
>
> This does not change the io_delay() in the boot code which is using
> the same port 0x80 I/O delay but those do not appear to be a problem
> as tested by David P. Reed. He moreover reported that booting with
> "acpi=off" also fixed things and seeing as how ACPI isn't touched
> until after this DMI based I/O port switch leaving the ones in the
> boot code be is safe.
>
> This patch is partly based on earlier patches from Pavel Machek and
> David P. Reed.
>
> Signed-off-by: Rene Herman <[email protected]>
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index 33121d6..6948e25 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -785,6 +785,12 @@ and is between 256 and 4096 characters. It is defined in the file
> for translation below 32 bit and if not available
> then look in the higher range.
>
> + io_delay= [X86-32,X86-64] I/O delay port
> + standard
> + Use the 0x80 standard I/O delay port (default)
> + alternate
> + Use the 0xed alternate I/O delay port
> +
> io7= [HW] IO7 for Marvel based alpha systems
> See comment before marvel_specify_io7 in
> arch/alpha/kernel/core_marvel.c.
> diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc_32.c
> index b74d60d..288e162 100644
> --- a/arch/x86/boot/compressed/misc_32.c
> +++ b/arch/x86/boot/compressed/misc_32.c
> @@ -276,10 +276,10 @@ static void putstr(const char *s)
> RM_SCREEN_INFO.orig_y = y;
>
> pos = (x + cols * y) * 2; /* Update cursor position */
> - outb_p(14, vidport);
> - outb_p(0xff & (pos >> 9), vidport+1);
> - outb_p(15, vidport);
> - outb_p(0xff & (pos >> 1), vidport+1);
> + outb(14, vidport);
> + outb(0xff & (pos >> 9), vidport+1);
> + outb(15, vidport);
> + outb(0xff & (pos >> 1), vidport+1);
> }
>
> static void* memset(void* s, int c, unsigned n)
> diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c
> index 6ea015a..43e5fcc 100644
> --- a/arch/x86/boot/compressed/misc_64.c
> +++ b/arch/x86/boot/compressed/misc_64.c
> @@ -269,10 +269,10 @@ static void putstr(const char *s)
> RM_SCREEN_INFO.orig_y = y;
>
> pos = (x + cols * y) * 2; /* Update cursor position */
> - outb_p(14, vidport);
> - outb_p(0xff & (pos >> 9), vidport+1);
> - outb_p(15, vidport);
> - outb_p(0xff & (pos >> 1), vidport+1);
> + outb(14, vidport);
> + outb(0xff & (pos >> 9), vidport+1);
> + outb(15, vidport);
> + outb(0xff & (pos >> 1), vidport+1);
> }
>
> static void* memset(void* s, int c, unsigned n)
> diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
> index a7bc93c..0cc1981 100644
> --- a/arch/x86/kernel/Makefile_32
> +++ b/arch/x86/kernel/Makefile_32
> @@ -8,7 +8,7 @@ CPPFLAGS_vmlinux.lds += -Ui386
> obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
> ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
> pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
> - quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o
> + quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o io_delay.o
>
> obj-$(CONFIG_STACKTRACE) += stacktrace.o
> obj-y += cpu/
> diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
> index 5a88890..08a68f0 100644
> --- a/arch/x86/kernel/Makefile_64
> +++ b/arch/x86/kernel/Makefile_64
> @@ -11,7 +11,7 @@ obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
> x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
> setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
> pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
> - i8253.o
> + i8253.o io_delay.o
>
> obj-$(CONFIG_STACKTRACE) += stacktrace.o
> obj-y += cpu/
> diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
> new file mode 100644
> index 0000000..77a8bcd
> --- /dev/null
> +++ b/arch/x86/kernel/io_delay.c
> @@ -0,0 +1,77 @@
> +/*
> + * I/O delay strategies for inb_p/outb_p
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/dmi.h>
> +#include <asm/io.h>
> +
> +/*
> + * Allow for a DMI based override of port 0x80
> + */
> +#define IO_DELAY_PORT_STD 0x80
> +#define IO_DELAY_PORT_ALT 0xed
> +
> +static unsigned short io_delay_port __read_mostly = IO_DELAY_PORT_STD;
> +
> +void native_io_delay(void)
> +{
> + asm volatile ("outb %%al, %w0" : : "d" (io_delay_port));
> +}
> +EXPORT_SYMBOL(native_io_delay);
> +
> +static int __init dmi_io_delay_port_alt(const struct dmi_system_id *id)
> +{
> + printk(KERN_NOTICE "%s: using alternate I/O delay port\n", id->ident);
> + io_delay_port = IO_DELAY_PORT_ALT;
> + return 0;
> +}
> +
> +static struct dmi_system_id __initdata dmi_io_delay_port_alt_table[] = {
> + {
> + .callback = dmi_io_delay_port_alt,
> + .ident = "Compaq Presario V6000",
> + .matches = {
> + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
> + DMI_MATCH(DMI_BOARD_NAME, "30B7")
> + }
> + },
> + {
> + .callback = dmi_io_delay_port_alt,
> + .ident = "HP Pavilion dv9000z",
> + .matches = {
> + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
> + DMI_MATCH(DMI_BOARD_NAME, "30B9")
> + }
> + },
> + {
> + }
> +};
> +
> +static int __initdata io_delay_override;
> +
> +static int __init io_delay_param(char *s)
> +{
> + if (!s)
> + return -EINVAL;
> +
> + if (!strcmp(s, "standard"))
> + io_delay_port = IO_DELAY_PORT_STD;
> + else if (!strcmp(s, "alternate"))
> + io_delay_port = IO_DELAY_PORT_ALT;
> + else
> + return -EINVAL;
> +
> + io_delay_override = 1;
> + return 0;
> +}
> +
> +early_param("io_delay", io_delay_param);
> +
> +void __init io_delay_init(void)
> +{
> + if (!io_delay_override)
> + dmi_check_system(dmi_io_delay_port_alt_table);
> +}
> diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
> index e1e18c3..6c3a3b4 100644
> --- a/arch/x86/kernel/setup_32.c
> +++ b/arch/x86/kernel/setup_32.c
> @@ -648,6 +648,8 @@ void __init setup_arch(char **cmdline_p)
>
> dmi_scan_machine();
>
> + io_delay_init();;
> +
> #ifdef CONFIG_X86_GENERICARCH
> generic_apic_probe();
> #endif
> diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
> index 30d94d1..ec976ed 100644
> --- a/arch/x86/kernel/setup_64.c
> +++ b/arch/x86/kernel/setup_64.c
> @@ -311,6 +311,8 @@ void __init setup_arch(char **cmdline_p)
>
> dmi_scan_machine();
>
> + io_delay_init();
> +
> #ifdef CONFIG_SMP
> /* setup to use the static apicid table during kernel startup */
> x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
> diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h
> index fe881cd..690b8f4 100644
> --- a/include/asm-x86/io_32.h
> +++ b/include/asm-x86/io_32.h
> @@ -250,10 +250,8 @@ static inline void flush_write_buffers(void)
>
> #endif /* __KERNEL__ */
>
> -static inline void native_io_delay(void)
> -{
> - asm volatile("outb %%al,$0x80" : : : "memory");
> -}
> +extern void io_delay_init(void);
> +extern void native_io_delay(void);
>
> #if defined(CONFIG_PARAVIRT)
> #include <asm/paravirt.h>
> diff --git a/include/asm-x86/io_64.h b/include/asm-x86/io_64.h
> index a037b07..b2d4994 100644
> --- a/include/asm-x86/io_64.h
> +++ b/include/asm-x86/io_64.h
> @@ -35,13 +35,18 @@
> * - Arnaldo Carvalho de Melo <[email protected]>
> */
>
> -#define __SLOW_DOWN_IO "\noutb %%al,$0x80"
> +extern void io_delay_init(void);
> +extern void native_io_delay(void);
>
> +static inline void slow_down_io(void)
> +{
> + native_io_delay();
> #ifdef REALLY_SLOW_IO
> -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
> -#else
> -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO
> + native_io_delay();
> + native_io_delay();
> + native_io_delay();
> #endif
> +}
>
> /*
> * Talk about misusing macros..
> @@ -50,21 +55,21 @@
> static inline void out##s(unsigned x value, unsigned short port) {
>
> #define __OUT2(s,s1,s2) \
> -__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1"
> +__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1" : : "a" (value), "Nd" (port))
>
> #define __OUT(s,s1,x) \
> -__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \
> -__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \
> +__OUT1(s,x) __OUT2(s,s1,"w"); } \
> +__OUT1(s##_p,x) __OUT2(s,s1,"w"); slow_down_io(); }
>
> #define __IN1(s) \
> static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v;
>
> #define __IN2(s,s1,s2) \
> -__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0"
> +__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0" : "=a" (_v) : "Nd" (port))
>
> -#define __IN(s,s1,i...) \
> -__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
> -__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \
> +#define __IN(s,s1) \
> +__IN1(s) __IN2(s,s1,"w"); return _v; } \
> +__IN1(s##_p) __IN2(s,s1,"w"); slow_down_io(); return _v; }
>
> #define __INS(s) \
> static inline void ins##s(unsigned short port, void * addr, unsigned long count) \
On 30-12-07 02:49, Islam Amer wrote:
> Glad I could be of help.
>
> Sure please go ahead, I will keep testing this patch with upcoming git
> kernels, and report any problems.
Thanks. I'll see if Linus wants it for 2.6.24 still. Could be minimal enough.
> So what I understand now is that AMD C1E state saves battery like
> dynticks, so we don't need dynticks ?
That bit I haven't a clue about I'm afraid...
Rene.
On Sun, 30 Dec 2007 03:49:21 +0200
Islam Amer <[email protected]> wrote:
> So what I understand now is that AMD C1E state saves battery like
> dynticks, so we don't need dynticks ?
No, it's not true.
C1E currently saves more power than dynticks just
because these platforms do not support higher C-states (like C2 and
C3). So, if we want to enable the current dynticks implementation, we
have to disable C1E, so we're left only with C1 (the old plain 'hlt'),
which isn't that great.
Other kernel developers, as discussed previously in this thread, are
working on a HPET-driven dynticks (as opposed to the current
LAPIC-driven one), but the change isn't that easy to make. This way,
dynticks and C1E could be both enabled and thus save more power.
Moreover, as I previously stated, dynticks isn't just about power
saving.
Note that this problem is only related to AMD64 multi-core laptops. As
far as I can see, devs might not invest much coding effort into this
and instead say "Go buy an Intel laptop!", as this really is a hardware
quirk. And if Andi Kleen is correct, this is not just a laptop vendors'
problem, but AMD itself doesn't offer support for higher C-states.
* Islam Amer <[email protected]> wrote:
> Sorry, pressed send too soon. here is the line in dmesg:
>
> Compaq Presario v6000: using alternate I/O delay port
>
> Thanks, please tell me if you need anymore info.
thanks Islam for testing this. I've added your DMI strings to x86.git,
so v2.6.25 should work out of box on your system.
Ingo
Eduard-Gabriel Munteanu <[email protected]> writes:
>
> Other kernel developers, as discussed previously in this thread, are
> working on a HPET-driven dynticks (as opposed to the current
> LAPIC-driven one), but the change isn't that easy to make.
No, actually HPET based dyntick has been implemented for a long time
(as long as APIC based dyntick). APIC based dyntick is preferred
because it is a little cheaper, but both are there.
The problem is just that many BIOSes don't tell the kernel about the
HPET location (even when the chipset supports it) because that wasn't
needed for older Windows versions. And without the BIOS telling the
kernel HPET it doesn't know where it is and can't use it when
the APIC timer is not usable.
The ongoing work is to implement hpet=force code that knows
about various chipsets and reads their hardware registers directly
and then uses the HPET timer without BIOS support.
The reason it is still an option is that there used to be at least one
old chipset where forcing the HPET could trigger hardware bugs.
On the other hand it is expected that BIOS versions support Vista
also supply HPET, so that problem will hopefully get better.
> This way,
> dynticks and C1E could be both enabled and thus save more power.
I would not expect too much over a HZ=100 kernel on current AMD.
C1e doesn't have too much latency on its own. iirc at least on current
AMD platforms sleeping for longer didn't make too much
difference. With HZ=250 it might be borderline, but I would not expect
miracles. It's probably only clearly a good idea if you're on a
HZ=1000 kernel or have applications that need very precise hrtimers (but that
might cost more power again because it'll cause more wakeups)
Of course this can always change in future systems -- these are
just rules of thumb currently.
-Andi
On 29-12-07 10:09, Richard Harman wrote:
> Anyway, I'm extremely open to getting to the bottom of working around
> the quirks on this hardware. If I havn't mentioned it previously, this
> laptop is an HP dv6408nr, with an amd turion tl-56 cpu and nVidia MCP51
> chipset.
>
> What can I do to help? It has been pulling teeth trying to get to the
> bottom of this.
This thread-branch appears to be broken on lkml.org so I couldn't look back
through it but was this what you tried?
http://lkml.org/lkml/2007/12/29/65
Updated patch with Islam's values added at:
http://lkml.org/lkml/2007/12/29/141
You'd plug in your values similarly and test...
Rene.
Eduard-Gabriel Munteanu wrote:
> On Sun, 30 Dec 2007 03:49:21 +0200
>
> Note that this problem is only related to AMD64 multi-core laptops. As
> far as I can see, devs might not invest much coding effort into this
> and instead say "Go buy an Intel laptop!", as this really is a hardware
> quirk. And if Andi Kleen is correct, this is not just a laptop vendors'
> problem, but AMD itself doesn't offer support for higher C-states.
>
I think I may have a monkey wrench to throw into this, I finally got
around to testing the C1E patch, and the port80 patch. End result:
port80 patch has zero effect on this laptop, and the C1E patch makes it
stable.
AMD C1E patch 'force_amd_c1e' (disabling the workaround):
hpet0: at MMIO 0xfed00000, IRQs 2, 8, 31
hpet0: 3 32-bit timers, 25000000 Hz
Time: hpet clocksource has been installed.
Clockevents: could not switch to one-shot mode: lapic is not functional.
Could not switch to high resolution mode on CPU 0
Clockevents: could not switch to one-shot mode: lapic is not functional.
Could not switch to high resolution mode on CPU 1
And with the C1E workaround enabled, the kernel doesn't whine that the
lapic isn't functional.
Those were the kernel messages I was trying to paste in my earlier
message, saying that the lapic works. Do I have a DV6000 laptop that is
a strange beast all in itself?
On Sun, 30 Dec 2007 15:42:17 +0100
Andi Kleen <[email protected]> wrote:
> Eduard-Gabriel Munteanu <[email protected]> writes:
> >
> > Other kernel developers, as discussed previously in this thread, are
> > working on a HPET-driven dynticks (as opposed to the current
> > LAPIC-driven one), but the change isn't that easy to make.
>
> No, actually HPET based dyntick has been implemented for a long time
> (as long as APIC based dyntick). APIC based dyntick is preferred
> because it is a little cheaper, but both are there.
>
> The problem is just that many BIOSes don't tell the kernel about the
> HPET location (even when the chipset supports it) because that wasn't
> needed for older Windows versions. And without the BIOS telling the
> kernel HPET it doesn't know where it is and can't use it when
> the APIC timer is not usable.
>
> The ongoing work is to implement hpet=force code that knows
> about various chipsets and reads their hardware registers directly
> and then uses the HPET timer without BIOS support.
As far as I can see, the kernel doesn't behave as it would be, IMO,
normal. I do have HPETs and Linux detects them without any
need for hpet=force (HPET is registered for clockevents), but keeps
LAPIC as the only option for dynticks. It looks like timing devices are
rated and then only one of them is selected for dynticks. But when that
one (here LAPIC) fails to provide the required functionality for nohz,
Linux does not fallback to the next best timer, as dynticks is provided
with only one such device, hence the message "lapic is not functional"
is shown. In fact, this selection process should be fixed.
Before coding my patch, I tried fiddling with that rating algorithm,
but Linux kept locking up during boot. I'll try again, maybe my
approach wasn't correct, but please tell me: is HPET-driven dynticks
code working? (I'm quite confused, as HPET should wake the CPUs even
when in C1E.)
> > This way,
> > dynticks and C1E could be both enabled and thus save more power.
>
> I would not expect too much over a HZ=100 kernel on current AMD.
With my patch (the one that disables C1E), powertop shows at most 10-11
wakeups per second when idle (no X server running). Isn't it
reasonable to expect a significant improvement over HZ=100?
On Sun, 30 Dec 2007 15:57:59 -0500
Richard Harman <[email protected]> wrote:
> I think I may have a monkey wrench to throw into this, I finally got
> around to testing the C1E patch, and the port80 patch. End result:
> port80 patch has zero effect on this laptop, and the C1E patch makes
> it stable.
>
> AMD C1E patch 'force_amd_c1e' (disabling the workaround):
>
> hpet0: at MMIO 0xfed00000, IRQs 2, 8, 31
> hpet0: 3 32-bit timers, 25000000 Hz
> Time: hpet clocksource has been installed.
> Clockevents: could not switch to one-shot mode: lapic is not
> functional. Could not switch to high resolution mode on CPU 0
> Clockevents: could not switch to one-shot mode: lapic is not
> functional. Could not switch to high resolution mode on CPU 1
>
> And with the C1E workaround enabled, the kernel doesn't whine that
> the lapic isn't functional.
>
> Those were the kernel messages I was trying to paste in my earlier
> message, saying that the lapic works. Do I have a DV6000 laptop that
> is a strange beast all in itself?
These two patches are not related in any way, they address two
different problems. My patch enables dynticks (and those messages don't
show up anymore), but it doesn't prevent your computer from locking up
as you said before.
Eduard-Gabriel Munteanu wrote:
> On Sun, 30 Dec 2007 15:57:59 -0500
> Richard Harman <[email protected]> wrote:
>
>> I think I may have a monkey wrench to throw into this, I finally got
>> around to testing the C1E patch, and the port80 patch. End result:
>> port80 patch has zero effect on this laptop, and the C1E patch makes
>> it stable.
>>
>> AMD C1E patch 'force_amd_c1e' (disabling the workaround):
>>
>> hpet0: at MMIO 0xfed00000, IRQs 2, 8, 31
>> hpet0: 3 32-bit timers, 25000000 Hz
>> Time: hpet clocksource has been installed.
>> Clockevents: could not switch to one-shot mode: lapic is not
>> functional. Could not switch to high resolution mode on CPU 0
>> Clockevents: could not switch to one-shot mode: lapic is not
>> functional. Could not switch to high resolution mode on CPU 1
>>
>> And with the C1E workaround enabled, the kernel doesn't whine that
>> the lapic isn't functional.
>>
>> Those were the kernel messages I was trying to paste in my earlier
>> message, saying that the lapic works. Do I have a DV6000 laptop that
>> is a strange beast all in itself?
>
> These two patches are not related in any way, they address two
> different problems. My patch enables dynticks (and those messages don't
> show up anymore), but it doesn't prevent your computer from locking up
> as you said before.
>
The C1E patch, which permits the lapic to function *does* make my system
stable. My previous method of testing (using USB peripherals) and
checking /proc/interrupts for ERRor interrupts so far hasn't caused the
system to lock up.
On Sun, 30 Dec 2007 08:36:26 -0500
Richard Harman <[email protected]> wrote:
> The C1E patch, which permits the lapic to function *does* make my
> system stable. My previous method of testing (using USB peripherals)
> and checking /proc/interrupts for ERRor interrupts so far hasn't
> caused the system to lock up.
Let me put it another way: have you actually experienced any lockups
with a vanilla (that is, with no patches applied) 2.6.24-rc6 kernel?
Try first with "force_amd_c1e". So far, all I see is that you're
getting nohz/dynticks working with my patch, and that's okay, it's the
expected behavior. But I seriously doubt that it's my patch that keeps
your system from locking up. I'm not bugging you just to clarify this
matter, but if your system locked up iff (as in both necessary and
sufficient) the patch was _not_ applied, that would be a really serious
issue.
Richard Harman wrote:
>
>>> I think I may have a monkey wrench to throw into this, I finally got
>>> around to testing the C1E patch, and the port80 patch. End result:
>>> port80 patch has zero effect on this laptop, and the C1E patch makes
>>> it stable.
>>>
>
Stating that your system is "stable" is not very definitive. Does
hwclock work when full Fedora 8 is running without the port80 patch, or
have you disabled the uses of hwclock in your init and shutdown code?
Have you set the hwclock setting to use the extremely dangerous
"-directisa" option - which hides the problem because it avoids the port
80 i/o?
Try compiling and running the test program port80.c a few times. If
your machine doesn't hang, it would be interesting to see the results it
gives.
The C1E patch alone does not fix the port 80 problem several of us have
observed. what does dmidecode say for your motherboard vendor and model?
> As far as I can see, the kernel doesn't behave as it would be, IMO,
> normal. I do have HPETs and Linux detects them without any
> need for hpet=force (HPET is registered for clockevents), but keeps
> LAPIC as the only option for dynticks. It looks like timing devices are
> rated and then only one of them is selected for dynticks. But when that
LAPIC is preferred because it is cheaper to reprogram, but if LAPIC
doesn't work (e.g. due to AMD C1E) it should fall back to HPET.
> Linux does not fallback to the next best timer, as dynticks is provided
> with only one such device, hence the message "lapic is not functional"
> is shown. In fact, this selection process should be fixed.
Yes it should. Something must be indeed wrong.
> approach wasn't correct, but please tell me: is HPET-driven dynticks
> code working? (I'm quite confused, as HPET should wake the CPUs even
AFAIK it should work, but I haven't tested it myself. Thomas G.
should know details.
> With my patch (the one that disables C1E), powertop shows at most 10-11
> wakeups per second when idle (no X server running). Isn't it
> reasonable to expect a significant improvement over HZ=100?
Did you measure the actual power? Minimal wakeups is not the ultimate goal,
the ultimate goal is to save power. Minimal wakeups is just a measure to that
-Andi
>