On some machines, buggy BIOSes don't properly setup WB MTRRs to
cover all available RAM, meaning the last few megs (or even gigs)
of memory will be marked uncached. Since Linux tends to allocate
from high memory addresses first, this causes the machine to be
unusably slow as soon as the kernel starts really using memory
(i.e. right around init time).
This patch works around the problem by scanning the MTRRs at
boot and figuring out whether the current end_pfn value (setup
by early e820 code) goes beyond the highest WB MTRR range, and
if so, trimming it to match. A fairly obnoxious KERN_WARNING
is printed too, letting the user know that not all of their
memory is available due to a likely BIOS bug.
Something similar could be done on i386 if needed, but the boot
ordering would be slightly different, since the MTRR code on i386
depends on the boot_cpu_data structure being setup.
Justin, can you please test and make sure this patch works for
you too? It'll only work around the problem, but it's better
than having to do mem= by hand or waiting for a fix from your
BIOS vendor.
Thanks,
Jesse
Signed-off-by: Jesse Barnes <[email protected]>
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index c4ebb51..71fc768 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -13,7 +13,7 @@
#include "mtrr.h"
struct mtrr_state {
- struct mtrr_var_range *var_ranges;
+ struct mtrr_var_range var_ranges[NUM_VAR_RANGES];
mtrr_type fixed_ranges[NUM_FIXED_RANGES];
unsigned char enabled;
unsigned char have_fixed;
@@ -84,12 +84,6 @@ void get_mtrr_state(void)
struct mtrr_var_range *vrs;
unsigned lo, dummy;
- if (!mtrr_state.var_ranges) {
- mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range),
- GFP_KERNEL);
- if (!mtrr_state.var_ranges)
- return;
- }
vrs = mtrr_state.var_ranges;
rdmsr(MTRRcap_MSR, lo, dummy);
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index c7d8f17..d7922ce 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -12,7 +12,7 @@
#include "mtrr.h"
/* RED-PEN: this is accessed without any locking */
-extern unsigned int *usage_table;
+extern unsigned int usage_table[];
#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 1cf466d..c133856 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -38,8 +38,8 @@
#include <linux/cpu.h>
#include <linux/mutex.h>
+#include <asm/e820.h>
#include <asm/mtrr.h>
-
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -47,7 +47,7 @@
u32 num_var_ranges = 0;
-unsigned int *usage_table;
+unsigned int usage_table[NUM_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
@@ -121,11 +121,6 @@ static void __init init_table(void)
int i, max;
max = num_var_ranges;
- if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
- == NULL) {
- printk(KERN_ERR "mtrr: could not allocate\n");
- return;
- }
for (i = 0; i < max; i++)
usage_table[i] = 1;
}
@@ -589,16 +584,11 @@ struct mtrr_value {
unsigned long lsize;
};
-static struct mtrr_value * mtrr_state;
+static struct mtrr_value mtrr_state[NUM_VAR_RANGES];
static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
{
int i;
- int size = num_var_ranges * sizeof(struct mtrr_value);
-
- mtrr_state = kzalloc(size,GFP_ATOMIC);
- if (!mtrr_state)
- return -ENOMEM;
for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i,
@@ -620,7 +610,6 @@ static int mtrr_restore(struct sys_device * sysdev)
mtrr_state[i].lsize,
mtrr_state[i].ltype);
}
- kfree(mtrr_state);
return 0;
}
@@ -631,6 +620,42 @@ static struct sysdev_driver mtrr_sysdev_driver = {
.resume = mtrr_restore,
};
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations. This routine checks to make sure the MTRRs having
+ * a write back type cover all of the memory the kernel is intending to use.
+ * If not, it'll trim any memory off the end by adjusting end_pfn, removing
+ * it from the kernel's allocation pools, warning the user with an obnoxious
+ * message.
+ */
+void __init mtrr_trim_uncached_memory(void)
+{
+ unsigned long i, base, size, highest_addr = 0;
+ mtrr_type type;
+
+ /* Find highest cached pfn */
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base <<= PAGE_SHIFT;
+ size <<= PAGE_SHIFT;
+ if (highest_addr < base + size)
+ highest_addr = base + size;
+ }
+
+ if ((highest_addr >> PAGE_SHIFT) != end_pfn) {
+ printk(KERN_WARNING "***************\n");
+ printk(KERN_WARNING "**** WARNING: likely BIOS bug\n");
+ printk(KERN_WARNING "**** MTRRs don't cover all of "
+ "memory, trimmed %ld pages\n", end_pfn -
+ (highest_addr >> PAGE_SHIFT));
+ printk(KERN_WARNING "***************\n");
+ end_pfn = highest_addr >> PAGE_SHIFT;
+ }
+}
/**
* mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index 289dfe6..a29dcba 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -14,6 +14,7 @@
#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
#define NUM_FIXED_RANGES 88
+#define NUM_VAR_RANGES 256
#define MTRRfix64K_00000_MSR 0x250
#define MTRRfix16K_80000_MSR 0x258
#define MTRRfix16K_A0000_MSR 0x259
diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c
index c3c6b91..c138eac 100644
--- a/arch/x86_64/kernel/bugs.c
+++ b/arch/x86_64/kernel/bugs.c
@@ -14,7 +14,6 @@
void __init check_bugs(void)
{
identify_cpu(&boot_cpu_data);
- mtrr_bp_init();
#if !defined(CONFIG_SMP)
printk("CPU: ");
print_cpu_info(&boot_cpu_data);
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index eb6524f..409b63c 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -266,6 +266,10 @@ void __init setup_arch(char **cmdline_p)
* we are rounding upwards:
*/
end_pfn = e820_end_of_ram();
+ /* Trim memory not covered by WB MTRRs */
+ mtrr_bp_init();
+ mtrr_trim_uncached_memory();
+
num_physpages = end_pfn;
check_efer();
diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h
index b557c48..cc62bd8 100644
--- a/include/asm-x86_64/mtrr.h
+++ b/include/asm-x86_64/mtrr.h
@@ -78,6 +78,7 @@ extern int mtrr_add_page (unsigned long base, unsigned long size,
unsigned int type, char increment);
extern int mtrr_del (int reg, unsigned long base, unsigned long size);
extern int mtrr_del_page (int reg, unsigned long base, unsigned long size);
+extern void mtrr_trim_uncached_memory(void);
# else
static __inline__ int mtrr_add (unsigned long base, unsigned long size,
unsigned int type, char increment)
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On some machines, buggy BIOSes don't properly setup WB MTRRs to
> cover all available RAM, meaning the last few megs (or even gigs)
> of memory will be marked uncached. Since Linux tends to allocate
> from high memory addresses first, this causes the machine to be
> unusably slow as soon as the kernel starts really using memory
> (i.e. right around init time).
>
> This patch works around the problem by scanning the MTRRs at
> boot and figuring out whether the current end_pfn value (setup
> by early e820 code) goes beyond the highest WB MTRR range, and
> if so, trimming it to match. A fairly obnoxious KERN_WARNING
> is printed too, letting the user know that not all of their
> memory is available due to a likely BIOS bug.
>
> Something similar could be done on i386 if needed, but the boot
> ordering would be slightly different, since the MTRR code on i386
> depends on the boot_cpu_data structure being setup.
>
> Justin, can you please test and make sure this patch works for
> you too? It'll only work around the problem, but it's better
> than having to do mem= by hand or waiting for a fix from your
> BIOS vendor.
>
> Thanks,
> Jesse
Against what kernel version does this patch apply?
linux-2.6.21# patch -p1 < ../mtrr.patch
patching file arch/i386/kernel/cpu/mtrr/generic.c
Hunk #2 succeeded at 66 (offset -18 lines).
patching file arch/i386/kernel/cpu/mtrr/if.c
patching file arch/i386/kernel/cpu/mtrr/main.c
patching file arch/i386/kernel/cpu/mtrr/mtrr.h
can't find file to patch at input line 160
Perhaps you used the wrong -p or --strip option?
The text leading up to this was:
--------------------------
|diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c
|index c3c6b91..c138eac 100644
|--- a/arch/x86_64/kernel/bugs.c
|+++ b/arch/x86_64/kernel/bugs.c
--------------------------
File to patch:
On Wednesday, June 6, 2007 1:26 pm Justin Piszcz wrote:
> On Wed, 6 Jun 2007, Jesse Barnes wrote:
> > On some machines, buggy BIOSes don't properly setup WB MTRRs to
> > cover all available RAM, meaning the last few megs (or even gigs)
> > of memory will be marked uncached. Since Linux tends to allocate
> > from high memory addresses first, this causes the machine to be
> > unusably slow as soon as the kernel starts really using memory
> > (i.e. right around init time).
> >
> > This patch works around the problem by scanning the MTRRs at
> > boot and figuring out whether the current end_pfn value (setup
> > by early e820 code) goes beyond the highest WB MTRR range, and
> > if so, trimming it to match. A fairly obnoxious KERN_WARNING
> > is printed too, letting the user know that not all of their
> > memory is available due to a likely BIOS bug.
> >
> > Something similar could be done on i386 if needed, but the boot
> > ordering would be slightly different, since the MTRR code on i386
> > depends on the boot_cpu_data structure being setup.
> >
> > Justin, can you please test and make sure this patch works for
> > you too? It'll only work around the problem, but it's better
> > than having to do mem= by hand or waiting for a fix from your
> > BIOS vendor.
> >
> > Thanks,
> > Jesse
>
> Against what kernel version does this patch apply?
Um... git as of b4946ffb1860597b187d78d61ac6504177eb0ff8. Sorry I
should have updated before spinning the patch (will do now).
Jesse
On Wednesday, June 6, 2007 1:28 pm Jesse Barnes wrote:
> > Against what kernel version does this patch apply?
>
> Um... git as of b4946ffb1860597b187d78d61ac6504177eb0ff8. Sorry I
> should have updated before spinning the patch (will do now).
Appears to apply cleanly to git head as of a minute ago too.
Jesse
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 1:28 pm Jesse Barnes wrote:
>>> Against what kernel version does this patch apply?
>>
>> Um... git as of b4946ffb1860597b187d78d61ac6504177eb0ff8. Sorry I
>> should have updated before spinning the patch (will do now).
>
> Appears to apply cleanly to git head as of a minute ago too.
>
> Jesse
>
Can you produce a patch against 2.6.22-rc4 or 2.6.21 so I and other people
can easily try it? I do not have git installed on this machine, thanks.
Justin.
On Wednesday, June 6, 2007 1:37 pm Justin Piszcz wrote:
> On Wed, 6 Jun 2007, Jesse Barnes wrote:
> > On Wednesday, June 6, 2007 1:28 pm Jesse Barnes wrote:
> >>> Against what kernel version does this patch apply?
> >>
> >> Um... git as of b4946ffb1860597b187d78d61ac6504177eb0ff8. Sorry I
> >> should have updated before spinning the patch (will do now).
> >
> > Appears to apply cleanly to git head as of a minute ago too.
> >
> > Jesse
>
> Can you produce a patch against 2.6.22-rc4 or 2.6.21 so I and other
> people can easily try it? I do not have git installed on this
> machine, thanks.
Seems to apply cleanly to 2.6.22-rc4 too. Haven't tested that though.
Jesse
Will give it a shot.
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 1:37 pm Justin Piszcz wrote:
>> On Wed, 6 Jun 2007, Jesse Barnes wrote:
>>> On Wednesday, June 6, 2007 1:28 pm Jesse Barnes wrote:
>>>>> Against what kernel version does this patch apply?
>>>>
>>>> Um... git as of b4946ffb1860597b187d78d61ac6504177eb0ff8. Sorry I
>>>> should have updated before spinning the patch (will do now).
>>>
>>> Appears to apply cleanly to git head as of a minute ago too.
>>>
>>> Jesse
>>
>> Can you produce a patch against 2.6.22-rc4 or 2.6.21 so I and other
>> people can easily try it? I do not have git installed on this
>> machine, thanks.
>
> Seems to apply cleanly to 2.6.22-rc4 too. Haven't tested that though.
>
> Jesse
>
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On some machines, buggy BIOSes don't properly setup WB MTRRs to
> cover all available RAM, meaning the last few megs (or even gigs)
> of memory will be marked uncached. Since Linux tends to allocate
> from high memory addresses first, this causes the machine to be
> unusably slow as soon as the kernel starts really using memory
> (i.e. right around init time).
>
> This patch works around the problem by scanning the MTRRs at
> boot and figuring out whether the current end_pfn value (setup
> by early e820 code) goes beyond the highest WB MTRR range, and
> if so, trimming it to match. A fairly obnoxious KERN_WARNING
> is printed too, letting the user know that not all of their
> memory is available due to a likely BIOS bug.
>
> Something similar could be done on i386 if needed, but the boot
> ordering would be slightly different, since the MTRR code on i386
> depends on the boot_cpu_data structure being setup.
>
> Justin, can you please test and make sure this patch works for
> you too? It'll only work around the problem, but it's better
> than having to do mem= by hand or waiting for a fix from your
> BIOS vendor.
>
> Thanks,
> Jesse
Jesse, it worked.
With mem=8832M (without your patch): 2.6.22-rc4:
top - 17:39:02 up 1 day, 8:07, 25 users, load average: 2.33, 0.76, 0.30
Tasks: 325 total, 11 running, 314 sleeping, 0 stopped, 0 zombie
Cpu(s): 80.0%us, 20.0%sy, 0.0%ni, 0.0%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st
Mem: 8039620k total, 7936472k used, 103148k free, 708k buffers
Swap: 16787768k total, 128k used, 16787640k free, 6646248k cached
With no mem= in append line (with your patch): 2.6.22-rc4:
top - 17:44:01 up 1 min, 1 user, load average: 0.97, 0.25, 0.08
Tasks: 145 total, 1 running, 144 sleeping, 0 stopped, 0 zombie
Cpu(s): 5.2%us, 3.0%sy, 1.2%ni, 86.8%id, 3.8%wa, 0.0%hi, 0.0%si, 0.0%st
Mem: 8039608k total, 969380k used, 7070228k free, 1232k buffers
Swap: 16787768k total, 0k used, 16787768k free, 109448k cached
Odd, remote netconsole did not capture the dmesg the E820 memory map.
Jun 6 17:43:03 p34 [ 53.598611] ahci 0000:00:1f.2: AHCI 0001.0100 32 slots 6 ports 3 Gbps 0x3f impl SATA mode
Jun 6 17:43:03 p34 [ 53.598683] ahci 0000:00:1f.2: flags: 64bit ncq led clo pio slum part
Jun 6 17:43:03 p34 [ 53.598986] scsi0 : ahci
Jun 6 17:43:03 p34 [ 53.599131] scsi1 : ahci
Jun 6 17:43:03 p34 [ 53.599239] scsi2 : ahci
Jun 6 17:43:03 p34 [ 53.599340] scsi3 : ahci
Jun 6 17:43:03 p34 [ 53.599438] scsi4 : ahci
I will run with this patch for a while but so far, no issues, everything
looks great.
Will it make it into 2.6.22-rc5? :)
Justin.
Mem: 8039620k total, 7936472k used, 103148k free, 708k buffers
Mem: 8039608k total, 969380k used, 7070228k free, 1232k buffers
I am curious, why does the patch != the mem=8832M?
Justin.
On Wed, 6 Jun 2007, Justin Piszcz wrote:
>
>
> On Wed, 6 Jun 2007, Jesse Barnes wrote:
>
>> On some machines, buggy BIOSes don't properly setup WB MTRRs to
>> cover all available RAM, meaning the last few megs (or even gigs)
>> of memory will be marked uncached. Since Linux tends to allocate
>> from high memory addresses first, this causes the machine to be
>> unusably slow as soon as the kernel starts really using memory
>> (i.e. right around init time).
>>
>> This patch works around the problem by scanning the MTRRs at
>> boot and figuring out whether the current end_pfn value (setup
>> by early e820 code) goes beyond the highest WB MTRR range, and
>> if so, trimming it to match. A fairly obnoxious KERN_WARNING
>> is printed too, letting the user know that not all of their
>> memory is available due to a likely BIOS bug.
>>
>> Something similar could be done on i386 if needed, but the boot
>> ordering would be slightly different, since the MTRR code on i386
>> depends on the boot_cpu_data structure being setup.
>>
>> Justin, can you please test and make sure this patch works for
>> you too? It'll only work around the problem, but it's better
>> than having to do mem= by hand or waiting for a fix from your
>> BIOS vendor.
>>
>> Thanks,
>> Jesse
>
> Jesse, it worked.
>
> With mem=8832M (without your patch): 2.6.22-rc4:
>
> top - 17:39:02 up 1 day, 8:07, 25 users, load average: 2.33, 0.76, 0.30
> Tasks: 325 total, 11 running, 314 sleeping, 0 stopped, 0 zombie
> Cpu(s): 80.0%us, 20.0%sy, 0.0%ni, 0.0%id, 0.0%wa, 0.0%hi, 0.0%si,
> 0.0%st
> Mem: 8039620k total, 7936472k used, 103148k free, 708k buffers
> Swap: 16787768k total, 128k used, 16787640k free, 6646248k cached
>
> With no mem= in append line (with your patch): 2.6.22-rc4:
>
> top - 17:44:01 up 1 min, 1 user, load average: 0.97, 0.25, 0.08
> Tasks: 145 total, 1 running, 144 sleeping, 0 stopped, 0 zombie
> Cpu(s): 5.2%us, 3.0%sy, 1.2%ni, 86.8%id, 3.8%wa, 0.0%hi, 0.0%si,
> 0.0%st
> Mem: 8039608k total, 969380k used, 7070228k free, 1232k buffers
> Swap: 16787768k total, 0k used, 16787768k free, 109448k cached
>
> Odd, remote netconsole did not capture the dmesg the E820 memory map.
>
> Jun 6 17:43:03 p34 [ 53.598611] ahci 0000:00:1f.2: AHCI 0001.0100 32 slots
> 6 ports 3 Gbps 0x3f impl SATA mode Jun 6 17:43:03 p34 [ 53.598683] ahci
> 0000:00:1f.2: flags: 64bit ncq led clo pio slum part Jun 6 17:43:03 p34 [
> 53.598986] scsi0 : ahci Jun 6 17:43:03 p34 [ 53.599131] scsi1 : ahci Jun
> 6 17:43:03 p34 [ 53.599239] scsi2 : ahci Jun 6 17:43:03 p34 [ 53.599340]
> scsi3 : ahci Jun 6 17:43:03 p34 [ 53.599438] scsi4 : ahci
>
> I will run with this patch for a while but so far, no issues, everything
> looks great.
>
> Will it make it into 2.6.22-rc5? :)
>
> Justin.
>
On Wednesday, June 6, 2007 3:03 pm Justin Piszcz wrote:
> Mem: 8039620k total, 7936472k used, 103148k free, 708k
> buffers Mem: 8039608k total, 969380k used, 7070228k free,
> 1232k buffers
>
> I am curious, why does the patch != the mem=8832M?
I'm not sure... can you post your e820 map from boot and the contents
of /proc/mtrr? Maybe my patch is trimming off a few too many pages, or
maybe 8832M isn't quite right and actually ends up leaving you with a
few uncached pages.
Jesse
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 3:03 pm Justin Piszcz wrote:
>> Mem: 8039620k total, 7936472k used, 103148k free, 708k
>> buffers Mem: 8039608k total, 969380k used, 7070228k free,
>> 1232k buffers
>>
>> I am curious, why does the patch != the mem=8832M?
>
> I'm not sure... can you post your e820 map from boot and the contents
> of /proc/mtrr? Maybe my patch is trimming off a few too many pages, or
> maybe 8832M isn't quite right and actually ends up leaving you with a
> few uncached pages.
>
> Jesse
>
I cannot post the E820 memory map, I have no way to capture it, I cannot
get anything from netconsole and dmesg does not show it either.
BEFORE:
reg00: base=0x00000000 ( 0MB), size=2048MB: write-back, count=1
reg01: base=0x80000000 (2048MB), size=1024MB: write-back, count=1
reg02: base=0xc0000000 (3072MB), size= 256MB: write-back, count=1
reg03: base=0xcf800000 (3320MB), size= 8MB: uncachable, count=1
reg04: base=0xcf700000 (3319MB), size= 1MB: uncachable, count=1
reg05: base=0x100000000 (4096MB), size=4096MB: write-back, count=1
reg06: base=0x200000000 (8192MB), size= 512MB: write-back, count=1
reg07: base=0x220000000 (8704MB), size= 128MB: write-back, count=1
AFTER:
$ cat /proc/mtrr
reg00: base=0x00000000 ( 0MB), size=2048MB: write-back, count=1
reg01: base=0x80000000 (2048MB), size=1024MB: write-back, count=1
reg02: base=0xc0000000 (3072MB), size= 256MB: write-back, count=1
reg03: base=0xcf800000 (3320MB), size= 8MB: uncachable, count=1
reg04: base=0xcf700000 (3319MB), size= 1MB: uncachable, count=1
reg05: base=0x100000000 (4096MB), size=4096MB: write-back, count=1
reg06: base=0x200000000 (8192MB), size= 512MB: write-back, count=1
reg07: base=0x220000000 (8704MB), size= 128MB: write-back, count=1
But that still works.
Justin.
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 3:03 pm Justin Piszcz wrote:
>> Mem: 8039620k total, 7936472k used, 103148k free, 708k
>> buffers Mem: 8039608k total, 969380k used, 7070228k free,
>> 1232k buffers
>>
>> I am curious, why does the patch != the mem=8832M?
>
> I'm not sure... can you post your e820 map from boot and the contents
> of /proc/mtrr? Maybe my patch is trimming off a few too many pages, or
> maybe 8832M isn't quite right and actually ends up leaving you with a
> few uncached pages.
>
> Jesse
>
Unless you know of some other way I can capture the output, it only starts
showing the dmesg from [50..] onward.
Justin.
On Wednesday, June 6, 2007 3:13 pm Justin Piszcz wrote:
> On Wed, 6 Jun 2007, Jesse Barnes wrote:
> > On Wednesday, June 6, 2007 3:03 pm Justin Piszcz wrote:
> >> Mem: 8039620k total, 7936472k used, 103148k free, 708k
> >> buffers Mem: 8039608k total, 969380k used, 7070228k free,
> >> 1232k buffers
> >>
> >> I am curious, why does the patch != the mem=8832M?
> >
> > I'm not sure... can you post your e820 map from boot and the
> > contents of /proc/mtrr? Maybe my patch is trimming off a few too
> > many pages, or maybe 8832M isn't quite right and actually ends up
> > leaving you with a few uncached pages.
> >
> > Jesse
>
> Unless you know of some other way I can capture the output, it only
> starts showing the dmesg from [50..] onward.
Did you boot the kernel with the 'debug' option? Maybe your dmesg
buffer is too small (there's a config option for that iirc).
Jesse
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 3:13 pm Justin Piszcz wrote:
>> On Wed, 6 Jun 2007, Jesse Barnes wrote:
>>> On Wednesday, June 6, 2007 3:03 pm Justin Piszcz wrote:
>>>> Mem: 8039620k total, 7936472k used, 103148k free, 708k
>>>> buffers Mem: 8039608k total, 969380k used, 7070228k free,
>>>> 1232k buffers
>>>>
>>>> I am curious, why does the patch != the mem=8832M?
>>>
>>> I'm not sure... can you post your e820 map from boot and the
>>> contents of /proc/mtrr? Maybe my patch is trimming off a few too
>>> many pages, or maybe 8832M isn't quite right and actually ends up
>>> leaving you with a few uncached pages.
>>>
>>> Jesse
>>
>> Unless you know of some other way I can capture the output, it only
>> starts showing the dmesg from [50..] onward.
>
> Did you boot the kernel with the 'debug' option? Maybe your dmesg
> buffer is too small (there's a config option for that iirc).
>
> Jesse
>
Nope, I booted with only netconsole= options. I have a lot of HW in the
box and I guess the buffer is too small. Not sure where to change it in
the kernel. Looking..
On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
> Nope, I booted with only netconsole= options. I have a lot of HW in
> the box and I guess the buffer is too small. Not sure where to
> change it in the kernel. Looking..
It's called "kernel log buffer size" and it's in "General setup".
Jesse
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
>> Nope, I booted with only netconsole= options. I have a lot of HW in
>> the box and I guess the buffer is too small. Not sure where to
>> change it in the kernel. Looking..
>
> It's called "kernel log buffer size" and it's in "General setup".
>
> Jesse
>
I was reviewing some OLD mailing list archives then!
--------------
Per: http://www.edlug.ed.ac.uk/archive/Aug2003/msg00270.html
The kernel has a cyclic buffer of length LOG_BUF_LEN (4096, since 1.3.54:
8192, since 2.1.113: 16384; in
recent kernels the size can be set at compile time) in which
messages
given as argument to the kernel
function printk() are stored (regardless of their loglevel).
So it doesn't look like you can without recompiling...
Bruce
--------------
Getting you the E820 memory map in a few moments.
Changed from 15 -> 16.
| | (16) Kernel log buffer size (16 => 64KB, 17 => 128KB)
Justin.
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
>> Nope, I booted with only netconsole= options. I have a lot of HW in
>> the box and I guess the buffer is too small. Not sure where to
>> change it in the kernel. Looking..
>
> It's called "kernel log buffer size" and it's in "General setup".
>
> Jesse
>
Here ya go (and full dmesg attached)
[ 0.000000] Linux version 2.6.22-rc4 ([email protected]) (gcc
version 4.1.2 20061115 (prerelease) (Debian 4.1.1-21)) #3 SMP Wed Jun 6
18:30:43 EDT 2007
[ 0.000000] Command line: auto BOOT_IMAGE=2.6.22-rc4-3 ro root=902
[email protected]/eth0,[email protected]/00:50:8D:ED:3C:E7
[ 0.000000] BIOS-provided physical RAM map:
[ 0.000000] BIOS-e820: 0000000000000000 - 000000000008f000 (usable)
[ 0.000000] BIOS-e820: 000000000008f000 - 00000000000a0000
(reserved)
[ 0.000000] BIOS-e820: 00000000000e0000 - 0000000000100000
(reserved)
[ 0.000000] BIOS-e820: 0000000000100000 - 00000000cf58f000 (usable)
[ 0.000000] BIOS-e820: 00000000cf58f000 - 00000000cf59c000
(reserved)
[ 0.000000] BIOS-e820: 00000000cf59c000 - 00000000cf653000 (usable)
[ 0.000000] BIOS-e820: 00000000cf653000 - 00000000cf6a5000 (ACPI
NVS)
[ 0.000000] BIOS-e820: 00000000cf6a5000 - 00000000cf6a8000 (ACPI
data)
[ 0.000000] BIOS-e820: 00000000cf6a8000 - 00000000cf6ef000 (ACPI
NVS)
[ 0.000000] BIOS-e820: 00000000cf6ef000 - 00000000cf6f1000 (ACPI
data)
[ 0.000000] BIOS-e820: 00000000cf6f1000 - 00000000cf6f2000 (usable)
[ 0.000000] BIOS-e820: 00000000cf6f2000 - 00000000cf6ff000 (ACPI
data)
[ 0.000000] BIOS-e820: 00000000cf6ff000 - 00000000cf700000 (usable)
[ 0.000000] BIOS-e820: 00000000cf700000 - 00000000d0000000
(reserved)
[ 0.000000] BIOS-e820: 00000000fff00000 - 0000000100000000
(reserved)
[ 0.000000] BIOS-e820: 0000000100000000 - 000000022c000000 (usable)
[ 0.000000] Entering add_active_range(0, 0, 143) 0 entries of 256
used
[ 0.000000] Entering add_active_range(0, 256, 849295) 1 entries of
256 used
[ 0.000000] Entering add_active_range(0, 849308, 849491) 2 entries of
256 used
[ 0.000000] Entering add_active_range(0, 849649, 849650) 3 entries of
256 used
[ 0.000000] Entering add_active_range(0, 849663, 849664) 4 entries of
256 used
[ 0.000000] Entering add_active_range(0, 1048576, 2277376) 5 entries
of 256 used
[ 0.000000] end_pfn_map = 2277376
[ 0.000000] ***************
[ 0.000000] **** WARNING: likely BIOS bug
[ 0.000000] **** MTRRs don't cover all of memory, trimmed 16384 pages
[ 0.000000] ***************
[ 0.000000] DMI 2.4 present.
[ 0.000000] ACPI: RSDP 000FE020, 0014 (r0 INTEL )
[ 0.000000] ACPI: RSDT CF6FD038, 0050 (r1 INTEL DG965WH 64C
1000013)
[ 0.000000] ACPI: FACP CF6FC000, 0074 (r1 INTEL DG965WH 64C
MSFT 1000013)
[ 0.000000] ACPI: DSDT CF6F7000, 40E9 (r1 INTEL DG965WH 64C
MSFT 1000013)
[ 0.000000] ACPI: FACS CF6A8000, 0040
[ 0.000000] ACPI: APIC CF6F6000, 0078 (r1 INTEL DG965WH 64C
MSFT 1000013)
[ 0.000000] ACPI: WDDT CF6F5000, 0040 (r1 INTEL DG965WH 64C
MSFT 1000013)
[ 0.000000] ACPI: MCFG CF6F4000, 003C (r1 INTEL DG965WH 64C
MSFT 1000013)
[ 0.000000] ACPI: ASF! CF6F3000, 00A6 (r32 INTEL DG965WH 64C
MSFT 1000013)
[ 0.000000] ACPI: HPET CF6F2000, 0038 (r1 INTEL DG965WH 64C
MSFT 1000013)
[ 0.000000] ACPI: SSDT CF6F0000, 01BC (r1 INTEL CpuPm 64C
MSFT 1000013)
[ 0.000000] ACPI: SSDT CF6EF000, 0175 (r1 INTEL Cpu0Ist 64C
MSFT 1000013)
[ 0.000000] ACPI: SSDT CF6A7000, 0175 (r1 INTEL Cpu1Ist 64C
MSFT 1000013)
[ 0.000000] ACPI: SSDT CF6A6000, 0175 (r1 INTEL Cpu2Ist 64C
MSFT 1000013)
[ 0.000000] ACPI: SSDT CF6A5000, 0175 (r1 INTEL Cpu3Ist 64C
MSFT 1000013)
[ 0.000000] Entering add_active_range(0, 0, 143) 0 entries of 256
used
[ 0.000000] Entering add_active_range(0, 256, 849295) 1 entries of
256 used
[ 0.000000] Entering add_active_range(0, 849308, 849491) 2 entries of
256 used
[ 0.000000] Entering add_active_range(0, 849649, 849650) 3 entries of
256 used
[ 0.000000] Entering add_active_range(0, 849663, 849664) 4 entries of
256 used
[ 0.000000] Entering add_active_range(0, 1048576, 2260992) 5 entries
of 256 used
[ 0.000000] Zone PFN ranges:
[ 0.000000] DMA 0 -> 4096
[ 0.000000] DMA32 4096 -> 1048576
[ 0.000000] Normal 1048576 -> 2260992
[ 0.000000] early_node_map[6] active PFN ranges
[ 0.000000] 0: 0 -> 143
[ 0.000000] 0: 256 -> 849295
[ 0.000000] 0: 849308 -> 849491
[ 0.000000] 0: 849649 -> 849650
[ 0.000000] 0: 849663 -> 849664
[ 0.000000] 0: 1048576 -> 2260992
[ 0.000000] On node 0 totalpages: 2061783
[ 0.000000] DMA zone: 56 pages used for memmap
[ 0.000000] DMA zone: 1395 pages reserved
[ 0.000000] DMA zone: 2532 pages, LIFO batch:0
[ 0.000000] DMA32 zone: 14280 pages used for memmap
[ 0.000000] DMA32 zone: 831104 pages, LIFO batch:31
[ 0.000000] Normal zone: 16576 pages used for memmap
[ 0.000000] Normal zone: 1195840 pages, LIFO batch:31
[ 0.000000] ACPI: PM-Timer IO Port: 0x408
[ 0.000000] ACPI: Local APIC address 0xfee00000
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
>> Nope, I booted with only netconsole= options. I have a lot of HW in
>> the box and I guess the buffer is too small. Not sure where to
>> change it in the kernel. Looking..
>
> It's called "kernel log buffer size" and it's in "General setup".
>
> Jesse
>
Seems stable & fast so far, except when it hit swap :)
top - 18:38:13 up 5 min, 6 users, load average: 38.56, 12.65, 4.52
Tasks: 227 total, 64 running, 163 sleeping, 0 stopped, 0 zombie
Cpu0 : 0.0%us, 23.9%sy, 0.0%ni, 0.0%id, 72.7%wa, 0.0%hi, 3.4%si, 0.0%st
Cpu1 : 0.0%us, 23.5%sy, 0.0%ni, 1.0%id, 75.5%wa, 0.0%hi, 0.0%si, 0.0%st
Cpu2 : 0.0%us, 24.1%sy, 0.0%ni, 0.0%id, 75.9%wa, 0.0%hi, 0.0%si, 0.0%st
Cpu3 : 0.0%us, 23.6%sy, 0.0%ni, 0.0%id, 76.4%wa, 0.0%hi, 0.0%si, 0.0%st
Mem: 8039576k total, 7998988k used, 40588k free, 12k buffers
Swap: 16787768k total, 879324k used, 15908444k free, 16316k cached
Tested with
$ stress --vm 32 --vm-bytes 250M
and
$ stress --vm 64 --vm-bytes 250M
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
248 root 10 -5 0 0 0 D 18 0.0 0:07.81 kswapd0
3788 war 18 0 256m 115m 164 D 3 1.5 0:01.32 stress
3800 war 18 0 256m 7300 160 R 3 0.1 0:00.58 stress
3801 war 18 0 256m 34m 160 R 3 0.4 0:02.48 stress
3808 war 18 0 256m 217m 164 R 3 2.8 0:01.16 stress
559 root 10 -5 0 0 0 D 2 0.0 0:00.15 md0_raid1
3771 war 18 0 256m 174m 164 R 2 2.2 0:01.30 stress
3785 war 18 0 256m 30m 164 R 2 0.4 0:01.42 stress
3813 war 18 0 256m 34m 164 R 2 0.4 0:01.06 stress
3787 war 18 0 256m 24m 160 R 2 0.3 0:00.25 stress
3794 war 18 0 256m 40m 160 R 2 0.5 0:00.14 stress
3778 war 18 0 256m 109m 164 R 2 1.4 0:01.23 stress
3783 war 18 0 256m 185m 164 R 2 2.4 0:01.29 stress
3799 war 18 0 256m 172m 164 R 2 2.2 0:01.07 stress
3816 war 18 0 256m 203m 160 R 2 2.6 0:00.39 stress
3819 war 18 0 256m 144m 160 R 2 1.8 0:00.40 stress
3820 war 18 0 256m 147m 160 R 2 1.9 0:00.39 stress
3825 war 18 0 256m 91m 164 R 2 1.2 0:01.16 stress
3777 war 18 0 256m 29m 164 R 1 0.4 0:01.05 stress
3779 war 18 0 256m 197m 164 R 1 2.5 0:01.29 stress
3805 war 18 0 256m 216m 160 R 1 2.8 0:00.36 stress
3812 war 18 0 256m 30m 164 R 1 0.4 0:01.14 stress
3830 war 18 0 256m 176m 164 R 1 2.3 0:01.17 stress
3831 war 18 0 256m 64m 164 R 1 0.8 0:01.40 stress
3769 war 18 0 256m 43m 164 R 1 0.6 0:01.23 stress
3770 war 18 0 256m 169m 164 R 1 2.2 0:01.18 stress
3773 war 18 0 256m 44m 160 R 1 0.6 0:00.29 stress
3774 war 18 0 256m 21m 160 R 1 0.3 0:00.21 stress
3780 war 18 0 256m 167m 164 R 1 2.1 0:01.33 stress
3781 war 18 0 256m 143m 164 R 1 1.8 0:01.35 stress
3798 war 18 0 256m 153m 164 R 1 1.9 0:01.16 stress
3807 war 18 0 256m 31m 164 R 1 0.4 0:01.17 stress
3817 war 18 0 256m 185m 160 R 1 2.4 0:00.40 stress
3822 war 18 0 256m 200m 160 R 1 2.6 0:00.41 stress
3824 war 18 0 256m 8684 160 R 1 0.1 0:00.16 stress
3827 war 18 0 256m 103m 164 R 1 1.3 0:00.95 stress
3828 war 18 0 256m 1976 160 R 1 0.0 0:00.19 stress
2026 daemon 30 15 30580 436 372 D 1 0.0 0:02.10 bindgraph.pl
3768 war 18 0 256m 81m 164 R 1 1.0 0:01.84 stress
3772 war 18 0 256m 158m 164 R 1 2.0 0:01.11 stress
3782 war 18 0 256m 180m 164 R 1 2.3 0:01.25 stress
3786 war 18 0 256m 173m 164 R 1 2.2 0:01.13 stress
3790 war 18 0 256m 20m 164 R 1 0.3 0:01.10 stress
3791 war 18 0 256m 36m 164 R 1 0.5 0:01.80 stress
3796 war 18 0 256m 111m 164 R 1 1.4 0:01.04 stress
3797 war 18 0 256m 120m 164 R 1 1.5 0:01.12 stress
3803 war 18 0 256m 129m 160 R 1 1.6 0:01.98 stress
3804 war 18 0 256m 234m 160 R 1 3.0 0:00.40 stress
3806 war 18 0 256m 49m 164 R 1 0.6 0:00.99 stress
3809 war 18 0 256m 54m 164 R 1 0.7 0:01.16 stress
3814 war 18 0 256m 38m 164 R 1 0.5 0:00.97 stress
3815 war 18 0 256m 69m 164 R 1 0.9 0:01.06 stress
246 root 15 0 0 0 0 D 0 0.0 0:00.01 pdflush
3687 war 15 0 18012 896 560 R 0 0.0 0:00.88 top
3775 war 18 0 256m 153m 164 R 0 2.0 0:01.14 stress
3776 war 18 0 256m 55m 164 R 0 0.7 0:01.34 stress
3784 war 18 0 256m 13m 164 R 0 0.2 0:00.96 stress
3789 war 18 0 256m 46m 164 R 0 0.6 0:01.54 stress
3792 war 18 0 256m 12m 164 R 0 0.2 0:01.00 stress
3793 war 18 0 256m 44m 164 R 0 0.6 0:00.96 stress
3795 war 18 0 256m 196m 164 R 0 2.5 0:01.06 stress
3802 war 18 0 256m 22m 160 R 0 0.3 0:02.70 stress
3810 war 18 0 256m 40m 164 R 0 0.5 0:01.06 stress
3818 war 18 0 256m 146m 160 R 0 1.9 0:00.34 stress
3829 war 18 0 256m 130m 164 R 0 1.7 0:01.11 stress
247 root 15 0 0 0 0 D 0 0.0 0:00.01 pdflush
554 root 10 -5 0 0 0 D 0 0.0 0:00.01 md2_raid1
3811 war 18 0 256m 61m 160 R 0 0.8 0:05.85 stress
3821 war 18 0 256m 136m 160 R 0 1.7 0:00.37 stress
3823 war 18 0 256m 204m 160 R 0 2.6 0:00.34 stress
3826 war 18 0 256m 95m 164 R 0 1.2 0:01.03 stress
On Wed, 6 Jun 2007 15:28:43 -0700 Jesse Barnes wrote:
> On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
> > Nope, I booted with only netconsole= options. I have a lot of HW in
> > the box and I guess the buffer is too small. Not sure where to
> > change it in the kernel. Looking..
>
> It's called "kernel log buffer size" and it's in "General setup".
or you can just boot with "log_buf_len=256k" on the kernel boot line (e.g.)
---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
On Wed, 6 Jun 2007, Randy Dunlap wrote:
> On Wed, 6 Jun 2007 15:28:43 -0700 Jesse Barnes wrote:
>
>> On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
>>> Nope, I booted with only netconsole= options. I have a lot of HW in
>>> the box and I guess the buffer is too small. Not sure where to
>>> change it in the kernel. Looking..
>>
>> It's called "kernel log buffer size" and it's in "General setup".
>
> or you can just boot with "log_buf_len=256k" on the kernel boot line (e.g.)
>
> ---
> ~Randy
> *** Remember to use Documentation/SubmitChecklist when testing your code ***
>
Thanks, very useful note!
$ stress --vm 32 --vm-bytes 250M
stress: info: [4087] dispatching hogs: 0 cpu, 0 io, 32 vm, 0 hdd
Mem: 8039576k total, 5288036k used, 2751540k free, 12k buffers
Swap: 0k total, 0k used, 0k free, 37944k cached
$ stress --vm 40 --vm-bytes 250M
stress: info: [4120] dispatching hogs: 0 cpu, 0 io, 40 vm, 0 hdd
Mem: 8039576k total, 6661484k used, 1378092k free, 12k buffers
Swap: 0k total, 0k used, 0k free, 38000k cached
$ stress --vm 42 --vm-bytes 250M
stress: info: [4398] dispatching hogs: 0 cpu, 0 io, 42 vm, 0 hdd
Mem: 8039576k total, 6447048k used, 1592528k free, 8k buffers
Swap: 0k total, 0k used, 0k free, 11148k cached
$ stress --vm 45 --vm-bytes 250M
stress: info: [4352] dispatching hogs: 0 cpu, 0 io, 45 vm, 0 hdd
stress: FAIL: [4372] (494) hogvm malloc failed: Cannot allocate memory
stress: FAIL: [4352] (395) <-- worker 4372 returned error 1
stress: WARN: [4352] (397) now reaping child worker processes
stress: FAIL: [4352] (452) failed run completed in 4s
$ stress --vm 48 --vm-bytes 250M
stress: info: [4303] dispatching hogs: 0 cpu, 0 io, 48 vm, 0 hdd
stress: FAIL: [4323] (494) hogvm malloc failed: Cannot allocate memory
stress: FAIL: [4322] (494) hogvm malloc failed: Cannot allocate memory
stress: FAIL: [4303] (395) <-- worker 4323 returned error 1
stress: WARN: [4303] (397) now reaping child worker processes
stress: FAIL: [4303] (395) <-- worker 4322 returned error 1
stress: WARN: [4303] (397) now reaping child worker processes
stress: FAIL: [4303] (452) failed run completed in 4s
Is there a better way to verify I can use all the available memory?
Justin.
On Wed, 6 Jun 2007, Randy Dunlap wrote:
> On Wed, 6 Jun 2007 15:28:43 -0700 Jesse Barnes wrote:
>
>> On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
>>> Nope, I booted with only netconsole= options. I have a lot of HW in
>>> the box and I guess the buffer is too small. Not sure where to
>>> change it in the kernel. Looking..
>>
>> It's called "kernel log buffer size" and it's in "General setup".
>
> or you can just boot with "log_buf_len=256k" on the kernel boot line (e.g.)
>
> ---
> ~Randy
> *** Remember to use Documentation/SubmitChecklist when testing your code ***
>
Hm, not sure if it was from the patch or what but I ran this:
1. swapoff -a
2. ./eatmem
The machine responded to ping and alt-sysrq-b but the box remain
unresponsive, I guess the kernel did not kill the process? :(
The moments before it 'froze'
top - 18:48:01 up 15 min, 7 users, load average: 6.61, 18.50, 13.31
Tasks: 200 total, 18 running, 182 sleeping, 0 stopped, 0 zombie
Cpu(s): 0.0%us, 90.7%sy, 0.0%ni, 5.9%id, 3.3%wa, 0.0%hi, 0.0%si,
0.0%st
Mem: 8039576k total, 7998860k used, 40716k free, 8k buffers
Swap: 0k total, 0k used, 0k free, 1664k cached
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
248 root 11 -5 0 0 0 R 85 0.0 0:16.05 kswapd0
2265 nut 18 0 13320 244 4 R 40 0.0 0:03.13 newhidups
2267 nut 18 0 12216 168 4 R 40 0.0 0:02.04 upsd
2474 ntp 18 0 22192 400 8 R 39 0.0 0:02.00 ntpd
3563 jpiszcz 18 0 41964 1264 4 R 38 0.0 0:02.20 pine
3530 root 18 0 96240 3132 36 R 37 0.0 0:02.09 kdm_greet
2052 root 18 0 6080 112 4 R 37 0.0 0:02.00 hald-addon-stor
4479 war 17 0 18012 700 252 R 33 0.0 0:01.81 top
4480 war 19 0 6948m 6.8g 4 R 22 88.4 0:05.81 eatmem
2095 root 18 0 13128 216 8 R 10 0.0 0:00.50 dirmngr
2545 root 18 0 95788 2488 4 R 5 0.0 0:00.25 apache2
3564 war 18 0 41620 832 4 R 5 0.0 0:00.34 pine
2270 nut 15 0 12212 144 4 S 1 0.0 0:00.05 upsmon
561 root 10 -5 0 0 0 S 0 0.0 0:00.02 xfsbufd
Very simply program:
#include <iostream>
using namespace std;
int main()
{
long int interations = 10000000;
int counter = 1;
for(counter;counter<interations;counter++)
{
double *d = new double[100];
}
return 0;
}
Any idea why the OOM killer can or does not kill it?
Justin.
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
>> Nope, I booted with only netconsole= options. I have a lot of HW in
>> the box and I guess the buffer is too small. Not sure where to
>> change it in the kernel. Looking..
>
> It's called "kernel log buffer size" and it's in "General setup".
>
> Jesse
>
Did the dmesg output get you what you needed? Why the few KB difference?
:)
Justin.
On Wed, 6 Jun 2007 18:54:37 -0400 (EDT) Justin Piszcz wrote:
> Hm, not sure if it was from the patch or what but I ran this:
>
> 1. swapoff -a
> 2. ./eatmem
>
> The machine responded to ping and alt-sysrq-b but the box remain
> unresponsive, I guess the kernel did not kill the process? :(
>
> The moments before it 'froze'
>
> top - 18:48:01 up 15 min, 7 users, load average: 6.61, 18.50, 13.31
> Tasks: 200 total, 18 running, 182 sleeping, 0 stopped, 0 zombie
> Cpu(s): 0.0%us, 90.7%sy, 0.0%ni, 5.9%id, 3.3%wa, 0.0%hi, 0.0%si,
> 0.0%st
> Mem: 8039576k total, 7998860k used, 40716k free, 8k buffers
> Swap: 0k total, 0k used, 0k free, 1664k cached
>
> PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
> 248 root 11 -5 0 0 0 R 85 0.0 0:16.05 kswapd0
> 2265 nut 18 0 13320 244 4 R 40 0.0 0:03.13 newhidups
> 2267 nut 18 0 12216 168 4 R 40 0.0 0:02.04 upsd
> 2474 ntp 18 0 22192 400 8 R 39 0.0 0:02.00 ntpd
> 3563 jpiszcz 18 0 41964 1264 4 R 38 0.0 0:02.20 pine
> 3530 root 18 0 96240 3132 36 R 37 0.0 0:02.09 kdm_greet
> 2052 root 18 0 6080 112 4 R 37 0.0 0:02.00 hald-addon-stor
> 4479 war 17 0 18012 700 252 R 33 0.0 0:01.81 top
> 4480 war 19 0 6948m 6.8g 4 R 22 88.4 0:05.81 eatmem
> 2095 root 18 0 13128 216 8 R 10 0.0 0:00.50 dirmngr
> 2545 root 18 0 95788 2488 4 R 5 0.0 0:00.25 apache2
> 3564 war 18 0 41620 832 4 R 5 0.0 0:00.34 pine
> 2270 nut 15 0 12212 144 4 S 1 0.0 0:00.05 upsmon
> 561 root 10 -5 0 0 0 S 0 0.0 0:00.02 xfsbufd
>
> Very simply program:
>
> #include <iostream>
> using namespace std;
>
> int main()
> {
> long int interations = 10000000;
> int counter = 1;
>
> for(counter;counter<interations;counter++)
> {
> double *d = new double[100];
You usually have to access the allocated memory, like:
*d = 1.0;
for it to actually be allocated (AFAIK).
> }
>
> return 0;
> }
>
> Any idea why the OOM killer can or does not kill it?
What are the values of /proc/sys/vm/overcommit* ?
See Documentation/vm/overcommit-accounting .
---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
On Wed, 6 Jun 2007, Randy Dunlap wrote:
> On Wed, 6 Jun 2007 18:54:37 -0400 (EDT) Justin Piszcz wrote:
>
>> Hm, not sure if it was from the patch or what but I ran this:
>>
>> 1. swapoff -a
>> 2. ./eatmem
>>
>
> You usually have to access the allocated memory, like:
>
> *d = 1.0;
>
> for it to actually be allocated (AFAIK).
>
>> }
>>
>> return 0;
>> }
>>
>> Any idea why the OOM killer can or does not kill it?
>
> What are the values of /proc/sys/vm/overcommit* ?
>
> See Documentation/vm/overcommit-accounting .
They should be the defaults as I do not change them:
p34:~# find /proc/|grep -i overcommit
/proc/sys/vm/overcommit_memory
/proc/sys/vm/overcommit_ratio
find: /proc/5128: No such file or directory
p34:~# cat /proc/sys/vm/overcommit_memory
0
p34:~# cat /proc/sys/vm/overcommit_ratio
50
p34:~#
Comments?
On Wednesday, June 6, 2007 3:57 pm Justin Piszcz wrote:
> On Wed, 6 Jun 2007, Jesse Barnes wrote:
> > On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
> >> Nope, I booted with only netconsole= options. I have a lot of HW
> >> in the box and I guess the buffer is too small. Not sure where to
> >> change it in the kernel. Looking..
> >
> > It's called "kernel log buffer size" and it's in "General setup".
> >
> > Jesse
>
> Did the dmesg output get you what you needed? Why the few KB
> difference?
>
> :)
Yeah, looked at your e820 and your MTRR settings and I think my patch is
doing the right thing (i.e. trimming just the right amount of memory,
leaving you with as much as possible).
The mem= approach though looks slightly off, but I haven't looked at
x86_64's mem= handling to see why. From a high level though, adjusting
end_pfn is the right thing to do, since theoretically mem= could choose
to make holes in your low memory and keep your high memory in the
allocation pools (though it's not generally implemented this way).
Jesse
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 3:57 pm Justin Piszcz wrote:
>> On Wed, 6 Jun 2007, Jesse Barnes wrote:
>>> On Wednesday, June 6, 2007 3:26 pm Justin Piszcz wrote:
>>>> Nope, I booted with only netconsole= options. I have a lot of HW
>>>> in the box and I guess the buffer is too small. Not sure where to
>>>> change it in the kernel. Looking..
>>>
>>> It's called "kernel log buffer size" and it's in "General setup".
>>>
>>> Jesse
>>
>> Did the dmesg output get you what you needed? Why the few KB
>> difference?
>>
>> :)
>
> Yeah, looked at your e820 and your MTRR settings and I think my patch is
> doing the right thing (i.e. trimming just the right amount of memory,
> leaving you with as much as possible).
>
> The mem= approach though looks slightly off, but I haven't looked at
> x86_64's mem= handling to see why. From a high level though, adjusting
> end_pfn is the right thing to do, since theoretically mem= could choose
> to make holes in your low memory and keep your high memory in the
> allocation pools (though it's not generally implemented this way).
>
> Jesse
>
Ahh, ok! Sounds great, I will keep running the kernel with your patch
without mem= and let you know if I see any issues.
Chances of getting this into 2.6.22-rc5?
Justin.
On Wednesday, June 6, 2007 4:24 pm Justin Piszcz wrote:
> > The mem= approach though looks slightly off, but I haven't looked
> > at x86_64's mem= handling to see why. From a high level though,
> > adjusting end_pfn is the right thing to do, since theoretically
> > mem= could choose to make holes in your low memory and keep your
> > high memory in the allocation pools (though it's not generally
> > implemented this way).
> >
> > Jesse
>
> Ahh, ok! Sounds great, I will keep running the kernel with your
> patch without mem= and let you know if I see any issues.
>
> Chances of getting this into 2.6.22-rc5?
I'm not sure it's appropriate for -rc5 since it mucks around with some
early boot ordering, but I'll leave that to Andi, since it does address
some real bugs people have been seeing.
Can we add your "Tested-by: Justin Piszcz <[email protected]>" to
the patch? :)
Thanks,
Jesse
On Wednesday, June 6, 2007 4:15 pm Justin Piszcz wrote:
> On Wed, 6 Jun 2007, Randy Dunlap wrote:
> > On Wed, 6 Jun 2007 18:54:37 -0400 (EDT) Justin Piszcz wrote:
> >> Hm, not sure if it was from the patch or what but I ran this:
> >>
> >> 1. swapoff -a
> >> 2. ./eatmem
> >
> > You usually have to access the allocated memory, like:
> >
> > *d = 1.0;
> >
> > for it to actually be allocated (AFAIK).
> >
> >> }
> >>
> >> return 0;
> >> }
> >>
> >> Any idea why the OOM killer can or does not kill it?
> >
> > What are the values of /proc/sys/vm/overcommit* ?
> >
> > See Documentation/vm/overcommit-accounting .
>
> They should be the defaults as I do not change them:
>
> p34:~# find /proc/|grep -i overcommit
> /proc/sys/vm/overcommit_memory
> /proc/sys/vm/overcommit_ratio
> find: /proc/5128: No such file or directory
> p34:~# cat /proc/sys/vm/overcommit_memory
> 0
> p34:~# cat /proc/sys/vm/overcommit_ratio
> 50
> p34:~#
>
>
> Comments?
You can be sure your memory is available if reported in /proc/meminfo or
at boot, since those represent the actual kernel data structures used
for memory allocation:
[ 0.000000] On node 0 totalpages: 2061783
That corresponds to 2061783*4k = 8445063168 bytes or ~8053M. Is that
fairly close to what's actually installed in the machine?
Note that your boot also mentions this:
[ 106.449661] mtrr: no more MTRRs available
which indicates that things like X may not be able to map the
framebuffer with the 'write-combine' attribute, which will hurt
performance. I've heard reports that turning of 'Intel QST fan
control' in your BIOS settings will prevent all your MTRRs from being
used (improperly, probably another BIOS bug) so that X will perform
well. But if you don't use X on this machine, you don't have to worry
about it. The other option would be to remap your MTRRs by hand to
free one up for X, you can do that by combining the last one or two
entries into a single MTRR using the API described in
Documentation/mtrr.txt before you start X.
Jesse
Jesse Barnes <[email protected]> writes:
> On some machines, buggy BIOSes don't properly setup WB MTRRs to
> cover all available RAM, meaning the last few megs (or even gigs)
> of memory will be marked uncached. Since Linux tends to allocate
> from high memory addresses first, this causes the machine to be
> unusably slow as soon as the kernel starts really using memory
> (i.e. right around init time).
>
> This patch works around the problem by scanning the MTRRs at
> boot and figuring out whether the current end_pfn value (setup
> by early e820 code) goes beyond the highest WB MTRR range, and
> if so, trimming it to match. A fairly obnoxious KERN_WARNING
> is printed too, letting the user know that not all of their
> memory is available due to a likely BIOS bug.
>
> Something similar could be done on i386 if needed, but the boot
> ordering would be slightly different, since the MTRR code on i386
> depends on the boot_cpu_data structure being setup.
>
> Justin, can you please test and make sure this patch works for
> you too? It'll only work around the problem, but it's better
> than having to do mem= by hand or waiting for a fix from your
> BIOS vendor.
Ok. Overall this feels good but a few nits below.
Would it make sense to split this into two patches.
The first to just do the cleanup that removes the allocations
for holding the mttr ranges?
> Thanks,
> Jesse
>
> Signed-off-by: Jesse Barnes <[email protected]>
>
> diff --git a/arch/i386/kernel/cpu/mtrr/generic.c
> b/arch/i386/kernel/cpu/mtrr/generic.c
> index c4ebb51..71fc768 100644
> --- a/arch/i386/kernel/cpu/mtrr/generic.c
> +++ b/arch/i386/kernel/cpu/mtrr/generic.c
> @@ -13,7 +13,7 @@
> #include "mtrr.h"
>
>
> struct mtrr_state {
> - struct mtrr_var_range *var_ranges;
> + struct mtrr_var_range var_ranges[NUM_VAR_RANGES];
Could we name it MAX_VAR_RANGES and not NUM_VAR_RANGES.
In practices this is going to be 8 for every cpu I know of,
so calling this NUM_VAR_RANGES may be a little confusing.
> mtrr_type fixed_ranges[NUM_FIXED_RANGES];
> unsigned char enabled;
> unsigned char have_fixed;
> @@ -84,12 +84,6 @@ void get_mtrr_state(void)
> struct mtrr_var_range *vrs;
> unsigned lo, dummy;
>
> - if (!mtrr_state.var_ranges) {
> - mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct
> mtrr_var_range),
> - GFP_KERNEL);
> - if (!mtrr_state.var_ranges)
> - return;
> - }
> vrs = mtrr_state.var_ranges;
>
> rdmsr(MTRRcap_MSR, lo, dummy);
> diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
> index c7d8f17..d7922ce 100644
> --- a/arch/i386/kernel/cpu/mtrr/if.c
> +++ b/arch/i386/kernel/cpu/mtrr/if.c
> @@ -12,7 +12,7 @@
> #include "mtrr.h"
>
> /* RED-PEN: this is accessed without any locking */
> -extern unsigned int *usage_table;
> +extern unsigned int usage_table[];
I think that should be:
> +extern unsigned int usage_table[NUM_VAR_RANGES];
Or even better yet the declaration moved to a header file.
>
> +/**
> + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
> + *
> + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
> + * memory configurations. This routine checks to make sure the MTRRs having
> + * a write back type cover all of the memory the kernel is intending to use.
> + * If not, it'll trim any memory off the end by adjusting end_pfn, removing
> + * it from the kernel's allocation pools, warning the user with an obnoxious
> + * message.
> + */
> +void __init mtrr_trim_uncached_memory(void)
> +{
> + unsigned long i, base, size, highest_addr = 0;
> + mtrr_type type;
> +
> + /* Find highest cached pfn */
> + for (i = 0; i < num_var_ranges; i++) {
> + mtrr_if->get(i, &base, &size, &type);
> + if (type != MTRR_TYPE_WRBACK)
> + continue;
> + base <<= PAGE_SHIFT;
> + size <<= PAGE_SHIFT;
> + if (highest_addr < base + size)
> + highest_addr = base + size;
> + }
This looks like it will handle the common case, so I have no major objections
to this code.
At least in theory and possibly in practice there are a couple of corner
cases we have missed her.
- Overlapping MTRRs.
- What happens if we have uncached memory lower down?
Except for performance problems I guess that case is relatively harmless.
- Is it possible and worth it to amend the e820 map, so it shows the
problem area as Reserved or otherwise not usable RAM?
> +
> + if ((highest_addr >> PAGE_SHIFT) != end_pfn) {
> + printk(KERN_WARNING "***************\n");
> + printk(KERN_WARNING "**** WARNING: likely BIOS bug\n");
> + printk(KERN_WARNING "**** MTRRs don't cover all of "
> + "memory, trimmed %ld pages\n", end_pfn -
> + (highest_addr >> PAGE_SHIFT));
> + printk(KERN_WARNING "***************\n");
> + end_pfn = highest_addr >> PAGE_SHIFT;
> + }
> +}
>
> /**
> * mtrr_bp_init - initialize mtrrs on the boot CPU
> diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
> index 289dfe6..a29dcba 100644
> --- a/arch/i386/kernel/cpu/mtrr/mtrr.h
> +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
> @@ -14,6 +14,7 @@
> #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
>
> #define NUM_FIXED_RANGES 88
> +#define NUM_VAR_RANGES 256
MAX_VAR_RANGES?
> #define MTRRfix64K_00000_MSR 0x250
> #define MTRRfix16K_80000_MSR 0x258
> #define MTRRfix16K_A0000_MSR 0x259
Eric
On Wed, 6 Jun 2007, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 4:15 pm Justin Piszcz wrote:
>> On Wed, 6 Jun 2007, Randy Dunlap wrote:
>>> On Wed, 6 Jun 2007 18:54:37 -0400 (EDT) Justin Piszcz wrote:
>>>> Hm, not sure if it was from the patch or what but I ran this:
>>>>
>>>> 1. swapoff -a
>>>> 2. ./eatmem
>>>
>>> You usually have to access the allocated memory, like:
>>>
>>> *d = 1.0;
>>>
>>> for it to actually be allocated (AFAIK).
>>>
>>>> }
>>>>
>>>> return 0;
>>>> }
>>>>
>>>> Any idea why the OOM killer can or does not kill it?
>>>
>>> What are the values of /proc/sys/vm/overcommit* ?
>>>
>>> See Documentation/vm/overcommit-accounting .
>>
>> They should be the defaults as I do not change them:
>>
>> p34:~# find /proc/|grep -i overcommit
>> /proc/sys/vm/overcommit_memory
>> /proc/sys/vm/overcommit_ratio
>> find: /proc/5128: No such file or directory
>> p34:~# cat /proc/sys/vm/overcommit_memory
>> 0
>> p34:~# cat /proc/sys/vm/overcommit_ratio
>> 50
>> p34:~#
>>
>>
>> Comments?
>
> You can be sure your memory is available if reported in /proc/meminfo or
> at boot, since those represent the actual kernel data structures used
> for memory allocation:
>
> [ 0.000000] On node 0 totalpages: 2061783
>
> That corresponds to 2061783*4k = 8445063168 bytes or ~8053M. Is that
> fairly close to what's actually installed in the machine?
>
> Note that your boot also mentions this:
>
> [ 106.449661] mtrr: no more MTRRs available
>
> which indicates that things like X may not be able to map the
> framebuffer with the 'write-combine' attribute, which will hurt
> performance. I've heard reports that turning of 'Intel QST fan
> control' in your BIOS settings will prevent all your MTRRs from being
> used (improperly, probably another BIOS bug) so that X will perform
> well. But if you don't use X on this machine, you don't have to worry
> about it. The other option would be to remap your MTRRs by hand to
> free one up for X, you can do that by combining the last one or two
> entries into a single MTRR using the API described in
> Documentation/mtrr.txt before you start X.
>
> Jesse
>
FYI--
[ 106.449661] mtrr: no more MTRRs available
This has always occurred, even with mem=8832M setting.
Justin.
On Wed, Jun 06, 2007 at 12:29:23PM -0700, Jesse Barnes wrote:
> On some machines, buggy BIOSes don't properly setup WB MTRRs to
> cover all available RAM, meaning the last few megs (or even gigs)
> of memory will be marked uncached. Since Linux tends to allocate
> from high memory addresses first, this causes the machine to be
> unusably slow as soon as the kernel starts really using memory
> (i.e. right around init time).
In theory -- while not recommended -- a BIOS could also
use a default fallback MTRR for cached and use explicit MTRRs to
map the non existing ranges uncached. Would it make sense to handle this case?
Right now if someone used a default WC MTRR to make the memory
cached you would clip all memory.
Perhaps a fail safe would be good -- always leave some
memory left over even if it looks wrong.
Should also probably have some command line option
to disable the check in case something bad happens with it.
Another thing that might be sense to investigate in relationship
to this patch is large page mappings with MTRRs. iirc P4 and also K8
splits pages internally with MTRR boundaries and might have some other
bad side effects. Should we use this as hints to use 4K pages
for the boundary areas?
-Andi
On Wed, Jun 06, 2007 at 04:27:46PM -0700, Jesse Barnes wrote:
> On Wednesday, June 6, 2007 4:24 pm Justin Piszcz wrote:
> > > The mem= approach though looks slightly off, but I haven't looked
> > > at x86_64's mem= handling to see why. From a high level though,
> > > adjusting end_pfn is the right thing to do, since theoretically
> > > mem= could choose to make holes in your low memory and keep your
> > > high memory in the allocation pools (though it's not generally
> > > implemented this way).
> > >
> > > Jesse
> >
> > Ahh, ok! Sounds great, I will keep running the kernel with your
> > patch without mem= and let you know if I see any issues.
> >
> > Chances of getting this into 2.6.22-rc5?
>
> I'm not sure it's appropriate for -rc5 since it mucks around with some
> early boot ordering, but I'll leave that to Andi, since it does address
> some real bugs people have been seeing.
I don't think the patch is suitable for merging at this time. Perhaps
if it survives some time in -mm* / 2.6.23* it could be backported
in a later 2.6.22 stable release. But right now it definitely
needs more testing and addressing of my review comments.
> Can we add your "Tested-by: Justin Piszcz <[email protected]>" to
> the patch? :)
All such headers are only for the trail of blame and do you want to blame
Justin if anything goes wrong? Perhaps it should rather have a
Blame-to: <whoever wrote Justin/Jesse's BIOS> but that also wouldn't
help without concrete contact points.
-ANdi
On Thu, 7 Jun 2007, Andi Kleen wrote:
> On Wed, Jun 06, 2007 at 04:27:46PM -0700, Jesse Barnes wrote:
>> On Wednesday, June 6, 2007 4:24 pm Justin Piszcz wrote:
>>>> The mem= approach though looks slightly off, but I haven't looked
>>>> at x86_64's mem= handling to see why. From a high level though,
>>>> adjusting end_pfn is the right thing to do, since theoretically
>>>> mem= could choose to make holes in your low memory and keep your
>>>> high memory in the allocation pools (though it's not generally
>>>> implemented this way).
>>>>
>>>> Jesse
>>>
>>> Ahh, ok! Sounds great, I will keep running the kernel with your
>>> patch without mem= and let you know if I see any issues.
>>>
>>> Chances of getting this into 2.6.22-rc5?
>>
>> I'm not sure it's appropriate for -rc5 since it mucks around with some
>> early boot ordering, but I'll leave that to Andi, since it does address
>> some real bugs people have been seeing.
>
> I don't think the patch is suitable for merging at this time. Perhaps
> if it survives some time in -mm* / 2.6.23* it could be backported
> in a later 2.6.22 stable release. But right now it definitely
> needs more testing and addressing of my review comments.
>
>> Can we add your "Tested-by: Justin Piszcz <[email protected]>" to
>> the patch? :)
>
> All such headers are only for the trail of blame and do you want to blame
> Justin if anything goes wrong? Perhaps it should rather have a
> Blame-to: <whoever wrote Justin/Jesse's BIOS> but that also wouldn't
> help without concrete contact points.
>
> -ANdi
>
Hah! Again, I'll keep runnihg with Jesse's patch and as long as I can
keep patching newer kernels I can continue to run with it. So far,
overnight with backups and the like, I have not noticed any problems.
Also tested logging in to X/KDE, no issues. [yet]
Justin.
On 6/7/07, Andi Kleen <[email protected]> wrote:
> On Wed, Jun 06, 2007 at 04:27:46PM -0700, Jesse Barnes wrote:
> > [...]
> > I'm not sure it's appropriate for -rc5 since it mucks around with some
> > early boot ordering, but I'll leave that to Andi, since it does address
> > some real bugs people have been seeing.
>
> I don't think the patch is suitable for merging at this time. Perhaps
> if it survives some time in -mm* / 2.6.23* it could be backported
> in a later 2.6.22 stable release. But right now it definitely
> needs more testing and addressing of my review comments.
BTW an unrelated/happy side-effect of the patch is that it removes
the zero-size-guilty kmalloc()'s from the mtrr code. -mm does have
some fixes that prevent those zero-size allocations, which could
likely be retired if this gets merged in? ...
[ On another unrelated note, could someone tell me who maintains
i386? I've looked around, but surprisingly can't seem to find
anybody listed anywhere. ]
Satyam
On Thursday, June 7, 2007 12:45 am Eric W. Biederman wrote:
> Ok. Overall this feels good but a few nits below.
> Would it make sense to split this into two patches.
> The first to just do the cleanup that removes the allocations
> for holding the mttr ranges?
I suppose we could split it, but it's small, and the only reason for
removing the allocations was so that we could init it earlier.
> > struct mtrr_state {
> > - struct mtrr_var_range *var_ranges;
> > + struct mtrr_var_range var_ranges[NUM_VAR_RANGES];
>
> Could we name it MAX_VAR_RANGES and not NUM_VAR_RANGES.
> In practices this is going to be 8 for every cpu I know of,
> so calling this NUM_VAR_RANGES may be a little confusing.
You're right, I should have kept the old name with MAX_ in it. I'll fix
it up.
> > /* RED-PEN: this is accessed without any locking */
> > -extern unsigned int *usage_table;
> > +extern unsigned int usage_table[];
>
> I think that should be:
> > +extern unsigned int usage_table[NUM_VAR_RANGES];
>
> Or even better yet the declaration moved to a header file.
Oops, yeah, this should just be in mtrr.h.
> This looks like it will handle the common case, so I have no major
> objections to this code.
>
> At least in theory and possibly in practice there are a couple of
> corner cases we have missed her.
>
> - Overlapping MTRRs.
Overlapping should be ok, since that's usually intentional (e.g. one big
wb range with a portion of uc space due to another mtrr).
> - What happens if we have uncached memory lower down?
Holes definitely aren't dealt with, but then we haven't seen any yet...
> Except for performance problems I guess that case is relatively
> harmless. - Is it possible and worth it to amend the e820 map, so it
> shows the problem area as Reserved or otherwise not usable RAM?
That would be useful, but only if we moved the check to a little
earlier, prior to the addition of the active ranges from the e820.
Might be a little nicer than adjusting end_pfn, but will ultimately
achieve the same thing...
Jesse
On Thursday, June 7, 2007 1:51 am Andi Kleen wrote:
> On Wed, Jun 06, 2007 at 04:27:46PM -0700, Jesse Barnes wrote:
> > On Wednesday, June 6, 2007 4:24 pm Justin Piszcz wrote:
> > > > The mem= approach though looks slightly off, but I haven't
> > > > looked at x86_64's mem= handling to see why. From a high level
> > > > though, adjusting end_pfn is the right thing to do, since
> > > > theoretically mem= could choose to make holes in your low
> > > > memory and keep your high memory in the allocation pools
> > > > (though it's not generally implemented this way).
> > > >
> > > > Jesse
> > >
> > > Ahh, ok! Sounds great, I will keep running the kernel with your
> > > patch without mem= and let you know if I see any issues.
> > >
> > > Chances of getting this into 2.6.22-rc5?
> >
> > I'm not sure it's appropriate for -rc5 since it mucks around with
> > some early boot ordering, but I'll leave that to Andi, since it
> > does address some real bugs people have been seeing.
>
> I don't think the patch is suitable for merging at this time. Perhaps
> if it survives some time in -mm* / 2.6.23* it could be backported
> in a later 2.6.22 stable release. But right now it definitely
> needs more testing and addressing of my review comments.
>
> > Can we add your "Tested-by: Justin Piszcz
> > <[email protected]>" to the patch? :)
>
> All such headers are only for the trail of blame and do you want to
> blame Justin if anything goes wrong? Perhaps it should rather have a
> Blame-to: <whoever wrote Justin/Jesse's BIOS> but that also wouldn't
> help without concrete contact points.
I think that header would be Lame-workaround-needed-because-of:
<[email protected]>. :) The idea of tested-by is to give people
a clue about who would be able to test any changes in the area
affected. So far from blaming Justin, it would give him credit for all
his testing, and let people know that he might be able to test similar
patches in the future. I think it's worthwhile to track that...
Jesse
On Thursday, June 7, 2007 1:16 am Andi Kleen wrote:
> On Wed, Jun 06, 2007 at 12:29:23PM -0700, Jesse Barnes wrote:
> > On some machines, buggy BIOSes don't properly setup WB MTRRs to
> > cover all available RAM, meaning the last few megs (or even gigs)
> > of memory will be marked uncached. Since Linux tends to allocate
> > from high memory addresses first, this causes the machine to be
> > unusably slow as soon as the kernel starts really using memory
> > (i.e. right around init time).
>
> In theory -- while not recommended -- a BIOS could also
> use a default fallback MTRR for cached and use explicit MTRRs to
> map the non existing ranges uncached. Would it make sense to handle
> this case?
Probably. I could just check the default memory type and bail out if
it's cacheable.
> Should also probably have some command line option
> to disable the check in case something bad happens with it.
Sure.
> Another thing that might be sense to investigate in relationship
> to this patch is large page mappings with MTRRs. iirc P4 and also K8
> splits pages internally with MTRR boundaries and might have some
> other bad side effects. Should we use this as hints to use 4K pages
> for the boundary areas?
Or I could trim to the nearest large page boundary... We'd lose a
little more memory but it would keep things simple.
Jesse
On Thu, 7 Jun 2007, Jesse Barnes wrote:
> On Thursday, June 7, 2007 1:16 am Andi Kleen wrote:
>> On Wed, Jun 06, 2007 at 12:29:23PM -0700, Jesse Barnes wrote:
>>> On some machines, buggy BIOSes don't properly setup WB MTRRs to
>>> cover all available RAM, meaning the last few megs (or even gigs)
>>> of memory will be marked uncached. Since Linux tends to allocate
>>> from high memory addresses first, this causes the machine to be
>>> unusably slow as soon as the kernel starts really using memory
>>> (i.e. right around init time).
>>
>> In theory -- while not recommended -- a BIOS could also
>> use a default fallback MTRR for cached and use explicit MTRRs to
>> map the non existing ranges uncached. Would it make sense to handle
>> this case?
>
> Probably. I could just check the default memory type and bail out if
> it's cacheable.
>
>> Should also probably have some command line option
>> to disable the check in case something bad happens with it.
>
> Sure.
>
>> Another thing that might be sense to investigate in relationship
>> to this patch is large page mappings with MTRRs. iirc P4 and also K8
>> splits pages internally with MTRR boundaries and might have some
>> other bad side effects. Should we use this as hints to use 4K pages
>> for the boundary areas?
>
> Or I could trim to the nearest large page boundary... We'd lose a
> little more memory but it would keep things simple.
>
> Jesse
>
How much more memory are we going to lose? Is mem= a better option if its
going to keep decreasing?
On Wed, 6 Jun 2007 12:29:23 -0700
Jesse Barnes <[email protected]> wrote:
> --- a/arch/i386/kernel/cpu/mtrr/if.c
> +++ b/arch/i386/kernel/cpu/mtrr/if.c
> @@ -12,7 +12,7 @@
> #include "mtrr.h"
>
> /* RED-PEN: this is accessed without any locking */
> -extern unsigned int *usage_table;
> +extern unsigned int usage_table[];
>
>
> --- a/arch/i386/kernel/cpu/mtrr/main.c
> +++ b/arch/i386/kernel/cpu/mtrr/main.c
> @@ -47,7 +47,7 @@
>
> u32 num_var_ranges = 0;
>
> -unsigned int *usage_table;
> +unsigned int usage_table[NUM_VAR_RANGES];
> static DEFINE_MUTEX(mtrr_mutex);
didn't it feel all dirty when you had to do that?
From: Andrew Morton <[email protected]>
- Move the declaration into a header file
- "usage_table" is a dumb name for an mtrr-specific kernel-wide identifier.
There appear to beseveral other poorly-chosen identifiers in mtrr.
Cc: Andi Kleen <[email protected]>
Cc: Jesse Barnes <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---
arch/i386/kernel/cpu/mtrr/if.c | 8 ++------
arch/i386/kernel/cpu/mtrr/main.c | 17 +++++++++--------
arch/i386/kernel/cpu/mtrr/mtrr.h | 2 ++
3 files changed, 13 insertions(+), 14 deletions(-)
diff -puN arch/i386/kernel/cpu/mtrr/if.c~i386-x86_64-trim-memory-not-covered-by-wb-mtrrs-fix arch/i386/kernel/cpu/mtrr/if.c
--- a/arch/i386/kernel/cpu/mtrr/if.c~i386-x86_64-trim-memory-not-covered-by-wb-mtrrs-fix
+++ a/arch/i386/kernel/cpu/mtrr/if.c
@@ -11,10 +11,6 @@
#include <asm/mtrr.h>
#include "mtrr.h"
-/* RED-PEN: this is accessed without any locking */
-extern unsigned int usage_table[];
-
-
#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
static const char *const mtrr_strings[MTRR_NUM_TYPES] =
@@ -396,7 +392,7 @@ static int mtrr_seq_show(struct seq_file
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
if (size == 0)
- usage_table[i] = 0;
+ mtrr_usage_table[i] = 0;
else {
if (size < (0x100000 >> PAGE_SHIFT)) {
/* less than 1MB */
@@ -410,7 +406,7 @@ static int mtrr_seq_show(struct seq_file
len += seq_printf(seq,
"reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
i, base, base >> (20 - PAGE_SHIFT), size, factor,
- mtrr_attrib_to_str(type), usage_table[i]);
+ mtrr_attrib_to_str(type), mtrr_usage_table[i]);
}
}
return 0;
diff -puN arch/i386/kernel/cpu/mtrr/main.c~i386-x86_64-trim-memory-not-covered-by-wb-mtrrs-fix arch/i386/kernel/cpu/mtrr/main.c
--- a/arch/i386/kernel/cpu/mtrr/main.c~i386-x86_64-trim-memory-not-covered-by-wb-mtrrs-fix
+++ a/arch/i386/kernel/cpu/mtrr/main.c
@@ -47,7 +47,7 @@
u32 num_var_ranges = 0;
-unsigned int usage_table[NUM_VAR_RANGES];
+unsigned int mtrr_usage_table[NUM_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
@@ -127,7 +127,7 @@ static void __init init_table(void)
max = num_var_ranges;
for (i = 0; i < max; i++)
- usage_table[i] = 1;
+ mtrr_usage_table[i] = 1;
}
struct set_mtrr_data {
@@ -381,7 +381,7 @@ int mtrr_add_page(unsigned long base, un
goto out;
}
if (increment)
- ++usage_table[i];
+ ++mtrr_usage_table[i];
error = i;
goto out;
}
@@ -390,12 +390,13 @@ int mtrr_add_page(unsigned long base, un
if (i >= 0) {
set_mtrr(i, base, size, type);
if (likely(replace < 0))
- usage_table[i] = 1;
+ mtrr_usage_table[i] = 1;
else {
- usage_table[i] = usage_table[replace] + !!increment;
+ mtrr_usage_table[i] = mtrr_usage_table[replace] +
+ !!increment;
if (unlikely(replace != i)) {
set_mtrr(replace, 0, 0, 0);
- usage_table[replace] = 0;
+ mtrr_usage_table[replace] = 0;
}
}
} else
@@ -525,11 +526,11 @@ int mtrr_del_page(int reg, unsigned long
printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
goto out;
}
- if (usage_table[reg] < 1) {
+ if (mtrr_usage_table[reg] < 1) {
printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
goto out;
}
- if (--usage_table[reg] < 1)
+ if (--mtrr_usage_table[reg] < 1)
set_mtrr(reg, 0, 0, 0);
error = reg;
out:
diff -puN arch/i386/kernel/cpu/mtrr/mtrr.h~i386-x86_64-trim-memory-not-covered-by-wb-mtrrs-fix arch/i386/kernel/cpu/mtrr/mtrr.h
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h~i386-x86_64-trim-memory-not-covered-by-wb-mtrrs-fix
+++ a/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -97,3 +97,5 @@ void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
+/* RED-PEN: this is accessed without any locking */
+extern unsigned int mtrr_usage_table[];
_
On Thursday, June 7, 2007 5:20:50 Andrew Morton wrote:
> > -unsigned int *usage_table;
> > +unsigned int usage_table[NUM_VAR_RANGES];
> > static DEFINE_MUTEX(mtrr_mutex);
>
> didn't it feel all dirty when you had to do that?
Hey, this was already there... I didn't want to rewrite the whole thing at
once. :) Patch looks fine though.
Thanks,
Jesse
Hi!
> This patch works around the problem by scanning the MTRRs at
> boot and figuring out whether the current end_pfn value (setup
> by early e820 code) goes beyond the highest WB MTRR range, and
> if so, trimming it to match. A fairly obnoxious KERN_WARNING
> is printed too, letting the user know that not all of their
> memory is available due to a likely BIOS bug.
Way too obnoxious warning, I'd say. Just drop the *s.
> + if ((highest_addr >> PAGE_SHIFT) != end_pfn) {
> + printk(KERN_WARNING "***************\n");
> + printk(KERN_WARNING "**** WARNING: likely BIOS bug\n");
> + printk(KERN_WARNING "**** MTRRs don't cover all of "
> + "memory, trimmed %ld pages\n", end_pfn -
> + (highest_addr >> PAGE_SHIFT));
> + printk(KERN_WARNING "***************\n");
> + end_pfn = highest_addr >> PAGE_SHIFT;
...and I'd print lost memory in kilobytes, to be more luser-friendly.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Wed, 6 Jun 2007 12:29:23 -0700
Jesse Barnes <[email protected]> wrote:
> On some machines, buggy BIOSes don't properly setup WB MTRRs to
> cover all available RAM, meaning the last few megs (or even gigs)
> of memory will be marked uncached. Since Linux tends to allocate
> from high memory addresses first, this causes the machine to be
> unusably slow as soon as the kernel starts really using memory
> (i.e. right around init time).
>
> This patch works around the problem by scanning the MTRRs at
> boot and figuring out whether the current end_pfn value (setup
> by early e820 code) goes beyond the highest WB MTRR range, and
> if so, trimming it to match. A fairly obnoxious KERN_WARNING
> is printed too, letting the user know that not all of their
> memory is available due to a likely BIOS bug.
>
> Something similar could be done on i386 if needed, but the boot
> ordering would be slightly different, since the MTRR code on i386
> depends on the boot_cpu_data structure being setup.
i386 allmodconfig:
arch/i386/kernel/cpu/mtrr/main.c: In function 'mtrr_trim_uncached_memory':
arch/i386/kernel/cpu/mtrr/main.c:655: error: 'end_pfn' undeclared (first use in this function)
arch/i386/kernel/cpu/mtrr/main.c:655: error: (Each undeclared identifier is reported only once
arch/i386/kernel/cpu/mtrr/main.c:655: error: for each function it appears in.)
I'll poke some ifdefs in there.
On Friday, June 8, 2007 2:15:00 Andrew Morton wrote:
> > Something similar could be done on i386 if needed, but the boot
> > ordering would be slightly different, since the MTRR code on i386
> > depends on the boot_cpu_data structure being setup.
>
> i386 allmodconfig:
>
> arch/i386/kernel/cpu/mtrr/main.c: In function
> 'mtrr_trim_uncached_memory': arch/i386/kernel/cpu/mtrr/main.c:655:
> error: 'end_pfn' undeclared (first use in this function)
> arch/i386/kernel/cpu/mtrr/main.c:655: error: (Each undeclared
> identifier is reported only once
> arch/i386/kernel/cpu/mtrr/main.c:655: error: for each function it
> appears in.)
>
> I'll poke some ifdefs in there.
Oops, sorry about that, the code is only called on x86_64 but compiled
even for i386...
Jesse
Jesse Barnes <[email protected]> writes:
>> - Overlapping MTRRs.
>
> Overlapping should be ok, since that's usually intentional (e.g. one big
> wb range with a portion of uc space due to another mtrr).
I'm not say overlapping was a bug. I was saying that you don't handle
overlapping mtrrs in figuring the last cached addresses. Therefore
when a UC range overlaps a WB range we might thing the last page
in the WB range is cached when it is not.
>> - What happens if we have uncached memory lower down?
>
> Holes definitely aren't dealt with, but then we haven't seen any yet...
>
>> Except for performance problems I guess that case is relatively
>> harmless. - Is it possible and worth it to amend the e820 map, so it
>> shows the problem area as Reserved or otherwise not usable RAM?
>
> That would be useful, but only if we moved the check to a little
> earlier, prior to the addition of the active ranges from the e820.
> Might be a little nicer than adjusting end_pfn, but will ultimately
> achieve the same thing...
Yes, with perhaps a little more consistency.
Eric
On Friday, June 8, 2007 4:13:22 Eric W. Biederman wrote:
> Jesse Barnes <[email protected]> writes:
> >> - Overlapping MTRRs.
> >
> > Overlapping should be ok, since that's usually intentional (e.g.
> > one big wb range with a portion of uc space due to another mtrr).
>
> I'm not say overlapping was a bug. I was saying that you don't
> handle overlapping mtrrs in figuring the last cached addresses.
> Therefore when a UC range overlaps a WB range we might thing the last
> page in the WB range is cached when it is not.
Oh right, that might be an issue, but we haven't seen it yet...
Jesse
Jesse Barnes <[email protected]> writes:
> On some machines, buggy BIOSes don't properly setup WB MTRRs to
> cover all available RAM, meaning the last few megs (or even gigs)
> of memory will be marked uncached. Since Linux tends to allocate
> from high memory addresses first, this causes the machine to be
> unusably slow as soon as the kernel starts really using memory
> (i.e. right around init time).
>
> This patch works around the problem by scanning the MTRRs at
> boot and figuring out whether the current end_pfn value (setup
> by early e820 code) goes beyond the highest WB MTRR range, and
> if so, trimming it to match. A fairly obnoxious KERN_WARNING
> is printed too, letting the user know that not all of their
> memory is available due to a likely BIOS bug.
A quick update. This patch is horribly incorrect on a socket F
opteron/Athlon 64 with memory above 4GB.
In particular those cpus are capable of mapping all of memory
above 4GB as write back without using a single MTRR.
So examining MTRRs is insufficient.
Eric
On Tuesday, June 12, 2007 6:11:21 Eric W. Biederman wrote:
> Jesse Barnes <[email protected]> writes:
> > On some machines, buggy BIOSes don't properly setup WB MTRRs to
> > cover all available RAM, meaning the last few megs (or even gigs)
> > of memory will be marked uncached. Since Linux tends to allocate
> > from high memory addresses first, this causes the machine to be
> > unusably slow as soon as the kernel starts really using memory
> > (i.e. right around init time).
> >
> > This patch works around the problem by scanning the MTRRs at
> > boot and figuring out whether the current end_pfn value (setup
> > by early e820 code) goes beyond the highest WB MTRR range, and
> > if so, trimming it to match. A fairly obnoxious KERN_WARNING
> > is printed too, letting the user know that not all of their
> > memory is available due to a likely BIOS bug.
>
> A quick update. This patch is horribly incorrect on a socket F
> opteron/Athlon 64 with memory above 4GB.
>
> In particular those cpus are capable of mapping all of memory
> above 4GB as write back without using a single MTRR.
>
> So examining MTRRs is insufficient.
Hm, yuck. What do you suggest? Should we only run this check when Intel
chips are present? Checking only the bottom 4G isn't sufficient since we've
seen platforms that have issues above that range...
Thanks,
Jesse
Jesse Barnes <[email protected]> writes:
> On Tuesday, June 12, 2007 6:11:21 Eric W. Biederman wrote:
>> Jesse Barnes <[email protected]> writes:
>> > On some machines, buggy BIOSes don't properly setup WB MTRRs to
>> > cover all available RAM, meaning the last few megs (or even gigs)
>> > of memory will be marked uncached. Since Linux tends to allocate
>> > from high memory addresses first, this causes the machine to be
>> > unusably slow as soon as the kernel starts really using memory
>> > (i.e. right around init time).
>> >
>> > This patch works around the problem by scanning the MTRRs at
>> > boot and figuring out whether the current end_pfn value (setup
>> > by early e820 code) goes beyond the highest WB MTRR range, and
>> > if so, trimming it to match. A fairly obnoxious KERN_WARNING
>> > is printed too, letting the user know that not all of their
>> > memory is available due to a likely BIOS bug.
>>
>> A quick update. This patch is horribly incorrect on a socket F
>> opteron/Athlon 64 with memory above 4GB.
>>
>> In particular those cpus are capable of mapping all of memory
>> above 4GB as write back without using a single MTRR.
>>
>> So examining MTRRs is insufficient.
>
> Hm, yuck. What do you suggest? Should we only run this check when Intel
> chips are present? Checking only the bottom 4G isn't sufficient since we've
> seen platforms that have issues above that range...
My gut feel says that we need to call a function that is potentially cpu specific,
older AMD cpus and Intel cpus can just use the generic mtrr code.
I would also suggest we build a list of ranges of write-back memory. Which
until we handle overlapping MTRRs in the generic MTRR case is just the write-back
MTRRs.
Then we get the data in a linux specific form we can check the linux specific
data structure against the e820 map.
I don't think that is going to much harder and it allows for creative cpu
designers.
Although this suggests that we want to worry about all memory holes as
well. Because I have seen at least one system which failed to cover
the lower 4G with MTRRs. While everything above 4G was fine.
Eric
Jesse Barnes wrote:
> On some machines, buggy BIOSes don't properly setup WB MTRRs to
> cover all available RAM, meaning the last few megs (or even gigs)
> of memory will be marked uncached. Since Linux tends to allocate
> from high memory addresses first, this causes the machine to be
> unusably slow as soon as the kernel starts really using memory
> (i.e. right around init time).
>
> This patch works around the problem by scanning the MTRRs at
> boot and figuring out whether the current end_pfn value (setup
> by early e820 code) goes beyond the highest WB MTRR range, and
> if so, trimming it to match. A fairly obnoxious KERN_WARNING
> is printed too, letting the user know that not all of their
> memory is available due to a likely BIOS bug.
>
I assume this cannot be fixed by the simple approach
of echoing some useful numbers into /proc/mtrr like
we used to do for video memory? (Before X did this
automatically?)
An extra bootscript seems better than loosing memory.
Helge Hafting
> I assume this cannot be fixed by the simple approach
> of echoing some useful numbers into /proc/mtrr like
> we used to do for video memory? (Before X did this
> automatically?)
>
> An extra bootscript seems better than loosing memory.
In some cases it probably can, in other cases not because the memory
controller is misconfigured or not the right bits are set
in the PCI bridges to enable DAC IO or ... or ...
There are also definite limits on how much quirks can do
to fix this -- Linux is a generic kernel, not a BIOS replacement,
and can never be as intimate with the current setup
as the BIOS is.
It's definitely far safer to not use the memory. You're
running in a situation never tested or considered by the
motherboard vendor and everything is possible.
-Andi