2008-01-10 18:50:28

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

x86_64: Map only usable memory in identity map. All reserved memory maps to a
zero page. This is done later during the boot process, by pruning the
page table setup earlier to remove mappings for the reserved region. Prune
done after mem_init, so we can allocate pages as needed and before APs start.

Signed-off-by: Venkatesh Pallipadi <[email protected]>
Signed-off-by: Suresh Siddha <[email protected]>

Index: linux-2.6.git/arch/x86/kernel/e820_64.c
===================================================================
--- linux-2.6.git.orig/arch/x86/kernel/e820_64.c 2008-01-08 03:41:30.000000000 -0800
+++ linux-2.6.git/arch/x86/kernel/e820_64.c 2008-01-08 04:00:59.000000000 -0800
@@ -121,6 +121,35 @@
}
EXPORT_SYMBOL_GPL(e820_any_mapped);

+int e820_any_non_reserved(unsigned long start, unsigned long end)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ if (ei->type == E820_RESERVED)
+ continue;
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_non_reserved);
+
+int is_memory_any_valid(unsigned long start, unsigned long end)
+{
+ /*
+ * Keep low PCI/ISA area always mapped.
+ * Note: end address is exclusive and start is inclusive here
+ */
+ if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS)
+ return 1;
+
+ /* Switch to efi or e820 in future here */
+ return e820_any_non_reserved(start, end);
+}
+EXPORT_SYMBOL_GPL(is_memory_any_valid);
+
/*
* This function checks if the entire range <start,end> is mapped with type.
*
@@ -156,6 +185,47 @@
return 0;
}

+int e820_all_non_reserved(unsigned long start, unsigned long end)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ if (ei->type == E820_RESERVED)
+ continue;
+
+ /* is the region (part) in overlap with the current region ?*/
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+
+ /*
+ * if the region is at the beginning of <start,end> we move
+ * start to the end of the region since it's ok until there
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+
+ /* if start is at or beyond end, we're done, full coverage */
+ if (start >= end)
+ return 1; /* we're done */
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(e820_all_non_reserved);
+
+int is_memory_all_valid(unsigned long start, unsigned long end)
+{
+ /*
+ * Keep low PCI/ISA area always mapped.
+ * Note: end address is exclusive and start is inclusive here
+ */
+ if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS)
+ return 1;
+
+ /* Switch to efi or e820 in future here */
+ return e820_all_non_reserved(start, end);
+}
+EXPORT_SYMBOL_GPL(is_memory_all_valid);
+
/*
* Find a free area in a specific range.
*/
Index: linux-2.6.git/arch/x86/mm/init_64.c
===================================================================
--- linux-2.6.git.orig/arch/x86/mm/init_64.c 2008-01-08 03:43:46.000000000 -0800
+++ linux-2.6.git/arch/x86/mm/init_64.c 2008-01-08 03:59:28.000000000 -0800
@@ -215,8 +215,9 @@
int i, pmds;

pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
- vaddr = __START_KERNEL_map;
- pmd = level2_kernel_pgt;
+ /* Skip PMDs meant for kernel text */
+ vaddr = __START_KERNEL_map + KERNEL_TEXT_SIZE;
+ pmd = level2_kernel_pgt + (KERNEL_TEXT_SIZE / PMD_SIZE);
last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
for (i = 0; i < pmds; i++) {
@@ -299,11 +300,6 @@
if (addr >= end)
break;

- if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
- set_pud(pud, __pud(0));
- continue;
- }
-
if (pud_val(*pud)) {
phys_pmd_update(pud, addr, end);
continue;
@@ -344,6 +340,8 @@
(table_start << PAGE_SHIFT) + tables);
}

+static unsigned long max_addr;
+
/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
This runs before bootmem is initialized and gets pages directly from the
physical memory. To access them they are temporarily mapped. */
@@ -370,10 +368,13 @@
pgd_t *pgd = pgd_offset_k(start);
pud_t *pud;

- if (after_bootmem)
+ if (after_bootmem) {
pud = pud_offset(pgd, start & PGDIR_MASK);
- else
+ } else {
pud = alloc_low_page(&pud_phys);
+ if (end > max_addr)
+ max_addr = end;
+ }

next = start + PGDIR_SIZE;
if (next > end)
@@ -489,6 +490,187 @@
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
kcore_vsyscall;

+
+static unsigned long __init get_res_page(void)
+{
+ static unsigned long res_phys_page;
+ if (!res_phys_page) {
+ pte_t *pte;
+ pte = alloc_low_page(&res_phys_page);
+ unmap_low_page(pte);
+ }
+ return res_phys_page;
+}
+
+static unsigned long __init get_res_ptepage(void)
+{
+ static unsigned long res_phys_ptepage;
+ if (!res_phys_ptepage) {
+ pte_t *pte_page;
+ unsigned long page_phys;
+ unsigned long entry;
+ int i;
+
+ pte_page = alloc_low_page(&res_phys_ptepage);
+
+ page_phys = get_res_page();
+ entry = _PAGE_NX | _KERNPG_TABLE | _PAGE_GLOBAL | page_phys;
+ entry &= __supported_pte_mask;
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte_t *pte = pte_page + i;
+ set_pte(pte, __pte(entry));
+ }
+
+ unmap_low_page(pte_page);
+ }
+ return res_phys_ptepage;
+}
+
+static void __init phys_pte_prune(pte_t *pte_page, unsigned long address,
+ unsigned long end, unsigned long vaddr, unsigned int exec)
+{
+ int i = pte_index(vaddr);
+
+ for (; i < PTRS_PER_PTE; i++, address = (address & PAGE_MASK) + PAGE_SIZE, vaddr = (vaddr + PAGE_MASK) + PAGE_SIZE) {
+ unsigned long entry;
+ pte_t *pte = pte_page + i;
+
+ if (address >= end)
+ break;
+
+ if (pte_val(*pte))
+ continue;
+
+ /* Nothing to map. Map the null page */
+ if (!(address & (~PAGE_MASK)) &&
+ (address + PAGE_SIZE <= end) &&
+ !is_memory_any_valid(address, address + PAGE_SIZE)) {
+ unsigned long phys_page;
+
+ phys_page = get_res_page();
+ entry = _PAGE_NX | _KERNPG_TABLE | _PAGE_GLOBAL |
+ phys_page;
+
+ entry &= __supported_pte_mask;
+ set_pte(pte, __pte(entry));
+
+ continue;
+ }
+
+ if (exec)
+ entry = _PAGE_NX|_KERNPG_TABLE|_PAGE_GLOBAL|address;
+ else
+ entry = _KERNPG_TABLE|_PAGE_GLOBAL|address;
+ entry &= __supported_pte_mask;
+ set_pte(pte, __pte(entry));
+ }
+}
+
+static void __init phys_pmd_prune(pmd_t *pmd_page, unsigned long address,
+ unsigned long end, unsigned long vaddr, unsigned int exec)
+{
+ int i = pmd_index(vaddr);
+
+ for (; i < PTRS_PER_PMD; i++, address = (address & PMD_MASK) + PMD_SIZE,
+ vaddr = (vaddr & PMD_MASK) + PMD_SIZE) {
+ pmd_t *pmd = pmd_page + i;
+ pte_t *pte;
+ unsigned long pte_phys;
+
+ if (address >= end)
+ break;
+
+ if (!pmd_val(*pmd))
+ continue;
+
+ /* Nothing to map. Map the null page */
+ if (!(address & (~PMD_MASK)) &&
+ (address + PMD_SIZE <= end) &&
+ !is_memory_any_valid(address, address + PMD_SIZE)) {
+
+ pte_phys = get_res_ptepage();
+ set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
+
+ continue;
+ }
+
+ /* Map with 2M pages */
+ if (is_memory_all_valid(address, address + PUD_SIZE)) {
+ /* Init already done */
+ continue;
+ }
+
+ /* Map with 4k pages */
+ pte = alloc_low_page(&pte_phys);
+ phys_pte_prune(pte, address, address + PMD_SIZE, vaddr, exec);
+ set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
+ unmap_low_page(pte);
+
+ }
+}
+
+static void __init phys_pud_prune(pud_t *pud_page, unsigned long addr,
+ unsigned long end, unsigned long vaddr, unsigned int exec)
+{
+ int i = pud_index(vaddr);
+
+ for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE,
+ vaddr = (vaddr & PUD_MASK) + PUD_SIZE) {
+ pud_t *pud = pud_page + i;
+
+ if (addr >= end)
+ break;
+
+ if (pud_val(*pud)) {
+ pmd_t *pmd = pmd_offset(pud,0);
+ phys_pmd_prune(pmd, addr, end, vaddr, exec);
+ }
+ }
+}
+
+void __init prune_reserved_region_maps(void)
+{
+ unsigned long start, end, next;
+
+ /* Prune physical memory identity map */
+ start = (unsigned long)__va(0);
+ end = max_addr;
+ for (; start < end; start = next) {
+ pgd_t *pgd = pgd_offset_k(start);
+ pud_t *pud;
+
+ pud = pud_offset(pgd, start & PGDIR_MASK);
+
+ next = start + PGDIR_SIZE;
+ if (next > end)
+ next = end;
+
+ phys_pud_prune(pud, __pa(start), __pa(next), start, 0);
+ }
+
+ /* Prune kernel text region */
+ start = (unsigned long)KERNEL_TEXT_START;
+ end = start + (unsigned long)KERNEL_TEXT_SIZE;
+ for (; start < end; start = next) {
+ pgd_t *pgd = pgd_offset_k(start);
+ pud_t *pud;
+
+ pud = pud_offset(pgd, start & PGDIR_MASK);
+
+ next = (start & PGDIR_MASK) + (unsigned long)PGDIR_SIZE;
+ if (!next || next > end)
+ next = end;
+
+ phys_pud_prune(pud,
+ start - (unsigned long)KERNEL_TEXT_START,
+ next - (unsigned long)KERNEL_TEXT_START,
+ start,
+ 1);
+ }
+
+ __flush_tlb();
+}
+
void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
@@ -538,6 +720,8 @@
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
initsize >> 10);
+
+ prune_reserved_region_maps();
}

void free_init_pages(char *what, unsigned long begin, unsigned long end)
Index: linux-2.6.git/arch/x86/mm/ioremap_64.c
===================================================================
--- linux-2.6.git.orig/arch/x86/mm/ioremap_64.c 2008-01-08 03:41:30.000000000 -0800
+++ linux-2.6.git/arch/x86/mm/ioremap_64.c 2008-01-08 03:59:28.000000000 -0800
@@ -19,6 +19,7 @@
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
#include <asm/proto.h>
+#include <asm/e820.h>

unsigned long __phys_addr(unsigned long x)
{
@@ -28,9 +29,6 @@
}
EXPORT_SYMBOL(__phys_addr);

-#define ISA_START_ADDRESS 0xa0000
-#define ISA_END_ADDRESS 0x100000
-
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
Index: linux-2.6.git/arch/x86/mm/pageattr_64.c
===================================================================
--- linux-2.6.git.orig/arch/x86/mm/pageattr_64.c 2008-01-08 03:41:30.000000000 -0800
+++ linux-2.6.git/arch/x86/mm/pageattr_64.c 2008-01-08 04:03:33.000000000 -0800
@@ -53,9 +53,11 @@
/*
* page_private is used to track the number of entries in
* the page table page have non standard attributes.
+ * Count of 1 indicates page split by split_large_page(),
+ * additional count indicates the number of pages with non-std attr.
*/
SetPagePrivate(base);
- page_private(base) = 0;
+ page_private(base) = 1;

address = __pa(address);
addr = address & LARGE_PAGE_MASK;
@@ -176,11 +178,8 @@
BUG();
}

- /* on x86-64 the direct mapping set at boot is not using 4k pages */
- BUG_ON(PageReserved(kpte_page));
-
save_page(kpte_page);
- if (page_private(kpte_page) == 0)
+ if (page_private(kpte_page) == 1)
revert_page(address, ref_prot);
return 0;
}
Index: linux-2.6.git/include/asm-x86/e820_64.h
===================================================================
--- linux-2.6.git.orig/include/asm-x86/e820_64.h 2008-01-08 03:41:30.000000000 -0800
+++ linux-2.6.git/include/asm-x86/e820_64.h 2008-01-08 03:59:28.000000000 -0800
@@ -26,6 +26,10 @@
extern void e820_mark_nosave_regions(void);
extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
+extern int e820_any_non_reserved(unsigned long start, unsigned long end);
+extern int is_memory_any_valid(unsigned long start, unsigned long end);
+extern int e820_all_non_reserved(unsigned long start, unsigned long end);
+extern int is_memory_all_valid(unsigned long start, unsigned long end);
extern unsigned long e820_hole_size(unsigned long start, unsigned long end);

extern void e820_setup_gap(void);
@@ -38,6 +42,10 @@

extern unsigned ebda_addr, ebda_size;
extern unsigned long nodemap_addr, nodemap_size;
+
+#define ISA_START_ADDRESS 0xa0000
+#define ISA_END_ADDRESS 0x100000
+
#endif/*!__ASSEMBLY__*/

#endif/*__E820_HEADER*/

--


2008-01-10 19:07:14

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

[email protected] writes:

> x86_64: Map only usable memory in identity map.

I don't think that is needed or makes sense for reserved/ACPI * etc.
Only e820 holes should be truly unmapped because only those should
contain mmio.

> All reserved memory maps to a
> zero page.

Why zero page? Why not unmap.

Anyways you could make that a zillion times more simple by just rounding
the e820 areas to 2MB -- for the holes only that should be ok I think;
i would expect them to be near always already suitably aligned.

In short this can be all done much simpler.

-Andi

2008-01-10 19:17:36

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: RE: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

>-----Original Message-----
>From: [email protected]
>[mailto:[email protected]] On Behalf Of Andi Kleen
>Sent: Thursday, January 10, 2008 11:07 AM
>To: Pallipadi, Venkatesh
>Cc: [email protected]; [email protected];
>[email protected]; [email protected];
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected];
>[email protected]; Siddha, Suresh B
>Subject: Re: [patch 02/11] PAT x86: Map only usable memory in
>x86_64 identity map and kernel text
>
>[email protected] writes:
>
>> x86_64: Map only usable memory in identity map.
>
>I don't think that is needed or makes sense for reserved/ACPI * etc.
>Only e820 holes should be truly unmapped because only those should
>contain mmio.

Do you mean just the regions that are not listed in e820 at all? We
should also not map anything marked "RESERVED" in e820. Right?

>> All reserved memory maps to a
>> zero page.
>
>Why zero page? Why not unmap.

I had it unmapped first. Then thought of zero mapping for dd of devmem
to continue working. May be there are apps that depend on that?
Also, dd of devmem seems to be already broken with big memory without
any of these changes.

>Anyways you could make that a zillion times more simple by
>just rounding
>the e820 areas to 2MB -- for the holes only that should be ok I think;
>i would expect them to be near always already suitably aligned.
>
>In short this can be all done much simpler.

On systems I tested, ACPI regions are typically not 2MB aligned. And on
some systems there are few 4k pages of reserved holes just before
0xa0000. PCI reserved regions are 2MB aligned however. I agree that
making this 2MB aligned will make this patch a lot simpler. But, not all
reserved regions seems to be aligned that way.

Thanks,
Venki

2008-01-10 19:25:47

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

On Thu, Jan 10, 2008 at 11:17:07AM -0800, Pallipadi, Venkatesh wrote:
> >I don't think that is needed or makes sense for reserved/ACPI * etc.
> >Only e820 holes should be truly unmapped because only those should
> >contain mmio.
>
> Do you mean just the regions that are not listed in e820 at all? We
> should also not map anything marked "RESERVED" in e820. Right?

RESERVED is usually memory used by the BIOS. Properly MMIO areas
should be in holes.

Of course there might be buggy BIOS who violate that but the
only way to find out is to check for the case in ioremap and warn. I would
be still optimistic of it being correct.

Another way would be to double check against the MTRRs - if it's UC then
it should be unmapped. Maybe that would be a good idea. That should
catch all true mmio holes unless a BIOS maps them cached but if it does
that it's already beyond help.

>
> >> All reserved memory maps to a
> >> zero page.
> >
> >Why zero page? Why not unmap.
>
> I had it unmapped first. Then thought of zero mapping for dd of devmem
> to continue working. May be there are apps that depend on that?
> Also, dd of devmem seems to be already broken with big memory without
> any of these changes.

Exactly it's already broken.

Anyways if someone accesses mmio through /dev/mem I think they definitely
want the real mappings, not a zero page. And dev/mem should provide.
The trick is just to do it without caching attribute violations,
but with mattr it is possible.

>
> >Anyways you could make that a zillion times more simple by
> >just rounding
> >the e820 areas to 2MB -- for the holes only that should be ok I think;
> >i would expect them to be near always already suitably aligned.
> >
> >In short this can be all done much simpler.
>
> On systems I tested, ACPI regions are typically not 2MB aligned. And on

ACPI regions don't need to be unmapped.

> some systems there are few 4k pages of reserved holes just before

reserved shouldn't be unmapped, just holes. Do they have holes
there or reserved areas?

I still hope 2MB alignment will work out.

-Andi

2008-01-10 20:51:44

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: RE: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text



>-----Original Message-----
>From: Andi Kleen [mailto:[email protected]]
>Sent: Thursday, January 10, 2008 11:28 AM
>To: Pallipadi, Venkatesh
>Cc: Andi Kleen; [email protected]; [email protected];
>[email protected]; [email protected];
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected];
>[email protected]; Siddha, Suresh B
>Subject: Re: [patch 02/11] PAT x86: Map only usable memory in
>x86_64 identity map and kernel text
>
>On Thu, Jan 10, 2008 at 11:17:07AM -0800, Pallipadi, Venkatesh wrote:
>> >I don't think that is needed or makes sense for
>reserved/ACPI * etc.
>> >Only e820 holes should be truly unmapped because only those should
>> >contain mmio.
>>
>> Do you mean just the regions that are not listed in e820 at all? We
>> should also not map anything marked "RESERVED" in e820. Right?
>
>RESERVED is usually memory used by the BIOS. Properly MMIO areas
>should be in holes.
>
>Of course there might be buggy BIOS who violate that but the
>only way to find out is to check for the case in ioremap and
>warn. I would
>be still optimistic of it being correct.
>
>Another way would be to double check against the MTRRs - if
>it's UC then
>it should be unmapped. Maybe that would be a good idea. That should
>catch all true mmio holes unless a BIOS maps them cached but if it does
>that it's already beyond help.

One of the test systems I have has following E820
BIOS-e820: 0000000000000000 - 000000000009cc00 (usable)
BIOS-e820: 000000000009cc00 - 00000000000a0000 (reserved)
BIOS-e820: 00000000000cc000 - 00000000000d0000 (reserved)
BIOS-e820: 00000000000e4000 - 0000000000100000 (reserved)
BIOS-e820: 0000000000100000 - 00000000cff60000 (usable)
BIOS-e820: 00000000cff60000 - 00000000cff69000 (ACPI data)
BIOS-e820: 00000000cff69000 - 00000000cff80000 (ACPI NVS)
BIOS-e820: 00000000cff80000 - 00000000d0000000 (reserved)
BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved)
BIOS-e820: 00000000fec00000 - 00000000fec10000 (reserved)
BIOS-e820: 00000000fee00000 - 00000000fee01000 (reserved)
BIOS-e820: 00000000ff000000 - 0000000100000000 (reserved)
BIOS-e820: 0000000100000000 - 0000000130000000 (usable)

I think it is unsafe to access any reserved areas through "WB" not just
mmio regions. In the above case 0xe0000000-0xf0000000 is one such
region.

Also, relying on MTRR, is like giving more importance to BIOS writer
than required :-). I think the best way to deal with MTRR is just to not
touch it. Leave it as it is and do not try to assume that they are
correct, as frequently they will not be.

>> >> All reserved memory maps to a
>> >> zero page.
>> >
>> >Why zero page? Why not unmap.
>>
>> I had it unmapped first. Then thought of zero mapping for dd
>of devmem
>> to continue working. May be there are apps that depend on that?
>> Also, dd of devmem seems to be already broken with big memory without
>> any of these changes.
>
>Exactly it's already broken.
>
>Anyways if someone accesses mmio through /dev/mem I think they
>definitely
>want the real mappings, not a zero page. And dev/mem should provide.
>The trick is just to do it without caching attribute violations,
>but with mattr it is possible.

I don't like /dev/mem supporting access to mmio. We do not know what
attributes to use for these regions. We can potentially map all these
pages uncacheable. But there may be cases where reading an address can
block too possibly?

>> >Anyways you could make that a zillion times more simple by
>> >just rounding
>> >the e820 areas to 2MB -- for the holes only that should be
>ok I think;
>> >i would expect them to be near always already suitably aligned.
>> >
>> >In short this can be all done much simpler.
>>
>> On systems I tested, ACPI regions are typically not 2MB
>aligned. And on
>
>ACPI regions don't need to be unmapped.
>
>> some systems there are few 4k pages of reserved holes just before
>
>reserved shouldn't be unmapped, just holes. Do they have holes
>there or reserved areas?
>
>I still hope 2MB alignment will work out.

E820 above has a combination of reserved and holes.
The problem is that we end up depending on specific e820s and paltform
specific problems/workarounds. This is not a real problem for i386 at
all, as we map only < 1G memory there. So, it is limited to x86_64
systems which should be less in number.

Thanks,
Venki

2008-01-10 21:11:40

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text



On Thu, 10 Jan 2008, [email protected] wrote:
>
> x86_64: Map only usable memory in identity map. All reserved memory maps to a
> zero page.

I don't mind this horribly per se, but why a zero page?

Accessing that page without mapping it explicitly would be a bug with
your change - if only because you'd get the wrong value!

So why map it at all? The only thing mapping it can do is to hide bugs.

Linus

2008-01-10 21:14:29

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

> I think it is unsafe to access any reserved areas through "WB" not just
> mmio regions. In the above case 0xe0000000-0xf0000000 is one such
> region.

That is 2MB aligned.

>
> Also, relying on MTRR, is like giving more importance to BIOS writer

Let's call it double checking.

Besides MTRRs will not go away anyways. The goal is just to not
require _more_ MTRRs in Linux like currently. But using the existing
ones is no problem.

> than required :-). I think the best way to deal with MTRR is just to not
> touch it. Leave it as it is and do not try to assume that they are
> correct, as frequently they will not be.

This means you have to trust the e820 map then. It's really the
same thing.

Anyways if you don't like checking the MTRRs that's fine too, but
then the e820 map has to be trusted. If that works it is great.

If not some double checking will be needed and the MTRRs would
be more convenient for that. The code would be somewhat ugly though.


>
> >> >> All reserved memory maps to a
> >> >> zero page.
> >> >
> >> >Why zero page? Why not unmap.
> >>
> >> I had it unmapped first. Then thought of zero mapping for dd
> >of devmem
> >> to continue working. May be there are apps that depend on that?
> >> Also, dd of devmem seems to be already broken with big memory without
> >> any of these changes.
> >
> >Exactly it's already broken.
> >
> >Anyways if someone accesses mmio through /dev/mem I think they
> >definitely
> >want the real mappings, not a zero page. And dev/mem should provide.
> >The trick is just to do it without caching attribute violations,
> >but with mattr it is possible.
>
> I don't like /dev/mem supporting access to mmio. We do not know what

But it always did that. I'm sure you'll break stuff if you forbid
it suddenly.

> attributes to use for these regions. We can potentially map all these
> pages uncacheable.

That is what current /dev/mem does.

> But there may be cases where reading an address can
> block too possibly?

Yes sure, machine may hang, but that was always the case and I don't
think it can be changed.

>
> >> >Anyways you could make that a zillion times more simple by
> >> >just rounding
> >> >the e820 areas to 2MB -- for the holes only that should be
> >ok I think;
> >> >i would expect them to be near always already suitably aligned.
> >> >
> >> >In short this can be all done much simpler.
> >>
> >> On systems I tested, ACPI regions are typically not 2MB
> >aligned. And on
> >
> >ACPI regions don't need to be unmapped.
> >
> >> some systems there are few 4k pages of reserved holes just before
> >
> >reserved shouldn't be unmapped, just holes. Do they have holes
> >there or reserved areas?
> >
> >I still hope 2MB alignment will work out.
>
> E820 above has a combination of reserved and holes.
> The problem is that we end up depending on specific e820s and paltform
> specific problems/workarounds. This is not a real problem for i386 at

> all, as we map only < 1G memory there.

First there is the 2GB and in theory 1/3 GB split too which are supported.
And then in theory someone could put mmio in the first 1GB anyways (e.g.
in the 1MB hole)

I don't think you can ignore i386 here.

-Andi

2008-01-10 21:57:53

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: RE: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text



>-----Original Message-----
>From: Linus Torvalds [mailto:[email protected]]
>Sent: Thursday, January 10, 2008 1:05 PM
>To: Pallipadi, Venkatesh
>Cc: [email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; Barnes, Jesse;
>[email protected]; [email protected]; Siddha, Suresh B
>Subject: Re: [patch 02/11] PAT x86: Map only usable memory in
>x86_64 identity map and kernel text
>
>
>
>On Thu, 10 Jan 2008, [email protected] wrote:
>>
>> x86_64: Map only usable memory in identity map. All reserved
>memory maps to a
>> zero page.
>
>I don't mind this horribly per se, but why a zero page?
>
>Accessing that page without mapping it explicitly would be a bug with
>your change - if only because you'd get the wrong value!
>
>So why map it at all? The only thing mapping it can do is to hide bugs.
>

Yes. I had those pages not mapped at all earlier. The reason I switched
to zero page is to continue support cases like:
BIOS-e820: 0000000000000000 - 000000000009cc00 (usable)
BIOS-e820: 000000000009cc00 - 00000000000a0000 (reserved)
BIOS-e820: 00000000000cc000 - 00000000000d0000 (reserved)
BIOS-e820: 00000000000e4000 - 0000000000100000 (reserved)
BIOS-e820: 0000000000100000 - 00000000cff60000 (usable)

In this case if some one does a dd of /dev/mem before they can read the
contents of usable memory in 0x100000-0xcff60000 range. But, if I not
map reserved regions, dd will stop after fist such hole. Even though
this may not be a good usage model, I thought there may be apps
depending on such things. Having said that, I do not like having dummy
zero page there very much. So, if we do not see any regressions due to
usages like above, I will be happy to remove mapping reserved regions
altogether.

Thanks,
Venki

2008-01-10 22:19:15

by Linus Torvalds

[permalink] [raw]
Subject: RE: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text



On Thu, 10 Jan 2008, Pallipadi, Venkatesh wrote:
>
> Yes. I had those pages not mapped at all earlier. The reason I switched
> to zero page is to continue support cases like:
> BIOS-e820: 0000000000000000 - 000000000009cc00 (usable)
> BIOS-e820: 000000000009cc00 - 00000000000a0000 (reserved)
> BIOS-e820: 00000000000cc000 - 00000000000d0000 (reserved)
> BIOS-e820: 00000000000e4000 - 0000000000100000 (reserved)
> BIOS-e820: 0000000000100000 - 00000000cff60000 (usable)
>
> In this case if some one does a dd of /dev/mem before they can read the
> contents of usable memory in 0x100000-0xcff60000 range.

Well, I think that /dev/mem should simply give them the right info. That's
what people use /dev/mem for - doing things like reading BIOS images etc.

So returning *either* a zero page *or* stopping at the first hole is both
equally wrong.

Linus

2008-01-10 22:24:49

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: RE: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text



>-----Original Message-----
>From: [email protected]
>[mailto:[email protected]] On Behalf Of Andi Kleen
>Sent: Thursday, January 10, 2008 1:17 PM
>To: Pallipadi, Venkatesh
>Cc: Andi Kleen; [email protected]; [email protected];
>[email protected]; [email protected];
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; Siddha, Suresh B
>Subject: Re: [patch 02/11] PAT x86: Map only usable memory in
>x86_64 identity map and kernel text
>
>> I think it is unsafe to access any reserved areas through
>"WB" not just
>> mmio regions. In the above case 0xe0000000-0xf0000000 is one such
>> region.
>
>That is 2MB aligned.

That e820 also has a reserved here at 0x9d000.

BIOS-e820: 0000000000000000 - 000000000009cc00 (usable)
BIOS-e820: 000000000009cc00 - 00000000000a0000 (reserved)
BIOS-e820: 00000000000cc000 - 00000000000d0000 (reserved)

If we keep mapping for such pages, it will be problematic as if a driver
later does a ioremap, then we have to go through split-pages and cpa.
With not mapping any reserved regions at all, we can avoid cpa for all
maps of reserved regions. Reducing the complications at setup will make
code more complicated at ioremap, etc.

Most of the holes/reserved areas will be 2M aligned, other than initial
2M and possible 2M around ACPI region. So, we may end up mapping some of
those pages with small pages. Even though it was not enforced until now,
I feel that is required for correctness.

>> >
>> >Exactly it's already broken.
>> >
>> >Anyways if someone accesses mmio through /dev/mem I think they
>> >definitely
>> >want the real mappings, not a zero page. And dev/mem
>should provide.
>> >The trick is just to do it without caching attribute violations,
>> >but with mattr it is possible.
>>
>> I don't like /dev/mem supporting access to mmio. We do not know what
>
>But it always did that. I'm sure you'll break stuff if you forbid
>it suddenly.
>
>> attributes to use for these regions. We can potentially map
>all these
>> pages uncacheable.
>
>That is what current /dev/mem does.

May be I am missing something. But, I don't think I saw /dev/mem
checking whether some region is reserved and mapping those pages as
uncacheable. As I though, its mostly done as MTRR has such setting. If I
do dd of devmem which ends up reading all reserved regions today, I see
one of my systems dying horribly with NMI dazed and confused and the
other gets SCSI errors etc. I am not sure how can some apps depend on
reading mmio regions through /dev/mem. Any particular app you are
thinking about?

>> But there may be cases where reading an address can
>> block too possibly?
>
>Yes sure, machine may hang, but that was always the case and I don't
>think it can be changed.
>
>>
>> >> >Anyways you could make that a zillion times more simple by
>> >> >just rounding
>> >> >the e820 areas to 2MB -- for the holes only that should be
>> >ok I think;
>> >> >i would expect them to be near always already suitably aligned.
>> >> >
>> >> >In short this can be all done much simpler.
>> >>
>> >> On systems I tested, ACPI regions are typically not 2MB
>> >aligned. And on
>> >
>> >ACPI regions don't need to be unmapped.
>> >
>> >> some systems there are few 4k pages of reserved holes just before
>> >
>> >reserved shouldn't be unmapped, just holes. Do they have holes
>> >there or reserved areas?
>> >
>> >I still hope 2MB alignment will work out.
>>
>> E820 above has a combination of reserved and holes.
>> The problem is that we end up depending on specific e820s
>and paltform
>> specific problems/workarounds. This is not a real problem for i386 at
>
>> all, as we map only < 1G memory there.
>
>First there is the 2GB and in theory 1/3 GB split too which
>are supported.
>And then in theory someone could put mmio in the first 1GB
>anyways (e.g.
>in the 1MB hole)
>
>I don't think you can ignore i386 here.
>

OK. I was thinking that we will have smaller subset of systems to worry
about with x86_64. With above, yes. We need to worry about i386 as well.

Other than the complicated code, do you see any issues of identity
mapping only "usable" and "ACPI" regions as per e820? We can possible
try to simplify the code, if that is the only concern.

Thanks,
Venki

2008-01-10 22:28:21

by Pallipadi, Venkatesh

[permalink] [raw]
Subject: RE: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text



>-----Original Message-----
>From: Linus Torvalds [mailto:[email protected]]
>Sent: Thursday, January 10, 2008 2:15 PM
>To: Pallipadi, Venkatesh
>Cc: [email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; [email protected];
>[email protected]; Barnes, Jesse; [email protected];
>[email protected]; Siddha, Suresh B
>Subject: RE: [patch 02/11] PAT x86: Map only usable memory in
>x86_64 identity map and kernel text
>
>
>
>On Thu, 10 Jan 2008, Pallipadi, Venkatesh wrote:
>>
>> Yes. I had those pages not mapped at all earlier. The reason
>I switched
>> to zero page is to continue support cases like:
>> BIOS-e820: 0000000000000000 - 000000000009cc00 (usable)
>> BIOS-e820: 000000000009cc00 - 00000000000a0000 (reserved)
>> BIOS-e820: 00000000000cc000 - 00000000000d0000 (reserved)
>> BIOS-e820: 00000000000e4000 - 0000000000100000 (reserved)
>> BIOS-e820: 0000000000100000 - 00000000cff60000 (usable)
>>
>> In this case if some one does a dd of /dev/mem before they
>can read the
>> contents of usable memory in 0x100000-0xcff60000 range.
>
>Well, I think that /dev/mem should simply give them the right
>info. That's
>what people use /dev/mem for - doing things like reading BIOS
>images etc.
>
>So returning *either* a zero page *or* stopping at the first
>hole is both
>equally wrong.
>

I was not fully clear in my earlier email. Mapping /dev/mem would still
work with our changes. As they go through proper map interface. It is
the dd of dev mem which does the read that has the problem. I was
wondering of apps using dd.

Thanks,
Venki

2008-01-10 22:32:58

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

On Thu, Jan 10, 2008 at 02:25:29PM -0800, Pallipadi, Venkatesh wrote:
>
>
> >-----Original Message-----
> >From: [email protected]
> >[mailto:[email protected]] On Behalf Of Andi Kleen
> >Sent: Thursday, January 10, 2008 1:17 PM
> >To: Pallipadi, Venkatesh
> >Cc: Andi Kleen; [email protected]; [email protected];
> >[email protected]; [email protected];
> >[email protected]; [email protected]; [email protected];
> >[email protected]; [email protected]; Siddha, Suresh B
> >Subject: Re: [patch 02/11] PAT x86: Map only usable memory in
> >x86_64 identity map and kernel text
> >
> >> I think it is unsafe to access any reserved areas through
> >"WB" not just
> >> mmio regions. In the above case 0xe0000000-0xf0000000 is one such
> >> region.
> >
> >That is 2MB aligned.
>
> That e820 also has a reserved here at 0x9d000.

That's not a hole

>
> BIOS-e820: 0000000000000000 - 000000000009cc00 (usable)
> BIOS-e820: 000000000009cc00 - 00000000000a0000 (reserved)
> BIOS-e820: 00000000000cc000 - 00000000000d0000 (reserved)
>
> If we keep mapping for such pages, it will be problematic as if a driver
> later does a ioremap, then we have to go through split-pages and cpa.

It should not do ioremap uncacheable from reserved because there
shouldn't be a MMIO hole in there. It can do ioremap_cachable()
but that is ok.

>
> Most of the holes/reserved areas will be 2M aligned, other than initial
> 2M and possible 2M around ACPI region. So, we may end up mapping some of
> those pages with small pages. Even though it was not enforced until now,
> I feel that is required for correctness.

If it's rare enough mapping in 2MB chunks around the holes is ok.
>
> >> >
> >> >Exactly it's already broken.
> >> >
> >> >Anyways if someone accesses mmio through /dev/mem I think they
> >> >definitely
> >> >want the real mappings, not a zero page. And dev/mem
> >should provide.
> >> >The trick is just to do it without caching attribute violations,
> >> >but with mattr it is possible.
> >>
> >> I don't like /dev/mem supporting access to mmio. We do not know what
> >
> >But it always did that. I'm sure you'll break stuff if you forbid
> >it suddenly.
> >
> >> attributes to use for these regions. We can potentially map
> >all these
> >> pages uncacheable.
> >
> >That is what current /dev/mem does.
>
> May be I am missing something. But, I don't think I saw /dev/mem
> checking whether some region is reserved and mapping those pages as
> uncacheable.

It relies partly on the MTRRs and partly checks for >= end_pfn.
Yes it's a gross hack, but it works.

> As I though, its mostly done as MTRR has such setting. If I
> do dd of devmem which ends up reading all reserved regions today, I see
> one of my systems dying horribly with NMI dazed and confused and the
> other gets SCSI errors etc. I am not sure how can some apps depend on
> reading mmio regions through /dev/mem. Any particular app you are
> thinking about?

The older X servers for once or x86emu in user space and likely various
others. There are all kind of scary utilities using /dev/mem around
(like BIOS flash updaters etc.)

I know some people who don't trust the VM for large memory ares
like to boot with small mem=... and then grab memory through /dev/mem.
I suspect if that didn't work anymore there would be eventually
complaints too although there might be a case be made for not
supporting that anymore.

BUt really /dev/mem is widely used and full compatibility is fairly
important.


> Other than the complicated code, do you see any issues of identity
> mapping only "usable" and "ACPI" regions as per e820? We can possible
> try to simplify the code, if that is the only concern.

The basic idea is fine.

-Andi

2008-01-10 22:53:18

by Valdis Klētnieks

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

On Thu, 10 Jan 2008 14:15:25 PST, Linus Torvalds said:

> Well, I think that /dev/mem should simply give them the right info. That's
> what people use /dev/mem for - doing things like reading BIOS images etc.
>
> So returning *either* a zero page *or* stopping at the first hole is both
> equally wrong.

A case could be made that the /dev/mem driver should at *least* prohibit access
to those memory ranges that the kernel already knows have (or might have)
memory-mapped control registers with Bad Juju side-effects attached to them.

Of course, a case could also be made that it should be permitted, because
anybody who tries to read such memory addresses either (a) knows what they're
doing or (b) is about to become an example of evolution in action... ;)

(Personally, I keep a copy of Arjan's "restrict devmem" patch from Fedora
around, so I guess that says which camp I belong in, and the fact it's a Fedora
patch and not mainstream says something too...)


Attachments:
(No filename) (226.00 B)

2008-01-14 16:45:06

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text


* Pallipadi, Venkatesh <[email protected]> wrote:

> Also, relying on MTRR, is like giving more importance to BIOS writer
> than required :-). I think the best way to deal with MTRR is just to
> not touch it. Leave it as it is and do not try to assume that they are
> correct, as frequently they will not be.

i'd suggest the following strategy on PAT-capable CPUs:

- do not try to write MTRRs. Ever.

- _read_ the current MTRR settings (including the default MTRR) and
check them against the e820 map. I can see two basic types of
mismatches:

- RAM area marked fine in e820 but marked UC by MTRR: this
currently results in a slow system. (NOTE: UC- would be fine and
overridable by PAT, hence it's not a conflict we should detect.)

- mmio area marked cacheable in the MTRR (results in broken system)

then emit a warning and exclude all such areas from any further Linux
use. I.e. if it's RAM then clip it from our memory map. If it's mmio
area then try to exclude it from BAR sizing/positioning.

this way we'll only have two sorts of physical pages put into
pagetables by Linux:

1) RAM page, marked cacheable by MTRR
2) RAM page, marked as UC- by MTRR
2) mmio page, marked as UC- by MTRR

- then we'd use PAT for all these patches to differentiate their
caching properties. We mark RAM pages as cacheable, and we mark mmio
pages as WC or UC.

I.e. try to be as conservative and always have a deterministic exit
strategy towards a 100% working system, even if BIOS writers messed up
the MTRR defaults. _Worst-case_ we boot up with somewhat less RAM or
with a somewhat smaller mmio area. (but there will be warnings in the
dmesg about that so users can complain about the BIOS.) We should never
ever allow the wrong BIOS MTRR defaults to impact Linux's correctness.

hm?

Ingo

2008-01-14 21:21:49

by Suresh Siddha

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

On Mon, Jan 14, 2008 at 05:43:24PM +0100, Ingo Molnar wrote:
>
> * Pallipadi, Venkatesh <[email protected]> wrote:
>
> > Also, relying on MTRR, is like giving more importance to BIOS writer
> > than required :-). I think the best way to deal with MTRR is just to
> > not touch it. Leave it as it is and do not try to assume that they are
> > correct, as frequently they will not be.
>
> i'd suggest the following strategy on PAT-capable CPUs:
>
> - do not try to write MTRRs. Ever.
>
> - _read_ the current MTRR settings (including the default MTRR) and
> check them against the e820 map. I can see two basic types of
> mismatches:
>
> - RAM area marked fine in e820 but marked UC by MTRR: this
> currently results in a slow system.

Time to resurrect Jesse's old patches
i386-trim-memory-not-covered-by-wb-mtrrs.patch(which was in -mm sometime back)

> (NOTE: UC- would be fine and
> overridable by PAT, hence it's not a conflict we should detect.)

UC- can't be specified by MTRR's.

> - mmio area marked cacheable in the MTRR (results in broken system)

PAT can help specify the UC/WC attribute here.

thanks,
suresh

2008-01-14 21:25:25

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

> Time to resurrect Jesse's old patches
> i386-trim-memory-not-covered-by-wb-mtrrs.patch(which was in -mm sometime back)

They broke booting on my AMD QuadCore system here. Never quite figured
out what the problem was unfortunately.

-Andi

2008-01-15 22:18:54

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text


* Siddha, Suresh B <[email protected]> wrote:

> On Mon, Jan 14, 2008 at 05:43:24PM +0100, Ingo Molnar wrote:
> >
> > * Pallipadi, Venkatesh <[email protected]> wrote:
> >
> > > Also, relying on MTRR, is like giving more importance to BIOS writer
> > > than required :-). I think the best way to deal with MTRR is just to
> > > not touch it. Leave it as it is and do not try to assume that they are
> > > correct, as frequently they will not be.
> >
> > i'd suggest the following strategy on PAT-capable CPUs:
> >
> > - do not try to write MTRRs. Ever.
> >
> > - _read_ the current MTRR settings (including the default MTRR) and
> > check them against the e820 map. I can see two basic types of
> > mismatches:
> >
> > - RAM area marked fine in e820 but marked UC by MTRR: this
> > currently results in a slow system.
>
> Time to resurrect Jesse's old patches
> i386-trim-memory-not-covered-by-wb-mtrrs.patch(which was in -mm
> sometime back)

just to make sure i understood the attribute priorities right: we cannot
just mark it WB in the PAT and expect it to be write-back - the UC of
the MTRR will control?

> > (NOTE: UC- would be fine and
> > overridable by PAT, hence it's not a conflict we should detect.)
>
> UC- can't be specified by MTRR's.

hm, only by PATs? Not even by the default MTRR?

> > - mmio area marked cacheable in the MTRR (results in broken
> > system)
>
> PAT can help specify the UC/WC attribute here.

ok. So it seems we dont even need all that many special cases, a "dont
write MTRRs" and "use PATs everywhere" rule would just do the right
thing all across?

Ingo

2008-01-15 23:08:44

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

> just to make sure i understood the attribute priorities right: we cannot
> just mark it WB in the PAT and expect it to be write-back - the UC of
> the MTRR will control?

There are different kinds of UC: UC+ and UC-. One controls the other doesn't.

-Andi

2008-01-15 23:21:25

by Suresh Siddha

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

On Tue, Jan 15, 2008 at 11:17:58PM +0100, Ingo Molnar wrote:
>
> * Siddha, Suresh B <[email protected]> wrote:
> > Time to resurrect Jesse's old patches
> > i386-trim-memory-not-covered-by-wb-mtrrs.patch(which was in -mm
> > sometime back)
>
> just to make sure i understood the attribute priorities right: we cannot
> just mark it WB in the PAT and expect it to be write-back - the UC of
> the MTRR will control?

unfortuantely PAT is not the over-riding winner always. It all depends
on the individual attributes. For WB in PAT, mtrr always takes
the precedence.

>
> > > (NOTE: UC- would be fine and
> > > overridable by PAT, hence it's not a conflict we should detect.)
> >
> > UC- can't be specified by MTRR's.
>
> hm, only by PATs? Not even by the default MTRR?

No.

> > > - mmio area marked cacheable in the MTRR (results in broken
> > > system)
> >
> > PAT can help specify the UC/WC attribute here.
>
> ok. So it seems we dont even need all that many special cases, a "dont
> write MTRRs" and "use PATs everywhere" rule would just do the right
> thing all across?

Yes. The main thing required is on the lines of Jesse's patch.
If the MTRR's def type is not WB, then we need to check if any of the RAM
is not covered by MTRR range registers and trim the RAM accordingly.

thanks,
suresh

2008-01-18 12:02:15

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text


* Siddha, Suresh B <[email protected]> wrote:

> > ok. So it seems we dont even need all that many special cases, a
> > "dont write MTRRs" and "use PATs everywhere" rule would just do the
> > right thing all across?
>
> Yes. The main thing required is on the lines of Jesse's patch. If the
> MTRR's def type is not WB, then we need to check if any of the RAM is
> not covered by MTRR range registers and trim the RAM accordingly.

ok. I've dusted off Jesse's patch (and Andrew's fix to it) and merged it
to x86.git - see below.

one immediate problem is:

+#ifdef CONFIG_X86_64

we should do this on 32-bit too.

Ingo

----------->
Subject: x86, 32-bit: trim memory not covered by wb mtrrs
From: Jesse Barnes <[email protected]>

On some machines, buggy BIOSes don't properly setup WB MTRRs to cover all
available RAM, meaning the last few megs (or even gigs) of memory will be
marked uncached. Since Linux tends to allocate from high memory addresses
first, this causes the machine to be unusably slow as soon as the kernel
starts really using memory (i.e. right around init time).

This patch works around the problem by scanning the MTRRs at boot and
figuring out whether the current end_pfn value (setup by early e820 code)
goes beyond the highest WB MTRR range, and if so, trimming it to match. A
fairly obnoxious KERN_WARNING is printed too, letting the user know that
not all of their memory is available due to a likely BIOS bug.

Something similar could be done on i386 if needed, but the boot ordering
would be slightly different, since the MTRR code on i386 depends on the
boot_cpu_data structure being setup.

This patch fixes a bug in the last patch that caused the code to run on
non-Intel machines (AMD machines apparently don't need it and it's untested
on other non-Intel machines, so best keep it off).

Signed-off-by: Jesse Barnes <[email protected]>
Tested-by: Justin Piszcz <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Cc: Yinghai Lu <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
Documentation/kernel-parameters.txt | 6 ++
arch/x86/kernel/bugs_64.c | 1
arch/x86/kernel/cpu/mtrr/generic.c | 8 ---
arch/x86/kernel/cpu/mtrr/if.c | 8 ---
arch/x86/kernel/cpu/mtrr/main.c | 90 ++++++++++++++++++++++++++----------
arch/x86/kernel/cpu/mtrr/mtrr.h | 3 +
arch/x86/kernel/setup_64.c | 4 +
include/asm-x86/mtrr.h | 5 +-
8 files changed, 86 insertions(+), 39 deletions(-)

Index: linux-x86.q/Documentation/kernel-parameters.txt
===================================================================
--- linux-x86.q.orig/Documentation/kernel-parameters.txt
+++ linux-x86.q/Documentation/kernel-parameters.txt
@@ -562,6 +562,12 @@ and is between 256 and 4096 characters.
See drivers/char/README.epca and
Documentation/digiepca.txt.

+ disable_mtrr_trim [X86-64, Intel only]
+ By default the kernel will trim any uncacheable
+ memory out of your available memory pool based on
+ MTRR settings. This parameter disables that behavior,
+ possibly causing your machine to run very slowly.
+
dmasound= [HW,OSS] Sound subsystem buffers

dscc4.setup= [NET]
Index: linux-x86.q/arch/x86/kernel/bugs_64.c
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/bugs_64.c
+++ linux-x86.q/arch/x86/kernel/bugs_64.c
@@ -13,7 +13,6 @@
void __init check_bugs(void)
{
identify_cpu(&boot_cpu_data);
- mtrr_bp_init();
#if !defined(CONFIG_SMP)
printk("CPU: ");
print_cpu_info(&boot_cpu_data);
Index: linux-x86.q/arch/x86/kernel/cpu/mtrr/generic.c
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/cpu/mtrr/generic.c
+++ linux-x86.q/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,7 +14,7 @@
#include "mtrr.h"

struct mtrr_state {
- struct mtrr_var_range *var_ranges;
+ struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
mtrr_type fixed_ranges[NUM_FIXED_RANGES];
unsigned char enabled;
unsigned char have_fixed;
@@ -90,12 +90,6 @@ void __init get_mtrr_state(void)
unsigned lo, dummy;
unsigned long flags;

- if (!mtrr_state.var_ranges) {
- mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range),
- GFP_KERNEL);
- if (!mtrr_state.var_ranges)
- return;
- }
vrs = mtrr_state.var_ranges;

rdmsr(MTRRcap_MSR, lo, dummy);
Index: linux-x86.q/arch/x86/kernel/cpu/mtrr/if.c
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/cpu/mtrr/if.c
+++ linux-x86.q/arch/x86/kernel/cpu/mtrr/if.c
@@ -11,10 +11,6 @@
#include <asm/mtrr.h>
#include "mtrr.h"

-/* RED-PEN: this is accessed without any locking */
-extern unsigned int *usage_table;
-
-
#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)

static const char *const mtrr_strings[MTRR_NUM_TYPES] =
@@ -397,7 +393,7 @@ static int mtrr_seq_show(struct seq_file
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
if (size == 0)
- usage_table[i] = 0;
+ mtrr_usage_table[i] = 0;
else {
if (size < (0x100000 >> PAGE_SHIFT)) {
/* less than 1MB */
@@ -411,7 +407,7 @@ static int mtrr_seq_show(struct seq_file
len += seq_printf(seq,
"reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
i, base, base >> (20 - PAGE_SHIFT), size, factor,
- mtrr_attrib_to_str(type), usage_table[i]);
+ mtrr_attrib_to_str(type), mtrr_usage_table[i]);
}
}
return 0;
Index: linux-x86.q/arch/x86/kernel/cpu/mtrr/main.c
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/cpu/mtrr/main.c
+++ linux-x86.q/arch/x86/kernel/cpu/mtrr/main.c
@@ -38,8 +38,8 @@
#include <linux/cpu.h>
#include <linux/mutex.h>

+#include <asm/e820.h>
#include <asm/mtrr.h>
-
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -47,7 +47,7 @@

u32 num_var_ranges = 0;

-unsigned int *usage_table;
+unsigned int mtrr_usage_table[MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);

u64 size_or_mask, size_and_mask;
@@ -121,13 +121,8 @@ static void __init init_table(void)
int i, max;

max = num_var_ranges;
- if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
- == NULL) {
- printk(KERN_ERR "mtrr: could not allocate\n");
- return;
- }
for (i = 0; i < max; i++)
- usage_table[i] = 1;
+ mtrr_usage_table[i] = 1;
}

struct set_mtrr_data {
@@ -383,7 +378,7 @@ int mtrr_add_page(unsigned long base, un
goto out;
}
if (increment)
- ++usage_table[i];
+ ++mtrr_usage_table[i];
error = i;
goto out;
}
@@ -391,15 +386,15 @@ int mtrr_add_page(unsigned long base, un
i = mtrr_if->get_free_region(base, size, replace);
if (i >= 0) {
set_mtrr(i, base, size, type);
- if (likely(replace < 0))
- usage_table[i] = 1;
- else {
- usage_table[i] = usage_table[replace];
+ if (likely(replace < 0)) {
+ mtrr_usage_table[i] = 1;
+ } else {
+ mtrr_usage_table[i] = mtrr_usage_table[replace];
if (increment)
- usage_table[i]++;
+ mtrr_usage_table[i]++;
if (unlikely(replace != i)) {
set_mtrr(replace, 0, 0, 0);
- usage_table[replace] = 0;
+ mtrr_usage_table[replace] = 0;
}
}
} else
@@ -529,11 +524,11 @@ int mtrr_del_page(int reg, unsigned long
printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
goto out;
}
- if (usage_table[reg] < 1) {
+ if (mtrr_usage_table[reg] < 1) {
printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
goto out;
}
- if (--usage_table[reg] < 1)
+ if (--mtrr_usage_table[reg] < 1)
set_mtrr(reg, 0, 0, 0);
error = reg;
out:
@@ -593,16 +588,11 @@ struct mtrr_value {
unsigned long lsize;
};

-static struct mtrr_value * mtrr_state;
+static struct mtrr_value mtrr_state[MAX_VAR_RANGES];

static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
{
int i;
- int size = num_var_ranges * sizeof(struct mtrr_value);
-
- mtrr_state = kzalloc(size,GFP_ATOMIC);
- if (!mtrr_state)
- return -ENOMEM;

for (i = 0; i < num_var_ranges; i++) {
mtrr_if->get(i,
@@ -624,7 +614,6 @@ static int mtrr_restore(struct sys_devic
mtrr_state[i].lsize,
mtrr_state[i].ltype);
}
- kfree(mtrr_state);
return 0;
}

@@ -635,6 +624,59 @@ static struct sysdev_driver mtrr_sysdev_
.resume = mtrr_restore,
};

+static int disable_mtrr_trim;
+
+static int __init disable_mtrr_trim_setup(char *str)
+{
+ disable_mtrr_trim = 1;
+ return 0;
+}
+early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
+
+#ifdef CONFIG_X86_64
+/**
+ * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
+ *
+ * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
+ * memory configurations. This routine checks to make sure the MTRRs having
+ * a write back type cover all of the memory the kernel is intending to use.
+ * If not, it'll trim any memory off the end by adjusting end_pfn, removing
+ * it from the kernel's allocation pools, warning the user with an obnoxious
+ * message.
+ */
+void __init mtrr_trim_uncached_memory(void)
+{
+ unsigned long i, base, size, highest_addr = 0, def, dummy;
+ mtrr_type type;
+
+ /* Make sure we only trim uncachable memory on Intel machines */
+ rdmsr(MTRRdefType_MSR, def, dummy);
+ def &= 0xff;
+ if (!is_cpu(INTEL) || disable_mtrr_trim || def != MTRR_TYPE_UNCACHABLE)
+ return;
+
+ /* Find highest cached pfn */
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base <<= PAGE_SHIFT;
+ size <<= PAGE_SHIFT;
+ if (highest_addr < base + size)
+ highest_addr = base + size;
+ }
+
+ if ((highest_addr >> PAGE_SHIFT) != end_pfn) {
+ printk(KERN_WARNING "***************\n");
+ printk(KERN_WARNING "**** WARNING: likely BIOS bug\n");
+ printk(KERN_WARNING "**** MTRRs don't cover all of "
+ "memory, trimmed %ld pages\n", end_pfn -
+ (highest_addr >> PAGE_SHIFT));
+ printk(KERN_WARNING "***************\n");
+ end_pfn = highest_addr >> PAGE_SHIFT;
+ }
+}
+#endif

/**
* mtrr_bp_init - initialize mtrrs on the boot CPU
Index: linux-x86.q/arch/x86/kernel/cpu/mtrr/mtrr.h
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ linux-x86.q/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -12,6 +12,7 @@
#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)

#define NUM_FIXED_RANGES 88
+#define MAX_VAR_RANGES 256
#define MTRRfix64K_00000_MSR 0x250
#define MTRRfix16K_80000_MSR 0x258
#define MTRRfix16K_A0000_MSR 0x259
@@ -32,6 +33,8 @@
an 8 bit field: */
typedef u8 mtrr_type;

+extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+
struct mtrr_ops {
u32 vendor;
u32 use_intel_if;
Index: linux-x86.q/arch/x86/kernel/setup_64.c
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/setup_64.c
+++ linux-x86.q/arch/x86/kernel/setup_64.c
@@ -322,6 +322,10 @@ void __init setup_arch(char **cmdline_p)
* we are rounding upwards:
*/
end_pfn = e820_end_of_ram();
+ /* Trim memory not covered by WB MTRRs */
+ mtrr_bp_init();
+ mtrr_trim_uncached_memory();
+
num_physpages = end_pfn;

check_efer();
Index: linux-x86.q/include/asm-x86/mtrr.h
===================================================================
--- linux-x86.q.orig/include/asm-x86/mtrr.h
+++ linux-x86.q/include/asm-x86/mtrr.h
@@ -97,6 +97,7 @@ extern int mtrr_del_page (int reg, unsig
extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
extern void mtrr_ap_init(void);
extern void mtrr_bp_init(void);
+extern void mtrr_trim_uncached_memory(void);
# else
#define mtrr_save_fixed_ranges(arg) do {} while (0)
#define mtrr_save_state() do {} while (0)
@@ -120,7 +121,9 @@ static __inline__ int mtrr_del_page (int
{
return -ENODEV;
}
-
+static __inline__ void mtrr_trim_uncached_memory(void)
+{
+}
static __inline__ void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) {;}

#define mtrr_ap_init() do {} while (0)

Subject: x86, 32-bit: trim memory not covered by wb mtrrs, fix
From: Andrew Morton <[email protected]>

Cc: "Eric W. Biederman" <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Jesse Barnes <[email protected]>
Cc: Yinghai Lu <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---

arch/x86/kernel/cpu/mtrr/main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-x86.q/arch/x86/kernel/cpu/mtrr/main.c
===================================================================
--- linux-x86.q.orig/arch/x86/kernel/cpu/mtrr/main.c
+++ linux-x86.q/arch/x86/kernel/cpu/mtrr/main.c
@@ -666,7 +666,7 @@ void __init mtrr_trim_uncached_memory(vo
highest_addr = base + size;
}

- if ((highest_addr >> PAGE_SHIFT) != end_pfn) {
+ if ((highest_addr >> PAGE_SHIFT) < end_pfn) {
printk(KERN_WARNING "***************\n");
printk(KERN_WARNING "**** WARNING: likely BIOS bug\n");
printk(KERN_WARNING "**** MTRRs don't cover all of "

2008-01-18 13:09:01

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

> (AMD machines apparently don't need it

That's not true -- we had AMD systems in the past with broken MTRRs for
large memory configurations too, Mostly it was pre revE though.

-Andi

2008-01-18 16:46:23

by Jesse Barnes

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

On Friday, January 18, 2008 5:12 am Andi Kleen wrote:
> > (AMD machines apparently don't need it
>
> That's not true -- we had AMD systems in the past with broken MTRRs for
> large memory configurations too, Mostly it was pre revE though.

It should be easy enough to enable it for AMD as well, and it would also be
good to track down the one failure you found... I don't *think* the
re-ordering of MTRR initialization should affect AMDs anymore than it does
Intel, but someone familiar with the boot code would have to do a quick audit
to be sure.

Jesse

2008-01-18 18:08:52

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

On Fri, Jan 18, 2008 at 08:46:02AM -0800, Jesse Barnes wrote:
> On Friday, January 18, 2008 5:12 am Andi Kleen wrote:
> > > (AMD machines apparently don't need it
> >
> > That's not true -- we had AMD systems in the past with broken MTRRs for
> > large memory configurations too, Mostly it was pre revE though.
>
> It should be easy enough to enable it for AMD as well, and it would also be
> good to track down the one failure you found... I don't *think* the
> re-ordering of MTRR initialization should affect AMDs anymore than it does
> Intel, but someone familiar with the boot code would have to do a quick audit
> to be sure.

I looked back then when I had bisected it down and I admit I didn't spot the
problem from source review. I think it came from the reordering so blacklisting
AMD alone wouldn't have helped. Might have been some
subtle race (e.g. long ago we had such races in the MTRR code
triggered by the first HT CPUs)

Anyways I just test booted latest git-x86 with your patches included on
the QC system and it booted now. However it has both more RAM and newer CPUs
(the original ones were pre-production, that is why I also didn't send you logs[1] ..)
then when I tested originally. So this means either the problem was somewhere
else or the different configuration hides it.

I guess you will hear about it if it's still broken on other machines.

Currently it looks good.

I think it should be enabled on AMD too though. If the reordering breaks
it then blacklisting won't help anyways.

-Andi

[1] but I checked the known errata and there was nothing related to MTRR.

2008-01-18 18:35:00

by Dave Jones

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

On Thu, Jan 10, 2008 at 05:50:41PM -0500, [email protected] wrote:

> (Personally, I keep a copy of Arjan's "restrict devmem" patch from Fedora
> around, so I guess that says which camp I belong in, and the fact it's a Fedora
> patch and not mainstream says something too...)

The way that patch is right now (and has been for some time) is kind of nasty,
and could use cleaning up. I made a half-assed attempt at improving it
a little over xmas, but it could rewriting from scratch.
That's the only reason I never bothered pushing it (and I assume, the reason
that Arjan didn't push it when he wrote the original).

Dave

--
http://www.codemonkey.org.uk

2008-01-18 19:03:27

by Jesse Barnes

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

On Friday, January 18, 2008 10:12 am Andi Kleen wrote:
> I looked back then when I had bisected it down and I admit I didn't spot
> the problem from source review. I think it came from the reordering so
> blacklisting AMD alone wouldn't have helped. Might have been some
> subtle race (e.g. long ago we had such races in the MTRR code
> triggered by the first HT CPUs)
>
> Anyways I just test booted latest git-x86 with your patches included on
> the QC system and it booted now. However it has both more RAM and newer
> CPUs (the original ones were pre-production, that is why I also didn't send
> you logs[1] ..) then when I tested originally. So this means either the
> problem was somewhere else or the different configuration hides it.
>
> I guess you will hear about it if it's still broken on other machines.
>
> Currently it looks good.
>
> I think it should be enabled on AMD too though. If the reordering breaks
> it then blacklisting won't help anyways.
>
> -Andi
>
> [1] but I checked the known errata and there was nothing related to MTRR.

Ah, ok, that explains your reticence earlier. Thanks for testing again, I
guess the patch is good to go.

Jesse

2008-01-18 20:56:22

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text


* Dave Jones <[email protected]> wrote:

> On Thu, Jan 10, 2008 at 05:50:41PM -0500, [email protected] wrote:
>
> > (Personally, I keep a copy of Arjan's "restrict devmem" patch from Fedora
> > around, so I guess that says which camp I belong in, and the fact it's a Fedora
> > patch and not mainstream says something too...)
>
> The way that patch is right now (and has been for some time) is kind
> of nasty, and could use cleaning up. I made a half-assed attempt at
> improving it a little over xmas, but it could rewriting from scratch.
> That's the only reason I never bothered pushing it (and I assume, the
> reason that Arjan didn't push it when he wrote the original).

could someone please send it into this thread so that we can have a go
at integrating it into x86.git?

Ingo

2008-01-19 02:38:49

by Andi Kleen

[permalink] [raw]
Subject: Re: [patch 02/11] PAT x86: Map only usable memory in x86_64 identity map and kernel text

> > I think it should be enabled on AMD too though. If the reordering breaks
> > it then blacklisting won't help anyways.

Actually it is already enabled on AMD. You check for is_cpu(INTEL)
but that just checks the generic MTRR architecture and all AMD CPUs
since K7 use that one too.

That is ok imho.

Perhaps it would be good to fix the incorrect comment though.


> >
> > -Andi
> >
> > [1] but I checked the known errata and there was nothing related to MTRR.
>
> Ah, ok, that explains your reticence earlier. Thanks for testing again, I
> guess the patch is good to go.

I see a failure here now on a (AMD) system where it trims a lot of memory, but
should probably not (or at least i haven't noticed any malfunction
before without it). Investigating.

-Andi