This first patch fixes silent memleak in the pageattr code that I found
while searching for the bug Andi fixed in the second patch below
(basically reference counting in split page was done on the pmd instead
of the pte).
Signed-off-by: Andrea Arcangeli <[email protected]>
Index: linux-2.5/arch/i386/mm/pageattr.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/i386/mm/pageattr.c,v
retrieving revision 1.13
diff -u -p -r1.13 pageattr.c
--- linux-2.5/arch/i386/mm/pageattr.c 27 Aug 2004 17:35:39 -0000 1.13
+++ linux-2.5/arch/i386/mm/pageattr.c 28 Oct 2004 19:11:20 -0000
@@ -117,22 +117,23 @@ __change_page_attr(struct page *page, pg
kpte_page = virt_to_page(kpte);
if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
- pte_t old = *kpte;
- pte_t standard = mk_pte(page, PAGE_KERNEL);
set_pte_atomic(kpte, mk_pte(page, prot));
- if (pte_same(old,standard))
- get_page(kpte_page);
} else {
struct page *split = split_large_page(address, prot);
if (!split)
return -ENOMEM;
- get_page(kpte_page);
set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL));
+ kpte_page = split;
}
+ get_page(kpte_page);
} else if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
__put_page(kpte_page);
- }
+ } else
+ BUG();
+
+ /* memleak and potential failed 2M page regeneration */
+ BUG_ON(!page_count(kpte_page));
if (cpu_has_pse && (page_count(kpte_page) == 1)) {
list_add(&kpte_page->lru, &df_list);
Index: linux-2.5/arch/x86_64/mm/pageattr.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/mm/pageattr.c,v
retrieving revision 1.12
diff -u -p -r1.12 pageattr.c
--- linux-2.5/arch/x86_64/mm/pageattr.c 27 Jun 2004 17:54:00 -0000 1.12
+++ linux-2.5/arch/x86_64/mm/pageattr.c 28 Oct 2004 19:11:20 -0000
@@ -124,28 +124,33 @@ __change_page_attr(unsigned long address
kpte_flags = pte_val(*kpte);
if (pgprot_val(prot) != pgprot_val(ref_prot)) {
if ((kpte_flags & _PAGE_PSE) == 0) {
- pte_t old = *kpte;
- pte_t standard = mk_pte(page, ref_prot);
-
set_pte(kpte, mk_pte(page, prot));
- if (pte_same(old,standard))
- get_page(kpte_page);
} else {
+ /*
+ * split_large_page will take the reference for this change_page_attr
+ * on the split page.
+ */
struct page *split = split_large_page(address, prot, ref_prot);
if (!split)
return -ENOMEM;
- get_page(kpte_page);
set_pte(kpte,mk_pte(split, ref_prot));
+ kpte_page = split;
}
+ get_page(kpte_page);
} else if ((kpte_flags & _PAGE_PSE) == 0) {
set_pte(kpte, mk_pte(page, ref_prot));
__put_page(kpte_page);
- }
+ } else
+ BUG();
- if (page_count(kpte_page) == 1) {
+ switch (page_count(kpte_page)) {
+ case 1:
save_page(address, kpte_page);
revert_page(address, ref_prot);
- }
+ break;
+ case 0:
+ BUG(); /* memleak and failed 2M page regeneration */
+ }
return 0;
}
This below patch from Andi is also needed to make the above work on x86
(otherwise one of my new above BUGS() will trigger signalling the fact
a bug was there). The below patch creates a subtle dependency that
(_PAGE_PCD << 24) must not be zero. It's not the cleanest thing ever,
but since it's an hardware bitflag I doubt it's going to break.
I'm not sure if I'm allowed to add the signedoff for Andi but I think I
should since he wrote the x86-64 version, if not please let me know (I
only backported it to x86 to test my above changes that otherwise would
trigger one of my added BUGs, and I did the other worthless cosmetical
fixes).
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Andrea Arcangeli <[email protected]>
Index: linux-2.5/arch/i386/mm/ioremap.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/i386/mm/ioremap.c,v
retrieving revision 1.23
diff -u -p -r1.23 ioremap.c
--- linux-2.5/arch/i386/mm/ioremap.c 11 Sep 2004 02:01:36 -0000 1.23
+++ linux-2.5/arch/i386/mm/ioremap.c 28 Oct 2004 19:11:20 -0000
@@ -152,7 +152,7 @@ void __iomem * __ioremap(unsigned long p
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP);
+ area = get_vm_area(size, VM_IOREMAP | (flags << 24));
if (!area)
return NULL;
area->phys_addr = phys_addr;
@@ -232,7 +232,7 @@ void iounmap(volatile void __iomem *addr
return;
}
- if (p->flags && p->phys_addr < virt_to_phys(high_memory)) {
+ if ((p->flags >> 24) && p->phys_addr < virt_to_phys(high_memory)) {
change_page_attr(virt_to_page(__va(p->phys_addr)),
p->size >> PAGE_SHIFT,
PAGE_KERNEL);
Index: linux-2.5/arch/x86_64/mm/ioremap.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/mm/ioremap.c,v
retrieving revision 1.16
diff -u -p -r1.16 ioremap.c
--- linux-2.5/arch/x86_64/mm/ioremap.c 22 Oct 2004 15:01:03 -0000 1.16
+++ linux-2.5/arch/x86_64/mm/ioremap.c 28 Oct 2004 19:11:20 -0000
@@ -128,11 +128,11 @@ void __iomem * __ioremap(unsigned long p
if (phys_addr >= 0xA0000 && last_addr < 0x100000)
return (__force void __iomem *)phys_to_virt(phys_addr);
+#ifndef CONFIG_DISCONTIGMEM
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
if (phys_addr < virt_to_phys(high_memory)) {
-#ifndef CONFIG_DISCONTIGMEM
char *t_addr, *t_end;
struct page *page;
@@ -142,8 +142,8 @@ void __iomem * __ioremap(unsigned long p
for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
if(!PageReserved(page))
return NULL;
-#endif
}
+#endif
/*
* Mappings have to be page-aligned
@@ -155,7 +155,7 @@ void __iomem * __ioremap(unsigned long p
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP);
+ area = get_vm_area(size, VM_IOREMAP | (flags << 24));
if (!area)
return NULL;
area->phys_addr = phys_addr;
@@ -195,12 +195,12 @@ void __iomem *ioremap_nocache (unsigned
if (!p)
return p;
- if (phys_addr + size < virt_to_phys(high_memory)) {
+ if (phys_addr + size - 1 < virt_to_phys(high_memory)) {
struct page *ppage = virt_to_page(__va(phys_addr));
unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- BUG_ON(phys_addr+size > (unsigned long)high_memory);
- BUG_ON(phys_addr + size < phys_addr);
+ BUG_ON(phys_addr+size >= (unsigned long)high_memory);
+ BUG_ON(phys_addr + size <= phys_addr);
if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
iounmap(p);
@@ -223,7 +223,7 @@ void iounmap(volatile void __iomem *addr
return;
}
- if (p->flags && p->phys_addr < virt_to_phys(high_memory)) {
+ if ((p->flags >> 24) && p->phys_addr + p->size < virt_to_phys(high_memory)) {
change_page_attr(virt_to_page(__va(p->phys_addr)),
p->size >> PAGE_SHIFT,
PAGE_KERNEL);
If you give them some beating on -mm let me know if you've any problem.
(running with this stuff on my machines right now, so far so good)
This patch:
> From: Andrea Arcangeli <[email protected]>
>
> - fix silent memleak in the pageattr code that I found while searching
> for the bug Andi fixed in the second patch below (basically reference
> counting in split page was done on the pmd instead of the pte).
>
> - Part of this patch is also needed to make the above work on x86 (otherwise
> one of my new above BUGS() will trigger signalling the fact a bug was
> there). The below patch creates a subtle dependency that (_PAGE_PCD << 24)
> must not be zero. It's not the cleanest thing ever, but since it's an
> hardware bitflag I doubt it's going to break.
>
> Signed-off-by: Andi Kleen <[email protected]>
> Signed-off-by: Andrea Arcangeli <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> ---
>
> 25-akpm/arch/i386/mm/ioremap.c | 4 ++--
> 25-akpm/arch/i386/mm/pageattr.c | 13 +++++++------
> 25-akpm/arch/x86_64/mm/ioremap.c | 14 +++++++-------
> 25-akpm/arch/x86_64/mm/pageattr.c | 23 ++++++++++++++---------
> 4 files changed, 30 insertions(+), 24 deletions(-)
is hitting this BUG() during bootup:
/* memleak and potential failed 2M page regeneration */
BUG_ON(!page_count(kpte_page));
in 2.6.10-rc1-mm2.
Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
Memory: 511144k/524288k available (1856k kernel code, 12608k reserved,
1186k data, 164k init, 0k highmem)
Checking if this processor honours the WP bit even in supervisor mode... Ok.
Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
------------[ cut here ]------------
kernel BUG at arch/i386/mm/pageattr.c:136!
invalid operand: 0000 [#1]
SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 0
EIP: 0060:[<c0113f48>] Not tainted VLI
EFLAGS: 00010046 (2.6.10-rc1-mm2)
EIP is at __change_page_attr+0x28c/0x358
eax: ffffffff ebx: 017ff163 ecx: 00000000 edx: c10001e0
esi: 00000000 edi: c000fff8 ebp: c10001e0 esp: c03f9d98
ds: 007b es: 007b ss: 0068
Process swapper (pid: 0, threadinfo=c03f9000 task=c0345b40)
Stack: c102ffe0 00000000 00000000 00000046 c0113ceb c17f9000 c102ff20
c102ff20 017ff163 00000000 017ff163 00000000 c0113ceb c17fe000
c102ffc0 c102ffc0 c1000000 c17ff000 c000fff8 017ff163 00000000
017ff163 00000000 00000000
Call Trace:
[<c0113ceb>] __change_page_attr+0x2f/0x358
[<c0113ceb>] __change_page_attr+0x2f/0x358
[<c011404a>] change_page_attr+0x36/0x54
[<c0114148>] kernel_map_pages+0x30/0x5f
[<c0137d80>] __alloc_pages+0x340/0x350
[<c0137dad>] __get_free_pages+0x1d/0x30
[<c013adfa>] kmem_getpages+0x26/0xd4
[<c013c221>] cache_grow+0xb1/0x150
[<c013c84a>] cache_alloc_refill+0x232/0x280
[<c013ccbe>] kmem_cache_alloc+0x5a/0x78
[<c01d4970>] idr_pre_get+0x1c/0x44
[<c0181b60>] sysfs_fill_super+0x0/0xa4
[<c01572cc>] set_anon_super+0x10/0xb8
[<c0156cff>] sget+0xb3/0x148
[<c0181b60>] sysfs_fill_super+0x0/0xa4
[<c0157636>] get_sb_single+0x26/0x8c
[<c0157608>] compare_single+0x0/0x8
[<c01572bc>] set_anon_super+0x0/0xb8
[<c0181c1d>] sysfs_get_sb+0x19/0x1d
[<c0181b60>] sysfs_fill_super+0x0/0xa4
[<c01576ea>] do_kern_mount+0x4e/0xd0
[<c015777d>] kern_mount+0x11/0x15
[<c0409962>] sysfs_init+0x1e/0x50
[<c0409430>] mnt_init+0xb4/0xc0
[<c040917a>] vfs_caches_init+0x7e/0x94
[<c03fa831>] start_kernel+0x12d/0x150
Code: c7 0f 75 f5 f0 ff 4d 04 eb 08 0f 0b 85 00 88 c1 2d c0 8b 45 00 89
ea f6 c4 80 74 07 8b 55 0c 8d 74 26 00 8b 42 04 83 f8 ff 75 08 <0f> 0b
88 00 88 c1 2d c0 a1 ac 6c 34 c0 a8 08 0f 84 aa 00 00 00
I'm tracking down now to see exactly what's going on. This just a
regular, plain 4-way x86 box with 4GB of RAM. Removing that BUG_ON()
lets me boot just fine.
kpte: c000fff8
address: c17ff000
kpte_page: c10001e0
pgprot_val(prot): 00000163
pgprot_val(PAGE_KERNEL): 00000163
(pte_val(*kpte) & _PAGE_PSE): 00000000
Dave Hansen <[email protected]> wrote:
>
> Andrea Arcangeli wrote:
> > Still I recommend investigating _why_ debug_pagealloc is violating the
> > API. It might not be necessary to wait for the pageattr universal
> > feature to make DEBUG_PAGEALLOC work safe.
>
> OK, good to know. But, for now, can we pull this out of -mm? Or, at
> least that BUG_ON()? DEBUG_PAGEALLOC is an awfully powerful debugging
> tool to just be removed like this.
If we make it a WARN_ON, will that cause a complete storm of output?
Andrew Morton wrote:
> Dave Hansen <[email protected]> wrote:
>
>>Andrea Arcangeli wrote:
>>
>>>Still I recommend investigating _why_ debug_pagealloc is violating the
>>>API. It might not be necessary to wait for the pageattr universal
>>>feature to make DEBUG_PAGEALLOC work safe.
>>
>>OK, good to know. But, for now, can we pull this out of -mm? Or, at
>>least that BUG_ON()? DEBUG_PAGEALLOC is an awfully powerful debugging
>>tool to just be removed like this.
>
> If we make it a WARN_ON, will that cause a complete storm of output?
Yeah, just tried it. I hit a couple hundred of them before I got to init.
On Tue, 2 Nov 2004, Dave Hansen wrote:
> This patch:
>
> > From: Andrea Arcangeli <[email protected]>
> >
> > - fix silent memleak in the pageattr code that I found while searching
> > for the bug Andi fixed in the second patch below (basically reference
> > counting in split page was done on the pmd instead of the pte).
> >
> > - Part of this patch is also needed to make the above work on x86 (otherwise
> > one of my new above BUGS() will trigger signalling the fact a bug was
> > there). The below patch creates a subtle dependency that (_PAGE_PCD << 24)
> > must not be zero. It's not the cleanest thing ever, but since it's an
> > hardware bitflag I doubt it's going to break.
> >
> > Signed-off-by: Andi Kleen <[email protected]>
> > Signed-off-by: Andrea Arcangeli <[email protected]>
> > Signed-off-by: Andrew Morton <[email protected]>
> > ---
> >
> > 25-akpm/arch/i386/mm/ioremap.c | 4 ++--
> > 25-akpm/arch/i386/mm/pageattr.c | 13 +++++++------
> > 25-akpm/arch/x86_64/mm/ioremap.c | 14 +++++++-------
> > 25-akpm/arch/x86_64/mm/pageattr.c | 23 ++++++++++++++---------
> > 4 files changed, 30 insertions(+), 24 deletions(-)
>
> is hitting this BUG() during bootup:
>
> /* memleak and potential failed 2M page regeneration */
> BUG_ON(!page_count(kpte_page));
>
> in 2.6.10-rc1-mm2.
>
I've seen the page_count being -1 (not sure why), for a number of pages in
the identity mapped region...So the BUG() on 0 doesn't seem valid to me.
Also, in order to tell if the pages should be merged back to create a huge
page, i don't see how the patch differentiates b/w pages that were split
and those that weren't simply based on the page_count....
-Jason
---
memhotplug1-dave/arch/i386/mm/pageattr.c | 7 +++++--
memhotplug1-dave/include/linux/mm.h | 3 +++
memhotplug1-dave/mm/page_alloc.c | 5 ++++-
3 files changed, 12 insertions(+), 3 deletions(-)
diff -puN include/linux/mm.h~Z3-page_debugging include/linux/mm.h
--- memhotplug1/include/linux/mm.h~Z3-page_debugging 2004-11-02 14:29:51.000000000 -0800
+++ memhotplug1-dave/include/linux/mm.h 2004-11-02 14:37:08.000000000 -0800
@@ -245,6 +245,9 @@ struct page {
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ int mapped;
+#endif
};
#ifdef CONFIG_MEMORY_HOTPLUG
diff -puN arch/i386/mm/pageattr.c~Z3-page_debugging arch/i386/mm/pageattr.c
--- memhotplug1/arch/i386/mm/pageattr.c~Z3-page_debugging 2004-11-02 14:31:07.000000000 -0800
+++ memhotplug1-dave/arch/i386/mm/pageattr.c 2004-11-02 14:41:00.000000000 -0800
@@ -153,7 +153,7 @@ __change_page_attr(struct page *page, pg
printk("pgprot_val(PAGE_KERNEL): %08lx\n", pgprot_val(PAGE_KERNEL));
printk("(pte_val(*kpte) & _PAGE_PSE): %08lx\n", (pte_val(*kpte) & _PAGE_PSE));
printk("path: %d\n", path);
- BUG();
+ WARN_ON(1);
}
if (cpu_has_pse && (page_count(kpte_page) == 1)) {
@@ -224,7 +224,10 @@ void kernel_map_pages(struct page *page,
/* the return value is ignored - the calls cannot fail,
* large pages are disabled at boot time.
*/
- change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+ if (enable && !page->mapped)
+ change_page_attr(page, numpages, PAGE_KERNEL);
+ else if (!enable && page->mapped)
+ change_page_attr(page, numpages, __pgprot(0));
/* we should perform an IPI and flush all tlbs,
* but that can deadlock->flush only current cpu.
*/
diff -puN mm/page_alloc.c~Z3-page_debugging mm/page_alloc.c
--- memhotplug1/mm/page_alloc.c~Z3-page_debugging 2004-11-02 14:37:53.000000000 -0800
+++ memhotplug1-dave/mm/page_alloc.c 2004-11-02 14:42:56.000000000 -0800
@@ -1840,8 +1840,11 @@ void __devinit memmap_init_zone(unsigned
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
- if (!is_highmem_idx(zone))
+ if (!is_highmem_idx(zone)) {
set_page_address(page, __va(start_pfn << PAGE_SHIFT));
+ page->mapped = 1;
+ } else
+ page->mapped = 0;
#endif
start_pfn++;
}
_
Dave Hansen <[email protected]> wrote:
>
> Andrea Arcangeli wrote:
> > Still I recommend investigating _why_ debug_pagealloc is violating the
> > API. It might not be necessary to wait for the pageattr universal
> > feature to make DEBUG_PAGEALLOC work safe.
>
> This makes the DEBUG_PAGEALLOC stuff symmetric enough to boot for me,
> and it's pretty damn simple. Any ideas for doing this without bloating
> 'struct page', even in the debugging case?
You could use a bit from page->flags. But CONFIG_DEBUG_PAGEALLOC uses so
much additional memory nobody would notice anyway.
hm. Or maybe page->mapcount is available on these pages.
BTW, please don't anyone going even trying to apply that piece of crap I
just sent out, I just wanted to demonstrate what solves my immediate
problem.
On Tue, Nov 02, 2004 at 05:34:08PM -0500, Jason Baron wrote:
> I've seen the page_count being -1 (not sure why), for a number of pages in
> the identity mapped region...So the BUG() on 0 doesn't seem valid to me.
bugcheck on 0 is valid there, it signals memleak. page_count == -1
signals another bug that might or might not be fixed by this as far as I
can tell (this furthermore is a pte, so it sure can't have page_count ==
0 or -1). So please try to track down which pages had page_count == -1.
Note, we must not confuse page->count with page_count, for the former
value -1 means "in the freelist". For the latter it signals a bug.
> Also, in order to tell if the pages should be merged back to create a huge
> page, i don't see how the patch differentiates b/w pages that were split
> and those that weren't simply based on the page_count....
that's a page_count of the _pte_, not of the pages. so we're guaranteed
it's always 1 unless we deal with this very pageattr code that is the
only one that evers boost a pte page_count to > 1.
If you mean how can we provide an universal API with only keeping track
of things with the page_count of the pte, we can't, and that's why it's
not an universal API and that's why some symmetry is required by the
API.
Andrea Arcangeli wrote:
> Still I recommend investigating _why_ debug_pagealloc is violating the
> API. It might not be necessary to wait for the pageattr universal
> feature to make DEBUG_PAGEALLOC work safe.
OK, good to know. But, for now, can we pull this out of -mm? Or, at
least that BUG_ON()? DEBUG_PAGEALLOC is an awfully powerful debugging
tool to just be removed like this.
On Tue, Nov 02, 2004 at 01:21:49PM -0800, Dave Hansen wrote:
> This patch:
>
> >From: Andrea Arcangeli <[email protected]>
> >
> >- fix silent memleak in the pageattr code that I found while searching
> > for the bug Andi fixed in the second patch below (basically reference
> > counting in split page was done on the pmd instead of the pte).
> >
> >- Part of this patch is also needed to make the above work on x86
> >(otherwise
> > one of my new above BUGS() will trigger signalling the fact a bug was
> > there). The below patch creates a subtle dependency that (_PAGE_PCD <<
> > 24)
> > must not be zero. It's not the cleanest thing ever, but since it's an
> > hardware bitflag I doubt it's going to break.
> >
> >Signed-off-by: Andi Kleen <[email protected]>
> >Signed-off-by: Andrea Arcangeli <[email protected]>
> >Signed-off-by: Andrew Morton <[email protected]>
> >---
> >
> > 25-akpm/arch/i386/mm/ioremap.c | 4 ++--
> > 25-akpm/arch/i386/mm/pageattr.c | 13 +++++++------
> > 25-akpm/arch/x86_64/mm/ioremap.c | 14 +++++++-------
> > 25-akpm/arch/x86_64/mm/pageattr.c | 23 ++++++++++++++---------
> > 4 files changed, 30 insertions(+), 24 deletions(-)
>
> is hitting this BUG() during bootup:
>
> /* memleak and potential failed 2M page regeneration */
> BUG_ON(!page_count(kpte_page));
>
> in 2.6.10-rc1-mm2.
>
> Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
> Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
> Memory: 511144k/524288k available (1856k kernel code, 12608k reserved,
> 1186k data, 164k init, 0k highmem)
> Checking if this processor honours the WP bit even in supervisor mode... Ok.
> Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
> ------------[ cut here ]------------
> kernel BUG at arch/i386/mm/pageattr.c:136!
> invalid operand: 0000 [#1]
> SMP DEBUG_PAGEALLOC
> Modules linked in:
> CPU: 0
> EIP: 0060:[<c0113f48>] Not tainted VLI
> EFLAGS: 00010046 (2.6.10-rc1-mm2)
> EIP is at __change_page_attr+0x28c/0x358
> eax: ffffffff ebx: 017ff163 ecx: 00000000 edx: c10001e0
> esi: 00000000 edi: c000fff8 ebp: c10001e0 esp: c03f9d98
> ds: 007b es: 007b ss: 0068
> Process swapper (pid: 0, threadinfo=c03f9000 task=c0345b40)
> Stack: c102ffe0 00000000 00000000 00000046 c0113ceb c17f9000 c102ff20
> c102ff20 017ff163 00000000 017ff163 00000000 c0113ceb c17fe000
> c102ffc0 c102ffc0 c1000000 c17ff000 c000fff8 017ff163 00000000
> 017ff163 00000000 00000000
> Call Trace:
> [<c0113ceb>] __change_page_attr+0x2f/0x358
> [<c0113ceb>] __change_page_attr+0x2f/0x358
> [<c011404a>] change_page_attr+0x36/0x54
> [<c0114148>] kernel_map_pages+0x30/0x5f
> [<c0137d80>] __alloc_pages+0x340/0x350
> [<c0137dad>] __get_free_pages+0x1d/0x30
> [<c013adfa>] kmem_getpages+0x26/0xd4
> [<c013c221>] cache_grow+0xb1/0x150
> [<c013c84a>] cache_alloc_refill+0x232/0x280
> [<c013ccbe>] kmem_cache_alloc+0x5a/0x78
> [<c01d4970>] idr_pre_get+0x1c/0x44
> [<c0181b60>] sysfs_fill_super+0x0/0xa4
> [<c01572cc>] set_anon_super+0x10/0xb8
> [<c0156cff>] sget+0xb3/0x148
> [<c0181b60>] sysfs_fill_super+0x0/0xa4
> [<c0157636>] get_sb_single+0x26/0x8c
> [<c0157608>] compare_single+0x0/0x8
> [<c01572bc>] set_anon_super+0x0/0xb8
> [<c0181c1d>] sysfs_get_sb+0x19/0x1d
> [<c0181b60>] sysfs_fill_super+0x0/0xa4
> [<c01576ea>] do_kern_mount+0x4e/0xd0
> [<c015777d>] kern_mount+0x11/0x15
> [<c0409962>] sysfs_init+0x1e/0x50
> [<c0409430>] mnt_init+0xb4/0xc0
> [<c040917a>] vfs_caches_init+0x7e/0x94
> [<c03fa831>] start_kernel+0x12d/0x150
> Code: c7 0f 75 f5 f0 ff 4d 04 eb 08 0f 0b 85 00 88 c1 2d c0 8b 45 00 89
> ea f6 c4 80 74 07 8b 55 0c 8d 74 26 00 8b 42 04 83 f8 ff 75 08 <0f> 0b
> 88 00 88 c1 2d c0 a1 ac 6c 34 c0 a8 08 0f 84 aa 00 00 00
>
> I'm tracking down now to see exactly what's going on. This just a
> regular, plain 4-way x86 box with 4GB of RAM. Removing that BUG_ON()
> lets me boot just fine.
you've a debugging option enabled.
I'm afraid somebody wrote common code with the hope that
change_page_attr had a natural universal API.
change_page_attr API has alwasy required you a certain level of symmetry
to work. Now I made it completley symmetric just to make it simpler to
use and I added BUG where the previous code would screwup.
If you do something like this you'll trigger a BUG_ON too:
change_page_attr(page, PAGE_KERNEL_NOCACHE)
change_page_attr(page, PAGE_KERNEL)
change_page_attr(page, PAGE_KERNEL)
the last one will BUG().
At least the new API with these changes will not leak memory anymore as
far as you retain simmetry (even if you run the change_page_attr on the
same page from different context).
I suspect that such debugging option has never worked right.
pageattr will need fixing so that to provide a natural universal API
that keeps track of each page. Previously this would have failed:
A change_page_attr(page1, UNCACHED) -> pte page_count = 2
B change_page_attr(page1, WHATEVER) -> pte page_count still 2
C change_page_attr(page2, UNCACHED) -> pte page count = 3
A change_page_attr(page1, PAGE_KERNEL) -> pte page count = 2
B change_page_attr(page1, PAGE_KERNEL) -> pte page count = 1 ->screwup
Now the above can work fine. But still examples like the above one will
trigger bugs since they're simply violating the current restricted
change_page_attr API.
I believe if you disable the DEBUG_PAGEALLOC it should work fine again.
Still I recommend investigating _why_ debug_pagealloc is violating the
API. It might not be necessary to wait for the pageattr universal
feature to make DEBUG_PAGEALLOC work safe.
On Tue, Nov 02, 2004 at 02:21:35PM -0800, Dave Hansen wrote:
> OK, good to know. But, for now, can we pull this out of -mm? Or, at
I doubt it's a good idea unless 1) you want the silent memleak back or
2) you found something wrong in the fix itself.
On Tue, Nov 02, 2004 at 03:00:26PM -0800, Dave Hansen wrote:
> just sent out, I just wanted to demonstrate what solves my immediate
> problem.
sure ;)
that's like disabling the config option, the only point of
change_page_attr is to split the direct mapping, it does nothing on
highmem, it actually BUGS() (and it wasn't one of my new bugs ;):
#ifdef CONFIG_HIGHMEM
if (page >= highmem_start_page)
BUG();
#endif
On Tue, Nov 02, 2004 at 03:04:26PM -0800, Andrew Morton wrote:
> Dave Hansen <[email protected]> wrote:
> >
> > Andrea Arcangeli wrote:
> > > Still I recommend investigating _why_ debug_pagealloc is violating the
> > > API. It might not be necessary to wait for the pageattr universal
> > > feature to make DEBUG_PAGEALLOC work safe.
> >
> > This makes the DEBUG_PAGEALLOC stuff symmetric enough to boot for me,
> > and it's pretty damn simple. Any ideas for doing this without bloating
> > 'struct page', even in the debugging case?
>
> You could use a bit from page->flags. But CONFIG_DEBUG_PAGEALLOC uses so
> much additional memory nobody would notice anyway.
>
> hm. Or maybe page->mapcount is available on these pages.
"page < highmem_start_page" would be equivalent to page->mapped == 1.
I'll try to reproduce the problem, frankly I wondered how
DEBUG_PAGEALLOC could ever work with the current pageattr code that is
far from be usable outside a single device driver that owns the physical
region (then I thought DEBUG_PAGEALLOC perhaps was in an unusable state
like dead code). The current change_page_attr is basically only for
symmetric use via ioremap_nocache or AGP explicit code, that _owns_ the
region by registering in resource.c. Attempting to use the current
restrictive API in random page resources not registared and owned by the
caller, may not even be fixable without creating an universal API.
Andrea Arcangeli wrote:
> On Tue, Nov 02, 2004 at 03:00:26PM -0800, Dave Hansen wrote:
>
>>just sent out, I just wanted to demonstrate what solves my immediate
>>problem.
>
> sure ;)
>
> that's like disabling the config option, the only point of
> change_page_attr is to split the direct mapping, it does nothing on
> highmem, it actually BUGS() (and it wasn't one of my new bugs ;):
>
> #ifdef CONFIG_HIGHMEM
> if (page >= highmem_start_page)
> BUG();
> #endif
Oh, crap. I meant to clear ->mapped when change_attr(__pgprot(0)) was
done on it, and set it when it was changed back. Doing that correctly
preserves the symmetry, right?
On Tue, Nov 02, 2004 at 05:43:45PM -0800, Dave Hansen wrote:
> Oh, crap. I meant to clear ->mapped when change_attr(__pgprot(0)) was
> done on it, and set it when it was changed back. Doing that correctly
> preserves the symmetry, right?
yes it should. I agree with Andrew a bitflag would be enough. I'd call
it PG_prot_none.
I realized if the page is in the freelist it's like if it's reserved,
since you're guaranteed there's no other pageattr working on it
(assuming every other pageattr is symmetric too, which we always depend
on). So I wonder why it's not symmetric already, despite the lack of
page->mapped. Would be nice to dump_stack when a __pg_prot(0) &&
page->mapped triggers.
Andrea Arcangeli wrote:
> On Tue, Nov 02, 2004 at 05:43:45PM -0800, Dave Hansen wrote:
>
>>Oh, crap. I meant to clear ->mapped when change_attr(__pgprot(0)) was
>>done on it, and set it when it was changed back. Doing that correctly
>>preserves the symmetry, right?
>
> yes it should. I agree with Andrew a bitflag would be enough. I'd call
> it PG_prot_none.
It should be enough, but I don't think we want to waste a bitflag for
something that's only needed for debugging anyway. They're getting
precious these days. Might as well just bloat the kernel some more when
the alloc debugging is on.
I'll see what I can do to get some backtraces of the __pg_prot(0) &&
page->mapped cases.
On Tue, Nov 02, 2004 at 06:48:09PM -0800, Dave Hansen wrote:
> It should be enough, but I don't think we want to waste a bitflag for
> something that's only needed for debugging anyway. They're getting
> precious these days. Might as well just bloat the kernel some more when
> the alloc debugging is on.
You can leave the bitflag the end (number 31) under the #ifdef. Using
the bitflag is less likely to create an heisenbug (due different layout
of the ram ;).
> I'll see what I can do to get some backtraces of the __pg_prot(0) &&
> page->mapped cases.
thanks!
Andrea Arcangeli wrote:
>>I'll see what I can do to get some backtraces of the __pg_prot(0) &&
>>page->mapped cases.
These appear to be the 2 most common paths: cache_init_objs() and
kfree(). But, the path that I'm hitting now appears to be that
something got page_count(kpte_page) to -1 (*NOT* page->_count), and then
the BUG_ON() trips after a get_page() is done on it. I'm tracking this
down now.
kpte: c0011000
address: c1a00000
kpte_page: c1000264
split: ffffffff
pgprot_val(prot): 00000000
pgprot_val(PAGE_KERNEL): 00000163
(pte_val(*kpte) & _PAGE_PSE): 00000000
path: 1
Badness in __change_page_attr at arch/i386/mm/pageattr.c:156
[<c0114e43>] __change_page_attr+0x443/0x5a4
[<c01150d9>] kernel_map_pages+0x31/0x80
[<c0114fda>] change_page_attr+0x36/0x54
[<c01150f3>] kernel_map_pages+0x4b/0x80
[<c013de18>] cache_init_objs+0x194/0x1d8
[<c013e014>] cache_grow+0xe8/0x154
[<c013e74a>] cache_alloc_refill+0x226/0x274
[<c013edb3>] __kmalloc+0x8b/0xbc
[<c0181302>] proc_create+0x76/0xcc
[<c01814c3>] create_proc_entry+0x6b/0xb4
[<c0121c3b>] register_proc_table+0xcb/0x110
[<c0121c67>] register_proc_table+0xf7/0x110
[<c0420994>] sysctl_init+0x10/0x1c
[<c0413934>] do_basic_setup+0x14/0x20
[<c01004ec>] init+0xa8/0x15c
[<c0100444>] init+0x0/0x15c
[<c0102305>] kernel_thread_helper+0x5/0xc
kpte: c0006978
address: c052f000
kpte_page: c10000d8
split: ffffffff
pgprot_val(prot): 00000000
pgprot_val(PAGE_KERNEL): 00000163
(pte_val(*kpte) & _PAGE_PSE): 00000000
path: 1
Badness in __change_page_attr at arch/i386/mm/pageattr.c:156
[<c0114e43>] __change_page_attr+0x443/0x5a4
[<c02367db>] scsi_run_queue+0xaf/0xb8
[<c0114fda>] change_page_attr+0x36/0x54
[<c01150f3>] kernel_map_pages+0x4b/0x80
[<c013e422>] cache_free_debugcheck+0x2a6/0x2c0
[<c02385a5>] scsi_probe_and_add_lun+0x135/0x188
[<c013f00b>] kfree+0x9f/0xdc
[<c02385a5>] scsi_probe_and_add_lun+0x135/0x188
[<c02385a5>] scsi_probe_and_add_lun+0x135/0x188
[<c0238ac7>] scsi_scan_target+0x63/0xbc
[<c0238b60>] scsi_scan_channel+0x40/0x68
[<c0238bfe>] scsi_scan_host_selected+0x76/0xb0
[<c0238c4a>] scsi_scan_host+0x12/0x18
[<c024b7ff>] ahc_linux_register_host+0x283/0x290
[<c01857e9>] sysfs_create_file+0x41/0x48
[<c01e4392>] pci_create_newid_file+0x1a/0x24
[<c01e470a>] pci_populate_driver_dir+0xa/0x10
[<c01e4786>] pci_register_driver+0x76/0x88
[<c024a867>] ahc_linux_detect+0x3b/0x60
[<c042750a>] ahc_linux_init+0xa/0x30
[<c04138ca>] do_initcalls+0x66/0xbc
[<c041393e>] do_basic_setup+0x1e/0x20
[<c01004ec>] init+0xa8/0x15c
[<c0100444>] init+0x0/0x15c
[<c0102305>] kernel_thread_helper+0x5/0xc
Hi Andrea,
Are you sure that BUG_ON() should be for !page_count(kpte_page)? I
noticed that I was getting some BUGs when the count went back to 0, but
the pte page was completely full with valid ptes. I *think* it's
correct to make it:
BUG_ON(page_count(kpte_page) < 0);
Or, I guess we could keep the old BUG_ON(), and put it inside the else
block with the __put_page().
-- Dave
I attached the wrong patch.
Here's what I meant to send.
-- Dave
On Thu, Nov 04, 2004 at 04:40:48PM -0800, Dave Hansen wrote:
> I attached the wrong patch.
>
> Here's what I meant to send.
>
> -- Dave
>
>
> ---
>
> memhotplug1-dave/arch/i386/mm/pageattr.c | 2 +-
> 1 files changed, 1 insertion(+), 1 deletion(-)
>
> diff -puN arch/i386/mm/pageattr.c~Z0-leaks_only_on_negative arch/i386/mm/pageattr.c
> --- memhotplug1/arch/i386/mm/pageattr.c~Z0-leaks_only_on_negative 2004-11-04 15:57:28.000000000 -0800
> +++ memhotplug1-dave/arch/i386/mm/pageattr.c 2004-11-04 15:58:50.000000000 -0800
> @@ -135,7 +135,7 @@ __change_page_attr(struct page *page, pg
> BUG();
>
> /* memleak and potential failed 2M page regeneration */
> - BUG_ON(!page_count(kpte_page));
> + BUG_ON(page_count(kpte_page) < 0);
>
> if (cpu_has_pse && (page_count(kpte_page) == 1)) {
> list_add(&kpte_page->lru, &df_list);
> _
that will hide the memleak again. Furthermore page_count cannot be < 0
unless we get a _double_ memleak.
The only chance for kpte_page to be freed, is to be == 1 in that place.
If kpte_page is == 1, it will be freed via list_add and the master page
will be regenerated giving it a chance to get performance back. If it's
0, it means we leaked memory as far as I can tell.
It's impossible a pte had a 0 page_count() and not to be in the freelist
already. There is no put_page at all in that whole path, there's only a
__put_page, so it's a memleak to get == 0 in there on any pte or pmd or
whatever else we cannot have put in the freelist already.
something else is still wrong, but knowing the above fixes it at least
tells us it's not a double leak.
On Thu, 2004-11-04 at 16:53, Andrea Arcangeli wrote:
> The only chance for kpte_page to be freed, is to be == 1 in that place.
> If kpte_page is == 1, it will be freed via list_add and the master page
> will be regenerated giving it a chance to get performance back. If it's
> 0, it means we leaked memory as far as I can tell.
>
> It's impossible a pte had a 0 page_count() and not to be in the freelist
> already. There is no put_page at all in that whole path, there's only a
> __put_page, so it's a memleak to get == 0 in there on any pte or pmd or
> whatever else we cannot have put in the freelist already.
Ahhh. I forgot about the allocator's reference on the page. However,
there still seems to be something fishy here.
The page that's causing trouble's pfn is 0x0000000f. It's also set as
PageReserved(). We may have some imbalance with page counts when the
page is PageReserved() that this is catching. I can't find any
asymmetric use of kernel_map_pages(). Both the slab and the allocator
appear to be behaving themselves.
I don't even see a case where that particular page has a get_page() done
on it before the first __change_page_attr() call on it. So, it probably
still has its page_count()==0 from the original set in
memmap_init_zone().
What happens when a pte page is bootmem-allocated? I *think* that's the
situation that I'm hitting. In that case, we can either try to hunt
down the real 'struct pages' after everything is brought up, or we can
just skip the BUG_ON() if the page is reserved. Any thoughts?
-- Dave
On Thu, Nov 04, 2004 at 05:55:40PM -0800, Dave Hansen wrote:
> What happens when a pte page is bootmem-allocated? I *think* that's the
> situation that I'm hitting. In that case, we can either try to hunt
> down the real 'struct pages' after everything is brought up, or we can
> just skip the BUG_ON() if the page is reserved. Any thoughts?
Skipping BUG_ON if the page is reserved is something you can certainly
try.
However if all usages are symmetric, the only pte that should ever get
freed, is the pte that change_page_attr itself has allocated via
split_large_page.
I tried the debug option right now, without the fixes I get a crash in
X (but not in pageattr.c, it's an invalid page fault in some direct
mapping), that might be a real bug or another false positive.
with the fixes applied I get this, so I can reproduce at least ;)
------------[ cut here ]------------
kernel BUG at arch/i386/mm/pageattr.c:133!
invalid operand: 0000 [#1]
SMP DEBUG_PAGEALLOC
CPU: 0
EIP: 0060:[<c011979f>] Not tainted
EFLAGS: 00010046 (2.6.5-0-andrea )
EIP is at change_page_attr+0x26f/0x2d0
eax: ffffffff ebx: c1037de0 ecx: c1000100 edx: c1000100
esi: 00000001 edi: 00000000 ebp: 00000163 esp: c1bf3ee0
ds: 007b es: 007b ss: 0068
Process swapper (pid: 1, threadinfo=c1bf2000 task=c1bf1780)
Stack: 0000001b c0008fbc c1bef000 00000246 00000000 00000001 c1037de0
c1037de0
00000001 00000000 00000001 c0119823 c0419780 c1037de0 c013fba5
c1bb049c
00000000 00000078 00000000 00000001 00000000 c1bf1780 00000010
c041a380
Call Trace:
[<c0119823>] kernel_map_pages+0x23/0x4f
[<c013fba5>] __alloc_pages+0x2f8/0x33b
[<c013fc44>] __get_free_pages+0x18/0x25
[<c0142600>] cache_alloc_refill+0x28c/0x530
[<c013d74b>] mempool_alloc_slab+0x0/0xb
[<c0142907>] __kmalloc+0x63/0x65
[<c013d995>] mempool_create+0x3f/0xbf
[<c013d740>] mempool_free_slab+0x0/0xb
[<c04c87fc>] init_bio+0xec/0x1a8
[<c0103199>] init+0x131/0x2ca
[<c0103068>] init+0x0/0x2ca
[<c0106005>] kernel_thread_helper+0x5/0xb
Code: 0f 0b 85 00 23 e7 3a c0 e9 0f fe ff ff 8d 41 10 8b 15 38 6e
<0>Kernel panic: Attempted to kill init!
On Thu, 2004-11-04 at 18:08, Andrea Arcangeli wrote:
> On Thu, Nov 04, 2004 at 05:55:40PM -0800, Dave Hansen wrote:
> > What happens when a pte page is bootmem-allocated? I *think* that's the
> > situation that I'm hitting. In that case, we can either try to hunt
> > down the real 'struct pages' after everything is brought up, or we can
> > just skip the BUG_ON() if the page is reserved. Any thoughts?
>
> Skipping BUG_ON if the page is reserved is something you can certainly
> try.
>
> However if all usages are symmetric, the only pte that should ever get
> freed, is the pte that change_page_attr itself has allocated via
> split_large_page.
>
> I tried the debug option right now, without the fixes I get a crash in
> X (but not in pageattr.c, it's an invalid page fault in some direct
> mapping), that might be a real bug or another false positive.
>
> with the fixes applied I get this, so I can reproduce at least ;)
Here we go again :)
I think we're being naughty about page_count()s for pages that never hit
the page allocator (ones that never hit free_all_bootmem()). They keep
an initial page_count() of 0, which is a no no if they're used as pte
pages and noticed by __change_page_attr(). This discrepancy isn't
noticed until the page is get'd 512 times, then completely __put'd as
things get allocated into space mapped by the page. The final __put
hits the BUG_ON(). To find this earlier, we could also have a
BUG_ON(!page_count(kpte_page)) in __change_page_attr() right after we
find the kpte_page, in addition to the check after the count is
modified.
This patch defaults the page counts to 1 instead of 0 for all pages in
the zone initialization. Any pages that go though free_all_bootmem()
are set back to a state where they cleanly go in to the allocator.
I'm not quite sure if this has any other weird effects, so I'll hold on
to it for a week or so and see if anything turns up.
-- Dave
On Thu, Nov 04, 2004 at 06:23:11PM -0800, Dave Hansen wrote:
> I'm not quite sure if this has any other weird effects, so I'll hold on
> to it for a week or so and see if anything turns up.
this fixed the problem for me too.
However I'm not convinced this is correct, nothing in the kernel should
ever free a bootmem piece of memory after the machine has booted.
If this helps, it also means we found an existing pte (not pmd) with
page_count 0 during the first unmap event (bootmem allocated). The
transition from mapped to unmapped works fine, but the transition from
unmapped to mapped will thorw the pte away and we'll regenerate a 2M pmd
where there was a pte instead. I wonder why there are 4k pages there in
the first place.
Anyways I understand what's going on now thanks to your debugging, and I
believe the only real fix is to use PageReserved to catch if we're
working on a newly allocated page or not, I don't like to depend on the
page_count being 0 for the bootmem pages like the previous code was
doing. I believe my code would now fall apart even if you were using it
with PSE disabled (nopentium or something). So I've to fix that bit at
least and I will use PageReserved for that.
The page_count of bootmem pages really doesn't matter since they must
never be freed. It really should remain 0 so we catch if anybody
executes a put_page on it.
I'll fix it up...
On Fri, Nov 05, 2004 at 05:03:08AM +0100, Andrea Arcangeli wrote:
> doing. I believe my code would now fall apart even if you were using it
> with PSE disabled (nopentium or something). So I've to fix that bit at
ah no pse disabled was working fine thanks to the cpu_has_pse (I even
already considered that case while doing the patch), only 4k pages setup
at boot time would screw it up and frankly there's no good reason at all
that such 4k pages should exist at all. However adding the reserved
check won't make the code more robust. This way I don't need to drop the
memleak detector from the code (there was a true gigantic memleak in
that code that was triggering in the common case, and it got hidden just
to let those 4k pages pass through).
The only reason to set the page_count 0 on bootmem is to crash on
put_page, nobody should ever care about the page_count of bootmem
memory. bootmem memory will be PageReserved, that is the only thing I'd
like to relay on.
So I believe this is the right fix to make the change_page_attr aware
about all kind of pagetables that might be mapping the direct mapping.
This is really what the code should be doing, since we've to reconstruct
a largepage and drop the pte only if the pte itself was not-reserved. We
shouldn't mess 4k pages instantiated at boot time.
--- sles/arch/i386/mm/pageattr.c.~1~ 2004-11-05 02:36:42.000000000 +0100
+++ sles/arch/i386/mm/pageattr.c 2004-11-05 05:18:27.216553680 +0100
@@ -129,13 +129,15 @@ __change_page_attr(struct page *page, pg
} else
BUG();
- /* memleak and potential failed 2M page regeneration */
- BUG_ON(!page_count(kpte_page));
-
- if (cpu_has_pse && (page_count(kpte_page) == 1)) {
- list_add(&kpte_page->lru, &df_list);
- revert_page(kpte_page, address);
- }
+ if (!PageReserved(kpte_page)) {
+ /* memleak and potential failed 2M page regeneration */
+ BUG_ON(!page_count(kpte_page));
+
+ if (cpu_has_pse && (page_count(kpte_page) == 1)) {
+ list_add(&kpte_page->lru, &df_list);
+ revert_page(kpte_page, address);
+ }
+ }
return 0;
}
I'll prepare a new complete patch against mainline for Andrew with some
more comment (only the pageattr.c part, the ioremap.c was already fine).
next things to do for the longer term:
1) fix this remaining oops with the debug knob enabled ;) (this is
unrelated to change_page_attr)
Unable to handle kernel paging request at virtual address c04c1a6e
printing eip:
c04c1a6e
*pde = 0053b027
*pte = 004c1000
Oops: 0000 [#1]
SMP DEBUG_PAGEALLOC
CPU: 2
EIP: 0060:[<c04c1a6e>] Not tainted
EFLAGS: 00013287 (2.6.5-0-andrea )
EIP is at mp_find_ioapic+0x0/0x53
eax: 00000016 ebx: 00000016 ecx: 00000001 edx: 00000001
esi: 00000001 edi: 00000016 ebp: 00000016 esp: f7fc1e60
ds: 007b es: 007b ss: 0068
Process X (pid: 2153, threadinfo=f7fc0000 task=f7fc21f0)
Stack: c0115a6f c03da597 00000000 f7e29000 c025cfc6 00000001 00000001
00000016
00000001 00000001 00000016 c01130d0 00000016 00000016 f7e29000
f7fc1eb8
00000016 c0265ae3 00000002 00000001 00000001 00e29000 00400000
c03da600
Call Trace:
[<c0115a6f>] mp_register_gsi+0x23/0x114
[<c025cfc6>] acpi_ut_value_exit+0x30/0x3b
[<c01130d0>] acpi_register_gsi+0x71/0x85
[<c0265ae3>] acpi_pci_irq_enable+0x2fd/0x36e
[<c032a29d>] pcibios_enable_device+0x14/0x18
[<c0235ee1>] pci_enable_device_bars+0x16/0x22
[<c028e477>] radeon_irq_busid+0x74/0x16c
[<c028e403>] radeon_irq_busid+0x0/0x16c
[<c0106403>] sys_get_thread_area+0xfa/0x139
[<c0287e8a>] radeon_ioctl+0xfd/0x125
[<c015a645>] vfs_write+0xa5/0xfc
[<c0106403>] sys_get_thread_area+0xfa/0x139
[<c016be6c>] sys_ioctl+0x2cc/0x492
[<c015a868>] sys_write+0x85/0xc6
[<c0106403>] sys_get_thread_area+0xfa/0x139
[<c0107f9f>] syscall_call+0x7/0xb
[<c0106403>] sys_get_thread_area+0xfa/0x139
Code: Bad EIP value.
2) drop those 4k pages from the first few megs of ram that just hurt
performance and waste ram (I guess the main problem is that this is in
strict talk with head.S, and the only communication between head.S and
the C code so far was to check the contents of each pgd and not to touch
what was not-null IIRC)
On Thu, Oct 28, 2004 at 09:21:04PM +0200, Andrea Arcangeli wrote:
> This first patch fixes silent memleak in the pageattr code that I found
> while searching for the bug Andi fixed in the second patch below
> (basically reference counting in split page was done on the pmd instead
> of the pte).
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
>
> Index: linux-2.5/arch/i386/mm/pageattr.c
> ===================================================================
> RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/i386/mm/pageattr.c,v
> retrieving revision 1.13
> diff -u -p -r1.13 pageattr.c
> --- linux-2.5/arch/i386/mm/pageattr.c 27 Aug 2004 17:35:39 -0000 1.13
> +++ linux-2.5/arch/i386/mm/pageattr.c 28 Oct 2004 19:11:20 -0000
> @@ -117,22 +117,23 @@ __change_page_attr(struct page *page, pg
> kpte_page = virt_to_page(kpte);
> if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
> if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
> - pte_t old = *kpte;
> - pte_t standard = mk_pte(page, PAGE_KERNEL);
> set_pte_atomic(kpte, mk_pte(page, prot));
> - if (pte_same(old,standard))
> - get_page(kpte_page);
> } else {
> struct page *split = split_large_page(address, prot);
> if (!split)
> return -ENOMEM;
> - get_page(kpte_page);
> set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL));
> + kpte_page = split;
> }
> + get_page(kpte_page);
> } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
> set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
> __put_page(kpte_page);
> - }
> + } else
> + BUG();
> +
> + /* memleak and potential failed 2M page regeneration */
> + BUG_ON(!page_count(kpte_page));
>
> if (cpu_has_pse && (page_count(kpte_page) == 1)) {
> list_add(&kpte_page->lru, &df_list);
> Index: linux-2.5/arch/x86_64/mm/pageattr.c
> ===================================================================
> RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/mm/pageattr.c,v
> retrieving revision 1.12
> diff -u -p -r1.12 pageattr.c
> --- linux-2.5/arch/x86_64/mm/pageattr.c 27 Jun 2004 17:54:00 -0000 1.12
> +++ linux-2.5/arch/x86_64/mm/pageattr.c 28 Oct 2004 19:11:20 -0000
> @@ -124,28 +124,33 @@ __change_page_attr(unsigned long address
> kpte_flags = pte_val(*kpte);
> if (pgprot_val(prot) != pgprot_val(ref_prot)) {
> if ((kpte_flags & _PAGE_PSE) == 0) {
> - pte_t old = *kpte;
> - pte_t standard = mk_pte(page, ref_prot);
> -
> set_pte(kpte, mk_pte(page, prot));
> - if (pte_same(old,standard))
> - get_page(kpte_page);
> } else {
> + /*
> + * split_large_page will take the reference for this change_page_attr
> + * on the split page.
> + */
> struct page *split = split_large_page(address, prot, ref_prot);
> if (!split)
> return -ENOMEM;
> - get_page(kpte_page);
> set_pte(kpte,mk_pte(split, ref_prot));
> + kpte_page = split;
> }
> + get_page(kpte_page);
> } else if ((kpte_flags & _PAGE_PSE) == 0) {
> set_pte(kpte, mk_pte(page, ref_prot));
> __put_page(kpte_page);
> - }
> + } else
> + BUG();
>
> - if (page_count(kpte_page) == 1) {
> + switch (page_count(kpte_page)) {
> + case 1:
> save_page(address, kpte_page);
> revert_page(address, ref_prot);
> - }
> + break;
> + case 0:
> + BUG(); /* memleak and failed 2M page regeneration */
> + }
> return 0;
> }
>
this new patch obsoletes the above one and fixes one issue with the
direct mapping near physical address 0 that is apparently still backed
by 4k ptes (i.e. reserved ptes with page_count == 0). That generated a
bugcheck false positive. But the bugcheck must be retained, since it
signaled a true memleak during normal usage (DEBUG_PAGELLOC is the only
special usage that end up calling change_page_attr near phys addr 0,
maybe agp could do it too in theory, but it's generally very unlikely to
trigger without the debugging knob enabled, especially given we start
allocating ram from high zones first). anyways this new patch fixes
PAGEALLOC_DEBUG. thanks to Dave Hansen for spotting the problem with
bootmem memory and for providing all the useful debugging info.
I added a further bugcheck on x86-64, so that if the bugcheck page_count
== 0 triggers, we'll also know if it was a reserved page or not (i.e.
the reserved bugcheck will trigger first). I wrote the paging part of
head.S of x86-64 and I recall I avoided any usage of 4k pages for the
initial direct mapping before firing up the paging engine. So I'm
confident x86-64 didn't require any modification and that it would never
run into bootmem allocated 4k pages like x86 can run into. the
additional bugcheck is just in case (mostly for documentation purposes
between the differences of the x86 and x86-64 implementation of
pageattr).
I was overoptimistic that these fixes (both this and Andi's ioremap
symmetry fix combined) would be enough to fix the real life crash.
Something is still not right, but I doubt there's any more bug in
pageattr right now. More likely ioremap stuff is still buggy (infact an
experimental patch from Andi that doesn't touch pageattr.c at all is
reported to fix the problem in practice, problem is I don't see why that
patch is fixing anything, I suspect some off by one bit). Anyways I'm
quite optimistic the below won't require another modification to the
file (I mean unless we want to create a true universal API that doesn't
require perfect symmetry to the usage).
DEBUG_PAGEALLOC oopses the X server on 2.6.5 based kernel, but not on a
2.6.9rc based kernel, so it could be a true 2.6.5 bug that I've seen
with debug pagealloc (and not a false positive), but that's unrelated to
this topic.
This works for me on x86 (with debug pagealloc enabled too) and x86-64.
Signed-off-by: Andrea Arcangeli <[email protected]>
Index: linux-2.5/arch/i386/mm/pageattr.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/i386/mm/pageattr.c,v
retrieving revision 1.13
diff -u -p -r1.13 pageattr.c
--- linux-2.5/arch/i386/mm/pageattr.c 27 Aug 2004 17:35:39 -0000 1.13
+++ linux-2.5/arch/i386/mm/pageattr.c 5 Nov 2004 07:54:25 -0000
@@ -117,27 +117,35 @@ __change_page_attr(struct page *page, pg
kpte_page = virt_to_page(kpte);
if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
- pte_t old = *kpte;
- pte_t standard = mk_pte(page, PAGE_KERNEL);
set_pte_atomic(kpte, mk_pte(page, prot));
- if (pte_same(old,standard))
- get_page(kpte_page);
} else {
struct page *split = split_large_page(address, prot);
if (!split)
return -ENOMEM;
- get_page(kpte_page);
set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL));
+ kpte_page = split;
}
+ get_page(kpte_page);
} else if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
__put_page(kpte_page);
- }
+ } else
+ BUG();
- if (cpu_has_pse && (page_count(kpte_page) == 1)) {
- list_add(&kpte_page->lru, &df_list);
- revert_page(kpte_page, address);
- }
+ /*
+ * If the pte was reserved, it means it was created at boot
+ * time (not via split_large_page) and in turn we must not
+ * replace it with a largepage.
+ */
+ if (!PageReserved(kpte_page)) {
+ /* memleak and potential failed 2M page regeneration */
+ BUG_ON(!page_count(kpte_page));
+
+ if (cpu_has_pse && (page_count(kpte_page) == 1)) {
+ list_add(&kpte_page->lru, &df_list);
+ revert_page(kpte_page, address);
+ }
+ }
return 0;
}
Index: linux-2.5/arch/x86_64/mm/pageattr.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/mm/pageattr.c,v
retrieving revision 1.12
diff -u -p -r1.12 pageattr.c
--- linux-2.5/arch/x86_64/mm/pageattr.c 27 Jun 2004 17:54:00 -0000 1.12
+++ linux-2.5/arch/x86_64/mm/pageattr.c 5 Nov 2004 07:54:25 -0000
@@ -124,28 +124,36 @@ __change_page_attr(unsigned long address
kpte_flags = pte_val(*kpte);
if (pgprot_val(prot) != pgprot_val(ref_prot)) {
if ((kpte_flags & _PAGE_PSE) == 0) {
- pte_t old = *kpte;
- pte_t standard = mk_pte(page, ref_prot);
-
set_pte(kpte, mk_pte(page, prot));
- if (pte_same(old,standard))
- get_page(kpte_page);
} else {
+ /*
+ * split_large_page will take the reference for this change_page_attr
+ * on the split page.
+ */
struct page *split = split_large_page(address, prot, ref_prot);
if (!split)
return -ENOMEM;
- get_page(kpte_page);
set_pte(kpte,mk_pte(split, ref_prot));
+ kpte_page = split;
}
+ get_page(kpte_page);
} else if ((kpte_flags & _PAGE_PSE) == 0) {
set_pte(kpte, mk_pte(page, ref_prot));
__put_page(kpte_page);
- }
+ } else
+ BUG();
+
+ /* on x86-64 the direct mapping set at boot is not using 4k pages */
+ BUG_ON(PageReserved(kpte_page));
- if (page_count(kpte_page) == 1) {
+ switch (page_count(kpte_page)) {
+ case 1:
save_page(address, kpte_page);
revert_page(address, ref_prot);
- }
+ break;
+ case 0:
+ BUG(); /* memleak and failed 2M page regeneration */
+ }
return 0;
}
> I was overoptimistic that these fixes (both this and Andi's ioremap
> symmetry fix combined) would be enough to fix the real life crash.
> Something is still not right, but I doubt there's any more bug in
> pageattr right now. More likely ioremap stuff is still buggy (infact an
At least the NX handling is still broken on i386 (when reverting
back it doesn't clear the NX bit for kernel text)
Also currently I can reproduce another problem on x86-64 with my
latest patch (appended for your reference). The change_page_attr_addr()
addition is not needed in the stock kernel yet, only together with another
patch.
I still don't like how you remove the reversal handling completely.
> > - if (pte_same(old,standard))
> > - get_page(kpte_page);
-Andi
Index: linux/arch/x86_64/mm/ioremap.c
===================================================================
--- linux.orig/arch/x86_64/mm/ioremap.c 2004-10-25 04:47:15.%N +0200
+++ linux/arch/x86_64/mm/ioremap.c 2004-11-02 17:55:45.%N +0100
@@ -16,7 +16,7 @@
#include <asm/fixmap.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-
+#include <asm/proto.h>
static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
unsigned long phys_addr, unsigned long flags)
@@ -98,8 +98,32 @@
return error;
}
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int
+ioremap_change_attr(unsigned long phys_addr, unsigned long size,
+ unsigned long flags)
+{
+ int err = 0;
+ if (flags && phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) {
+ unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ unsigned long vaddr = (unsigned long) __va(phys_addr);
+
+ /*
+ * Must use a address here and not struct page because the phys addr
+ * can be a in hole between nodes and not have an memmap entry.
+ */
+ err = change_page_attr_addr(vaddr, npages, __pgprot(flags));
+ if (!err)
+ global_flush_tlb();
+ }
+ return err;
+}
+
/*
- * Generic mapping function (not visible outside):
+ * Generic mapping function
*/
/*
@@ -155,12 +179,17 @@
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP);
+ area = get_vm_area(size, VM_IOREMAP | (flags << 24));
if (!area)
return NULL;
area->phys_addr = phys_addr;
addr = area->addr;
if (remap_area_pages((unsigned long) addr, phys_addr, size, flags)) {
+ remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+ return NULL;
+ }
+ if (ioremap_change_attr(phys_addr, size, flags) < 0) {
+ area->flags &= 0xffffff;
vunmap(addr);
return NULL;
}
@@ -191,43 +220,34 @@
void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
{
- void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
- if (!p)
- return p;
-
- if (phys_addr + size < virt_to_phys(high_memory)) {
- struct page *ppage = virt_to_page(__va(phys_addr));
- unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
- BUG_ON(phys_addr+size > (unsigned long)high_memory);
- BUG_ON(phys_addr + size < phys_addr);
-
- if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
- iounmap(p);
- p = NULL;
- }
- global_flush_tlb();
- }
-
- return p;
+ return __ioremap(phys_addr, size, _PAGE_PCD);
}
void iounmap(volatile void __iomem *addr)
{
- struct vm_struct *p;
+ struct vm_struct *p, **pprev;
+
if (addr <= high_memory)
return;
- p = remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
+
+ write_lock(&vmlist_lock);
+ for (p = vmlist, pprev = &vmlist; p != NULL; pprev = &p->next, p = *pprev)
+ if (p->addr == (void *)(PAGE_MASK & (unsigned long)addr))
+ break;
if (!p) {
printk("__iounmap: bad address %p\n", addr);
- return;
- }
-
- if (p->flags && p->phys_addr < virt_to_phys(high_memory)) {
+ goto out_unlock;
+ }
+ *pprev = p->next;
+ unmap_vm_area(p);
+ if ((p->flags >> 24) &&
+ p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) {
change_page_attr(virt_to_page(__va(p->phys_addr)),
p->size >> PAGE_SHIFT,
PAGE_KERNEL);
global_flush_tlb();
}
+out_unlock:
+ write_unlock(&vmlist_lock);
kfree(p);
}
Index: linux/include/asm-x86_64/cacheflush.h
===================================================================
--- linux.orig/include/asm-x86_64/cacheflush.h 2004-06-16 12:23:40.%N +0200
+++ linux/include/asm-x86_64/cacheflush.h 2004-11-02 17:55:45.%N +0100
@@ -25,5 +25,6 @@
void global_flush_tlb(void);
int change_page_attr(struct page *page, int numpages, pgprot_t prot);
+int change_page_attr_addr(unsigned long addr, int numpages, pgprot_t prot);
#endif /* _X8664_CACHEFLUSH_H */
Index: linux/arch/x86_64/mm/pageattr.c
===================================================================
--- linux.orig/arch/x86_64/mm/pageattr.c 2004-08-15 19:45:14.%N +0200
+++ linux/arch/x86_64/mm/pageattr.c 2004-11-02 17:55:45.%N +0100
@@ -111,13 +111,12 @@
}
static int
-__change_page_attr(unsigned long address, struct page *page, pgprot_t prot,
- pgprot_t ref_prot)
+__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
+ pgprot_t ref_prot)
{
pte_t *kpte;
struct page *kpte_page;
unsigned kpte_flags;
-
kpte = lookup_address(address);
if (!kpte) return 0;
kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
@@ -125,20 +124,20 @@
if (pgprot_val(prot) != pgprot_val(ref_prot)) {
if ((kpte_flags & _PAGE_PSE) == 0) {
pte_t old = *kpte;
- pte_t standard = mk_pte(page, ref_prot);
+ pte_t standard = pfn_pte(pfn, ref_prot);
- set_pte(kpte, mk_pte(page, prot));
+ set_pte(kpte, pfn_pte(pfn, prot));
if (pte_same(old,standard))
get_page(kpte_page);
} else {
struct page *split = split_large_page(address, prot, ref_prot);
if (!split)
return -ENOMEM;
- get_page(kpte_page);
+ get_page(split);
set_pte(kpte,mk_pte(split, ref_prot));
}
} else if ((kpte_flags & _PAGE_PSE) == 0) {
- set_pte(kpte, mk_pte(page, ref_prot));
+ set_pte(kpte, pfn_pte(pfn, ref_prot));
__put_page(kpte_page);
}
@@ -162,31 +161,38 @@
*
* Caller must call global_flush_tlb() after this.
*/
-int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
{
int err = 0;
int i;
down_write(&init_mm.mmap_sem);
- for (i = 0; i < numpages; !err && i++, page++) {
- unsigned long address = (unsigned long)page_address(page);
- err = __change_page_attr(address, page, prot, PAGE_KERNEL);
+ for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
+ unsigned long pfn = __pa(address) >> PAGE_SHIFT;
+
+ err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
if (err)
break;
/* Handle kernel mapping too which aliases part of the
* lowmem */
/* Disabled right now. Fixme */
- if (0 && page_to_phys(page) < KERNEL_TEXT_SIZE) {
+ if (0 && __pa(address) < KERNEL_TEXT_SIZE) {
unsigned long addr2;
- addr2 = __START_KERNEL_map + page_to_phys(page);
- err = __change_page_attr(addr2, page, prot,
- PAGE_KERNEL_EXEC);
+ addr2 = __START_KERNEL_map + __pa(address);
+ err = __change_page_attr(addr2, pfn, prot, PAGE_KERNEL_EXEC);
}
}
up_write(&init_mm.mmap_sem);
return err;
}
+/* Don't call this for MMIO areas that may not have a mem_map entry */
+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+{
+ unsigned long addr = (unsigned long)page_address(page);
+ return change_page_attr_addr(addr, numpages, prot);
+}
+
void global_flush_tlb(void)
{
struct deferred_page *df, *next_df;
On Fri, Nov 05, 2004 at 09:31:02AM +0100, Andi Kleen wrote:
> At least the NX handling is still broken on i386 (when reverting
> back it doesn't clear the NX bit for kernel text)
I'd never use a 32bit kernel on a x86-64 box, so that's sort of low
interest bug to me, but acked.
> I still don't like how you remove the reversal handling completely.
that's to make it fully symmetric. If you're so attached to the old
API, I'm not going to care if you want it back, but if there are 3
symmetric users with the third going to work on a different page, my
code will work the previous code will corrupt the mapping due the
refcount going down too fast. If you could mention a single case where
it would make sense not to be symmetric, I would change my mind. I find
so much simpler to remember that as far as I'm always symmetric the
refcounting will go right no matter what the other tasks are doing
around. The below special case just complicates the API for no good
reason. My problem is that it requires somebody understanding pageattr.c
and fixing bugs on it daily to remember and to be able to use your below
API IMHO (I had no idea myself of this undocumented below subtle detail
until I read the code, infact it was the first thing I've removed after
I noticed the asymmetry it generated). Bug again no problem, I'll try
hard to remember it if we're going to keep it ;).
> > > - if (pte_same(old,standard))
> > > - get_page(kpte_page);
Hello,
the fix for this bug in 2.6.11-rc3 for this bug is wrong, I thought I
posted the right one (that I already applied to all SUSE branches except
the HEAD branch that probably is in sync with the inferior fix in
mainline). Right fix is the below one. And then of course drop those
useless -PAGE_SIZE in change_page_attr p->size parameter in the arch
code. Exposing vmalloc.c internal knowledge of the guard-page-size into
the arch code is an unnecssary breakage of the vmalloc abstraction and
I've been very careful to avoid that and I thought I posted it too. It
has been even quicker to fix it right for me in a single place than to
hand edit the (not single) change_page_attr callers. I'd like the right
fix to obsolete the hand editing of arch code in multiple places that
breaks the layering. Thanks.
From: Andrea Arcangeli <[email protected]>
Subject: reject zero page vm-area request, align size properly
and hide the guard page from the callers like ioremap - this avoids
a kernel crash due one more page being passed to change_page_attr
Signed-off-by: Andrea Arcangeli <[email protected]>
--- sl9.2/mm/vmalloc.c.~1~ 2004-12-04 01:44:23.352416128 +0100
+++ sl9.2/mm/vmalloc.c 2004-12-04 03:02:37.299827656 +0100
@@ -199,20 +199,22 @@ struct vm_struct *__get_vm_area(unsigned
align = 1ul << bit;
}
addr = ALIGN(start, align);
+ size = PAGE_ALIGN(size);
area = kmalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
return NULL;
- /*
- * We always allocate a guard page.
- */
- size += PAGE_SIZE;
if (unlikely(!size)) {
kfree (area);
return NULL;
}
+ /*
+ * We always allocate a guard page.
+ */
+ size += PAGE_SIZE;
+
write_lock(&vmlist_lock);
for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
if ((unsigned long)tmp->addr < addr) {
@@ -290,6 +292,11 @@ found:
unmap_vm_area(tmp);
*p = tmp->next;
write_unlock(&vmlist_lock);
+
+ /*
+ * Remove the guard page.
+ */
+ tmp->size -= PAGE_SIZE;
return tmp;
}
On Tue, Feb 15, 2005 at 12:15:54AM +0100, Andrea Arcangeli wrote:
> Hello,
>
> the fix for this bug in 2.6.11-rc3 for this bug is wrong, I thought I
Can you describe what exactly is wrong?
-Andi
On Tue, Feb 15, 2005 at 11:39:15AM +0100, Andi Kleen wrote:
> On Tue, Feb 15, 2005 at 12:15:54AM +0100, Andrea Arcangeli wrote:
> > Hello,
> >
> > the fix for this bug in 2.6.11-rc3 for this bug is wrong, I thought I
>
> Can you describe what exactly is wrong?
the wrong thing is that if I change the size of the guard page in
vmalloc.c, the arch code will break. the guard page is a debugging knob,
people may want to remove it someday to save virtual address space, and
they shouldn't be required to look after N architectures.
Plus people will keep forgetting that p->size has the guard page
included if you don't fix it the right way, x86-64 and x86 have been
corrupting the pageattr of the next physical pages because of that for a
long time.
Plus if you applied the best fix in vmalloc.c, you wouldn't even need to
touch x86 and x86-64 arch dependent code, you could call
change_page_attr on p->size without having to do -PAGE_SIZE.
At least you should define a VMALLOC_GUARD_PAGE_SIZE, or something,
the hardcoding of p->size-PAGE_SIZE looks wrong to me.
On Tue, Feb 15, 2005 at 11:48:27AM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 15, 2005 at 11:39:15AM +0100, Andi Kleen wrote:
> > On Tue, Feb 15, 2005 at 12:15:54AM +0100, Andrea Arcangeli wrote:
> > > Hello,
> > >
> > > the fix for this bug in 2.6.11-rc3 for this bug is wrong, I thought I
> >
> > Can you describe what exactly is wrong?
>
> the wrong thing is that if I change the size of the guard page in
> vmalloc.c, the arch code will break. the guard page is a debugging knob,
> people may want to remove it someday to save virtual address space, and
> they shouldn't be required to look after N architectures.
Ok, so it is merely a cosmetic issue.
I was worried about an actual bug :)
No problem with changing it, but hopefully after 2.6.11.
-Andi
On Tue, Feb 15, 2005 at 11:51:52AM +0100, Andi Kleen wrote:
> Ok, so it is merely a cosmetic issue.
In practice it's solved, I didn't mean we had a bug still, I'm just
suggesting to fix it in a diffeerent way.
It's not just for cosmetic reasons that I suggest to change this. My
point is that the _real_ reason why we had the bug in the first place is
that people forgets that p->size includes the guard page (because it
shouldn't include the guard page). The fundamental problem of vmalloc
exposing the guard page to the callers (which makes it prone for
mistakes, and prone for breakage if somebody needs all virtual space and
removes the guard page), isn't solved yet.
> I was worried about an actual bug :)
No bugs anymore in practice, the -PAGE_SIZE in arch/x86* is clearly
equivalent to -PAGE_SIZE in mm/vmalloc.c, in practice it's the same
either ways.
> No problem with changing it, but hopefully after 2.6.11.
Ok fine with me, take your time it's clearly not urgent because in
practice no bug can be triggered anymore ;). thanks!
On Tue, 15 Feb 2005, Andrea Arcangeli wrote:
> On Tue, Feb 15, 2005 at 11:51:52AM +0100, Andi Kleen wrote:
>
> It's not just for cosmetic reasons that I suggest to change this. My
> point is that the _real_ reason why we had the bug in the first place is
> that people forgets that p->size includes the guard page (because it
> shouldn't include the guard page). The fundamental problem of vmalloc
> exposing the guard page to the callers (which makes it prone for
> mistakes, and prone for breakage if somebody needs all virtual space and
> removes the guard page), isn't solved yet.
Strongly agree with you. I remember this nuisance from large-page-patch
days: the guard page (if any) really should be an implementation detail
private to mm/vmalloc.c, never exposed outside.
> > No problem with changing it, but hopefully after 2.6.11.
>
> Ok fine with me, take your time it's clearly not urgent because in
> practice no bug can be triggered anymore ;). thanks!
Yes, not urgent. The instance of exposure I remember is one of the
PAGE_SIZEs in fs/proc/kcore.c.
Hugh