LinuxLists.cc - fix iounmap and a pageattr memleak (x86 and x86-64)

2004-10-28 19:24:17

Subject: fix iounmap and a pageattr memleak (x86 and x86-64)

This first patch fixes silent memleak in the pageattr code that I found
while searching for the bug Andi fixed in the second patch below
(basically reference counting in split page was done on the pmd instead
of the pte).

Signed-off-by: Andrea Arcangeli <[email protected]>

Index: linux-2.5/arch/i386/mm/pageattr.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/i386/mm/pageattr.c,v
retrieving revision 1.13
diff -u -p -r1.13 pageattr.c
--- linux-2.5/arch/i386/mm/pageattr.c 27 Aug 2004 17:35:39 -0000 1.13
+++ linux-2.5/arch/i386/mm/pageattr.c 28 Oct 2004 19:11:20 -0000
@@ -117,22 +117,23 @@ __change_page_attr(struct page *page, pg
kpte_page = virt_to_page(kpte);
if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
- pte_t old = *kpte;
- pte_t standard = mk_pte(page, PAGE_KERNEL);
set_pte_atomic(kpte, mk_pte(page, prot));
- if (pte_same(old,standard))
- get_page(kpte_page);
} else {
struct page *split = split_large_page(address, prot);
if (!split)
return -ENOMEM;
- get_page(kpte_page);
set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL));
+ kpte_page = split;
}
+ get_page(kpte_page);
} else if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
__put_page(kpte_page);
- }
+ } else
+ BUG();
+
+ /* memleak and potential failed 2M page regeneration */
+ BUG_ON(!page_count(kpte_page));

if (cpu_has_pse && (page_count(kpte_page) == 1)) {
list_add(&kpte_page->lru, &df_list);
Index: linux-2.5/arch/x86_64/mm/pageattr.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/mm/pageattr.c,v
retrieving revision 1.12
diff -u -p -r1.12 pageattr.c
--- linux-2.5/arch/x86_64/mm/pageattr.c 27 Jun 2004 17:54:00 -0000 1.12
+++ linux-2.5/arch/x86_64/mm/pageattr.c 28 Oct 2004 19:11:20 -0000
@@ -124,28 +124,33 @@ __change_page_attr(unsigned long address
kpte_flags = pte_val(*kpte);
if (pgprot_val(prot) != pgprot_val(ref_prot)) {
if ((kpte_flags & _PAGE_PSE) == 0) {
- pte_t old = *kpte;
- pte_t standard = mk_pte(page, ref_prot);
-
set_pte(kpte, mk_pte(page, prot));
- if (pte_same(old,standard))
- get_page(kpte_page);
} else {
+ /*
+ * split_large_page will take the reference for this change_page_attr
+ * on the split page.
+ */
struct page *split = split_large_page(address, prot, ref_prot);
if (!split)
return -ENOMEM;
- get_page(kpte_page);
set_pte(kpte,mk_pte(split, ref_prot));
+ kpte_page = split;
}
+ get_page(kpte_page);
} else if ((kpte_flags & _PAGE_PSE) == 0) {
set_pte(kpte, mk_pte(page, ref_prot));
__put_page(kpte_page);
- }
+ } else
+ BUG();

- if (page_count(kpte_page) == 1) {
+ switch (page_count(kpte_page)) {
+ case 1:
save_page(address, kpte_page);
revert_page(address, ref_prot);
- }
+ break;
+ case 0:
+ BUG(); /* memleak and failed 2M page regeneration */
+ }
return 0;
}

This below patch from Andi is also needed to make the above work on x86
(otherwise one of my new above BUGS() will trigger signalling the fact
a bug was there). The below patch creates a subtle dependency that
(_PAGE_PCD << 24) must not be zero. It's not the cleanest thing ever,
but since it's an hardware bitflag I doubt it's going to break.

I'm not sure if I'm allowed to add the signedoff for Andi but I think I
should since he wrote the x86-64 version, if not please let me know (I
only backported it to x86 to test my above changes that otherwise would
trigger one of my added BUGs, and I did the other worthless cosmetical
fixes).

Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Andrea Arcangeli <[email protected]>

Index: linux-2.5/arch/i386/mm/ioremap.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/i386/mm/ioremap.c,v
retrieving revision 1.23
diff -u -p -r1.23 ioremap.c
--- linux-2.5/arch/i386/mm/ioremap.c 11 Sep 2004 02:01:36 -0000 1.23
+++ linux-2.5/arch/i386/mm/ioremap.c 28 Oct 2004 19:11:20 -0000
@@ -152,7 +152,7 @@ void __iomem * __ioremap(unsigned long p
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP);
+ area = get_vm_area(size, VM_IOREMAP | (flags << 24));
if (!area)
return NULL;
area->phys_addr = phys_addr;
@@ -232,7 +232,7 @@ void iounmap(volatile void __iomem *addr
return;
}

- if (p->flags && p->phys_addr < virt_to_phys(high_memory)) {
+ if ((p->flags >> 24) && p->phys_addr < virt_to_phys(high_memory)) {
change_page_attr(virt_to_page(__va(p->phys_addr)),
p->size >> PAGE_SHIFT,
PAGE_KERNEL);
Index: linux-2.5/arch/x86_64/mm/ioremap.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/arch/x86_64/mm/ioremap.c,v
retrieving revision 1.16
diff -u -p -r1.16 ioremap.c
--- linux-2.5/arch/x86_64/mm/ioremap.c 22 Oct 2004 15:01:03 -0000 1.16
+++ linux-2.5/arch/x86_64/mm/ioremap.c 28 Oct 2004 19:11:20 -0000
@@ -128,11 +128,11 @@ void __iomem * __ioremap(unsigned long p
if (phys_addr >= 0xA0000 && last_addr < 0x100000)
return (__force void __iomem *)phys_to_virt(phys_addr);

+#ifndef CONFIG_DISCONTIGMEM
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
if (phys_addr < virt_to_phys(high_memory)) {
-#ifndef CONFIG_DISCONTIGMEM
char *t_addr, *t_end;
struct page *page;

@@ -142,8 +142,8 @@ void __iomem * __ioremap(unsigned long p
for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
if(!PageReserved(page))
return NULL;
-#endif
}
+#endif

/*
* Mappings have to be page-aligned
@@ -155,7 +155,7 @@ void __iomem * __ioremap(unsigned long p
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP);
+ area = get_vm_area(size, VM_IOREMAP | (flags << 24));
if (!area)
return NULL;
area->phys_addr = phys_addr;
@@ -195,12 +195,12 @@ void __iomem *ioremap_nocache (unsigned
if (!p)
return p;

- if (phys_addr + size < virt_to_phys(high_memory)) {
+ if (phys_addr + size - 1 < virt_to_phys(high_memory)) {
struct page *ppage = virt_to_page(__va(phys_addr));
unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;

- BUG_ON(phys_addr+size > (unsigned long)high_memory);
- BUG_ON(phys_addr + size < phys_addr);
+ BUG_ON(phys_addr+size >= (unsigned long)high_memory);
+ BUG_ON(phys_addr + size <= phys_addr);

if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
iounmap(p);
@@ -223,7 +223,7 @@ void iounmap(volatile void __iomem *addr
return;
}

- if (p->flags && p->phys_addr < virt_to_phys(high_memory)) {
+ if ((p->flags >> 24) && p->phys_addr + p->size < virt_to_phys(high_memory)) {
change_page_attr(virt_to_page(__va(p->phys_addr)),
p->size >> PAGE_SHIFT,
PAGE_KERNEL);

If you give them some beating on -mm let me know if you've any problem.
(running with this stuff on my machines right now, so far so good)

2004-11-02 21:23:12

by Dave Hansen

[permalink] [raw]

Subject: Re: fix iounmap and a pageattr memleak (x86 and x86-64)

This patch:

> From: Andrea Arcangeli <[email protected]>
>
> - fix silent memleak in the pageattr code that I found while searching
> for the bug Andi fixed in the second patch below (basically reference
> counting in split page was done on the pmd instead of the pte).
>
> - Part of this patch is also needed to make the above work on x86 (otherwise
> one of my new above BUGS() will trigger signalling the fact a bug was
> there). The below patch creates a subtle dependency that (_PAGE_PCD << 24)
> must not be zero. It's not the cleanest thing ever, but since it's an
> hardware bitflag I doubt it's going to break.
>
> Signed-off-by: Andi Kleen <[email protected]>
> Signed-off-by: Andrea Arcangeli <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> ---
>
> 25-akpm/arch/i386/mm/ioremap.c | 4 ++--
> 25-akpm/arch/i386/mm/pageattr.c | 13 +++++++------
> 25-akpm/arch/x86_64/mm/ioremap.c | 14 +++++++-------
> 25-akpm/arch/x86_64/mm/pageattr.c | 23 ++++++++++++++---------
> 4 files changed, 30 insertions(+), 24 deletions(-)

is hitting this BUG() during bootup:

/* memleak and potential failed 2M page regeneration */
BUG_ON(!page_count(kpte_page));

in 2.6.10-rc1-mm2.

Dentry cache hash table entries: 131072 (order: 7, 524288 bytes)
Inode-cache hash table entries: 65536 (order: 6, 262144 bytes)
Memory: 511144k/524288k available (1856k kernel code, 12608k reserved,
1186k data, 164k init, 0k highmem)
Checking if this processor honours the WP bit even in supervisor mode... Ok.
Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
------------[ cut here ]------------
kernel BUG at arch/i386/mm/pageattr.c:136!
invalid operand: 0000 [#1]
SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 0
EIP: 0060:[<c0113f48>] Not tainted VLI
EFLAGS: 00010046 (2.6.10-rc1-mm2)
EIP is at __change_page_attr+0x28c/0x358
eax: ffffffff ebx: 017ff163 ecx: 00000000 edx: c10001e0
esi: 00000000 edi: c000fff8 ebp: c10001e0 esp: c03f9d98
ds: 007b es: 007b ss: 0068
Process swapper (pid: 0, threadinfo=c03f9000 task=c0345b40)
Stack: c102ffe0 00000000 00000000 00000046 c0113ceb c17f9000 c102ff20
c102ff20 017ff163 00000000 017ff163 00000000 c0113ceb c17fe000
c102ffc0 c102ffc0 c1000000 c17ff000 c000fff8 017ff163 00000000
017ff163 00000000 00000000
Call Trace:
[<c0113ceb>] __change_page_attr+0x2f/0x358
[<c0113ceb>] __change_page_attr+0x2f/0x358
[<c011404a>] change_page_attr+0x36/0x54
[<c0114148>] kernel_map_pages+0x30/0x5f
[<c0137d80>] __alloc_pages+0x340/0x350
[<c0137dad>] __get_free_pages+0x1d/0x30
[<c013adfa>] kmem_getpages+0x26/0xd4
[<c013c221>] cache_grow+0xb1/0x150
[<c013c84a>] cache_alloc_refill+0x232/0x280
[<c013ccbe>] kmem_cache_alloc+0x5a/0x78
[<c01d4970>] idr_pre_get+0x1c/0x44
[<c0181b60>] sysfs_fill_super+0x0/0xa4
[<c01572cc>] set_anon_super+0x10/0xb8
[<c0156cff>] sget+0xb3/0x148
[<c0181b60>] sysfs_fill_super+0x0/0xa4
[<c0157636>] get_sb_single+0x26/0x8c
[<c0157608>] compare_single+0x0/0x8
[<c01572bc>] set_anon_super+0x0/0xb8
[<c0181c1d>] sysfs_get_sb+0x19/0x1d
[<c0181b60>] sysfs_fill_super+0x0/0xa4
[<c01576ea>] do_kern_mount+0x4e/0xd0
[<c015777d>] kern_mount+0x11/0x15
[<c0409962>] sysfs_init+0x1e/0x50
[<c0409430>] mnt_init+0xb4/0xc0
[<c040917a>] vfs_caches_init+0x7e/0x94
[<c03fa831>] start_kernel+0x12d/0x150
Code: c7 0f 75 f5 f0 ff 4d 04 eb 08 0f 0b 85 00 88 c1 2d c0 8b 45 00 89
ea f6 c4 80 74 07 8b 55 0c 8d 74 26 00 8b 42 04 83 f8 ff 75 08 <0f> 0b
88 00 88 c1 2d c0 a1 ac 6c 34 c0 a8 08 0f 84 aa 00 00 00

I'm tracking down now to see exactly what's going on. This just a
regular, plain 4-way x86 box with 4GB of RAM. Removing that BUG_ON()
lets me boot just fine.

kpte: c000fff8
address: c17ff000
kpte_page: c10001e0
pgprot_val(prot): 00000163
pgprot_val(PAGE_KERNEL): 00000163
(pte_val(*kpte) & _PAGE_PSE): 00000000

2004-11-02 22:26:49

by Andrew Morton

[permalink] [raw]

Subject: Re: fix iounmap and a pageattr memleak (x86 and x86-64)

Dave Hansen <[email protected]> wrote:
>
> Andrea Arcangeli wrote:
> > Still I recommend investigating _why_ debug_pagealloc is violating the
> > API. It might not be necessary to wait for the pageattr universal
> > feature to make DEBUG_PAGEALLOC work safe.
>
> OK, good to know. But, for now, can we pull this out of -mm? Or, at
> least that BUG_ON()? DEBUG_PAGEALLOC is an awfully powerful debugging
> tool to just be removed like this.

If we make it a WARN_ON, will that cause a complete storm of output?

2004-11-02 22:36:10

by Dave Hansen

[permalink] [raw]

Subject: Re: fix iounmap and a pageattr memleak (x86 and x86-64)

Andrew Morton wrote:
> Dave Hansen <[email protected]> wrote:
>
>>Andrea Arcangeli wrote:
>>
>>>Still I recommend investigating _why_ debug_pagealloc is violating the
>>>API. It might not be necessary to wait for the pageattr universal
>>>feature to make DEBUG_PAGEALLOC work safe.
>>
>>OK, good to know. But, for now, can we pull this out of -mm? Or, at
>>least that BUG_ON()? DEBUG_PAGEALLOC is an awfully powerful debugging
>>tool to just be removed like this.
>
> If we make it a WARN_ON, will that cause a complete storm of output?

Yeah, just tried it. I hit a couple hundred of them before I got to init.

2004-11-02 22:42:01

by Jason Baron

[permalink] [raw]

Subject: Re: fix iounmap and a pageattr memleak (x86 and x86-64)

On Tue, 2 Nov 2004, Dave Hansen wrote:

> This patch:
>
> > From: Andrea Arcangeli <[email protected]>
> >
> > - fix silent memleak in the pageattr code that I found while searching
> > for the bug Andi fixed in the second patch below (basically reference
> > counting in split page was done on the pmd instead of the pte).
> >
> > - Part of this patch is also needed to make the above work on x86 (otherwise
> > one of my new above BUGS() will trigger signalling the fact a bug was
> > there). The below patch creates a subtle dependency that (_PAGE_PCD << 24)
> > must not be zero. It's not the cleanest thing ever, but since it's an
> > hardware bitflag I doubt it's going to break.
> >
> > Signed-off-by: Andi Kleen <[email protected]>
> > Signed-off-by: Andrea Arcangeli <[email protected]>
> > Signed-off-by: Andrew Morton <[email protected]>
> > ---
> >
> > 25-akpm/arch/i386/mm/ioremap.c | 4 ++--
> > 25-akpm/arch/i386/mm/pageattr.c | 13 +++++++------
> > 25-akpm/arch/x86_64/mm/ioremap.c | 14 +++++++-------
> > 25-akpm/arch/x86_64/mm/pageattr.c | 23 ++++++++++++++---------
> > 4 files changed, 30 insertions(+), 24 deletions(-)
>
> is hitting this BUG() during bootup:
>
> /* memleak and potential failed 2M page regeneration */
> BUG_ON(!page_count(kpte_page));
>
> in 2.6.10-rc1-mm2.
>

I've seen the page_count being -1 (not sure why), for a number of pages in
the identity mapped region...So the BUG() on 0 doesn't seem valid to me.

Also, in order to tell if the pages should be merged back to create a huge
page, i don't see how the patch differentiates b/w pages that were split
and those that weren't simply based on the page_count....

-Jason

2004-11-02 22:51:28

by Dave Hansen

[permalink] [raw]

Subject: Re: fix iounmap and a pageattr memleak (x86 and x86-64)

---

memhotplug1-dave/arch/i386/mm/pageattr.c | 7 +++++--
memhotplug1-dave/include/linux/mm.h | 3 +++
memhotplug1-dave/mm/page_alloc.c | 5 ++++-
3 files changed, 12 insertions(+), 3 deletions(-)

diff -puN include/linux/mm.h~Z3-page_debugging include/linux/mm.h
--- memhotplug1/include/linux/mm.h~Z3-page_debugging 2004-11-02 14:29:51.000000000 -0800
+++ memhotplug1-dave/include/linux/mm.h 2004-11-02 14:37:08.000000000 -0800
@@ -245,6 +245,9 @@ struct page {
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ int mapped;
+#endif
};

#ifdef CONFIG_MEMORY_HOTPLUG
diff -puN arch/i386/mm/pageattr.c~Z3-page_debugging arch/i386/mm/pageattr.c
--- memhotplug1/arch/i386/mm/pageattr.c~Z3-page_debugging 2004-11-02 14:31:07.000000000 -0800
+++ memhotplug1-dave/arch/i386/mm/pageattr.c 2004-11-02 14:41:00.000000000 -0800
@@ -153,7 +153,7 @@ __change_page_attr(struct page *page, pg
printk("pgprot_val(PAGE_KERNEL): %08lx\n", pgprot_val(PAGE_KERNEL));
printk("(pte_val(*kpte) & _PAGE_PSE): %08lx\n", (pte_val(*kpte) & _PAGE_PSE));
printk("path: %d\n", path);
- BUG();
+ WARN_ON(1);
}

if (cpu_has_pse && (page_count(kpte_page) == 1)) {
@@ -224,7 +224,10 @@ void kernel_map_pages(struct page *page,
/* the return value is ignored - the calls cannot fail,
* large pages are disabled at boot time.
*/
- change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+ if (enable && !page->mapped)
+ change_page_attr(page, numpages, PAGE_KERNEL);
+ else if (!enable && page->mapped)
+ change_page_attr(page, numpages, __pgprot(0));
/* we should perform an IPI and flush all tlbs,
* but that can deadlock->flush only current cpu.
*/
diff -puN mm/page_alloc.c~Z3-page_debugging mm/page_alloc.c
--- memhotplug1/mm/page_alloc.c~Z3-page_debugging 2004-11-02 14:37:53.000000000 -0800
+++ memhotplug1-dave/mm/page_alloc.c 2004-11-02 14:42:56.000000000 -0800
@@ -1840,8 +1840,11 @@ void __devinit memmap_init_zone(unsigned
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
- if (!is_highmem_idx(zone))
+ if (!is_highmem_idx(zone)) {
set_page_address(page, __va(start_pfn << PAGE_SHIFT));
+ page->mapped = 1;
+ } else
+ page->mapped = 0;
#endif
start_pfn++;
}
_

Attachments:

Z3-page_debugging.patch (2.36 kB)

2004-11-02 23:01:18

by Andrew Morton

[permalink] [raw]

Subject: Re: fix iounmap and a pageattr memleak (x86 and x86-64)

Dave Hansen <[email protected]> wrote:
>
> Andrea Arcangeli wrote:
> > Still I recommend investigating _why_ debug_pagealloc is violating the
> > API. It might not be necessary to wait for the pageattr universal
> > feature to make DEBUG_PAGEALLOC work safe.
>
> This makes the DEBUG_PAGEALLOC stuff symmetric enough to boot for me,
> and it's pretty damn simple. Any ideas for doing this without bloating
> 'struct page', even in the debugging case?

You could use a bit from page->flags. But CONFIG_DEBUG_PAGEALLOC uses so
much additional memory nobody would notice anyway.

hm. Or maybe page->mapcount is available on these pages.

2004-11-02 23:11:53

by Dave Hansen

[permalink] [raw]

Subject: Re: fix iounmap and a pageattr memleak (x86 and x86-64)

BTW, please don't anyone going even trying to apply that piece of crap I
just sent out, I just wanted to demonstrate what solves my immediate
problem.

2004-11-02 23:32:42