LinuxLists.cc - RFC: Cleanup / small fixes to hugetlb fault handling

2005-10-26 02:01:09

Subject: RFC: Cleanup / small fixes to hugetlb fault handling

Hi, Adam, Bill, Hugh,

Does this look like a reasonable patch to send to akpm for -mm.

This patch makes some slight tweaks / cleanups to the fault handling
path for huge pages in -mm. My main motivation is to make it simpler
to fit COW in, but along the way it addresses a few minor problems
with the existing code:

- The check against i_size was duplicated: once in
find_lock_huge_page() and again in hugetlb_fault() after taking the
page_table_lock. We only really need the locked one, so remove the
other.

- find_lock_huge_page() didn't, in fact, lock the page if it newly
allocated one, rather than finding it in the page cache already. As
far as I can tell this is a bug, so the patch corrects it.

- find_lock_huge_page() isn't a great name, since it does extra things
not analagous to find_lock_page(). Rename it
find_or_alloc_huge_page() which is closer to the mark.

Signed-off-by: David Gibson <[email protected]>

Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c 2005-10-26 11:18:39.000000000 +1000
+++ working-2.6/mm/hugetlb.c 2005-10-26 11:34:32.000000000 +1000
@@ -336,8 +336,8 @@
flush_tlb_range(vma, start, end);
}

-static struct page *find_lock_huge_page(struct address_space *mapping,
- unsigned long idx)
+static struct page *find_or_alloc_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
struct page *page;
int err;
@@ -347,21 +347,19 @@
retry:
page = find_lock_page(mapping, idx);
if (page)
- goto out;
-
- /* Check to make sure the mapping hasn't been truncated */
- size = i_size_read(inode) >> HPAGE_SHIFT;
- if (idx >= size)
- goto out;
+ return page;

if (hugetlb_get_quota(mapping))
- goto out;
+ return NULL;
+
page = alloc_huge_page();
if (!page) {
hugetlb_put_quota(mapping);
- goto out;
+ return NULL;
}

+ lock_page(page);
+
err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
if (err) {
put_page(page);
@@ -370,50 +368,49 @@
goto retry;
page = NULL;
}
-out:
+
return page;
}

-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access)
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access, pte_t *ptep)
{
- int ret = VM_FAULT_SIGBUS;
+ int ret;
unsigned long idx;
unsigned long size;
- pte_t *pte;
struct page *page;
struct address_space *mapping;

- pte = huge_pte_alloc(mm, address);
- if (!pte)
- goto out;
-
mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));

- /*
- * Use page lock to guard against racing truncation
- * before we get page_table_lock.
- */
- page = find_lock_huge_page(mapping, idx);
+ /* This returns a locked page, which keeps us safe in the
+ * event of a race with truncate() */
+ page = find_or_alloc_huge_page(mapping, idx);
if (!page)
- goto out;
+ return VM_FAULT_SIGBUS;

spin_lock(&mm->page_table_lock);
+
+ ret = VM_FAULT_SIGBUS;
+
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
if (idx >= size)
goto backout;

ret = VM_FAULT_MINOR;
+
if (!pte_none(*pte))
+ /* oops, someone instantiated this PTE before us */
goto backout;

add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+
spin_unlock(&mm->page_table_lock);
unlock_page(page);
-out:
+
return ret;

backout:
@@ -421,7 +418,29 @@
hugetlb_put_quota(mapping);
unlock_page(page);
put_page(page);
- goto out;
+
+ return ret;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ pte_t entry;
+
+ ptep = huge_pte_alloc(mm, address);
+ if (! ptep)
+ goto out;
+
+ entry = *ptep;
+
+ if (pte_none(entry))
+ return hugetlb_no_page(mm, vma, address, ptep);
+
+ /* we could get here if another thread instantiated the pte
+ * before the test above */
+
+ return VM_FAULT_SIGBUS;
}

int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson

2005-10-26 03:18:07

by David Gibson

[permalink] [raw]

Subject: Re: RFC: Cleanup / small fixes to hugetlb fault handling

On Wed, Oct 26, 2005 at 12:00:55PM +1000, David Gibson wrote:
> Hi, Adam, Bill, Hugh,
>
> Does this look like a reasonable patch to send to akpm for -mm.

Ahem. Or rather this version, which actually compiles.

This patch makes some slight tweaks / cleanups to the fault handling
path for huge pages in -mm. My main motivation is to make it simpler
to fit COW in, but along the way it addresses a few minor problems
with the existing code:

- The check against i_size was duplicated: once in
find_lock_huge_page() and again in hugetlb_fault() after taking the
page_table_lock. We only really need the locked one, so remove the
other.

- find_lock_huge_page() didn't, in fact, lock the page if it newly
allocated one, rather than finding it in the page cache already. As
far as I can tell this is a bug, so the patch corrects it.

- find_lock_huge_page() isn't a great name, since it does extra things
not analagous to find_lock_page(). Rename it
find_or_alloc_huge_page() which is closer to the mark.

Signed-off-by: David Gibson <[email protected]>

Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c 2005-10-26 11:18:39.000000000 +1000
+++ working-2.6/mm/hugetlb.c 2005-10-26 12:46:29.000000000 +1000
@@ -336,32 +336,28 @@
flush_tlb_range(vma, start, end);
}

-static struct page *find_lock_huge_page(struct address_space *mapping,
- unsigned long idx)
+static struct page *find_or_alloc_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
struct page *page;
int err;
- struct inode *inode = mapping->host;
- unsigned long size;

retry:
page = find_lock_page(mapping, idx);
if (page)
- goto out;
-
- /* Check to make sure the mapping hasn't been truncated */
- size = i_size_read(inode) >> HPAGE_SHIFT;
- if (idx >= size)
- goto out;
+ return page;

if (hugetlb_get_quota(mapping))
- goto out;
+ return NULL;
+
page = alloc_huge_page();
if (!page) {
hugetlb_put_quota(mapping);
- goto out;
+ return NULL;
}

+ lock_page(page);
+
err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
if (err) {
put_page(page);
@@ -370,50 +366,49 @@
goto retry;
page = NULL;
}
-out:
+
return page;
}

-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access)
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
{
- int ret = VM_FAULT_SIGBUS;
+ int ret;
unsigned long idx;
unsigned long size;
- pte_t *pte;
struct page *page;
struct address_space *mapping;

- pte = huge_pte_alloc(mm, address);
- if (!pte)
- goto out;
-
mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));

- /*
- * Use page lock to guard against racing truncation
- * before we get page_table_lock.
- */
- page = find_lock_huge_page(mapping, idx);
+ /* This returns a locked page, which keeps us safe in the
+ * event of a race with truncate() */
+ page = find_or_alloc_huge_page(mapping, idx);
if (!page)
- goto out;
+ return VM_FAULT_SIGBUS;

spin_lock(&mm->page_table_lock);
+
+ ret = VM_FAULT_SIGBUS;
+
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
if (idx >= size)
goto backout;

ret = VM_FAULT_MINOR;
- if (!pte_none(*pte))
+
+ if (!pte_none(*ptep))
+ /* oops, someone instantiated this PTE before us */
goto backout;

add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+ set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+
spin_unlock(&mm->page_table_lock);
unlock_page(page);
-out:
+
return ret;

backout:
@@ -421,7 +416,30 @@
hugetlb_put_quota(mapping);
unlock_page(page);
put_page(page);
- goto out;
+
+ return ret;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ pte_t entry;
+
+ ptep = huge_pte_alloc(mm, address);
+ if (! ptep)
+ /* OOM */
+ return VM_FAULT_SIGBUS;
+
+ entry = *ptep;
+
+ if (pte_none(entry))
+ return hugetlb_no_page(mm, vma, address, ptep);
+
+ /* we could get here if another thread instantiated the pte
+ * before the test above */
+
+ return VM_FAULT_SIGBUS;
}

int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson

2005-10-26 05:06:28

by salve vandana

[permalink] [raw]

Subject: __alloc_pages: 0-order allocation failed (gfp=0x1d2/0)

Hi,

I am getting this VM error on 2.4.28kernel(RAM-768MB,
No Swap and the root file system,whose size is around
300MB is loaded as initrd).
After the error the processes are getting killed and
system is rebooted. I am not understanding why the MM
is trying is allocate pages from HIGH Memory
(gfp=0x1d2/0) when I dont have High memory and the
kernel is also not enabled to support High memory. I
have put the printk's to see how kswapd is woken up to
free unused pages because I dont want to run out of
memory.

Here is log:

try_to_free_pages_zone
try_to_free_pages_zone
try_to_free_pages_zone
__alloc_pages: 0-order allocation failed (gfp=0x1d2/0)
VM: killing process cp_test
Received SIGCHLDtry_to_free_pages_zone
cp_test exited (PID = 213).Invalid TFTP URL for
exporting crash-dumps...
__alloc_pages: 0-order allocation failed (gfp=0x1f0/0)
ry_to_free_pages_zone
try_to_free_pages_zone
try_to_free_pages_zone

Thanks,
Vandana

__________________________________
Yahoo! FareChase: Search multiple travel sites in one click.
http://farechase.yahoo.com

2005-10-26 11:12:30

by salve vandana

[permalink] [raw]

Subject: Re: __alloc_pages: 0-order allocation failed (gfp=0x1d2/0)

Here is some more inputs on what i tried to do

Mine system setup is:
2.4.28kernel with 768MBRAM(final configuration will
1G), No Swap Space and the
Root File system is mounted as initrd, whose size is
around 300MB.

After few hours,the system goes out of memory and
starts killing processes

As the whole system is in-memory file system all the
pages allocated are not freed but are put into active
or inactive page list.

I increased the watermark value of min,low and high
pages to higher value then the default value to invoke
the kernel thread "kswap", which free's some of the
pages. This is the observation I have.

Also the kernel is not configured with HIGH_MEM
support but still I can see some page allocation done
from high memmory...this is mine guess from the
following messsage(gfp = 0x1d2)
"__alloc_pages: 0-order allocation failed
(gfp=0x1d2/0"

After 3-4 hours some of the processes get killed.
Given below :

try_to_free_pages_zone
try_to_free_pages_zone
try_to_free_pages_zone
__alloc_pages: 0-order allocation failed (gfp=0x1d2/0)
VM: killing process cp_test
Received SIGCHLDtry_to_free_pages_zone
cp_test exited (PID = 213).Invalid TFTP URL for
exporting crash-dumps...
__alloc_pages: 0-order allocation failed (gfp=0x1f0/0)
ry_to_free_pages_zone
try_to_free_pages_zone
try_to_free_pages_zone

How can I avoid this "Out of memory Issue". Does swap
space helps in this case or what is the significance
of having swap space for ramdisk file system.

Your inputs will be of great. Needs to fixup this
issue as soon as possible.

Thanks,
Vandana

--- salve vandana <[email protected]> wrote:

> Hi,
>
> I am getting this VM error on
> 2.4.28kernel(RAM-768MB,
> No Swap and the root file system,whose size is
> around
> 300MB is loaded as initrd).
> After the error the processes are getting killed and
> system is rebooted. I am not understanding why the
> MM
> is trying is allocate pages from HIGH Memory
> (gfp=0x1d2/0) when I dont have High memory and the
> kernel is also not enabled to support High memory. I
> have put the printk's to see how kswapd is woken up
> to
> free unused pages because I dont want to run out of
> memory.
>
> Here is log:
>
> try_to_free_pages_zone
> try_to_free_pages_zone
> try_to_free_pages_zone
> __alloc_pages: 0-order allocation failed
> (gfp=0x1d2/0)
> VM: killing process cp_test
> Received SIGCHLDtry_to_free_pages_zone
> cp_test exited (PID = 213).Invalid TFTP URL for
> exporting crash-dumps...
> __alloc_pages: 0-order allocation failed
> (gfp=0x1f0/0)
> ry_to_free_pages_zone
> try_to_free_pages_zone
> try_to_free_pages_zone
>
> Thanks,
> Vandana
>
>
>
>
>
> __________________________________
> Yahoo! FareChase: Search multiple travel sites in
> one click.
> http://farechase.yahoo.com
>
> --
> To unsubscribe, send a message with 'unsubscribe
> linux-mm' in
> the body to [email protected]. For more info on
> Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]">
> [email protected] </a>
>

__________________________________
Yahoo! Mail - PC Magazine Editors' Choice 2005
http://mail.yahoo.com

2005-10-26 18:47:03

by Chen, Kenneth W

[permalink] [raw]

Subject: RE: RFC: Cleanup / small fixes to hugetlb fault handling

David Gibson wrote on Tuesday, October 25, 2005 7:49 PM
> +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> + unsigned long address, int write_access)
> +{
> + pte_t *ptep;
> + pte_t entry;
> +
> + ptep = huge_pte_alloc(mm, address);
> + if (! ptep)
> + /* OOM */
> + return VM_FAULT_SIGBUS;
> +
> + entry = *ptep;
> +
> + if (pte_none(entry))
> + return hugetlb_no_page(mm, vma, address, ptep);
> +
> + /* we could get here if another thread instantiated the pte
> + * before the test above */
> +
> + return VM_FAULT_SIGBUS;
> }

Are you sure about the last return? Looks like a typo to me, if *ptep
is present, it should return VM_FAULT_MINOR.

But the bigger question is: don't you need some lock when checking *ptep?

- Ken

2005-10-26 20:14:35

by Chen, Kenneth W

[permalink] [raw]

Subject: RE: RFC: Cleanup / small fixes to hugetlb fault handling

David Gibson wrote on Tuesday, October 25, 2005 7:49 PM
> - find_lock_huge_page() didn't, in fact, lock the page if it newly
> allocated one, rather than finding it in the page cache already. As
> far as I can tell this is a bug, so the patch corrects it.

add_to_page_cache will lock the page if it was successfully added to the
address space radix tree. I don't see a bug that you are seeing.

2005-10-26 21:46:33

by Adam Litke

[permalink] [raw]

Subject: Re: RFC: Cleanup / small fixes to hugetlb fault handling

On Wed, 2005-10-26 at 12:48 +1000, David Gibson wrote:
> On Wed, Oct 26, 2005 at 12:00:55PM +1000, David Gibson wrote:
> > Hi, Adam, Bill, Hugh,
> >
> > Does this look like a reasonable patch to send to akpm for -mm.
>
> Ahem. Or rather this version, which actually compiles.
>
> This patch makes some slight tweaks / cleanups to the fault handling
> path for huge pages in -mm. My main motivation is to make it simpler
> to fit COW in, but along the way it addresses a few minor problems
> with the existing code:
>
> - The check against i_size was duplicated: once in
> find_lock_huge_page() and again in hugetlb_fault() after taking the
> page_table_lock. We only really need the locked one, so remove the
> other.

Fair enough.

> - find_lock_huge_page() didn't, in fact, lock the page if it newly
> allocated one, rather than finding it in the page cache already. As
> far as I can tell this is a bug, so the patch corrects it.

Thanks. I was about to post a fix for this too. It is reproducible in
the case where two threads race in the fault handler and both do
alloc_huge_page(). In that case, the loser will fail to insert his page
into the page cache and will call put_page() which has a
BUG_ON(page_count(page) == 0).

> - find_lock_huge_page() isn't a great name, since it does extra things
> not analagous to find_lock_page(). Rename it
> find_or_alloc_huge_page() which is closer to the mark.

I'll agree with the above. I am not all that committed to the current
layout and what you have here is a little closer to the thinking in my
original patch ;)

<snip>

> +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> + unsigned long address, int write_access)
> +{
> + pte_t *ptep;
> + pte_t entry;
> +
> + ptep = huge_pte_alloc(mm, address);
> + if (! ptep)
> + /* OOM */
> + return VM_FAULT_SIGBUS;
> +
> + entry = *ptep;
> +
> + if (pte_none(entry))
> + return hugetlb_no_page(mm, vma, address, ptep);
> +
> + /* we could get here if another thread instantiated the pte
> + * before the test above */
> +
> + return VM_FAULT_SIGBUS;
> }

I'll agree with Ken that the last return should probably still be
VM_FAULT_MINOR.

--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

2005-10-27 00:11:18

by David Gibson

[permalink] [raw]

Subject: Re: RFC: Cleanup / small fixes to hugetlb fault handling

On Wed, Oct 26, 2005 at 11:44:52AM -0700, Chen, Kenneth W wrote:
> David Gibson wrote on Tuesday, October 25, 2005 7:49 PM
> > +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> > + unsigned long address, int write_access)
> > +{
> > + pte_t *ptep;
> > + pte_t entry;
> > +
> > + ptep = huge_pte_alloc(mm, address);
> > + if (! ptep)
> > + /* OOM */
> > + return VM_FAULT_SIGBUS;
> > +
> > + entry = *ptep;
> > +
> > + if (pte_none(entry))
> > + return hugetlb_no_page(mm, vma, address, ptep);
> > +
> > + /* we could get here if another thread instantiated the pte
> > + * before the test above */
> > +
> > + return VM_FAULT_SIGBUS;
> > }
>
> Are you sure about the last return? Looks like a typo to me, if *ptep
> is present, it should return VM_FAULT_MINOR.

Oops, yes, thinko. Corrected patch shortly.

> But the bigger question is: don't you need some lock when checking *ptep?

No, I'm pretty sure that's ok. In a sense, the test here is only an
optimization: we recheck the pte with lock held in hugetlb_no_page()
before attempting the set_pte_at().

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson

2005-10-27 00:18:32

by Chen, Kenneth W

[permalink] [raw]

Subject: RE: RFC: Cleanup / small fixes to hugetlb fault handling

David Gibson wrote on Wednesday, October 26, 2005 5:05 PM
> On Wed, Oct 26, 2005 at 11:44:52AM -0700, Chen, Kenneth W wrote:
> > David Gibson wrote on Tuesday, October 25, 2005 7:49 PM
> > > +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> > > + unsigned long address, int write_access)
> > > +{
> > > + pte_t *ptep;
> > > + pte_t entry;
> > > +
> > > + ptep = huge_pte_alloc(mm, address);
> > > + if (! ptep)
> > > + /* OOM */
> > > + return VM_FAULT_SIGBUS;
> > > +
> > > + entry = *ptep;
> > > +
> > > + if (pte_none(entry))
> > > + return hugetlb_no_page(mm, vma, address, ptep);
> > > +
> > > + /* we could get here if another thread instantiated the pte
> > > + * before the test above */
> > > +
> > > + return VM_FAULT_SIGBUS;
> > > }
> >
> > Are you sure about the last return? Looks like a typo to me, if *ptep
> > is present, it should return VM_FAULT_MINOR.
>
> Oops, yes, thinko. Corrected patch shortly.

While you at it, I think it would be preferable that the first return be
VM_FAULT_OOM, your thoughts?

- Ken

2005-10-27 00:21:09

by David Gibson

[permalink] [raw]

Subject: Re: RFC: Cleanup / small fixes to hugetlb fault handling

On Wed, Oct 26, 2005 at 04:46:16PM -0500, Adam Litke wrote:
> On Wed, 2005-10-26 at 12:48 +1000, David Gibson wrote:
> > On Wed, Oct 26, 2005 at 12:00:55PM +1000, David Gibson wrote:
> > > Hi, Adam, Bill, Hugh,
> > >
> > > Does this look like a reasonable patch to send to akpm for -mm.
> >
> > Ahem. Or rather this version, which actually compiles.
> >
> > This patch makes some slight tweaks / cleanups to the fault handling
> > path for huge pages in -mm. My main motivation is to make it simpler
> > to fit COW in, but along the way it addresses a few minor problems
> > with the existing code:
> >
> > - The check against i_size was duplicated: once in
> > find_lock_huge_page() and again in hugetlb_fault() after taking the
> > page_table_lock. We only really need the locked one, so remove the
> > other.
>
> Fair enough.
>
> > - find_lock_huge_page() didn't, in fact, lock the page if it newly
> > allocated one, rather than finding it in the page cache already. As
> > far as I can tell this is a bug, so the patch corrects it.
>
> Thanks. I was about to post a fix for this too. It is reproducible in
> the case where two threads race in the fault handler and both do
> alloc_huge_page(). In that case, the loser will fail to insert his page
> into the page cache and will call put_page() which has a
> BUG_ON(page_count(page) == 0).

Hrm.. I don't think this is due to that. As Ken points out,
add_to_page_cache() does indeed lock the page if it suceeds in
inserting. And alloc_huge_page() sets the count to 1, not 0, so the
put_page() should not cause a problem on the failure path.

Hrm.. wonder what's going on here.

> > - find_lock_huge_page() isn't a great name, since it does extra things
> > not analagous to find_lock_page(). Rename it
> > find_or_alloc_huge_page() which is closer to the mark.
>
> I'll agree with the above. I am not all that committed to the current
> layout and what you have here is a little closer to the thinking in my
> original patch ;)
>
> <snip>
>
> > +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> > + unsigned long address, int write_access)
> > +{
> > + pte_t *ptep;
> > + pte_t entry;
> > +
> > + ptep = huge_pte_alloc(mm, address);
> > + if (! ptep)
> > + /* OOM */
> > + return VM_FAULT_SIGBUS;
> > +
> > + entry = *ptep;
> > +
> > + if (pte_none(entry))
> > + return hugetlb_no_page(mm, vma, address, ptep);
> > +
> > + /* we could get here if another thread instantiated the pte
> > + * before the test above */
> > +
> > + return VM_FAULT_SIGBUS;
> > }
>
> I'll agree with Ken that the last return should probably still be
> VM_FAULT_MINOR.
>

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson

2005-10-27 00:21:07

by David Gibson

[permalink] [raw]

Subject: Re: RFC: Cleanup / small fixes to hugetlb fault handling

On Wed, Oct 26, 2005 at 01:12:30PM -0700, Chen, Kenneth W wrote:
> David Gibson wrote on Tuesday, October 25, 2005 7:49 PM
> > - find_lock_huge_page() didn't, in fact, lock the page if it newly
> > allocated one, rather than finding it in the page cache already. As
> > far as I can tell this is a bug, so the patch corrects it.
>
> add_to_page_cache will lock the page if it was successfully added to the
> address space radix tree. I don't see a bug that you are seeing.

Ah, so it does. Thought I might be missing something.

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson

2005-10-27 00:23:15

by David Gibson

[permalink] [raw]

Subject: Re: RFC: Cleanup / small fixes to hugetlb fault handling

2005-10-27 06:39:34

by David Gibson

[permalink] [raw]

Subject: Re: RFC: Cleanup / small fixes to hugetlb fault handling

On Wed, Oct 26, 2005 at 05:16:39PM -0700, Chen, Kenneth W wrote:
> David Gibson wrote on Wednesday, October 26, 2005 5:05 PM
> > On Wed, Oct 26, 2005 at 11:44:52AM -0700, Chen, Kenneth W wrote:
> > > David Gibson wrote on Tuesday, October 25, 2005 7:49 PM
> > > > +int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> > > > + unsigned long address, int write_access)
> > > > +{
> > > > + pte_t *ptep;
> > > > + pte_t entry;
> > > > +
> > > > + ptep = huge_pte_alloc(mm, address);
> > > > + if (! ptep)
> > > > + /* OOM */
> > > > + return VM_FAULT_SIGBUS;
> > > > +
> > > > + entry = *ptep;
> > > > +
> > > > + if (pte_none(entry))
> > > > + return hugetlb_no_page(mm, vma, address, ptep);
> > > > +
> > > > + /* we could get here if another thread instantiated the pte
> > > > + * before the test above */
> > > > +
> > > > + return VM_FAULT_SIGBUS;
> > > > }
> > >
> > > Are you sure about the last return? Looks like a typo to me, if *ptep
> > > is present, it should return VM_FAULT_MINOR.
> >
> > Oops, yes, thinko. Corrected patch shortly.
>
> While you at it, I think it would be preferable that the first return be
> VM_FAULT_OOM, your thoughts?

Ok, here's the revised patch with this change, and the others
mentioned elsewhere.

RFC: Cleanup / small fixes to hugetlb fault handling

This patch makes some slight tweaks / cleanups to the fault handling
path for huge pages in -mm. My main motivation is to make it simpler
to fit COW in, but along the way it addresses a few minor problems
with the existing code:

- The check against i_size was duplicated: once in
find_lock_huge_page() and again in hugetlb_fault() after taking the
page_table_lock. We only really need the locked one, so remove the
other.

- find_lock_huge_page() isn't a great name, since it does extra things
not analagous to find_lock_page(). Rename it
find_or_alloc_huge_page() which is closer to the mark.

Signed-off-by: David Gibson <[email protected]>

Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c 2005-10-27 16:34:20.000000000 +1000
+++ working-2.6/mm/hugetlb.c 2005-10-27 16:34:55.000000000 +1000
@@ -336,30 +336,24 @@
flush_tlb_range(vma, start, end);
}

-static struct page *find_lock_huge_page(struct address_space *mapping,
- unsigned long idx)
+static struct page *find_or_alloc_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
struct page *page;
int err;
- struct inode *inode = mapping->host;
- unsigned long size;

retry:
page = find_lock_page(mapping, idx);
if (page)
- goto out;
-
- /* Check to make sure the mapping hasn't been truncated */
- size = i_size_read(inode) >> HPAGE_SHIFT;
- if (idx >= size)
- goto out;
+ return page;

if (hugetlb_get_quota(mapping))
- goto out;
+ return NULL;
+
page = alloc_huge_page();
if (!page) {
hugetlb_put_quota(mapping);
- goto out;
+ return NULL;
}

err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
@@ -370,50 +364,49 @@
goto retry;
page = NULL;
}
-out:
+
return page;
}

-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access)
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
{
- int ret = VM_FAULT_SIGBUS;
+ int ret;
unsigned long idx;
unsigned long size;
- pte_t *pte;
struct page *page;
struct address_space *mapping;

- pte = huge_pte_alloc(mm, address);
- if (!pte)
- goto out;
-
mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));

- /*
- * Use page lock to guard against racing truncation
- * before we get page_table_lock.
- */
- page = find_lock_huge_page(mapping, idx);
+ /* This returns a locked page, which keeps us safe in the
+ * event of a race with truncate() */
+ page = find_or_alloc_huge_page(mapping, idx);
if (!page)
- goto out;
+ return VM_FAULT_SIGBUS;

spin_lock(&mm->page_table_lock);
+
+ ret = VM_FAULT_SIGBUS;
+
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
if (idx >= size)
goto backout;

ret = VM_FAULT_MINOR;
- if (!pte_none(*pte))
+
+ if (!pte_none(*ptep))
+ /* oops, someone instantiated this PTE before us */
goto backout;

add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+ set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+
spin_unlock(&mm->page_table_lock);
unlock_page(page);
-out:
+
return ret;

backout:
@@ -421,7 +414,29 @@
hugetlb_put_quota(mapping);
unlock_page(page);
put_page(page);
- goto out;
+
+ return ret;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ pte_t entry;
+
+ ptep = huge_pte_alloc(mm, address);
+ if (! ptep)
+ return VM_FAULT_OOM;
+
+ entry = *ptep;
+
+ if (pte_none(entry))
+ return hugetlb_no_page(mm, vma, address, ptep);
+
+ /* we could get here if another thread instantiated the pte
+ * before the test above */
+
+ return VM_FAULT_MINOR;
}

int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson