Ok, here's the next iteration of these patches. I think I've handled
the truncate() case by comparing the hugetlbfs inode's i_size with the
mapping offset of the requested page to make sure it hasn't been
truncated. Can anyone confirm or deny that I have the locking correct
for this? The other patches are still unchanged. Andrew: Did Andi
Kleen's explanation of huge_pages_needed() satisfy?
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
Version 2 (Mon, 3 Oct 2005)
* Removed follow_hugetlb_page since get_user_pages() was the only caller.
Initial Post (Thu, 18 Aug 2005)
In preparation for hugetlb demand faulting, remove this get_user_pages()
optimization. Since huge pages will no longer be prefaulted, we can't assume
that the huge ptes are established and hence, calling follow_hugetlb_page() is
not valid.
With the follow_hugetlb_page() call removed, the normal code path will be
triggered. follow_page() will either use follow_huge_addr() or
follow_huge_pmd() to check for a previously faulted "page" to return. When
this fails (ie. with demand faults), __handle_mm_fault() gets called which
invokes the hugetlb_fault() handler to instantiate the huge page.
This patch doesn't make a lot of sense by itself, but I've broken it out to
facilitate discussion on this specific element of the demand fault changes.
While coding this up, I referenced previous discussion on this topic starting
at http://lkml.org/lkml/2004/4/13/176 , which contains more opinions about the
correctness of this approach.
Signed-off-by: Adam Litke <[email protected]>
---
hugetlb.c | 54 ------------------------------------------------------
memory.c | 5 -----
2 files changed, 59 deletions(-)
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -392,57 +392,3 @@ out:
spin_unlock(&mm->page_table_lock);
return ret;
}
-
-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
-{
- unsigned long vpfn, vaddr = *position;
- int remainder = *length;
-
- BUG_ON(!is_vm_hugetlb_page(vma));
-
- vpfn = vaddr/PAGE_SIZE;
- spin_lock(&mm->page_table_lock);
- while (vaddr < vma->vm_end && remainder) {
-
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- /* Some archs (sparc64, sh*) have multiple
- * pte_ts to each hugepage. We have to make
- * sure we get the first, for the page
- * indexing below to work. */
- pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-
- /* the hugetlb file might have been truncated */
- if (!pte || pte_none(*pte)) {
- remainder = 0;
- if (!i)
- i = -EFAULT;
- break;
- }
-
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
- WARN_ON(!PageCompound(page));
-
- get_page(page);
- pages[i] = page;
- }
-
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- ++vpfn;
- --remainder;
- ++i;
- }
- spin_unlock(&mm->page_table_lock);
- *length = remainder;
- *position = vaddr;
-
- return i;
-}
diff -upN reference/mm/memory.c current/mm/memory.c
--- reference/mm/memory.c
+++ current/mm/memory.c
@@ -971,11 +971,6 @@ int get_user_pages(struct task_struct *t
|| !(flags & vma->vm_flags))
return i ? : -EFAULT;
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &len, i);
- continue;
- }
spin_lock(&mm->page_table_lock);
do {
int write_access = write;
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
Version 5 (Tue, 11 Oct 2005)
Deal with hugetlbfs file truncation in find_get_huge_page()
Version 4 (Mon, 03 Oct 2005)
Make find_get_huge_page bale properly when add_to_page_cache fails
due to OOM conditions
Version 3 (Thu, 08 Sep 2005)
Organized logic in hugetlb_pte_fault() by breaking out
find_get_page/alloc_huge_page logic into separate function
Removed a few more paranoid checks ( Thanks )
Fixed tlb flushing in a race case ( Yanmin Zhang )
Version 2 (Wed, 17 Aug 2005)
Removed spurious WARN_ON()
Patches added earlier in the series (now in mainline):
Check for p?d_none() in arch/i386/mm/hugetlbpage.c:huge_pte_offset()
Move i386 stale pte check into huge_pte_alloc()
Initial Post (Fri, 05 Aug 2005)
Below is a patch to implement demand faulting for huge pages. The main
motivation for changing from prefaulting to demand faulting is so that
huge page memory areas can be allocated according to NUMA policy.
Thanks to consolidated hugetlb code, switching the behavior requires changing
only one fault handler. The bulk of the patch just moves the logic from
hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page().
Signed-off-by: Adam Litke <[email protected]>
---
fs/hugetlbfs/inode.c | 6 --
include/linux/hugetlb.h | 2
mm/hugetlb.c | 139 ++++++++++++++++++++++++++++++++----------------
mm/memory.c | 2
4 files changed, 98 insertions(+), 51 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group;
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
loff_t len, vma_len;
int ret;
@@ -79,10 +78,7 @@ static int hugetlbfs_file_mmap(struct fi
if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
goto out;
- ret = hugetlb_prefault(mapping, vma);
- if (ret)
- goto out;
-
+ ret = 0;
if (inode->i_size < len)
inode->i_size = len;
out:
diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h
+++ current/include/linux/hugetlb.h
@@ -25,6 +25,8 @@ int is_hugepage_mem_enough(size_t);
unsigned long hugetlb_total_pages(void);
struct page *alloc_huge_page(void);
void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+ unsigned long address, int write_access);
extern unsigned long max_huge_pages;
extern const unsigned long hugetlb_zero, hugetlb_infinity;
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -312,9 +312,8 @@ void unmap_hugepage_range(struct vm_area
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (! ptep)
- /* This can happen on truncate, or if an
- * mmap() is aborted due to an error before
- * the prefault */
+ /* This can happen on truncate, or for pages
+ * not yet faulted in */
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -338,57 +337,107 @@ void zap_hugepage_range(struct vm_area_s
spin_unlock(&mm->page_table_lock);
}
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+static struct page *find_get_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
+ struct page *page = NULL;
+ int rc;
+ struct inode *inode = mapping->host;
+ unsigned long size;
+
+retry:
+ page = find_get_page(mapping, idx);
+ if (page)
+ goto out;
+
+ /* Check to make sure the mapping hasn't been truncated */
+ size = i_size_read(inode) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto out;
+
+ if (hugetlb_get_quota(mapping))
+ goto out;
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ goto out;
+ }
+
+ /*
+ * It would be better to use GFP_KERNEL here but then we'd need to
+ * drop the page_table_lock and handle several race conditions.
+ */
+ rc = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+ if (rc) {
+ put_page(page);
+ page = NULL;
+ hugetlb_put_quota(mapping);
+ if (rc == -ENOMEM)
+ goto out;
+ else
+ goto retry;
+ }
+ unlock_page(page);
+out:
+ return page;
+}
+
+static int hugetlb_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ int ret = VM_FAULT_MINOR;
+ unsigned long idx;
+ pte_t *pte;
+ struct page *page;
+ struct address_space *mapping;
- WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(vma->vm_start & ~HPAGE_MASK);
BUG_ON(vma->vm_end & ~HPAGE_MASK);
+ BUG_ON(!vma->vm_file);
- hugetlb_prefault_arch_hook(mm);
+ pte = huge_pte_offset(mm, address);
+ if (!pte) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if (!pte_none(*pte))
+ goto out;
- spin_lock(&mm->page_table_lock);
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
- unsigned long idx;
- pte_t *pte = huge_pte_alloc(mm, addr);
- struct page *page;
+ mapping = vma->vm_file->f_mapping;
+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+ page = find_get_huge_page(mapping, idx);
+ if (!page) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
+ add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+out:
+ return ret;
+}
- idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- page = find_get_page(mapping, idx);
- if (!page) {
- /* charge the fs quota first */
- if (hugetlb_get_quota(mapping)) {
- ret = -ENOMEM;
- goto out;
- }
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- ret = -ENOMEM;
- goto out;
- }
- ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
- if (! ret) {
- unlock_page(page);
- } else {
- hugetlb_put_quota(mapping);
- free_huge_page(page);
- goto out;
- }
- }
- add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ int rc = VM_FAULT_MINOR;
+
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_alloc(mm, address);
+ if (!ptep) {
+ rc = VM_FAULT_SIGBUS;
+ goto out;
}
+ if (pte_none(*ptep))
+ rc = hugetlb_pte_fault(mm, vma, address, write_access);
+
+ if (rc == VM_FAULT_MINOR)
+ flush_tlb_page(vma, address);
out:
spin_unlock(&mm->page_table_lock);
- return ret;
+ return rc;
}
diff -upN reference/mm/memory.c current/mm/memory.c
--- reference/mm/memory.c
+++ current/mm/memory.c
@@ -2040,7 +2040,7 @@ int __handle_mm_fault(struct mm_struct *
inc_page_state(pgfault);
if (is_vm_hugetlb_page(vma))
- return VM_FAULT_SIGBUS; /* mapping truncation does this. */
+ return hugetlb_fault(mm, vma, address, write_access);
/*
* We need the page table lock to synchronize with kswapd
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
Adam Litke <[email protected]> wrote:
>
> Andrew: Did Andi
> Kleen's explanation of huge_pages_needed() satisfy?
Spose so. I trust that it's adequately commented in this version..
Initial Post (Thu, 18 Aug 2005)
Basic overcommit checking for hugetlb_file_map() based on an implementation
used with demand faulting in SLES9.
Since we're not prefaulting the pages at mmap time, some extra accounting is
needed. This patch implements a basic sanity check to ensure that the number
of huge pages required to satisfy the mmap are currently available. Of course
this method doesn't guarantee that the pages will be available at fault time,
but I think it is a good start on doing proper accounting and solves 90% of the
overcommit problems I see in practice.
Huge page shared memory segments are simpler and still maintain their commit on
shmget semantics.
Signed-off-by: Adam Litke <[email protected]>
---
inode.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
1 files changed, 55 insertions(+), 10 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -45,9 +45,59 @@ static struct backing_dev_info hugetlbfs
int sysctl_hugetlb_shm_group;
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+ int i;
+
+ for (i = 0; i < pagevec_count(pvec); ++i)
+ put_page(pvec->pages[i]);
+
+ pagevec_reinit(pvec);
+}
+
+unsigned long
+huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ int i;
+ struct pagevec pvec;
+ unsigned long start = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
+ pgoff_t next = vma->vm_pgoff;
+ pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+
+ /*
+ * Shared memory segments are accounted for at shget time,
+ * not at shmat (when the mapping is actually created) so
+ * check here if the memory has already been accounted for.
+ */
+ if (inode->i_blocks != 0)
+ return 0;
+
+ pagevec_init(&pvec, 0);
+ while (next < endpg) {
+ if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+ break;
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ if (page->index > next)
+ next = page->index;
+ if (page->index >= endpg)
+ break;
+ next++;
+ hugepages--;
+ }
+ huge_pagevec_release(&pvec);
+ }
+ return hugepages << HPAGE_SHIFT;
+}
+
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long bytes;
loff_t len, vma_len;
int ret;
@@ -66,6 +116,10 @@ static int hugetlbfs_file_mmap(struct fi
if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
return -EINVAL;
+ bytes = huge_pages_needed(mapping, vma);
+ if (!is_hugepage_mem_enough(bytes))
+ return -ENOMEM;
+
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
down(&inode->i_sem);
@@ -167,16 +221,6 @@ static int hugetlbfs_commit_write(struct
return -EINVAL;
}
-static void huge_pagevec_release(struct pagevec *pvec)
-{
- int i;
-
- for (i = 0; i < pagevec_count(pvec); ++i)
- put_page(pvec->pages[i]);
-
- pagevec_reinit(pvec);
-}
-
static void truncate_huge_page(struct page *page)
{
clear_page_dirty(page);
@@ -792,6 +836,7 @@ struct file *hugetlb_zero_setup(size_t s
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0;
+ inode->i_blocks = 1;
file->f_vfsmnt = mntget(hugetlbfs_vfsmount);
file->f_dentry = dentry;
file->f_mapping = inode->i_mapping;
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
On Tue, 2005-10-11 at 11:32 -0700, Andrew Morton wrote:
> Adam Litke <[email protected]> wrote:
> >
> > Andrew: Did Andi
> > Kleen's explanation of huge_pages_needed() satisfy?
>
> Spose so. I trust that it's adequately commented in this version..
Just to be sure, added a comment block at the top of
huge_pages_needed().
Initial Post (Thu, 18 Aug 2005)
Basic overcommit checking for hugetlb_file_map() based on an implementation
used with demand faulting in SLES9.
Since we're not prefaulting the pages at mmap time, some extra accounting is
needed. This patch implements a basic sanity check to ensure that the number
of huge pages required to satisfy the mmap are currently available. Of course
this method doesn't guarantee that the pages will be available at fault time,
but I think it is a good start on doing proper accounting and solves 90% of the
overcommit problems I see in practice.
Huge page shared memory segments are simpler and still maintain their commit on
shmget semantics.
Signed-off-by: Adam Litke <[email protected]>
---
inode.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
1 files changed, 63 insertions(+), 10 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -45,9 +45,67 @@ static struct backing_dev_info hugetlbfs
int sysctl_hugetlb_shm_group;
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+ int i;
+
+ for (i = 0; i < pagevec_count(pvec); ++i)
+ put_page(pvec->pages[i]);
+
+ pagevec_reinit(pvec);
+}
+
+/*
+ * huge_pages_needed tries to determine the number of new huge pages that
+ * will be required to fully populate this VMA. This will be equal to
+ * the size of the VMA in huge pages minus the number of huge pages
+ * (covered by this VMA) that are found in the page cache.
+ *
+ * Result is in bytes to be compatible with is_hugepage_mem_enough()
+ */
+unsigned long
+huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ int i;
+ struct pagevec pvec;
+ unsigned long start = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
+ pgoff_t next = vma->vm_pgoff;
+ pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+
+ /*
+ * Shared memory segments are accounted for at shget time,
+ * not at shmat (when the mapping is actually created) so
+ * check here if the memory has already been accounted for.
+ */
+ if (inode->i_blocks != 0)
+ return 0;
+
+ pagevec_init(&pvec, 0);
+ while (next < endpg) {
+ if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+ break;
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ if (page->index > next)
+ next = page->index;
+ if (page->index >= endpg)
+ break;
+ next++;
+ hugepages--;
+ }
+ huge_pagevec_release(&pvec);
+ }
+ return hugepages << HPAGE_SHIFT;
+}
+
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long bytes;
loff_t len, vma_len;
int ret;
@@ -66,6 +124,10 @@ static int hugetlbfs_file_mmap(struct fi
if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
return -EINVAL;
+ bytes = huge_pages_needed(mapping, vma);
+ if (!is_hugepage_mem_enough(bytes))
+ return -ENOMEM;
+
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
down(&inode->i_sem);
@@ -167,16 +229,6 @@ static int hugetlbfs_commit_write(struct
return -EINVAL;
}
-static void huge_pagevec_release(struct pagevec *pvec)
-{
- int i;
-
- for (i = 0; i < pagevec_count(pvec); ++i)
- put_page(pvec->pages[i]);
-
- pagevec_reinit(pvec);
-}
-
static void truncate_huge_page(struct page *page)
{
clear_page_dirty(page);
@@ -792,6 +844,7 @@ struct file *hugetlb_zero_setup(size_t s
d_instantiate(dentry, inode);
inode->i_size = size;
inode->i_nlink = 0;
+ inode->i_blocks = 1;
file->f_vfsmnt = mntget(hugetlbfs_vfsmount);
file->f_dentry = dentry;
file->f_mapping = inode->i_mapping;
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
On Tue, Oct 11, 2005 at 01:32:38PM -0500, Adam Litke wrote:
> Version 5 (Tue, 11 Oct 2005)
> Deal with hugetlbfs file truncation in find_get_huge_page()
> Version 4 (Mon, 03 Oct 2005)
> Make find_get_huge_page bale properly when add_to_page_cache fails
> due to OOM conditions
> Version 3 (Thu, 08 Sep 2005)
> Organized logic in hugetlb_pte_fault() by breaking out
> find_get_page/alloc_huge_page logic into separate function
> Removed a few more paranoid checks ( Thanks )
> Fixed tlb flushing in a race case ( Yanmin Zhang )
>
> Version 2 (Wed, 17 Aug 2005)
> Removed spurious WARN_ON()
> Patches added earlier in the series (now in mainline):
> Check for p?d_none() in arch/i386/mm/hugetlbpage.c:huge_pte_offset()
> Move i386 stale pte check into huge_pte_alloc()
I'm not sure this does fully deal with truncation, I'm afraid - it
will deal with a truncation well before the fault, but not a
concurrent truncate(). We'll need the truncate_count/retry logic from
do_no_page, I think. Andi/Hugh, can you confirm that's correct?
> Initial Post (Fri, 05 Aug 2005)
>
> Below is a patch to implement demand faulting for huge pages. The main
> motivation for changing from prefaulting to demand faulting is so that
> huge page memory areas can be allocated according to NUMA policy.
>
> Thanks to consolidated hugetlb code, switching the behavior requires changing
> only one fault handler. The bulk of the patch just moves the logic from
> hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page().
While we're at it - it's a minor nit, but I find the distinction
between hugetlb_pte_fault() and hugetlb_fault() confusing. A better
name for the former would be hugetlb_no_page(), in which case we
should probably also move the border between it and
hugetlb_find_get_page() to match the boundary between do_no_page() and
mapping->nopage.
How about this, for example:
Signed-off-by: David Gibson <[email protected]>
Index: working-2.6/fs/hugetlbfs/inode.c
===================================================================
--- working-2.6.orig/fs/hugetlbfs/inode.c 2005-10-12 14:28:08.000000000 +1000
+++ working-2.6/fs/hugetlbfs/inode.c 2005-10-12 14:40:54.000000000 +1000
@@ -48,7 +48,6 @@
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
loff_t len, vma_len;
int ret;
@@ -79,10 +78,7 @@
if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
goto out;
- ret = hugetlb_prefault(mapping, vma);
- if (ret)
- goto out;
-
+ ret = 0;
if (inode->i_size < len)
inode->i_size = len;
out:
Index: working-2.6/include/linux/hugetlb.h
===================================================================
--- working-2.6.orig/include/linux/hugetlb.h 2005-10-12 10:25:24.000000000 +1000
+++ working-2.6/include/linux/hugetlb.h 2005-10-12 14:28:18.000000000 +1000
@@ -25,6 +25,8 @@
unsigned long hugetlb_total_pages(void);
struct page *alloc_huge_page(void);
void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+ unsigned long address, int write_access);
extern unsigned long max_huge_pages;
extern const unsigned long hugetlb_zero, hugetlb_infinity;
Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c 2005-10-12 14:28:16.000000000 +1000
+++ working-2.6/mm/hugetlb.c 2005-10-12 16:00:12.000000000 +1000
@@ -312,9 +312,8 @@
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (! ptep)
- /* This can happen on truncate, or if an
- * mmap() is aborted due to an error before
- * the prefault */
+ /* This can happen on truncate, or for pages
+ * not yet faulted in */
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -338,57 +337,128 @@
spin_unlock(&mm->page_table_lock);
}
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+static struct page *hugetlbfs_nopage(struct vm_area_struct *vma,
+ unsigned long address)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
+ int err;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned long pgoff, size;
+ struct page *page = NULL;
+
+ pgoff = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+ /* Check to make sure the mapping hasn't been truncated */
+ size = i_size_read(inode) >> HPAGE_SHIFT;
+ if (pgoff >= size)
+ return NULL;
+
+ retry:
+ page = find_get_page(mapping, pgoff);
+ if (page)
+ /* Another thread won the race */
+ return page;
+
+ if (hugetlb_get_quota(mapping) != 0)
+ goto out;
+
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ goto out;
+ }
+
+ /*
+ * It would be better to use GFP_KERNEL here but then we'd need to
+ * drop the page_table_lock and handle several race conditions.
+ */
+ err = add_to_page_cache(page, mapping, pgoff, GFP_ATOMIC);
+ if (err) {
+ put_page(page);
+ page = NULL;
+ hugetlb_put_quota(mapping);
+ if (err == -ENOMEM)
+ goto out;
+ else
+ goto retry;
+ }
+ unlock_page(page);
+out:
+ return page;
+
+}
+
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *pte,
+ int write_access)
+{
+ struct page *new_page;
+ struct address_space *mapping;
+ unsigned int sequence;
- WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(vma->vm_start & ~HPAGE_MASK);
BUG_ON(vma->vm_end & ~HPAGE_MASK);
+ BUG_ON(!vma->vm_file);
+ BUG_ON(!pte);
+ BUG_ON(!pte_none(*pte));
+
+ spin_unlock(&mm->page_table_lock);
- hugetlb_prefault_arch_hook(mm);
+ mapping = vma->vm_file->f_mapping;
+ sequence = mapping->truncate_count;
+ smp_rmb(); /* serializes i_size against truncate_count */
+
+ retry:
+ new_page = hugetlbfs_nopage(vma, address & HPAGE_MASK);
spin_lock(&mm->page_table_lock);
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
- unsigned long idx;
- pte_t *pte = huge_pte_alloc(mm, addr);
- struct page *page;
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!new_page)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Someone could have truncated this page.
+ */
+ if (unlikely(sequence != mapping->truncate_count)) {
+ spin_unlock(&mm->page_table_lock);
+ page_cache_release(new_page);
+ cond_resched();
+ sequence = mapping->truncate_count;
+ smp_rmb();
+ goto retry;
+ }
- idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- page = find_get_page(mapping, idx);
- if (!page) {
- /* charge the fs quota first */
- if (hugetlb_get_quota(mapping)) {
- ret = -ENOMEM;
- goto out;
- }
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- ret = -ENOMEM;
- goto out;
- }
- ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
- if (! ret) {
- unlock_page(page);
- } else {
- hugetlb_put_quota(mapping);
- free_huge_page(page);
- goto out;
- }
- }
+ if (pte_none(*pte)) {
+ set_huge_pte_at(mm, address, pte,
+ make_huge_pte(vma, new_page));
add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+ } else {
+ /* One of our sibling threads was faster, back out. */
+ page_cache_release(new_page);
+ }
+ return VM_FAULT_MINOR;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ int rc = VM_FAULT_MINOR;
+
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_alloc(mm, address);
+ if (!ptep) {
+ rc = VM_FAULT_SIGBUS;
+ goto out;
}
+ if (pte_none(*ptep))
+ rc = hugetlb_no_page(mm, vma, address, ptep, write_access);
+
+ if (rc == VM_FAULT_MINOR)
+ flush_tlb_page(vma, address);
out:
spin_unlock(&mm->page_table_lock);
- return ret;
+ return rc;
}
Index: working-2.6/mm/memory.c
===================================================================
--- working-2.6.orig/mm/memory.c 2005-10-12 14:28:16.000000000 +1000
+++ working-2.6/mm/memory.c 2005-10-12 14:28:18.000000000 +1000
@@ -2040,7 +2040,7 @@
inc_page_state(pgfault);
if (is_vm_hugetlb_page(vma))
- return VM_FAULT_SIGBUS; /* mapping truncation does this. */
+ return hugetlb_fault(mm, vma, address, write_access);
/*
* We need the page table lock to synchronize with kswapd
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson
On Wed, 12 Oct 2005, David Gibson wrote:
>
> I'm not sure this does fully deal with truncation, I'm afraid - it
> will deal with a truncation well before the fault, but not a
> concurrent truncate(). We'll need the truncate_count/retry logic from
> do_no_page, I think. Andi/Hugh, can you confirm that's correct?
Very likely you're right, but I already explained to Adam privately
that I won't get to look at all this before tomorrow - sorry.
Hugh
Thanks for the review and comments...
On Wed, 2005-10-12 at 16:09 +1000, David Gibson wrote:
> On Tue, Oct 11, 2005 at 01:32:38PM -0500, Adam Litke wrote:
> > Version 5 (Tue, 11 Oct 2005)
> > Deal with hugetlbfs file truncation in find_get_huge_page()
> > Version 4 (Mon, 03 Oct 2005)
> > Make find_get_huge_page bale properly when add_to_page_cache fails
> > due to OOM conditions
> > Version 3 (Thu, 08 Sep 2005)
> > Organized logic in hugetlb_pte_fault() by breaking out
> > find_get_page/alloc_huge_page logic into separate function
> > Removed a few more paranoid checks ( Thanks )
> > Fixed tlb flushing in a race case ( Yanmin Zhang )
> >
> > Version 2 (Wed, 17 Aug 2005)
> > Removed spurious WARN_ON()
> > Patches added earlier in the series (now in mainline):
> > Check for p?d_none() in arch/i386/mm/hugetlbpage.c:huge_pte_offset()
> > Move i386 stale pte check into huge_pte_alloc()
>
> I'm not sure this does fully deal with truncation, I'm afraid - it
> will deal with a truncation well before the fault, but not a
> concurrent truncate(). We'll need the truncate_count/retry logic from
> do_no_page, I think. Andi/Hugh, can you confirm that's correct?
Ok. I can see why we need that.
> > Initial Post (Fri, 05 Aug 2005)
> >
> > Below is a patch to implement demand faulting for huge pages. The main
> > motivation for changing from prefaulting to demand faulting is so that
> > huge page memory areas can be allocated according to NUMA policy.
> >
> > Thanks to consolidated hugetlb code, switching the behavior requires changing
> > only one fault handler. The bulk of the patch just moves the logic from
> > hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page().
>
> While we're at it - it's a minor nit, but I find the distinction
> between hugetlb_pte_fault() and hugetlb_fault() confusing. A better
> name for the former would be hugetlb_no_page(), in which case we
> should probably also move the border between it and
> hugetlb_find_get_page() to match the boundary between do_no_page() and
> mapping->nopage.
>
> How about this, for example:
Yeah, I suppose that division makes more sense when comparing to the
normal fault handler code.
> @@ -338,57 +337,128 @@
> spin_unlock(&mm->page_table_lock);
> }
>
> -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
> +static struct page *hugetlbfs_nopage(struct vm_area_struct *vma,
<snip>
> + /* Check to make sure the mapping hasn't been truncated */
> + size = i_size_read(inode) >> HPAGE_SHIFT;
> + if (pgoff >= size)
> + return NULL;
> +
> + retry:
> + page = find_get_page(mapping, pgoff);
> + if (page)
> + /* Another thread won the race */
> + return page;
Both of those returns could be changed to goto out so that the function
has only one exit path. Isn't that what we want?
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
On Thu, Oct 13, 2005 at 10:49:28AM -0500, Adam Litke wrote:
> Thanks for the review and comments...
>
> On Wed, 2005-10-12 at 16:09 +1000, David Gibson wrote:
> > On Tue, Oct 11, 2005 at 01:32:38PM -0500, Adam Litke wrote:
> > > Version 5 (Tue, 11 Oct 2005)
> > > Deal with hugetlbfs file truncation in find_get_huge_page()
> > > Version 4 (Mon, 03 Oct 2005)
> > > Make find_get_huge_page bale properly when add_to_page_cache fails
> > > due to OOM conditions
> > > Version 3 (Thu, 08 Sep 2005)
> > > Organized logic in hugetlb_pte_fault() by breaking out
> > > find_get_page/alloc_huge_page logic into separate function
> > > Removed a few more paranoid checks ( Thanks )
> > > Fixed tlb flushing in a race case ( Yanmin Zhang )
> > >
> > > Version 2 (Wed, 17 Aug 2005)
> > > Removed spurious WARN_ON()
> > > Patches added earlier in the series (now in mainline):
> > > Check for p?d_none() in arch/i386/mm/hugetlbpage.c:huge_pte_offset()
> > > Move i386 stale pte check into huge_pte_alloc()
> >
> > I'm not sure this does fully deal with truncation, I'm afraid - it
> > will deal with a truncation well before the fault, but not a
> > concurrent truncate(). We'll need the truncate_count/retry logic from
> > do_no_page, I think. Andi/Hugh, can you confirm that's correct?
>
> Ok. I can see why we need that.
>
> > > Initial Post (Fri, 05 Aug 2005)
> > >
> > > Below is a patch to implement demand faulting for huge pages. The main
> > > motivation for changing from prefaulting to demand faulting is so that
> > > huge page memory areas can be allocated according to NUMA policy.
> > >
> > > Thanks to consolidated hugetlb code, switching the behavior requires changing
> > > only one fault handler. The bulk of the patch just moves the logic from
> > > hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page().
> >
> > While we're at it - it's a minor nit, but I find the distinction
> > between hugetlb_pte_fault() and hugetlb_fault() confusing. A better
> > name for the former would be hugetlb_no_page(), in which case we
> > should probably also move the border between it and
> > hugetlb_find_get_page() to match the boundary between do_no_page() and
> > mapping->nopage.
> >
> > How about this, for example:
>
> Yeah, I suppose that division makes more sense when comparing to the
> normal fault handler code.
>
> > @@ -338,57 +337,128 @@
> > spin_unlock(&mm->page_table_lock);
> > }
> >
> > -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
> > +static struct page *hugetlbfs_nopage(struct vm_area_struct *vma,
>
> <snip>
>
> > + /* Check to make sure the mapping hasn't been truncated */
> > + size = i_size_read(inode) >> HPAGE_SHIFT;
> > + if (pgoff >= size)
> > + return NULL;
> > +
> > + retry:
> > + page = find_get_page(mapping, pgoff);
> > + if (page)
> > + /* Another thread won the race */
> > + return page;
>
> Both of those returns could be changed to goto out so that the function
> has only one exit path. Isn't that what we want?
Eh, I've never been of the "gotos are better than multiple exit
points" school of thought. However, I've not been consistent here,
sometimes using return and sometimes goto out. Here's a new version
Subject: [PATCH 2/3] hugetlb: Demand fault handler
From: Adam Litke <[email protected]>
To: [email protected]
Cc: [email protected], [email protected],
David Gibson <[email protected]>, [email protected],
[email protected]
Organization: IBM
Date: Tue, 11 Oct 2005 13:32:38 -0500
Version 5 (Tue, 11 Oct 2005)
Deal with hugetlbfs file truncation in find_get_huge_page()
Version 4 (Mon, 03 Oct 2005)
Make find_get_huge_page bale properly when add_to_page_cache fails
due to OOM conditions
Version 3 (Thu, 08 Sep 2005)
Organized logic in hugetlb_pte_fault() by breaking out
find_get_page/alloc_huge_page logic into separate function
Removed a few more paranoid checks ( Thanks )
Fixed tlb flushing in a race case ( Yanmin Zhang )
Version 2 (Wed, 17 Aug 2005)
Removed spurious WARN_ON()
Patches added earlier in the series (now in mainline):
Check for p?d_none() in arch/i386/mm/hugetlbpage.c:huge_pte_offset()
Move i386 stale pte check into huge_pte_alloc()
Initial Post (Fri, 05 Aug 2005)
Below is a patch to implement demand faulting for huge pages. The main
motivation for changing from prefaulting to demand faulting is so that
huge page memory areas can be allocated according to NUMA policy.
Thanks to consolidated hugetlb code, switching the behavior requires changing
only one fault handler. The bulk of the patch just moves the logic from
hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page().
Signed-off-by: Adam Litke <[email protected]>
Index: working-2.6/fs/hugetlbfs/inode.c
===================================================================
--- working-2.6.orig/fs/hugetlbfs/inode.c 2005-10-14 11:05:42.000000000 +1000
+++ working-2.6/fs/hugetlbfs/inode.c 2005-10-14 11:08:39.000000000 +1000
@@ -48,7 +48,6 @@
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
loff_t len, vma_len;
int ret;
@@ -79,10 +78,7 @@
if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
goto out;
- ret = hugetlb_prefault(mapping, vma);
- if (ret)
- goto out;
-
+ ret = 0;
if (inode->i_size < len)
inode->i_size = len;
out:
Index: working-2.6/include/linux/hugetlb.h
===================================================================
--- working-2.6.orig/include/linux/hugetlb.h 2005-10-12 17:10:16.000000000 +1000
+++ working-2.6/include/linux/hugetlb.h 2005-10-14 11:05:44.000000000 +1000
@@ -25,6 +25,8 @@
unsigned long hugetlb_total_pages(void);
struct page *alloc_huge_page(void);
void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+ unsigned long address, int write_access);
extern unsigned long max_huge_pages;
extern const unsigned long hugetlb_zero, hugetlb_infinity;
Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c 2005-10-14 11:05:44.000000000 +1000
+++ working-2.6/mm/hugetlb.c 2005-10-14 11:09:20.000000000 +1000
@@ -312,9 +312,8 @@
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (! ptep)
- /* This can happen on truncate, or if an
- * mmap() is aborted due to an error before
- * the prefault */
+ /* This can happen on truncate, or for pages
+ * not yet faulted in */
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -338,57 +337,127 @@
spin_unlock(&mm->page_table_lock);
}
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+static struct page *hugetlbfs_nopage(struct vm_area_struct *vma,
+ unsigned long address)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
+ int err;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned long pgoff, size;
+ struct page *page = NULL;
+
+ pgoff = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+ /* Check to make sure the mapping hasn't been truncated */
+ size = i_size_read(inode) >> HPAGE_SHIFT;
+ if (pgoff >= size)
+ return NULL;
+
+ retry:
+ page = find_get_page(mapping, pgoff);
+ if (page)
+ /* Another thread won the race */
+ return page;
+
+ if (hugetlb_get_quota(mapping) != 0)
+ return NULL;
+
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ return NULL;
+ }
+
+ /*
+ * It would be better to use GFP_KERNEL here but then we'd need to
+ * drop the page_table_lock and handle several race conditions.
+ */
+ err = add_to_page_cache(page, mapping, pgoff, GFP_ATOMIC);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -ENOMEM)
+ return NULL;
+ else
+ goto retry;
+ }
+ unlock_page(page);
+
+ return page;
+
+}
+
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *pte,
+ int write_access)
+{
+ struct page *new_page;
+ struct address_space *mapping;
+ unsigned int sequence;
- WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(vma->vm_start & ~HPAGE_MASK);
BUG_ON(vma->vm_end & ~HPAGE_MASK);
+ BUG_ON(!vma->vm_file);
+ BUG_ON(!pte);
+ BUG_ON(!pte_none(*pte));
+
+ spin_unlock(&mm->page_table_lock);
+
+ mapping = vma->vm_file->f_mapping;
+ sequence = mapping->truncate_count;
+ smp_rmb(); /* serializes i_size against truncate_count */
- hugetlb_prefault_arch_hook(mm);
+ retry:
+ new_page = hugetlbfs_nopage(vma, address & HPAGE_MASK);
spin_lock(&mm->page_table_lock);
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
- unsigned long idx;
- pte_t *pte = huge_pte_alloc(mm, addr);
- struct page *page;
-
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
-
- idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- page = find_get_page(mapping, idx);
- if (!page) {
- /* charge the fs quota first */
- if (hugetlb_get_quota(mapping)) {
- ret = -ENOMEM;
- goto out;
- }
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- ret = -ENOMEM;
- goto out;
- }
- ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
- if (! ret) {
- unlock_page(page);
- } else {
- hugetlb_put_quota(mapping);
- free_huge_page(page);
- goto out;
- }
- }
+
+ if (!new_page)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Someone could have truncated this page.
+ */
+ if (unlikely(sequence != mapping->truncate_count)) {
+ spin_unlock(&mm->page_table_lock);
+ page_cache_release(new_page);
+ cond_resched();
+ sequence = mapping->truncate_count;
+ smp_rmb();
+ goto retry;
+ }
+
+ if (pte_none(*pte)) {
+ set_huge_pte_at(mm, address, pte,
+ make_huge_pte(vma, new_page));
add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+ } else {
+ /* One of our sibling threads was faster, back out. */
+ page_cache_release(new_page);
+ }
+ return VM_FAULT_MINOR;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ int rc = VM_FAULT_MINOR;
+
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_alloc(mm, address);
+ if (!ptep) {
+ rc = VM_FAULT_SIGBUS;
+ goto out;
}
+ if (pte_none(*ptep))
+ rc = hugetlb_no_page(mm, vma, address, ptep, write_access);
+
+ if (rc == VM_FAULT_MINOR)
+ flush_tlb_page(vma, address);
out:
spin_unlock(&mm->page_table_lock);
- return ret;
+ return rc;
}
Index: working-2.6/mm/memory.c
===================================================================
--- working-2.6.orig/mm/memory.c 2005-10-14 11:05:44.000000000 +1000
+++ working-2.6/mm/memory.c 2005-10-14 11:05:44.000000000 +1000
@@ -2040,7 +2040,7 @@
inc_page_state(pgfault);
if (is_vm_hugetlb_page(vma))
- return VM_FAULT_SIGBUS; /* mapping truncation does this. */
+ return hugetlb_fault(mm, vma, address, write_access);
/*
* We need the page table lock to synchronize with kswapd
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/people/dgibson