Ok. Hugh and I have rolled these around the last few days and what we
have now should integrate properly and solve the outstanding truncation
issues. We should be ready to re-insert these patches into -mm.
The old patch [1/3] remove get_user_pages_optimization is no longer
necessary. We now have [1/2] "the fault handler" and [2/2] "overcommit
accounting check"
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
Version 5 (Tue, 18 Oct 2005)
Large update from Hugh Dickins to merge with his page_table_lock
scalability patches
Use lock_page() to guard against truncation races
Fault handler hierarchy changed to more closely resemble do_no_page()
Version 4 (Mon, 03 Oct 2005)
Make find_get_huge_page bale properly when add_to_page_cache fails
due to OOM conditions
Version 3 (Thu, 08 Sep 2005)
Organized logic in hugetlb_pte_fault() by breaking out
find_get_page/alloc_huge_page logic into separate function
Removed a few more paranoid checks ( Thanks )
Fixed tlb flushing in a race case ( Yanmin Zhang )
Version 2 (Wed, 17 Aug 2005)
Removed spurious WARN_ON()
Patches added earlier in the series (now in mainline):
Check for p?d_none() in arch/i386/mm/hugetlbpage.c:huge_pte_offset()
Move i386 stale pte check into huge_pte_alloc()
Initial Post (Fri, 05 Aug 2005)
Below is a patch to implement demand faulting for huge pages. The main
motivation for changing from prefaulting to demand faulting is so that
huge page memory areas can be allocated according to NUMA policy.
Thanks to consolidated hugetlb code, switching the behavior requires changing
only one fault handler. The bulk of the patch just moves the logic from
hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page().
Signed-off-by: Adam Litke <[email protected]>
---
fs/hugetlbfs/inode.c | 7 -
include/linux/hugetlb.h | 2
mm/hugetlb.c | 180 ++++++++++++++++++++++++++++--------------------
mm/memory.c | 2
4 files changed, 111 insertions(+), 80 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group;
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
loff_t len, vma_len;
int ret;
@@ -79,10 +78,8 @@ static int hugetlbfs_file_mmap(struct fi
if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
goto out;
- ret = hugetlb_prefault(mapping, vma);
- if (ret)
- goto out;
-
+ ret = 0;
+ hugetlb_prefault_arch_hook(vma->vm_mm);
if (inode->i_size < len)
inode->i_size = len;
out:
diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h
+++ current/include/linux/hugetlb.h
@@ -24,6 +24,8 @@ int is_hugepage_mem_enough(size_t);
unsigned long hugetlb_total_pages(void);
struct page *alloc_huge_page(void);
void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access);
extern unsigned long max_huge_pages;
extern const unsigned long hugetlb_zero, hugetlb_infinity;
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -320,10 +320,7 @@ void unmap_hugepage_range(struct vm_area
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
- if (! ptep)
- /* This can happen on truncate, or if an
- * mmap() is aborted due to an error before
- * the prefault */
+ if (!ptep)
continue;
pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -339,59 +336,92 @@ void unmap_hugepage_range(struct vm_area
flush_tlb_range(vma, start, end);
}
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+static struct page *find_lock_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
- struct mm_struct *mm = current->mm;
- unsigned long addr;
- int ret = 0;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
- BUG_ON(vma->vm_start & ~HPAGE_MASK);
- BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
- hugetlb_prefault_arch_hook(mm);
-
- for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
- unsigned long idx;
- pte_t *pte = huge_pte_alloc(mm, addr);
- struct page *page;
-
- if (!pte) {
- ret = -ENOMEM;
- goto out;
- }
+ struct page *page;
+ int err;
+ struct inode *inode = mapping->host;
+ unsigned long size;
+
+retry:
+ page = find_lock_page(mapping, idx);
+ if (page)
+ goto out;
+
+ /* Check to make sure the mapping hasn't been truncated */
+ size = i_size_read(inode) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto out;
+
+ if (hugetlb_get_quota(mapping))
+ goto out;
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ goto out;
+ }
- idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
- + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
- page = find_get_page(mapping, idx);
- if (!page) {
- /* charge the fs quota first */
- if (hugetlb_get_quota(mapping)) {
- ret = -ENOMEM;
- goto out;
- }
- page = alloc_huge_page();
- if (!page) {
- hugetlb_put_quota(mapping);
- ret = -ENOMEM;
- goto out;
- }
- ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
- if (! ret) {
- unlock_page(page);
- } else {
- hugetlb_put_quota(mapping);
- free_huge_page(page);
- goto out;
- }
- }
- spin_lock(&mm->page_table_lock);
- add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
- spin_unlock(&mm->page_table_lock);
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ page = NULL;
}
out:
+ return page;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ int ret = VM_FAULT_SIGBUS;
+ unsigned long idx;
+ unsigned long size;
+ pte_t *pte;
+ struct page *page;
+ struct address_space *mapping;
+
+ pte = huge_pte_alloc(mm, address);
+ if (!pte)
+ goto out;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+ /*
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
+ */
+ page = find_lock_huge_page(mapping, idx);
+ if (!page)
+ goto out;
+
+ spin_lock(&mm->page_table_lock);
+ size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+ if (idx >= size)
+ goto backout;
+
+ ret = VM_FAULT_MINOR;
+ if (!pte_none(*pte))
+ goto backout;
+
+ add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+ spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+out:
return ret;
+
+backout:
+ spin_unlock(&mm->page_table_lock);
+ hugetlb_put_quota(mapping);
+ unlock_page(page);
+ put_page(page);
+ goto out;
}
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -401,34 +431,36 @@ int follow_hugetlb_page(struct mm_struct
unsigned long vpfn, vaddr = *position;
int remainder = *length;
- BUG_ON(!is_vm_hugetlb_page(vma));
-
vpfn = vaddr/PAGE_SIZE;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
+ pte_t *pte;
+ struct page *page;
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- /* Some archs (sparc64, sh*) have multiple
- * pte_ts to each hugepage. We have to make
- * sure we get the first, for the page
- * indexing below to work. */
- pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-
- /* the hugetlb file might have been truncated */
- if (!pte || pte_none(*pte)) {
- remainder = 0;
- if (!i)
- i = -EFAULT;
- break;
- }
+ /*
+ * Some archs (sparc64, sh*) have multiple pte_ts to
+ * each hugepage. We have to make * sure we get the
+ * first, for the page indexing below to work.
+ */
+ pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+
+ if (!pte || pte_none(*pte)) {
+ int ret;
+
+ spin_unlock(&mm->page_table_lock);
+ ret = hugetlb_fault(mm, vma, vaddr, 0);
+ spin_lock(&mm->page_table_lock);
+ if (ret == VM_FAULT_MINOR)
+ continue;
+
+ remainder = 0;
+ if (!i)
+ i = -EFAULT;
+ break;
+ }
+ if (pages) {
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
- WARN_ON(!PageCompound(page));
-
get_page(page);
pages[i] = page;
}
diff -upN reference/mm/memory.c current/mm/memory.c
--- reference/mm/memory.c
+++ current/mm/memory.c
@@ -2027,7 +2027,7 @@ int __handle_mm_fault(struct mm_struct *
inc_page_state(pgfault);
if (is_vm_hugetlb_page(vma))
- return VM_FAULT_SIGBUS; /* mapping truncation does this. */
+ return hugetlb_fault(mm, vma, address, write_access);
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center
Version 2 (Tue, 18 Oct 2005)
Added comment to huge_pages_needed
Removed erroneous shared memory exemption logic
Initial Post (Thu, 18 Aug 2005)
Basic overcommit checking for hugetlb_file_map() based on an implementation
used with demand faulting in SLES9.
Since demand faulting can't guarantee the availability of pages at mmap time,
this patch implements a basic sanity check to ensure that the number of huge
pages required to satisfy the mmap are currently available. Despite the
obvious race, I think it is a good start on doing proper accounting. I'd like
to work towards an accounting system that mimics the semantics of normal pages
(especially for the MAP_PRIVATE/COW case). That work is underway and builds on
what this patch starts.
Huge page shared memory segments are simpler and still maintain their commit on
shmget semantics.
Signed-off-by: Adam Litke <[email protected]>
---
inode.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
1 files changed, 54 insertions(+), 10 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -45,9 +45,59 @@ static struct backing_dev_info hugetlbfs
int sysctl_hugetlb_shm_group;
+static void huge_pagevec_release(struct pagevec *pvec)
+{
+ int i;
+
+ for (i = 0; i < pagevec_count(pvec); ++i)
+ put_page(pvec->pages[i]);
+
+ pagevec_reinit(pvec);
+}
+
+/*
+ * huge_pages_needed tries to determine the number of new huge pages that
+ * will be required to fully populate this VMA. This will be equal to
+ * the size of the VMA in huge pages minus the number of huge pages
+ * (covered by this VMA) that are found in the page cache.
+ *
+ * Result is in bytes to be compatible with is_hugepage_mem_enough()
+ */
+unsigned long
+huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
+{
+ int i;
+ struct pagevec pvec;
+ unsigned long start = vma->vm_start;
+ unsigned long end = vma->vm_end;
+ unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
+ pgoff_t next = vma->vm_pgoff;
+ pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT);
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+
+ pagevec_init(&pvec, 0);
+ while (next < endpg) {
+ if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+ break;
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ if (page->index > next)
+ next = page->index;
+ if (page->index >= endpg)
+ break;
+ next++;
+ hugepages--;
+ }
+ huge_pagevec_release(&pvec);
+ }
+ return hugepages << HPAGE_SHIFT;
+}
+
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long bytes;
loff_t len, vma_len;
int ret;
@@ -66,6 +116,10 @@ static int hugetlbfs_file_mmap(struct fi
if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
return -EINVAL;
+ bytes = huge_pages_needed(mapping, vma);
+ if (!is_hugepage_mem_enough(bytes))
+ return -ENOMEM;
+
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
down(&inode->i_sem);
@@ -168,16 +222,6 @@ static int hugetlbfs_commit_write(struct
return -EINVAL;
}
-static void huge_pagevec_release(struct pagevec *pvec)
-{
- int i;
-
- for (i = 0; i < pagevec_count(pvec); ++i)
- put_page(pvec->pages[i]);
-
- pagevec_reinit(pvec);
-}
-
static void truncate_huge_page(struct page *page)
{
clear_page_dirty(page);
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center