2004-04-13 23:35:51

by Chen, Kenneth W

[permalink] [raw]
Subject: hugetlb demand paging patch part [2/3]

Part 2 of 3

2. hugetlb_demand_generic.patch - this handles bulk of hugetlb demand
paging for generic portion of the kernel. I've put hugetlb fault
handler in mm/hugetlbpage.c since the fault handler is *exactly* the
same for all arch, but that requires opening up huge_pte_alloc() and
set_huge_pte() functions in each arch. If people object where it
should live. It takes me less than a minute to delete the common
code and replicate it in each of the 5 arch that supports hugetlb.
Just let me know if that's the case.


fs/hugetlbfs/inode.c | 18 +++------------
include/linux/hugetlb.h | 9 ++++---
mm/hugetlb.c | 56 ++++++++++++++++++++++++++++++++++++++++++++----
mm/memory.c | 7 ------
4 files changed, 62 insertions(+), 28 deletions(-)

diff -Nurp linux-2.6.5/fs/hugetlbfs/inode.c linux-2.6.5.htlb/fs/hugetlbfs/inode.c
--- linux-2.6.5/fs/hugetlbfs/inode.c 2004-04-13 12:56:29.000000000 -0700
+++ linux-2.6.5.htlb/fs/hugetlbfs/inode.c 2004-04-13 12:42:35.000000000 -0700
@@ -218,11 +218,6 @@ static int hugetlb_acct_commit(struct in
{
return region_add(&inode->i_mapping->private_list, from, to);
}
-static void hugetlb_acct_undo(struct inode *inode, int chg)
-{
- hugetlb_put_quota(inode->i_mapping, chg);
- hugetlb_acct_memory(-chg);
-}
static void hugetlb_acct_release(struct inode *inode, int to)
{
int chg;
@@ -252,9 +247,8 @@ static struct backing_dev_info hugetlbfs
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
loff_t len, vma_len;
- int ret;
+ int ret = 0;
int chg;

if (vma->vm_start & ~HPAGE_MASK)
@@ -281,15 +275,11 @@ static int hugetlbfs_file_mmap(struct fi
file_accessed(file);
vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
vma->vm_ops = &hugetlb_vm_ops;
- ret = hugetlb_prefault(mapping, vma);
len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- if (ret == 0) {
- if (inode->i_size < len)
- inode->i_size = len;
- hugetlb_acct_commit(inode, VMACCTPG(vma->vm_pgoff),
+ if (inode->i_size < len)
+ inode->i_size = len;
+ hugetlb_acct_commit(inode, VMACCTPG(vma->vm_pgoff),
VMACCTPG(vma->vm_pgoff + (vma_len >> PAGE_SHIFT)));
- } else
- hugetlb_acct_undo(inode, chg);

unlock_out:
up(&inode->i_sem);
diff -Nurp linux-2.6.5/include/linux/hugetlb.h linux-2.6.5.htlb/include/linux/hugetlb.h
--- linux-2.6.5/include/linux/hugetlb.h 2004-04-13 12:56:29.000000000 -0700
+++ linux-2.6.5.htlb/include/linux/hugetlb.h 2004-04-13 12:02:31.000000000 -0700
@@ -14,10 +14,12 @@ static inline int is_vm_hugetlb_page(str

int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void *, size_t *);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
-int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int
*, int);
void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
-int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
+pte_t *huge_pte_alloc(struct mm_struct *, unsigned long);
+void set_huge_pte(struct mm_struct *, struct vm_area_struct *, struct page *, pte_t *, int);
+int handle_hugetlb_mm_fault(struct mm_struct *, struct vm_area_struct *,
+ unsigned long, int);
void huge_page_release(struct page *);
int hugetlb_report_meminfo(char *);
int is_hugepage_mem_enough(size_t);
@@ -66,10 +68,9 @@ static inline unsigned long hugetlb_tota
return 0;
}

-#define follow_hugetlb_page(m,v,p,vs,a,b,i) ({ BUG(); 0; })
#define follow_huge_addr(mm, vma, addr, write) 0
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
-#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
+#define handle_hugetlb_mm_fault(mm, vma, addr, write) BUG()
#define zap_hugepage_range(vma, start, len) BUG()
#define unmap_hugepage_range(vma, start, end) BUG()
#define huge_page_release(page) BUG()
diff -Nurp linux-2.6.5/mm/hugetlb.c linux-2.6.5.htlb/mm/hugetlb.c
--- linux-2.6.5/mm/hugetlb.c 2004-04-13 12:56:13.000000000 -0700
+++ linux-2.6.5.htlb/mm/hugetlb.c 2004-04-13 12:18:29.000000000 -0700
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
#include <linux/sysctl.h>

const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -217,11 +218,58 @@ unsigned long hugetlb_total_pages(void)
}
EXPORT_SYMBOL(hugetlb_total_pages);

+int handle_hugetlb_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+ unsigned long addr, int write_access)
+{
+ pte_t *pte;
+ struct page *page;
+ struct address_space *mapping;
+ int idx, ret = VM_FAULT_MINOR;
+
+ spin_lock(&mm->page_table_lock);
+ pte = huge_pte_alloc(mm, addr & HPAGE_MASK);
+ if (!pte) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+ if (!pte_none(*pte))
+ goto out;
+ spin_unlock(&mm->page_table_lock);
+
+ mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
+ idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+retry:
+ page = find_get_page(mapping, idx);
+ if (!page) {
+ page = alloc_huge_page();
+ if (!page)
+ goto retry;
+ ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+ if (!ret) {
+ unlock_page(page);
+ } else {
+ free_huge_page(page);
+ if (ret == -EEXIST)
+ goto retry;
+ else
+ return VM_FAULT_OOM;
+ }
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (pte_none(*pte))
+ set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+ else
+ page_cache_release(page);
+out:
+ spin_unlock(&mm->page_table_lock);
+ return VM_FAULT_MINOR;
+}
+
/*
- * We cannot handle pagefaults against hugetlb pages at all. They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
+ * We should not get here because handle_mm_fault() is supposed to trap
+ * hugetlb page fault. BUG if we get here.
*/
static struct page *hugetlb_nopage(struct vm_area_struct *vma,
unsigned long address, int *unused)
diff -Nurp linux-2.6.5/mm/memory.c linux-2.6.5.htlb/mm/memory.c
--- linux-2.6.5/mm/memory.c 2004-04-13 12:56:13.000000000 -0700
+++ linux-2.6.5.htlb/mm/memory.c 2004-04-13 12:02:31.000000000 -0700
@@ -769,11 +769,6 @@ int get_user_pages(struct task_struct *t
if ((pages && vm_io) || !(flags & vma->vm_flags))
return i ? : -EFAULT;

- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &len, i);
- continue;
- }
spin_lock(&mm->page_table_lock);
do {
struct page *map = NULL;
@@ -1697,7 +1692,7 @@ int handle_mm_fault(struct mm_struct *mm
inc_page_state(pgfault);

if (is_vm_hugetlb_page(vma))
- return VM_FAULT_SIGBUS; /* mapping truncation does this. */
+ return handle_hugetlb_mm_fault(mm, vma, address, write_access);

/*
* We need the page table lock to synchronize with kswapd



2004-04-15 07:19:47

by David Gibson

[permalink] [raw]
Subject: Re: hugetlb demand paging patch part [2/3]

[snip]
> -int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int

[snip]

> diff -Nurp linux-2.6.5/mm/memory.c linux-2.6.5.htlb/mm/memory.c
> +++ linux-2.6.5.htlb/mm/memory.c 2004-04-13 12:02:31.000000000 -0700
> @@ -769,11 +769,6 @@ int get_user_pages(struct task_struct *t
> if ((pages && vm_io) || !(flags & vma->vm_flags))
> return i ? : -EFAULT;
>
> - if (is_vm_hugetlb_page(vma)) {
> - i = follow_hugetlb_page(mm, vma, pages, vmas,
> - &start, &len, i);
> - continue;
> - }
> spin_lock(&mm->page_table_lock);
> do {
> struct page *map = NULL;

Ok, I notice that you've removed the follow_hugtlb_page() function
(and from the arch specific stuff, as well). As far as I can tell,
this isn't actually related to demand-paging, in fact as far as I can
tell this function is unnecessary right now, because follow_page()
should already work for huge pages. In particular the path in
get_user_pages() which can call handle_mm_fault() (which won't work on
hugepages without your patch) should never get triggered, since
hugepages are all prefaulted.

Does that sound right? In other words, do you think the patch below,
which just kills off follow_hugetlb_page() is safe, or have I missed
something?

Index: working-2.6/mm/memory.c
===================================================================
--- working-2.6.orig/mm/memory.c 2004-04-13 11:42:42.000000000 +1000
+++ working-2.6/mm/memory.c 2004-04-15 17:03:01.421905400 +1000
@@ -766,16 +766,13 @@
|| !(flags & vma->vm_flags))
return i ? : -EFAULT;

- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &len, i);
- continue;
- }
spin_lock(&mm->page_table_lock);
do {
struct page *map;
int lookup_write = write;
while (!(map = follow_page(mm, start, lookup_write))) {
+ /* hugepages should always be prefaulted */
+ BUG_ON(is_vm_hugetlb_page(vma));
/*
* Shortcut for anonymous pages. We don't want
* to force the creation of pages tables for
Index: working-2.6/include/linux/hugetlb.h
===================================================================
--- working-2.6.orig/include/linux/hugetlb.h 2004-04-13 11:42:41.000000000 +1000
+++ working-2.6/include/linux/hugetlb.h 2004-04-15 17:03:22.847834536 +1000
@@ -12,7 +12,6 @@

int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void *, size_t *);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
-int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
@@ -64,7 +63,6 @@
return 0;
}

-#define follow_hugetlb_page(m,v,p,vs,a,b,i) ({ BUG(); 0; })
#define follow_huge_addr(mm, vma, addr, write) 0
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
Index: working-2.6/arch/ppc64/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/hugetlbpage.c 2004-04-13 11:42:35.000000000 +1000
+++ working-2.6/arch/ppc64/mm/hugetlbpage.c 2004-04-15 17:03:43.052825264 +1000
@@ -288,52 +288,6 @@
return 0;
}

-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
-{
- unsigned long vpfn, vaddr = *position;
- int remainder = *length;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
-
- vpfn = vaddr/PAGE_SIZE;
- while (vaddr < vma->vm_end && remainder) {
- BUG_ON(!in_hugepage_area(mm->context, vaddr));
-
- if (pages) {
- hugepte_t *pte;
- struct page *page;
-
- pte = hugepte_offset(mm, vaddr);
-
- /* hugetlb should be locked, and hence, prefaulted */
- WARN_ON(!pte || hugepte_none(*pte));
-
- page = &hugepte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
- WARN_ON(!PageCompound(page));
-
- get_page(page);
- pages[i] = page;
- }
-
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- ++vpfn;
- --remainder;
- ++i;
- }
-
- *length = remainder;
- *position = vaddr;
-
- return i;
-}
-
struct page *
follow_huge_addr(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address, int write)
Index: working-2.6/arch/i386/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/i386/mm/hugetlbpage.c 2004-04-13 11:42:35.000000000 +1000
+++ working-2.6/arch/i386/mm/hugetlbpage.c 2004-04-15 17:07:42.813857792 +1000
@@ -93,65 +93,26 @@
return -ENOMEM;
}

-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
-{
- unsigned long vpfn, vaddr = *position;
- int remainder = *length;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
-
- vpfn = vaddr/PAGE_SIZE;
- while (vaddr < vma->vm_end && remainder) {
-
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- pte = huge_pte_offset(mm, vaddr);
-
- /* hugetlb should be locked, and hence, prefaulted */
- WARN_ON(!pte || pte_none(*pte));
-
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
- WARN_ON(!PageCompound(page));
-
- get_page(page);
- pages[i] = page;
- }
-
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- ++vpfn;
- --remainder;
- ++i;
- }
-
- *length = remainder;
- *position = vaddr;
-
- return i;
-}
-
#if 0 /* This is just for testing */
struct page *
follow_huge_addr(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address, int write)
{
- unsigned long start = address;
- int length = 1;
- int nr;
+ int vpfn = vaddr / PAGE_SIZE;
struct page *page;
+ pte_t *pte;

- nr = follow_hugetlb_page(mm, vma, &page, NULL, &start, &length, 0);
- if (nr == 1)
- return page;
- return NULL;
+ pte = huge_pte_offset(mm, address);
+
+ /* hugetlb should be locked, and hence, prefaulted */
+ WARN_ON(!pte || pte_none(*pte));
+
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+ WARN_ON(!PageCompound(page));
+
+ get_page(page);
+ return page;
}

/*
Index: working-2.6/arch/sparc64/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/sparc64/mm/hugetlbpage.c 2004-04-13 11:42:35.000000000 +1000
+++ working-2.6/arch/sparc64/mm/hugetlbpage.c 2004-04-15 17:08:08.815900136 +1000
@@ -121,47 +121,6 @@
return -ENOMEM;
}

-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
-{
- unsigned long vaddr = *position;
- int remainder = *length;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
-
- while (vaddr < vma->vm_end && remainder) {
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- pte = huge_pte_offset(mm, vaddr);
-
- /* hugetlb should be locked, and hence, prefaulted */
- BUG_ON(!pte || pte_none(*pte));
-
- page = pte_page(*pte);
-
- WARN_ON(!PageCompound(page));
-
- get_page(page);
- pages[i] = page;
- }
-
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- --remainder;
- ++i;
- }
-
- *length = remainder;
- *position = vaddr;
-
- return i;
-}
-
struct page *follow_huge_addr(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, int write)
Index: working-2.6/arch/ia64/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/ia64/mm/hugetlbpage.c 2004-04-14 12:22:48.000000000 +1000
+++ working-2.6/arch/ia64/mm/hugetlbpage.c 2004-04-15 17:08:30.667905672 +1000
@@ -113,43 +113,6 @@
return -ENOMEM;
}

-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *st, int *length, int i)
-{
- pte_t *ptep, pte;
- unsigned long start = *st;
- unsigned long pstart;
- int len = *length;
- struct page *page;
-
- do {
- pstart = start & HPAGE_MASK;
- ptep = huge_pte_offset(mm, start);
- pte = *ptep;
-
-back1:
- page = pte_page(pte);
- if (pages) {
- page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
- get_page(page);
- pages[i] = page;
- }
- if (vmas)
- vmas[i] = vma;
- i++;
- len--;
- start += PAGE_SIZE;
- if (((start & HPAGE_MASK) == pstart) && len &&
- (start < vma->vm_end))
- goto back1;
- } while (len && start < vma->vm_end);
- *length = len;
- *st = start;
- return i;
-}
-
struct vm_area_struct *hugepage_vma(struct mm_struct *mm, unsigned long addr)
{
if (mm->used_hugetlb) {
Index: working-2.6/arch/sh/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/sh/mm/hugetlbpage.c 2004-04-13 11:42:35.000000000 +1000
+++ working-2.6/arch/sh/mm/hugetlbpage.c 2004-04-15 17:08:49.216881808 +1000
@@ -124,47 +124,6 @@
return -ENOMEM;
}

-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
-{
- unsigned long vaddr = *position;
- int remainder = *length;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
-
- while (vaddr < vma->vm_end && remainder) {
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- pte = huge_pte_offset(mm, vaddr);
-
- /* hugetlb should be locked, and hence, prefaulted */
- BUG_ON(!pte || pte_none(*pte));
-
- page = pte_page(*pte);
-
- WARN_ON(!PageCompound(page));
-
- get_page(page);
- pages[i] = page;
- }
-
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- --remainder;
- ++i;
- }
-
- *length = remainder;
- *position = vaddr;
-
- return i;
-}
-
struct page *follow_huge_addr(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, int write)


--
David Gibson | For every complex problem there is a
david AT gibson.dropbear.id.au | solution which is simple, neat and
| wrong.
http://www.ozlabs.org/people/dgibson

2004-04-15 17:35:12

by Chen, Kenneth W

[permalink] [raw]
Subject: RE: hugetlb demand paging patch part [2/3]

>>>> David Gibson wrote on Thursday, April 15, 2004 12:17 AM
> > diff -Nurp linux-2.6.5/mm/memory.c linux-2.6.5.htlb/mm/memory.c
> > +++ linux-2.6.5.htlb/mm/memory.c 2004-04-13 12:02:31.000000000 -0700
> > @@ -769,11 +769,6 @@ int get_user_pages(struct task_struct *t
> > if ((pages && vm_io) || !(flags & vma->vm_flags))
> > return i ? : -EFAULT;
> >
> > - if (is_vm_hugetlb_page(vma)) {
> > - i = follow_hugetlb_page(mm, vma, pages, vmas,
> > - &start, &len, i);
> > - continue;
> > - }
> > spin_lock(&mm->page_table_lock);
> > do {
> > struct page *map = NULL;
>
> Ok, I notice that you've removed the follow_hugtlb_page() function
> (and from the arch specific stuff, as well). As far as I can tell,
> this isn't actually related to demand-paging, in fact as far as I can
> tell this function is unnecessary

That was the reason I removed the function because it is no longer used
with demand paging.


> should already work for huge pages. In particular the path in
> get_user_pages() which can call handle_mm_fault() (which won't work on
> hugepages without your patch) should never get triggered, since
> hugepages are all prefaulted.

> Does that sound right? In other words, do you think the patch below,
> which just kills off follow_hugetlb_page() is safe, or have I missed
> something?
>
> Index: working-2.6/mm/memory.c
> ===================================================================
> --- working-2.6.orig/mm/memory.c 2004-04-13 11:42:42.000000000 +1000
> +++ working-2.6/mm/memory.c 2004-04-15 17:03:01.421905400 +1000
> @@ -766,16 +766,13 @@
> [snip]
> spin_lock(&mm->page_table_lock);
> do {
> struct page *map;
> int lookup_write = write;
> while (!(map = follow_page(mm, start, lookup_write))) {
> + /* hugepages should always be prefaulted */
> + BUG_ON(is_vm_hugetlb_page(vma));
> /*
> * Shortcut for anonymous pages. We don't want
> * to force the creation of pages tables for

This portion is incorrect, because it will trigger BUG_ON all the time
for faulting hugetlb page.

Yes, killing follow_hugetlb_page() is safe because follow_page() takes
care of hugetlb page. See 2nd patch posted earlier in
hugetlb_demanding_generic.patch


2004-04-16 02:39:00

by David Gibson

[permalink] [raw]
Subject: Re: hugetlb demand paging patch part [2/3]

On Thu, Apr 15, 2004 at 10:27:58AM -0700, Chen, Kenneth W wrote:
> >>>> David Gibson wrote on Thursday, April 15, 2004 12:17 AM
> > > diff -Nurp linux-2.6.5/mm/memory.c linux-2.6.5.htlb/mm/memory.c
> > > +++ linux-2.6.5.htlb/mm/memory.c 2004-04-13 12:02:31.000000000 -0700
> > > @@ -769,11 +769,6 @@ int get_user_pages(struct task_struct *t
> > > if ((pages && vm_io) || !(flags & vma->vm_flags))
> > > return i ? : -EFAULT;
> > >
> > > - if (is_vm_hugetlb_page(vma)) {
> > > - i = follow_hugetlb_page(mm, vma, pages, vmas,
> > > - &start, &len, i);
> > > - continue;
> > > - }
> > > spin_lock(&mm->page_table_lock);
> > > do {
> > > struct page *map = NULL;
> >
> > Ok, I notice that you've removed the follow_hugtlb_page() function
> > (and from the arch specific stuff, as well). As far as I can tell,
> > this isn't actually related to demand-paging, in fact as far as I can
> > tell this function is unnecessary
>
> That was the reason I removed the function because it is no longer used
> with demand paging.

Yes, but what I'm saying is as far as I can tell it shouldn't be
necessary even *without* demand paging. Again I'm trying to separate
cleanups from new features in your patches.

> > should already work for huge pages. In particular the path in
> > get_user_pages() which can call handle_mm_fault() (which won't work on
> > hugepages without your patch) should never get triggered, since
> > hugepages are all prefaulted.
>
> > Does that sound right? In other words, do you think the patch below,
> > which just kills off follow_hugetlb_page() is safe, or have I missed
> > something?
> >
> > Index: working-2.6/mm/memory.c
> > ===================================================================
> > +++ working-2.6/mm/memory.c 2004-04-15 17:03:01.421905400 +1000
> > @@ -766,16 +766,13 @@
> > [snip]
> > spin_lock(&mm->page_table_lock);
> > do {
> > struct page *map;
> > int lookup_write = write;
> > while (!(map = follow_page(mm, start, lookup_write))) {
> > + /* hugepages should always be prefaulted */
> > + BUG_ON(is_vm_hugetlb_page(vma));
> > /*
> > * Shortcut for anonymous pages. We don't want
> > * to force the creation of pages tables for
>
> This portion is incorrect, because it will trigger BUG_ON all the time
> for faulting hugetlb page.

Ah, yes, I did that because I was (and am) thinking of the case
without demand paging. But I just realised that of course we can get
a fault in that case, if there's a mapping truncation at the wrong
time. Removing the BUG_ON does mean that its significant that the
never-called hugetlb nopage function is non-NULL, so that
untouched_anonymous_page() returns 0 before looking at the page
tables, which is a bit more subtle than I'd like. Nontheless, revised
patch below.

> Yes, killing follow_hugetlb_page() is safe because follow_page() takes
> care of hugetlb page. See 2nd patch posted earlier in
> hugetlb_demanding_generic.patch

Yes, I looked at it already. But what I'm asking about is applying
this patch *without* (or before) going to demand paging.

Index: working-2.6/mm/memory.c
===================================================================
--- working-2.6.orig/mm/memory.c 2004-04-13 11:42:42.000000000 +1000
+++ working-2.6/mm/memory.c 2004-04-16 11:46:31.935870496 +1000
@@ -766,16 +766,13 @@
|| !(flags & vma->vm_flags))
return i ? : -EFAULT;

- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &len, i);
- continue;
- }
spin_lock(&mm->page_table_lock);
do {
struct page *map;
int lookup_write = write;
while (!(map = follow_page(mm, start, lookup_write))) {
+ /* hugepages should always be prefaulted */
+ BUG_ON(is_vm_hugetlb_page(vma));
/*
* Shortcut for anonymous pages. We don't want
* to force the creation of pages tables for
Index: working-2.6/include/linux/hugetlb.h
===================================================================
--- working-2.6.orig/include/linux/hugetlb.h 2004-04-13 11:42:41.000000000 +1000
+++ working-2.6/include/linux/hugetlb.h 2004-04-16 11:46:31.947868672 +1000
@@ -12,7 +12,6 @@

int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void *, size_t *);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
-int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
@@ -64,7 +63,6 @@
return 0;
}

-#define follow_hugetlb_page(m,v,p,vs,a,b,i) ({ BUG(); 0; })
#define follow_huge_addr(mm, vma, addr, write) 0
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
Index: working-2.6/arch/ppc64/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/hugetlbpage.c 2004-04-13 11:42:35.000000000 +1000
+++ working-2.6/arch/ppc64/mm/hugetlbpage.c 2004-04-16 11:46:31.950868216 +1000
@@ -288,52 +288,6 @@
return 0;
}

-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
-{
- unsigned long vpfn, vaddr = *position;
- int remainder = *length;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
-
- vpfn = vaddr/PAGE_SIZE;
- while (vaddr < vma->vm_end && remainder) {
- BUG_ON(!in_hugepage_area(mm->context, vaddr));
-
- if (pages) {
- hugepte_t *pte;
- struct page *page;
-
- pte = hugepte_offset(mm, vaddr);
-
- /* hugetlb should be locked, and hence, prefaulted */
- WARN_ON(!pte || hugepte_none(*pte));
-
- page = &hugepte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
- WARN_ON(!PageCompound(page));
-
- get_page(page);
- pages[i] = page;
- }
-
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- ++vpfn;
- --remainder;
- ++i;
- }
-
- *length = remainder;
- *position = vaddr;
-
- return i;
-}
-
struct page *
follow_huge_addr(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address, int write)
Index: working-2.6/arch/i386/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/i386/mm/hugetlbpage.c 2004-04-13 11:42:35.000000000 +1000
+++ working-2.6/arch/i386/mm/hugetlbpage.c 2004-04-16 11:49:11.914912248 +1000
@@ -93,65 +93,27 @@
return -ENOMEM;
}

-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
-{
- unsigned long vpfn, vaddr = *position;
- int remainder = *length;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
-
- vpfn = vaddr/PAGE_SIZE;
- while (vaddr < vma->vm_end && remainder) {
-
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- pte = huge_pte_offset(mm, vaddr);
-
- /* hugetlb should be locked, and hence, prefaulted */
- WARN_ON(!pte || pte_none(*pte));
-
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-
- WARN_ON(!PageCompound(page));
-
- get_page(page);
- pages[i] = page;
- }
-
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- ++vpfn;
- --remainder;
- ++i;
- }
-
- *length = remainder;
- *position = vaddr;
-
- return i;
-}
-
#if 0 /* This is just for testing */
struct page *
follow_huge_addr(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address, int write)
{
- unsigned long start = address;
- int length = 1;
- int nr;
+ int vpfn = vaddr / PAGE_SIZE;
struct page *page;
+ pte_t *pte;

- nr = follow_hugetlb_page(mm, vma, &page, NULL, &start, &length, 0);
- if (nr == 1)
- return page;
- return NULL;
+ pte = huge_pte_offset(mm, address);
+
+ /* PTE could be absent if there's been a mapping truncation */
+ if (! pte || pte_none(*pte))
+ return NULL;
+
+ page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+ WARN_ON(!PageCompound(page));
+
+ get_page(page);
+ return page;
}

/*
Index: working-2.6/arch/sparc64/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/sparc64/mm/hugetlbpage.c 2004-04-13 11:42:35.000000000 +1000
+++ working-2.6/arch/sparc64/mm/hugetlbpage.c 2004-04-16 11:46:31.961866544 +1000
@@ -121,47 +121,6 @@
return -ENOMEM;
}

-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
-{
- unsigned long vaddr = *position;
- int remainder = *length;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
-
- while (vaddr < vma->vm_end && remainder) {
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- pte = huge_pte_offset(mm, vaddr);
-
- /* hugetlb should be locked, and hence, prefaulted */
- BUG_ON(!pte || pte_none(*pte));
-
- page = pte_page(*pte);
-
- WARN_ON(!PageCompound(page));
-
- get_page(page);
- pages[i] = page;
- }
-
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- --remainder;
- ++i;
- }
-
- *length = remainder;
- *position = vaddr;
-
- return i;
-}
-
struct page *follow_huge_addr(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, int write)
Index: working-2.6/arch/ia64/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/ia64/mm/hugetlbpage.c 2004-04-14 12:22:48.000000000 +1000
+++ working-2.6/arch/ia64/mm/hugetlbpage.c 2004-04-16 11:46:31.963866240 +1000
@@ -113,43 +113,6 @@
return -ENOMEM;
}

-int
-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *st, int *length, int i)
-{
- pte_t *ptep, pte;
- unsigned long start = *st;
- unsigned long pstart;
- int len = *length;
- struct page *page;
-
- do {
- pstart = start & HPAGE_MASK;
- ptep = huge_pte_offset(mm, start);
- pte = *ptep;
-
-back1:
- page = pte_page(pte);
- if (pages) {
- page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
- get_page(page);
- pages[i] = page;
- }
- if (vmas)
- vmas[i] = vma;
- i++;
- len--;
- start += PAGE_SIZE;
- if (((start & HPAGE_MASK) == pstart) && len &&
- (start < vma->vm_end))
- goto back1;
- } while (len && start < vma->vm_end);
- *length = len;
- *st = start;
- return i;
-}
-
struct vm_area_struct *hugepage_vma(struct mm_struct *mm, unsigned long addr)
{
if (mm->used_hugetlb) {
Index: working-2.6/arch/sh/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/sh/mm/hugetlbpage.c 2004-04-13 11:42:35.000000000 +1000
+++ working-2.6/arch/sh/mm/hugetlbpage.c 2004-04-16 11:46:31.971865024 +1000
@@ -124,47 +124,6 @@
return -ENOMEM;
}

-int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct page **pages, struct vm_area_struct **vmas,
- unsigned long *position, int *length, int i)
-{
- unsigned long vaddr = *position;
- int remainder = *length;
-
- WARN_ON(!is_vm_hugetlb_page(vma));
-
- while (vaddr < vma->vm_end && remainder) {
- if (pages) {
- pte_t *pte;
- struct page *page;
-
- pte = huge_pte_offset(mm, vaddr);
-
- /* hugetlb should be locked, and hence, prefaulted */
- BUG_ON(!pte || pte_none(*pte));
-
- page = pte_page(*pte);
-
- WARN_ON(!PageCompound(page));
-
- get_page(page);
- pages[i] = page;
- }
-
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- --remainder;
- ++i;
- }
-
- *length = remainder;
- *position = vaddr;
-
- return i;
-}
-
struct page *follow_huge_addr(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, int write)

--
David Gibson | For every complex problem there is a
david AT gibson.dropbear.id.au | solution which is simple, neat and
| wrong.
http://www.ozlabs.org/people/dgibson

2004-04-16 02:58:56

by Chen, Kenneth W

[permalink] [raw]
Subject: RE: hugetlb demand paging patch part [2/3]

>>>> David Gibson wrote on Thursday, April 15, 2004 7:35 PM
> > Yes, killing follow_hugetlb_page() is safe because follow_page() takes
> > care of hugetlb page. See 2nd patch posted earlier in
> > hugetlb_demanding_generic.patch
>
> Yes, I looked at it already. But what I'm asking about is applying
> this patch *without* (or before) going to demand paging.
>
> Index: working-2.6/mm/memory.c
> ===================================================================
> --- working-2.6.orig/mm/memory.c 2004-04-13 11:42:42.000000000 +1000
> +++ working-2.6/mm/memory.c 2004-04-16 11:46:31.935870496 +1000
> @@ -766,16 +766,13 @@
> || !(flags & vma->vm_flags))
> return i ? : -EFAULT;
>
> - if (is_vm_hugetlb_page(vma)) {
> - i = follow_hugetlb_page(mm, vma, pages, vmas,
> - &start, &len, i);
> - continue;
> - }
> spin_lock(&mm->page_table_lock);
> do {
> struct page *map;
> int lookup_write = write;
> while (!(map = follow_page(mm, start, lookup_write))) {
> + /* hugepages should always be prefaulted */
> + BUG_ON(is_vm_hugetlb_page(vma));
> /*
> * Shortcut for anonymous pages. We don't want
> * to force the creation of pages tables for
>
> Yes, I looked at it already. But what I'm asking about is applying
> this patch *without* (or before) going to demand paging.

In that case, yes, it is not absolutely required. But we do special
optimization for follow_hugetlb_pages() in the prefaulting implementation,
at least for ia64 arch. It give a sizable gain on db benchmark.

- Ken


2004-04-16 03:35:57

by David Gibson

[permalink] [raw]
Subject: Re: hugetlb demand paging patch part [2/3]

On Thu, Apr 15, 2004 at 07:58:07PM -0700, Chen, Kenneth W wrote:
> >>>> David Gibson wrote on Thursday, April 15, 2004 7:35 PM
> > > Yes, killing follow_hugetlb_page() is safe because follow_page() takes
> > > care of hugetlb page. See 2nd patch posted earlier in
> > > hugetlb_demanding_generic.patch
> >
> > Yes, I looked at it already. But what I'm asking about is applying
> > this patch *without* (or before) going to demand paging.
> >
> > Index: working-2.6/mm/memory.c
> > ===================================================================
> > +++ working-2.6/mm/memory.c 2004-04-16 11:46:31.935870496 +1000
> > @@ -766,16 +766,13 @@
> > || !(flags & vma->vm_flags))
> > return i ? : -EFAULT;
> >
> > - if (is_vm_hugetlb_page(vma)) {
> > - i = follow_hugetlb_page(mm, vma, pages, vmas,
> > - &start, &len, i);
> > - continue;
> > - }
> > spin_lock(&mm->page_table_lock);
> > do {
> > struct page *map;
> > int lookup_write = write;
> > while (!(map = follow_page(mm, start, lookup_write))) {
> > + /* hugepages should always be prefaulted */
> > + BUG_ON(is_vm_hugetlb_page(vma));
> > /*
> > * Shortcut for anonymous pages. We don't want
> > * to force the creation of pages tables for
> >
> > Yes, I looked at it already. But what I'm asking about is applying
> > this patch *without* (or before) going to demand paging.

Oh, drat. Looks like I included the old version of the patch again,
not the fixed version without the BUG_ON(). Corrected version
below for real this time.

> In that case, yes, it is not absolutely required. But we do special
> optimization for follow_hugetlb_pages() in the prefaulting implementation,
> at least for ia64 arch. It give a sizable gain on db benchmark.

Ah! So it's just an optimiziation - it makes a bit more sense to me
now. I had assumed that this case (hugepage get_user_pages()) would
be sufficiently rare that it would not require optimization.
Apparently not.

Do you know where the cycles are going without this optimization? In
particular, could it be just the find_vma() in hugepage_vma() called
before follow_huge_addr()? I note that IA64 is the only arch to have
a non-trivial hugepage_vma()/follow_huge_addr() and that its
follow_huge_addr() doesn't actually use the vma passed in.

So the only significance of the find_vma() is to determine that we're
in an allocated region and that therefore (with prefault) the hugepage
PTEs must be present. If you actually check for the presence of the
page table entries and the return code of huge_pte_offset() in
follow_huge_addr() (as your 1/3 patch does), then the only thing that
hugepage_vma() need do on ia64 is to check that the address lies in
region 1 (and of course change names and remove the vma parameters,
since they're no longer used).

If we could get rid of follow_hugetlb_pages() it would remove an ugly
function from every arch, which would be nice.

--
David Gibson | For every complex problem there is a
david AT gibson.dropbear.id.au | solution which is simple, neat and
| wrong.
http://www.ozlabs.org/people/dgibson

2004-04-16 04:14:25

by Chen, Kenneth W

[permalink] [raw]
Subject: RE: hugetlb demand paging patch part [2/3]

David Gibson wrote on Thursday, April 15, 2004 8:27 PM
> Ah! So it's just an optimiziation - it makes a bit more sense to me
> now. I had assumed that this case (hugepage get_user_pages()) would
> be sufficiently rare that it would not require optimization.
> Apparently not.

It's a huge deal because for *every* I/O, kernel has to do get_user_pages()
to lock the page, it's really gets in the way with the spin_lock as well.

spin_lock(&mm->page_table_lock);
do {
struct page *map;
int lookup_write = write;
while (!(map = follow_page(mm, start, lookup_write))) {

With current state of art platform, I/O requirement pushes into 200K
per second, this become quite significant.


> Do you know where the cycles are going without this optimization? In
> particular, could it be just the find_vma() in hugepage_vma() called
> before follow_huge_addr()? I note that IA64 is the only arch to have
> a non-trivial hugepage_vma()/follow_huge_addr() and that its
> follow_huge_addr() doesn't actually use the vma passed in.

That's one, plus the spin lock mentioned above.


> If we could get rid of follow_hugetlb_pages() it would remove an ugly
> function from every arch, which would be nice.

I hope the goal here is not to trim code for existing prefaulting scheme.
That function has to go for demand paging, and demand paging comes with
a performance price most people don't realize. If the goal here is to
make the code prettier, I vote against that.


2004-04-16 04:51:21

by David Gibson

[permalink] [raw]
Subject: Re: hugetlb demand paging patch part [2/3]

On Thu, Apr 15, 2004 at 09:13:38PM -0700, Chen, Kenneth W wrote:
> David Gibson wrote on Thursday, April 15, 2004 8:27 PM
> > Ah! So it's just an optimiziation - it makes a bit more sense to me
> > now. I had assumed that this case (hugepage get_user_pages()) would
> > be sufficiently rare that it would not require optimization.
> > Apparently not.
>
> It's a huge deal because for *every* I/O, kernel has to do get_user_pages()
> to lock the page, it's really gets in the way with the spin_lock as well.
>
> spin_lock(&mm->page_table_lock);
> do {
> struct page *map;
> int lookup_write = write;
> while (!(map = follow_page(mm, start, lookup_write))) {
>
> With current state of art platform, I/O requirement pushes into 200K
> per second, this become quite significant.

Ok. This makes sense now that you explain it.

> > Do you know where the cycles are going without this optimization? In
> > particular, could it be just the find_vma() in hugepage_vma() called
> > before follow_huge_addr()? I note that IA64 is the only arch to have
> > a non-trivial hugepage_vma()/follow_huge_addr() and that its
> > follow_huge_addr() doesn't actually use the vma passed in.
>
> That's one, plus the spin lock mentioned above.

And akpm has just explained why it can be avoided in the hugepage
case.

> > If we could get rid of follow_hugetlb_pages() it would remove an ugly
> > function from every arch, which would be nice.
>
> I hope the goal here is not to trim code for existing prefaulting scheme.
> That function has to go for demand paging, and demand paging comes with
> a performance price most people don't realize. If the goal here is to
> make the code prettier, I vote against that.

Well, I'm attempting to understand the hugepage code across all the
archs, so that I can try to implement copy-on-write with a minimum of
arch specific gunk. Simplifying and consolidating the existing code
across archs would be a helpful first step, if possible.

--
David Gibson | For every complex problem there is a
david AT gibson.dropbear.id.au | solution which is simple, neat and
| wrong.
http://www.ozlabs.org/people/dgibson

2004-04-16 05:57:31

by Chen, Kenneth W

[permalink] [raw]
Subject: RE: hugetlb demand paging patch part [2/3]

David Gibson wrote on Thursday, April 15, 2004 9:49 PM
> > > If we could get rid of follow_hugetlb_pages() it would remove an ugly
> > > function from every arch, which would be nice.
> >
> > I hope the goal here is not to trim code for existing prefaulting scheme.
> > That function has to go for demand paging, and demand paging comes with
> > a performance price most people don't realize. If the goal here is to
> > make the code prettier, I vote against that.
>
> Well, I'm attempting to understand the hugepage code across all the
> archs, so that I can try to implement copy-on-write with a minimum of
> arch specific gunk. Simplifying and consolidating the existing code
> across archs would be a helpful first step, if possible.

Looks like everyone has their own agenda, COW is related to demand paging,
and has it's own set of characteristics to deal with. I would hope do one
thing at a time.

- Ken


2004-04-16 06:17:42

by David Gibson

[permalink] [raw]
Subject: Re: hugetlb demand paging patch part [2/3]

On Thu, Apr 15, 2004 at 10:56:14PM -0700, Chen, Kenneth W wrote:
> David Gibson wrote on Thursday, April 15, 2004 9:49 PM
> > > > If we could get rid of follow_hugetlb_pages() it would remove an ugly
> > > > function from every arch, which would be nice.
> > >
> > > I hope the goal here is not to trim code for existing prefaulting scheme.
> > > That function has to go for demand paging, and demand paging comes with
> > > a performance price most people don't realize. If the goal here is to
> > > make the code prettier, I vote against that.
> >
> > Well, I'm attempting to understand the hugepage code across all the
> > archs, so that I can try to implement copy-on-write with a minimum of
> > arch specific gunk. Simplifying and consolidating the existing code
> > across archs would be a helpful first step, if possible.
>
> Looks like everyone has their own agenda, COW is related to demand paging,
> and has it's own set of characteristics to deal with. I would hope do one
> thing at a time.

Which is why I've attempted to factor things out of your patches which
don't appear to be inherent to demand paging. Consolidating the
existing hugepage code will make both demand-paging and COW patches
much more palatable.

--
David Gibson | For every complex problem there is a
david AT gibson.dropbear.id.au | solution which is simple, neat and
| wrong.
http://www.ozlabs.org/people/dgibson

2004-04-16 19:04:04

by Ray Bryant

[permalink] [raw]
Subject: Re: hugetlb demand paging patch part [2/3]

David,

Is there a big user demand for copy-on-write support for hugetlb pages?
I can understand the rationale for making hugetlb pages behave more like user
pages, and fixing the problem that hugetlb pages are shared across fork via
MAP_SHARE semantics regardless of whether the user requests MAP_PRIVATE or
not, but it just doesn't strike me as something that anyone who uses hugetlb
pages would actually want.

Of course, YRMV (your requirements may vary). :-)

'David Gibson' wrote:
>
> Well, I'm attempting to understand the hugepage code across all the
> archs, so that I can try to implement copy-on-write with a minimum of
> arch specific gunk. Simplifying and consolidating the existing code
> across archs would be a helpful first step, if possible.
>

--
Best Regards,
Ray
-----------------------------------------------
Ray Bryant
512-453-9679 (work) 512-507-7807 (cell)
[email protected] [email protected]
The box said: "Requires Windows 98 or better",
so I installed Linux.
-----------------------------------------------

2004-04-17 12:07:52

by David Gibson

[permalink] [raw]
Subject: Re: hugetlb demand paging patch part [2/3]

On Fri, Apr 16, 2004 at 02:05:13PM -0500, Ray Bryant wrote:
> David,
>
> Is there a big user demand for copy-on-write support for hugetlb pages?
> I can understand the rationale for making hugetlb pages behave more like
> user pages, and fixing the problem that hugetlb pages are shared across
> fork via MAP_SHARE semantics regardless of whether the user requests
> MAP_PRIVATE or not, but it just doesn't strike me as something that anyone
> who uses hugetlb pages would actually want.

My main interest in it is as a prerequisite for various methods of
"automatically" using hugepages for programs where it is difficult to
manually code them to use hugetlbfs. In particular, think HPC
monsters written in FORTRAN. e.g. automatically putting suitable
aligned anonymous mmap()s in hugepages under some circumstances (I
can't say I like that idea much), using an LD_PRELOAD to put
malloc()ated memory into hugepages, or using a hacked ELF loader to
put the BSS section (again, think FORTRAN) into hugepages (actually
easier and less ugly than it sounds).

In any of these cases having the memory have different semantics
(MAP_SHARED) to normal anonymous memory would clearly be a Bad Thing.

> Of course, YRMV (your requirements may vary). :-)
>
> 'David Gibson' wrote:
> >
> >Well, I'm attempting to understand the hugepage code across all the
> >archs, so that I can try to implement copy-on-write with a minimum of
> >arch specific gunk. Simplifying and consolidating the existing code
> >across archs would be a helpful first step, if possible.

--
David Gibson | For every complex problem there is a
david AT gibson.dropbear.id.au | solution which is simple, neat and
| wrong.
http://www.ozlabs.org/people/dgibson

2004-04-18 17:35:43

by Ray Bryant

[permalink] [raw]
Subject: Re: [Lse-tech] Re: hugetlb demand paging patch part [2/3]



'David Gibson' wrote:

>
>
> My main interest in it is as a prerequisite for various methods of
> "automatically" using hugepages for programs where it is difficult to
> manually code them to use hugetlbfs. In particular, think HPC
> monsters written in FORTRAN. e.g. automatically putting suitable
> aligned anonymous mmap()s in hugepages under some circumstances (I
> can't say I like that idea much), using an LD_PRELOAD to put
> malloc()ated memory into hugepages, or using a hacked ELF loader to
> put the BSS section (again, think FORTRAN) into hugepages (actually
> easier and less ugly than it sounds).
>

Well, that certainly is a laudable goal. At the moment, one usually has to
resort to such things as POINTER variables and the like to get access to
hugetlbpage segments. Unfortunately, some of our experiments with the Intel
compiler for ia64 have indicated that the generated code can be significantly
slower when arrays are referenced off of POINTER variables than when the same
arrays are referenced out of COMMON, thus eliminating the performance gain of
HUGETLB pages.

My question was really intended to address applying development effort to
things that the users of hugetlbpages will likely actually use. For example,
it seems pointless to worry too much about demand paging of hugetlbpages out
to disk. Anyone who uses hugetlbpages for the performance boost they give
will also likely have rightsized their problem or machine configuration to
eliminate any swapping.

> In any of these cases having the memory have different semantics
> (MAP_SHARED) to normal anonymous memory would clearly be a Bad Thing.
>
>
>
>

--
Best Regards,
Ray
-----------------------------------------------
Ray Bryant
512-453-9679 (work) 512-507-7807 (cell)
[email protected] [email protected]
The box said: "Requires Windows 98 or better",
so I installed Linux.
-----------------------------------------------

2004-04-19 01:01:05

by David Gibson

[permalink] [raw]
Subject: Re: [Lse-tech] Re: hugetlb demand paging patch part [2/3]

On Sun, Apr 18, 2004 at 12:36:05PM -0500, Ray Bryant wrote:
>
>
> 'David Gibson' wrote:
>
> >
> >
> >My main interest in it is as a prerequisite for various methods of
> >"automatically" using hugepages for programs where it is difficult to
> >manually code them to use hugetlbfs. In particular, think HPC
> >monsters written in FORTRAN. e.g. automatically putting suitable
> >aligned anonymous mmap()s in hugepages under some circumstances (I
> >can't say I like that idea much), using an LD_PRELOAD to put
> >malloc()ated memory into hugepages, or using a hacked ELF loader to
> >put the BSS section (again, think FORTRAN) into hugepages (actually
> >easier and less ugly than it sounds).
> >
>
> Well, that certainly is a laudable goal. At the moment, one usually has to
> resort to such things as POINTER variables and the like to get access to
> hugetlbpage segments. Unfortunately, some of our experiments with the
> Intel compiler for ia64 have indicated that the generated code can be
> significantly slower when arrays are referenced off of POINTER variables
> than when the same arrays are referenced out of COMMON, thus eliminating
> the performance gain of HUGETLB pages.

Well, that's one problem with using POINTERs, but I think perhaps the
more serious one is that a lot of HPC code is written by scientists
who aren't programmers, and who still think in FORTRAN77.

> My question was really intended to address applying development effort to
> things that the users of hugetlbpages will likely actually use. For
> example, it seems pointless to worry too much about demand paging of
> hugetlbpages out to disk. Anyone who uses hugetlbpages for the performance
> boost they give will also likely have rightsized their problem or machine
> configuration to eliminate any swapping.

Well, indeed. Note that this "demand paging" set of patches don't
actually do paging out to disk - they just do on-demand allocation of
physical hugepages, rather than prefaulting them all. My only real
interest in it is that part of the mechanism is identical to that
needed for COW (handle_hugetlb_mm_fault(), in particular).

> >In any of these cases having the memory have different semantics
> >(MAP_SHARED) to normal anonymous memory would clearly be a Bad Thing.

--
David Gibson | For every complex problem there is a
david AT gibson.dropbear.id.au | solution which is simple, neat and
| wrong.
http://www.ozlabs.org/people/dgibson