2005-11-07 21:11:43

by Adam Litke

[permalink] [raw]
Subject: [RFC 0/2] Copy on write for hugetlbfs

The following two patches implement copy-on-write for hugetlbfs (thus
enabling MAP_PRIVATE mappings). Patch 1/2 (previously posted by David
Gibson) contains a couple small fixes to the demand fault handler and
makes COW fit in nicely. Patch 2/2 is the cow changes. Comments?
--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center


2005-11-07 21:39:42

by Adam Litke

[permalink] [raw]
Subject: [RFC 2/2] Hugetlb COW

[RFC] COW for hugepages
(Patch originally from David Gibson <[email protected]>)

This patch implements copy-on-write for hugepages, hence allowing
MAP_PRIVATE mappings of hugetlbfs.

This is chiefly useful for cases where we want to use hugepages
"automatically" - that is to map hugepages without the knowledge of
the code in the final application (either via kernel hooks, or with
LD_PRELOAD). We can use various heuristics to determine when
hugepages might be a good idea, but changing the semantics of
anonymous memory from MAP_PRIVATE to MAP_SHARED without the app's
knowledge is clearly wrong.
---
fs/hugetlbfs/inode.c | 3 -
include/linux/hugetlb.h | 11 ++++
mm/hugetlb.c | 113 ++++++++++++++++++++++++++++++++++++++++--------
mm/mmap.c | 2
4 files changed, 107 insertions(+), 22 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -100,9 +100,6 @@ static int hugetlbfs_file_mmap(struct fi
loff_t len, vma_len;
int ret;

- if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
- return -EINVAL;
-
if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
return -EINVAL;

diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h
+++ current/include/linux/hugetlb.h
@@ -65,6 +65,17 @@ pte_t huge_ptep_get_and_clear(struct mm_
pte_t *ptep);
#endif

+#define huge_ptep_set_wrprotect(mm, addr, ptep) ptep_set_wrprotect(mm, addr, ptep)
+static inline void set_huge_ptep_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t entry;
+
+ entry = pte_mkwrite(pte_mkdirty(*ptep));
+ ptep_set_access_flags(vma, address, ptep, entry, 1);
+ update_mmu_cache(vma, address, entry);
+}
+
#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK
#define hugetlb_prefault_arch_hook(mm) do { } while (0)
#else
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -255,11 +255,12 @@ struct vm_operations_struct hugetlb_vm_o
.nopage = hugetlb_nopage,
};

-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable)
{
pte_t entry;

- if (vma->vm_flags & VM_WRITE) {
+ if (writable) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
} else {
@@ -277,6 +278,9 @@ int copy_hugetlb_page_range(struct mm_st
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
+ int cow;
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;

for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
src_pte = huge_pte_offset(src, addr);
@@ -288,6 +292,8 @@ int copy_hugetlb_page_range(struct mm_st
spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
if (!pte_none(*src_pte)) {
+ if (cow)
+ huge_ptep_set_wrprotect(src, addr, src_pte);
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
@@ -340,7 +346,7 @@ void unmap_hugepage_range(struct vm_area
}

static struct page *find_or_alloc_huge_page(struct address_space *mapping,
- unsigned long idx)
+ unsigned long idx, int shared)
{
struct page *page;
int err;
@@ -359,26 +365,78 @@ retry:
return NULL;
}

- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
- if (err) {
- put_page(page);
- hugetlb_put_quota(mapping);
- if (err == -EEXIST)
- goto retry;
- page = NULL;
+ if (shared) {
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ page = NULL;
+ }
}

return page;
}

-int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, pte_t pte)
+{
+ struct page *old_page, *new_page;
+ int i, avoidcopy;
+
+ old_page = pte_page(pte);
+
+ /* If no-one else is actually using this page, avoid the copy
+ * and just make the page writable */
+ avoidcopy = (page_count(old_page) == 1);
+ if (avoidcopy) {
+ set_huge_ptep_writable(vma, address, ptep);
+ return VM_FAULT_MINOR;
+ }
+
+ page_cache_get(old_page);
+ new_page = alloc_huge_page();
+
+ if (! new_page) {
+ page_cache_release(old_page);
+
+ /* Logically this is OOM, not a SIGBUS, but an OOM
+ * could cause the kernel to go killing other
+ * processes which won't help the hugepage situation
+ * at all (?) */
+ return VM_FAULT_SIGBUS;
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+ copy_user_highpage(new_page + i, old_page + i,
+ address + i*PAGE_SIZE);
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+ if (likely(pte_same(*ptep, pte))) {
+ /* Break COW */
+ set_huge_pte_at(mm, address, ptep,
+ make_huge_pte(vma, new_page, 1));
+ /* Make the old page be freed below */
+ new_page = old_page;
+ }
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ return VM_FAULT_MINOR;
+}
+
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ int write_access)
{
int ret;
unsigned long idx;
unsigned long size;
struct page *page;
struct address_space *mapping;
+ pte_t new_pte;

mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -386,7 +444,8 @@ int hugetlb_no_page(struct mm_struct *mm

/* This returns a locked page, which keeps us safe in the
* event of a race with truncate() */
- page = find_or_alloc_huge_page(mapping, idx);
+ page = find_or_alloc_huge_page(mapping, idx,
+ vma->vm_flags & VM_SHARED);
if (!page)
return VM_FAULT_SIGBUS;

@@ -405,7 +464,16 @@ int hugetlb_no_page(struct mm_struct *mm
goto backout;

add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+
+ new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ && (vma->vm_flags & VM_SHARED)));
+
+ set_huge_pte_at(mm, address, ptep, new_pte);
+
+ if (write_access && !(vma->vm_flags & VM_SHARED)) {
+ /* Optimization, do the COW without a second fault */
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+ }

spin_unlock(&mm->page_table_lock);
unlock_page(page);
@@ -426,6 +494,7 @@ int hugetlb_fault(struct mm_struct *mm,
{
pte_t *ptep;
pte_t entry;
+ int ret;

ptep = huge_pte_alloc(mm, address);
if (! ptep)
@@ -434,12 +503,20 @@ int hugetlb_fault(struct mm_struct *mm,
entry = *ptep;

if (pte_none(entry))
- return hugetlb_no_page(mm, vma, address, ptep);
+ return hugetlb_no_page(mm, vma, address, ptep, write_access);

- /* we could get here if another thread instantiated the pte
- * before the test above */
+ ret = VM_FAULT_MINOR;

- return VM_FAULT_MINOR;
+ spin_lock(&mm->page_table_lock);
+
+ if (likely(pte_same(entry, *ptep)))
+ /* pte could have changed before we grabbed the lock */
+ if (write_access && !pte_write(entry))
+ ret = hugetlb_cow(mm, vma, address, ptep, entry);
+
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
}

int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
diff -upN reference/mm/mmap.c current/mm/mmap.c
--- reference/mm/mmap.c
+++ current/mm/mmap.c
@@ -1077,7 +1077,7 @@ munmap_back:
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+ if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED | VM_HUGETLB))
== (VM_WRITE | VM_RESERVED)) {
printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
"PROT_WRITE mmap of VM_RESERVED memory, which "

--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

2005-11-07 21:39:06

by Adam Litke

[permalink] [raw]
Subject: [RFC 1/2] Hugetlb fault fixes and reorg

[RFC] Cleanup / small fixes to hugetlb fault handling
(Patch originally from David Gibson <[email protected]>)
Initial Post: Tue. 25 Oct 2005

On Thu, 2005-10-27 at 16:37 +1000, 'David Gibson' wrote:
> This patch makes some slight tweaks / cleanups to the fault handling
> path for huge pages in -mm. My main motivation is to make it simpler
> to fit COW in, but along the way it addresses a few minor problems
> with the existing code:
>
> - The check against i_size was duplicated: once in
> find_lock_huge_page() and again in hugetlb_fault() after taking the
> page_table_lock. We only really need the locked one, so remove the
> other.
>
> - find_lock_huge_page() isn't a great name, since it does extra things
> not analagous to find_lock_page(). Rename it
> find_or_alloc_huge_page() which is closer to the mark.
>
> Signed-off-by: David Gibson <[email protected]>
Acked-by: Adam Litke <[email protected]>

---
hugetlb.c | 77 +++++++++++++++++++++++++++++++++++++-------------------------
1 files changed, 46 insertions(+), 31 deletions(-)
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -339,30 +339,24 @@ void unmap_hugepage_range(struct vm_area
flush_tlb_range(vma, start, end);
}

-static struct page *find_lock_huge_page(struct address_space *mapping,
- unsigned long idx)
+static struct page *find_or_alloc_huge_page(struct address_space *mapping,
+ unsigned long idx)
{
struct page *page;
int err;
- struct inode *inode = mapping->host;
- unsigned long size;

retry:
page = find_lock_page(mapping, idx);
if (page)
- goto out;
-
- /* Check to make sure the mapping hasn't been truncated */
- size = i_size_read(inode) >> HPAGE_SHIFT;
- if (idx >= size)
- goto out;
+ return page;

if (hugetlb_get_quota(mapping))
- goto out;
+ return NULL;
+
page = alloc_huge_page();
if (!page) {
hugetlb_put_quota(mapping);
- goto out;
+ return NULL;
}

err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
@@ -373,50 +367,49 @@ retry:
goto retry;
page = NULL;
}
-out:
+
return page;
}

-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access)
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
{
- int ret = VM_FAULT_SIGBUS;
+ int ret;
unsigned long idx;
unsigned long size;
- pte_t *pte;
struct page *page;
struct address_space *mapping;

- pte = huge_pte_alloc(mm, address);
- if (!pte)
- goto out;
-
mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));

- /*
- * Use page lock to guard against racing truncation
- * before we get page_table_lock.
- */
- page = find_lock_huge_page(mapping, idx);
+ /* This returns a locked page, which keeps us safe in the
+ * event of a race with truncate() */
+ page = find_or_alloc_huge_page(mapping, idx);
if (!page)
- goto out;
+ return VM_FAULT_SIGBUS;

spin_lock(&mm->page_table_lock);
+
+ ret = VM_FAULT_SIGBUS;
+
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
if (idx >= size)
goto backout;

ret = VM_FAULT_MINOR;
- if (!pte_none(*pte))
+
+ if (!pte_none(*ptep))
+ /* oops, someone instantiated this PTE before us */
goto backout;

add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+ set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+
spin_unlock(&mm->page_table_lock);
unlock_page(page);
-out:
+
return ret;

backout:
@@ -424,7 +417,29 @@ backout:
hugetlb_put_quota(mapping);
unlock_page(page);
put_page(page);
- goto out;
+
+ return ret;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ pte_t entry;
+
+ ptep = huge_pte_alloc(mm, address);
+ if (! ptep)
+ return VM_FAULT_OOM;
+
+ entry = *ptep;
+
+ if (pte_none(entry))
+ return hugetlb_no_page(mm, vma, address, ptep);
+
+ /* we could get here if another thread instantiated the pte
+ * before the test above */
+
+ return VM_FAULT_MINOR;
}

int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,

--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

2005-11-07 21:48:44

by Adam Litke

[permalink] [raw]
Subject: Re: [RFC 2/2] Hugetlb COW

On Mon, 2005-11-07 at 15:38 -0600, Adam Litke wrote:
> [RFC] COW for hugepages
> (Patch originally from David Gibson <[email protected]>)
>
> This patch implements copy-on-write for hugepages, hence allowing
> MAP_PRIVATE mappings of hugetlbfs.
>
> This is chiefly useful for cases where we want to use hugepages
> "automatically" - that is to map hugepages without the knowledge of
> the code in the final application (either via kernel hooks, or with
> LD_PRELOAD). We can use various heuristics to determine when
> hugepages might be a good idea, but changing the semantics of
> anonymous memory from MAP_PRIVATE to MAP_SHARED without the app's
> knowledge is clearly wrong.

I forgot to mention in the original post that this patch is currently
broken on ppc64 due to a problem with update_mmu_cache(). The proper
fix is understood but backed up behind the powerpc merge activity.

--
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

2005-11-07 23:31:36

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [RFC 1/2] Hugetlb fault fixes and reorg

On Mon, Nov 07, 2005 at 03:38:16PM -0600, Adam Litke wrote:
> (Patch originally from David Gibson <[email protected]>)
> Initial Post: Tue. 25 Oct 2005
> -static struct page *find_lock_huge_page(struct address_space *mapping,
> - unsigned long idx)
> +static struct page *find_or_alloc_huge_page(struct address_space *mapping,
> + unsigned long idx)
> {
> struct page *page;
> int err;
> - struct inode *inode = mapping->host;
> - unsigned long size;

This patch is a combination of function renaming, variable
initialization/assignment and return path/etc. oddities, plus some
functional changes (did I catch them all?) which apparently took a bit
of effort to get to after sifting through the rest of that.

Dump the parallel cleanups or split them into pure cleanup and pure
functional patches. I don't mind the cleanups, I mind the mixing.


-- wli

2005-11-07 23:36:14

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [RFC 2/2] Hugetlb COW

On Mon, Nov 07, 2005 at 03:38:53PM -0600, Adam Litke wrote:
> [RFC] COW for hugepages
> (Patch originally from David Gibson <[email protected]>)
> This patch implements copy-on-write for hugepages, hence allowing
> MAP_PRIVATE mappings of hugetlbfs.
> This is chiefly useful for cases where we want to use hugepages
> "automatically" - that is to map hugepages without the knowledge of
> the code in the final application (either via kernel hooks, or with
> LD_PRELOAD). We can use various heuristics to determine when
> hugepages might be a good idea, but changing the semantics of
> anonymous memory from MAP_PRIVATE to MAP_SHARED without the app's
> knowledge is clearly wrong.

I'll go check for architectures where page protections may be encoded
differently depending on the size of the translation, or whose code is
otherwise unprepared to cope with protection bits.

If you've done such checking already, I'd be much obliged to hear of it
(in fact, I'd much prefer you to have done so).


-- wli

2005-11-08 01:21:51

by David Gibson

[permalink] [raw]
Subject: Re: [RFC 1/2] Hugetlb fault fixes and reorg

On Mon, Nov 07, 2005 at 03:30:53PM -0800, William Lee Irwin wrote:
> On Mon, Nov 07, 2005 at 03:38:16PM -0600, Adam Litke wrote:
> > (Patch originally from David Gibson <[email protected]>)
> > Initial Post: Tue. 25 Oct 2005
> > -static struct page *find_lock_huge_page(struct address_space *mapping,
> > - unsigned long idx)
> > +static struct page *find_or_alloc_huge_page(struct address_space *mapping,
> > + unsigned long idx)
> > {
> > struct page *page;
> > int err;
> > - struct inode *inode = mapping->host;
> > - unsigned long size;
>
> This patch is a combination of function renaming, variable
> initialization/assignment and return path/etc. oddities, plus some
> functional changes (did I catch them all?) which apparently took a bit
> of effort to get to after sifting through the rest of that.

Functional changes? There shouldn't be...

> Dump the parallel cleanups or split them into pure cleanup and pure
> functional patches. I don't mind the cleanups, I mind the mixing.

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson

2005-11-08 02:53:38

by David Gibson

[permalink] [raw]
Subject: Re: [RFC 2/2] Hugetlb COW

On Mon, Nov 07, 2005 at 03:35:38PM -0800, William Lee Irwin wrote:
> On Mon, Nov 07, 2005 at 03:38:53PM -0600, Adam Litke wrote:
> > [RFC] COW for hugepages
> > (Patch originally from David Gibson <[email protected]>)
> > This patch implements copy-on-write for hugepages, hence allowing
> > MAP_PRIVATE mappings of hugetlbfs.
> > This is chiefly useful for cases where we want to use hugepages
> > "automatically" - that is to map hugepages without the knowledge of
> > the code in the final application (either via kernel hooks, or with
> > LD_PRELOAD). We can use various heuristics to determine when
> > hugepages might be a good idea, but changing the semantics of
> > anonymous memory from MAP_PRIVATE to MAP_SHARED without the app's
> > knowledge is clearly wrong.
>
> I'll go check for architectures where page protections may be encoded
> differently depending on the size of the translation, or whose code is
> otherwise unprepared to cope with protection bits.
>
> If you've done such checking already, I'd be much obliged to hear of it
> (in fact, I'd much prefer you to have done so).

I can't see how the COW catch could be any more broken in this regard
than we are already: make_huge_pte() in mm/hugetlb.c already assumes
that pte_mkwrite() and pte_wrprotect() will work properly on hugepage
PTEs. COW doesn't use anything more.

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson

2005-11-08 03:46:28

by David Gibson

[permalink] [raw]
Subject: Re: [RFC 2/2] Hugetlb COW

On Mon, Nov 07, 2005 at 03:38:53PM -0600, Adam Litke wrote:
> [RFC] COW for hugepages
> (Patch originally from David Gibson <[email protected]>)

Just realised there's at least one seriouss bug in the patch -
find_or_alloc_huge_page() can return a non-locked page when demand
faulting a PRIVATE mapping. That will cause the unlock_page() further
down to BUG(). Here's a corrected version.

This patch implements copy-on-write for hugepages, hence allowing
MAP_SHARED mappings of hugetlbfs.

This is chiefly useful for cases where we want to use hugepages
"automatically" - that is to map hugepages without the knowledge of
the code in the final application (either via kernel hooks, or with
LD_PRELOAD). We can use various heuristics to determine when
hugepages might be a good idea, but changing the semantics of
anonymous memory from MAP_PRIVATE to MAP_SHARED without the app's
knowledge is clearly wrong.

Index: working-2.6/fs/hugetlbfs/inode.c
===================================================================
--- working-2.6.orig/fs/hugetlbfs/inode.c 2005-11-08 14:04:59.000000000 +1100
+++ working-2.6/fs/hugetlbfs/inode.c 2005-11-08 14:05:04.000000000 +1100
@@ -100,9 +100,6 @@
loff_t len, vma_len;
int ret;

- if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
- return -EINVAL;
-
if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
return -EINVAL;

Index: working-2.6/include/linux/hugetlb.h
===================================================================
--- working-2.6.orig/include/linux/hugetlb.h 2005-11-08 14:04:59.000000000 +1100
+++ working-2.6/include/linux/hugetlb.h 2005-11-08 14:05:04.000000000 +1100
@@ -65,6 +65,17 @@
pte_t *ptep);
#endif

+#define huge_ptep_set_wrprotect(mm, addr, ptep) ptep_set_wrprotect(mm, addr, ptep)
+static inline void set_huge_ptep_writable(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ pte_t entry;
+
+ entry = pte_mkwrite(pte_mkdirty(*ptep));
+ ptep_set_access_flags(vma, address, ptep, entry, 1);
+ update_mmu_cache(vma, address, entry);
+}
+
#ifndef ARCH_HAS_HUGETLB_PREFAULT_HOOK
#define hugetlb_prefault_arch_hook(mm) do { } while (0)
#else
Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c 2005-11-08 14:05:04.000000000 +1100
+++ working-2.6/mm/hugetlb.c 2005-11-08 14:21:13.000000000 +1100
@@ -255,11 +255,12 @@
.nopage = hugetlb_nopage,
};

-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+ int writable)
{
pte_t entry;

- if (vma->vm_flags & VM_WRITE) {
+ if (writable) {
entry =
pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
} else {
@@ -277,6 +278,9 @@
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
+ int cow;
+
+ cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;

for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
src_pte = huge_pte_offset(src, addr);
@@ -288,6 +292,8 @@
spin_lock(&dst->page_table_lock);
spin_lock(&src->page_table_lock);
if (!pte_none(*src_pte)) {
+ if (cow)
+ huge_ptep_set_wrprotect(src, addr, src_pte);
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
@@ -340,7 +346,7 @@
}

static struct page *find_or_alloc_huge_page(struct address_space *mapping,
- unsigned long idx)
+ unsigned long idx, int shared)
{
struct page *page;
int err;
@@ -359,26 +365,81 @@
return NULL;
}

- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
- if (err) {
- put_page(page);
- hugetlb_put_quota(mapping);
- if (err == -EEXIST)
- goto retry;
- page = NULL;
+ if (shared) {
+ err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ if (err) {
+ put_page(page);
+ hugetlb_put_quota(mapping);
+ if (err == -EEXIST)
+ goto retry;
+ page = NULL;
+ }
+ } else {
+ /* Caller expects a locked page */
+ lock_page(page);
}

return page;
}

-int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, pte_t pte)
+{
+ struct page *old_page, *new_page;
+ int i, avoidcopy;
+
+ old_page = pte_page(pte);
+
+ /* If no-one else is actually using this page, avoid the copy
+ * and just make the page writable */
+ avoidcopy = (page_count(old_page) == 1);
+ if (avoidcopy) {
+ set_huge_ptep_writable(vma, address, ptep);
+ return VM_FAULT_MINOR;
+ }
+
+ page_cache_get(old_page);
+ new_page = alloc_huge_page();
+
+ if (! new_page) {
+ page_cache_release(old_page);
+
+ /* Logically this is OOM, not a SIGBUS, but an OOM
+ * could cause the kernel to go killing other
+ * processes which won't help the hugepage situation
+ * at all (?) */
+ return VM_FAULT_SIGBUS;
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+ copy_user_highpage(new_page + i, old_page + i,
+ address + i*PAGE_SIZE);
+ spin_lock(&mm->page_table_lock);
+
+ ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+ if (likely(pte_same(*ptep, pte))) {
+ /* Break COW */
+ set_huge_pte_at(mm, address, ptep,
+ make_huge_pte(vma, new_page, 1));
+ /* Make the old page be freed below */
+ new_page = old_page;
+ }
+ page_cache_release(new_page);
+ page_cache_release(old_page);
+ return VM_FAULT_MINOR;
+}
+
+static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ int write_access)
{
int ret;
unsigned long idx;
unsigned long size;
struct page *page;
struct address_space *mapping;
+ pte_t new_pte;

mapping = vma->vm_file->f_mapping;
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -386,10 +447,13 @@

/* This returns a locked page, which keeps us safe in the
* event of a race with truncate() */
- page = find_or_alloc_huge_page(mapping, idx);
+ page = find_or_alloc_huge_page(mapping, idx,
+ vma->vm_flags & VM_SHARED);
if (!page)
return VM_FAULT_SIGBUS;

+ BUG_ON(!PageLocked(page));
+
spin_lock(&mm->page_table_lock);

ret = VM_FAULT_SIGBUS;
@@ -405,7 +469,16 @@
goto backout;

add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
- set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page));
+
+ new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ && (vma->vm_flags & VM_SHARED)));
+
+ set_huge_pte_at(mm, address, ptep, new_pte);
+
+ if (write_access && !(vma->vm_flags & VM_SHARED)) {
+ /* Optimization, do the COW without a second fault */
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+ }

spin_unlock(&mm->page_table_lock);
unlock_page(page);
@@ -426,6 +499,7 @@
{
pte_t *ptep;
pte_t entry;
+ int ret;

ptep = huge_pte_alloc(mm, address);
if (! ptep)
@@ -434,12 +508,20 @@
entry = *ptep;

if (pte_none(entry))
- return hugetlb_no_page(mm, vma, address, ptep);
+ return hugetlb_no_page(mm, vma, address, ptep, write_access);

- /* we could get here if another thread instantiated the pte
- * before the test above */
+ ret = VM_FAULT_MINOR;

- return VM_FAULT_MINOR;
+ spin_lock(&mm->page_table_lock);
+
+ if (likely(pte_same(entry, *ptep)))
+ /* pte could have changed before we grabbed the lock */
+ if (write_access && !pte_write(entry))
+ ret = hugetlb_cow(mm, vma, address, ptep, entry);
+
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
}

int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
Index: working-2.6/mm/mmap.c
===================================================================
--- working-2.6.orig/mm/mmap.c 2005-11-08 14:04:59.000000000 +1100
+++ working-2.6/mm/mmap.c 2005-11-08 14:05:04.000000000 +1100
@@ -1076,7 +1076,7 @@
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+ if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED | VM_HUGETLB))
== (VM_WRITE | VM_RESERVED)) {
printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
"PROT_WRITE mmap of VM_RESERVED memory, which "


--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson

2005-11-08 03:46:27

by David Gibson

[permalink] [raw]
Subject: Re: [RFC 2/2] Hugetlb COW

On Mon, Nov 07, 2005 at 03:47:55PM -0600, Adam Litke wrote:
> On Mon, 2005-11-07 at 15:38 -0600, Adam Litke wrote:
> > [RFC] COW for hugepages
> > (Patch originally from David Gibson <[email protected]>)
> >
> > This patch implements copy-on-write for hugepages, hence allowing
> > MAP_PRIVATE mappings of hugetlbfs.
> >
> > This is chiefly useful for cases where we want to use hugepages
> > "automatically" - that is to map hugepages without the knowledge of
> > the code in the final application (either via kernel hooks, or with
> > LD_PRELOAD). We can use various heuristics to determine when
> > hugepages might be a good idea, but changing the semantics of
> > anonymous memory from MAP_PRIVATE to MAP_SHARED without the app's
> > knowledge is clearly wrong.
>
> I forgot to mention in the original post that this patch is currently
> broken on ppc64 due to a problem with update_mmu_cache(). The proper
> fix is understood but backed up behind the powerpc merge activity.

Now that the merge tree and 64k pages have been pulled into mainline I
updated this patch, and sent it off to paulus. You'll fnid it under
the subject "ppc64: Make hash_preload() and update_mmu_cache() cope
with hugepages".

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson

2005-11-08 04:21:43

by David Gibson

[permalink] [raw]
Subject: Re: [RFC 2/2] Hugetlb COW

On Mon, Nov 07, 2005 at 03:47:55PM -0600, Adam Litke wrote:
> On Mon, 2005-11-07 at 15:38 -0600, Adam Litke wrote:
> > [RFC] COW for hugepages
> > (Patch originally from David Gibson <[email protected]>)
> >
> > This patch implements copy-on-write for hugepages, hence allowing
> > MAP_PRIVATE mappings of hugetlbfs.
> >
> > This is chiefly useful for cases where we want to use hugepages
> > "automatically" - that is to map hugepages without the knowledge of
> > the code in the final application (either via kernel hooks, or with
> > LD_PRELOAD). We can use various heuristics to determine when
> > hugepages might be a good idea, but changing the semantics of
> > anonymous memory from MAP_PRIVATE to MAP_SHARED without the app's
> > knowledge is clearly wrong.
>
> I forgot to mention in the original post that this patch is currently
> broken on ppc64 due to a problem with update_mmu_cache(). The proper
> fix is understood but backed up behind the powerpc merge activity.

Actually, BenH just pointed out that the version of update_mmu_cache()
we now have from the 64k patch already half-handled this situation -
it wouldn't actually do anything on a hugepage fault, but it wouldn't
blow up either. So the COW patch should work as is on ppc64, although
the update_mmu_cache change is probably an important optimization
(otherwise we'll take two faults on every COW - one to actually do the
COW, and one to twiddle the bits in the hash page table).

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson