The invalidation of address ranges in a mm_struct needs to be
performed when pages are removed or permissions etc change.
invalidate_range_begin/end() is frequently called with only mmap_sem
held. If invalidate_range_begin() is called with locks held then we
pass a flag into invalidate_range() to indicate that no sleeping is
possible.
In two cases we use invalidate_range_begin/end to invalidate
single pages because the pair allows holding off new references
(idea by Robin Holt).
do_wp_page(): We hold off new references while update the pte.
xip_unmap: We are not taking the PageLock so we cannot
use the invalidate_page mmu_rmap_notifier. invalidate_range_begin/end
stands in.
Comments state that mmap_sem must be held for
remap_pfn_range() but various drivers do not seem to do this.
Signed-off-by: Andrea Arcangeli <[email protected]>
Signed-off-by: Robin Holt <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>
---
mm/filemap_xip.c | 5 +++++
mm/fremap.c | 3 +++
mm/hugetlb.c | 3 +++
mm/memory.c | 24 ++++++++++++++++++++++--
mm/mmap.c | 2 ++
mm/mremap.c | 7 ++++++-
6 files changed, 41 insertions(+), 3 deletions(-)
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c 2008-01-31 20:56:03.000000000 -0800
+++ linux-2.6/mm/fremap.c 2008-01-31 20:59:14.000000000 -0800
@@ -15,6 +15,7 @@
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
@@ -211,7 +212,9 @@ asmlinkage long sys_remap_file_pages(uns
spin_unlock(&mapping->i_mmap_lock);
}
+ mmu_notifier(invalidate_range_begin, mm, start, start + size, 0);
err = populate_range(mm, vma, start, size, pgoff);
+ mmu_notifier(invalidate_range_end, mm, start, start + size, 0);
if (!err && !(flags & MAP_NONBLOCK)) {
if (unlikely(has_write_lock)) {
downgrade_write(&mm->mmap_sem);
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2008-01-31 20:56:03.000000000 -0800
+++ linux-2.6/mm/memory.c 2008-01-31 20:59:14.000000000 -0800
@@ -50,6 +50,7 @@
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
+#include <linux/mmu_notifier.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -601,6 +602,9 @@ int copy_page_range(struct mm_struct *ds
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier(invalidate_range_begin, src_mm, addr, end, 0);
+
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
do {
@@ -611,6 +615,11 @@ int copy_page_range(struct mm_struct *ds
vma, addr, next))
return -ENOMEM;
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier(invalidate_range_end, src_mm,
+ vma->vm_start, end, 0);
+
return 0;
}
@@ -883,13 +892,16 @@ unsigned long zap_page_range(struct vm_a
struct mmu_gather *tlb;
unsigned long end = address + size;
unsigned long nr_accounted = 0;
+ int atomic = details ? (details->i_mmap_lock != 0) : 0;
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
+ mmu_notifier(invalidate_range_begin, mm, address, end, atomic);
end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
if (tlb)
tlb_finish_mmu(tlb, address, end);
+ mmu_notifier(invalidate_range_end, mm, address, end, atomic);
return end;
}
@@ -1318,7 +1330,7 @@ int remap_pfn_range(struct vm_area_struc
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + PAGE_ALIGN(size);
+ unsigned long start = addr, end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
int err;
@@ -1352,6 +1364,7 @@ int remap_pfn_range(struct vm_area_struc
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
do {
next = pgd_addr_end(addr, end);
err = remap_pud_range(mm, pgd, addr, next,
@@ -1359,6 +1372,7 @@ int remap_pfn_range(struct vm_area_struc
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier(invalidate_range_end, mm, start, end, 0);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1442,10 +1456,11 @@ int apply_to_page_range(struct mm_struct
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + size;
+ unsigned long start = addr, end = addr + size;
int err;
BUG_ON(addr >= end);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1453,6 +1468,7 @@ int apply_to_page_range(struct mm_struct
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier(invalidate_range_end, mm, start, end, 0);
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1630,6 +1646,8 @@ gotten:
goto oom;
cow_user_page(new_page, old_page, address, vma);
+ mmu_notifier(invalidate_range_begin, mm, address,
+ address + PAGE_SIZE, 0);
/*
* Re-check the pte - we dropped the lock
*/
@@ -1668,6 +1686,8 @@ gotten:
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
+ mmu_notifier(invalidate_range_end, mm,
+ address, address + PAGE_SIZE, 0);
if (dirty_page) {
if (vma->vm_file)
file_update_time(vma->vm_file);
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c 2008-01-31 20:58:05.000000000 -0800
+++ linux-2.6/mm/mmap.c 2008-01-31 20:59:14.000000000 -0800
@@ -1744,11 +1744,13 @@ static void unmap_region(struct mm_struc
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
+ mmu_notifier(invalidate_range_end, mm, start, end, 0);
}
/*
Index: linux-2.6/mm/hugetlb.c
===================================================================
--- linux-2.6.orig/mm/hugetlb.c 2008-01-31 20:56:03.000000000 -0800
+++ linux-2.6/mm/hugetlb.c 2008-01-31 20:59:14.000000000 -0800
@@ -14,6 +14,7 @@
#include <linux/mempolicy.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
+#include <linux/mmu_notifier.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -743,6 +744,7 @@ void __unmap_hugepage_range(struct vm_ar
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 1);
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
@@ -763,6 +765,7 @@ void __unmap_hugepage_range(struct vm_ar
}
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
+ mmu_notifier(invalidate_range_end, mm, start, end, 1);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
list_del(&page->lru);
put_page(page);
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c 2008-01-31 20:56:03.000000000 -0800
+++ linux-2.6/mm/filemap_xip.c 2008-01-31 20:59:14.000000000 -0800
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/uio.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
#include <linux/sched.h>
#include <asm/tlbflush.h>
@@ -189,6 +190,8 @@ __xip_unmap (struct address_space * mapp
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ mmu_notifier(invalidate_range_begin, mm, address,
+ address + PAGE_SIZE, 1);
pte = page_check_address(page, mm, address, &ptl);
if (pte) {
/* Nuke the page table entry. */
@@ -200,6 +203,8 @@ __xip_unmap (struct address_space * mapp
pte_unmap_unlock(pte, ptl);
page_cache_release(page);
}
+ mmu_notifier(invalidate_range_end, mm,
+ address, address + PAGE_SIZE, 1);
}
spin_unlock(&mapping->i_mmap_lock);
}
Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c 2008-01-31 20:56:03.000000000 -0800
+++ linux-2.6/mm/mremap.c 2008-01-31 20:59:14.000000000 -0800
@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -124,12 +125,15 @@ unsigned long move_page_tables(struct vm
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len)
{
- unsigned long extent, next, old_end;
+ unsigned long extent, next, old_start, old_end;
pmd_t *old_pmd, *new_pmd;
+ old_start = old_addr;
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
+ mmu_notifier(invalidate_range_begin, vma->vm_mm,
+ old_addr, old_end, 0);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
next = (old_addr + PMD_SIZE) & PMD_MASK;
@@ -150,6 +154,7 @@ unsigned long move_page_tables(struct vm
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
new_vma, new_pmd, new_addr);
}
+ mmu_notifier(invalidate_range_end, vma->vm_mm, old_start, old_end, 0);
return len + old_addr - old_end; /* how much done */
}
--
do_wp_page can reach the _end callout without passing the _begin
callout. This prevents making the _end unles the _begin has also
been made.
Index: mmu_notifiers-cl-v5/mm/memory.c
===================================================================
--- mmu_notifiers-cl-v5.orig/mm/memory.c 2008-02-01 04:44:03.000000000 -0600
+++ mmu_notifiers-cl-v5/mm/memory.c 2008-02-01 04:46:18.000000000 -0600
@@ -1564,7 +1564,7 @@ static int do_wp_page(struct mm_struct *
{
struct page *old_page, *new_page;
pte_t entry;
- int reuse = 0, ret = 0;
+ int reuse = 0, ret = 0, invalidate_started = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
@@ -1649,6 +1649,8 @@ gotten:
mmu_notifier(invalidate_range_begin, mm, address,
address + PAGE_SIZE, 0);
+ invalidate_started = 1;
+
/*
* Re-check the pte - we dropped the lock
*/
@@ -1687,7 +1689,8 @@ gotten:
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
- mmu_notifier(invalidate_range_end, mm,
+ if (invalidate_started)
+ mmu_notifier(invalidate_range_end, mm,
address, address + PAGE_SIZE, 0);
if (dirty_page) {
if (vma->vm_file)
Argh. Did not see this soon enougn. Maybe this one is better since it
avoids the additional unlocks?
On Fri, 1 Feb 2008, Robin Holt wrote:
> do_wp_page can reach the _end callout without passing the _begin
> callout. This prevents making the _end unles the _begin has also
> been made.
>
> Index: mmu_notifiers-cl-v5/mm/memory.c
> ===================================================================
> --- mmu_notifiers-cl-v5.orig/mm/memory.c 2008-02-01 04:44:03.000000000 -0600
> +++ mmu_notifiers-cl-v5/mm/memory.c 2008-02-01 04:46:18.000000000 -0600
> @@ -1564,7 +1564,7 @@ static int do_wp_page(struct mm_struct *
> {
> struct page *old_page, *new_page;
> pte_t entry;
> - int reuse = 0, ret = 0;
> + int reuse = 0, ret = 0, invalidate_started = 0;
> int page_mkwrite = 0;
> struct page *dirty_page = NULL;
>
> @@ -1649,6 +1649,8 @@ gotten:
>
> mmu_notifier(invalidate_range_begin, mm, address,
> address + PAGE_SIZE, 0);
> + invalidate_started = 1;
> +
> /*
> * Re-check the pte - we dropped the lock
> */
> @@ -1687,7 +1689,8 @@ gotten:
> page_cache_release(old_page);
> unlock:
> pte_unmap_unlock(page_table, ptl);
> - mmu_notifier(invalidate_range_end, mm,
> + if (invalidate_started)
> + mmu_notifier(invalidate_range_end, mm,
> address, address + PAGE_SIZE, 0);
> if (dirty_page) {
> if (vma->vm_file)
>
Christoph,
The following code in do_wp_page is a problem.
We are getting this callout when we transition the pte from a read-only
to read-write. Jack and I can not see a reason we would need that
callout. It is causing problems for xpmem in that a write fault goes
to get_user_pages which gets back to do_wp_page that does the callout.
XPMEM only allows either faulting or invalidating to occur for an mm.
As you can see, the case above needs it to be in both states.
Thanks,
Robin
> @@ -1630,6 +1646,8 @@ gotten:
> goto oom;
> cow_user_page(new_page, old_page, address, vma);
>
> + mmu_notifier(invalidate_range_begin, mm, address,
> + address + PAGE_SIZE, 0);
> /*
> * Re-check the pte - we dropped the lock
> */
> @@ -1668,6 +1686,8 @@ gotten:
> page_cache_release(old_page);
> unlock:
> pte_unmap_unlock(page_table, ptl);
> + mmu_notifier(invalidate_range_end, mm,
> + address, address + PAGE_SIZE, 0);
> if (dirty_page) {
> if (vma->vm_file)
> file_update_time(vma->vm_file);
> Index: linux-2.6/mm/mmap.c
> ===================================================================
> --- linux-2.6.orig/mm/mmap.c 2008-01-31 20:58:05.000000000 -0800
> +++ linux-2.6/mm/mmap.c 2008-01-31 20:59:14.000000000 -0800
> @@ -1744,11 +1744,13 @@ static void unmap_region(struct mm_struc
> lru_add_drain();
> tlb = tlb_gather_mmu(mm, 0);
> update_hiwater_rss(mm);
> + mmu_notifier(invalidate_range_begin, mm, start, end, 0);
> unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
> vm_unacct_memory(nr_accounted);
> free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
> next? next->vm_start: 0);
> tlb_finish_mmu(tlb, start, end);
> + mmu_notifier(invalidate_range_end, mm, start, end, 0);
> }
>
> /*
> Index: linux-2.6/mm/hugetlb.c
> ===================================================================
> --- linux-2.6.orig/mm/hugetlb.c 2008-01-31 20:56:03.000000000 -0800
> +++ linux-2.6/mm/hugetlb.c 2008-01-31 20:59:14.000000000 -0800
> @@ -14,6 +14,7 @@
> #include <linux/mempolicy.h>
> #include <linux/cpuset.h>
> #include <linux/mutex.h>
> +#include <linux/mmu_notifier.h>
>
> #include <asm/page.h>
> #include <asm/pgtable.h>
> @@ -743,6 +744,7 @@ void __unmap_hugepage_range(struct vm_ar
> BUG_ON(start & ~HPAGE_MASK);
> BUG_ON(end & ~HPAGE_MASK);
>
> + mmu_notifier(invalidate_range_begin, mm, start, end, 1);
> spin_lock(&mm->page_table_lock);
> for (address = start; address < end; address += HPAGE_SIZE) {
> ptep = huge_pte_offset(mm, address);
> @@ -763,6 +765,7 @@ void __unmap_hugepage_range(struct vm_ar
> }
> spin_unlock(&mm->page_table_lock);
> flush_tlb_range(vma, start, end);
> + mmu_notifier(invalidate_range_end, mm, start, end, 1);
> list_for_each_entry_safe(page, tmp, &page_list, lru) {
> list_del(&page->lru);
> put_page(page);
> Index: linux-2.6/mm/filemap_xip.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap_xip.c 2008-01-31 20:56:03.000000000 -0800
> +++ linux-2.6/mm/filemap_xip.c 2008-01-31 20:59:14.000000000 -0800
> @@ -13,6 +13,7 @@
> #include <linux/module.h>
> #include <linux/uio.h>
> #include <linux/rmap.h>
> +#include <linux/mmu_notifier.h>
> #include <linux/sched.h>
> #include <asm/tlbflush.h>
>
> @@ -189,6 +190,8 @@ __xip_unmap (struct address_space * mapp
> address = vma->vm_start +
> ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
> BUG_ON(address < vma->vm_start || address >= vma->vm_end);
> + mmu_notifier(invalidate_range_begin, mm, address,
> + address + PAGE_SIZE, 1);
> pte = page_check_address(page, mm, address, &ptl);
> if (pte) {
> /* Nuke the page table entry. */
> @@ -200,6 +203,8 @@ __xip_unmap (struct address_space * mapp
> pte_unmap_unlock(pte, ptl);
> page_cache_release(page);
> }
> + mmu_notifier(invalidate_range_end, mm,
> + address, address + PAGE_SIZE, 1);
> }
> spin_unlock(&mapping->i_mmap_lock);
> }
> Index: linux-2.6/mm/mremap.c
> ===================================================================
> --- linux-2.6.orig/mm/mremap.c 2008-01-31 20:56:03.000000000 -0800
> +++ linux-2.6/mm/mremap.c 2008-01-31 20:59:14.000000000 -0800
> @@ -18,6 +18,7 @@
> #include <linux/highmem.h>
> #include <linux/security.h>
> #include <linux/syscalls.h>
> +#include <linux/mmu_notifier.h>
>
> #include <asm/uaccess.h>
> #include <asm/cacheflush.h>
> @@ -124,12 +125,15 @@ unsigned long move_page_tables(struct vm
> unsigned long old_addr, struct vm_area_struct *new_vma,
> unsigned long new_addr, unsigned long len)
> {
> - unsigned long extent, next, old_end;
> + unsigned long extent, next, old_start, old_end;
> pmd_t *old_pmd, *new_pmd;
>
> + old_start = old_addr;
> old_end = old_addr + len;
> flush_cache_range(vma, old_addr, old_end);
>
> + mmu_notifier(invalidate_range_begin, vma->vm_mm,
> + old_addr, old_end, 0);
> for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
> cond_resched();
> next = (old_addr + PMD_SIZE) & PMD_MASK;
> @@ -150,6 +154,7 @@ unsigned long move_page_tables(struct vm
> move_ptes(vma, old_pmd, old_addr, old_addr + extent,
> new_vma, new_pmd, new_addr);
> }
> + mmu_notifier(invalidate_range_end, vma->vm_mm, old_start, old_end, 0);
>
> return len + old_addr - old_end; /* how much done */
> }
>
> --
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
On Fri, 1 Feb 2008, Robin Holt wrote:
> We are getting this callout when we transition the pte from a read-only
> to read-write. Jack and I can not see a reason we would need that
> callout. It is causing problems for xpmem in that a write fault goes
> to get_user_pages which gets back to do_wp_page that does the callout.
Right. You placed it there in the first place. So we can drop the code
from do_wp_page?
On Fri, Feb 01, 2008 at 03:19:32PM -0800, Christoph Lameter wrote:
> On Fri, 1 Feb 2008, Robin Holt wrote:
>
> > We are getting this callout when we transition the pte from a read-only
> > to read-write. Jack and I can not see a reason we would need that
> > callout. It is causing problems for xpmem in that a write fault goes
> > to get_user_pages which gets back to do_wp_page that does the callout.
>
> Right. You placed it there in the first place. So we can drop the code
> from do_wp_page?
No, we need a callout when we are becoming more restrictive, but not
when becoming more permissive. I would have to guess that is the case
for any of these callouts. It is for both GRU and XPMEM. I would
expect the same is true for KVM, but would like a ruling from Andrea on
that.
Thanks,
Robin
On Fri, 1 Feb 2008, Robin Holt wrote:
> On Fri, Feb 01, 2008 at 03:19:32PM -0800, Christoph Lameter wrote:
> > On Fri, 1 Feb 2008, Robin Holt wrote:
> >
> > > We are getting this callout when we transition the pte from a read-only
> > > to read-write. Jack and I can not see a reason we would need that
> > > callout. It is causing problems for xpmem in that a write fault goes
> > > to get_user_pages which gets back to do_wp_page that does the callout.
> >
> > Right. You placed it there in the first place. So we can drop the code
> > from do_wp_page?
>
> No, we need a callout when we are becoming more restrictive, but not
> when becoming more permissive. I would have to guess that is the case
> for any of these callouts. It is for both GRU and XPMEM. I would
> expect the same is true for KVM, but would like a ruling from Andrea on
> that.
do_wp_page is entered when the pte shows that the page is not writeable
and it makes the page writable in some situations. Then we do not
invalidate the remote reference.
However, when we do COW then a *new* page is put in place of the existing
readonly page. At that point we need to remove the remote pte that is
readonly. Then we install a new pte pointing to a *different* page that is
writable.
Are you saying that you get the callback when transitioning from a read
only to a read write pte on the *same* page?
On Fri, Feb 01, 2008 at 04:05:08PM -0800, Christoph Lameter wrote:
> On Fri, 1 Feb 2008, Robin Holt wrote:
>
> > On Fri, Feb 01, 2008 at 03:19:32PM -0800, Christoph Lameter wrote:
> > > On Fri, 1 Feb 2008, Robin Holt wrote:
> > >
> > > > We are getting this callout when we transition the pte from a read-only
> > > > to read-write. Jack and I can not see a reason we would need that
> > > > callout. It is causing problems for xpmem in that a write fault goes
> > > > to get_user_pages which gets back to do_wp_page that does the callout.
> > >
> > > Right. You placed it there in the first place. So we can drop the code
> > > from do_wp_page?
> >
> > No, we need a callout when we are becoming more restrictive, but not
> > when becoming more permissive. I would have to guess that is the case
> > for any of these callouts. It is for both GRU and XPMEM. I would
> > expect the same is true for KVM, but would like a ruling from Andrea on
> > that.
>
> do_wp_page is entered when the pte shows that the page is not writeable
> and it makes the page writable in some situations. Then we do not
> invalidate the remote reference.
>
> However, when we do COW then a *new* page is put in place of the existing
> readonly page. At that point we need to remove the remote pte that is
> readonly. Then we install a new pte pointing to a *different* page that is
> writable.
>
> Are you saying that you get the callback when transitioning from a read
> only to a read write pte on the *same* page?
I believe that is what we saw. We have not put in any more debug
information yet. I will try to squeze it in this weekend. Otherwise,
I will probably have to wait until early Monday.
Thanks
Robin
On Fri, Feb 01, 2008 at 06:21:45PM -0600, Robin Holt wrote:
> On Fri, Feb 01, 2008 at 04:05:08PM -0800, Christoph Lameter wrote:
> > Are you saying that you get the callback when transitioning from a read
> > only to a read write pte on the *same* page?
>
> I believe that is what we saw. We have not put in any more debug
> information yet. I will try to squeze it in this weekend. Otherwise,
> I will probably have to wait until early Monday.
I hate it when I am confused. I misunderstood what Dean had been saying.
After I looked at his test case and remembering his screen at the time
we were discussing, I am nearly positive that both the parent and child
were still running (no exec, no exit). We would therefore have two refs
on the page and, yes, be changing the pte which would warrant the callout.
Now I really need to think this through more. Sounds like a good thing
for Monday.
Thanks,
Robin
On Fri, Feb 01, 2008 at 05:35:28PM -0600, Robin Holt wrote:
> No, we need a callout when we are becoming more restrictive, but not
> when becoming more permissive. I would have to guess that is the case
> for any of these callouts. It is for both GRU and XPMEM. I would
> expect the same is true for KVM, but would like a ruling from Andrea on
> that.
I still hope I don't need to take any lock in _range_start and that
losing coherency (w/o risking global memory corruption but only
risking temporary userland data corruption thanks to the page pin) is
ok for KVM.
If I would have to take a lock in _range_start like XPMEM is forced to
do (GRU is by far not forced to it, if it would switch to my #v5) then
it would be a problem.