2008-02-15 06:52:27

by Christoph Lameter

[permalink] [raw]
Subject: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

The invalidation of address ranges in a mm_struct needs to be
performed when pages are removed or permissions etc change.

If invalidate_range_begin() is called with locks held then we
pass a flag into invalidate_range() to indicate that no sleeping is
possible. Locks are only held for truncate and huge pages.

In two cases we use invalidate_range_begin/end to invalidate
single pages because the pair allows holding off new references
(idea by Robin Holt).

do_wp_page(): We hold off new references while we update the pte.

xip_unmap: We are not taking the PageLock so we cannot
use the invalidate_page mmu_rmap_notifier. invalidate_range_begin/end
stands in.

Signed-off-by: Andrea Arcangeli <[email protected]>
Signed-off-by: Robin Holt <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>

---
mm/filemap_xip.c | 5 +++++
mm/fremap.c | 3 +++
mm/hugetlb.c | 3 +++
mm/memory.c | 35 +++++++++++++++++++++++++++++------
mm/mmap.c | 2 ++
mm/mprotect.c | 3 +++
mm/mremap.c | 7 ++++++-
7 files changed, 51 insertions(+), 7 deletions(-)

Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c 2008-02-14 18:43:31.000000000 -0800
+++ linux-2.6/mm/fremap.c 2008-02-14 18:45:07.000000000 -0800
@@ -15,6 +15,7 @@
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>

#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(uns
spin_unlock(&mapping->i_mmap_lock);
}

+ mmu_notifier(invalidate_range_begin, mm, start, start + size, 0);
err = populate_range(mm, vma, start, size, pgoff);
+ mmu_notifier(invalidate_range_end, mm, start, start + size, 0);
if (!err && !(flags & MAP_NONBLOCK)) {
if (unlikely(has_write_lock)) {
downgrade_write(&mm->mmap_sem);
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2008-02-14 18:43:31.000000000 -0800
+++ linux-2.6/mm/memory.c 2008-02-14 18:45:07.000000000 -0800
@@ -51,6 +51,7 @@
#include <linux/init.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>

#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -611,6 +612,9 @@ int copy_page_range(struct mm_struct *ds
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);

+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier(invalidate_range_begin, src_mm, addr, end, 0);
+
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
do {
@@ -621,6 +625,11 @@ int copy_page_range(struct mm_struct *ds
vma, addr, next))
return -ENOMEM;
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier(invalidate_range_end, src_mm,
+ vma->vm_start, end, 0);
+
return 0;
}

@@ -893,13 +902,16 @@ unsigned long zap_page_range(struct vm_a
struct mmu_gather *tlb;
unsigned long end = address + size;
unsigned long nr_accounted = 0;
+ int atomic = details ? (details->i_mmap_lock != 0) : 0;

lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
+ mmu_notifier(invalidate_range_begin, mm, address, end, atomic);
end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
if (tlb)
tlb_finish_mmu(tlb, address, end);
+ mmu_notifier(invalidate_range_end, mm, address, end, atomic);
return end;
}

@@ -1339,7 +1351,7 @@ int remap_pfn_range(struct vm_area_struc
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + PAGE_ALIGN(size);
+ unsigned long start = addr, end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
int err;

@@ -1373,6 +1385,7 @@ int remap_pfn_range(struct vm_area_struc
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
do {
next = pgd_addr_end(addr, end);
err = remap_pud_range(mm, pgd, addr, next,
@@ -1380,6 +1393,7 @@ int remap_pfn_range(struct vm_area_struc
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier(invalidate_range_end, mm, start, end, 0);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1463,10 +1477,11 @@ int apply_to_page_range(struct mm_struct
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + size;
+ unsigned long start = addr, end = addr + size;
int err;

BUG_ON(addr >= end);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1474,6 +1489,7 @@ int apply_to_page_range(struct mm_struct
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier(invalidate_range_end, mm, start, end, 0);
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1614,8 +1630,10 @@ static int do_wp_page(struct mm_struct *
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
page_cache_release(old_page);
- if (!pte_same(*page_table, orig_pte))
- goto unlock;
+ if (!pte_same(*page_table, orig_pte)) {
+ pte_unmap_unlock(page_table, ptl);
+ goto check_dirty;
+ }

page_mkwrite = 1;
}
@@ -1631,7 +1649,8 @@ static int do_wp_page(struct mm_struct *
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, entry);
ret |= VM_FAULT_WRITE;
- goto unlock;
+ pte_unmap_unlock(page_table, ptl);
+ goto check_dirty;
}

/*
@@ -1653,6 +1672,8 @@ gotten:
if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;

+ mmu_notifier(invalidate_range_begin, mm, address,
+ address + PAGE_SIZE, 0);
/*
* Re-check the pte - we dropped the lock
*/
@@ -1691,8 +1712,10 @@ gotten:
page_cache_release(new_page);
if (old_page)
page_cache_release(old_page);
-unlock:
pte_unmap_unlock(page_table, ptl);
+ mmu_notifier(invalidate_range_end, mm,
+ address, address + PAGE_SIZE, 0);
+check_dirty:
if (dirty_page) {
if (vma->vm_file)
file_update_time(vma->vm_file);
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c 2008-02-14 18:44:56.000000000 -0800
+++ linux-2.6/mm/mmap.c 2008-02-14 18:45:07.000000000 -0800
@@ -1748,11 +1748,13 @@ static void unmap_region(struct mm_struc
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
+ mmu_notifier(invalidate_range_end, mm, start, end, 0);
}

/*
Index: linux-2.6/mm/hugetlb.c
===================================================================
--- linux-2.6.orig/mm/hugetlb.c 2008-02-14 18:43:31.000000000 -0800
+++ linux-2.6/mm/hugetlb.c 2008-02-14 18:45:07.000000000 -0800
@@ -14,6 +14,7 @@
#include <linux/mempolicy.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
+#include <linux/mmu_notifier.h>

#include <asm/page.h>
#include <asm/pgtable.h>
@@ -755,6 +756,7 @@ void __unmap_hugepage_range(struct vm_ar
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);

+ mmu_notifier(invalidate_range_begin, mm, start, end, 1);
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
@@ -775,6 +777,7 @@ void __unmap_hugepage_range(struct vm_ar
}
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
+ mmu_notifier(invalidate_range_end, mm, start, end, 1);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
list_del(&page->lru);
put_page(page);
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c 2008-02-14 18:43:31.000000000 -0800
+++ linux-2.6/mm/filemap_xip.c 2008-02-14 18:45:07.000000000 -0800
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/uio.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
#include <linux/sched.h>
#include <asm/tlbflush.h>

@@ -190,6 +191,8 @@ __xip_unmap (struct address_space * mapp
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ mmu_notifier(invalidate_range_begin, mm, address,
+ address + PAGE_SIZE, 1);
pte = page_check_address(page, mm, address, &ptl);
if (pte) {
/* Nuke the page table entry. */
@@ -201,6 +204,8 @@ __xip_unmap (struct address_space * mapp
pte_unmap_unlock(pte, ptl);
page_cache_release(page);
}
+ mmu_notifier(invalidate_range_end, mm,
+ address, address + PAGE_SIZE, 1);
}
spin_unlock(&mapping->i_mmap_lock);
}
Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c 2008-02-14 18:43:31.000000000 -0800
+++ linux-2.6/mm/mremap.c 2008-02-14 18:45:07.000000000 -0800
@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>

#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -124,12 +125,15 @@ unsigned long move_page_tables(struct vm
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len)
{
- unsigned long extent, next, old_end;
+ unsigned long extent, next, old_start, old_end;
pmd_t *old_pmd, *new_pmd;

+ old_start = old_addr;
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);

+ mmu_notifier(invalidate_range_begin, vma->vm_mm,
+ old_addr, old_end, 0);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
next = (old_addr + PMD_SIZE) & PMD_MASK;
@@ -150,6 +154,7 @@ unsigned long move_page_tables(struct vm
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
new_vma, new_pmd, new_addr);
}
+ mmu_notifier(invalidate_range_end, vma->vm_mm, old_start, old_end, 0);

return len + old_addr - old_end; /* how much done */
}
Index: linux-2.6/mm/mprotect.c
===================================================================
--- linux-2.6.orig/mm/mprotect.c 2008-02-14 18:43:31.000000000 -0800
+++ linux-2.6/mm/mprotect.c 2008-02-14 18:45:07.000000000 -0800
@@ -21,6 +21,7 @@
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -198,10 +199,12 @@ success:
dirty_accountable = 1;
}

+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
if (is_vm_hugetlb_page(vma))
hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
else
change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+ mmu_notifier(invalidate_range_end, mm, start, end, 0);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
return 0;

--


2008-02-16 03:40:56

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Thu, 14 Feb 2008 22:49:01 -0800 Christoph Lameter <[email protected]> wrote:

> The invalidation of address ranges in a mm_struct needs to be
> performed when pages are removed or permissions etc change.

hm. Do they? Why? If I'm in the process of zero-copy writing a hunk of
memory out to hardware then do I care if someone write-protects the ptes?

Spose so, but some fleshing-out of the various scenarios here would clarify
things.

> If invalidate_range_begin() is called with locks held then we
> pass a flag into invalidate_range() to indicate that no sleeping is
> possible. Locks are only held for truncate and huge pages.

This is so bad.

I supposed in the restricted couple of cases which you're focussed on it
works OK. But is it generally suitable? What if IO is in progress? What
if other cluster nodes need to be talked to? Does it suit RDMA?

> In two cases we use invalidate_range_begin/end to invalidate
> single pages because the pair allows holding off new references
> (idea by Robin Holt).

Assuming that there is a missing "within the range" in this description, I
assume that all clients will just throw up theior hands in horror and will
disallow all references to all parts of the mm.

Of course, to do that they will need to take a sleeping lock to prevent
other threads from establishing new references. whoops.

> do_wp_page(): We hold off new references while we update the pte.
>
> xip_unmap: We are not taking the PageLock so we cannot
> use the invalidate_page mmu_rmap_notifier. invalidate_range_begin/end
> stands in.

What does "stands in" mean?

> Signed-off-by: Andrea Arcangeli <[email protected]>
> Signed-off-by: Robin Holt <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> ---
> mm/filemap_xip.c | 5 +++++
> mm/fremap.c | 3 +++
> mm/hugetlb.c | 3 +++
> mm/memory.c | 35 +++++++++++++++++++++++++++++------
> mm/mmap.c | 2 ++
> mm/mprotect.c | 3 +++
> mm/mremap.c | 7 ++++++-
> 7 files changed, 51 insertions(+), 7 deletions(-)
>
> Index: linux-2.6/mm/fremap.c
> ===================================================================
> --- linux-2.6.orig/mm/fremap.c 2008-02-14 18:43:31.000000000 -0800
> +++ linux-2.6/mm/fremap.c 2008-02-14 18:45:07.000000000 -0800
> @@ -15,6 +15,7 @@
> #include <linux/rmap.h>
> #include <linux/module.h>
> #include <linux/syscalls.h>
> +#include <linux/mmu_notifier.h>
>
> #include <asm/mmu_context.h>
> #include <asm/cacheflush.h>
> @@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(uns
> spin_unlock(&mapping->i_mmap_lock);
> }
>
> + mmu_notifier(invalidate_range_begin, mm, start, start + size, 0);
> err = populate_range(mm, vma, start, size, pgoff);
> + mmu_notifier(invalidate_range_end, mm, start, start + size, 0);

To avoid off-by-one confusion the changelogs, documentation and comments
should be very careful to tell the reader whether the range includes the
byte at start+size. I don't thik that was done?

2008-02-16 19:27:00

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, 15 Feb 2008, Andrew Morton wrote:

> On Thu, 14 Feb 2008 22:49:01 -0800 Christoph Lameter <[email protected]> wrote:
>
> > The invalidation of address ranges in a mm_struct needs to be
> > performed when pages are removed or permissions etc change.
>
> hm. Do they? Why? If I'm in the process of zero-copy writing a hunk of
> memory out to hardware then do I care if someone write-protects the ptes?
>
> Spose so, but some fleshing-out of the various scenarios here would clarify
> things.

You care f.e. if the VM needs to writeprotect a memory range and a write
occurs. In that case the VM needs to be proper write processing and write
through an external pte would cause memory corruption.

> > If invalidate_range_begin() is called with locks held then we
> > pass a flag into invalidate_range() to indicate that no sleeping is
> > possible. Locks are only held for truncate and huge pages.
>
> This is so bad.

Ok so I can twidlle around with the inode_mmap_lock to drop it while this
is called?

> > In two cases we use invalidate_range_begin/end to invalidate
> > single pages because the pair allows holding off new references
> > (idea by Robin Holt).
>
> Assuming that there is a missing "within the range" in this description, I
> assume that all clients will just throw up theior hands in horror and will
> disallow all references to all parts of the mm.

Right. Missing within the range. We only need to disallow creating new
ptes right? Why disallow references?


> > xip_unmap: We are not taking the PageLock so we cannot
> > use the invalidate_page mmu_rmap_notifier. invalidate_range_begin/end
> > stands in.
>
> What does "stands in" mean?

Use a range begin / end to invalidate a page.

> > + mmu_notifier(invalidate_range_begin, mm, start, start + size, 0);
> > err = populate_range(mm, vma, start, size, pgoff);
> > + mmu_notifier(invalidate_range_end, mm, start, start + size, 0);
>
> To avoid off-by-one confusion the changelogs, documentation and comments
> should be very careful to tell the reader whether the range includes the
> byte at start+size. I don't thik that was done?

No it was not. I assumed that the convention is always start - (end - 1)
and the byte at end is not affected by the operation.

2008-02-19 09:10:18

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Friday 15 February 2008 17:49, Christoph Lameter wrote:
> The invalidation of address ranges in a mm_struct needs to be
> performed when pages are removed or permissions etc change.
>
> If invalidate_range_begin() is called with locks held then we
> pass a flag into invalidate_range() to indicate that no sleeping is
> possible. Locks are only held for truncate and huge pages.
>
> In two cases we use invalidate_range_begin/end to invalidate
> single pages because the pair allows holding off new references
> (idea by Robin Holt).
>
> do_wp_page(): We hold off new references while we update the pte.
>
> xip_unmap: We are not taking the PageLock so we cannot
> use the invalidate_page mmu_rmap_notifier. invalidate_range_begin/end
> stands in.

This whole thing would be much better if you didn't rely on the page
lock at all, but either a) used the same locking as Linux does for its
ptes/tlbs, or b) have some locking that is private to the mmu notifier
code. Then there is not all this new stuff that has to be understood in
the core VM.

Also, why do you have to "invalidate" ranges when switching to a
_more_ permissive state? This stuff should basically be the same as
(a subset of) the TLB flushing API AFAIKS. Anything more is a pretty
big burden to put in the core VM.

See my alternative patch I posted -- I can't see why it won't work
just like a TLB.

As far as sleeping inside callbacks goes... I think there are big
problems with the patch (the sleeping patch and the external rmap
patch). I don't think it is workable in its current state. Either
we have to make some big changes to the core VM, or we have to turn
some locks into sleeping locks to do it properly AFAIKS. Neither
one is good.

But anyway, I don't really think the two approaches (Andrea's
notifiers vs sleeping/xrmap) should be tangled up too much. I
think Andrea's can possibly be quite unintrusive and useful very
soon.

2008-02-19 13:34:22

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Tue, Feb 19, 2008 at 07:54:14PM +1100, Nick Piggin wrote:
> As far as sleeping inside callbacks goes... I think there are big
> problems with the patch (the sleeping patch and the external rmap
> patch). I don't think it is workable in its current state. Either
> we have to make some big changes to the core VM, or we have to turn
> some locks into sleeping locks to do it properly AFAIKS. Neither
> one is good.

Agreed.

The thing is quite simple, the moment we support xpmem the complexity
in the mmu notifier patch start and there are hacks, duplicated
functionality through the same xpmem callbacks etc... GRU can already
be 100% supported (infact simpler and safer) with my patch.

> But anyway, I don't really think the two approaches (Andrea's
> notifiers vs sleeping/xrmap) should be tangled up too much. I
> think Andrea's can possibly be quite unintrusive and useful very
> soon.

Yes, that's why I kept maintaining my patch and I posted the last
revision to Andrew. I use pte/tlb locking of the core VM, it's
unintrusive and obviously safe. Furthermore it can be extended with
Christoph's stuff in a 100% backwards compatible fashion later if needed.

2008-02-19 23:09:14

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Friday 15 February 2008 17:49, Christoph Lameter wrote:
> The invalidation of address ranges in a mm_struct needs to be
> performed when pages are removed or permissions etc change.
>
> If invalidate_range_begin() is called with locks held then we
> pass a flag into invalidate_range() to indicate that no sleeping is
> possible. Locks are only held for truncate and huge pages.

You can't sleep inside rcu_read_lock()!

I must say that for a patch that is up to v8 or whatever and is
posted twice a week to such a big cc list, it is kind of slack to
not even test it and expect other people to review it.

Also, what we are going to need here are not skeleton drivers
that just do all the *easy* bits (of registering their callbacks),
but actual fully working examples that do everything that any
real driver will need to do. If not for the sanity of the driver
writer, then for the sanity of the VM developers (I don't want
to have to understand xpmem or infiniband in order to understand
how the VM works).



> In two cases we use invalidate_range_begin/end to invalidate
> single pages because the pair allows holding off new references
> (idea by Robin Holt).
>
> do_wp_page(): We hold off new references while we update the pte.
>
> xip_unmap: We are not taking the PageLock so we cannot
> use the invalidate_page mmu_rmap_notifier. invalidate_range_begin/end
> stands in.
>
> Signed-off-by: Andrea Arcangeli <[email protected]>
> Signed-off-by: Robin Holt <[email protected]>
> Signed-off-by: Christoph Lameter <[email protected]>
>
> ---
> mm/filemap_xip.c | 5 +++++
> mm/fremap.c | 3 +++
> mm/hugetlb.c | 3 +++
> mm/memory.c | 35 +++++++++++++++++++++++++++++------
> mm/mmap.c | 2 ++
> mm/mprotect.c | 3 +++
> mm/mremap.c | 7 ++++++-
> 7 files changed, 51 insertions(+), 7 deletions(-)
>
> Index: linux-2.6/mm/fremap.c
> ===================================================================
> --- linux-2.6.orig/mm/fremap.c 2008-02-14 18:43:31.000000000 -0800
> +++ linux-2.6/mm/fremap.c 2008-02-14 18:45:07.000000000 -0800
> @@ -15,6 +15,7 @@
> #include <linux/rmap.h>
> #include <linux/module.h>
> #include <linux/syscalls.h>
> +#include <linux/mmu_notifier.h>
>
> #include <asm/mmu_context.h>
> #include <asm/cacheflush.h>
> @@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(uns
> spin_unlock(&mapping->i_mmap_lock);
> }
>
> + mmu_notifier(invalidate_range_begin, mm, start, start + size, 0);
> err = populate_range(mm, vma, start, size, pgoff);
> + mmu_notifier(invalidate_range_end, mm, start, start + size, 0);
> if (!err && !(flags & MAP_NONBLOCK)) {
> if (unlikely(has_write_lock)) {
> downgrade_write(&mm->mmap_sem);
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c 2008-02-14 18:43:31.000000000 -0800
> +++ linux-2.6/mm/memory.c 2008-02-14 18:45:07.000000000 -0800
> @@ -51,6 +51,7 @@
> #include <linux/init.h>
> #include <linux/writeback.h>
> #include <linux/memcontrol.h>
> +#include <linux/mmu_notifier.h>
>
> #include <asm/pgalloc.h>
> #include <asm/uaccess.h>
> @@ -611,6 +612,9 @@ int copy_page_range(struct mm_struct *ds
> if (is_vm_hugetlb_page(vma))
> return copy_hugetlb_page_range(dst_mm, src_mm, vma);
>
> + if (is_cow_mapping(vma->vm_flags))
> + mmu_notifier(invalidate_range_begin, src_mm, addr, end, 0);
> +
> dst_pgd = pgd_offset(dst_mm, addr);
> src_pgd = pgd_offset(src_mm, addr);
> do {
> @@ -621,6 +625,11 @@ int copy_page_range(struct mm_struct *ds
> vma, addr, next))
> return -ENOMEM;
> } while (dst_pgd++, src_pgd++, addr = next, addr != end);
> +
> + if (is_cow_mapping(vma->vm_flags))
> + mmu_notifier(invalidate_range_end, src_mm,
> + vma->vm_start, end, 0);
> +
> return 0;
> }
>
> @@ -893,13 +902,16 @@ unsigned long zap_page_range(struct vm_a
> struct mmu_gather *tlb;
> unsigned long end = address + size;
> unsigned long nr_accounted = 0;
> + int atomic = details ? (details->i_mmap_lock != 0) : 0;
>
> lru_add_drain();
> tlb = tlb_gather_mmu(mm, 0);
> update_hiwater_rss(mm);
> + mmu_notifier(invalidate_range_begin, mm, address, end, atomic);
> end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
> if (tlb)
> tlb_finish_mmu(tlb, address, end);
> + mmu_notifier(invalidate_range_end, mm, address, end, atomic);
> return end;
> }
>

Where do you invalidate for munmap()?

Also, how to you resolve the case where you are not allowed to sleep?
I would have thought either you have to handle it, in which case nobody
needs to sleep; or you can't handle it, in which case the code is
broken.

2008-02-20 01:01:28

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, Feb 20, 2008 at 10:08:49AM +1100, Nick Piggin wrote:
> You can't sleep inside rcu_read_lock()!
>
> I must say that for a patch that is up to v8 or whatever and is
> posted twice a week to such a big cc list, it is kind of slack to
> not even test it and expect other people to review it.

Well, xpmem requirements are complex. As as side effect of the
simplicity of my approach, my patch is 100% safe since #v1. Now it
also works for GRU and it cluster invalidates.

> Also, what we are going to need here are not skeleton drivers
> that just do all the *easy* bits (of registering their callbacks),
> but actual fully working examples that do everything that any
> real driver will need to do. If not for the sanity of the driver

I've a fully working scenario for my patch, infact I didn't post the
mmu notifier patch until I got KVM to swap 100% reliably to be sure I
would post something that works well. mmu notifiers are already used
in KVM for:

1) 100% reliable and efficient swapping of guest physical memory
2) copy-on-writes of writeprotect faults after ksm page sharing of guest
physical memory
3) ballooning using madvise to give the guest memory back to the host

My implementation is the most handy because it requires zero changes
to the ksm code too (no explicit mmu notifier calls after
ptep_clear_flush) and it's also 100% safe (no mess with schedules over
rcu_read_lock), no "atomic" parameters, and it doesn't open a window
where sptes have a view on older pages and linux pte has view on newer
pages (this can happen with remap_file_pages with my KVM swapping
patch to use V8 Christoph's patch).

> Also, how to you resolve the case where you are not allowed to sleep?
> I would have thought either you have to handle it, in which case nobody
> needs to sleep; or you can't handle it, in which case the code is
> broken.

I also asked exactly this, glad you reasked this too.

2008-02-20 03:00:44

by Robin Holt

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, Feb 20, 2008 at 02:00:38AM +0100, Andrea Arcangeli wrote:
> On Wed, Feb 20, 2008 at 10:08:49AM +1100, Nick Piggin wrote:
> > You can't sleep inside rcu_read_lock()!
> >
> > I must say that for a patch that is up to v8 or whatever and is
> > posted twice a week to such a big cc list, it is kind of slack to
> > not even test it and expect other people to review it.
>
> Well, xpmem requirements are complex. As as side effect of the
> simplicity of my approach, my patch is 100% safe since #v1. Now it
> also works for GRU and it cluster invalidates.
>
> > Also, what we are going to need here are not skeleton drivers
> > that just do all the *easy* bits (of registering their callbacks),
> > but actual fully working examples that do everything that any
> > real driver will need to do. If not for the sanity of the driver
>
> I've a fully working scenario for my patch, infact I didn't post the
> mmu notifier patch until I got KVM to swap 100% reliably to be sure I
> would post something that works well. mmu notifiers are already used
> in KVM for:
>
> 1) 100% reliable and efficient swapping of guest physical memory
> 2) copy-on-writes of writeprotect faults after ksm page sharing of guest
> physical memory
> 3) ballooning using madvise to give the guest memory back to the host
>
> My implementation is the most handy because it requires zero changes
> to the ksm code too (no explicit mmu notifier calls after
> ptep_clear_flush) and it's also 100% safe (no mess with schedules over
> rcu_read_lock), no "atomic" parameters, and it doesn't open a window
> where sptes have a view on older pages and linux pte has view on newer
> pages (this can happen with remap_file_pages with my KVM swapping
> patch to use V8 Christoph's patch).
>
> > Also, how to you resolve the case where you are not allowed to sleep?
> > I would have thought either you have to handle it, in which case nobody
> > needs to sleep; or you can't handle it, in which case the code is
> > broken.
>
> I also asked exactly this, glad you reasked this too.

Currently, we BUG_ON having a PFN in our tables and not being able
to sleep. These are mappings which MPT has never supported in the past
and XPMEM was already not allowing page faults for VMAs which are not
anonymous so it should never happen. If the file-backed operations can
ever get changed to allow for sleeping and a customer has a need for it,
we would need to change XPMEM to allow those types of faults to succeed.

Thanks,
Robin

2008-02-20 03:14:23

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wednesday 20 February 2008 14:00, Robin Holt wrote:
> On Wed, Feb 20, 2008 at 02:00:38AM +0100, Andrea Arcangeli wrote:
> > On Wed, Feb 20, 2008 at 10:08:49AM +1100, Nick Piggin wrote:

> > > Also, how to you resolve the case where you are not allowed to sleep?
> > > I would have thought either you have to handle it, in which case nobody
> > > needs to sleep; or you can't handle it, in which case the code is
> > > broken.
> >
> > I also asked exactly this, glad you reasked this too.
>
> Currently, we BUG_ON having a PFN in our tables and not being able
> to sleep. These are mappings which MPT has never supported in the past
> and XPMEM was already not allowing page faults for VMAs which are not
> anonymous so it should never happen. If the file-backed operations can
> ever get changed to allow for sleeping and a customer has a need for it,
> we would need to change XPMEM to allow those types of faults to succeed.

Do you really want to be able to swap, or are you just interested
in keeping track of unmaps / prot changes?

2008-02-20 03:19:51

by Robin Holt

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, Feb 20, 2008 at 02:11:41PM +1100, Nick Piggin wrote:
> On Wednesday 20 February 2008 14:00, Robin Holt wrote:
> > On Wed, Feb 20, 2008 at 02:00:38AM +0100, Andrea Arcangeli wrote:
> > > On Wed, Feb 20, 2008 at 10:08:49AM +1100, Nick Piggin wrote:
>
> > > > Also, how to you resolve the case where you are not allowed to sleep?
> > > > I would have thought either you have to handle it, in which case nobody
> > > > needs to sleep; or you can't handle it, in which case the code is
> > > > broken.
> > >
> > > I also asked exactly this, glad you reasked this too.
> >
> > Currently, we BUG_ON having a PFN in our tables and not being able
> > to sleep. These are mappings which MPT has never supported in the past
> > and XPMEM was already not allowing page faults for VMAs which are not
> > anonymous so it should never happen. If the file-backed operations can
> > ever get changed to allow for sleeping and a customer has a need for it,
> > we would need to change XPMEM to allow those types of faults to succeed.
>
> Do you really want to be able to swap, or are you just interested
> in keeping track of unmaps / prot changes?

I would rather not swap, but we do have one customer that would like
swapout to work for certain circumstances. Additionally, we have
many customers that would rather that their system not die under I/O
termination.

Thanks,
Robin

2008-02-27 22:23:39

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Tue, 19 Feb 2008, Andrea Arcangeli wrote:

> Yes, that's why I kept maintaining my patch and I posted the last
> revision to Andrew. I use pte/tlb locking of the core VM, it's
> unintrusive and obviously safe. Furthermore it can be extended with
> Christoph's stuff in a 100% backwards compatible fashion later if needed.

How would that work? You rely on the pte locking. Thus calls are all in an
atomic context. I think we need a general scheme that allows sleeping when
references are invalidates. Even the GRU has performance issues when using
the KVM patch.


2008-02-27 22:36:14

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, 20 Feb 2008, Nick Piggin wrote:

> On Friday 15 February 2008 17:49, Christoph Lameter wrote:
> > The invalidation of address ranges in a mm_struct needs to be
> > performed when pages are removed or permissions etc change.
> >
> > If invalidate_range_begin() is called with locks held then we
> > pass a flag into invalidate_range() to indicate that no sleeping is
> > possible. Locks are only held for truncate and huge pages.
>
> You can't sleep inside rcu_read_lock()!

Could you be specific? This refers to page migration? Hmmm... Guess we
would need to inc the refcount there instead?

> I must say that for a patch that is up to v8 or whatever and is
> posted twice a week to such a big cc list, it is kind of slack to
> not even test it and expect other people to review it.

It was tested with the GRU and XPmem. Andrea also reported success.

> Also, what we are going to need here are not skeleton drivers
> that just do all the *easy* bits (of registering their callbacks),
> but actual fully working examples that do everything that any
> real driver will need to do. If not for the sanity of the driver
> writer, then for the sanity of the VM developers (I don't want
> to have to understand xpmem or infiniband in order to understand
> how the VM works).

There are 3 different drivers that can already use it but the code is
complex and not easy to review. Skeletons are easy to allow people to get
started with it.

> > lru_add_drain();
> > tlb = tlb_gather_mmu(mm, 0);
> > update_hiwater_rss(mm);
> > + mmu_notifier(invalidate_range_begin, mm, address, end, atomic);
> > end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
> > if (tlb)
> > tlb_finish_mmu(tlb, address, end);
> > + mmu_notifier(invalidate_range_end, mm, address, end, atomic);
> > return end;
> > }
> >
>
> Where do you invalidate for munmap()?

zap_page_range() called from unmap_vmas().

> Also, how to you resolve the case where you are not allowed to sleep?
> I would have thought either you have to handle it, in which case nobody
> needs to sleep; or you can't handle it, in which case the code is
> broken.

That can be done in a variety of ways:

1. Change VM locking

2. Not handle file backed mappings (XPmem could work mostly in such a
config)

3. Keep the refcount elevated until pages are freed in another execution
context.

2008-02-27 22:39:56

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, 20 Feb 2008, Andrea Arcangeli wrote:

> Well, xpmem requirements are complex. As as side effect of the
> simplicity of my approach, my patch is 100% safe since #v1. Now it
> also works for GRU and it cluster invalidates.

The patch has to satisfy RDMA, XPMEM, GRU and KVM. I keep hearing that we
have a KVM only solution that works 100% (which makes me just switch
ignore the rest of the argument because 100% solutions usually do not
exist).


> rcu_read_lock), no "atomic" parameters, and it doesn't open a window
> where sptes have a view on older pages and linux pte has view on newer
> pages (this can happen with remap_file_pages with my KVM swapping
> patch to use V8 Christoph's patch).

Ok so you are now getting away from keeping the refcount elevated? That
was your design decision....


> > Also, how to you resolve the case where you are not allowed to sleep?
> > I would have thought either you have to handle it, in which case nobody
> > needs to sleep; or you can't handle it, in which case the code is
> > broken.
>
> I also asked exactly this, glad you reasked this too.

It would have helped if you would have repeated my answers that you had
already gotten before. You knew I was on vacation....

2008-02-27 22:42:52

by tip-bot for Jack Steiner

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

>
> > Also, what we are going to need here are not skeleton drivers
> > that just do all the *easy* bits (of registering their callbacks),
> > but actual fully working examples that do everything that any
> > real driver will need to do. If not for the sanity of the driver
> > writer, then for the sanity of the VM developers (I don't want
> > to have to understand xpmem or infiniband in order to understand
> > how the VM works).
>
> There are 3 different drivers that can already use it but the code is
> complex and not easy to review. Skeletons are easy to allow people to get
> started with it.


I posted the full GRU driver late last week. It is a lot of
code & somewhat difficult to understand w/o access to full chip
specs (sorry). The code is fairly well commented & the
parts related to TLB management should be understandable.

2008-02-27 23:57:48

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, Feb 27, 2008 at 02:23:29PM -0800, Christoph Lameter wrote:
> How would that work? You rely on the pte locking. Thus calls are all in an

I don't rely on the pte locking in #v7, exactly to satisfy GRU
(so far purely theoretical) performance complains.

> atomic context. I think we need a general scheme that allows sleeping when

Calls are still in atomic context until we change the i_mmap_lock to a
mutex under a CONFIG_XPMEM, or unless we boost mm_users, drop the lock
and restart the loop at every different mm. In any case those changes
should be under CONFIG_XPMEM IMHO given desktop users definitely don't
need this (regular non-blocking mmu notifiers in my patch are all what
a desktop user need as far as I can tell).

> references are invalidates. Even the GRU has performance issues when using
> the KVM patch.

GRU will perform the same with #v7 or V8.

2008-02-28 00:10:23

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, 27 Feb 2008, Christoph Lameter wrote:

> Could you be specific? This refers to page migration? Hmmm... Guess we
> would need to inc the refcount there instead?

Argh. No its the callback list scanning. Yuck. No one noticed.

2008-02-28 00:11:41

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, Feb 27, 2008 at 02:35:59PM -0800, Christoph Lameter wrote:
> Could you be specific? This refers to page migration? Hmmm... Guess we

If the reader schedule, the synchronize_rcu will return in the other
cpu and the objects in the list will be freed and overwritten, and
when the task is scheduled back in, it'll follow dangling pointers...
You can't use RCU if you want any of your invalidate methods to
schedule. Otherwise it's like having zero locking.

> 2. Not handle file backed mappings (XPmem could work mostly in such a
> config)

IMHO that fits under your definition of "hacking something in now and
then having to modify it later".

> 3. Keep the refcount elevated until pages are freed in another execution
> context.

Page refcount is not enough (the mmu_notifier_release will run in
another cpu the moment after i_mmap_lock is unlocked) but mm_users may
prevent us to change the i_mmap_lock to a mutex, but it'll slowdown
truncate as it'll have to drop the lock and restart the radix tree
walk every time so a change like this better fits as a separate
CONFIG_XPMEM IMHO.

2008-02-28 00:14:21

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Thu, 28 Feb 2008, Andrea Arcangeli wrote:

> > 3. Keep the refcount elevated until pages are freed in another execution
> > context.
>
> Page refcount is not enough (the mmu_notifier_release will run in
> another cpu the moment after i_mmap_lock is unlocked) but mm_users may
> prevent us to change the i_mmap_lock to a mutex, but it'll slowdown
> truncate as it'll have to drop the lock and restart the radix tree
> walk every time so a change like this better fits as a separate
> CONFIG_XPMEM IMHO.

Erm. This would also be needed by RDMA etc.

2008-02-28 00:38:44

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, Feb 27, 2008 at 02:39:46PM -0800, Christoph Lameter wrote:
> On Wed, 20 Feb 2008, Andrea Arcangeli wrote:
>
> > Well, xpmem requirements are complex. As as side effect of the
> > simplicity of my approach, my patch is 100% safe since #v1. Now it
> > also works for GRU and it cluster invalidates.
>
> The patch has to satisfy RDMA, XPMEM, GRU and KVM. I keep hearing that we
> have a KVM only solution that works 100% (which makes me just switch
> ignore the rest of the argument because 100% solutions usually do not
> exist).

I only said 100% safe, I didn't imply anything other than it won't
crash the kernel ;).

#v6 and #v7 only leaves XPMEM out AFIK, and that can be supported
later with a CONFIG_XPMEM that purely changes some VM locking. #v7
also provides maximum performance to GRU.

> > rcu_read_lock), no "atomic" parameters, and it doesn't open a window
> > where sptes have a view on older pages and linux pte has view on newer
> > pages (this can happen with remap_file_pages with my KVM swapping
> > patch to use V8 Christoph's patch).
>
> Ok so you are now getting away from keeping the refcount elevated? That
> was your design decision....

No, I'm not getting away from it. If I would get away from it, I would
be forced to implement invalidate_range_begin. However even if I don't
get away from it, the fact I only implement invalidate_range_end, and
that's called after the PT lock is dropped, opens a little window with
lost coherency (which may not be detectable by userland anyway). But this
little window is fine for KVM and it doesn't impose any security
risk. But clearly proving the locking safe becomes a bit more complex
in #v7 than in #v6.

> It would have helped if you would have repeated my answers that you had
> already gotten before. You knew I was on vacation....

I didn't remember the BUG_ON crystal clear sorry, but not sure why you
think it was your call, this was a lowlevel XPMEM question and Robin
promptly answered/reminded about it infact.

2008-02-28 00:53:17

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, Feb 27, 2008 at 04:14:08PM -0800, Christoph Lameter wrote:
> Erm. This would also be needed by RDMA etc.

The only RDMA I know is Quadrics, and Quadrics apparently doesn't need
to schedule inside the invalidate methods AFIK, so I doubt the above
is true. It'd be interesting to know if IB is like Quadrics and it
also doesn't require blocking to invalidate certain remote mappings.

2008-02-28 01:03:41

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Thu, 28 Feb 2008, Andrea Arcangeli wrote:

> On Wed, Feb 27, 2008 at 04:14:08PM -0800, Christoph Lameter wrote:
> > Erm. This would also be needed by RDMA etc.
>
> The only RDMA I know is Quadrics, and Quadrics apparently doesn't need
> to schedule inside the invalidate methods AFIK, so I doubt the above
> is true. It'd be interesting to know if IB is like Quadrics and it
> also doesn't require blocking to invalidate certain remote mappings.

RDMA works across a network and I would assume that it needs confirmation
that a connection has been torn down before pages can be unmapped.

2008-02-28 01:10:37

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wed, Feb 27, 2008 at 05:03:21PM -0800, Christoph Lameter wrote:
> RDMA works across a network and I would assume that it needs confirmation
> that a connection has been torn down before pages can be unmapped.

Depends on the latency of the network, for example with page pinning
it can even try to reduce the wait time, by tearing down the mapping
in range_begin and spin waiting the ack only later in range_end.

2008-02-28 10:53:34

by Robin Holt

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Thu, Feb 28, 2008 at 01:52:50AM +0100, Andrea Arcangeli wrote:
> On Wed, Feb 27, 2008 at 04:14:08PM -0800, Christoph Lameter wrote:
> > Erm. This would also be needed by RDMA etc.
>
> The only RDMA I know is Quadrics, and Quadrics apparently doesn't need
> to schedule inside the invalidate methods AFIK, so I doubt the above
> is true. It'd be interesting to know if IB is like Quadrics and it
> also doesn't require blocking to invalidate certain remote mappings.

We got an answer from the IB guys already. They do not track which of
their handles are being used by remote processes so neither approach
will work for their purposes with the exception of straight unmaps. In
that case, they could use the callout to remove TLB information and rely
on the lack of page table information to kill the users process.
Without changes to their library spec, I don't believe anything further
is possible. If they did change their library spec, I believe they
could get things to work the same way that XPMEM has gotten things to
work, where a message is sent to the remote side for TLB clearing and
that will require sleeping.

Thanks,
Robin

2008-02-28 18:44:15

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Thu, 28 Feb 2008, Andrea Arcangeli wrote:

> On Wed, Feb 27, 2008 at 05:03:21PM -0800, Christoph Lameter wrote:
> > RDMA works across a network and I would assume that it needs confirmation
> > that a connection has been torn down before pages can be unmapped.
>
> Depends on the latency of the network, for example with page pinning
> it can even try to reduce the wait time, by tearing down the mapping
> in range_begin and spin waiting the ack only later in range_end.

What about invalidate_page()?

2008-02-29 01:06:42

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Thu, Feb 28, 2008 at 10:43:54AM -0800, Christoph Lameter wrote:
> What about invalidate_page()?

That would just spin waiting an ack (just like the smp-tlb-flushing
invalidates in numa already does).

Thinking more about this, we could also parallelize it with an
invalidate_page_before/end. If it takes 1usec to flush remotely,
scheduling would be overkill, but spending 1usec in a while loop isn't
nice if we can parallelize that 1usec with the ipi-tlb-flush. Not sure
if it makes sense... it certainly would be quick to add it (especially
thanks to _notify ;).

2008-02-29 01:08:59

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, 29 Feb 2008, Andrea Arcangeli wrote:

> On Thu, Feb 28, 2008 at 10:43:54AM -0800, Christoph Lameter wrote:
> > What about invalidate_page()?
>
> That would just spin waiting an ack (just like the smp-tlb-flushing
> invalidates in numa already does).

And thus the device driver may stop receiving data on a UP system? It will
never get the ack.

> Thinking more about this, we could also parallelize it with an
> invalidate_page_before/end. If it takes 1usec to flush remotely,
> scheduling would be overkill, but spending 1usec in a while loop isn't
> nice if we can parallelize that 1usec with the ipi-tlb-flush. Not sure
> if it makes sense... it certainly would be quick to add it (especially
> thanks to _notify ;).

invalidate_page_before/end could be realized as an
invalidate_range_begin/end on a page sized range?

2008-02-29 13:13:28

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Thu, Feb 28, 2008 at 04:59:59PM -0800, Christoph Lameter wrote:
> And thus the device driver may stop receiving data on a UP system? It will
> never get the ack.

Not sure to follow, sorry.

My idea was:

post the invalidate in the mmio region of the device
smp_call_function()
while (mmio device wait-bitflag is on);

Instead of the current:

smp_call_function()
post the invalidate in the mmio region of the device
while (mmio device wait-bitflag is on);

To decrease the wait loop time.

> invalidate_page_before/end could be realized as an
> invalidate_range_begin/end on a page sized range?

If we go this route, once you add support to xpmem, you'll have to
make the anon_vma lock a mutex too, that would be fine with me
though. The main reason invalidate_page exists, is to allow you to
leave it as non-sleep-capable even after you make invalidate_range
sleep capable, and to implement the mmu_rmap_notifiers sleep capable
in all the paths that invalidate_page would be called. That was the
strategy you had in your patch. I'll try to drop invalidate_page. I
wonder if then you won't need the mmu_rmap_notifiers anymore.

2008-02-29 19:55:45

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, 29 Feb 2008, Andrea Arcangeli wrote:

> On Thu, Feb 28, 2008 at 04:59:59PM -0800, Christoph Lameter wrote:
> > And thus the device driver may stop receiving data on a UP system? It will
> > never get the ack.
>
> Not sure to follow, sorry.
>
> My idea was:
>
> post the invalidate in the mmio region of the device
> smp_call_function()
> while (mmio device wait-bitflag is on);

So the device driver on UP can only operate through interrupts? If you are
hogging the only cpu then driver operations may not be possible.

> > invalidate_page_before/end could be realized as an
> > invalidate_range_begin/end on a page sized range?
>
> If we go this route, once you add support to xpmem, you'll have to
> make the anon_vma lock a mutex too, that would be fine with me
> though. The main reason invalidate_page exists, is to allow you to
> leave it as non-sleep-capable even after you make invalidate_range
> sleep capable, and to implement the mmu_rmap_notifiers sleep capable
> in all the paths that invalidate_page would be called. That was the
> strategy you had in your patch. I'll try to drop invalidate_page. I
> wonder if then you won't need the mmu_rmap_notifiers anymore.

I am mainly concerned with making the mmu notifier a generally useful
feature for multiple users. Xpmem is one example of a different user. It
should be considered as one example of a different type of callback user.
It is not the gold standard that you make it to be. RDMA is another and
there are likely scores of others (DMA engines etc) once it becomes clear
that such a feature is available. In general the mmu notifier will allows
us to fix the problems caused by memory pinning and mlock by various
devices and other mechanisms that need to directly access memory.

And yes I would like to get rid of the mmu_rmap_notifiers altogether. It
would be much cleaner with just one mmu_notifier that can sleep in all
functions.

2008-02-29 20:17:55

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, Feb 29, 2008 at 11:55:17AM -0800, Christoph Lameter wrote:
> > post the invalidate in the mmio region of the device
> > smp_call_function()
> > while (mmio device wait-bitflag is on);
>
> So the device driver on UP can only operate through interrupts? If you are
> hogging the only cpu then driver operations may not be possible.

There was no irq involved in the above pseudocode, the irq if
something would run in the remote system. Still irqs can run fine
during the while loop like they run fine on top of
smp_call_function. The send-irq and the following spin-on-a-bitflag
works exactly as smp_call_function except this isn't a numa-CPU to
invalidate.

> And yes I would like to get rid of the mmu_rmap_notifiers altogether. It
> would be much cleaner with just one mmu_notifier that can sleep in all
> functions.

Agreed. I just thought xpmem needed an invalidate-by-page, but
I'm glad if xpmem can go in sync with the KVM/GRU/DRI model in this
regard.

2008-02-29 21:03:29

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, 29 Feb 2008, Andrea Arcangeli wrote:

> Agreed. I just thought xpmem needed an invalidate-by-page, but
> I'm glad if xpmem can go in sync with the KVM/GRU/DRI model in this
> regard.

That means we need both the anon_vma locks and the i_mmap_lock to become
semaphores. I think semaphores are better than mutexes. Rik and Lee saw
some performance improvements because list can be traversed in parallel
when the anon_vma lock is switched to be a rw lock.

Sounds like we get to a conceptually clean version here?

2008-02-29 21:23:47

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, Feb 29, 2008 at 01:03:16PM -0800, Christoph Lameter wrote:
> That means we need both the anon_vma locks and the i_mmap_lock to become
> semaphores. I think semaphores are better than mutexes. Rik and Lee saw
> some performance improvements because list can be traversed in parallel
> when the anon_vma lock is switched to be a rw lock.

The improvement was with a rw spinlock IIRC, so I don't see how it's
related to this.

Perhaps the rwlock spinlock can be changed to a rw semaphore without
measurable overscheduling in the fast path. However theoretically
speaking the rw_lock spinlock is more efficient than a rw semaphore in
case of a little contention during the page fault fast path because
the critical section is just a list_add so it'd be overkill to
schedule while waiting. That's why currently it's a spinlock (or rw
spinlock).

> Sounds like we get to a conceptually clean version here?

I don't have a strong opinion if it should become a semaphore
unconditionally or only with a CONFIG_XPMEM=y. But keep in mind
preempt-rt runs quite a bit slower, or we could rip spinlocks out of
the kernel in the first place ;)

2008-02-29 21:29:36

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, 29 Feb 2008, Andrea Arcangeli wrote:

> I don't have a strong opinion if it should become a semaphore
> unconditionally or only with a CONFIG_XPMEM=y. But keep in mind
> preempt-rt runs quite a bit slower, or we could rip spinlocks out of
> the kernel in the first place ;)

D you just skip comments of people on the mmu_notifier? It took me to
remind you about Andrew's comments to note those. And I just responded on
the XPmem issue in the morning.

Again for the gazillionth time: There will be no CONFIG_XPMEM because the
functionality needs to be generic and not XPMEM specific.

2008-02-29 21:34:45

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, 29 Feb 2008, Andrea Arcangeli wrote:

> On Fri, Feb 29, 2008 at 01:03:16PM -0800, Christoph Lameter wrote:
> > That means we need both the anon_vma locks and the i_mmap_lock to become
> > semaphores. I think semaphores are better than mutexes. Rik and Lee saw
> > some performance improvements because list can be traversed in parallel
> > when the anon_vma lock is switched to be a rw lock.
>
> The improvement was with a rw spinlock IIRC, so I don't see how it's
> related to this.

AFAICT The rw semaphore fastpath is similar in performance to a rw
spinlock.

> Perhaps the rwlock spinlock can be changed to a rw semaphore without
> measurable overscheduling in the fast path. However theoretically

Overscheduling? You mean overhead?

> speaking the rw_lock spinlock is more efficient than a rw semaphore in
> case of a little contention during the page fault fast path because
> the critical section is just a list_add so it'd be overkill to
> schedule while waiting. That's why currently it's a spinlock (or rw
> spinlock).

On the other hand a semaphore puts the process to sleep and may actually
improve performance because there is less time spend in a busy loop.
Other processes may do something useful and we stay off the contended
cacheline reducing traffic on the interconnect.

> preempt-rt runs quite a bit slower, or we could rip spinlocks out of
> the kernel in the first place ;)

The question is why that is the case and it seesm that there are issues
with interrupt on/off that are important here and particularly significant
with the SLAB allocator (significant hacks there to deal with that issue).
The fastpath that we have in the works for SLUB may address a large
part of that issue because it no longer relies on disabling interrupts.

2008-02-29 21:48:19

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, Feb 29, 2008 at 01:34:34PM -0800, Christoph Lameter wrote:
> On Fri, 29 Feb 2008, Andrea Arcangeli wrote:
>
> > On Fri, Feb 29, 2008 at 01:03:16PM -0800, Christoph Lameter wrote:
> > > That means we need both the anon_vma locks and the i_mmap_lock to become
> > > semaphores. I think semaphores are better than mutexes. Rik and Lee saw
> > > some performance improvements because list can be traversed in parallel
> > > when the anon_vma lock is switched to be a rw lock.
> >
> > The improvement was with a rw spinlock IIRC, so I don't see how it's
> > related to this.
>
> AFAICT The rw semaphore fastpath is similar in performance to a rw
> spinlock.

read side is taken in the slow path.

write side is taken in the fast path.

pagefault is fast path, VM during swapping is slow path.

> > Perhaps the rwlock spinlock can be changed to a rw semaphore without
> > measurable overscheduling in the fast path. However theoretically
>
> Overscheduling? You mean overhead?

The only possible overhead that a rw semaphore could ever generate vs
a rw lock is overscheduling.

> > speaking the rw_lock spinlock is more efficient than a rw semaphore in
> > case of a little contention during the page fault fast path because
> > the critical section is just a list_add so it'd be overkill to
> > schedule while waiting. That's why currently it's a spinlock (or rw
> > spinlock).
>
> On the other hand a semaphore puts the process to sleep and may actually
> improve performance because there is less time spend in a busy loop.
> Other processes may do something useful and we stay off the contended
> cacheline reducing traffic on the interconnect.

Yes, that's the positive side, the negative side is that you'll put
the task in uninterruptible sleep and call schedule() and require a
wakeup, because a list_add taking <1usec is running in the
other cpu. No other downside. But that's the only reason it's a
spinlock right now, infact there can't be any other reason.

2008-02-29 22:13:20

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, 29 Feb 2008, Andrea Arcangeli wrote:

> > AFAICT The rw semaphore fastpath is similar in performance to a rw
> > spinlock.
>
> read side is taken in the slow path.

Slowpath meaning VM slowpath or lock slow path? Its seems that the rwsem
read side path is pretty efficient:

static inline void __down_read(struct rw_semaphore *sem)
{
__asm__ __volatile__(
"# beginning down_read\n\t"
LOCK_PREFIX " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */
" jns 1f\n"
" call call_rwsem_down_read_failed\n"
"1:\n\t"
"# ending down_read\n\t"
: "+m" (sem->count)
: "a" (sem)
: "memory", "cc");
}



>
> write side is taken in the fast path.
>
> pagefault is fast path, VM during swapping is slow path.

Not sure what you are saying here. A pagefault should be considered as a
fast path and swapping is not performance critical?

> > > Perhaps the rwlock spinlock can be changed to a rw semaphore without
> > > measurable overscheduling in the fast path. However theoretically
> >
> > Overscheduling? You mean overhead?
>
> The only possible overhead that a rw semaphore could ever generate vs
> a rw lock is overscheduling.

Ok too many calls to schedule() because the slow path (of the semaphore)
is taken?

> > On the other hand a semaphore puts the process to sleep and may actually
> > improve performance because there is less time spend in a busy loop.
> > Other processes may do something useful and we stay off the contended
> > cacheline reducing traffic on the interconnect.
>
> Yes, that's the positive side, the negative side is that you'll put
> the task in uninterruptible sleep and call schedule() and require a
> wakeup, because a list_add taking <1usec is running in the
> other cpu. No other downside. But that's the only reason it's a
> spinlock right now, infact there can't be any other reason.

But that is only happening for the contended case. Certainly a spinlock is
better for 2p system but the more processors content for the lock (and
the longer the hold off is, typical for the processors with 4p or 8p or
more) the better a semaphore will work.

2008-02-29 22:41:57

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Fri, Feb 29, 2008 at 02:12:57PM -0800, Christoph Lameter wrote:
> On Fri, 29 Feb 2008, Andrea Arcangeli wrote:
>
> > > AFAICT The rw semaphore fastpath is similar in performance to a rw
> > > spinlock.
> >
> > read side is taken in the slow path.
>
> Slowpath meaning VM slowpath or lock slow path? Its seems that the rwsem

With slow path I meant the VM. Sorry if that was confusing given locks
also have fast paths (no contention) and slow paths (contention).

> read side path is pretty efficient:

Yes. The assembly doesn't worry me at all.

> > pagefault is fast path, VM during swapping is slow path.
>
> Not sure what you are saying here. A pagefault should be considered as a
> fast path and swapping is not performance critical?

Yes, swapping is I/O bound and it rarely becomes CPU hog in the common
case.

There are corner case workloads (including OOM) where swapping can
become cpu bound (that's also where rwlock helps). But certainly the
speed of fork() and a page fault, is critical for _everyone_, not just
a few workloads and setups.

> Ok too many calls to schedule() because the slow path (of the semaphore)
> is taken?

Yes, that's the only possible worry when converting a spinlock to
mutex.

> But that is only happening for the contended case. Certainly a spinlock is
> better for 2p system but the more processors content for the lock (and
> the longer the hold off is, typical for the processors with 4p or 8p or
> more) the better a semaphore will work.

Sure. That's also why the PT lock switches for >4way compiles. Config
option helps to keep the VM optimal for everyone. Here it is possible
it won't be necessary but I can't be sure given both i_mmap_lock and
anon-vma lock are used in some many places. Some TPC comparison would
be nice before making a default switch IMHO.

2008-03-03 05:11:43

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Thursday 28 February 2008 09:35, Christoph Lameter wrote:
> On Wed, 20 Feb 2008, Nick Piggin wrote:
> > On Friday 15 February 2008 17:49, Christoph Lameter wrote:

> > Also, what we are going to need here are not skeleton drivers
> > that just do all the *easy* bits (of registering their callbacks),
> > but actual fully working examples that do everything that any
> > real driver will need to do. If not for the sanity of the driver
> > writer, then for the sanity of the VM developers (I don't want
> > to have to understand xpmem or infiniband in order to understand
> > how the VM works).
>
> There are 3 different drivers that can already use it but the code is
> complex and not easy to review. Skeletons are easy to allow people to get
> started with it.

Your skeleton is just registering notifiers and saying

/* you fill the hard part in */

If somebody needs a skeleton in order just to register the notifiers,
then almost by definition they are unqualified to write the hard
part ;)


> > > lru_add_drain();
> > > tlb = tlb_gather_mmu(mm, 0);
> > > update_hiwater_rss(mm);
> > > + mmu_notifier(invalidate_range_begin, mm, address, end, atomic);
> > > end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
> > > if (tlb)
> > > tlb_finish_mmu(tlb, address, end);
> > > + mmu_notifier(invalidate_range_end, mm, address, end, atomic);
> > > return end;
> > > }
> >
> > Where do you invalidate for munmap()?
>
> zap_page_range() called from unmap_vmas().

But it is not allowed to sleep. Where do you call the sleepable one
from?


> > Also, how to you resolve the case where you are not allowed to sleep?
> > I would have thought either you have to handle it, in which case nobody
> > needs to sleep; or you can't handle it, in which case the code is
> > broken.
>
> That can be done in a variety of ways:
>
> 1. Change VM locking
>
> 2. Not handle file backed mappings (XPmem could work mostly in such a
> config)
>
> 3. Keep the refcount elevated until pages are freed in another execution
> context.

OK, there are ways to solve it or hack around it. But this is exactly
why I think the implementations should be kept seperate. Andrea's
notifiers are coherent, work on all types of mappings, and will
hopefully match closely the regular TLB invalidation sequence in the
Linux VM (at the moment it is quite close, but I hope to make it a
bit closer) so that it requires almost no changes to the mm.

All the other things to try to make it sleep are either hacking holes
in it (eg by removing coherency). So I don't think it is reasonable to
require that any patch handle all cases. I actually think Andrea's
patch is quite nice and simple itself, wheras I am against the patches
that you posted.

What about a completely different approach... XPmem runs over NUMAlink,
right? Why not provide some non-sleeping way to basically IPI remote
nodes over the NUMAlink where they can process the invalidation? If you
intra-node cache coherency has to run over this link anyway, then
presumably it is capable.

Or another idea, why don't you LD_PRELOAD in the MPT library to also
intercept munmap, mprotect, mremap etc as well as just fork()? That
would give you similarly "good enough" coherency as the mmu notifier
patches except that you can't swap (which Robin said was not a big
problem).

2008-03-03 19:28:37

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Mon, 3 Mar 2008, Nick Piggin wrote:

> Your skeleton is just registering notifiers and saying
>
> /* you fill the hard part in */
>
> If somebody needs a skeleton in order just to register the notifiers,
> then almost by definition they are unqualified to write the hard
> part ;)

Its also providing a locking scheme.

> OK, there are ways to solve it or hack around it. But this is exactly
> why I think the implementations should be kept seperate. Andrea's
> notifiers are coherent, work on all types of mappings, and will
> hopefully match closely the regular TLB invalidation sequence in the
> Linux VM (at the moment it is quite close, but I hope to make it a
> bit closer) so that it requires almost no changes to the mm.

Then put it into the arch code for TLB invalidation. Paravirt ops gives
good examples on how to do that.

> What about a completely different approach... XPmem runs over NUMAlink,
> right? Why not provide some non-sleeping way to basically IPI remote
> nodes over the NUMAlink where they can process the invalidation? If you
> intra-node cache coherency has to run over this link anyway, then
> presumably it is capable.

There is another Linux instance at the remote end that first has to
remove its own ptes. Also would not work for Inifiniband and other
solutions. All the approaches that require evictions in an atomic context
are limiting the approach and do not allow the generic functionality that
we want in order to not add alternate APIs for this.

> Or another idea, why don't you LD_PRELOAD in the MPT library to also
> intercept munmap, mprotect, mremap etc as well as just fork()? That
> would give you similarly "good enough" coherency as the mmu notifier
> patches except that you can't swap (which Robin said was not a big
> problem).

The good enough solution right now is to pin pages by elevating
refcounts.

2008-03-04 13:00:52

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Tuesday 04 March 2008 06:28, Christoph Lameter wrote:
> On Mon, 3 Mar 2008, Nick Piggin wrote:
> > Your skeleton is just registering notifiers and saying
> >
> > /* you fill the hard part in */
> >
> > If somebody needs a skeleton in order just to register the notifiers,
> > then almost by definition they are unqualified to write the hard
> > part ;)
>
> Its also providing a locking scheme.

Not the full locking scheme. If you have a look at the real code
required to do it, it is non trivial.


> > OK, there are ways to solve it or hack around it. But this is exactly
> > why I think the implementations should be kept seperate. Andrea's
> > notifiers are coherent, work on all types of mappings, and will
> > hopefully match closely the regular TLB invalidation sequence in the
> > Linux VM (at the moment it is quite close, but I hope to make it a
> > bit closer) so that it requires almost no changes to the mm.
>
> Then put it into the arch code for TLB invalidation. Paravirt ops gives
> good examples on how to do that.

Put what into arch code?


> > What about a completely different approach... XPmem runs over NUMAlink,
> > right? Why not provide some non-sleeping way to basically IPI remote
> > nodes over the NUMAlink where they can process the invalidation? If you
> > intra-node cache coherency has to run over this link anyway, then
> > presumably it is capable.
>
> There is another Linux instance at the remote end that first has to
> remove its own ptes.

Yeah, what's the problem?


> Also would not work for Inifiniband and other
> solutions.

infiniband doesn't want it. Other solutions is just handwaving,
because if we don't know what the other soloutions are, then we can't
make any sort of informed choices.


> All the approaches that require evictions in an atomic context
> are limiting the approach and do not allow the generic functionality that
> we want in order to not add alternate APIs for this.

The only generic way to do this that I have seen (and the only proposed
way that doesn't add alternate APIs for that matter) is turning VM locks
into sleeping locks. In which case, Andrea's notifiers will work just
fine (except for relatively minor details like rcu list scanning).

So I don't see what you're arguing for. There is no requirement that we
support sleeping notifiers in the same patch as non-sleeping ones.
Considering the simplicity of the non-sleeping notifiers and the
problems with sleeping ones, I think it is pretty clear that they are
different beasts (unless VM locking is changed).


> > Or another idea, why don't you LD_PRELOAD in the MPT library to also
> > intercept munmap, mprotect, mremap etc as well as just fork()? That
> > would give you similarly "good enough" coherency as the mmu notifier
> > patches except that you can't swap (which Robin said was not a big
> > problem).
>
> The good enough solution right now is to pin pages by elevating
> refcounts.

Which kind of leads to the question of why do you need any further
kernel patches if that is good enough?

2008-03-04 18:59:17

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Tue, 4 Mar 2008, Nick Piggin wrote:

> > Then put it into the arch code for TLB invalidation. Paravirt ops gives
> > good examples on how to do that.
>
> Put what into arch code?

The mmu notifier code.

> > > What about a completely different approach... XPmem runs over NUMAlink,
> > > right? Why not provide some non-sleeping way to basically IPI remote
> > > nodes over the NUMAlink where they can process the invalidation? If you
> > > intra-node cache coherency has to run over this link anyway, then
> > > presumably it is capable.
> >
> > There is another Linux instance at the remote end that first has to
> > remove its own ptes.
>
> Yeah, what's the problem?

The remote end has to invalidate the page which involves locking etc.

> > Also would not work for Inifiniband and other
> > solutions.
>
> infiniband doesn't want it. Other solutions is just handwaving,
> because if we don't know what the other soloutions are, then we can't
> make any sort of informed choices.

We need a solution in general to avoid the pinning problems. Infiniband
has those too.

> > All the approaches that require evictions in an atomic context
> > are limiting the approach and do not allow the generic functionality that
> > we want in order to not add alternate APIs for this.
>
> The only generic way to do this that I have seen (and the only proposed
> way that doesn't add alternate APIs for that matter) is turning VM locks
> into sleeping locks. In which case, Andrea's notifiers will work just
> fine (except for relatively minor details like rcu list scanning).

No they wont. As you pointed out the callback need RCU locking.

> > The good enough solution right now is to pin pages by elevating
> > refcounts.
>
> Which kind of leads to the question of why do you need any further
> kernel patches if that is good enough?

Well its good enough with severe problems during reclaim, livelocks etc.
One could improve on that scheme through Rik's work trying to add a new
page flag that mark pinned pages and then keep them off the LRUs and
limiting their number. Having pinned page would limit the ability to
reclaim by the VM and make page migration, memory unplug etc impossible.
It is better to have notifier scheme that allows to tell a device driver
to free up the memory it has mapped.

2008-03-05 00:53:28

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 2/6] mmu_notifier: Callbacks to invalidate address ranges

On Wednesday 05 March 2008 05:58, Christoph Lameter wrote:
> On Tue, 4 Mar 2008, Nick Piggin wrote:
> > > Then put it into the arch code for TLB invalidation. Paravirt ops gives
> > > good examples on how to do that.
> >
> > Put what into arch code?
>
> The mmu notifier code.

It isn't arch specific.


> > > > What about a completely different approach... XPmem runs over
> > > > NUMAlink, right? Why not provide some non-sleeping way to basically
> > > > IPI remote nodes over the NUMAlink where they can process the
> > > > invalidation? If you intra-node cache coherency has to run over this
> > > > link anyway, then presumably it is capable.
> > >
> > > There is another Linux instance at the remote end that first has to
> > > remove its own ptes.
> >
> > Yeah, what's the problem?
>
> The remote end has to invalidate the page which involves locking etc.

I don't see what the problem is.


> > > Also would not work for Inifiniband and other
> > > solutions.
> >
> > infiniband doesn't want it. Other solutions is just handwaving,
> > because if we don't know what the other soloutions are, then we can't
> > make any sort of informed choices.
>
> We need a solution in general to avoid the pinning problems. Infiniband
> has those too.
>
> > > All the approaches that require evictions in an atomic context
> > > are limiting the approach and do not allow the generic functionality
> > > that we want in order to not add alternate APIs for this.
> >
> > The only generic way to do this that I have seen (and the only proposed
> > way that doesn't add alternate APIs for that matter) is turning VM locks
> > into sleeping locks. In which case, Andrea's notifiers will work just
> > fine (except for relatively minor details like rcu list scanning).
>
> No they wont. As you pointed out the callback need RCU locking.

That can be fixed easily.


> > > The good enough solution right now is to pin pages by elevating
> > > refcounts.
> >
> > Which kind of leads to the question of why do you need any further
> > kernel patches if that is good enough?
>
> Well its good enough with severe problems during reclaim, livelocks etc.
> One could improve on that scheme through Rik's work trying to add a new
> page flag that mark pinned pages and then keep them off the LRUs and
> limiting their number. Having pinned page would limit the ability to
> reclaim by the VM and make page migration, memory unplug etc impossible.

Well not impossible. You could have a callback to invalidate the remote
TLB and drop the pin on a given page.


> It is better to have notifier scheme that allows to tell a device driver
> to free up the memory it has mapped.

Yeah, it would be nice for those people with clusters of Altixes. Doesn't
mean it has to go upstream, though.