2008-01-31 04:58:52

by Christoph Lameter

[permalink] [raw]
Subject: [patch 2/3] mmu_notifier: Callbacks to invalidate address ranges

The invalidation of address ranges in a mm_struct needs to be
performed when pages are removed or permissions etc change.

invalidate_range_begin/end() is frequently called with only mmap_sem
held. If invalidate_range_begin() is called with locks held then we
pass a flag into invalidate_range() to indicate that no sleeping is
possible.

In two cases we use invalidate_range_begin/end to invalidate
single pages because the pair allows holding off new references
(idea by Robin Holt).

do_wp_page(): We hold off new references while update the pte.

xip_unmap: We are not taking the PageLock so we cannot
use the invalidate_page mmu_rmap_notifier. invalidate_range_begin/end
stands in.

Comments state that mmap_sem must be held for
remap_pfn_range() but various drivers do not seem to do this.

Signed-off-by: Andrea Arcangeli <[email protected]>
Signed-off-by: Robin Holt <[email protected]>
Signed-off-by: Christoph Lameter <[email protected]>

---
mm/filemap_xip.c | 4 ++++
mm/fremap.c | 3 +++
mm/hugetlb.c | 3 +++
mm/memory.c | 15 +++++++++++++--
mm/mmap.c | 2 ++
5 files changed, 25 insertions(+), 2 deletions(-)

Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c 2008-01-30 20:03:05.000000000 -0800
+++ linux-2.6/mm/fremap.c 2008-01-30 20:05:39.000000000 -0800
@@ -15,6 +15,7 @@
#include <linux/rmap.h>
#include <linux/module.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>

#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
@@ -211,7 +212,9 @@ asmlinkage long sys_remap_file_pages(uns
spin_unlock(&mapping->i_mmap_lock);
}

+ mmu_notifier(invalidate_range_begin, mm, start, start + size, 0);
err = populate_range(mm, vma, start, size, pgoff);
+ mmu_notifier(invalidate_range_end, mm, 0);
if (!err && !(flags & MAP_NONBLOCK)) {
if (unlikely(has_write_lock)) {
downgrade_write(&mm->mmap_sem);
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2008-01-30 20:03:05.000000000 -0800
+++ linux-2.6/mm/memory.c 2008-01-30 20:07:27.000000000 -0800
@@ -50,6 +50,7 @@
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
+#include <linux/mmu_notifier.h>

#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -883,13 +884,16 @@ unsigned long zap_page_range(struct vm_a
struct mmu_gather *tlb;
unsigned long end = address + size;
unsigned long nr_accounted = 0;
+ int atomic = details ? (details->i_mmap_lock != 0) : 0;

lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
+ mmu_notifier(invalidate_range_begin, mm, address, end, atomic);
end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
if (tlb)
tlb_finish_mmu(tlb, address, end);
+ mmu_notifier(invalidate_range_end, mm, atomic);
return end;
}

@@ -1318,7 +1322,7 @@ int remap_pfn_range(struct vm_area_struc
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + PAGE_ALIGN(size);
+ unsigned long start = addr, end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
int err;

@@ -1352,6 +1356,7 @@ int remap_pfn_range(struct vm_area_struc
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
do {
next = pgd_addr_end(addr, end);
err = remap_pud_range(mm, pgd, addr, next,
@@ -1359,6 +1364,7 @@ int remap_pfn_range(struct vm_area_struc
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier(invalidate_range_end, mm, 0);
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1442,10 +1448,11 @@ int apply_to_page_range(struct mm_struct
{
pgd_t *pgd;
unsigned long next;
- unsigned long end = addr + size;
+ unsigned long start = addr, end = addr + size;
int err;

BUG_ON(addr >= end);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1453,6 +1460,7 @@ int apply_to_page_range(struct mm_struct
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ mmu_notifier(invalidate_range_end, mm, 0);
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1630,6 +1638,8 @@ gotten:
goto oom;
cow_user_page(new_page, old_page, address, vma);

+ mmu_notifier(invalidate_range_begin, mm, address,
+ address + PAGE_SIZE - 1, 0);
/*
* Re-check the pte - we dropped the lock
*/
@@ -1668,6 +1678,7 @@ gotten:
page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
+ mmu_notifier(invalidate_range_end, mm, 0);
if (dirty_page) {
if (vma->vm_file)
file_update_time(vma->vm_file);
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c 2008-01-30 20:03:05.000000000 -0800
+++ linux-2.6/mm/mmap.c 2008-01-30 20:05:39.000000000 -0800
@@ -1744,11 +1744,13 @@ static void unmap_region(struct mm_struc
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
+ mmu_notifier(invalidate_range_begin, mm, start, end, 0);
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
+ mmu_notifier(invalidate_range_end, mm, 0);
}

/*
Index: linux-2.6/mm/hugetlb.c
===================================================================
--- linux-2.6.orig/mm/hugetlb.c 2008-01-30 20:03:05.000000000 -0800
+++ linux-2.6/mm/hugetlb.c 2008-01-30 20:05:39.000000000 -0800
@@ -14,6 +14,7 @@
#include <linux/mempolicy.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
+#include <linux/mmu_notifier.h>

#include <asm/page.h>
#include <asm/pgtable.h>
@@ -743,6 +744,7 @@ void __unmap_hugepage_range(struct vm_ar
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);

+ mmu_notifier(invalidate_range_begin, mm, start, end, 1);
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
@@ -763,6 +765,7 @@ void __unmap_hugepage_range(struct vm_ar
}
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
+ mmu_notifier(invalidate_range_end, mm, 1);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
list_del(&page->lru);
put_page(page);
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c 2008-01-30 20:03:05.000000000 -0800
+++ linux-2.6/mm/filemap_xip.c 2008-01-30 20:05:39.000000000 -0800
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/uio.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
#include <linux/sched.h>
#include <asm/tlbflush.h>

@@ -189,6 +190,8 @@ __xip_unmap (struct address_space * mapp
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ mmu_notifier(invalidate_range_begin, mm, address,
+ address + PAGE_SIZE - 1, 1);
pte = page_check_address(page, mm, address, &ptl);
if (pte) {
/* Nuke the page table entry. */
@@ -200,6 +203,7 @@ __xip_unmap (struct address_space * mapp
pte_unmap_unlock(pte, ptl);
page_cache_release(page);
}
+ mmu_notifier(invalidate_range_end, mm, 1);
}
spin_unlock(&mapping->i_mmap_lock);
}

--


2008-01-31 12:31:32

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [patch 2/3] mmu_notifier: Callbacks to invalidate address ranges

On Wed, Jan 30, 2008 at 08:57:52PM -0800, Christoph Lameter wrote:
> @@ -211,7 +212,9 @@ asmlinkage long sys_remap_file_pages(uns
> spin_unlock(&mapping->i_mmap_lock);
> }
>
> + mmu_notifier(invalidate_range_begin, mm, start, start + size, 0);
> err = populate_range(mm, vma, start, size, pgoff);
> + mmu_notifier(invalidate_range_end, mm, 0);
> if (!err && !(flags & MAP_NONBLOCK)) {
> if (unlikely(has_write_lock)) {
> downgrade_write(&mm->mmap_sem);

This can't be enough for GRU, infact it can't work for KVM either. You
got 1) to have some invalidate_page for GRU before freeing the page,
and 2) to pass start, end to range_end (if you want kvm to use it
instead of invalidate_page).

mremap still missing as a whole.

2008-01-31 20:07:18

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/3] mmu_notifier: Callbacks to invalidate address ranges

On Thu, 31 Jan 2008, Andrea Arcangeli wrote:

> On Wed, Jan 30, 2008 at 08:57:52PM -0800, Christoph Lameter wrote:
> > @@ -211,7 +212,9 @@ asmlinkage long sys_remap_file_pages(uns
> > spin_unlock(&mapping->i_mmap_lock);
> > }
> >
> > + mmu_notifier(invalidate_range_begin, mm, start, start + size, 0);
> > err = populate_range(mm, vma, start, size, pgoff);
> > + mmu_notifier(invalidate_range_end, mm, 0);
> > if (!err && !(flags & MAP_NONBLOCK)) {
> > if (unlikely(has_write_lock)) {
> > downgrade_write(&mm->mmap_sem);
>
> This can't be enough for GRU, infact it can't work for KVM either. You
> got 1) to have some invalidate_page for GRU before freeing the page,
> and 2) to pass start, end to range_end (if you want kvm to use it
> instead of invalidate_page).

The external references are dropped when calling invalidate_range_begin.
This would work both for the KVM and the GRU. Why would KVM not be able to
invalidate the range before? Locking conventions is that no additional
external reference can be added between invalidate_range_begin and
invalidate_range_end. So KVM is fine too.

> mremap still missing as a whole.

mremap uses do_munmap which calls into unmap_region() that already has
callbacks. So what is wrong there?

2008-01-31 22:01:54

by Christoph Lameter

[permalink] [raw]
Subject: mmu_notifier: close hole in fork

Talking to Robin and Jack we found taht we still have a hole during fork.
Fork may set a pte writeprotect. At that point the remote pte are
not marked readonly(!). Remote writes may occur to pages that are marked
readonly locally without this patch.

mmu_notifier: Provide invalidate_range on fork

On fork we change ptes in cow mappings to readonly. This means we must
invalidate the ptes so that they are reestablished later with proper
permission.

Signed-off-by: Christoph Lameter <[email protected]>

---
mm/memory.c | 6 ++++++
1 file changed, 6 insertions(+)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2008-01-31 13:42:35.000000000 -0800
+++ linux-2.6/mm/memory.c 2008-01-31 13:47:31.000000000 -0800
@@ -602,6 +602,9 @@ int copy_page_range(struct mm_struct *ds
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);

+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier(invalidate_range_begin, src_mm, addr, end, 0);
+
dst_pgd = pgd_offset(dst_mm, addr);
src_pgd = pgd_offset(src_mm, addr);
do {
@@ -612,6 +615,9 @@ int copy_page_range(struct mm_struct *ds
vma, addr, next))
return -ENOMEM;
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
+
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier(invalidate_range_end, src_mm, 0);
return 0;
}

2008-01-31 22:16:56

by Christoph Lameter

[permalink] [raw]
Subject: Re: mmu_notifier: reduce size of mm_struct if !CONFIG_MMU_NOTIFIER

mmu_notifier: Reduce size of mm_struct if !CONFIG_MMU_NOTIFIER

Andrea and Peter had a concern about this.

Use an #ifdef to make the mmu_notifer_head structure empty if we have
no notifier. That allows the use of the structure in inline functions
(which allows parameter verification even if !CONFIG_MMU_NOTIFIER)

Signed-off-by: Christoph Lameter <[email protected]>

---
include/linux/mm_types.h | 2 ++
1 file changed, 2 insertions(+)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h 2008-01-31 14:03:23.000000000 -0800
+++ linux-2.6/include/linux/mm_types.h 2008-01-31 14:03:38.000000000 -0800
@@ -154,7 +154,9 @@ struct vm_area_struct {
};

struct mmu_notifier_head {
+#ifdef CONFIG_MMU_NOTIFIER
struct hlist_head head;
+#endif
};

struct mm_struct {

2008-01-31 22:22:17

by Christoph Lameter

[permalink] [raw]
Subject: mmu_notifier: Move mmu_notifier_release up to get rid of the invalidat_all() callback

mmu_notifier: Move mmu_notifier_release up to get rid of invalidate_all()

It seems that it is safe to call mmu_notifier_release() before we tear down
the pages and the vmas since we are the only executing thread. mmu_notifier_release
can then also tear down all its external ptes and thus we can get rid
of the invalidate_all() callback.

During the final teardown no mmu_notifier calls are registered anymore which
will speed up exit processing.

Is this okay for KVM too?

Signed-off-by: Christoph Lameter <[email protected]>

---
include/linux/mmu_notifier.h | 4 ----
mm/mmap.c | 3 +--
2 files changed, 1 insertion(+), 6 deletions(-)

Index: linux-2.6/include/linux/mmu_notifier.h
===================================================================
--- linux-2.6.orig/include/linux/mmu_notifier.h 2008-01-31 14:17:17.000000000 -0800
+++ linux-2.6/include/linux/mmu_notifier.h 2008-01-31 14:17:28.000000000 -0800
@@ -59,10 +59,6 @@ struct mmu_notifier_ops {
void (*release)(struct mmu_notifier *mn,
struct mm_struct *mm);

- /* Dummy needed because the mmu_notifier() macro requires it */
- void (*invalidate_all)(struct mmu_notifier *mn, struct mm_struct *mm,
- int dummy);
-
/*
* age_page is called from contexts where the pte_lock is held
*/
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c 2008-01-31 14:16:51.000000000 -0800
+++ linux-2.6/mm/mmap.c 2008-01-31 14:17:10.000000000 -0800
@@ -2036,7 +2036,7 @@ void exit_mmap(struct mm_struct *mm)
unsigned long end;

/* mm's last user has gone, and its about to be pulled down */
- mmu_notifier(invalidate_all, mm, 0);
+ mmu_notifier_release(mm);
arch_exit_mmap(mm);

lru_add_drain();
@@ -2048,7 +2048,6 @@ void exit_mmap(struct mm_struct *mm)
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
- mmu_notifier_release(mm);

/*
* Walk the list again, actually closing and freeing it,

2008-02-01 00:02:33

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: mmu_notifier: close hole in fork

On Thu, Jan 31, 2008 at 02:01:43PM -0800, Christoph Lameter wrote:
> Talking to Robin and Jack we found taht we still have a hole during fork.
> Fork may set a pte writeprotect. At that point the remote pte are
> not marked readonly(!). Remote writes may occur to pages that are marked
> readonly locally without this patch.

Good catch! This was missing also in my #v5 (KVM doesn't need that
because the only possible cows on sptes can be generated by ksm, but
it would have been a problem for GRU). The more I think about it, the
more I think GRU will run so nicely with follow_page taking zero
additional locking and with the nicely scalable PT lock that doesn't
require 1024 threads to trash on the same lock on 1024 different cpus
if they access addresses that aren't in the same naturally aligned 2M
chunk of virtual address space. I understood follow_page was
performance critical for GRU, if yes, then I hope my _dual_ approach
is by far the best for at least GRU (and KVM of course for the very
same reason), and of course it'll fit XPMEM too the moment you add
invalidate_range_start/end too.

Signed-off-by: Andrea Arcangeli <[email protected]>

diff --git a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -494,6 +494,7 @@ static int copy_pte_range(struct mm_stru
spinlock_t *src_ptl, *dst_ptl;
int progress = 0;
int rss[2];
+ unsigned long start;

again:
rss[1] = rss[0] = 0;
@@ -505,6 +506,7 @@ again:
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
arch_enter_lazy_mmu_mode();

+ start = addr;
do {
/*
* We are holding two locks at this point - either of them
@@ -525,6 +527,8 @@ again:
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

arch_leave_lazy_mmu_mode();
+ if (is_cow_mapping(vma->vm_flags))
+ mmu_notifier(invalidate_pages, vma->vm_mm, start, addr);
spin_unlock(src_ptl);
pte_unmap_nested(src_pte - 1);
add_mm_rss(dst_mm, rss[0], rss[1]);

2008-02-01 00:14:22

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: mmu_notifier: Move mmu_notifier_release up to get rid of the invalidat_all() callback

On Thu, Jan 31, 2008 at 02:21:58PM -0800, Christoph Lameter wrote:
> Is this okay for KVM too?

->release isn't implemented at all in KVM, only the list_del generates
complications.

I think current code could be already safe through the mm_count pin,
becasue KVM relies on the fact anybody pinning through mm_count like
KVM does, is forbidden to call unregister and it's forced to wait the
auto-disarming when mm_users hits zero, but I feel like something's
still wrong if I think that I'm not using call_rcu to free the
notifier (OTOH we agreed the list had to be frozen and w/o readers
(modulo _release) before _release is called, so if this initial
assumption is ok it seems I may be safe w/o call_rcu?).

But it's really tricky path. Anyway this is the last of my worries
right now, it works perfectly fine with a single user obviously, and
the moment KVM threads runs remotely through GRU/XPMEM isn't happening
too soon ;) so let's concentrate on the rest first. I can say
hlist_del_init doesn't seem to provide any benefit given nobody could
possibly decide to call register or unregister after _release run.

2008-02-01 01:48:29

by Christoph Lameter

[permalink] [raw]
Subject: Re: mmu_notifier: close hole in fork

On Fri, 1 Feb 2008, Andrea Arcangeli wrote:

> Good catch! This was missing also in my #v5 (KVM doesn't need that
> because the only possible cows on sptes can be generated by ksm, but
> it would have been a problem for GRU). The more I think about it, the

How do you think the GRU should know when to drop the refcount? There is
no page table and thus no way of tracking that a refcount was taken.
Without the refcount you cannot defer the freeing of the page. So
shootdown on invalidate_range_begin and lock out until
invalidate_range_end seems to be the only workable solution.

BTW what do you think about adding a flag parameter to the invalidate
calls that allows shooting down writable ptes only? That could be useful
for COW and page_mkclean.

So

#define MMU_ATOMIC 1
#define MMU_WRITABLE 2

insted of the atomic parameter?


2008-02-01 01:52:21

by Christoph Lameter

[permalink] [raw]
Subject: Re: mmu_notifier: Move mmu_notifier_release up to get rid of the invalidat_all() callback

On Fri, 1 Feb 2008, Andrea Arcangeli wrote:

> On Thu, Jan 31, 2008 at 02:21:58PM -0800, Christoph Lameter wrote:
> > Is this okay for KVM too?
>
> ->release isn't implemented at all in KVM, only the list_del generates
> complications.

Why would the list_del generate problems?

> I think current code could be already safe through the mm_count pin,
> becasue KVM relies on the fact anybody pinning through mm_count like
> KVM does, is forbidden to call unregister and it's forced to wait the
> auto-disarming when mm_users hits zero, but I feel like something's
> still wrong if I think that I'm not using call_rcu to free the
> notifier (OTOH we agreed the list had to be frozen and w/o readers
> (modulo _release) before _release is called, so if this initial
> assumption is ok it seems I may be safe w/o call_rcu?).

You could pin via mm_users? Then it would be entirely safe and no need for
rcu tricks?

OTOH if there are mm_count users like in KVM: Could we guarantee that
they do not perform any operations with the mmu notifier list? Then we
would be safe as well.

> too soon ;) so let's concentrate on the rest first. I can say
> hlist_del_init doesn't seem to provide any benefit given nobody could
> possibly decide to call register or unregister after _release run.

It is useful if a device driver has a list of data segments that contain
struct mmu_notifiers. The device driver can inspect the mmu_notifier and
reliably conclude that the beast is inactive.

2008-02-01 01:57:40

by Christoph Lameter

[permalink] [raw]
Subject: mmu_notifier: invalidate_range for move_page_tables

mmu_notifier: Provide invalidate_range for move_page_tables

Move page tables also needs to invalidate the external references
and hold new references off while moving page table entries.

This is already guaranteed by holding a writelock
on mmap_sem for get_user_pages() but follow_page() is not subject to
the mmap_sem locking.

Signed-off-by: Christoph Lameter <[email protected]>

---
mm/mremap.c | 4 ++++
1 file changed, 4 insertions(+)

Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c 2008-01-25 14:25:31.000000000 -0800
+++ linux-2.6/mm/mremap.c 2008-01-31 17:54:19.000000000 -0800
@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>

#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -130,6 +131,8 @@ unsigned long move_page_tables(struct vm
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);

+ mmu_notifier(invalidate_range_begin, vma->vm_mm,
+ old_addr, old_addr + len, 0);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
next = (old_addr + PMD_SIZE) & PMD_MASK;
@@ -150,6 +153,7 @@ unsigned long move_page_tables(struct vm
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
new_vma, new_pmd, new_addr);
}
+ mmu_notifier(invalidate_range_end, vma->vm_mm, 0);

return len + old_addr - old_end; /* how much done */
}

2008-02-01 02:38:25

by Robin Holt

[permalink] [raw]
Subject: Re: mmu_notifier: invalidate_range for move_page_tables

On Thu, Jan 31, 2008 at 05:57:25PM -0800, Christoph Lameter wrote:
> Move page tables also needs to invalidate the external references
> and hold new references off while moving page table entries.

I must admit to not having spent any time thinking about this, but aren't
we moving the entries from one set of page tables to the other, leaving
the pte_t entries unchanged. I guess I should go look, but could you
provide a quick pointer in the proper direction as to why we need to
recall externals when the before and after look of these page tables
will have the same information for the TLBs.

Thanks,
Robin

2008-02-01 02:42:11

by Christoph Lameter

[permalink] [raw]
Subject: Re: mmu_notifier: invalidate_range for move_page_tables

On Thu, 31 Jan 2008, Robin Holt wrote:

> On Thu, Jan 31, 2008 at 05:57:25PM -0800, Christoph Lameter wrote:
> > Move page tables also needs to invalidate the external references
> > and hold new references off while moving page table entries.
>
> I must admit to not having spent any time thinking about this, but aren't
> we moving the entries from one set of page tables to the other, leaving
> the pte_t entries unchanged. I guess I should go look, but could you
> provide a quick pointer in the proper direction as to why we need to
> recall externals when the before and after look of these page tables
> will have the same information for the TLBs.

remap changes the address of pages in a process. The pages appear at
another address. Thus the external pte will have the wrong information if
not invalidated.

Do a

man mremap

2008-02-01 04:24:22

by Robin Holt

[permalink] [raw]
Subject: Re: [patch 2/3] mmu_notifier: Callbacks to invalidate address ranges

> Index: linux-2.6/mm/memory.c
...
> @@ -1668,6 +1678,7 @@ gotten:
> page_cache_release(old_page);
> unlock:
> pte_unmap_unlock(page_table, ptl);
> + mmu_notifier(invalidate_range_end, mm, 0);

I think we can get an _end call without the _begin call before it.

Thanks,
Robin

2008-02-01 05:01:11

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/3] mmu_notifier: Callbacks to invalidate address ranges

On Thu, 31 Jan 2008, Robin Holt wrote:

> > Index: linux-2.6/mm/memory.c
> ...
> > @@ -1668,6 +1678,7 @@ gotten:
> > page_cache_release(old_page);
> > unlock:
> > pte_unmap_unlock(page_table, ptl);
> > + mmu_notifier(invalidate_range_end, mm, 0);
>
> I think we can get an _end call without the _begin call before it.

If that would be true then also the pte would have been left locked.

We always hit unlock. Maybe I just do not see it?

2008-02-01 10:32:35

by Robin Holt

[permalink] [raw]
Subject: Re: [patch 2/3] mmu_notifier: Callbacks to invalidate address ranges

On Thu, Jan 31, 2008 at 08:43:58PM -0800, Christoph Lameter wrote:
> On Thu, 31 Jan 2008, Robin Holt wrote:
>
> > > Index: linux-2.6/mm/memory.c
> > ...
> > > @@ -1668,6 +1678,7 @@ gotten:
> > > page_cache_release(old_page);
> > > unlock:
> > > pte_unmap_unlock(page_table, ptl);
> > > + mmu_notifier(invalidate_range_end, mm, 0);
> >
> > I think we can get an _end call without the _begin call before it.
>
> If that would be true then also the pte would have been left locked.
>
> We always hit unlock. Maybe I just do not see it?

Maybe I haven't looked closely enough, but let's start with some common
assumptions. Looking at do_wp_page from 2.6.24 (I believe that is what
my work area is based upon). On line 1559, the function begins being
declared.

On lines 1614 and 1630, we do "goto unlock" where the _end callout is
soon made. The _begin callout does not come until after those branches
have been taken (occurs on line 1648).

Thanks,
Robin

2008-02-01 10:37:47

by Robin Holt

[permalink] [raw]
Subject: Re: [patch 2/3] mmu_notifier: Callbacks to invalidate address ranges

On Fri, Feb 01, 2008 at 04:32:21AM -0600, Robin Holt wrote:
> On Thu, Jan 31, 2008 at 08:43:58PM -0800, Christoph Lameter wrote:
> > On Thu, 31 Jan 2008, Robin Holt wrote:
> >
> > > > Index: linux-2.6/mm/memory.c
> > > ...
> > > > @@ -1668,6 +1678,7 @@ gotten:
> > > > page_cache_release(old_page);
> > > > unlock:
> > > > pte_unmap_unlock(page_table, ptl);
> > > > + mmu_notifier(invalidate_range_end, mm, 0);
> > >
> > > I think we can get an _end call without the _begin call before it.
> >
> > If that would be true then also the pte would have been left locked.
> >
> > We always hit unlock. Maybe I just do not see it?
>
> Maybe I haven't looked closely enough, but let's start with some common
> assumptions. Looking at do_wp_page from 2.6.24 (I believe that is what
> my work area is based upon). On line 1559, the function begins being
> declared.
>
> On lines 1614 and 1630, we do "goto unlock" where the _end callout is
> soon made. The _begin callout does not come until after those branches
> have been taken (occurs on line 1648).
>
> Thanks,
> Robin

Ignore this thread, I am going to throw a patch against the new version.

Thanks,
Robin

2008-02-01 19:13:20

by Christoph Lameter

[permalink] [raw]
Subject: Re: [patch 2/3] mmu_notifier: Callbacks to invalidate address ranges

On Fri, 1 Feb 2008, Robin Holt wrote:

> Maybe I haven't looked closely enough, but let's start with some common
> assumptions. Looking at do_wp_page from 2.6.24 (I believe that is what
> my work area is based upon). On line 1559, the function begins being
> declared.

Aah I looked at the wrong file.

> On lines 1614 and 1630, we do "goto unlock" where the _end callout is
> soon made. The _begin callout does not come until after those branches
> have been taken (occurs on line 1648).

There are actually two cases...

---
mm/memory.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c 2008-02-01 11:04:21.000000000 -0800
+++ linux-2.6/mm/memory.c 2008-02-01 11:12:12.000000000 -0800
@@ -1611,8 +1611,10 @@ static int do_wp_page(struct mm_struct *
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
page_cache_release(old_page);
- if (!pte_same(*page_table, orig_pte))
- goto unlock;
+ if (!pte_same(*page_table, orig_pte)) {
+ pte_unmap_unlock(page_table, ptl);
+ goto check_dirty;
+ }

page_mkwrite = 1;
}
@@ -1628,7 +1630,8 @@ static int do_wp_page(struct mm_struct *
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, entry);
ret |= VM_FAULT_WRITE;
- goto unlock;
+ pte_unmap_unlock(page_table, ptl);
+ goto check_dirty;
}

/*
@@ -1684,10 +1687,10 @@ gotten:
page_cache_release(new_page);
if (old_page)
page_cache_release(old_page);
-unlock:
pte_unmap_unlock(page_table, ptl);
mmu_notifier(invalidate_range_end, mm,
address, address + PAGE_SIZE, 0);
+check_dirty:
if (dirty_page) {
if (vma->vm_file)
file_update_time(vma->vm_file);