Core code for mmu notifiers.
Signed-off-by: Christoph Lameter <[email protected]>
Signed-off-by: Andrea Arcangeli <[email protected]>
---
include/linux/mm_types.h | 8 ++
include/linux/mmu_notifier.h | 152 +++++++++++++++++++++++++++++++++++++++++++
include/linux/page-flags.h | 9 ++
kernel/fork.c | 2
mm/Kconfig | 4 +
mm/Makefile | 1
mm/mmap.c | 2
mm/mmu_notifier.c | 91 +++++++++++++++++++++++++
8 files changed, 269 insertions(+)
Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h 2008-01-24 20:59:17.000000000 -0800
+++ linux-2.6/include/linux/mm_types.h 2008-01-24 20:59:19.000000000 -0800
@@ -153,6 +153,10 @@ struct vm_area_struct {
#endif
};
+struct mmu_notifier_head {
+ struct hlist_head head;
+};
+
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
@@ -219,6 +223,10 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+#ifdef CONFIG_MMU_NOTIFIER
+ struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
+#endif
};
#endif /* _LINUX_MM_TYPES_H */
Index: linux-2.6/include/linux/mmu_notifier.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/linux/mmu_notifier.h 2008-01-24 20:59:19.000000000 -0800
@@ -0,0 +1,152 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+/*
+ * MMU motifier
+ *
+ * Notifier functions for hardware and software that establishes external
+ * references to pages of a Linux system. The notifier calls ensure that
+ * the external mappings are removed when the Linux VM removes memory ranges
+ * or individual pages from a process.
+ *
+ * These fall into two classes
+ *
+ * 1. mmu_notifier
+ *
+ * These are callbacks registered with an mm_struct. If mappings are
+ * removed from an address space then callbacks are performed.
+ * Spinlocks must be held in order to the walk reverse maps and the
+ * notifications are performed while the spinlock is held.
+ *
+ *
+ * 2. mmu_rmap_notifier
+ *
+ * Callbacks for subsystems that provide their own rmaps. These
+ * need to walk their own rmaps for a page. The invalidate_page
+ * callback is outside of locks so that we are not in a strictly
+ * atomic context (but we may be in a PF_MEMALLOC context if the
+ * notifier is called from reclaim code) and are able to sleep.
+ * Rmap notifiers need an extra page bit and are only available
+ * on 64 bit platforms. It is up to the subsystem to mark pags
+ * as PageExternalRmap as needed to trigger the callbacks. Pages
+ * must be marked dirty if dirty bits are set in the external
+ * pte.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/mm_types.h>
+
+struct mmu_notifier_ops;
+
+struct mmu_notifier {
+ struct hlist_node hlist;
+ const struct mmu_notifier_ops *ops;
+};
+
+struct mmu_notifier_ops {
+ void (*release)(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+ int (*age_page)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+ void (*invalidate_page)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+ void (*invalidate_range)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end);
+};
+
+struct mmu_rmap_notifier_ops;
+
+struct mmu_rmap_notifier {
+ struct hlist_node hlist;
+ const struct mmu_rmap_notifier_ops *ops;
+};
+
+struct mmu_rmap_notifier_ops {
+ /*
+ * Called with the page lock held after ptes are modified or removed
+ * so that a subsystem with its own rmap's can remove remote ptes
+ * mapping a page.
+ */
+ void (*invalidate_page)(struct mmu_rmap_notifier *mrn, struct page *page);
+};
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+extern void mmu_notifier_register(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+extern void mmu_notifier_unregister(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+extern void mmu_notifier_release(struct mm_struct *mm);
+extern int mmu_notifier_age_page(struct mm_struct *mm,
+ unsigned long address);
+
+static inline void mmu_notifier_head_init(struct mmu_notifier_head *mnh)
+{
+ INIT_HLIST_HEAD(&mnh->head);
+}
+
+#define mmu_notifier(function, mm, args...) \
+ do { \
+ struct mmu_notifier *__mn; \
+ struct hlist_node *__n; \
+ \
+ if (unlikely(!hlist_empty(&(mm)->mmu_notifier.head))) { \
+ rcu_read_lock(); \
+ hlist_for_each_entry_rcu(__mn, __n, \
+ &(mm)->mmu_notifier.head, \
+ hlist) \
+ if (__mn->ops->function) \
+ __mn->ops->function(__mn, \
+ mm, \
+ args); \
+ rcu_read_unlock(); \
+ } \
+ } while (0)
+
+extern void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn);
+extern void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn);
+
+extern struct hlist_head mmu_rmap_notifier_list;
+
+#define mmu_rmap_notifier(function, args...) \
+ do { \
+ struct mmu_rmap_notifier *__mrn; \
+ struct hlist_node *__n; \
+ \
+ rcu_read_lock(); \
+ hlist_for_each_entry_rcu(__mrn, __n, &mmu_rmap_notifier_list, \
+ hlist) \
+ if (__mrn->ops->function) \
+ __mrn->ops->function(__mrn, args); \
+ rcu_read_unlock(); \
+ } while (0);
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+#define mmu_notifier(function, mm, args...) do { } while (0)
+#define mmu_rmap_notifier(function, args...) do { } while (0)
+
+static inline void mmu_notifier_register(struct mmu_notifier *mn,
+ struct mm_struct *mm) {}
+static inline void mmu_notifier_unregister(struct mmu_notifier *mn,
+ struct mm_struct *mm) {}
+static inline void mmu_notifier_release(struct mm_struct *mm) {}
+static inline void mmu_notifier_age(struct mm_struct *mm,
+ unsigned long address)
+{
+ return 0;
+}
+
+static inline void mmu_notifier_head_init(struct mmu_notifier_head *mmh) {}
+
+static inline void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn) {}
+static inline void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn) {}
+
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif /* _LINUX_MMU_NOTIFIER_H */
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h 2008-01-24 20:59:17.000000000 -0800
+++ linux-2.6/include/linux/page-flags.h 2008-01-24 20:59:19.000000000 -0800
@@ -105,6 +105,7 @@
* 64 bit | FIELDS | ?????? FLAGS |
* 63 32 0
*/
+#define PG_external_rmap 30 /* Page has external rmap */
#define PG_uncached 31 /* Page has been mapped as uncached */
#endif
@@ -260,6 +261,14 @@ static inline void __ClearPageTail(struc
#define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
#define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)
+#if defined(CONFIG_MMU_NOTIFIER) && defined(CONFIG_64BIT)
+#define PageExternalRmap(page) test_bit(PG_external_rmap, &(page)->flags)
+#define SetPageExternalRmap(page) set_bit(PG_external_rmap, &(page)->flags)
+#define ClearPageExternalRmap(page) clear_bit(PG_external_rmap, &(page)->flags)
+#else
+#define PageExternalRmap(page) 0
+#endif
+
struct page; /* forward declaration */
extern void cancel_dirty_page(struct page *page, unsigned int account_size);
Index: linux-2.6/mm/Kconfig
===================================================================
--- linux-2.6.orig/mm/Kconfig 2008-01-24 20:59:17.000000000 -0800
+++ linux-2.6/mm/Kconfig 2008-01-24 20:59:19.000000000 -0800
@@ -193,3 +193,7 @@ config NR_QUICK
config VIRT_TO_BUS
def_bool y
depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+ def_bool y
+ bool "MMU notifier, for paging KVM/RDMA"
Index: linux-2.6/mm/Makefile
===================================================================
--- linux-2.6.orig/mm/Makefile 2008-01-24 20:59:17.000000000 -0800
+++ linux-2.6/mm/Makefile 2008-01-24 20:59:19.000000000 -0800
@@ -30,4 +30,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
Index: linux-2.6/mm/mmu_notifier.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/mm/mmu_notifier.c 2008-01-24 20:59:19.000000000 -0800
@@ -0,0 +1,91 @@
+/*
+ * linux/mm/mmu_notifier.c
+ *
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright (C) 2008 SGI
+ * Christoph Lameter <[email protected]>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+
+void mmu_notifier_release(struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n,
+ &mm->mmu_notifier.head, hlist) {
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ hlist_del(&mn->hlist);
+ }
+ rcu_read_unlock();
+ }
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->age_page can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int mmu_notifier_age_page(struct mm_struct *mm, unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+ int young = 0;
+
+ if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n,
+ &mm->mmu_notifier.head, hlist) {
+ if (mn->ops->age_page)
+ young |= mn->ops->age_page(mn, mm, address);
+ }
+ rcu_read_unlock();
+ }
+
+ return young;
+}
+
+static DEFINE_SPINLOCK(mmu_notifier_list_lock);
+
+void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ spin_lock(&mmu_notifier_list_lock);
+ hlist_add_head(&mn->hlist, &mm->mmu_notifier.head);
+ spin_unlock(&mmu_notifier_list_lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ spin_lock(&mmu_notifier_list_lock);
+ hlist_del(&mn->hlist);
+ spin_unlock(&mmu_notifier_list_lock);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+HLIST_HEAD(mmu_rmap_notifier_list);
+
+void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn)
+{
+ spin_lock(&mmu_notifier_list_lock);
+ hlist_add_head_rcu(&mrn->hlist, &mmu_rmap_notifier_list);
+ spin_unlock(&mmu_notifier_list_lock);
+}
+EXPORT_SYMBOL(mmu_rmap_notifier_register);
+
+void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn)
+{
+ spin_lock(&mmu_notifier_list_lock);
+ hlist_del_rcu(&mrn->hlist);
+ spin_unlock(&mmu_notifier_list_lock);
+}
+EXPORT_SYMBOL(mmu_rmap_notifier_unregister);
+
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c 2008-01-24 20:59:17.000000000 -0800
+++ linux-2.6/kernel/fork.c 2008-01-24 20:59:19.000000000 -0800
@@ -51,6 +51,7 @@
#include <linux/random.h>
#include <linux/tty.h>
#include <linux/proc_fs.h>
+#include <linux/mmu_notifier.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -359,6 +360,7 @@ static struct mm_struct * mm_init(struct
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
+ mmu_notifier_head_init(&mm->mmu_notifier);
return mm;
}
free_mm(mm);
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c 2008-01-24 20:59:17.000000000 -0800
+++ linux-2.6/mm/mmap.c 2008-01-24 20:59:19.000000000 -0800
@@ -26,6 +26,7 @@
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -2043,6 +2044,7 @@ void exit_mmap(struct mm_struct *mm)
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
tlb_finish_mmu(tlb, 0, end);
+ mmu_notifier_release(mm);
/*
* Walk the list again, actually closing and freeing it,
--
> +#define mmu_notifier(function, mm, args...) \
...
> + if (__mn->ops->function) \
> + __mn->ops->function(__mn, \
> + mm, \
> + args); \
__mn->ops->function(__mn, mm, args); \
I realize it is a minor nit, but since we put the continuation in column
81 in the next define, can we do the same here and make this more
readable?
> + rcu_read_unlock(); \
...
> +#define mmu_rmap_notifier(function, args...) \
> + do { \
> + struct mmu_rmap_notifier *__mrn; \
> + struct hlist_node *__n; \
> + \
> +void mmu_notifier_release(struct mm_struct *mm)
> +{
> + struct mmu_notifier *mn;
> + struct hlist_node *n;
> +
> + if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
> + rcu_read_lock();
> + hlist_for_each_entry_rcu(mn, n,
> + &mm->mmu_notifier.head, hlist) {
> + if (mn->ops->release)
> + mn->ops->release(mn, mm);
> + hlist_del(&mn->hlist);
I think the hlist_del needs to be before the function callout so we can free
the structure without a use-after-free issue.
hlist_for_each_entry_rcu(mn, n,
&mm->mmu_notifier.head, hlist) {
hlist_del_rcu(&mn->hlist);
if (mn->ops->release)
mn->ops->release(mn, mm);
> +static DEFINE_SPINLOCK(mmu_notifier_list_lock);
Remove
> +
> +void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
> +{
> + spin_lock(&mmu_notifier_list_lock);
Shouldn't this really be protected by the down_write(mmap_sem)? Maybe:
BUG_ON(!rwsem_is_write_locked(&mm->mmap_sem));
> + hlist_add_head(&mn->hlist, &mm->mmu_notifier.head);
hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier.head);
> + spin_unlock(&mmu_notifier_list_lock);
> +}
> +EXPORT_SYMBOL_GPL(mmu_notifier_register);
> +
> +void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
> +{
> + spin_lock(&mmu_notifier_list_lock);
> + hlist_del(&mn->hlist);
hlist_del_rcu? Ditto on the lock.
> + spin_unlock(&mmu_notifier_list_lock);
> +}
> +EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
> +
> +HLIST_HEAD(mmu_rmap_notifier_list);
static DEFINE_SPINLOCK(mmu_rmap_notifier_list_lock);
> +
> +void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn)
> +{
> + spin_lock(&mmu_notifier_list_lock);
> + hlist_add_head_rcu(&mrn->hlist, &mmu_rmap_notifier_list);
> + spin_unlock(&mmu_notifier_list_lock);
spin_lock(&mmu_rmap_notifier_list_lock);
hlist_add_head_rcu(&mrn->hlist, &mmu_rmap_notifier_list);
spin_unlock(&mmu_rmap_notifier_list_lock);
> +}
> +EXPORT_SYMBOL(mmu_rmap_notifier_register);
> +
> +void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn)
> +{
> + spin_lock(&mmu_notifier_list_lock);
> + hlist_del_rcu(&mrn->hlist);
> + spin_unlock(&mmu_notifier_list_lock);
spin_lock(&mmu_rmap_notifier_list_lock);
hlist_del_rcu(&mrn->hlist);
spin_unlock(&mmu_rmap_notifier_list_lock);
> @@ -2043,6 +2044,7 @@ void exit_mmap(struct mm_struct *mm)
> vm_unacct_memory(nr_accounted);
> free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
> tlb_finish_mmu(tlb, 0, end);
> + mmu_notifier_release(mm);
Can we consider moving this notifier or introducing an additional notifier
in the release or a flag to this one indicating early/late.
The GRU that Jack is concerned with would benefit from the early in
that it could just invalidate the GRU context and immediately all GRU
TLB entries are invalid. I believe Jack would like to also be able to
remove his entry from the mmu_notifier list in an effort to avoid the
page and range callouts.
XPMEM, would also benefit from a call early. We could make all the
segments as being torn down and start the recalls. We already have
this code in and working (have since it was first written 6 years ago).
In this case, all segments are torn down with a single message to each
of the importing partitions. In contrast, the teardown code which would
happen now would be one set of messages for each vma.
Thanks,
Robin
On Fri, 25 Jan 2008, Robin Holt wrote:
> I realize it is a minor nit, but since we put the continuation in column
> 81 in the next define, can we do the same here and make this more
> readable?
We need to fix the next define to not use column 81.
Found a couple of more 80 column infractions. Will be fixed in next
release.
> > +void mmu_notifier_release(struct mm_struct *mm)
> > +{
> > + struct mmu_notifier *mn;
> > + struct hlist_node *n;
> > +
> > + if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
> > + rcu_read_lock();
> > + hlist_for_each_entry_rcu(mn, n,
> > + &mm->mmu_notifier.head, hlist) {
> > + if (mn->ops->release)
> > + mn->ops->release(mn, mm);
> > + hlist_del(&mn->hlist);
>
> I think the hlist_del needs to be before the function callout so we can free
> the structure without a use-after-free issue.
The list head is in the mm_struct. This will be freed later.
> > +void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
> > +{
> > + spin_lock(&mmu_notifier_list_lock);
>
> Shouldn't this really be protected by the down_write(mmap_sem)? Maybe:
Ok. We could switch this to mmap_sem protection for the mm_struct but the
rmap notifier is not associated with an mm_struct. So we would need to
keep it there. Since we already have a spinlock: Just use it for both to
avoid further complications.
> > + spin_lock(&mmu_notifier_list_lock);
> > + hlist_del(&mn->hlist);
>
> hlist_del_rcu? Ditto on the lock.
Peter already mentioned that and I have posted patches that address this
issue.
> > @@ -2043,6 +2044,7 @@ void exit_mmap(struct mm_struct *mm)
> > vm_unacct_memory(nr_accounted);
> > free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
> > tlb_finish_mmu(tlb, 0, end);
> > + mmu_notifier_release(mm);
>
> Can we consider moving this notifier or introducing an additional notifier
> in the release or a flag to this one indicating early/late.
There is only one call right now?
> The GRU that Jack is concerned with would benefit from the early in
> that it could just invalidate the GRU context and immediately all GRU
> TLB entries are invalid. I believe Jack would like to also be able to
> remove his entry from the mmu_notifier list in an effort to avoid the
> page and range callouts.
The TLB entries are removed by earlier invalidate_range calls. I would
think that no TLBs are left at this point. Its simply a matter of
releasing any still allocated resources through this callback.
> XPMEM, would also benefit from a call early. We could make all the
> segments as being torn down and start the recalls. We already have
> this code in and working (have since it was first written 6 years ago).
> In this case, all segments are torn down with a single message to each
> of the importing partitions. In contrast, the teardown code which would
> happen now would be one set of messages for each vma.
So we need an additional global teardown call? Then we'd need to switch
off the vma based invalidate_range()?
On Fri, Jan 25, 2008 at 10:47:04AM -0800, Christoph Lameter wrote:
> On Fri, 25 Jan 2008, Robin Holt wrote:
>
> > I realize it is a minor nit, but since we put the continuation in column
> > 81 in the next define, can we do the same here and make this more
> > readable?
>
> We need to fix the next define to not use column 81.
> Found a couple of more 80 column infractions. Will be fixed in next
> release.
>
> > > +void mmu_notifier_release(struct mm_struct *mm)
> > > +{
> > > + struct mmu_notifier *mn;
> > > + struct hlist_node *n;
> > > +
> > > + if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
> > > + rcu_read_lock();
> > > + hlist_for_each_entry_rcu(mn, n,
> > > + &mm->mmu_notifier.head, hlist) {
> > > + if (mn->ops->release)
> > > + mn->ops->release(mn, mm);
> > > + hlist_del(&mn->hlist);
> >
> > I think the hlist_del needs to be before the function callout so we can free
> > the structure without a use-after-free issue.
>
> The list head is in the mm_struct. This will be freed later.
>
I meant the structure pointed to by &mn. I assume it is intended that
structure be kmalloc'd as part of a larger structure. The driver is the
entity which created that structure and should be the one to free it.
> > > +void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
> > > +{
> > > + spin_lock(&mmu_notifier_list_lock);
> >
> > Shouldn't this really be protected by the down_write(mmap_sem)? Maybe:
>
> Ok. We could switch this to mmap_sem protection for the mm_struct but the
> rmap notifier is not associated with an mm_struct. So we would need to
> keep it there. Since we already have a spinlock: Just use it for both to
> avoid further complications.
But now you are putting a global lock in where it is inappropriate.
>
> > > + spin_lock(&mmu_notifier_list_lock);
> > > + hlist_del(&mn->hlist);
> >
> > hlist_del_rcu? Ditto on the lock.
>
> Peter already mentioned that and I have posted patches that address this
> issue.
>
> > > @@ -2043,6 +2044,7 @@ void exit_mmap(struct mm_struct *mm)
> > > vm_unacct_memory(nr_accounted);
> > > free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
> > > tlb_finish_mmu(tlb, 0, end);
> > > + mmu_notifier_release(mm);
> >
> > Can we consider moving this notifier or introducing an additional notifier
> > in the release or a flag to this one indicating early/late.
>
> There is only one call right now?
>
> > The GRU that Jack is concerned with would benefit from the early in
> > that it could just invalidate the GRU context and immediately all GRU
> > TLB entries are invalid. I believe Jack would like to also be able to
> > remove his entry from the mmu_notifier list in an effort to avoid the
> > page and range callouts.
>
> The TLB entries are removed by earlier invalidate_range calls. I would
> think that no TLBs are left at this point. Its simply a matter of
> releasing any still allocated resources through this callback.
What I was asking for is a way to avoid those numerous callouts for
drivers that can do early cleanup.
>
> > XPMEM, would also benefit from a call early. We could make all the
> > segments as being torn down and start the recalls. We already have
> > this code in and working (have since it was first written 6 years ago).
> > In this case, all segments are torn down with a single message to each
> > of the importing partitions. In contrast, the teardown code which would
> > happen now would be one set of messages for each vma.
>
> So we need an additional global teardown call? Then we'd need to switch
> off the vma based invalidate_range()?
No, EXACTLY what I originally was asking for, either move this call site
up, introduce an additional mmu_notifier op, or place this one in two
locations with a flag indicating which call is being made.
Thanks,
Robin
On Fri, 25 Jan 2008, Robin Holt wrote:
> > > > +void mmu_notifier_release(struct mm_struct *mm)
> > > > +{
> > > > + struct mmu_notifier *mn;
> > > > + struct hlist_node *n;
> > > > +
> > > > + if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
> > > > + rcu_read_lock();
> > > > + hlist_for_each_entry_rcu(mn, n,
> > > > + &mm->mmu_notifier.head, hlist) {
> > > > + if (mn->ops->release)
> > > > + mn->ops->release(mn, mm);
> > > > + hlist_del(&mn->hlist);
> > >
> > > I think the hlist_del needs to be before the function callout so we can free
> > > the structure without a use-after-free issue.
> >
> > The list head is in the mm_struct. This will be freed later.
> >
>
> I meant the structure pointed to by &mn. I assume it is intended that
> structure be kmalloc'd as part of a larger structure. The driver is the
> entity which created that structure and should be the one to free it.
mn will be pointing to the listhead in the mm_struct one after the other.
You mean the ops structure?
> > > > +void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
> > > > +{
> > > > + spin_lock(&mmu_notifier_list_lock);
> > >
> > > Shouldn't this really be protected by the down_write(mmap_sem)? Maybe:
> >
> > Ok. We could switch this to mmap_sem protection for the mm_struct but the
> > rmap notifier is not associated with an mm_struct. So we would need to
> > keep it there. Since we already have a spinlock: Just use it for both to
> > avoid further complications.
>
> But now you are putting a global lock in where it is inappropriate.
The lock is only used during register and unregister. Very low level
usage.
> > > XPMEM, would also benefit from a call early. We could make all the
> > > segments as being torn down and start the recalls. We already have
> > > this code in and working (have since it was first written 6 years ago).
> > > In this case, all segments are torn down with a single message to each
> > > of the importing partitions. In contrast, the teardown code which would
> > > happen now would be one set of messages for each vma.
> >
> > So we need an additional global teardown call? Then we'd need to switch
> > off the vma based invalidate_range()?
>
> No, EXACTLY what I originally was asking for, either move this call site
> up, introduce an additional mmu_notifier op, or place this one in two
> locations with a flag indicating which call is being made.
Add a new invalidate_all() call? Then on exit we do
1. invalidate_all()
2. invalidate_range() for each vma
3. release()
We cannot simply move the call up because there will be future range
callbacks on vma invalidation.
On Fri, Jan 25, 2008 at 11:03:07AM -0800, Christoph Lameter wrote:
> > > > Shouldn't this really be protected by the down_write(mmap_sem)? Maybe:
> > >
> > > Ok. We could switch this to mmap_sem protection for the mm_struct but the
> > > rmap notifier is not associated with an mm_struct. So we would need to
> > > keep it there. Since we already have a spinlock: Just use it for both to
> > > avoid further complications.
> >
> > But now you are putting a global lock in where it is inappropriate.
>
> The lock is only used during register and unregister. Very low level
> usage.
Seems to me that is the same argument used for lock_kernel. I am saying
we have a perfectly reasonable way to seperate the protections down to
their smallest. For the things hanging off the mm, mmap_sem, for the
other list, a list specific lock.
Keep in mind that on a 2048p SSI MPI job starting up, we have 2048 ranks
doing this at the same time 6 times withing their address range. That
seems like a lock which could get hot fairly quickly. It may be for a
short period during startup and shutdown, but it is there.
>
> > > > XPMEM, would also benefit from a call early. We could make all the
> > > > segments as being torn down and start the recalls. We already have
> > > > this code in and working (have since it was first written 6 years ago).
> > > > In this case, all segments are torn down with a single message to each
> > > > of the importing partitions. In contrast, the teardown code which would
> > > > happen now would be one set of messages for each vma.
> > >
> > > So we need an additional global teardown call? Then we'd need to switch
> > > off the vma based invalidate_range()?
> >
> > No, EXACTLY what I originally was asking for, either move this call site
> > up, introduce an additional mmu_notifier op, or place this one in two
> > locations with a flag indicating which call is being made.
>
> Add a new invalidate_all() call? Then on exit we do
>
> 1. invalidate_all()
That will be fine as long as we can unregister the ops notifier and free
the structure. Otherwise, we end up being called needlessly.
>
> 2. invalidate_range() for each vma
>
> 3. release()
>
> We cannot simply move the call up because there will be future range
> callbacks on vma invalidation.
I am not sure what this means. Right now, if you were to notify XPMEM
the process is exiting, we would take care of all the recalling of pages
exported by this process, clearing those pages cache lines from cache,
and raising memory protections. I would assume that moving the callout
earlier would expect the same of every driver.
Thanks,
Robin
On Fri, 25 Jan 2008, Robin Holt wrote:
> Keep in mind that on a 2048p SSI MPI job starting up, we have 2048 ranks
> doing this at the same time 6 times withing their address range. That
> seems like a lock which could get hot fairly quickly. It may be for a
> short period during startup and shutdown, but it is there.
Ok. I guess we need to have a __register_mmu_notifier that expects the
mmap_sem to be held then?
> > 1. invalidate_all()
>
> That will be fine as long as we can unregister the ops notifier and free
> the structure. Otherwise, we end up being called needlessly.
No you cannot do that because there are still callbacks that come later.
The invalidate_all may lead to invalidate_range() doing nothing for this
mm. The ops notifier and the freeing of the structure has to wait until
release().
> > 2. invalidate_range() for each vma
> >
> > 3. release()
> >
> > We cannot simply move the call up because there will be future range
> > callbacks on vma invalidation.
>
> I am not sure what this means. Right now, if you were to notify XPMEM
> the process is exiting, we would take care of all the recalling of pages
> exported by this process, clearing those pages cache lines from cache,
> and raising memory protections. I would assume that moving the callout
> earlier would expect the same of every driver.
That does not sync with the current scheme of the invalidate_range()
hooks. We would have to do a global invalidate early and then place the
other invalidate_range hooks in such a way that none is called in later in
process exit handling.
Diff so far against V1
- Improve RCU support. (There is now a sychronize_rcu in mmu_release which
is bad.)
- Clean compile for !MMU_NOTIFIER
- Use mmap_sem for serializing additions the mmu_notifier list in the
mm_struct (but still global spinlock for mmu_rmap_notifier. The
registration function is called only a couple of times))
-
---
include/linux/list.h | 14 ++++++++++++++
include/linux/mm_types.h | 2 --
include/linux/mmu_notifier.h | 39 ++++++++++++++++++++++++++++++++++++---
mm/mmu_notifier.c | 28 +++++++++++++++++++---------
4 files changed, 69 insertions(+), 14 deletions(-)
Index: linux-2.6/mm/mmu_notifier.c
===================================================================
--- linux-2.6.orig/mm/mmu_notifier.c 2008-01-25 12:14:49.000000000 -0800
+++ linux-2.6/mm/mmu_notifier.c 2008-01-25 12:14:49.000000000 -0800
@@ -15,17 +15,18 @@
void mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
- struct hlist_node *n;
+ struct hlist_node *n, *t;
if (unlikely(!hlist_empty(&mm->mmu_notifier.head))) {
rcu_read_lock();
- hlist_for_each_entry_rcu(mn, n,
+ hlist_for_each_entry_safe_rcu(mn, n, t,
&mm->mmu_notifier.head, hlist) {
if (mn->ops->release)
mn->ops->release(mn, mm);
hlist_del(&mn->hlist);
}
rcu_read_unlock();
+ synchronize_rcu();
}
}
@@ -53,24 +54,33 @@ int mmu_notifier_age_page(struct mm_stru
return young;
}
-static DEFINE_SPINLOCK(mmu_notifier_list_lock);
+/*
+ * Note that all notifiers use RCU. The updates are only guaranteed to be
+ * visible to other processes after a RCU quiescent period!
+ */
+void __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier.head);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
{
- spin_lock(&mmu_notifier_list_lock);
- hlist_add_head(&mn->hlist, &mm->mmu_notifier.head);
- spin_unlock(&mmu_notifier_list_lock);
+ down_write(&mm->mmap_sem);
+ __mmu_notifier_register(mn, mm);
+ up_write(&mm->mmap_sem);
}
EXPORT_SYMBOL_GPL(mmu_notifier_register);
void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
{
- spin_lock(&mmu_notifier_list_lock);
- hlist_del(&mn->hlist);
- spin_unlock(&mmu_notifier_list_lock);
+ down_write(&mm->mmap_sem);
+ hlist_del_rcu(&mn->hlist);
+ up_write(&mm->mmap_sem);
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+static DEFINE_SPINLOCK(mmu_notifier_list_lock);
HLIST_HEAD(mmu_rmap_notifier_list);
void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn)
Index: linux-2.6/include/linux/list.h
===================================================================
--- linux-2.6.orig/include/linux/list.h 2008-01-25 12:14:47.000000000 -0800
+++ linux-2.6/include/linux/list.h 2008-01-25 12:14:49.000000000 -0800
@@ -991,6 +991,20 @@ static inline void hlist_add_after_rcu(s
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
pos = pos->next)
+/**
+ * hlist_for_each_entry_safe_rcu - iterate over list of given type
+ * @tpos: the type * to use as a loop cursor.
+ * @pos: the &struct hlist_node to use as a loop cursor.
+ * @n: temporary pointer
+ * @head: the head for your list.
+ * @member: the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe_rcu(tpos, pos, n, head, member) \
+ for (pos = (head)->first; \
+ rcu_dereference(pos) && ({ n = pos->next; 1;}) && \
+ ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+ pos = n)
+
#else
#warning "don't include kernel headers in userspace"
#endif /* __KERNEL__ */
Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h 2008-01-25 12:14:49.000000000 -0800
+++ linux-2.6/include/linux/mm_types.h 2008-01-25 12:14:49.000000000 -0800
@@ -224,9 +224,7 @@ struct mm_struct {
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
-#ifdef CONFIG_MMU_NOTIFIER
struct mmu_notifier_head mmu_notifier; /* MMU notifier list */
-#endif
};
#endif /* _LINUX_MM_TYPES_H */
Index: linux-2.6/include/linux/mmu_notifier.h
===================================================================
--- linux-2.6.orig/include/linux/mmu_notifier.h 2008-01-25 12:14:49.000000000 -0800
+++ linux-2.6/include/linux/mmu_notifier.h 2008-01-25 13:07:54.000000000 -0800
@@ -46,6 +46,10 @@ struct mmu_notifier {
};
struct mmu_notifier_ops {
+ /*
+ * Note the mmu_notifier structure must be released with
+ * call_rcu
+ */
void (*release)(struct mmu_notifier *mn,
struct mm_struct *mm);
int (*age_page)(struct mmu_notifier *mn,
@@ -78,10 +82,16 @@ struct mmu_rmap_notifier_ops {
#ifdef CONFIG_MMU_NOTIFIER
+/* Must hold the mmap_sem */
+extern void __mmu_notifier_register(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+/* Will acquire mmap_sem */
extern void mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm);
+/* Will acquire mmap_sem */
extern void mmu_notifier_unregister(struct mmu_notifier *mn,
struct mm_struct *mm);
+
extern void mmu_notifier_release(struct mm_struct *mm);
extern int mmu_notifier_age_page(struct mm_struct *mm,
unsigned long address);
@@ -130,15 +140,38 @@ extern struct hlist_head mmu_rmap_notifi
#else /* CONFIG_MMU_NOTIFIER */
-#define mmu_notifier(function, mm, args...) do { } while (0)
-#define mmu_rmap_notifier(function, args...) do { } while (0)
+/*
+ * Notifiers that use the parameters that they were passed so that the
+ * compiler does not complain about unused variables but does proper
+ * parameter checks even if !CONFIG_MMU_NOTIFIER.
+ * Macros generate no code.
+ */
+#define mmu_notifier(function, mm, args...) \
+ do { \
+ if (0) { \
+ struct mmu_notifier *__mn; \
+ \
+ __mn = (struct mmu_notifier *)(0x00ff); \
+ __mn->ops->function(__mn, mm, args); \
+ }; \
+ } while (0)
+
+#define mmu_rmap_notifier(function, args...) \
+ do { \
+ if (0) { \
+ struct mmu_rmap_notifier *__mrn; \
+ \
+ __mrn = (struct mmu_rmap_notifier *)(0x00ff); \
+ __mrn->ops->function(__mrn, args); \
+ } \
+ } while (0);
static inline void mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm) {}
static inline void mmu_notifier_unregister(struct mmu_notifier *mn,
struct mm_struct *mm) {}
static inline void mmu_notifier_release(struct mm_struct *mm) {}
-static inline void mmu_notifier_age(struct mm_struct *mm,
+static inline int mmu_notifier_age_page(struct mm_struct *mm,
unsigned long address)
{
return 0;
> > > 1. invalidate_all()
> >
> > That will be fine as long as we can unregister the ops notifier and free
> > the structure. Otherwise, we end up being called needlessly.
>
> No you cannot do that because there are still callbacks that come later.
> The invalidate_all may lead to invalidate_range() doing nothing for this
> mm. The ops notifier and the freeing of the structure has to wait until
> release().
Could you be a little more clear here? If you are saying that the other
callbacks will need to do work? I can assure you we will clean up those
pages and raise memory protections. It will also be done in a much more
efficient fashion than the individual callouts.
If, on the other hand, you are saying we can not because of the way
we traverse the list, can we return a result indicating to the caller
we would like to be unregistered and then the mmu_notifier code do the
remove followed by a call to the release notifier?
>
> > > 2. invalidate_range() for each vma
> > >
> > > 3. release()
> > >
> > > We cannot simply move the call up because there will be future range
> > > callbacks on vma invalidation.
> >
> > I am not sure what this means. Right now, if you were to notify XPMEM
> > the process is exiting, we would take care of all the recalling of pages
> > exported by this process, clearing those pages cache lines from cache,
> > and raising memory protections. I would assume that moving the callout
> > earlier would expect the same of every driver.
>
> That does not sync with the current scheme of the invalidate_range()
> hooks. We would have to do a global invalidate early and then place the
> other invalidate_range hooks in such a way that none is called in later in
> process exit handling.
But if the notifier is removed from the list following the invalidate_all
callout, there would be no additional callouts.
Thanks,
Robin
> void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
> {
> - spin_lock(&mmu_notifier_list_lock);
> - hlist_add_head(&mn->hlist, &mm->mmu_notifier.head);
> - spin_unlock(&mmu_notifier_list_lock);
> + down_write(&mm->mmap_sem);
> + __mmu_notifier_register(mn, mm);
> + up_write(&mm->mmap_sem);
> }
> EXPORT_SYMBOL_GPL(mmu_notifier_register);
But what if the caller is already holding the mmap_sem? Why force the
acquire into this function? Since we are dealing with a semaphore/mutex,
it is reasonable that other structures are protected by this, more work
will be done, and therefore put the weight of acquiring the sema in the
control of the caller where they can decide if more needs to be completed.
That was why I originally suggested creating a new rwsem_is_write_locked()
function and basing a BUG_ON upon that.
Thanks,
Robin
On Sat, 26 Jan 2008, Robin Holt wrote:
> But what if the caller is already holding the mmap_sem? Why force the
> acquire into this function? Since we are dealing with a semaphore/mutex,
Then you need to call __register_mmu_notifier.
On Sat, 26 Jan 2008, Robin Holt wrote:
> > No you cannot do that because there are still callbacks that come later.
> > The invalidate_all may lead to invalidate_range() doing nothing for this
> > mm. The ops notifier and the freeing of the structure has to wait until
> > release().
>
> Could you be a little more clear here? If you are saying that the other
> callbacks will need to do work? I can assure you we will clean up those
> pages and raise memory protections. It will also be done in a much more
> efficient fashion than the individual callouts.
No the other callbacks need to work in the sense that they can be called.
You could have them do nothing after an invalidate_all().
But you cannot release the allocated structs needed for list traversal
etc.
> If, on the other hand, you are saying we can not because of the way
> we traverse the list, can we return a result indicating to the caller
> we would like to be unregistered and then the mmu_notifier code do the
> remove followed by a call to the release notifier?
You would need to release the resources when the release notifier is
called.
> > That does not sync with the current scheme of the invalidate_range()
> > hooks. We would have to do a global invalidate early and then place the
> > other invalidate_range hooks in such a way that none is called in later in
> > process exit handling.
>
> But if the notifier is removed from the list following the invalidate_all
> callout, there would be no additional callouts.
Hmmm.... Okay did not think about that. Then you would need to do a
synchronize_rcu() in invalidate_all()?