These special additional callbacks are required because XPmem (and likely
other mechanisms) do use their own rmap (multiple processes on a series
of remote Linux instances may be accessing the memory of a process).
F.e. XPmem may have to send out notifications to remote Linux instances
and receive confirmation before a page can be freed.
So we handle this like an additional Linux reverse map that is walked after
the existing rmaps have been walked. We leave the walking to the driver that
is then able to use something else than a spinlock to walk its reverse
maps. So we can actually call the driver without holding spinlocks while
we hold the Pagelock.
However, we cannot determine the mm_struct that a page belongs to at
that point. The mm_struct can only be determined from the rmaps by the
device driver.
We add another pageflag (PageExternalRmap) that is set if a page has
been remotely mapped (f.e. by a process from another Linux instance).
We can then only perform the callbacks for pages that are actually in
remote use.
Rmap notifiers need an extra page bit and are only available
on 64 bit platforms. This functionality is not available on 32 bit!
A notifier that uses the reverse maps callbacks does not need to provide
the invalidate_page() method that is called when locks are held.
Signed-off-by: Christoph Lameter <[email protected]>
---
include/linux/mmu_notifier.h | 65 +++++++++++++++++++++++++++++++++++++++++++
include/linux/page-flags.h | 11 +++++++
mm/mmu_notifier.c | 34 ++++++++++++++++++++++
mm/rmap.c | 9 +++++
4 files changed, 119 insertions(+)
Index: linux-2.6/include/linux/page-flags.h
===================================================================
--- linux-2.6.orig/include/linux/page-flags.h 2008-02-14 20:58:17.000000000 -0800
+++ linux-2.6/include/linux/page-flags.h 2008-02-14 21:21:04.000000000 -0800
@@ -105,6 +105,7 @@
* 64 bit | FIELDS | ?????? FLAGS |
* 63 32 0
*/
+#define PG_external_rmap 30 /* Page has external rmap */
#define PG_uncached 31 /* Page has been mapped as uncached */
#endif
@@ -296,6 +297,16 @@ static inline void __ClearPageTail(struc
#define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags)
#define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags)
+#if defined(CONFIG_MMU_NOTIFIER) && defined(CONFIG_64BIT)
+#define PageExternalRmap(page) test_bit(PG_external_rmap, &(page)->flags)
+#define SetPageExternalRmap(page) set_bit(PG_external_rmap, &(page)->flags)
+#define ClearPageExternalRmap(page) clear_bit(PG_external_rmap, \
+ &(page)->flags)
+#else
+#define ClearPageExternalRmap(page) do {} while (0)
+#define PageExternalRmap(page) 0
+#endif
+
struct page; /* forward declaration */
extern void cancel_dirty_page(struct page *page, unsigned int account_size);
Index: linux-2.6/include/linux/mmu_notifier.h
===================================================================
--- linux-2.6.orig/include/linux/mmu_notifier.h 2008-02-14 21:20:55.000000000 -0800
+++ linux-2.6/include/linux/mmu_notifier.h 2008-02-14 21:21:04.000000000 -0800
@@ -23,6 +23,18 @@
* where sleeping is allowed or in atomic contexts. A flag is passed
* to indicate an atomic context.
*
+ *
+ * 2. mmu_rmap_notifier
+ *
+ * Callbacks for subsystems that provide their own rmaps. These
+ * need to walk their own rmaps for a page. The invalidate_page
+ * callback is outside of locks so that we are not in a strictly
+ * atomic context (but we may be in a PF_MEMALLOC context if the
+ * notifier is called from reclaim code) and are able to sleep.
+ *
+ * Rmap notifiers need an extra page bit and are only available
+ * on 64 bit platforms.
+ *
* Pages must be marked dirty if dirty bits are found to be set in
* the external ptes.
*/
@@ -96,6 +108,23 @@ struct mmu_notifier_ops {
int atomic);
};
+struct mmu_rmap_notifier_ops;
+
+struct mmu_rmap_notifier {
+ struct hlist_node hlist;
+ const struct mmu_rmap_notifier_ops *ops;
+};
+
+struct mmu_rmap_notifier_ops {
+ /*
+ * Called with the page lock held after ptes are modified or removed
+ * so that a subsystem with its own rmap's can remove remote ptes
+ * mapping a page.
+ */
+ void (*invalidate_page)(struct mmu_rmap_notifier *mrn,
+ struct page *page);
+};
+
#ifdef CONFIG_MMU_NOTIFIER
/*
@@ -146,6 +175,27 @@ static inline void mmu_notifier_head_ini
} \
} while (0)
+extern void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn);
+extern void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn);
+
+/* Must hold PageLock */
+extern void mmu_rmap_export_page(struct page *page);
+
+extern struct hlist_head mmu_rmap_notifier_list;
+
+#define mmu_rmap_notifier(function, args...) \
+ do { \
+ struct mmu_rmap_notifier *__mrn; \
+ struct hlist_node *__n; \
+ \
+ rcu_read_lock(); \
+ hlist_for_each_entry_rcu(__mrn, __n, \
+ &mmu_rmap_notifier_list, hlist) \
+ if (__mrn->ops->function) \
+ __mrn->ops->function(__mrn, args); \
+ rcu_read_unlock(); \
+ } while (0);
+
#else /* CONFIG_MMU_NOTIFIER */
/*
@@ -164,6 +214,16 @@ static inline void mmu_notifier_head_ini
}; \
} while (0)
+#define mmu_rmap_notifier(function, args...) \
+ do { \
+ if (0) { \
+ struct mmu_rmap_notifier *__mrn; \
+ \
+ __mrn = (struct mmu_rmap_notifier *)(0x00ff); \
+ __mrn->ops->function(__mrn, args); \
+ } \
+ } while (0);
+
static inline void mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm) {}
static inline void mmu_notifier_unregister(struct mmu_notifier *mn,
@@ -177,6 +237,11 @@ static inline int mmu_notifier_age_page(
static inline void mmu_notifier_head_init(struct mmu_notifier_head *mmh) {}
+static inline void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn)
+ {}
+static inline void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn)
+ {}
+
#endif /* CONFIG_MMU_NOTIFIER */
#endif /* _LINUX_MMU_NOTIFIER_H */
Index: linux-2.6/mm/mmu_notifier.c
===================================================================
--- linux-2.6.orig/mm/mmu_notifier.c 2008-02-14 21:17:51.000000000 -0800
+++ linux-2.6/mm/mmu_notifier.c 2008-02-14 21:21:04.000000000 -0800
@@ -74,3 +74,37 @@ void mmu_notifier_unregister(struct mmu_
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+#ifdef CONFIG_64BIT
+static DEFINE_SPINLOCK(mmu_notifier_list_lock);
+HLIST_HEAD(mmu_rmap_notifier_list);
+
+void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn)
+{
+ spin_lock(&mmu_notifier_list_lock);
+ hlist_add_head_rcu(&mrn->hlist, &mmu_rmap_notifier_list);
+ spin_unlock(&mmu_notifier_list_lock);
+}
+EXPORT_SYMBOL(mmu_rmap_notifier_register);
+
+void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn)
+{
+ spin_lock(&mmu_notifier_list_lock);
+ hlist_del_rcu(&mrn->hlist);
+ spin_unlock(&mmu_notifier_list_lock);
+}
+EXPORT_SYMBOL(mmu_rmap_notifier_unregister);
+
+/*
+ * Export a page.
+ *
+ * Pagelock must be held.
+ * Must be called before a page is put on an external rmap.
+ */
+void mmu_rmap_export_page(struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+ SetPageExternalRmap(page);
+}
+EXPORT_SYMBOL(mmu_rmap_export_page);
+
+#endif
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c 2008-02-14 21:21:00.000000000 -0800
+++ linux-2.6/mm/rmap.c 2008-02-14 21:21:04.000000000 -0800
@@ -497,6 +497,10 @@ int page_mkclean(struct page *page)
struct address_space *mapping = page_mapping(page);
if (mapping) {
ret = page_mkclean_file(mapping, page);
+ if (unlikely(PageExternalRmap(page))) {
+ mmu_rmap_notifier(invalidate_page, page);
+ ClearPageExternalRmap(page);
+ }
if (page_test_dirty(page)) {
page_clear_dirty(page);
ret = 1;
@@ -1013,6 +1017,11 @@ int try_to_unmap(struct page *page, int
else
ret = try_to_unmap_file(page, migration);
+ if (unlikely(PageExternalRmap(page))) {
+ mmu_rmap_notifier(invalidate_page, page);
+ ClearPageExternalRmap(page);
+ }
+
if (!page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
--
On Thu, 14 Feb 2008 22:49:04 -0800 Christoph Lameter <[email protected]> wrote:
> These special additional callbacks are required because XPmem (and likely
> other mechanisms) do use their own rmap (multiple processes on a series
> of remote Linux instances may be accessing the memory of a process).
> F.e. XPmem may have to send out notifications to remote Linux instances
> and receive confirmation before a page can be freed.
>
> So we handle this like an additional Linux reverse map that is walked after
> the existing rmaps have been walked. We leave the walking to the driver that
> is then able to use something else than a spinlock to walk its reverse
> maps. So we can actually call the driver without holding spinlocks while
> we hold the Pagelock.
>
> However, we cannot determine the mm_struct that a page belongs to at
> that point. The mm_struct can only be determined from the rmaps by the
> device driver.
>
> We add another pageflag (PageExternalRmap) that is set if a page has
> been remotely mapped (f.e. by a process from another Linux instance).
> We can then only perform the callbacks for pages that are actually in
> remote use.
>
> Rmap notifiers need an extra page bit and are only available
> on 64 bit platforms. This functionality is not available on 32 bit!
>
> A notifier that uses the reverse maps callbacks does not need to provide
> the invalidate_page() method that is called when locks are held.
>
hrm.
> +#define mmu_rmap_notifier(function, args...) \
> + do { \
> + struct mmu_rmap_notifier *__mrn; \
> + struct hlist_node *__n; \
> + \
> + rcu_read_lock(); \
> + hlist_for_each_entry_rcu(__mrn, __n, \
> + &mmu_rmap_notifier_list, hlist) \
> + if (__mrn->ops->function) \
> + __mrn->ops->function(__mrn, args); \
> + rcu_read_unlock(); \
> + } while (0);
> +
buggy macro: use locals.
> +#define mmu_rmap_notifier(function, args...) \
> + do { \
> + if (0) { \
> + struct mmu_rmap_notifier *__mrn; \
> + \
> + __mrn = (struct mmu_rmap_notifier *)(0x00ff); \
> + __mrn->ops->function(__mrn, args); \
> + } \
> + } while (0);
> +
Same observation as in the other patch.
> ===================================================================
> --- linux-2.6.orig/mm/mmu_notifier.c 2008-02-14 21:17:51.000000000 -0800
> +++ linux-2.6/mm/mmu_notifier.c 2008-02-14 21:21:04.000000000 -0800
> @@ -74,3 +74,37 @@ void mmu_notifier_unregister(struct mmu_
> }
> EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
>
> +#ifdef CONFIG_64BIT
> +static DEFINE_SPINLOCK(mmu_notifier_list_lock);
> +HLIST_HEAD(mmu_rmap_notifier_list);
> +
> +void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn)
> +{
> + spin_lock(&mmu_notifier_list_lock);
> + hlist_add_head_rcu(&mrn->hlist, &mmu_rmap_notifier_list);
> + spin_unlock(&mmu_notifier_list_lock);
> +}
> +EXPORT_SYMBOL(mmu_rmap_notifier_register);
> +
> +void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn)
> +{
> + spin_lock(&mmu_notifier_list_lock);
> + hlist_del_rcu(&mrn->hlist);
> + spin_unlock(&mmu_notifier_list_lock);
> +}
> +EXPORT_SYMBOL(mmu_rmap_notifier_unregister);
>
> +/*
> + * Export a page.
> + *
> + * Pagelock must be held.
> + * Must be called before a page is put on an external rmap.
> + */
> +void mmu_rmap_export_page(struct page *page)
> +{
> + BUG_ON(!PageLocked(page));
> + SetPageExternalRmap(page);
> +}
> +EXPORT_SYMBOL(mmu_rmap_export_page);
The other patch used EXPORT_SYMBOL_GPL.
On Fri, 15 Feb 2008, Andrew Morton wrote:
> > +#define mmu_rmap_notifier(function, args...) \
> > + do { \
> > + struct mmu_rmap_notifier *__mrn; \
> > + struct hlist_node *__n; \
> > + \
> > + rcu_read_lock(); \
> > + hlist_for_each_entry_rcu(__mrn, __n, \
> > + &mmu_rmap_notifier_list, hlist) \
> > + if (__mrn->ops->function) \
> > + __mrn->ops->function(__mrn, args); \
> > + rcu_read_unlock(); \
> > + } while (0);
> > +
>
> buggy macro: use locals.
Ok. Same as the non rmap version.
> > +EXPORT_SYMBOL(mmu_rmap_export_page);
>
> The other patch used EXPORT_SYMBOL_GPL.
Ok will make that consistent.
On Friday 15 February 2008 17:49, Christoph Lameter wrote:
> These special additional callbacks are required because XPmem (and likely
> other mechanisms) do use their own rmap (multiple processes on a series
> of remote Linux instances may be accessing the memory of a process).
> F.e. XPmem may have to send out notifications to remote Linux instances
> and receive confirmation before a page can be freed.
>
> So we handle this like an additional Linux reverse map that is walked after
> the existing rmaps have been walked. We leave the walking to the driver
> that is then able to use something else than a spinlock to walk its reverse
> maps. So we can actually call the driver without holding spinlocks while we
> hold the Pagelock.
I don't know how this is supposed to solve anything. The sleeping
problem happens I guess mostly in truncate. And all you are doing
is putting these rmap callbacks in page_mkclean and try_to_unmap.
> However, we cannot determine the mm_struct that a page belongs to at
> that point. The mm_struct can only be determined from the rmaps by the
> device driver.
>
> We add another pageflag (PageExternalRmap) that is set if a page has
> been remotely mapped (f.e. by a process from another Linux instance).
> We can then only perform the callbacks for pages that are actually in
> remote use.
>
> Rmap notifiers need an extra page bit and are only available
> on 64 bit platforms. This functionality is not available on 32 bit!
>
> A notifier that uses the reverse maps callbacks does not need to provide
> the invalidate_page() method that is called when locks are held.
That doesn't seem right. To start with, the new callbacks aren't
even called in the places where invalidate_page isn't allowed to
sleep.
The problem is unmap_mapping_range, right? And unmap_mapping_range
must walk the rmaps with the mmap lock held, which is why it can't
sleep. And it can't hold any mmap_sem so it cannot prevent address
space modifications of the processes in question between the time
you unmap them from the linux ptes with unmap_mapping_range, and the
time that you unmap them from your driver.
So in the meantime, you could have eg. a fault come in and set up a
new page for one of the processes, and that page might even get
exported via the same external driver. And now you have a totally
inconsistent view.
Preventing new mappings from being set up until the old mapping is
completely flushed is basically what we need to ensure for any sane
TLB as far as I can tell. To do that, you'll need to make the mmap
lock sleep, and either take mmap_sem inside it (which is a
deadlock condition at the moment), or make ptl sleep as well. These
are simply the locks we use to prevent that from happening, so I
can't see how you can possibly hope to have a coherent TLB without
invalidating inside those locks.
On Wed, Feb 20, 2008 at 10:55:20AM +1100, Nick Piggin wrote:
> On Friday 15 February 2008 17:49, Christoph Lameter wrote:
> > These special additional callbacks are required because XPmem (and likely
> > other mechanisms) do use their own rmap (multiple processes on a series
> > of remote Linux instances may be accessing the memory of a process).
> > F.e. XPmem may have to send out notifications to remote Linux instances
> > and receive confirmation before a page can be freed.
> >
> > So we handle this like an additional Linux reverse map that is walked after
> > the existing rmaps have been walked. We leave the walking to the driver
> > that is then able to use something else than a spinlock to walk its reverse
> > maps. So we can actually call the driver without holding spinlocks while we
> > hold the Pagelock.
>
> I don't know how this is supposed to solve anything. The sleeping
> problem happens I guess mostly in truncate. And all you are doing
> is putting these rmap callbacks in page_mkclean and try_to_unmap.
>
>
> > However, we cannot determine the mm_struct that a page belongs to at
> > that point. The mm_struct can only be determined from the rmaps by the
> > device driver.
> >
> > We add another pageflag (PageExternalRmap) that is set if a page has
> > been remotely mapped (f.e. by a process from another Linux instance).
> > We can then only perform the callbacks for pages that are actually in
> > remote use.
> >
> > Rmap notifiers need an extra page bit and are only available
> > on 64 bit platforms. This functionality is not available on 32 bit!
> >
> > A notifier that uses the reverse maps callbacks does not need to provide
> > the invalidate_page() method that is called when locks are held.
>
> That doesn't seem right. To start with, the new callbacks aren't
> even called in the places where invalidate_page isn't allowed to
> sleep.
>
> The problem is unmap_mapping_range, right? And unmap_mapping_range
> must walk the rmaps with the mmap lock held, which is why it can't
> sleep. And it can't hold any mmap_sem so it cannot prevent address
> space modifications of the processes in question between the time
> you unmap them from the linux ptes with unmap_mapping_range, and the
> time that you unmap them from your driver.
>
> So in the meantime, you could have eg. a fault come in and set up a
> new page for one of the processes, and that page might even get
> exported via the same external driver. And now you have a totally
> inconsistent view.
>
> Preventing new mappings from being set up until the old mapping is
> completely flushed is basically what we need to ensure for any sane
> TLB as far as I can tell. To do that, you'll need to make the mmap
> lock sleep, and either take mmap_sem inside it (which is a
> deadlock condition at the moment), or make ptl sleep as well. These
> are simply the locks we use to prevent that from happening, so I
> can't see how you can possibly hope to have a coherent TLB without
> invalidating inside those locks.
All of that is correct. For XPMEM, we do not currently allow file backed
mapping pages from being exported so we should never reach this condition.
It has been an issue since day 1. We have operated with that assumption
for 6 years and have not had issues with that assumption. The user of
xpmem is MPT and it controls the communication buffers so it is reasonable
to expect this type of behavior.
Thanks,
Robin
On Wednesday 20 February 2008 14:12, Robin Holt wrote:
> For XPMEM, we do not currently allow file backed
> mapping pages from being exported so we should never reach this condition.
> It has been an issue since day 1. We have operated with that assumption
> for 6 years and have not had issues with that assumption. The user of
> xpmem is MPT and it controls the communication buffers so it is reasonable
> to expect this type of behavior.
OK, that makes things simpler.
So why can't you export a device from your xpmem driver, which
can be mmap()ed to give out "anonymous" memory pages to be used
for these communication buffers?
I guess you may also want an "munmap/mprotect" callback, which
we don't have in the kernel right now... but at least you could
prototype it easily by having an ioctl to be called before
munmapping or mprotecting (eg. the ioctl could prevent new TLB
setup for the region, and shoot down existing ones).
This is actually going to be much faster for you if you use any
threaded applications, because you will be able to do all the
shootdown round trips outside mmap_sem, and so you will be able
to have other threads faulting and even mmap()ing / munmaping
at the same time as the shootdown is happening.
I guess there is some catch...
On Wed, Feb 20, 2008 at 02:51:45PM +1100, Nick Piggin wrote:
> On Wednesday 20 February 2008 14:12, Robin Holt wrote:
> > For XPMEM, we do not currently allow file backed
> > mapping pages from being exported so we should never reach this condition.
> > It has been an issue since day 1. We have operated with that assumption
> > for 6 years and have not had issues with that assumption. The user of
> > xpmem is MPT and it controls the communication buffers so it is reasonable
> > to expect this type of behavior.
>
> OK, that makes things simpler.
>
> So why can't you export a device from your xpmem driver, which
> can be mmap()ed to give out "anonymous" memory pages to be used
> for these communication buffers?
Because we need to have heap and stack available as well. MPT does
not control all the communication buffer areas. I haven't checked, but
this is the same problem that IB will have. I believe they are actually
allowing any memory region be accessible, but I am not sure of that.
Thanks,
Robin
On Wed, Feb 20, 2008 at 03:00:36AM -0600, Robin Holt wrote:
> On Wed, Feb 20, 2008 at 02:51:45PM +1100, Nick Piggin wrote:
> > On Wednesday 20 February 2008 14:12, Robin Holt wrote:
> > > For XPMEM, we do not currently allow file backed
> > > mapping pages from being exported so we should never reach this condition.
> > > It has been an issue since day 1. We have operated with that assumption
> > > for 6 years and have not had issues with that assumption. The user of
> > > xpmem is MPT and it controls the communication buffers so it is reasonable
> > > to expect this type of behavior.
> >
> > OK, that makes things simpler.
> >
> > So why can't you export a device from your xpmem driver, which
> > can be mmap()ed to give out "anonymous" memory pages to be used
> > for these communication buffers?
>
> Because we need to have heap and stack available as well. MPT does
> not control all the communication buffer areas. I haven't checked, but
> this is the same problem that IB will have. I believe they are actually
> allowing any memory region be accessible, but I am not sure of that.
I should have read my work email first. I had gotten an email from
one of our MPT developers saying they would love it if they could share
file backed memory areas as well as it would help them with their MPI-IO
functions which currently need to do multiple copy steps. Not sure how
high of a priority I am going to be able to make that.
Thanks,
Robin
On Wednesday 20 February 2008 20:00, Robin Holt wrote:
> On Wed, Feb 20, 2008 at 02:51:45PM +1100, Nick Piggin wrote:
> > On Wednesday 20 February 2008 14:12, Robin Holt wrote:
> > > For XPMEM, we do not currently allow file backed
> > > mapping pages from being exported so we should never reach this
> > > condition. It has been an issue since day 1. We have operated with
> > > that assumption for 6 years and have not had issues with that
> > > assumption. The user of xpmem is MPT and it controls the communication
> > > buffers so it is reasonable to expect this type of behavior.
> >
> > OK, that makes things simpler.
> >
> > So why can't you export a device from your xpmem driver, which
> > can be mmap()ed to give out "anonymous" memory pages to be used
> > for these communication buffers?
>
> Because we need to have heap and stack available as well. MPT does
> not control all the communication buffer areas. I haven't checked, but
> this is the same problem that IB will have. I believe they are actually
> allowing any memory region be accessible, but I am not sure of that.
Then you should create a driver that the user program can register
and unregister regions of their memory with. The driver can do a
get_user_pages to get the pages, and then you'd just need to set up
some kind of mapping so that userspace can unmap pages / won't leak
memory (and an exit_mm notifier I guess).
Because you don't need to swap, you don't need coherency, and you
are in control of the areas, then this seems like the best choice.
It would allow you to use heap, stack, file-backed, anything.
On Thu, Feb 21, 2008 at 03:20:02PM +1100, Nick Piggin wrote:
> > > So why can't you export a device from your xpmem driver, which
> > > can be mmap()ed to give out "anonymous" memory pages to be used
> > > for these communication buffers?
> >
> > Because we need to have heap and stack available as well. MPT does
> > not control all the communication buffer areas. I haven't checked, but
> > this is the same problem that IB will have. I believe they are actually
> > allowing any memory region be accessible, but I am not sure of that.
>
> Then you should create a driver that the user program can register
> and unregister regions of their memory with. The driver can do a
> get_user_pages to get the pages, and then you'd just need to set up
> some kind of mapping so that userspace can unmap pages / won't leak
> memory (and an exit_mm notifier I guess).
OK. You need to explain this better to me. How would this driver
supposedly work? What we have is an MPI library. It gets invoked at
process load time to establish its rank-to-rank communication regions.
It then turns control over to the processes main(). That is allowed to
run until it hits the
MPI_Init(&argc, &argv);
The process is then totally under the users control until:
MPI_Send(intmessage, m_size, MPI_INT, my_rank+half, tag, MPI_COMM_WORLD);
MPI_Recv(intmessage, m_size, MPI_INT, my_rank+half,tag, MPI_COMM_WORLD, &status);
That is it. That is all our allowed interaction with the users process.
Are you saying at the time of the MPI_Send, we should:
down_write(¤t->mm->mmap_sem);
Find all the VMAs that describe this region and record their
vm_ops structure.
Find all currently inserted page table information.
Create new VMAs that describe the same regions as before.
Insert our special fault handler which merely calls their old
fault handler and then exports the page then returns the page to the
kernel.
Take an extra reference count on the page for each possible
remote rank we are exporting this to.
That doesn't seem too unreasonable, except when you compare it to how the
driver currently works. Remember, this is done from a library which has
no insight into what the user has done to its own virtual address space.
As a result, each MPI_Send() would result in a system call (or we would
need to have a set of callouts for changes to a processes VMAs) which
would be a significant increase in communication overhead.
Maybe I am missing what you intend to do, but what we need is a means of
tracking one processes virtual address space changes so other processes
can do direct memory accesses without the need for a system call on each
communication event.
> Because you don't need to swap, you don't need coherency, and you
> are in control of the areas, then this seems like the best choice.
> It would allow you to use heap, stack, file-backed, anything.
You are missing one point here. The MPI specifications that have
been out there for decades do not require the process use a library
for allocating the buffer. I realize that is a horrible shortcoming,
but that is the world we live in. Even if we could change that spec,
we would still need to support the existing specs. As a result, the
user can change their virtual address space as they need and still expect
communications be cheap.
Thanks,
Robin
On Thursday 21 February 2008 21:58, Robin Holt wrote:
> On Thu, Feb 21, 2008 at 03:20:02PM +1100, Nick Piggin wrote:
> > > > So why can't you export a device from your xpmem driver, which
> > > > can be mmap()ed to give out "anonymous" memory pages to be used
> > > > for these communication buffers?
> > >
> > > Because we need to have heap and stack available as well. MPT does
> > > not control all the communication buffer areas. I haven't checked, but
> > > this is the same problem that IB will have. I believe they are
> > > actually allowing any memory region be accessible, but I am not sure of
> > > that.
> >
> > Then you should create a driver that the user program can register
> > and unregister regions of their memory with. The driver can do a
> > get_user_pages to get the pages, and then you'd just need to set up
> > some kind of mapping so that userspace can unmap pages / won't leak
> > memory (and an exit_mm notifier I guess).
>
> OK. You need to explain this better to me. How would this driver
> supposedly work? What we have is an MPI library. It gets invoked at
> process load time to establish its rank-to-rank communication regions.
> It then turns control over to the processes main(). That is allowed to
> run until it hits the
> MPI_Init(&argc, &argv);
>
> The process is then totally under the users control until:
> MPI_Send(intmessage, m_size, MPI_INT, my_rank+half, tag, MPI_COMM_WORLD);
> MPI_Recv(intmessage, m_size, MPI_INT, my_rank+half,tag, MPI_COMM_WORLD,
> &status);
>
> That is it. That is all our allowed interaction with the users process.
OK, when you said something along the lines of "the MPT library has
control of the comm buffer", then I assumed it was an area of virtual
memory which is set up as part of initialization, rather than during
runtime. I guess I jumped to conclusions.
> That doesn't seem too unreasonable, except when you compare it to how the
> driver currently works. Remember, this is done from a library which has
> no insight into what the user has done to its own virtual address space.
> As a result, each MPI_Send() would result in a system call (or we would
> need to have a set of callouts for changes to a processes VMAs) which
> would be a significant increase in communication overhead.
>
> Maybe I am missing what you intend to do, but what we need is a means of
> tracking one processes virtual address space changes so other processes
> can do direct memory accesses without the need for a system call on each
> communication event.
Yeah it's tricky. BTW. what is the performance difference between
having a system call or no?
> > Because you don't need to swap, you don't need coherency, and you
> > are in control of the areas, then this seems like the best choice.
> > It would allow you to use heap, stack, file-backed, anything.
>
> You are missing one point here. The MPI specifications that have
> been out there for decades do not require the process use a library
> for allocating the buffer. I realize that is a horrible shortcoming,
> but that is the world we live in. Even if we could change that spec,
Can you change the spec? Are you working on it?
> we would still need to support the existing specs. As a result, the
> user can change their virtual address space as they need and still expect
> communications be cheap.
That's true. How has it been supported up to now? Are you using
these kind of notifiers in patched kernels?
On Tue, Feb 26, 2008 at 05:11:32PM +1100, Nick Piggin wrote:
> > You are missing one point here. The MPI specifications that have
> > been out there for decades do not require the process use a library
> > for allocating the buffer. I realize that is a horrible shortcoming,
> > but that is the world we live in. Even if we could change that spec,
>
> Can you change the spec?
Not really. It will break all existing codes. MPI-2 provides a call for
memory allocation (and it's beneficial to use this call for some interconnects),
but many (most?) applications are still written for MPI-1 and those that
are written for MPI-2 mostly uses the old habit of allocating memory by malloc(),
or even use stack or BSS memory for communication buffer purposes.
--
Gleb.
On Tuesday 26 February 2008 18:21, Gleb Natapov wrote:
> On Tue, Feb 26, 2008 at 05:11:32PM +1100, Nick Piggin wrote:
> > > You are missing one point here. The MPI specifications that have
> > > been out there for decades do not require the process use a library
> > > for allocating the buffer. I realize that is a horrible shortcoming,
> > > but that is the world we live in. Even if we could change that spec,
> >
> > Can you change the spec?
>
> Not really. It will break all existing codes.
I meant as in eg. submit changes to MPI-3
> MPI-2 provides a call for
> memory allocation (and it's beneficial to use this call for some
> interconnects), but many (most?) applications are still written for MPI-1
> and those that are written for MPI-2 mostly uses the old habit of
> allocating memory by malloc(), or even use stack or BSS memory for
> communication buffer purposes.
OK, so MPI-2 already has some way to do that... I'm not saying that we
can now completely dismiss the idea of using notifiers for this, but it
is just a good data point to know.
Thanks,
Nick
On Tue, Feb 26, 2008 at 07:52:41PM +1100, Nick Piggin wrote:
> On Tuesday 26 February 2008 18:21, Gleb Natapov wrote:
> > On Tue, Feb 26, 2008 at 05:11:32PM +1100, Nick Piggin wrote:
> > > > You are missing one point here. The MPI specifications that have
> > > > been out there for decades do not require the process use a library
> > > > for allocating the buffer. I realize that is a horrible shortcoming,
> > > > but that is the world we live in. Even if we could change that spec,
> > >
> > > Can you change the spec?
> >
> > Not really. It will break all existing codes.
>
> I meant as in eg. submit changes to MPI-3
MPI spec tries to be backward compatible. And MPI-2 spec is 10 years
old, but MPI-1 is still in a wider use. HPC is moving fast in terms of HW
technology, but slow in terms of SW. Fortran is still hot there :)
--
Gleb.
> > > > Can you change the spec?
> > >
> > > Not really. It will break all existing codes.
> >
> > I meant as in eg. submit changes to MPI-3
>
> MPI spec tries to be backward compatible. And MPI-2 spec is 10 years
> old, but MPI-1 is still in a wider use. HPC is moving fast in terms of HW
> technology, but slow in terms of SW. Fortran is still hot there :)
Agreed.
many many people dislike incompatible specification change.
We should accept real world spec.
- kosaki
On Tue, Feb 26, 2008 at 07:52:41PM +1100, Nick Piggin wrote:
> On Tuesday 26 February 2008 18:21, Gleb Natapov wrote:
> > On Tue, Feb 26, 2008 at 05:11:32PM +1100, Nick Piggin wrote:
> > > > You are missing one point here. The MPI specifications that have
> > > > been out there for decades do not require the process use a library
> > > > for allocating the buffer. I realize that is a horrible shortcoming,
> > > > but that is the world we live in. Even if we could change that spec,
> > >
> > > Can you change the spec?
> >
> > Not really. It will break all existing codes.
>
> I meant as in eg. submit changes to MPI-3
>
>
> > MPI-2 provides a call for
> > memory allocation (and it's beneficial to use this call for some
> > interconnects), but many (most?) applications are still written for MPI-1
> > and those that are written for MPI-2 mostly uses the old habit of
> > allocating memory by malloc(), or even use stack or BSS memory for
> > communication buffer purposes.
>
> OK, so MPI-2 already has some way to do that... I'm not saying that we
> can now completely dismiss the idea of using notifiers for this, but it
> is just a good data point to know.
It is in MPI-2, but MPI-2 does not prohibit communication from regions
not allocated by the MPI call.
Thanks,
Robin
> > That is it. That is all our allowed interaction with the users process.
>
> OK, when you said something along the lines of "the MPT library has
> control of the comm buffer", then I assumed it was an area of virtual
> memory which is set up as part of initialization, rather than during
> runtime. I guess I jumped to conclusions.
There are six regions the MPT library typically makes. The most basic
one is a fixed size. It describes the MPT internal buffers, the stack,
the heap, the application text, and finally the entire address space.
That last region is seldom used. MPT only has control over the first
two.
> > That doesn't seem too unreasonable, except when you compare it to how the
> > driver currently works. Remember, this is done from a library which has
> > no insight into what the user has done to its own virtual address space.
> > As a result, each MPI_Send() would result in a system call (or we would
> > need to have a set of callouts for changes to a processes VMAs) which
> > would be a significant increase in communication overhead.
> >
> > Maybe I am missing what you intend to do, but what we need is a means of
> > tracking one processes virtual address space changes so other processes
> > can do direct memory accesses without the need for a system call on each
> > communication event.
>
> Yeah it's tricky. BTW. what is the performance difference between
> having a system call or no?
The system call takes many microseconds and still requires the same
latency of the communication. Without it, our latency is
usually below two microseconds.
> > > Because you don't need to swap, you don't need coherency, and you
> > > are in control of the areas, then this seems like the best choice.
> > > It would allow you to use heap, stack, file-backed, anything.
> >
> > You are missing one point here. The MPI specifications that have
> > been out there for decades do not require the process use a library
> > for allocating the buffer. I realize that is a horrible shortcoming,
> > but that is the world we live in. Even if we could change that spec,
>
> Can you change the spec? Are you working on it?
Even if we changed the spec, the old specs will continue to be
supported. I personally am not involved. Not sure if anybody else is
working this issue.
> > we would still need to support the existing specs. As a result, the
> > user can change their virtual address space as they need and still expect
> > communications be cheap.
>
> That's true. How has it been supported up to now? Are you using
> these kind of notifiers in patched kernels?
At fault time, we check to see if it is an anon or mspec vma. We pin
the page an insert them. The remote OS then losses synchronicity with
the owning processes page tables. If an unmap, madvise, etc occurs the
page tables are updated without regard to our references. Fork or exit
(fork is caught using an LD_PRELOAD library) cause the user pages to be
recalled from the remote side and put_page returns them to the kernel.
We have documented that this loss of synchronicity is due to their
action and not supported. Essentially, we rely upon the application
being well behaved. To this point, that has remainded true.
Thanks,
Robin
On Wed, 20 Feb 2008, Nick Piggin wrote:
> I don't know how this is supposed to solve anything. The sleeping
> problem happens I guess mostly in truncate. And all you are doing
> is putting these rmap callbacks in page_mkclean and try_to_unmap.
truncate is handled by the range invalidates. This is special code to deal
with the unnap/clean of an individual page.
> That doesn't seem right. To start with, the new callbacks aren't
> even called in the places where invalidate_page isn't allowed to
> sleep.
>
> The problem is unmap_mapping_range, right? And unmap_mapping_range
> must walk the rmaps with the mmap lock held, which is why it can't
> sleep. And it can't hold any mmap_sem so it cannot prevent address
Nope. unmap_mapping_range is already handled by the range callbacks.
> So in the meantime, you could have eg. a fault come in and set up a
> new page for one of the processes, and that page might even get
> exported via the same external driver. And now you have a totally
> inconsistent view.
The situation that you are imagining has already been dealt with by the
earlier patches. This is only to allow sleeping while unmapping individual
pages.
On Wed, Feb 27, 2008 at 02:43:41PM -0800, Christoph Lameter wrote:
> Nope. unmap_mapping_range is already handled by the range callbacks.
But they're called with atomic=1 on anything but anonymous memory. I
understood Andrew asked to remove the atomic param and to allow
sleeping for all kind of vmas. I also understood certain XPMEM
customers asked to use XPMEM on something more than anonymous memory.
> The situation that you are imagining has already been dealt with [..]
I guess there's some misunderstanding, I think Nick was referring to
the above problem.
On Thu, 28 Feb 2008, Andrea Arcangeli wrote:
> On Wed, Feb 27, 2008 at 02:43:41PM -0800, Christoph Lameter wrote:
> > Nope. unmap_mapping_range is already handled by the range callbacks.
>
> But they're called with atomic=1 on anything but anonymous memory. I
> understood Andrew asked to remove the atomic param and to allow
> sleeping for all kind of vmas. I also understood certain XPMEM
> customers asked to use XPMEM on something more than anonymous memory.
Yes but the patch that is discussed here does not handle that situation.