Hi all,
this patch series introduces the gntdev device that allows usermode
to map granted pages; gntdev is used by qemu to implement Xen backends
in userspace.
Considering that granted pages still have the same entry in the m2p
as before being granted, the following patches also add a mechanism to
override portions of the m2p so that we can find out the pfn
corresponding to the mfn of a granted page in the "receiver" domain.
This is needed so that we can have pte_page work correctly for these
pages. Once this requirement is satisfied, __get_user_pages_fast can
work on granted pages and userspace applications (the xen block backend
implemented in qemu) can use AIO on them.
The list of patches and the diffstat follow:
Gerd Hoffmann (1):
xen/gntdev: allow usermode to map granted pages
Ian Campbell (2):
xen: define gnttab_set_map_op/unmap_op
xen: gntdev: move use of GNTMAP_contains_pte next to the map_op
Jeremy Fitzhardinge (5):
xen/gntdev: add VM_PFNMAP to vma
xen: move p2m handling to separate file
xen: add m2p override mechanism
xen/gntdev: stop using "token" argument
xen p2m: clear the old pte when adding a page to m2p_override
Stefano Stabellini (3):
xen p2m: transparently change the p2m mappings in the m2p override
xen: introduce gnttab_map_refs and gnttab_unmap_refs
xen gntdev: use gnttab_map_refs and gnttab_unmap_refs
arch/x86/include/asm/xen/page.h | 16 +-
arch/x86/xen/Makefile | 3 +-
arch/x86/xen/mmu.c | 365 ---------------------
arch/x86/xen/p2m.c | 493 ++++++++++++++++++++++++++++
drivers/xen/Kconfig | 7 +
drivers/xen/Makefile | 2 +
drivers/xen/gntdev.c | 672 +++++++++++++++++++++++++++++++++++++++
drivers/xen/grant-table.c | 36 ++
include/xen/gntdev.h | 119 +++++++
include/xen/grant_table.h | 44 +++-
10 files changed, 1387 insertions(+), 370 deletions(-)
Cheers,
Stefano
P.S.
I'll be AFK for a couple of weeks, so don't expect an immediate reply.
From: Ian Campbell <[email protected]>
Impact: hypercall definitions
These functions populate the gnttab data structures used by the
granttab map and unmap ops and are used in the backend drivers.
Originally xen-unstable.hg 9625:c3bb51c443a7
[ Include Stefano's fix for phys_addr_t ]
Signed-off-by: Ian Campbell <[email protected]>
Signed-off-by: Stefano Stabellini <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/xen/grant_table.h | 39 ++++++++++++++++++++++++++++++++++++++-
1 files changed, 38 insertions(+), 1 deletions(-)
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 9a73170..1821aa1 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -37,10 +37,16 @@
#ifndef __ASM_GNTTAB_H__
#define __ASM_GNTTAB_H__
-#include <asm/xen/hypervisor.h>
+#include <asm/page.h>
+
+#include <xen/interface/xen.h>
#include <xen/interface/grant_table.h>
+
+#include <asm/xen/hypervisor.h>
#include <asm/xen/grant_table.h>
+#include <xen/features.h>
+
/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
#define NR_GRANT_FRAMES 4
@@ -107,6 +113,37 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
unsigned long pfn);
+static inline void
+gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
+ uint32_t flags, grant_ref_t ref, domid_t domid)
+{
+ if (flags & GNTMAP_contains_pte)
+ map->host_addr = addr;
+ else if (xen_feature(XENFEAT_auto_translated_physmap))
+ map->host_addr = __pa(addr);
+ else
+ map->host_addr = addr;
+
+ map->flags = flags;
+ map->ref = ref;
+ map->dom = domid;
+}
+
+static inline void
+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
+ uint32_t flags, grant_handle_t handle)
+{
+ if (flags & GNTMAP_contains_pte)
+ unmap->host_addr = addr;
+ else if (xen_feature(XENFEAT_auto_translated_physmap))
+ unmap->host_addr = __pa(addr);
+ else
+ unmap->host_addr = addr;
+
+ unmap->handle = handle;
+ unmap->dev_bus_addr = 0;
+}
+
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
struct grant_entry **__shared);
--
1.5.6.5
From: Jeremy Fitzhardinge <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/Makefile | 3 +-
arch/x86/xen/mmu.c | 365 -----------------------------------------------
arch/x86/xen/p2m.c | 376 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 378 insertions(+), 366 deletions(-)
create mode 100644 arch/x86/xen/p2m.c
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 7793851..17c565d 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
- grant-table.o suspend.o platform-pci-unplug.o
+ grant-table.o suspend.o platform-pci-unplug.o \
+ p2m.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 44924e5..7575e55 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
-/*
- * Xen leaves the responsibility for maintaining p2m mappings to the
- * guests themselves, but it must also access and update the p2m array
- * during suspend/resume when all the pages are reallocated.
- *
- * The p2m table is logically a flat array, but we implement it as a
- * three-level tree to allow the address space to be sparse.
- *
- * Xen
- * |
- * p2m_top p2m_top_mfn
- * / \ / \
- * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
- * / \ / \ / /
- * p2m p2m p2m p2m p2m p2m p2m ...
- *
- * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
- *
- * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
- * maximum representable pseudo-physical address space is:
- * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
- *
- * P2M_PER_PAGE depends on the architecture, as a mfn is always
- * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
- * 512 and 1024 entries respectively.
- */
-
-unsigned long xen_max_p2m_pfn __read_mostly;
-
-#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-/* Placeholders for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
-
-static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
-
-RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-
-static inline unsigned p2m_top_index(unsigned long pfn)
-{
- BUG_ON(pfn >= MAX_P2M_PFN);
- return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
-}
-
-static inline unsigned p2m_mid_index(unsigned long pfn)
-{
- return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
-}
-
-static inline unsigned p2m_index(unsigned long pfn)
-{
- return pfn % P2M_PER_PAGE;
-}
-
-static void p2m_top_init(unsigned long ***top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing;
-}
-
-static void p2m_top_mfn_init(unsigned long *top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = virt_to_mfn(p2m_mid_missing_mfn);
-}
-
-static void p2m_top_mfn_p_init(unsigned long **top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing_mfn;
-}
-
-static void p2m_mid_init(unsigned long **mid)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = p2m_missing;
-}
-
-static void p2m_mid_mfn_init(unsigned long *mid)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = virt_to_mfn(p2m_missing);
-}
-
-static void p2m_init(unsigned long *p2m)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- p2m[i] = INVALID_P2M_ENTRY;
-}
-
-/*
- * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
- *
- * This is called both at boot time, and after resuming from suspend:
- * - At boot time we're called very early, and must use extend_brk()
- * to allocate memory.
- *
- * - After resume we're called from within stop_machine, but the mfn
- * tree should alreay be completely allocated.
- */
-void xen_build_mfn_list_list(void)
-{
- unsigned long pfn;
-
- /* Pre-initialize p2m_top_mfn to be completely missing */
- if (p2m_top_mfn == NULL) {
- p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
-
- p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_mfn_p_init(p2m_top_mfn_p);
-
- p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_mfn_init(p2m_top_mfn);
- } else {
- /* Reinitialise, mfn's all change after migration */
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
- }
-
- for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
- unsigned long **mid;
- unsigned long *mid_mfn_p;
-
- mid = p2m_top[topidx];
- mid_mfn_p = p2m_top_mfn_p[topidx];
-
- /* Don't bother allocating any mfn mid levels if
- * they're just missing, just update the stored mfn,
- * since all could have changed over a migrate.
- */
- if (mid == p2m_mid_missing) {
- BUG_ON(mididx);
- BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
- p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
- pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
- continue;
- }
-
- if (mid_mfn_p == p2m_mid_missing_mfn) {
- /*
- * XXX boot-time only! We should never find
- * missing parts of the mfn tree after
- * runtime. extend_brk() will BUG if we call
- * it too late.
- */
- mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
-
- p2m_top_mfn_p[topidx] = mid_mfn_p;
- }
-
- p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
- mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
- }
-}
-
-void xen_setup_mfn_list_list(void)
-{
- BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
-
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn);
- HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
-}
-
-/* Set up p2m_top to point to the domain-builder provided p2m pages */
-void __init xen_build_dynamic_phys_to_machine(void)
-{
- unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
- unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- unsigned long pfn;
-
- xen_max_p2m_pfn = max_pfn;
-
- p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_missing);
-
- p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_missing);
-
- p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_init(p2m_top);
-
- /*
- * The domain builder gives us a pre-constructed p2m array in
- * mfn_list for all the pages initially given to us, so we just
- * need to graft that into our tree structure.
- */
- for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
-
- if (p2m_top[topidx] == p2m_mid_missing) {
- unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
-
- p2m_top[topidx] = mid;
- }
-
- p2m_top[topidx][mididx] = &mfn_list[pfn];
- }
-}
-
-unsigned long get_phys_to_machine(unsigned long pfn)
-{
- unsigned topidx, mididx, idx;
-
- if (unlikely(pfn >= MAX_P2M_PFN))
- return INVALID_P2M_ENTRY;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- return p2m_top[topidx][mididx][idx];
-}
-EXPORT_SYMBOL_GPL(get_phys_to_machine);
-
-static void *alloc_p2m_page(void)
-{
- return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
-}
-
-static void free_p2m_page(void *p)
-{
- free_page((unsigned long)p);
-}
-
-/*
- * Fully allocate the p2m structure for a given pfn. We need to check
- * that both the top and mid levels are allocated, and make sure the
- * parallel mfn tree is kept in sync. We may race with other cpus, so
- * the new pages are installed with cmpxchg; if we lose the race then
- * simply free the page we allocated and use the one that's there.
- */
-static bool alloc_p2m(unsigned long pfn)
-{
- unsigned topidx, mididx;
- unsigned long ***top_p, **mid;
- unsigned long *top_mfn_p, *mid_mfn;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
-
- top_p = &p2m_top[topidx];
- mid = *top_p;
-
- if (mid == p2m_mid_missing) {
- /* Mid level is missing, allocate a new one */
- mid = alloc_p2m_page();
- if (!mid)
- return false;
-
- p2m_mid_init(mid);
-
- if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
- free_p2m_page(mid);
- }
-
- top_mfn_p = &p2m_top_mfn[topidx];
- mid_mfn = p2m_top_mfn_p[topidx];
-
- BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
-
- if (mid_mfn == p2m_mid_missing_mfn) {
- /* Separately check the mid mfn level */
- unsigned long missing_mfn;
- unsigned long mid_mfn_mfn;
-
- mid_mfn = alloc_p2m_page();
- if (!mid_mfn)
- return false;
-
- p2m_mid_mfn_init(mid_mfn);
-
- missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
- mid_mfn_mfn = virt_to_mfn(mid_mfn);
- if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
- free_p2m_page(mid_mfn);
- else
- p2m_top_mfn_p[topidx] = mid_mfn;
- }
-
- if (p2m_top[topidx][mididx] == p2m_missing) {
- /* p2m leaf page is missing */
- unsigned long *p2m;
-
- p2m = alloc_p2m_page();
- if (!p2m)
- return false;
-
- p2m_init(p2m);
-
- if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
- free_p2m_page(p2m);
- else
- mid_mfn[mididx] = virt_to_mfn(p2m);
- }
-
- return true;
-}
-
-/* Try to install p2m mapping; fail if intermediate bits missing */
-bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- unsigned topidx, mididx, idx;
-
- if (unlikely(pfn >= MAX_P2M_PFN)) {
- BUG_ON(mfn != INVALID_P2M_ENTRY);
- return true;
- }
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- if (p2m_top[topidx][mididx] == p2m_missing)
- return mfn == INVALID_P2M_ENTRY;
-
- p2m_top[topidx][mididx][idx] = mfn;
-
- return true;
-}
-
-bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
- BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return true;
- }
-
- if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!alloc_p2m(pfn))
- return false;
-
- if (!__set_phys_to_machine(pfn, mfn))
- return false;
- }
-
- return true;
-}
-
unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 0000000..259ec3b
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,376 @@
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ * Xen
+ * |
+ * p2m_top p2m_top_mfn
+ * / \ / \
+ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
+ * / \ / \ / /
+ * p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <asm/cache.h>
+#include <asm/setup.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+unsigned long xen_max_p2m_pfn __read_mostly;
+
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
+
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+ BUG_ON(pfn >= MAX_P2M_PFN);
+ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+}
+
+static inline unsigned p2m_index(unsigned long pfn)
+{
+ return pfn % P2M_PER_PAGE;
+}
+
+static void p2m_top_init(unsigned long ***top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ * to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ * tree should alreay be completely allocated.
+ */
+void xen_build_mfn_list_list(void)
+{
+ unsigned long pfn;
+
+ /* Pre-initialize p2m_top_mfn to be completely missing */
+ if (p2m_top_mfn == NULL) {
+ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
+
+ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ }
+
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned long **mid;
+ unsigned long *mid_mfn_p;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+
+ /* Don't bother allocating any mfn mid levels if
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (mid == p2m_mid_missing) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+ continue;
+ }
+
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ /*
+ * XXX boot-time only! We should never find
+ * missing parts of the mfn tree after
+ * runtime. extend_brk() will BUG if we call
+ * it too late.
+ */
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ }
+
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+ }
+}
+
+void xen_setup_mfn_list_list(void)
+{
+ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+}
+
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+ unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
+ unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+ unsigned long pfn;
+
+ xen_max_p2m_pfn = max_pfn;
+
+ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_missing);
+
+ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_missing);
+
+ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_init(p2m_top);
+
+ /*
+ * The domain builder gives us a pre-constructed p2m array in
+ * mfn_list for all the pages initially given to us, so we just
+ * need to graft that into our tree structure.
+ */
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(mid);
+
+ p2m_top[topidx] = mid;
+ }
+
+ p2m_top[topidx][mididx] = &mfn_list[pfn];
+ }
+}
+
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN))
+ return INVALID_P2M_ENTRY;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ return p2m_top[topidx][mididx][idx];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+
+static void *alloc_p2m_page(void)
+{
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+static void free_p2m_page(void *p)
+{
+ free_page((unsigned long)p);
+}
+
+/*
+ * Fully allocate the p2m structure for a given pfn. We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync. We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+ unsigned topidx, mididx;
+ unsigned long ***top_p, **mid;
+ unsigned long *top_mfn_p, *mid_mfn;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+
+ top_p = &p2m_top[topidx];
+ mid = *top_p;
+
+ if (mid == p2m_mid_missing) {
+ /* Mid level is missing, allocate a new one */
+ mid = alloc_p2m_page();
+ if (!mid)
+ return false;
+
+ p2m_mid_init(mid);
+
+ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+ free_p2m_page(mid);
+ }
+
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = p2m_top_mfn_p[topidx];
+
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
+
+ p2m_mid_mfn_init(mid_mfn);
+
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+ free_p2m_page(mid_mfn);
+ else
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_missing) {
+ /* p2m leaf page is missing */
+ unsigned long *p2m;
+
+ p2m = alloc_p2m_page();
+ if (!p2m)
+ return false;
+
+ p2m_init(p2m);
+
+ if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+ free_p2m_page(p2m);
+ else
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ }
+
+ return true;
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN)) {
+ BUG_ON(mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ if (p2m_top[topidx][mididx] == p2m_missing)
+ return mfn == INVALID_P2M_ENTRY;
+
+ p2m_top[topidx][mididx][idx] = mfn;
+
+ return true;
+}
+
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (!alloc_p2m(pfn))
+ return false;
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ return false;
+ }
+
+ return true;
+}
--
1.5.6.5
From: Jeremy Fitzhardinge <[email protected]>
When adding a page to m2p_override we change the p2m of the page so we
need to also clear the old pte of the kernel linear mapping because it
doesn't correspond anymore.
When we remove the page from m2p_override we restore the original p2m of
the page and we also restore the old pte of the kernel linear mapping.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Signed-off-by: Stefano Stabellini <[email protected]>
---
arch/x86/xen/p2m.c | 35 ++++++++++++++++++++++++++++++++++-
1 files changed, 34 insertions(+), 1 deletions(-)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 7dde8e4..ca6552d 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/list.h>
#include <linux/hash.h>
+#include <linux/sched.h>
#include <asm/cache.h>
#include <asm/setup.h>
@@ -412,6 +413,22 @@ void m2p_add_override(unsigned long mfn, struct page *page)
page->index = pfn_to_mfn(pfn);
__set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+ if (!PageHighMem(page)) {
+ unsigned long address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ unsigned level;
+ pte_t *ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_add_override: pfn %lx not mapped", pfn)) {
+ __free_page(page);
+
+ return;
+ }
+
+ /* Just zap old mapping for now */
+ pte_clear(&init_mm, address, ptep);
+ }
+
spin_lock_irqsave(&m2p_override_lock, flags);
list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
spin_unlock_irqrestore(&m2p_override_lock, flags);
@@ -420,10 +437,26 @@ void m2p_add_override(unsigned long mfn, struct page *page)
void m2p_remove_override(struct page *page)
{
unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
spin_lock_irqsave(&m2p_override_lock, flags);
list_del(&page->lru);
spin_unlock_irqrestore(&m2p_override_lock, flags);
- __set_phys_to_machine(page_to_pfn(page), page->index);
+ __set_phys_to_machine(pfn, page->index);
+
+ if (!PageHighMem(page)) {
+ unsigned long address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ unsigned level;
+ pte_t *ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_remove_override: pfn %lx not mapped", pfn))
+ return;
+
+ set_pte_at(&init_mm, address, ptep,
+ pfn_pte(pfn, PAGE_KERNEL));
+ /* No tlb flush necessary because the caller already
+ * left the pte unmapped. */
+ }
}
struct page *m2p_find_override(unsigned long mfn)
--
1.5.6.5
From: Ian Campbell <[email protected]>
This flag controls the meaning of gnttab_map_grant_ref.host_addr and
specifies that the field contains a refernce to the pte entry to be
used to perform the mapping. Therefore move the use of this flag to
the point at which we actually use a reference to the pte instead of
something else, splitting up the usage of the flag in this way is
confusing and potentially error prone.
The other flags are all properties of the mapping itself as opposed to
properties of the hypercall arguments and therefore it make sense to
continue to pass them round in map->flags.
Signed-off-by: Ian Campbell <[email protected]>
Cc: Jeremy Fitzhardinge <[email protected]>
Cc: Stefano Stabellini <[email protected]>
Cc: Derek G. Murray <[email protected]>
Cc: Gerd Hoffmann <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
drivers/xen/gntdev.c | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index cf61c7d..b916d6b 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -205,10 +205,12 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void
BUG_ON(pgnr >= map->count);
pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
pte_maddr += (unsigned long)pte & ~PAGE_MASK;
- gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags,
+ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
+ GNTMAP_contains_pte | map->flags,
map->grants[pgnr].ref,
map->grants[pgnr].domid);
- gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags,
+ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr,
+ GNTMAP_contains_pte | map->flags,
0 /* handle */);
return 0;
}
@@ -579,7 +581,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
vma->vm_private_data = map;
map->vma = vma;
- map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte;
+ map->flags = GNTMAP_host_map | GNTMAP_application_map;
if (!(vma->vm_flags & VM_WRITE))
map->flags |= GNTMAP_readonly;
--
1.5.6.5
From: Stefano Stabellini <[email protected]>
Use gnttab_map_refs and gnttab_unmap_refs to map and unmap the grant
ref, so that we can have a corresponding struct page.
Signed-off-by: Stefano Stabellini <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
drivers/xen/gntdev.c | 37 ++++++++++++++++++++++++++++++-------
1 files changed, 30 insertions(+), 7 deletions(-)
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index fc5e420..35de6bb 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -68,6 +68,7 @@ struct grant_map {
struct ioctl_gntdev_grant_ref *grants;
struct gnttab_map_grant_ref *map_ops;
struct gnttab_unmap_grant_ref *unmap_ops;
+ struct page **pages;
};
/* ------------------------------------------------------------------ */
@@ -88,6 +89,7 @@ static void gntdev_print_maps(struct gntdev_priv *priv,
static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
{
struct grant_map *add;
+ int i;
add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
if (NULL == add)
@@ -96,11 +98,19 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL);
add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL);
add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
- if (NULL == add->grants ||
- NULL == add->map_ops ||
- NULL == add->unmap_ops)
+ add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL);
+ if (NULL == add->grants ||
+ NULL == add->map_ops ||
+ NULL == add->unmap_ops ||
+ NULL == add->pages)
goto err;
+ for (i = 0; i < count; i++) {
+ add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (add->pages[i] == NULL)
+ goto err;
+ }
+
add->index = 0;
add->count = count;
add->priv = priv;
@@ -111,6 +121,12 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
return add;
err:
+ if (add->pages)
+ for (i = 0; i < count; i++) {
+ if (add->pages[i])
+ __free_page(add->pages[i]);
+ }
+ kfree(add->pages);
kfree(add->grants);
kfree(add->map_ops);
kfree(add->unmap_ops);
@@ -186,8 +202,17 @@ static int gntdev_del_map(struct grant_map *map)
static void gntdev_free_map(struct grant_map *map)
{
+ unsigned i;
+
if (!map)
return;
+
+ if (map->pages)
+ for (i = 0; i < map->count; i++) {
+ if (map->pages[i])
+ __free_page(map->pages[i]);
+ }
+ kfree(map->pages);
kfree(map->grants);
kfree(map->map_ops);
kfree(map->unmap_ops);
@@ -221,8 +246,7 @@ static int map_grant_pages(struct grant_map *map)
if (debug)
printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count);
- err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
- map->map_ops, map->count);
+ err = gnttab_map_refs(map->map_ops, map->pages, map->count);
if (WARN_ON(err))
return err;
@@ -241,8 +265,7 @@ static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
if (debug)
printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__,
map->index, map->count, offset, pages);
- err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
- map->unmap_ops + offset, pages);
+ err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages);
if (WARN_ON(err))
return err;
--
1.5.6.5
From: Jeremy Fitzhardinge <[email protected]>
It's the struct page of the L1 pte page. But we can get its mfn
by simply doing an arbitrary_virt_to_machine() on it anyway (which is
the safe conservative choice; since we no longer allow HIGHPTE pages,
we would never expect to be operating on a mapped pte page).
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
drivers/xen/gntdev.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index b916d6b..fc5e420 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -203,8 +203,8 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void
u64 pte_maddr;
BUG_ON(pgnr >= map->count);
- pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
- pte_maddr += (unsigned long)pte & ~PAGE_MASK;
+ pte_maddr = arbitrary_virt_to_machine(pte).maddr;
+
gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr,
GNTMAP_contains_pte | map->flags,
map->grants[pgnr].ref,
--
1.5.6.5
From: Stefano Stabellini <[email protected]>
gnttab_map_refs maps some grant refs and uses the new m2p override to
set a proper m2p mapping for the granted pages.
gnttab_unmap_refs unmaps the granted refs and removes th mappings from
the m2p override.
Signed-off-by: Stefano Stabellini <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
drivers/xen/grant-table.c | 36 ++++++++++++++++++++++++++++++++++++
include/xen/grant_table.h | 5 +++++
2 files changed, 41 insertions(+), 0 deletions(-)
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 6c45318..a5cf820 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -447,6 +447,42 @@ unsigned int gnttab_max_grant_frames(void)
}
EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret;
+ pte_t val;
+ pte_t *pte;
+ unsigned long pfn, mfn;
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count);
+
+ for (i = 0; i < count; i++) {
+ pfn = mfn_to_pfn(map_ops[i].host_addr >> PAGE_SHIFT);
+ pte = (pte_t *) __va((pfn << PAGE_SHIFT) +
+ (map_ops[i].host_addr & ~PAGE_MASK));
+ val = *pte;
+ mfn = (native_pte_val(val) & PTE_PFN_MASK) >> PAGE_SHIFT;
+ m2p_add_override(mfn, pages[i]);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_map_refs);
+
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct page **pages, unsigned int count)
+{
+ int i, ret;
+
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count);
+ for (i = 0; i < count; i++)
+ m2p_remove_override(pages[i]);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gnttab_unmap_refs);
+
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
struct gnttab_setup_table setup;
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 1821aa1..b1fab6b 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -155,4 +155,9 @@ unsigned int gnttab_max_grant_frames(void);
#define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
+int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
+ struct page **pages, unsigned int count);
+int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct page **pages, unsigned int count);
+
#endif /* __ASM_GNTTAB_H__ */
--
1.5.6.5
From: Stefano Stabellini <[email protected]>
In m2p_add_override store the original mfn into page->index and then
change the p2m mapping, setting mfns as FOREIGN_FRAME.
In m2p_remove_override restore the original mapping.
Signed-off-by: Stefano Stabellini <[email protected]>
---
arch/x86/xen/p2m.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 8db19d5..7dde8e4 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -407,8 +407,11 @@ static unsigned long mfn_hash(unsigned long mfn)
void m2p_add_override(unsigned long mfn, struct page *page)
{
unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
page->private = mfn;
+ page->index = pfn_to_mfn(pfn);
+ __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
spin_lock_irqsave(&m2p_override_lock, flags);
list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
spin_unlock_irqrestore(&m2p_override_lock, flags);
@@ -420,6 +423,7 @@ void m2p_remove_override(struct page *page)
spin_lock_irqsave(&m2p_override_lock, flags);
list_del(&page->lru);
spin_unlock_irqrestore(&m2p_override_lock, flags);
+ __set_phys_to_machine(page_to_pfn(page), page->index);
}
struct page *m2p_find_override(unsigned long mfn)
--
1.5.6.5
From: Jeremy Fitzhardinge <[email protected]>
These pages are from other domains, so don't have any local PFN.
VM_PFNMAP is the closest concept Linux has to this.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
drivers/xen/gntdev.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 45898d4..cf61c7d 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -574,6 +574,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
vma->vm_flags |= VM_RESERVED;
vma->vm_flags |= VM_DONTCOPY;
vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_flags |= VM_PFNMAP;
vma->vm_private_data = map;
map->vma = vma;
--
1.5.6.5
From: Gerd Hoffmann <[email protected]>
The gntdev driver allows usermode to map granted pages from other
domains. This is typically used to implement a Xen backend driver
in user mode.
Signed-off-by: Gerd Hoffmann <[email protected]>
Signed-off-by: Stefano Stabellini <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
drivers/xen/Kconfig | 7 +
drivers/xen/Makefile | 2 +
drivers/xen/gntdev.c | 646 ++++++++++++++++++++++++++++++++++++++++++++++++++
include/xen/gntdev.h | 119 +++++++++
4 files changed, 774 insertions(+), 0 deletions(-)
create mode 100644 drivers/xen/gntdev.c
create mode 100644 include/xen/gntdev.h
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 6e6180c..0c6d2a1 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -62,6 +62,13 @@ config XEN_SYS_HYPERVISOR
virtual environment, /sys/hypervisor will still be present,
but will have no xen contents.
+config XEN_GNTDEV
+ tristate "userspace grant access device driver"
+ depends on XEN
+ select MMU_NOTIFIER
+ help
+ Allows userspace processes use grants.
+
config XEN_PLATFORM_PCI
tristate "xen platform pci device driver"
depends on XEN_PVHVM
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 533a199..674fdb5 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
obj-$(CONFIG_XEN_BALLOON) += balloon.o
obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o
+obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o
obj-$(CONFIG_XENFS) += xenfs/
obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o
@@ -16,4 +17,5 @@ obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
obj-$(CONFIG_XEN_DOM0) += pci.o
xen-evtchn-y := evtchn.o
+xen-gntdev-y := gntdev.o
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
new file mode 100644
index 0000000..45898d4
--- /dev/null
+++ b/drivers/xen/gntdev.c
@@ -0,0 +1,646 @@
+/******************************************************************************
+ * gntdev.c
+ *
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ * (c) 2009 Gerd Hoffmann <[email protected]>
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mmu_notifier.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+#include <xen/xen.h>
+#include <xen/grant_table.h>
+#include <xen/gntdev.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Derek G. Murray <[email protected]>, "
+ "Gerd Hoffmann <[email protected]>");
+MODULE_DESCRIPTION("User-space granted page access driver");
+
+static int debug = 0;
+module_param(debug, int, 0644);
+static int limit = 1024;
+module_param(limit, int, 0644);
+
+struct gntdev_priv {
+ struct list_head maps;
+ uint32_t used;
+ uint32_t limit;
+ spinlock_t lock;
+ struct mm_struct *mm;
+ struct mmu_notifier mn;
+};
+
+struct grant_map {
+ struct list_head next;
+ struct gntdev_priv *priv;
+ struct vm_area_struct *vma;
+ int index;
+ int count;
+ int flags;
+ int is_mapped;
+ struct ioctl_gntdev_grant_ref *grants;
+ struct gnttab_map_grant_ref *map_ops;
+ struct gnttab_unmap_grant_ref *unmap_ops;
+};
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_print_maps(struct gntdev_priv *priv,
+ char *text, int text_index)
+{
+ struct grant_map *map;
+
+ printk("%s: maps list (priv %p, usage %d/%d)\n",
+ __FUNCTION__, priv, priv->used, priv->limit);
+ list_for_each_entry(map, &priv->maps, next)
+ printk(" index %2d, count %2d %s\n",
+ map->index, map->count,
+ map->index == text_index && text ? text : "");
+}
+
+static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
+{
+ struct grant_map *add;
+
+ add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
+ if (NULL == add)
+ return NULL;
+
+ add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL);
+ add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL);
+ add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
+ if (NULL == add->grants ||
+ NULL == add->map_ops ||
+ NULL == add->unmap_ops)
+ goto err;
+
+ add->index = 0;
+ add->count = count;
+ add->priv = priv;
+
+ if (add->count + priv->used > priv->limit)
+ goto err;
+
+ return add;
+
+err:
+ kfree(add->grants);
+ kfree(add->map_ops);
+ kfree(add->unmap_ops);
+ kfree(add);
+ return NULL;
+}
+
+static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (add->index + add->count < map->index) {
+ list_add_tail(&add->next, &map->next);
+ goto done;
+ }
+ add->index = map->index + map->count;
+ }
+ list_add_tail(&add->next, &priv->maps);
+
+done:
+ priv->used += add->count;
+ if (debug)
+ gntdev_print_maps(priv, "[new]", add->index);
+}
+
+static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index,
+ int count)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (map->index != index)
+ continue;
+ if (map->count != count)
+ continue;
+ return map;
+ }
+ return NULL;
+}
+
+static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
+ unsigned long vaddr)
+{
+ struct grant_map *map;
+
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (vaddr < map->vma->vm_start)
+ continue;
+ if (vaddr >= map->vma->vm_end)
+ continue;
+ return map;
+ }
+ return NULL;
+}
+
+static int gntdev_del_map(struct grant_map *map)
+{
+ int i;
+
+ if (map->vma)
+ return -EBUSY;
+ for (i = 0; i < map->count; i++)
+ if (map->unmap_ops[i].handle)
+ return -EBUSY;
+
+ map->priv->used -= map->count;
+ list_del(&map->next);
+ return 0;
+}
+
+static void gntdev_free_map(struct grant_map *map)
+{
+ if (!map)
+ return;
+ kfree(map->grants);
+ kfree(map->map_ops);
+ kfree(map->unmap_ops);
+ kfree(map);
+}
+
+/* ------------------------------------------------------------------ */
+
+static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data)
+{
+ struct grant_map *map = data;
+ unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
+ u64 pte_maddr;
+
+ BUG_ON(pgnr >= map->count);
+ pte_maddr = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
+ pte_maddr += (unsigned long)pte & ~PAGE_MASK;
+ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags,
+ map->grants[pgnr].ref,
+ map->grants[pgnr].domid);
+ gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags,
+ 0 /* handle */);
+ return 0;
+}
+
+static int map_grant_pages(struct grant_map *map)
+{
+ int i, err = 0;
+
+ if (debug)
+ printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count);
+ err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ map->map_ops, map->count);
+ if (WARN_ON(err))
+ return err;
+
+ for (i = 0; i < map->count; i++) {
+ if (map->map_ops[i].status)
+ err = -EINVAL;
+ map->unmap_ops[i].handle = map->map_ops[i].handle;
+ }
+ return err;
+}
+
+static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
+{
+ int i, err = 0;
+
+ if (debug)
+ printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__,
+ map->index, map->count, offset, pages);
+ err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ map->unmap_ops + offset, pages);
+ if (WARN_ON(err))
+ return err;
+
+ for (i = 0; i < pages; i++) {
+ if (map->unmap_ops[offset+i].status)
+ err = -EINVAL;
+ map->unmap_ops[offset+i].handle = 0;
+ }
+ return err;
+}
+
+/* ------------------------------------------------------------------ */
+
+static void gntdev_vma_close(struct vm_area_struct *vma)
+{
+ struct grant_map *map = vma->vm_private_data;
+
+ if (debug)
+ printk("%s\n", __FUNCTION__);
+ map->is_mapped = 0;
+ map->vma = NULL;
+ vma->vm_private_data = NULL;
+}
+
+static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ if (debug)
+ printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n",
+ __FUNCTION__, vmf->virtual_address, vmf->pgoff);
+ vmf->flags = VM_FAULT_ERROR;
+ return 0;
+}
+
+static struct vm_operations_struct gntdev_vmops = {
+ .close = gntdev_vma_close,
+ .fault = gntdev_vma_fault,
+};
+
+/* ------------------------------------------------------------------ */
+
+static void mn_invl_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+ unsigned long mstart, mend;
+ int err;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (!map->is_mapped)
+ continue;
+ if (map->vma->vm_start >= end)
+ continue;
+ if (map->vma->vm_end <= start)
+ continue;
+ mstart = max(start, map->vma->vm_start);
+ mend = min(end, map->vma->vm_end);
+ if (debug)
+ printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
+ __FUNCTION__, map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end,
+ start, end, mstart, mend);
+ err = unmap_grant_pages(map,
+ (mstart - map->vma->vm_start) >> PAGE_SHIFT,
+ (mend - mstart) >> PAGE_SHIFT);
+ WARN_ON(err);
+ }
+ spin_unlock(&priv->lock);
+}
+
+static void mn_invl_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
+}
+
+static void mn_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
+ struct grant_map *map;
+ int err;
+
+ spin_lock(&priv->lock);
+ list_for_each_entry(map, &priv->maps, next) {
+ if (!map->vma)
+ continue;
+ if (debug)
+ printk("%s: map %d+%d (%lx %lx)\n",
+ __FUNCTION__, map->index, map->count,
+ map->vma->vm_start, map->vma->vm_end);
+ err = unmap_grant_pages(map, 0, map->count);
+ WARN_ON(err);
+ }
+ spin_unlock(&priv->lock);
+}
+
+struct mmu_notifier_ops gntdev_mmu_ops = {
+ .release = mn_release,
+ .invalidate_page = mn_invl_page,
+ .invalidate_range_start = mn_invl_range_start,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&priv->maps);
+ spin_lock_init(&priv->lock);
+ priv->limit = limit;
+
+ priv->mm = get_task_mm(current);
+ if (!priv->mm) {
+ kfree(priv);
+ return -ENOMEM;
+ }
+ priv->mn.ops = &gntdev_mmu_ops;
+ mmu_notifier_register(&priv->mn, priv->mm);
+ mmput(priv->mm);
+
+ flip->private_data = priv;
+ if (debug)
+ printk("%s: priv %p\n", __FUNCTION__, priv);
+
+ return 0;
+}
+
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ struct grant_map *map;
+ int err;
+
+ if (debug)
+ printk("%s: priv %p\n", __FUNCTION__, priv);
+
+ spin_lock(&priv->lock);
+ while (!list_empty(&priv->maps)) {
+ map = list_entry(priv->maps.next, struct grant_map, next);
+ err = gntdev_del_map(map);
+ if (WARN_ON(err))
+ gntdev_free_map(map);
+
+ }
+ spin_unlock(&priv->lock);
+
+ mmu_notifier_unregister(&priv->mn, priv->mm);
+ kfree(priv);
+ return 0;
+}
+
+static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_map_grant_ref __user *u)
+{
+ struct ioctl_gntdev_map_grant_ref op;
+ struct grant_map *map;
+ int err;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ if (debug)
+ printk("%s: priv %p, add %d\n", __FUNCTION__, priv,
+ op.count);
+ if (unlikely(op.count <= 0))
+ return -EINVAL;
+ if (unlikely(op.count > priv->limit))
+ return -EINVAL;
+
+ err = -ENOMEM;
+ map = gntdev_alloc_map(priv, op.count);
+ if (!map)
+ return err;
+ if (copy_from_user(map->grants, &u->refs,
+ sizeof(map->grants[0]) * op.count) != 0) {
+ gntdev_free_map(map);
+ return err;
+ }
+
+ spin_lock(&priv->lock);
+ gntdev_add_map(priv, map);
+ op.index = map->index << PAGE_SHIFT;
+ spin_unlock(&priv->lock);
+
+ if (copy_to_user(u, &op, sizeof(op)) != 0) {
+ spin_lock(&priv->lock);
+ gntdev_del_map(map);
+ spin_unlock(&priv->lock);
+ gntdev_free_map(map);
+ return err;
+ }
+ return 0;
+}
+
+static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
+ struct ioctl_gntdev_unmap_grant_ref __user *u)
+{
+ struct ioctl_gntdev_unmap_grant_ref op;
+ struct grant_map *map;
+ int err = -EINVAL;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ if (debug)
+ printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv,
+ (int)op.index, (int)op.count);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
+ if (map)
+ err = gntdev_del_map(map);
+ spin_unlock(&priv->lock);
+ if (!err)
+ gntdev_free_map(map);
+ return err;
+}
+
+static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
+ struct ioctl_gntdev_get_offset_for_vaddr __user *u)
+{
+ struct ioctl_gntdev_get_offset_for_vaddr op;
+ struct grant_map *map;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ if (debug)
+ printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv,
+ (unsigned long)op.vaddr);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_vaddr(priv, op.vaddr);
+ if (map == NULL ||
+ map->vma->vm_start != op.vaddr) {
+ spin_unlock(&priv->lock);
+ return -EINVAL;
+ }
+ op.offset = map->index << PAGE_SHIFT;
+ op.count = map->count;
+ spin_unlock(&priv->lock);
+
+ if (copy_to_user(u, &op, sizeof(op)) != 0)
+ return -EFAULT;
+ return 0;
+}
+
+static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
+ struct ioctl_gntdev_set_max_grants __user *u)
+{
+ struct ioctl_gntdev_set_max_grants op;
+
+ if (copy_from_user(&op, u, sizeof(op)) != 0)
+ return -EFAULT;
+ if (debug)
+ printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count);
+ if (op.count > limit)
+ return -EINVAL;
+
+ spin_lock(&priv->lock);
+ priv->limit = op.count;
+ spin_unlock(&priv->lock);
+ return 0;
+}
+
+static long gntdev_ioctl(struct file *flip,
+ unsigned int cmd, unsigned long arg)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ void __user *ptr = (void __user *)arg;
+
+ switch (cmd) {
+ case IOCTL_GNTDEV_MAP_GRANT_REF:
+ return gntdev_ioctl_map_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+ return gntdev_ioctl_unmap_grant_ref(priv, ptr);
+
+ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+ return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
+
+ case IOCTL_GNTDEV_SET_MAX_GRANTS:
+ return gntdev_ioctl_set_max_grants(priv, ptr);
+
+ default:
+ if (debug)
+ printk("%s: priv %p, unknown cmd %x\n",
+ __FUNCTION__, priv, cmd);
+ return -ENOIOCTLCMD;
+ }
+
+ return 0;
+}
+
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
+{
+ struct gntdev_priv *priv = flip->private_data;
+ int index = vma->vm_pgoff;
+ int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+ struct grant_map *map;
+ int err = -EINVAL;
+
+ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ if (debug)
+ printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__,
+ index, count, vma->vm_start, vma->vm_pgoff);
+
+ spin_lock(&priv->lock);
+ map = gntdev_find_map_index(priv, index, count);
+ if (!map)
+ goto unlock_out;
+ if (map->vma)
+ goto unlock_out;
+ if (priv->mm != vma->vm_mm) {
+ printk("%s: Huh? Other mm?\n", __FUNCTION__);
+ goto unlock_out;
+ }
+
+ vma->vm_ops = &gntdev_vmops;
+
+ vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_DONTCOPY;
+ vma->vm_flags |= VM_DONTEXPAND;
+
+ vma->vm_private_data = map;
+ map->vma = vma;
+
+ map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte;
+ if (!(vma->vm_flags & VM_WRITE))
+ map->flags |= GNTMAP_readonly;
+
+ err = apply_to_page_range(vma->vm_mm, vma->vm_start,
+ vma->vm_end - vma->vm_start,
+ find_grant_ptes, map);
+ if (err) {
+ goto unlock_out;
+ if (debug)
+ printk("%s: find_grant_ptes() failure.\n", __FUNCTION__);
+ }
+
+ err = map_grant_pages(map);
+ if (err) {
+ goto unlock_out;
+ if (debug)
+ printk("%s: map_grant_pages() failure.\n", __FUNCTION__);
+ }
+ map->is_mapped = 1;
+
+unlock_out:
+ spin_unlock(&priv->lock);
+ return err;
+}
+
+static const struct file_operations gntdev_fops = {
+ .owner = THIS_MODULE,
+ .open = gntdev_open,
+ .release = gntdev_release,
+ .mmap = gntdev_mmap,
+ .unlocked_ioctl = gntdev_ioctl
+};
+
+static struct miscdevice gntdev_miscdev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "xen/gntdev",
+ .fops = &gntdev_fops,
+};
+
+/* ------------------------------------------------------------------ */
+
+static int __init gntdev_init(void)
+{
+ int err;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ err = misc_register(&gntdev_miscdev);
+ if (err != 0) {
+ printk(KERN_ERR "Could not register gntdev device\n");
+ return err;
+ }
+ return 0;
+}
+
+static void __exit gntdev_exit(void)
+{
+ misc_deregister(&gntdev_miscdev);
+}
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* ------------------------------------------------------------------ */
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
new file mode 100644
index 0000000..8bd1467
--- /dev/null
+++ b/include/xen/gntdev.h
@@ -0,0 +1,119 @@
+/******************************************************************************
+ * gntdev.h
+ *
+ * Interface to /dev/xen/gntdev.
+ *
+ * Copyright (c) 2007, D G Murray
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_GNTDEV_H__
+#define __LINUX_PUBLIC_GNTDEV_H__
+
+struct ioctl_gntdev_grant_ref {
+ /* The domain ID of the grant to be mapped. */
+ uint32_t domid;
+ /* The grant reference of the grant to be mapped. */
+ uint32_t ref;
+};
+
+/*
+ * Inserts the grant references into the mapping table of an instance
+ * of gntdev. N.B. This does not perform the mapping, which is deferred
+ * until mmap() is called with @index as the offset.
+ */
+#define IOCTL_GNTDEV_MAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
+struct ioctl_gntdev_map_grant_ref {
+ /* IN parameters */
+ /* The number of grants to be mapped. */
+ uint32_t count;
+ uint32_t pad;
+ /* OUT parameters */
+ /* The offset to be used on a subsequent call to mmap(). */
+ uint64_t index;
+ /* Variable IN parameter. */
+ /* Array of grant references, of size @count. */
+ struct ioctl_gntdev_grant_ref refs[1];
+};
+
+/*
+ * Removes the grant references from the mapping table of an instance of
+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
+ * before this ioctl is called, or an error will result.
+ */
+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
+struct ioctl_gntdev_unmap_grant_ref {
+ /* IN parameters */
+ /* The offset was returned by the corresponding map operation. */
+ uint64_t index;
+ /* The number of pages to be unmapped. */
+ uint32_t count;
+ uint32_t pad;
+};
+
+/*
+ * Returns the offset in the driver's address space that corresponds
+ * to @vaddr. This can be used to perform a munmap(), followed by an
+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
+ * the caller. The number of pages that were allocated at the same time as
+ * @vaddr is returned in @count.
+ *
+ * N.B. Where more than one page has been mapped into a contiguous range, the
+ * supplied @vaddr must correspond to the start of the range; otherwise
+ * an error will result. It is only possible to munmap() the entire
+ * contiguously-allocated range at once, and not any subrange thereof.
+ */
+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
+_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
+struct ioctl_gntdev_get_offset_for_vaddr {
+ /* IN parameters */
+ /* The virtual address of the first mapped page in a range. */
+ uint64_t vaddr;
+ /* OUT parameters */
+ /* The offset that was used in the initial mmap() operation. */
+ uint64_t offset;
+ /* The number of pages mapped in the VM area that begins at @vaddr. */
+ uint32_t count;
+ uint32_t pad;
+};
+
+/*
+ * Sets the maximum number of grants that may mapped at once by this gntdev
+ * instance.
+ *
+ * N.B. This must be called before any other ioctl is performed on the device.
+ */
+#define IOCTL_GNTDEV_SET_MAX_GRANTS \
+_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
+struct ioctl_gntdev_set_max_grants {
+ /* IN parameter */
+ /* The maximum number of grants that may be mapped at once. */
+ uint32_t count;
+};
+
+#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
--
1.5.6.5
From: Jeremy Fitzhardinge <[email protected]>
Add a simple hashtable based mechanism to override some portions of the
m2p, so that we can find out the pfn corresponding to an mfn of a
granted page. In fact entries corresponding to granted pages in the m2p
hold the original pfn value of the page in the source domain that
granted it.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Signed-off-by: Stefano Stabellini <[email protected]>
---
arch/x86/include/asm/xen/page.h | 16 ++++++-
arch/x86/xen/p2m.c | 80 +++++++++++++++++++++++++++++++++++++++
2 files changed, 93 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 8760cc6..50f0a0f 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -42,6 +42,11 @@ extern unsigned int machine_to_phys_order;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern void m2p_add_override(unsigned long mfn, struct page *page);
+extern void m2p_remove_override(struct page *page);
+extern struct page *m2p_find_override(unsigned long mfn);
+extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
+
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
unsigned long mfn;
@@ -72,9 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
- if (unlikely((mfn >> machine_to_phys_order) != 0))
- return ~0;
-
pfn = 0;
/*
* The array access can fail (e.g., device space beyond end of RAM).
@@ -83,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
*/
__get_user(pfn, &machine_to_phys_mapping[mfn]);
+ /*
+ * If this appears to be a foreign mfn (because the pfn
+ * doesn't map back to the mfn), then check the local override
+ * table to see if there's a better pfn to use.
+ */
+ if (get_phys_to_machine(pfn) != mfn)
+ pfn = m2p_find_override_pfn(mfn, pfn);
+
return pfn;
}
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 259ec3b..8db19d5 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -27,6 +27,8 @@
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/hash.h>
#include <asm/cache.h>
#include <asm/setup.h>
@@ -37,6 +39,8 @@
#include "xen-ops.h"
+static void __init m2p_override_init(void);
+
unsigned long xen_max_p2m_pfn __read_mostly;
#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
@@ -234,6 +238,8 @@ void __init xen_build_dynamic_phys_to_machine(void)
p2m_top[topidx][mididx] = &mfn_list[pfn];
}
+
+ m2p_override_init();
}
unsigned long get_phys_to_machine(unsigned long pfn)
@@ -374,3 +380,77 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true;
}
+
+#define M2P_OVERRIDE_HASH_SHIFT 10
+#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
+
+static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static DEFINE_SPINLOCK(m2p_override_lock);
+
+static void __init m2p_override_init(void)
+{
+ unsigned i;
+
+ m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+ sizeof(unsigned long));
+
+ for (i = 0; i < M2P_OVERRIDE_HASH; i++)
+ INIT_LIST_HEAD(&m2p_overrides[i]);
+}
+
+static unsigned long mfn_hash(unsigned long mfn)
+{
+ return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
+}
+
+/* Add an MFN override for a particular page */
+void m2p_add_override(unsigned long mfn, struct page *page)
+{
+ unsigned long flags;
+ page->private = mfn;
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+}
+
+void m2p_remove_override(struct page *page)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_del(&page->lru);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+}
+
+struct page *m2p_find_override(unsigned long mfn)
+{
+ unsigned long flags;
+ struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
+ struct page *p, *ret;
+
+ ret = NULL;
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+
+ list_for_each_entry(p, bucket, lru) {
+ if (p->private == mfn) {
+ ret = p;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return ret;
+}
+
+unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
+{
+ struct page *p = m2p_find_override(mfn);
+ unsigned long ret = pfn;
+
+ if (p)
+ ret = page_to_pfn(p);
+
+ return ret;
+}
--
1.5.6.5
On Wed, 15 Dec 2010, Stefano Stabellini wrote:
> Hi all,
> this patch series introduces the gntdev device that allows usermode
> to map granted pages; gntdev is used by qemu to implement Xen backends
> in userspace.
>
> Considering that granted pages still have the same entry in the m2p
> as before being granted, the following patches also add a mechanism to
> override portions of the m2p so that we can find out the pfn
> corresponding to the mfn of a granted page in the "receiver" domain.
> This is needed so that we can have pte_page work correctly for these
> pages. Once this requirement is satisfied, __get_user_pages_fast can
> work on granted pages and userspace applications (the xen block backend
> implemented in qemu) can use AIO on them.
>
>
I forgot to add that a branch with this series based on 2.6.37-rc5 is
available here:
git://xenbits.xen.org/people/sstabellini/linux-pvhvm.git 2.6.37-rc5-gntdev-2
On 12/15/2010 05:40 AM, [email protected] wrote:
> From: Stefano Stabellini <[email protected]>
>
> In m2p_add_override store the original mfn into page->index and then
> change the p2m mapping, setting mfns as FOREIGN_FRAME.
>
> In m2p_remove_override restore the original mapping.
>
> Signed-off-by: Stefano Stabellini <[email protected]>
> ---
> arch/x86/xen/p2m.c | 4 ++++
> 1 files changed, 4 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
> index 8db19d5..7dde8e4 100644
> --- a/arch/x86/xen/p2m.c
> +++ b/arch/x86/xen/p2m.c
> @@ -407,8 +407,11 @@ static unsigned long mfn_hash(unsigned long mfn)
> void m2p_add_override(unsigned long mfn, struct page *page)
> {
> unsigned long flags;
> + unsigned long pfn = page_to_pfn(page);
> page->private = mfn;
> + page->index = pfn_to_mfn(pfn);
Is there any risk that a page being used for a granted mfn will end up
getting xen_create_contiguous_region() applied to it via the DMA API?
That would be messy...
J
>
> + __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
> spin_lock_irqsave(&m2p_override_lock, flags);
> list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
> spin_unlock_irqrestore(&m2p_override_lock, flags);
> @@ -420,6 +423,7 @@ void m2p_remove_override(struct page *page)
> spin_lock_irqsave(&m2p_override_lock, flags);
> list_del(&page->lru);
> spin_unlock_irqrestore(&m2p_override_lock, flags);
> + __set_phys_to_machine(page_to_pfn(page), page->index);
> }
>
> struct page *m2p_find_override(unsigned long mfn)
On Wed, 15 Dec 2010, Jeremy Fitzhardinge wrote:
> On 12/15/2010 05:40 AM, [email protected] wrote:
> > From: Stefano Stabellini <[email protected]>
> >
> > In m2p_add_override store the original mfn into page->index and then
> > change the p2m mapping, setting mfns as FOREIGN_FRAME.
> >
> > In m2p_remove_override restore the original mapping.
> >
> > Signed-off-by: Stefano Stabellini <[email protected]>
> > ---
> > arch/x86/xen/p2m.c | 4 ++++
> > 1 files changed, 4 insertions(+), 0 deletions(-)
> >
> > diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
> > index 8db19d5..7dde8e4 100644
> > --- a/arch/x86/xen/p2m.c
> > +++ b/arch/x86/xen/p2m.c
> > @@ -407,8 +407,11 @@ static unsigned long mfn_hash(unsigned long mfn)
> > void m2p_add_override(unsigned long mfn, struct page *page)
> > {
> > unsigned long flags;
> > + unsigned long pfn = page_to_pfn(page);
> > page->private = mfn;
> > + page->index = pfn_to_mfn(pfn);
>
> Is there any risk that a page being used for a granted mfn will end up
> getting xen_create_contiguous_region() applied to it via the DMA API?
> That would be messy...
I don't think so because AFAIK xen_create_contiguous_region is only
called:
- by xen_swiotlb_fixup on xen_io_tlb_start (+ offset) that has been
allocated using alloc_bootmem;
- by xen_swiotlb_alloc_coherent on memory allocated using
__get_free_pages.
if in the future xen_create_contiguous_region will be called on other
memory ranges then maybe, but at the moment there are no problems.