2013-05-24 17:24:32

by Alex Williamson

[permalink] [raw]
Subject: [PATCH 0/2] vfio: type1 iommu hugepage support

This series let's the vfio type1 iommu backend take advantage of iommu
large page support. See patch 2/2 for the details. This has been
tested on both amd_iommu and intel_iommu, but only my AMD system has
large page support. I'd appreciate any testing and feedback on other
systems, particularly vt-d systems supporting large pages. Mapping
efficiency should be improved a bit without iommu hugepages, but I
hope that it's much more noticeable with huge pages, especially for
very large QEMU guests.

This change includes a clarification to the mapping expectations for
users of the type1 iommu, but is compatible with known users and works
with existing QEMU userspace supporting vfio. Thanks,

Alex

---

Alex Williamson (2):
vfio: Convert type1 iommu to use rbtree
vfio: hugepage support for vfio_iommu_type1


drivers/vfio/vfio_iommu_type1.c | 607 ++++++++++++++++++++++++---------------
include/uapi/linux/vfio.h | 8 -
2 files changed, 387 insertions(+), 228 deletions(-)


2013-05-24 17:24:38

by Alex Williamson

[permalink] [raw]
Subject: [PATCH 1/2] vfio: Convert type1 iommu to use rbtree

We need to keep track of all the DMA mappings of an iommu container so
that it can be automatically unmapped when the user releases the file
descriptor. We currently do this using a simple list, where we merge
entries with contiguous iovas and virtual addresses. Using a tree for
this is a bit more efficient and allows us to use common code instead
of inventing our own.

Signed-off-by: Alex Williamson <[email protected]>
---
drivers/vfio/vfio_iommu_type1.c | 190 ++++++++++++++++++++-------------------
1 file changed, 96 insertions(+), 94 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6f3fbc4..0e863b3 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -31,6 +31,7 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/pci.h> /* pci_bus_type */
+#include <linux/rbtree.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
@@ -50,13 +51,13 @@ MODULE_PARM_DESC(allow_unsafe_interrupts,
struct vfio_iommu {
struct iommu_domain *domain;
struct mutex lock;
- struct list_head dma_list;
+ struct rb_root dma_list;
struct list_head group_list;
bool cache;
};

struct vfio_dma {
- struct list_head next;
+ struct rb_node node;
dma_addr_t iova; /* Device address */
unsigned long vaddr; /* Process virtual addr */
long npage; /* Number of pages */
@@ -75,6 +76,49 @@ struct vfio_group {

#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)

+static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
+ dma_addr_t start, size_t size)
+{
+ struct rb_node *node = iommu->dma_list.rb_node;
+
+ while (node) {
+ struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+ if (start + size <= dma->iova)
+ node = node->rb_left;
+ else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
+ node = node->rb_right;
+ else
+ return dma;
+ }
+
+ return NULL;
+}
+
+static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
+{
+ struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
+ struct vfio_dma *dma;
+
+ while (*link) {
+ parent = *link;
+ dma = rb_entry(parent, struct vfio_dma, node);
+
+ if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, &iommu->dma_list);
+}
+
+static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
+{
+ rb_erase(&old->node, &iommu->dma_list);
+}
+
struct vwork {
struct mm_struct *mm;
long npage;
@@ -289,31 +333,8 @@ static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
return 0;
}

-static inline bool ranges_overlap(dma_addr_t start1, size_t size1,
- dma_addr_t start2, size_t size2)
-{
- if (start1 < start2)
- return (start2 - start1 < size1);
- else if (start2 < start1)
- return (start1 - start2 < size2);
- return (size1 > 0 && size2 > 0);
-}
-
-static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
- dma_addr_t start, size_t size)
-{
- struct vfio_dma *dma;
-
- list_for_each_entry(dma, &iommu->dma_list, next) {
- if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage),
- start, size))
- return dma;
- }
- return NULL;
-}
-
-static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
- size_t size, struct vfio_dma *dma)
+static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
+ size_t size, struct vfio_dma *dma)
{
struct vfio_dma *split;
long npage_lo, npage_hi;
@@ -322,10 +343,9 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
if (start <= dma->iova &&
start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
- list_del(&dma->next);
- npage_lo = dma->npage;
+ vfio_remove_dma(iommu, dma);
kfree(dma);
- return npage_lo;
+ return 0;
}

/* Overlap low address of existing range */
@@ -339,7 +359,7 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
dma->iova += overlap;
dma->vaddr += overlap;
dma->npage -= npage_lo;
- return npage_lo;
+ return 0;
}

/* Overlap high address of existing range */
@@ -351,7 +371,7 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,

vfio_dma_unmap(iommu, start, npage_hi, dma->prot);
dma->npage -= npage_hi;
- return npage_hi;
+ return 0;
}

/* Split existing */
@@ -370,16 +390,16 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
split->iova = start + size;
split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
split->prot = dma->prot;
- list_add(&split->next, &iommu->dma_list);
- return size >> PAGE_SHIFT;
+ vfio_insert_dma(iommu, split);
+ return 0;
}

static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_unmap *unmap)
{
- long ret = 0, npage = unmap->size >> PAGE_SHIFT;
- struct vfio_dma *dma, *tmp;
uint64_t mask;
+ struct vfio_dma *dma;
+ int ret = 0;

mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;

@@ -393,25 +413,19 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,

mutex_lock(&iommu->lock);

- list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) {
- if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage),
- unmap->iova, unmap->size)) {
- ret = vfio_remove_dma_overlap(iommu, unmap->iova,
- unmap->size, dma);
- if (ret > 0)
- npage -= ret;
- if (ret < 0 || npage == 0)
- break;
- }
- }
+ while (!ret && (dma = vfio_find_dma(iommu,
+ unmap->iova, unmap->size)))
+ ret = vfio_remove_dma_overlap(iommu, unmap->iova,
+ unmap->size, dma);
+
mutex_unlock(&iommu->lock);
- return ret > 0 ? 0 : (int)ret;
+ return ret;
}

static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
- struct vfio_dma *dma, *pdma = NULL;
+ struct vfio_dma *dma;
dma_addr_t iova = map->iova;
unsigned long locked, lock_limit, vaddr = map->vaddr;
size_t size = map->size;
@@ -452,6 +466,10 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (!npage)
return -EINVAL;

+ dma = kzalloc(sizeof *dma, GFP_KERNEL);
+ if (!dma)
+ return -ENOMEM;
+
mutex_lock(&iommu->lock);

if (vfio_find_dma(iommu, iova, size)) {
@@ -473,62 +491,45 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (ret)
goto out_lock;

+ dma->npage = npage;
+ dma->iova = iova;
+ dma->vaddr = vaddr;
+ dma->prot = prot;
+
/* Check if we abut a region below - nothing below 0 */
if (iova) {
- dma = vfio_find_dma(iommu, iova - 1, 1);
- if (dma && dma->prot == prot &&
- dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) {
-
- dma->npage += npage;
- iova = dma->iova;
- vaddr = dma->vaddr;
+ struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1);
+ if (tmp && tmp->prot == prot &&
+ tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) {
+ vfio_remove_dma(iommu, tmp);
+ dma->npage += tmp->npage;
+ dma->iova = iova = tmp->iova;
+ dma->vaddr = vaddr = tmp->vaddr;
+ kfree(tmp);
npage = dma->npage;
size = NPAGE_TO_SIZE(npage);
-
- pdma = dma;
}
}

/* Check if we abut a region above - nothing above ~0 + 1 */
if (iova + size) {
- dma = vfio_find_dma(iommu, iova + size, 1);
- if (dma && dma->prot == prot &&
- dma->vaddr == vaddr + size) {
-
- dma->npage += npage;
- dma->iova = iova;
- dma->vaddr = vaddr;
-
- /*
- * If merged above and below, remove previously
- * merged entry. New entry covers it.
- */
- if (pdma) {
- list_del(&pdma->next);
- kfree(pdma);
- }
- pdma = dma;
+ struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1);
+ if (tmp && tmp->prot == prot &&
+ tmp->vaddr == vaddr + size) {
+ vfio_remove_dma(iommu, tmp);
+ dma->npage += tmp->npage;
+ kfree(tmp);
+ npage = dma->npage;
+ size = NPAGE_TO_SIZE(npage);
}
}

- /* Isolated, new region */
- if (!pdma) {
- dma = kzalloc(sizeof *dma, GFP_KERNEL);
- if (!dma) {
- ret = -ENOMEM;
- vfio_dma_unmap(iommu, iova, npage, prot);
- goto out_lock;
- }
-
- dma->npage = npage;
- dma->iova = iova;
- dma->vaddr = vaddr;
- dma->prot = prot;
- list_add(&dma->next, &iommu->dma_list);
- }
+ vfio_insert_dma(iommu, dma);

out_lock:
mutex_unlock(&iommu->lock);
+ if (ret)
+ kfree(dma);
return ret;
}

@@ -606,7 +607,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
return ERR_PTR(-ENOMEM);

INIT_LIST_HEAD(&iommu->group_list);
- INIT_LIST_HEAD(&iommu->dma_list);
+ iommu->dma_list = RB_ROOT;
mutex_init(&iommu->lock);

/*
@@ -640,7 +641,7 @@ static void vfio_iommu_type1_release(void *iommu_data)
{
struct vfio_iommu *iommu = iommu_data;
struct vfio_group *group, *group_tmp;
- struct vfio_dma *dma, *dma_tmp;
+ struct rb_node *node;

list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
iommu_detach_group(iommu->domain, group->iommu_group);
@@ -648,9 +649,10 @@ static void vfio_iommu_type1_release(void *iommu_data)
kfree(group);
}

- list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) {
+ while ((node = rb_first(&iommu->dma_list))) {
+ struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
- list_del(&dma->next);
+ vfio_remove_dma(iommu, dma);
kfree(dma);
}

2013-05-24 17:24:47

by Alex Williamson

[permalink] [raw]
Subject: [PATCH 2/2] vfio: hugepage support for vfio_iommu_type1

We currently send all mappings to the iommu in PAGE_SIZE chunks,
which prevents the iommu from enabling support for larger page sizes.
We still need to pin pages, which means we step through them in
PAGE_SIZE chunks, but we can batch up contiguous physical memory
chunks to allow the iommu the opportunity to use larger pages. The
approach here is a bit different that the one currently used for
legacy KVM device assignment. Rather than looking at the vma page
size and using that as the maximum size to pass to the iommu, we
instead simply look at whether the next page is physically
contiguous. This means we might ask the iommu to map a 4MB region,
while legacy KVM might limit itself to a maximum of 2MB.

Splitting our mapping path also allows us to be smarter about locked
memory because we can more easily unwind if the user attempts to
exceed the limit. Therefore, rather than assuming that a mapping
will result in locked memory, we test each page as it is pinned to
determine whether it locks RAM vs an mmap'd MMIO region. This should
result in better locking granularity and less locked page fudge
factors in userspace.

The unmap path uses the same algorithm as legacy KVM. We don't want
to track the pfn for each mapping ourselves, but we need the pfn in
order to unpin pages. We therefore ask the iommu for the iova to
physical address translation, ask it to unpin a page, and see how many
pages were actually unpinned. iommus supporting large pages will
often return something bigger than a page here, which we know will be
physically contiguous and we can unpin a batch of pfns. iommus not
supporting large mappings won't see an improvement in batching here as
they only unmap a page at a time.

With this change, we also make a clarification to the API for mapping
and unmapping DMA. We can only guarantee unmaps at the same
granularity as used for the original mapping. In other words,
unmapping a subregion of a previous mapping is not guaranteed and may
result in a larger or smaller unmapping than requested. The size
field in the unmapping structure is updated to reflect this.
Previously this was unmodified on mapping, always returning the the
requested unmap size. This is now updated to return the actual unmap
size on success, allowing userspace to appropriately track mappings.

Signed-off-by: Alex Williamson <[email protected]>
---
drivers/vfio/vfio_iommu_type1.c | 523 +++++++++++++++++++++++++--------------
include/uapi/linux/vfio.h | 8 -
2 files changed, 344 insertions(+), 187 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0e863b3..6654a7e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,7 +60,7 @@ struct vfio_dma {
struct rb_node node;
dma_addr_t iova; /* Device address */
unsigned long vaddr; /* Process virtual addr */
- long npage; /* Number of pages */
+ size_t size; /* Map size (bytes) */
int prot; /* IOMMU_READ/WRITE */
};

@@ -74,8 +74,6 @@ struct vfio_group {
* into DMA'ble space using the IOMMU
*/

-#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
-
static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
dma_addr_t start, size_t size)
{
@@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,

if (start + size <= dma->iova)
node = node->rb_left;
- else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
+ else if (start >= dma->iova + dma->size)
node = node->rb_right;
else
return dma;
@@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
parent = *link;
dma = rb_entry(parent, struct vfio_dma, node);

- if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
+ if (new->iova + new->size <= dma->iova)
link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
@@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage)
struct vwork *vwork;
struct mm_struct *mm;

- if (!current->mm)
- return; /* process exited */
+ if (!current->mm || !npage)
+ return; /* process exited or nothing to do */

if (down_write_trylock(&current->mm->mmap_sem)) {
current->mm->locked_vm += npage;
@@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot)
return 0;
}

-/* Unmap DMA region */
-static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
- long npage, int prot)
-{
- long i, unlocked = 0;
-
- for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
- unsigned long pfn;
-
- pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
- if (pfn) {
- iommu_unmap(iommu->domain, iova, PAGE_SIZE);
- unlocked += put_pfn(pfn, prot);
- }
- }
- return unlocked;
-}
-
-static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
- long npage, int prot)
-{
- long unlocked;
-
- unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot);
- vfio_lock_acct(-unlocked);
-}
-
static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
{
struct page *page[1];
@@ -270,79 +241,142 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
return ret;
}

-/* Map DMA region */
-static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
- unsigned long vaddr, long npage, int prot)
+/*
+ * Attempt to pin pages. We really don't want to track all the pfns and
+ * the iommu can only map chunks of consecutive pfns anyway, so get the
+ * first page and all consecutive pages with the same locking.
+ */
+static long vfio_pin_pages(unsigned long vaddr, long npage,
+ int prot, unsigned long *pfn_base)
{
- dma_addr_t start = iova;
- long i, locked = 0;
- int ret;
+ unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ bool lock_cap = capable(CAP_IPC_LOCK);
+ long ret, i;

- /* Verify that pages are not already mapped */
- for (i = 0; i < npage; i++, iova += PAGE_SIZE)
- if (iommu_iova_to_phys(iommu->domain, iova))
- return -EBUSY;
+ if (!current->mm)
+ return -ENODEV;

- iova = start;
+ ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+ if (ret)
+ return ret;

- if (iommu->cache)
- prot |= IOMMU_CACHE;
+ if (is_invalid_reserved_pfn(*pfn_base))
+ return 1;

- /*
- * XXX We break mappings into pages and use get_user_pages_fast to
- * pin the pages in memory. It's been suggested that mlock might
- * provide a more efficient mechanism, but nothing prevents the
- * user from munlocking the pages, which could then allow the user
- * access to random host memory. We also have no guarantee from the
- * IOMMU API that the iommu driver can unmap sub-pages of previous
- * mappings. This means we might lose an entire range if a single
- * page within it is unmapped. Single page mappings are inefficient,
- * but provide the most flexibility for now.
- */
- for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
+ if (!lock_cap && current->mm->locked_vm + 1 > limit) {
+ put_pfn(*pfn_base, prot);
+ pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+ limit << PAGE_SHIFT);
+ return -ENOMEM;
+ }
+
+ /* Lock all the consecutive pages from pfn_base */
+ for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
unsigned long pfn = 0;

ret = vaddr_get_pfn(vaddr, prot, &pfn);
- if (ret) {
- __vfio_dma_do_unmap(iommu, start, i, prot);
- return ret;
+ if (ret)
+ break;
+
+ if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
+ put_pfn(pfn, prot);
+ break;
}

+ if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
+ put_pfn(pfn, prot);
+ pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+ __func__, limit << PAGE_SHIFT);
+ break;
+ }
+ }
+
+ vfio_lock_acct(i);
+
+ return i;
+}
+
+static long vfio_unpin_pages(unsigned long pfn, long npage,
+ int prot, bool do_accounting)
+{
+ unsigned long unlocked = 0;
+ long i;
+
+ for (i = 0; i < npage; i++)
+ unlocked += put_pfn(pfn++, prot);
+
+ if (do_accounting)
+ vfio_lock_acct(-unlocked);
+
+ return unlocked;
+}
+
+static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+ dma_addr_t iova, size_t *size)
+{
+ dma_addr_t start = iova, end = iova + *size;
+ long unlocked = 0;
+
+ while (iova < end) {
+ size_t unmapped;
+ phys_addr_t phys;
+
/*
- * Only add actual locked pages to accounting
- * XXX We're effectively marking a page locked for every
- * IOVA page even though it's possible the user could be
- * backing multiple IOVAs with the same vaddr. This over-
- * penalizes the user process, but we currently have no
- * easy way to do this properly.
+ * We use the IOMMU to track the physical address. This
+ * saves us from having a lot more entries in our mapping
+ * tree. The downside is that we don't track the size
+ * used to do the mapping. We request unmap of a single
+ * page, but expect IOMMUs that support large pages to
+ * unmap a larger chunk.
*/
- if (!is_invalid_reserved_pfn(pfn))
- locked++;
-
- ret = iommu_map(iommu->domain, iova,
- (phys_addr_t)pfn << PAGE_SHIFT,
- PAGE_SIZE, prot);
- if (ret) {
- /* Back out mappings on error */
- put_pfn(pfn, prot);
- __vfio_dma_do_unmap(iommu, start, i, prot);
- return ret;
+ phys = iommu_iova_to_phys(iommu->domain, iova);
+ if (WARN_ON(!phys)) {
+ iova += PAGE_SIZE;
+ continue;
}
+
+ unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
+ if (!unmapped)
+ break;
+
+ unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
+ unmapped >> PAGE_SHIFT,
+ dma->prot, false);
+ iova += unmapped;
}
- vfio_lock_acct(locked);
+
+ vfio_lock_acct(-unlocked);
+
+ *size = iova - start;
+
return 0;
}

static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
- size_t size, struct vfio_dma *dma)
+ size_t *size, struct vfio_dma *dma)
{
+ size_t offset, overlap, tmp;
struct vfio_dma *split;
- long npage_lo, npage_hi;
+ int ret;
+
+ /*
+ * Existing dma region is completely covered, unmap all. This is
+ * the likely case since userspace tends to map and unmap buffers
+ * in one shot rather than multiple mappings within a buffer.
+ */
+ if (likely(start <= dma->iova &&
+ start + *size >= dma->iova + dma->size)) {
+ *size = dma->size;
+ ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
+ if (ret)
+ return ret;
+
+ /*
+ * Did we remove more than we have? Should never happen
+ * since a vfio_dma is contiguous in iova and vaddr.
+ */
+ WARN_ON(*size != dma->size);

- /* Existing dma region is completely covered, unmap all */
- if (start <= dma->iova &&
- start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
- vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
vfio_remove_dma(iommu, dma);
kfree(dma);
return 0;
@@ -350,47 +384,79 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,

/* Overlap low address of existing range */
if (start <= dma->iova) {
- size_t overlap;
+ overlap = start + *size - dma->iova;
+ ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
+ if (ret)
+ return ret;

- overlap = start + size - dma->iova;
- npage_lo = overlap >> PAGE_SHIFT;
+ vfio_remove_dma(iommu, dma);

- vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot);
- dma->iova += overlap;
- dma->vaddr += overlap;
- dma->npage -= npage_lo;
+ /*
+ * Check, we may have removed to whole vfio_dma. If not
+ * fixup and re-insert.
+ */
+ if (overlap < dma->size) {
+ dma->iova += overlap;
+ dma->vaddr += overlap;
+ dma->size -= overlap;
+ vfio_insert_dma(iommu, dma);
+ }
+ *size = overlap;
return 0;
}

/* Overlap high address of existing range */
- if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
- size_t overlap;
+ if (start + *size >= dma->iova + dma->size) {
+ offset = start - dma->iova;
+ overlap = dma->size - offset;

- overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start;
- npage_hi = overlap >> PAGE_SHIFT;
+ ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
+ if (ret)
+ return ret;
+
+ /*
+ * We may have unmapped the entire vfio_dma if the user is
+ * trying to unmap a sub-region of what was originally
+ * mapped. If anything left, we can resize in place since
+ * iova is unchanged.
+ */
+ if (overlap < dma->size)
+ dma->size -= overlap;
+ else
+ vfio_remove_dma(iommu, dma);

- vfio_dma_unmap(iommu, start, npage_hi, dma->prot);
- dma->npage -= npage_hi;
+ *size = overlap;
return 0;
}

/* Split existing */
- npage_lo = (start - dma->iova) >> PAGE_SHIFT;
- npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo;
+ offset = start - dma->iova;

- split = kzalloc(sizeof *split, GFP_KERNEL);
- if (!split)
- return -ENOMEM;
+ ret = vfio_unmap_unpin(iommu, dma, start, size);
+ if (ret)
+ return ret;

- vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot);
+ WARN_ON(!*size);
+ tmp = dma->size;

- dma->npage = npage_lo;
+ /*
+ * Resize the lower vfio_dma in place, insert new for remaining
+ * upper segment.
+ */
+ dma->size = offset;
+
+ if (offset + *size < tmp) {
+ split = kzalloc(sizeof(*split), GFP_KERNEL);
+ if (!split)
+ return -ENOMEM;
+
+ split->size = tmp - offset - *size;
+ split->iova = dma->iova + offset + *size;
+ split->vaddr = dma->vaddr + offset + *size;
+ split->prot = dma->prot;
+ vfio_insert_dma(iommu, split);
+ }

- split->npage = npage_hi;
- split->iova = start + size;
- split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
- split->prot = dma->prot;
- vfio_insert_dma(iommu, split);
return 0;
}

@@ -399,6 +465,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
{
uint64_t mask;
struct vfio_dma *dma;
+ size_t unmapped = 0, size;
int ret = 0;

mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
@@ -408,30 +475,66 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (unmap->size & mask)
return -EINVAL;

- /* XXX We still break these down into PAGE_SIZE */
WARN_ON(mask & PAGE_MASK);

mutex_lock(&iommu->lock);

- while (!ret && (dma = vfio_find_dma(iommu,
- unmap->iova, unmap->size)))
- ret = vfio_remove_dma_overlap(iommu, unmap->iova,
- unmap->size, dma);
+ while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
+ size = unmap->size;
+ ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
+ if (ret)
+ break;
+ unmapped += size;
+ }

mutex_unlock(&iommu->lock);
+
+ /*
+ * We may unmap more than requested, update the unmap struct so
+ * userspace can know.
+ */
+ unmap->size = unmapped;
+
+ return ret;
+}
+
+/*
+ * Turns out AMD IOMMU has a page table bug where it won't map large pages
+ * to a region that previously mapped smaller pages. This should be fixed
+ * soon, so this is just a temporary workaround to break mappings down into
+ * PAGE_SIZE. Better to map smaller pages than nothing.
+ */
+static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
+ unsigned long pfn, long npage, int prot)
+{
+ long i;
+ int ret;
+
+ for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
+ ret = iommu_map(iommu->domain, iova,
+ (phys_addr_t)pfn << PAGE_SHIFT,
+ PAGE_SIZE, prot);
+ if (ret)
+ break;
+ }
+
+ for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
+ iommu_unmap(iommu->domain, iova, PAGE_SIZE);
+
return ret;
}

static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
- struct vfio_dma *dma;
- dma_addr_t iova = map->iova;
- unsigned long locked, lock_limit, vaddr = map->vaddr;
+ dma_addr_t end, iova;
+ unsigned long vaddr = map->vaddr;
size_t size = map->size;
+ long npage;
int ret = 0, prot = 0;
uint64_t mask;
- long npage;
+
+ end = map->iova + map->size;

mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;

@@ -444,92 +547,138 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (!prot)
return -EINVAL; /* No READ/WRITE? */

+ if (iommu->cache)
+ prot |= IOMMU_CACHE;
+
if (vaddr & mask)
return -EINVAL;
- if (iova & mask)
+ if (map->iova & mask)
return -EINVAL;
- if (size & mask)
+ if (!map->size || map->size & mask)
return -EINVAL;

- /* XXX We still break these down into PAGE_SIZE */
WARN_ON(mask & PAGE_MASK);

/* Don't allow IOVA wrap */
- if (iova + size && iova + size < iova)
+ if (end && end < map->iova)
return -EINVAL;

/* Don't allow virtual address wrap */
- if (vaddr + size && vaddr + size < vaddr)
- return -EINVAL;
-
- npage = size >> PAGE_SHIFT;
- if (!npage)
+ if (vaddr + map->size && vaddr + map->size < vaddr)
return -EINVAL;

- dma = kzalloc(sizeof *dma, GFP_KERNEL);
- if (!dma)
- return -ENOMEM;
-
mutex_lock(&iommu->lock);

- if (vfio_find_dma(iommu, iova, size)) {
- ret = -EBUSY;
- goto out_lock;
+ if (vfio_find_dma(iommu, map->iova, map->size)) {
+ mutex_unlock(&iommu->lock);
+ return -EEXIST;
}

- /* account for locked pages */
- locked = current->mm->locked_vm + npage;
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
- pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
- __func__, rlimit(RLIMIT_MEMLOCK));
- ret = -ENOMEM;
- goto out_lock;
- }
+ for (iova = map->iova; iova < end; iova += size, vaddr += size) {
+ struct vfio_dma *dma = NULL;
+ unsigned long pfn;
+ long i;
+
+ /* Pin a contiguous chunk of memory */
+ npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
+ prot, &pfn);
+ if (npage <= 0) {
+ WARN_ON(!npage);
+ ret = (int)npage;
+ break;
+ }

- ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot);
- if (ret)
- goto out_lock;
-
- dma->npage = npage;
- dma->iova = iova;
- dma->vaddr = vaddr;
- dma->prot = prot;
-
- /* Check if we abut a region below - nothing below 0 */
- if (iova) {
- struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1);
- if (tmp && tmp->prot == prot &&
- tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) {
- vfio_remove_dma(iommu, tmp);
- dma->npage += tmp->npage;
- dma->iova = iova = tmp->iova;
- dma->vaddr = vaddr = tmp->vaddr;
- kfree(tmp);
- npage = dma->npage;
- size = NPAGE_TO_SIZE(npage);
+ /* Verify pages are not already mapped */
+ for (i = 0; i < npage; i++) {
+ if (iommu_iova_to_phys(iommu->domain,
+ iova + (i << PAGE_SHIFT))) {
+ vfio_unpin_pages(pfn, npage, prot, true);
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ ret = iommu_map(iommu->domain, iova,
+ (phys_addr_t)pfn << PAGE_SHIFT,
+ npage << PAGE_SHIFT, prot);
+ if (ret) {
+ if (ret != -EBUSY ||
+ map_try_harder(iommu, iova, pfn, npage, prot)) {
+ vfio_unpin_pages(pfn, npage, prot, true);
+ break;
+ }
+ }
+
+ size = npage << PAGE_SHIFT;
+
+ /*
+ * Check if we abut a region below - nothing below 0.
+ * This is the most likely case when mapping chunks of
+ * physically contiguous regions within a virtual address
+ * range. Update the abutting entry in place since iova
+ * doesn't change.
+ */
+ if (likely(iova)) {
+ struct vfio_dma *tmp;
+ tmp = vfio_find_dma(iommu, iova - 1, 1);
+ if (tmp && tmp->prot == prot &&
+ tmp->vaddr + tmp->size == vaddr) {
+ tmp->size += size;
+
+ iova = tmp->iova;
+ size = tmp->size;
+ vaddr = tmp->vaddr;
+ dma = tmp;
+ }
+ }
+
+ /* Check if we abut a region above - nothing above ~0 + 1 */
+ if (likely(iova + size)) {
+ struct vfio_dma *tmp;
+
+ tmp = vfio_find_dma(iommu, iova + size, 1);
+ if (tmp && tmp->prot == prot &&
+ tmp->vaddr == vaddr + size) {
+ vfio_remove_dma(iommu, tmp);
+ if (dma)
+ dma->size += tmp->size;
+ else
+ size += tmp->size;
+ kfree(tmp);
+ }
}
- }

- /* Check if we abut a region above - nothing above ~0 + 1 */
- if (iova + size) {
- struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1);
- if (tmp && tmp->prot == prot &&
- tmp->vaddr == vaddr + size) {
- vfio_remove_dma(iommu, tmp);
- dma->npage += tmp->npage;
- kfree(tmp);
- npage = dma->npage;
- size = NPAGE_TO_SIZE(npage);
+ if (!dma) {
+ dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+ if (!dma) {
+ iommu_unmap(iommu->domain, iova, size);
+ vfio_unpin_pages(pfn, npage, prot, true);
+ ret = -ENOMEM;
+ break;
+ }
+
+ dma->size = size;
+ dma->iova = iova;
+ dma->vaddr = vaddr;
+ dma->prot = prot;
+ vfio_insert_dma(iommu, dma);
}
}

- vfio_insert_dma(iommu, dma);
+ if (ret) {
+ struct vfio_dma *tmp;
+ iova = map->iova;
+ size = map->size;
+ while ((tmp = vfio_find_dma(iommu, iova, size))) {
+ if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) {
+ pr_warn("%s: Error rolling back failed map\n",
+ __func__);
+ break;
+ }
+ }
+ }

-out_lock:
mutex_unlock(&iommu->lock);
- if (ret)
- kfree(dma);
return ret;
}

@@ -651,9 +800,8 @@ static void vfio_iommu_type1_release(void *iommu_data)

while ((node = rb_first(&iommu->dma_list))) {
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
- vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
- vfio_remove_dma(iommu, dma);
- kfree(dma);
+ size_t size = dma->size;
+ vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
}

iommu_domain_free(iommu->domain);
@@ -708,6 +856,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,

} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
struct vfio_iommu_type1_dma_unmap unmap;
+ long ret;

minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);

@@ -717,7 +866,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
if (unmap.argsz < minsz || unmap.flags)
return -EINVAL;

- return vfio_dma_do_unmap(iommu, &unmap);
+ ret = vfio_dma_do_unmap(iommu, &unmap);
+ if (ret)
+ return ret;
+
+ return copy_to_user((void __user *)arg, &unmap, minsz);
}

return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 4f41f30..5c24535 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -360,10 +360,14 @@ struct vfio_iommu_type1_dma_map {
#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)

/**
- * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
+ * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
+ * struct vfio_dma_unmap)
*
* Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
- * Caller sets argsz.
+ * Caller sets argsz. The actual unmapped size is returned in the size
+ * field. No guarantee is made to the user that arbitrary unmaps of iova
+ * or size different from those used in the original mapping call will
+ * succeed.
*/
struct vfio_iommu_type1_dma_unmap {
__u32 argsz;

2013-05-25 11:20:23

by Konrad Rzeszutek Wilk

[permalink] [raw]
Subject: Re: [PATCH 2/2] vfio: hugepage support for vfio_iommu_type1

> + * Turns out AMD IOMMU has a page table bug where it won't map large pages
> + * to a region that previously mapped smaller pages. This should be fixed
> + * soon, so this is just a temporary workaround to break mappings down into
> + * PAGE_SIZE. Better to map smaller pages than nothing.
> + */
> +static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
> + unsigned long pfn, long npage, int prot)
> +{
> + long i;
> + int ret;
> +
> + for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
> + ret = iommu_map(iommu->domain, iova,
> + (phys_addr_t)pfn << PAGE_SHIFT,
> + PAGE_SIZE, prot);
> + if (ret)
> + break;
> + }
> +
> + for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
> + iommu_unmap(iommu->domain, iova, PAGE_SIZE);
> +
> return ret;
> }

This looks to belong to a vfio-quirk file (a something else) that deals with
various IOMMU's quirks.

2013-05-25 11:22:07

by Konrad Rzeszutek Wilk

[permalink] [raw]
Subject: Re: [PATCH 0/2] vfio: type1 iommu hugepage support

On Fri, May 24, 2013 at 11:24:26AM -0600, Alex Williamson wrote:
> This series let's the vfio type1 iommu backend take advantage of iommu
> large page support. See patch 2/2 for the details. This has been
> tested on both amd_iommu and intel_iommu, but only my AMD system has
> large page support. I'd appreciate any testing and feedback on other
> systems, particularly vt-d systems supporting large pages. Mapping
> efficiency should be improved a bit without iommu hugepages, but I
> hope that it's much more noticeable with huge pages, especially for
> very large QEMU guests.

I took a very very quick look - and I am wondering if there should also
be a flag to turn it on/off in ther kernel in such case? Especially in the
field if a user finds out that their particular IOMMU chipset might
be doing something funky with large-pages ?

>
> This change includes a clarification to the mapping expectations for
> users of the type1 iommu, but is compatible with known users and works
> with existing QEMU userspace supporting vfio. Thanks,
>
> Alex
>
> ---
>
> Alex Williamson (2):
> vfio: Convert type1 iommu to use rbtree
> vfio: hugepage support for vfio_iommu_type1
>
>
> drivers/vfio/vfio_iommu_type1.c | 607 ++++++++++++++++++++++++---------------
> include/uapi/linux/vfio.h | 8 -
> 2 files changed, 387 insertions(+), 228 deletions(-)
> _______________________________________________
> iommu mailing list
> [email protected]
> https://lists.linuxfoundation.org/mailman/listinfo/iommu

2013-05-25 14:23:57

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH 2/2] vfio: hugepage support for vfio_iommu_type1

On Sat, 2013-05-25 at 07:20 -0400, Konrad Rzeszutek Wilk wrote:
> > + * Turns out AMD IOMMU has a page table bug where it won't map large pages
> > + * to a region that previously mapped smaller pages. This should be fixed
> > + * soon, so this is just a temporary workaround to break mappings down into
> > + * PAGE_SIZE. Better to map smaller pages than nothing.
> > + */
> > +static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
> > + unsigned long pfn, long npage, int prot)
> > +{
> > + long i;
> > + int ret;
> > +
> > + for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
> > + ret = iommu_map(iommu->domain, iova,
> > + (phys_addr_t)pfn << PAGE_SHIFT,
> > + PAGE_SIZE, prot);
> > + if (ret)
> > + break;
> > + }
> > +
> > + for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
> > + iommu_unmap(iommu->domain, iova, PAGE_SIZE);
> > +
> > return ret;
> > }
>
> This looks to belong to a vfio-quirk file (a something else) that deals with
> various IOMMU's quirks.

This is a software bug in amd_iommu, which Joerg knows about, so I'm
hoping this is a short lived quirk. I dropped it here for convenience,
but I expect to be able to remove it before long. If we find some
hardware quirks, I expect we'd want the quirks in the drivers themselves
so the consumers, like vfio, don't need to worry about them. Thanks,

Alex

2013-05-25 14:40:12

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH 0/2] vfio: type1 iommu hugepage support

On Sat, 2013-05-25 at 07:21 -0400, Konrad Rzeszutek Wilk wrote:
> On Fri, May 24, 2013 at 11:24:26AM -0600, Alex Williamson wrote:
> > This series let's the vfio type1 iommu backend take advantage of iommu
> > large page support. See patch 2/2 for the details. This has been
> > tested on both amd_iommu and intel_iommu, but only my AMD system has
> > large page support. I'd appreciate any testing and feedback on other
> > systems, particularly vt-d systems supporting large pages. Mapping
> > efficiency should be improved a bit without iommu hugepages, but I
> > hope that it's much more noticeable with huge pages, especially for
> > very large QEMU guests.
>
> I took a very very quick look - and I am wondering if there should also
> be a flag to turn it on/off in ther kernel in such case? Especially in the
> field if a user finds out that their particular IOMMU chipset might
> be doing something funky with large-pages ?

Yeah, that's probably a reasonable feature. intel-iommu has a boot
option to disable superpages, but we could make it more consistent at
the vfio level if we have a switch there. It could provided interesting
performance and debugging info. I'll add a module option to do this.
Thanks,

Alex

> > This change includes a clarification to the mapping expectations for
> > users of the type1 iommu, but is compatible with known users and works
> > with existing QEMU userspace supporting vfio. Thanks,
> >
> > Alex
> >
> > ---
> >
> > Alex Williamson (2):
> > vfio: Convert type1 iommu to use rbtree
> > vfio: hugepage support for vfio_iommu_type1
> >
> >
> > drivers/vfio/vfio_iommu_type1.c | 607 ++++++++++++++++++++++++---------------
> > include/uapi/linux/vfio.h | 8 -
> > 2 files changed, 387 insertions(+), 228 deletions(-)
> > _______________________________________________
> > iommu mailing list
> > [email protected]
> > https://lists.linuxfoundation.org/mailman/listinfo/iommu


2013-05-27 08:41:45

by Sethi Varun-B16395

[permalink] [raw]
Subject: RE: [PATCH 2/2] vfio: hugepage support for vfio_iommu_type1



> -----Original Message-----
> From: [email protected] [mailto:iommu-
> [email protected]] On Behalf Of Alex Williamson
> Sent: Friday, May 24, 2013 10:55 PM
> To: [email protected]
> Cc: [email protected]; [email protected]; qemu-
> [email protected]; [email protected]; [email protected]
> Subject: [PATCH 2/2] vfio: hugepage support for vfio_iommu_type1
>
> We currently send all mappings to the iommu in PAGE_SIZE chunks, which
> prevents the iommu from enabling support for larger page sizes.
> We still need to pin pages, which means we step through them in PAGE_SIZE
> chunks, but we can batch up contiguous physical memory chunks to allow
> the iommu the opportunity to use larger pages. The approach here is a
> bit different that the one currently used for legacy KVM device
> assignment. Rather than looking at the vma page size and using that as
> the maximum size to pass to the iommu, we instead simply look at whether
> the next page is physically contiguous. This means we might ask the
> iommu to map a 4MB region, while legacy KVM might limit itself to a

[Sethi Varun-B16395] Wouldn't this depend on the IOMMU page alignment constraints?

> maximum of 2MB.
>
> Splitting our mapping path also allows us to be smarter about locked
> memory because we can more easily unwind if the user attempts to exceed
> the limit. Therefore, rather than assuming that a mapping will result in
> locked memory, we test each page as it is pinned to determine whether it
> locks RAM vs an mmap'd MMIO region. This should result in better locking
> granularity and less locked page fudge factors in userspace.
>
> The unmap path uses the same algorithm as legacy KVM. We don't want to
> track the pfn for each mapping ourselves, but we need the pfn in order to
> unpin pages. We therefore ask the iommu for the iova to physical address
> translation, ask it to unpin a page, and see how many pages were actually
> unpinned. iommus supporting large pages will often return something
> bigger than a page here, which we know will be physically contiguous and
> we can unpin a batch of pfns. iommus not supporting large mappings won't
> see an improvement in batching here as they only unmap a page at a time.
>
> With this change, we also make a clarification to the API for mapping and
> unmapping DMA. We can only guarantee unmaps at the same granularity as
> used for the original mapping. In other words, unmapping a subregion of
> a previous mapping is not guaranteed and may result in a larger or
> smaller unmapping than requested. The size field in the unmapping
> structure is updated to reflect this.
> Previously this was unmodified on mapping, always returning the the
> requested unmap size. This is now updated to return the actual unmap
> size on success, allowing userspace to appropriately track mappings.
>
[Sethi Varun-B16395] The main problem here is that the user space application is oblivious of the physical memory contiguity, right?


> Signed-off-by: Alex Williamson <[email protected]>
> ---
> drivers/vfio/vfio_iommu_type1.c | 523 +++++++++++++++++++++++++--------
> ------
> +static long vfio_unpin_pages(unsigned long pfn, long npage,
> + int prot, bool do_accounting)
> +{
> + unsigned long unlocked = 0;
> + long i;
> +
> + for (i = 0; i < npage; i++)
> + unlocked += put_pfn(pfn++, prot);
> +
> + if (do_accounting)
> + vfio_lock_acct(-unlocked);
> +
> + return unlocked;
> +}
> +
> +static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma
> *dma,
> + dma_addr_t iova, size_t *size)
> +{
> + dma_addr_t start = iova, end = iova + *size;
> + long unlocked = 0;
> +
> + while (iova < end) {
> + size_t unmapped;
> + phys_addr_t phys;
> +
> /*
> - * Only add actual locked pages to accounting
> - * XXX We're effectively marking a page locked for every
> - * IOVA page even though it's possible the user could be
> - * backing multiple IOVAs with the same vaddr. This over-
> - * penalizes the user process, but we currently have no
> - * easy way to do this properly.
> + * We use the IOMMU to track the physical address. This
> + * saves us from having a lot more entries in our mapping
> + * tree. The downside is that we don't track the size
> + * used to do the mapping. We request unmap of a single
> + * page, but expect IOMMUs that support large pages to
> + * unmap a larger chunk.
> */
> - if (!is_invalid_reserved_pfn(pfn))
> - locked++;
> -
> - ret = iommu_map(iommu->domain, iova,
> - (phys_addr_t)pfn << PAGE_SHIFT,
> - PAGE_SIZE, prot);
> - if (ret) {
> - /* Back out mappings on error */
> - put_pfn(pfn, prot);
> - __vfio_dma_do_unmap(iommu, start, i, prot);
> - return ret;
> + phys = iommu_iova_to_phys(iommu->domain, iova);
> + if (WARN_ON(!phys)) {
[Sethi Varun-B16395] When can this happen? Why won't this be treated as an error?

> + iova += PAGE_SIZE;
> + continue;
> }
> +
> + unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
> + if (!unmapped)
> + break;
> +
> + unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> + unmapped >> PAGE_SHIFT,
> + dma->prot, false);
> + iova += unmapped;
> }
> - vfio_lock_acct(locked);
> +
> + vfio_lock_acct(-unlocked);
> +
> + *size = iova - start;
> +
> return 0;
> }
>
> static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t
> start,
> - size_t size, struct vfio_dma *dma)
> + size_t *size, struct vfio_dma *dma)
> {
> + size_t offset, overlap, tmp;
> struct vfio_dma *split;
> - long npage_lo, npage_hi;
> + int ret;
> +
> + /*
> + * Existing dma region is completely covered, unmap all. This is
> + * the likely case since userspace tends to map and unmap buffers
> + * in one shot rather than multiple mappings within a buffer.
> + */
> + if (likely(start <= dma->iova &&
> + start + *size >= dma->iova + dma->size)) {
> + *size = dma->size;
> + ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
> + if (ret)
> + return ret;
> +
> + /*
> + * Did we remove more than we have? Should never happen
> + * since a vfio_dma is contiguous in iova and vaddr.
> + */
> + WARN_ON(*size != dma->size);
[Sethi Varun-B16395] Doesn't this indicate something wrong with the IOMMU mappings?

-Varun


2013-05-27 13:37:18

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH 2/2] vfio: hugepage support for vfio_iommu_type1

On Mon, 2013-05-27 at 08:41 +0000, Sethi Varun-B16395 wrote:
>
> > -----Original Message-----
> > From: [email protected] [mailto:iommu-
> > [email protected]] On Behalf Of Alex Williamson
> > Sent: Friday, May 24, 2013 10:55 PM
> > To: [email protected]
> > Cc: [email protected]; [email protected]; qemu-
> > [email protected]; [email protected]; [email protected]
> > Subject: [PATCH 2/2] vfio: hugepage support for vfio_iommu_type1
> >
> > We currently send all mappings to the iommu in PAGE_SIZE chunks, which
> > prevents the iommu from enabling support for larger page sizes.
> > We still need to pin pages, which means we step through them in PAGE_SIZE
> > chunks, but we can batch up contiguous physical memory chunks to allow
> > the iommu the opportunity to use larger pages. The approach here is a
> > bit different that the one currently used for legacy KVM device
> > assignment. Rather than looking at the vma page size and using that as
> > the maximum size to pass to the iommu, we instead simply look at whether
> > the next page is physically contiguous. This means we might ask the
> > iommu to map a 4MB region, while legacy KVM might limit itself to a
>
[Sethi Varun-B16395] Wouldn't this depend on the IOMMU page alignment
constraints?

The iommu_map() function handles this.

> > maximum of 2MB.
> >
> > Splitting our mapping path also allows us to be smarter about locked
> > memory because we can more easily unwind if the user attempts to exceed
> > the limit. Therefore, rather than assuming that a mapping will result in
> > locked memory, we test each page as it is pinned to determine whether it
> > locks RAM vs an mmap'd MMIO region. This should result in better locking
> > granularity and less locked page fudge factors in userspace.
> >
> > The unmap path uses the same algorithm as legacy KVM. We don't want to
> > track the pfn for each mapping ourselves, but we need the pfn in order to
> > unpin pages. We therefore ask the iommu for the iova to physical address
> > translation, ask it to unpin a page, and see how many pages were actually
> > unpinned. iommus supporting large pages will often return something
> > bigger than a page here, which we know will be physically contiguous and
> > we can unpin a batch of pfns. iommus not supporting large mappings won't
> > see an improvement in batching here as they only unmap a page at a time.
> >
> > With this change, we also make a clarification to the API for mapping and
> > unmapping DMA. We can only guarantee unmaps at the same granularity as
> > used for the original mapping. In other words, unmapping a subregion of
> > a previous mapping is not guaranteed and may result in a larger or
> > smaller unmapping than requested. The size field in the unmapping
> > structure is updated to reflect this.
> > Previously this was unmodified on mapping, always returning the the
> > requested unmap size. This is now updated to return the actual unmap
> > size on success, allowing userspace to appropriately track mappings.
> >
> [Sethi Varun-B16395] The main problem here is that the user space
> application is oblivious of the physical memory contiguity, right?

Yes.

> > Signed-off-by: Alex Williamson <[email protected]>
> > ---
> > drivers/vfio/vfio_iommu_type1.c | 523 +++++++++++++++++++++++++--------
> > ------
> > +static long vfio_unpin_pages(unsigned long pfn, long npage,
> > + int prot, bool do_accounting)
> > +{
> > + unsigned long unlocked = 0;
> > + long i;
> > +
> > + for (i = 0; i < npage; i++)
> > + unlocked += put_pfn(pfn++, prot);
> > +
> > + if (do_accounting)
> > + vfio_lock_acct(-unlocked);
> > +
> > + return unlocked;
> > +}
> > +
> > +static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma
> > *dma,
> > + dma_addr_t iova, size_t *size)
> > +{
> > + dma_addr_t start = iova, end = iova + *size;
> > + long unlocked = 0;
> > +
> > + while (iova < end) {
> > + size_t unmapped;
> > + phys_addr_t phys;
> > +
> > /*
> > - * Only add actual locked pages to accounting
> > - * XXX We're effectively marking a page locked for every
> > - * IOVA page even though it's possible the user could be
> > - * backing multiple IOVAs with the same vaddr. This over-
> > - * penalizes the user process, but we currently have no
> > - * easy way to do this properly.
> > + * We use the IOMMU to track the physical address. This
> > + * saves us from having a lot more entries in our mapping
> > + * tree. The downside is that we don't track the size
> > + * used to do the mapping. We request unmap of a single
> > + * page, but expect IOMMUs that support large pages to
> > + * unmap a larger chunk.
> > */
> > - if (!is_invalid_reserved_pfn(pfn))
> > - locked++;
> > -
> > - ret = iommu_map(iommu->domain, iova,
> > - (phys_addr_t)pfn << PAGE_SHIFT,
> > - PAGE_SIZE, prot);
> > - if (ret) {
> > - /* Back out mappings on error */
> > - put_pfn(pfn, prot);
> > - __vfio_dma_do_unmap(iommu, start, i, prot);
> > - return ret;
> > + phys = iommu_iova_to_phys(iommu->domain, iova);
> > + if (WARN_ON(!phys)) {
> [Sethi Varun-B16395] When can this happen? Why won't this be treated
> as an error?

I think this should never happen, which is why I just have a WARN and
continue path vs return error out to the user.

> > + iova += PAGE_SIZE;
> > + continue;
> > }
> > +
> > + unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
> > + if (!unmapped)
> > + break;
> > +
> > + unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
> > + unmapped >> PAGE_SHIFT,
> > + dma->prot, false);
> > + iova += unmapped;
> > }
> > - vfio_lock_acct(locked);
> > +
> > + vfio_lock_acct(-unlocked);
> > +
> > + *size = iova - start;
> > +
> > return 0;
> > }
> >
> > static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t
> > start,
> > - size_t size, struct vfio_dma *dma)
> > + size_t *size, struct vfio_dma *dma)
> > {
> > + size_t offset, overlap, tmp;
> > struct vfio_dma *split;
> > - long npage_lo, npage_hi;
> > + int ret;
> > +
> > + /*
> > + * Existing dma region is completely covered, unmap all. This is
> > + * the likely case since userspace tends to map and unmap buffers
> > + * in one shot rather than multiple mappings within a buffer.
> > + */
> > + if (likely(start <= dma->iova &&
> > + start + *size >= dma->iova + dma->size)) {
> > + *size = dma->size;
> > + ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
> > + if (ret)
> > + return ret;
> > +
> > + /*
> > + * Did we remove more than we have? Should never happen
> > + * since a vfio_dma is contiguous in iova and vaddr.
> > + */
> > + WARN_ON(*size != dma->size);
> [Sethi Varun-B16395] Doesn't this indicate something wrong with the IOMMU mappings?

Yes, we should always be able to remove one of our struct vfio_dma
tracking structures entirely because it will be a superset of previous
mappings that are both contiguous in virtual address and iova. This is
a sanity check WARN to make sure that's true. Thanks,

Alex

2013-05-28 16:27:58

by Alex Williamson

[permalink] [raw]
Subject: [PATCH 3/2] vfio: Provide module option to disable vfio_iommu_type1 hugepage support

Add a module option to vfio_iommu_type1 to disable IOMMU hugepage
support. This causes iommu_map to only be called with single page
mappings, disabling the IOMMU driver's ability to use hugepages.
This option can be enabled by loading vfio_iommu_type1 with
disable_hugepages=1 or dynamically through sysfs. If enabled
dynamically, only new mappings are restricted.

Signed-off-by: Alex Williamson <[email protected]>
---

As suggested by Konrad. This is cleaner to add as a follow-on

drivers/vfio/vfio_iommu_type1.c | 11 +++++++++++
1 file changed, 11 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6654a7e..8a2be4e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -48,6 +48,12 @@ module_param_named(allow_unsafe_interrupts,
MODULE_PARM_DESC(allow_unsafe_interrupts,
"Enable VFIO IOMMU support for on platforms without interrupt remapping support.");

+static bool disable_hugepages;
+module_param_named(disable_hugepages,
+ disable_hugepages, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable_hugepages,
+ "Disable VFIO IOMMU support for IOMMU hugepages.");
+
struct vfio_iommu {
struct iommu_domain *domain;
struct mutex lock;
@@ -270,6 +276,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
return -ENOMEM;
}

+ if (unlikely(disable_hugepages)) {
+ vfio_lock_acct(1);
+ return 1;
+ }
+
/* Lock all the consecutive pages from pfn_base */
for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
unsigned long pfn = 0;

2013-05-28 16:42:26

by Konrad Rzeszutek Wilk

[permalink] [raw]
Subject: Re: [PATCH 3/2] vfio: Provide module option to disable vfio_iommu_type1 hugepage support

On Tue, May 28, 2013 at 10:27:52AM -0600, Alex Williamson wrote:
> Add a module option to vfio_iommu_type1 to disable IOMMU hugepage
> support. This causes iommu_map to only be called with single page
> mappings, disabling the IOMMU driver's ability to use hugepages.
> This option can be enabled by loading vfio_iommu_type1 with
> disable_hugepages=1 or dynamically through sysfs. If enabled
> dynamically, only new mappings are restricted.
>
> Signed-off-by: Alex Williamson <[email protected]>

Reviewed-by: Konrad Rzeszutek Wilk <[email protected]>
> ---

>
> As suggested by Konrad. This is cleaner to add as a follow-on
>
> drivers/vfio/vfio_iommu_type1.c | 11 +++++++++++
> 1 file changed, 11 insertions(+)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 6654a7e..8a2be4e 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -48,6 +48,12 @@ module_param_named(allow_unsafe_interrupts,
> MODULE_PARM_DESC(allow_unsafe_interrupts,
> "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
>
> +static bool disable_hugepages;
> +module_param_named(disable_hugepages,
> + disable_hugepages, bool, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(disable_hugepages,
> + "Disable VFIO IOMMU support for IOMMU hugepages.");
> +
> struct vfio_iommu {
> struct iommu_domain *domain;
> struct mutex lock;
> @@ -270,6 +276,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> return -ENOMEM;
> }
>
> + if (unlikely(disable_hugepages)) {
> + vfio_lock_acct(1);
> + return 1;
> + }
> +
> /* Lock all the consecutive pages from pfn_base */
> for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
> unsigned long pfn = 0;
>

2013-05-31 02:33:32

by Vinod, Chegu

[permalink] [raw]
Subject: Re: [PATCH 3/2] vfio: Provide module option to disable vfio_iommu_type1 hugepage support

On 5/28/2013 9:27 AM, Alex Williamson wrote:
> Add a module option to vfio_iommu_type1 to disable IOMMU hugepage
> support. This causes iommu_map to only be called with single page
> mappings, disabling the IOMMU driver's ability to use hugepages.
> This option can be enabled by loading vfio_iommu_type1 with
> disable_hugepages=1 or dynamically through sysfs. If enabled
> dynamically, only new mappings are restricted.
>
> Signed-off-by: Alex Williamson <[email protected]>
> ---
>
> As suggested by Konrad. This is cleaner to add as a follow-on
>
> drivers/vfio/vfio_iommu_type1.c | 11 +++++++++++
> 1 file changed, 11 insertions(+)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 6654a7e..8a2be4e 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -48,6 +48,12 @@ module_param_named(allow_unsafe_interrupts,
> MODULE_PARM_DESC(allow_unsafe_interrupts,
> "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
>
> +static bool disable_hugepages;
> +module_param_named(disable_hugepages,
> + disable_hugepages, bool, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(disable_hugepages,
> + "Disable VFIO IOMMU support for IOMMU hugepages.");
> +
> struct vfio_iommu {
> struct iommu_domain *domain;
> struct mutex lock;
> @@ -270,6 +276,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage,
> return -ENOMEM;
> }
>
> + if (unlikely(disable_hugepages)) {
> + vfio_lock_acct(1);
> + return 1;
> + }
> +
> /* Lock all the consecutive pages from pfn_base */
> for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
> unsigned long pfn = 0;
>
> .
>

Tested-by: Chegu Vinod <[email protected]>

I was able to verify your changes on a 2 Sandybridge-EP socket platform
and observed about ~7-8% improvement in the netperf's TCP_RR
performance. The guest size was small (16vcpu/32GB).

Hopefully these changes also have an indirect benefit of avoiding soft
lockups on the host side when larger guests (> 256GB ) are rebooted.
Someone who has ready access to a larger Sandybridge-EP/EX platform can
verify this.

FYI
Vinod