As part of the mdev support, type1 now gets a task reference per
vfio_dma and uses that to get an mm reference for the task while
working on accounting. That's the correct thing to do for paths
where we can't rely on using current, but there are still hot paths
where we can optimize because we know we're invoked by the user.
Specifically, vfio_pin_pages_remote() is only called when the user
does DMA mapping (vfio_dma_do_map) or if an IOMMU group is added to
a container with existing mappings (vfio_iommu_replay). We can
therefore use current->mm as well as rlimit() and capable() directly
rather than going through the high overhead path via the stored
task_struct. We also know that vfio_dma_do_unmap() is only called
via user ioctl, so we can also tune that path to be more lightweight.
In a synthetic guest mapping test emulating a 1TB VM backed by a
single 4GB range remapped multiple times across the address space,
the mdev changes to the type1 backend introduced a roughly 25% hit
in runtime of this test. These changes restore it to nearly the
previous performance for the interfaces exercised here,
VFIO_IOMMU_MAP_DMA and release on close.
Signed-off-by: Alex Williamson <[email protected]>
---
drivers/vfio/vfio_iommu_type1.c | 145 +++++++++++++++++++++------------------
1 file changed, 79 insertions(+), 66 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9815e45..8dfeafb 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -103,6 +103,10 @@ struct vfio_pfn {
#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
(!list_empty(&iommu->domain_list))
+/* Make function bool options readable */
+#define IS_CURRENT (true)
+#define DO_ACCOUNTING (true)
+
static int put_pfn(unsigned long pfn, int prot);
/*
@@ -264,7 +268,8 @@ static void vfio_lock_acct_bg(struct work_struct *work)
kfree(vwork);
}
-static void vfio_lock_acct(struct task_struct *task, long npage)
+static void vfio_lock_acct(struct task_struct *task,
+ long npage, bool is_current)
{
struct vwork *vwork;
struct mm_struct *mm;
@@ -272,24 +277,31 @@ static void vfio_lock_acct(struct task_struct *task, long npage)
if (!npage)
return;
- mm = get_task_mm(task);
+ mm = is_current ? task->mm : get_task_mm(task);
if (!mm)
- return; /* process exited or nothing to do */
+ return; /* process exited */
if (down_write_trylock(&mm->mmap_sem)) {
mm->locked_vm += npage;
up_write(&mm->mmap_sem);
- mmput(mm);
+ if (!is_current)
+ mmput(mm);
return;
}
+ if (is_current) {
+ mm = get_task_mm(task);
+ if (!mm)
+ return;
+ }
+
/*
* Couldn't get mmap_sem lock, so must setup to update
* mm->locked_vm later. If locked_vm were atomic, we
* wouldn't need this silliness
*/
vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
- if (!vwork) {
+ if (WARN_ON(!vwork)) {
mmput(mm);
return;
}
@@ -345,13 +357,13 @@ static int put_pfn(unsigned long pfn, int prot)
}
static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
- int prot, unsigned long *pfn)
+ int prot, unsigned long *pfn, bool is_current)
{
struct page *page[1];
struct vm_area_struct *vma;
int ret;
- if (mm == current->mm) {
+ if (is_current) {
ret = get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE),
page);
} else {
@@ -393,96 +405,92 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
long npage, unsigned long *pfn_base)
{
- unsigned long limit;
- bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
- CAP_IPC_LOCK);
- struct mm_struct *mm;
- long ret, i = 0, lock_acct = 0;
+ unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ bool lock_cap = capable(CAP_IPC_LOCK);
+ long ret, pinned = 0, lock_acct = 0;
bool rsvd;
dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
- mm = get_task_mm(dma->task);
- if (!mm)
+ /* This code path is only user initiated */
+ if (!current->mm)
return -ENODEV;
- ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
+ ret = vaddr_get_pfn(current->mm, vaddr,
+ dma->prot, pfn_base, IS_CURRENT);
if (ret)
- goto pin_pg_remote_exit;
+ return ret;
+ pinned++;
rsvd = is_invalid_reserved_pfn(*pfn_base);
- limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
/*
* Reserved pages aren't counted against the user, externally pinned
* pages are already counted against the user.
*/
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
- if (!lock_cap && mm->locked_vm + 1 > limit) {
+ if (!lock_cap && current->mm->locked_vm + 1 > limit) {
put_pfn(*pfn_base, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
limit << PAGE_SHIFT);
- ret = -ENOMEM;
- goto pin_pg_remote_exit;
+ return -ENOMEM;
}
lock_acct++;
}
- i++;
- if (likely(!disable_hugepages)) {
- /* Lock all the consecutive pages from pfn_base */
- for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; i < npage;
- i++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
- unsigned long pfn = 0;
+ if (unlikely(disable_hugepages))
+ goto out;
- ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
- if (ret)
- break;
+ /* Lock all the consecutive pages from pfn_base */
+ for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
+ pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
+ unsigned long pfn = 0;
- if (pfn != *pfn_base + i ||
- rsvd != is_invalid_reserved_pfn(pfn)) {
+ ret = vaddr_get_pfn(current->mm, vaddr,
+ dma->prot, &pfn, IS_CURRENT);
+ if (ret)
+ break;
+
+ if (pfn != *pfn_base + pinned ||
+ rsvd != is_invalid_reserved_pfn(pfn)) {
+ put_pfn(pfn, dma->prot);
+ break;
+ }
+
+ if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+ if (!lock_cap &&
+ current->mm->locked_vm + lock_acct + 1 > limit) {
put_pfn(pfn, dma->prot);
+ pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+ __func__, limit << PAGE_SHIFT);
break;
}
-
- if (!rsvd && !vfio_find_vpfn(dma, iova)) {
- if (!lock_cap &&
- mm->locked_vm + lock_acct + 1 > limit) {
- put_pfn(pfn, dma->prot);
- pr_warn("%s: RLIMIT_MEMLOCK (%ld) "
- "exceeded\n", __func__,
- limit << PAGE_SHIFT);
- break;
- }
- lock_acct++;
- }
+ lock_acct++;
}
}
- vfio_lock_acct(dma->task, lock_acct);
- ret = i;
+out:
+ vfio_lock_acct(current, lock_acct, IS_CURRENT);
-pin_pg_remote_exit:
- mmput(mm);
- return ret;
+ return pinned;
}
static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
unsigned long pfn, long npage,
- bool do_accounting)
+ bool do_accounting, bool is_current)
{
long unlocked = 0, locked = 0;
long i;
- for (i = 0; i < npage; i++) {
+ for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
if (put_pfn(pfn++, dma->prot)) {
unlocked++;
- if (vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT)))
+ if (vfio_find_vpfn(dma, iova))
locked++;
}
}
if (do_accounting)
- vfio_lock_acct(dma->task, locked - unlocked);
+ vfio_lock_acct(dma->task, locked - unlocked, is_current);
return unlocked;
}
@@ -501,7 +509,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
if (!mm)
return -ENODEV;
- ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
+ ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base, !IS_CURRENT);
if (ret)
goto pin_page_exit;
@@ -518,7 +526,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
}
if (!rsvd && do_accounting)
- vfio_lock_acct(dma->task, 1);
+ vfio_lock_acct(dma->task, 1, !IS_CURRENT);
ret = 1;
pin_page_exit:
@@ -538,7 +546,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
if (do_accounting)
- vfio_lock_acct(dma->task, -unlocked);
+ vfio_lock_acct(dma->task, -unlocked, !IS_CURRENT);
return unlocked;
}
@@ -671,7 +679,7 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
}
static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
- bool do_accounting)
+ bool do_accounting, bool is_current)
{
dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
struct vfio_domain *domain, *d;
@@ -727,7 +735,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
unlocked += vfio_unpin_pages_remote(dma, iova,
phys >> PAGE_SHIFT,
unmapped >> PAGE_SHIFT,
- false);
+ !DO_ACCOUNTING, is_current);
iova += unmapped;
cond_resched();
@@ -735,15 +743,16 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
dma->iommu_mapped = false;
if (do_accounting) {
- vfio_lock_acct(dma->task, -unlocked);
+ vfio_lock_acct(dma->task, -unlocked, is_current);
return 0;
}
return unlocked;
}
-static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
+static void vfio_remove_dma(struct vfio_iommu *iommu,
+ struct vfio_dma *dma, bool is_current)
{
- vfio_unmap_unpin(iommu, dma, true);
+ vfio_unmap_unpin(iommu, dma, DO_ACCOUNTING, is_current);
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
kfree(dma);
@@ -874,7 +883,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
goto again;
}
unmapped += dma->size;
- vfio_remove_dma(iommu, dma);
+ vfio_remove_dma(iommu, dma, IS_CURRENT);
}
unlock:
@@ -964,7 +973,8 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
dma->prot);
if (ret) {
vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
- npage, true);
+ npage, DO_ACCOUNTING,
+ IS_CURRENT);
break;
}
@@ -975,7 +985,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
dma->iommu_mapped = true;
if (ret)
- vfio_remove_dma(iommu, dma);
+ vfio_remove_dma(iommu, dma, IS_CURRENT);
return ret;
}
@@ -1322,7 +1332,9 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
struct rb_node *node;
while ((node = rb_first(&iommu->dma_list)))
- vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
+ vfio_remove_dma(iommu,
+ rb_entry(node, struct vfio_dma, node),
+ !IS_CURRENT);
}
static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
@@ -1335,7 +1347,8 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
long locked = 0, unlocked = 0;
dma = rb_entry(n, struct vfio_dma, node);
- unlocked += vfio_unmap_unpin(iommu, dma, false);
+ unlocked += vfio_unmap_unpin(iommu, dma,
+ !DO_ACCOUNTING, !IS_CURRENT);
p = rb_first(&dma->pfn_list);
for (; p; p = rb_next(p)) {
struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
@@ -1344,7 +1357,7 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
if (!is_invalid_reserved_pfn(vpfn->pfn))
locked++;
}
- vfio_lock_acct(dma->task, locked - unlocked);
+ vfio_lock_acct(dma->task, locked - unlocked, !IS_CURRENT);
}
}
On 12/14/2016 2:28 AM, Alex Williamson wrote:
> As part of the mdev support, type1 now gets a task reference per
> vfio_dma and uses that to get an mm reference for the task while
> working on accounting. That's the correct thing to do for paths
> where we can't rely on using current, but there are still hot paths
> where we can optimize because we know we're invoked by the user.
>
> Specifically, vfio_pin_pages_remote() is only called when the user
> does DMA mapping (vfio_dma_do_map) or if an IOMMU group is added to
> a container with existing mappings (vfio_iommu_replay). We can
> therefore use current->mm as well as rlimit() and capable() directly
> rather than going through the high overhead path via the stored
> task_struct. We also know that vfio_dma_do_unmap() is only called
> via user ioctl, so we can also tune that path to be more lightweight.
>
> In a synthetic guest mapping test emulating a 1TB VM backed by a
> single 4GB range remapped multiple times across the address space,
> the mdev changes to the type1 backend introduced a roughly 25% hit
> in runtime of this test. These changes restore it to nearly the
> previous performance for the interfaces exercised here,
> VFIO_IOMMU_MAP_DMA and release on close.
>
> Signed-off-by: Alex Williamson <[email protected]>
> ---
> drivers/vfio/vfio_iommu_type1.c | 145 +++++++++++++++++++++------------------
> 1 file changed, 79 insertions(+), 66 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 9815e45..8dfeafb 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -103,6 +103,10 @@ struct vfio_pfn {
> #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> (!list_empty(&iommu->domain_list))
>
> +/* Make function bool options readable */
> +#define IS_CURRENT (true)
> +#define DO_ACCOUNTING (true)
> +
> static int put_pfn(unsigned long pfn, int prot);
>
> /*
> @@ -264,7 +268,8 @@ static void vfio_lock_acct_bg(struct work_struct *work)
> kfree(vwork);
> }
>
> -static void vfio_lock_acct(struct task_struct *task, long npage)
> +static void vfio_lock_acct(struct task_struct *task,
> + long npage, bool is_current)
> {
> struct vwork *vwork;
> struct mm_struct *mm;
> @@ -272,24 +277,31 @@ static void vfio_lock_acct(struct task_struct *task, long npage)
> if (!npage)
> return;
>
> - mm = get_task_mm(task);
> + mm = is_current ? task->mm : get_task_mm(task);
> if (!mm)
> - return; /* process exited or nothing to do */
> + return; /* process exited */
>
> if (down_write_trylock(&mm->mmap_sem)) {
> mm->locked_vm += npage;
> up_write(&mm->mmap_sem);
> - mmput(mm);
> + if (!is_current)
> + mmput(mm);
> return;
> }
>
> + if (is_current) {
> + mm = get_task_mm(task);
> + if (!mm)
> + return;
> + }
> +
> /*
> * Couldn't get mmap_sem lock, so must setup to update
> * mm->locked_vm later. If locked_vm were atomic, we
> * wouldn't need this silliness
> */
> vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> - if (!vwork) {
> + if (WARN_ON(!vwork)) {
> mmput(mm);
> return;
> }
> @@ -345,13 +357,13 @@ static int put_pfn(unsigned long pfn, int prot)
> }
>
> static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> - int prot, unsigned long *pfn)
> + int prot, unsigned long *pfn, bool is_current)
> {
> struct page *page[1];
> struct vm_area_struct *vma;
> int ret;
>
> - if (mm == current->mm) {
> + if (is_current) {
With this change, if vfio_pin_page_external() gets called from QEMU
process context, for example in response to some BAR0 register access,
it will still fallback to slow path, get_user_pages_remote(). We don't
have to change this function. This path already takes care of taking
best possible path.
That also makes me think, vfio_pin_page_external() uses task structure
to get mlock limit and capability. Expectation is mdev vendor driver
shouldn't pin all system memory, but if any mdev driver does that, then
that driver might see such performance impact. Should we optimize this
path if (dma->task == current)?
Thanks,
Kirti
> ret = get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE),
> page);
> } else {
> @@ -393,96 +405,92 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> long npage, unsigned long *pfn_base)
> {
> - unsigned long limit;
> - bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
> - CAP_IPC_LOCK);
> - struct mm_struct *mm;
> - long ret, i = 0, lock_acct = 0;
> + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> + bool lock_cap = capable(CAP_IPC_LOCK);
> + long ret, pinned = 0, lock_acct = 0;
> bool rsvd;
> dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
>
> - mm = get_task_mm(dma->task);
> - if (!mm)
> + /* This code path is only user initiated */
> + if (!current->mm)
> return -ENODEV;
>
> - ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
> + ret = vaddr_get_pfn(current->mm, vaddr,
> + dma->prot, pfn_base, IS_CURRENT);
> if (ret)
> - goto pin_pg_remote_exit;
> + return ret;
>
> + pinned++;
> rsvd = is_invalid_reserved_pfn(*pfn_base);
> - limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>
> /*
> * Reserved pages aren't counted against the user, externally pinned
> * pages are already counted against the user.
> */
> if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> - if (!lock_cap && mm->locked_vm + 1 > limit) {
> + if (!lock_cap && current->mm->locked_vm + 1 > limit) {
> put_pfn(*pfn_base, dma->prot);
> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> limit << PAGE_SHIFT);
> - ret = -ENOMEM;
> - goto pin_pg_remote_exit;
> + return -ENOMEM;
> }
> lock_acct++;
> }
>
> - i++;
> - if (likely(!disable_hugepages)) {
> - /* Lock all the consecutive pages from pfn_base */
> - for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; i < npage;
> - i++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> - unsigned long pfn = 0;
> + if (unlikely(disable_hugepages))
> + goto out;
>
> - ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
> - if (ret)
> - break;
> + /* Lock all the consecutive pages from pfn_base */
> + for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> + pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> + unsigned long pfn = 0;
>
> - if (pfn != *pfn_base + i ||
> - rsvd != is_invalid_reserved_pfn(pfn)) {
> + ret = vaddr_get_pfn(current->mm, vaddr,
> + dma->prot, &pfn, IS_CURRENT);
> + if (ret)
> + break;
> +
> + if (pfn != *pfn_base + pinned ||
> + rsvd != is_invalid_reserved_pfn(pfn)) {
> + put_pfn(pfn, dma->prot);
> + break;
> + }
> +
> + if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> + if (!lock_cap &&
> + current->mm->locked_vm + lock_acct + 1 > limit) {
> put_pfn(pfn, dma->prot);
> + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> + __func__, limit << PAGE_SHIFT);
> break;
> }
> -
> - if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> - if (!lock_cap &&
> - mm->locked_vm + lock_acct + 1 > limit) {
> - put_pfn(pfn, dma->prot);
> - pr_warn("%s: RLIMIT_MEMLOCK (%ld) "
> - "exceeded\n", __func__,
> - limit << PAGE_SHIFT);
> - break;
> - }
> - lock_acct++;
> - }
> + lock_acct++;
> }
> }
>
> - vfio_lock_acct(dma->task, lock_acct);
> - ret = i;
> +out:
> + vfio_lock_acct(current, lock_acct, IS_CURRENT);
>
> -pin_pg_remote_exit:
> - mmput(mm);
> - return ret;
> + return pinned;
> }
>
> static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> unsigned long pfn, long npage,
> - bool do_accounting)
> + bool do_accounting, bool is_current)
> {
> long unlocked = 0, locked = 0;
> long i;
>
> - for (i = 0; i < npage; i++) {
> + for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> if (put_pfn(pfn++, dma->prot)) {
> unlocked++;
> - if (vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT)))
> + if (vfio_find_vpfn(dma, iova))
> locked++;
> }
> }
>
> if (do_accounting)
> - vfio_lock_acct(dma->task, locked - unlocked);
> + vfio_lock_acct(dma->task, locked - unlocked, is_current);
>
> return unlocked;
> }
> @@ -501,7 +509,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> if (!mm)
> return -ENODEV;
>
> - ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
> + ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base, !IS_CURRENT);
> if (ret)
> goto pin_page_exit;
>
> @@ -518,7 +526,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> }
>
> if (!rsvd && do_accounting)
> - vfio_lock_acct(dma->task, 1);
> + vfio_lock_acct(dma->task, 1, !IS_CURRENT);
> ret = 1;
>
> pin_page_exit:
> @@ -538,7 +546,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
>
> if (do_accounting)
> - vfio_lock_acct(dma->task, -unlocked);
> + vfio_lock_acct(dma->task, -unlocked, !IS_CURRENT);
>
> return unlocked;
> }
> @@ -671,7 +679,7 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> }
>
> static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> - bool do_accounting)
> + bool do_accounting, bool is_current)
> {
> dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> struct vfio_domain *domain, *d;
> @@ -727,7 +735,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> unlocked += vfio_unpin_pages_remote(dma, iova,
> phys >> PAGE_SHIFT,
> unmapped >> PAGE_SHIFT,
> - false);
> + !DO_ACCOUNTING, is_current);
> iova += unmapped;
>
> cond_resched();
> @@ -735,15 +743,16 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
>
> dma->iommu_mapped = false;
> if (do_accounting) {
> - vfio_lock_acct(dma->task, -unlocked);
> + vfio_lock_acct(dma->task, -unlocked, is_current);
> return 0;
> }
> return unlocked;
> }
>
> -static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> +static void vfio_remove_dma(struct vfio_iommu *iommu,
> + struct vfio_dma *dma, bool is_current)
> {
> - vfio_unmap_unpin(iommu, dma, true);
> + vfio_unmap_unpin(iommu, dma, DO_ACCOUNTING, is_current);
> vfio_unlink_dma(iommu, dma);
> put_task_struct(dma->task);
> kfree(dma);
> @@ -874,7 +883,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> goto again;
> }
> unmapped += dma->size;
> - vfio_remove_dma(iommu, dma);
> + vfio_remove_dma(iommu, dma, IS_CURRENT);
> }
>
> unlock:
> @@ -964,7 +973,8 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> dma->prot);
> if (ret) {
> vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
> - npage, true);
> + npage, DO_ACCOUNTING,
> + IS_CURRENT);
> break;
> }
>
> @@ -975,7 +985,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> dma->iommu_mapped = true;
>
> if (ret)
> - vfio_remove_dma(iommu, dma);
> + vfio_remove_dma(iommu, dma, IS_CURRENT);
>
> return ret;
> }
> @@ -1322,7 +1332,9 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
> struct rb_node *node;
>
> while ((node = rb_first(&iommu->dma_list)))
> - vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
> + vfio_remove_dma(iommu,
> + rb_entry(node, struct vfio_dma, node),
> + !IS_CURRENT);
> }
>
> static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> @@ -1335,7 +1347,8 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> long locked = 0, unlocked = 0;
>
> dma = rb_entry(n, struct vfio_dma, node);
> - unlocked += vfio_unmap_unpin(iommu, dma, false);
> + unlocked += vfio_unmap_unpin(iommu, dma,
> + !DO_ACCOUNTING, !IS_CURRENT);
> p = rb_first(&dma->pfn_list);
> for (; p; p = rb_next(p)) {
> struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> @@ -1344,7 +1357,7 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> if (!is_invalid_reserved_pfn(vpfn->pfn))
> locked++;
> }
> - vfio_lock_acct(dma->task, locked - unlocked);
> + vfio_lock_acct(dma->task, locked - unlocked, !IS_CURRENT);
> }
> }
>
>
On Thu, 15 Dec 2016 12:05:35 +0530
Kirti Wankhede <[email protected]> wrote:
> On 12/14/2016 2:28 AM, Alex Williamson wrote:
> > As part of the mdev support, type1 now gets a task reference per
> > vfio_dma and uses that to get an mm reference for the task while
> > working on accounting. That's the correct thing to do for paths
> > where we can't rely on using current, but there are still hot paths
> > where we can optimize because we know we're invoked by the user.
> >
> > Specifically, vfio_pin_pages_remote() is only called when the user
> > does DMA mapping (vfio_dma_do_map) or if an IOMMU group is added to
> > a container with existing mappings (vfio_iommu_replay). We can
> > therefore use current->mm as well as rlimit() and capable() directly
> > rather than going through the high overhead path via the stored
> > task_struct. We also know that vfio_dma_do_unmap() is only called
> > via user ioctl, so we can also tune that path to be more lightweight.
> >
> > In a synthetic guest mapping test emulating a 1TB VM backed by a
> > single 4GB range remapped multiple times across the address space,
> > the mdev changes to the type1 backend introduced a roughly 25% hit
> > in runtime of this test. These changes restore it to nearly the
> > previous performance for the interfaces exercised here,
> > VFIO_IOMMU_MAP_DMA and release on close.
> >
> > Signed-off-by: Alex Williamson <[email protected]>
> > ---
> > drivers/vfio/vfio_iommu_type1.c | 145 +++++++++++++++++++++------------------
> > 1 file changed, 79 insertions(+), 66 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 9815e45..8dfeafb 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -103,6 +103,10 @@ struct vfio_pfn {
> > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> > (!list_empty(&iommu->domain_list))
> >
> > +/* Make function bool options readable */
> > +#define IS_CURRENT (true)
> > +#define DO_ACCOUNTING (true)
> > +
> > static int put_pfn(unsigned long pfn, int prot);
> >
> > /*
> > @@ -264,7 +268,8 @@ static void vfio_lock_acct_bg(struct work_struct *work)
> > kfree(vwork);
> > }
> >
> > -static void vfio_lock_acct(struct task_struct *task, long npage)
> > +static void vfio_lock_acct(struct task_struct *task,
> > + long npage, bool is_current)
> > {
> > struct vwork *vwork;
> > struct mm_struct *mm;
> > @@ -272,24 +277,31 @@ static void vfio_lock_acct(struct task_struct *task, long npage)
> > if (!npage)
> > return;
> >
> > - mm = get_task_mm(task);
> > + mm = is_current ? task->mm : get_task_mm(task);
> > if (!mm)
> > - return; /* process exited or nothing to do */
> > + return; /* process exited */
> >
> > if (down_write_trylock(&mm->mmap_sem)) {
> > mm->locked_vm += npage;
> > up_write(&mm->mmap_sem);
> > - mmput(mm);
> > + if (!is_current)
> > + mmput(mm);
> > return;
> > }
> >
> > + if (is_current) {
> > + mm = get_task_mm(task);
> > + if (!mm)
> > + return;
> > + }
> > +
> > /*
> > * Couldn't get mmap_sem lock, so must setup to update
> > * mm->locked_vm later. If locked_vm were atomic, we
> > * wouldn't need this silliness
> > */
> > vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> > - if (!vwork) {
> > + if (WARN_ON(!vwork)) {
> > mmput(mm);
> > return;
> > }
> > @@ -345,13 +357,13 @@ static int put_pfn(unsigned long pfn, int prot)
> > }
> >
> > static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> > - int prot, unsigned long *pfn)
> > + int prot, unsigned long *pfn, bool is_current)
> > {
> > struct page *page[1];
> > struct vm_area_struct *vma;
> > int ret;
> >
> > - if (mm == current->mm) {
> > + if (is_current) {
>
> With this change, if vfio_pin_page_external() gets called from QEMU
> process context, for example in response to some BAR0 register access,
> it will still fallback to slow path, get_user_pages_remote(). We don't
> have to change this function. This path already takes care of taking
> best possible path.
>
> That also makes me think, vfio_pin_page_external() uses task structure
> to get mlock limit and capability. Expectation is mdev vendor driver
> shouldn't pin all system memory, but if any mdev driver does that, then
> that driver might see such performance impact. Should we optimize this
> path if (dma->task == current)?
Hi Kirti,
I was actually trying to avoid the (task == current) test with this
change because I wasn't sure how reliable it is. Is there a
possibility that this test generates a false positive if current
coincidentally matches our task and does that allow us the same
opportunities for making use of current that we have when we know in a
process context execution path? The above change makes this a more
direct association. Can you show that inferring the process context is
correct? Thanks,
Alex
> > ret = get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE),
> > page);
> > } else {
> > @@ -393,96 +405,92 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> > static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> > long npage, unsigned long *pfn_base)
> > {
> > - unsigned long limit;
> > - bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
> > - CAP_IPC_LOCK);
> > - struct mm_struct *mm;
> > - long ret, i = 0, lock_acct = 0;
> > + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > + bool lock_cap = capable(CAP_IPC_LOCK);
> > + long ret, pinned = 0, lock_acct = 0;
> > bool rsvd;
> > dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
> >
> > - mm = get_task_mm(dma->task);
> > - if (!mm)
> > + /* This code path is only user initiated */
> > + if (!current->mm)
> > return -ENODEV;
> >
> > - ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
> > + ret = vaddr_get_pfn(current->mm, vaddr,
> > + dma->prot, pfn_base, IS_CURRENT);
> > if (ret)
> > - goto pin_pg_remote_exit;
> > + return ret;
> >
> > + pinned++;
> > rsvd = is_invalid_reserved_pfn(*pfn_base);
> > - limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >
> > /*
> > * Reserved pages aren't counted against the user, externally pinned
> > * pages are already counted against the user.
> > */
> > if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> > - if (!lock_cap && mm->locked_vm + 1 > limit) {
> > + if (!lock_cap && current->mm->locked_vm + 1 > limit) {
> > put_pfn(*pfn_base, dma->prot);
> > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> > limit << PAGE_SHIFT);
> > - ret = -ENOMEM;
> > - goto pin_pg_remote_exit;
> > + return -ENOMEM;
> > }
> > lock_acct++;
> > }
> >
> > - i++;
> > - if (likely(!disable_hugepages)) {
> > - /* Lock all the consecutive pages from pfn_base */
> > - for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; i < npage;
> > - i++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> > - unsigned long pfn = 0;
> > + if (unlikely(disable_hugepages))
> > + goto out;
> >
> > - ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
> > - if (ret)
> > - break;
> > + /* Lock all the consecutive pages from pfn_base */
> > + for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> > + pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> > + unsigned long pfn = 0;
> >
> > - if (pfn != *pfn_base + i ||
> > - rsvd != is_invalid_reserved_pfn(pfn)) {
> > + ret = vaddr_get_pfn(current->mm, vaddr,
> > + dma->prot, &pfn, IS_CURRENT);
> > + if (ret)
> > + break;
> > +
> > + if (pfn != *pfn_base + pinned ||
> > + rsvd != is_invalid_reserved_pfn(pfn)) {
> > + put_pfn(pfn, dma->prot);
> > + break;
> > + }
> > +
> > + if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> > + if (!lock_cap &&
> > + current->mm->locked_vm + lock_acct + 1 > limit) {
> > put_pfn(pfn, dma->prot);
> > + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > + __func__, limit << PAGE_SHIFT);
> > break;
> > }
> > -
> > - if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> > - if (!lock_cap &&
> > - mm->locked_vm + lock_acct + 1 > limit) {
> > - put_pfn(pfn, dma->prot);
> > - pr_warn("%s: RLIMIT_MEMLOCK (%ld) "
> > - "exceeded\n", __func__,
> > - limit << PAGE_SHIFT);
> > - break;
> > - }
> > - lock_acct++;
> > - }
> > + lock_acct++;
> > }
> > }
> >
> > - vfio_lock_acct(dma->task, lock_acct);
> > - ret = i;
> > +out:
> > + vfio_lock_acct(current, lock_acct, IS_CURRENT);
> >
> > -pin_pg_remote_exit:
> > - mmput(mm);
> > - return ret;
> > + return pinned;
> > }
> >
> > static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> > unsigned long pfn, long npage,
> > - bool do_accounting)
> > + bool do_accounting, bool is_current)
> > {
> > long unlocked = 0, locked = 0;
> > long i;
> >
> > - for (i = 0; i < npage; i++) {
> > + for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> > if (put_pfn(pfn++, dma->prot)) {
> > unlocked++;
> > - if (vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT)))
> > + if (vfio_find_vpfn(dma, iova))
> > locked++;
> > }
> > }
> >
> > if (do_accounting)
> > - vfio_lock_acct(dma->task, locked - unlocked);
> > + vfio_lock_acct(dma->task, locked - unlocked, is_current);
> >
> > return unlocked;
> > }
> > @@ -501,7 +509,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> > if (!mm)
> > return -ENODEV;
> >
> > - ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
> > + ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base, !IS_CURRENT);
> > if (ret)
> > goto pin_page_exit;
> >
> > @@ -518,7 +526,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> > }
> >
> > if (!rsvd && do_accounting)
> > - vfio_lock_acct(dma->task, 1);
> > + vfio_lock_acct(dma->task, 1, !IS_CURRENT);
> > ret = 1;
> >
> > pin_page_exit:
> > @@ -538,7 +546,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> > unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
> >
> > if (do_accounting)
> > - vfio_lock_acct(dma->task, -unlocked);
> > + vfio_lock_acct(dma->task, -unlocked, !IS_CURRENT);
> >
> > return unlocked;
> > }
> > @@ -671,7 +679,7 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> > }
> >
> > static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > - bool do_accounting)
> > + bool do_accounting, bool is_current)
> > {
> > dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> > struct vfio_domain *domain, *d;
> > @@ -727,7 +735,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > unlocked += vfio_unpin_pages_remote(dma, iova,
> > phys >> PAGE_SHIFT,
> > unmapped >> PAGE_SHIFT,
> > - false);
> > + !DO_ACCOUNTING, is_current);
> > iova += unmapped;
> >
> > cond_resched();
> > @@ -735,15 +743,16 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> >
> > dma->iommu_mapped = false;
> > if (do_accounting) {
> > - vfio_lock_acct(dma->task, -unlocked);
> > + vfio_lock_acct(dma->task, -unlocked, is_current);
> > return 0;
> > }
> > return unlocked;
> > }
> >
> > -static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > +static void vfio_remove_dma(struct vfio_iommu *iommu,
> > + struct vfio_dma *dma, bool is_current)
> > {
> > - vfio_unmap_unpin(iommu, dma, true);
> > + vfio_unmap_unpin(iommu, dma, DO_ACCOUNTING, is_current);
> > vfio_unlink_dma(iommu, dma);
> > put_task_struct(dma->task);
> > kfree(dma);
> > @@ -874,7 +883,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > goto again;
> > }
> > unmapped += dma->size;
> > - vfio_remove_dma(iommu, dma);
> > + vfio_remove_dma(iommu, dma, IS_CURRENT);
> > }
> >
> > unlock:
> > @@ -964,7 +973,8 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > dma->prot);
> > if (ret) {
> > vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
> > - npage, true);
> > + npage, DO_ACCOUNTING,
> > + IS_CURRENT);
> > break;
> > }
> >
> > @@ -975,7 +985,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > dma->iommu_mapped = true;
> >
> > if (ret)
> > - vfio_remove_dma(iommu, dma);
> > + vfio_remove_dma(iommu, dma, IS_CURRENT);
> >
> > return ret;
> > }
> > @@ -1322,7 +1332,9 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
> > struct rb_node *node;
> >
> > while ((node = rb_first(&iommu->dma_list)))
> > - vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
> > + vfio_remove_dma(iommu,
> > + rb_entry(node, struct vfio_dma, node),
> > + !IS_CURRENT);
> > }
> >
> > static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> > @@ -1335,7 +1347,8 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> > long locked = 0, unlocked = 0;
> >
> > dma = rb_entry(n, struct vfio_dma, node);
> > - unlocked += vfio_unmap_unpin(iommu, dma, false);
> > + unlocked += vfio_unmap_unpin(iommu, dma,
> > + !DO_ACCOUNTING, !IS_CURRENT);
> > p = rb_first(&dma->pfn_list);
> > for (; p; p = rb_next(p)) {
> > struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> > @@ -1344,7 +1357,7 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> > if (!is_invalid_reserved_pfn(vpfn->pfn))
> > locked++;
> > }
> > - vfio_lock_acct(dma->task, locked - unlocked);
> > + vfio_lock_acct(dma->task, locked - unlocked, !IS_CURRENT);
> > }
> > }
> >
> >
On 12/15/2016 1:33 PM, Alex Williamson wrote:
> On Thu, 15 Dec 2016 12:05:35 +0530
> Kirti Wankhede <[email protected]> wrote:
>
>> On 12/14/2016 2:28 AM, Alex Williamson wrote:
>>> As part of the mdev support, type1 now gets a task reference per
>>> vfio_dma and uses that to get an mm reference for the task while
>>> working on accounting. That's the correct thing to do for paths
>>> where we can't rely on using current, but there are still hot paths
>>> where we can optimize because we know we're invoked by the user.
>>>
>>> Specifically, vfio_pin_pages_remote() is only called when the user
>>> does DMA mapping (vfio_dma_do_map) or if an IOMMU group is added to
>>> a container with existing mappings (vfio_iommu_replay). We can
>>> therefore use current->mm as well as rlimit() and capable() directly
>>> rather than going through the high overhead path via the stored
>>> task_struct. We also know that vfio_dma_do_unmap() is only called
>>> via user ioctl, so we can also tune that path to be more lightweight.
>>>
>>> In a synthetic guest mapping test emulating a 1TB VM backed by a
>>> single 4GB range remapped multiple times across the address space,
>>> the mdev changes to the type1 backend introduced a roughly 25% hit
>>> in runtime of this test. These changes restore it to nearly the
>>> previous performance for the interfaces exercised here,
>>> VFIO_IOMMU_MAP_DMA and release on close.
>>>
>>> Signed-off-by: Alex Williamson <[email protected]>
>>> ---
>>> drivers/vfio/vfio_iommu_type1.c | 145 +++++++++++++++++++++------------------
>>> 1 file changed, 79 insertions(+), 66 deletions(-)
>>>
>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>> index 9815e45..8dfeafb 100644
>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>> @@ -103,6 +103,10 @@ struct vfio_pfn {
>>> #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
>>> (!list_empty(&iommu->domain_list))
>>>
>>> +/* Make function bool options readable */
>>> +#define IS_CURRENT (true)
>>> +#define DO_ACCOUNTING (true)
>>> +
>>> static int put_pfn(unsigned long pfn, int prot);
>>>
>>> /*
>>> @@ -264,7 +268,8 @@ static void vfio_lock_acct_bg(struct work_struct *work)
>>> kfree(vwork);
>>> }
>>>
>>> -static void vfio_lock_acct(struct task_struct *task, long npage)
>>> +static void vfio_lock_acct(struct task_struct *task,
>>> + long npage, bool is_current)
>>> {
>>> struct vwork *vwork;
>>> struct mm_struct *mm;
>>> @@ -272,24 +277,31 @@ static void vfio_lock_acct(struct task_struct *task, long npage)
>>> if (!npage)
>>> return;
>>>
>>> - mm = get_task_mm(task);
>>> + mm = is_current ? task->mm : get_task_mm(task);
>>> if (!mm)
>>> - return; /* process exited or nothing to do */
>>> + return; /* process exited */
>>>
>>> if (down_write_trylock(&mm->mmap_sem)) {
>>> mm->locked_vm += npage;
>>> up_write(&mm->mmap_sem);
>>> - mmput(mm);
>>> + if (!is_current)
>>> + mmput(mm);
>>> return;
>>> }
>>>
>>> + if (is_current) {
>>> + mm = get_task_mm(task);
>>> + if (!mm)
>>> + return;
>>> + }
>>> +
>>> /*
>>> * Couldn't get mmap_sem lock, so must setup to update
>>> * mm->locked_vm later. If locked_vm were atomic, we
>>> * wouldn't need this silliness
>>> */
>>> vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>>> - if (!vwork) {
>>> + if (WARN_ON(!vwork)) {
>>> mmput(mm);
>>> return;
>>> }
>>> @@ -345,13 +357,13 @@ static int put_pfn(unsigned long pfn, int prot)
>>> }
>>>
>>> static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>>> - int prot, unsigned long *pfn)
>>> + int prot, unsigned long *pfn, bool is_current)
>>> {
>>> struct page *page[1];
>>> struct vm_area_struct *vma;
>>> int ret;
>>>
>>> - if (mm == current->mm) {
>>> + if (is_current) {
>>
>> With this change, if vfio_pin_page_external() gets called from QEMU
>> process context, for example in response to some BAR0 register access,
>> it will still fallback to slow path, get_user_pages_remote(). We don't
>> have to change this function. This path already takes care of taking
>> best possible path.
>>
>> That also makes me think, vfio_pin_page_external() uses task structure
>> to get mlock limit and capability. Expectation is mdev vendor driver
>> shouldn't pin all system memory, but if any mdev driver does that, then
>> that driver might see such performance impact. Should we optimize this
>> path if (dma->task == current)?
>
> Hi Kirti,
>
> I was actually trying to avoid the (task == current) test with this
> change because I wasn't sure how reliable it is. Is there a
> possibility that this test generates a false positive if current
> coincidentally matches our task and does that allow us the same
> opportunities for making use of current that we have when we know in a
> process context execution path? The above change makes this a more
> direct association. Can you show that inferring the process context is
> correct? Thanks,
We do hold the usage count of task structure, get_task_struct(current),
before saving its reference in dma->task which is released,
put_task_struct(), from vfio_remove_dma(). That makes sure that we have
a valid reference to task structure till we remove/free that dma
structure. Why would the check (dma->task == current) be false positive?
Vendor driver can call vfio_pin_pages() on access to some emulated
register from the same task who have mapped dma range, in that case this
check would be true.
Thanks,
Kirti
On Thu, 15 Dec 2016 23:27:54 +0530
Kirti Wankhede <[email protected]> wrote:
> On 12/15/2016 1:33 PM, Alex Williamson wrote:
> > On Thu, 15 Dec 2016 12:05:35 +0530
> > Kirti Wankhede <[email protected]> wrote:
> >
> >> On 12/14/2016 2:28 AM, Alex Williamson wrote:
> >>> As part of the mdev support, type1 now gets a task reference per
> >>> vfio_dma and uses that to get an mm reference for the task while
> >>> working on accounting. That's the correct thing to do for paths
> >>> where we can't rely on using current, but there are still hot paths
> >>> where we can optimize because we know we're invoked by the user.
> >>>
> >>> Specifically, vfio_pin_pages_remote() is only called when the user
> >>> does DMA mapping (vfio_dma_do_map) or if an IOMMU group is added to
> >>> a container with existing mappings (vfio_iommu_replay). We can
> >>> therefore use current->mm as well as rlimit() and capable() directly
> >>> rather than going through the high overhead path via the stored
> >>> task_struct. We also know that vfio_dma_do_unmap() is only called
> >>> via user ioctl, so we can also tune that path to be more lightweight.
> >>>
> >>> In a synthetic guest mapping test emulating a 1TB VM backed by a
> >>> single 4GB range remapped multiple times across the address space,
> >>> the mdev changes to the type1 backend introduced a roughly 25% hit
> >>> in runtime of this test. These changes restore it to nearly the
> >>> previous performance for the interfaces exercised here,
> >>> VFIO_IOMMU_MAP_DMA and release on close.
> >>>
> >>> Signed-off-by: Alex Williamson <[email protected]>
> >>> ---
> >>> drivers/vfio/vfio_iommu_type1.c | 145 +++++++++++++++++++++------------------
> >>> 1 file changed, 79 insertions(+), 66 deletions(-)
> >>>
> >>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> >>> index 9815e45..8dfeafb 100644
> >>> --- a/drivers/vfio/vfio_iommu_type1.c
> >>> +++ b/drivers/vfio/vfio_iommu_type1.c
> >>> @@ -103,6 +103,10 @@ struct vfio_pfn {
> >>> #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> >>> (!list_empty(&iommu->domain_list))
> >>>
> >>> +/* Make function bool options readable */
> >>> +#define IS_CURRENT (true)
> >>> +#define DO_ACCOUNTING (true)
> >>> +
> >>> static int put_pfn(unsigned long pfn, int prot);
> >>>
> >>> /*
> >>> @@ -264,7 +268,8 @@ static void vfio_lock_acct_bg(struct work_struct *work)
> >>> kfree(vwork);
> >>> }
> >>>
> >>> -static void vfio_lock_acct(struct task_struct *task, long npage)
> >>> +static void vfio_lock_acct(struct task_struct *task,
> >>> + long npage, bool is_current)
> >>> {
> >>> struct vwork *vwork;
> >>> struct mm_struct *mm;
> >>> @@ -272,24 +277,31 @@ static void vfio_lock_acct(struct task_struct *task, long npage)
> >>> if (!npage)
> >>> return;
> >>>
> >>> - mm = get_task_mm(task);
> >>> + mm = is_current ? task->mm : get_task_mm(task);
> >>> if (!mm)
> >>> - return; /* process exited or nothing to do */
> >>> + return; /* process exited */
> >>>
> >>> if (down_write_trylock(&mm->mmap_sem)) {
> >>> mm->locked_vm += npage;
> >>> up_write(&mm->mmap_sem);
> >>> - mmput(mm);
> >>> + if (!is_current)
> >>> + mmput(mm);
> >>> return;
> >>> }
> >>>
> >>> + if (is_current) {
> >>> + mm = get_task_mm(task);
> >>> + if (!mm)
> >>> + return;
> >>> + }
> >>> +
> >>> /*
> >>> * Couldn't get mmap_sem lock, so must setup to update
> >>> * mm->locked_vm later. If locked_vm were atomic, we
> >>> * wouldn't need this silliness
> >>> */
> >>> vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> >>> - if (!vwork) {
> >>> + if (WARN_ON(!vwork)) {
> >>> mmput(mm);
> >>> return;
> >>> }
> >>> @@ -345,13 +357,13 @@ static int put_pfn(unsigned long pfn, int prot)
> >>> }
> >>>
> >>> static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> >>> - int prot, unsigned long *pfn)
> >>> + int prot, unsigned long *pfn, bool is_current)
> >>> {
> >>> struct page *page[1];
> >>> struct vm_area_struct *vma;
> >>> int ret;
> >>>
> >>> - if (mm == current->mm) {
> >>> + if (is_current) {
> >>
> >> With this change, if vfio_pin_page_external() gets called from QEMU
> >> process context, for example in response to some BAR0 register access,
> >> it will still fallback to slow path, get_user_pages_remote(). We don't
> >> have to change this function. This path already takes care of taking
> >> best possible path.
> >>
> >> That also makes me think, vfio_pin_page_external() uses task structure
> >> to get mlock limit and capability. Expectation is mdev vendor driver
> >> shouldn't pin all system memory, but if any mdev driver does that, then
> >> that driver might see such performance impact. Should we optimize this
> >> path if (dma->task == current)?
> >
> > Hi Kirti,
> >
> > I was actually trying to avoid the (task == current) test with this
> > change because I wasn't sure how reliable it is. Is there a
> > possibility that this test generates a false positive if current
> > coincidentally matches our task and does that allow us the same
> > opportunities for making use of current that we have when we know in a
> > process context execution path? The above change makes this a more
> > direct association. Can you show that inferring the process context is
> > correct? Thanks,
>
> We do hold the usage count of task structure, get_task_struct(current),
> before saving its reference in dma->task which is released,
> put_task_struct(), from vfio_remove_dma(). That makes sure that we have
> a valid reference to task structure till we remove/free that dma
> structure. Why would the check (dma->task == current) be false positive?
> Vendor driver can call vfio_pin_pages() on access to some emulated
> register from the same task who have mapped dma range, in that case this
> check would be true.
I agree, the task matching dma->task cannot go away, the question is
whether it's valid to use (current == dma->task) to detect if we're in
the thread of execution of that task, or could we be detecting
coincidental matches where we may get a different result if we're
preempted by an interrupt. TBH, I don't know how current works
at that level of detail to know for sure, which is part of why I made
the above change. It seemed like the safer option to know for certain
which path we're on rather than attempt to detect it. Does current
work as this requires, to detect that we're within the task's thread of
execution and not simply temporarily aligned? Thanks,
Alex
On 12/15/2016 11:57 PM, Alex Williamson wrote:
> On Thu, 15 Dec 2016 23:27:54 +0530
> Kirti Wankhede <[email protected]> wrote:
>
>> On 12/15/2016 1:33 PM, Alex Williamson wrote:
>>> On Thu, 15 Dec 2016 12:05:35 +0530
>>> Kirti Wankhede <[email protected]> wrote:
>>>
>>>> On 12/14/2016 2:28 AM, Alex Williamson wrote:
>>>>> As part of the mdev support, type1 now gets a task reference per
>>>>> vfio_dma and uses that to get an mm reference for the task while
>>>>> working on accounting. That's the correct thing to do for paths
>>>>> where we can't rely on using current, but there are still hot paths
>>>>> where we can optimize because we know we're invoked by the user.
>>>>>
>>>>> Specifically, vfio_pin_pages_remote() is only called when the user
>>>>> does DMA mapping (vfio_dma_do_map) or if an IOMMU group is added to
>>>>> a container with existing mappings (vfio_iommu_replay). We can
>>>>> therefore use current->mm as well as rlimit() and capable() directly
>>>>> rather than going through the high overhead path via the stored
>>>>> task_struct. We also know that vfio_dma_do_unmap() is only called
>>>>> via user ioctl, so we can also tune that path to be more lightweight.
>>>>>
>>>>> In a synthetic guest mapping test emulating a 1TB VM backed by a
>>>>> single 4GB range remapped multiple times across the address space,
>>>>> the mdev changes to the type1 backend introduced a roughly 25% hit
>>>>> in runtime of this test. These changes restore it to nearly the
>>>>> previous performance for the interfaces exercised here,
>>>>> VFIO_IOMMU_MAP_DMA and release on close.
>>>>>
>>>>> Signed-off-by: Alex Williamson <[email protected]>
>>>>> ---
>>>>> drivers/vfio/vfio_iommu_type1.c | 145 +++++++++++++++++++++------------------
>>>>> 1 file changed, 79 insertions(+), 66 deletions(-)
>>>>>
>>>>> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
>>>>> index 9815e45..8dfeafb 100644
>>>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>>>> @@ -103,6 +103,10 @@ struct vfio_pfn {
>>>>> #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
>>>>> (!list_empty(&iommu->domain_list))
>>>>>
>>>>> +/* Make function bool options readable */
>>>>> +#define IS_CURRENT (true)
>>>>> +#define DO_ACCOUNTING (true)
>>>>> +
>>>>> static int put_pfn(unsigned long pfn, int prot);
>>>>>
>>>>> /*
>>>>> @@ -264,7 +268,8 @@ static void vfio_lock_acct_bg(struct work_struct *work)
>>>>> kfree(vwork);
>>>>> }
>>>>>
>>>>> -static void vfio_lock_acct(struct task_struct *task, long npage)
>>>>> +static void vfio_lock_acct(struct task_struct *task,
>>>>> + long npage, bool is_current)
>>>>> {
>>>>> struct vwork *vwork;
>>>>> struct mm_struct *mm;
>>>>> @@ -272,24 +277,31 @@ static void vfio_lock_acct(struct task_struct *task, long npage)
>>>>> if (!npage)
>>>>> return;
>>>>>
>>>>> - mm = get_task_mm(task);
>>>>> + mm = is_current ? task->mm : get_task_mm(task);
>>>>> if (!mm)
>>>>> - return; /* process exited or nothing to do */
>>>>> + return; /* process exited */
>>>>>
>>>>> if (down_write_trylock(&mm->mmap_sem)) {
>>>>> mm->locked_vm += npage;
>>>>> up_write(&mm->mmap_sem);
>>>>> - mmput(mm);
>>>>> + if (!is_current)
>>>>> + mmput(mm);
>>>>> return;
>>>>> }
>>>>>
>>>>> + if (is_current) {
>>>>> + mm = get_task_mm(task);
>>>>> + if (!mm)
>>>>> + return;
>>>>> + }
>>>>> +
>>>>> /*
>>>>> * Couldn't get mmap_sem lock, so must setup to update
>>>>> * mm->locked_vm later. If locked_vm were atomic, we
>>>>> * wouldn't need this silliness
>>>>> */
>>>>> vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
>>>>> - if (!vwork) {
>>>>> + if (WARN_ON(!vwork)) {
>>>>> mmput(mm);
>>>>> return;
>>>>> }
>>>>> @@ -345,13 +357,13 @@ static int put_pfn(unsigned long pfn, int prot)
>>>>> }
>>>>>
>>>>> static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
>>>>> - int prot, unsigned long *pfn)
>>>>> + int prot, unsigned long *pfn, bool is_current)
>>>>> {
>>>>> struct page *page[1];
>>>>> struct vm_area_struct *vma;
>>>>> int ret;
>>>>>
>>>>> - if (mm == current->mm) {
>>>>> + if (is_current) {
>>>>
>>>> With this change, if vfio_pin_page_external() gets called from QEMU
>>>> process context, for example in response to some BAR0 register access,
>>>> it will still fallback to slow path, get_user_pages_remote(). We don't
>>>> have to change this function. This path already takes care of taking
>>>> best possible path.
>>>>
>>>> That also makes me think, vfio_pin_page_external() uses task structure
>>>> to get mlock limit and capability. Expectation is mdev vendor driver
>>>> shouldn't pin all system memory, but if any mdev driver does that, then
>>>> that driver might see such performance impact. Should we optimize this
>>>> path if (dma->task == current)?
>>>
>>> Hi Kirti,
>>>
>>> I was actually trying to avoid the (task == current) test with this
>>> change because I wasn't sure how reliable it is. Is there a
>>> possibility that this test generates a false positive if current
>>> coincidentally matches our task and does that allow us the same
>>> opportunities for making use of current that we have when we know in a
>>> process context execution path? The above change makes this a more
>>> direct association. Can you show that inferring the process context is
>>> correct? Thanks,
>>
>> We do hold the usage count of task structure, get_task_struct(current),
>> before saving its reference in dma->task which is released,
>> put_task_struct(), from vfio_remove_dma(). That makes sure that we have
>> a valid reference to task structure till we remove/free that dma
>> structure. Why would the check (dma->task == current) be false positive?
>> Vendor driver can call vfio_pin_pages() on access to some emulated
>> register from the same task who have mapped dma range, in that case this
>> check would be true.
>
> I agree, the task matching dma->task cannot go away, the question is
> whether it's valid to use (current == dma->task) to detect if we're in
> the thread of execution of that task, or could we be detecting
> coincidental matches where we may get a different result if we're
> preempted by an interrupt. TBH, I don't know how current works
> at that level of detail to know for sure, which is part of why I made
> the above change. It seemed like the safer option to know for certain
> which path we're on rather than attempt to detect it. Does current
> work as this requires, to detect that we're within the task's thread of
> execution and not simply temporarily aligned? Thanks,
>
If preempted by interrupt, then on restoring the task to run again, its
context should resume the current.
If your are still not comfortable, I'm OK to go ahead with your change
in vfio_pin_page_external(). But lets not change vaddr_get_pfn().
Thanks,
Kirti