v5:
- patch 1/ Use bool* to cleanup vfio_lock_acct() callers; sorry
we cannot re-test CAP_IPC_LOCK for all callers
- patch 2/ Re-add pr_warn, add Kirti's R-b
- patch 3/ NEW, analyzing impact of vfio_lock_acct() testing
CAP_IPC_LOCK for all callers revealed a long hanging
optimization
Thanks for the reviews, keep 'em coming,
Alex
---
Alex Williamson (3):
vfio/type1: Remove locked page accounting workqueue
vfio/type1: Prune vfio_pin_page_external()
vfio/type1: Reduce repetitive calls in vfio_pin_pages_remote()
drivers/vfio/vfio_iommu_type1.c | 150 +++++++++++++++++----------------------
1 file changed, 64 insertions(+), 86 deletions(-)
If the mmap_sem is contented then the vfio type1 IOMMU backend will
defer locked page accounting updates to a workqueue task. This has a
few problems and depending on which side the user tries to play, they
might be over-penalized for unmaps that haven't yet been accounted or
race the workqueue to enter more mappings than they're allowed. The
original intent of this workqueue mechanism seems to be focused on
reducing latency through the ioctl, but we cannot do so at the cost
of correctness. Remove this workqueue mechanism and update the
callers to allow for failure. We can also now recheck the limit under
write lock to make sure we don't exceed it.
vfio_pin_pages_remote() also now necessarily includes an unwind path
which we can jump to directly if the consecutive page pinning finds
that we're exceeding the user's memory limits. This avoids the
current lazy approach which does accounting and mapping up to the
fault, only to return an error on the next iteration to unwind the
entire vfio_dma.
Cc: [email protected]
Signed-off-by: Alex Williamson <[email protected]>
---
drivers/vfio/vfio_iommu_type1.c | 110 ++++++++++++++++++---------------------
1 file changed, 51 insertions(+), 59 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 32d2633092a3..a8a079ba9477 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -246,69 +246,46 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
return ret;
}
-struct vwork {
- struct mm_struct *mm;
- long npage;
- struct work_struct work;
-};
-
-/* delayed decrement/increment for locked_vm */
-static void vfio_lock_acct_bg(struct work_struct *work)
-{
- struct vwork *vwork = container_of(work, struct vwork, work);
- struct mm_struct *mm;
-
- mm = vwork->mm;
- down_write(&mm->mmap_sem);
- mm->locked_vm += vwork->npage;
- up_write(&mm->mmap_sem);
- mmput(mm);
- kfree(vwork);
-}
-
-static void vfio_lock_acct(struct task_struct *task, long npage)
+static int vfio_lock_acct(struct task_struct *task, long npage, bool *lock_cap)
{
- struct vwork *vwork;
struct mm_struct *mm;
bool is_current;
+ int ret;
if (!npage)
- return;
+ return 0;
is_current = (task->mm == current->mm);
mm = is_current ? task->mm : get_task_mm(task);
if (!mm)
- return; /* process exited */
+ return -ESRCH; /* process exited */
- if (down_write_trylock(&mm->mmap_sem)) {
- mm->locked_vm += npage;
- up_write(&mm->mmap_sem);
- if (!is_current)
- mmput(mm);
- return;
- }
+ ret = down_write_killable(&mm->mmap_sem);
+ if (!ret) {
+ if (npage > 0) {
+ if (lock_cap ? !*lock_cap :
+ !has_capability(task, CAP_IPC_LOCK)) {
+ unsigned long limit;
+
+ limit = task_rlimit(task,
+ RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (mm->locked_vm + npage > limit)
+ ret = -ENOMEM;
+ }
+ }
+
+ if (!ret)
+ mm->locked_vm += npage;
- if (is_current) {
- mm = get_task_mm(task);
- if (!mm)
- return;
+ up_write(&mm->mmap_sem);
}
- /*
- * Couldn't get mmap_sem lock, so must setup to update
- * mm->locked_vm later. If locked_vm were atomic, we
- * wouldn't need this silliness
- */
- vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
- if (WARN_ON(!vwork)) {
+ if (!is_current)
mmput(mm);
- return;
- }
- INIT_WORK(&vwork->work, vfio_lock_acct_bg);
- vwork->mm = mm;
- vwork->npage = npage;
- schedule_work(&vwork->work);
+
+ return ret;
}
/*
@@ -405,7 +382,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
long npage, unsigned long *pfn_base)
{
- unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
bool lock_cap = capable(CAP_IPC_LOCK);
long ret, pinned = 0, lock_acct = 0;
bool rsvd;
@@ -442,8 +419,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
/* Lock all the consecutive pages from pfn_base */
for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
- unsigned long pfn = 0;
-
ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
if (ret)
break;
@@ -460,14 +435,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
put_pfn(pfn, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT);
- break;
+ ret = -ENOMEM;
+ goto unpin_out;
}
lock_acct++;
}
}
out:
- vfio_lock_acct(current, lock_acct);
+ ret = vfio_lock_acct(current, lock_acct, &lock_cap);
+
+unpin_out:
+ if (ret) {
+ if (!rsvd) {
+ for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
+ put_pfn(pfn, dma->prot);
+ }
+
+ return ret;
+ }
return pinned;
}
@@ -488,7 +474,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
}
if (do_accounting)
- vfio_lock_acct(dma->task, locked - unlocked);
+ vfio_lock_acct(dma->task, locked - unlocked, NULL);
return unlocked;
}
@@ -522,8 +508,14 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
goto pin_page_exit;
}
- if (!rsvd && do_accounting)
- vfio_lock_acct(dma->task, 1);
+ if (!rsvd && do_accounting) {
+ ret = vfio_lock_acct(dma->task, 1, &lock_cap);
+ if (ret) {
+ put_pfn(*pfn_base, dma->prot);
+ goto pin_page_exit;
+ }
+ }
+
ret = 1;
pin_page_exit:
@@ -543,7 +535,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
if (do_accounting)
- vfio_lock_acct(dma->task, -unlocked);
+ vfio_lock_acct(dma->task, -unlocked, NULL);
return unlocked;
}
@@ -740,7 +732,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
dma->iommu_mapped = false;
if (do_accounting) {
- vfio_lock_acct(dma->task, -unlocked);
+ vfio_lock_acct(dma->task, -unlocked, NULL);
return 0;
}
return unlocked;
@@ -1382,7 +1374,7 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
if (!is_invalid_reserved_pfn(vpfn->pfn))
locked++;
}
- vfio_lock_acct(dma->task, locked - unlocked);
+ vfio_lock_acct(dma->task, locked - unlocked, NULL);
}
}
With vfio_lock_acct() testing the locked memory limit under mmap_sem,
it's redundant to do it here for a single page. We can also reorder
our tests such that we can avoid testing for reserved pages if we're
not doing accounting and let vfio_lock_acct() test the process
CAP_IPC_LOCK. Finally, this function oddly returns 1 on success.
Update to return zero on success, -errno on error. Since the function
only pins a single page, there's no need to return the number of pages
pinned.
N.B. vfio_pin_pages_remote() can pin a large contiguous range of pages
before calling vfio_lock_acct(). If we were to similarly remove the
extra test there, a user could temporarily pin far more pages than
they're allowed.
Suggested-by: Kirti Wankhede <[email protected]>
Suggested-by: Eric Auger <[email protected]>
Reviewed-by: Kirti Wankhede <[email protected]>
Signed-off-by: Alex Williamson <[email protected]>
---
drivers/vfio/vfio_iommu_type1.c | 35 ++++++++---------------------------
1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a8a079ba9477..372e4f626138 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -482,43 +482,26 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
unsigned long *pfn_base, bool do_accounting)
{
- unsigned long limit;
- bool lock_cap = has_capability(dma->task, CAP_IPC_LOCK);
struct mm_struct *mm;
int ret;
- bool rsvd;
mm = get_task_mm(dma->task);
if (!mm)
return -ENODEV;
ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
- if (ret)
- goto pin_page_exit;
-
- rsvd = is_invalid_reserved_pfn(*pfn_base);
- limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) {
- put_pfn(*pfn_base, dma->prot);
- pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
- __func__, dma->task->comm, task_pid_nr(dma->task),
- limit << PAGE_SHIFT);
- ret = -ENOMEM;
- goto pin_page_exit;
- }
-
- if (!rsvd && do_accounting) {
- ret = vfio_lock_acct(dma->task, 1, &lock_cap);
+ if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
+ ret = vfio_lock_acct(dma->task, 1, NULL);
if (ret) {
put_pfn(*pfn_base, dma->prot);
- goto pin_page_exit;
+ if (ret == -ENOMEM)
+ pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
+ "(%ld) exceeded\n", __func__,
+ dma->task->comm, task_pid_nr(dma->task),
+ task_rlimit(dma->task, RLIMIT_MEMLOCK));
}
}
- ret = 1;
-
-pin_page_exit:
mmput(mm);
return ret;
}
@@ -598,10 +581,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
remote_vaddr = dma->vaddr + iova - dma->iova;
ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
do_accounting);
- if (ret <= 0) {
- WARN_ON(!ret);
+ if (ret)
goto pin_unwind;
- }
ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
if (ret) {
vfio_pin_pages_remote() is typically called to iterate over a range
of memory. Testing CAP_IPC_LOCK is relatively expensive, so it makes
sense to push it up to the caller, which can then repeatedly call
vfio_pin_pages_remote() using that value. This can show nearly a 20%
improvement on the worst case path through VFIO_IOMMU_MAP_DMA with
contiguous page mapping disabled. Testing RLIMIT_MEMLOCK is much more
lightweight, but we bring it along on the same principle and it does
seem to show a marginal improvement.
Signed-off-by: Alex Williamson <[email protected]>
---
drivers/vfio/vfio_iommu_type1.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 372e4f626138..8549cb111627 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -380,10 +380,10 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
* first page and all consecutive pages with the same locking.
*/
static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
- long npage, unsigned long *pfn_base)
+ long npage, unsigned long *pfn_base,
+ bool lock_cap, unsigned long limit)
{
- unsigned long pfn = 0, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- bool lock_cap = capable(CAP_IPC_LOCK);
+ unsigned long pfn = 0;
long ret, pinned = 0, lock_acct = 0;
bool rsvd;
dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
@@ -924,13 +924,15 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
unsigned long vaddr = dma->vaddr;
size_t size = map_size;
long npage;
- unsigned long pfn;
+ unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ bool lock_cap = capable(CAP_IPC_LOCK);
int ret = 0;
while (size) {
/* Pin a contiguous chunk of memory */
npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
- size >> PAGE_SHIFT, &pfn);
+ size >> PAGE_SHIFT, &pfn,
+ lock_cap, limit);
if (npage <= 0) {
WARN_ON(!npage);
ret = (int)npage;
@@ -1040,6 +1042,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
{
struct vfio_domain *d;
struct rb_node *n;
+ unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ bool lock_cap = capable(CAP_IPC_LOCK);
int ret;
/* Arbitrarily pick the first domain in the list for lookups */
@@ -1086,7 +1090,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
npage = vfio_pin_pages_remote(dma, vaddr,
n >> PAGE_SHIFT,
- &pfn);
+ &pfn, lock_cap,
+ limit);
if (npage <= 0) {
WARN_ON(!npage);
ret = (int)npage;
On Mon, Apr 17, 2017 at 04:37:38PM -0600, Alex Williamson wrote:
> v5:
> - patch 1/ Use bool* to cleanup vfio_lock_acct() callers; sorry
> we cannot re-test CAP_IPC_LOCK for all callers
> - patch 2/ Re-add pr_warn, add Kirti's R-b
> - patch 3/ NEW, analyzing impact of vfio_lock_acct() testing
> CAP_IPC_LOCK for all callers revealed a long hanging
> optimization
>
> Thanks for the reviews, keep 'em coming,
All patches looks good to me.
Reviewed-by: Peter Xu <[email protected]>
Thanks!
--
Peter Xu
On 4/18/2017 9:53 AM, Peter Xu wrote:
> On Mon, Apr 17, 2017 at 04:37:38PM -0600, Alex Williamson wrote:
>> v5:
>> - patch 1/ Use bool* to cleanup vfio_lock_acct() callers; sorry
>> we cannot re-test CAP_IPC_LOCK for all callers
>> - patch 2/ Re-add pr_warn, add Kirti's R-b
>> - patch 3/ NEW, analyzing impact of vfio_lock_acct() testing
>> CAP_IPC_LOCK for all callers revealed a long hanging
>> optimization
>>
>> Thanks for the reviews, keep 'em coming,
>
> All patches looks good to me.
>
> Reviewed-by: Peter Xu <[email protected]>
>
> Thanks!
>
All three looks good to me too.
Reviewed-by: Kirti Wankhede <[email protected]>
Thanks,
Kirti