2021-02-19 16:15:36

by Daniel Jordan

[permalink] [raw]
Subject: [PATCH v2 3/3] vfio/type1: Batch page pinning

Pinning one 4K page at a time is inefficient, so do it in batches of 512
instead. This is just an optimization with no functional change
intended, and in particular the driver still calls iommu_map() with the
largest physically contiguous range possible.

Add two fields in vfio_batch to remember where to start between calls to
vfio_pin_pages_remote(), and use vfio_batch_unpin() to handle remaining
pages in the batch in case of error.

qemu pins pages for guests around 8% faster on my test system, a
two-node Broadwell server with 128G memory per node. The qemu process
was bound to one node with its allocations constrained there as well.

base test
guest ---------------- ----------------
mem (GB) speedup avg sec (std) avg sec (std)
1 7.4% 0.61 (0.00) 0.56 (0.00)
2 8.3% 0.93 (0.00) 0.85 (0.00)
4 8.4% 1.46 (0.00) 1.34 (0.00)
8 8.6% 2.54 (0.01) 2.32 (0.00)
16 8.3% 4.66 (0.00) 4.27 (0.01)
32 8.3% 8.94 (0.01) 8.20 (0.01)
64 8.2% 17.47 (0.01) 16.04 (0.03)
120 8.5% 32.45 (0.13) 29.69 (0.01)

perf diff confirms less time spent in pup. Here are the top ten
functions:

Baseline Delta Abs Symbol

78.63% +6.64% clear_page_erms
1.50% -1.50% __gup_longterm_locked
1.27% -0.78% __get_user_pages
+0.76% kvm_zap_rmapp.constprop.0
0.54% -0.53% vmacache_find
0.55% -0.51% get_pfnblock_flags_mask
0.48% -0.48% __get_user_pages_remote
+0.39% slot_rmap_walk_next
+0.32% vfio_pin_map_dma
+0.26% kvm_handle_hva_range
...

Suggested-by: Matthew Wilcox (Oracle) <[email protected]>
Signed-off-by: Daniel Jordan <[email protected]>
---
drivers/vfio/vfio_iommu_type1.c | 135 +++++++++++++++++++++-----------
1 file changed, 89 insertions(+), 46 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b7247a2fc87e..cec2083dd556 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -107,6 +107,8 @@ struct vfio_batch {
struct page **pages; /* for pin_user_pages_remote */
struct page *fallback_page; /* if pages alloc fails */
int capacity; /* length of pages array */
+ int size; /* of batch currently */
+ int offset; /* of next entry in pages */
};

struct vfio_group {
@@ -469,6 +471,9 @@ static int put_pfn(unsigned long pfn, int prot)

static void vfio_batch_init(struct vfio_batch *batch)
{
+ batch->size = 0;
+ batch->offset = 0;
+
if (unlikely(disable_hugepages))
goto fallback;

@@ -484,6 +489,17 @@ static void vfio_batch_init(struct vfio_batch *batch)
batch->capacity = 1;
}

+static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
+{
+ while (batch->size) {
+ unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
+
+ put_pfn(pfn, dma->prot);
+ batch->offset++;
+ batch->size--;
+ }
+}
+
static void vfio_batch_fini(struct vfio_batch *batch)
{
if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
@@ -625,65 +641,88 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
long npage, unsigned long *pfn_base,
unsigned long limit, struct vfio_batch *batch)
{
- unsigned long pfn = 0;
+ unsigned long pfn;
+ struct mm_struct *mm = current->mm;
long ret, pinned = 0, lock_acct = 0;
bool rsvd;
dma_addr_t iova = vaddr - dma->vaddr + dma->iova;

/* This code path is only user initiated */
- if (!current->mm)
+ if (!mm)
return -ENODEV;

- ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, pfn_base,
- batch->pages);
- if (ret < 0)
- return ret;
-
- pinned++;
- rsvd = is_invalid_reserved_pfn(*pfn_base);
-
- /*
- * Reserved pages aren't counted against the user, externally pinned
- * pages are already counted against the user.
- */
- if (!rsvd && !vfio_find_vpfn(dma, iova)) {
- if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
- put_pfn(*pfn_base, dma->prot);
- pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
- limit << PAGE_SHIFT);
- return -ENOMEM;
- }
- lock_acct++;
+ if (batch->size) {
+ /* Leftover pages in batch from an earlier call. */
+ *pfn_base = page_to_pfn(batch->pages[batch->offset]);
+ pfn = *pfn_base;
+ rsvd = is_invalid_reserved_pfn(*pfn_base);
+ } else {
+ *pfn_base = 0;
}

- if (unlikely(disable_hugepages))
- goto out;
+ while (npage) {
+ if (!batch->size) {
+ /* Empty batch, so refill it. */
+ long req_pages = min_t(long, npage, batch->capacity);

- /* Lock all the consecutive pages from pfn_base */
- for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
- pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
- ret = vaddr_get_pfns(current->mm, vaddr, 1, dma->prot, &pfn,
- batch->pages);
- if (ret < 0)
- break;
+ ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
+ &pfn, batch->pages);
+ if (ret < 0)
+ goto unpin_out;

- if (pfn != *pfn_base + pinned ||
- rsvd != is_invalid_reserved_pfn(pfn)) {
- put_pfn(pfn, dma->prot);
- break;
+ batch->size = ret;
+ batch->offset = 0;
+
+ if (!*pfn_base) {
+ *pfn_base = pfn;
+ rsvd = is_invalid_reserved_pfn(*pfn_base);
+ }
}

- if (!rsvd && !vfio_find_vpfn(dma, iova)) {
- if (!dma->lock_cap &&
- current->mm->locked_vm + lock_acct + 1 > limit) {
- put_pfn(pfn, dma->prot);
- pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
- __func__, limit << PAGE_SHIFT);
- ret = -ENOMEM;
- goto unpin_out;
+ /*
+ * pfn is preset for the first iteration of this inner loop and
+ * updated at the end to handle a VM_PFNMAP pfn. In that case,
+ * batch->pages isn't valid (there's no struct page), so allow
+ * batch->pages to be touched only when there's more than one
+ * pfn to check, which guarantees the pfns are from a
+ * !VM_PFNMAP vma.
+ */
+ while (true) {
+ if (pfn != *pfn_base + pinned ||
+ rsvd != is_invalid_reserved_pfn(pfn))
+ goto out;
+
+ /*
+ * Reserved pages aren't counted against the user,
+ * externally pinned pages are already counted against
+ * the user.
+ */
+ if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+ if (!dma->lock_cap &&
+ mm->locked_vm + lock_acct + 1 > limit) {
+ pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+ __func__, limit << PAGE_SHIFT);
+ ret = -ENOMEM;
+ goto unpin_out;
+ }
+ lock_acct++;
}
- lock_acct++;
+
+ pinned++;
+ npage--;
+ vaddr += PAGE_SIZE;
+ iova += PAGE_SIZE;
+ batch->offset++;
+ batch->size--;
+
+ if (!batch->size)
+ break;
+
+ pfn = page_to_pfn(batch->pages[batch->offset]);
}
+
+ if (unlikely(disable_hugepages))
+ break;
}

out:
@@ -691,10 +730,11 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,

unpin_out:
if (ret < 0) {
- if (!rsvd) {
+ if (pinned && !rsvd) {
for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
put_pfn(pfn, dma->prot);
}
+ vfio_batch_unpin(batch, dma);

return ret;
}
@@ -1453,6 +1493,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
if (ret) {
vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
npage, true);
+ vfio_batch_unpin(&batch, dma);
break;
}

@@ -1716,11 +1757,13 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
ret = iommu_map(domain->domain, iova, phys,
size, dma->prot | domain->prot);
if (ret) {
- if (!dma->iommu_mapped)
+ if (!dma->iommu_mapped) {
vfio_unpin_pages_remote(dma, iova,
phys >> PAGE_SHIFT,
size >> PAGE_SHIFT,
true);
+ vfio_batch_unpin(&batch, dma);
+ }
goto unwind;
}

--
2.30.1


2021-03-23 19:35:51

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] vfio/type1: Batch page pinning

Hi Daniel,

I've found a bug in this patch that we need to fix. The diff is a
little difficult to follow, so I'll discuss it in the resulting
function below...

(1) Imagine the user has passed a vaddr range that alternates pfnmaps
and pinned memory per page.


static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
long npage, unsigned long *pfn_base,
unsigned long limit, struct vfio_batch *batch)
{
unsigned long pfn;
struct mm_struct *mm = current->mm;
long ret, pinned = 0, lock_acct = 0;
bool rsvd;
dma_addr_t iova = vaddr - dma->vaddr + dma->iova;

/* This code path is only user initiated */
if (!mm)
return -ENODEV;

if (batch->size) {
/* Leftover pages in batch from an earlier call. */
*pfn_base = page_to_pfn(batch->pages[batch->offset]);
pfn = *pfn_base;
rsvd = is_invalid_reserved_pfn(*pfn_base);

(4) We're called again and fill our local variables from the batch. The
batch only has one page, so we'll complete the inner loop below and refill.

(6) We're called again, batch->size is 1, but it was for a pfnmap, the pages
array still contains the last pinned page, so we end up incorrectly using
this pfn again for the next entry.

} else {
*pfn_base = 0;
}

while (npage) {
if (!batch->size) {
/* Empty batch, so refill it. */
long req_pages = min_t(long, npage, batch->capacity);

ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
&pfn, batch->pages);
if (ret < 0)
goto unpin_out;

(2) Assume the 1st page is pfnmap, the 2nd is pinned memory

batch->size = ret;
batch->offset = 0;

if (!*pfn_base) {
*pfn_base = pfn;
rsvd = is_invalid_reserved_pfn(*pfn_base);
}
}

/*
* pfn is preset for the first iteration of this inner loop and
* updated at the end to handle a VM_PFNMAP pfn. In that case,
* batch->pages isn't valid (there's no struct page), so allow
* batch->pages to be touched only when there's more than one
* pfn to check, which guarantees the pfns are from a
* !VM_PFNMAP vma.
*/
while (true) {
if (pfn != *pfn_base + pinned ||
rsvd != is_invalid_reserved_pfn(pfn))
goto out;

(3) On the 2nd page, both tests are probably true here, so we take this goto,
leaving the batch with the next page.

(5) Now we've refilled batch, but the next page is pfnmap, so likely both of the
above tests are true... but this is a pfnmap'ing!

(7) Do we add something like if (batch->size == 1 && !batch->offset) {
put_pfn(pfn, dma->prot); batch->size = 0; }?

/*
* Reserved pages aren't counted against the user,
* externally pinned pages are already counted against
* the user.
*/
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
if (!dma->lock_cap &&
mm->locked_vm + lock_acct + 1 > limit) {
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT);
ret = -ENOMEM;
goto unpin_out;
}
lock_acct++;
}

pinned++;
npage--;
vaddr += PAGE_SIZE;
iova += PAGE_SIZE;
batch->offset++;
batch->size--;

if (!batch->size)
break;

pfn = page_to_pfn(batch->pages[batch->offset]);
}

if (unlikely(disable_hugepages))
break;
}

out:
ret = vfio_lock_acct(dma, lock_acct, false);

unpin_out:
if (ret < 0) {
if (pinned && !rsvd) {
for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
put_pfn(pfn, dma->prot);
}
vfio_batch_unpin(batch, dma);

(8) These calls to batch_unpin are rather precarious as well, any time batch->size is
non-zero, we risk using the pages array for a pfnmap. We might actually want the
above change in (7) moved into this exit path so that we can never return a potential
pfnmap batch.

return ret;
}

return pinned;
}

This is a regression that not only causes incorrect mapping for the
user, but also allows the user to trigger bad page counts, so we need
a fix for v5.12. Thanks,

Alex

2021-03-23 22:32:40

by Daniel Jordan

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] vfio/type1: Batch page pinning

Hi Alex,

Alex Williamson <[email protected]> writes:
> I've found a bug in this patch that we need to fix. The diff is a
> little difficult to follow,

It was an awful diff, I remember...

> so I'll discuss it in the resulting function below...
>
> (1) Imagine the user has passed a vaddr range that alternates pfnmaps
> and pinned memory per page.
>
>
> static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> long npage, unsigned long *pfn_base,
> unsigned long limit, struct vfio_batch *batch)
> {
> unsigned long pfn;
> struct mm_struct *mm = current->mm;
> long ret, pinned = 0, lock_acct = 0;
> bool rsvd;
> dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
>
> /* This code path is only user initiated */
> if (!mm)
> return -ENODEV;
>
> if (batch->size) {
> /* Leftover pages in batch from an earlier call. */
> *pfn_base = page_to_pfn(batch->pages[batch->offset]);
> pfn = *pfn_base;
> rsvd = is_invalid_reserved_pfn(*pfn_base);
>
> (4) We're called again and fill our local variables from the batch. The
> batch only has one page, so we'll complete the inner loop below and refill.
>
> (6) We're called again, batch->size is 1, but it was for a pfnmap, the pages
> array still contains the last pinned page, so we end up incorrectly using
> this pfn again for the next entry.
>
> } else {
> *pfn_base = 0;
> }
>
> while (npage) {
> if (!batch->size) {
> /* Empty batch, so refill it. */
> long req_pages = min_t(long, npage, batch->capacity);
>
> ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
> &pfn, batch->pages);
> if (ret < 0)
> goto unpin_out;
>
> (2) Assume the 1st page is pfnmap, the 2nd is pinned memory

Just to check we're on the same wavelength, I think you can hit this bug
with one less call to vfio_pin_pages_remote() if the 1st page in the
vaddr range is pinned memory and the 2nd is pfnmap. Then you'd have the
following sequence:

vfio_pin_pages_remote() call #1:

- In the first batch refill, you'd get a size=1 batch with pinned
memory and complete the inner loop, breaking at "if (!batch->size)".

- In the second batch refill, you'd get another size=1 batch with a
pfnmap page, take the "goto unpin_out" in the inner loop, and return
from the function with the batch still containing a single pfnmap
page.

vfio_pin_pages_remote() call #2:

- *pfn_base is set from the first element of the pages array, which
unfortunately has the non-pfnmap pfn.

Did I follow you?

>
> batch->size = ret;
> batch->offset = 0;
>
> if (!*pfn_base) {
> *pfn_base = pfn;
> rsvd = is_invalid_reserved_pfn(*pfn_base);
> }
> }
>
> /*
> * pfn is preset for the first iteration of this inner loop and
> * updated at the end to handle a VM_PFNMAP pfn. In that case,
> * batch->pages isn't valid (there's no struct page), so allow
> * batch->pages to be touched only when there's more than one
> * pfn to check, which guarantees the pfns are from a
> * !VM_PFNMAP vma.
> */
> while (true) {
> if (pfn != *pfn_base + pinned ||
> rsvd != is_invalid_reserved_pfn(pfn))
> goto out;
>
> (3) On the 2nd page, both tests are probably true here, so we take this goto,
> leaving the batch with the next page.
>
> (5) Now we've refilled batch, but the next page is pfnmap, so likely both of the
> above tests are true... but this is a pfnmap'ing!
>
> (7) Do we add something like if (batch->size == 1 && !batch->offset) {
> put_pfn(pfn, dma->prot); batch->size = 0; }?

Yes, that could work, maybe with a check for a pfnmap mapping (rsvd)
instead of those two conditions.

I'd rejected two approaches where the batch stores pfns instead of
pages. Allocating two pages (one for pages, one for pfns) seems
overkill, though the allocation is transient. Using a union for "struct
page **pages" and "unsigned long *pfns" seems fragile due to the sizes
of each type needing to match, and possibly slow from having to loop
over the array twice (once to convert them all after pin_user_pages and
again for the inner loop). Neither seem much better, at least to me,
even with this bug as additional motivation.

It'd be better if pup returned pfns in some form, but that's another
issue entirely.

>
> /*
> * Reserved pages aren't counted against the user,
> * externally pinned pages are already counted against
> * the user.
> */
> if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> if (!dma->lock_cap &&
> mm->locked_vm + lock_acct + 1 > limit) {
> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> __func__, limit << PAGE_SHIFT);
> ret = -ENOMEM;
> goto unpin_out;
> }
> lock_acct++;
> }
>
> pinned++;
> npage--;
> vaddr += PAGE_SIZE;
> iova += PAGE_SIZE;
> batch->offset++;
> batch->size--;
>
> if (!batch->size)
> break;
>
> pfn = page_to_pfn(batch->pages[batch->offset]);
> }
>
> if (unlikely(disable_hugepages))
> break;
> }
>
> out:
> ret = vfio_lock_acct(dma, lock_acct, false);
>
> unpin_out:
> if (ret < 0) {
> if (pinned && !rsvd) {
> for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> put_pfn(pfn, dma->prot);
> }
> vfio_batch_unpin(batch, dma);
>
> (8) These calls to batch_unpin are rather precarious as well, any time batch->size is
> non-zero, we risk using the pages array for a pfnmap. We might actually want the
> above change in (7) moved into this exit path so that we can never return a potential
> pfnmap batch.

Yes, the exit path seems like the right place for the fix.

>
> return ret; }
>
> return pinned;
> }
>
> This is a regression that not only causes incorrect mapping for the
> user, but also allows the user to trigger bad page counts, so we need
> a fix for v5.12.

Sure, I can test a fix after I get your thoughts on the above.

Daniel

2021-03-24 21:34:43

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v2 3/3] vfio/type1: Batch page pinning

On Tue, 23 Mar 2021 18:25:45 -0400
Daniel Jordan <[email protected]> wrote:

> Hi Alex,
>
> Alex Williamson <[email protected]> writes:
> > I've found a bug in this patch that we need to fix. The diff is a
> > little difficult to follow,
>
> It was an awful diff, I remember...
>
> > so I'll discuss it in the resulting function below...
> >
> > (1) Imagine the user has passed a vaddr range that alternates pfnmaps
> > and pinned memory per page.
> >
> >
> > static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> > long npage, unsigned long *pfn_base,
> > unsigned long limit, struct vfio_batch *batch)
> > {
> > unsigned long pfn;
> > struct mm_struct *mm = current->mm;
> > long ret, pinned = 0, lock_acct = 0;
> > bool rsvd;
> > dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
> >
> > /* This code path is only user initiated */
> > if (!mm)
> > return -ENODEV;
> >
> > if (batch->size) {
> > /* Leftover pages in batch from an earlier call. */
> > *pfn_base = page_to_pfn(batch->pages[batch->offset]);
> > pfn = *pfn_base;
> > rsvd = is_invalid_reserved_pfn(*pfn_base);
> >
> > (4) We're called again and fill our local variables from the batch. The
> > batch only has one page, so we'll complete the inner loop below and refill.
> >
> > (6) We're called again, batch->size is 1, but it was for a pfnmap, the pages
> > array still contains the last pinned page, so we end up incorrectly using
> > this pfn again for the next entry.
> >
> > } else {
> > *pfn_base = 0;
> > }
> >
> > while (npage) {
> > if (!batch->size) {
> > /* Empty batch, so refill it. */
> > long req_pages = min_t(long, npage, batch->capacity);
> >
> > ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
> > &pfn, batch->pages);
> > if (ret < 0)
> > goto unpin_out;
> >
> > (2) Assume the 1st page is pfnmap, the 2nd is pinned memory
>
> Just to check we're on the same wavelength, I think you can hit this bug
> with one less call to vfio_pin_pages_remote() if the 1st page in the
> vaddr range is pinned memory and the 2nd is pfnmap. Then you'd have the
> following sequence:
>
> vfio_pin_pages_remote() call #1:
>
> - In the first batch refill, you'd get a size=1 batch with pinned
> memory and complete the inner loop, breaking at "if (!batch->size)".
>
> - In the second batch refill, you'd get another size=1 batch with a
> pfnmap page, take the "goto unpin_out" in the inner loop, and return
> from the function with the batch still containing a single pfnmap
> page.
>
> vfio_pin_pages_remote() call #2:
>
> - *pfn_base is set from the first element of the pages array, which
> unfortunately has the non-pfnmap pfn.
>
> Did I follow you?

Yep, I should have simplified to skip the first mapping, but I was also
trying to make sure I made sense of the test case I was playing with
that triggered it. The important transition is pinned memory to pfnmap
since that let's us return with non-zero batch size and stale data in
the pages array.

> >
> > batch->size = ret;
> > batch->offset = 0;
> >
> > if (!*pfn_base) {
> > *pfn_base = pfn;
> > rsvd = is_invalid_reserved_pfn(*pfn_base);
> > }
> > }
> >
> > /*
> > * pfn is preset for the first iteration of this inner loop and
> > * updated at the end to handle a VM_PFNMAP pfn. In that case,
> > * batch->pages isn't valid (there's no struct page), so allow
> > * batch->pages to be touched only when there's more than one
> > * pfn to check, which guarantees the pfns are from a
> > * !VM_PFNMAP vma.
> > */
> > while (true) {
> > if (pfn != *pfn_base + pinned ||
> > rsvd != is_invalid_reserved_pfn(pfn))
> > goto out;
> >
> > (3) On the 2nd page, both tests are probably true here, so we take this goto,
> > leaving the batch with the next page.
> >
> > (5) Now we've refilled batch, but the next page is pfnmap, so likely both of the
> > above tests are true... but this is a pfnmap'ing!
> >
> > (7) Do we add something like if (batch->size == 1 && !batch->offset) {
> > put_pfn(pfn, dma->prot); batch->size = 0; }?
>
> Yes, that could work, maybe with a check for a pfnmap mapping (rsvd)
> instead of those two conditions.

@rsvd could work as well, but unfortunately means we'll call
is_invalid... even if we just have a discontinuity.

> I'd rejected two approaches where the batch stores pfns instead of
> pages. Allocating two pages (one for pages, one for pfns) seems
> overkill, though the allocation is transient. Using a union for "struct
> page **pages" and "unsigned long *pfns" seems fragile due to the sizes
> of each type needing to match, and possibly slow from having to loop
> over the array twice (once to convert them all after pin_user_pages and
> again for the inner loop). Neither seem much better, at least to me,
> even with this bug as additional motivation.
>
> It'd be better if pup returned pfns in some form, but that's another
> issue entirely.

It is a little curious that vfio_batch_unpin() converts pages to pfn,
only to call a function that converts them back to pages for unpinning,
when those pages should never trigger the pfnmap bypass. I think
that's only an error path too, so it seems unnecessary to clear them as
dirty regardless of the DMA mapping protections. At some point we'll
want to include pfnmap batches too, but only after we get rid of
follow_pte() lookups for them.

> >
> > /*
> > * Reserved pages aren't counted against the user,
> > * externally pinned pages are already counted against
> > * the user.
> > */
> > if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> > if (!dma->lock_cap &&
> > mm->locked_vm + lock_acct + 1 > limit) {
> > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > __func__, limit << PAGE_SHIFT);
> > ret = -ENOMEM;
> > goto unpin_out;
> > }
> > lock_acct++;
> > }
> >
> > pinned++;
> > npage--;
> > vaddr += PAGE_SIZE;
> > iova += PAGE_SIZE;
> > batch->offset++;
> > batch->size--;
> >
> > if (!batch->size)
> > break;
> >
> > pfn = page_to_pfn(batch->pages[batch->offset]);
> > }
> >
> > if (unlikely(disable_hugepages))
> > break;
> > }
> >
> > out:
> > ret = vfio_lock_acct(dma, lock_acct, false);
> >
> > unpin_out:
> > if (ret < 0) {
> > if (pinned && !rsvd) {
> > for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
> > put_pfn(pfn, dma->prot);
> > }
> > vfio_batch_unpin(batch, dma);
> >
> > (8) These calls to batch_unpin are rather precarious as well, any time batch->size is
> > non-zero, we risk using the pages array for a pfnmap. We might actually want the
> > above change in (7) moved into this exit path so that we can never return a potential
> > pfnmap batch.
>
> Yes, the exit path seems like the right place for the fix.
>
> >
> > return ret; }
> >
> > return pinned;
> > }
> >
> > This is a regression that not only causes incorrect mapping for the
> > user, but also allows the user to trigger bad page counts, so we need
> > a fix for v5.12.
>
> Sure, I can test a fix after I get your thoughts on the above.

I think you've got the right idea about my concern above. Thanks,

Alex