2018-02-01 06:28:58

by Suthikulpanit, Suravee

[permalink] [raw]
Subject: [PATCH v5] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs

VFIO IOMMU type1 currently upmaps IOVA pages synchronously, which requires
IOTLB flushing for every unmapping. This results in large IOTLB flushing
overhead when handling pass-through devices has a large number of mapped
IOVAs. This can be avoided by using the new IOTLB flushing interface.

Cc: Alex Williamson <[email protected]>
Cc: Joerg Roedel <[email protected]>
Signed-off-by: Suravee Suthikulpanit <[email protected]>
---

Changes from v4 (https://lkml.org/lkml/2018/1/31/153)
* Change return type from ssize_t back to size_t since we no longer
changing IOMMU API. Also update error handling logic accordingly.
* In unmap_unpin_fast(), also sync when failing to allocate entry.
* Some code restructuring and variable renaming.

drivers/vfio/vfio_iommu_type1.c | 128 ++++++++++++++++++++++++++++++++++++----
1 file changed, 117 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e30e29a..6041530 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -102,6 +102,13 @@ struct vfio_pfn {
atomic_t ref_count;
};

+struct vfio_regions {
+ struct list_head list;
+ dma_addr_t iova;
+ phys_addr_t phys;
+ size_t len;
+};
+
#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
(!list_empty(&iommu->domain_list))

@@ -648,11 +655,102 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
return i > npage ? npage : (i > 0 ? i : -EINVAL);
}

+static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
+ struct list_head *regions)
+{
+ long unlocked = 0;
+ struct vfio_regions *entry, *next;
+
+ iommu_tlb_sync(domain->domain);
+
+ list_for_each_entry_safe(entry, next, regions, list) {
+ unlocked += vfio_unpin_pages_remote(dma,
+ entry->iova,
+ entry->phys >> PAGE_SHIFT,
+ entry->len >> PAGE_SHIFT,
+ false);
+ list_del(&entry->list);
+ kfree(entry);
+ }
+
+ cond_resched();
+
+ return unlocked;
+}
+
+/*
+ * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
+ * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
+ * of these regions (currently using a list).
+ *
+ * This value specifies maximum number of regions for each IOTLB flush sync.
+ */
+#define VFIO_IOMMU_TLB_SYNC_MAX 512
+
+static size_t unmap_unpin_fast(struct vfio_domain *domain,
+ struct vfio_dma *dma, dma_addr_t *iova,
+ size_t len, phys_addr_t phys, long *unlocked,
+ struct list_head *unmapped_list,
+ int *unmapped_cnt)
+{
+ size_t unmapped = 0;
+ struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+
+ if (entry) {
+ unmapped = iommu_unmap_fast(domain->domain, *iova, len);
+
+ if (!unmapped) {
+ kfree(entry);
+ } else {
+ iommu_tlb_range_add(domain->domain, *iova, unmapped);
+ entry->iova = *iova;
+ entry->phys = phys;
+ entry->len = unmapped;
+ list_add_tail(&entry->list, unmapped_list);
+
+ *iova += unmapped;
+ (*unmapped_cnt)++;
+ }
+ }
+
+ /*
+ * Sync if the number of fast-unmap regions hits the limit
+ * or in case of errors.
+ */
+ if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
+ *unlocked += vfio_sync_unpin(dma, domain,
+ unmapped_list);
+ *unmapped_cnt = 0;
+ }
+
+ return unmapped;
+}
+
+static size_t unmap_unpin_slow(struct vfio_domain *domain,
+ struct vfio_dma *dma, dma_addr_t *iova,
+ size_t len, phys_addr_t phys,
+ long *unlocked)
+{
+ size_t unmapped = iommu_unmap(domain->domain, *iova, len);
+
+ if (unmapped) {
+ *unlocked += vfio_unpin_pages_remote(dma, *iova,
+ phys >> PAGE_SHIFT,
+ unmapped >> PAGE_SHIFT,
+ false);
+ *iova += unmapped;
+ cond_resched();
+ }
+ return unmapped;
+}
+
static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
bool do_accounting)
{
dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
struct vfio_domain *domain, *d;
+ struct list_head unmapped_region_list;
+ int unmapped_region_cnt = 0;
long unlocked = 0;

if (!dma->size)
@@ -661,6 +759,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
return 0;

+ INIT_LIST_HEAD(&unmapped_region_list);
+
/*
* We use the IOMMU to track the physical addresses, otherwise we'd
* need a much more complicated tracking system. Unfortunately that
@@ -698,20 +798,26 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
break;
}

- unmapped = iommu_unmap(domain->domain, iova, len);
- if (WARN_ON(!unmapped))
- break;
-
- unlocked += vfio_unpin_pages_remote(dma, iova,
- phys >> PAGE_SHIFT,
- unmapped >> PAGE_SHIFT,
- false);
- iova += unmapped;
-
- cond_resched();
+ /*
+ * First, try to use fast unmap/unpin. In case of failure,
+ * switch to slow unmap/unpin path.
+ */
+ unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
+ &unlocked, &unmapped_region_list,
+ &unmapped_region_cnt);
+ if (!unmapped) {
+ unmapped = unmap_unpin_slow(domain, dma, &iova, len,
+ phys, &unlocked);
+ if (WARN_ON(!unmapped))
+ break;
+ }
}

dma->iommu_mapped = false;
+
+ if (unmapped_region_cnt)
+ unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list);
+
if (do_accounting) {
vfio_lock_acct(dma->task, -unlocked, NULL);
return 0;
--
1.8.3.1



2018-02-22 23:00:40

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v5] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs

On Thu, 1 Feb 2018 01:27:38 -0500
Suravee Suthikulpanit <[email protected]> wrote:

> VFIO IOMMU type1 currently upmaps IOVA pages synchronously, which requires
> IOTLB flushing for every unmapping. This results in large IOTLB flushing
> overhead when handling pass-through devices has a large number of mapped
> IOVAs. This can be avoided by using the new IOTLB flushing interface.
>
> Cc: Alex Williamson <[email protected]>
> Cc: Joerg Roedel <[email protected]>
> Signed-off-by: Suravee Suthikulpanit <[email protected]>
> ---
>
> Changes from v4 (https://lkml.org/lkml/2018/1/31/153)
> * Change return type from ssize_t back to size_t since we no longer
> changing IOMMU API. Also update error handling logic accordingly.
> * In unmap_unpin_fast(), also sync when failing to allocate entry.
> * Some code restructuring and variable renaming.
>
> drivers/vfio/vfio_iommu_type1.c | 128 ++++++++++++++++++++++++++++++++++++----
> 1 file changed, 117 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e30e29a..6041530 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -102,6 +102,13 @@ struct vfio_pfn {
> atomic_t ref_count;
> };
>
> +struct vfio_regions {
> + struct list_head list;
> + dma_addr_t iova;
> + phys_addr_t phys;
> + size_t len;
> +};
> +
> #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> (!list_empty(&iommu->domain_list))
>
> @@ -648,11 +655,102 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> return i > npage ? npage : (i > 0 ? i : -EINVAL);
> }
>
> +static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
> + struct list_head *regions)
> +{
> + long unlocked = 0;
> + struct vfio_regions *entry, *next;
> +
> + iommu_tlb_sync(domain->domain);
> +
> + list_for_each_entry_safe(entry, next, regions, list) {
> + unlocked += vfio_unpin_pages_remote(dma,
> + entry->iova,
> + entry->phys >> PAGE_SHIFT,
> + entry->len >> PAGE_SHIFT,
> + false);
> + list_del(&entry->list);
> + kfree(entry);
> + }
> +
> + cond_resched();
> +
> + return unlocked;
> +}
> +
> +/*
> + * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
> + * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
> + * of these regions (currently using a list).
> + *
> + * This value specifies maximum number of regions for each IOTLB flush sync.
> + */
> +#define VFIO_IOMMU_TLB_SYNC_MAX 512
> +
> +static size_t unmap_unpin_fast(struct vfio_domain *domain,
> + struct vfio_dma *dma, dma_addr_t *iova,
> + size_t len, phys_addr_t phys, long *unlocked,
> + struct list_head *unmapped_list,
> + int *unmapped_cnt)
> +{
> + size_t unmapped = 0;
> + struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> +
> + if (entry) {
> + unmapped = iommu_unmap_fast(domain->domain, *iova, len);
> +
> + if (!unmapped) {
> + kfree(entry);
> + } else {
> + iommu_tlb_range_add(domain->domain, *iova, unmapped);
> + entry->iova = *iova;
> + entry->phys = phys;
> + entry->len = unmapped;
> + list_add_tail(&entry->list, unmapped_list);
> +
> + *iova += unmapped;
> + (*unmapped_cnt)++;
> + }
> + }
> +
> + /*
> + * Sync if the number of fast-unmap regions hits the limit
> + * or in case of errors.
> + */
> + if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
> + *unlocked += vfio_sync_unpin(dma, domain,
> + unmapped_list);
> + *unmapped_cnt = 0;
> + }
> +
> + return unmapped;
> +}
> +
> +static size_t unmap_unpin_slow(struct vfio_domain *domain,
> + struct vfio_dma *dma, dma_addr_t *iova,
> + size_t len, phys_addr_t phys,
> + long *unlocked)
> +{
> + size_t unmapped = iommu_unmap(domain->domain, *iova, len);
> +
> + if (unmapped) {
> + *unlocked += vfio_unpin_pages_remote(dma, *iova,
> + phys >> PAGE_SHIFT,
> + unmapped >> PAGE_SHIFT,
> + false);
> + *iova += unmapped;
> + cond_resched();
> + }
> + return unmapped;
> +}
> +
> static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> bool do_accounting)
> {
> dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> struct vfio_domain *domain, *d;
> + struct list_head unmapped_region_list;
> + int unmapped_region_cnt = 0;
> long unlocked = 0;
>
> if (!dma->size)
> @@ -661,6 +759,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> return 0;
>
> + INIT_LIST_HEAD(&unmapped_region_list);

Since I harassed Shameer about using LIST_HEAD() for the iova list
extension, I feel obligated to note that it can also be used here. If
you approve I'll just remove the above INIT_LIST_HEAD() and declare
unmapped_region_list as LIST_HEAD(unmapped_region_list);, no need to
re-send. Otherwise looks fine to me. Thanks,

Alex

2018-02-23 08:22:03

by Tian, Kevin

[permalink] [raw]
Subject: RE: [PATCH v5] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs

> From: Alex Williamson
> Sent: Friday, February 23, 2018 6:59 AM
>
> On Thu, 1 Feb 2018 01:27:38 -0500
> Suravee Suthikulpanit <[email protected]> wrote:
>
> > VFIO IOMMU type1 currently upmaps IOVA pages synchronously, which
> requires
> > IOTLB flushing for every unmapping. This results in large IOTLB flushing
> > overhead when handling pass-through devices has a large number of
> mapped
> > IOVAs. This can be avoided by using the new IOTLB flushing interface.
> >
> > Cc: Alex Williamson <[email protected]>
> > Cc: Joerg Roedel <[email protected]>
> > Signed-off-by: Suravee Suthikulpanit <[email protected]>
> > ---
> >
> > Changes from v4 (https://lkml.org/lkml/2018/1/31/153)
> > * Change return type from ssize_t back to size_t since we no longer
> > changing IOMMU API. Also update error handling logic accordingly.
> > * In unmap_unpin_fast(), also sync when failing to allocate entry.
> > * Some code restructuring and variable renaming.
> >
> > drivers/vfio/vfio_iommu_type1.c | 128
> ++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 117 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> > index e30e29a..6041530 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -102,6 +102,13 @@ struct vfio_pfn {
> > atomic_t ref_count;
> > };
> >
> > +struct vfio_regions {
> > + struct list_head list;
> > + dma_addr_t iova;
> > + phys_addr_t phys;
> > + size_t len;
> > +};
> > +
> > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> > (!list_empty(&iommu->domain_list))
> >
> > @@ -648,11 +655,102 @@ static int
> vfio_iommu_type1_unpin_pages(void *iommu_data,
> > return i > npage ? npage : (i > 0 ? i : -EINVAL);
> > }
> >
> > +static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain
> *domain,
> > + struct list_head *regions)
> > +{
> > + long unlocked = 0;
> > + struct vfio_regions *entry, *next;
> > +
> > + iommu_tlb_sync(domain->domain);
> > +
> > + list_for_each_entry_safe(entry, next, regions, list) {
> > + unlocked += vfio_unpin_pages_remote(dma,
> > + entry->iova,
> > + entry->phys >>
> PAGE_SHIFT,
> > + entry->len >> PAGE_SHIFT,
> > + false);
> > + list_del(&entry->list);
> > + kfree(entry);
> > + }
> > +
> > + cond_resched();
> > +
> > + return unlocked;
> > +}
> > +
> > +/*
> > + * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
> > + * Therefore, when using IOTLB flush sync interface, VFIO need to keep
> track
> > + * of these regions (currently using a list).
> > + *
> > + * This value specifies maximum number of regions for each IOTLB flush
> sync.
> > + */
> > +#define VFIO_IOMMU_TLB_SYNC_MAX 512
> > +
> > +static size_t unmap_unpin_fast(struct vfio_domain *domain,
> > + struct vfio_dma *dma, dma_addr_t *iova,
> > + size_t len, phys_addr_t phys, long *unlocked,
> > + struct list_head *unmapped_list,
> > + int *unmapped_cnt)
> > +{
> > + size_t unmapped = 0;
> > + struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> > +
> > + if (entry) {
> > + unmapped = iommu_unmap_fast(domain->domain, *iova,
> len);
> > +
> > + if (!unmapped) {
> > + kfree(entry);
> > + } else {
> > + iommu_tlb_range_add(domain->domain, *iova,
> unmapped);
> > + entry->iova = *iova;
> > + entry->phys = phys;
> > + entry->len = unmapped;
> > + list_add_tail(&entry->list, unmapped_list);
> > +
> > + *iova += unmapped;
> > + (*unmapped_cnt)++;
> > + }
> > + }
> > +
> > + /*
> > + * Sync if the number of fast-unmap regions hits the limit
> > + * or in case of errors.
> > + */
> > + if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX
> || !unmapped) {
> > + *unlocked += vfio_sync_unpin(dma, domain,
> > + unmapped_list);
> > + *unmapped_cnt = 0;
> > + }

I'm not sure why returning ZERO is treated as only unmap error
here, but if looking at __iommu_unmap clearly there are other
error codes returned also. I know it's not introduced by this
patch but Alex, was it deliberately implemented such way under
any assumption or a typo?

Thanks
Kevin

2018-02-23 15:17:36

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v5] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs

On Fri, 23 Feb 2018 08:20:51 +0000
"Tian, Kevin" <[email protected]> wrote:

> > From: Alex Williamson
> > Sent: Friday, February 23, 2018 6:59 AM
> >
> > On Thu, 1 Feb 2018 01:27:38 -0500
> > Suravee Suthikulpanit <[email protected]> wrote:
> >
> > > VFIO IOMMU type1 currently upmaps IOVA pages synchronously, which
> > requires
> > > IOTLB flushing for every unmapping. This results in large IOTLB flushing
> > > overhead when handling pass-through devices has a large number of
> > mapped
> > > IOVAs. This can be avoided by using the new IOTLB flushing interface.
> > >
> > > Cc: Alex Williamson <[email protected]>
> > > Cc: Joerg Roedel <[email protected]>
> > > Signed-off-by: Suravee Suthikulpanit <[email protected]>
> > > ---
> > >
> > > Changes from v4 (https://lkml.org/lkml/2018/1/31/153)
> > > * Change return type from ssize_t back to size_t since we no longer
> > > changing IOMMU API. Also update error handling logic accordingly.
> > > * In unmap_unpin_fast(), also sync when failing to allocate entry.
> > > * Some code restructuring and variable renaming.
> > >
> > > drivers/vfio/vfio_iommu_type1.c | 128
> > ++++++++++++++++++++++++++++++++++++----
> > > 1 file changed, 117 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c
> > > index e30e29a..6041530 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -102,6 +102,13 @@ struct vfio_pfn {
> > > atomic_t ref_count;
> > > };
> > >
> > > +struct vfio_regions {
> > > + struct list_head list;
> > > + dma_addr_t iova;
> > > + phys_addr_t phys;
> > > + size_t len;
> > > +};
> > > +
> > > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> > > (!list_empty(&iommu->domain_list))
> > >
> > > @@ -648,11 +655,102 @@ static int
> > vfio_iommu_type1_unpin_pages(void *iommu_data,
> > > return i > npage ? npage : (i > 0 ? i : -EINVAL);
> > > }
> > >
> > > +static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain
> > *domain,
> > > + struct list_head *regions)
> > > +{
> > > + long unlocked = 0;
> > > + struct vfio_regions *entry, *next;
> > > +
> > > + iommu_tlb_sync(domain->domain);
> > > +
> > > + list_for_each_entry_safe(entry, next, regions, list) {
> > > + unlocked += vfio_unpin_pages_remote(dma,
> > > + entry->iova,
> > > + entry->phys >>
> > PAGE_SHIFT,
> > > + entry->len >> PAGE_SHIFT,
> > > + false);
> > > + list_del(&entry->list);
> > > + kfree(entry);
> > > + }
> > > +
> > > + cond_resched();
> > > +
> > > + return unlocked;
> > > +}
> > > +
> > > +/*
> > > + * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
> > > + * Therefore, when using IOTLB flush sync interface, VFIO need to keep
> > track
> > > + * of these regions (currently using a list).
> > > + *
> > > + * This value specifies maximum number of regions for each IOTLB flush
> > sync.
> > > + */
> > > +#define VFIO_IOMMU_TLB_SYNC_MAX 512
> > > +
> > > +static size_t unmap_unpin_fast(struct vfio_domain *domain,
> > > + struct vfio_dma *dma, dma_addr_t *iova,
> > > + size_t len, phys_addr_t phys, long *unlocked,
> > > + struct list_head *unmapped_list,
> > > + int *unmapped_cnt)
> > > +{
> > > + size_t unmapped = 0;
> > > + struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> > > +
> > > + if (entry) {
> > > + unmapped = iommu_unmap_fast(domain->domain, *iova,
> > len);
> > > +
> > > + if (!unmapped) {
> > > + kfree(entry);
> > > + } else {
> > > + iommu_tlb_range_add(domain->domain, *iova,
> > unmapped);
> > > + entry->iova = *iova;
> > > + entry->phys = phys;
> > > + entry->len = unmapped;
> > > + list_add_tail(&entry->list, unmapped_list);
> > > +
> > > + *iova += unmapped;
> > > + (*unmapped_cnt)++;
> > > + }
> > > + }
> > > +
> > > + /*
> > > + * Sync if the number of fast-unmap regions hits the limit
> > > + * or in case of errors.
> > > + */
> > > + if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX
> > || !unmapped) {
> > > + *unlocked += vfio_sync_unpin(dma, domain,
> > > + unmapped_list);
> > > + *unmapped_cnt = 0;
> > > + }
>
> I'm not sure why returning ZERO is treated as only unmap error
> here, but if looking at __iommu_unmap clearly there are other
> error codes returned also. I know it's not introduced by this
> patch but Alex, was it deliberately implemented such way under
> any assumption or a typo?

iommu_unmap() returns a size_t, an unsigned type. Suravee has another
patch in the iommu space to correct that function from trying to return
-errno. Thanks,

Alex

2018-03-22 21:49:09

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH v5] vfio/type1: Adopt fast IOTLB flush interface when unmap IOVAs

On Thu, 22 Feb 2018 15:59:15 -0700
Alex Williamson <[email protected]> wrote:

> On Thu, 1 Feb 2018 01:27:38 -0500
> Suravee Suthikulpanit <[email protected]> wrote:
>
> > VFIO IOMMU type1 currently upmaps IOVA pages synchronously, which requires
> > IOTLB flushing for every unmapping. This results in large IOTLB flushing
> > overhead when handling pass-through devices has a large number of mapped
> > IOVAs. This can be avoided by using the new IOTLB flushing interface.
> >
> > Cc: Alex Williamson <[email protected]>
> > Cc: Joerg Roedel <[email protected]>
> > Signed-off-by: Suravee Suthikulpanit <[email protected]>
> > ---
> >
> > Changes from v4 (https://lkml.org/lkml/2018/1/31/153)
> > * Change return type from ssize_t back to size_t since we no longer
> > changing IOMMU API. Also update error handling logic accordingly.
> > * In unmap_unpin_fast(), also sync when failing to allocate entry.
> > * Some code restructuring and variable renaming.
> >
> > drivers/vfio/vfio_iommu_type1.c | 128 ++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 117 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index e30e29a..6041530 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -102,6 +102,13 @@ struct vfio_pfn {
> > atomic_t ref_count;
> > };
> >
> > +struct vfio_regions {
> > + struct list_head list;
> > + dma_addr_t iova;
> > + phys_addr_t phys;
> > + size_t len;
> > +};
> > +
> > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> > (!list_empty(&iommu->domain_list))
> >
> > @@ -648,11 +655,102 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> > return i > npage ? npage : (i > 0 ? i : -EINVAL);
> > }
> >
> > +static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
> > + struct list_head *regions)
> > +{
> > + long unlocked = 0;
> > + struct vfio_regions *entry, *next;
> > +
> > + iommu_tlb_sync(domain->domain);
> > +
> > + list_for_each_entry_safe(entry, next, regions, list) {
> > + unlocked += vfio_unpin_pages_remote(dma,
> > + entry->iova,
> > + entry->phys >> PAGE_SHIFT,
> > + entry->len >> PAGE_SHIFT,
> > + false);
> > + list_del(&entry->list);
> > + kfree(entry);
> > + }
> > +
> > + cond_resched();
> > +
> > + return unlocked;
> > +}
> > +
> > +/*
> > + * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
> > + * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
> > + * of these regions (currently using a list).
> > + *
> > + * This value specifies maximum number of regions for each IOTLB flush sync.
> > + */
> > +#define VFIO_IOMMU_TLB_SYNC_MAX 512
> > +
> > +static size_t unmap_unpin_fast(struct vfio_domain *domain,
> > + struct vfio_dma *dma, dma_addr_t *iova,
> > + size_t len, phys_addr_t phys, long *unlocked,
> > + struct list_head *unmapped_list,
> > + int *unmapped_cnt)
> > +{
> > + size_t unmapped = 0;
> > + struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> > +
> > + if (entry) {
> > + unmapped = iommu_unmap_fast(domain->domain, *iova, len);
> > +
> > + if (!unmapped) {
> > + kfree(entry);
> > + } else {
> > + iommu_tlb_range_add(domain->domain, *iova, unmapped);
> > + entry->iova = *iova;
> > + entry->phys = phys;
> > + entry->len = unmapped;
> > + list_add_tail(&entry->list, unmapped_list);
> > +
> > + *iova += unmapped;
> > + (*unmapped_cnt)++;
> > + }
> > + }
> > +
> > + /*
> > + * Sync if the number of fast-unmap regions hits the limit
> > + * or in case of errors.
> > + */
> > + if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
> > + *unlocked += vfio_sync_unpin(dma, domain,
> > + unmapped_list);
> > + *unmapped_cnt = 0;
> > + }
> > +
> > + return unmapped;
> > +}
> > +
> > +static size_t unmap_unpin_slow(struct vfio_domain *domain,
> > + struct vfio_dma *dma, dma_addr_t *iova,
> > + size_t len, phys_addr_t phys,
> > + long *unlocked)
> > +{
> > + size_t unmapped = iommu_unmap(domain->domain, *iova, len);
> > +
> > + if (unmapped) {
> > + *unlocked += vfio_unpin_pages_remote(dma, *iova,
> > + phys >> PAGE_SHIFT,
> > + unmapped >> PAGE_SHIFT,
> > + false);
> > + *iova += unmapped;
> > + cond_resched();
> > + }
> > + return unmapped;
> > +}
> > +
> > static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > bool do_accounting)
> > {
> > dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> > struct vfio_domain *domain, *d;
> > + struct list_head unmapped_region_list;
> > + int unmapped_region_cnt = 0;
> > long unlocked = 0;
> >
> > if (!dma->size)
> > @@ -661,6 +759,8 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
> > return 0;
> >
> > + INIT_LIST_HEAD(&unmapped_region_list);
>
> Since I harassed Shameer about using LIST_HEAD() for the iova list
> extension, I feel obligated to note that it can also be used here. If
> you approve I'll just remove the above INIT_LIST_HEAD() and declare
> unmapped_region_list as LIST_HEAD(unmapped_region_list);, no need to
> re-send. Otherwise looks fine to me. Thanks,

I went ahead with this option, applied to vfio next branch for v4.17.
Thanks,

Alex