From: Liu Yi L <[email protected]>
For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest "owns" the
first-level/stage-1 translation structures, the host IOMMU driver has no
knowledge of first-level/stage-1 structure cache updates unless the guest
invalidation requests are trapped and propagated to the host.
This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to propagate guest
first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU cache
correctness.
With this patch, vSVA (Virtual Shared Virtual Addressing) can be used safely
as the host IOMMU iotlb correctness are ensured.
Cc: Kevin Tian <[email protected]>
CC: Jacob Pan <[email protected]>
Cc: Alex Williamson <[email protected]>
Cc: Eric Auger <[email protected]>
Cc: Jean-Philippe Brucker <[email protected]>
Signed-off-by: Liu Yi L <[email protected]>
Signed-off-by: Eric Auger <[email protected]>
Signed-off-by: Jacob Pan <[email protected]>
---
drivers/vfio/vfio_iommu_type1.c | 49 +++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/vfio.h | 22 ++++++++++++++++++
2 files changed, 71 insertions(+)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a877747..937ec3f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2423,6 +2423,15 @@ static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
return ret;
}
+static int vfio_cache_inv_fn(struct device *dev, void *data)
+{
+ struct domain_capsule *dc = (struct domain_capsule *)data;
+ struct iommu_cache_invalidate_info *cache_inv_info =
+ (struct iommu_cache_invalidate_info *) dc->data;
+
+ return iommu_cache_invalidate(dc->domain, dev, cache_inv_info);
+}
+
static long vfio_iommu_type1_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
@@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
}
kfree(gbind_data);
return ret;
+ } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
+ struct vfio_iommu_type1_cache_invalidate cache_inv;
+ u32 version;
+ int info_size;
+ void *cache_info;
+ int ret;
+
+ minsz = offsetofend(struct vfio_iommu_type1_cache_invalidate,
+ flags);
+
+ if (copy_from_user(&cache_inv, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (cache_inv.argsz < minsz || cache_inv.flags)
+ return -EINVAL;
+
+ /* Get the version of struct iommu_cache_invalidate_info */
+ if (copy_from_user(&version,
+ (void __user *) (arg + minsz), sizeof(version)))
+ return -EFAULT;
+
+ info_size = iommu_uapi_get_data_size(
+ IOMMU_UAPI_CACHE_INVAL, version);
+
+ cache_info = kzalloc(info_size, GFP_KERNEL);
+ if (!cache_info)
+ return -ENOMEM;
+
+ if (copy_from_user(cache_info,
+ (void __user *) (arg + minsz), info_size)) {
+ kfree(cache_info);
+ return -EFAULT;
+ }
+
+ mutex_lock(&iommu->lock);
+ ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn,
+ cache_info);
+ mutex_unlock(&iommu->lock);
+ kfree(cache_info);
+ return ret;
}
return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2235bc6..62ca791 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind {
*/
#define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23)
+/**
+ * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24,
+ * struct vfio_iommu_type1_cache_invalidate)
+ *
+ * Propagate guest IOMMU cache invalidation to the host. The cache
+ * invalidation information is conveyed by @cache_info, the content
+ * format would be structures defined in uapi/linux/iommu.h. User
+ * should be aware of that the struct iommu_cache_invalidate_info
+ * has a @version field, vfio needs to parse this field before getting
+ * data from userspace.
+ *
+ * Availability of this IOCTL is after VFIO_SET_IOMMU.
+ *
+ * returns: 0 on success, -errno on failure.
+ */
+struct vfio_iommu_type1_cache_invalidate {
+ __u32 argsz;
+ __u32 flags;
+ struct iommu_cache_invalidate_info cache_info;
+};
+#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + 24)
+
/* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
/*
--
2.7.4
> From: Liu, Yi L <[email protected]>
> Sent: Sunday, March 22, 2020 8:32 PM
>
> From: Liu Yi L <[email protected]>
>
> For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest
> "owns" the
> first-level/stage-1 translation structures, the host IOMMU driver has no
> knowledge of first-level/stage-1 structure cache updates unless the guest
> invalidation requests are trapped and propagated to the host.
>
> This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to
> propagate guest
> first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU
> cache
> correctness.
>
> With this patch, vSVA (Virtual Shared Virtual Addressing) can be used safely
> as the host IOMMU iotlb correctness are ensured.
>
> Cc: Kevin Tian <[email protected]>
> CC: Jacob Pan <[email protected]>
> Cc: Alex Williamson <[email protected]>
> Cc: Eric Auger <[email protected]>
> Cc: Jean-Philippe Brucker <[email protected]>
> Signed-off-by: Liu Yi L <[email protected]>
> Signed-off-by: Eric Auger <[email protected]>
> Signed-off-by: Jacob Pan <[email protected]>
> ---
> drivers/vfio/vfio_iommu_type1.c | 49
> +++++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h | 22 ++++++++++++++++++
> 2 files changed, 71 insertions(+)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> index a877747..937ec3f 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2423,6 +2423,15 @@ static long
> vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> return ret;
> }
>
> +static int vfio_cache_inv_fn(struct device *dev, void *data)
vfio_iommu_cache_inv_fn
> +{
> + struct domain_capsule *dc = (struct domain_capsule *)data;
> + struct iommu_cache_invalidate_info *cache_inv_info =
> + (struct iommu_cache_invalidate_info *) dc->data;
> +
> + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info);
> +}
> +
> static long vfio_iommu_type1_ioctl(void *iommu_data,
> unsigned int cmd, unsigned long arg)
> {
> @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> }
> kfree(gbind_data);
> return ret;
> + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
> + struct vfio_iommu_type1_cache_invalidate cache_inv;
> + u32 version;
> + int info_size;
> + void *cache_info;
> + int ret;
> +
> + minsz = offsetofend(struct
> vfio_iommu_type1_cache_invalidate,
> + flags);
> +
> + if (copy_from_user(&cache_inv, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (cache_inv.argsz < minsz || cache_inv.flags)
> + return -EINVAL;
> +
> + /* Get the version of struct iommu_cache_invalidate_info */
> + if (copy_from_user(&version,
> + (void __user *) (arg + minsz), sizeof(version)))
> + return -EFAULT;
> +
> + info_size = iommu_uapi_get_data_size(
> + IOMMU_UAPI_CACHE_INVAL,
> version);
> +
> + cache_info = kzalloc(info_size, GFP_KERNEL);
> + if (!cache_info)
> + return -ENOMEM;
> +
> + if (copy_from_user(cache_info,
> + (void __user *) (arg + minsz), info_size)) {
> + kfree(cache_info);
> + return -EFAULT;
> + }
> +
> + mutex_lock(&iommu->lock);
> + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn,
> + cache_info);
> + mutex_unlock(&iommu->lock);
> + kfree(cache_info);
> + return ret;
> }
>
> return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 2235bc6..62ca791 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind {
> */
> #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23)
>
> +/**
> + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24,
> + * struct vfio_iommu_type1_cache_invalidate)
> + *
> + * Propagate guest IOMMU cache invalidation to the host. The cache
> + * invalidation information is conveyed by @cache_info, the content
> + * format would be structures defined in uapi/linux/iommu.h. User
> + * should be aware of that the struct iommu_cache_invalidate_info
> + * has a @version field, vfio needs to parse this field before getting
> + * data from userspace.
> + *
> + * Availability of this IOCTL is after VFIO_SET_IOMMU.
> + *
> + * returns: 0 on success, -errno on failure.
> + */
> +struct vfio_iommu_type1_cache_invalidate {
> + __u32 argsz;
> + __u32 flags;
> + struct iommu_cache_invalidate_info cache_info;
> +};
> +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE +
> 24)
> +
> /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>
> /*
> --
> 2.7.4
This patch looks good to me in general. But since there is still
a major open about version compatibility, I'll hold my r-b until
that open is closed. ????
Thanks
Kevin
> @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> }
> kfree(gbind_data);
> return ret;
> + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
Please refactor the spaghetti in this ioctl handler to use a switch
statement and a helper function per command before growing it even more.
> + /* Get the version of struct iommu_cache_invalidate_info */
> + if (copy_from_user(&version,
> + (void __user *) (arg + minsz), sizeof(version)))
> + return -EFAULT;
> +
> + info_size = iommu_uapi_get_data_size(
> + IOMMU_UAPI_CACHE_INVAL, version);
> +
> + cache_info = kzalloc(info_size, GFP_KERNEL);
> + if (!cache_info)
> + return -ENOMEM;
> +
> + if (copy_from_user(cache_info,
> + (void __user *) (arg + minsz), info_size)) {
The user might have changed the version while you were allocating and
freeing the memory, introducing potentially exploitable racing
conditions.
Hi Hellwig,
Thanks for your review, Hellwig. :-) inline reply.
> From: Christoph Hellwig <[email protected]>
> Sent: Tuesday, March 31, 2020 3:56 PM
> To: Liu, Yi L <[email protected]>
> Subject: Re: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE
>
> > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> > }
> > kfree(gbind_data);
> > return ret;
> > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
>
> Please refactor the spaghetti in this ioctl handler to use a switch statement and a
> helper function per command before growing it even more.
got it. I may get a separate refactor patch before adding my changes.
>
> > + /* Get the version of struct iommu_cache_invalidate_info */
> > + if (copy_from_user(&version,
> > + (void __user *) (arg + minsz), sizeof(version)))
> > + return -EFAULT;
> > +
> > + info_size = iommu_uapi_get_data_size(
> > + IOMMU_UAPI_CACHE_INVAL, version);
> > +
> > + cache_info = kzalloc(info_size, GFP_KERNEL);
> > + if (!cache_info)
> > + return -ENOMEM;
> > +
> > + if (copy_from_user(cache_info,
> > + (void __user *) (arg + minsz), info_size)) {
>
> The user might have changed the version while you were allocating and
> freeing the
> memory, introducing potentially exploitable racing conditions.
yeah, I know the @version is not welcomed in the thread Jacob is driving.
I'll adjust the code here once the open in that thread has been solved.
But regardless of the version, I'm not sure if I 100% got your point.
Could you elaborate a bit? BTW. The code somehow referenced the code
below. The basic flow is copying partial data from __arg and then copy
the rest data after figuring out how much left. The difference betwen
below code and my code is just different way to figure out left data
size. Since I'm not sure if I got your point. If the racing is true in
such flow, I guess there are quite a few places need to enhance.
vfio_pci_ioctl(){
{
...
} else if (cmd == VFIO_DEVICE_SET_IRQS) {
struct vfio_irq_set hdr;
u8 *data = NULL;
int max, ret = 0;
size_t data_size = 0;
minsz = offsetofend(struct vfio_irq_set, count);
if (copy_from_user(&hdr, (void __user *)arg, minsz))
return -EFAULT;
max = vfio_pci_get_irq_count(vdev, hdr.index);
ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
VFIO_PCI_NUM_IRQS, &data_size);
if (ret)
return ret;
if (data_size) {
data = memdup_user((void __user *)(arg + minsz),
data_size);
if (IS_ERR(data))
return PTR_ERR(data);
}
mutex_lock(&vdev->igate);
ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
hdr.start, hdr.count, data);
mutex_unlock(&vdev->igate);
kfree(data);
return ret;
} else if (cmd == VFIO_DEVICE_RESET) {
...
}
Regards,
Yi Liu
> From: Tian, Kevin <[email protected]>
> Sent: Monday, March 30, 2020 8:58 PM
> To: Liu, Yi L <[email protected]>; [email protected];
> Subject: RE: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE
>
> > From: Liu, Yi L <[email protected]>
> > Sent: Sunday, March 22, 2020 8:32 PM
> >
> > From: Liu Yi L <[email protected]>
> >
> > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest "owns"
> > the
> > first-level/stage-1 translation structures, the host IOMMU driver has
> > no knowledge of first-level/stage-1 structure cache updates unless the
> > guest invalidation requests are trapped and propagated to the host.
> >
> > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to propagate
> > guest
> > first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU
> > cache correctness.
> >
> > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used
> > safely as the host IOMMU iotlb correctness are ensured.
> >
> > Cc: Kevin Tian <[email protected]>
> > CC: Jacob Pan <[email protected]>
> > Cc: Alex Williamson <[email protected]>
> > Cc: Eric Auger <[email protected]>
> > Cc: Jean-Philippe Brucker <[email protected]>
> > Signed-off-by: Liu Yi L <[email protected]>
> > Signed-off-by: Eric Auger <[email protected]>
> > Signed-off-by: Jacob Pan <[email protected]>
> > ---
> > drivers/vfio/vfio_iommu_type1.c | 49
> > +++++++++++++++++++++++++++++++++++++++++
> > include/uapi/linux/vfio.h | 22 ++++++++++++++++++
> > 2 files changed, 71 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c index a877747..937ec3f 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2423,6 +2423,15 @@ static long
> > vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> > return ret;
> > }
> >
> > +static int vfio_cache_inv_fn(struct device *dev, void *data)
>
> vfio_iommu_cache_inv_fn
got it.
> > +{
> > + struct domain_capsule *dc = (struct domain_capsule *)data;
> > + struct iommu_cache_invalidate_info *cache_inv_info =
> > + (struct iommu_cache_invalidate_info *) dc->data;
> > +
> > + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info); }
> > +
> > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > unsigned int cmd, unsigned long arg) { @@ -
> 2629,6 +2638,46 @@
> > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > }
> > kfree(gbind_data);
> > return ret;
> > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
> > + struct vfio_iommu_type1_cache_invalidate cache_inv;
> > + u32 version;
> > + int info_size;
> > + void *cache_info;
> > + int ret;
> > +
> > + minsz = offsetofend(struct
> > vfio_iommu_type1_cache_invalidate,
> > + flags);
> > +
> > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz))
> > + return -EFAULT;
> > +
> > + if (cache_inv.argsz < minsz || cache_inv.flags)
> > + return -EINVAL;
> > +
> > + /* Get the version of struct iommu_cache_invalidate_info */
> > + if (copy_from_user(&version,
> > + (void __user *) (arg + minsz), sizeof(version)))
> > + return -EFAULT;
> > +
> > + info_size = iommu_uapi_get_data_size(
> > + IOMMU_UAPI_CACHE_INVAL,
> > version);
> > +
> > + cache_info = kzalloc(info_size, GFP_KERNEL);
> > + if (!cache_info)
> > + return -ENOMEM;
> > +
> > + if (copy_from_user(cache_info,
> > + (void __user *) (arg + minsz), info_size)) {
> > + kfree(cache_info);
> > + return -EFAULT;
> > + }
> > +
> > + mutex_lock(&iommu->lock);
> > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn,
> > + cache_info);
> > + mutex_unlock(&iommu->lock);
> > + kfree(cache_info);
> > + return ret;
> > }
> >
> > return -ENOTTY;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 2235bc6..62ca791 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind {
> > */
> > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23)
> >
> > +/**
> > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24,
> > + * struct vfio_iommu_type1_cache_invalidate)
> > + *
> > + * Propagate guest IOMMU cache invalidation to the host. The cache
> > + * invalidation information is conveyed by @cache_info, the content
> > + * format would be structures defined in uapi/linux/iommu.h. User
> > + * should be aware of that the struct iommu_cache_invalidate_info
> > + * has a @version field, vfio needs to parse this field before
> > +getting
> > + * data from userspace.
> > + *
> > + * Availability of this IOCTL is after VFIO_SET_IOMMU.
> > + *
> > + * returns: 0 on success, -errno on failure.
> > + */
> > +struct vfio_iommu_type1_cache_invalidate {
> > + __u32 argsz;
> > + __u32 flags;
> > + struct iommu_cache_invalidate_info cache_info;
> > +};
> > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE +
> > 24)
> > +
> > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU
> > -------- */
> >
> > /*
> > --
> > 2.7.4
>
> This patch looks good to me in general. But since there is still a major open about
> version compatibility, I'll hold my r-b until that open is closed. ????
>
thanks,
Regards,
Yi Liu
On Sun, 22 Mar 2020 05:32:04 -0700
"Liu, Yi L" <[email protected]> wrote:
> From: Liu Yi L <[email protected]>
>
> For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest "owns" the
> first-level/stage-1 translation structures, the host IOMMU driver has no
> knowledge of first-level/stage-1 structure cache updates unless the guest
> invalidation requests are trapped and propagated to the host.
>
> This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to propagate guest
> first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU cache
> correctness.
>
> With this patch, vSVA (Virtual Shared Virtual Addressing) can be used safely
> as the host IOMMU iotlb correctness are ensured.
>
> Cc: Kevin Tian <[email protected]>
> CC: Jacob Pan <[email protected]>
> Cc: Alex Williamson <[email protected]>
> Cc: Eric Auger <[email protected]>
> Cc: Jean-Philippe Brucker <[email protected]>
> Signed-off-by: Liu Yi L <[email protected]>
> Signed-off-by: Eric Auger <[email protected]>
> Signed-off-by: Jacob Pan <[email protected]>
> ---
> drivers/vfio/vfio_iommu_type1.c | 49 +++++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h | 22 ++++++++++++++++++
> 2 files changed, 71 insertions(+)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index a877747..937ec3f 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2423,6 +2423,15 @@ static long vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> return ret;
> }
>
> +static int vfio_cache_inv_fn(struct device *dev, void *data)
> +{
> + struct domain_capsule *dc = (struct domain_capsule *)data;
> + struct iommu_cache_invalidate_info *cache_inv_info =
> + (struct iommu_cache_invalidate_info *) dc->data;
> +
> + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info);
> +}
> +
> static long vfio_iommu_type1_ioctl(void *iommu_data,
> unsigned int cmd, unsigned long arg)
> {
> @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
> }
> kfree(gbind_data);
> return ret;
> + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
> + struct vfio_iommu_type1_cache_invalidate cache_inv;
> + u32 version;
> + int info_size;
> + void *cache_info;
> + int ret;
> +
> + minsz = offsetofend(struct vfio_iommu_type1_cache_invalidate,
> + flags);
This breaks backward compatibility as soon as struct
iommu_cache_invalidate_info changes size by its defined versioning
scheme. ie. a field gets added, the version is bumped, all existing
userspace breaks. Our minsz is offsetofend to the version field,
interpret the version to size, then reevaluate argsz.
> +
> + if (copy_from_user(&cache_inv, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (cache_inv.argsz < minsz || cache_inv.flags)
> + return -EINVAL;
> +
> + /* Get the version of struct iommu_cache_invalidate_info */
> + if (copy_from_user(&version,
> + (void __user *) (arg + minsz), sizeof(version)))
> + return -EFAULT;
> +
> + info_size = iommu_uapi_get_data_size(
> + IOMMU_UAPI_CACHE_INVAL, version);
> +
> + cache_info = kzalloc(info_size, GFP_KERNEL);
> + if (!cache_info)
> + return -ENOMEM;
> +
> + if (copy_from_user(cache_info,
> + (void __user *) (arg + minsz), info_size)) {
> + kfree(cache_info);
> + return -EFAULT;
> + }
> +
> + mutex_lock(&iommu->lock);
> + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn,
> + cache_info);
How does a user respond when their cache invalidate fails? Isn't this
also another case where our for_each_dev can fail at an arbitrary point
leaving us with no idea whether each device even had the opportunity to
perform the invalidation request. I don't see how we have any chance
to maintain coherency after this faults.
> + mutex_unlock(&iommu->lock);
> + kfree(cache_info);
> + return ret;
> }
>
> return -ENOTTY;
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 2235bc6..62ca791 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind {
> */
> #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23)
>
> +/**
> + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24,
> + * struct vfio_iommu_type1_cache_invalidate)
> + *
> + * Propagate guest IOMMU cache invalidation to the host. The cache
> + * invalidation information is conveyed by @cache_info, the content
> + * format would be structures defined in uapi/linux/iommu.h. User
> + * should be aware of that the struct iommu_cache_invalidate_info
> + * has a @version field, vfio needs to parse this field before getting
> + * data from userspace.
> + *
> + * Availability of this IOCTL is after VFIO_SET_IOMMU.
Is this a necessary qualifier? A user can try to call this ioctl at
any point, it only makes sense in certain configurations, but it should
always "do the right thing" relative to the container iommu config.
Also, I don't see anything in these last few patches testing the
operating IOMMU model, what happens when a user calls them when not
using the nesting IOMMU?
Is this ioctl and the previous BIND ioctl only valid when configured
for the nesting IOMMU type?
> + *
> + * returns: 0 on success, -errno on failure.
> + */
> +struct vfio_iommu_type1_cache_invalidate {
> + __u32 argsz;
> + __u32 flags;
> + struct iommu_cache_invalidate_info cache_info;
> +};
> +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE + 24)
The future extension capabilities of this ioctl worry me, I wonder if
we should do another data[] with flag defining that data as CACHE_INFO.
> +
> /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
>
> /*
> From: Alex Williamson <[email protected]>
> Sent: Friday, April 3, 2020 4:24 AM
>
> On Sun, 22 Mar 2020 05:32:04 -0700
> "Liu, Yi L" <[email protected]> wrote:
>
> > From: Liu Yi L <[email protected]>
> >
> > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest
> "owns" the
> > first-level/stage-1 translation structures, the host IOMMU driver has no
> > knowledge of first-level/stage-1 structure cache updates unless the guest
> > invalidation requests are trapped and propagated to the host.
> >
> > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to
> propagate guest
> > first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU
> cache
> > correctness.
> >
> > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used safely
> > as the host IOMMU iotlb correctness are ensured.
> >
> > Cc: Kevin Tian <[email protected]>
> > CC: Jacob Pan <[email protected]>
> > Cc: Alex Williamson <[email protected]>
> > Cc: Eric Auger <[email protected]>
> > Cc: Jean-Philippe Brucker <[email protected]>
> > Signed-off-by: Liu Yi L <[email protected]>
> > Signed-off-by: Eric Auger <[email protected]>
> > Signed-off-by: Jacob Pan <[email protected]>
> > ---
> > drivers/vfio/vfio_iommu_type1.c | 49
> +++++++++++++++++++++++++++++++++++++++++
> > include/uapi/linux/vfio.h | 22 ++++++++++++++++++
> > 2 files changed, 71 insertions(+)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> > index a877747..937ec3f 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -2423,6 +2423,15 @@ static long
> vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> > return ret;
> > }
> >
> > +static int vfio_cache_inv_fn(struct device *dev, void *data)
> > +{
> > + struct domain_capsule *dc = (struct domain_capsule *)data;
> > + struct iommu_cache_invalidate_info *cache_inv_info =
> > + (struct iommu_cache_invalidate_info *) dc->data;
> > +
> > + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info);
> > +}
> > +
> > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > unsigned int cmd, unsigned long arg)
> > {
> > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> > }
> > kfree(gbind_data);
> > return ret;
> > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
> > + struct vfio_iommu_type1_cache_invalidate cache_inv;
> > + u32 version;
> > + int info_size;
> > + void *cache_info;
> > + int ret;
> > +
> > + minsz = offsetofend(struct
> vfio_iommu_type1_cache_invalidate,
> > + flags);
>
> This breaks backward compatibility as soon as struct
> iommu_cache_invalidate_info changes size by its defined versioning
> scheme. ie. a field gets added, the version is bumped, all existing
> userspace breaks. Our minsz is offsetofend to the version field,
> interpret the version to size, then reevaluate argsz.
btw the version scheme is challenged by Christoph Hellwig. After
some discussions, we need your guidance how to move forward.
Jacob summarized available options below:
https://lkml.org/lkml/2020/4/2/876
>
> > +
> > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz))
> > + return -EFAULT;
> > +
> > + if (cache_inv.argsz < minsz || cache_inv.flags)
> > + return -EINVAL;
> > +
> > + /* Get the version of struct iommu_cache_invalidate_info */
> > + if (copy_from_user(&version,
> > + (void __user *) (arg + minsz), sizeof(version)))
> > + return -EFAULT;
> > +
> > + info_size = iommu_uapi_get_data_size(
> > + IOMMU_UAPI_CACHE_INVAL,
> version);
> > +
> > + cache_info = kzalloc(info_size, GFP_KERNEL);
> > + if (!cache_info)
> > + return -ENOMEM;
> > +
> > + if (copy_from_user(cache_info,
> > + (void __user *) (arg + minsz), info_size)) {
> > + kfree(cache_info);
> > + return -EFAULT;
> > + }
> > +
> > + mutex_lock(&iommu->lock);
> > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn,
> > + cache_info);
>
> How does a user respond when their cache invalidate fails? Isn't this
> also another case where our for_each_dev can fail at an arbitrary point
> leaving us with no idea whether each device even had the opportunity to
> perform the invalidation request. I don't see how we have any chance
> to maintain coherency after this faults.
Then can we make it simple to support singleton group only?
>
> > + mutex_unlock(&iommu->lock);
> > + kfree(cache_info);
> > + return ret;
> > }
> >
> > return -ENOTTY;
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 2235bc6..62ca791 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind {
> > */
> > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23)
> >
> > +/**
> > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24,
> > + * struct vfio_iommu_type1_cache_invalidate)
> > + *
> > + * Propagate guest IOMMU cache invalidation to the host. The cache
> > + * invalidation information is conveyed by @cache_info, the content
> > + * format would be structures defined in uapi/linux/iommu.h. User
> > + * should be aware of that the struct iommu_cache_invalidate_info
> > + * has a @version field, vfio needs to parse this field before getting
> > + * data from userspace.
> > + *
> > + * Availability of this IOCTL is after VFIO_SET_IOMMU.
>
> Is this a necessary qualifier? A user can try to call this ioctl at
> any point, it only makes sense in certain configurations, but it should
> always "do the right thing" relative to the container iommu config.
>
> Also, I don't see anything in these last few patches testing the
> operating IOMMU model, what happens when a user calls them when not
> using the nesting IOMMU?
>
> Is this ioctl and the previous BIND ioctl only valid when configured
> for the nesting IOMMU type?
I think so. We should add the nesting check in those new ioctls.
>
> > + *
> > + * returns: 0 on success, -errno on failure.
> > + */
> > +struct vfio_iommu_type1_cache_invalidate {
> > + __u32 argsz;
> > + __u32 flags;
> > + struct iommu_cache_invalidate_info cache_info;
> > +};
> > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE
> + 24)
>
> The future extension capabilities of this ioctl worry me, I wonder if
> we should do another data[] with flag defining that data as CACHE_INFO.
Can you elaborate? Does it mean with this way we don't rely on iommu
driver to provide version_to_size conversion and instead we just pass
data[] to iommu driver for further audit?
>
> > +
> > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU --------
> */
> >
> > /*
Thanks
Kevin
On Fri, 3 Apr 2020 06:39:22 +0000
"Tian, Kevin" <[email protected]> wrote:
> > From: Alex Williamson <[email protected]>
> > Sent: Friday, April 3, 2020 4:24 AM
> >
> > On Sun, 22 Mar 2020 05:32:04 -0700
> > "Liu, Yi L" <[email protected]> wrote:
> >
> > > From: Liu Yi L <[email protected]>
> > >
> > > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest
> > "owns" the
> > > first-level/stage-1 translation structures, the host IOMMU driver has no
> > > knowledge of first-level/stage-1 structure cache updates unless the guest
> > > invalidation requests are trapped and propagated to the host.
> > >
> > > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to
> > propagate guest
> > > first-level/stage-1 IOMMU cache invalidations to host to ensure IOMMU
> > cache
> > > correctness.
> > >
> > > With this patch, vSVA (Virtual Shared Virtual Addressing) can be used safely
> > > as the host IOMMU iotlb correctness are ensured.
> > >
> > > Cc: Kevin Tian <[email protected]>
> > > CC: Jacob Pan <[email protected]>
> > > Cc: Alex Williamson <[email protected]>
> > > Cc: Eric Auger <[email protected]>
> > > Cc: Jean-Philippe Brucker <[email protected]>
> > > Signed-off-by: Liu Yi L <[email protected]>
> > > Signed-off-by: Eric Auger <[email protected]>
> > > Signed-off-by: Jacob Pan <[email protected]>
> > > ---
> > > drivers/vfio/vfio_iommu_type1.c | 49
> > +++++++++++++++++++++++++++++++++++++++++
> > > include/uapi/linux/vfio.h | 22 ++++++++++++++++++
> > > 2 files changed, 71 insertions(+)
> > >
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c
> > > index a877747..937ec3f 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -2423,6 +2423,15 @@ static long
> > vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> > > return ret;
> > > }
> > >
> > > +static int vfio_cache_inv_fn(struct device *dev, void *data)
> > > +{
> > > + struct domain_capsule *dc = (struct domain_capsule *)data;
> > > + struct iommu_cache_invalidate_info *cache_inv_info =
> > > + (struct iommu_cache_invalidate_info *) dc->data;
> > > +
> > > + return iommu_cache_invalidate(dc->domain, dev, cache_inv_info);
> > > +}
> > > +
> > > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > unsigned int cmd, unsigned long arg)
> > > {
> > > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void
> > *iommu_data,
> > > }
> > > kfree(gbind_data);
> > > return ret;
> > > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
> > > + struct vfio_iommu_type1_cache_invalidate cache_inv;
> > > + u32 version;
> > > + int info_size;
> > > + void *cache_info;
> > > + int ret;
> > > +
> > > + minsz = offsetofend(struct
> > vfio_iommu_type1_cache_invalidate,
> > > + flags);
> >
> > This breaks backward compatibility as soon as struct
> > iommu_cache_invalidate_info changes size by its defined versioning
> > scheme. ie. a field gets added, the version is bumped, all existing
> > userspace breaks. Our minsz is offsetofend to the version field,
> > interpret the version to size, then reevaluate argsz.
>
> btw the version scheme is challenged by Christoph Hellwig. After
> some discussions, we need your guidance how to move forward.
> Jacob summarized available options below:
> https://lkml.org/lkml/2020/4/2/876
Ok
> >
> > > +
> > > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz))
> > > + return -EFAULT;
> > > +
> > > + if (cache_inv.argsz < minsz || cache_inv.flags)
> > > + return -EINVAL;
> > > +
> > > + /* Get the version of struct iommu_cache_invalidate_info */
> > > + if (copy_from_user(&version,
> > > + (void __user *) (arg + minsz), sizeof(version)))
> > > + return -EFAULT;
> > > +
> > > + info_size = iommu_uapi_get_data_size(
> > > + IOMMU_UAPI_CACHE_INVAL,
> > version);
> > > +
> > > + cache_info = kzalloc(info_size, GFP_KERNEL);
> > > + if (!cache_info)
> > > + return -ENOMEM;
> > > +
> > > + if (copy_from_user(cache_info,
> > > + (void __user *) (arg + minsz), info_size)) {
> > > + kfree(cache_info);
> > > + return -EFAULT;
> > > + }
> > > +
> > > + mutex_lock(&iommu->lock);
> > > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn,
> > > + cache_info);
> >
> > How does a user respond when their cache invalidate fails? Isn't this
> > also another case where our for_each_dev can fail at an arbitrary point
> > leaving us with no idea whether each device even had the opportunity to
> > perform the invalidation request. I don't see how we have any chance
> > to maintain coherency after this faults.
>
> Then can we make it simple to support singleton group only?
Are you suggesting a single group per container or a single device per
group? Unless we have both, aren't we always going to have this issue.
OTOH, why should a cache invalidate fail?
> > > + mutex_unlock(&iommu->lock);
> > > + kfree(cache_info);
> > > + return ret;
> > > }
> > >
> > > return -ENOTTY;
> > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > index 2235bc6..62ca791 100644
> > > --- a/include/uapi/linux/vfio.h
> > > +++ b/include/uapi/linux/vfio.h
> > > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind {
> > > */
> > > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23)
> > >
> > > +/**
> > > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24,
> > > + * struct vfio_iommu_type1_cache_invalidate)
> > > + *
> > > + * Propagate guest IOMMU cache invalidation to the host. The cache
> > > + * invalidation information is conveyed by @cache_info, the content
> > > + * format would be structures defined in uapi/linux/iommu.h. User
> > > + * should be aware of that the struct iommu_cache_invalidate_info
> > > + * has a @version field, vfio needs to parse this field before getting
> > > + * data from userspace.
> > > + *
> > > + * Availability of this IOCTL is after VFIO_SET_IOMMU.
> >
> > Is this a necessary qualifier? A user can try to call this ioctl at
> > any point, it only makes sense in certain configurations, but it should
> > always "do the right thing" relative to the container iommu config.
> >
> > Also, I don't see anything in these last few patches testing the
> > operating IOMMU model, what happens when a user calls them when not
> > using the nesting IOMMU?
> >
> > Is this ioctl and the previous BIND ioctl only valid when configured
> > for the nesting IOMMU type?
>
> I think so. We should add the nesting check in those new ioctls.
>
> >
> > > + *
> > > + * returns: 0 on success, -errno on failure.
> > > + */
> > > +struct vfio_iommu_type1_cache_invalidate {
> > > + __u32 argsz;
> > > + __u32 flags;
> > > + struct iommu_cache_invalidate_info cache_info;
> > > +};
> > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE
> > + 24)
> >
> > The future extension capabilities of this ioctl worry me, I wonder if
> > we should do another data[] with flag defining that data as CACHE_INFO.
>
> Can you elaborate? Does it mean with this way we don't rely on iommu
> driver to provide version_to_size conversion and instead we just pass
> data[] to iommu driver for further audit?
No, my concern is that this ioctl has a single function, strictly tied
to the iommu uapi. If we replace cache_info with data[] then we can
define a flag to specify that data[] is struct
iommu_cache_invalidate_info, and if we need to, a different flag to
identify data[] as something else. For example if we get stuck
expanding cache_info to meet new demands and develop a new uapi to
solve that, how would we expand this ioctl to support it rather than
also create a new ioctl? There's also a trade-off in making the ioctl
usage more difficult for the user. I'd still expect the vfio layer to
check the flag and interpret data[] as indicated by the flag rather
than just passing a blob of opaque data to the iommu layer though.
Thanks,
Alex
On Fri, 3 Apr 2020 06:39:22 +0000
"Tian, Kevin" <[email protected]> wrote:
> > From: Alex Williamson <[email protected]>
> > Sent: Friday, April 3, 2020 4:24 AM
> >
> > On Sun, 22 Mar 2020 05:32:04 -0700
> > "Liu, Yi L" <[email protected]> wrote:
> >
> > > From: Liu Yi L <[email protected]>
> > >
> > > For VFIO IOMMUs with the type VFIO_TYPE1_NESTING_IOMMU, guest
> > "owns" the
> > > first-level/stage-1 translation structures, the host IOMMU driver
> > > has no knowledge of first-level/stage-1 structure cache updates
> > > unless the guest invalidation requests are trapped and propagated
> > > to the host.
> > >
> > > This patch adds a new IOCTL VFIO_IOMMU_CACHE_INVALIDATE to
> > propagate guest
> > > first-level/stage-1 IOMMU cache invalidations to host to ensure
> > > IOMMU
> > cache
> > > correctness.
> > >
> > > With this patch, vSVA (Virtual Shared Virtual Addressing) can be
> > > used safely as the host IOMMU iotlb correctness are ensured.
> > >
> > > Cc: Kevin Tian <[email protected]>
> > > CC: Jacob Pan <[email protected]>
> > > Cc: Alex Williamson <[email protected]>
> > > Cc: Eric Auger <[email protected]>
> > > Cc: Jean-Philippe Brucker <[email protected]>
> > > Signed-off-by: Liu Yi L <[email protected]>
> > > Signed-off-by: Eric Auger <[email protected]>
> > > Signed-off-by: Jacob Pan <[email protected]>
> > > ---
> > > drivers/vfio/vfio_iommu_type1.c | 49
> > +++++++++++++++++++++++++++++++++++++++++
> > > include/uapi/linux/vfio.h | 22 ++++++++++++++++++
> > > 2 files changed, 71 insertions(+)
> > >
> > > diff --git a/drivers/vfio/vfio_iommu_type1.c
> > b/drivers/vfio/vfio_iommu_type1.c
> > > index a877747..937ec3f 100644
> > > --- a/drivers/vfio/vfio_iommu_type1.c
> > > +++ b/drivers/vfio/vfio_iommu_type1.c
> > > @@ -2423,6 +2423,15 @@ static long
> > vfio_iommu_type1_unbind_gpasid(struct vfio_iommu *iommu,
> > > return ret;
> > > }
> > >
> > > +static int vfio_cache_inv_fn(struct device *dev, void *data)
> > > +{
> > > + struct domain_capsule *dc = (struct domain_capsule
> > > *)data;
> > > + struct iommu_cache_invalidate_info *cache_inv_info =
> > > + (struct iommu_cache_invalidate_info *) dc->data;
> > > +
> > > + return iommu_cache_invalidate(dc->domain, dev,
> > > cache_inv_info); +}
> > > +
> > > static long vfio_iommu_type1_ioctl(void *iommu_data,
> > > unsigned int cmd, unsigned
> > > long arg) {
> > > @@ -2629,6 +2638,46 @@ static long vfio_iommu_type1_ioctl(void
> > *iommu_data,
> > > }
> > > kfree(gbind_data);
> > > return ret;
> > > + } else if (cmd == VFIO_IOMMU_CACHE_INVALIDATE) {
> > > + struct vfio_iommu_type1_cache_invalidate
> > > cache_inv;
> > > + u32 version;
> > > + int info_size;
> > > + void *cache_info;
> > > + int ret;
> > > +
> > > + minsz = offsetofend(struct
> > vfio_iommu_type1_cache_invalidate,
> > > + flags);
> >
> > This breaks backward compatibility as soon as struct
> > iommu_cache_invalidate_info changes size by its defined versioning
> > scheme. ie. a field gets added, the version is bumped, all existing
> > userspace breaks. Our minsz is offsetofend to the version field,
> > interpret the version to size, then reevaluate argsz.
>
> btw the version scheme is challenged by Christoph Hellwig. After
> some discussions, we need your guidance how to move forward.
> Jacob summarized available options below:
> https://lkml.org/lkml/2020/4/2/876
>
For this particular case, I don't quite get the difference between
minsz=flags and minsz=version. Our IOMMU version scheme will only
change size at the end where the variable union is used for vendor
specific extensions. Version bump does not change size (only re-purpose
padding) from the start of the UAPI structure to the union, i.e. version
will __always__ be after struct vfio_iommu_type1_cache_invalidate.flags.
> >
> > > +
> > > + if (copy_from_user(&cache_inv, (void __user
> > > *)arg, minsz))
> > > + return -EFAULT;
> > > +
> > > + if (cache_inv.argsz < minsz || cache_inv.flags)
> > > + return -EINVAL;
> > > +
> > > + /* Get the version of struct
> > > iommu_cache_invalidate_info */
> > > + if (copy_from_user(&version,
> > > + (void __user *) (arg + minsz),
> > > sizeof(version)))
> > > + return -EFAULT;
> > > +
> > > + info_size = iommu_uapi_get_data_size(
> > > + IOMMU_UAPI_CACHE_INVAL,
> > version);
> > > +
> > > + cache_info = kzalloc(info_size, GFP_KERNEL);
> > > + if (!cache_info)
> > > + return -ENOMEM;
> > > +
> > > + if (copy_from_user(cache_info,
> > > + (void __user *) (arg + minsz),
> > > info_size)) {
> > > + kfree(cache_info);
> > > + return -EFAULT;
> > > + }
> > > +
> > > + mutex_lock(&iommu->lock);
> > > + ret = vfio_iommu_for_each_dev(iommu,
> > > vfio_cache_inv_fn,
> > > + cache_info);
> >
> > How does a user respond when their cache invalidate fails? Isn't
> > this also another case where our for_each_dev can fail at an
> > arbitrary point leaving us with no idea whether each device even
> > had the opportunity to perform the invalidation request. I don't
> > see how we have any chance to maintain coherency after this
> > faults.
>
> Then can we make it simple to support singleton group only?
>
> >
> > > + mutex_unlock(&iommu->lock);
> > > + kfree(cache_info);
> > > + return ret;
> > > }
> > >
> > > return -ENOTTY;
> > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > index 2235bc6..62ca791 100644
> > > --- a/include/uapi/linux/vfio.h
> > > +++ b/include/uapi/linux/vfio.h
> > > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind {
> > > */
> > > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE
> > > + 23)
> > >
> > > +/**
> > > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24,
> > > + * struct
> > > vfio_iommu_type1_cache_invalidate)
> > > + *
> > > + * Propagate guest IOMMU cache invalidation to the host. The
> > > cache
> > > + * invalidation information is conveyed by @cache_info, the
> > > content
> > > + * format would be structures defined in uapi/linux/iommu.h. User
> > > + * should be aware of that the struct
> > > iommu_cache_invalidate_info
> > > + * has a @version field, vfio needs to parse this field before
> > > getting
> > > + * data from userspace.
> > > + *
> > > + * Availability of this IOCTL is after VFIO_SET_IOMMU.
> >
> > Is this a necessary qualifier? A user can try to call this ioctl at
> > any point, it only makes sense in certain configurations, but it
> > should always "do the right thing" relative to the container iommu
> > config.
> >
> > Also, I don't see anything in these last few patches testing the
> > operating IOMMU model, what happens when a user calls them when not
> > using the nesting IOMMU?
> >
> > Is this ioctl and the previous BIND ioctl only valid when configured
> > for the nesting IOMMU type?
>
> I think so. We should add the nesting check in those new ioctls.
>
I also added nesting domain attribute check in IOMMU driver, so bind
guest PASID will fail if nesting mode is not supported. There will be
no invalidation w/o bind.
> >
> > > + *
> > > + * returns: 0 on success, -errno on failure.
> > > + */
> > > +struct vfio_iommu_type1_cache_invalidate {
> > > + __u32 argsz;
> > > + __u32 flags;
> > > + struct iommu_cache_invalidate_info cache_info;
> > > +};
> > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE,
> > > VFIO_BASE
> > + 24)
> >
> > The future extension capabilities of this ioctl worry me, I wonder
> > if we should do another data[] with flag defining that data as
> > CACHE_INFO.
>
> Can you elaborate? Does it mean with this way we don't rely on iommu
> driver to provide version_to_size conversion and instead we just pass
> data[] to iommu driver for further audit?
>
I guess Alex meant we do something similar to:
struct vfio_irq_set {
__u32 argsz;
__u32 flags;
#define VFIO_IRQ_SET_DATA_NONE (1 << 0) /* Data not present */
#define VFIO_IRQ_SET_DATA_BOOL (1 << 1) /* Data is bool (u8) */
#define VFIO_IRQ_SET_DATA_EVENTFD (1 << 2) /* Data is eventfd (s32) */
#define VFIO_IRQ_SET_ACTION_MASK (1 << 3) /* Mask interrupt */
#define VFIO_IRQ_SET_ACTION_UNMASK (1 << 4) /* Unmask interrupt */
#define VFIO_IRQ_SET_ACTION_TRIGGER (1 << 5) /* Trigger interrupt */
__u32 index;
__u32 start;
__u32 count;
__u8 data[];
};
So we could do:
struct vfio_iommu_type1_cache_invalidate {
__u32 argsz;
#define VFIO_INVL_DATA_NONE
#define VFIO_INVL_DATA_CACHE_INFO (1 << 1)
__u32 flags;
__u8 data[];
}
We still need version_to_size version, but under
if (flag & VFIO_INVL_DATA_CACHE_INFO)
get_size_from_version();
> >
> > > +
> > > /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU
> > > --------
> > */
> > >
> > > /*
>
> Thanks
> Kevin
[Jacob Pan]
Hi Alex,
> From: Alex Williamson <[email protected]>
> Sent: Friday, April 3, 2020 11:35 PM
> Subject: Re: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE
>
> On Fri, 3 Apr 2020 06:39:22 +0000
> "Tian, Kevin" <[email protected]> wrote:
>
> > > From: Alex Williamson <[email protected]>
> > > Sent: Friday, April 3, 2020 4:24 AM
> > >
> > > On Sun, 22 Mar 2020 05:32:04 -0700
> > > "Liu, Yi L" <[email protected]> wrote:
> > >
> > > > From: Liu Yi L <[email protected]>
> > > >
[...]
>
> > >
> > > > +
> > > > + if (copy_from_user(&cache_inv, (void __user *)arg, minsz))
> > > > + return -EFAULT;
> > > > +
> > > > + if (cache_inv.argsz < minsz || cache_inv.flags)
> > > > + return -EINVAL;
> > > > +
> > > > + /* Get the version of struct iommu_cache_invalidate_info */
> > > > + if (copy_from_user(&version,
> > > > + (void __user *) (arg + minsz), sizeof(version)))
> > > > + return -EFAULT;
> > > > +
> > > > + info_size = iommu_uapi_get_data_size(
> > > > + IOMMU_UAPI_CACHE_INVAL,
> > > version);
> > > > +
> > > > + cache_info = kzalloc(info_size, GFP_KERNEL);
> > > > + if (!cache_info)
> > > > + return -ENOMEM;
> > > > +
> > > > + if (copy_from_user(cache_info,
> > > > + (void __user *) (arg + minsz), info_size)) {
> > > > + kfree(cache_info);
> > > > + return -EFAULT;
> > > > + }
> > > > +
> > > > + mutex_lock(&iommu->lock);
> > > > + ret = vfio_iommu_for_each_dev(iommu, vfio_cache_inv_fn,
> > > > + cache_info);
> > >
> > > How does a user respond when their cache invalidate fails? Isn't this
> > > also another case where our for_each_dev can fail at an arbitrary point
> > > leaving us with no idea whether each device even had the opportunity to
> > > perform the invalidation request. I don't see how we have any chance
> > > to maintain coherency after this faults.
> >
> > Then can we make it simple to support singleton group only?
>
> Are you suggesting a single group per container or a single device per
> group? Unless we have both, aren't we always going to have this issue.
Agreed. we need both to avoid the potential for_each_dev() loop issue.
I suppose this is also the most typical and desired config for vSVA
support. I think it makes sense with below items:
a) one group per container
PASID and nested translation gives user-space a chance to attach their
page table (e.g. guest process page table) to host IOMMU, this is vSVA.
If adding multiple groups to a vSVA-capable container, then a SVA bind
on this container means bind it with all groups (devices are included)
within the container. This doesn't make sense with three reasons: for
one the passthru devices are not necessary to be manipulated by same
guest application; for two passthru devices are not surely added in a
single guest group; for three not all passthru devices (either from
different group or same group) are sva capable.
As above, enforce one group per container makes sense to me.
b) one device per group
SVA support is limited to singleton group so far in bare-metal bind
per Jean's series. I think it's be good to follow it in passthru case.
https://patchwork.kernel.org/patch/10213877/
https://lkml.org/lkml/2019/4/10/663
As mentioned in a), group may have both SVA-capable device and non-SVA
-capable device, it would be a problem for VFIO to figure a way to isolate
them.
> OTOH, why should a cache invalidate fail?
there are sanity check done by vendor iommu driver against the invalidate
request from userspace. so it may fail if sanity check failed. But I guess
it may be better to something like abort instead of fail the request. isn't?
>
> > > > + mutex_unlock(&iommu->lock);
> > > > + kfree(cache_info);
> > > > + return ret;
> > > > }
> > > >
> > > > return -ENOTTY;
> > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > > > index 2235bc6..62ca791 100644
> > > > --- a/include/uapi/linux/vfio.h
> > > > +++ b/include/uapi/linux/vfio.h
> > > > @@ -899,6 +899,28 @@ struct vfio_iommu_type1_bind {
> > > > */
> > > > #define VFIO_IOMMU_BIND _IO(VFIO_TYPE, VFIO_BASE + 23)
> > > >
> > > > +/**
> > > > + * VFIO_IOMMU_CACHE_INVALIDATE - _IOW(VFIO_TYPE, VFIO_BASE + 24,
> > > > + * struct vfio_iommu_type1_cache_invalidate)
> > > > + *
> > > > + * Propagate guest IOMMU cache invalidation to the host. The cache
> > > > + * invalidation information is conveyed by @cache_info, the content
> > > > + * format would be structures defined in uapi/linux/iommu.h. User
> > > > + * should be aware of that the struct iommu_cache_invalidate_info
> > > > + * has a @version field, vfio needs to parse this field before getting
> > > > + * data from userspace.
> > > > + *
> > > > + * Availability of this IOCTL is after VFIO_SET_IOMMU.
> > >
> > > Is this a necessary qualifier? A user can try to call this ioctl at
> > > any point, it only makes sense in certain configurations, but it should
> > > always "do the right thing" relative to the container iommu config.
> > >
> > > Also, I don't see anything in these last few patches testing the
> > > operating IOMMU model, what happens when a user calls them when not
> > > using the nesting IOMMU?
> > >
> > > Is this ioctl and the previous BIND ioctl only valid when configured
> > > for the nesting IOMMU type?
> >
> > I think so. We should add the nesting check in those new ioctls.
> >
> > >
> > > > + *
> > > > + * returns: 0 on success, -errno on failure.
> > > > + */
> > > > +struct vfio_iommu_type1_cache_invalidate {
> > > > + __u32 argsz;
> > > > + __u32 flags;
> > > > + struct iommu_cache_invalidate_info cache_info;
> > > > +};
> > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE, VFIO_BASE
> > > + 24)
> > >
> > > The future extension capabilities of this ioctl worry me, I wonder if
> > > we should do another data[] with flag defining that data as CACHE_INFO.
> >
> > Can you elaborate? Does it mean with this way we don't rely on iommu
> > driver to provide version_to_size conversion and instead we just pass
> > data[] to iommu driver for further audit?
>
> No, my concern is that this ioctl has a single function, strictly tied
> to the iommu uapi. If we replace cache_info with data[] then we can
> define a flag to specify that data[] is struct
> iommu_cache_invalidate_info, and if we need to, a different flag to
> identify data[] as something else. For example if we get stuck
> expanding cache_info to meet new demands and develop a new uapi to
> solve that, how would we expand this ioctl to support it rather than
> also create a new ioctl? There's also a trade-off in making the ioctl
> usage more difficult for the user. I'd still expect the vfio layer to
> check the flag and interpret data[] as indicated by the flag rather
> than just passing a blob of opaque data to the iommu layer though.
Ok, I think data[] is acceptable. BTW. Do you have any decision on the
uapi version open iin Jacob's thread? I'd like to re-work my patch based
on your decision.
https://lkml.org/lkml/2020/4/2/876
thanks again for your help. :-)
Regards,
Yi Liu
> From: Liu, Yi L <[email protected]>
> Sent: Thursday, April 16, 2020 6:40 PM
>
> Hi Alex,
> Still have a direction question with you. Better get agreement with you
> before heading forward.
>
> > From: Alex Williamson <[email protected]>
> > Sent: Friday, April 3, 2020 11:35 PM
> [...]
> > > > > + *
> > > > > + * returns: 0 on success, -errno on failure.
> > > > > + */
> > > > > +struct vfio_iommu_type1_cache_invalidate {
> > > > > + __u32 argsz;
> > > > > + __u32 flags;
> > > > > + struct iommu_cache_invalidate_info cache_info;
> > > > > +};
> > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE,
> > VFIO_BASE
> > > > + 24)
> > > >
> > > > The future extension capabilities of this ioctl worry me, I wonder if
> > > > we should do another data[] with flag defining that data as
> CACHE_INFO.
> > >
> > > Can you elaborate? Does it mean with this way we don't rely on iommu
> > > driver to provide version_to_size conversion and instead we just pass
> > > data[] to iommu driver for further audit?
> >
> > No, my concern is that this ioctl has a single function, strictly tied
> > to the iommu uapi. If we replace cache_info with data[] then we can
> > define a flag to specify that data[] is struct
> > iommu_cache_invalidate_info, and if we need to, a different flag to
> > identify data[] as something else. For example if we get stuck
> > expanding cache_info to meet new demands and develop a new uapi to
> > solve that, how would we expand this ioctl to support it rather than
> > also create a new ioctl? There's also a trade-off in making the ioctl
> > usage more difficult for the user. I'd still expect the vfio layer to
> > check the flag and interpret data[] as indicated by the flag rather
> > than just passing a blob of opaque data to the iommu layer though.
> > Thanks,
>
> Based on your comments about defining a single ioctl and a unified
> vfio structure (with a @data[] field) for pasid_alloc/free, bind/
> unbind_gpasid, cache_inv. After some offline trying, I think it would
> be good for bind/unbind_gpasid and cache_inv as both of them use the
> iommu uapi definition. While the pasid alloc/free operation doesn't.
> It would be weird to put all of them together. So pasid alloc/free
> may have a separate ioctl. It would look as below. Does this direction
> look good per your opinion?
>
> ioctl #22: VFIO_IOMMU_PASID_REQUEST
> /**
> * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID
> * specify a pasid to be freed when flags == FREE_PASID
> * @range: specify the allocation range when flags == ALLOC_PASID
> */
> struct vfio_iommu_pasid_request {
> __u32 argsz;
> #define VFIO_IOMMU_ALLOC_PASID (1 << 0)
> #define VFIO_IOMMU_FREE_PASID (1 << 1)
> __u32 flags;
> __u32 pasid;
> struct {
> __u32 min;
> __u32 max;
> } range;
> };
>
> ioctl #23: VFIO_IOMMU_NESTING_OP
> struct vfio_iommu_type1_nesting_op {
> __u32 argsz;
> __u32 flags;
> __u32 op;
> __u8 data[];
> };
>
> /* Nesting Ops */
> #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0
> #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1
> #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2
>
Then why cannot we just put PASID into the header since the
majority of nested usage is associated with a pasid?
ioctl #23: VFIO_IOMMU_NESTING_OP
struct vfio_iommu_type1_nesting_op {
__u32 argsz;
__u32 flags;
__u32 op;
__u32 pasid;
__u8 data[];
};
In case of SMMUv2 which supports nested w/o PASID, this field can
be ignored for that specific case.
Thanks
Kevin
Hi Kevin,
On 4/16/20 2:09 PM, Tian, Kevin wrote:
>> From: Liu, Yi L <[email protected]>
>> Sent: Thursday, April 16, 2020 6:40 PM
>>
>> Hi Alex,
>> Still have a direction question with you. Better get agreement with you
>> before heading forward.
>>
>>> From: Alex Williamson <[email protected]>
>>> Sent: Friday, April 3, 2020 11:35 PM
>> [...]
>>>>>> + *
>>>>>> + * returns: 0 on success, -errno on failure.
>>>>>> + */
>>>>>> +struct vfio_iommu_type1_cache_invalidate {
>>>>>> + __u32 argsz;
>>>>>> + __u32 flags;
>>>>>> + struct iommu_cache_invalidate_info cache_info;
>>>>>> +};
>>>>>> +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE,
>>> VFIO_BASE
>>>>> + 24)
>>>>>
>>>>> The future extension capabilities of this ioctl worry me, I wonder if
>>>>> we should do another data[] with flag defining that data as
>> CACHE_INFO.
>>>>
>>>> Can you elaborate? Does it mean with this way we don't rely on iommu
>>>> driver to provide version_to_size conversion and instead we just pass
>>>> data[] to iommu driver for further audit?
>>>
>>> No, my concern is that this ioctl has a single function, strictly tied
>>> to the iommu uapi. If we replace cache_info with data[] then we can
>>> define a flag to specify that data[] is struct
>>> iommu_cache_invalidate_info, and if we need to, a different flag to
>>> identify data[] as something else. For example if we get stuck
>>> expanding cache_info to meet new demands and develop a new uapi to
>>> solve that, how would we expand this ioctl to support it rather than
>>> also create a new ioctl? There's also a trade-off in making the ioctl
>>> usage more difficult for the user. I'd still expect the vfio layer to
>>> check the flag and interpret data[] as indicated by the flag rather
>>> than just passing a blob of opaque data to the iommu layer though.
>>> Thanks,
>>
>> Based on your comments about defining a single ioctl and a unified
>> vfio structure (with a @data[] field) for pasid_alloc/free, bind/
>> unbind_gpasid, cache_inv. After some offline trying, I think it would
>> be good for bind/unbind_gpasid and cache_inv as both of them use the
>> iommu uapi definition. While the pasid alloc/free operation doesn't.
>> It would be weird to put all of them together. So pasid alloc/free
>> may have a separate ioctl. It would look as below. Does this direction
>> look good per your opinion?
>>
>> ioctl #22: VFIO_IOMMU_PASID_REQUEST
>> /**
>> * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID
>> * specify a pasid to be freed when flags == FREE_PASID
>> * @range: specify the allocation range when flags == ALLOC_PASID
>> */
>> struct vfio_iommu_pasid_request {
>> __u32 argsz;
>> #define VFIO_IOMMU_ALLOC_PASID (1 << 0)
>> #define VFIO_IOMMU_FREE_PASID (1 << 1)
>> __u32 flags;
>> __u32 pasid;
>> struct {
>> __u32 min;
>> __u32 max;
>> } range;
>> };
>>
>> ioctl #23: VFIO_IOMMU_NESTING_OP
>> struct vfio_iommu_type1_nesting_op {
>> __u32 argsz;
>> __u32 flags;
>> __u32 op;
>> __u8 data[];
>> };
>>
>> /* Nesting Ops */
>> #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0
>> #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1
>> #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2
>>
>
> Then why cannot we just put PASID into the header since the
> majority of nested usage is associated with a pasid?
>
> ioctl #23: VFIO_IOMMU_NESTING_OP
> struct vfio_iommu_type1_nesting_op {
> __u32 argsz;
> __u32 flags;
> __u32 op;
> __u32 pasid;
> __u8 data[];
> };
>
> In case of SMMUv2 which supports nested w/o PASID, this field can
> be ignored for that specific case.
On my side I would prefer keeping the pasid in the data[]. This is not
always used.
For instance, in iommu_cache_invalidate_info/iommu_inv_pasid_info we
devised flags to tell whether the PASID is used.
Thanks
Eric
>
> Thanks
> Kevin
>
> From: Auger Eric <[email protected]>
> Sent: Thursday, April 16, 2020 8:43 PM
>
> Hi Kevin,
> On 4/16/20 2:09 PM, Tian, Kevin wrote:
> >> From: Liu, Yi L <[email protected]>
> >> Sent: Thursday, April 16, 2020 6:40 PM
> >>
> >> Hi Alex,
> >> Still have a direction question with you. Better get agreement with you
> >> before heading forward.
> >>
> >>> From: Alex Williamson <[email protected]>
> >>> Sent: Friday, April 3, 2020 11:35 PM
> >> [...]
> >>>>>> + *
> >>>>>> + * returns: 0 on success, -errno on failure.
> >>>>>> + */
> >>>>>> +struct vfio_iommu_type1_cache_invalidate {
> >>>>>> + __u32 argsz;
> >>>>>> + __u32 flags;
> >>>>>> + struct iommu_cache_invalidate_info cache_info;
> >>>>>> +};
> >>>>>> +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE,
> >>> VFIO_BASE
> >>>>> + 24)
> >>>>>
> >>>>> The future extension capabilities of this ioctl worry me, I wonder if
> >>>>> we should do another data[] with flag defining that data as
> >> CACHE_INFO.
> >>>>
> >>>> Can you elaborate? Does it mean with this way we don't rely on iommu
> >>>> driver to provide version_to_size conversion and instead we just pass
> >>>> data[] to iommu driver for further audit?
> >>>
> >>> No, my concern is that this ioctl has a single function, strictly tied
> >>> to the iommu uapi. If we replace cache_info with data[] then we can
> >>> define a flag to specify that data[] is struct
> >>> iommu_cache_invalidate_info, and if we need to, a different flag to
> >>> identify data[] as something else. For example if we get stuck
> >>> expanding cache_info to meet new demands and develop a new uapi to
> >>> solve that, how would we expand this ioctl to support it rather than
> >>> also create a new ioctl? There's also a trade-off in making the ioctl
> >>> usage more difficult for the user. I'd still expect the vfio layer to
> >>> check the flag and interpret data[] as indicated by the flag rather
> >>> than just passing a blob of opaque data to the iommu layer though.
> >>> Thanks,
> >>
> >> Based on your comments about defining a single ioctl and a unified
> >> vfio structure (with a @data[] field) for pasid_alloc/free, bind/
> >> unbind_gpasid, cache_inv. After some offline trying, I think it would
> >> be good for bind/unbind_gpasid and cache_inv as both of them use the
> >> iommu uapi definition. While the pasid alloc/free operation doesn't.
> >> It would be weird to put all of them together. So pasid alloc/free
> >> may have a separate ioctl. It would look as below. Does this direction
> >> look good per your opinion?
> >>
> >> ioctl #22: VFIO_IOMMU_PASID_REQUEST
> >> /**
> >> * @pasid: used to return the pasid alloc result when flags ==
> ALLOC_PASID
> >> * specify a pasid to be freed when flags == FREE_PASID
> >> * @range: specify the allocation range when flags == ALLOC_PASID
> >> */
> >> struct vfio_iommu_pasid_request {
> >> __u32 argsz;
> >> #define VFIO_IOMMU_ALLOC_PASID (1 << 0)
> >> #define VFIO_IOMMU_FREE_PASID (1 << 1)
> >> __u32 flags;
> >> __u32 pasid;
> >> struct {
> >> __u32 min;
> >> __u32 max;
> >> } range;
> >> };
> >>
> >> ioctl #23: VFIO_IOMMU_NESTING_OP
> >> struct vfio_iommu_type1_nesting_op {
> >> __u32 argsz;
> >> __u32 flags;
> >> __u32 op;
> >> __u8 data[];
> >> };
> >>
> >> /* Nesting Ops */
> >> #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0
> >> #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1
> >> #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2
> >>
> >
> > Then why cannot we just put PASID into the header since the
> > majority of nested usage is associated with a pasid?
> >
> > ioctl #23: VFIO_IOMMU_NESTING_OP
> > struct vfio_iommu_type1_nesting_op {
> > __u32 argsz;
> > __u32 flags;
> > __u32 op;
> > __u32 pasid;
> > __u8 data[];
> > };
> >
> > In case of SMMUv2 which supports nested w/o PASID, this field can
> > be ignored for that specific case.
> On my side I would prefer keeping the pasid in the data[]. This is not
> always used.
>
> For instance, in iommu_cache_invalidate_info/iommu_inv_pasid_info we
> devised flags to tell whether the PASID is used.
>
But don't we include a PASID in both invalidate structures already?
struct iommu_inv_addr_info {
#define IOMMU_INV_ADDR_FLAGS_PASID (1 << 0)
#define IOMMU_INV_ADDR_FLAGS_ARCHID (1 << 1)
#define IOMMU_INV_ADDR_FLAGS_LEAF (1 << 2)
__u32 flags;
__u32 archid;
__u64 pasid;
__u64 addr;
__u64 granule_size;
__u64 nb_granules;
};
struct iommu_inv_pasid_info {
#define IOMMU_INV_PASID_FLAGS_PASID (1 << 0)
#define IOMMU_INV_PASID_FLAGS_ARCHID (1 << 1)
__u32 flags;
__u32 archid;
__u64 pasid;
};
then consolidating the pasid field into generic header doesn't
hurt. the specific handler still rely on flags to tell whether it
is used?
Thanks
Kevin
On Thu, 16 Apr 2020 08:40:31 -0600
Alex Williamson <[email protected]> wrote:
> On Thu, 16 Apr 2020 10:40:03 +0000
> "Liu, Yi L" <[email protected]> wrote:
>
> > Hi Alex,
> > Still have a direction question with you. Better get agreement with you
> > before heading forward.
> >
> > > From: Alex Williamson <[email protected]>
> > > Sent: Friday, April 3, 2020 11:35 PM
> > [...]
> > > > > > + *
> > > > > > + * returns: 0 on success, -errno on failure.
> > > > > > + */
> > > > > > +struct vfio_iommu_type1_cache_invalidate {
> > > > > > + __u32 argsz;
> > > > > > + __u32 flags;
> > > > > > + struct iommu_cache_invalidate_info cache_info;
> > > > > > +};
> > > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE,
> > > VFIO_BASE
> > > > > + 24)
> > > > >
> > > > > The future extension capabilities of this ioctl worry me, I wonder if
> > > > > we should do another data[] with flag defining that data as CACHE_INFO.
> > > >
> > > > Can you elaborate? Does it mean with this way we don't rely on iommu
> > > > driver to provide version_to_size conversion and instead we just pass
> > > > data[] to iommu driver for further audit?
> > >
> > > No, my concern is that this ioctl has a single function, strictly tied
> > > to the iommu uapi. If we replace cache_info with data[] then we can
> > > define a flag to specify that data[] is struct
> > > iommu_cache_invalidate_info, and if we need to, a different flag to
> > > identify data[] as something else. For example if we get stuck
> > > expanding cache_info to meet new demands and develop a new uapi to
> > > solve that, how would we expand this ioctl to support it rather than
> > > also create a new ioctl? There's also a trade-off in making the ioctl
> > > usage more difficult for the user. I'd still expect the vfio layer to
> > > check the flag and interpret data[] as indicated by the flag rather
> > > than just passing a blob of opaque data to the iommu layer though.
> > > Thanks,
> >
> > Based on your comments about defining a single ioctl and a unified
> > vfio structure (with a @data[] field) for pasid_alloc/free, bind/
> > unbind_gpasid, cache_inv. After some offline trying, I think it would
> > be good for bind/unbind_gpasid and cache_inv as both of them use the
> > iommu uapi definition. While the pasid alloc/free operation doesn't.
> > It would be weird to put all of them together. So pasid alloc/free
> > may have a separate ioctl. It would look as below. Does this direction
> > look good per your opinion?
> >
> > ioctl #22: VFIO_IOMMU_PASID_REQUEST
> > /**
> > * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID
> > * specify a pasid to be freed when flags == FREE_PASID
> > * @range: specify the allocation range when flags == ALLOC_PASID
> > */
> > struct vfio_iommu_pasid_request {
> > __u32 argsz;
> > #define VFIO_IOMMU_ALLOC_PASID (1 << 0)
> > #define VFIO_IOMMU_FREE_PASID (1 << 1)
> > __u32 flags;
> > __u32 pasid;
> > struct {
> > __u32 min;
> > __u32 max;
> > } range;
> > };
>
> Can't the ioctl return the pasid valid on alloc (like GET_DEVICE_FD)?
s/valid/value/
> Would it be useful to support freeing a range of pasids? If so then we
> could simply use range for both, ie. allocate a pasid from this range
> and return it, or free all pasids in this range? vfio already needs to
> track pasids to free them on release, so presumably this is something
> we could support easily.
>
> > ioctl #23: VFIO_IOMMU_NESTING_OP
> > struct vfio_iommu_type1_nesting_op {
> > __u32 argsz;
> > __u32 flags;
> > __u32 op;
> > __u8 data[];
> > };
>
> data only has 4-byte alignment, I think we really want it at an 8-byte
> alignment. This is why I embedded the "op" into the flag for
> DEVICE_FEATURE. Thanks,
>
> Alex
>
> >
> > /* Nesting Ops */
> > #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0
> > #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1
> > #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2
> >
> > Thanks,
> > Yi Liu
> >
>
> _______________________________________________
> iommu mailing list
> [email protected]
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
>
Hi Kevin,
On 4/16/20 3:28 PM, Tian, Kevin wrote:
>> From: Auger Eric <[email protected]>
>> Sent: Thursday, April 16, 2020 8:43 PM
>>
>> Hi Kevin,
>> On 4/16/20 2:09 PM, Tian, Kevin wrote:
>>>> From: Liu, Yi L <[email protected]>
>>>> Sent: Thursday, April 16, 2020 6:40 PM
>>>>
>>>> Hi Alex,
>>>> Still have a direction question with you. Better get agreement with you
>>>> before heading forward.
>>>>
>>>>> From: Alex Williamson <[email protected]>
>>>>> Sent: Friday, April 3, 2020 11:35 PM
>>>> [...]
>>>>>>>> + *
>>>>>>>> + * returns: 0 on success, -errno on failure.
>>>>>>>> + */
>>>>>>>> +struct vfio_iommu_type1_cache_invalidate {
>>>>>>>> + __u32 argsz;
>>>>>>>> + __u32 flags;
>>>>>>>> + struct iommu_cache_invalidate_info cache_info;
>>>>>>>> +};
>>>>>>>> +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE,
>>>>> VFIO_BASE
>>>>>>> + 24)
>>>>>>>
>>>>>>> The future extension capabilities of this ioctl worry me, I wonder if
>>>>>>> we should do another data[] with flag defining that data as
>>>> CACHE_INFO.
>>>>>>
>>>>>> Can you elaborate? Does it mean with this way we don't rely on iommu
>>>>>> driver to provide version_to_size conversion and instead we just pass
>>>>>> data[] to iommu driver for further audit?
>>>>>
>>>>> No, my concern is that this ioctl has a single function, strictly tied
>>>>> to the iommu uapi. If we replace cache_info with data[] then we can
>>>>> define a flag to specify that data[] is struct
>>>>> iommu_cache_invalidate_info, and if we need to, a different flag to
>>>>> identify data[] as something else. For example if we get stuck
>>>>> expanding cache_info to meet new demands and develop a new uapi to
>>>>> solve that, how would we expand this ioctl to support it rather than
>>>>> also create a new ioctl? There's also a trade-off in making the ioctl
>>>>> usage more difficult for the user. I'd still expect the vfio layer to
>>>>> check the flag and interpret data[] as indicated by the flag rather
>>>>> than just passing a blob of opaque data to the iommu layer though.
>>>>> Thanks,
>>>>
>>>> Based on your comments about defining a single ioctl and a unified
>>>> vfio structure (with a @data[] field) for pasid_alloc/free, bind/
>>>> unbind_gpasid, cache_inv. After some offline trying, I think it would
>>>> be good for bind/unbind_gpasid and cache_inv as both of them use the
>>>> iommu uapi definition. While the pasid alloc/free operation doesn't.
>>>> It would be weird to put all of them together. So pasid alloc/free
>>>> may have a separate ioctl. It would look as below. Does this direction
>>>> look good per your opinion?
>>>>
>>>> ioctl #22: VFIO_IOMMU_PASID_REQUEST
>>>> /**
>>>> * @pasid: used to return the pasid alloc result when flags ==
>> ALLOC_PASID
>>>> * specify a pasid to be freed when flags == FREE_PASID
>>>> * @range: specify the allocation range when flags == ALLOC_PASID
>>>> */
>>>> struct vfio_iommu_pasid_request {
>>>> __u32 argsz;
>>>> #define VFIO_IOMMU_ALLOC_PASID (1 << 0)
>>>> #define VFIO_IOMMU_FREE_PASID (1 << 1)
>>>> __u32 flags;
>>>> __u32 pasid;
>>>> struct {
>>>> __u32 min;
>>>> __u32 max;
>>>> } range;
>>>> };
>>>>
>>>> ioctl #23: VFIO_IOMMU_NESTING_OP
>>>> struct vfio_iommu_type1_nesting_op {
>>>> __u32 argsz;
>>>> __u32 flags;
>>>> __u32 op;
>>>> __u8 data[];
>>>> };
>>>>
>>>> /* Nesting Ops */
>>>> #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0
>>>> #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1
>>>> #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2
>>>>
>>>
>>> Then why cannot we just put PASID into the header since the
>>> majority of nested usage is associated with a pasid?
>>>
>>> ioctl #23: VFIO_IOMMU_NESTING_OP
>>> struct vfio_iommu_type1_nesting_op {
>>> __u32 argsz;
>>> __u32 flags;
>>> __u32 op;
>>> __u32 pasid;
>>> __u8 data[];
>>> };
>>>
>>> In case of SMMUv2 which supports nested w/o PASID, this field can
>>> be ignored for that specific case.
>> On my side I would prefer keeping the pasid in the data[]. This is not
>> always used.
>>
>> For instance, in iommu_cache_invalidate_info/iommu_inv_pasid_info we
>> devised flags to tell whether the PASID is used.
>>
>
> But don't we include a PASID in both invalidate structures already?
The pasid presence is indicated by the IOMMU_INV_ADDR_FLAGS_PASID flag.
For instance for nested stage SMMUv3 I current performs an ARCHID (asid)
based invalidation only.
Eric
>
> struct iommu_inv_addr_info {
> #define IOMMU_INV_ADDR_FLAGS_PASID (1 << 0)
> #define IOMMU_INV_ADDR_FLAGS_ARCHID (1 << 1)
> #define IOMMU_INV_ADDR_FLAGS_LEAF (1 << 2)
> __u32 flags;
> __u32 archid;
> __u64 pasid;
> __u64 addr;
> __u64 granule_size;
> __u64 nb_granules;
> };
>
> struct iommu_inv_pasid_info {
> #define IOMMU_INV_PASID_FLAGS_PASID (1 << 0)
> #define IOMMU_INV_PASID_FLAGS_ARCHID (1 << 1)
> __u32 flags;
> __u32 archid;
> __u64 pasid;
> };
>
> then consolidating the pasid field into generic header doesn't
> hurt. the specific handler still rely on flags to tell whether it
> is used?
>
> Thanks
> Kevin
>
Hi Alex,
Still have a direction question with you. Better get agreement with you
before heading forward.
> From: Alex Williamson <[email protected]>
> Sent: Friday, April 3, 2020 11:35 PM
[...]
> > > > + *
> > > > + * returns: 0 on success, -errno on failure.
> > > > + */
> > > > +struct vfio_iommu_type1_cache_invalidate {
> > > > + __u32 argsz;
> > > > + __u32 flags;
> > > > + struct iommu_cache_invalidate_info cache_info;
> > > > +};
> > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE,
> VFIO_BASE
> > > + 24)
> > >
> > > The future extension capabilities of this ioctl worry me, I wonder if
> > > we should do another data[] with flag defining that data as CACHE_INFO.
> >
> > Can you elaborate? Does it mean with this way we don't rely on iommu
> > driver to provide version_to_size conversion and instead we just pass
> > data[] to iommu driver for further audit?
>
> No, my concern is that this ioctl has a single function, strictly tied
> to the iommu uapi. If we replace cache_info with data[] then we can
> define a flag to specify that data[] is struct
> iommu_cache_invalidate_info, and if we need to, a different flag to
> identify data[] as something else. For example if we get stuck
> expanding cache_info to meet new demands and develop a new uapi to
> solve that, how would we expand this ioctl to support it rather than
> also create a new ioctl? There's also a trade-off in making the ioctl
> usage more difficult for the user. I'd still expect the vfio layer to
> check the flag and interpret data[] as indicated by the flag rather
> than just passing a blob of opaque data to the iommu layer though.
> Thanks,
Based on your comments about defining a single ioctl and a unified
vfio structure (with a @data[] field) for pasid_alloc/free, bind/
unbind_gpasid, cache_inv. After some offline trying, I think it would
be good for bind/unbind_gpasid and cache_inv as both of them use the
iommu uapi definition. While the pasid alloc/free operation doesn't.
It would be weird to put all of them together. So pasid alloc/free
may have a separate ioctl. It would look as below. Does this direction
look good per your opinion?
ioctl #22: VFIO_IOMMU_PASID_REQUEST
/**
* @pasid: used to return the pasid alloc result when flags == ALLOC_PASID
* specify a pasid to be freed when flags == FREE_PASID
* @range: specify the allocation range when flags == ALLOC_PASID
*/
struct vfio_iommu_pasid_request {
__u32 argsz;
#define VFIO_IOMMU_ALLOC_PASID (1 << 0)
#define VFIO_IOMMU_FREE_PASID (1 << 1)
__u32 flags;
__u32 pasid;
struct {
__u32 min;
__u32 max;
} range;
};
ioctl #23: VFIO_IOMMU_NESTING_OP
struct vfio_iommu_type1_nesting_op {
__u32 argsz;
__u32 flags;
__u32 op;
__u8 data[];
};
/* Nesting Ops */
#define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0
#define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1
#define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2
Thanks,
Yi Liu
On Thu, 16 Apr 2020 10:40:03 +0000
"Liu, Yi L" <[email protected]> wrote:
> Hi Alex,
> Still have a direction question with you. Better get agreement with you
> before heading forward.
>
> > From: Alex Williamson <[email protected]>
> > Sent: Friday, April 3, 2020 11:35 PM
> [...]
> > > > > + *
> > > > > + * returns: 0 on success, -errno on failure.
> > > > > + */
> > > > > +struct vfio_iommu_type1_cache_invalidate {
> > > > > + __u32 argsz;
> > > > > + __u32 flags;
> > > > > + struct iommu_cache_invalidate_info cache_info;
> > > > > +};
> > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE,
> > VFIO_BASE
> > > > + 24)
> > > >
> > > > The future extension capabilities of this ioctl worry me, I wonder if
> > > > we should do another data[] with flag defining that data as CACHE_INFO.
> > >
> > > Can you elaborate? Does it mean with this way we don't rely on iommu
> > > driver to provide version_to_size conversion and instead we just pass
> > > data[] to iommu driver for further audit?
> >
> > No, my concern is that this ioctl has a single function, strictly tied
> > to the iommu uapi. If we replace cache_info with data[] then we can
> > define a flag to specify that data[] is struct
> > iommu_cache_invalidate_info, and if we need to, a different flag to
> > identify data[] as something else. For example if we get stuck
> > expanding cache_info to meet new demands and develop a new uapi to
> > solve that, how would we expand this ioctl to support it rather than
> > also create a new ioctl? There's also a trade-off in making the ioctl
> > usage more difficult for the user. I'd still expect the vfio layer to
> > check the flag and interpret data[] as indicated by the flag rather
> > than just passing a blob of opaque data to the iommu layer though.
> > Thanks,
>
> Based on your comments about defining a single ioctl and a unified
> vfio structure (with a @data[] field) for pasid_alloc/free, bind/
> unbind_gpasid, cache_inv. After some offline trying, I think it would
> be good for bind/unbind_gpasid and cache_inv as both of them use the
> iommu uapi definition. While the pasid alloc/free operation doesn't.
> It would be weird to put all of them together. So pasid alloc/free
> may have a separate ioctl. It would look as below. Does this direction
> look good per your opinion?
>
> ioctl #22: VFIO_IOMMU_PASID_REQUEST
> /**
> * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID
> * specify a pasid to be freed when flags == FREE_PASID
> * @range: specify the allocation range when flags == ALLOC_PASID
> */
> struct vfio_iommu_pasid_request {
> __u32 argsz;
> #define VFIO_IOMMU_ALLOC_PASID (1 << 0)
> #define VFIO_IOMMU_FREE_PASID (1 << 1)
> __u32 flags;
> __u32 pasid;
> struct {
> __u32 min;
> __u32 max;
> } range;
> };
Can't the ioctl return the pasid valid on alloc (like GET_DEVICE_FD)?
Would it be useful to support freeing a range of pasids? If so then we
could simply use range for both, ie. allocate a pasid from this range
and return it, or free all pasids in this range? vfio already needs to
track pasids to free them on release, so presumably this is something
we could support easily.
> ioctl #23: VFIO_IOMMU_NESTING_OP
> struct vfio_iommu_type1_nesting_op {
> __u32 argsz;
> __u32 flags;
> __u32 op;
> __u8 data[];
> };
data only has 4-byte alignment, I think we really want it at an 8-byte
alignment. This is why I embedded the "op" into the flag for
DEVICE_FEATURE. Thanks,
Alex
>
> /* Nesting Ops */
> #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0
> #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1
> #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2
>
> Thanks,
> Yi Liu
>
Hi Alex,
> From: Alex Williamson <[email protected]>
> Sent: Thursday, April 16, 2020 10:41 PM
> To: Liu, Yi L <[email protected]>
> Subject: Re: [PATCH v1 7/8] vfio/type1: Add VFIO_IOMMU_CACHE_INVALIDATE
>
> On Thu, 16 Apr 2020 10:40:03 +0000
> "Liu, Yi L" <[email protected]> wrote:
>
> > Hi Alex,
> > Still have a direction question with you. Better get agreement with you
> > before heading forward.
> >
> > > From: Alex Williamson <[email protected]>
> > > Sent: Friday, April 3, 2020 11:35 PM
> > [...]
> > > > > > + *
> > > > > > + * returns: 0 on success, -errno on failure.
> > > > > > + */
> > > > > > +struct vfio_iommu_type1_cache_invalidate {
> > > > > > + __u32 argsz;
> > > > > > + __u32 flags;
> > > > > > + struct iommu_cache_invalidate_info cache_info;
> > > > > > +};
> > > > > > +#define VFIO_IOMMU_CACHE_INVALIDATE _IO(VFIO_TYPE,
> > > VFIO_BASE
> > > > > + 24)
> > > > >
> > > > > The future extension capabilities of this ioctl worry me, I wonder if
> > > > > we should do another data[] with flag defining that data as CACHE_INFO.
> > > >
> > > > Can you elaborate? Does it mean with this way we don't rely on iommu
> > > > driver to provide version_to_size conversion and instead we just pass
> > > > data[] to iommu driver for further audit?
> > >
> > > No, my concern is that this ioctl has a single function, strictly tied
> > > to the iommu uapi. If we replace cache_info with data[] then we can
> > > define a flag to specify that data[] is struct
> > > iommu_cache_invalidate_info, and if we need to, a different flag to
> > > identify data[] as something else. For example if we get stuck
> > > expanding cache_info to meet new demands and develop a new uapi to
> > > solve that, how would we expand this ioctl to support it rather than
> > > also create a new ioctl? There's also a trade-off in making the ioctl
> > > usage more difficult for the user. I'd still expect the vfio layer to
> > > check the flag and interpret data[] as indicated by the flag rather
> > > than just passing a blob of opaque data to the iommu layer though.
> > > Thanks,
> >
> > Based on your comments about defining a single ioctl and a unified
> > vfio structure (with a @data[] field) for pasid_alloc/free, bind/
> > unbind_gpasid, cache_inv. After some offline trying, I think it would
> > be good for bind/unbind_gpasid and cache_inv as both of them use the
> > iommu uapi definition. While the pasid alloc/free operation doesn't.
> > It would be weird to put all of them together. So pasid alloc/free
> > may have a separate ioctl. It would look as below. Does this direction
> > look good per your opinion?
> >
> > ioctl #22: VFIO_IOMMU_PASID_REQUEST
> > /**
> > * @pasid: used to return the pasid alloc result when flags == ALLOC_PASID
> > * specify a pasid to be freed when flags == FREE_PASID
> > * @range: specify the allocation range when flags == ALLOC_PASID
> > */
> > struct vfio_iommu_pasid_request {
> > __u32 argsz;
> > #define VFIO_IOMMU_ALLOC_PASID (1 << 0)
> > #define VFIO_IOMMU_FREE_PASID (1 << 1)
> > __u32 flags;
> > __u32 pasid;
> > struct {
> > __u32 min;
> > __u32 max;
> > } range;
> > };
>
> Can't the ioctl return the pasid valid on alloc (like GET_DEVICE_FD)?
Yep, I think you mentioned before. At that time, I believed it would be
better to return the result via a __u32 buffer so that make full use of
the 32 bits. But looks like it doesn't make much difference. I'll follow
your suggestion.
> Would it be useful to support freeing a range of pasids? If so then we
> could simply use range for both, ie. allocate a pasid from this range
> and return it, or free all pasids in this range? vfio already needs to
> track pasids to free them on release, so presumably this is something
> we could support easily.
yes, I think it is a nice thing. then I can remove the @pasid field.
will do it.
> > ioctl #23: VFIO_IOMMU_NESTING_OP
> > struct vfio_iommu_type1_nesting_op {
> > __u32 argsz;
> > __u32 flags;
> > __u32 op;
> > __u8 data[];
> > };
>
> data only has 4-byte alignment, I think we really want it at an 8-byte
> alignment. This is why I embedded the "op" into the flag for
> DEVICE_FEATURE. Thanks,
got it. I may also merge the op into flags (maybe the lower 16 bits for
op).
Thanks,
Yi Liu
> Alex
>
> >
> > /* Nesting Ops */
> > #define VFIO_IOMMU_NESTING_OP_BIND_PGTBL 0
> > #define VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL 1
> > #define VFIO_IOMMU_NESTING_OP_CACHE_INVLD 2
> >
> > Thanks,
> > Yi Liu
> >