[HACK. will fix in v2]
IOVA range is critical info for userspace to manage DMA for an I/O address
space. This patch reports the valid iova range info of a given device.
Due to aforementioned hack, this info comes from the hacked vfio type1
driver. To follow the same format in vfio, we also introduce a cap chain
format in IOMMU_DEVICE_GET_INFO to carry the iova range info.
Signed-off-by: Liu Yi L <[email protected]>
---
drivers/iommu/iommu.c | 2 ++
drivers/iommu/iommufd/iommufd.c | 41 +++++++++++++++++++++++++++-
drivers/vfio/vfio_iommu_type1.c | 47 ++++++++++++++++++++++++++++++---
include/linux/vfio.h | 2 ++
include/uapi/linux/iommu.h | 3 +++
5 files changed, 90 insertions(+), 5 deletions(-)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b6178997aef1..44bba346ab52 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2755,6 +2755,7 @@ void iommu_get_resv_regions(struct device *dev, struct list_head *list)
if (ops && ops->get_resv_regions)
ops->get_resv_regions(dev, list);
}
+EXPORT_SYMBOL_GPL(iommu_get_resv_regions);
void iommu_put_resv_regions(struct device *dev, struct list_head *list)
{
@@ -2763,6 +2764,7 @@ void iommu_put_resv_regions(struct device *dev, struct list_head *list)
if (ops && ops->put_resv_regions)
ops->put_resv_regions(dev, list);
}
+EXPORT_SYMBOL_GPL(iommu_put_resv_regions);
/**
* generic_iommu_put_resv_regions - Reserved region driver helper
diff --git a/drivers/iommu/iommufd/iommufd.c b/drivers/iommu/iommufd/iommufd.c
index 25373a0e037a..cbf5e30062a6 100644
--- a/drivers/iommu/iommufd/iommufd.c
+++ b/drivers/iommu/iommufd/iommufd.c
@@ -19,6 +19,7 @@
#include <linux/iommufd.h>
#include <linux/xarray.h>
#include <asm-generic/bug.h>
+#include <linux/vfio.h>
/* Per iommufd */
struct iommufd_ctx {
@@ -298,6 +299,38 @@ iommu_find_device_from_cookie(struct iommufd_ctx *ictx, u64 dev_cookie)
return dev;
}
+static int iommu_device_add_cap_chain(struct device *dev, unsigned long arg,
+ struct iommu_device_info *info)
+{
+ struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+ int ret;
+
+ ret = vfio_device_add_iova_cap(dev, &caps);
+ if (ret)
+ return ret;
+
+ if (caps.size) {
+ info->flags |= IOMMU_DEVICE_INFO_CAPS;
+
+ if (info->argsz < sizeof(*info) + caps.size) {
+ info->argsz = sizeof(*info) + caps.size;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(*info));
+ if (copy_to_user((void __user *)arg +
+ sizeof(*info), caps.buf,
+ caps.size)) {
+ kfree(caps.buf);
+ info->flags &= ~IOMMU_DEVICE_INFO_CAPS;
+ return -EFAULT;
+ }
+ info->cap_offset = sizeof(*info);
+ }
+
+ kfree(caps.buf);
+ }
+ return 0;
+}
+
static void iommu_device_build_info(struct device *dev,
struct iommu_device_info *info)
{
@@ -324,8 +357,9 @@ static int iommufd_get_device_info(struct iommufd_ctx *ictx,
struct iommu_device_info info;
unsigned long minsz;
struct device *dev;
+ int ret;
- minsz = offsetofend(struct iommu_device_info, addr_width);
+ minsz = offsetofend(struct iommu_device_info, cap_offset);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
@@ -341,6 +375,11 @@ static int iommufd_get_device_info(struct iommufd_ctx *ictx,
iommu_device_build_info(dev, &info);
+ info.cap_offset = 0;
+ ret = iommu_device_add_cap_chain(dev, arg, &info);
+ if (ret)
+ pr_info_ratelimited("No cap chain added, error %d\n", ret);
+
return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c1c6bc803d94..28c1699aed6b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2963,15 +2963,15 @@ static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
return 0;
}
-static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
- struct vfio_info_cap *caps)
+static int vfio_iova_list_build_caps(struct list_head *iova_list,
+ struct vfio_info_cap *caps)
{
struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
struct vfio_iova *iova;
size_t size;
int iovas = 0, i = 0, ret;
- list_for_each_entry(iova, &iommu->iova_list, list)
+ list_for_each_entry(iova, iova_list, list)
iovas++;
if (!iovas) {
@@ -2990,7 +2990,7 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
cap_iovas->nr_iovas = iovas;
- list_for_each_entry(iova, &iommu->iova_list, list) {
+ list_for_each_entry(iova, iova_list, list) {
cap_iovas->iova_ranges[i].start = iova->start;
cap_iovas->iova_ranges[i].end = iova->end;
i++;
@@ -3002,6 +3002,45 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
return ret;
}
+static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
+ struct vfio_info_cap *caps)
+{
+ return vfio_iova_list_build_caps(&iommu->iova_list, caps);
+}
+
+/* HACK: called by /dev/iommu core to build iova range cap for a device */
+int vfio_device_add_iova_cap(struct device *dev, struct vfio_info_cap *caps)
+{
+ u64 awidth;
+ dma_addr_t aperture_end;
+ LIST_HEAD(iova);
+ LIST_HEAD(dev_resv_regions);
+ int ret;
+
+ ret = iommu_device_get_info(dev, IOMMU_DEV_INFO_ADDR_WIDTH, &awidth);
+ if (ret)
+ return ret;
+
+ /* FIXME: needs to use geometry info reported by iommu core. */
+ aperture_end = ((dma_addr_t)1) << awidth;
+
+ ret = vfio_iommu_iova_insert(&iova, 0, aperture_end);
+ if (ret)
+ return ret;
+
+ iommu_get_resv_regions(dev, &dev_resv_regions);
+ ret = vfio_iommu_resv_exclude(&iova, &dev_resv_regions);
+ if (ret)
+ goto out;
+
+ ret = vfio_iova_list_build_caps(&iova, caps);
+out:
+ vfio_iommu_iova_free(&iova);
+ iommu_put_resv_regions(dev, &dev_resv_regions);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vfio_device_add_iova_cap);
+
static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
struct vfio_info_cap *caps)
{
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index d904ee5a68cc..605b8e828be4 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -212,6 +212,8 @@ extern int vfio_info_add_capability(struct vfio_info_cap *caps,
extern int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
int num_irqs, int max_irq_type,
size_t *data_size);
+extern int vfio_device_add_iova_cap(struct device *dev,
+ struct vfio_info_cap *caps);
struct pci_dev;
#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH)
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index 49731be71213..f408ad3c8ade 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -68,6 +68,7 @@
* +---------------+------------+
* ...
* @addr_width: the address width of supported I/O address spaces.
+ * @cap_offset: Offset within info struct of first cap
*
* Availability: after device is bound to iommufd
*/
@@ -77,9 +78,11 @@ struct iommu_device_info {
#define IOMMU_DEVICE_INFO_ENFORCE_SNOOP (1 << 0) /* IOMMU enforced snoop */
#define IOMMU_DEVICE_INFO_PGSIZES (1 << 1) /* supported page sizes */
#define IOMMU_DEVICE_INFO_ADDR_WIDTH (1 << 2) /* addr_wdith field valid */
+#define IOMMU_DEVICE_INFO_CAPS (1 << 3) /* info supports cap chain */
__u64 dev_cookie;
__u64 pgsize_bitmap;
__u32 addr_width;
+ __u32 cap_offset;
};
#define IOMMU_DEVICE_GET_INFO _IO(IOMMU_TYPE, IOMMU_BASE + 1)
--
2.25.1
On Sun, Sep 19, 2021 at 02:38:45PM +0800, Liu Yi L wrote:
> [HACK. will fix in v2]
>
> IOVA range is critical info for userspace to manage DMA for an I/O address
> space. This patch reports the valid iova range info of a given device.
>
> Due to aforementioned hack, this info comes from the hacked vfio type1
> driver. To follow the same format in vfio, we also introduce a cap chain
> format in IOMMU_DEVICE_GET_INFO to carry the iova range info.
[...]
> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> index 49731be71213..f408ad3c8ade 100644
> --- a/include/uapi/linux/iommu.h
> +++ b/include/uapi/linux/iommu.h
> @@ -68,6 +68,7 @@
> * +---------------+------------+
> * ...
> * @addr_width: the address width of supported I/O address spaces.
> + * @cap_offset: Offset within info struct of first cap
> *
> * Availability: after device is bound to iommufd
> */
> @@ -77,9 +78,11 @@ struct iommu_device_info {
> #define IOMMU_DEVICE_INFO_ENFORCE_SNOOP (1 << 0) /* IOMMU enforced snoop */
> #define IOMMU_DEVICE_INFO_PGSIZES (1 << 1) /* supported page sizes */
> #define IOMMU_DEVICE_INFO_ADDR_WIDTH (1 << 2) /* addr_wdith field valid */
> +#define IOMMU_DEVICE_INFO_CAPS (1 << 3) /* info supports cap chain */
> __u64 dev_cookie;
> __u64 pgsize_bitmap;
> __u32 addr_width;
> + __u32 cap_offset;
We can also add vendor-specific page table and PASID table properties as
capabilities, otherwise we'll need giant unions in the iommu_device_info
struct. That made me wonder whether pgsize and addr_width should also be
separate capabilities for consistency, but this way might be good enough.
There won't be many more generic capabilities. I have "output address
width" and "PASID width", the rest is specific to Arm and SMMU table
formats.
Thanks,
Jean
> };
>
> #define IOMMU_DEVICE_GET_INFO _IO(IOMMU_TYPE, IOMMU_BASE + 1)
> --
> 2.25.1
>
> From: Jean-Philippe Brucker <[email protected]>
> Sent: Wednesday, September 22, 2021 10:49 PM
>
> On Sun, Sep 19, 2021 at 02:38:45PM +0800, Liu Yi L wrote:
> > [HACK. will fix in v2]
> >
> > IOVA range is critical info for userspace to manage DMA for an I/O address
> > space. This patch reports the valid iova range info of a given device.
> >
> > Due to aforementioned hack, this info comes from the hacked vfio type1
> > driver. To follow the same format in vfio, we also introduce a cap chain
> > format in IOMMU_DEVICE_GET_INFO to carry the iova range info.
> [...]
> > diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> > index 49731be71213..f408ad3c8ade 100644
> > --- a/include/uapi/linux/iommu.h
> > +++ b/include/uapi/linux/iommu.h
> > @@ -68,6 +68,7 @@
> > * +---------------+------------+
> > * ...
> > * @addr_width: the address width of supported I/O address spaces.
> > + * @cap_offset: Offset within info struct of first cap
> > *
> > * Availability: after device is bound to iommufd
> > */
> > @@ -77,9 +78,11 @@ struct iommu_device_info {
> > #define IOMMU_DEVICE_INFO_ENFORCE_SNOOP (1 << 0) /* IOMMU
> enforced snoop */
> > #define IOMMU_DEVICE_INFO_PGSIZES (1 << 1) /* supported page
> sizes */
> > #define IOMMU_DEVICE_INFO_ADDR_WIDTH (1 << 2) /*
> addr_wdith field valid */
> > +#define IOMMU_DEVICE_INFO_CAPS (1 << 3) /* info
> supports cap chain */
> > __u64 dev_cookie;
> > __u64 pgsize_bitmap;
> > __u32 addr_width;
> > + __u32 cap_offset;
>
> We can also add vendor-specific page table and PASID table properties as
> capabilities, otherwise we'll need giant unions in the iommu_device_info
> struct. That made me wonder whether pgsize and addr_width should also
> be
> separate capabilities for consistency, but this way might be good enough.
> There won't be many more generic capabilities. I have "output address
> width"
what do you mean by "output address width"? Is it the output address
of stage-1 translation?
>
and "PASID width", the rest is specific to Arm and SMMU table
> formats.
When coming to nested translation support, the stage-1 related info are
likely to be vendor-specific, and will be reported in cap chain.
Regards,
Yi Liu
> Thanks,
> Jean
>
> > };
> >
> > #define IOMMU_DEVICE_GET_INFO _IO(IOMMU_TYPE, IOMMU_BASE +
> 1)
> > --
> > 2.25.1
> >
On Wed, Sep 29, 2021 at 10:44:01AM +0000, Liu, Yi L wrote:
> > From: Jean-Philippe Brucker <[email protected]>
> > Sent: Wednesday, September 22, 2021 10:49 PM
> >
> > On Sun, Sep 19, 2021 at 02:38:45PM +0800, Liu Yi L wrote:
> > > [HACK. will fix in v2]
> > >
> > > IOVA range is critical info for userspace to manage DMA for an I/O address
> > > space. This patch reports the valid iova range info of a given device.
> > >
> > > Due to aforementioned hack, this info comes from the hacked vfio type1
> > > driver. To follow the same format in vfio, we also introduce a cap chain
> > > format in IOMMU_DEVICE_GET_INFO to carry the iova range info.
> > [...]
> > > diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> > > index 49731be71213..f408ad3c8ade 100644
> > > --- a/include/uapi/linux/iommu.h
> > > +++ b/include/uapi/linux/iommu.h
> > > @@ -68,6 +68,7 @@
> > > * +---------------+------------+
> > > * ...
> > > * @addr_width: the address width of supported I/O address spaces.
> > > + * @cap_offset: Offset within info struct of first cap
> > > *
> > > * Availability: after device is bound to iommufd
> > > */
> > > @@ -77,9 +78,11 @@ struct iommu_device_info {
> > > #define IOMMU_DEVICE_INFO_ENFORCE_SNOOP (1 << 0) /* IOMMU
> > enforced snoop */
> > > #define IOMMU_DEVICE_INFO_PGSIZES (1 << 1) /* supported page
> > sizes */
> > > #define IOMMU_DEVICE_INFO_ADDR_WIDTH (1 << 2) /*
> > addr_wdith field valid */
> > > +#define IOMMU_DEVICE_INFO_CAPS (1 << 3) /* info
> > supports cap chain */
> > > __u64 dev_cookie;
> > > __u64 pgsize_bitmap;
> > > __u32 addr_width;
> > > + __u32 cap_offset;
> >
> > We can also add vendor-specific page table and PASID table properties as
> > capabilities, otherwise we'll need giant unions in the iommu_device_info
> > struct. That made me wonder whether pgsize and addr_width should also
> > be
> > separate capabilities for consistency, but this way might be good enough.
> > There won't be many more generic capabilities. I have "output address
> > width"
>
> what do you mean by "output address width"? Is it the output address
> of stage-1 translation?
Yes, so the guest knows the size of GPA it can write into the page table.
For Arm SMMU the GPA size is determined by both the SMMU implementation
and the host kernel configuration. But maybe that could also be
vendor-specific, if other architectures don't need to communicate it.
> >
> and "PASID width", the rest is specific to Arm and SMMU table
> > formats.
>
> When coming to nested translation support, the stage-1 related info are
> likely to be vendor-specific, and will be reported in cap chain.
Agreed
Thanks,
Jean
On Wed, Sep 29, 2021 at 01:07:56PM +0100, Jean-Philippe Brucker wrote:
> Yes, so the guest knows the size of GPA it can write into the page table.
> For Arm SMMU the GPA size is determined by both the SMMU implementation
> and the host kernel configuration. But maybe that could also be
> vendor-specific, if other architectures don't need to communicate it.
I think there should be a dedicated query to return HW specific
parmaters for a user page table format. Somehow I think there will be
a lot of these.
So 'user page table format arm smmu v1' can be queried to return its
own unique struct that has everything needed to operate that format of
page table.
Userspace already needs to know how to form that specific HW PTEs,
so processing a HW specific query is not a problem.
Jason