2021-09-19 12:15:22

by Yi Liu

[permalink] [raw]
Subject: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

This patch adds VFIO_DEVICE_BIND_IOMMUFD for userspace to bind the vfio
device to an iommufd. No VFIO_DEVICE_UNBIND_IOMMUFD interface is provided
because it's implicitly done when the device fd is closed.

In concept a vfio device can be bound to multiple iommufds, each hosting
a subset of I/O address spaces attached by this device. However as a
starting point (matching current vfio), only one I/O address space is
supported per vfio device. It implies one device can only be attached
to one iommufd at this point.

Signed-off-by: Liu Yi L <[email protected]>
---
drivers/vfio/pci/Kconfig | 1 +
drivers/vfio/pci/vfio_pci.c | 72 ++++++++++++++++++++++++++++-
drivers/vfio/pci/vfio_pci_private.h | 8 ++++
include/uapi/linux/vfio.h | 30 ++++++++++++
4 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 5e2e1b9a9fd3..3abfb098b4dc 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -5,6 +5,7 @@ config VFIO_PCI
depends on MMU
select VFIO_VIRQFD
select IRQ_BYPASS_MANAGER
+ select IOMMUFD
help
Support for the PCI VFIO bus driver. This is required to make
use of PCI drivers using the VFIO framework.
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 145addde983b..20006bb66430 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -552,6 +552,16 @@ static void vfio_pci_release(struct vfio_device *core_vdev)
vdev->req_trigger = NULL;
}
mutex_unlock(&vdev->igate);
+
+ mutex_lock(&vdev->videv_lock);
+ if (vdev->videv) {
+ struct vfio_iommufd_device *videv = vdev->videv;
+
+ vdev->videv = NULL;
+ iommufd_unbind_device(videv->idev);
+ kfree(videv);
+ }
+ mutex_unlock(&vdev->videv_lock);
}

mutex_unlock(&vdev->reflck->lock);
@@ -780,7 +790,66 @@ static long vfio_pci_ioctl(struct vfio_device *core_vdev,
container_of(core_vdev, struct vfio_pci_device, vdev);
unsigned long minsz;

- if (cmd == VFIO_DEVICE_GET_INFO) {
+ if (cmd == VFIO_DEVICE_BIND_IOMMUFD) {
+ struct vfio_device_iommu_bind_data bind_data;
+ unsigned long minsz;
+ struct iommufd_device *idev;
+ struct vfio_iommufd_device *videv;
+
+ /*
+ * Reject the request if the device is already opened and
+ * attached to a container.
+ */
+ if (vfio_device_in_container(core_vdev))
+ return -ENOTTY;
+
+ minsz = offsetofend(struct vfio_device_iommu_bind_data, dev_cookie);
+
+ if (copy_from_user(&bind_data, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (bind_data.argsz < minsz ||
+ bind_data.flags || bind_data.iommu_fd < 0)
+ return -EINVAL;
+
+ mutex_lock(&vdev->videv_lock);
+ /*
+ * Allow only one iommufd per device until multiple
+ * address spaces (e.g. vSVA) support is introduced
+ * in the future.
+ */
+ if (vdev->videv) {
+ mutex_unlock(&vdev->videv_lock);
+ return -EBUSY;
+ }
+
+ idev = iommufd_bind_device(bind_data.iommu_fd,
+ &vdev->pdev->dev,
+ bind_data.dev_cookie);
+ if (IS_ERR(idev)) {
+ mutex_unlock(&vdev->videv_lock);
+ return PTR_ERR(idev);
+ }
+
+ videv = kzalloc(sizeof(*videv), GFP_KERNEL);
+ if (!videv) {
+ iommufd_unbind_device(idev);
+ mutex_unlock(&vdev->videv_lock);
+ return -ENOMEM;
+ }
+ videv->idev = idev;
+ videv->iommu_fd = bind_data.iommu_fd;
+ /*
+ * A security context has been established. Unblock
+ * user access.
+ */
+ if (atomic_read(&vdev->block_access))
+ atomic_set(&vdev->block_access, 0);
+ vdev->videv = videv;
+ mutex_unlock(&vdev->videv_lock);
+
+ return 0;
+ } else if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;
struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
unsigned long capsz;
@@ -2031,6 +2100,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
mutex_init(&vdev->vma_lock);
INIT_LIST_HEAD(&vdev->vma_list);
init_rwsem(&vdev->memory_lock);
+ mutex_init(&vdev->videv_lock);

ret = vfio_pci_reflck_attach(vdev);
if (ret)
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index f12012e30b53..bd784accac35 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/uuid.h>
#include <linux/notifier.h>
+#include <linux/iommufd.h>

#ifndef VFIO_PCI_PRIVATE_H
#define VFIO_PCI_PRIVATE_H
@@ -99,6 +100,11 @@ struct vfio_pci_mmap_vma {
struct list_head vma_next;
};

+struct vfio_iommufd_device {
+ struct iommufd_device *idev;
+ int iommu_fd;
+};
+
struct vfio_pci_device {
struct vfio_device vdev;
struct pci_dev *pdev;
@@ -144,6 +150,8 @@ struct vfio_pci_device {
struct list_head vma_list;
struct rw_semaphore memory_lock;
atomic_t block_access;
+ struct mutex videv_lock;
+ struct vfio_iommufd_device *videv;
};

#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ef33ea002b0b..c902abd60339 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -190,6 +190,36 @@ struct vfio_group_status {

/* --------------- IOCTLs for DEVICE file descriptors --------------- */

+/*
+ * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
+ * struct vfio_device_iommu_bind_data)
+ *
+ * Bind a vfio_device to the specified iommufd
+ *
+ * The user should provide a device cookie when calling this ioctl. The
+ * cookie is later used in iommufd for capability query, iotlb invalidation
+ * and I/O fault handling.
+ *
+ * User is not allowed to access the device before the binding operation
+ * is completed.
+ *
+ * Unbind is automatically conducted when device fd is closed.
+ *
+ * Input parameters:
+ * - iommu_fd;
+ * - dev_cookie;
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_iommu_bind_data {
+ __u32 argsz;
+ __u32 flags;
+ __s32 iommu_fd;
+ __u64 dev_cookie;
+};
+
+#define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 19)
+
/**
* VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
* struct vfio_device_info)
--
2.25.1


2021-09-21 18:43:34

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

On Sun, Sep 19, 2021 at 02:38:36PM +0800, Liu Yi L wrote:
> This patch adds VFIO_DEVICE_BIND_IOMMUFD for userspace to bind the vfio
> device to an iommufd. No VFIO_DEVICE_UNBIND_IOMMUFD interface is provided
> because it's implicitly done when the device fd is closed.
>
> In concept a vfio device can be bound to multiple iommufds, each hosting
> a subset of I/O address spaces attached by this device. However as a
> starting point (matching current vfio), only one I/O address space is
> supported per vfio device. It implies one device can only be attached
> to one iommufd at this point.
>
> Signed-off-by: Liu Yi L <[email protected]>
> drivers/vfio/pci/Kconfig | 1 +
> drivers/vfio/pci/vfio_pci.c | 72 ++++++++++++++++++++++++++++-
> drivers/vfio/pci/vfio_pci_private.h | 8 ++++
> include/uapi/linux/vfio.h | 30 ++++++++++++
> 4 files changed, 110 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 5e2e1b9a9fd3..3abfb098b4dc 100644
> +++ b/drivers/vfio/pci/Kconfig
> @@ -5,6 +5,7 @@ config VFIO_PCI
> depends on MMU
> select VFIO_VIRQFD
> select IRQ_BYPASS_MANAGER
> + select IOMMUFD
> help
> Support for the PCI VFIO bus driver. This is required to make
> use of PCI drivers using the VFIO framework.
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 145addde983b..20006bb66430 100644
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -552,6 +552,16 @@ static void vfio_pci_release(struct vfio_device *core_vdev)
> vdev->req_trigger = NULL;
> }
> mutex_unlock(&vdev->igate);
> +
> + mutex_lock(&vdev->videv_lock);
> + if (vdev->videv) {
> + struct vfio_iommufd_device *videv = vdev->videv;
> +
> + vdev->videv = NULL;
> + iommufd_unbind_device(videv->idev);
> + kfree(videv);
> + }
> + mutex_unlock(&vdev->videv_lock);
> }
>
> mutex_unlock(&vdev->reflck->lock);
> @@ -780,7 +790,66 @@ static long vfio_pci_ioctl(struct vfio_device *core_vdev,
> container_of(core_vdev, struct vfio_pci_device, vdev);
> unsigned long minsz;
>
> - if (cmd == VFIO_DEVICE_GET_INFO) {
> + if (cmd == VFIO_DEVICE_BIND_IOMMUFD) {

Choosing to implement this through the ioctl multiplexor is what is
causing so much ugly gyration in the previous patches

This should be a straightforward new function and ops:

struct iommufd_device *vfio_pci_bind_iommufd(struct vfio_device *)
{
iommu_dev = iommufd_bind_device(bind_data.iommu_fd,
&vdev->pdev->dev,
bind_data.dev_cookie);
if (!iommu_dev) return ERR
vdev->iommu_dev = iommu_dev;
}
static const struct vfio_device_ops vfio_pci_ops = {
.bind_iommufd = &*vfio_pci_bind_iommufd

If you do the other stuff I said then you'll notice that the
iommufd_bind_device() will provide automatic exclusivity.

The thread that sees ops->bind_device succeed will know it is the only
thread that can see that (by definition, the iommu enable user stuff
has to be exclusive and race free) thus it can go ahead and store the
iommu pointer.

The other half of the problem '&vdev->block_access' is solved by
manipulating the filp->f_ops. Start with a fops that can ONLY call the
above op. When the above op succeeds switch the fops to the normal
full ops. .

The same flow happens when the group fd spawns the device fd, just
parts of iommfd_bind_device are open coded into the vfio code, but the
whole flow and sequence should be the same.

> + /*
> + * Reject the request if the device is already opened and
> + * attached to a container.
> + */
> + if (vfio_device_in_container(core_vdev))
> + return -ENOTTY;

This is wrongly locked

> +
> + minsz = offsetofend(struct vfio_device_iommu_bind_data, dev_cookie);
> +
> + if (copy_from_user(&bind_data, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (bind_data.argsz < minsz ||
> + bind_data.flags || bind_data.iommu_fd < 0)
> + return -EINVAL;
> +
> + mutex_lock(&vdev->videv_lock);
> + /*
> + * Allow only one iommufd per device until multiple
> + * address spaces (e.g. vSVA) support is introduced
> + * in the future.
> + */
> + if (vdev->videv) {
> + mutex_unlock(&vdev->videv_lock);
> + return -EBUSY;
> + }
> +
> + idev = iommufd_bind_device(bind_data.iommu_fd,
> + &vdev->pdev->dev,
> + bind_data.dev_cookie);
> + if (IS_ERR(idev)) {
> + mutex_unlock(&vdev->videv_lock);
> + return PTR_ERR(idev);
> + }
> +
> + videv = kzalloc(sizeof(*videv), GFP_KERNEL);
> + if (!videv) {
> + iommufd_unbind_device(idev);
> + mutex_unlock(&vdev->videv_lock);
> + return -ENOMEM;
> + }
> + videv->idev = idev;
> + videv->iommu_fd = bind_data.iommu_fd;

No need for more memory, a struct vfio_device can be attached to a
single iommu context. If idev then the context and all the other
information is valid.

> + if (atomic_read(&vdev->block_access))
> + atomic_set(&vdev->block_access, 0);

I'm sure I'll tell you this is all wrongly locked too if I look
closely.

> +/*
> + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
> + * struct vfio_device_iommu_bind_data)
> + *
> + * Bind a vfio_device to the specified iommufd
> + *
> + * The user should provide a device cookie when calling this ioctl. The
> + * cookie is later used in iommufd for capability query, iotlb invalidation
> + * and I/O fault handling.
> + *
> + * User is not allowed to access the device before the binding operation
> + * is completed.
> + *
> + * Unbind is automatically conducted when device fd is closed.
> + *
> + * Input parameters:
> + * - iommu_fd;
> + * - dev_cookie;
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_iommu_bind_data {
> + __u32 argsz;
> + __u32 flags;
> + __s32 iommu_fd;
> + __u64 dev_cookie;

Missing explicit padding

Always use __aligned_u64 in uapi headers, fix all the patches.

Jason

2021-09-22 21:05:59

by Alex Williamson

[permalink] [raw]
Subject: Re: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

On Tue, 21 Sep 2021 14:29:39 -0300
Jason Gunthorpe <[email protected]> wrote:

> On Sun, Sep 19, 2021 at 02:38:36PM +0800, Liu Yi L wrote:
> > +struct vfio_device_iommu_bind_data {
> > + __u32 argsz;
> > + __u32 flags;
> > + __s32 iommu_fd;
> > + __u64 dev_cookie;
>
> Missing explicit padding
>
> Always use __aligned_u64 in uapi headers, fix all the patches.

We don't need padding or explicit alignment if we just swap the order
of iommu_fd and dev_cookie. Thanks,

Alex

2021-09-22 23:03:40

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

On Wed, Sep 22, 2021 at 03:01:01PM -0600, Alex Williamson wrote:
> On Tue, 21 Sep 2021 14:29:39 -0300
> Jason Gunthorpe <[email protected]> wrote:
>
> > On Sun, Sep 19, 2021 at 02:38:36PM +0800, Liu Yi L wrote:
> > > +struct vfio_device_iommu_bind_data {
> > > + __u32 argsz;
> > > + __u32 flags;
> > > + __s32 iommu_fd;
> > > + __u64 dev_cookie;
> >
> > Missing explicit padding
> >
> > Always use __aligned_u64 in uapi headers, fix all the patches.
>
> We don't need padding or explicit alignment if we just swap the order
> of iommu_fd and dev_cookie. Thanks,

Yes, the padding should all be checked and minimized

But it is always good practice to always use __aligned_u64 in the uapi
headers just in case someone messes it up someday - it prevents small
mistakes from becoming an ABI mess.

Jason

2021-09-29 06:43:31

by Tian, Kevin

[permalink] [raw]
Subject: RE: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

> From: David Gibson <[email protected]>
> Sent: Wednesday, September 29, 2021 2:01 PM
>
> On Sun, Sep 19, 2021 at 02:38:36PM +0800, Liu Yi L wrote:
> > This patch adds VFIO_DEVICE_BIND_IOMMUFD for userspace to bind the
> vfio
> > device to an iommufd. No VFIO_DEVICE_UNBIND_IOMMUFD interface is
> provided
> > because it's implicitly done when the device fd is closed.
> >
> > In concept a vfio device can be bound to multiple iommufds, each hosting
> > a subset of I/O address spaces attached by this device.
>
> I really feel like this many<->many mapping between devices is going
> to be super-confusing, and therefore make it really hard to be
> confident we have all the rules right for proper isolation.

Based on new discussion on group ownership part (patch06), I feel this
many<->many relationship will disappear. The context fd (either container
or iommufd) will uniquely mark the ownership on a physical device and
its group. With this design it's impractical to have one device bound
to multiple iommufds. Actually I don't think this is a compelling usage
in reality. The previous rationale was that no need to impose such restriction
if no special reason... and now we have a reason. ????

Jason, are you OK with this simplification?

>
> That's why I was suggesting a concept like endpoints, to break this
> into two many<->one relationships. I'm ok if that isn't visible in
> the user API, but I think this is going to be really hard to keep
> track of if it isn't explicit somewhere in the internals.
>

I think this endpoint concept is represented by ioas_device_info in
patch14:

+/*
+ * An ioas_device_info object is created per each successful attaching
+ * request. A list of objects are maintained per ioas when the address
+ * space is shared by multiple devices.
+ */
+struct ioas_device_info {
+ struct iommufd_device *idev;
+ struct list_head next;
};

currently it's 1:1 mapping before this object and iommufd_device,
because no pasid support yet.

We can rename it to struct ioas_endpoint if it makes you feel better.

Thanks
Kevin

2021-09-29 10:37:14

by David Gibson

[permalink] [raw]
Subject: Re: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

On Sun, Sep 19, 2021 at 02:38:36PM +0800, Liu Yi L wrote:
> This patch adds VFIO_DEVICE_BIND_IOMMUFD for userspace to bind the vfio
> device to an iommufd. No VFIO_DEVICE_UNBIND_IOMMUFD interface is provided
> because it's implicitly done when the device fd is closed.
>
> In concept a vfio device can be bound to multiple iommufds, each hosting
> a subset of I/O address spaces attached by this device.

I really feel like this many<->many mapping between devices is going
to be super-confusing, and therefore make it really hard to be
confident we have all the rules right for proper isolation.

That's why I was suggesting a concept like endpoints, to break this
into two many<->one relationships. I'm ok if that isn't visible in
the user API, but I think this is going to be really hard to keep
track of if it isn't explicit somewhere in the internals.

> However as a
> starting point (matching current vfio), only one I/O address space is
> supported per vfio device. It implies one device can only be attached
> to one iommufd at this point.
>
> Signed-off-by: Liu Yi L <[email protected]>
> ---
> drivers/vfio/pci/Kconfig | 1 +
> drivers/vfio/pci/vfio_pci.c | 72 ++++++++++++++++++++++++++++-
> drivers/vfio/pci/vfio_pci_private.h | 8 ++++
> include/uapi/linux/vfio.h | 30 ++++++++++++
> 4 files changed, 110 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
> index 5e2e1b9a9fd3..3abfb098b4dc 100644
> --- a/drivers/vfio/pci/Kconfig
> +++ b/drivers/vfio/pci/Kconfig
> @@ -5,6 +5,7 @@ config VFIO_PCI
> depends on MMU
> select VFIO_VIRQFD
> select IRQ_BYPASS_MANAGER
> + select IOMMUFD
> help
> Support for the PCI VFIO bus driver. This is required to make
> use of PCI drivers using the VFIO framework.
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 145addde983b..20006bb66430 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -552,6 +552,16 @@ static void vfio_pci_release(struct vfio_device *core_vdev)
> vdev->req_trigger = NULL;
> }
> mutex_unlock(&vdev->igate);
> +
> + mutex_lock(&vdev->videv_lock);
> + if (vdev->videv) {
> + struct vfio_iommufd_device *videv = vdev->videv;
> +
> + vdev->videv = NULL;
> + iommufd_unbind_device(videv->idev);
> + kfree(videv);
> + }
> + mutex_unlock(&vdev->videv_lock);
> }
>
> mutex_unlock(&vdev->reflck->lock);
> @@ -780,7 +790,66 @@ static long vfio_pci_ioctl(struct vfio_device *core_vdev,
> container_of(core_vdev, struct vfio_pci_device, vdev);
> unsigned long minsz;
>
> - if (cmd == VFIO_DEVICE_GET_INFO) {
> + if (cmd == VFIO_DEVICE_BIND_IOMMUFD) {
> + struct vfio_device_iommu_bind_data bind_data;
> + unsigned long minsz;
> + struct iommufd_device *idev;
> + struct vfio_iommufd_device *videv;
> +
> + /*
> + * Reject the request if the device is already opened and
> + * attached to a container.
> + */
> + if (vfio_device_in_container(core_vdev))

Usually one would do argument sanity checks before checks that
actually depend on machine state.

> + return -ENOTTY;

This doesn't seem like the right error code. It's a perfectly valid
operation for this device - just not available right now.

> +
> + minsz = offsetofend(struct vfio_device_iommu_bind_data, dev_cookie);
> +
> + if (copy_from_user(&bind_data, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (bind_data.argsz < minsz ||
> + bind_data.flags || bind_data.iommu_fd < 0)
> + return -EINVAL;
> +
> + mutex_lock(&vdev->videv_lock);
> + /*
> + * Allow only one iommufd per device until multiple
> + * address spaces (e.g. vSVA) support is introduced
> + * in the future.
> + */
> + if (vdev->videv) {
> + mutex_unlock(&vdev->videv_lock);
> + return -EBUSY;
> + }
> +
> + idev = iommufd_bind_device(bind_data.iommu_fd,
> + &vdev->pdev->dev,
> + bind_data.dev_cookie);
> + if (IS_ERR(idev)) {
> + mutex_unlock(&vdev->videv_lock);
> + return PTR_ERR(idev);
> + }
> +
> + videv = kzalloc(sizeof(*videv), GFP_KERNEL);
> + if (!videv) {
> + iommufd_unbind_device(idev);
> + mutex_unlock(&vdev->videv_lock);
> + return -ENOMEM;
> + }
> + videv->idev = idev;
> + videv->iommu_fd = bind_data.iommu_fd;
> + /*
> + * A security context has been established. Unblock
> + * user access.
> + */
> + if (atomic_read(&vdev->block_access))
> + atomic_set(&vdev->block_access, 0);
> + vdev->videv = videv;
> + mutex_unlock(&vdev->videv_lock);
> +
> + return 0;
> + } else if (cmd == VFIO_DEVICE_GET_INFO) {
> struct vfio_device_info info;
> struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> unsigned long capsz;
> @@ -2031,6 +2100,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> mutex_init(&vdev->vma_lock);
> INIT_LIST_HEAD(&vdev->vma_list);
> init_rwsem(&vdev->memory_lock);
> + mutex_init(&vdev->videv_lock);
>
> ret = vfio_pci_reflck_attach(vdev);
> if (ret)
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index f12012e30b53..bd784accac35 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -14,6 +14,7 @@
> #include <linux/types.h>
> #include <linux/uuid.h>
> #include <linux/notifier.h>
> +#include <linux/iommufd.h>
>
> #ifndef VFIO_PCI_PRIVATE_H
> #define VFIO_PCI_PRIVATE_H
> @@ -99,6 +100,11 @@ struct vfio_pci_mmap_vma {
> struct list_head vma_next;
> };
>
> +struct vfio_iommufd_device {
> + struct iommufd_device *idev;

Could this be embedded to avoid multiple layers of pointers?

> + int iommu_fd;
> +};
> +
> struct vfio_pci_device {
> struct vfio_device vdev;
> struct pci_dev *pdev;
> @@ -144,6 +150,8 @@ struct vfio_pci_device {
> struct list_head vma_list;
> struct rw_semaphore memory_lock;
> atomic_t block_access;
> + struct mutex videv_lock;
> + struct vfio_iommufd_device *videv;
> };
>
> #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index ef33ea002b0b..c902abd60339 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -190,6 +190,36 @@ struct vfio_group_status {
>
> /* --------------- IOCTLs for DEVICE file descriptors --------------- */
>
> +/*
> + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 19,
> + * struct vfio_device_iommu_bind_data)
> + *
> + * Bind a vfio_device to the specified iommufd
> + *
> + * The user should provide a device cookie when calling this ioctl. The
> + * cookie is later used in iommufd for capability query, iotlb invalidation
> + * and I/O fault handling.
> + *
> + * User is not allowed to access the device before the binding operation
> + * is completed.
> + *
> + * Unbind is automatically conducted when device fd is closed.
> + *
> + * Input parameters:
> + * - iommu_fd;
> + * - dev_cookie;
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_iommu_bind_data {
> + __u32 argsz;
> + __u32 flags;
> + __s32 iommu_fd;
> + __u64 dev_cookie;
> +};
> +
> +#define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 19)
> +
> /**
> * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
> * struct vfio_device_info)

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


Attachments:
(No filename) (7.60 kB)
signature.asc (849.00 B)
Download all attachments

2021-09-29 12:30:50

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

On Wed, Sep 29, 2021 at 06:41:00AM +0000, Tian, Kevin wrote:
> > From: David Gibson <[email protected]>
> > Sent: Wednesday, September 29, 2021 2:01 PM
> >
> > On Sun, Sep 19, 2021 at 02:38:36PM +0800, Liu Yi L wrote:
> > > This patch adds VFIO_DEVICE_BIND_IOMMUFD for userspace to bind the
> > vfio
> > > device to an iommufd. No VFIO_DEVICE_UNBIND_IOMMUFD interface is
> > provided
> > > because it's implicitly done when the device fd is closed.
> > >
> > > In concept a vfio device can be bound to multiple iommufds, each hosting
> > > a subset of I/O address spaces attached by this device.
> >
> > I really feel like this many<->many mapping between devices is going
> > to be super-confusing, and therefore make it really hard to be
> > confident we have all the rules right for proper isolation.
>
> Based on new discussion on group ownership part (patch06), I feel this
> many<->many relationship will disappear. The context fd (either container
> or iommufd) will uniquely mark the ownership on a physical device and
> its group. With this design it's impractical to have one device bound
> to multiple iommufds.

That should be a requirement! We have no way to prove that two
iommufds are the same security domain, so devices/groups cannot be
shared.

That is why the API I suggested takes in a struct file to ID the user
security context. A group is accessible only from that single struct
file and no more.

If the first series goes the way I outlined then I think David's
concern about security is strongly solved as the IOMMU layer is
directly managing it with a very clear responsiblity and semantic.

Jason

2021-09-29 23:51:34

by Tian, Kevin

[permalink] [raw]
Subject: RE: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, September 29, 2021 8:28 PM
>
> On Wed, Sep 29, 2021 at 06:41:00AM +0000, Tian, Kevin wrote:
> > > From: David Gibson <[email protected]>
> > > Sent: Wednesday, September 29, 2021 2:01 PM
> > >
> > > On Sun, Sep 19, 2021 at 02:38:36PM +0800, Liu Yi L wrote:
> > > > This patch adds VFIO_DEVICE_BIND_IOMMUFD for userspace to bind
> the
> > > vfio
> > > > device to an iommufd. No VFIO_DEVICE_UNBIND_IOMMUFD interface
> is
> > > provided
> > > > because it's implicitly done when the device fd is closed.
> > > >
> > > > In concept a vfio device can be bound to multiple iommufds, each
> hosting
> > > > a subset of I/O address spaces attached by this device.
> > >
> > > I really feel like this many<->many mapping between devices is going
> > > to be super-confusing, and therefore make it really hard to be
> > > confident we have all the rules right for proper isolation.
> >
> > Based on new discussion on group ownership part (patch06), I feel this
> > many<->many relationship will disappear. The context fd (either container
> > or iommufd) will uniquely mark the ownership on a physical device and
> > its group. With this design it's impractical to have one device bound
> > to multiple iommufds.
>
> That should be a requirement! We have no way to prove that two
> iommufds are the same security domain, so devices/groups cannot be
> shared.
>
> That is why the API I suggested takes in a struct file to ID the user
> security context. A group is accessible only from that single struct
> file and no more.
>
> If the first series goes the way I outlined then I think David's
> concern about security is strongly solved as the IOMMU layer is
> directly managing it with a very clear responsiblity and semantic.
>

Yes, this is also my understanding now.

2021-09-30 04:27:59

by David Gibson

[permalink] [raw]
Subject: Re: [RFC 08/20] vfio/pci: Add VFIO_DEVICE_BIND_IOMMUFD

On Wed, Sep 29, 2021 at 06:41:00AM +0000, Tian, Kevin wrote:
> > From: David Gibson <[email protected]>
> > Sent: Wednesday, September 29, 2021 2:01 PM
> >
> > On Sun, Sep 19, 2021 at 02:38:36PM +0800, Liu Yi L wrote:
> > > This patch adds VFIO_DEVICE_BIND_IOMMUFD for userspace to bind the
> > vfio
> > > device to an iommufd. No VFIO_DEVICE_UNBIND_IOMMUFD interface is
> > provided
> > > because it's implicitly done when the device fd is closed.
> > >
> > > In concept a vfio device can be bound to multiple iommufds, each hosting
> > > a subset of I/O address spaces attached by this device.
> >
> > I really feel like this many<->many mapping between devices is going
> > to be super-confusing, and therefore make it really hard to be
> > confident we have all the rules right for proper isolation.
>
> Based on new discussion on group ownership part (patch06), I feel this
> many<->many relationship will disappear. The context fd (either container
> or iommufd) will uniquely mark the ownership on a physical device and
> its group. With this design it's impractical to have one device bound
> to multiple iommufds. Actually I don't think this is a compelling usage
> in reality. The previous rationale was that no need to impose such restriction
> if no special reason... and now we have a reason. ????
>
> Jason, are you OK with this simplification?
>
> >
> > That's why I was suggesting a concept like endpoints, to break this
> > into two many<->one relationships. I'm ok if that isn't visible in
> > the user API, but I think this is going to be really hard to keep
> > track of if it isn't explicit somewhere in the internals.
> >
>
> I think this endpoint concept is represented by ioas_device_info in
> patch14:
>
> +/*
> + * An ioas_device_info object is created per each successful attaching
> + * request. A list of objects are maintained per ioas when the address
> + * space is shared by multiple devices.
> + */
> +struct ioas_device_info {
> + struct iommufd_device *idev;
> + struct list_head next;
> };
>
> currently it's 1:1 mapping before this object and iommufd_device,
> because no pasid support yet.

Ok, I haven't read that far in the series yet.

> We can rename it to struct ioas_endpoint if it makes you feel
> better.

Meh. The concept is much more important than the name.

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


Attachments:
(No filename) (2.52 kB)
signature.asc (849.00 B)
Download all attachments