2021-09-19 12:11:57

by Yi Liu

[permalink] [raw]
Subject: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
userspace to directly open a vfio device w/o relying on container/group
(/dev/vfio/$GROUP). Anything related to group is now hidden behind
iommufd (more specifically in iommu core by this RFC) in a device-centric
manner.

In case a device is exposed in both legacy and new interfaces (see next
patch for how to decide it), this patch also ensures that when the device
is already opened via one interface then the other one must be blocked.

Signed-off-by: Liu Yi L <[email protected]>
---
drivers/vfio/vfio.c | 228 +++++++++++++++++++++++++++++++++++++++----
include/linux/vfio.h | 2 +
2 files changed, 213 insertions(+), 17 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 02cc51ce6891..84436d7abedd 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -46,6 +46,12 @@ static struct vfio {
struct mutex group_lock;
struct cdev group_cdev;
dev_t group_devt;
+ /* Fields for /dev/vfio/devices interface */
+ struct class *device_class;
+ struct cdev device_cdev;
+ dev_t device_devt;
+ struct mutex device_lock;
+ struct idr device_idr;
} vfio;

struct vfio_iommu_driver {
@@ -81,9 +87,11 @@ struct vfio_group {
struct list_head container_next;
struct list_head unbound_list;
struct mutex unbound_lock;
- atomic_t opened;
- wait_queue_head_t container_q;
+ struct mutex opened_lock;
+ u32 opened;
+ bool opened_by_nongroup_dev;
bool noiommu;
+ wait_queue_head_t container_q;
unsigned int dev_counter;
struct kvm *kvm;
struct blocking_notifier_head notifier;
@@ -327,7 +335,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
INIT_LIST_HEAD(&group->unbound_list);
mutex_init(&group->unbound_lock);
atomic_set(&group->container_users, 0);
- atomic_set(&group->opened, 0);
+ mutex_init(&group->opened_lock);
init_waitqueue_head(&group->container_q);
group->iommu_group = iommu_group;
#ifdef CONFIG_VFIO_NOIOMMU
@@ -1489,10 +1497,53 @@ static long vfio_group_fops_unl_ioctl(struct file *filep,
return ret;
}

+/*
+ * group->opened is used to ensure that the group can be opened only via
+ * one of the two interfaces (/dev/vfio/$GROUP and /dev/vfio/devices/
+ * $DEVICE) instead of both.
+ *
+ * We also introduce a new group flag to indicate whether this group is
+ * opened via /dev/vfio/devices/$DEVICE. For multi-devices group,
+ * group->opened also tracks how many devices have been opened in the
+ * group if the new flag is true.
+ *
+ * Also add a new lock since two flags are operated here.
+ */
+static int vfio_group_try_open(struct vfio_group *group, bool nongroup_dev)
+{
+ int ret = 0;
+
+ mutex_lock(&group->opened_lock);
+ if (group->opened) {
+ if (nongroup_dev && group->opened_by_nongroup_dev)
+ group->opened++;
+ else
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /*
+ * Is something still in use from a previous open? Should
+ * not allow new open if it is such case.
+ */
+ if (group->container) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ group->opened = 1;
+ group->opened_by_nongroup_dev = nongroup_dev;
+
+out:
+ mutex_unlock(&group->opened_lock);
+
+ return ret;
+}
+
static int vfio_group_fops_open(struct inode *inode, struct file *filep)
{
struct vfio_group *group;
- int opened;
+ int ret;

group = vfio_group_get_from_minor(iminor(inode));
if (!group)
@@ -1503,18 +1554,10 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep)
return -EPERM;
}

- /* Do we need multiple instances of the group open? Seems not. */
- opened = atomic_cmpxchg(&group->opened, 0, 1);
- if (opened) {
- vfio_group_put(group);
- return -EBUSY;
- }
-
- /* Is something still in use from a previous open? */
- if (group->container) {
- atomic_dec(&group->opened);
+ ret = vfio_group_try_open(group, false);
+ if (ret) {
vfio_group_put(group);
- return -EBUSY;
+ return ret;
}

/* Warn if previous user didn't cleanup and re-init to drop them */
@@ -1534,7 +1577,9 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep)

vfio_group_try_dissolve_container(group);

- atomic_dec(&group->opened);
+ mutex_lock(&group->opened_lock);
+ group->opened--;
+ mutex_unlock(&group->opened_lock);

vfio_group_put(group);

@@ -1552,6 +1597,92 @@ static const struct file_operations vfio_group_fops = {
/**
* VFIO Device fd
*/
+static struct vfio_device *vfio_device_get_from_minor(int minor)
+{
+ struct vfio_device *device;
+
+ mutex_lock(&vfio.device_lock);
+ device = idr_find(&vfio.device_idr, minor);
+ if (!device || !vfio_device_try_get(device)) {
+ mutex_unlock(&vfio.device_lock);
+ return NULL;
+ }
+ mutex_unlock(&vfio.device_lock);
+
+ return device;
+}
+
+static int vfio_device_fops_open(struct inode *inode, struct file *filep)
+{
+ struct vfio_device *device;
+ struct vfio_group *group;
+ int ret, opened;
+
+ device = vfio_device_get_from_minor(iminor(inode));
+ if (!device)
+ return -ENODEV;
+
+ /*
+ * Check whether the user has opened this device via the legacy
+ * container/group interface. If yes, then prevent the user from
+ * opening it via device node in /dev/vfio/devices. Otherwise,
+ * mark the group as opened to block the group interface. either
+ * way, we must ensure only one interface is used to open the
+ * device when it supports both legacy and new interfaces.
+ */
+ group = vfio_group_try_get(device->group);
+ if (group) {
+ ret = vfio_group_try_open(group, true);
+ if (ret)
+ goto err_group_try_open;
+ }
+
+ /*
+ * No support of multiple instances of the device open, similar to
+ * the policy on the group open.
+ */
+ opened = atomic_cmpxchg(&device->opened, 0, 1);
+ if (opened) {
+ ret = -EBUSY;
+ goto err_device_try_open;
+ }
+
+ if (!try_module_get(device->dev->driver->owner)) {
+ ret = -ENODEV;
+ goto err_module_get;
+ }
+
+ ret = device->ops->open(device);
+ if (ret)
+ goto err_device_open;
+
+ filep->private_data = device;
+
+ if (group)
+ vfio_group_put(group);
+ return 0;
+err_device_open:
+ module_put(device->dev->driver->owner);
+err_module_get:
+ atomic_dec(&device->opened);
+err_device_try_open:
+ if (group) {
+ mutex_lock(&group->opened_lock);
+ group->opened--;
+ mutex_unlock(&group->opened_lock);
+ }
+err_group_try_open:
+ if (group)
+ vfio_group_put(group);
+ vfio_device_put(device);
+ return ret;
+}
+
+static bool vfio_device_in_container(struct vfio_device *device)
+{
+ return !!(device->group && device->group->container);
+}
+
static int vfio_device_fops_release(struct inode *inode, struct file *filep)
{
struct vfio_device *device = filep->private_data;
@@ -1560,7 +1691,16 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)

module_put(device->dev->driver->owner);

- vfio_group_try_dissolve_container(device->group);
+ if (vfio_device_in_container(device)) {
+ vfio_group_try_dissolve_container(device->group);
+ } else {
+ atomic_dec(&device->opened);
+ if (device->group) {
+ mutex_lock(&device->group->opened_lock);
+ device->group->opened--;
+ mutex_unlock(&device->group->opened_lock);
+ }
+ }

vfio_device_put(device);

@@ -1613,6 +1753,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)

static const struct file_operations vfio_device_fops = {
.owner = THIS_MODULE,
+ .open = vfio_device_fops_open,
.release = vfio_device_fops_release,
.read = vfio_device_fops_read,
.write = vfio_device_fops_write,
@@ -2295,6 +2436,52 @@ static struct miscdevice vfio_dev = {
.mode = S_IRUGO | S_IWUGO,
};

+static char *vfio_device_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
+}
+
+static int vfio_init_device_class(void)
+{
+ int ret;
+
+ mutex_init(&vfio.device_lock);
+ idr_init(&vfio.device_idr);
+
+ /* /dev/vfio/devices/$DEVICE */
+ vfio.device_class = class_create(THIS_MODULE, "vfio-device");
+ if (IS_ERR(vfio.device_class))
+ return PTR_ERR(vfio.device_class);
+
+ vfio.device_class->devnode = vfio_device_devnode;
+
+ ret = alloc_chrdev_region(&vfio.device_devt, 0, MINORMASK + 1, "vfio-device");
+ if (ret)
+ goto err_alloc_chrdev;
+
+ cdev_init(&vfio.device_cdev, &vfio_device_fops);
+ ret = cdev_add(&vfio.device_cdev, vfio.device_devt, MINORMASK + 1);
+ if (ret)
+ goto err_cdev_add;
+ return 0;
+
+err_cdev_add:
+ unregister_chrdev_region(vfio.device_devt, MINORMASK + 1);
+err_alloc_chrdev:
+ class_destroy(vfio.device_class);
+ vfio.device_class = NULL;
+ return ret;
+}
+
+static void vfio_destroy_device_class(void)
+{
+ cdev_del(&vfio.device_cdev);
+ unregister_chrdev_region(vfio.device_devt, MINORMASK + 1);
+ class_destroy(vfio.device_class);
+ vfio.device_class = NULL;
+ idr_destroy(&vfio.device_idr);
+}
+
static int __init vfio_init(void)
{
int ret;
@@ -2329,6 +2516,10 @@ static int __init vfio_init(void)
if (ret)
goto err_cdev_add;

+ ret = vfio_init_device_class();
+ if (ret)
+ goto err_init_device_class;
+
pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");

#ifdef CONFIG_VFIO_NOIOMMU
@@ -2336,6 +2527,8 @@ static int __init vfio_init(void)
#endif
return 0;

+err_init_device_class:
+ cdev_del(&vfio.group_cdev);
err_cdev_add:
unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
err_alloc_chrdev:
@@ -2358,6 +2551,7 @@ static void __exit vfio_cleanup(void)
unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
class_destroy(vfio.class);
vfio.class = NULL;
+ vfio_destroy_device_class();
misc_deregister(&vfio_dev);
}

diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index a2c5b30e1763..4a5f3f99eab2 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -24,6 +24,8 @@ struct vfio_device {
refcount_t refcount;
struct completion comp;
struct list_head group_next;
+ int minor;
+ atomic_t opened;
};

/**
--
2.25.1


2021-09-21 15:58:32

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Sun, Sep 19, 2021 at 02:38:30PM +0800, Liu Yi L wrote:
> This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> userspace to directly open a vfio device w/o relying on container/group
> (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> iommufd (more specifically in iommu core by this RFC) in a device-centric
> manner.
>
> In case a device is exposed in both legacy and new interfaces (see next
> patch for how to decide it), this patch also ensures that when the device
> is already opened via one interface then the other one must be blocked.
>
> Signed-off-by: Liu Yi L <[email protected]>
> ---
> drivers/vfio/vfio.c | 228 +++++++++++++++++++++++++++++++++++++++----
> include/linux/vfio.h | 2 +
> 2 files changed, 213 insertions(+), 17 deletions(-)

> +static int vfio_init_device_class(void)
> +{
> + int ret;
> +
> + mutex_init(&vfio.device_lock);
> + idr_init(&vfio.device_idr);
> +
> + /* /dev/vfio/devices/$DEVICE */
> + vfio.device_class = class_create(THIS_MODULE, "vfio-device");
> + if (IS_ERR(vfio.device_class))
> + return PTR_ERR(vfio.device_class);
> +
> + vfio.device_class->devnode = vfio_device_devnode;
> +
> + ret = alloc_chrdev_region(&vfio.device_devt, 0, MINORMASK + 1, "vfio-device");
> + if (ret)
> + goto err_alloc_chrdev;
> +
> + cdev_init(&vfio.device_cdev, &vfio_device_fops);
> + ret = cdev_add(&vfio.device_cdev, vfio.device_devt, MINORMASK + 1);
> + if (ret)
> + goto err_cdev_add;

Huh? This is not how cdevs are used. This patch needs rewriting.

The struct vfio_device should gain a 'struct device' and 'struct cdev'
as non-pointer members

vfio register path should end up doing cdev_device_add() for each
vfio_device

vfio_unregister path should do cdev_device_del()

No idr should be needed, an ida is used to allocate minor numbers

The struct device release function should trigger a kfree which
requires some reworking of the callers

vfio_init_group_dev() should do a device_initialize()
vfio_uninit_group_dev() should do a device_put()

The opened atomic is aweful. A newly created fd should start in a
state where it has a disabled fops

The only thing the disabled fops can do is register the device to the
iommu fd. When successfully registered the device gets the normal fops.

The registration steps should be done under a normal lock inside the
vfio_device. If a vfio_device is already registered then further
registration should fail.

Getting the device fd via the group fd triggers the same sequence as
above.

Jason

2021-09-21 22:25:52

by Alex Williamson

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Sun, 19 Sep 2021 14:38:30 +0800
Liu Yi L <[email protected]> wrote:

> This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> userspace to directly open a vfio device w/o relying on container/group
> (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> iommufd (more specifically in iommu core by this RFC) in a device-centric
> manner.
>
> In case a device is exposed in both legacy and new interfaces (see next
> patch for how to decide it), this patch also ensures that when the device
> is already opened via one interface then the other one must be blocked.
>
> Signed-off-by: Liu Yi L <[email protected]>
> ---
> drivers/vfio/vfio.c | 228 +++++++++++++++++++++++++++++++++++++++----
> include/linux/vfio.h | 2 +
> 2 files changed, 213 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 02cc51ce6891..84436d7abedd 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
...
> @@ -2295,6 +2436,52 @@ static struct miscdevice vfio_dev = {
> .mode = S_IRUGO | S_IWUGO,
> };
>
> +static char *vfio_device_devnode(struct device *dev, umode_t *mode)
> +{
> + return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> +}

dev_name() doesn't provide us with any uniqueness guarantees, so this
could potentially generate naming conflicts. The similar scheme for
devices within an iommu group appends an instance number if a conflict
occurs, but that solution doesn't work here where the name isn't just a
link to the actual device. Devices within an iommu group are also
likely associated within a bus_type, so the potential for conflict is
pretty negligible, that's not the case as vfio is adopted for new
device types. Thanks,

Alex

2021-09-22 02:05:20

by Tian, Kevin

[permalink] [raw]
Subject: RE: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

> From: Jason Gunthorpe <[email protected]>
> Sent: Tuesday, September 21, 2021 11:57 PM
>
> On Sun, Sep 19, 2021 at 02:38:30PM +0800, Liu Yi L wrote:
> > This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> > userspace to directly open a vfio device w/o relying on container/group
> > (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> > iommufd (more specifically in iommu core by this RFC) in a device-centric
> > manner.
> >
> > In case a device is exposed in both legacy and new interfaces (see next
> > patch for how to decide it), this patch also ensures that when the device
> > is already opened via one interface then the other one must be blocked.
> >
> > Signed-off-by: Liu Yi L <[email protected]>
> > ---
> > drivers/vfio/vfio.c | 228 +++++++++++++++++++++++++++++++++++++++----
> > include/linux/vfio.h | 2 +
> > 2 files changed, 213 insertions(+), 17 deletions(-)
>
> > +static int vfio_init_device_class(void)
> > +{
> > + int ret;
> > +
> > + mutex_init(&vfio.device_lock);
> > + idr_init(&vfio.device_idr);
> > +
> > + /* /dev/vfio/devices/$DEVICE */
> > + vfio.device_class = class_create(THIS_MODULE, "vfio-device");
> > + if (IS_ERR(vfio.device_class))
> > + return PTR_ERR(vfio.device_class);
> > +
> > + vfio.device_class->devnode = vfio_device_devnode;
> > +
> > + ret = alloc_chrdev_region(&vfio.device_devt, 0, MINORMASK + 1,
> "vfio-device");
> > + if (ret)
> > + goto err_alloc_chrdev;
> > +
> > + cdev_init(&vfio.device_cdev, &vfio_device_fops);
> > + ret = cdev_add(&vfio.device_cdev, vfio.device_devt, MINORMASK +
> 1);
> > + if (ret)
> > + goto err_cdev_add;
>
> Huh? This is not how cdevs are used. This patch needs rewriting.
>
> The struct vfio_device should gain a 'struct device' and 'struct cdev'
> as non-pointer members
>
> vfio register path should end up doing cdev_device_add() for each
> vfio_device
>
> vfio_unregister path should do cdev_device_del()
>
> No idr should be needed, an ida is used to allocate minor numbers
>
> The struct device release function should trigger a kfree which
> requires some reworking of the callers
>
> vfio_init_group_dev() should do a device_initialize()
> vfio_uninit_group_dev() should do a device_put()

All above are good suggestions!

>
> The opened atomic is aweful. A newly created fd should start in a
> state where it has a disabled fops
>
> The only thing the disabled fops can do is register the device to the
> iommu fd. When successfully registered the device gets the normal fops.
>
> The registration steps should be done under a normal lock inside the
> vfio_device. If a vfio_device is already registered then further
> registration should fail.
>
> Getting the device fd via the group fd triggers the same sequence as
> above.
>

Above works if the group interface is also connected to iommufd, i.e.
making vfio type1 as a shim. In this case we can use the registration
status as the exclusive switch. But if we keep vfio type1 separate as
today, then a new atomic is still necessary. This all depends on how
we want to deal with vfio type1 and iommufd, and possibly what's
discussed here just adds another pound to the shim option...

Thanks
Kevin

2021-09-22 02:08:21

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Tue, Sep 21, 2021 at 11:56:06PM +0000, Tian, Kevin wrote:
> > The opened atomic is aweful. A newly created fd should start in a
> > state where it has a disabled fops
> >
> > The only thing the disabled fops can do is register the device to the
> > iommu fd. When successfully registered the device gets the normal fops.
> >
> > The registration steps should be done under a normal lock inside the
> > vfio_device. If a vfio_device is already registered then further
> > registration should fail.
> >
> > Getting the device fd via the group fd triggers the same sequence as
> > above.
> >
>
> Above works if the group interface is also connected to iommufd, i.e.
> making vfio type1 as a shim. In this case we can use the registration
> status as the exclusive switch. But if we keep vfio type1 separate as
> today, then a new atomic is still necessary. This all depends on how
> we want to deal with vfio type1 and iommufd, and possibly what's
> discussed here just adds another pound to the shim option...

No, it works the same either way, the group FD path is identical to
the normal FD path, it just triggers some of the state transitions
automatically internally instead of requiring external ioctls.

The device FDs starts disabled, an internal API binds it to the iommu
via open coding with the group API, and then the rest of the APIs can
be enabled. Same as today.

Jason

2021-09-22 02:10:11

by Tian, Kevin

[permalink] [raw]
Subject: RE: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, September 22, 2021 8:55 AM
>
> On Tue, Sep 21, 2021 at 11:56:06PM +0000, Tian, Kevin wrote:
> > > The opened atomic is aweful. A newly created fd should start in a
> > > state where it has a disabled fops
> > >
> > > The only thing the disabled fops can do is register the device to the
> > > iommu fd. When successfully registered the device gets the normal fops.
> > >
> > > The registration steps should be done under a normal lock inside the
> > > vfio_device. If a vfio_device is already registered then further
> > > registration should fail.
> > >
> > > Getting the device fd via the group fd triggers the same sequence as
> > > above.
> > >
> >
> > Above works if the group interface is also connected to iommufd, i.e.
> > making vfio type1 as a shim. In this case we can use the registration
> > status as the exclusive switch. But if we keep vfio type1 separate as
> > today, then a new atomic is still necessary. This all depends on how
> > we want to deal with vfio type1 and iommufd, and possibly what's
> > discussed here just adds another pound to the shim option...
>
> No, it works the same either way, the group FD path is identical to
> the normal FD path, it just triggers some of the state transitions
> automatically internally instead of requiring external ioctls.
>
> The device FDs starts disabled, an internal API binds it to the iommu
> via open coding with the group API, and then the rest of the APIs can
> be enabled. Same as today.
>

Still a bit confused. if vfio type1 also connects to iommufd, whether
the device is registered can be centrally checked based on whether
an iommu_ctx is recorded. But if type1 doesn't talk to iommufd at
all, don't we still need introduce a new state (calling it 'opened' or
'registered') to protect the two interfaces? In this case what is the
point of keeping device FD disabled even for the group path?

2021-09-22 02:11:35

by Tian, Kevin

[permalink] [raw]
Subject: RE: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

> From: Alex Williamson <[email protected]>
> Sent: Wednesday, September 22, 2021 3:56 AM
>
> On Sun, 19 Sep 2021 14:38:30 +0800
> Liu Yi L <[email protected]> wrote:
>
> > This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> > userspace to directly open a vfio device w/o relying on container/group
> > (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> > iommufd (more specifically in iommu core by this RFC) in a device-centric
> > manner.
> >
> > In case a device is exposed in both legacy and new interfaces (see next
> > patch for how to decide it), this patch also ensures that when the device
> > is already opened via one interface then the other one must be blocked.
> >
> > Signed-off-by: Liu Yi L <[email protected]>
> > ---
> > drivers/vfio/vfio.c | 228 +++++++++++++++++++++++++++++++++++++++----
> > include/linux/vfio.h | 2 +
> > 2 files changed, 213 insertions(+), 17 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> > index 02cc51ce6891..84436d7abedd 100644
> > --- a/drivers/vfio/vfio.c
> > +++ b/drivers/vfio/vfio.c
> ...
> > @@ -2295,6 +2436,52 @@ static struct miscdevice vfio_dev = {
> > .mode = S_IRUGO | S_IWUGO,
> > };
> >
> > +static char *vfio_device_devnode(struct device *dev, umode_t *mode)
> > +{
> > + return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> > +}
>
> dev_name() doesn't provide us with any uniqueness guarantees, so this
> could potentially generate naming conflicts. The similar scheme for
> devices within an iommu group appends an instance number if a conflict
> occurs, but that solution doesn't work here where the name isn't just a
> link to the actual device. Devices within an iommu group are also
> likely associated within a bus_type, so the potential for conflict is
> pretty negligible, that's not the case as vfio is adopted for new
> device types. Thanks,
>

This is also our concern. Thanks for confirming it. Appreciate if you
can help think out some better alternative to deal with it.

Thanks
Kevin

2021-09-22 03:25:23

by Tian, Kevin

[permalink] [raw]
Subject: RE: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

> From: Tian, Kevin
> Sent: Wednesday, September 22, 2021 9:07 AM
>
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Wednesday, September 22, 2021 8:55 AM
> >
> > On Tue, Sep 21, 2021 at 11:56:06PM +0000, Tian, Kevin wrote:
> > > > The opened atomic is aweful. A newly created fd should start in a
> > > > state where it has a disabled fops
> > > >
> > > > The only thing the disabled fops can do is register the device to the
> > > > iommu fd. When successfully registered the device gets the normal fops.
> > > >
> > > > The registration steps should be done under a normal lock inside the
> > > > vfio_device. If a vfio_device is already registered then further
> > > > registration should fail.
> > > >
> > > > Getting the device fd via the group fd triggers the same sequence as
> > > > above.
> > > >
> > >
> > > Above works if the group interface is also connected to iommufd, i.e.
> > > making vfio type1 as a shim. In this case we can use the registration
> > > status as the exclusive switch. But if we keep vfio type1 separate as
> > > today, then a new atomic is still necessary. This all depends on how
> > > we want to deal with vfio type1 and iommufd, and possibly what's
> > > discussed here just adds another pound to the shim option...
> >
> > No, it works the same either way, the group FD path is identical to
> > the normal FD path, it just triggers some of the state transitions
> > automatically internally instead of requiring external ioctls.
> >
> > The device FDs starts disabled, an internal API binds it to the iommu
> > via open coding with the group API, and then the rest of the APIs can
> > be enabled. Same as today.
> >

After reading your comments on patch08, I may have a clearer picture
on your suggestion. The key is to handle exclusive access at the binding
time (based on vdev->iommu_dev). Please see whether below makes
sense:

Shared sequence:

1) initialize the device with a parked fops;
2) need binding (explicit or implicit) to move away from parked fops;
3) switch to normal fops after successful binding;

1) happens at device probe.

for nongroup 2) and 3) are done together in VFIO_DEVICE_GET_IOMMUFD:

- 2) is done by calling .bind_iommufd() callback;
- 3) could be done within .bind_iommufd(), or via a new callback e.g.
.finalize_device(). The latter may be preferred for the group interface;
- Two threads may open the same device simultaneously, with exclusive
access guaranteed by iommufd_bind_device();
- Open() after successful binding is rejected, since normal fops has been
activated. This is checked upon vdev->iommu_dev;

for group 2/3) are done together in VFIO_GROUP_GET_DEVICE_FD:

- 2) is done by open coding bind_iommufd + attach_ioas. Create an
iommufd_device object and record it to vdev->iommu_dev
- 3) is done by calling .finalize_device();
- open() after a valid vdev->iommu_dev is rejected. this also ensures
exclusive ownership with the nongroup path.

If Alex also agrees with it, this might be another mini-series to be merged
(just for group path) before this one. Doing so sort of nullifies the existing
group/container attaching process, where attach_ioas will be skipped and
now the security context is established when the device is opened.

Thanks
Kevin

2021-09-22 12:33:25

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Wed, Sep 22, 2021 at 01:07:11AM +0000, Tian, Kevin wrote:
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Wednesday, September 22, 2021 8:55 AM
> >
> > On Tue, Sep 21, 2021 at 11:56:06PM +0000, Tian, Kevin wrote:
> > > > The opened atomic is aweful. A newly created fd should start in a
> > > > state where it has a disabled fops
> > > >
> > > > The only thing the disabled fops can do is register the device to the
> > > > iommu fd. When successfully registered the device gets the normal fops.
> > > >
> > > > The registration steps should be done under a normal lock inside the
> > > > vfio_device. If a vfio_device is already registered then further
> > > > registration should fail.
> > > >
> > > > Getting the device fd via the group fd triggers the same sequence as
> > > > above.
> > > >
> > >
> > > Above works if the group interface is also connected to iommufd, i.e.
> > > making vfio type1 as a shim. In this case we can use the registration
> > > status as the exclusive switch. But if we keep vfio type1 separate as
> > > today, then a new atomic is still necessary. This all depends on how
> > > we want to deal with vfio type1 and iommufd, and possibly what's
> > > discussed here just adds another pound to the shim option...
> >
> > No, it works the same either way, the group FD path is identical to
> > the normal FD path, it just triggers some of the state transitions
> > automatically internally instead of requiring external ioctls.
> >
> > The device FDs starts disabled, an internal API binds it to the iommu
> > via open coding with the group API, and then the rest of the APIs can
> > be enabled. Same as today.
> >
>
> Still a bit confused. if vfio type1 also connects to iommufd, whether
> the device is registered can be centrally checked based on whether
> an iommu_ctx is recorded. But if type1 doesn't talk to iommufd at
> all, don't we still need introduce a new state (calling it 'opened' or
> 'registered') to protect the two interfaces?

The "new state" is if the fops are pointing at the real fops or the
pre-fops, which in turn protects everything. You could imagine this as
some state in front of every fop call if you want.

> In this case what is the point of keeping device FD disabled even
> for the group path?

I have a feeling when you go through the APIs it will make sense to
have some symmetry here.

eg creating a device FD should have basically the same flow no matter
what triggers it, not confusing special cases where the group code
skips steps

Jason

2021-09-22 12:52:52

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Wed, Sep 22, 2021 at 03:22:42AM +0000, Tian, Kevin wrote:
> > From: Tian, Kevin
> > Sent: Wednesday, September 22, 2021 9:07 AM
> >
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Wednesday, September 22, 2021 8:55 AM
> > >
> > > On Tue, Sep 21, 2021 at 11:56:06PM +0000, Tian, Kevin wrote:
> > > > > The opened atomic is aweful. A newly created fd should start in a
> > > > > state where it has a disabled fops
> > > > >
> > > > > The only thing the disabled fops can do is register the device to the
> > > > > iommu fd. When successfully registered the device gets the normal fops.
> > > > >
> > > > > The registration steps should be done under a normal lock inside the
> > > > > vfio_device. If a vfio_device is already registered then further
> > > > > registration should fail.
> > > > >
> > > > > Getting the device fd via the group fd triggers the same sequence as
> > > > > above.
> > > > >
> > > >
> > > > Above works if the group interface is also connected to iommufd, i.e.
> > > > making vfio type1 as a shim. In this case we can use the registration
> > > > status as the exclusive switch. But if we keep vfio type1 separate as
> > > > today, then a new atomic is still necessary. This all depends on how
> > > > we want to deal with vfio type1 and iommufd, and possibly what's
> > > > discussed here just adds another pound to the shim option...
> > >
> > > No, it works the same either way, the group FD path is identical to
> > > the normal FD path, it just triggers some of the state transitions
> > > automatically internally instead of requiring external ioctls.
> > >
> > > The device FDs starts disabled, an internal API binds it to the iommu
> > > via open coding with the group API, and then the rest of the APIs can
> > > be enabled. Same as today.
> > >
>
> After reading your comments on patch08, I may have a clearer picture
> on your suggestion. The key is to handle exclusive access at the binding
> time (based on vdev->iommu_dev). Please see whether below makes
> sense:
>
> Shared sequence:
>
> 1) initialize the device with a parked fops;
> 2) need binding (explicit or implicit) to move away from parked fops;
> 3) switch to normal fops after successful binding;
>
> 1) happens at device probe.

1 happens when the cdev is setup with the parked fops, yes. I'd say it
happens at fd open time.

> for nongroup 2) and 3) are done together in VFIO_DEVICE_GET_IOMMUFD:
>
> - 2) is done by calling .bind_iommufd() callback;
> - 3) could be done within .bind_iommufd(), or via a new callback e.g.
> .finalize_device(). The latter may be preferred for the group interface;
> - Two threads may open the same device simultaneously, with exclusive
> access guaranteed by iommufd_bind_device();
> - Open() after successful binding is rejected, since normal fops has been
> activated. This is checked upon vdev->iommu_dev;

Almost, open is always successful, what fails is
VFIO_DEVICE_GET_IOMMUFD (or the group equivilant). The user ends up
with a FD that is useless, cannot reach the ops and thus cannot impact
the device it doesn't own in any way.

It is similar to opening a group FD

> for group 2/3) are done together in VFIO_GROUP_GET_DEVICE_FD:
>
> - 2) is done by open coding bind_iommufd + attach_ioas. Create an
> iommufd_device object and record it to vdev->iommu_dev
> - 3) is done by calling .finalize_device();
> - open() after a valid vdev->iommu_dev is rejected. this also ensures
> exclusive ownership with the nongroup path.

Same comment as above, groups should go through the same sequence of
steps, create a FD, attempt to bind, if successuful make the FD
operational.

The only difference is that failure in these steps does not call
fd_install(). For this reason alone the FD could start out with
operational fops, but it feels like a needless optimization.

> If Alex also agrees with it, this might be another mini-series to be merged
> (just for group path) before this one. Doing so sort of nullifies the existing
> group/container attaching process, where attach_ioas will be skipped and
> now the security context is established when the device is opened.

I think it is really important to unify DMA exclusion model and lower
to the core iommu code. If there is a reason the exclusion must be
triggered on group fd open then the iommu core code should provide an
API to do that which interworks with the device API iommufd will work.

But I would start here because it is much simpler to understand..

Jason

2021-09-22 14:13:16

by Tian, Kevin

[permalink] [raw]
Subject: RE: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, September 22, 2021 8:51 PM
>
> On Wed, Sep 22, 2021 at 03:22:42AM +0000, Tian, Kevin wrote:
> > > From: Tian, Kevin
> > > Sent: Wednesday, September 22, 2021 9:07 AM
> > >
> > > > From: Jason Gunthorpe <[email protected]>
> > > > Sent: Wednesday, September 22, 2021 8:55 AM
> > > >
> > > > On Tue, Sep 21, 2021 at 11:56:06PM +0000, Tian, Kevin wrote:
> > > > > > The opened atomic is aweful. A newly created fd should start in a
> > > > > > state where it has a disabled fops
> > > > > >
> > > > > > The only thing the disabled fops can do is register the device to the
> > > > > > iommu fd. When successfully registered the device gets the normal
> fops.
> > > > > >
> > > > > > The registration steps should be done under a normal lock inside
> the
> > > > > > vfio_device. If a vfio_device is already registered then further
> > > > > > registration should fail.
> > > > > >
> > > > > > Getting the device fd via the group fd triggers the same sequence as
> > > > > > above.
> > > > > >
> > > > >
> > > > > Above works if the group interface is also connected to iommufd, i.e.
> > > > > making vfio type1 as a shim. In this case we can use the registration
> > > > > status as the exclusive switch. But if we keep vfio type1 separate as
> > > > > today, then a new atomic is still necessary. This all depends on how
> > > > > we want to deal with vfio type1 and iommufd, and possibly what's
> > > > > discussed here just adds another pound to the shim option...
> > > >
> > > > No, it works the same either way, the group FD path is identical to
> > > > the normal FD path, it just triggers some of the state transitions
> > > > automatically internally instead of requiring external ioctls.
> > > >
> > > > The device FDs starts disabled, an internal API binds it to the iommu
> > > > via open coding with the group API, and then the rest of the APIs can
> > > > be enabled. Same as today.
> > > >
> >
> > After reading your comments on patch08, I may have a clearer picture
> > on your suggestion. The key is to handle exclusive access at the binding
> > time (based on vdev->iommu_dev). Please see whether below makes
> > sense:
> >
> > Shared sequence:
> >
> > 1) initialize the device with a parked fops;
> > 2) need binding (explicit or implicit) to move away from parked fops;
> > 3) switch to normal fops after successful binding;
> >
> > 1) happens at device probe.
>
> 1 happens when the cdev is setup with the parked fops, yes. I'd say it
> happens at fd open time.
>
> > for nongroup 2) and 3) are done together in VFIO_DEVICE_GET_IOMMUFD:
> >
> > - 2) is done by calling .bind_iommufd() callback;
> > - 3) could be done within .bind_iommufd(), or via a new callback e.g.
> > .finalize_device(). The latter may be preferred for the group interface;
> > - Two threads may open the same device simultaneously, with exclusive
> > access guaranteed by iommufd_bind_device();
> > - Open() after successful binding is rejected, since normal fops has been
> > activated. This is checked upon vdev->iommu_dev;
>
> Almost, open is always successful, what fails is
> VFIO_DEVICE_GET_IOMMUFD (or the group equivilant). The user ends up
> with a FD that is useless, cannot reach the ops and thus cannot impact
> the device it doesn't own in any way.

make sense. I had an wrong impression that once a normal fops is
activated it is also visible to other threads. But in concept this fops
replacement should be local to each thread thus another thread
opening the device always gets a parked fops.

>
> It is similar to opening a group FD
>
> > for group 2/3) are done together in VFIO_GROUP_GET_DEVICE_FD:
> >
> > - 2) is done by open coding bind_iommufd + attach_ioas. Create an
> > iommufd_device object and record it to vdev->iommu_dev
> > - 3) is done by calling .finalize_device();
> > - open() after a valid vdev->iommu_dev is rejected. this also ensures
> > exclusive ownership with the nongroup path.
>
> Same comment as above, groups should go through the same sequence of
> steps, create a FD, attempt to bind, if successuful make the FD
> operational.
>
> The only difference is that failure in these steps does not call
> fd_install(). For this reason alone the FD could start out with
> operational fops, but it feels like a needless optimization.
>
> > If Alex also agrees with it, this might be another mini-series to be merged
> > (just for group path) before this one. Doing so sort of nullifies the existing
> > group/container attaching process, where attach_ioas will be skipped and
> > now the security context is established when the device is opened.
>
> I think it is really important to unify DMA exclusion model and lower
> to the core iommu code. If there is a reason the exclusion must be
> triggered on group fd open then the iommu core code should provide an
> API to do that which interworks with the device API iommufd will work.
>
> But I would start here because it is much simpler to understand..
>

Let's work on this task first and figure out what's the cleaner way to unify
it. My current impression is that having an iommu api for group fd open
might be simpler. Currently vfio iommu drivers are coupled with container
with group-granular operations. Adapting them to device fd open will
require more changes to handle device<->group. anyway we'll see...

2021-09-29 03:29:54

by David Gibson

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Sun, Sep 19, 2021 at 02:38:30PM +0800, Liu Yi L wrote:
> This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> userspace to directly open a vfio device w/o relying on container/group
> (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> iommufd (more specifically in iommu core by this RFC) in a device-centric
> manner.
>
> In case a device is exposed in both legacy and new interfaces (see next
> patch for how to decide it), this patch also ensures that when the device
> is already opened via one interface then the other one must be blocked.
>
> Signed-off-by: Liu Yi L <[email protected]>
[snip]

> +static bool vfio_device_in_container(struct vfio_device *device)
> +{
> + return !!(device->group && device->group->container);

You don't need !! here. && is already a logical operation, so returns
a valid bool.

> +}
> +
> static int vfio_device_fops_release(struct inode *inode, struct file *filep)
> {
> struct vfio_device *device = filep->private_data;
> @@ -1560,7 +1691,16 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
>
> module_put(device->dev->driver->owner);
>
> - vfio_group_try_dissolve_container(device->group);
> + if (vfio_device_in_container(device)) {
> + vfio_group_try_dissolve_container(device->group);
> + } else {
> + atomic_dec(&device->opened);
> + if (device->group) {
> + mutex_lock(&device->group->opened_lock);
> + device->group->opened--;
> + mutex_unlock(&device->group->opened_lock);
> + }
> + }
>
> vfio_device_put(device);
>
> @@ -1613,6 +1753,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
>
> static const struct file_operations vfio_device_fops = {
> .owner = THIS_MODULE,
> + .open = vfio_device_fops_open,
> .release = vfio_device_fops_release,
> .read = vfio_device_fops_read,
> .write = vfio_device_fops_write,
> @@ -2295,6 +2436,52 @@ static struct miscdevice vfio_dev = {
> .mode = S_IRUGO | S_IWUGO,
> };
>
> +static char *vfio_device_devnode(struct device *dev, umode_t *mode)
> +{
> + return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));

Others have pointed out some problems with the use of dev_name()
here. I'll add that I think you'll make things much easier if instead
of using one huge "devices" subdir, you use a separate subdir for each
vfio sub-driver (so, one for PCI, one for each type of mdev, one for
platform, etc.). That should make avoiding name conflicts a lot simpler.

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


Attachments:
(No filename) (2.71 kB)
signature.asc (849.00 B)
Download all attachments

2021-09-29 19:43:03

by Alex Williamson

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Wed, 29 Sep 2021 12:08:59 +1000
David Gibson <[email protected]> wrote:

> On Sun, Sep 19, 2021 at 02:38:30PM +0800, Liu Yi L wrote:
> > This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> > userspace to directly open a vfio device w/o relying on container/group
> > (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> > iommufd (more specifically in iommu core by this RFC) in a device-centric
> > manner.
> >
> > In case a device is exposed in both legacy and new interfaces (see next
> > patch for how to decide it), this patch also ensures that when the device
> > is already opened via one interface then the other one must be blocked.
> >
> > Signed-off-by: Liu Yi L <[email protected]>
> [snip]
>
> > +static bool vfio_device_in_container(struct vfio_device *device)
> > +{
> > + return !!(device->group && device->group->container);
>
> You don't need !! here. && is already a logical operation, so returns
> a valid bool.
>
> > +}
> > +
> > static int vfio_device_fops_release(struct inode *inode, struct file *filep)
> > {
> > struct vfio_device *device = filep->private_data;
> > @@ -1560,7 +1691,16 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
> >
> > module_put(device->dev->driver->owner);
> >
> > - vfio_group_try_dissolve_container(device->group);
> > + if (vfio_device_in_container(device)) {
> > + vfio_group_try_dissolve_container(device->group);
> > + } else {
> > + atomic_dec(&device->opened);
> > + if (device->group) {
> > + mutex_lock(&device->group->opened_lock);
> > + device->group->opened--;
> > + mutex_unlock(&device->group->opened_lock);
> > + }
> > + }
> >
> > vfio_device_put(device);
> >
> > @@ -1613,6 +1753,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
> >
> > static const struct file_operations vfio_device_fops = {
> > .owner = THIS_MODULE,
> > + .open = vfio_device_fops_open,
> > .release = vfio_device_fops_release,
> > .read = vfio_device_fops_read,
> > .write = vfio_device_fops_write,
> > @@ -2295,6 +2436,52 @@ static struct miscdevice vfio_dev = {
> > .mode = S_IRUGO | S_IWUGO,
> > };
> >
> > +static char *vfio_device_devnode(struct device *dev, umode_t *mode)
> > +{
> > + return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
>
> Others have pointed out some problems with the use of dev_name()
> here. I'll add that I think you'll make things much easier if instead
> of using one huge "devices" subdir, you use a separate subdir for each
> vfio sub-driver (so, one for PCI, one for each type of mdev, one for
> platform, etc.). That should make avoiding name conflicts a lot simpler.

It seems like this is unnecessary if we use the vfioX naming approach.
Conflicts are trivial to ignore if we don't involve dev_name() and
looking for the correct major:minor chardev in the correct subdirectory
seems like a hassle for userspace. Thanks,

Alex

2021-09-30 03:10:10

by David Gibson

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Wed, Sep 29, 2021 at 01:05:21PM -0600, Alex Williamson wrote:
> On Wed, 29 Sep 2021 12:08:59 +1000
> David Gibson <[email protected]> wrote:
>
> > On Sun, Sep 19, 2021 at 02:38:30PM +0800, Liu Yi L wrote:
> > > This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> > > userspace to directly open a vfio device w/o relying on container/group
> > > (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> > > iommufd (more specifically in iommu core by this RFC) in a device-centric
> > > manner.
> > >
> > > In case a device is exposed in both legacy and new interfaces (see next
> > > patch for how to decide it), this patch also ensures that when the device
> > > is already opened via one interface then the other one must be blocked.
> > >
> > > Signed-off-by: Liu Yi L <[email protected]>
> > [snip]
> >
> > > +static bool vfio_device_in_container(struct vfio_device *device)
> > > +{
> > > + return !!(device->group && device->group->container);
> >
> > You don't need !! here. && is already a logical operation, so returns
> > a valid bool.
> >
> > > +}
> > > +
> > > static int vfio_device_fops_release(struct inode *inode, struct file *filep)
> > > {
> > > struct vfio_device *device = filep->private_data;
> > > @@ -1560,7 +1691,16 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
> > >
> > > module_put(device->dev->driver->owner);
> > >
> > > - vfio_group_try_dissolve_container(device->group);
> > > + if (vfio_device_in_container(device)) {
> > > + vfio_group_try_dissolve_container(device->group);
> > > + } else {
> > > + atomic_dec(&device->opened);
> > > + if (device->group) {
> > > + mutex_lock(&device->group->opened_lock);
> > > + device->group->opened--;
> > > + mutex_unlock(&device->group->opened_lock);
> > > + }
> > > + }
> > >
> > > vfio_device_put(device);
> > >
> > > @@ -1613,6 +1753,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
> > >
> > > static const struct file_operations vfio_device_fops = {
> > > .owner = THIS_MODULE,
> > > + .open = vfio_device_fops_open,
> > > .release = vfio_device_fops_release,
> > > .read = vfio_device_fops_read,
> > > .write = vfio_device_fops_write,
> > > @@ -2295,6 +2436,52 @@ static struct miscdevice vfio_dev = {
> > > .mode = S_IRUGO | S_IWUGO,
> > > };
> > >
> > > +static char *vfio_device_devnode(struct device *dev, umode_t *mode)
> > > +{
> > > + return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev));
> >
> > Others have pointed out some problems with the use of dev_name()
> > here. I'll add that I think you'll make things much easier if instead
> > of using one huge "devices" subdir, you use a separate subdir for each
> > vfio sub-driver (so, one for PCI, one for each type of mdev, one for
> > platform, etc.). That should make avoiding name conflicts a lot simpler.
>
> It seems like this is unnecessary if we use the vfioX naming approach.
> Conflicts are trivial to ignore if we don't involve dev_name() and
> looking for the correct major:minor chardev in the correct subdirectory
> seems like a hassle for userspace. Thanks,

Right.. it does sound like a hassle, but AFAICT that's *more*
necessary with /dev/vfio/vfioXX than with /dev/vfio/pci/DDDD:BB:SS.F,
since you have to look up a meaningful name in sysfs to find the right
devnode.

--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


Attachments:
(No filename) (3.60 kB)
signature.asc (849.00 B)
Download all attachments

2021-10-20 12:41:25

by Yi Liu

[permalink] [raw]
Subject: RE: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

> From: David Gibson <[email protected]>
> Sent: Wednesday, September 29, 2021 10:09 AM
>
> On Sun, Sep 19, 2021 at 02:38:30PM +0800, Liu Yi L wrote:
> > This patch introduces a new interface (/dev/vfio/devices/$DEVICE) for
> > userspace to directly open a vfio device w/o relying on container/group
> > (/dev/vfio/$GROUP). Anything related to group is now hidden behind
> > iommufd (more specifically in iommu core by this RFC) in a device-centric
> > manner.
> >
> > In case a device is exposed in both legacy and new interfaces (see next
> > patch for how to decide it), this patch also ensures that when the device
> > is already opened via one interface then the other one must be blocked.
> >
> > Signed-off-by: Liu Yi L <[email protected]>
> [snip]
>
> > +static bool vfio_device_in_container(struct vfio_device *device)
> > +{
> > + return !!(device->group && device->group->container);
>
> You don't need !! here. && is already a logical operation, so returns
> a valid bool.

got it. thanks.

Regards,
Yi Liu

2021-10-25 14:11:27

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Mon, Oct 25, 2021 at 06:28:09AM +0000, Liu, Yi L wrote:
> thanks for the guiding. will also refer to your vfio_group_cdev series.
>
> Need to double confirm here. Not quite following on the kfree. Is
> this kfree to free the vfio_device structure? But now the
> vfio_device pointer is provided by callers (e.g. vfio-pci). Do
> you want to let vfio core allocate the vfio_device struct and
> return the pointer to callers?

There are several common patterns for this problem, two that would be
suitable:

- Require each driver to provide a release op inside vfio_device_ops
that does the kfree. Have the core provide a struct device release
op that calls this one. Keep the kalloc/kfree in the drivers

- Move the kalloc into the core and have the core provide the kfree
with an optional release callback for anydriver specific cleanup

This requires some macro to make the memory layout work. RDMA has
a version of this:

struct ib_device *_ib_alloc_device(size_t size);
#define ib_alloc_device(drv_struct, member) \
container_of(_ib_alloc_device(sizeof(struct drv_struct) + \
BUILD_BUG_ON_ZERO(offsetof( \
struct drv_struct, member))), \
struct drv_struct, member)


In part the choice is how many drivers require a release callback
anyhow, if they all do then the first is easier to understand. If only
few or none do then the latter is less code in drivers, and never
exposes the driver to the tricky transition from alloc to refcount
cleanup.

Jason

2021-10-29 09:51:54

by Yi Liu

[permalink] [raw]
Subject: RE: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

Hi Jason,

> From: Jason Gunthorpe <[email protected]>
> Sent: Monday, October 25, 2021 8:53 PM
>
> On Mon, Oct 25, 2021 at 06:28:09AM +0000, Liu, Yi L wrote:
> > thanks for the guiding. will also refer to your vfio_group_cdev series.
> >
> > Need to double confirm here. Not quite following on the kfree. Is
> > this kfree to free the vfio_device structure? But now the
> > vfio_device pointer is provided by callers (e.g. vfio-pci). Do
> > you want to let vfio core allocate the vfio_device struct and
> > return the pointer to callers?
>
> There are several common patterns for this problem, two that would be
> suitable:
>
> - Require each driver to provide a release op inside vfio_device_ops
> that does the kfree. Have the core provide a struct device release
> op that calls this one. Keep the kalloc/kfree in the drivers

this way sees to suit the existing vfio registration manner listed
below. right? But device drivers needs to do the kfree in the
newly added release op instead of doing it on their own (e.g.
doing kfree in remove).

vfio_init_group_dev()
vfio_register_group_dev()
vfio_unregister_group_dev()
vfio_uninit_group_dev()

> - Move the kalloc into the core and have the core provide the kfree
> with an optional release callback for anydriver specific cleanup
>
> This requires some macro to make the memory layout work. RDMA has
> a version of this:
>
> struct ib_device *_ib_alloc_device(size_t size);
> #define ib_alloc_device(drv_struct, member) \
> container_of(_ib_alloc_device(sizeof(struct drv_struct) + \
> BUILD_BUG_ON_ZERO(offsetof( \
> struct drv_struct, member))), \
> struct drv_struct, member)
>

thanks for the example. If this way, still requires driver to provide
a release op inside vfio_device_ops. right?

> In part the choice is how many drivers require a release callback
> anyhow, if they all do then the first is easier to understand. If only
> few or none do then the latter is less code in drivers, and never
> exposes the driver to the tricky transition from alloc to refcount
> cleanup.

I'm not quite sure. But per my understanding, since the vfio_device
is expected to be embedded in the device state struct (e.g.
vfio_pci_core_device), I guess most of the drivers will require callback
to do driver specific cleanup. Seems like option #1 may make sense?

Regards,
Yi Liu

2021-11-01 12:51:30

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Fri, Oct 29, 2021 at 09:47:27AM +0000, Liu, Yi L wrote:
> Hi Jason,
>
> > From: Jason Gunthorpe <[email protected]>
> > Sent: Monday, October 25, 2021 8:53 PM
> >
> > On Mon, Oct 25, 2021 at 06:28:09AM +0000, Liu, Yi L wrote:
> > > thanks for the guiding. will also refer to your vfio_group_cdev series.
> > >
> > > Need to double confirm here. Not quite following on the kfree. Is
> > > this kfree to free the vfio_device structure? But now the
> > > vfio_device pointer is provided by callers (e.g. vfio-pci). Do
> > > you want to let vfio core allocate the vfio_device struct and
> > > return the pointer to callers?
> >
> > There are several common patterns for this problem, two that would be
> > suitable:
> >
> > - Require each driver to provide a release op inside vfio_device_ops
> > that does the kfree. Have the core provide a struct device release
> > op that calls this one. Keep the kalloc/kfree in the drivers
>
> this way sees to suit the existing vfio registration manner listed
> below. right?

Not really, most drivers are just doing kfree. The need for release
comes if the drivers are doing more stuff.

> But device drivers needs to do the kfree in the
> newly added release op instead of doing it on their own (e.g.
> doing kfree in remove).

Yes

> > struct ib_device *_ib_alloc_device(size_t size);
> > #define ib_alloc_device(drv_struct, member) \
> > container_of(_ib_alloc_device(sizeof(struct drv_struct) + \
> > BUILD_BUG_ON_ZERO(offsetof( \
> > struct drv_struct, member))), \
> > struct drv_struct, member)
> >
>
> thanks for the example. If this way, still requires driver to provide
> a release op inside vfio_device_ops. right?

No, it would optional. It would contain the stuff the driver is doing
before kfree()

For instance mdev looks like the only driver that cares:

vfio_uninit_group_dev(&mdev_state->vdev);
kfree(mdev_state->pages);
kfree(mdev_state->vconfig);
kfree(mdev_state);

pages/vconfig would logically be in a release function

On the other hand ccw needs to rcu free the vfio_device, so that would
have to be global overhead with this api design.

Jason

2021-11-02 09:56:53

by Yi Liu

[permalink] [raw]
Subject: RE: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

> From: Jason Gunthorpe <[email protected]>
> Sent: Monday, November 1, 2021 8:50 PM
>
> On Fri, Oct 29, 2021 at 09:47:27AM +0000, Liu, Yi L wrote:
> > Hi Jason,
> >
> > > From: Jason Gunthorpe <[email protected]>
> > > Sent: Monday, October 25, 2021 8:53 PM
> > >
> > > On Mon, Oct 25, 2021 at 06:28:09AM +0000, Liu, Yi L wrote:
> > > > thanks for the guiding. will also refer to your vfio_group_cdev series.
> > > >
> > > > Need to double confirm here. Not quite following on the kfree. Is
> > > > this kfree to free the vfio_device structure? But now the
> > > > vfio_device pointer is provided by callers (e.g. vfio-pci). Do
> > > > you want to let vfio core allocate the vfio_device struct and
> > > > return the pointer to callers?
> > >
> > > There are several common patterns for this problem, two that would be
> > > suitable:
> > >
> > > - Require each driver to provide a release op inside vfio_device_ops
> > > that does the kfree. Have the core provide a struct device release
> > > op that calls this one. Keep the kalloc/kfree in the drivers
> >
> > this way sees to suit the existing vfio registration manner listed
> > below. right?
>
> Not really, most drivers are just doing kfree. The need for release
> comes if the drivers are doing more stuff.
>
> > But device drivers needs to do the kfree in the
> > newly added release op instead of doing it on their own (e.g.
> > doing kfree in remove).
>
> Yes
>
> > > struct ib_device *_ib_alloc_device(size_t size);
> > > #define ib_alloc_device(drv_struct, member) \
> > > container_of(_ib_alloc_device(sizeof(struct drv_struct) + \
> > > BUILD_BUG_ON_ZERO(offsetof( \
> > > struct drv_struct, member))), \
> > > struct drv_struct, member)
> > >
> >
> > thanks for the example. If this way, still requires driver to provide
> > a release op inside vfio_device_ops. right?
>
> No, it would optional. It would contain the stuff the driver is doing
> before kfree()
>
> For instance mdev looks like the only driver that cares:
>
> vfio_uninit_group_dev(&mdev_state->vdev);
> kfree(mdev_state->pages);
> kfree(mdev_state->vconfig);
> kfree(mdev_state);
>
> pages/vconfig would logically be in a release function

I see. So the criteria is: the pointer fields pointing to a memory buffer
allocated by the device driver should be logically be free in a release
function. right? I can see there are such fields in struct vfio_pci_core_device
and mdev_state (both mbochs and mdpy). So we may go with your option #2.
Is it? otherwise, needs to add release callback for all the related drivers.

struct vfio_pci_core_device {
struct vifo_device vdev;
...
u8 *pci_config_map;
u8 *vconfig;
...
};

struct mdev_state {
struct vifo_device vdev;
...
u8 *vconfig;
struct page **pages;
...
};

> On the other hand ccw needs to rcu free the vfio_device, so that would
> have to be global overhead with this api design.

not quite get. why ccw is special here? could you elaborate?

Thanks,
Yi Liu

2021-11-03 13:30:32

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

On Tue, Nov 02, 2021 at 09:53:29AM +0000, Liu, Yi L wrote:

> > vfio_uninit_group_dev(&mdev_state->vdev);
> > kfree(mdev_state->pages);
> > kfree(mdev_state->vconfig);
> > kfree(mdev_state);
> >
> > pages/vconfig would logically be in a release function
>
> I see. So the criteria is: the pointer fields pointing to a memory buffer
> allocated by the device driver should be logically be free in a release
> function. right?

Often yes, that is usually a good idea

>I can see there are such fields in struct vfio_pci_core_device
> and mdev_state (both mbochs and mdpy). So we may go with your option #2.
> Is it? otherwise, needs to add release callback for all the related drivers.

Yes, that is the approx trade off

> > On the other hand ccw needs to rcu free the vfio_device, so that would
> > have to be global overhead with this api design.
>
> not quite get. why ccw is special here? could you elaborate?

I added a rcu usage to it in order to fix a race

+static inline struct vfio_ccw_private *vfio_ccw_get_priv(struct subchannel *sch)
+{
+ struct vfio_ccw_private *private;
+
+ rcu_read_lock();
+ private = dev_get_drvdata(&sch->dev);
+ if (private && !vfio_device_try_get(&private->vdev))
+ private = NULL;
+ rcu_read_unlock();
+ return private;
+}


Jason

2021-11-11 12:32:10

by Yi Liu

[permalink] [raw]
Subject: RE: [RFC 02/20] vfio: Add device class for /dev/vfio/devices

Hi Jason,

> From: Jason Gunthorpe <[email protected]>
> Sent: Wednesday, November 3, 2021 9:26 PM
>
> On Tue, Nov 02, 2021 at 09:53:29AM +0000, Liu, Yi L wrote:
>
> > > vfio_uninit_group_dev(&mdev_state->vdev);
> > > kfree(mdev_state->pages);
> > > kfree(mdev_state->vconfig);
> > > kfree(mdev_state);
> > >
> > > pages/vconfig would logically be in a release function
> >
> > I see. So the criteria is: the pointer fields pointing to a memory buffer
> > allocated by the device driver should be logically be free in a release
> > function. right?
>
> Often yes, that is usually a good idea
>
> >I can see there are such fields in struct vfio_pci_core_device
> > and mdev_state (both mbochs and mdpy). So we may go with your option
> #2.
> > Is it? otherwise, needs to add release callback for all the related drivers.
>
> Yes, that is the approx trade off
>
> > > On the other hand ccw needs to rcu free the vfio_device, so that would
> > > have to be global overhead with this api design.
> >
> > not quite get. why ccw is special here? could you elaborate?
>
> I added a rcu usage to it in order to fix a race
>
> +static inline struct vfio_ccw_private *vfio_ccw_get_priv(struct subchannel
> *sch)
> +{
> + struct vfio_ccw_private *private;
> +
> + rcu_read_lock();
> + private = dev_get_drvdata(&sch->dev);
> + if (private && !vfio_device_try_get(&private->vdev))
> + private = NULL;
> + rcu_read_unlock();
> + return private;
> +}

you are right. After checking your ccw patch, the private free triggered
by vfio_ccw_free_private() should use kfree_rcu(). So it is not quite
same with other vfio_device users which only need kfree() to free the
vfio_device. So how can I address the difference when moving the vfio_device
alloc/free into vfio core? any suggestion?

@@ -164,14 +173,14 @@ static void vfio_ccw_free_private(struct vfio_ccw_private *private)
kmem_cache_free(vfio_ccw_io_region, private->io_region);
kfree(private->cp.guest_cp);
mutex_destroy(&private->io_mutex);
- kfree(private);
+ vfio_uninit_group_dev(&private->vdev);
+ kfree_rcu(private, rcu);
}

https://lore.kernel.org/kvm/[email protected]/

Regards,
Yi Liu