2013-08-14 20:10:33

by Alex Williamson

[permalink] [raw]
Subject: [PATCH] vfio-pci: PCI hot reset interface

The current VFIO_DEVICE_RESET interface only maps to PCI use cases
where we can isolate the reset to the individual PCI function. This
means the device must support FLR (PCIe or AF), PM reset on D3hot->D0
transition, device specific reset, or be a singleton device on a bus
for a secondary bus reset. FLR does not have widespread support,
PM reset is not very reliable, and bus topology is dictated by the
system and device design. We need to provide a means for a user to
induce a bus reset in cases where the existing mechanisms are not
available or not reliable.

This device specific extension to VFIO provides the user with this
ability. Two new ioctls are introduced:
- VFIO_DEVICE_PCI_GET_HOT_RESET_INFO
- VFIO_DEVICE_PCI_HOT_RESET

The first provides the user with information about the extent of
devices affected by a hot reset. This is essentially a list of
devices and the IOMMU groups they belong to. The user may then
initiate a hot reset by calling the second ioctl. We must be
careful that the user has ownership of all the affected devices
found via the first ioctl, so the second ioctl takes a list of file
descriptors for the VFIO groups affected by the reset. Each group
must have IOMMU protection established for the ioctl to succeed.

Signed-off-by: Alex Williamson <[email protected]>
---

This patch is dependent on v5 "pci: bus and slot reset interfaces" as
well as "pci: Add probe functions for bus and slot reset".

drivers/vfio/pci/vfio_pci.c | 272 +++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/vfio.h | 38 ++++++
2 files changed, 309 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index cef6002..eb69bf3 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -227,6 +227,97 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
return 0;
}

+static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
+{
+ (*(int *)data)++;
+ return 0;
+}
+
+struct vfio_pci_fill_info {
+ int max;
+ int cur;
+ struct vfio_pci_dependent_device *devices;
+};
+
+static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
+{
+ struct vfio_pci_fill_info *info = data;
+ struct iommu_group *iommu_group;
+
+ if (info->cur == info->max)
+ return -EAGAIN; /* Something changed, try again */
+
+ iommu_group = iommu_group_get(&pdev->dev);
+ if (!iommu_group)
+ return -EPERM; /* Cannot reset non-isolated devices */
+
+ info->devices[info->cur].group_id = iommu_group_id(iommu_group);
+ info->devices[info->cur].segment = pci_domain_nr(pdev->bus);
+ info->devices[info->cur].bus = pdev->bus->number;
+ info->devices[info->cur].devfn = pdev->devfn;
+ info->cur++;
+ iommu_group_put(iommu_group);
+ return 0;
+}
+
+struct vfio_pci_group {
+ struct vfio_group *group;
+ int id;
+};
+
+struct vfio_pci_group_info {
+ int count;
+ struct vfio_pci_group *groups;
+};
+
+static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
+{
+ struct vfio_pci_group_info *info = data;
+ struct iommu_group *group;
+ int id, i;
+
+ group = iommu_group_get(&pdev->dev);
+ if (!group)
+ return -EPERM;
+
+ id = iommu_group_id(group);
+
+ for (i = 0; i < info->count; i++)
+ if (info->groups[i].id == id)
+ break;
+
+ iommu_group_put(group);
+
+ return (i == info->count) ? -EINVAL : 0;
+}
+
+static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
+ int (*fn)(struct pci_dev *,
+ void *data), void *data,
+ bool slot)
+{
+ struct pci_dev *tmp;
+ int ret = 0;
+
+ list_for_each_entry(tmp, &pdev->bus->devices, bus_list) {
+ if (slot && tmp->slot != pdev->slot)
+ continue;
+
+ ret = fn(tmp, data);
+ if (ret)
+ break;
+
+ if (tmp->subordinate) {
+ ret = vfio_pci_for_each_slot_or_bus(tmp, fn,
+ data, false);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
static long vfio_pci_ioctl(void *device_data,
unsigned int cmd, unsigned long arg)
{
@@ -407,10 +498,189 @@ static long vfio_pci_ioctl(void *device_data,

return ret;

- } else if (cmd == VFIO_DEVICE_RESET)
+ } else if (cmd == VFIO_DEVICE_RESET) {
return vdev->reset_works ?
pci_reset_function(vdev->pdev) : -EINVAL;

+ } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
+ struct vfio_pci_hot_reset_info hdr;
+ struct vfio_pci_fill_info fill = { 0 };
+ struct vfio_pci_dependent_device *devices = NULL;
+ bool slot = false;
+ int ret = 0;
+
+ minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
+
+ if (copy_from_user(&hdr, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (hdr.argsz < minsz)
+ return -EINVAL;
+
+ hdr.flags = 0;
+
+ /* Can we do a slot or bus reset or neither? */
+ if (!pci_probe_reset_slot(vdev->pdev->slot))
+ slot = true;
+ else if (pci_probe_reset_bus(vdev->pdev->bus))
+ return -ENODEV;
+
+ /* How many devices are affected? */
+ ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
+ vfio_pci_count_devs,
+ &fill.max, slot);
+ if (ret)
+ return ret;
+
+ WARN_ON(!fill.max); /* Should always be at least one */
+
+ /*
+ * If there's enough space, fill it now, otherwise return
+ * -ENOSPC and the number of devices affected.
+ */
+ if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
+ ret = -ENOSPC;
+ hdr.count = fill.max;
+ goto reset_info_exit;
+ }
+
+ devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
+ if (!devices)
+ return -ENOMEM;
+
+ fill.devices = devices;
+
+ ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
+ vfio_pci_fill_devs,
+ &fill, slot);
+
+ /*
+ * If a device was removed between counting and filling,
+ * we may come up short of fill.max. If a device was
+ * added, we'll have a return of -EAGAIN above.
+ */
+ if (!ret)
+ hdr.count = fill.cur;
+
+reset_info_exit:
+ if (copy_to_user((void __user *)arg, &hdr, minsz))
+ ret = -EFAULT;
+
+ if (!ret) {
+ if (copy_to_user((void __user *)(arg + minsz), devices,
+ hdr.count * sizeof(*devices)))
+ ret = -EFAULT;
+ }
+
+ kfree(devices);
+ return ret;
+
+ } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
+ struct vfio_pci_hot_reset hdr;
+ int32_t *group_fds;
+ struct vfio_pci_group *groups;
+ struct vfio_pci_group_info info;
+ bool slot = false;
+ int i, count = 0, ret = 0;
+
+ minsz = offsetofend(struct vfio_pci_hot_reset, count);
+
+ if (copy_from_user(&hdr, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (hdr.argsz < minsz || hdr.flags)
+ return -EINVAL;
+
+ /* Can we do a slot or bus reset or neither? */
+ if (!pci_probe_reset_slot(vdev->pdev->slot))
+ slot = true;
+ else if (pci_probe_reset_bus(vdev->pdev->bus))
+ return -ENODEV;
+
+ /*
+ * We can't let userspace give us an arbitrarily large
+ * buffer to copy, so verify how many we think there
+ * could be. Note groups can have multiple devices so
+ * one group per device is the max.
+ */
+ ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
+ vfio_pci_count_devs,
+ &count, slot);
+ if (ret)
+ return ret;
+
+ /* Somewhere between 1 and count is OK */
+ if (!hdr.count || hdr.count > count)
+ return -EINVAL;
+
+ group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
+ groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
+ if (!group_fds || !groups) {
+ kfree(group_fds);
+ kfree(groups);
+ return -ENOMEM;
+ }
+
+ if (copy_from_user(group_fds, (void __user *)(arg + minsz),
+ hdr.count * sizeof(*group_fds))) {
+ kfree(group_fds);
+ kfree(groups);
+ return -EFAULT;
+ }
+
+ /*
+ * For each group_fd, get the group through the vfio external
+ * user interface and store the group and iommu ID. This
+ * ensures the group is held across the reset.
+ */
+ for (i = 0; i < hdr.count; i++) {
+ struct vfio_group *group;
+ struct fd f = fdget(group_fds[i]);
+ if (!f.file) {
+ ret = -EBADF;
+ break;
+ }
+
+ group = vfio_group_get_external_user(f.file);
+ fdput(f);
+ if (IS_ERR(group)) {
+ ret = PTR_ERR(group);
+ break;
+ }
+
+ groups[i].group = group;
+ groups[i].id = vfio_external_user_iommu_id(group);
+ }
+
+ kfree(group_fds);
+
+ /* release reference to groups on error */
+ if (ret)
+ goto hot_reset_release;
+
+ info.count = hdr.count;
+ info.groups = groups;
+
+ /*
+ * Test whether all the affected devices are contained
+ * by the set of groups provided by the user.
+ */
+ ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
+ vfio_pci_validate_devs,
+ &info, slot);
+ if (!ret)
+ /* User has access, do the reset */
+ ret = slot ? pci_reset_slot(vdev->pdev->slot) :
+ pci_reset_bus(vdev->pdev->bus);
+
+hot_reset_release:
+ for (i--; i >= 0; i--)
+ vfio_group_put_external_user(groups[i].group);
+
+ kfree(groups);
+ return ret;;
+ }
+
return -ENOTTY;
}

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 916e444..0fd47f5 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -324,6 +324,44 @@ enum {
VFIO_PCI_NUM_IRQS
};

+/**
+ * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12,
+ * struct vfio_pci_hot_reset_info)
+ *
+ * Return: 0 on success, -errno on failure:
+ * -enospc = insufficient buffer, -enodev = unsupported for device.
+ */
+struct vfio_pci_dependent_device {
+ __u32 group_id;
+ __u16 segment;
+ __u8 bus;
+ __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */
+};
+
+struct vfio_pci_hot_reset_info {
+ __u32 argsz;
+ __u32 flags;
+ __u32 count;
+ struct vfio_pci_dependent_device devices[];
+};
+
+#define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/**
+ * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
+ * struct vfio_pci_hot_reset)
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_pci_hot_reset {
+ __u32 argsz;
+ __u32 flags;
+ __u32 count;
+ __s32 group_fds[];
+};
+
+#define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
+
/* -------- API for Type1 VFIO IOMMU -------- */

/**


2013-08-14 22:42:38

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

[+cc Al, linux-fsdevel for fdget/fdput usage]

On Wed, Aug 14, 2013 at 2:10 PM, Alex Williamson
<[email protected]> wrote:
> The current VFIO_DEVICE_RESET interface only maps to PCI use cases
> where we can isolate the reset to the individual PCI function. This
> means the device must support FLR (PCIe or AF), PM reset on D3hot->D0
> transition, device specific reset, or be a singleton device on a bus
> for a secondary bus reset. FLR does not have widespread support,
> PM reset is not very reliable, and bus topology is dictated by the
> system and device design. We need to provide a means for a user to
> induce a bus reset in cases where the existing mechanisms are not
> available or not reliable.
>
> This device specific extension to VFIO provides the user with this
> ability. Two new ioctls are introduced:
> - VFIO_DEVICE_PCI_GET_HOT_RESET_INFO
> - VFIO_DEVICE_PCI_HOT_RESET
>
> The first provides the user with information about the extent of
> devices affected by a hot reset. This is essentially a list of
> devices and the IOMMU groups they belong to. The user may then
> initiate a hot reset by calling the second ioctl. We must be
> careful that the user has ownership of all the affected devices
> found via the first ioctl, so the second ioctl takes a list of file
> descriptors for the VFIO groups affected by the reset. Each group
> must have IOMMU protection established for the ioctl to succeed.
>
> Signed-off-by: Alex Williamson <[email protected]>
> ---
>
> This patch is dependent on v5 "pci: bus and slot reset interfaces" as
> well as "pci: Add probe functions for bus and slot reset".
>
> drivers/vfio/pci/vfio_pci.c | 272 +++++++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/vfio.h | 38 ++++++
> 2 files changed, 309 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index cef6002..eb69bf3 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -227,6 +227,97 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
> return 0;
> }
>
> +static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
> +{
> + (*(int *)data)++;
> + return 0;
> +}
> +
> +struct vfio_pci_fill_info {
> + int max;
> + int cur;
> + struct vfio_pci_dependent_device *devices;
> +};
> +
> +static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> +{
> + struct vfio_pci_fill_info *info = data;
> + struct iommu_group *iommu_group;
> +
> + if (info->cur == info->max)
> + return -EAGAIN; /* Something changed, try again */
> +
> + iommu_group = iommu_group_get(&pdev->dev);
> + if (!iommu_group)
> + return -EPERM; /* Cannot reset non-isolated devices */
> +
> + info->devices[info->cur].group_id = iommu_group_id(iommu_group);
> + info->devices[info->cur].segment = pci_domain_nr(pdev->bus);
> + info->devices[info->cur].bus = pdev->bus->number;
> + info->devices[info->cur].devfn = pdev->devfn;
> + info->cur++;
> + iommu_group_put(iommu_group);
> + return 0;
> +}
> +
> +struct vfio_pci_group {
> + struct vfio_group *group;
> + int id;
> +};
> +
> +struct vfio_pci_group_info {
> + int count;
> + struct vfio_pci_group *groups;
> +};
> +
> +static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
> +{
> + struct vfio_pci_group_info *info = data;
> + struct iommu_group *group;
> + int id, i;
> +
> + group = iommu_group_get(&pdev->dev);
> + if (!group)
> + return -EPERM;
> +
> + id = iommu_group_id(group);
> +
> + for (i = 0; i < info->count; i++)
> + if (info->groups[i].id == id)
> + break;
> +
> + iommu_group_put(group);
> +
> + return (i == info->count) ? -EINVAL : 0;
> +}
> +
> +static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
> + int (*fn)(struct pci_dev *,
> + void *data), void *data,
> + bool slot)
> +{
> + struct pci_dev *tmp;
> + int ret = 0;
> +
> + list_for_each_entry(tmp, &pdev->bus->devices, bus_list) {
> + if (slot && tmp->slot != pdev->slot)
> + continue;
> +
> + ret = fn(tmp, data);
> + if (ret)
> + break;
> +
> + if (tmp->subordinate) {
> + ret = vfio_pci_for_each_slot_or_bus(tmp, fn,
> + data, false);
> + if (ret)
> + break;
> + }
> + }
> +
> + return ret;
> +}

vfio_pci_for_each_slot_or_bus() isn't really vfio-specific, is it? I
mean, traversing the PCI hierarchy doesn't require vfio knowledge. I
think this loop (walking the bus->devices list) skips devices on
"virtual buses" that may be added for SR-IOV. I'm not sure that
pci_walk_bus() handles that correctly either, but at least if you used
that, we could fix the problem in one place.

> +
> static long vfio_pci_ioctl(void *device_data,
> unsigned int cmd, unsigned long arg)
> {
> @@ -407,10 +498,189 @@ static long vfio_pci_ioctl(void *device_data,
>
> return ret;
>
> - } else if (cmd == VFIO_DEVICE_RESET)
> + } else if (cmd == VFIO_DEVICE_RESET) {
> return vdev->reset_works ?
> pci_reset_function(vdev->pdev) : -EINVAL;
>
> + } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
> + struct vfio_pci_hot_reset_info hdr;
> + struct vfio_pci_fill_info fill = { 0 };
> + struct vfio_pci_dependent_device *devices = NULL;
> + bool slot = false;
> + int ret = 0;
> +
> + minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
> +
> + if (copy_from_user(&hdr, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (hdr.argsz < minsz)
> + return -EINVAL;
> +
> + hdr.flags = 0;
> +
> + /* Can we do a slot or bus reset or neither? */
> + if (!pci_probe_reset_slot(vdev->pdev->slot))
> + slot = true;
> + else if (pci_probe_reset_bus(vdev->pdev->bus))
> + return -ENODEV;
> +
> + /* How many devices are affected? */
> + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
> + vfio_pci_count_devs,
> + &fill.max, slot);
> + if (ret)
> + return ret;
> +
> + WARN_ON(!fill.max); /* Should always be at least one */
> +
> + /*
> + * If there's enough space, fill it now, otherwise return
> + * -ENOSPC and the number of devices affected.
> + */
> + if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
> + ret = -ENOSPC;
> + hdr.count = fill.max;
> + goto reset_info_exit;
> + }
> +
> + devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
> + if (!devices)
> + return -ENOMEM;
> +
> + fill.devices = devices;
> +
> + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
> + vfio_pci_fill_devs,
> + &fill, slot);
> +
> + /*
> + * If a device was removed between counting and filling,
> + * we may come up short of fill.max. If a device was
> + * added, we'll have a return of -EAGAIN above.
> + */
> + if (!ret)
> + hdr.count = fill.cur;
> +
> +reset_info_exit:
> + if (copy_to_user((void __user *)arg, &hdr, minsz))
> + ret = -EFAULT;
> +
> + if (!ret) {
> + if (copy_to_user((void __user *)(arg + minsz), devices,
> + hdr.count * sizeof(*devices)))
> + ret = -EFAULT;
> + }
> +
> + kfree(devices);
> + return ret;
> +
> + } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
> + struct vfio_pci_hot_reset hdr;
> + int32_t *group_fds;
> + struct vfio_pci_group *groups;
> + struct vfio_pci_group_info info;
> + bool slot = false;
> + int i, count = 0, ret = 0;
> +
> + minsz = offsetofend(struct vfio_pci_hot_reset, count);
> +
> + if (copy_from_user(&hdr, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (hdr.argsz < minsz || hdr.flags)
> + return -EINVAL;
> +
> + /* Can we do a slot or bus reset or neither? */
> + if (!pci_probe_reset_slot(vdev->pdev->slot))
> + slot = true;
> + else if (pci_probe_reset_bus(vdev->pdev->bus))
> + return -ENODEV;
> +
> + /*
> + * We can't let userspace give us an arbitrarily large
> + * buffer to copy, so verify how many we think there
> + * could be. Note groups can have multiple devices so
> + * one group per device is the max.
> + */
> + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
> + vfio_pci_count_devs,
> + &count, slot);
> + if (ret)
> + return ret;
> +
> + /* Somewhere between 1 and count is OK */
> + if (!hdr.count || hdr.count > count)
> + return -EINVAL;
> +
> + group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
> + groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
> + if (!group_fds || !groups) {
> + kfree(group_fds);
> + kfree(groups);
> + return -ENOMEM;
> + }
> +
> + if (copy_from_user(group_fds, (void __user *)(arg + minsz),
> + hdr.count * sizeof(*group_fds))) {
> + kfree(group_fds);
> + kfree(groups);
> + return -EFAULT;
> + }
> +
> + /*
> + * For each group_fd, get the group through the vfio external
> + * user interface and store the group and iommu ID. This
> + * ensures the group is held across the reset.
> + */
> + for (i = 0; i < hdr.count; i++) {
> + struct vfio_group *group;
> + struct fd f = fdget(group_fds[i]);
> + if (!f.file) {
> + ret = -EBADF;
> + break;
> + }
> +
> + group = vfio_group_get_external_user(f.file);
> + fdput(f);
> + if (IS_ERR(group)) {
> + ret = PTR_ERR(group);
> + break;
> + }
> +
> + groups[i].group = group;
> + groups[i].id = vfio_external_user_iommu_id(group);
> + }
> +
> + kfree(group_fds);
> +
> + /* release reference to groups on error */
> + if (ret)
> + goto hot_reset_release;
> +
> + info.count = hdr.count;
> + info.groups = groups;
> +
> + /*
> + * Test whether all the affected devices are contained
> + * by the set of groups provided by the user.
> + */
> + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
> + vfio_pci_validate_devs,
> + &info, slot);
> + if (!ret)
> + /* User has access, do the reset */
> + ret = slot ? pci_reset_slot(vdev->pdev->slot) :
> + pci_reset_bus(vdev->pdev->bus);
> +
> +hot_reset_release:
> + for (i--; i >= 0; i--)
> + vfio_group_put_external_user(groups[i].group);
> +
> + kfree(groups);
> + return ret;;
> + }
> +
> return -ENOTTY;
> }
>
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 916e444..0fd47f5 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -324,6 +324,44 @@ enum {
> VFIO_PCI_NUM_IRQS
> };
>
> +/**
> + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12,
> + * struct vfio_pci_hot_reset_info)
> + *
> + * Return: 0 on success, -errno on failure:
> + * -enospc = insufficient buffer, -enodev = unsupported for device.
> + */
> +struct vfio_pci_dependent_device {
> + __u32 group_id;
> + __u16 segment;
> + __u8 bus;
> + __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */
> +};
> +
> +struct vfio_pci_hot_reset_info {
> + __u32 argsz;
> + __u32 flags;
> + __u32 count;
> + struct vfio_pci_dependent_device devices[];
> +};
> +
> +#define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +/**
> + * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
> + * struct vfio_pci_hot_reset)
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_pci_hot_reset {
> + __u32 argsz;
> + __u32 flags;
> + __u32 count;
> + __s32 group_fds[];
> +};
> +
> +#define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2013-08-14 23:06:25

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Wed, 2013-08-14 at 16:42 -0600, Bjorn Helgaas wrote:
> [+cc Al, linux-fsdevel for fdget/fdput usage]
>
> On Wed, Aug 14, 2013 at 2:10 PM, Alex Williamson
> <[email protected]> wrote:
> > The current VFIO_DEVICE_RESET interface only maps to PCI use cases
> > where we can isolate the reset to the individual PCI function. This
> > means the device must support FLR (PCIe or AF), PM reset on D3hot->D0
> > transition, device specific reset, or be a singleton device on a bus
> > for a secondary bus reset. FLR does not have widespread support,
> > PM reset is not very reliable, and bus topology is dictated by the
> > system and device design. We need to provide a means for a user to
> > induce a bus reset in cases where the existing mechanisms are not
> > available or not reliable.
> >
> > This device specific extension to VFIO provides the user with this
> > ability. Two new ioctls are introduced:
> > - VFIO_DEVICE_PCI_GET_HOT_RESET_INFO
> > - VFIO_DEVICE_PCI_HOT_RESET
> >
> > The first provides the user with information about the extent of
> > devices affected by a hot reset. This is essentially a list of
> > devices and the IOMMU groups they belong to. The user may then
> > initiate a hot reset by calling the second ioctl. We must be
> > careful that the user has ownership of all the affected devices
> > found via the first ioctl, so the second ioctl takes a list of file
> > descriptors for the VFIO groups affected by the reset. Each group
> > must have IOMMU protection established for the ioctl to succeed.
> >
> > Signed-off-by: Alex Williamson <[email protected]>
> > ---
> >
> > This patch is dependent on v5 "pci: bus and slot reset interfaces" as
> > well as "pci: Add probe functions for bus and slot reset".
> >
> > drivers/vfio/pci/vfio_pci.c | 272 +++++++++++++++++++++++++++++++++++++++++++
> > include/uapi/linux/vfio.h | 38 ++++++
> > 2 files changed, 309 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> > index cef6002..eb69bf3 100644
> > --- a/drivers/vfio/pci/vfio_pci.c
> > +++ b/drivers/vfio/pci/vfio_pci.c
> > @@ -227,6 +227,97 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
> > return 0;
> > }
> >
> > +static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
> > +{
> > + (*(int *)data)++;
> > + return 0;
> > +}
> > +
> > +struct vfio_pci_fill_info {
> > + int max;
> > + int cur;
> > + struct vfio_pci_dependent_device *devices;
> > +};
> > +
> > +static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> > +{
> > + struct vfio_pci_fill_info *info = data;
> > + struct iommu_group *iommu_group;
> > +
> > + if (info->cur == info->max)
> > + return -EAGAIN; /* Something changed, try again */
> > +
> > + iommu_group = iommu_group_get(&pdev->dev);
> > + if (!iommu_group)
> > + return -EPERM; /* Cannot reset non-isolated devices */
> > +
> > + info->devices[info->cur].group_id = iommu_group_id(iommu_group);
> > + info->devices[info->cur].segment = pci_domain_nr(pdev->bus);
> > + info->devices[info->cur].bus = pdev->bus->number;
> > + info->devices[info->cur].devfn = pdev->devfn;
> > + info->cur++;
> > + iommu_group_put(iommu_group);
> > + return 0;
> > +}
> > +
> > +struct vfio_pci_group {
> > + struct vfio_group *group;
> > + int id;
> > +};
> > +
> > +struct vfio_pci_group_info {
> > + int count;
> > + struct vfio_pci_group *groups;
> > +};
> > +
> > +static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
> > +{
> > + struct vfio_pci_group_info *info = data;
> > + struct iommu_group *group;
> > + int id, i;
> > +
> > + group = iommu_group_get(&pdev->dev);
> > + if (!group)
> > + return -EPERM;
> > +
> > + id = iommu_group_id(group);
> > +
> > + for (i = 0; i < info->count; i++)
> > + if (info->groups[i].id == id)
> > + break;
> > +
> > + iommu_group_put(group);
> > +
> > + return (i == info->count) ? -EINVAL : 0;
> > +}
> > +
> > +static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
> > + int (*fn)(struct pci_dev *,
> > + void *data), void *data,
> > + bool slot)
> > +{
> > + struct pci_dev *tmp;
> > + int ret = 0;
> > +
> > + list_for_each_entry(tmp, &pdev->bus->devices, bus_list) {
> > + if (slot && tmp->slot != pdev->slot)
> > + continue;
> > +
> > + ret = fn(tmp, data);
> > + if (ret)
> > + break;
> > +
> > + if (tmp->subordinate) {
> > + ret = vfio_pci_for_each_slot_or_bus(tmp, fn,
> > + data, false);
> > + if (ret)
> > + break;
> > + }
> > + }
> > +
> > + return ret;
> > +}
>
> vfio_pci_for_each_slot_or_bus() isn't really vfio-specific, is it?

It's not, I originally has callbacks split out as PCI patches but I was
able to simplify some things in the code by customizing it to my usage,
so I left it here.

> I mean, traversing the PCI hierarchy doesn't require vfio knowledge. I
> think this loop (walking the bus->devices list) skips devices on
> "virtual buses" that may be added for SR-IOV. I'm not sure that
> pci_walk_bus() handles that correctly either, but at least if you used
> that, we could fix the problem in one place.

I didn't know about pci_walk_bus(), I'll look into switching to it.
Thanks,

Alex

2013-08-19 18:41:30

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Wed, 2013-08-14 at 17:06 -0600, Alex Williamson wrote:
> On Wed, 2013-08-14 at 16:42 -0600, Bjorn Helgaas wrote:
> > [+cc Al, linux-fsdevel for fdget/fdput usage]
> >
> > On Wed, Aug 14, 2013 at 2:10 PM, Alex Williamson
> > <[email protected]> wrote:
> > > The current VFIO_DEVICE_RESET interface only maps to PCI use cases
> > > where we can isolate the reset to the individual PCI function. This
> > > means the device must support FLR (PCIe or AF), PM reset on D3hot->D0
> > > transition, device specific reset, or be a singleton device on a bus
> > > for a secondary bus reset. FLR does not have widespread support,
> > > PM reset is not very reliable, and bus topology is dictated by the
> > > system and device design. We need to provide a means for a user to
> > > induce a bus reset in cases where the existing mechanisms are not
> > > available or not reliable.
> > >
> > > This device specific extension to VFIO provides the user with this
> > > ability. Two new ioctls are introduced:
> > > - VFIO_DEVICE_PCI_GET_HOT_RESET_INFO
> > > - VFIO_DEVICE_PCI_HOT_RESET
> > >
> > > The first provides the user with information about the extent of
> > > devices affected by a hot reset. This is essentially a list of
> > > devices and the IOMMU groups they belong to. The user may then
> > > initiate a hot reset by calling the second ioctl. We must be
> > > careful that the user has ownership of all the affected devices
> > > found via the first ioctl, so the second ioctl takes a list of file
> > > descriptors for the VFIO groups affected by the reset. Each group
> > > must have IOMMU protection established for the ioctl to succeed.
> > >
> > > Signed-off-by: Alex Williamson <[email protected]>
> > > ---
> > >
> > > This patch is dependent on v5 "pci: bus and slot reset interfaces" as
> > > well as "pci: Add probe functions for bus and slot reset".
> > >
> > > drivers/vfio/pci/vfio_pci.c | 272 +++++++++++++++++++++++++++++++++++++++++++
> > > include/uapi/linux/vfio.h | 38 ++++++
> > > 2 files changed, 309 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> > > index cef6002..eb69bf3 100644
> > > --- a/drivers/vfio/pci/vfio_pci.c
> > > +++ b/drivers/vfio/pci/vfio_pci.c
> > > @@ -227,6 +227,97 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
> > > return 0;
> > > }
> > >
> > > +static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
> > > +{
> > > + (*(int *)data)++;
> > > + return 0;
> > > +}
> > > +
> > > +struct vfio_pci_fill_info {
> > > + int max;
> > > + int cur;
> > > + struct vfio_pci_dependent_device *devices;
> > > +};
> > > +
> > > +static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
> > > +{
> > > + struct vfio_pci_fill_info *info = data;
> > > + struct iommu_group *iommu_group;
> > > +
> > > + if (info->cur == info->max)
> > > + return -EAGAIN; /* Something changed, try again */
> > > +
> > > + iommu_group = iommu_group_get(&pdev->dev);
> > > + if (!iommu_group)
> > > + return -EPERM; /* Cannot reset non-isolated devices */
> > > +
> > > + info->devices[info->cur].group_id = iommu_group_id(iommu_group);
> > > + info->devices[info->cur].segment = pci_domain_nr(pdev->bus);
> > > + info->devices[info->cur].bus = pdev->bus->number;
> > > + info->devices[info->cur].devfn = pdev->devfn;
> > > + info->cur++;
> > > + iommu_group_put(iommu_group);
> > > + return 0;
> > > +}
> > > +
> > > +struct vfio_pci_group {
> > > + struct vfio_group *group;
> > > + int id;
> > > +};
> > > +
> > > +struct vfio_pci_group_info {
> > > + int count;
> > > + struct vfio_pci_group *groups;
> > > +};
> > > +
> > > +static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
> > > +{
> > > + struct vfio_pci_group_info *info = data;
> > > + struct iommu_group *group;
> > > + int id, i;
> > > +
> > > + group = iommu_group_get(&pdev->dev);
> > > + if (!group)
> > > + return -EPERM;
> > > +
> > > + id = iommu_group_id(group);
> > > +
> > > + for (i = 0; i < info->count; i++)
> > > + if (info->groups[i].id == id)
> > > + break;
> > > +
> > > + iommu_group_put(group);
> > > +
> > > + return (i == info->count) ? -EINVAL : 0;
> > > +}
> > > +
> > > +static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
> > > + int (*fn)(struct pci_dev *,
> > > + void *data), void *data,
> > > + bool slot)
> > > +{
> > > + struct pci_dev *tmp;
> > > + int ret = 0;
> > > +
> > > + list_for_each_entry(tmp, &pdev->bus->devices, bus_list) {
> > > + if (slot && tmp->slot != pdev->slot)
> > > + continue;
> > > +
> > > + ret = fn(tmp, data);
> > > + if (ret)
> > > + break;
> > > +
> > > + if (tmp->subordinate) {
> > > + ret = vfio_pci_for_each_slot_or_bus(tmp, fn,
> > > + data, false);
> > > + if (ret)
> > > + break;
> > > + }
> > > + }
> > > +
> > > + return ret;
> > > +}
> >
> > vfio_pci_for_each_slot_or_bus() isn't really vfio-specific, is it?
>
> It's not, I originally has callbacks split out as PCI patches but I was
> able to simplify some things in the code by customizing it to my usage,
> so I left it here.
>
> > I mean, traversing the PCI hierarchy doesn't require vfio knowledge. I
> > think this loop (walking the bus->devices list) skips devices on
> > "virtual buses" that may be added for SR-IOV. I'm not sure that
> > pci_walk_bus() handles that correctly either, but at least if you used
> > that, we could fix the problem in one place.
>
> I didn't know about pci_walk_bus(), I'll look into switching to it.

It looks like pci_walk_bus() is a poor replacement for when dealing with
slots. There might be multiple slots on a bus or a mix of slots and
non-slots, so for each device pci_walk_bus() finds on a subordinate bus
I'd need to walk up the tree to find the parent bridge on the original
bus to figure out if it's in the same slot. Should we have a
pci_walk_slot() function? Thanks,

Alex

2013-08-19 20:03:05

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Mon, Aug 19, 2013 at 12:41 PM, Alex Williamson
<[email protected]> wrote:
> On Wed, 2013-08-14 at 17:06 -0600, Alex Williamson wrote:
>> On Wed, 2013-08-14 at 16:42 -0600, Bjorn Helgaas wrote:
>> > On Wed, Aug 14, 2013 at 2:10 PM, Alex Williamson
>> > <[email protected]> wrote:

>> > > +static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
>> > > + int (*fn)(struct pci_dev *,
>> > > + void *data), void *data,
>> > > + bool slot)
>> > > +{
>> > > + struct pci_dev *tmp;
>> > > + int ret = 0;
>> > > +
>> > > + list_for_each_entry(tmp, &pdev->bus->devices, bus_list) {
>> > > + if (slot && tmp->slot != pdev->slot)
>> > > + continue;
>> > > +
>> > > + ret = fn(tmp, data);
>> > > + if (ret)
>> > > + break;
>> > > +
>> > > + if (tmp->subordinate) {
>> > > + ret = vfio_pci_for_each_slot_or_bus(tmp, fn,
>> > > + data, false);
>> > > + if (ret)
>> > > + break;
>> > > + }
>> > > + }
>> > > +
>> > > + return ret;
>> > > +}
>> >
>> > vfio_pci_for_each_slot_or_bus() isn't really vfio-specific, is it?
>>
>> It's not, I originally has callbacks split out as PCI patches but I was
>> able to simplify some things in the code by customizing it to my usage,
>> so I left it here.
>>
>> > I mean, traversing the PCI hierarchy doesn't require vfio knowledge. I
>> > think this loop (walking the bus->devices list) skips devices on
>> > "virtual buses" that may be added for SR-IOV. I'm not sure that
>> > pci_walk_bus() handles that correctly either, but at least if you used
>> > that, we could fix the problem in one place.
>>
>> I didn't know about pci_walk_bus(), I'll look into switching to it.
>
> It looks like pci_walk_bus() is a poor replacement for when dealing with
> slots. There might be multiple slots on a bus or a mix of slots and
> non-slots, so for each device pci_walk_bus() finds on a subordinate bus
> I'd need to walk up the tree to find the parent bridge on the original
> bus to figure out if it's in the same slot.

Do you really care about that scenario? PCIe only supports a single
slot per bus, as far as I know.

> Should we have a pci_walk_slot() function?

I guess. And supply the pci_slot rather than the pci_dev? I'm a
little bit worried because the idea of a "slot" is not well-defined in
the spec, and we have sort of an ad hoc method of discovering and
managing them, e.g., acpiphp and pciehp might discover the same slot.
But I guess that's no reason to bury generic code in vfio.

Bjorn

2013-08-19 20:20:40

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Mon, 2013-08-19 at 14:02 -0600, Bjorn Helgaas wrote:
> On Mon, Aug 19, 2013 at 12:41 PM, Alex Williamson
> <[email protected]> wrote:
> > On Wed, 2013-08-14 at 17:06 -0600, Alex Williamson wrote:
> >> On Wed, 2013-08-14 at 16:42 -0600, Bjorn Helgaas wrote:
> >> > On Wed, Aug 14, 2013 at 2:10 PM, Alex Williamson
> >> > <[email protected]> wrote:
>
> >> > > +static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
> >> > > + int (*fn)(struct pci_dev *,
> >> > > + void *data), void *data,
> >> > > + bool slot)
> >> > > +{
> >> > > + struct pci_dev *tmp;
> >> > > + int ret = 0;
> >> > > +
> >> > > + list_for_each_entry(tmp, &pdev->bus->devices, bus_list) {
> >> > > + if (slot && tmp->slot != pdev->slot)
> >> > > + continue;
> >> > > +
> >> > > + ret = fn(tmp, data);
> >> > > + if (ret)
> >> > > + break;
> >> > > +
> >> > > + if (tmp->subordinate) {
> >> > > + ret = vfio_pci_for_each_slot_or_bus(tmp, fn,
> >> > > + data, false);
> >> > > + if (ret)
> >> > > + break;
> >> > > + }
> >> > > + }
> >> > > +
> >> > > + return ret;
> >> > > +}
> >> >
> >> > vfio_pci_for_each_slot_or_bus() isn't really vfio-specific, is it?
> >>
> >> It's not, I originally has callbacks split out as PCI patches but I was
> >> able to simplify some things in the code by customizing it to my usage,
> >> so I left it here.
> >>
> >> > I mean, traversing the PCI hierarchy doesn't require vfio knowledge. I
> >> > think this loop (walking the bus->devices list) skips devices on
> >> > "virtual buses" that may be added for SR-IOV. I'm not sure that
> >> > pci_walk_bus() handles that correctly either, but at least if you used
> >> > that, we could fix the problem in one place.
> >>
> >> I didn't know about pci_walk_bus(), I'll look into switching to it.
> >
> > It looks like pci_walk_bus() is a poor replacement for when dealing with
> > slots. There might be multiple slots on a bus or a mix of slots and
> > non-slots, so for each device pci_walk_bus() finds on a subordinate bus
> > I'd need to walk up the tree to find the parent bridge on the original
> > bus to figure out if it's in the same slot.
>
> Do you really care about that scenario? PCIe only supports a single
> slot per bus, as far as I know.

I believe that's true for pciehp, but I can easily imagine that it's not
the case for other hotplug controllers. I don't run into this scenario
on any of my hardware, but I also don't want to embed any pciehp
assumptions either. So I care for the sake of completeness, but I'm not
targeting specific hardware that needs this.

> > Should we have a pci_walk_slot() function?
>
> I guess. And supply the pci_slot rather than the pci_dev? I'm a
> little bit worried because the idea of a "slot" is not well-defined in
> the spec, and we have sort of an ad hoc method of discovering and
> managing them, e.g., acpiphp and pciehp might discover the same slot.
> But I guess that's no reason to bury generic code in vfio.

I try to handle the slot as opaque, only caring that the slot pointer
matches, so I think our implementation is ok... so long as we only get
one driver claiming to manage a slot, but that's not a vfio problem ;)
Thanks,

Alex

2013-08-19 22:43:03

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Mon, 2013-08-19 at 14:02 -0600, Bjorn Helgaas wrote:
> I guess. And supply the pci_slot rather than the pci_dev? I'm a
> little bit worried because the idea of a "slot" is not well-defined in
> the spec, and we have sort of an ad hoc method of discovering and
> managing them, e.g., acpiphp and pciehp might discover the same slot.
> But I guess that's no reason to bury generic code in vfio.

And I don't have pci_slot's at all yet on powerpc "powernv" (the host
platform for KVM) since at this stage we don't support physical hotplug
on the target machines...

Alex, why specifically looking for "slots" here ? I don't quite
understand. It makes sense to be able to reset individual devices
whether they are on the otherboard, behind extension chassis or directly
on slots...

Cheers,
Ben.

2013-08-19 22:44:50

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Mon, 2013-08-19 at 14:20 -0600, Alex Williamson wrote:
> I try to handle the slot as opaque, only caring that the slot pointer
> matches, so I think our implementation is ok... so long as we only get
> one driver claiming to manage a slot, but that's not a vfio problem ;)
> Thanks,

By why bother with slots ? Why do you even think about slots in that
context ? slots are a badly defined thing in our current PCI stack,
pretty much intricated with hotplug. I don't see why the reset semantics
would be tied to slots at all.

The only case where it *might* make some sense (and even then ...) is if
you want to start exposing slot power control and PERST but that would
imply a pile of platform specific gunk anyway.

Ben.

2013-08-19 22:59:44

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Tue, 2013-08-20 at 08:42 +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2013-08-19 at 14:02 -0600, Bjorn Helgaas wrote:
> > I guess. And supply the pci_slot rather than the pci_dev? I'm a
> > little bit worried because the idea of a "slot" is not well-defined in
> > the spec, and we have sort of an ad hoc method of discovering and
> > managing them, e.g., acpiphp and pciehp might discover the same slot.
> > But I guess that's no reason to bury generic code in vfio.
>
> And I don't have pci_slot's at all yet on powerpc "powernv" (the host
> platform for KVM) since at this stage we don't support physical hotplug
> on the target machines...
>
> Alex, why specifically looking for "slots" here ? I don't quite
> understand. It makes sense to be able to reset individual devices
> whether they are on the otherboard, behind extension chassis or directly
> on slots...

a) resetting a slot may have a smaller footprint than resetting a bus,
b) hotplug controllers sometimes need to be involved in a bus reset.
For b) I have a specific example where my Lenovo S20 workstation has an
onboard tg3 NIC attached to a root port supporting pciehp (go figure
since the tg3 is soldered onto the motherboard) and doing a secondary
bus reset at the root port triggers a presence detection change and
therefore tries to do a surprise removal. By doing a "slot" reset, I
have the hotplug controller code manage the bus reset by disabling
presence detection around the bus reset. If you don't have slots and
you don't need anything special around a secondary bus reset, you're
fine. It's just an opportunity to provide a hook for the hotplug
controller to participate. Thanks,

Alex

2013-08-19 23:02:52

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Tue, 2013-08-20 at 08:44 +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2013-08-19 at 14:20 -0600, Alex Williamson wrote:
> > I try to handle the slot as opaque, only caring that the slot pointer
> > matches, so I think our implementation is ok... so long as we only get
> > one driver claiming to manage a slot, but that's not a vfio problem ;)
> > Thanks,
>
> By why bother with slots ? Why do you even think about slots in that
> context ? slots are a badly defined thing in our current PCI stack,
> pretty much intricated with hotplug. I don't see why the reset semantics
> would be tied to slots at all.

See my other reply, hotplug presence detection and secondary bus resets
don't just work.

> The only case where it *might* make some sense (and even then ...) is if
> you want to start exposing slot power control and PERST but that would
> imply a pile of platform specific gunk anyway.

But that platform specific gunk can be hidden away in the hotplug
controller. We just need to be able to ask if it can reset a slot and
tell it to do it. If it happens via a slot power control or a secondary
bus reset, do we care? Thanks,

Alex

2013-08-19 23:52:20

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Mon, 2013-08-19 at 16:59 -0600, Alex Williamson wrote:
> On Tue, 2013-08-20 at 08:42 +1000, Benjamin Herrenschmidt wrote:
> > On Mon, 2013-08-19 at 14:02 -0600, Bjorn Helgaas wrote:
> > > I guess. And supply the pci_slot rather than the pci_dev? I'm a
> > > little bit worried because the idea of a "slot" is not well-defined in
> > > the spec, and we have sort of an ad hoc method of discovering and
> > > managing them, e.g., acpiphp and pciehp might discover the same slot.
> > > But I guess that's no reason to bury generic code in vfio.
> >
> > And I don't have pci_slot's at all yet on powerpc "powernv" (the host
> > platform for KVM) since at this stage we don't support physical hotplug
> > on the target machines...
> >
> > Alex, why specifically looking for "slots" here ? I don't quite
> > understand. It makes sense to be able to reset individual devices
> > whether they are on the otherboard, behind extension chassis or directly
> > on slots...
>
> a) resetting a slot may have a smaller footprint than resetting a bus,

*May* ... at least on PCIe there is no difference. I suppose PCI pre-E
slots might have individual reset controls.... though the way to get
them is fairly platform specific.

> b) hotplug controllers sometimes need to be involved in a bus reset.
> For b) I have a specific example where my Lenovo S20 workstation has an
> onboard tg3 NIC attached to a root port supporting pciehp (go figure
> since the tg3 is soldered onto the motherboard) and doing a secondary
> bus reset at the root port triggers a presence detection change and
> therefore tries to do a surprise removal. By doing a "slot" reset, I
> have the hotplug controller code manage the bus reset by disabling
> presence detection around the bus reset. If you don't have slots and
> you don't need anything special around a secondary bus reset, you're
> fine. It's just an opportunity to provide a hook for the hotplug
> controller to participate. Thanks,

Yuck, junk HW again ... oh well, I suppose that's never going to end...

As long as the code works without the slots I'm fine :-) As I mentioned,
we might have to do a whole different infrastructure for EEH anyway
(which sucks but we have little choice in the matter).

Cheers,
Ben.

> Alex
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2013-08-20 03:18:29

by Al Viro

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Wed, Aug 14, 2013 at 04:42:14PM -0600, Bjorn Helgaas wrote:
> [+cc Al, linux-fsdevel for fdget/fdput usage]

fdget/fdput use looks sane, the only thing is that I would rather
have an explicit include of linux/file.h instead of relying upon
linux/eventfd.h pulling it. Incidentally, there are only 5 files
that include the latter without an explicit include of the former -
drivers/vfio/pci/vfio_pci.c, drivers/vhost/scsi.c, kernel/cgroup.c,
mm/memcontrol.c and mm/vmpressure.c. And only kernel/cgroup.c (and,
with this patch, vfio_pci.c) really wants anything from linux/file.h,
so I'd rather kill that indirect include in eventfd.h and slapped
an explicit include of file.h in these two files...

BTW, most of the eventfd_fget() users might as well be using fget()
(or fdget(), for that matter). They tend to be immediately followed
by eventfd_ctx_fileget(), which repeats the "is that an eventfd file?"
check anyway.

Completely untested patch below does that to kernel/cgroup.c; Tejun,
Davide - do you have any objections against the following?

Kill indirect include of file.h from eventfd.h, use fdget() in cgroup.c

kernel/cgroup.c is the only place in the tree that relies on eventfd.h
pulling file.h; move that include there. Switch from eventfd_fget()/fput()
to fdget()/fdput(), while we are at it - eventfd_ctx_fileget() will fail
on non-eventfd descriptors just fine, no need to do that check twice...

Signed-off-by: Al Viro <[email protected]>
---
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index cf5d2af..ff0b981 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -9,7 +9,6 @@
#define _LINUX_EVENTFD_H

#include <linux/fcntl.h>
-#include <linux/file.h>
#include <linux/wait.h>

/*
@@ -26,6 +25,8 @@
#define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)

+struct file;
+
#ifdef CONFIG_EVENTFD

struct file *eventfd_file_create(unsigned int count, int flags);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 781845a..f88ecaf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
#include <linux/poll.h>
#include <linux/flex_array.h> /* used in cgroup_attach_task */
#include <linux/kthread.h>
+#include <linux/file.h>

#include <linux/atomic.h>

@@ -3969,8 +3970,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
struct cgroup_event *event = NULL;
struct cgroup *cgrp_cfile;
unsigned int efd, cfd;
- struct file *efile = NULL;
- struct file *cfile = NULL;
+ struct fd efile;
+ struct fd cfile;
char *endp;
int ret;

@@ -3993,31 +3994,31 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
INIT_WORK(&event->remove, cgroup_event_remove);

- efile = eventfd_fget(efd);
- if (IS_ERR(efile)) {
- ret = PTR_ERR(efile);
- goto fail;
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
+ goto fail1;
}

- event->eventfd = eventfd_ctx_fileget(efile);
+ event->eventfd = eventfd_ctx_fileget(efile.file);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
- goto fail;
+ goto fail2;
}

- cfile = fget(cfd);
- if (!cfile) {
+ cfile = fdget(cfd);
+ if (!cfile.file) {
ret = -EBADF;
- goto fail;
+ goto fail3;
}

/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(file_inode(cfile), MAY_READ);
+ ret = inode_permission(file_inode(cfile.file), MAY_READ);
if (ret < 0)
goto fail;

- event->cft = __file_cft(cfile);
+ event->cft = __file_cft(cfile.file);
if (IS_ERR(event->cft)) {
ret = PTR_ERR(event->cft);
goto fail;
@@ -4027,7 +4028,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
* The file to be monitored must be in the same cgroup as
* cgroup.event_control is.
*/
- cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+ cgrp_cfile = __d_cgrp(cfile.file->f_dentry->d_parent);
if (cgrp_cfile != cgrp) {
ret = -EINVAL;
goto fail;
@@ -4043,7 +4044,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
if (ret)
goto fail;

- efile->f_op->poll(efile, &event->pt);
+ efile.file->f_op->poll(efile.file, &event->pt);

/*
* Events should be removed after rmdir of cgroup directory, but before
@@ -4056,21 +4057,18 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
list_add(&event->list, &cgrp->event_list);
spin_unlock(&cgrp->event_list_lock);

- fput(cfile);
- fput(efile);
+ fdput(cfile);
+ fdput(efile);

return 0;

fail:
- if (cfile)
- fput(cfile);
-
- if (event && event->eventfd && !IS_ERR(event->eventfd))
- eventfd_ctx_put(event->eventfd);
-
- if (!IS_ERR_OR_NULL(efile))
- fput(efile);
-
+ fdput(cfile);
+fail3:
+ eventfd_ctx_put(event->eventfd);
+fail2:
+ fdput(efile);
+fail1:
kfree(event);

return ret;

2013-08-20 03:53:20

by Alex Williamson

[permalink] [raw]
Subject: Re: [PATCH] vfio-pci: PCI hot reset interface

On Tue, 2013-08-20 at 04:18 +0100, Al Viro wrote:
> On Wed, Aug 14, 2013 at 04:42:14PM -0600, Bjorn Helgaas wrote:
> > [+cc Al, linux-fsdevel for fdget/fdput usage]
>
> fdget/fdput use looks sane, the only thing is that I would rather
> have an explicit include of linux/file.h instead of relying upon
> linux/eventfd.h pulling it.

Thanks for reviewing, I'll add an explicit include.

> Incidentally, there are only 5 files
> that include the latter without an explicit include of the former -
> drivers/vfio/pci/vfio_pci.c, drivers/vhost/scsi.c, kernel/cgroup.c,
> mm/memcontrol.c and mm/vmpressure.c. And only kernel/cgroup.c (and,
> with this patch, vfio_pci.c) really wants anything from linux/file.h,
> so I'd rather kill that indirect include in eventfd.h and slapped
> an explicit include of file.h in these two files...
>
> BTW, most of the eventfd_fget() users might as well be using fget()
> (or fdget(), for that matter). They tend to be immediately followed
> by eventfd_ctx_fileget(), which repeats the "is that an eventfd file?"
> check anyway.

Hmm, I've got one of those elsewhere in vfio code too. Thanks for the
tip.

Alex

> Completely untested patch below does that to kernel/cgroup.c; Tejun,
> Davide - do you have any objections against the following?
>
> Kill indirect include of file.h from eventfd.h, use fdget() in cgroup.c
>
> kernel/cgroup.c is the only place in the tree that relies on eventfd.h
> pulling file.h; move that include there. Switch from eventfd_fget()/fput()
> to fdget()/fdput(), while we are at it - eventfd_ctx_fileget() will fail
> on non-eventfd descriptors just fine, no need to do that check twice...
>
> Signed-off-by: Al Viro <[email protected]>
> ---
> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> index cf5d2af..ff0b981 100644
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -9,7 +9,6 @@
> #define _LINUX_EVENTFD_H
>
> #include <linux/fcntl.h>
> -#include <linux/file.h>
> #include <linux/wait.h>
>
> /*
> @@ -26,6 +25,8 @@
> #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
> #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
>
> +struct file;
> +
> #ifdef CONFIG_EVENTFD
>
> struct file *eventfd_file_create(unsigned int count, int flags);
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 781845a..f88ecaf 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -60,6 +60,7 @@
> #include <linux/poll.h>
> #include <linux/flex_array.h> /* used in cgroup_attach_task */
> #include <linux/kthread.h>
> +#include <linux/file.h>
>
> #include <linux/atomic.h>
>
> @@ -3969,8 +3970,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
> struct cgroup_event *event = NULL;
> struct cgroup *cgrp_cfile;
> unsigned int efd, cfd;
> - struct file *efile = NULL;
> - struct file *cfile = NULL;
> + struct fd efile;
> + struct fd cfile;
> char *endp;
> int ret;
>
> @@ -3993,31 +3994,31 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
> init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
> INIT_WORK(&event->remove, cgroup_event_remove);
>
> - efile = eventfd_fget(efd);
> - if (IS_ERR(efile)) {
> - ret = PTR_ERR(efile);
> - goto fail;
> + efile = fdget(efd);
> + if (!efile.file) {
> + ret = -EBADF;
> + goto fail1;
> }
>
> - event->eventfd = eventfd_ctx_fileget(efile);
> + event->eventfd = eventfd_ctx_fileget(efile.file);
> if (IS_ERR(event->eventfd)) {
> ret = PTR_ERR(event->eventfd);
> - goto fail;
> + goto fail2;
> }
>
> - cfile = fget(cfd);
> - if (!cfile) {
> + cfile = fdget(cfd);
> + if (!cfile.file) {
> ret = -EBADF;
> - goto fail;
> + goto fail3;
> }
>
> /* the process need read permission on control file */
> /* AV: shouldn't we check that it's been opened for read instead? */
> - ret = inode_permission(file_inode(cfile), MAY_READ);
> + ret = inode_permission(file_inode(cfile.file), MAY_READ);
> if (ret < 0)
> goto fail;
>
> - event->cft = __file_cft(cfile);
> + event->cft = __file_cft(cfile.file);
> if (IS_ERR(event->cft)) {
> ret = PTR_ERR(event->cft);
> goto fail;
> @@ -4027,7 +4028,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
> * The file to be monitored must be in the same cgroup as
> * cgroup.event_control is.
> */
> - cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
> + cgrp_cfile = __d_cgrp(cfile.file->f_dentry->d_parent);
> if (cgrp_cfile != cgrp) {
> ret = -EINVAL;
> goto fail;
> @@ -4043,7 +4044,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
> if (ret)
> goto fail;
>
> - efile->f_op->poll(efile, &event->pt);
> + efile.file->f_op->poll(efile.file, &event->pt);
>
> /*
> * Events should be removed after rmdir of cgroup directory, but before
> @@ -4056,21 +4057,18 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
> list_add(&event->list, &cgrp->event_list);
> spin_unlock(&cgrp->event_list_lock);
>
> - fput(cfile);
> - fput(efile);
> + fdput(cfile);
> + fdput(efile);
>
> return 0;
>
> fail:
> - if (cfile)
> - fput(cfile);
> -
> - if (event && event->eventfd && !IS_ERR(event->eventfd))
> - eventfd_ctx_put(event->eventfd);
> -
> - if (!IS_ERR_OR_NULL(efile))
> - fput(efile);
> -
> + fdput(cfile);
> +fail3:
> + eventfd_ctx_put(event->eventfd);
> +fail2:
> + fdput(efile);
> +fail1:
> kfree(event);
>
> return ret;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/