The ioeventfd here is actually irqfd handling of an ioeventfd such as
supported in KVM. A user is able to pre-program a device write to
occur when the eventfd triggers. This is yet another instance of
eventfd-irqfd triggering between KVM and vfio. The impetus for this
is high frequency writes to pages which are virtualized in QEMU.
Enabling this near-direct write path for selected registers within
the virtualized page can improve performance and reduce overhead.
Specifically this is initially targeted at NVIDIA graphics cards where
the driver issues a write to an MMIO register within a virtualized
region in order to allow the MSI interrupt to re-trigger.
Signed-off-by: Alex Williamson <[email protected]>
---
drivers/vfio/pci/vfio_pci.c | 33 +++++++
drivers/vfio/pci/vfio_pci_private.h | 14 +++
drivers/vfio/pci/vfio_pci_rdwr.c | 165 ++++++++++++++++++++++++++++++++---
include/uapi/linux/vfio.h | 24 +++++
4 files changed, 224 insertions(+), 12 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index f041b1a6cf66..c8e7297a61a3 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -302,6 +302,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
{
struct pci_dev *pdev = vdev->pdev;
struct vfio_pci_dummy_resource *dummy_res, *tmp;
+ struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
int i, bar;
/* Stop the device from further DMA */
@@ -311,6 +312,14 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
VFIO_IRQ_SET_ACTION_TRIGGER,
vdev->irq_type, 0, 0, NULL);
+ /* Device closed, don't need mutex here */
+ list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
+ &vdev->ioeventfds_list, next) {
+ vfio_virqfd_disable(&ioeventfd->virqfd);
+ list_del(&ioeventfd->next);
+ kfree(ioeventfd);
+ }
+
vdev->virq_disabled = false;
for (i = 0; i < vdev->num_regions; i++)
@@ -1039,6 +1048,28 @@ static long vfio_pci_ioctl(void *device_data,
kfree(groups);
return ret;
+ } else if (cmd == VFIO_DEVICE_IOEVENTFD) {
+ struct vfio_device_ioeventfd ioeventfd;
+ int count;
+
+ minsz = offsetofend(struct vfio_device_ioeventfd, fd);
+
+ if (copy_from_user(&ioeventfd, (void __user*)arg, minsz))
+ return -EFAULT;
+
+ if (ioeventfd.argsz < minsz)
+ return -EINVAL;
+
+ if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
+ return -EINVAL;
+
+ count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
+
+ if (hweight8(count) != 1 || ioeventfd.fd < -1)
+ return -EINVAL;
+
+ return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
+ ioeventfd.data, count, ioeventfd.fd);
}
return -ENOTTY;
@@ -1217,6 +1248,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
vdev->irq_type = VFIO_PCI_NUM_IRQS;
mutex_init(&vdev->igate);
spin_lock_init(&vdev->irqlock);
+ mutex_init(&vdev->ioeventfds_lock);
+ INIT_LIST_HEAD(&vdev->ioeventfds_list);
ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
if (ret) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index f561ac1c78a0..23797622396e 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -29,6 +29,15 @@
#define PCI_CAP_ID_INVALID 0xFF /* default raw access */
#define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */
+struct vfio_pci_ioeventfd {
+ struct list_head next;
+ struct virqfd *virqfd;
+ loff_t pos;
+ uint64_t data;
+ int bar;
+ int count;
+};
+
struct vfio_pci_irq_ctx {
struct eventfd_ctx *trigger;
struct virqfd *unmask;
@@ -95,6 +104,8 @@ struct vfio_pci_device {
struct eventfd_ctx *err_trigger;
struct eventfd_ctx *req_trigger;
struct list_head dummy_resources_list;
+ struct mutex ioeventfds_lock;
+ struct list_head ioeventfds_list;
};
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
@@ -120,6 +131,9 @@ extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite);
+extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
+ uint64_t data, int count, int fd);
+
extern int vfio_pci_init_perm_bits(void);
extern void vfio_pci_uninit_perm_bits(void);
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 357243d76f10..55bb4517d4ba 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -17,6 +17,7 @@
#include <linux/pci.h>
#include <linux/uaccess.h>
#include <linux/io.h>
+#include <linux/vfio.h>
#include <linux/vgaarb.h>
#include "vfio_pci_private.h"
@@ -113,6 +114,30 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
return done;
}
+static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ int ret;
+ void __iomem *io;
+
+ if (vdev->barmap[bar])
+ return 0;
+
+ ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
+ if (ret)
+ return ret;
+
+ io = pci_iomap(pdev, bar, 0);
+ if (!io) {
+ pci_release_selected_regions(pdev, 1 << bar);
+ return -ENOMEM;
+ }
+
+ vdev->barmap[bar] = io;
+
+ return 0;
+}
+
ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
{
@@ -147,22 +172,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
if (!io)
return -ENOMEM;
x_end = end;
- } else if (!vdev->barmap[bar]) {
- int ret;
-
- ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
+ } else {
+ int ret = vfio_pci_setup_barmap(vdev, bar);
if (ret)
return ret;
- io = pci_iomap(pdev, bar, 0);
- if (!io) {
- pci_release_selected_regions(pdev, 1 << bar);
- return -ENOMEM;
- }
-
- vdev->barmap[bar] = io;
- } else
io = vdev->barmap[bar];
+ }
if (bar == vdev->msix_bar) {
x_start = vdev->msix_offset;
@@ -242,3 +258,128 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
return done;
}
+
+static int vfio_pci_ioeventfd_handler8(void *opaque, void *data)
+{
+ iowrite8((unsigned long)data, opaque);
+ return 0;
+}
+
+static int vfio_pci_ioeventfd_handler16(void *opaque, void *data)
+{
+ iowrite16((unsigned long)data, opaque);
+ return 0;
+}
+
+static int vfio_pci_ioeventfd_handler32(void *opaque, void *data)
+{
+ iowrite32((unsigned long)data, opaque);
+ return 0;
+}
+
+#ifdef iowrite64
+static int vfio_pci_ioeventfd_handler64(void *opaque, void *data)
+{
+ iowrite64((unsigned long)data, opaque);
+ return 0;
+}
+#endif
+
+long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
+ uint64_t data, int count, int fd)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
+ int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
+ struct vfio_pci_ioeventfd *ioeventfd;
+ int (*handler)(void *, void *);
+ unsigned long val;
+
+ /* Only support ioeventfds into BARs */
+ if (bar > VFIO_PCI_BAR5_REGION_INDEX)
+ return -EINVAL;
+
+ if (pos + count > pci_resource_len(pdev, bar))
+ return -EINVAL;
+
+ /* Disallow ioeventfds working around MSI-X table writes */
+ if (bar == vdev->msix_bar &&
+ !(pos + count <= vdev->msix_offset ||
+ pos >= vdev->msix_offset + vdev->msix_size))
+ return -EINVAL;
+
+ switch (count) {
+ case 1:
+ handler = &vfio_pci_ioeventfd_handler8;
+ val = data;
+ break;
+ case 2:
+ handler = &vfio_pci_ioeventfd_handler16;
+ val = le16_to_cpu(data);
+ break;
+ case 4:
+ handler = &vfio_pci_ioeventfd_handler32;
+ val = le32_to_cpu(data);
+ break;
+#ifdef iowrite64
+ case 8:
+ handler = &vfio_pci_ioeventfd_handler64;
+ val = le64_to_cpu(data);
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ ret = vfio_pci_setup_barmap(vdev, bar);
+ if (ret)
+ return ret;
+
+ mutex_lock(&vdev->ioeventfds_lock);
+
+ list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
+ if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
+ ioeventfd->data == data && ioeventfd->count == count) {
+ if (fd == -1) {
+ vfio_virqfd_disable(&ioeventfd->virqfd);
+ list_del(&ioeventfd->next);
+ kfree(ioeventfd);
+ ret = 0;
+ } else
+ ret = -EEXIST;
+
+ goto out_unlock;
+ }
+ }
+
+ if (fd < 0) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
+ if (!ioeventfd) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ ioeventfd->pos = pos;
+ ioeventfd->bar = bar;
+ ioeventfd->data = data;
+ ioeventfd->count = count;
+
+ ret = vfio_virqfd_enable(vdev->barmap[ioeventfd->bar] + ioeventfd->pos,
+ handler, NULL, (void *)val,
+ &ioeventfd->virqfd, fd);
+ if (ret) {
+ kfree(ioeventfd);
+ goto out_unlock;
+ }
+
+ list_add(&ioeventfd->next, &vdev->ioeventfds_list);
+
+out_unlock:
+ mutex_unlock(&vdev->ioeventfds_lock);
+
+ return ret;
+}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e3301dbd27d4..07966a5f0832 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
#define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
+/**
+ * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
+ * struct vfio_device_ioeventfd)
+ *
+ * Perform a write to the device at the specified device fd offset, with
+ * the specified data and width when the provided eventfd is triggered.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_ioeventfd {
+ __u32 argsz;
+ __u32 flags;
+#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */
+#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */
+#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
+#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
+#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
+ __u64 offset; /* device fd offset of write */
+ __u64 data; /* data to be written */
+ __s32 fd; /* -1 for de-assignment */
+};
+
+#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 14)
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**
On 07/02/18 11:08, Alex Williamson wrote:
> The ioeventfd here is actually irqfd handling of an ioeventfd such as
> supported in KVM. A user is able to pre-program a device write to
> occur when the eventfd triggers. This is yet another instance of
> eventfd-irqfd triggering between KVM and vfio. The impetus for this
> is high frequency writes to pages which are virtualized in QEMU.
> Enabling this near-direct write path for selected registers within
> the virtualized page can improve performance and reduce overhead.
> Specifically this is initially targeted at NVIDIA graphics cards where
> the driver issues a write to an MMIO register within a virtualized
> region in order to allow the MSI interrupt to re-trigger.
>
> Signed-off-by: Alex Williamson <[email protected]>
> ---
> drivers/vfio/pci/vfio_pci.c | 33 +++++++
> drivers/vfio/pci/vfio_pci_private.h | 14 +++
> drivers/vfio/pci/vfio_pci_rdwr.c | 165 ++++++++++++++++++++++++++++++++---
> include/uapi/linux/vfio.h | 24 +++++
> 4 files changed, 224 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index f041b1a6cf66..c8e7297a61a3 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -302,6 +302,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
> {
> struct pci_dev *pdev = vdev->pdev;
> struct vfio_pci_dummy_resource *dummy_res, *tmp;
> + struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
> int i, bar;
>
> /* Stop the device from further DMA */
> @@ -311,6 +312,14 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
> VFIO_IRQ_SET_ACTION_TRIGGER,
> vdev->irq_type, 0, 0, NULL);
>
> + /* Device closed, don't need mutex here */
> + list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
> + &vdev->ioeventfds_list, next) {
> + vfio_virqfd_disable(&ioeventfd->virqfd);
> + list_del(&ioeventfd->next);
> + kfree(ioeventfd);
> + }
> +
> vdev->virq_disabled = false;
>
> for (i = 0; i < vdev->num_regions; i++)
> @@ -1039,6 +1048,28 @@ static long vfio_pci_ioctl(void *device_data,
>
> kfree(groups);
> return ret;
> + } else if (cmd == VFIO_DEVICE_IOEVENTFD) {
> + struct vfio_device_ioeventfd ioeventfd;
> + int count;
> +
> + minsz = offsetofend(struct vfio_device_ioeventfd, fd);
> +
> + if (copy_from_user(&ioeventfd, (void __user*)arg, minsz))
> + return -EFAULT;
> +
> + if (ioeventfd.argsz < minsz)
> + return -EINVAL;
> +
> + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
> + return -EINVAL;
> +
> + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
> +
> + if (hweight8(count) != 1 || ioeventfd.fd < -1)
> + return -EINVAL;
> +
> + return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
> + ioeventfd.data, count, ioeventfd.fd);
> }
>
> return -ENOTTY;
> @@ -1217,6 +1248,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> vdev->irq_type = VFIO_PCI_NUM_IRQS;
> mutex_init(&vdev->igate);
> spin_lock_init(&vdev->irqlock);
> + mutex_init(&vdev->ioeventfds_lock);
> + INIT_LIST_HEAD(&vdev->ioeventfds_list);
>
> ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
> if (ret) {
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index f561ac1c78a0..23797622396e 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -29,6 +29,15 @@
> #define PCI_CAP_ID_INVALID 0xFF /* default raw access */
> #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */
>
> +struct vfio_pci_ioeventfd {
> + struct list_head next;
> + struct virqfd *virqfd;
> + loff_t pos;
> + uint64_t data;
> + int bar;
> + int count;
> +};
> +
> struct vfio_pci_irq_ctx {
> struct eventfd_ctx *trigger;
> struct virqfd *unmask;
> @@ -95,6 +104,8 @@ struct vfio_pci_device {
> struct eventfd_ctx *err_trigger;
> struct eventfd_ctx *req_trigger;
> struct list_head dummy_resources_list;
> + struct mutex ioeventfds_lock;
> + struct list_head ioeventfds_list;
> };
>
> #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
> @@ -120,6 +131,9 @@ extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
> extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
> size_t count, loff_t *ppos, bool iswrite);
>
> +extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
> + uint64_t data, int count, int fd);
> +
> extern int vfio_pci_init_perm_bits(void);
> extern void vfio_pci_uninit_perm_bits(void);
>
> diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
> index 357243d76f10..55bb4517d4ba 100644
> --- a/drivers/vfio/pci/vfio_pci_rdwr.c
> +++ b/drivers/vfio/pci/vfio_pci_rdwr.c
> @@ -17,6 +17,7 @@
> #include <linux/pci.h>
> #include <linux/uaccess.h>
> #include <linux/io.h>
> +#include <linux/vfio.h>
> #include <linux/vgaarb.h>
>
> #include "vfio_pci_private.h"
> @@ -113,6 +114,30 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
> return done;
> }
>
> +static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
> +{
> + struct pci_dev *pdev = vdev->pdev;
> + int ret;
> + void __iomem *io;
> +
> + if (vdev->barmap[bar])
> + return 0;
> +
> + ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
> + if (ret)
> + return ret;
> +
> + io = pci_iomap(pdev, bar, 0);
> + if (!io) {
> + pci_release_selected_regions(pdev, 1 << bar);
> + return -ENOMEM;
> + }
> +
> + vdev->barmap[bar] = io;
> +
> + return 0;
> +}
> +
> ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
> size_t count, loff_t *ppos, bool iswrite)
> {
> @@ -147,22 +172,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
> if (!io)
> return -ENOMEM;
> x_end = end;
> - } else if (!vdev->barmap[bar]) {
> - int ret;
> -
> - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
> + } else {
> + int ret = vfio_pci_setup_barmap(vdev, bar);
> if (ret)
> return ret;
>
> - io = pci_iomap(pdev, bar, 0);
> - if (!io) {
> - pci_release_selected_regions(pdev, 1 << bar);
> - return -ENOMEM;
> - }
> -
> - vdev->barmap[bar] = io;
> - } else
> io = vdev->barmap[bar];
> + }
>
> if (bar == vdev->msix_bar) {
> x_start = vdev->msix_offset;
> @@ -242,3 +258,128 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
>
> return done;
> }
> +
> +static int vfio_pci_ioeventfd_handler8(void *opaque, void *data)
> +{
> + iowrite8((unsigned long)data, opaque);
> + return 0;
> +}
> +
> +static int vfio_pci_ioeventfd_handler16(void *opaque, void *data)
> +{
> + iowrite16((unsigned long)data, opaque);
> + return 0;
> +}
> +
> +static int vfio_pci_ioeventfd_handler32(void *opaque, void *data)
> +{
> + iowrite32((unsigned long)data, opaque);
> + return 0;
> +}
> +
> +#ifdef iowrite64
> +static int vfio_pci_ioeventfd_handler64(void *opaque, void *data)
> +{
> + iowrite64((unsigned long)data, opaque);
> + return 0;
> +}
> +#endif
> +
> +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
> + uint64_t data, int count, int fd)
> +{
> + struct pci_dev *pdev = vdev->pdev;
> + loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
> + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
> + struct vfio_pci_ioeventfd *ioeventfd;
> + int (*handler)(void *, void *);
> + unsigned long val;
> +
> + /* Only support ioeventfds into BARs */
> + if (bar > VFIO_PCI_BAR5_REGION_INDEX)
> + return -EINVAL;
> +
> + if (pos + count > pci_resource_len(pdev, bar))
> + return -EINVAL;
> +
> + /* Disallow ioeventfds working around MSI-X table writes */
> + if (bar == vdev->msix_bar &&
> + !(pos + count <= vdev->msix_offset ||
> + pos >= vdev->msix_offset + vdev->msix_size))
> + return -EINVAL;
> +
> + switch (count) {
> + case 1:
> + handler = &vfio_pci_ioeventfd_handler8;
> + val = data;
> + break;
> + case 2:
> + handler = &vfio_pci_ioeventfd_handler16;
> + val = le16_to_cpu(data);
> + break;
> + case 4:
> + handler = &vfio_pci_ioeventfd_handler32;
> + val = le32_to_cpu(data);
> + break;
> +#ifdef iowrite64
> + case 8:
> + handler = &vfio_pci_ioeventfd_handler64;
> + val = le64_to_cpu(data);
> + break;
> +#endif
> + default:
> + return -EINVAL;
> + }
> +
> + ret = vfio_pci_setup_barmap(vdev, bar);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&vdev->ioeventfds_lock);
> +
> + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
> + if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
> + ioeventfd->data == data && ioeventfd->count == count) {
> + if (fd == -1) {
> + vfio_virqfd_disable(&ioeventfd->virqfd);
> + list_del(&ioeventfd->next);
> + kfree(ioeventfd);
> + ret = 0;
> + } else
> + ret = -EEXIST;
> +
> + goto out_unlock;
> + }
> + }
> +
> + if (fd < 0) {
> + ret = -ENODEV;
> + goto out_unlock;
> + }
> +
> + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
> + if (!ioeventfd) {
> + ret = -ENOMEM;
> + goto out_unlock;
> + }
> +
> + ioeventfd->pos = pos;
> + ioeventfd->bar = bar;
> + ioeventfd->data = data;
> + ioeventfd->count = count;
> +
> + ret = vfio_virqfd_enable(vdev->barmap[ioeventfd->bar] + ioeventfd->pos,
> + handler, NULL, (void *)val,
> + &ioeventfd->virqfd, fd);
> + if (ret) {
> + kfree(ioeventfd);
> + goto out_unlock;
> + }
> +
> + list_add(&ioeventfd->next, &vdev->ioeventfds_list);
> +
> +out_unlock:
> + mutex_unlock(&vdev->ioeventfds_lock);
> +
> + return ret;
> +}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index e3301dbd27d4..07966a5f0832 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
>
> #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
>
> +/**
> + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
> + * struct vfio_device_ioeventfd)
> + *
> + * Perform a write to the device at the specified device fd offset, with
> + * the specified data and width when the provided eventfd is triggered.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_ioeventfd {
> + __u32 argsz;
> + __u32 flags;
> +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */
> +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */
> +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
> +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
> +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
> + __u64 offset; /* device fd offset of write */
> + __u64 data; /* data to be written */
> + __s32 fd; /* -1 for de-assignment */
> +};
> +
> +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 14)
Is this a first ioctl with endianness fixed to little-endian? I'd suggest
to comment on that as things like vfio_info_cap_header do use the host
endianness.
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
>
--
Alexey
On Wed, 7 Feb 2018 15:09:22 +1100
Alexey Kardashevskiy <[email protected]> wrote:
> On 07/02/18 11:08, Alex Williamson wrote:
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index e3301dbd27d4..07966a5f0832 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
> >
> > #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
> >
> > +/**
> > + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
> > + * struct vfio_device_ioeventfd)
> > + *
> > + * Perform a write to the device at the specified device fd offset, with
> > + * the specified data and width when the provided eventfd is triggered.
> > + *
> > + * Return: 0 on success, -errno on failure.
> > + */
> > +struct vfio_device_ioeventfd {
> > + __u32 argsz;
> > + __u32 flags;
> > +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */
> > +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */
> > +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
> > +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
> > +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
> > + __u64 offset; /* device fd offset of write */
> > + __u64 data; /* data to be written */
> > + __s32 fd; /* -1 for de-assignment */
> > +};
> > +
> > +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 14)
>
>
> Is this a first ioctl with endianness fixed to little-endian? I'd suggest
> to comment on that as things like vfio_info_cap_header do use the host
> endianness.
Look at our current read and write interface, we call leXX_to_cpu
before calling iowriteXX there and I think a user would logically
expect to use the same data format here as they would there. Also note
that iowriteXX does a cpu_to_leXX, so are we really defining the
interface as little-endian or are we just trying to make ourselves
endian neutral and counter that implicit conversion? Thanks,
Alex
On 07/02/18 15:25, Alex Williamson wrote:
> On Wed, 7 Feb 2018 15:09:22 +1100
> Alexey Kardashevskiy <[email protected]> wrote:
>> On 07/02/18 11:08, Alex Williamson wrote:
>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>> index e3301dbd27d4..07966a5f0832 100644
>>> --- a/include/uapi/linux/vfio.h
>>> +++ b/include/uapi/linux/vfio.h
>>> @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
>>>
>>> #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
>>>
>>> +/**
>>> + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
>>> + * struct vfio_device_ioeventfd)
>>> + *
>>> + * Perform a write to the device at the specified device fd offset, with
>>> + * the specified data and width when the provided eventfd is triggered.
>>> + *
>>> + * Return: 0 on success, -errno on failure.
>>> + */
>>> +struct vfio_device_ioeventfd {
>>> + __u32 argsz;
>>> + __u32 flags;
>>> +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */
>>> +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */
>>> +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
>>> +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
>>> +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
>>> + __u64 offset; /* device fd offset of write */
>>> + __u64 data; /* data to be written */
>>> + __s32 fd; /* -1 for de-assignment */
>>> +};
>>> +
>>> +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 14)
>>
>>
>> Is this a first ioctl with endianness fixed to little-endian? I'd suggest
>> to comment on that as things like vfio_info_cap_header do use the host
>> endianness.
>
> Look at our current read and write interface, we call leXX_to_cpu
> before calling iowriteXX there and I think a user would logically
> expect to use the same data format here as they would there.
If the data is "char data[8]" (i.e. bytestream), then it can be expected to
be device/bus endian (i.e. PCI == little endian), but if it is u64 - then I
am not so sure really, and this made me look around. It could be "__le64
data" too.
> Also note
> that iowriteXX does a cpu_to_leXX, so are we really defining the
> interface as little-endian or are we just trying to make ourselves
> endian neutral and counter that implicit conversion? Thanks,
Defining it LE is fine, I just find it a bit confusing when
vfio_info_cap_header is host endian but vfio_device_ioeventfd is not.
--
Alexey
On Wed, 7 Feb 2018 15:48:26 +1100
Alexey Kardashevskiy <[email protected]> wrote:
> On 07/02/18 15:25, Alex Williamson wrote:
> > On Wed, 7 Feb 2018 15:09:22 +1100
> > Alexey Kardashevskiy <[email protected]> wrote:
> >> On 07/02/18 11:08, Alex Williamson wrote:
> >>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>> index e3301dbd27d4..07966a5f0832 100644
> >>> --- a/include/uapi/linux/vfio.h
> >>> +++ b/include/uapi/linux/vfio.h
> >>> @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
> >>>
> >>> #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
> >>>
> >>> +/**
> >>> + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
> >>> + * struct vfio_device_ioeventfd)
> >>> + *
> >>> + * Perform a write to the device at the specified device fd offset, with
> >>> + * the specified data and width when the provided eventfd is triggered.
> >>> + *
> >>> + * Return: 0 on success, -errno on failure.
> >>> + */
> >>> +struct vfio_device_ioeventfd {
> >>> + __u32 argsz;
> >>> + __u32 flags;
> >>> +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */
> >>> +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */
> >>> +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
> >>> +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
> >>> +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
> >>> + __u64 offset; /* device fd offset of write */
> >>> + __u64 data; /* data to be written */
> >>> + __s32 fd; /* -1 for de-assignment */
> >>> +};
> >>> +
> >>> +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 14)
> >>
> >>
> >> Is this a first ioctl with endianness fixed to little-endian? I'd suggest
> >> to comment on that as things like vfio_info_cap_header do use the host
> >> endianness.
> >
> > Look at our current read and write interface, we call leXX_to_cpu
> > before calling iowriteXX there and I think a user would logically
> > expect to use the same data format here as they would there.
>
> If the data is "char data[8]" (i.e. bytestream), then it can be expected to
> be device/bus endian (i.e. PCI == little endian), but if it is u64 - then I
> am not so sure really, and this made me look around. It could be "__le64
> data" too.
>
> > Also note
> > that iowriteXX does a cpu_to_leXX, so are we really defining the
> > interface as little-endian or are we just trying to make ourselves
> > endian neutral and counter that implicit conversion? Thanks,
>
> Defining it LE is fine, I just find it a bit confusing when
> vfio_info_cap_header is host endian but vfio_device_ioeventfd is not.
But I don't think we are defining the interface as little-endian.
iowriteXX does a cpu_to_leXX byteswap. Therefore in order to maintain
endian neutrality, if the data does a cpu->le swap on the way out, I
need to do a le->cpu swap on the way in, right? Please defend the
assertion that we're creating a little-endian interface. Thanks,
Alex
Hi Alex,
On 07/02/18 01:08, Alex Williamson wrote:
> The ioeventfd here is actually irqfd handling of an ioeventfd such as
> supported in KVM. A user is able to pre-program a device write to
> occur when the eventfd triggers. This is yet another instance of
> eventfd-irqfd triggering between KVM and vfio. The impetus for this
> is high frequency writes to pages which are virtualized in QEMU.
> Enabling this near-direct write path for selected registers within
> the virtualized page can improve performance and reduce overhead.
> Specifically this is initially targeted at NVIDIA graphics cards where
> the driver issues a write to an MMIO register within a virtualized
> region in order to allow the MSI interrupt to re-trigger.
>
> Signed-off-by: Alex Williamson <[email protected]>
fyi it does not apply anymore on master (conflict in
include/uapi/linux/vfio.h on GFX stuff)
> ---
> drivers/vfio/pci/vfio_pci.c | 33 +++++++
> drivers/vfio/pci/vfio_pci_private.h | 14 +++
> drivers/vfio/pci/vfio_pci_rdwr.c | 165 ++++++++++++++++++++++++++++++++---
> include/uapi/linux/vfio.h | 24 +++++
> 4 files changed, 224 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index f041b1a6cf66..c8e7297a61a3 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -302,6 +302,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
> {
> struct pci_dev *pdev = vdev->pdev;
> struct vfio_pci_dummy_resource *dummy_res, *tmp;
> + struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
> int i, bar;
>
> /* Stop the device from further DMA */
> @@ -311,6 +312,14 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
> VFIO_IRQ_SET_ACTION_TRIGGER,
> vdev->irq_type, 0, 0, NULL);
>
> + /* Device closed, don't need mutex here */
> + list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
> + &vdev->ioeventfds_list, next) {
> + vfio_virqfd_disable(&ioeventfd->virqfd);
> + list_del(&ioeventfd->next);
> + kfree(ioeventfd);
> + }
> +
> vdev->virq_disabled = false;
>
> for (i = 0; i < vdev->num_regions; i++)
> @@ -1039,6 +1048,28 @@ static long vfio_pci_ioctl(void *device_data,
>
> kfree(groups);
> return ret;
> + } else if (cmd == VFIO_DEVICE_IOEVENTFD) {
> + struct vfio_device_ioeventfd ioeventfd;
> + int count;
> +
> + minsz = offsetofend(struct vfio_device_ioeventfd, fd);
> +
> + if (copy_from_user(&ioeventfd, (void __user*)arg, minsz))
> + return -EFAULT;
> +
> + if (ioeventfd.argsz < minsz)
> + return -EINVAL;
> +
> + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
> + return -EINVAL;
> +
> + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
> +
> + if (hweight8(count) != 1 || ioeventfd.fd < -1)
> + return -EINVAL;
> +
> + return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
> + ioeventfd.data, count, ioeventfd.fd);
> }
>
> return -ENOTTY;
> @@ -1217,6 +1248,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> vdev->irq_type = VFIO_PCI_NUM_IRQS;
> mutex_init(&vdev->igate);
> spin_lock_init(&vdev->irqlock);
> + mutex_init(&vdev->ioeventfds_lock);
> + INIT_LIST_HEAD(&vdev->ioeventfds_list);
>
> ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
> if (ret) {
> diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
> index f561ac1c78a0..23797622396e 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -29,6 +29,15 @@
> #define PCI_CAP_ID_INVALID 0xFF /* default raw access */
> #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */
>
> +struct vfio_pci_ioeventfd {
> + struct list_head next;
> + struct virqfd *virqfd;
> + loff_t pos;
> + uint64_t data;
> + int bar;
> + int count;
> +};
> +
> struct vfio_pci_irq_ctx {
> struct eventfd_ctx *trigger;
> struct virqfd *unmask;
> @@ -95,6 +104,8 @@ struct vfio_pci_device {
> struct eventfd_ctx *err_trigger;
> struct eventfd_ctx *req_trigger;
> struct list_head dummy_resources_list;
> + struct mutex ioeventfds_lock;
> + struct list_head ioeventfds_list;
> };
>
> #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
> @@ -120,6 +131,9 @@ extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
> extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
> size_t count, loff_t *ppos, bool iswrite);
>
> +extern long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
> + uint64_t data, int count, int fd);
> +
> extern int vfio_pci_init_perm_bits(void);
> extern void vfio_pci_uninit_perm_bits(void);
>
> diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
> index 357243d76f10..55bb4517d4ba 100644
> --- a/drivers/vfio/pci/vfio_pci_rdwr.c
> +++ b/drivers/vfio/pci/vfio_pci_rdwr.c
> @@ -17,6 +17,7 @@
> #include <linux/pci.h>
> #include <linux/uaccess.h>
> #include <linux/io.h>
> +#include <linux/vfio.h>
> #include <linux/vgaarb.h>
>
> #include "vfio_pci_private.h"
> @@ -113,6 +114,30 @@ static ssize_t do_io_rw(void __iomem *io, char __user *buf,
> return done;
> }
>
> +static int vfio_pci_setup_barmap(struct vfio_pci_device *vdev, int bar)
> +{
> + struct pci_dev *pdev = vdev->pdev;
> + int ret;
> + void __iomem *io;
> +
> + if (vdev->barmap[bar])
> + return 0;
> +
> + ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
> + if (ret)
> + return ret;
> +
> + io = pci_iomap(pdev, bar, 0);
> + if (!io) {
> + pci_release_selected_regions(pdev, 1 << bar);
> + return -ENOMEM;
> + }
> +
> + vdev->barmap[bar] = io;
> +
> + return 0;
> +}
> +
> ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
> size_t count, loff_t *ppos, bool iswrite)
> {
> @@ -147,22 +172,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
> if (!io)
> return -ENOMEM;
> x_end = end;
> - } else if (!vdev->barmap[bar]) {
> - int ret;
> -
> - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio");
> + } else {
> + int ret = vfio_pci_setup_barmap(vdev, bar);
> if (ret)
> return ret;
>
> - io = pci_iomap(pdev, bar, 0);
> - if (!io) {
> - pci_release_selected_regions(pdev, 1 << bar);
> - return -ENOMEM;
> - }
> -
> - vdev->barmap[bar] = io;
> - } else
> io = vdev->barmap[bar];
> + }
>
> if (bar == vdev->msix_bar) {
> x_start = vdev->msix_offset;
> @@ -242,3 +258,128 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf,
>
> return done;
> }
> +
> +static int vfio_pci_ioeventfd_handler8(void *opaque, void *data)
> +{
> + iowrite8((unsigned long)data, opaque);
> + return 0;
> +}
> +
> +static int vfio_pci_ioeventfd_handler16(void *opaque, void *data)
> +{
> + iowrite16((unsigned long)data, opaque);
> + return 0;
> +}
> +
> +static int vfio_pci_ioeventfd_handler32(void *opaque, void *data)
> +{
> + iowrite32((unsigned long)data, opaque);
> + return 0;
> +}
> +
> +#ifdef iowrite64
> +static int vfio_pci_ioeventfd_handler64(void *opaque, void *data)
> +{
> + iowrite64((unsigned long)data, opaque);
> + return 0;
> +}
> +#endif
> +
> +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
> + uint64_t data, int count, int fd)
> +{
> + struct pci_dev *pdev = vdev->pdev;
> + loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
> + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
> + struct vfio_pci_ioeventfd *ioeventfd;
> + int (*handler)(void *, void *);
> + unsigned long val;
> +
> + /* Only support ioeventfds into BARs */
> + if (bar > VFIO_PCI_BAR5_REGION_INDEX)
> + return -EINVAL;
> +
> + if (pos + count > pci_resource_len(pdev, bar))
> + return -EINVAL;
> +
> + /* Disallow ioeventfds working around MSI-X table writes */
> + if (bar == vdev->msix_bar &&
> + !(pos + count <= vdev->msix_offset ||
> + pos >= vdev->msix_offset + vdev->msix_size))
> + return -EINVAL;
> +
> + switch (count) {
> + case 1:
> + handler = &vfio_pci_ioeventfd_handler8;
> + val = data;
> + break;
> + case 2:
> + handler = &vfio_pci_ioeventfd_handler16;
> + val = le16_to_cpu(data);
> + break;
> + case 4:
> + handler = &vfio_pci_ioeventfd_handler32;
> + val = le32_to_cpu(data);
> + break;
> +#ifdef iowrite64
> + case 8:
> + handler = &vfio_pci_ioeventfd_handler64;
> + val = le64_to_cpu(data);
> + break;
> +#endif
> + default:
> + return -EINVAL;
> + }
> +
> + ret = vfio_pci_setup_barmap(vdev, bar);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&vdev->ioeventfds_lock);
> +
> + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
> + if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
> + ioeventfd->data == data && ioeventfd->count == count) {
> + if (fd == -1) {
> + vfio_virqfd_disable(&ioeventfd->virqfd);
> + list_del(&ioeventfd->next);
> + kfree(ioeventfd);
> + ret = 0;
> + } else
> + ret = -EEXIST;
> +
> + goto out_unlock;
> + }
> + }
> +
> + if (fd < 0) {
> + ret = -ENODEV;
> + goto out_unlock;
> + }
> +
> + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
> + if (!ioeventfd) {
> + ret = -ENOMEM;
> + goto out_unlock;
> + }
> +
> + ioeventfd->pos = pos;
> + ioeventfd->bar = bar;
> + ioeventfd->data = data;
> + ioeventfd->count = count;
> +
> + ret = vfio_virqfd_enable(vdev->barmap[ioeventfd->bar] + ioeventfd->pos,
nit: bar and pos could be used directly
> + handler, NULL, (void *)val,
> + &ioeventfd->virqfd, fd);
> + if (ret) {
> + kfree(ioeventfd);
> + goto out_unlock;
> + }
> +
> + list_add(&ioeventfd->next, &vdev->ioeventfds_list);
> +
> +out_unlock:
> + mutex_unlock(&vdev->ioeventfds_lock);
> +
> + return ret;
> +}
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index e3301dbd27d4..07966a5f0832 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
>
> #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
>
> +/**
> + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
> + * struct vfio_device_ioeventfd)
> + *
> + * Perform a write to the device at the specified device fd offset, with
> + * the specified data and width when the provided eventfd is triggered.
don't you want to document the limitation on BAR only and excluding the
MSIx table.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +struct vfio_device_ioeventfd {
> + __u32 argsz;
> + __u32 flags;
> +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */
> +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */
> +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
> +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
> +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
kvm_ioeventfd is using a _u32 len but well this is maybe simpler here to
test sizes?
> + __u64 offset; /* device fd offset of write */
> + __u64 data; /* data to be written */
> + __s32 fd; /* -1 for de-assignment */
> +};
Otherwise it looks good to me.
Thanks
Eric
> +
> +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 14)
> +
> /* -------- API for Type1 VFIO IOMMU -------- */
>
> /**
>
>
On Wed, 7 Feb 2018 16:46:19 +0100
Auger Eric <[email protected]> wrote:
> Hi Alex,
>
> On 07/02/18 01:08, Alex Williamson wrote:
> > The ioeventfd here is actually irqfd handling of an ioeventfd such as
> > supported in KVM. A user is able to pre-program a device write to
> > occur when the eventfd triggers. This is yet another instance of
> > eventfd-irqfd triggering between KVM and vfio. The impetus for this
> > is high frequency writes to pages which are virtualized in QEMU.
> > Enabling this near-direct write path for selected registers within
> > the virtualized page can improve performance and reduce overhead.
> > Specifically this is initially targeted at NVIDIA graphics cards where
> > the driver issues a write to an MMIO register within a virtualized
> > region in order to allow the MSI interrupt to re-trigger.
> >
> > Signed-off-by: Alex Williamson <[email protected]>
>
> fyi it does not apply anymore on master (conflict in
> include/uapi/linux/vfio.h on GFX stuff)
Sorry, I should have noted that this was against v4.15, I didn't want
the churn of the merge window since I was benchmarking against this.
Will update for non-RFC.
...
> > +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
> > + uint64_t data, int count, int fd)
> > +{
> > + struct pci_dev *pdev = vdev->pdev;
> > + loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
> > + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
> > + struct vfio_pci_ioeventfd *ioeventfd;
> > + int (*handler)(void *, void *);
> > + unsigned long val;
> > +
> > + /* Only support ioeventfds into BARs */
> > + if (bar > VFIO_PCI_BAR5_REGION_INDEX)
> > + return -EINVAL;
> > +
> > + if (pos + count > pci_resource_len(pdev, bar))
> > + return -EINVAL;
> > +
> > + /* Disallow ioeventfds working around MSI-X table writes */
> > + if (bar == vdev->msix_bar &&
> > + !(pos + count <= vdev->msix_offset ||
> > + pos >= vdev->msix_offset + vdev->msix_size))
> > + return -EINVAL;
> > +
> > + switch (count) {
> > + case 1:
> > + handler = &vfio_pci_ioeventfd_handler8;
> > + val = data;
> > + break;
> > + case 2:
> > + handler = &vfio_pci_ioeventfd_handler16;
> > + val = le16_to_cpu(data);
> > + break;
> > + case 4:
> > + handler = &vfio_pci_ioeventfd_handler32;
> > + val = le32_to_cpu(data);
> > + break;
> > +#ifdef iowrite64
> > + case 8:
> > + handler = &vfio_pci_ioeventfd_handler64;
> > + val = le64_to_cpu(data);
> > + break;
> > +#endif
> > + default:
> > + return -EINVAL;
> > + }
> > +
> > + ret = vfio_pci_setup_barmap(vdev, bar);
> > + if (ret)
> > + return ret;
> > +
> > + mutex_lock(&vdev->ioeventfds_lock);
> > +
> > + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
> > + if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
> > + ioeventfd->data == data && ioeventfd->count == count) {
> > + if (fd == -1) {
> > + vfio_virqfd_disable(&ioeventfd->virqfd);
> > + list_del(&ioeventfd->next);
> > + kfree(ioeventfd);
> > + ret = 0;
> > + } else
> > + ret = -EEXIST;
> > +
> > + goto out_unlock;
> > + }
> > + }
> > +
> > + if (fd < 0) {
> > + ret = -ENODEV;
> > + goto out_unlock;
> > + }
> > +
> > + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
> > + if (!ioeventfd) {
> > + ret = -ENOMEM;
> > + goto out_unlock;
> > + }
> > +
> > + ioeventfd->pos = pos;
> > + ioeventfd->bar = bar;
> > + ioeventfd->data = data;
> > + ioeventfd->count = count;
> > +
> > + ret = vfio_virqfd_enable(vdev->barmap[ioeventfd->bar] + ioeventfd->pos,
> nit: bar and pos could be used directly
Indeed, probably leftover from development. Fixed and re-wrapped the
following lines.
> > + handler, NULL, (void *)val,
> > + &ioeventfd->virqfd, fd);
> > + if (ret) {
> > + kfree(ioeventfd);
> > + goto out_unlock;
> > + }
> > +
> > + list_add(&ioeventfd->next, &vdev->ioeventfds_list);
> > +
> > +out_unlock:
> > + mutex_unlock(&vdev->ioeventfds_lock);
> > +
> > + return ret;
> > +}
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index e3301dbd27d4..07966a5f0832 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
> >
> > #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
> >
> > +/**
> > + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
> > + * struct vfio_device_ioeventfd)
> > + *
> > + * Perform a write to the device at the specified device fd offset, with
> > + * the specified data and width when the provided eventfd is triggered.
> don't you want to document the limitation on BAR only and excluding the
> MSIx table.
Sure, we could. This is also just an acceleration interface, it's not
required for functionality, so a user can probe the capabilities by
trying to enable an ioeventfd to test for support. I don't really want
to add a flag to each region to identify support or create yet another
sparse map identifying which sections of regions are supported. We
could enable this for PCI config space, but I didn't think it really
made sense (and I was too lazy). Real PCI config space (not MMIO
mirrors of config space on GPUs) should never be a performance path,
therefore I question if there's any ROI for the code to support it.
Device specific regions would need to be handled on a case by case
basis, and I don't think we have any cases yet were it makes sense
(IIRC the IGD regions are read-only). Of course ROMs are read-only, so
it doesn't make sense to support them.
I also note that this patch of course only supports directly assigned
vfio-pci devices, not vfio-platform and not mdev devices. Since the
ioctl is intended to be device agnostic, maybe we could have a default
handler that simply uses the device file write interface. At least one
issue there is that those expect a user buffer. I'll look into how I
might add the support more generically, if perhaps less optimized.
Does it seem like a sufficient user API to try and ioeventfd and be
prepared for it to fail? Given that this is a new ioctl, users need to
do that for backwards compatibility anyway.
Something I forgot to mention in my rush to send this out is that I
debated whether to create a new ioctl or re-use the SET_IRQS ioctl. In
the end, I couldn't resolve how to handle the index, start, and count
fields, so I created this new ioctl. Would we create an ioeventfd
index? We couldn't simply pass an array of ioeventfds since each needs
to be associated with an offset, data, and size, so we'd need a new
data type with a structure encompassing those fields. In the end it
obviously didn't make sense to me to re-use SET_IRQS, but if others can
come up with something that makes sense, I'm open to it. Thanks,
Alex
On 08/02/18 01:12, Alex Williamson wrote:
> On Wed, 7 Feb 2018 15:48:26 +1100
> Alexey Kardashevskiy <[email protected]> wrote:
>
>> On 07/02/18 15:25, Alex Williamson wrote:
>>> On Wed, 7 Feb 2018 15:09:22 +1100
>>> Alexey Kardashevskiy <[email protected]> wrote:
>>>> On 07/02/18 11:08, Alex Williamson wrote:
>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>>>> index e3301dbd27d4..07966a5f0832 100644
>>>>> --- a/include/uapi/linux/vfio.h
>>>>> +++ b/include/uapi/linux/vfio.h
>>>>> @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
>>>>>
>>>>> #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
>>>>>
>>>>> +/**
>>>>> + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
>>>>> + * struct vfio_device_ioeventfd)
>>>>> + *
>>>>> + * Perform a write to the device at the specified device fd offset, with
>>>>> + * the specified data and width when the provided eventfd is triggered.
>>>>> + *
>>>>> + * Return: 0 on success, -errno on failure.
>>>>> + */
>>>>> +struct vfio_device_ioeventfd {
>>>>> + __u32 argsz;
>>>>> + __u32 flags;
>>>>> +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */
>>>>> +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */
>>>>> +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
>>>>> +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
>>>>> +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
>>>>> + __u64 offset; /* device fd offset of write */
>>>>> + __u64 data; /* data to be written */
>>>>> + __s32 fd; /* -1 for de-assignment */
>>>>> +};
>>>>> +
>>>>> +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 14)
>>>>
>>>>
>>>> Is this a first ioctl with endianness fixed to little-endian? I'd suggest
>>>> to comment on that as things like vfio_info_cap_header do use the host
>>>> endianness.
>>>
>>> Look at our current read and write interface, we call leXX_to_cpu
>>> before calling iowriteXX there and I think a user would logically
>>> expect to use the same data format here as they would there.
>>
>> If the data is "char data[8]" (i.e. bytestream), then it can be expected to
>> be device/bus endian (i.e. PCI == little endian), but if it is u64 - then I
>> am not so sure really, and this made me look around. It could be "__le64
>> data" too.
>>
>>> Also note
>>> that iowriteXX does a cpu_to_leXX, so are we really defining the
>>> interface as little-endian or are we just trying to make ourselves
>>> endian neutral and counter that implicit conversion? Thanks,
>>
>> Defining it LE is fine, I just find it a bit confusing when
>> vfio_info_cap_header is host endian but vfio_device_ioeventfd is not.
>
> But I don't think we are defining the interface as little-endian.
> iowriteXX does a cpu_to_leXX byteswap. Therefore in order to maintain
> endian neutrality, if the data does a cpu->le swap on the way out, I
> need to do a le->cpu swap on the way in, right? Please defend the
> assertion that we're creating a little-endian interface. Thanks,
vfio_pci_ioctl() passes "endian-neutral" ioeventfd.data to
vfio_pci_ioeventfd() which immediately does the leXX_to_cpu() conversion
(and uses the result later on in iowriteXX(), which is not VFIO API) so I
read it as the ioctl really expects LE.
The QEMU part - vfio_nvidia_mirror_quirk MR - does not swap bytes but the
MR itself it declared DEVICE_LITTLE_ENDIAN which means
vfio_nvidia_quirk_mirror_write() receives byteswapped @data in the host
endian == bigendian on a big endian host. So the ioctl() handler will
receive a BE value, do byteswap #1 in leXX_to_cpu(), and then do byteswap
#2 in iowriteXX() so after all a BE will be written to a device. So I'd say
we rather do not need leXX_to_cpu() in vfio_pci_ioeventfd(). Correct me
where I am wrong. Thanks,
--
Alexey
Hi Alex,
On 07/02/18 17:57, Alex Williamson wrote:
> On Wed, 7 Feb 2018 16:46:19 +0100
> Auger Eric <[email protected]> wrote:
>
>> Hi Alex,
>>
>> On 07/02/18 01:08, Alex Williamson wrote:
>>> The ioeventfd here is actually irqfd handling of an ioeventfd such as
>>> supported in KVM. A user is able to pre-program a device write to
>>> occur when the eventfd triggers. This is yet another instance of
>>> eventfd-irqfd triggering between KVM and vfio. The impetus for this
>>> is high frequency writes to pages which are virtualized in QEMU.
>>> Enabling this near-direct write path for selected registers within
>>> the virtualized page can improve performance and reduce overhead.
>>> Specifically this is initially targeted at NVIDIA graphics cards where
>>> the driver issues a write to an MMIO register within a virtualized
>>> region in order to allow the MSI interrupt to re-trigger.
>>>
>>> Signed-off-by: Alex Williamson <[email protected]>
>>
>> fyi it does not apply anymore on master (conflict in
>> include/uapi/linux/vfio.h on GFX stuff)
>
> Sorry, I should have noted that this was against v4.15, I didn't want
> the churn of the merge window since I was benchmarking against this.
> Will update for non-RFC.
>
> ...
>>> +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
>>> + uint64_t data, int count, int fd)
>>> +{
>>> + struct pci_dev *pdev = vdev->pdev;
>>> + loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
>>> + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
>>> + struct vfio_pci_ioeventfd *ioeventfd;
>>> + int (*handler)(void *, void *);
>>> + unsigned long val;
>>> +
>>> + /* Only support ioeventfds into BARs */
>>> + if (bar > VFIO_PCI_BAR5_REGION_INDEX)
>>> + return -EINVAL;
>>> +
>>> + if (pos + count > pci_resource_len(pdev, bar))
>>> + return -EINVAL;
>>> +
>>> + /* Disallow ioeventfds working around MSI-X table writes */
>>> + if (bar == vdev->msix_bar &&
>>> + !(pos + count <= vdev->msix_offset ||
>>> + pos >= vdev->msix_offset + vdev->msix_size))
>>> + return -EINVAL;
>>> +
>>> + switch (count) {
>>> + case 1:
>>> + handler = &vfio_pci_ioeventfd_handler8;
>>> + val = data;
>>> + break;
>>> + case 2:
>>> + handler = &vfio_pci_ioeventfd_handler16;
>>> + val = le16_to_cpu(data);
>>> + break;
>>> + case 4:
>>> + handler = &vfio_pci_ioeventfd_handler32;
>>> + val = le32_to_cpu(data);
>>> + break;
>>> +#ifdef iowrite64
>>> + case 8:
>>> + handler = &vfio_pci_ioeventfd_handler64;
>>> + val = le64_to_cpu(data);
>>> + break;
>>> +#endif
>>> + default:
>>> + return -EINVAL;
>>> + }
>>> +
>>> + ret = vfio_pci_setup_barmap(vdev, bar);
>>> + if (ret)
>>> + return ret;
>>> +
>>> + mutex_lock(&vdev->ioeventfds_lock);
>>> +
>>> + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
>>> + if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
>>> + ioeventfd->data == data && ioeventfd->count == count) {
>>> + if (fd == -1) {
>>> + vfio_virqfd_disable(&ioeventfd->virqfd);
>>> + list_del(&ioeventfd->next);
>>> + kfree(ioeventfd);
>>> + ret = 0;
>>> + } else
>>> + ret = -EEXIST;
>>> +
>>> + goto out_unlock;
>>> + }
>>> + }
>>> +
>>> + if (fd < 0) {
>>> + ret = -ENODEV;
>>> + goto out_unlock;
>>> + }
>>> +
>>> + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
>>> + if (!ioeventfd) {
>>> + ret = -ENOMEM;
>>> + goto out_unlock;
>>> + }
>>> +
>>> + ioeventfd->pos = pos;
>>> + ioeventfd->bar = bar;
>>> + ioeventfd->data = data;
>>> + ioeventfd->count = count;
>>> +
>>> + ret = vfio_virqfd_enable(vdev->barmap[ioeventfd->bar] + ioeventfd->pos,
>> nit: bar and pos could be used directly
>
> Indeed, probably leftover from development. Fixed and re-wrapped the
> following lines.
>
>>> + handler, NULL, (void *)val,
>>> + &ioeventfd->virqfd, fd);
>>> + if (ret) {
>>> + kfree(ioeventfd);
>>> + goto out_unlock;
>>> + }
>>> +
>>> + list_add(&ioeventfd->next, &vdev->ioeventfds_list);
>>> +
>>> +out_unlock:
>>> + mutex_unlock(&vdev->ioeventfds_lock);
>>> +
>>> + return ret;
>>> +}
>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>> index e3301dbd27d4..07966a5f0832 100644
>>> --- a/include/uapi/linux/vfio.h
>>> +++ b/include/uapi/linux/vfio.h
>>> @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
>>>
>>> #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
>>>
>>> +/**
>>> + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
>>> + * struct vfio_device_ioeventfd)
>>> + *
>>> + * Perform a write to the device at the specified device fd offset, with
>>> + * the specified data and width when the provided eventfd is triggered.
>> don't you want to document the limitation on BAR only and excluding the
>> MSIx table.
>
> Sure, we could. This is also just an acceleration interface, it's not
> required for functionality, so a user can probe the capabilities by
> trying to enable an ioeventfd to test for support. I don't really want
> to add a flag to each region to identify support or create yet another
> sparse map identifying which sections of regions are supported. We
> could enable this for PCI config space, but I didn't think it really
> made sense (and I was too lazy). Real PCI config space (not MMIO
> mirrors of config space on GPUs) should never be a performance path,
> therefore I question if there's any ROI for the code to support it.
> Device specific regions would need to be handled on a case by case
> basis, and I don't think we have any cases yet were it makes sense
> (IIRC the IGD regions are read-only). Of course ROMs are read-only, so
> it doesn't make sense to support them.
>
> I also note that this patch of course only supports directly assigned
> vfio-pci devices, not vfio-platform and not mdev devices. Since the
> ioctl is intended to be device agnostic, maybe we could have a default
> handler that simply uses the device file write interface. At least one
> issue there is that those expect a user buffer. I'll look into how I
> might add the support more generically, if perhaps less optimized.
OK
>
> Does it seem like a sufficient user API to try and ioeventfd and be
> prepared for it to fail? Given that this is a new ioctl, users need to
> do that for backwards compatibility anyway.
looks OK to me.
>
> Something I forgot to mention in my rush to send this out is that I
> debated whether to create a new ioctl or re-use the SET_IRQS ioctl. In
> the end, I couldn't resolve how to handle the index, start, and count
> fields, so I created this new ioctl. Would we create an ioeventfd
> index? We couldn't simply pass an array of ioeventfds since each needs
> to be associated with an offset, data, and size, so we'd need a new
> data type with a structure encompassing those fields. In the end it
> obviously didn't make sense to me to re-use SET_IRQS, but if others can
> come up with something that makes sense, I'm open to it. Thanks,
as far as I am concerned, I prefer this API rather than the SET_IRQS one ;-)
Thanks
Eric
>
> Alex
>
On Tue, Feb 06, 2018 at 05:08:14PM -0700, Alex Williamson wrote:
[...]
> +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
> + uint64_t data, int count, int fd)
> +{
> + struct pci_dev *pdev = vdev->pdev;
> + loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
> + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
> + struct vfio_pci_ioeventfd *ioeventfd;
> + int (*handler)(void *, void *);
> + unsigned long val;
> +
> + /* Only support ioeventfds into BARs */
> + if (bar > VFIO_PCI_BAR5_REGION_INDEX)
> + return -EINVAL;
> +
> + if (pos + count > pci_resource_len(pdev, bar))
> + return -EINVAL;
> +
> + /* Disallow ioeventfds working around MSI-X table writes */
> + if (bar == vdev->msix_bar &&
> + !(pos + count <= vdev->msix_offset ||
> + pos >= vdev->msix_offset + vdev->msix_size))
> + return -EINVAL;
> +
> + switch (count) {
> + case 1:
> + handler = &vfio_pci_ioeventfd_handler8;
> + val = data;
> + break;
> + case 2:
> + handler = &vfio_pci_ioeventfd_handler16;
> + val = le16_to_cpu(data);
> + break;
> + case 4:
> + handler = &vfio_pci_ioeventfd_handler32;
> + val = le32_to_cpu(data);
> + break;
> +#ifdef iowrite64
> + case 8:
> + handler = &vfio_pci_ioeventfd_handler64;
> + val = le64_to_cpu(data);
> + break;
> +#endif
> + default:
> + return -EINVAL;
> + }
> +
> + ret = vfio_pci_setup_barmap(vdev, bar);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&vdev->ioeventfds_lock);
> +
> + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
> + if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
> + ioeventfd->data == data && ioeventfd->count == count) {
> + if (fd == -1) {
> + vfio_virqfd_disable(&ioeventfd->virqfd);
> + list_del(&ioeventfd->next);
> + kfree(ioeventfd);
> + ret = 0;
> + } else
> + ret = -EEXIST;
> +
> + goto out_unlock;
> + }
> + }
> +
> + if (fd < 0) {
> + ret = -ENODEV;
> + goto out_unlock;
> + }
> +
> + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
> + if (!ioeventfd) {
> + ret = -ENOMEM;
> + goto out_unlock;
> + }
> +
> + ioeventfd->pos = pos;
> + ioeventfd->bar = bar;
> + ioeventfd->data = data;
> + ioeventfd->count = count;
> +
> + ret = vfio_virqfd_enable(vdev->barmap[ioeventfd->bar] + ioeventfd->pos,
> + handler, NULL, (void *)val,
> + &ioeventfd->virqfd, fd);
> + if (ret) {
> + kfree(ioeventfd);
> + goto out_unlock;
> + }
> +
> + list_add(&ioeventfd->next, &vdev->ioeventfds_list);
Is there a limit on how many ioeventfds that can be created?
IIUC we'll create this eventfd "automatically" if a MMIO addr/data
triggered continuously for N=10 times, then would it be safer we have
a limitation on maximum eventfds? Or not sure whether a malicious
guest can consume the host memory by sending:
- addr1/data1, 10 times
- addr2/data2, 10 times
- ...
To create unlimited ioeventfds? Thanks,
--
Peter Xu
On Fri, 9 Feb 2018 15:05:11 +0800
Peter Xu <[email protected]> wrote:
> On Tue, Feb 06, 2018 at 05:08:14PM -0700, Alex Williamson wrote:
>
> [...]
>
> > +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
> > + uint64_t data, int count, int fd)
> > +{
> > + struct pci_dev *pdev = vdev->pdev;
> > + loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
> > + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
> > + struct vfio_pci_ioeventfd *ioeventfd;
> > + int (*handler)(void *, void *);
> > + unsigned long val;
> > +
> > + /* Only support ioeventfds into BARs */
> > + if (bar > VFIO_PCI_BAR5_REGION_INDEX)
> > + return -EINVAL;
> > +
> > + if (pos + count > pci_resource_len(pdev, bar))
> > + return -EINVAL;
> > +
> > + /* Disallow ioeventfds working around MSI-X table writes */
> > + if (bar == vdev->msix_bar &&
> > + !(pos + count <= vdev->msix_offset ||
> > + pos >= vdev->msix_offset + vdev->msix_size))
> > + return -EINVAL;
> > +
> > + switch (count) {
> > + case 1:
> > + handler = &vfio_pci_ioeventfd_handler8;
> > + val = data;
> > + break;
> > + case 2:
> > + handler = &vfio_pci_ioeventfd_handler16;
> > + val = le16_to_cpu(data);
> > + break;
> > + case 4:
> > + handler = &vfio_pci_ioeventfd_handler32;
> > + val = le32_to_cpu(data);
> > + break;
> > +#ifdef iowrite64
> > + case 8:
> > + handler = &vfio_pci_ioeventfd_handler64;
> > + val = le64_to_cpu(data);
> > + break;
> > +#endif
> > + default:
> > + return -EINVAL;
> > + }
> > +
> > + ret = vfio_pci_setup_barmap(vdev, bar);
> > + if (ret)
> > + return ret;
> > +
> > + mutex_lock(&vdev->ioeventfds_lock);
> > +
> > + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
> > + if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
> > + ioeventfd->data == data && ioeventfd->count == count) {
> > + if (fd == -1) {
> > + vfio_virqfd_disable(&ioeventfd->virqfd);
> > + list_del(&ioeventfd->next);
> > + kfree(ioeventfd);
> > + ret = 0;
> > + } else
> > + ret = -EEXIST;
> > +
> > + goto out_unlock;
> > + }
> > + }
> > +
> > + if (fd < 0) {
> > + ret = -ENODEV;
> > + goto out_unlock;
> > + }
> > +
> > + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
> > + if (!ioeventfd) {
> > + ret = -ENOMEM;
> > + goto out_unlock;
> > + }
> > +
> > + ioeventfd->pos = pos;
> > + ioeventfd->bar = bar;
> > + ioeventfd->data = data;
> > + ioeventfd->count = count;
> > +
> > + ret = vfio_virqfd_enable(vdev->barmap[ioeventfd->bar] + ioeventfd->pos,
> > + handler, NULL, (void *)val,
> > + &ioeventfd->virqfd, fd);
> > + if (ret) {
> > + kfree(ioeventfd);
> > + goto out_unlock;
> > + }
> > +
> > + list_add(&ioeventfd->next, &vdev->ioeventfds_list);
>
> Is there a limit on how many ioeventfds that can be created?
>
> IIUC we'll create this eventfd "automatically" if a MMIO addr/data
> triggered continuously for N=10 times, then would it be safer we have
> a limitation on maximum eventfds? Or not sure whether a malicious
> guest can consume the host memory by sending:
>
> - addr1/data1, 10 times
> - addr2/data2, 10 times
> - ...
>
> To create unlimited ioeventfds? Thanks,
Good question, it is somewhat exploitable in the guest the way it's
written, however a user process does have an open file limit and each
eventfd consumes a file handle, so unless someone is running QEMU with
unlimited file handles, there is a built-in limit. Two problems remain
though:
First, is it still a bad idea that a user within a guest can target
this device page to consume all of the QEMU process' open file handles,
even if ultimately they're only harming themselves? What would a
reasonable cap of file descriptors for this purpose be? How would we
know which are actively used and which could be expired? Currently
only 2 are registered, the MSI-ACK address and some unknown secondary
one that's low frequency, but enough to trigger the algorithm here (and
doesn't seem harmful to let it get enabled). We could therefore
arbitrarily pick 5 as an upper limit here, maybe with a warning if the
code hits that limit.
Second, is there still an exploit in the proposed vfio interface that a
user could re-use a single file descriptor for multiple vfio
ioeventfds. I don't know. I thought about looking to see whether a
file descriptor is re-used, but then I wondered if that might actually
be kind of a neat and potentially useful interface that a single
eventfd could trigger multiple device writes. It looks like KVM
arbitrarily sets a limit of 1000 iobus devices, where each ioeventfd
would be such a device. Perhaps we take the same approach and pick an
arbitrary high upper limit per vfio device. What's your opinion?
Thanks,
Alex
On Fri, Feb 09, 2018 at 02:45:41PM -0700, Alex Williamson wrote:
> On Fri, 9 Feb 2018 15:05:11 +0800
> Peter Xu <[email protected]> wrote:
>
> > On Tue, Feb 06, 2018 at 05:08:14PM -0700, Alex Williamson wrote:
> >
> > [...]
> >
> > > +long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
> > > + uint64_t data, int count, int fd)
> > > +{
> > > + struct pci_dev *pdev = vdev->pdev;
> > > + loff_t pos = offset & VFIO_PCI_OFFSET_MASK;
> > > + int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset);
> > > + struct vfio_pci_ioeventfd *ioeventfd;
> > > + int (*handler)(void *, void *);
> > > + unsigned long val;
> > > +
> > > + /* Only support ioeventfds into BARs */
> > > + if (bar > VFIO_PCI_BAR5_REGION_INDEX)
> > > + return -EINVAL;
> > > +
> > > + if (pos + count > pci_resource_len(pdev, bar))
> > > + return -EINVAL;
> > > +
> > > + /* Disallow ioeventfds working around MSI-X table writes */
> > > + if (bar == vdev->msix_bar &&
> > > + !(pos + count <= vdev->msix_offset ||
> > > + pos >= vdev->msix_offset + vdev->msix_size))
> > > + return -EINVAL;
> > > +
> > > + switch (count) {
> > > + case 1:
> > > + handler = &vfio_pci_ioeventfd_handler8;
> > > + val = data;
> > > + break;
> > > + case 2:
> > > + handler = &vfio_pci_ioeventfd_handler16;
> > > + val = le16_to_cpu(data);
> > > + break;
> > > + case 4:
> > > + handler = &vfio_pci_ioeventfd_handler32;
> > > + val = le32_to_cpu(data);
> > > + break;
> > > +#ifdef iowrite64
> > > + case 8:
> > > + handler = &vfio_pci_ioeventfd_handler64;
> > > + val = le64_to_cpu(data);
> > > + break;
> > > +#endif
> > > + default:
> > > + return -EINVAL;
> > > + }
> > > +
> > > + ret = vfio_pci_setup_barmap(vdev, bar);
> > > + if (ret)
> > > + return ret;
> > > +
> > > + mutex_lock(&vdev->ioeventfds_lock);
> > > +
> > > + list_for_each_entry(ioeventfd, &vdev->ioeventfds_list, next) {
> > > + if (ioeventfd->pos == pos && ioeventfd->bar == bar &&
> > > + ioeventfd->data == data && ioeventfd->count == count) {
> > > + if (fd == -1) {
> > > + vfio_virqfd_disable(&ioeventfd->virqfd);
> > > + list_del(&ioeventfd->next);
> > > + kfree(ioeventfd);
> > > + ret = 0;
> > > + } else
> > > + ret = -EEXIST;
> > > +
> > > + goto out_unlock;
> > > + }
> > > + }
> > > +
> > > + if (fd < 0) {
> > > + ret = -ENODEV;
> > > + goto out_unlock;
> > > + }
> > > +
> > > + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL);
> > > + if (!ioeventfd) {
> > > + ret = -ENOMEM;
> > > + goto out_unlock;
> > > + }
> > > +
> > > + ioeventfd->pos = pos;
> > > + ioeventfd->bar = bar;
> > > + ioeventfd->data = data;
> > > + ioeventfd->count = count;
> > > +
> > > + ret = vfio_virqfd_enable(vdev->barmap[ioeventfd->bar] + ioeventfd->pos,
> > > + handler, NULL, (void *)val,
> > > + &ioeventfd->virqfd, fd);
> > > + if (ret) {
> > > + kfree(ioeventfd);
> > > + goto out_unlock;
> > > + }
> > > +
> > > + list_add(&ioeventfd->next, &vdev->ioeventfds_list);
> >
> > Is there a limit on how many ioeventfds that can be created?
> >
> > IIUC we'll create this eventfd "automatically" if a MMIO addr/data
> > triggered continuously for N=10 times, then would it be safer we have
> > a limitation on maximum eventfds? Or not sure whether a malicious
> > guest can consume the host memory by sending:
> >
> > - addr1/data1, 10 times
> > - addr2/data2, 10 times
> > - ...
> >
> > To create unlimited ioeventfds? Thanks,
>
> Good question, it is somewhat exploitable in the guest the way it's
> written, however a user process does have an open file limit and each
> eventfd consumes a file handle, so unless someone is running QEMU with
> unlimited file handles, there is a built-in limit. Two problems remain
> though:
>
> First, is it still a bad idea that a user within a guest can target
> this device page to consume all of the QEMU process' open file handles,
> even if ultimately they're only harming themselves? What would a
> reasonable cap of file descriptors for this purpose be? How would we
> know which are actively used and which could be expired? Currently
> only 2 are registered, the MSI-ACK address and some unknown secondary
> one that's low frequency, but enough to trigger the algorithm here (and
> doesn't seem harmful to let it get enabled). We could therefore
> arbitrarily pick 5 as an upper limit here, maybe with a warning if the
> code hits that limit.
>
> Second, is there still an exploit in the proposed vfio interface that a
> user could re-use a single file descriptor for multiple vfio
> ioeventfds. I don't know. I thought about looking to see whether a
> file descriptor is re-used, but then I wondered if that might actually
> be kind of a neat and potentially useful interface that a single
> eventfd could trigger multiple device writes.
Do you know any of possible scenario for this? Since that sounds like
an interesting idea.
> It looks like KVM
> arbitrarily sets a limit of 1000 iobus devices, where each ioeventfd
> would be such a device. Perhaps we take the same approach and pick an
> arbitrary high upper limit per vfio device. What's your opinion?
I did a "ulimit -n" and I got 1024 as default. So even if someone
granted the QEMU process with more/unlimited file handles, we'll still
have a hard limit of ~1000 vfio-ioeventfds, which sounds safe.
But I agree that we can still have a even smaller limit for vfio,
especially we know explicitly on how many we may need in most cases.
I'll follow your choice of number because I totally have no good idea
on that. While if we are going to have a limitation, I'm thinking
whether it'll be good we have a WARN_ONCE when that limit is reached.
Thanks,
--
Peter Xu
Hi,
On 08/02/18 02:22, Alexey Kardashevskiy wrote:
> On 08/02/18 01:12, Alex Williamson wrote:
>> On Wed, 7 Feb 2018 15:48:26 +1100
>> Alexey Kardashevskiy <[email protected]> wrote:
>>
>>> On 07/02/18 15:25, Alex Williamson wrote:
>>>> On Wed, 7 Feb 2018 15:09:22 +1100
>>>> Alexey Kardashevskiy <[email protected]> wrote:
>>>>> On 07/02/18 11:08, Alex Williamson wrote:
>>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>>>>> index e3301dbd27d4..07966a5f0832 100644
>>>>>> --- a/include/uapi/linux/vfio.h
>>>>>> +++ b/include/uapi/linux/vfio.h
>>>>>> @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
>>>>>>
>>>>>> #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
>>>>>>
>>>>>> +/**
>>>>>> + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
>>>>>> + * struct vfio_device_ioeventfd)
>>>>>> + *
>>>>>> + * Perform a write to the device at the specified device fd offset, with
>>>>>> + * the specified data and width when the provided eventfd is triggered.
>>>>>> + *
>>>>>> + * Return: 0 on success, -errno on failure.
>>>>>> + */
>>>>>> +struct vfio_device_ioeventfd {
>>>>>> + __u32 argsz;
>>>>>> + __u32 flags;
>>>>>> +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */
>>>>>> +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */
>>>>>> +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
>>>>>> +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
>>>>>> +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
>>>>>> + __u64 offset; /* device fd offset of write */
>>>>>> + __u64 data; /* data to be written */
>>>>>> + __s32 fd; /* -1 for de-assignment */
>>>>>> +};
>>>>>> +
>>>>>> +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 14)
>>>>>
>>>>>
>>>>> Is this a first ioctl with endianness fixed to little-endian? I'd suggest
>>>>> to comment on that as things like vfio_info_cap_header do use the host
>>>>> endianness.
>>>>
>>>> Look at our current read and write interface, we call leXX_to_cpu
>>>> before calling iowriteXX there and I think a user would logically
>>>> expect to use the same data format here as they would there.
>>>
>>> If the data is "char data[8]" (i.e. bytestream), then it can be expected to
>>> be device/bus endian (i.e. PCI == little endian), but if it is u64 - then I
>>> am not so sure really, and this made me look around. It could be "__le64
>>> data" too.
>>>
>>>> Also note
>>>> that iowriteXX does a cpu_to_leXX, so are we really defining the
>>>> interface as little-endian or are we just trying to make ourselves
>>>> endian neutral and counter that implicit conversion? Thanks,
>>>
>>> Defining it LE is fine, I just find it a bit confusing when
>>> vfio_info_cap_header is host endian but vfio_device_ioeventfd is not.
>>
>> But I don't think we are defining the interface as little-endian.
>> iowriteXX does a cpu_to_leXX byteswap. Therefore in order to maintain
>> endian neutrality, if the data does a cpu->le swap on the way out, I
>> need to do a le->cpu swap on the way in, right? Please defend the
>> assertion that we're creating a little-endian interface. Thanks,
>
>
> vfio_pci_ioctl() passes "endian-neutral" ioeventfd.data to
> vfio_pci_ioeventfd() which immediately does the leXX_to_cpu() conversion
> (and uses the result later on in iowriteXX(), which is not VFIO API) so I
> read it as the ioctl really expects LE.
>
> The QEMU part - vfio_nvidia_mirror_quirk MR - does not swap bytes but the
> MR itself it declared DEVICE_LITTLE_ENDIAN which means
> vfio_nvidia_quirk_mirror_write() receives byteswapped @data in the host
> endian == bigendian on a big endian host. So the ioctl() handler will
> receive a BE value, do byteswap #1 in leXX_to_cpu(), and then do byteswap
> #2 in iowriteXX() so after all a BE will be written to a device. So I'd say
> we rather do not need leXX_to_cpu() in vfio_pci_ioeventfd(). Correct me
> where I am wrong. Thanks,
It is not crystal clear to me what is the outcome of this discussion.
Please can you clarify?
At the beginning I understood we had a chain of lexx_to_cpu and
cpu_to_lexx (in iowritexx) so it was neutral. Now I am lost about what
we want.
Thanks
Eric
>
>
>
On Tue, 13 Mar 2018 13:38:00 +0100
Auger Eric <[email protected]> wrote:
> On 08/02/18 02:22, Alexey Kardashevskiy wrote:
> > On 08/02/18 01:12, Alex Williamson wrote:
> >> On Wed, 7 Feb 2018 15:48:26 +1100
> >> Alexey Kardashevskiy <[email protected]> wrote:
> >>> On 07/02/18 15:25, Alex Williamson wrote:
> >>>> On Wed, 7 Feb 2018 15:09:22 +1100
> >>>> Alexey Kardashevskiy <[email protected]> wrote:
> >>>>> On 07/02/18 11:08, Alex Williamson wrote:
> >>>>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>>>>> index e3301dbd27d4..07966a5f0832 100644
> >>>>>> --- a/include/uapi/linux/vfio.h
> >>>>>> +++ b/include/uapi/linux/vfio.h
> >>>>>> @@ -503,6 +503,30 @@ struct vfio_pci_hot_reset {
> >>>>>>
> >>>>>> #define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
> >>>>>>
> >>>>>> +/**
> >>>>>> + * VFIO_DEVICE_IOEVENTFD - _IOW(VFIO_TYPE, VFIO_BASE + 14,
> >>>>>> + * struct vfio_device_ioeventfd)
> >>>>>> + *
> >>>>>> + * Perform a write to the device at the specified device fd offset, with
> >>>>>> + * the specified data and width when the provided eventfd is triggered.
> >>>>>> + *
> >>>>>> + * Return: 0 on success, -errno on failure.
> >>>>>> + */
> >>>>>> +struct vfio_device_ioeventfd {
> >>>>>> + __u32 argsz;
> >>>>>> + __u32 flags;
> >>>>>> +#define VFIO_DEVICE_IOEVENTFD_8 (1 << 0) /* 1-byte write */
> >>>>>> +#define VFIO_DEVICE_IOEVENTFD_16 (1 << 1) /* 2-byte write */
> >>>>>> +#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
> >>>>>> +#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
> >>>>>> +#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
> >>>>>> + __u64 offset; /* device fd offset of write */
> >>>>>> + __u64 data; /* data to be written */
> >>>>>> + __s32 fd; /* -1 for de-assignment */
> >>>>>> +};
> >>>>>> +
> >>>>>> +#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 14)
> >>>>>
> >>>>>
> >>>>> Is this a first ioctl with endianness fixed to little-endian? I'd suggest
> >>>>> to comment on that as things like vfio_info_cap_header do use the host
> >>>>> endianness.
> >>>>
> >>>> Look at our current read and write interface, we call leXX_to_cpu
> >>>> before calling iowriteXX there and I think a user would logically
> >>>> expect to use the same data format here as they would there.
> >>>
> >>> If the data is "char data[8]" (i.e. bytestream), then it can be expected to
> >>> be device/bus endian (i.e. PCI == little endian), but if it is u64 - then I
> >>> am not so sure really, and this made me look around. It could be "__le64
> >>> data" too.
> >>>
> >>>> Also note
> >>>> that iowriteXX does a cpu_to_leXX, so are we really defining the
> >>>> interface as little-endian or are we just trying to make ourselves
> >>>> endian neutral and counter that implicit conversion? Thanks,
> >>>
> >>> Defining it LE is fine, I just find it a bit confusing when
> >>> vfio_info_cap_header is host endian but vfio_device_ioeventfd is not.
> >>
> >> But I don't think we are defining the interface as little-endian.
> >> iowriteXX does a cpu_to_leXX byteswap. Therefore in order to maintain
> >> endian neutrality, if the data does a cpu->le swap on the way out, I
> >> need to do a le->cpu swap on the way in, right? Please defend the
> >> assertion that we're creating a little-endian interface. Thanks,
> >
> >
> > vfio_pci_ioctl() passes "endian-neutral" ioeventfd.data to
> > vfio_pci_ioeventfd() which immediately does the leXX_to_cpu() conversion
> > (and uses the result later on in iowriteXX(), which is not VFIO API) so I
> > read it as the ioctl really expects LE.
> >
> > The QEMU part - vfio_nvidia_mirror_quirk MR - does not swap bytes but the
> > MR itself it declared DEVICE_LITTLE_ENDIAN which means
> > vfio_nvidia_quirk_mirror_write() receives byteswapped @data in the host
> > endian == bigendian on a big endian host. So the ioctl() handler will
> > receive a BE value, do byteswap #1 in leXX_to_cpu(), and then do byteswap
> > #2 in iowriteXX() so after all a BE will be written to a device. So I'd say
> > we rather do not need leXX_to_cpu() in vfio_pci_ioeventfd(). Correct me
> > where I am wrong. Thanks,
>
> It is not crystal clear to me what is the outcome of this discussion.
> Please can you clarify?
>
> At the beginning I understood we had a chain of lexx_to_cpu and
> cpu_to_lexx (in iowritexx) so it was neutral. Now I am lost about what
> we want.
I've tried to address this with patch 2/3 in the newer series, adding
helpers such that the implicit endian-ness of the io{read,write}
functions is hidden and no extraneous swapping is done. Therefore the
leXX_to_cpu() is gone, as Alexey wanted. Unless there's a new
objection, this is what I intend to go with. Thanks,
Alex