This RFC is to demonstrate below ideas,
a) Build vhost-mdev on top of the same abstraction defined in
the virtio-mdev series [1];
b) Introduce /dev/vhost-mdev to do vhost ioctls and support
setting mdev device as backend;
Now the userspace API looks like this:
- Userspace generates a compatible mdev device;
- Userspace opens this mdev device with VFIO API (including
doing IOMMU programming for this mdev device with VFIO's
container/group based interface);
- Userspace opens /dev/vhost-mdev and gets vhost fd;
- Userspace uses vhost ioctls to setup vhost (userspace should
do VHOST_MDEV_SET_BACKEND ioctl with VFIO group fd and device
fd first before doing other vhost ioctls);
Only compile test has been done for this series for now.
RFCv3: https://patchwork.kernel.org/patch/11117785/
[1] https://lkml.org/lkml/2019/9/10/135
Tiwei Bie (3):
vfio: support getting vfio device from device fd
vfio: support checking vfio driver by device ops
vhost: introduce mdev based hardware backend
drivers/vfio/mdev/vfio_mdev.c | 3 +-
drivers/vfio/vfio.c | 32 +++
drivers/vhost/Kconfig | 9 +
drivers/vhost/Makefile | 3 +
drivers/vhost/mdev.c | 462 +++++++++++++++++++++++++++++++
drivers/vhost/vhost.c | 39 ++-
drivers/vhost/vhost.h | 6 +
include/linux/vfio.h | 11 +
include/uapi/linux/vhost.h | 10 +
include/uapi/linux/vhost_types.h | 5 +
10 files changed, 573 insertions(+), 7 deletions(-)
create mode 100644 drivers/vhost/mdev.c
--
2.17.1
This patch introduces the support for getting VFIO device
from VFIO device fd. With this support, it's possible for
vhost to get VFIO device from the group fd and device fd
set by the userspace.
Signed-off-by: Tiwei Bie <[email protected]>
---
drivers/vfio/vfio.c | 25 +++++++++++++++++++++++++
include/linux/vfio.h | 4 ++++
2 files changed, 29 insertions(+)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 388597930b64..697fd079bb3f 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -890,6 +890,31 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
return device;
}
+struct vfio_device *vfio_device_get_from_fd(struct vfio_group *group,
+ int device_fd)
+{
+ struct fd f;
+ struct vfio_device *it, *device = ERR_PTR(-ENODEV);
+
+ f = fdget(device_fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ mutex_lock(&group->device_lock);
+ list_for_each_entry(it, &group->device_list, group_next) {
+ if (it == f.file->private_data) {
+ device = it;
+ vfio_device_get(device);
+ break;
+ }
+ }
+ mutex_unlock(&group->device_lock);
+
+ fdput(f);
+ return device;
+}
+EXPORT_SYMBOL_GPL(vfio_device_get_from_fd);
+
/*
* Caller must hold a reference to the vfio_device
*/
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e42a711a2800..e75b24fd7c5c 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -15,6 +15,8 @@
#include <linux/poll.h>
#include <uapi/linux/vfio.h>
+struct vfio_group;
+
/**
* struct vfio_device_ops - VFIO bus driver device callbacks
*
@@ -50,6 +52,8 @@ extern int vfio_add_group_dev(struct device *dev,
extern void *vfio_del_group_dev(struct device *dev);
extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
+extern struct vfio_device *vfio_device_get_from_fd(struct vfio_group *group,
+ int device_fd);
extern void vfio_device_put(struct vfio_device *device);
extern void *vfio_device_data(struct vfio_device *device);
--
2.17.1
More details about this patch can be found from the cover
letter for now. Only compile test has been done for now.
Signed-off-by: Tiwei Bie <[email protected]>
---
drivers/vhost/Kconfig | 9 +
drivers/vhost/Makefile | 3 +
drivers/vhost/mdev.c | 462 +++++++++++++++++++++++++++++++
drivers/vhost/vhost.c | 39 ++-
drivers/vhost/vhost.h | 6 +
include/uapi/linux/vhost.h | 10 +
include/uapi/linux/vhost_types.h | 5 +
7 files changed, 528 insertions(+), 6 deletions(-)
create mode 100644 drivers/vhost/mdev.c
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 3d03ccbd1adc..ef9783156d2e 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -34,6 +34,15 @@ config VHOST_VSOCK
To compile this driver as a module, choose M here: the module will be called
vhost_vsock.
+config VHOST_MDEV
+ tristate "Mediated device based hardware vhost accelerator"
+ depends on EVENTFD && VFIO && VFIO_MDEV
+ select VHOST
+ default n
+ ---help---
+ Say Y here to enable the vhost_mdev module
+ for use with hardware vhost accelerators
+
config VHOST
tristate
---help---
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 6c6df24f770c..ad9c0f8c6d8c 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -10,4 +10,7 @@ vhost_vsock-y := vsock.o
obj-$(CONFIG_VHOST_RING) += vringh.o
+obj-$(CONFIG_VHOST_MDEV) += vhost_mdev.o
+vhost_mdev-y := mdev.o
+
obj-$(CONFIG_VHOST) += vhost.o
diff --git a/drivers/vhost/mdev.c b/drivers/vhost/mdev.c
new file mode 100644
index 000000000000..8c6597aff45e
--- /dev/null
+++ b/drivers/vhost/mdev.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018-2019 Intel Corporation.
+ */
+
+#include <linux/compat.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/mdev.h>
+#include <linux/module.h>
+#include <linux/vfio.h>
+#include <linux/vhost.h>
+#include <linux/virtio_mdev.h>
+
+#include "vhost.h"
+
+struct vhost_mdev {
+ struct mutex mutex;
+ struct vhost_dev dev;
+ struct vhost_virtqueue *vqs;
+ int nvqs;
+ u64 state;
+ u64 features;
+ u64 acked_features;
+ struct vfio_group *vfio_group;
+ struct vfio_device *vfio_device;
+ struct mdev_device *mdev;
+};
+
+/*
+ * XXX
+ * We assume virtio_mdev.ko exposes below symbols for now, as we
+ * don't have a proper way to access parent ops directly yet.
+ *
+ * virtio_mdev_readl()
+ * virtio_mdev_writel()
+ */
+extern u32 virtio_mdev_readl(struct mdev_device *mdev, loff_t off);
+extern void virtio_mdev_writel(struct mdev_device *mdev, loff_t off, u32 val);
+
+static u8 mdev_get_status(struct mdev_device *mdev)
+{
+ return virtio_mdev_readl(mdev, VIRTIO_MDEV_STATUS);
+}
+
+static void mdev_set_status(struct mdev_device *mdev, u8 status)
+{
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_STATUS, status);
+}
+
+static void mdev_add_status(struct mdev_device *mdev, u8 status)
+{
+ status |= mdev_get_status(mdev);
+ mdev_set_status(mdev, status);
+}
+
+static void mdev_reset(struct mdev_device *mdev)
+{
+ mdev_set_status(mdev, 0);
+}
+
+static void handle_vq_kick(struct vhost_work *work)
+{
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_mdev *m = container_of(vq->dev, struct vhost_mdev, dev);
+
+ virtio_mdev_writel(m->mdev, VIRTIO_MDEV_QUEUE_NOTIFY, vq - m->vqs);
+}
+
+static long vhost_mdev_start_backend(struct vhost_mdev *m)
+{
+ struct mdev_device *mdev = m->mdev;
+ u64 features = m->acked_features;
+ u64 addr;
+ struct vhost_virtqueue *vq;
+ int queue_id;
+
+ features |= 1ULL << VIRTIO_F_IOMMU_PLATFORM;
+
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_DRIVER_FEATURES_SEL, 1);
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_DRIVER_FEATURES,
+ (u32)(features >> 32));
+
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_DRIVER_FEATURES_SEL, 0);
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_DRIVER_FEATURES,
+ (u32)features);
+
+ mdev_add_status(mdev, VIRTIO_CONFIG_S_FEATURES_OK);
+ if (!(mdev_get_status(mdev) & VIRTIO_CONFIG_S_FEATURES_OK))
+ return -ENODEV;
+
+ for (queue_id = 0; queue_id < m->nvqs; queue_id++) {
+ vq = &m->vqs[queue_id];
+
+ if (!vq->desc || !vq->avail || !vq->used)
+ break;
+
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_NUM, vq->num);
+
+ if (!vhost_translate_ring_addr(vq, (u64)vq->desc,
+ vhost_get_desc_size(vq, vq->num),
+ &addr))
+ return -EINVAL;
+
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_DESC_LOW, addr);
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_DESC_HIGH,
+ (addr >> 32));
+
+ if (!vhost_translate_ring_addr(vq, (u64)vq->avail,
+ vhost_get_avail_size(vq, vq->num),
+ &addr))
+ return -EINVAL;
+
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_AVAIL_LOW, addr);
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_AVAIL_HIGH,
+ (addr >> 32));
+
+ if (!vhost_translate_ring_addr(vq, (u64)vq->used,
+ vhost_get_used_size(vq, vq->num),
+ &addr))
+ return -EINVAL;
+
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_USED_LOW, addr);
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_USED_HIGH,
+ (addr >> 32));
+
+ // XXX: we need to support set_vring_base
+
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_READY, 1);
+ }
+
+ // XXX: we need to setup interrupt as well
+
+ mdev_add_status(mdev, VIRTIO_CONFIG_S_DRIVER_OK);
+ return 0;
+}
+
+static long vhost_mdev_stop_backend(struct vhost_mdev *m)
+{
+ struct mdev_device *mdev = m->mdev;
+
+ mdev_reset(mdev);
+ mdev_add_status(mdev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+ mdev_add_status(mdev, VIRTIO_CONFIG_S_DRIVER);
+ return 0;
+}
+
+static long vhost_set_state(struct vhost_mdev *m, u64 __user *statep)
+{
+ u64 state;
+ long r;
+
+ if (copy_from_user(&state, statep, sizeof(state)))
+ return -EFAULT;
+
+ if (state >= VHOST_MDEV_S_MAX)
+ return -EINVAL;
+
+ if (m->state == state)
+ return 0;
+
+ m->state = state;
+
+ switch (m->state) {
+ case VHOST_MDEV_S_RUNNING:
+ r = vhost_mdev_start_backend(m);
+ break;
+ case VHOST_MDEV_S_STOPPED:
+ r = vhost_mdev_stop_backend(m);
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+
+ return r;
+}
+
+static long vhost_get_features(struct vhost_mdev *m, u64 __user *featurep)
+{
+ if (copy_to_user(featurep, &m->features, sizeof(m->features)))
+ return -EFAULT;
+ return 0;
+}
+
+static long vhost_set_features(struct vhost_mdev *m, u64 __user *featurep)
+{
+ u64 features;
+
+ if (copy_from_user(&features, featurep, sizeof(features)))
+ return -EFAULT;
+
+ if (features & ~m->features)
+ return -EINVAL;
+
+ m->acked_features = features;
+
+ return 0;
+}
+
+static long vhost_get_vring_base(struct vhost_mdev *m, void __user *argp)
+{
+ struct vhost_virtqueue *vq;
+ u32 idx;
+ long r;
+
+ r = get_user(idx, (u32 __user *)argp);
+ if (r < 0)
+ return r;
+ if (idx >= m->nvqs)
+ return -ENOBUFS;
+
+ vq = &m->vqs[idx];
+
+ // XXX: we need to support get_vring_base
+ //vq->last_avail_idx = virtio_mdev_readl(b->mdev, ...);
+
+ return vhost_vring_ioctl(&m->dev, VHOST_GET_VRING_BASE, argp);
+}
+
+static void vhost_mdev_release_backend(struct vhost_mdev *m)
+{
+ if (!m->mdev)
+ return;
+
+ if (m->state != VHOST_MDEV_S_STOPPED) {
+ m->state = VHOST_MDEV_S_STOPPED;
+ vhost_mdev_stop_backend(m);
+ }
+
+ vhost_dev_stop(&m->dev);
+ vhost_dev_cleanup(&m->dev);
+
+ kfree(m->dev.vqs);
+ kfree(m->vqs);
+
+ vfio_device_put(m->vfio_device);
+ vfio_group_put_external_user(m->vfio_group);
+
+ m->mdev = NULL;
+}
+
+static long vhost_mdev_set_backend(struct vhost_mdev *m,
+ struct vhost_mdev_backend __user *argp)
+{
+ struct vhost_mdev_backend backend;
+ struct mdev_device *mdev;
+ struct vhost_dev *dev;
+ struct vhost_virtqueue **vqs;
+ struct file *file;
+ struct vfio_device *device;
+ struct vfio_group *group;
+ unsigned long magic;
+ u64 features;
+ int i, nvqs;
+ long r;
+
+ vhost_mdev_release_backend(m);
+
+ if (copy_from_user(&backend, argp, sizeof(backend))) {
+ r = -EFAULT;
+ goto err;
+ }
+
+ file = fget(backend.group_fd);
+ if (!file) {
+ r = -EBADF;
+ goto err;
+ }
+
+ group = vfio_group_get_external_user(file);
+ fput(file);
+ if (IS_ERR(group)) {
+ r = PTR_ERR(group);
+ goto err;
+ }
+
+ device = vfio_device_get_from_fd(group, backend.device_fd);
+ if (!IS_ERR(device)) {
+ r = PTR_ERR(device);
+ goto err_put_group;
+ }
+
+ if (!vfio_device_ops_match(device, &vfio_mdev_dev_ops)) {
+ r = -EINVAL;
+ goto err_put_device;
+ }
+
+ mdev = vfio_device_data(m->vfio_device);
+
+ magic = virtio_mdev_readl(mdev, VIRTIO_MDEV_MAGIC_VALUE);
+ if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) {
+ r = -ENODEV;
+ goto err_put_device;
+ }
+
+ mdev_reset(mdev);
+ mdev_add_status(mdev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
+ mdev_add_status(mdev, VIRTIO_CONFIG_S_DRIVER);
+
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_DEVICE_FEATURES_SEL, 1);
+ features = virtio_mdev_readl(mdev, VIRTIO_MDEV_DEVICE_FEATURES);
+ features <<= 32;
+
+ virtio_mdev_writel(mdev, VIRTIO_MDEV_DEVICE_FEATURES_SEL, 0);
+ features |= virtio_mdev_readl(mdev, VIRTIO_MDEV_DEVICE_FEATURES);
+
+ if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
+ r = -EINVAL;
+ goto err_put_device;
+ }
+
+ m->features = features;
+
+ nvqs = virtio_mdev_readl(mdev, VIRTIO_MDEV_QUEUE_NUM_MAX);
+ m->nvqs = nvqs;
+
+ m->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue),
+ GFP_KERNEL);
+ if (!m->vqs) {
+ r = -ENOMEM;
+ goto err_put_device;
+ }
+
+ vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
+ if (!vqs) {
+ r = -ENOMEM;
+ goto err_free_vqs;
+ }
+
+ dev = &m->dev;
+ for (i = 0; i < nvqs; i++) {
+ vqs[i] = &m->vqs[i];
+ vqs[i]->handle_kick = handle_vq_kick;
+ }
+ vhost_dev_init(dev, vqs, nvqs, 0, 0, 0);
+
+ m->vfio_group = group;
+ m->vfio_device = device;
+ m->mdev = mdev;
+
+ return 0;
+
+err_free_vqs:
+ kfree(m->vqs);
+err_put_device:
+ vfio_device_put(device);
+err_put_group:
+ vfio_group_put_external_user(group);
+err:
+ return r;
+}
+
+static int vhost_mdev_open(struct inode *inode, struct file *f)
+{
+ struct vhost_mdev *m;
+
+ m = kzalloc(sizeof(*m), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!m)
+ return -ENOMEM;
+
+ mutex_init(&m->mutex);
+ f->private_data = m;
+
+ return 0;
+}
+
+static int vhost_mdev_release(struct inode *inode, struct file *f)
+{
+ struct vhost_mdev *m = f->private_data;
+
+ vhost_mdev_release_backend(m);
+ mutex_destroy(&m->mutex);
+ kfree(m);
+
+ return 0;
+}
+
+static long vhost_mdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct vhost_mdev *m = f->private_data;
+ long r;
+
+ mutex_lock(&m->mutex);
+
+ if (cmd == VHOST_MDEV_SET_BACKEND) {
+ r = vhost_mdev_set_backend(m, argp);
+ goto done;
+ }
+
+ if (!m->mdev) {
+ r = -EINVAL;
+ goto done;
+ }
+
+ switch (cmd) {
+ case VHOST_MDEV_SET_STATE:
+ r = vhost_set_state(m, argp);
+ break;
+ case VHOST_GET_FEATURES:
+ r = vhost_get_features(m, argp);
+ break;
+ case VHOST_SET_FEATURES:
+ r = vhost_set_features(m, argp);
+ break;
+ case VHOST_GET_VRING_BASE:
+ r = vhost_get_vring_base(m, argp);
+ break;
+ default:
+ r = vhost_dev_ioctl(&m->dev, cmd, argp);
+ if (r == -ENOIOCTLCMD)
+ r = vhost_vring_ioctl(&m->dev, cmd, argp);
+ }
+
+done:
+ mutex_lock(&m->mutex);
+ return r;
+}
+
+#ifdef CONFIG_COMPAT
+static long vhost_mdev_compat_ioctl(struct file *f, unsigned int ioctl,
+ unsigned long arg)
+{
+ return vhost_mdev_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations vhost_mdev_fops = {
+ .owner = THIS_MODULE,
+ .release = vhost_mdev_release,
+ .unlocked_ioctl = vhost_mdev_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = vhost_mdev_compat_ioctl,
+#endif
+ .open = vhost_mdev_open,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice vhost_mdev_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "vhost-mdev",
+ .fops = &vhost_mdev_fops,
+};
+
+static int __init vhost_mdev_init(void)
+{
+ return misc_register(&vhost_mdev_misc);
+}
+module_init(vhost_mdev_init);
+
+static void __exit vhost_mdev_exit(void)
+{
+ misc_deregister(&vhost_mdev_misc);
+}
+module_exit(vhost_mdev_exit);
+
+MODULE_VERSION("0.0.0");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Hardware vhost accelerator abstraction");
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5dc174ac8cac..0f7236a17a56 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -426,8 +426,7 @@ bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
}
EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
-static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
- unsigned int num)
+size_t vhost_get_avail_size(struct vhost_virtqueue *vq, unsigned int num)
{
size_t event __maybe_unused =
vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
@@ -435,9 +434,9 @@ static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
return sizeof(*vq->avail) +
sizeof(*vq->avail->ring) * num + event;
}
+EXPORT_SYMBOL_GPL(vhost_get_avail_size);
-static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
- unsigned int num)
+size_t vhost_get_used_size(struct vhost_virtqueue *vq, unsigned int num)
{
size_t event __maybe_unused =
vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
@@ -445,12 +444,13 @@ static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
return sizeof(*vq->used) +
sizeof(*vq->used->ring) * num + event;
}
+EXPORT_SYMBOL_GPL(vhost_get_used_size);
-static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
- unsigned int num)
+size_t vhost_get_desc_size(struct vhost_virtqueue *vq, unsigned int num)
{
return sizeof(*vq->desc) * num;
}
+EXPORT_SYMBOL_GPL(vhost_get_desc_size);
void vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue **vqs, int nvqs,
@@ -2617,6 +2617,33 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
}
EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
+bool vhost_translate_ring_addr(struct vhost_virtqueue *vq, u64 ring_addr,
+ u64 len, u64 *addr)
+{
+ struct vhost_umem *umem = vq->umem;
+ struct vhost_umem_node *u;
+
+ if (vhost_overflow(ring_addr, len))
+ return false;
+
+ if (vq->iotlb) {
+ /* Ring address is already IOVA */
+ *addr = ring_addr;
+ return true;
+ }
+
+ /* Ring address is host virtual address. */
+ list_for_each_entry(u, &umem->umem_list, link) {
+ if (u->userspace_addr <= ring_addr &&
+ u->userspace_addr + u->size >= ring_addr + len) {
+ *addr = ring_addr - u->userspace_addr + u->start;
+ return true;
+ }
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(vhost_translate_ring_addr);
static int __init vhost_init(void)
{
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index e9ed2722b633..294a6bcb6adf 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -189,6 +189,12 @@ long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp);
bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);
+bool vhost_translate_ring_addr(struct vhost_virtqueue *vq, u64 ring_addr,
+ u64 len, u64 *addr);
+
+size_t vhost_get_avail_size(struct vhost_virtqueue *vq, unsigned int num);
+size_t vhost_get_used_size(struct vhost_virtqueue *vq, unsigned int num);
+size_t vhost_get_desc_size(struct vhost_virtqueue *vq, unsigned int num);
int vhost_get_vq_desc(struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 40d028eed645..7213aedc8506 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -116,4 +116,14 @@
#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64)
#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int)
+/* VHOST_MDEV specific defines */
+
+#define VHOST_MDEV_SET_BACKEND _IOW(VHOST_VIRTIO, 0x70, \
+ struct vhost_mdev_backend)
+#define VHOST_MDEV_SET_STATE _IOW(VHOST_VIRTIO, 0x71, __u64)
+
+#define VHOST_MDEV_S_STOPPED 0
+#define VHOST_MDEV_S_RUNNING 1
+#define VHOST_MDEV_S_MAX 2
+
#endif
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index c907290ff065..f06f0dbb7e51 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -119,6 +119,11 @@ struct vhost_scsi_target {
unsigned short reserved;
};
+struct vhost_mdev_backend {
+ int group_fd;
+ int device_fd;
+};
+
/* Feature bits */
/* Log all write descriptors. Can be changed while device is active. */
#define VHOST_F_LOG_ALL 26
--
2.17.1
On 2019/9/17 上午9:02, Tiwei Bie wrote:
> More details about this patch can be found from the cover
> letter for now. Only compile test has been done for now.
>
> Signed-off-by: Tiwei Bie <[email protected]>
> ---
> drivers/vhost/Kconfig | 9 +
> drivers/vhost/Makefile | 3 +
> drivers/vhost/mdev.c | 462 +++++++++++++++++++++++++++++++
> drivers/vhost/vhost.c | 39 ++-
> drivers/vhost/vhost.h | 6 +
> include/uapi/linux/vhost.h | 10 +
> include/uapi/linux/vhost_types.h | 5 +
> 7 files changed, 528 insertions(+), 6 deletions(-)
> create mode 100644 drivers/vhost/mdev.c
>
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index 3d03ccbd1adc..ef9783156d2e 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -34,6 +34,15 @@ config VHOST_VSOCK
> To compile this driver as a module, choose M here: the module will be called
> vhost_vsock.
>
> +config VHOST_MDEV
> + tristate "Mediated device based hardware vhost accelerator"
> + depends on EVENTFD && VFIO && VFIO_MDEV
> + select VHOST
> + default n
> + ---help---
> + Say Y here to enable the vhost_mdev module
> + for use with hardware vhost accelerators
> +
> config VHOST
> tristate
> ---help---
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index 6c6df24f770c..ad9c0f8c6d8c 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -10,4 +10,7 @@ vhost_vsock-y := vsock.o
>
> obj-$(CONFIG_VHOST_RING) += vringh.o
>
> +obj-$(CONFIG_VHOST_MDEV) += vhost_mdev.o
> +vhost_mdev-y := mdev.o
> +
> obj-$(CONFIG_VHOST) += vhost.o
> diff --git a/drivers/vhost/mdev.c b/drivers/vhost/mdev.c
> new file mode 100644
> index 000000000000..8c6597aff45e
> --- /dev/null
> +++ b/drivers/vhost/mdev.c
> @@ -0,0 +1,462 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018-2019 Intel Corporation.
> + */
> +
> +#include <linux/compat.h>
> +#include <linux/kernel.h>
> +#include <linux/miscdevice.h>
> +#include <linux/mdev.h>
> +#include <linux/module.h>
> +#include <linux/vfio.h>
> +#include <linux/vhost.h>
> +#include <linux/virtio_mdev.h>
> +
> +#include "vhost.h"
> +
> +struct vhost_mdev {
> + struct mutex mutex;
> + struct vhost_dev dev;
> + struct vhost_virtqueue *vqs;
> + int nvqs;
> + u64 state;
> + u64 features;
> + u64 acked_features;
> + struct vfio_group *vfio_group;
> + struct vfio_device *vfio_device;
> + struct mdev_device *mdev;
> +};
> +
> +/*
> + * XXX
> + * We assume virtio_mdev.ko exposes below symbols for now, as we
> + * don't have a proper way to access parent ops directly yet.
> + *
> + * virtio_mdev_readl()
> + * virtio_mdev_writel()
> + */
> +extern u32 virtio_mdev_readl(struct mdev_device *mdev, loff_t off);
> +extern void virtio_mdev_writel(struct mdev_device *mdev, loff_t off, u32 val);
Need to consider a better approach, I feel we should do it through some
kind of mdev driver instead of talk to mdev device directly.
> +
> +static u8 mdev_get_status(struct mdev_device *mdev)
> +{
> + return virtio_mdev_readl(mdev, VIRTIO_MDEV_STATUS);
> +}
> +
> +static void mdev_set_status(struct mdev_device *mdev, u8 status)
> +{
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_STATUS, status);
> +}
> +
> +static void mdev_add_status(struct mdev_device *mdev, u8 status)
> +{
> + status |= mdev_get_status(mdev);
> + mdev_set_status(mdev, status);
> +}
> +
> +static void mdev_reset(struct mdev_device *mdev)
> +{
> + mdev_set_status(mdev, 0);
> +}
> +
> +static void handle_vq_kick(struct vhost_work *work)
> +{
> + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
> + poll.work);
> + struct vhost_mdev *m = container_of(vq->dev, struct vhost_mdev, dev);
> +
> + virtio_mdev_writel(m->mdev, VIRTIO_MDEV_QUEUE_NOTIFY, vq - m->vqs);
> +}
> +
> +static long vhost_mdev_start_backend(struct vhost_mdev *m)
> +{
> + struct mdev_device *mdev = m->mdev;
> + u64 features = m->acked_features;
> + u64 addr;
> + struct vhost_virtqueue *vq;
> + int queue_id;
> +
> + features |= 1ULL << VIRTIO_F_IOMMU_PLATFORM;
Is it better to get this feature from backend?
> +
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_DRIVER_FEATURES_SEL, 1);
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_DRIVER_FEATURES,
> + (u32)(features >> 32));
> +
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_DRIVER_FEATURES_SEL, 0);
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_DRIVER_FEATURES,
> + (u32)features);
> +
> + mdev_add_status(mdev, VIRTIO_CONFIG_S_FEATURES_OK);
> + if (!(mdev_get_status(mdev) & VIRTIO_CONFIG_S_FEATURES_OK))
> + return -ENODEV;
> +
> + for (queue_id = 0; queue_id < m->nvqs; queue_id++) {
> + vq = &m->vqs[queue_id];
> +
> + if (!vq->desc || !vq->avail || !vq->used)
> + break;
> +
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_NUM, vq->num);
> +
> + if (!vhost_translate_ring_addr(vq, (u64)vq->desc,
> + vhost_get_desc_size(vq, vq->num),
> + &addr))
> + return -EINVAL;
Interesting, any reason for doing such kinds of translation to HVA? I
believe the add should already an IOVA that has been map by VFIO.
> +
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_DESC_LOW, addr);
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_DESC_HIGH,
> + (addr >> 32));
> +
> + if (!vhost_translate_ring_addr(vq, (u64)vq->avail,
> + vhost_get_avail_size(vq, vq->num),
> + &addr))
> + return -EINVAL;
> +
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_AVAIL_LOW, addr);
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_AVAIL_HIGH,
> + (addr >> 32));
> +
> + if (!vhost_translate_ring_addr(vq, (u64)vq->used,
> + vhost_get_used_size(vq, vq->num),
> + &addr))
> + return -EINVAL;
> +
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_USED_LOW, addr);
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_USED_HIGH,
> + (addr >> 32));
> +
> + // XXX: we need to support set_vring_base
I'm working on V2 that support this API.
> +
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_READY, 1);
> + }
> +
> + // XXX: we need to setup interrupt as well
V2 will have a better interrupt API like some kind of callback. It could
be as simple as a wrapper of eventfd_signal().
> +
> + mdev_add_status(mdev, VIRTIO_CONFIG_S_DRIVER_OK);
> + return 0;
> +}
> +
> +static long vhost_mdev_stop_backend(struct vhost_mdev *m)
> +{
> + struct mdev_device *mdev = m->mdev;
> +
> + mdev_reset(mdev);
> + mdev_add_status(mdev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> + mdev_add_status(mdev, VIRTIO_CONFIG_S_DRIVER);
> + return 0;
> +}
> +
> +static long vhost_set_state(struct vhost_mdev *m, u64 __user *statep)
> +{
> + u64 state;
> + long r;
> +
> + if (copy_from_user(&state, statep, sizeof(state)))
> + return -EFAULT;
> +
> + if (state >= VHOST_MDEV_S_MAX)
> + return -EINVAL;
> +
> + if (m->state == state)
> + return 0;
> +
> + m->state = state;
> +
> + switch (m->state) {
> + case VHOST_MDEV_S_RUNNING:
> + r = vhost_mdev_start_backend(m);
> + break;
> + case VHOST_MDEV_S_STOPPED:
> + r = vhost_mdev_stop_backend(m);
> + break;
> + default:
> + r = -EINVAL;
> + break;
> + }
> +
> + return r;
> +}
> +
> +static long vhost_get_features(struct vhost_mdev *m, u64 __user *featurep)
> +{
> + if (copy_to_user(featurep, &m->features, sizeof(m->features)))
> + return -EFAULT;
> + return 0;
> +}
> +
> +static long vhost_set_features(struct vhost_mdev *m, u64 __user *featurep)
> +{
> + u64 features;
> +
> + if (copy_from_user(&features, featurep, sizeof(features)))
> + return -EFAULT;
> +
> + if (features & ~m->features)
> + return -EINVAL;
> +
> + m->acked_features = features;
> +
> + return 0;
> +}
> +
> +static long vhost_get_vring_base(struct vhost_mdev *m, void __user *argp)
> +{
> + struct vhost_virtqueue *vq;
> + u32 idx;
> + long r;
> +
> + r = get_user(idx, (u32 __user *)argp);
> + if (r < 0)
> + return r;
> + if (idx >= m->nvqs)
> + return -ENOBUFS;
> +
> + vq = &m->vqs[idx];
> +
> + // XXX: we need to support get_vring_base
> + //vq->last_avail_idx = virtio_mdev_readl(b->mdev, ...);
> +
> + return vhost_vring_ioctl(&m->dev, VHOST_GET_VRING_BASE, argp);
> +}
> +
> +static void vhost_mdev_release_backend(struct vhost_mdev *m)
> +{
> + if (!m->mdev)
> + return;
> +
> + if (m->state != VHOST_MDEV_S_STOPPED) {
> + m->state = VHOST_MDEV_S_STOPPED;
> + vhost_mdev_stop_backend(m);
> + }
> +
> + vhost_dev_stop(&m->dev);
> + vhost_dev_cleanup(&m->dev);
> +
> + kfree(m->dev.vqs);
> + kfree(m->vqs);
> +
> + vfio_device_put(m->vfio_device);
> + vfio_group_put_external_user(m->vfio_group);
> +
> + m->mdev = NULL;
> +}
> +
> +static long vhost_mdev_set_backend(struct vhost_mdev *m,
> + struct vhost_mdev_backend __user *argp)
> +{
> + struct vhost_mdev_backend backend;
> + struct mdev_device *mdev;
> + struct vhost_dev *dev;
> + struct vhost_virtqueue **vqs;
> + struct file *file;
> + struct vfio_device *device;
> + struct vfio_group *group;
> + unsigned long magic;
> + u64 features;
> + int i, nvqs;
> + long r;
> +
> + vhost_mdev_release_backend(m);
> +
> + if (copy_from_user(&backend, argp, sizeof(backend))) {
> + r = -EFAULT;
> + goto err;
> + }
> +
> + file = fget(backend.group_fd);
> + if (!file) {
> + r = -EBADF;
> + goto err;
> + }
> +
> + group = vfio_group_get_external_user(file);
> + fput(file);
> + if (IS_ERR(group)) {
> + r = PTR_ERR(group);
> + goto err;
> + }
> +
> + device = vfio_device_get_from_fd(group, backend.device_fd);
> + if (!IS_ERR(device)) {
> + r = PTR_ERR(device);
> + goto err_put_group;
> + }
> +
> + if (!vfio_device_ops_match(device, &vfio_mdev_dev_ops)) {
> + r = -EINVAL;
> + goto err_put_device;
> + }
It looks to me that we can avoid to expose vfio_mdev_dev_ops here.
> +
> + mdev = vfio_device_data(m->vfio_device);
> +
> + magic = virtio_mdev_readl(mdev, VIRTIO_MDEV_MAGIC_VALUE);
> + if (magic != ('v' | 'i' << 8 | 'r' << 16 | 't' << 24)) {
> + r = -ENODEV;
> + goto err_put_device;
> + }
> +
> + mdev_reset(mdev);
> + mdev_add_status(mdev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
> + mdev_add_status(mdev, VIRTIO_CONFIG_S_DRIVER);
> +
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_DEVICE_FEATURES_SEL, 1);
> + features = virtio_mdev_readl(mdev, VIRTIO_MDEV_DEVICE_FEATURES);
> + features <<= 32;
> +
> + virtio_mdev_writel(mdev, VIRTIO_MDEV_DEVICE_FEATURES_SEL, 0);
> + features |= virtio_mdev_readl(mdev, VIRTIO_MDEV_DEVICE_FEATURES);
> +
> + if (!(features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))) {
> + r = -EINVAL;
> + goto err_put_device;
> + }
> +
> + m->features = features;
> +
> + nvqs = virtio_mdev_readl(mdev, VIRTIO_MDEV_QUEUE_NUM_MAX);
> + m->nvqs = nvqs;
> +
> + m->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue),
> + GFP_KERNEL);
> + if (!m->vqs) {
> + r = -ENOMEM;
> + goto err_put_device;
> + }
> +
> + vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
> + if (!vqs) {
> + r = -ENOMEM;
> + goto err_free_vqs;
> + }
> +
> + dev = &m->dev;
> + for (i = 0; i < nvqs; i++) {
> + vqs[i] = &m->vqs[i];
> + vqs[i]->handle_kick = handle_vq_kick;
> + }
> + vhost_dev_init(dev, vqs, nvqs, 0, 0, 0);
> +
> + m->vfio_group = group;
> + m->vfio_device = device;
> + m->mdev = mdev;
> +
> + return 0;
> +
> +err_free_vqs:
> + kfree(m->vqs);
> +err_put_device:
> + vfio_device_put(device);
> +err_put_group:
> + vfio_group_put_external_user(group);
> +err:
> + return r;
> +}
> +
> +static int vhost_mdev_open(struct inode *inode, struct file *f)
> +{
> + struct vhost_mdev *m;
> +
> + m = kzalloc(sizeof(*m), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> + if (!m)
> + return -ENOMEM;
> +
> + mutex_init(&m->mutex);
> + f->private_data = m;
> +
> + return 0;
> +}
> +
> +static int vhost_mdev_release(struct inode *inode, struct file *f)
> +{
> + struct vhost_mdev *m = f->private_data;
> +
> + vhost_mdev_release_backend(m);
> + mutex_destroy(&m->mutex);
> + kfree(m);
> +
> + return 0;
> +}
> +
> +static long vhost_mdev_ioctl(struct file *f, unsigned int cmd,
> + unsigned long arg)
> +{
> + void __user *argp = (void __user *)arg;
> + struct vhost_mdev *m = f->private_data;
> + long r;
> +
> + mutex_lock(&m->mutex);
> +
> + if (cmd == VHOST_MDEV_SET_BACKEND) {
> + r = vhost_mdev_set_backend(m, argp);
> + goto done;
> + }
> +
> + if (!m->mdev) {
> + r = -EINVAL;
> + goto done;
> + }
> +
> + switch (cmd) {
> + case VHOST_MDEV_SET_STATE:
> + r = vhost_set_state(m, argp);
> + break;
> + case VHOST_GET_FEATURES:
> + r = vhost_get_features(m, argp);
> + break;
> + case VHOST_SET_FEATURES:
> + r = vhost_set_features(m, argp);
> + break;
> + case VHOST_GET_VRING_BASE:
> + r = vhost_get_vring_base(m, argp);
> + break;
> + default:
> + r = vhost_dev_ioctl(&m->dev, cmd, argp);
> + if (r == -ENOIOCTLCMD)
> + r = vhost_vring_ioctl(&m->dev, cmd, argp);
> + }
> +
> +done:
> + mutex_lock(&m->mutex);
> + return r;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static long vhost_mdev_compat_ioctl(struct file *f, unsigned int ioctl,
> + unsigned long arg)
> +{
> + return vhost_mdev_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
> +}
> +#endif
> +
> +static const struct file_operations vhost_mdev_fops = {
> + .owner = THIS_MODULE,
> + .release = vhost_mdev_release,
> + .unlocked_ioctl = vhost_mdev_ioctl,
> +#ifdef CONFIG_COMPAT
> + .compat_ioctl = vhost_mdev_compat_ioctl,
> +#endif
> + .open = vhost_mdev_open,
> + .llseek = noop_llseek,
> +};
> +
> +static struct miscdevice vhost_mdev_misc = {
> + .minor = MISC_DYNAMIC_MINOR,
> + .name = "vhost-mdev",
> + .fops = &vhost_mdev_fops,
> +};
> +
> +static int __init vhost_mdev_init(void)
> +{
> + return misc_register(&vhost_mdev_misc);
> +}
> +module_init(vhost_mdev_init);
> +
> +static void __exit vhost_mdev_exit(void)
> +{
> + misc_deregister(&vhost_mdev_misc);
> +}
> +module_exit(vhost_mdev_exit);
> +
> +MODULE_VERSION("0.0.0");
> +MODULE_LICENSE("GPL v2");
> +MODULE_DESCRIPTION("Hardware vhost accelerator abstraction");
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 5dc174ac8cac..0f7236a17a56 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -426,8 +426,7 @@ bool vhost_exceeds_weight(struct vhost_virtqueue *vq,
> }
> EXPORT_SYMBOL_GPL(vhost_exceeds_weight);
>
> -static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
> - unsigned int num)
> +size_t vhost_get_avail_size(struct vhost_virtqueue *vq, unsigned int num)
> {
> size_t event __maybe_unused =
> vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
> @@ -435,9 +434,9 @@ static size_t vhost_get_avail_size(struct vhost_virtqueue *vq,
> return sizeof(*vq->avail) +
> sizeof(*vq->avail->ring) * num + event;
> }
> +EXPORT_SYMBOL_GPL(vhost_get_avail_size);
>
> -static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
> - unsigned int num)
> +size_t vhost_get_used_size(struct vhost_virtqueue *vq, unsigned int num)
> {
> size_t event __maybe_unused =
> vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
> @@ -445,12 +444,13 @@ static size_t vhost_get_used_size(struct vhost_virtqueue *vq,
> return sizeof(*vq->used) +
> sizeof(*vq->used->ring) * num + event;
> }
> +EXPORT_SYMBOL_GPL(vhost_get_used_size);
>
> -static size_t vhost_get_desc_size(struct vhost_virtqueue *vq,
> - unsigned int num)
> +size_t vhost_get_desc_size(struct vhost_virtqueue *vq, unsigned int num)
> {
> return sizeof(*vq->desc) * num;
> }
> +EXPORT_SYMBOL_GPL(vhost_get_desc_size);
>
> void vhost_dev_init(struct vhost_dev *dev,
> struct vhost_virtqueue **vqs, int nvqs,
> @@ -2617,6 +2617,33 @@ struct vhost_msg_node *vhost_dequeue_msg(struct vhost_dev *dev,
> }
> EXPORT_SYMBOL_GPL(vhost_dequeue_msg);
>
> +bool vhost_translate_ring_addr(struct vhost_virtqueue *vq, u64 ring_addr,
> + u64 len, u64 *addr)
> +{
> + struct vhost_umem *umem = vq->umem;
> + struct vhost_umem_node *u;
> +
> + if (vhost_overflow(ring_addr, len))
> + return false;
> +
> + if (vq->iotlb) {
> + /* Ring address is already IOVA */
> + *addr = ring_addr;
> + return true;
> + }
> +
> + /* Ring address is host virtual address. */
> + list_for_each_entry(u, &umem->umem_list, link) {
> + if (u->userspace_addr <= ring_addr &&
> + u->userspace_addr + u->size >= ring_addr + len) {
> + *addr = ring_addr - u->userspace_addr + u->start;
> + return true;
> + }
> + }
> +
> + return false;
> +}
> +EXPORT_SYMBOL_GPL(vhost_translate_ring_addr);
As we've discussed, this is necessary.
Thanks
>
> static int __init vhost_init(void)
> {
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index e9ed2722b633..294a6bcb6adf 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -189,6 +189,12 @@ long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
> long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp);
> bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
> bool vhost_log_access_ok(struct vhost_dev *);
> +bool vhost_translate_ring_addr(struct vhost_virtqueue *vq, u64 ring_addr,
> + u64 len, u64 *addr);
> +
> +size_t vhost_get_avail_size(struct vhost_virtqueue *vq, unsigned int num);
> +size_t vhost_get_used_size(struct vhost_virtqueue *vq, unsigned int num);
> +size_t vhost_get_desc_size(struct vhost_virtqueue *vq, unsigned int num);
>
> int vhost_get_vq_desc(struct vhost_virtqueue *,
> struct iovec iov[], unsigned int iov_count,
> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> index 40d028eed645..7213aedc8506 100644
> --- a/include/uapi/linux/vhost.h
> +++ b/include/uapi/linux/vhost.h
> @@ -116,4 +116,14 @@
> #define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64)
> #define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int)
>
> +/* VHOST_MDEV specific defines */
> +
> +#define VHOST_MDEV_SET_BACKEND _IOW(VHOST_VIRTIO, 0x70, \
> + struct vhost_mdev_backend)
> +#define VHOST_MDEV_SET_STATE _IOW(VHOST_VIRTIO, 0x71, __u64)
> +
> +#define VHOST_MDEV_S_STOPPED 0
> +#define VHOST_MDEV_S_RUNNING 1
> +#define VHOST_MDEV_S_MAX 2
> +
> #endif
> diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
> index c907290ff065..f06f0dbb7e51 100644
> --- a/include/uapi/linux/vhost_types.h
> +++ b/include/uapi/linux/vhost_types.h
> @@ -119,6 +119,11 @@ struct vhost_scsi_target {
> unsigned short reserved;
> };
>
> +struct vhost_mdev_backend {
> + int group_fd;
> + int device_fd;
> +};
> +
> /* Feature bits */
> /* Log all write descriptors. Can be changed while device is active. */
> #define VHOST_F_LOG_ALL 26
On Tue, Sep 17, 2019 at 11:32:03AM +0800, Jason Wang wrote:
> On 2019/9/17 上午9:02, Tiwei Bie wrote:
> > This RFC is to demonstrate below ideas,
> >
> > a) Build vhost-mdev on top of the same abstraction defined in
> > the virtio-mdev series [1];
> >
> > b) Introduce /dev/vhost-mdev to do vhost ioctls and support
> > setting mdev device as backend;
> >
> > Now the userspace API looks like this:
> >
> > - Userspace generates a compatible mdev device;
> >
> > - Userspace opens this mdev device with VFIO API (including
> > doing IOMMU programming for this mdev device with VFIO's
> > container/group based interface);
> >
> > - Userspace opens /dev/vhost-mdev and gets vhost fd;
> >
> > - Userspace uses vhost ioctls to setup vhost (userspace should
> > do VHOST_MDEV_SET_BACKEND ioctl with VFIO group fd and device
> > fd first before doing other vhost ioctls);
> >
> > Only compile test has been done for this series for now.
>
>
> Have a hard thought on the architecture:
Thanks a lot! Do appreciate it!
>
> 1) Create a vhost char device and pass vfio mdev device fd to it as a
> backend and translate vhost-mdev ioctl to virtio mdev transport (e.g
> read/write). DMA was done through the VFIO DMA mapping on the container that
> is attached.
Yeah, that's what we are doing in this series.
>
> We have two more choices:
>
> 2) Use vfio-mdev but do not create vhost-mdev device, instead, just
> implement vhost ioctl on vfio_device_ops, and translate them into
> virtio-mdev transport or just pass ioctl to parent.
Yeah. Instead of introducing /dev/vhost-mdev char device, do
vhost ioctls on VFIO device fd directly. That's what we did
in RFC v3.
>
> 3) Don't use vfio-mdev, create a new vhost-mdev driver, during probe still
> try to add dev to vfio group and talk to parent with device specific ops
If my understanding is correct, this means we need to introduce
a new VFIO device driver to replace the existing vfio-mdev driver
in our case. Below is a quick draft just to show my understanding:
#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/vfio.h>
#include <linux/mdev.h>
#include "mdev_private.h"
/* XXX: we need a proper way to include below vhost header. */
#include "../../vhost/vhost.h"
static int vfio_vhost_mdev_open(void *device_data)
{
if (!try_module_get(THIS_MODULE))
return -ENODEV;
/* ... */
vhost_dev_init(...);
return 0;
}
static void vfio_vhost_mdev_release(void *device_data)
{
/* ... */
module_put(THIS_MODULE);
}
static long vfio_vhost_mdev_unlocked_ioctl(void *device_data,
unsigned int cmd, unsigned long arg)
{
struct mdev_device *mdev = device_data;
struct mdev_parent *parent = mdev->parent;
/*
* Use vhost ioctls.
*
* We will have a different parent_ops design.
* And potentially, we can share the same parent_ops
* with virtio_mdev.
*/
switch (cmd) {
case VHOST_GET_FEATURES:
parent->ops->get_features(mdev, ...);
break;
/* ... */
}
return 0;
}
static ssize_t vfio_vhost_mdev_read(void *device_data, char __user *buf,
size_t count, loff_t *ppos)
{
/* ... */
return 0;
}
static ssize_t vfio_vhost_mdev_write(void *device_data, const char __user *buf,
size_t count, loff_t *ppos)
{
/* ... */
return 0;
}
static int vfio_vhost_mdev_mmap(void *device_data, struct vm_area_struct *vma)
{
/* ... */
return 0;
}
static const struct vfio_device_ops vfio_vhost_mdev_dev_ops = {
.name = "vfio-vhost-mdev",
.open = vfio_vhost_mdev_open,
.release = vfio_vhost_mdev_release,
.ioctl = vfio_vhost_mdev_unlocked_ioctl,
.read = vfio_vhost_mdev_read,
.write = vfio_vhost_mdev_write,
.mmap = vfio_vhost_mdev_mmap,
};
static int vfio_vhost_mdev_probe(struct device *dev)
{
struct mdev_device *mdev = to_mdev_device(dev);
/* ... */
return vfio_add_group_dev(dev, &vfio_vhost_mdev_dev_ops, mdev);
}
static void vfio_vhost_mdev_remove(struct device *dev)
{
/* ... */
vfio_del_group_dev(dev);
}
static struct mdev_driver vfio_vhost_mdev_driver = {
.name = "vfio_vhost_mdev",
.probe = vfio_vhost_mdev_probe,
.remove = vfio_vhost_mdev_remove,
};
static int __init vfio_vhost_mdev_init(void)
{
return mdev_register_driver(&vfio_vhost_mdev_driver, THIS_MODULE);
}
module_init(vfio_vhost_mdev_init)
static void __exit vfio_vhost_mdev_exit(void)
{
mdev_unregister_driver(&vfio_vhost_mdev_driver);
}
module_exit(vfio_vhost_mdev_exit)
>
> So I have some questions:
>
> 1) Compared to method 2, what's the advantage of creating a new vhost char
> device? I guess it's for keep the API compatibility?
One benefit is that we can avoid doing vhost ioctls on
VFIO device fd.
>
> 2) For method 2, is there any easy way for user/admin to distinguish e.g
> ordinary vfio-mdev for vhost from ordinary vfio-mdev?
I think device-api could be a choice.
> I saw you introduce
> ops matching helper but it's not friendly to management.
The ops matching helper is just to check whether a given
vfio-device is based on a mdev device.
>
> 3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
> assumes the parameter comes from userspace, it prevents support kernel
> virtio drivers.
>
> 4) So comes the idea of method 3, since it register a new vhost-mdev driver,
> we can use device specific ops instead of VFIO ones, then we can have a
> common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
As the above draft shows, this requires introducing a new
VFIO device driver. I think Alex's opinion matters here.
Thanks,
Tiwei
>
> What's your thoughts?
>
> Thanks
>
>
> >
> > RFCv3: https://patchwork.kernel.org/patch/11117785/
> >
> > [1] https://lkml.org/lkml/2019/9/10/135
> >
> > Tiwei Bie (3):
> > vfio: support getting vfio device from device fd
> > vfio: support checking vfio driver by device ops
> > vhost: introduce mdev based hardware backend
> >
> > drivers/vfio/mdev/vfio_mdev.c | 3 +-
> > drivers/vfio/vfio.c | 32 +++
> > drivers/vhost/Kconfig | 9 +
> > drivers/vhost/Makefile | 3 +
> > drivers/vhost/mdev.c | 462 +++++++++++++++++++++++++++++++
> > drivers/vhost/vhost.c | 39 ++-
> > drivers/vhost/vhost.h | 6 +
> > include/linux/vfio.h | 11 +
> > include/uapi/linux/vhost.h | 10 +
> > include/uapi/linux/vhost_types.h | 5 +
> > 10 files changed, 573 insertions(+), 7 deletions(-)
> > create mode 100644 drivers/vhost/mdev.c
> >
This patch introduces the support for checking the VFIO driver
by device ops. And vfio-mdev's device ops is also exported to
make it possible to check whether a VFIO device is based on a
mdev device.
Signed-off-by: Tiwei Bie <[email protected]>
---
drivers/vfio/mdev/vfio_mdev.c | 3 ++-
drivers/vfio/vfio.c | 7 +++++++
include/linux/vfio.h | 7 +++++++
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 30964a4e0a28..e0f31c5a5db2 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -98,7 +98,7 @@ static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
return parent->ops->mmap(mdev, vma);
}
-static const struct vfio_device_ops vfio_mdev_dev_ops = {
+const struct vfio_device_ops vfio_mdev_dev_ops = {
.name = "vfio-mdev",
.open = vfio_mdev_open,
.release = vfio_mdev_release,
@@ -107,6 +107,7 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = {
.write = vfio_mdev_write,
.mmap = vfio_mdev_mmap,
};
+EXPORT_SYMBOL_GPL(vfio_mdev_dev_ops);
static int vfio_mdev_probe(struct device *dev)
{
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 697fd079bb3f..1145110909e4 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1806,6 +1806,13 @@ long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
}
EXPORT_SYMBOL_GPL(vfio_external_check_extension);
+bool vfio_device_ops_match(struct vfio_device *device,
+ const struct vfio_device_ops *ops)
+{
+ return device->ops == ops;
+}
+EXPORT_SYMBOL_GPL(vfio_device_ops_match);
+
/**
* Sub-module support
*/
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index e75b24fd7c5c..741c5bb567a8 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -56,6 +56,8 @@ extern struct vfio_device *vfio_device_get_from_fd(struct vfio_group *group,
int device_fd);
extern void vfio_device_put(struct vfio_device *device);
extern void *vfio_device_data(struct vfio_device *device);
+extern bool vfio_device_ops_match(struct vfio_device *device,
+ const struct vfio_device_ops *ops);
/**
* struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
@@ -199,4 +201,9 @@ extern int vfio_virqfd_enable(void *opaque,
void *data, struct virqfd **pvirqfd, int fd);
extern void vfio_virqfd_disable(struct virqfd **pvirqfd);
+/*
+ * VFIO device ops
+ */
+extern const struct vfio_device_ops vfio_mdev_dev_ops;
+
#endif /* VFIO_H */
--
2.17.1
On 2019/9/17 上午9:02, Tiwei Bie wrote:
> This RFC is to demonstrate below ideas,
>
> a) Build vhost-mdev on top of the same abstraction defined in
> the virtio-mdev series [1];
>
> b) Introduce /dev/vhost-mdev to do vhost ioctls and support
> setting mdev device as backend;
>
> Now the userspace API looks like this:
>
> - Userspace generates a compatible mdev device;
>
> - Userspace opens this mdev device with VFIO API (including
> doing IOMMU programming for this mdev device with VFIO's
> container/group based interface);
>
> - Userspace opens /dev/vhost-mdev and gets vhost fd;
>
> - Userspace uses vhost ioctls to setup vhost (userspace should
> do VHOST_MDEV_SET_BACKEND ioctl with VFIO group fd and device
> fd first before doing other vhost ioctls);
>
> Only compile test has been done for this series for now.
Have a hard thought on the architecture:
1) Create a vhost char device and pass vfio mdev device fd to it as a
backend and translate vhost-mdev ioctl to virtio mdev transport (e.g
read/write). DMA was done through the VFIO DMA mapping on the container
that is attached.
We have two more choices:
2) Use vfio-mdev but do not create vhost-mdev device, instead, just
implement vhost ioctl on vfio_device_ops, and translate them into
virtio-mdev transport or just pass ioctl to parent.
3) Don't use vfio-mdev, create a new vhost-mdev driver, during probe
still try to add dev to vfio group and talk to parent with device
specific ops
So I have some questions:
1) Compared to method 2, what's the advantage of creating a new vhost
char device? I guess it's for keep the API compatibility?
2) For method 2, is there any easy way for user/admin to distinguish e.g
ordinary vfio-mdev for vhost from ordinary vfio-mdev? I saw you
introduce ops matching helper but it's not friendly to management.
3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
assumes the parameter comes from userspace, it prevents support kernel
virtio drivers.
4) So comes the idea of method 3, since it register a new vhost-mdev
driver, we can use device specific ops instead of VFIO ones, then we can
have a common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
What's your thoughts?
Thanks
>
> RFCv3: https://patchwork.kernel.org/patch/11117785/
>
> [1] https://lkml.org/lkml/2019/9/10/135
>
> Tiwei Bie (3):
> vfio: support getting vfio device from device fd
> vfio: support checking vfio driver by device ops
> vhost: introduce mdev based hardware backend
>
> drivers/vfio/mdev/vfio_mdev.c | 3 +-
> drivers/vfio/vfio.c | 32 +++
> drivers/vhost/Kconfig | 9 +
> drivers/vhost/Makefile | 3 +
> drivers/vhost/mdev.c | 462 +++++++++++++++++++++++++++++++
> drivers/vhost/vhost.c | 39 ++-
> drivers/vhost/vhost.h | 6 +
> include/linux/vfio.h | 11 +
> include/uapi/linux/vhost.h | 10 +
> include/uapi/linux/vhost_types.h | 5 +
> 10 files changed, 573 insertions(+), 7 deletions(-)
> create mode 100644 drivers/vhost/mdev.c
>
On 2019/9/17 下午6:58, Tiwei Bie wrote:
> On Tue, Sep 17, 2019 at 11:32:03AM +0800, Jason Wang wrote:
>> On 2019/9/17 上午9:02, Tiwei Bie wrote:
>>> This RFC is to demonstrate below ideas,
>>>
>>> a) Build vhost-mdev on top of the same abstraction defined in
>>> the virtio-mdev series [1];
>>>
>>> b) Introduce /dev/vhost-mdev to do vhost ioctls and support
>>> setting mdev device as backend;
>>>
>>> Now the userspace API looks like this:
>>>
>>> - Userspace generates a compatible mdev device;
>>>
>>> - Userspace opens this mdev device with VFIO API (including
>>> doing IOMMU programming for this mdev device with VFIO's
>>> container/group based interface);
>>>
>>> - Userspace opens /dev/vhost-mdev and gets vhost fd;
>>>
>>> - Userspace uses vhost ioctls to setup vhost (userspace should
>>> do VHOST_MDEV_SET_BACKEND ioctl with VFIO group fd and device
>>> fd first before doing other vhost ioctls);
>>>
>>> Only compile test has been done for this series for now.
>>
>> Have a hard thought on the architecture:
> Thanks a lot! Do appreciate it!
>
>> 1) Create a vhost char device and pass vfio mdev device fd to it as a
>> backend and translate vhost-mdev ioctl to virtio mdev transport (e.g
>> read/write). DMA was done through the VFIO DMA mapping on the container that
>> is attached.
> Yeah, that's what we are doing in this series.
>
>> We have two more choices:
>>
>> 2) Use vfio-mdev but do not create vhost-mdev device, instead, just
>> implement vhost ioctl on vfio_device_ops, and translate them into
>> virtio-mdev transport or just pass ioctl to parent.
> Yeah. Instead of introducing /dev/vhost-mdev char device, do
> vhost ioctls on VFIO device fd directly. That's what we did
> in RFC v3.
>
>> 3) Don't use vfio-mdev, create a new vhost-mdev driver, during probe still
>> try to add dev to vfio group and talk to parent with device specific ops
> If my understanding is correct, this means we need to introduce
> a new VFIO device driver to replace the existing vfio-mdev driver
> in our case. Below is a quick draft just to show my understanding:
>
> #include <linux/init.h>
> #include <linux/module.h>
> #include <linux/device.h>
> #include <linux/kernel.h>
> #include <linux/slab.h>
> #include <linux/vfio.h>
> #include <linux/mdev.h>
>
> #include "mdev_private.h"
>
> /* XXX: we need a proper way to include below vhost header. */
> #include "../../vhost/vhost.h"
>
> static int vfio_vhost_mdev_open(void *device_data)
> {
> if (!try_module_get(THIS_MODULE))
> return -ENODEV;
>
> /* ... */
> vhost_dev_init(...);
>
> return 0;
> }
>
> static void vfio_vhost_mdev_release(void *device_data)
> {
> /* ... */
> module_put(THIS_MODULE);
> }
>
> static long vfio_vhost_mdev_unlocked_ioctl(void *device_data,
> unsigned int cmd, unsigned long arg)
> {
> struct mdev_device *mdev = device_data;
> struct mdev_parent *parent = mdev->parent;
>
> /*
> * Use vhost ioctls.
> *
> * We will have a different parent_ops design.
> * And potentially, we can share the same parent_ops
> * with virtio_mdev.
> */
> switch (cmd) {
> case VHOST_GET_FEATURES:
> parent->ops->get_features(mdev, ...);
> break;
> /* ... */
> }
>
> return 0;
> }
>
> static ssize_t vfio_vhost_mdev_read(void *device_data, char __user *buf,
> size_t count, loff_t *ppos)
> {
> /* ... */
> return 0;
> }
>
> static ssize_t vfio_vhost_mdev_write(void *device_data, const char __user *buf,
> size_t count, loff_t *ppos)
> {
> /* ... */
> return 0;
> }
>
> static int vfio_vhost_mdev_mmap(void *device_data, struct vm_area_struct *vma)
> {
> /* ... */
> return 0;
> }
>
> static const struct vfio_device_ops vfio_vhost_mdev_dev_ops = {
> .name = "vfio-vhost-mdev",
> .open = vfio_vhost_mdev_open,
> .release = vfio_vhost_mdev_release,
> .ioctl = vfio_vhost_mdev_unlocked_ioctl,
> .read = vfio_vhost_mdev_read,
> .write = vfio_vhost_mdev_write,
> .mmap = vfio_vhost_mdev_mmap,
> };
>
> static int vfio_vhost_mdev_probe(struct device *dev)
> {
> struct mdev_device *mdev = to_mdev_device(dev);
>
> /* ... */
> return vfio_add_group_dev(dev, &vfio_vhost_mdev_dev_ops, mdev);
> }
>
> static void vfio_vhost_mdev_remove(struct device *dev)
> {
> /* ... */
> vfio_del_group_dev(dev);
> }
>
> static struct mdev_driver vfio_vhost_mdev_driver = {
> .name = "vfio_vhost_mdev",
> .probe = vfio_vhost_mdev_probe,
> .remove = vfio_vhost_mdev_remove,
> };
>
> static int __init vfio_vhost_mdev_init(void)
> {
> return mdev_register_driver(&vfio_vhost_mdev_driver, THIS_MODULE);
> }
> module_init(vfio_vhost_mdev_init)
>
> static void __exit vfio_vhost_mdev_exit(void)
> {
> mdev_unregister_driver(&vfio_vhost_mdev_driver);
> }
> module_exit(vfio_vhost_mdev_exit)
Yes, something like this basically.
>> So I have some questions:
>>
>> 1) Compared to method 2, what's the advantage of creating a new vhost char
>> device? I guess it's for keep the API compatibility?
> One benefit is that we can avoid doing vhost ioctls on
> VFIO device fd.
Yes, but any benefit from doing this?
>
>> 2) For method 2, is there any easy way for user/admin to distinguish e.g
>> ordinary vfio-mdev for vhost from ordinary vfio-mdev?
> I think device-api could be a choice.
Ok.
>
>> I saw you introduce
>> ops matching helper but it's not friendly to management.
> The ops matching helper is just to check whether a given
> vfio-device is based on a mdev device.
>
>> 3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
>> assumes the parameter comes from userspace, it prevents support kernel
>> virtio drivers.
>>
>> 4) So comes the idea of method 3, since it register a new vhost-mdev driver,
>> we can use device specific ops instead of VFIO ones, then we can have a
>> common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
> As the above draft shows, this requires introducing a new
> VFIO device driver. I think Alex's opinion matters here.
Yes, it is.
Thanks
> Thanks,
> Tiwei
>
>> What's your thoughts?
>>
>> Thanks
>>
>>
>>> RFCv3: https://patchwork.kernel.org/patch/11117785/
>>>
>>> [1] https://lkml.org/lkml/2019/9/10/135
>>>
>>> Tiwei Bie (3):
>>> vfio: support getting vfio device from device fd
>>> vfio: support checking vfio driver by device ops
>>> vhost: introduce mdev based hardware backend
>>>
>>> drivers/vfio/mdev/vfio_mdev.c | 3 +-
>>> drivers/vfio/vfio.c | 32 +++
>>> drivers/vhost/Kconfig | 9 +
>>> drivers/vhost/Makefile | 3 +
>>> drivers/vhost/mdev.c | 462 +++++++++++++++++++++++++++++++
>>> drivers/vhost/vhost.c | 39 ++-
>>> drivers/vhost/vhost.h | 6 +
>>> include/linux/vfio.h | 11 +
>>> include/uapi/linux/vhost.h | 10 +
>>> include/uapi/linux/vhost_types.h | 5 +
>>> 10 files changed, 573 insertions(+), 7 deletions(-)
>>> create mode 100644 drivers/vhost/mdev.c
>>>
On Wed, Sep 18, 2019 at 01:51:21PM +0800, Jason Wang wrote:
>
> On 2019/9/17 下午6:58, Tiwei Bie wrote:
> > On Tue, Sep 17, 2019 at 11:32:03AM +0800, Jason Wang wrote:
> > > On 2019/9/17 上午9:02, Tiwei Bie wrote:
> > > > This RFC is to demonstrate below ideas,
> > > >
> > > > a) Build vhost-mdev on top of the same abstraction defined in
> > > > the virtio-mdev series [1];
> > > >
> > > > b) Introduce /dev/vhost-mdev to do vhost ioctls and support
> > > > setting mdev device as backend;
> > > >
> > > > Now the userspace API looks like this:
> > > >
> > > > - Userspace generates a compatible mdev device;
> > > >
> > > > - Userspace opens this mdev device with VFIO API (including
> > > > doing IOMMU programming for this mdev device with VFIO's
> > > > container/group based interface);
> > > >
> > > > - Userspace opens /dev/vhost-mdev and gets vhost fd;
> > > >
> > > > - Userspace uses vhost ioctls to setup vhost (userspace should
> > > > do VHOST_MDEV_SET_BACKEND ioctl with VFIO group fd and device
> > > > fd first before doing other vhost ioctls);
> > > >
> > > > Only compile test has been done for this series for now.
> > >
> > > Have a hard thought on the architecture:
> > Thanks a lot! Do appreciate it!
> >
> > > 1) Create a vhost char device and pass vfio mdev device fd to it as a
> > > backend and translate vhost-mdev ioctl to virtio mdev transport (e.g
> > > read/write). DMA was done through the VFIO DMA mapping on the container that
> > > is attached.
> > Yeah, that's what we are doing in this series.
> >
> > > We have two more choices:
> > >
> > > 2) Use vfio-mdev but do not create vhost-mdev device, instead, just
> > > implement vhost ioctl on vfio_device_ops, and translate them into
> > > virtio-mdev transport or just pass ioctl to parent.
> > Yeah. Instead of introducing /dev/vhost-mdev char device, do
> > vhost ioctls on VFIO device fd directly. That's what we did
> > in RFC v3.
> >
> > > 3) Don't use vfio-mdev, create a new vhost-mdev driver, during probe still
> > > try to add dev to vfio group and talk to parent with device specific ops
> > If my understanding is correct, this means we need to introduce
> > a new VFIO device driver to replace the existing vfio-mdev driver
> > in our case. Below is a quick draft just to show my understanding:
> >
> > #include <linux/init.h>
> > #include <linux/module.h>
> > #include <linux/device.h>
> > #include <linux/kernel.h>
> > #include <linux/slab.h>
> > #include <linux/vfio.h>
> > #include <linux/mdev.h>
> >
> > #include "mdev_private.h"
> >
> > /* XXX: we need a proper way to include below vhost header. */
> > #include "../../vhost/vhost.h"
> >
> > static int vfio_vhost_mdev_open(void *device_data)
> > {
> > if (!try_module_get(THIS_MODULE))
> > return -ENODEV;
> >
> > /* ... */
> > vhost_dev_init(...);
> >
> > return 0;
> > }
> >
> > static void vfio_vhost_mdev_release(void *device_data)
> > {
> > /* ... */
> > module_put(THIS_MODULE);
> > }
> >
> > static long vfio_vhost_mdev_unlocked_ioctl(void *device_data,
> > unsigned int cmd, unsigned long arg)
> > {
> > struct mdev_device *mdev = device_data;
> > struct mdev_parent *parent = mdev->parent;
> >
> > /*
> > * Use vhost ioctls.
> > *
> > * We will have a different parent_ops design.
> > * And potentially, we can share the same parent_ops
> > * with virtio_mdev.
> > */
> > switch (cmd) {
> > case VHOST_GET_FEATURES:
> > parent->ops->get_features(mdev, ...);
> > break;
> > /* ... */
> > }
> >
> > return 0;
> > }
> >
> > static ssize_t vfio_vhost_mdev_read(void *device_data, char __user *buf,
> > size_t count, loff_t *ppos)
> > {
> > /* ... */
> > return 0;
> > }
> >
> > static ssize_t vfio_vhost_mdev_write(void *device_data, const char __user *buf,
> > size_t count, loff_t *ppos)
> > {
> > /* ... */
> > return 0;
> > }
> >
> > static int vfio_vhost_mdev_mmap(void *device_data, struct vm_area_struct *vma)
> > {
> > /* ... */
> > return 0;
> > }
> >
> > static const struct vfio_device_ops vfio_vhost_mdev_dev_ops = {
> > .name = "vfio-vhost-mdev",
> > .open = vfio_vhost_mdev_open,
> > .release = vfio_vhost_mdev_release,
> > .ioctl = vfio_vhost_mdev_unlocked_ioctl,
> > .read = vfio_vhost_mdev_read,
> > .write = vfio_vhost_mdev_write,
> > .mmap = vfio_vhost_mdev_mmap,
> > };
> >
> > static int vfio_vhost_mdev_probe(struct device *dev)
> > {
> > struct mdev_device *mdev = to_mdev_device(dev);
> >
> > /* ... */
> > return vfio_add_group_dev(dev, &vfio_vhost_mdev_dev_ops, mdev);
> > }
> >
> > static void vfio_vhost_mdev_remove(struct device *dev)
> > {
> > /* ... */
> > vfio_del_group_dev(dev);
> > }
> >
> > static struct mdev_driver vfio_vhost_mdev_driver = {
> > .name = "vfio_vhost_mdev",
> > .probe = vfio_vhost_mdev_probe,
> > .remove = vfio_vhost_mdev_remove,
> > };
> >
> > static int __init vfio_vhost_mdev_init(void)
> > {
> > return mdev_register_driver(&vfio_vhost_mdev_driver, THIS_MODULE);
> > }
> > module_init(vfio_vhost_mdev_init)
> >
> > static void __exit vfio_vhost_mdev_exit(void)
> > {
> > mdev_unregister_driver(&vfio_vhost_mdev_driver);
> > }
> > module_exit(vfio_vhost_mdev_exit)
>
>
> Yes, something like this basically.
>
>
> > > So I have some questions:
> > >
> > > 1) Compared to method 2, what's the advantage of creating a new vhost char
> > > device? I guess it's for keep the API compatibility?
> > One benefit is that we can avoid doing vhost ioctls on
> > VFIO device fd.
>
>
> Yes, but any benefit from doing this?
It does seem a bit more modular, but it's certainly not a big deal.
> >
> > > 2) For method 2, is there any easy way for user/admin to distinguish e.g
> > > ordinary vfio-mdev for vhost from ordinary vfio-mdev?
> > I think device-api could be a choice.
>
>
> Ok.
>
>
> >
> > > I saw you introduce
> > > ops matching helper but it's not friendly to management.
> > The ops matching helper is just to check whether a given
> > vfio-device is based on a mdev device.
> >
> > > 3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
> > > assumes the parameter comes from userspace, it prevents support kernel
> > > virtio drivers.
> > >
> > > 4) So comes the idea of method 3, since it register a new vhost-mdev driver,
> > > we can use device specific ops instead of VFIO ones, then we can have a
> > > common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
> > As the above draft shows, this requires introducing a new
> > VFIO device driver. I think Alex's opinion matters here.
>
>
> Yes, it is.
>
> Thanks
>
>
> > Thanks,
> > Tiwei
> >
> > > What's your thoughts?
> > >
> > > Thanks
> > >
> > >
> > > > RFCv3: https://patchwork.kernel.org/patch/11117785/
> > > >
> > > > [1] https://lkml.org/lkml/2019/9/10/135
> > > >
> > > > Tiwei Bie (3):
> > > > vfio: support getting vfio device from device fd
> > > > vfio: support checking vfio driver by device ops
> > > > vhost: introduce mdev based hardware backend
> > > >
> > > > drivers/vfio/mdev/vfio_mdev.c | 3 +-
> > > > drivers/vfio/vfio.c | 32 +++
> > > > drivers/vhost/Kconfig | 9 +
> > > > drivers/vhost/Makefile | 3 +
> > > > drivers/vhost/mdev.c | 462 +++++++++++++++++++++++++++++++
> > > > drivers/vhost/vhost.c | 39 ++-
> > > > drivers/vhost/vhost.h | 6 +
> > > > include/linux/vfio.h | 11 +
> > > > include/uapi/linux/vhost.h | 10 +
> > > > include/uapi/linux/vhost_types.h | 5 +
> > > > 10 files changed, 573 insertions(+), 7 deletions(-)
> > > > create mode 100644 drivers/vhost/mdev.c
> > > >
On 2019/9/18 下午10:32, Michael S. Tsirkin wrote:
>>>> So I have some questions:
>>>>
>>>> 1) Compared to method 2, what's the advantage of creating a new vhost char
>>>> device? I guess it's for keep the API compatibility?
>>> One benefit is that we can avoid doing vhost ioctls on
>>> VFIO device fd.
>> Yes, but any benefit from doing this?
> It does seem a bit more modular, but it's certainly not a big deal.
Ok, if we go this way, it could be as simple as provide some callback to
vhost, then vhost can just forward the ioctl through parent_ops.
>
>>>> 2) For method 2, is there any easy way for user/admin to distinguish e.g
>>>> ordinary vfio-mdev for vhost from ordinary vfio-mdev?
>>> I think device-api could be a choice.
>> Ok.
>>
>>
>>>> I saw you introduce
>>>> ops matching helper but it's not friendly to management.
>>> The ops matching helper is just to check whether a given
>>> vfio-device is based on a mdev device.
>>>
>>>> 3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
>>>> assumes the parameter comes from userspace, it prevents support kernel
>>>> virtio drivers.
>>>>
>>>> 4) So comes the idea of method 3, since it register a new vhost-mdev driver,
>>>> we can use device specific ops instead of VFIO ones, then we can have a
>>>> common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
>>> As the above draft shows, this requires introducing a new
>>> VFIO device driver. I think Alex's opinion matters here.
Just to clarify, a new type of mdev driver but provides dummy
vfio_device_ops for VFIO to make container DMA ioctl work.
Thanks
>> Yes, it is.
>>
>> Thanks
>>
>>
On Thu, Sep 19, 2019 at 09:08:11PM +0800, Jason Wang wrote:
> On 2019/9/18 下午10:32, Michael S. Tsirkin wrote:
> > > > > So I have some questions:
> > > > >
> > > > > 1) Compared to method 2, what's the advantage of creating a new vhost char
> > > > > device? I guess it's for keep the API compatibility?
> > > > One benefit is that we can avoid doing vhost ioctls on
> > > > VFIO device fd.
> > > Yes, but any benefit from doing this?
> > It does seem a bit more modular, but it's certainly not a big deal.
>
> Ok, if we go this way, it could be as simple as provide some callback to
> vhost, then vhost can just forward the ioctl through parent_ops.
>
> >
> > > > > 2) For method 2, is there any easy way for user/admin to distinguish e.g
> > > > > ordinary vfio-mdev for vhost from ordinary vfio-mdev?
> > > > I think device-api could be a choice.
> > > Ok.
> > >
> > >
> > > > > I saw you introduce
> > > > > ops matching helper but it's not friendly to management.
> > > > The ops matching helper is just to check whether a given
> > > > vfio-device is based on a mdev device.
> > > >
> > > > > 3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
> > > > > assumes the parameter comes from userspace, it prevents support kernel
> > > > > virtio drivers.
> > > > >
> > > > > 4) So comes the idea of method 3, since it register a new vhost-mdev driver,
> > > > > we can use device specific ops instead of VFIO ones, then we can have a
> > > > > common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
> > > > As the above draft shows, this requires introducing a new
> > > > VFIO device driver. I think Alex's opinion matters here.
>
> Just to clarify, a new type of mdev driver but provides dummy
> vfio_device_ops for VFIO to make container DMA ioctl work.
I see. Thanks! IIUC, you mean we can provide a very tiny
VFIO device driver in drivers/vhost/mdev.c, e.g.:
static int vfio_vhost_mdev_open(void *device_data)
{
if (!try_module_get(THIS_MODULE))
return -ENODEV;
return 0;
}
static void vfio_vhost_mdev_release(void *device_data)
{
module_put(THIS_MODULE);
}
static const struct vfio_device_ops vfio_vhost_mdev_dev_ops = {
.name = "vfio-vhost-mdev",
.open = vfio_vhost_mdev_open,
.release = vfio_vhost_mdev_release,
};
static int vhost_mdev_probe(struct device *dev)
{
struct mdev_device *mdev = to_mdev_device(dev);
... Check the mdev device_id proposed in ...
... https://lkml.org/lkml/2019/9/12/151 ...
return vfio_add_group_dev(dev, &vfio_vhost_mdev_dev_ops, mdev);
}
static void vhost_mdev_remove(struct device *dev)
{
vfio_del_group_dev(dev);
}
static struct mdev_driver vhost_mdev_driver = {
.name = "vhost_mdev",
.probe = vhost_mdev_probe,
.remove = vhost_mdev_remove,
};
So we can bind above mdev driver to the virtio-mdev compatible
mdev devices when we want to use vhost-mdev.
After binding above driver to the mdev device, we can setup IOMMU
via VFIO and get VFIO device fd of this mdev device, and pass it
to vhost fd (/dev/vhost-mdev) with a SET_BACKEND ioctl.
Thanks,
Tiwei
>
> Thanks
>
>
> > > Yes, it is.
> > >
> > > Thanks
> > >
> > >
On 2019/9/19 下午11:45, Tiwei Bie wrote:
> On Thu, Sep 19, 2019 at 09:08:11PM +0800, Jason Wang wrote:
>> On 2019/9/18 下午10:32, Michael S. Tsirkin wrote:
>>>>>> So I have some questions:
>>>>>>
>>>>>> 1) Compared to method 2, what's the advantage of creating a new vhost char
>>>>>> device? I guess it's for keep the API compatibility?
>>>>> One benefit is that we can avoid doing vhost ioctls on
>>>>> VFIO device fd.
>>>> Yes, but any benefit from doing this?
>>> It does seem a bit more modular, but it's certainly not a big deal.
>> Ok, if we go this way, it could be as simple as provide some callback to
>> vhost, then vhost can just forward the ioctl through parent_ops.
>>
>>>>>> 2) For method 2, is there any easy way for user/admin to distinguish e.g
>>>>>> ordinary vfio-mdev for vhost from ordinary vfio-mdev?
>>>>> I think device-api could be a choice.
>>>> Ok.
>>>>
>>>>
>>>>>> I saw you introduce
>>>>>> ops matching helper but it's not friendly to management.
>>>>> The ops matching helper is just to check whether a given
>>>>> vfio-device is based on a mdev device.
>>>>>
>>>>>> 3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
>>>>>> assumes the parameter comes from userspace, it prevents support kernel
>>>>>> virtio drivers.
>>>>>>
>>>>>> 4) So comes the idea of method 3, since it register a new vhost-mdev driver,
>>>>>> we can use device specific ops instead of VFIO ones, then we can have a
>>>>>> common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
>>>>> As the above draft shows, this requires introducing a new
>>>>> VFIO device driver. I think Alex's opinion matters here.
>> Just to clarify, a new type of mdev driver but provides dummy
>> vfio_device_ops for VFIO to make container DMA ioctl work.
> I see. Thanks! IIUC, you mean we can provide a very tiny
> VFIO device driver in drivers/vhost/mdev.c, e.g.:
>
> static int vfio_vhost_mdev_open(void *device_data)
> {
> if (!try_module_get(THIS_MODULE))
> return -ENODEV;
> return 0;
> }
>
> static void vfio_vhost_mdev_release(void *device_data)
> {
> module_put(THIS_MODULE);
> }
>
> static const struct vfio_device_ops vfio_vhost_mdev_dev_ops = {
> .name = "vfio-vhost-mdev",
> .open = vfio_vhost_mdev_open,
> .release = vfio_vhost_mdev_release,
> };
>
> static int vhost_mdev_probe(struct device *dev)
> {
> struct mdev_device *mdev = to_mdev_device(dev);
>
> ... Check the mdev device_id proposed in ...
> ... https://lkml.org/lkml/2019/9/12/151 ...
To clarify, this should be done through the id_table fields in
vhost_mdev_driver, and it should claim it supports virtio-mdev device only:
static struct mdev_class_id id_table[] = {
{ MDEV_ID_VIRTIO },
{ 0 },
};
static struct mdev_driver vhost_mdev_driver = {
...
.id_table = id_table,
}
>
> return vfio_add_group_dev(dev, &vfio_vhost_mdev_dev_ops, mdev);
And in vfio_vhost_mdev_ops, all its need is to just implement vhost-net
ioctl and translate them to virtio-mdev transport (e.g device_ops I
proposed or ioctls other whatever other method) API. And it could have a
dummy ops implementation for the other device_ops.
> }
>
> static void vhost_mdev_remove(struct device *dev)
> {
> vfio_del_group_dev(dev);
> }
>
> static struct mdev_driver vhost_mdev_driver = {
> .name = "vhost_mdev",
> .probe = vhost_mdev_probe,
> .remove = vhost_mdev_remove,
> };
>
> So we can bind above mdev driver to the virtio-mdev compatible
> mdev devices when we want to use vhost-mdev.
>
> After binding above driver to the mdev device, we can setup IOMMU
> via VFIO and get VFIO device fd of this mdev device, and pass it
> to vhost fd (/dev/vhost-mdev) with a SET_BACKEND ioctl.
Then what vhost-mdev char device did is just forwarding ioctl back to
this vfio device fd which seems a overkill. It's simpler that just do
ioctl on the device ops directly.
Thanks
>
> Thanks,
> Tiwei
>
>> Thanks
>>
>>
>>>> Yes, it is.
>>>>
>>>> Thanks
>>>>
>>>>
On Fri, Sep 20, 2019 at 09:30:58AM +0800, Jason Wang wrote:
> On 2019/9/19 下午11:45, Tiwei Bie wrote:
> > On Thu, Sep 19, 2019 at 09:08:11PM +0800, Jason Wang wrote:
> > > On 2019/9/18 下午10:32, Michael S. Tsirkin wrote:
> > > > > > > So I have some questions:
> > > > > > >
> > > > > > > 1) Compared to method 2, what's the advantage of creating a new vhost char
> > > > > > > device? I guess it's for keep the API compatibility?
> > > > > > One benefit is that we can avoid doing vhost ioctls on
> > > > > > VFIO device fd.
> > > > > Yes, but any benefit from doing this?
> > > > It does seem a bit more modular, but it's certainly not a big deal.
> > > Ok, if we go this way, it could be as simple as provide some callback to
> > > vhost, then vhost can just forward the ioctl through parent_ops.
> > >
> > > > > > > 2) For method 2, is there any easy way for user/admin to distinguish e.g
> > > > > > > ordinary vfio-mdev for vhost from ordinary vfio-mdev?
> > > > > > I think device-api could be a choice.
> > > > > Ok.
> > > > >
> > > > >
> > > > > > > I saw you introduce
> > > > > > > ops matching helper but it's not friendly to management.
> > > > > > The ops matching helper is just to check whether a given
> > > > > > vfio-device is based on a mdev device.
> > > > > >
> > > > > > > 3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
> > > > > > > assumes the parameter comes from userspace, it prevents support kernel
> > > > > > > virtio drivers.
> > > > > > >
> > > > > > > 4) So comes the idea of method 3, since it register a new vhost-mdev driver,
> > > > > > > we can use device specific ops instead of VFIO ones, then we can have a
> > > > > > > common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
> > > > > > As the above draft shows, this requires introducing a new
> > > > > > VFIO device driver. I think Alex's opinion matters here.
> > > Just to clarify, a new type of mdev driver but provides dummy
> > > vfio_device_ops for VFIO to make container DMA ioctl work.
> > I see. Thanks! IIUC, you mean we can provide a very tiny
> > VFIO device driver in drivers/vhost/mdev.c, e.g.:
> >
> > static int vfio_vhost_mdev_open(void *device_data)
> > {
> > if (!try_module_get(THIS_MODULE))
> > return -ENODEV;
> > return 0;
> > }
> >
> > static void vfio_vhost_mdev_release(void *device_data)
> > {
> > module_put(THIS_MODULE);
> > }
> >
> > static const struct vfio_device_ops vfio_vhost_mdev_dev_ops = {
> > .name = "vfio-vhost-mdev",
> > .open = vfio_vhost_mdev_open,
> > .release = vfio_vhost_mdev_release,
> > };
> >
> > static int vhost_mdev_probe(struct device *dev)
> > {
> > struct mdev_device *mdev = to_mdev_device(dev);
> >
> > ... Check the mdev device_id proposed in ...
> > ... https://lkml.org/lkml/2019/9/12/151 ...
>
>
> To clarify, this should be done through the id_table fields in
> vhost_mdev_driver, and it should claim it supports virtio-mdev device only:
>
>
> static struct mdev_class_id id_table[] = {
> { MDEV_ID_VIRTIO },
> { 0 },
> };
>
>
> static struct mdev_driver vhost_mdev_driver = {
> ...
> .id_table = id_table,
> }
In this way, both of virtio-mdev and vhost-mdev will try to
take this device. We may want a way to let vhost-mdev take this
device only when users explicitly ask it to do it. Or maybe we
can have a different MDEV_ID for vhost-mdev but share the device
ops with virtio-mdev.
>
>
> >
> > return vfio_add_group_dev(dev, &vfio_vhost_mdev_dev_ops, mdev);
>
>
> And in vfio_vhost_mdev_ops, all its need is to just implement vhost-net
> ioctl and translate them to virtio-mdev transport (e.g device_ops I proposed
> or ioctls other whatever other method) API.
I see, so my previous understanding is basically correct:
https://lkml.org/lkml/2019/9/17/332
I.e. we won't have a separate vhost fd and we will do all vhost
ioctls on the VFIO device fd backed by this new VFIO driver.
> And it could have a dummy ops
> implementation for the other device_ops.
>
>
> > }
> >
> > static void vhost_mdev_remove(struct device *dev)
> > {
> > vfio_del_group_dev(dev);
> > }
> >
> > static struct mdev_driver vhost_mdev_driver = {
> > .name = "vhost_mdev",
> > .probe = vhost_mdev_probe,
> > .remove = vhost_mdev_remove,
> > };
> >
> > So we can bind above mdev driver to the virtio-mdev compatible
> > mdev devices when we want to use vhost-mdev.
> >
> > After binding above driver to the mdev device, we can setup IOMMU
> > via VFIO and get VFIO device fd of this mdev device, and pass it
> > to vhost fd (/dev/vhost-mdev) with a SET_BACKEND ioctl.
>
>
> Then what vhost-mdev char device did is just forwarding ioctl back to this
> vfio device fd which seems a overkill. It's simpler that just do ioctl on
> the device ops directly.
Yes.
Thanks,
Tiwei
>
> Thanks
>
>
> >
> > Thanks,
> > Tiwei
> >
> > > Thanks
> > >
> > >
> > > > > Yes, it is.
> > > > >
> > > > > Thanks
> > > > >
> > > > >
On 2019/9/20 上午10:16, Tiwei Bie wrote:
> On Fri, Sep 20, 2019 at 09:30:58AM +0800, Jason Wang wrote:
>> On 2019/9/19 下午11:45, Tiwei Bie wrote:
>>> On Thu, Sep 19, 2019 at 09:08:11PM +0800, Jason Wang wrote:
>>>> On 2019/9/18 下午10:32, Michael S. Tsirkin wrote:
>>>>>>>> So I have some questions:
>>>>>>>>
>>>>>>>> 1) Compared to method 2, what's the advantage of creating a new vhost char
>>>>>>>> device? I guess it's for keep the API compatibility?
>>>>>>> One benefit is that we can avoid doing vhost ioctls on
>>>>>>> VFIO device fd.
>>>>>> Yes, but any benefit from doing this?
>>>>> It does seem a bit more modular, but it's certainly not a big deal.
>>>> Ok, if we go this way, it could be as simple as provide some callback to
>>>> vhost, then vhost can just forward the ioctl through parent_ops.
>>>>
>>>>>>>> 2) For method 2, is there any easy way for user/admin to distinguish e.g
>>>>>>>> ordinary vfio-mdev for vhost from ordinary vfio-mdev?
>>>>>>> I think device-api could be a choice.
>>>>>> Ok.
>>>>>>
>>>>>>
>>>>>>>> I saw you introduce
>>>>>>>> ops matching helper but it's not friendly to management.
>>>>>>> The ops matching helper is just to check whether a given
>>>>>>> vfio-device is based on a mdev device.
>>>>>>>
>>>>>>>> 3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
>>>>>>>> assumes the parameter comes from userspace, it prevents support kernel
>>>>>>>> virtio drivers.
>>>>>>>>
>>>>>>>> 4) So comes the idea of method 3, since it register a new vhost-mdev driver,
>>>>>>>> we can use device specific ops instead of VFIO ones, then we can have a
>>>>>>>> common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
>>>>>>> As the above draft shows, this requires introducing a new
>>>>>>> VFIO device driver. I think Alex's opinion matters here.
>>>> Just to clarify, a new type of mdev driver but provides dummy
>>>> vfio_device_ops for VFIO to make container DMA ioctl work.
>>> I see. Thanks! IIUC, you mean we can provide a very tiny
>>> VFIO device driver in drivers/vhost/mdev.c, e.g.:
>>>
>>> static int vfio_vhost_mdev_open(void *device_data)
>>> {
>>> if (!try_module_get(THIS_MODULE))
>>> return -ENODEV;
>>> return 0;
>>> }
>>>
>>> static void vfio_vhost_mdev_release(void *device_data)
>>> {
>>> module_put(THIS_MODULE);
>>> }
>>>
>>> static const struct vfio_device_ops vfio_vhost_mdev_dev_ops = {
>>> .name = "vfio-vhost-mdev",
>>> .open = vfio_vhost_mdev_open,
>>> .release = vfio_vhost_mdev_release,
>>> };
>>>
>>> static int vhost_mdev_probe(struct device *dev)
>>> {
>>> struct mdev_device *mdev = to_mdev_device(dev);
>>>
>>> ... Check the mdev device_id proposed in ...
>>> ... https://lkml.org/lkml/2019/9/12/151 ...
>>
>> To clarify, this should be done through the id_table fields in
>> vhost_mdev_driver, and it should claim it supports virtio-mdev device only:
>>
>>
>> static struct mdev_class_id id_table[] = {
>> { MDEV_ID_VIRTIO },
>> { 0 },
>> };
>>
>>
>> static struct mdev_driver vhost_mdev_driver = {
>> ...
>> .id_table = id_table,
>> }
> In this way, both of virtio-mdev and vhost-mdev will try to
> take this device. We may want a way to let vhost-mdev take this
> device only when users explicitly ask it to do it. Or maybe we
> can have a different MDEV_ID for vhost-mdev but share the device
> ops with virtio-mdev.
I think it's similar to virtio-pci vs vfio-pci. User can choose to
switch the driver through bind/unbind.
>
>>
>>> return vfio_add_group_dev(dev, &vfio_vhost_mdev_dev_ops, mdev);
>>
>> And in vfio_vhost_mdev_ops, all its need is to just implement vhost-net
>> ioctl and translate them to virtio-mdev transport (e.g device_ops I proposed
>> or ioctls other whatever other method) API.
> I see, so my previous understanding is basically correct:
>
> https://lkml.org/lkml/2019/9/17/332
>
> I.e. we won't have a separate vhost fd and we will do all vhost
> ioctls on the VFIO device fd backed by this new VFIO driver.
Yes.
Thanks
>
>> And it could have a dummy ops
>> implementation for the other device_ops.
>>
>>
>>> }
>>>
>>> static void vhost_mdev_remove(struct device *dev)
>>> {
>>> vfio_del_group_dev(dev);
>>> }
>>>
>>> static struct mdev_driver vhost_mdev_driver = {
>>> .name = "vhost_mdev",
>>> .probe = vhost_mdev_probe,
>>> .remove = vhost_mdev_remove,
>>> };
>>>
>>> So we can bind above mdev driver to the virtio-mdev compatible
>>> mdev devices when we want to use vhost-mdev.
>>>
>>> After binding above driver to the mdev device, we can setup IOMMU
>>> via VFIO and get VFIO device fd of this mdev device, and pass it
>>> to vhost fd (/dev/vhost-mdev) with a SET_BACKEND ioctl.
>>
>> Then what vhost-mdev char device did is just forwarding ioctl back to this
>> vfio device fd which seems a overkill. It's simpler that just do ioctl on
>> the device ops directly.
> Yes.
>
> Thanks,
> Tiwei
>
>
>> Thanks
>>
>>
>>> Thanks,
>>> Tiwei
>>>
>>>> Thanks
>>>>
>>>>
>>>>>> Yes, it is.
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>>>
On Tue, Sep 17, 2019 at 03:26:30PM +0800, Jason Wang wrote:
> On 2019/9/17 上午9:02, Tiwei Bie wrote:
> > diff --git a/drivers/vhost/mdev.c b/drivers/vhost/mdev.c
> > new file mode 100644
> > index 000000000000..8c6597aff45e
> > --- /dev/null
> > +++ b/drivers/vhost/mdev.c
> > @@ -0,0 +1,462 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright (C) 2018-2019 Intel Corporation.
> > + */
> > +
> > +#include <linux/compat.h>
> > +#include <linux/kernel.h>
> > +#include <linux/miscdevice.h>
> > +#include <linux/mdev.h>
> > +#include <linux/module.h>
> > +#include <linux/vfio.h>
> > +#include <linux/vhost.h>
> > +#include <linux/virtio_mdev.h>
> > +
> > +#include "vhost.h"
> > +
> > +struct vhost_mdev {
> > + struct mutex mutex;
> > + struct vhost_dev dev;
> > + struct vhost_virtqueue *vqs;
> > + int nvqs;
> > + u64 state;
> > + u64 features;
> > + u64 acked_features;
> > + struct vfio_group *vfio_group;
> > + struct vfio_device *vfio_device;
> > + struct mdev_device *mdev;
> > +};
> > +
> > +/*
> > + * XXX
> > + * We assume virtio_mdev.ko exposes below symbols for now, as we
> > + * don't have a proper way to access parent ops directly yet.
> > + *
> > + * virtio_mdev_readl()
> > + * virtio_mdev_writel()
> > + */
> > +extern u32 virtio_mdev_readl(struct mdev_device *mdev, loff_t off);
> > +extern void virtio_mdev_writel(struct mdev_device *mdev, loff_t off, u32 val);
>
>
> Need to consider a better approach, I feel we should do it through some kind
> of mdev driver instead of talk to mdev device directly.
Yeah, a better approach is really needed here.
Besides, we may want a way to allow accessing the mdev
device_ops proposed in below series outside the
drivers/vfio/mdev/ directory.
https://lkml.org/lkml/2019/9/12/151
I.e. allow putting mdev drivers outside above directory.
> > +
> > + for (queue_id = 0; queue_id < m->nvqs; queue_id++) {
> > + vq = &m->vqs[queue_id];
> > +
> > + if (!vq->desc || !vq->avail || !vq->used)
> > + break;
> > +
> > + virtio_mdev_writel(mdev, VIRTIO_MDEV_QUEUE_NUM, vq->num);
> > +
> > + if (!vhost_translate_ring_addr(vq, (u64)vq->desc,
> > + vhost_get_desc_size(vq, vq->num),
> > + &addr))
> > + return -EINVAL;
>
>
> Interesting, any reason for doing such kinds of translation to HVA? I
> believe the add should already an IOVA that has been map by VFIO.
Currently, in the software based vhost-kernel and vhost-user
backends, QEMU will pass ring addresses as HVA in SET_VRING_ADDR
ioctl when iotlb isn't enabled. If it's OK to let QEMU pass GPA
in vhost-mdev in this case, then this translation won't be needed.
Thanks,
Tiwei
On 2019/9/19 下午11:45, Tiwei Bie wrote:
> On Thu, Sep 19, 2019 at 09:08:11PM +0800, Jason Wang wrote:
>> On 2019/9/18 下午10:32, Michael S. Tsirkin wrote:
>>>>>> So I have some questions:
>>>>>>
>>>>>> 1) Compared to method 2, what's the advantage of creating a new vhost char
>>>>>> device? I guess it's for keep the API compatibility?
>>>>> One benefit is that we can avoid doing vhost ioctls on
>>>>> VFIO device fd.
>>>> Yes, but any benefit from doing this?
>>> It does seem a bit more modular, but it's certainly not a big deal.
>> Ok, if we go this way, it could be as simple as provide some callback to
>> vhost, then vhost can just forward the ioctl through parent_ops.
>>
>>>>>> 2) For method 2, is there any easy way for user/admin to distinguish e.g
>>>>>> ordinary vfio-mdev for vhost from ordinary vfio-mdev?
>>>>> I think device-api could be a choice.
>>>> Ok.
>>>>
>>>>
>>>>>> I saw you introduce
>>>>>> ops matching helper but it's not friendly to management.
>>>>> The ops matching helper is just to check whether a given
>>>>> vfio-device is based on a mdev device.
>>>>>
>>>>>> 3) A drawback of 1) and 2) is that it must follow vfio_device_ops that
>>>>>> assumes the parameter comes from userspace, it prevents support kernel
>>>>>> virtio drivers.
>>>>>>
>>>>>> 4) So comes the idea of method 3, since it register a new vhost-mdev driver,
>>>>>> we can use device specific ops instead of VFIO ones, then we can have a
>>>>>> common API between vDPA parent and vhost-mdev/virtio-mdev drivers.
>>>>> As the above draft shows, this requires introducing a new
>>>>> VFIO device driver. I think Alex's opinion matters here.
>> Just to clarify, a new type of mdev driver but provides dummy
>> vfio_device_ops for VFIO to make container DMA ioctl work.
> I see. Thanks! IIUC, you mean we can provide a very tiny
> VFIO device driver in drivers/vhost/mdev.c, e.g.:
>
> static int vfio_vhost_mdev_open(void *device_data)
> {
> if (!try_module_get(THIS_MODULE))
> return -ENODEV;
> return 0;
> }
>
> static void vfio_vhost_mdev_release(void *device_data)
> {
> module_put(THIS_MODULE);
> }
>
> static const struct vfio_device_ops vfio_vhost_mdev_dev_ops = {
> .name = "vfio-vhost-mdev",
> .open = vfio_vhost_mdev_open,
> .release = vfio_vhost_mdev_release,
> };
>
> static int vhost_mdev_probe(struct device *dev)
> {
> struct mdev_device *mdev = to_mdev_device(dev);
>
> ... Check the mdev device_id proposed in ...
> ... https://lkml.org/lkml/2019/9/12/151 ...
>
> return vfio_add_group_dev(dev, &vfio_vhost_mdev_dev_ops, mdev);
> }
>
> static void vhost_mdev_remove(struct device *dev)
> {
> vfio_del_group_dev(dev);
> }
>
> static struct mdev_driver vhost_mdev_driver = {
> .name = "vhost_mdev",
> .probe = vhost_mdev_probe,
> .remove = vhost_mdev_remove,
> };
>
> So we can bind above mdev driver to the virtio-mdev compatible
> mdev devices when we want to use vhost-mdev.
>
> After binding above driver to the mdev device, we can setup IOMMU
> via VFIO and get VFIO device fd of this mdev device, and pass it
> to vhost fd (/dev/vhost-mdev) with a SET_BACKEND ioctl.
>
> Thanks,
> Tiwei
Yes, something like this.
Thanks
>> Thanks
>>
>>
>>>> Yes, it is.
>>>>
>>>> Thanks
>>>>
>>>>