2020-01-31 03:38:44

by Tiwei Bie

[permalink] [raw]
Subject: [PATCH] vhost: introduce vDPA based backend

This patch introduces a vDPA based vhost backend. This
backend is built on top of the same interface defined
in virtio-vDPA and provides a generic vhost interface
for userspace to accelerate the virtio devices in guest.

This backend is implemented as a vDPA device driver on
top of the same ops used in virtio-vDPA. It will create
char device entry named vhost-vdpa/$vdpa_device_index
for userspace to use. Userspace can use vhost ioctls on
top of this char device to setup the backend.

Signed-off-by: Tiwei Bie <[email protected]>
---
This patch depends on below series:
https://lkml.org/lkml/2020/1/16/353

Please note that _SET_MEM_TABLE isn't fully supported yet.
Comments would be appreciated!

Changes since last patch (https://lkml.org/lkml/2019/11/18/1068)
- Switch to the vDPA bus;
- Switch to vhost's own chardev;

drivers/vhost/Kconfig | 12 +
drivers/vhost/Makefile | 3 +
drivers/vhost/vdpa.c | 705 +++++++++++++++++++++++++++++++
include/uapi/linux/vhost.h | 21 +
include/uapi/linux/vhost_types.h | 8 +
5 files changed, 749 insertions(+)
create mode 100644 drivers/vhost/vdpa.c

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index f21c45aa5e07..13e6a94d0243 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -34,6 +34,18 @@ config VHOST_VSOCK
To compile this driver as a module, choose M here: the module will be called
vhost_vsock.

+config VHOST_VDPA
+ tristate "Vhost driver for vDPA based backend"
+ depends on EVENTFD && VDPA
+ select VHOST
+ default n
+ ---help---
+ This kernel module can be loaded in host kernel to accelerate
+ guest virtio devices with the vDPA based backends.
+
+ To compile this driver as a module, choose M here: the module
+ will be called vhost_vdpa.
+
config VHOST
tristate
depends on VHOST_IOTLB
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index df99756fbb26..a65e9f4a2c0a 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -10,6 +10,9 @@ vhost_vsock-y := vsock.o

obj-$(CONFIG_VHOST_RING) += vringh.o

+obj-$(CONFIG_VHOST_VDPA) += vhost_vdpa.o
+vhost_vdpa-y := vdpa.o
+
obj-$(CONFIG_VHOST) += vhost.o

obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
new file mode 100644
index 000000000000..631d994d37ac
--- /dev/null
+++ b/drivers/vhost/vdpa.c
@@ -0,0 +1,705 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018-2020 Intel Corporation.
+ *
+ * Author: Tiwei Bie <[email protected]>
+ *
+ * Thanks to Jason Wang and Michael S. Tsirkin for the valuable
+ * comments and suggestions. And thanks to Cunming Liang and
+ * Zhihong Wang for all their supports.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/uuid.h>
+#include <linux/vdpa.h>
+#include <linux/nospec.h>
+#include <linux/vhost.h>
+#include <linux/virtio_net.h>
+
+#include "vhost.h"
+
+enum {
+ VHOST_VDPA_FEATURES =
+ (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
+ (1ULL << VIRTIO_F_ANY_LAYOUT) |
+ (1ULL << VIRTIO_F_VERSION_1) |
+ (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+ (1ULL << VIRTIO_F_RING_PACKED) |
+ (1ULL << VIRTIO_F_ORDER_PLATFORM) |
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
+ (1ULL << VIRTIO_RING_F_EVENT_IDX),
+
+ VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES |
+ (1ULL << VIRTIO_NET_F_CSUM) |
+ (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
+ (1ULL << VIRTIO_NET_F_MTU) |
+ (1ULL << VIRTIO_NET_F_MAC) |
+ (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
+ (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
+ (1ULL << VIRTIO_NET_F_GUEST_ECN) |
+ (1ULL << VIRTIO_NET_F_GUEST_UFO) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO4) |
+ (1ULL << VIRTIO_NET_F_HOST_TSO6) |
+ (1ULL << VIRTIO_NET_F_HOST_ECN) |
+ (1ULL << VIRTIO_NET_F_HOST_UFO) |
+ (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+ (1ULL << VIRTIO_NET_F_STATUS) |
+ (1ULL << VIRTIO_NET_F_SPEED_DUPLEX),
+};
+
+/* Currently, only network backend w/o multiqueue is supported. */
+#define VHOST_VDPA_VQ_MAX 2
+
+struct vhost_vdpa {
+ /* The lock is to protect this structure. */
+ struct mutex mutex;
+ struct vhost_dev vdev;
+ struct vhost_virtqueue *vqs;
+ struct vdpa_device *vdpa;
+ struct device *dev;
+ atomic_t opened;
+ int nvqs;
+ int virtio_id;
+ int minor;
+};
+
+static struct {
+ /* The lock is to protect this structure. */
+ struct mutex mutex;
+ struct class *class;
+ struct idr idr;
+ struct cdev cdev;
+ dev_t devt;
+ wait_queue_head_t release_q;
+} vhost_vdpa;
+
+static const u64 vhost_vdpa_features[] = {
+ [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES,
+};
+
+static int vhost_vdpa_alloc_minor(struct vhost_vdpa *v)
+{
+ return idr_alloc(&vhost_vdpa.idr, v, 0, MINORMASK + 1,
+ GFP_KERNEL);
+}
+
+static void vhost_vdpa_free_minor(int minor)
+{
+ idr_remove(&vhost_vdpa.idr, minor);
+}
+
+static struct vhost_vdpa *vhost_vdpa_get_from_minor(int minor)
+{
+ struct vhost_vdpa *v;
+
+ mutex_lock(&vhost_vdpa.mutex);
+ v = idr_find(&vhost_vdpa.idr, minor);
+ mutex_unlock(&vhost_vdpa.mutex);
+
+ return v;
+}
+
+static void handle_vq_kick(struct vhost_work *work)
+{
+ struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
+ poll.work);
+ struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev);
+ const struct vdpa_config_ops *ops = v->vdpa->config;
+
+ ops->kick_vq(v->vdpa, vq - v->vqs);
+}
+
+static irqreturn_t vhost_vdpa_virtqueue_cb(void *private)
+{
+ struct vhost_virtqueue *vq = private;
+ struct eventfd_ctx *call_ctx = vq->call_ctx;
+
+ if (call_ctx)
+ eventfd_signal(call_ctx, 1);
+
+ return IRQ_HANDLED;
+}
+
+static void vhost_vdpa_reset(struct vhost_vdpa *v)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+
+ ops->set_status(vdpa, 0);
+}
+
+static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u32 device_id;
+
+ device_id = ops->get_device_id(vdpa);
+
+ if (copy_to_user(argp, &device_id, sizeof(device_id)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u8 status;
+
+ status = ops->get_status(vdpa);
+
+ if (copy_to_user(statusp, &status, sizeof(status)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u8 status;
+
+ if (copy_from_user(&status, statusp, sizeof(status)))
+ return -EFAULT;
+
+ /*
+ * Userspace shouldn't remove status bits unless reset the
+ * status to 0.
+ */
+ if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
+ return -EINVAL;
+
+ ops->set_status(vdpa, status);
+
+ return 0;
+}
+
+static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
+ struct vhost_vdpa_config *c)
+{
+ long size = 0;
+
+ switch (v->virtio_id) {
+ case VIRTIO_ID_NET:
+ size = sizeof(struct virtio_net_config);
+ break;
+ }
+
+ if (c->len == 0)
+ return -EINVAL;
+
+ if (c->len > size - c->off)
+ return -E2BIG;
+
+ return 0;
+}
+
+static long vhost_vdpa_get_config(struct vhost_vdpa *v,
+ struct vhost_vdpa_config __user *c)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vhost_vdpa_config config;
+ unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+ u8 *buf;
+
+ if (copy_from_user(&config, c, size))
+ return -EFAULT;
+ if (vhost_vdpa_config_validate(v, &config))
+ return -EINVAL;
+ buf = kvzalloc(config.len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ ops->get_config(vdpa, config.off, buf, config.len);
+
+ if (copy_to_user(c->buf, buf, config.len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ kvfree(buf);
+ return 0;
+}
+
+static long vhost_vdpa_set_config(struct vhost_vdpa *v,
+ struct vhost_vdpa_config __user *c)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vhost_vdpa_config config;
+ unsigned long size = offsetof(struct vhost_vdpa_config, buf);
+ u8 *buf;
+
+ if (copy_from_user(&config, c, size))
+ return -EFAULT;
+ if (vhost_vdpa_config_validate(v, &config))
+ return -EINVAL;
+ buf = kvzalloc(config.len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, c->buf, config.len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ ops->set_config(vdpa, config.off, buf, config.len);
+
+ kvfree(buf);
+ return 0;
+}
+
+static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u64 features;
+
+ features = ops->get_features(vdpa);
+ features &= vhost_vdpa_features[v->virtio_id];
+
+ if (copy_to_user(featurep, &features, sizeof(features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u64 features;
+
+ /*
+ * It's not allowed to change the features after they have
+ * been negotiated.
+ */
+ if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK)
+ return -EBUSY;
+
+ if (copy_from_user(&features, featurep, sizeof(features)))
+ return -EFAULT;
+
+ if (features & ~vhost_vdpa_features[v->virtio_id])
+ return -EINVAL;
+
+ if (ops->set_features(vdpa, features))
+ return -EINVAL;
+
+ return 0;
+}
+
+static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ u16 num;
+
+ num = ops->get_vq_num_max(vdpa);
+
+ if (copy_to_user(argp, &num, sizeof(num)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
+ void __user *argp)
+{
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vdpa_callback cb;
+ struct vhost_virtqueue *vq;
+ struct vhost_vring_state s;
+ u8 status;
+ u32 idx;
+ long r;
+
+ r = get_user(idx, (u32 __user *)argp);
+ if (r < 0)
+ return r;
+ if (idx >= v->nvqs)
+ return -ENOBUFS;
+
+ idx = array_index_nospec(idx, v->nvqs);
+ vq = &v->vqs[idx];
+
+ status = ops->get_status(vdpa);
+
+ /*
+ * It's not allowed to detect and program vqs before
+ * features negotiation or after enabling driver.
+ */
+ if (!(status & VIRTIO_CONFIG_S_FEATURES_OK) ||
+ (status & VIRTIO_CONFIG_S_DRIVER_OK))
+ return -EBUSY;
+
+ if (cmd == VHOST_VDPA_SET_VRING_ENABLE) {
+ if (copy_from_user(&s, argp, sizeof(s)))
+ return -EFAULT;
+ ops->set_vq_ready(vdpa, idx, s.num);
+ return 0;
+ }
+
+ /*
+ * It's not allowed to detect and program vqs with
+ * vqs enabled.
+ */
+ if (ops->get_vq_ready(vdpa, idx))
+ return -EBUSY;
+
+ if (cmd == VHOST_GET_VRING_BASE)
+ vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx);
+
+ r = vhost_vring_ioctl(&v->vdev, cmd, argp);
+ if (r)
+ return r;
+
+ switch (cmd) {
+ case VHOST_SET_VRING_ADDR:
+ if (ops->set_vq_address(vdpa, idx, (u64)vq->desc,
+ (u64)vq->avail, (u64)vq->used))
+ r = -EINVAL;
+ break;
+
+ case VHOST_SET_VRING_BASE:
+ if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx))
+ r = -EINVAL;
+ break;
+
+ case VHOST_SET_VRING_CALL:
+ if (vq->call_ctx) {
+ cb.callback = vhost_vdpa_virtqueue_cb;
+ cb.private = vq;
+ } else {
+ cb.callback = NULL;
+ cb.private = NULL;
+ }
+ ops->set_vq_cb(vdpa, idx, &cb);
+ break;
+
+ case VHOST_SET_VRING_NUM:
+ ops->set_vq_num(vdpa, idx, vq->num);
+ break;
+ }
+
+ return r;
+}
+
+static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v)
+{
+ /* TODO: fix this */
+ return -ENOTSUPP;
+}
+
+static int vhost_vdpa_open(struct inode *inode, struct file *filep)
+{
+ struct vhost_vdpa *v;
+ struct vhost_dev *dev;
+ struct vhost_virtqueue **vqs;
+ int nvqs, i, r, opened;
+
+ v = vhost_vdpa_get_from_minor(iminor(inode));
+ if (!v)
+ return -ENODEV;
+
+ opened = atomic_cmpxchg(&v->opened, 0, 1);
+ if (opened) {
+ r = -EBUSY;
+ goto err;
+ }
+
+ nvqs = v->nvqs;
+ vhost_vdpa_reset(v);
+
+ vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
+ if (!vqs) {
+ r = -ENOMEM;
+ goto err;
+ }
+
+ dev = &v->vdev;
+ for (i = 0; i < nvqs; i++) {
+ vqs[i] = &v->vqs[i];
+ vqs[i]->handle_kick = handle_vq_kick;
+ }
+ vhost_dev_init(dev, vqs, nvqs, 0, 0, 0);
+
+ filep->private_data = v;
+
+ return 0;
+
+err:
+ atomic_dec(&v->opened);
+ return r;
+}
+
+static int vhost_vdpa_release(struct inode *inode, struct file *filep)
+{
+ struct vhost_vdpa *v = filep->private_data;
+
+ mutex_lock(&v->mutex);
+ filep->private_data = NULL;
+ vhost_vdpa_reset(v);
+ vhost_dev_stop(&v->vdev);
+ vhost_dev_cleanup(&v->vdev);
+ kfree(v->vdev.vqs);
+ mutex_unlock(&v->mutex);
+
+ atomic_dec(&v->opened);
+ wake_up(&vhost_vdpa.release_q);
+
+ return 0;
+}
+
+static long vhost_vdpa_unlocked_ioctl(struct file *filep,
+ unsigned int cmd, unsigned long arg)
+{
+ struct vhost_vdpa *v = filep->private_data;
+ struct vdpa_device *vdpa = v->vdpa;
+ const struct vdpa_config_ops *ops = vdpa->config;
+ void __user *argp = (void __user *)arg;
+ long r;
+
+ mutex_lock(&v->mutex);
+
+ switch (cmd) {
+ case VHOST_VDPA_GET_DEVICE_ID:
+ r = vhost_vdpa_get_device_id(v, argp);
+ break;
+ case VHOST_VDPA_GET_STATUS:
+ r = vhost_vdpa_get_status(v, argp);
+ break;
+ case VHOST_VDPA_SET_STATUS:
+ r = vhost_vdpa_set_status(v, argp);
+ break;
+ case VHOST_VDPA_GET_CONFIG:
+ r = vhost_vdpa_get_config(v, argp);
+ break;
+ case VHOST_VDPA_SET_CONFIG:
+ r = vhost_vdpa_set_config(v, argp);
+ break;
+ case VHOST_GET_FEATURES:
+ r = vhost_vdpa_get_features(v, argp);
+ break;
+ case VHOST_SET_FEATURES:
+ r = vhost_vdpa_set_features(v, argp);
+ break;
+ case VHOST_VDPA_GET_VRING_NUM:
+ r = vhost_vdpa_get_vring_num(v, argp);
+ break;
+ case VHOST_SET_LOG_BASE:
+ case VHOST_SET_LOG_FD:
+ r = -ENOIOCTLCMD;
+ break;
+ default:
+ mutex_lock(&v->vdev.mutex);
+ r = vhost_dev_ioctl(&v->vdev, cmd, argp);
+ if (r == -ENOIOCTLCMD)
+ r = vhost_vdpa_vring_ioctl(v, cmd, argp);
+ mutex_unlock(&v->vdev.mutex);
+ break;
+ }
+
+ if (r)
+ goto out;
+
+ switch (cmd) {
+ case VHOST_SET_MEM_TABLE:
+ if (ops->set_map)
+ r = ops->set_map(vdpa, v->vdev.umem);
+ else
+ r = vhost_vdpa_do_dma_mapping(v);
+ break;
+ }
+
+out:
+ mutex_unlock(&v->mutex);
+ return r;
+}
+
+static const struct file_operations vhost_vdpa_fops = {
+ .owner = THIS_MODULE,
+ .open = vhost_vdpa_open,
+ .release = vhost_vdpa_release,
+ .unlocked_ioctl = vhost_vdpa_unlocked_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
+
+static int vhost_vdpa_probe(struct device *dev)
+{
+ struct vdpa_device *vdpa = dev_to_vdpa(dev);
+ const struct vdpa_config_ops *ops = vdpa->config;
+ struct vhost_vdpa *v;
+ struct device *d;
+ int minor, nvqs;
+ int r;
+
+ /* Currently, we only accept the network devices. */
+ if (ops->get_device_id(vdpa) != VIRTIO_ID_NET) {
+ r = -ENOTSUPP;
+ goto err;
+ }
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!v) {
+ r = -ENOMEM;
+ goto err;
+ }
+
+ nvqs = VHOST_VDPA_VQ_MAX;
+
+ v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue),
+ GFP_KERNEL);
+ if (!v->vqs) {
+ r = -ENOMEM;
+ goto err_alloc_vqs;
+ }
+
+ mutex_init(&v->mutex);
+ atomic_set(&v->opened, 0);
+
+ v->vdpa = vdpa;
+ v->nvqs = nvqs;
+ v->virtio_id = ops->get_device_id(vdpa);
+
+ mutex_lock(&vhost_vdpa.mutex);
+
+ minor = vhost_vdpa_alloc_minor(v);
+ if (minor < 0) {
+ r = minor;
+ goto err_alloc_minor;
+ }
+
+ d = device_create(vhost_vdpa.class, NULL,
+ MKDEV(MAJOR(vhost_vdpa.devt), minor),
+ v, "%d", vdpa->index);
+ if (IS_ERR(d)) {
+ r = PTR_ERR(d);
+ goto err_device_create;
+ }
+
+ mutex_unlock(&vhost_vdpa.mutex);
+
+ v->dev = d;
+ v->minor = minor;
+ dev_set_drvdata(dev, v);
+
+ return 0;
+
+err_device_create:
+ vhost_vdpa_free_minor(minor);
+err_alloc_minor:
+ mutex_unlock(&vhost_vdpa.mutex);
+ kfree(v->vqs);
+err_alloc_vqs:
+ kfree(v);
+err:
+ return r;
+}
+
+static void vhost_vdpa_remove(struct device *dev)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct vhost_vdpa *v = dev_get_drvdata(dev);
+ int opened;
+
+ add_wait_queue(&vhost_vdpa.release_q, &wait);
+
+ do {
+ opened = atomic_cmpxchg(&v->opened, 0, 1);
+ if (!opened)
+ break;
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
+ } while (1);
+
+ remove_wait_queue(&vhost_vdpa.release_q, &wait);
+
+ mutex_lock(&vhost_vdpa.mutex);
+ device_destroy(vhost_vdpa.class,
+ MKDEV(MAJOR(vhost_vdpa.devt), v->minor));
+ vhost_vdpa_free_minor(v->minor);
+ mutex_unlock(&vhost_vdpa.mutex);
+ kfree(v->vqs);
+ kfree(v);
+}
+
+static struct vdpa_driver vhost_vdpa_driver = {
+ .drv = {
+ .name = "vhost_vdpa",
+ },
+ .probe = vhost_vdpa_probe,
+ .remove = vhost_vdpa_remove,
+};
+
+static char *vhost_vdpa_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "vhost-vdpa/%s", dev_name(dev));
+}
+
+static int __init vhost_vdpa_init(void)
+{
+ int r;
+
+ idr_init(&vhost_vdpa.idr);
+ mutex_init(&vhost_vdpa.mutex);
+ init_waitqueue_head(&vhost_vdpa.release_q);
+
+ /* /dev/vhost-vdpa/$vdpa_device_index */
+ vhost_vdpa.class = class_create(THIS_MODULE, "vhost-vdpa");
+ if (IS_ERR(vhost_vdpa.class)) {
+ r = PTR_ERR(vhost_vdpa.class);
+ goto err_class;
+ }
+
+ vhost_vdpa.class->devnode = vhost_vdpa_devnode;
+
+ r = alloc_chrdev_region(&vhost_vdpa.devt, 0, MINORMASK + 1,
+ "vhost-vdpa");
+ if (r)
+ goto err_alloc_chrdev;
+
+ cdev_init(&vhost_vdpa.cdev, &vhost_vdpa_fops);
+ r = cdev_add(&vhost_vdpa.cdev, vhost_vdpa.devt, MINORMASK + 1);
+ if (r)
+ goto err_cdev_add;
+
+ r = register_vdpa_driver(&vhost_vdpa_driver);
+ if (r)
+ goto err_register_vdpa_driver;
+
+ return 0;
+
+err_register_vdpa_driver:
+ cdev_del(&vhost_vdpa.cdev);
+err_cdev_add:
+ unregister_chrdev_region(vhost_vdpa.devt, MINORMASK + 1);
+err_alloc_chrdev:
+ class_destroy(vhost_vdpa.class);
+err_class:
+ idr_destroy(&vhost_vdpa.idr);
+ return r;
+}
+module_init(vhost_vdpa_init);
+
+static void __exit vhost_vdpa_exit(void)
+{
+ unregister_vdpa_driver(&vhost_vdpa_driver);
+ cdev_del(&vhost_vdpa.cdev);
+ unregister_chrdev_region(vhost_vdpa.devt, MINORMASK + 1);
+ class_destroy(vhost_vdpa.class);
+ idr_destroy(&vhost_vdpa.idr);
+}
+module_exit(vhost_vdpa_exit);
+
+MODULE_VERSION("0.0.1");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("vDPA based vhost backend for virtio");
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index 40d028eed645..23f6db2106f5 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -116,4 +116,25 @@
#define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64)
#define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int)

+/* VHOST_VDPA specific defines */
+
+/* Get the device id. The device ids follow the same definition of
+ * the device id defined in virtio-spec. */
+#define VHOST_VDPA_GET_DEVICE_ID _IOR(VHOST_VIRTIO, 0x70, __u32)
+/* Get and set the status. The status bits follow the same definition
+ * of the device status defined in virtio-spec. */
+#define VHOST_VDPA_GET_STATUS _IOR(VHOST_VIRTIO, 0x71, __u8)
+#define VHOST_VDPA_SET_STATUS _IOW(VHOST_VIRTIO, 0x72, __u8)
+/* Get and set the device config. The device config follows the same
+ * definition of the device config defined in virtio-spec. */
+#define VHOST_VDPA_GET_CONFIG _IOR(VHOST_VIRTIO, 0x73, \
+ struct vhost_vdpa_config)
+#define VHOST_VDPA_SET_CONFIG _IOW(VHOST_VIRTIO, 0x74, \
+ struct vhost_vdpa_config)
+/* Enable/disable the ring. */
+#define VHOST_VDPA_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x75, \
+ struct vhost_vring_state)
+/* Get the max ring size. */
+#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16)
+
#endif
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index c907290ff065..669457ce5c48 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -119,6 +119,14 @@ struct vhost_scsi_target {
unsigned short reserved;
};

+/* VHOST_VDPA specific definitions */
+
+struct vhost_vdpa_config {
+ __u32 off;
+ __u32 len;
+ __u8 buf[0];
+};
+
/* Feature bits */
/* Log all write descriptors. Can be changed while device is active. */
#define VHOST_F_LOG_ALL 26
--
2.24.1


2020-01-31 03:58:02

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

Hi,

On 1/30/20 7:36 PM, Tiwei Bie wrote:
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index f21c45aa5e07..13e6a94d0243 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -34,6 +34,18 @@ config VHOST_VSOCK
> To compile this driver as a module, choose M here: the module will be called
> vhost_vsock.
>
> +config VHOST_VDPA
> + tristate "Vhost driver for vDPA based backend"
> + depends on EVENTFD && VDPA
> + select VHOST
> + default n
> + ---help---
> + This kernel module can be loaded in host kernel to accelerate
> + guest virtio devices with the vDPA based backends.

vDPA-based

> +
> + To compile this driver as a module, choose M here: the module
> + will be called vhost_vdpa.
> +

The preferred Kconfig style nowadays is
(a) use "help" instead of "---help---"
(b) indent the help text with one tab + 2 spaces

and don't use "default n" since that is already the default.

> config VHOST
> tristate
> depends on VHOST_IOTLB

thanks.
--
~Randy

2020-01-31 05:14:30

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On 1/30/20 7:56 PM, Randy Dunlap wrote:
> Hi,
>
> On 1/30/20 7:36 PM, Tiwei Bie wrote:
>> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
>> index f21c45aa5e07..13e6a94d0243 100644
>> --- a/drivers/vhost/Kconfig
>> +++ b/drivers/vhost/Kconfig
>> @@ -34,6 +34,18 @@ config VHOST_VSOCK
>> To compile this driver as a module, choose M here: the module will be called
>> vhost_vsock.
>>
>> +config VHOST_VDPA
>> + tristate "Vhost driver for vDPA based backend"

oops, missed this one:
vDPA-based

>> + depends on EVENTFD && VDPA
>> + select VHOST
>> + default n
>> + ---help---
>> + This kernel module can be loaded in host kernel to accelerate
>> + guest virtio devices with the vDPA based backends.
>
> vDPA-based
>
>> +
>> + To compile this driver as a module, choose M here: the module
>> + will be called vhost_vdpa.
>> +
>
> The preferred Kconfig style nowadays is
> (a) use "help" instead of "---help---"
> (b) indent the help text with one tab + 2 spaces
>
> and don't use "default n" since that is already the default.
>
>> config VHOST
>> tristate
>> depends on VHOST_IOTLB
>
> thanks.
>


--
~Randy

2020-01-31 05:53:30

by Tiwei Bie

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Thu, Jan 30, 2020 at 07:56:43PM -0800, Randy Dunlap wrote:
> Hi,
>
> On 1/30/20 7:36 PM, Tiwei Bie wrote:
> > diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> > index f21c45aa5e07..13e6a94d0243 100644
> > --- a/drivers/vhost/Kconfig
> > +++ b/drivers/vhost/Kconfig
> > @@ -34,6 +34,18 @@ config VHOST_VSOCK
> > To compile this driver as a module, choose M here: the module will be called
> > vhost_vsock.
> >
> > +config VHOST_VDPA
> > + tristate "Vhost driver for vDPA based backend"
> > + depends on EVENTFD && VDPA
> > + select VHOST
> > + default n
> > + ---help---
> > + This kernel module can be loaded in host kernel to accelerate
> > + guest virtio devices with the vDPA based backends.
>
> vDPA-based

Will fix this and other similar ones in the patch. Thanks!

>
> > +
> > + To compile this driver as a module, choose M here: the module
> > + will be called vhost_vdpa.
> > +
>
> The preferred Kconfig style nowadays is
> (a) use "help" instead of "---help---"
> (b) indent the help text with one tab + 2 spaces
>
> and don't use "default n" since that is already the default.

Will fix in the next version.

Thanks,
Tiwei

>
> > config VHOST
> > tristate
> > depends on VHOST_IOTLB
>
> thanks.
> --
> ~Randy
>

2020-01-31 05:55:22

by Tiwei Bie

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Thu, Jan 30, 2020 at 09:12:57PM -0800, Randy Dunlap wrote:
> On 1/30/20 7:56 PM, Randy Dunlap wrote:
> > Hi,
> >
> > On 1/30/20 7:36 PM, Tiwei Bie wrote:
> >> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> >> index f21c45aa5e07..13e6a94d0243 100644
> >> --- a/drivers/vhost/Kconfig
> >> +++ b/drivers/vhost/Kconfig
> >> @@ -34,6 +34,18 @@ config VHOST_VSOCK
> >> To compile this driver as a module, choose M here: the module will be called
> >> vhost_vsock.
> >>
> >> +config VHOST_VDPA
> >> + tristate "Vhost driver for vDPA based backend"
>
> oops, missed this one:
> vDPA-based

Will fix. Thanks!

>
> >> + depends on EVENTFD && VDPA
> >> + select VHOST
> >> + default n
> >> + ---help---
> >> + This kernel module can be loaded in host kernel to accelerate
> >> + guest virtio devices with the vDPA based backends.
> >
> > vDPA-based
> >
> >> +
> >> + To compile this driver as a module, choose M here: the module
> >> + will be called vhost_vdpa.
> >> +
> >
> > The preferred Kconfig style nowadays is
> > (a) use "help" instead of "---help---"
> > (b) indent the help text with one tab + 2 spaces
> >
> > and don't use "default n" since that is already the default.
> >
> >> config VHOST
> >> tristate
> >> depends on VHOST_IOTLB
> >
> > thanks.
> >
>
>
> --
> ~Randy
>

2020-02-04 03:32:54

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/1/31 上午11:36, Tiwei Bie wrote:
> This patch introduces a vDPA based vhost backend. This
> backend is built on top of the same interface defined
> in virtio-vDPA and provides a generic vhost interface
> for userspace to accelerate the virtio devices in guest.
>
> This backend is implemented as a vDPA device driver on
> top of the same ops used in virtio-vDPA. It will create
> char device entry named vhost-vdpa/$vdpa_device_index
> for userspace to use. Userspace can use vhost ioctls on
> top of this char device to setup the backend.
>
> Signed-off-by: Tiwei Bie <[email protected]>
> ---
> This patch depends on below series:
> https://lkml.org/lkml/2020/1/16/353
>
> Please note that _SET_MEM_TABLE isn't fully supported yet.
> Comments would be appreciated!
>
> Changes since last patch (https://lkml.org/lkml/2019/11/18/1068)
> - Switch to the vDPA bus;
> - Switch to vhost's own chardev;
>
> drivers/vhost/Kconfig | 12 +
> drivers/vhost/Makefile | 3 +
> drivers/vhost/vdpa.c | 705 +++++++++++++++++++++++++++++++
> include/uapi/linux/vhost.h | 21 +
> include/uapi/linux/vhost_types.h | 8 +
> 5 files changed, 749 insertions(+)
> create mode 100644 drivers/vhost/vdpa.c
>
> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> index f21c45aa5e07..13e6a94d0243 100644
> --- a/drivers/vhost/Kconfig
> +++ b/drivers/vhost/Kconfig
> @@ -34,6 +34,18 @@ config VHOST_VSOCK
> To compile this driver as a module, choose M here: the module will be called
> vhost_vsock.
>
> +config VHOST_VDPA
> + tristate "Vhost driver for vDPA based backend"
> + depends on EVENTFD && VDPA
> + select VHOST
> + default n
> + ---help---
> + This kernel module can be loaded in host kernel to accelerate
> + guest virtio devices with the vDPA based backends.
> +
> + To compile this driver as a module, choose M here: the module
> + will be called vhost_vdpa.
> +
> config VHOST
> tristate
> depends on VHOST_IOTLB
> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> index df99756fbb26..a65e9f4a2c0a 100644
> --- a/drivers/vhost/Makefile
> +++ b/drivers/vhost/Makefile
> @@ -10,6 +10,9 @@ vhost_vsock-y := vsock.o
>
> obj-$(CONFIG_VHOST_RING) += vringh.o
>
> +obj-$(CONFIG_VHOST_VDPA) += vhost_vdpa.o
> +vhost_vdpa-y := vdpa.o
> +
> obj-$(CONFIG_VHOST) += vhost.o
>
> obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> new file mode 100644
> index 000000000000..631d994d37ac
> --- /dev/null
> +++ b/drivers/vhost/vdpa.c
> @@ -0,0 +1,705 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018-2020 Intel Corporation.
> + *
> + * Author: Tiwei Bie <[email protected]>
> + *
> + * Thanks to Jason Wang and Michael S. Tsirkin for the valuable
> + * comments and suggestions. And thanks to Cunming Liang and
> + * Zhihong Wang for all their supports.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/cdev.h>
> +#include <linux/device.h>
> +#include <linux/uuid.h>
> +#include <linux/vdpa.h>
> +#include <linux/nospec.h>
> +#include <linux/vhost.h>
> +#include <linux/virtio_net.h>
> +
> +#include "vhost.h"
> +
> +enum {
> + VHOST_VDPA_FEATURES =
> + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
> + (1ULL << VIRTIO_F_ANY_LAYOUT) |
> + (1ULL << VIRTIO_F_VERSION_1) |
> + (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
> + (1ULL << VIRTIO_F_RING_PACKED) |
> + (1ULL << VIRTIO_F_ORDER_PLATFORM) |
> + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
> + (1ULL << VIRTIO_RING_F_EVENT_IDX),
> +
> + VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES |
> + (1ULL << VIRTIO_NET_F_CSUM) |
> + (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
> + (1ULL << VIRTIO_NET_F_MTU) |
> + (1ULL << VIRTIO_NET_F_MAC) |
> + (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
> + (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
> + (1ULL << VIRTIO_NET_F_GUEST_ECN) |
> + (1ULL << VIRTIO_NET_F_GUEST_UFO) |
> + (1ULL << VIRTIO_NET_F_HOST_TSO4) |
> + (1ULL << VIRTIO_NET_F_HOST_TSO6) |
> + (1ULL << VIRTIO_NET_F_HOST_ECN) |
> + (1ULL << VIRTIO_NET_F_HOST_UFO) |
> + (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
> + (1ULL << VIRTIO_NET_F_STATUS) |
> + (1ULL << VIRTIO_NET_F_SPEED_DUPLEX),
> +};
> +
> +/* Currently, only network backend w/o multiqueue is supported. */
> +#define VHOST_VDPA_VQ_MAX 2
> +
> +struct vhost_vdpa {
> + /* The lock is to protect this structure. */
> + struct mutex mutex;
> + struct vhost_dev vdev;
> + struct vhost_virtqueue *vqs;
> + struct vdpa_device *vdpa;
> + struct device *dev;
> + atomic_t opened;
> + int nvqs;
> + int virtio_id;
> + int minor;
> +};
> +
> +static struct {
> + /* The lock is to protect this structure. */
> + struct mutex mutex;
> + struct class *class;
> + struct idr idr;
> + struct cdev cdev;
> + dev_t devt;
> + wait_queue_head_t release_q;
> +} vhost_vdpa;
> +
> +static const u64 vhost_vdpa_features[] = {
> + [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES,
> +};
> +
> +static int vhost_vdpa_alloc_minor(struct vhost_vdpa *v)
> +{
> + return idr_alloc(&vhost_vdpa.idr, v, 0, MINORMASK + 1,
> + GFP_KERNEL);
> +}
> +
> +static void vhost_vdpa_free_minor(int minor)
> +{
> + idr_remove(&vhost_vdpa.idr, minor);
> +}
> +
> +static struct vhost_vdpa *vhost_vdpa_get_from_minor(int minor)
> +{
> + struct vhost_vdpa *v;
> +
> + mutex_lock(&vhost_vdpa.mutex);
> + v = idr_find(&vhost_vdpa.idr, minor);
> + mutex_unlock(&vhost_vdpa.mutex);
> +
> + return v;
> +}
> +
> +static void handle_vq_kick(struct vhost_work *work)
> +{
> + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
> + poll.work);
> + struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev);
> + const struct vdpa_config_ops *ops = v->vdpa->config;
> +
> + ops->kick_vq(v->vdpa, vq - v->vqs);
> +}
> +
> +static irqreturn_t vhost_vdpa_virtqueue_cb(void *private)
> +{
> + struct vhost_virtqueue *vq = private;
> + struct eventfd_ctx *call_ctx = vq->call_ctx;
> +
> + if (call_ctx)
> + eventfd_signal(call_ctx, 1);
> +
> + return IRQ_HANDLED;
> +}
> +
> +static void vhost_vdpa_reset(struct vhost_vdpa *v)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> +
> + ops->set_status(vdpa, 0);
> +}
> +
> +static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + u32 device_id;
> +
> + device_id = ops->get_device_id(vdpa);
> +
> + if (copy_to_user(argp, &device_id, sizeof(device_id)))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + u8 status;
> +
> + status = ops->get_status(vdpa);
> +
> + if (copy_to_user(statusp, &status, sizeof(status)))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + u8 status;
> +
> + if (copy_from_user(&status, statusp, sizeof(status)))
> + return -EFAULT;
> +
> + /*
> + * Userspace shouldn't remove status bits unless reset the
> + * status to 0.
> + */
> + if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
> + return -EINVAL;
> +
> + ops->set_status(vdpa, status);
> +
> + return 0;
> +}
> +
> +static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
> + struct vhost_vdpa_config *c)
> +{
> + long size = 0;
> +
> + switch (v->virtio_id) {
> + case VIRTIO_ID_NET:
> + size = sizeof(struct virtio_net_config);
> + break;
> + }
> +
> + if (c->len == 0)
> + return -EINVAL;
> +
> + if (c->len > size - c->off)
> + return -E2BIG;
> +
> + return 0;
> +}
> +
> +static long vhost_vdpa_get_config(struct vhost_vdpa *v,
> + struct vhost_vdpa_config __user *c)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + struct vhost_vdpa_config config;
> + unsigned long size = offsetof(struct vhost_vdpa_config, buf);
> + u8 *buf;
> +
> + if (copy_from_user(&config, c, size))
> + return -EFAULT;
> + if (vhost_vdpa_config_validate(v, &config))
> + return -EINVAL;
> + buf = kvzalloc(config.len, GFP_KERNEL);
> + if (!buf)
> + return -ENOMEM;
> +
> + ops->get_config(vdpa, config.off, buf, config.len);
> +
> + if (copy_to_user(c->buf, buf, config.len)) {
> + kvfree(buf);
> + return -EFAULT;
> + }
> +
> + kvfree(buf);
> + return 0;
> +}
> +
> +static long vhost_vdpa_set_config(struct vhost_vdpa *v,
> + struct vhost_vdpa_config __user *c)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + struct vhost_vdpa_config config;
> + unsigned long size = offsetof(struct vhost_vdpa_config, buf);
> + u8 *buf;
> +
> + if (copy_from_user(&config, c, size))
> + return -EFAULT;
> + if (vhost_vdpa_config_validate(v, &config))
> + return -EINVAL;
> + buf = kvzalloc(config.len, GFP_KERNEL);
> + if (!buf)
> + return -ENOMEM;
> +
> + if (copy_from_user(buf, c->buf, config.len)) {
> + kvfree(buf);
> + return -EFAULT;
> + }
> +
> + ops->set_config(vdpa, config.off, buf, config.len);
> +
> + kvfree(buf);
> + return 0;
> +}
> +
> +static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + u64 features;
> +
> + features = ops->get_features(vdpa);
> + features &= vhost_vdpa_features[v->virtio_id];
> +
> + if (copy_to_user(featurep, &features, sizeof(features)))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + u64 features;
> +
> + /*
> + * It's not allowed to change the features after they have
> + * been negotiated.
> + */
> + if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK)
> + return -EBUSY;
> +
> + if (copy_from_user(&features, featurep, sizeof(features)))
> + return -EFAULT;
> +
> + if (features & ~vhost_vdpa_features[v->virtio_id])
> + return -EINVAL;
> +
> + if (ops->set_features(vdpa, features))
> + return -EINVAL;
> +
> + return 0;
> +}
> +
> +static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + u16 num;
> +
> + num = ops->get_vq_num_max(vdpa);
> +
> + if (copy_to_user(argp, &num, sizeof(num)))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
> + void __user *argp)
> +{
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + struct vdpa_callback cb;
> + struct vhost_virtqueue *vq;
> + struct vhost_vring_state s;
> + u8 status;
> + u32 idx;
> + long r;
> +
> + r = get_user(idx, (u32 __user *)argp);
> + if (r < 0)
> + return r;
> + if (idx >= v->nvqs)
> + return -ENOBUFS;
> +
> + idx = array_index_nospec(idx, v->nvqs);
> + vq = &v->vqs[idx];
> +
> + status = ops->get_status(vdpa);
> +
> + /*
> + * It's not allowed to detect and program vqs before
> + * features negotiation or after enabling driver.
> + */
> + if (!(status & VIRTIO_CONFIG_S_FEATURES_OK) ||
> + (status & VIRTIO_CONFIG_S_DRIVER_OK))
> + return -EBUSY;
> +
> + if (cmd == VHOST_VDPA_SET_VRING_ENABLE) {
> + if (copy_from_user(&s, argp, sizeof(s)))
> + return -EFAULT;
> + ops->set_vq_ready(vdpa, idx, s.num);
> + return 0;
> + }
> +
> + /*
> + * It's not allowed to detect and program vqs with
> + * vqs enabled.
> + */
> + if (ops->get_vq_ready(vdpa, idx))
> + return -EBUSY;
> +
> + if (cmd == VHOST_GET_VRING_BASE)
> + vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx);
> +
> + r = vhost_vring_ioctl(&v->vdev, cmd, argp);
> + if (r)
> + return r;
> +
> + switch (cmd) {
> + case VHOST_SET_VRING_ADDR:
> + if (ops->set_vq_address(vdpa, idx, (u64)vq->desc,
> + (u64)vq->avail, (u64)vq->used))
> + r = -EINVAL;
> + break;
> +
> + case VHOST_SET_VRING_BASE:
> + if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx))
> + r = -EINVAL;
> + break;
> +
> + case VHOST_SET_VRING_CALL:
> + if (vq->call_ctx) {
> + cb.callback = vhost_vdpa_virtqueue_cb;
> + cb.private = vq;
> + } else {
> + cb.callback = NULL;
> + cb.private = NULL;
> + }
> + ops->set_vq_cb(vdpa, idx, &cb);
> + break;
> +
> + case VHOST_SET_VRING_NUM:
> + ops->set_vq_num(vdpa, idx, vq->num);
> + break;
> + }
> +
> + return r;
> +}
> +
> +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v)
> +{
> + /* TODO: fix this */


Before trying to do this it looks to me we need the following during the
probe

1) if set_map() is not supported by the vDPA device probe the IOMMU that
is supported by the vDPA device
2) allocate IOMMU domain

And then:

3) pin pages through GUP and do proper accounting
4) store GPA->HPA mapping in the umem
5) generate diffs of memory table and using IOMMU API to setup the dma
mapping in this method

For 1), I'm not sure parent is sufficient for to doing this or need to
introduce new API like iommu_device in mdev.


> + return -ENOTSUPP;
> +}
> +
> +static int vhost_vdpa_open(struct inode *inode, struct file *filep)
> +{
> + struct vhost_vdpa *v;
> + struct vhost_dev *dev;
> + struct vhost_virtqueue **vqs;
> + int nvqs, i, r, opened;
> +
> + v = vhost_vdpa_get_from_minor(iminor(inode));
> + if (!v)
> + return -ENODEV;
> +
> + opened = atomic_cmpxchg(&v->opened, 0, 1);
> + if (opened) {
> + r = -EBUSY;
> + goto err;
> + }
> +
> + nvqs = v->nvqs;
> + vhost_vdpa_reset(v);
> +
> + vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
> + if (!vqs) {
> + r = -ENOMEM;
> + goto err;
> + }
> +
> + dev = &v->vdev;
> + for (i = 0; i < nvqs; i++) {
> + vqs[i] = &v->vqs[i];
> + vqs[i]->handle_kick = handle_vq_kick;
> + }
> + vhost_dev_init(dev, vqs, nvqs, 0, 0, 0);
> +
> + filep->private_data = v;
> +
> + return 0;
> +
> +err:
> + atomic_dec(&v->opened);
> + return r;
> +}
> +
> +static int vhost_vdpa_release(struct inode *inode, struct file *filep)
> +{
> + struct vhost_vdpa *v = filep->private_data;
> +
> + mutex_lock(&v->mutex);
> + filep->private_data = NULL;
> + vhost_vdpa_reset(v);
> + vhost_dev_stop(&v->vdev);
> + vhost_dev_cleanup(&v->vdev);
> + kfree(v->vdev.vqs);
> + mutex_unlock(&v->mutex);
> +
> + atomic_dec(&v->opened);
> + wake_up(&vhost_vdpa.release_q);
> +
> + return 0;
> +}
> +
> +static long vhost_vdpa_unlocked_ioctl(struct file *filep,
> + unsigned int cmd, unsigned long arg)
> +{
> + struct vhost_vdpa *v = filep->private_data;
> + struct vdpa_device *vdpa = v->vdpa;
> + const struct vdpa_config_ops *ops = vdpa->config;
> + void __user *argp = (void __user *)arg;
> + long r;
> +
> + mutex_lock(&v->mutex);
> +
> + switch (cmd) {
> + case VHOST_VDPA_GET_DEVICE_ID:
> + r = vhost_vdpa_get_device_id(v, argp);
> + break;
> + case VHOST_VDPA_GET_STATUS:
> + r = vhost_vdpa_get_status(v, argp);
> + break;
> + case VHOST_VDPA_SET_STATUS:
> + r = vhost_vdpa_set_status(v, argp);
> + break;
> + case VHOST_VDPA_GET_CONFIG:
> + r = vhost_vdpa_get_config(v, argp);
> + break;
> + case VHOST_VDPA_SET_CONFIG:
> + r = vhost_vdpa_set_config(v, argp);
> + break;
> + case VHOST_GET_FEATURES:
> + r = vhost_vdpa_get_features(v, argp);
> + break;
> + case VHOST_SET_FEATURES:
> + r = vhost_vdpa_set_features(v, argp);
> + break;
> + case VHOST_VDPA_GET_VRING_NUM:
> + r = vhost_vdpa_get_vring_num(v, argp);
> + break;
> + case VHOST_SET_LOG_BASE:
> + case VHOST_SET_LOG_FD:
> + r = -ENOIOCTLCMD;
> + break;
> + default:
> + mutex_lock(&v->vdev.mutex);
> + r = vhost_dev_ioctl(&v->vdev, cmd, argp);
> + if (r == -ENOIOCTLCMD)
> + r = vhost_vdpa_vring_ioctl(v, cmd, argp);
> + mutex_unlock(&v->vdev.mutex);
> + break;
> + }
> +
> + if (r)
> + goto out;
> +
> + switch (cmd) {
> + case VHOST_SET_MEM_TABLE:
> + if (ops->set_map)
> + r = ops->set_map(vdpa, v->vdev.umem);


I think we need do the following to make sure set_map() works:

1) pin pages through GUP and do proper accounting for e.g ulimit
2) using GPA->HPA mapping for dev->umem
3) then there's no need to deal VHOST_SET_MEM_TABLE in
vhost_dev_ioctl(), or add some specific callbacks in vhost_set_memory().

Thanks


> + else
> + r = vhost_vdpa_do_dma_mapping(v);
> + break;
> + }
> +
> +out:
> + mutex_unlock(&v->mutex);
> + return r;
> +}
> +
> +static const struct file_operations vhost_vdpa_fops = {
> + .owner = THIS_MODULE,
> + .open = vhost_vdpa_open,
> + .release = vhost_vdpa_release,
> + .unlocked_ioctl = vhost_vdpa_unlocked_ioctl,
> + .compat_ioctl = compat_ptr_ioctl,
> +};
> +
> +static int vhost_vdpa_probe(struct device *dev)
> +{
> + struct vdpa_device *vdpa = dev_to_vdpa(dev);
> + const struct vdpa_config_ops *ops = vdpa->config;
> + struct vhost_vdpa *v;
> + struct device *d;
> + int minor, nvqs;
> + int r;
> +
> + /* Currently, we only accept the network devices. */
> + if (ops->get_device_id(vdpa) != VIRTIO_ID_NET) {
> + r = -ENOTSUPP;
> + goto err;
> + }
> +
> + v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> + if (!v) {
> + r = -ENOMEM;
> + goto err;
> + }
> +
> + nvqs = VHOST_VDPA_VQ_MAX;
> +
> + v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue),
> + GFP_KERNEL);
> + if (!v->vqs) {
> + r = -ENOMEM;
> + goto err_alloc_vqs;
> + }
> +
> + mutex_init(&v->mutex);
> + atomic_set(&v->opened, 0);
> +
> + v->vdpa = vdpa;
> + v->nvqs = nvqs;
> + v->virtio_id = ops->get_device_id(vdpa);
> +
> + mutex_lock(&vhost_vdpa.mutex);
> +
> + minor = vhost_vdpa_alloc_minor(v);
> + if (minor < 0) {
> + r = minor;
> + goto err_alloc_minor;
> + }
> +
> + d = device_create(vhost_vdpa.class, NULL,
> + MKDEV(MAJOR(vhost_vdpa.devt), minor),
> + v, "%d", vdpa->index);
> + if (IS_ERR(d)) {
> + r = PTR_ERR(d);
> + goto err_device_create;
> + }
> +
> + mutex_unlock(&vhost_vdpa.mutex);
> +
> + v->dev = d;
> + v->minor = minor;
> + dev_set_drvdata(dev, v);
> +
> + return 0;
> +
> +err_device_create:
> + vhost_vdpa_free_minor(minor);
> +err_alloc_minor:
> + mutex_unlock(&vhost_vdpa.mutex);
> + kfree(v->vqs);
> +err_alloc_vqs:
> + kfree(v);
> +err:
> + return r;
> +}
> +
> +static void vhost_vdpa_remove(struct device *dev)
> +{
> + DEFINE_WAIT_FUNC(wait, woken_wake_function);
> + struct vhost_vdpa *v = dev_get_drvdata(dev);
> + int opened;
> +
> + add_wait_queue(&vhost_vdpa.release_q, &wait);
> +
> + do {
> + opened = atomic_cmpxchg(&v->opened, 0, 1);
> + if (!opened)
> + break;
> + wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
> + } while (1);
> +
> + remove_wait_queue(&vhost_vdpa.release_q, &wait);
> +
> + mutex_lock(&vhost_vdpa.mutex);
> + device_destroy(vhost_vdpa.class,
> + MKDEV(MAJOR(vhost_vdpa.devt), v->minor));
> + vhost_vdpa_free_minor(v->minor);
> + mutex_unlock(&vhost_vdpa.mutex);
> + kfree(v->vqs);
> + kfree(v);
> +}
> +
> +static struct vdpa_driver vhost_vdpa_driver = {
> + .drv = {
> + .name = "vhost_vdpa",
> + },
> + .probe = vhost_vdpa_probe,
> + .remove = vhost_vdpa_remove,
> +};
> +
> +static char *vhost_vdpa_devnode(struct device *dev, umode_t *mode)
> +{
> + return kasprintf(GFP_KERNEL, "vhost-vdpa/%s", dev_name(dev));
> +}
> +
> +static int __init vhost_vdpa_init(void)
> +{
> + int r;
> +
> + idr_init(&vhost_vdpa.idr);
> + mutex_init(&vhost_vdpa.mutex);
> + init_waitqueue_head(&vhost_vdpa.release_q);
> +
> + /* /dev/vhost-vdpa/$vdpa_device_index */
> + vhost_vdpa.class = class_create(THIS_MODULE, "vhost-vdpa");
> + if (IS_ERR(vhost_vdpa.class)) {
> + r = PTR_ERR(vhost_vdpa.class);
> + goto err_class;
> + }
> +
> + vhost_vdpa.class->devnode = vhost_vdpa_devnode;
> +
> + r = alloc_chrdev_region(&vhost_vdpa.devt, 0, MINORMASK + 1,
> + "vhost-vdpa");
> + if (r)
> + goto err_alloc_chrdev;
> +
> + cdev_init(&vhost_vdpa.cdev, &vhost_vdpa_fops);
> + r = cdev_add(&vhost_vdpa.cdev, vhost_vdpa.devt, MINORMASK + 1);
> + if (r)
> + goto err_cdev_add;
> +
> + r = register_vdpa_driver(&vhost_vdpa_driver);
> + if (r)
> + goto err_register_vdpa_driver;
> +
> + return 0;
> +
> +err_register_vdpa_driver:
> + cdev_del(&vhost_vdpa.cdev);
> +err_cdev_add:
> + unregister_chrdev_region(vhost_vdpa.devt, MINORMASK + 1);
> +err_alloc_chrdev:
> + class_destroy(vhost_vdpa.class);
> +err_class:
> + idr_destroy(&vhost_vdpa.idr);
> + return r;
> +}
> +module_init(vhost_vdpa_init);
> +
> +static void __exit vhost_vdpa_exit(void)
> +{
> + unregister_vdpa_driver(&vhost_vdpa_driver);
> + cdev_del(&vhost_vdpa.cdev);
> + unregister_chrdev_region(vhost_vdpa.devt, MINORMASK + 1);
> + class_destroy(vhost_vdpa.class);
> + idr_destroy(&vhost_vdpa.idr);
> +}
> +module_exit(vhost_vdpa_exit);
> +
> +MODULE_VERSION("0.0.1");
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Intel Corporation");
> +MODULE_DESCRIPTION("vDPA based vhost backend for virtio");
> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> index 40d028eed645..23f6db2106f5 100644
> --- a/include/uapi/linux/vhost.h
> +++ b/include/uapi/linux/vhost.h
> @@ -116,4 +116,25 @@
> #define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64)
> #define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int)
>
> +/* VHOST_VDPA specific defines */
> +
> +/* Get the device id. The device ids follow the same definition of
> + * the device id defined in virtio-spec. */
> +#define VHOST_VDPA_GET_DEVICE_ID _IOR(VHOST_VIRTIO, 0x70, __u32)
> +/* Get and set the status. The status bits follow the same definition
> + * of the device status defined in virtio-spec. */
> +#define VHOST_VDPA_GET_STATUS _IOR(VHOST_VIRTIO, 0x71, __u8)
> +#define VHOST_VDPA_SET_STATUS _IOW(VHOST_VIRTIO, 0x72, __u8)
> +/* Get and set the device config. The device config follows the same
> + * definition of the device config defined in virtio-spec. */
> +#define VHOST_VDPA_GET_CONFIG _IOR(VHOST_VIRTIO, 0x73, \
> + struct vhost_vdpa_config)
> +#define VHOST_VDPA_SET_CONFIG _IOW(VHOST_VIRTIO, 0x74, \
> + struct vhost_vdpa_config)
> +/* Enable/disable the ring. */
> +#define VHOST_VDPA_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x75, \
> + struct vhost_vring_state)
> +/* Get the max ring size. */
> +#define VHOST_VDPA_GET_VRING_NUM _IOR(VHOST_VIRTIO, 0x76, __u16)
> +
> #endif
> diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
> index c907290ff065..669457ce5c48 100644
> --- a/include/uapi/linux/vhost_types.h
> +++ b/include/uapi/linux/vhost_types.h
> @@ -119,6 +119,14 @@ struct vhost_scsi_target {
> unsigned short reserved;
> };
>
> +/* VHOST_VDPA specific definitions */
> +
> +struct vhost_vdpa_config {
> + __u32 off;
> + __u32 len;
> + __u8 buf[0];
> +};
> +
> /* Feature bits */
> /* Log all write descriptors. Can be changed while device is active. */
> #define VHOST_F_LOG_ALL 26

2020-02-04 06:04:30

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> 5) generate diffs of memory table and using IOMMU API to setup the dma
> mapping in this method

Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?

--
MST

2020-02-04 06:47:51

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
>> 5) generate diffs of memory table and using IOMMU API to setup the dma
>> mapping in this method
> Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
>

Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?

Thanks


2020-02-05 02:04:12

by Tiwei Bie

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> On 2020/1/31 上午11:36, Tiwei Bie wrote:
> > This patch introduces a vDPA based vhost backend. This
> > backend is built on top of the same interface defined
> > in virtio-vDPA and provides a generic vhost interface
> > for userspace to accelerate the virtio devices in guest.
> >
> > This backend is implemented as a vDPA device driver on
> > top of the same ops used in virtio-vDPA. It will create
> > char device entry named vhost-vdpa/$vdpa_device_index
> > for userspace to use. Userspace can use vhost ioctls on
> > top of this char device to setup the backend.
> >
> > Signed-off-by: Tiwei Bie <[email protected]>
> > ---
> > This patch depends on below series:
> > https://lkml.org/lkml/2020/1/16/353
> >
> > Please note that _SET_MEM_TABLE isn't fully supported yet.
> > Comments would be appreciated!
> >
> > Changes since last patch (https://lkml.org/lkml/2019/11/18/1068)
> > - Switch to the vDPA bus;
> > - Switch to vhost's own chardev;
> >
> > drivers/vhost/Kconfig | 12 +
> > drivers/vhost/Makefile | 3 +
> > drivers/vhost/vdpa.c | 705 +++++++++++++++++++++++++++++++
> > include/uapi/linux/vhost.h | 21 +
> > include/uapi/linux/vhost_types.h | 8 +
> > 5 files changed, 749 insertions(+)
> > create mode 100644 drivers/vhost/vdpa.c
> >
> > diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> > index f21c45aa5e07..13e6a94d0243 100644
> > --- a/drivers/vhost/Kconfig
> > +++ b/drivers/vhost/Kconfig
> > @@ -34,6 +34,18 @@ config VHOST_VSOCK
> > To compile this driver as a module, choose M here: the module will be called
> > vhost_vsock.
> > +config VHOST_VDPA
> > + tristate "Vhost driver for vDPA based backend"
> > + depends on EVENTFD && VDPA
> > + select VHOST
> > + default n
> > + ---help---
> > + This kernel module can be loaded in host kernel to accelerate
> > + guest virtio devices with the vDPA based backends.
> > +
> > + To compile this driver as a module, choose M here: the module
> > + will be called vhost_vdpa.
> > +
> > config VHOST
> > tristate
> > depends on VHOST_IOTLB
> > diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> > index df99756fbb26..a65e9f4a2c0a 100644
> > --- a/drivers/vhost/Makefile
> > +++ b/drivers/vhost/Makefile
> > @@ -10,6 +10,9 @@ vhost_vsock-y := vsock.o
> > obj-$(CONFIG_VHOST_RING) += vringh.o
> > +obj-$(CONFIG_VHOST_VDPA) += vhost_vdpa.o
> > +vhost_vdpa-y := vdpa.o
> > +
> > obj-$(CONFIG_VHOST) += vhost.o
> > obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o
> > diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> > new file mode 100644
> > index 000000000000..631d994d37ac
> > --- /dev/null
> > +++ b/drivers/vhost/vdpa.c
> > @@ -0,0 +1,705 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Copyright (C) 2018-2020 Intel Corporation.
> > + *
> > + * Author: Tiwei Bie <[email protected]>
> > + *
> > + * Thanks to Jason Wang and Michael S. Tsirkin for the valuable
> > + * comments and suggestions. And thanks to Cunming Liang and
> > + * Zhihong Wang for all their supports.
> > + */
> > +
> > +#include <linux/kernel.h>
> > +#include <linux/module.h>
> > +#include <linux/cdev.h>
> > +#include <linux/device.h>
> > +#include <linux/uuid.h>
> > +#include <linux/vdpa.h>
> > +#include <linux/nospec.h>
> > +#include <linux/vhost.h>
> > +#include <linux/virtio_net.h>
> > +
> > +#include "vhost.h"
> > +
> > +enum {
> > + VHOST_VDPA_FEATURES =
> > + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
> > + (1ULL << VIRTIO_F_ANY_LAYOUT) |
> > + (1ULL << VIRTIO_F_VERSION_1) |
> > + (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
> > + (1ULL << VIRTIO_F_RING_PACKED) |
> > + (1ULL << VIRTIO_F_ORDER_PLATFORM) |
> > + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
> > + (1ULL << VIRTIO_RING_F_EVENT_IDX),
> > +
> > + VHOST_VDPA_NET_FEATURES = VHOST_VDPA_FEATURES |
> > + (1ULL << VIRTIO_NET_F_CSUM) |
> > + (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
> > + (1ULL << VIRTIO_NET_F_MTU) |
> > + (1ULL << VIRTIO_NET_F_MAC) |
> > + (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
> > + (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
> > + (1ULL << VIRTIO_NET_F_GUEST_ECN) |
> > + (1ULL << VIRTIO_NET_F_GUEST_UFO) |
> > + (1ULL << VIRTIO_NET_F_HOST_TSO4) |
> > + (1ULL << VIRTIO_NET_F_HOST_TSO6) |
> > + (1ULL << VIRTIO_NET_F_HOST_ECN) |
> > + (1ULL << VIRTIO_NET_F_HOST_UFO) |
> > + (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
> > + (1ULL << VIRTIO_NET_F_STATUS) |
> > + (1ULL << VIRTIO_NET_F_SPEED_DUPLEX),
> > +};
> > +
> > +/* Currently, only network backend w/o multiqueue is supported. */
> > +#define VHOST_VDPA_VQ_MAX 2
> > +
> > +struct vhost_vdpa {
> > + /* The lock is to protect this structure. */
> > + struct mutex mutex;
> > + struct vhost_dev vdev;
> > + struct vhost_virtqueue *vqs;
> > + struct vdpa_device *vdpa;
> > + struct device *dev;
> > + atomic_t opened;
> > + int nvqs;
> > + int virtio_id;
> > + int minor;
> > +};
> > +
> > +static struct {
> > + /* The lock is to protect this structure. */
> > + struct mutex mutex;
> > + struct class *class;
> > + struct idr idr;
> > + struct cdev cdev;
> > + dev_t devt;
> > + wait_queue_head_t release_q;
> > +} vhost_vdpa;
> > +
> > +static const u64 vhost_vdpa_features[] = {
> > + [VIRTIO_ID_NET] = VHOST_VDPA_NET_FEATURES,
> > +};
> > +
> > +static int vhost_vdpa_alloc_minor(struct vhost_vdpa *v)
> > +{
> > + return idr_alloc(&vhost_vdpa.idr, v, 0, MINORMASK + 1,
> > + GFP_KERNEL);
> > +}
> > +
> > +static void vhost_vdpa_free_minor(int minor)
> > +{
> > + idr_remove(&vhost_vdpa.idr, minor);
> > +}
> > +
> > +static struct vhost_vdpa *vhost_vdpa_get_from_minor(int minor)
> > +{
> > + struct vhost_vdpa *v;
> > +
> > + mutex_lock(&vhost_vdpa.mutex);
> > + v = idr_find(&vhost_vdpa.idr, minor);
> > + mutex_unlock(&vhost_vdpa.mutex);
> > +
> > + return v;
> > +}
> > +
> > +static void handle_vq_kick(struct vhost_work *work)
> > +{
> > + struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
> > + poll.work);
> > + struct vhost_vdpa *v = container_of(vq->dev, struct vhost_vdpa, vdev);
> > + const struct vdpa_config_ops *ops = v->vdpa->config;
> > +
> > + ops->kick_vq(v->vdpa, vq - v->vqs);
> > +}
> > +
> > +static irqreturn_t vhost_vdpa_virtqueue_cb(void *private)
> > +{
> > + struct vhost_virtqueue *vq = private;
> > + struct eventfd_ctx *call_ctx = vq->call_ctx;
> > +
> > + if (call_ctx)
> > + eventfd_signal(call_ctx, 1);
> > +
> > + return IRQ_HANDLED;
> > +}
> > +
> > +static void vhost_vdpa_reset(struct vhost_vdpa *v)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > +
> > + ops->set_status(vdpa, 0);
> > +}
> > +
> > +static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + u32 device_id;
> > +
> > + device_id = ops->get_device_id(vdpa);
> > +
> > + if (copy_to_user(argp, &device_id, sizeof(device_id)))
> > + return -EFAULT;
> > +
> > + return 0;
> > +}
> > +
> > +static long vhost_vdpa_get_status(struct vhost_vdpa *v, u8 __user *statusp)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + u8 status;
> > +
> > + status = ops->get_status(vdpa);
> > +
> > + if (copy_to_user(statusp, &status, sizeof(status)))
> > + return -EFAULT;
> > +
> > + return 0;
> > +}
> > +
> > +static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + u8 status;
> > +
> > + if (copy_from_user(&status, statusp, sizeof(status)))
> > + return -EFAULT;
> > +
> > + /*
> > + * Userspace shouldn't remove status bits unless reset the
> > + * status to 0.
> > + */
> > + if (status != 0 && (ops->get_status(vdpa) & ~status) != 0)
> > + return -EINVAL;
> > +
> > + ops->set_status(vdpa, status);
> > +
> > + return 0;
> > +}
> > +
> > +static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
> > + struct vhost_vdpa_config *c)
> > +{
> > + long size = 0;
> > +
> > + switch (v->virtio_id) {
> > + case VIRTIO_ID_NET:
> > + size = sizeof(struct virtio_net_config);
> > + break;
> > + }
> > +
> > + if (c->len == 0)
> > + return -EINVAL;
> > +
> > + if (c->len > size - c->off)
> > + return -E2BIG;
> > +
> > + return 0;
> > +}
> > +
> > +static long vhost_vdpa_get_config(struct vhost_vdpa *v,
> > + struct vhost_vdpa_config __user *c)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + struct vhost_vdpa_config config;
> > + unsigned long size = offsetof(struct vhost_vdpa_config, buf);
> > + u8 *buf;
> > +
> > + if (copy_from_user(&config, c, size))
> > + return -EFAULT;
> > + if (vhost_vdpa_config_validate(v, &config))
> > + return -EINVAL;
> > + buf = kvzalloc(config.len, GFP_KERNEL);
> > + if (!buf)
> > + return -ENOMEM;
> > +
> > + ops->get_config(vdpa, config.off, buf, config.len);
> > +
> > + if (copy_to_user(c->buf, buf, config.len)) {
> > + kvfree(buf);
> > + return -EFAULT;
> > + }
> > +
> > + kvfree(buf);
> > + return 0;
> > +}
> > +
> > +static long vhost_vdpa_set_config(struct vhost_vdpa *v,
> > + struct vhost_vdpa_config __user *c)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + struct vhost_vdpa_config config;
> > + unsigned long size = offsetof(struct vhost_vdpa_config, buf);
> > + u8 *buf;
> > +
> > + if (copy_from_user(&config, c, size))
> > + return -EFAULT;
> > + if (vhost_vdpa_config_validate(v, &config))
> > + return -EINVAL;
> > + buf = kvzalloc(config.len, GFP_KERNEL);
> > + if (!buf)
> > + return -ENOMEM;
> > +
> > + if (copy_from_user(buf, c->buf, config.len)) {
> > + kvfree(buf);
> > + return -EFAULT;
> > + }
> > +
> > + ops->set_config(vdpa, config.off, buf, config.len);
> > +
> > + kvfree(buf);
> > + return 0;
> > +}
> > +
> > +static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + u64 features;
> > +
> > + features = ops->get_features(vdpa);
> > + features &= vhost_vdpa_features[v->virtio_id];
> > +
> > + if (copy_to_user(featurep, &features, sizeof(features)))
> > + return -EFAULT;
> > +
> > + return 0;
> > +}
> > +
> > +static long vhost_vdpa_set_features(struct vhost_vdpa *v, u64 __user *featurep)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + u64 features;
> > +
> > + /*
> > + * It's not allowed to change the features after they have
> > + * been negotiated.
> > + */
> > + if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_FEATURES_OK)
> > + return -EBUSY;
> > +
> > + if (copy_from_user(&features, featurep, sizeof(features)))
> > + return -EFAULT;
> > +
> > + if (features & ~vhost_vdpa_features[v->virtio_id])
> > + return -EINVAL;
> > +
> > + if (ops->set_features(vdpa, features))
> > + return -EINVAL;
> > +
> > + return 0;
> > +}
> > +
> > +static long vhost_vdpa_get_vring_num(struct vhost_vdpa *v, u16 __user *argp)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + u16 num;
> > +
> > + num = ops->get_vq_num_max(vdpa);
> > +
> > + if (copy_to_user(argp, &num, sizeof(num)))
> > + return -EFAULT;
> > +
> > + return 0;
> > +}
> > +
> > +static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
> > + void __user *argp)
> > +{
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + struct vdpa_callback cb;
> > + struct vhost_virtqueue *vq;
> > + struct vhost_vring_state s;
> > + u8 status;
> > + u32 idx;
> > + long r;
> > +
> > + r = get_user(idx, (u32 __user *)argp);
> > + if (r < 0)
> > + return r;
> > + if (idx >= v->nvqs)
> > + return -ENOBUFS;
> > +
> > + idx = array_index_nospec(idx, v->nvqs);
> > + vq = &v->vqs[idx];
> > +
> > + status = ops->get_status(vdpa);
> > +
> > + /*
> > + * It's not allowed to detect and program vqs before
> > + * features negotiation or after enabling driver.
> > + */
> > + if (!(status & VIRTIO_CONFIG_S_FEATURES_OK) ||
> > + (status & VIRTIO_CONFIG_S_DRIVER_OK))
> > + return -EBUSY;
> > +
> > + if (cmd == VHOST_VDPA_SET_VRING_ENABLE) {
> > + if (copy_from_user(&s, argp, sizeof(s)))
> > + return -EFAULT;
> > + ops->set_vq_ready(vdpa, idx, s.num);
> > + return 0;
> > + }
> > +
> > + /*
> > + * It's not allowed to detect and program vqs with
> > + * vqs enabled.
> > + */
> > + if (ops->get_vq_ready(vdpa, idx))
> > + return -EBUSY;
> > +
> > + if (cmd == VHOST_GET_VRING_BASE)
> > + vq->last_avail_idx = ops->get_vq_state(v->vdpa, idx);
> > +
> > + r = vhost_vring_ioctl(&v->vdev, cmd, argp);
> > + if (r)
> > + return r;
> > +
> > + switch (cmd) {
> > + case VHOST_SET_VRING_ADDR:
> > + if (ops->set_vq_address(vdpa, idx, (u64)vq->desc,
> > + (u64)vq->avail, (u64)vq->used))
> > + r = -EINVAL;
> > + break;
> > +
> > + case VHOST_SET_VRING_BASE:
> > + if (ops->set_vq_state(vdpa, idx, vq->last_avail_idx))
> > + r = -EINVAL;
> > + break;
> > +
> > + case VHOST_SET_VRING_CALL:
> > + if (vq->call_ctx) {
> > + cb.callback = vhost_vdpa_virtqueue_cb;
> > + cb.private = vq;
> > + } else {
> > + cb.callback = NULL;
> > + cb.private = NULL;
> > + }
> > + ops->set_vq_cb(vdpa, idx, &cb);
> > + break;
> > +
> > + case VHOST_SET_VRING_NUM:
> > + ops->set_vq_num(vdpa, idx, vq->num);
> > + break;
> > + }
> > +
> > + return r;
> > +}
> > +
> > +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v)
> > +{
> > + /* TODO: fix this */
>
>
> Before trying to do this it looks to me we need the following during the
> probe
>
> 1) if set_map() is not supported by the vDPA device probe the IOMMU that is
> supported by the vDPA device
> 2) allocate IOMMU domain
>
> And then:
>
> 3) pin pages through GUP and do proper accounting
> 4) store GPA->HPA mapping in the umem
> 5) generate diffs of memory table and using IOMMU API to setup the dma
> mapping in this method
>
> For 1), I'm not sure parent is sufficient for to doing this or need to
> introduce new API like iommu_device in mdev.

Agree. We may also need to introduce something like
the iommu_device.

>
>
> > + return -ENOTSUPP;
> > +}
> > +
> > +static int vhost_vdpa_open(struct inode *inode, struct file *filep)
> > +{
> > + struct vhost_vdpa *v;
> > + struct vhost_dev *dev;
> > + struct vhost_virtqueue **vqs;
> > + int nvqs, i, r, opened;
> > +
> > + v = vhost_vdpa_get_from_minor(iminor(inode));
> > + if (!v)
> > + return -ENODEV;
> > +
> > + opened = atomic_cmpxchg(&v->opened, 0, 1);
> > + if (opened) {
> > + r = -EBUSY;
> > + goto err;
> > + }
> > +
> > + nvqs = v->nvqs;
> > + vhost_vdpa_reset(v);
> > +
> > + vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL);
> > + if (!vqs) {
> > + r = -ENOMEM;
> > + goto err;
> > + }
> > +
> > + dev = &v->vdev;
> > + for (i = 0; i < nvqs; i++) {
> > + vqs[i] = &v->vqs[i];
> > + vqs[i]->handle_kick = handle_vq_kick;
> > + }
> > + vhost_dev_init(dev, vqs, nvqs, 0, 0, 0);
> > +
> > + filep->private_data = v;
> > +
> > + return 0;
> > +
> > +err:
> > + atomic_dec(&v->opened);
> > + return r;
> > +}
> > +
> > +static int vhost_vdpa_release(struct inode *inode, struct file *filep)
> > +{
> > + struct vhost_vdpa *v = filep->private_data;
> > +
> > + mutex_lock(&v->mutex);
> > + filep->private_data = NULL;
> > + vhost_vdpa_reset(v);
> > + vhost_dev_stop(&v->vdev);
> > + vhost_dev_cleanup(&v->vdev);
> > + kfree(v->vdev.vqs);
> > + mutex_unlock(&v->mutex);
> > +
> > + atomic_dec(&v->opened);
> > + wake_up(&vhost_vdpa.release_q);
> > +
> > + return 0;
> > +}
> > +
> > +static long vhost_vdpa_unlocked_ioctl(struct file *filep,
> > + unsigned int cmd, unsigned long arg)
> > +{
> > + struct vhost_vdpa *v = filep->private_data;
> > + struct vdpa_device *vdpa = v->vdpa;
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + void __user *argp = (void __user *)arg;
> > + long r;
> > +
> > + mutex_lock(&v->mutex);
> > +
> > + switch (cmd) {
> > + case VHOST_VDPA_GET_DEVICE_ID:
> > + r = vhost_vdpa_get_device_id(v, argp);
> > + break;
> > + case VHOST_VDPA_GET_STATUS:
> > + r = vhost_vdpa_get_status(v, argp);
> > + break;
> > + case VHOST_VDPA_SET_STATUS:
> > + r = vhost_vdpa_set_status(v, argp);
> > + break;
> > + case VHOST_VDPA_GET_CONFIG:
> > + r = vhost_vdpa_get_config(v, argp);
> > + break;
> > + case VHOST_VDPA_SET_CONFIG:
> > + r = vhost_vdpa_set_config(v, argp);
> > + break;
> > + case VHOST_GET_FEATURES:
> > + r = vhost_vdpa_get_features(v, argp);
> > + break;
> > + case VHOST_SET_FEATURES:
> > + r = vhost_vdpa_set_features(v, argp);
> > + break;
> > + case VHOST_VDPA_GET_VRING_NUM:
> > + r = vhost_vdpa_get_vring_num(v, argp);
> > + break;
> > + case VHOST_SET_LOG_BASE:
> > + case VHOST_SET_LOG_FD:
> > + r = -ENOIOCTLCMD;
> > + break;
> > + default:
> > + mutex_lock(&v->vdev.mutex);
> > + r = vhost_dev_ioctl(&v->vdev, cmd, argp);
> > + if (r == -ENOIOCTLCMD)
> > + r = vhost_vdpa_vring_ioctl(v, cmd, argp);
> > + mutex_unlock(&v->vdev.mutex);
> > + break;
> > + }
> > +
> > + if (r)
> > + goto out;
> > +
> > + switch (cmd) {
> > + case VHOST_SET_MEM_TABLE:
> > + if (ops->set_map)
> > + r = ops->set_map(vdpa, v->vdev.umem);
>
>
> I think we need do the following to make sure set_map() works:
>
> 1) pin pages through GUP and do proper accounting for e.g ulimit
> 2) using GPA->HPA mapping for dev->umem
> 3) then there's no need to deal VHOST_SET_MEM_TABLE in vhost_dev_ioctl(), or
> add some specific callbacks in vhost_set_memory().

OK.

Thanks!
Tiwei

>
> Thanks
>
>

2020-02-05 02:07:18

by Tiwei Bie

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Tue, Feb 04, 2020 at 02:46:16PM +0800, Jason Wang wrote:
> On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
> > On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> > > 5) generate diffs of memory table and using IOMMU API to setup the dma
> > > mapping in this method
> > Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
> >
>
> Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?

Do you mean we let userspace to only use VHOST_IOTLB_UPDATE/INVALIDATE
to do the DMA mapping in vhost-vdpa case? When vIOMMU isn't available,
userspace will set msg->iova to GPA, otherwise userspace will set
msg->iova to GIOVA, and vhost-vdpa module will get HPA from msg->uaddr?

Thanks,
Tiwei

>
> Thanks
>
>

2020-02-05 03:13:59

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 上午10:05, Tiwei Bie wrote:
> On Tue, Feb 04, 2020 at 02:46:16PM +0800, Jason Wang wrote:
>> On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
>>> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
>>>> 5) generate diffs of memory table and using IOMMU API to setup the dma
>>>> mapping in this method
>>> Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
>>>
>> Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?
> Do you mean we let userspace to only use VHOST_IOTLB_UPDATE/INVALIDATE
> to do the DMA mapping in vhost-vdpa case? When vIOMMU isn't available,
> userspace will set msg->iova to GPA, otherwise userspace will set
> msg->iova to GIOVA, and vhost-vdpa module will get HPA from msg->uaddr?
>
> Thanks,
> Tiwei


I think so. Michael, do you think this makes sense?

Thanks


>
>> Thanks
>>
>>

2020-02-05 03:14:18

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 上午10:02, Tiwei Bie wrote:
>> Before trying to do this it looks to me we need the following during the
>> probe
>>
>> 1) if set_map() is not supported by the vDPA device probe the IOMMU that is
>> supported by the vDPA device
>> 2) allocate IOMMU domain
>>
>> And then:
>>
>> 3) pin pages through GUP and do proper accounting
>> 4) store GPA->HPA mapping in the umem
>> 5) generate diffs of memory table and using IOMMU API to setup the dma
>> mapping in this method
>>
>> For 1), I'm not sure parent is sufficient for to doing this or need to
>> introduce new API like iommu_device in mdev.
> Agree. We may also need to introduce something like
> the iommu_device.
>

Right, this is what I plan to do in next version.

Thanks

2020-02-05 05:38:02

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 05, 2020 at 11:12:21AM +0800, Jason Wang wrote:
>
> On 2020/2/5 上午10:05, Tiwei Bie wrote:
> > On Tue, Feb 04, 2020 at 02:46:16PM +0800, Jason Wang wrote:
> > > On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
> > > > On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> > > > > 5) generate diffs of memory table and using IOMMU API to setup the dma
> > > > > mapping in this method
> > > > Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
> > > >
> > > Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?
> > Do you mean we let userspace to only use VHOST_IOTLB_UPDATE/INVALIDATE
> > to do the DMA mapping in vhost-vdpa case? When vIOMMU isn't available,
> > userspace will set msg->iova to GPA, otherwise userspace will set
> > msg->iova to GIOVA, and vhost-vdpa module will get HPA from msg->uaddr?
> >
> > Thanks,
> > Tiwei
>
>
> I think so. Michael, do you think this makes sense?
>
> Thanks

to make sure, could you post the suggested argument format for
these ioctls?

>
> >
> > > Thanks
> > >
> > >

2020-02-05 05:52:26

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 下午1:31, Michael S. Tsirkin wrote:
> On Wed, Feb 05, 2020 at 11:12:21AM +0800, Jason Wang wrote:
>> On 2020/2/5 上午10:05, Tiwei Bie wrote:
>>> On Tue, Feb 04, 2020 at 02:46:16PM +0800, Jason Wang wrote:
>>>> On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
>>>>> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
>>>>>> 5) generate diffs of memory table and using IOMMU API to setup the dma
>>>>>> mapping in this method
>>>>> Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
>>>>>
>>>> Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?
>>> Do you mean we let userspace to only use VHOST_IOTLB_UPDATE/INVALIDATE
>>> to do the DMA mapping in vhost-vdpa case? When vIOMMU isn't available,
>>> userspace will set msg->iova to GPA, otherwise userspace will set
>>> msg->iova to GIOVA, and vhost-vdpa module will get HPA from msg->uaddr?
>>>
>>> Thanks,
>>> Tiwei
>> I think so. Michael, do you think this makes sense?
>>
>> Thanks
> to make sure, could you post the suggested argument format for
> these ioctls?
>

It's the existed uapi:

/* no alignment requirement */
struct vhost_iotlb_msg {
    __u64 iova;
    __u64 size;
    __u64 uaddr;
#define VHOST_ACCESS_RO      0x1
#define VHOST_ACCESS_WO      0x2
#define VHOST_ACCESS_RW      0x3
    __u8 perm;
#define VHOST_IOTLB_MISS           1
#define VHOST_IOTLB_UPDATE         2
#define VHOST_IOTLB_INVALIDATE     3
#define VHOST_IOTLB_ACCESS_FAIL    4
    __u8 type;
};

#define VHOST_IOTLB_MSG 0x1
#define VHOST_IOTLB_MSG_V2 0x2

struct vhost_msg {
    int type;
    union {
        struct vhost_iotlb_msg iotlb;
        __u8 padding[64];
    };
};

struct vhost_msg_v2 {
    __u32 type;
    __u32 reserved;
    union {
        struct vhost_iotlb_msg iotlb;
        __u8 padding[64];
    };
};

2020-02-05 06:31:47

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 05, 2020 at 01:50:28PM +0800, Jason Wang wrote:
>
> On 2020/2/5 下午1:31, Michael S. Tsirkin wrote:
> > On Wed, Feb 05, 2020 at 11:12:21AM +0800, Jason Wang wrote:
> > > On 2020/2/5 上午10:05, Tiwei Bie wrote:
> > > > On Tue, Feb 04, 2020 at 02:46:16PM +0800, Jason Wang wrote:
> > > > > On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
> > > > > > On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> > > > > > > 5) generate diffs of memory table and using IOMMU API to setup the dma
> > > > > > > mapping in this method
> > > > > > Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
> > > > > >
> > > > > Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?
> > > > Do you mean we let userspace to only use VHOST_IOTLB_UPDATE/INVALIDATE
> > > > to do the DMA mapping in vhost-vdpa case? When vIOMMU isn't available,
> > > > userspace will set msg->iova to GPA, otherwise userspace will set
> > > > msg->iova to GIOVA, and vhost-vdpa module will get HPA from msg->uaddr?
> > > >
> > > > Thanks,
> > > > Tiwei
> > > I think so. Michael, do you think this makes sense?
> > >
> > > Thanks
> > to make sure, could you post the suggested argument format for
> > these ioctls?
> >
>
> It's the existed uapi:
>
> /* no alignment requirement */
> struct vhost_iotlb_msg {
>     __u64 iova;
>     __u64 size;
>     __u64 uaddr;
> #define VHOST_ACCESS_RO      0x1
> #define VHOST_ACCESS_WO      0x2
> #define VHOST_ACCESS_RW      0x3
>     __u8 perm;
> #define VHOST_IOTLB_MISS           1
> #define VHOST_IOTLB_UPDATE         2
> #define VHOST_IOTLB_INVALIDATE     3
> #define VHOST_IOTLB_ACCESS_FAIL    4
>     __u8 type;
> };
>
> #define VHOST_IOTLB_MSG 0x1
> #define VHOST_IOTLB_MSG_V2 0x2
>
> struct vhost_msg {
>     int type;
>     union {
>         struct vhost_iotlb_msg iotlb;
>         __u8 padding[64];
>     };
> };
>
> struct vhost_msg_v2 {
>     __u32 type;
>     __u32 reserved;
>     union {
>         struct vhost_iotlb_msg iotlb;
>         __u8 padding[64];
>     };
> };

Oh ok. So with a real device, I suspect we do not want to wait for each
change to be processed by device completely, so we might want an asynchronous variant
and then some kind of flush that tells device "you better apply these now".

--
MST

2020-02-05 06:54:29

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 下午2:30, Michael S. Tsirkin wrote:
> On Wed, Feb 05, 2020 at 01:50:28PM +0800, Jason Wang wrote:
>> On 2020/2/5 下午1:31, Michael S. Tsirkin wrote:
>>> On Wed, Feb 05, 2020 at 11:12:21AM +0800, Jason Wang wrote:
>>>> On 2020/2/5 上午10:05, Tiwei Bie wrote:
>>>>> On Tue, Feb 04, 2020 at 02:46:16PM +0800, Jason Wang wrote:
>>>>>> On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
>>>>>>> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
>>>>>>>> 5) generate diffs of memory table and using IOMMU API to setup the dma
>>>>>>>> mapping in this method
>>>>>>> Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
>>>>>>>
>>>>>> Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?
>>>>> Do you mean we let userspace to only use VHOST_IOTLB_UPDATE/INVALIDATE
>>>>> to do the DMA mapping in vhost-vdpa case? When vIOMMU isn't available,
>>>>> userspace will set msg->iova to GPA, otherwise userspace will set
>>>>> msg->iova to GIOVA, and vhost-vdpa module will get HPA from msg->uaddr?
>>>>>
>>>>> Thanks,
>>>>> Tiwei
>>>> I think so. Michael, do you think this makes sense?
>>>>
>>>> Thanks
>>> to make sure, could you post the suggested argument format for
>>> these ioctls?
>>>
>> It's the existed uapi:
>>
>> /* no alignment requirement */
>> struct vhost_iotlb_msg {
>>     __u64 iova;
>>     __u64 size;
>>     __u64 uaddr;
>> #define VHOST_ACCESS_RO      0x1
>> #define VHOST_ACCESS_WO      0x2
>> #define VHOST_ACCESS_RW      0x3
>>     __u8 perm;
>> #define VHOST_IOTLB_MISS           1
>> #define VHOST_IOTLB_UPDATE         2
>> #define VHOST_IOTLB_INVALIDATE     3
>> #define VHOST_IOTLB_ACCESS_FAIL    4
>>     __u8 type;
>> };
>>
>> #define VHOST_IOTLB_MSG 0x1
>> #define VHOST_IOTLB_MSG_V2 0x2
>>
>> struct vhost_msg {
>>     int type;
>>     union {
>>         struct vhost_iotlb_msg iotlb;
>>         __u8 padding[64];
>>     };
>> };
>>
>> struct vhost_msg_v2 {
>>     __u32 type;
>>     __u32 reserved;
>>     union {
>>         struct vhost_iotlb_msg iotlb;
>>         __u8 padding[64];
>>     };
>> };
> Oh ok. So with a real device, I suspect we do not want to wait for each
> change to be processed by device completely, so we might want an asynchronous variant
> and then some kind of flush that tells device "you better apply these now".


Let me explain:

There are two types of devices:

1) device without on-chip IOMMU, DMA was done via IOMMU API which only
support incremental map/unmap
2) device with on-chip IOMMU, DMA could be done by device driver itself,
and we could choose to pass the whole mappings to the driver at one time
through vDPA bus operation (set_map)

For vhost-vpda, there're two types of memory mapping:

a) memory table, setup by userspace through VHOST_SET_MEM_TABLE, the
whole mapping is updated in this way
b) IOTLB API, incrementally done by userspace through vhost message
(IOTLB_UPDATE/IOTLB_INVALIDATE)

The current design is:

- Reuse VHOST_SET_MEM_TABLE, and for type 1), we can choose to send
diffs through IOMMU API or flush all the mappings then map new ones. For
type 2), just send the whole mapping through set_map()
- Reuse vhost IOTLB, so for type 1), simply forward update/invalidate
request via IOMMU API, for type 2), send IOTLB to vDPA device driver via
set_map(), device driver may choose to send diffs or rebuild all mapping
at their will

Technically we can use vhost IOTLB API (map/umap) for building
VHOST_SET_MEM_TABLE, but to avoid device to process the each request, it
looks to me we need new UAPI which seems sub optimal.

What's you thought?

Thanks


>

2020-02-05 07:17:21

by Shahaf Shuler

[permalink] [raw]
Subject: RE: [PATCH] vhost: introduce vDPA based backend

Wednesday, February 5, 2020 4:03 AM, Tiwei Bie:
> Subject: Re: [PATCH] vhost: introduce vDPA based backend
>
> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> > On 2020/1/31 上午11:36, Tiwei Bie wrote:
> > > This patch introduces a vDPA based vhost backend. This backend is
> > > built on top of the same interface defined in virtio-vDPA and
> > > provides a generic vhost interface for userspace to accelerate the
> > > virtio devices in guest.
> > >
> > > This backend is implemented as a vDPA device driver on top of the
> > > same ops used in virtio-vDPA. It will create char device entry named
> > > vhost-vdpa/$vdpa_device_index for userspace to use. Userspace can
> > > use vhost ioctls on top of this char device to setup the backend.
> > >
> > > Signed-off-by: Tiwei Bie <[email protected]>

[...]

> > > +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v) {
> > > + /* TODO: fix this */
> >
> >
> > Before trying to do this it looks to me we need the following during
> > the probe
> >
> > 1) if set_map() is not supported by the vDPA device probe the IOMMU
> > that is supported by the vDPA device
> > 2) allocate IOMMU domain
> >
> > And then:
> >
> > 3) pin pages through GUP and do proper accounting
> > 4) store GPA->HPA mapping in the umem
> > 5) generate diffs of memory table and using IOMMU API to setup the dma
> > mapping in this method
> >
> > For 1), I'm not sure parent is sufficient for to doing this or need to
> > introduce new API like iommu_device in mdev.
>
> Agree. We may also need to introduce something like the iommu_device.
>

Would it be better for the map/umnap logic to happen inside each device ?
Devices that needs the IOMMU will call iommu APIs from inside the driver callback.
Devices that has other ways to do the DMA mapping will call the proprietary APIs.

2020-02-05 07:18:13

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 05, 2020 at 02:49:31PM +0800, Jason Wang wrote:
>
> On 2020/2/5 下午2:30, Michael S. Tsirkin wrote:
> > On Wed, Feb 05, 2020 at 01:50:28PM +0800, Jason Wang wrote:
> > > On 2020/2/5 下午1:31, Michael S. Tsirkin wrote:
> > > > On Wed, Feb 05, 2020 at 11:12:21AM +0800, Jason Wang wrote:
> > > > > On 2020/2/5 上午10:05, Tiwei Bie wrote:
> > > > > > On Tue, Feb 04, 2020 at 02:46:16PM +0800, Jason Wang wrote:
> > > > > > > On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
> > > > > > > > On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> > > > > > > > > 5) generate diffs of memory table and using IOMMU API to setup the dma
> > > > > > > > > mapping in this method
> > > > > > > > Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
> > > > > > > >
> > > > > > > Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?
> > > > > > Do you mean we let userspace to only use VHOST_IOTLB_UPDATE/INVALIDATE
> > > > > > to do the DMA mapping in vhost-vdpa case? When vIOMMU isn't available,
> > > > > > userspace will set msg->iova to GPA, otherwise userspace will set
> > > > > > msg->iova to GIOVA, and vhost-vdpa module will get HPA from msg->uaddr?
> > > > > >
> > > > > > Thanks,
> > > > > > Tiwei
> > > > > I think so. Michael, do you think this makes sense?
> > > > >
> > > > > Thanks
> > > > to make sure, could you post the suggested argument format for
> > > > these ioctls?
> > > >
> > > It's the existed uapi:
> > >
> > > /* no alignment requirement */
> > > struct vhost_iotlb_msg {
> > >     __u64 iova;
> > >     __u64 size;
> > >     __u64 uaddr;
> > > #define VHOST_ACCESS_RO      0x1
> > > #define VHOST_ACCESS_WO      0x2
> > > #define VHOST_ACCESS_RW      0x3
> > >     __u8 perm;
> > > #define VHOST_IOTLB_MISS           1
> > > #define VHOST_IOTLB_UPDATE         2
> > > #define VHOST_IOTLB_INVALIDATE     3
> > > #define VHOST_IOTLB_ACCESS_FAIL    4
> > >     __u8 type;
> > > };
> > >
> > > #define VHOST_IOTLB_MSG 0x1
> > > #define VHOST_IOTLB_MSG_V2 0x2
> > >
> > > struct vhost_msg {
> > >     int type;
> > >     union {
> > >         struct vhost_iotlb_msg iotlb;
> > >         __u8 padding[64];
> > >     };
> > > };
> > >
> > > struct vhost_msg_v2 {
> > >     __u32 type;
> > >     __u32 reserved;
> > >     union {
> > >         struct vhost_iotlb_msg iotlb;
> > >         __u8 padding[64];
> > >     };
> > > };
> > Oh ok. So with a real device, I suspect we do not want to wait for each
> > change to be processed by device completely, so we might want an asynchronous variant
> > and then some kind of flush that tells device "you better apply these now".
>
>
> Let me explain:
>
> There are two types of devices:
>
> 1) device without on-chip IOMMU, DMA was done via IOMMU API which only
> support incremental map/unmap

Most IOMMUs have queues nowdays though. Whether APIs within kernel
expose that matters but we are better off on emulating
hardware not specific guest behaviour.

> 2) device with on-chip IOMMU, DMA could be done by device driver itself, and
> we could choose to pass the whole mappings to the driver at one time through
> vDPA bus operation (set_map)
>
> For vhost-vpda, there're two types of memory mapping:
>
> a) memory table, setup by userspace through VHOST_SET_MEM_TABLE, the whole
> mapping is updated in this way
> b) IOTLB API, incrementally done by userspace through vhost message
> (IOTLB_UPDATE/IOTLB_INVALIDATE)
>
> The current design is:
>
> - Reuse VHOST_SET_MEM_TABLE, and for type 1), we can choose to send diffs
> through IOMMU API or flush all the mappings then map new ones. For type 2),
> just send the whole mapping through set_map()

I know that at least for RDMA based things, you can't change
a mapping if it's active. So drivers will need to figure out the
differences which just looks ugly: userspace knows what
it was changing (really just adding/removing some guest memory).



> - Reuse vhost IOTLB, so for type 1), simply forward update/invalidate
> request via IOMMU API, for type 2), send IOTLB to vDPA device driver via
> set_map(), device driver may choose to send diffs or rebuild all mapping at
> their will
>
> Technically we can use vhost IOTLB API (map/umap) for building
> VHOST_SET_MEM_TABLE, but to avoid device to process the each request, it
> looks to me we need new UAPI which seems sub optimal.
>
> What's you thought?
>
> Thanks

I suspect we can't completely avoid a new UAPI.

>
> >

2020-02-05 07:44:25

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 下午3:16, Michael S. Tsirkin wrote:
> On Wed, Feb 05, 2020 at 02:49:31PM +0800, Jason Wang wrote:
>> On 2020/2/5 下午2:30, Michael S. Tsirkin wrote:
>>> On Wed, Feb 05, 2020 at 01:50:28PM +0800, Jason Wang wrote:
>>>> On 2020/2/5 下午1:31, Michael S. Tsirkin wrote:
>>>>> On Wed, Feb 05, 2020 at 11:12:21AM +0800, Jason Wang wrote:
>>>>>> On 2020/2/5 上午10:05, Tiwei Bie wrote:
>>>>>>> On Tue, Feb 04, 2020 at 02:46:16PM +0800, Jason Wang wrote:
>>>>>>>> On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
>>>>>>>>> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
>>>>>>>>>> 5) generate diffs of memory table and using IOMMU API to setup the dma
>>>>>>>>>> mapping in this method
>>>>>>>>> Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
>>>>>>>>>
>>>>>>>> Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?
>>>>>>> Do you mean we let userspace to only use VHOST_IOTLB_UPDATE/INVALIDATE
>>>>>>> to do the DMA mapping in vhost-vdpa case? When vIOMMU isn't available,
>>>>>>> userspace will set msg->iova to GPA, otherwise userspace will set
>>>>>>> msg->iova to GIOVA, and vhost-vdpa module will get HPA from msg->uaddr?
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Tiwei
>>>>>> I think so. Michael, do you think this makes sense?
>>>>>>
>>>>>> Thanks
>>>>> to make sure, could you post the suggested argument format for
>>>>> these ioctls?
>>>>>
>>>> It's the existed uapi:
>>>>
>>>> /* no alignment requirement */
>>>> struct vhost_iotlb_msg {
>>>>     __u64 iova;
>>>>     __u64 size;
>>>>     __u64 uaddr;
>>>> #define VHOST_ACCESS_RO      0x1
>>>> #define VHOST_ACCESS_WO      0x2
>>>> #define VHOST_ACCESS_RW      0x3
>>>>     __u8 perm;
>>>> #define VHOST_IOTLB_MISS           1
>>>> #define VHOST_IOTLB_UPDATE         2
>>>> #define VHOST_IOTLB_INVALIDATE     3
>>>> #define VHOST_IOTLB_ACCESS_FAIL    4
>>>>     __u8 type;
>>>> };
>>>>
>>>> #define VHOST_IOTLB_MSG 0x1
>>>> #define VHOST_IOTLB_MSG_V2 0x2
>>>>
>>>> struct vhost_msg {
>>>>     int type;
>>>>     union {
>>>>         struct vhost_iotlb_msg iotlb;
>>>>         __u8 padding[64];
>>>>     };
>>>> };
>>>>
>>>> struct vhost_msg_v2 {
>>>>     __u32 type;
>>>>     __u32 reserved;
>>>>     union {
>>>>         struct vhost_iotlb_msg iotlb;
>>>>         __u8 padding[64];
>>>>     };
>>>> };
>>> Oh ok. So with a real device, I suspect we do not want to wait for each
>>> change to be processed by device completely, so we might want an asynchronous variant
>>> and then some kind of flush that tells device "you better apply these now".
>>
>> Let me explain:
>>
>> There are two types of devices:
>>
>> 1) device without on-chip IOMMU, DMA was done via IOMMU API which only
>> support incremental map/unmap
> Most IOMMUs have queues nowdays though. Whether APIs within kernel
> expose that matters but we are better off on emulating
> hardware not specific guest behaviour.


Last time I checked Intel IOMMU driver, I see the async QI is not used
there. And I'm not sure how queue will help much here. Qemu still need
to wait for all the DMA is setup to let guest work.


>
>> 2) device with on-chip IOMMU, DMA could be done by device driver itself, and
>> we could choose to pass the whole mappings to the driver at one time through
>> vDPA bus operation (set_map)
>>
>> For vhost-vpda, there're two types of memory mapping:
>>
>> a) memory table, setup by userspace through VHOST_SET_MEM_TABLE, the whole
>> mapping is updated in this way
>> b) IOTLB API, incrementally done by userspace through vhost message
>> (IOTLB_UPDATE/IOTLB_INVALIDATE)
>>
>> The current design is:
>>
>> - Reuse VHOST_SET_MEM_TABLE, and for type 1), we can choose to send diffs
>> through IOMMU API or flush all the mappings then map new ones. For type 2),
>> just send the whole mapping through set_map()
> I know that at least for RDMA based things, you can't change
> a mapping if it's active. So drivers will need to figure out the
> differences which just looks ugly: userspace knows what
> it was changing (really just adding/removing some guest memory).


Two methods:

1) using IOTLB message VHOST_IOTLB_UPDATE/INVALIDATE
2) let vhost differs from two memory tables which should not be too hard
(compare two rb trees)


>
>
>
>> - Reuse vhost IOTLB, so for type 1), simply forward update/invalidate
>> request via IOMMU API, for type 2), send IOTLB to vDPA device driver via
>> set_map(), device driver may choose to send diffs or rebuild all mapping at
>> their will
>>
>> Technically we can use vhost IOTLB API (map/umap) for building
>> VHOST_SET_MEM_TABLE, but to avoid device to process the each request, it
>> looks to me we need new UAPI which seems sub optimal.
>>
>> What's you thought?
>>
>> Thanks
> I suspect we can't completely avoid a new UAPI.


AFAIK, memory table usually contain just few entries, the performance
cost should be fine. (At least should be the same as the case of VFIO).

So in qemu, simply hooking add_region/remove_region to
VHOST_IOTLB_UPDATE/VHOST_IOTLB_INVALIDATE should work?

If we introduce API like you proposed previously (memory listener style):

begin
add
remove
commit

I suspect it will be too heavweight for the case of vIOMMU and for the
driver that want to build new mapping, we need addnop etc...

Thanks


>

2020-02-05 07:53:02

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 下午3:15, Shahaf Shuler wrote:
> Wednesday, February 5, 2020 4:03 AM, Tiwei Bie:
>> Subject: Re: [PATCH] vhost: introduce vDPA based backend
>>
>> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
>>> On 2020/1/31 上午11:36, Tiwei Bie wrote:
>>>> This patch introduces a vDPA based vhost backend. This backend is
>>>> built on top of the same interface defined in virtio-vDPA and
>>>> provides a generic vhost interface for userspace to accelerate the
>>>> virtio devices in guest.
>>>>
>>>> This backend is implemented as a vDPA device driver on top of the
>>>> same ops used in virtio-vDPA. It will create char device entry named
>>>> vhost-vdpa/$vdpa_device_index for userspace to use. Userspace can
>>>> use vhost ioctls on top of this char device to setup the backend.
>>>>
>>>> Signed-off-by: Tiwei Bie <[email protected]>
> [...]
>
>>>> +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v) {
>>>> + /* TODO: fix this */
>>>
>>> Before trying to do this it looks to me we need the following during
>>> the probe
>>>
>>> 1) if set_map() is not supported by the vDPA device probe the IOMMU
>>> that is supported by the vDPA device
>>> 2) allocate IOMMU domain
>>>
>>> And then:
>>>
>>> 3) pin pages through GUP and do proper accounting
>>> 4) store GPA->HPA mapping in the umem
>>> 5) generate diffs of memory table and using IOMMU API to setup the dma
>>> mapping in this method
>>>
>>> For 1), I'm not sure parent is sufficient for to doing this or need to
>>> introduce new API like iommu_device in mdev.
>> Agree. We may also need to introduce something like the iommu_device.
>>
> Would it be better for the map/umnap logic to happen inside each device ?
> Devices that needs the IOMMU will call iommu APIs from inside the driver callback.


Technically, this can work. But if it can be done by vhost-vpda it will
make the vDPA driver more compact and easier to be implemented.


> Devices that has other ways to do the DMA mapping will call the proprietary APIs.


To confirm, do you prefer:

1) map/unmap

or

2) pass all maps at one time?

Thanks


>

2020-02-05 09:25:06

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 05, 2020 at 03:42:18PM +0800, Jason Wang wrote:
>
> On 2020/2/5 下午3:16, Michael S. Tsirkin wrote:
> > On Wed, Feb 05, 2020 at 02:49:31PM +0800, Jason Wang wrote:
> > > On 2020/2/5 下午2:30, Michael S. Tsirkin wrote:
> > > > On Wed, Feb 05, 2020 at 01:50:28PM +0800, Jason Wang wrote:
> > > > > On 2020/2/5 下午1:31, Michael S. Tsirkin wrote:
> > > > > > On Wed, Feb 05, 2020 at 11:12:21AM +0800, Jason Wang wrote:
> > > > > > > On 2020/2/5 上午10:05, Tiwei Bie wrote:
> > > > > > > > On Tue, Feb 04, 2020 at 02:46:16PM +0800, Jason Wang wrote:
> > > > > > > > > On 2020/2/4 下午2:01, Michael S. Tsirkin wrote:
> > > > > > > > > > On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> > > > > > > > > > > 5) generate diffs of memory table and using IOMMU API to setup the dma
> > > > > > > > > > > mapping in this method
> > > > > > > > > > Frankly I think that's a bunch of work. Why not a MAP/UNMAP interface?
> > > > > > > > > >
> > > > > > > > > Sure, so that basically VHOST_IOTLB_UPDATE/INVALIDATE I think?
> > > > > > > > Do you mean we let userspace to only use VHOST_IOTLB_UPDATE/INVALIDATE
> > > > > > > > to do the DMA mapping in vhost-vdpa case? When vIOMMU isn't available,
> > > > > > > > userspace will set msg->iova to GPA, otherwise userspace will set
> > > > > > > > msg->iova to GIOVA, and vhost-vdpa module will get HPA from msg->uaddr?
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > > Tiwei
> > > > > > > I think so. Michael, do you think this makes sense?
> > > > > > >
> > > > > > > Thanks
> > > > > > to make sure, could you post the suggested argument format for
> > > > > > these ioctls?
> > > > > >
> > > > > It's the existed uapi:
> > > > >
> > > > > /* no alignment requirement */
> > > > > struct vhost_iotlb_msg {
> > > > >     __u64 iova;
> > > > >     __u64 size;
> > > > >     __u64 uaddr;
> > > > > #define VHOST_ACCESS_RO      0x1
> > > > > #define VHOST_ACCESS_WO      0x2
> > > > > #define VHOST_ACCESS_RW      0x3
> > > > >     __u8 perm;
> > > > > #define VHOST_IOTLB_MISS           1
> > > > > #define VHOST_IOTLB_UPDATE         2
> > > > > #define VHOST_IOTLB_INVALIDATE     3
> > > > > #define VHOST_IOTLB_ACCESS_FAIL    4
> > > > >     __u8 type;
> > > > > };
> > > > >
> > > > > #define VHOST_IOTLB_MSG 0x1
> > > > > #define VHOST_IOTLB_MSG_V2 0x2
> > > > >
> > > > > struct vhost_msg {
> > > > >     int type;
> > > > >     union {
> > > > >         struct vhost_iotlb_msg iotlb;
> > > > >         __u8 padding[64];
> > > > >     };
> > > > > };
> > > > >
> > > > > struct vhost_msg_v2 {
> > > > >     __u32 type;
> > > > >     __u32 reserved;
> > > > >     union {
> > > > >         struct vhost_iotlb_msg iotlb;
> > > > >         __u8 padding[64];
> > > > >     };
> > > > > };
> > > > Oh ok. So with a real device, I suspect we do not want to wait for each
> > > > change to be processed by device completely, so we might want an asynchronous variant
> > > > and then some kind of flush that tells device "you better apply these now".
> > >
> > > Let me explain:
> > >
> > > There are two types of devices:
> > >
> > > 1) device without on-chip IOMMU, DMA was done via IOMMU API which only
> > > support incremental map/unmap
> > Most IOMMUs have queues nowdays though. Whether APIs within kernel
> > expose that matters but we are better off on emulating
> > hardware not specific guest behaviour.
>
>
> Last time I checked Intel IOMMU driver, I see the async QI is not used
> there. And I'm not sure how queue will help much here. Qemu still need to
> wait for all the DMA is setup to let guest work.
>
> >
> > > 2) device with on-chip IOMMU, DMA could be done by device driver itself, and
> > > we could choose to pass the whole mappings to the driver at one time through
> > > vDPA bus operation (set_map)
> > >
> > > For vhost-vpda, there're two types of memory mapping:
> > >
> > > a) memory table, setup by userspace through VHOST_SET_MEM_TABLE, the whole
> > > mapping is updated in this way
> > > b) IOTLB API, incrementally done by userspace through vhost message
> > > (IOTLB_UPDATE/IOTLB_INVALIDATE)
> > >
> > > The current design is:
> > >
> > > - Reuse VHOST_SET_MEM_TABLE, and for type 1), we can choose to send diffs
> > > through IOMMU API or flush all the mappings then map new ones. For type 2),
> > > just send the whole mapping through set_map()
> > I know that at least for RDMA based things, you can't change
> > a mapping if it's active. So drivers will need to figure out the
> > differences which just looks ugly: userspace knows what
> > it was changing (really just adding/removing some guest memory).
>
>
> Two methods:
>
> 1) using IOTLB message VHOST_IOTLB_UPDATE/INVALIDATE
> 2) let vhost differs from two memory tables which should not be too hard
> (compare two rb trees)


Right but 2 is just such an obvious waste of cyclces. userspace knows what changed
why does vhost need to re-calculate it? No?

>
> >
> >
> >
> > > - Reuse vhost IOTLB, so for type 1), simply forward update/invalidate
> > > request via IOMMU API, for type 2), send IOTLB to vDPA device driver via
> > > set_map(), device driver may choose to send diffs or rebuild all mapping at
> > > their will
> > >
> > > Technically we can use vhost IOTLB API (map/umap) for building
> > > VHOST_SET_MEM_TABLE, but to avoid device to process the each request, it
> > > looks to me we need new UAPI which seems sub optimal.
> > >
> > > What's you thought?
> > >
> > > Thanks
> > I suspect we can't completely avoid a new UAPI.
>
>
> AFAIK, memory table usually contain just few entries, the performance cost
> should be fine. (At least should be the same as the case of VFIO).
>
> So in qemu, simply hooking add_region/remove_region to
> VHOST_IOTLB_UPDATE/VHOST_IOTLB_INVALIDATE should work?
>
> If we introduce API like you proposed previously (memory listener style):
>
> begin
> add
> remove
> commit
>
> I suspect it will be too heavweight for the case of vIOMMU and for the
> driver that want to build new mapping, we need addnop etc...
>
> Thanks
>

I feel this can help some workloads but this can wait, for sure.


> >

2020-02-05 09:27:09

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 05, 2020 at 03:50:14PM +0800, Jason Wang wrote:
>
> On 2020/2/5 下午3:15, Shahaf Shuler wrote:
> > Wednesday, February 5, 2020 4:03 AM, Tiwei Bie:
> > > Subject: Re: [PATCH] vhost: introduce vDPA based backend
> > >
> > > On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> > > > On 2020/1/31 上午11:36, Tiwei Bie wrote:
> > > > > This patch introduces a vDPA based vhost backend. This backend is
> > > > > built on top of the same interface defined in virtio-vDPA and
> > > > > provides a generic vhost interface for userspace to accelerate the
> > > > > virtio devices in guest.
> > > > >
> > > > > This backend is implemented as a vDPA device driver on top of the
> > > > > same ops used in virtio-vDPA. It will create char device entry named
> > > > > vhost-vdpa/$vdpa_device_index for userspace to use. Userspace can
> > > > > use vhost ioctls on top of this char device to setup the backend.
> > > > >
> > > > > Signed-off-by: Tiwei Bie <[email protected]>
> > [...]
> >
> > > > > +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v) {
> > > > > + /* TODO: fix this */
> > > >
> > > > Before trying to do this it looks to me we need the following during
> > > > the probe
> > > >
> > > > 1) if set_map() is not supported by the vDPA device probe the IOMMU
> > > > that is supported by the vDPA device
> > > > 2) allocate IOMMU domain
> > > >
> > > > And then:
> > > >
> > > > 3) pin pages through GUP and do proper accounting
> > > > 4) store GPA->HPA mapping in the umem
> > > > 5) generate diffs of memory table and using IOMMU API to setup the dma
> > > > mapping in this method
> > > >
> > > > For 1), I'm not sure parent is sufficient for to doing this or need to
> > > > introduce new API like iommu_device in mdev.
> > > Agree. We may also need to introduce something like the iommu_device.
> > >
> > Would it be better for the map/umnap logic to happen inside each device ?
> > Devices that needs the IOMMU will call iommu APIs from inside the driver callback.
>
>
> Technically, this can work. But if it can be done by vhost-vpda it will make
> the vDPA driver more compact and easier to be implemented.
>
>
> > Devices that has other ways to do the DMA mapping will call the proprietary APIs.
>
>
> To confirm, do you prefer:
>
> 1) map/unmap
>
> or
>
> 2) pass all maps at one time?
>
> Thanks
>
>

I mean we really already have both right? ATM 1 is used with an iommu
and 2 without. I guess we can also have drivers ask for either or both
...

--
MST

2020-02-05 09:31:23

by Shahaf Shuler

[permalink] [raw]
Subject: RE: [PATCH] vhost: introduce vDPA based backend

Wednesday, February 5, 2020 9:50 AM, Jason Wang:
> Subject: Re: [PATCH] vhost: introduce vDPA based backend
> On 2020/2/5 下午3:15, Shahaf Shuler wrote:
> > Wednesday, February 5, 2020 4:03 AM, Tiwei Bie:
> >> Subject: Re: [PATCH] vhost: introduce vDPA based backend
> >>
> >> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> >>> On 2020/1/31 上午11:36, Tiwei Bie wrote:
> >>>> This patch introduces a vDPA based vhost backend. This backend is
> >>>> built on top of the same interface defined in virtio-vDPA and
> >>>> provides a generic vhost interface for userspace to accelerate the
> >>>> virtio devices in guest.
> >>>>
> >>>> This backend is implemented as a vDPA device driver on top of the
> >>>> same ops used in virtio-vDPA. It will create char device entry
> >>>> named vhost-vdpa/$vdpa_device_index for userspace to use.
> Userspace
> >>>> can use vhost ioctls on top of this char device to setup the backend.
> >>>>
> >>>> Signed-off-by: Tiwei Bie <[email protected]>
> > [...]
> >
> >>>> +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v) {
> >>>> + /* TODO: fix this */
> >>>
> >>> Before trying to do this it looks to me we need the following during
> >>> the probe
> >>>
> >>> 1) if set_map() is not supported by the vDPA device probe the IOMMU
> >>> that is supported by the vDPA device
> >>> 2) allocate IOMMU domain
> >>>
> >>> And then:
> >>>
> >>> 3) pin pages through GUP and do proper accounting
> >>> 4) store GPA->HPA mapping in the umem
> >>> 5) generate diffs of memory table and using IOMMU API to setup the
> >>> dma mapping in this method
> >>>
> >>> For 1), I'm not sure parent is sufficient for to doing this or need
> >>> to introduce new API like iommu_device in mdev.
> >> Agree. We may also need to introduce something like the iommu_device.
> >>
> > Would it be better for the map/umnap logic to happen inside each device ?
> > Devices that needs the IOMMU will call iommu APIs from inside the driver
> callback.
>
>
> Technically, this can work. But if it can be done by vhost-vpda it will make the
> vDPA driver more compact and easier to be implemented.

Need to see the layering of such proposal but am not sure.
Vhost-vdpa is generic framework, while the DMA mapping is vendor specific.
Maybe vhost-vdpa can have some shared code needed to operate on iommu, so drivers can re-use it. to me it seems simpler than exposing a new iommu device.

>
>
> > Devices that has other ways to do the DMA mapping will call the
> proprietary APIs.
>
>
> To confirm, do you prefer:
>
> 1) map/unmap

It is not only that. AFAIR there also flush and invalidate calls, right?

>
> or
>
> 2) pass all maps at one time?

To me this seems more straight forward.
It is correct that under hotplug and large number of memory segments the driver will need to understand the diff (or not and just reload the new configuration). However, my assumption here is that memory hotplug is heavy flow anyway, and the driver extra cycles will not be that visible

>
> Thanks
>
>
> >

2020-02-05 10:35:06

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 05, 2020 at 09:30:14AM +0000, Shahaf Shuler wrote:
> Wednesday, February 5, 2020 9:50 AM, Jason Wang:
> > Subject: Re: [PATCH] vhost: introduce vDPA based backend
> > On 2020/2/5 下午3:15, Shahaf Shuler wrote:
> > > Wednesday, February 5, 2020 4:03 AM, Tiwei Bie:
> > >> Subject: Re: [PATCH] vhost: introduce vDPA based backend
> > >>
> > >> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
> > >>> On 2020/1/31 上午11:36, Tiwei Bie wrote:
> > >>>> This patch introduces a vDPA based vhost backend. This backend is
> > >>>> built on top of the same interface defined in virtio-vDPA and
> > >>>> provides a generic vhost interface for userspace to accelerate the
> > >>>> virtio devices in guest.
> > >>>>
> > >>>> This backend is implemented as a vDPA device driver on top of the
> > >>>> same ops used in virtio-vDPA. It will create char device entry
> > >>>> named vhost-vdpa/$vdpa_device_index for userspace to use.
> > Userspace
> > >>>> can use vhost ioctls on top of this char device to setup the backend.
> > >>>>
> > >>>> Signed-off-by: Tiwei Bie <[email protected]>
> > > [...]
> > >
> > >>>> +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v) {
> > >>>> + /* TODO: fix this */
> > >>>
> > >>> Before trying to do this it looks to me we need the following during
> > >>> the probe
> > >>>
> > >>> 1) if set_map() is not supported by the vDPA device probe the IOMMU
> > >>> that is supported by the vDPA device
> > >>> 2) allocate IOMMU domain
> > >>>
> > >>> And then:
> > >>>
> > >>> 3) pin pages through GUP and do proper accounting
> > >>> 4) store GPA->HPA mapping in the umem
> > >>> 5) generate diffs of memory table and using IOMMU API to setup the
> > >>> dma mapping in this method
> > >>>
> > >>> For 1), I'm not sure parent is sufficient for to doing this or need
> > >>> to introduce new API like iommu_device in mdev.
> > >> Agree. We may also need to introduce something like the iommu_device.
> > >>
> > > Would it be better for the map/umnap logic to happen inside each device ?
> > > Devices that needs the IOMMU will call iommu APIs from inside the driver
> > callback.
> >
> >
> > Technically, this can work. But if it can be done by vhost-vpda it will make the
> > vDPA driver more compact and easier to be implemented.
>
> Need to see the layering of such proposal but am not sure.
> Vhost-vdpa is generic framework, while the DMA mapping is vendor specific.
> Maybe vhost-vdpa can have some shared code needed to operate on iommu, so drivers can re-use it. to me it seems simpler than exposing a new iommu device.
>
> >
> >
> > > Devices that has other ways to do the DMA mapping will call the
> > proprietary APIs.
> >
> >
> > To confirm, do you prefer:
> >
> > 1) map/unmap
>
> It is not only that. AFAIR there also flush and invalidate calls, right?
>
> >
> > or
> >
> > 2) pass all maps at one time?
>
> To me this seems more straight forward.
> It is correct that under hotplug and large number of memory segments
> the driver will need to understand the diff (or not and just reload
> the new configuration).
> However, my assumption here is that memory
> hotplug is heavy flow anyway, and the driver extra cycles will not be
> that visible

I think we can just allow both, after all vhost already has both interfaces ...
We just need a flag that tells userspace whether it needs to
update all maps aggressively or can wait for a fault.

> >
> > Thanks
> >
> >
> > >
>

2020-02-05 12:59:03

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 05, 2020 at 03:50:14PM +0800, Jason Wang wrote:
> > Would it be better for the map/umnap logic to happen inside each device ?
> > Devices that needs the IOMMU will call iommu APIs from inside the driver callback.
>
> Technically, this can work. But if it can be done by vhost-vpda it will make
> the vDPA driver more compact and easier to be implemented.

Generally speaking, in the kernel, it is normal to not hoist code of
out drivers into subsystems until 2-3 drivers are duplicating that
code. It helps ensure the right design is used

Jason

2020-02-05 13:17:28

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 05, 2020 at 08:56:48AM -0400, Jason Gunthorpe wrote:
> On Wed, Feb 05, 2020 at 03:50:14PM +0800, Jason Wang wrote:
> > > Would it be better for the map/umnap logic to happen inside each device ?
> > > Devices that needs the IOMMU will call iommu APIs from inside the driver callback.
> >
> > Technically, this can work. But if it can be done by vhost-vpda it will make
> > the vDPA driver more compact and easier to be implemented.
>
> Generally speaking, in the kernel, it is normal to not hoist code of
> out drivers into subsystems until 2-3 drivers are duplicating that
> code. It helps ensure the right design is used
>
> Jason

That's up to the sybsystem maintainer really, as there's also some
intuition involved in guessing a specific API is widely useful.
In-kernel APIs are flexible, if we find something isn't needed we just
drop it.

--
MST

2020-02-06 03:10:00

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 下午5:23, Michael S. Tsirkin wrote:
> On Wed, Feb 05, 2020 at 03:50:14PM +0800, Jason Wang wrote:
>> On 2020/2/5 下午3:15, Shahaf Shuler wrote:
>>> Wednesday, February 5, 2020 4:03 AM, Tiwei Bie:
>>>> Subject: Re: [PATCH] vhost: introduce vDPA based backend
>>>>
>>>> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
>>>>> On 2020/1/31 上午11:36, Tiwei Bie wrote:
>>>>>> This patch introduces a vDPA based vhost backend. This backend is
>>>>>> built on top of the same interface defined in virtio-vDPA and
>>>>>> provides a generic vhost interface for userspace to accelerate the
>>>>>> virtio devices in guest.
>>>>>>
>>>>>> This backend is implemented as a vDPA device driver on top of the
>>>>>> same ops used in virtio-vDPA. It will create char device entry named
>>>>>> vhost-vdpa/$vdpa_device_index for userspace to use. Userspace can
>>>>>> use vhost ioctls on top of this char device to setup the backend.
>>>>>>
>>>>>> Signed-off-by: Tiwei Bie<[email protected]>
>>> [...]
>>>
>>>>>> +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v) {
>>>>>> + /* TODO: fix this */
>>>>> Before trying to do this it looks to me we need the following during
>>>>> the probe
>>>>>
>>>>> 1) if set_map() is not supported by the vDPA device probe the IOMMU
>>>>> that is supported by the vDPA device
>>>>> 2) allocate IOMMU domain
>>>>>
>>>>> And then:
>>>>>
>>>>> 3) pin pages through GUP and do proper accounting
>>>>> 4) store GPA->HPA mapping in the umem
>>>>> 5) generate diffs of memory table and using IOMMU API to setup the dma
>>>>> mapping in this method
>>>>>
>>>>> For 1), I'm not sure parent is sufficient for to doing this or need to
>>>>> introduce new API like iommu_device in mdev.
>>>> Agree. We may also need to introduce something like the iommu_device.
>>>>
>>> Would it be better for the map/umnap logic to happen inside each device ?
>>> Devices that needs the IOMMU will call iommu APIs from inside the driver callback.
>> Technically, this can work. But if it can be done by vhost-vpda it will make
>> the vDPA driver more compact and easier to be implemented.
>>
>>
>>> Devices that has other ways to do the DMA mapping will call the proprietary APIs.
>> To confirm, do you prefer:
>>
>> 1) map/unmap
>>
>> or
>>
>> 2) pass all maps at one time?
>>
>> Thanks
>>
>>
> I mean we really already have both right? ATM 1 is used with an iommu
> and 2 without. I guess we can also have drivers ask for either or both
> ...


Yes, that looks better.

Thanks

2020-02-06 03:22:26

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 下午6:33, Michael S. Tsirkin wrote:
> On Wed, Feb 05, 2020 at 09:30:14AM +0000, Shahaf Shuler wrote:
>> Wednesday, February 5, 2020 9:50 AM, Jason Wang:
>>> Subject: Re: [PATCH] vhost: introduce vDPA based backend
>>> On 2020/2/5 下午3:15, Shahaf Shuler wrote:
>>>> Wednesday, February 5, 2020 4:03 AM, Tiwei Bie:
>>>>> Subject: Re: [PATCH] vhost: introduce vDPA based backend
>>>>>
>>>>> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
>>>>>> On 2020/1/31 上午11:36, Tiwei Bie wrote:
>>>>>>> This patch introduces a vDPA based vhost backend. This backend is
>>>>>>> built on top of the same interface defined in virtio-vDPA and
>>>>>>> provides a generic vhost interface for userspace to accelerate the
>>>>>>> virtio devices in guest.
>>>>>>>
>>>>>>> This backend is implemented as a vDPA device driver on top of the
>>>>>>> same ops used in virtio-vDPA. It will create char device entry
>>>>>>> named vhost-vdpa/$vdpa_device_index for userspace to use.
>>> Userspace
>>>>>>> can use vhost ioctls on top of this char device to setup the backend.
>>>>>>>
>>>>>>> Signed-off-by: Tiwei Bie<[email protected]>
>>>> [...]
>>>>
>>>>>>> +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v) {
>>>>>>> + /* TODO: fix this */
>>>>>> Before trying to do this it looks to me we need the following during
>>>>>> the probe
>>>>>>
>>>>>> 1) if set_map() is not supported by the vDPA device probe the IOMMU
>>>>>> that is supported by the vDPA device
>>>>>> 2) allocate IOMMU domain
>>>>>>
>>>>>> And then:
>>>>>>
>>>>>> 3) pin pages through GUP and do proper accounting
>>>>>> 4) store GPA->HPA mapping in the umem
>>>>>> 5) generate diffs of memory table and using IOMMU API to setup the
>>>>>> dma mapping in this method
>>>>>>
>>>>>> For 1), I'm not sure parent is sufficient for to doing this or need
>>>>>> to introduce new API like iommu_device in mdev.
>>>>> Agree. We may also need to introduce something like the iommu_device.
>>>>>
>>>> Would it be better for the map/umnap logic to happen inside each device ?
>>>> Devices that needs the IOMMU will call iommu APIs from inside the driver
>>> callback.
>>>
>>>
>>> Technically, this can work. But if it can be done by vhost-vpda it will make the
>>> vDPA driver more compact and easier to be implemented.
>> Need to see the layering of such proposal but am not sure.
>> Vhost-vdpa is generic framework, while the DMA mapping is vendor specific.
>> Maybe vhost-vdpa can have some shared code needed to operate on iommu, so drivers can re-use it. to me it seems simpler than exposing a new iommu device.
>>
>>>> Devices that has other ways to do the DMA mapping will call the
>>> proprietary APIs.
>>>
>>>
>>> To confirm, do you prefer:
>>>
>>> 1) map/unmap
>> It is not only that. AFAIR there also flush and invalidate calls, right?
>>
>>> or
>>>
>>> 2) pass all maps at one time?
>> To me this seems more straight forward.
>> It is correct that under hotplug and large number of memory segments
>> the driver will need to understand the diff (or not and just reload
>> the new configuration).
>> However, my assumption here is that memory
>> hotplug is heavy flow anyway, and the driver extra cycles will not be
>> that visible
> I think we can just allow both, after all vhost already has both interfaces ...
> We just need a flag that tells userspace whether it needs to
> update all maps aggressively or can wait for a fault.


It looks to me such flag is not a must and we can introduce it later
when device support page fault.

Thanks


2020-02-06 03:24:44

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 下午9:14, Michael S. Tsirkin wrote:
> On Wed, Feb 05, 2020 at 08:56:48AM -0400, Jason Gunthorpe wrote:
>> On Wed, Feb 05, 2020 at 03:50:14PM +0800, Jason Wang wrote:
>>>> Would it be better for the map/umnap logic to happen inside each device ?
>>>> Devices that needs the IOMMU will call iommu APIs from inside the driver callback.
>>> Technically, this can work. But if it can be done by vhost-vpda it will make
>>> the vDPA driver more compact and easier to be implemented.
>> Generally speaking, in the kernel, it is normal to not hoist code of
>> out drivers into subsystems until 2-3 drivers are duplicating that
>> code. It helps ensure the right design is used
>>
>> Jason
> That's up to the sybsystem maintainer really, as there's also some
> intuition involved in guessing a specific API is widely useful.
> In-kernel APIs are flexible, if we find something isn't needed we just
> drop it.
>

If I understand correctly. At least Intel (Ling Shan) and Brodcom (Rob)
doesn't want to deal with DMA stuffs in their driver.

Anyway since the DMA bus operations is optional, driver may still choose
to do DMA by itself if they want even if it requires platform IOMMU to work.

Thanks

2020-02-06 03:25:58

by Zhu Lingshan

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2/6/2020 11:11 AM, Jason Wang wrote:
>
> On 2020/2/5 下午9:14, Michael S. Tsirkin wrote:
>> On Wed, Feb 05, 2020 at 08:56:48AM -0400, Jason Gunthorpe wrote:
>>> On Wed, Feb 05, 2020 at 03:50:14PM +0800, Jason Wang wrote:
>>>>> Would it be better for the map/umnap logic to happen inside each
>>>>> device ?
>>>>> Devices that needs the IOMMU will call iommu APIs from inside the
>>>>> driver callback.
>>>> Technically, this can work. But if it can be done by vhost-vpda it
>>>> will make
>>>> the vDPA driver more compact and easier to be implemented.
>>> Generally speaking, in the kernel, it is normal to not hoist code of
>>> out drivers into subsystems until 2-3 drivers are duplicating that
>>> code. It helps ensure the right design is used
>>>
>>> Jason
>> That's up to the sybsystem maintainer really, as there's also some
>> intuition involved in guessing a specific API is widely useful.
>> In-kernel APIs are flexible, if we find something isn't needed we just
>> drop it.
>>
>
> If I understand correctly. At least Intel (Ling Shan) and Brodcom
> (Rob) doesn't want to deal with DMA stuffs in their driver.
>
> Anyway since the DMA bus operations is optional, driver may still
> choose to do DMA by itself if they want even if it requires platform
> IOMMU to work.
>
> Thanks
>
Many Thanks if this could be done. The parent device has DMA
capabilities and dma ops implemented, we hope can make use of it, as
discussed in the vdpa thread.

2020-02-06 03:38:43

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend


On 2020/2/5 下午5:30, Shahaf Shuler wrote:
> Wednesday, February 5, 2020 9:50 AM, Jason Wang:
>> Subject: Re: [PATCH] vhost: introduce vDPA based backend
>> On 2020/2/5 下午3:15, Shahaf Shuler wrote:
>>> Wednesday, February 5, 2020 4:03 AM, Tiwei Bie:
>>>> Subject: Re: [PATCH] vhost: introduce vDPA based backend
>>>>
>>>> On Tue, Feb 04, 2020 at 11:30:11AM +0800, Jason Wang wrote:
>>>>> On 2020/1/31 上午11:36, Tiwei Bie wrote:
>>>>>> This patch introduces a vDPA based vhost backend. This backend is
>>>>>> built on top of the same interface defined in virtio-vDPA and
>>>>>> provides a generic vhost interface for userspace to accelerate the
>>>>>> virtio devices in guest.
>>>>>>
>>>>>> This backend is implemented as a vDPA device driver on top of the
>>>>>> same ops used in virtio-vDPA. It will create char device entry
>>>>>> named vhost-vdpa/$vdpa_device_index for userspace to use.
>> Userspace
>>>>>> can use vhost ioctls on top of this char device to setup the backend.
>>>>>>
>>>>>> Signed-off-by: Tiwei Bie <[email protected]>
>>> [...]
>>>
>>>>>> +static long vhost_vdpa_do_dma_mapping(struct vhost_vdpa *v) {
>>>>>> + /* TODO: fix this */
>>>>> Before trying to do this it looks to me we need the following during
>>>>> the probe
>>>>>
>>>>> 1) if set_map() is not supported by the vDPA device probe the IOMMU
>>>>> that is supported by the vDPA device
>>>>> 2) allocate IOMMU domain
>>>>>
>>>>> And then:
>>>>>
>>>>> 3) pin pages through GUP and do proper accounting
>>>>> 4) store GPA->HPA mapping in the umem
>>>>> 5) generate diffs of memory table and using IOMMU API to setup the
>>>>> dma mapping in this method
>>>>>
>>>>> For 1), I'm not sure parent is sufficient for to doing this or need
>>>>> to introduce new API like iommu_device in mdev.
>>>> Agree. We may also need to introduce something like the iommu_device.
>>>>
>>> Would it be better for the map/umnap logic to happen inside each device ?
>>> Devices that needs the IOMMU will call iommu APIs from inside the driver
>> callback.
>>
>>
>> Technically, this can work. But if it can be done by vhost-vpda it will make the
>> vDPA driver more compact and easier to be implemented.
> Need to see the layering of such proposal but am not sure.
> Vhost-vdpa is generic framework, while the DMA mapping is vendor specific.
> Maybe vhost-vdpa can have some shared code needed to operate on iommu, so drivers can re-use it. to me it seems simpler than exposing a new iommu device.


I think you mean on-chip IOMMU here. For shared code, I guess this only
thing that could be shared is the mapping (rbtree) and some helpers. Or
is there any other in your mind?


>>
>>> Devices that has other ways to do the DMA mapping will call the
>> proprietary APIs.
>>
>>
>> To confirm, do you prefer:
>>
>> 1) map/unmap
> It is not only that. AFAIR there also flush and invalidate calls, right?


unmap will accept a range so it it can do flush and invalidate.


>
>> or
>>
>> 2) pass all maps at one time?
> To me this seems more straight forward.
> It is correct that under hotplug and large number of memory segments the driver will need to understand the diff (or not and just reload the new configuration). However, my assumption here is that memory hotplug is heavy flow anyway, and the driver extra cycles will not be that visible


Yes, and vhost can provide helpers to generate the diffs.

Thanks


>
>> Thanks
>>
>>

2020-02-18 13:54:47

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Fri, Jan 31, 2020 at 11:36:51AM +0800, Tiwei Bie wrote:

> +static int vhost_vdpa_alloc_minor(struct vhost_vdpa *v)
> +{
> + return idr_alloc(&vhost_vdpa.idr, v, 0, MINORMASK + 1,
> + GFP_KERNEL);
> +}

Please don't use idr in new code, use xarray directly

> +static int vhost_vdpa_probe(struct device *dev)
> +{
> + struct vdpa_device *vdpa = dev_to_vdpa(dev);
> + const struct vdpa_config_ops *ops = vdpa->config;
> + struct vhost_vdpa *v;
> + struct device *d;
> + int minor, nvqs;
> + int r;
> +
> + /* Currently, we only accept the network devices. */
> + if (ops->get_device_id(vdpa) != VIRTIO_ID_NET) {
> + r = -ENOTSUPP;
> + goto err;
> + }
> +
> + v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> + if (!v) {
> + r = -ENOMEM;
> + goto err;
> + }
> +
> + nvqs = VHOST_VDPA_VQ_MAX;
> +
> + v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue),
> + GFP_KERNEL);
> + if (!v->vqs) {
> + r = -ENOMEM;
> + goto err_alloc_vqs;
> + }
> +
> + mutex_init(&v->mutex);
> + atomic_set(&v->opened, 0);
> +
> + v->vdpa = vdpa;
> + v->nvqs = nvqs;
> + v->virtio_id = ops->get_device_id(vdpa);
> +
> + mutex_lock(&vhost_vdpa.mutex);
> +
> + minor = vhost_vdpa_alloc_minor(v);
> + if (minor < 0) {
> + r = minor;
> + goto err_alloc_minor;
> + }
> +
> + d = device_create(vhost_vdpa.class, NULL,
> + MKDEV(MAJOR(vhost_vdpa.devt), minor),
> + v, "%d", vdpa->index);
> + if (IS_ERR(d)) {
> + r = PTR_ERR(d);
> + goto err_device_create;
> + }
> +

I can't understand what this messing around with major/minor numbers
does. Without allocating a cdev via cdev_add/etc there is only a
single char dev in existence here. This and the stuff in
vhost_vdpa_open() looks non-functional.

> +static void vhost_vdpa_remove(struct device *dev)
> +{
> + DEFINE_WAIT_FUNC(wait, woken_wake_function);
> + struct vhost_vdpa *v = dev_get_drvdata(dev);
> + int opened;
> +
> + add_wait_queue(&vhost_vdpa.release_q, &wait);
> +
> + do {
> + opened = atomic_cmpxchg(&v->opened, 0, 1);
> + if (!opened)
> + break;
> + wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
> + } while (1);
> +
> + remove_wait_queue(&vhost_vdpa.release_q, &wait);

*barf* use the normal refcount pattern please

read side:

refcount_inc_not_zero(uses)
//stuff
if (refcount_dec_and_test(uses))
complete(completer)

destroy side:
if (refcount_dec_and_test(uses))
complete(completer)
wait_for_completion(completer)
// refcount now permanently == 0

Use a completion in driver code

> + mutex_lock(&vhost_vdpa.mutex);
> + device_destroy(vhost_vdpa.class,
> + MKDEV(MAJOR(vhost_vdpa.devt), v->minor));
> + vhost_vdpa_free_minor(v->minor);
> + mutex_unlock(&vhost_vdpa.mutex);
> + kfree(v->vqs);
> + kfree(v);

This use after-fress vs vhost_vdpa_open prior to it setting the open
bit. Maybe use xarray, rcu and kfree_rcu ..

> +static int __init vhost_vdpa_init(void)
> +{
> + int r;
> +
> + idr_init(&vhost_vdpa.idr);
> + mutex_init(&vhost_vdpa.mutex);
> + init_waitqueue_head(&vhost_vdpa.release_q);
> +
> + /* /dev/vhost-vdpa/$vdpa_device_index */
> + vhost_vdpa.class = class_create(THIS_MODULE, "vhost-vdpa");
> + if (IS_ERR(vhost_vdpa.class)) {
> + r = PTR_ERR(vhost_vdpa.class);
> + goto err_class;
> + }
> +
> + vhost_vdpa.class->devnode = vhost_vdpa_devnode;
> +
> + r = alloc_chrdev_region(&vhost_vdpa.devt, 0, MINORMASK + 1,
> + "vhost-vdpa");
> + if (r)
> + goto err_alloc_chrdev;
> +
> + cdev_init(&vhost_vdpa.cdev, &vhost_vdpa_fops);
> + r = cdev_add(&vhost_vdpa.cdev, vhost_vdpa.devt, MINORMASK + 1);
> + if (r)
> + goto err_cdev_add;

It is very strange, is the intention to create a single global char
dev?

If so, why is there this:

+static int vhost_vdpa_open(struct inode *inode, struct file *filep)
+{
+ struct vhost_vdpa *v;
+ struct vhost_dev *dev;
+ struct vhost_virtqueue **vqs;
+ int nvqs, i, r, opened;
+
+ v = vhost_vdpa_get_from_minor(iminor(inode));

?

If the idea is to create a per-vdpa char dev then this stuff belongs
in vhost_vdpa_probe(), the cdev should be part of the vhost_vdpa, and
the above should be container_of not an idr lookup.

Jason

2020-02-19 02:53:44

by Tiwei Bie

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Tue, Feb 18, 2020 at 09:53:59AM -0400, Jason Gunthorpe wrote:
> On Fri, Jan 31, 2020 at 11:36:51AM +0800, Tiwei Bie wrote:
>
> > +static int vhost_vdpa_alloc_minor(struct vhost_vdpa *v)
> > +{
> > + return idr_alloc(&vhost_vdpa.idr, v, 0, MINORMASK + 1,
> > + GFP_KERNEL);
> > +}
>
> Please don't use idr in new code, use xarray directly
>
> > +static int vhost_vdpa_probe(struct device *dev)
> > +{
> > + struct vdpa_device *vdpa = dev_to_vdpa(dev);
> > + const struct vdpa_config_ops *ops = vdpa->config;
> > + struct vhost_vdpa *v;
> > + struct device *d;
> > + int minor, nvqs;
> > + int r;
> > +
> > + /* Currently, we only accept the network devices. */
> > + if (ops->get_device_id(vdpa) != VIRTIO_ID_NET) {
> > + r = -ENOTSUPP;
> > + goto err;
> > + }
> > +
> > + v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> > + if (!v) {
> > + r = -ENOMEM;
> > + goto err;
> > + }
> > +
> > + nvqs = VHOST_VDPA_VQ_MAX;
> > +
> > + v->vqs = kmalloc_array(nvqs, sizeof(struct vhost_virtqueue),
> > + GFP_KERNEL);
> > + if (!v->vqs) {
> > + r = -ENOMEM;
> > + goto err_alloc_vqs;
> > + }
> > +
> > + mutex_init(&v->mutex);
> > + atomic_set(&v->opened, 0);
> > +
> > + v->vdpa = vdpa;
> > + v->nvqs = nvqs;
> > + v->virtio_id = ops->get_device_id(vdpa);
> > +
> > + mutex_lock(&vhost_vdpa.mutex);
> > +
> > + minor = vhost_vdpa_alloc_minor(v);
> > + if (minor < 0) {
> > + r = minor;
> > + goto err_alloc_minor;
> > + }
> > +
> > + d = device_create(vhost_vdpa.class, NULL,
> > + MKDEV(MAJOR(vhost_vdpa.devt), minor),
> > + v, "%d", vdpa->index);
> > + if (IS_ERR(d)) {
> > + r = PTR_ERR(d);
> > + goto err_device_create;
> > + }
> > +
>
> I can't understand what this messing around with major/minor numbers
> does. Without allocating a cdev via cdev_add/etc there is only a
> single char dev in existence here. This and the stuff in
> vhost_vdpa_open() looks non-functional.

I followed the code in VFIO. Please see more details below.

>
> > +static void vhost_vdpa_remove(struct device *dev)
> > +{
> > + DEFINE_WAIT_FUNC(wait, woken_wake_function);
> > + struct vhost_vdpa *v = dev_get_drvdata(dev);
> > + int opened;
> > +
> > + add_wait_queue(&vhost_vdpa.release_q, &wait);
> > +
> > + do {
> > + opened = atomic_cmpxchg(&v->opened, 0, 1);
> > + if (!opened)
> > + break;
> > + wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
> > + } while (1);
> > +
> > + remove_wait_queue(&vhost_vdpa.release_q, &wait);
>
> *barf* use the normal refcount pattern please
>
> read side:
>
> refcount_inc_not_zero(uses)
> //stuff
> if (refcount_dec_and_test(uses))
> complete(completer)
>
> destroy side:
> if (refcount_dec_and_test(uses))
> complete(completer)
> wait_for_completion(completer)
> // refcount now permanently == 0
>
> Use a completion in driver code
>
> > + mutex_lock(&vhost_vdpa.mutex);
> > + device_destroy(vhost_vdpa.class,
> > + MKDEV(MAJOR(vhost_vdpa.devt), v->minor));
> > + vhost_vdpa_free_minor(v->minor);
> > + mutex_unlock(&vhost_vdpa.mutex);
> > + kfree(v->vqs);
> > + kfree(v);
>
> This use after-fress vs vhost_vdpa_open prior to it setting the open
> bit. Maybe use xarray, rcu and kfree_rcu ..
>
> > +static int __init vhost_vdpa_init(void)
> > +{
> > + int r;
> > +
> > + idr_init(&vhost_vdpa.idr);
> > + mutex_init(&vhost_vdpa.mutex);
> > + init_waitqueue_head(&vhost_vdpa.release_q);
> > +
> > + /* /dev/vhost-vdpa/$vdpa_device_index */
> > + vhost_vdpa.class = class_create(THIS_MODULE, "vhost-vdpa");
> > + if (IS_ERR(vhost_vdpa.class)) {
> > + r = PTR_ERR(vhost_vdpa.class);
> > + goto err_class;
> > + }
> > +
> > + vhost_vdpa.class->devnode = vhost_vdpa_devnode;
> > +
> > + r = alloc_chrdev_region(&vhost_vdpa.devt, 0, MINORMASK + 1,
> > + "vhost-vdpa");
> > + if (r)
> > + goto err_alloc_chrdev;
> > +
> > + cdev_init(&vhost_vdpa.cdev, &vhost_vdpa_fops);
> > + r = cdev_add(&vhost_vdpa.cdev, vhost_vdpa.devt, MINORMASK + 1);
> > + if (r)
> > + goto err_cdev_add;
>
> It is very strange, is the intention to create a single global char
> dev?

No. It's to create a per-vdpa char dev named
vhost-vdpa/$vdpa_device_index in dev.

I followed the code in VFIO which creates char dev
vfio/$GROUP dynamically, e.g.:

https://github.com/torvalds/linux/blob/b1da3acc781c/drivers/vfio/vfio.c#L2164-L2180
https://github.com/torvalds/linux/blob/b1da3acc781c/drivers/vfio/vfio.c#L373-L387
https://github.com/torvalds/linux/blob/b1da3acc781c/drivers/vfio/vfio.c#L1553

Is it something unwanted?

Thanks for the review.

Regards,
Tiwei

>
> If so, why is there this:
>
> +static int vhost_vdpa_open(struct inode *inode, struct file *filep)
> +{
> + struct vhost_vdpa *v;
> + struct vhost_dev *dev;
> + struct vhost_virtqueue **vqs;
> + int nvqs, i, r, opened;
> +
> + v = vhost_vdpa_get_from_minor(iminor(inode));
>
> ?
>
> If the idea is to create a per-vdpa char dev then this stuff belongs
> in vhost_vdpa_probe(), the cdev should be part of the vhost_vdpa, and
> the above should be container_of not an idr lookup.
>
> Jason

2020-02-19 13:11:21

by Jason Gunthorpe

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 19, 2020 at 10:52:38AM +0800, Tiwei Bie wrote:
> > > +static int __init vhost_vdpa_init(void)
> > > +{
> > > + int r;
> > > +
> > > + idr_init(&vhost_vdpa.idr);
> > > + mutex_init(&vhost_vdpa.mutex);
> > > + init_waitqueue_head(&vhost_vdpa.release_q);
> > > +
> > > + /* /dev/vhost-vdpa/$vdpa_device_index */
> > > + vhost_vdpa.class = class_create(THIS_MODULE, "vhost-vdpa");
> > > + if (IS_ERR(vhost_vdpa.class)) {
> > > + r = PTR_ERR(vhost_vdpa.class);
> > > + goto err_class;
> > > + }
> > > +
> > > + vhost_vdpa.class->devnode = vhost_vdpa_devnode;
> > > +
> > > + r = alloc_chrdev_region(&vhost_vdpa.devt, 0, MINORMASK + 1,
> > > + "vhost-vdpa");
> > > + if (r)
> > > + goto err_alloc_chrdev;
> > > +
> > > + cdev_init(&vhost_vdpa.cdev, &vhost_vdpa_fops);
> > > + r = cdev_add(&vhost_vdpa.cdev, vhost_vdpa.devt, MINORMASK + 1);
> > > + if (r)
> > > + goto err_cdev_add;
> >
> > It is very strange, is the intention to create a single global char
> > dev?
>
> No. It's to create a per-vdpa char dev named
> vhost-vdpa/$vdpa_device_index in dev.
>
> I followed the code in VFIO which creates char dev
> vfio/$GROUP dynamically, e.g.:
>
> https://github.com/torvalds/linux/blob/b1da3acc781c/drivers/vfio/vfio.c#L2164-L2180
> https://github.com/torvalds/linux/blob/b1da3acc781c/drivers/vfio/vfio.c#L373-L387
> https://github.com/torvalds/linux/blob/b1da3acc781c/drivers/vfio/vfio.c#L1553
>
> Is it something unwanted?

Yes it is unwanted. This is some special pattern for vfio's unique
needs.

Since this has a struct device for each char dev instance please use
the normal cdev_device_add() driven pattern here, or justify why it
needs to be special like this.

Jason

2020-02-20 02:43:06

by Tiwei Bie

[permalink] [raw]
Subject: Re: [PATCH] vhost: introduce vDPA based backend

On Wed, Feb 19, 2020 at 09:11:02AM -0400, Jason Gunthorpe wrote:
> On Wed, Feb 19, 2020 at 10:52:38AM +0800, Tiwei Bie wrote:
> > > > +static int __init vhost_vdpa_init(void)
> > > > +{
> > > > + int r;
> > > > +
> > > > + idr_init(&vhost_vdpa.idr);
> > > > + mutex_init(&vhost_vdpa.mutex);
> > > > + init_waitqueue_head(&vhost_vdpa.release_q);
> > > > +
> > > > + /* /dev/vhost-vdpa/$vdpa_device_index */
> > > > + vhost_vdpa.class = class_create(THIS_MODULE, "vhost-vdpa");
> > > > + if (IS_ERR(vhost_vdpa.class)) {
> > > > + r = PTR_ERR(vhost_vdpa.class);
> > > > + goto err_class;
> > > > + }
> > > > +
> > > > + vhost_vdpa.class->devnode = vhost_vdpa_devnode;
> > > > +
> > > > + r = alloc_chrdev_region(&vhost_vdpa.devt, 0, MINORMASK + 1,
> > > > + "vhost-vdpa");
> > > > + if (r)
> > > > + goto err_alloc_chrdev;
> > > > +
> > > > + cdev_init(&vhost_vdpa.cdev, &vhost_vdpa_fops);
> > > > + r = cdev_add(&vhost_vdpa.cdev, vhost_vdpa.devt, MINORMASK + 1);
> > > > + if (r)
> > > > + goto err_cdev_add;
> > >
> > > It is very strange, is the intention to create a single global char
> > > dev?
> >
> > No. It's to create a per-vdpa char dev named
> > vhost-vdpa/$vdpa_device_index in dev.
> >
> > I followed the code in VFIO which creates char dev
> > vfio/$GROUP dynamically, e.g.:
> >
> > https://github.com/torvalds/linux/blob/b1da3acc781c/drivers/vfio/vfio.c#L2164-L2180
> > https://github.com/torvalds/linux/blob/b1da3acc781c/drivers/vfio/vfio.c#L373-L387
> > https://github.com/torvalds/linux/blob/b1da3acc781c/drivers/vfio/vfio.c#L1553
> >
> > Is it something unwanted?
>
> Yes it is unwanted. This is some special pattern for vfio's unique
> needs.
>
> Since this has a struct device for each char dev instance please use
> the normal cdev_device_add() driven pattern here, or justify why it
> needs to be special like this.

I see. Thanks! I will embed the cdev in each vhost_vdpa
structure directly.

Regards,
Tiwei

>
> Jason