Hi all,
This series introduces some ways to improve VDUSE performance.
Patch 1 ~ 7 brings current interrupt affinity spreading mechanism
to vduse device and make it possible for the virtio-blk driver
to build the blk-mq queues based on it. This would be useful to
mitigate the virtqueue lock contention in virtio-blk driver. In
our test, we could get ~50% improvement (600k iops -> 900k iops)
when using per-cpu virtqueue.
Patch 8 adds a sysfs interface for each vduse virtqueue to show
the affinity and effective affinity for IRQ callback. And the
effective affinity interface is also writable so that we can
do some performance tuning when the affinity mask contains more
than one CPU.
Patch 9 adds a sysfs interface for each virtqueues to control
whether use workqueue to inject irq or not. The vhost-vdpa case
can benefit from it.
Patch 10, 11 add a sysfs interface to support specifying bounce
buffer size in virtio-vdpa case. The high throughput workloads
can benefit from it. And we can also use it to reduce the memory
overhead for small throughput workloads.
Please review, thanks!
V1 to V2:
- Export irq_create_affinity_masks()
- Add set/get_vq_affinity and set_irq_affinity callbacks in vDPA
framework
- Add automatic irq callback affinity support in VDUSE driver [Jason]
- Add more backgrounds information in commit log [Jason]
- Only support changing effective affinity when the value is a subset
of the IRQ callback affinity mask.
Xie Yongji (11):
genirq/affinity:: Export irq_create_affinity_masks()
vdpa: Add set/get_vq_affinity callbacks in vdpa_config_ops
vdpa: Add set_irq_affinity callback in vdpa_config_ops
vduse: Refactor allocation for vduse virtqueues
vduse: Introduce bound workqueue for irq injection
vduse: Support automatic irq callback affinity
vduse: Support set/get_vq_affinity callbacks
vduse: Add sysfs interface for irq callback affinity
vduse: Add enable_irq_wq sysfs interface for virtqueues
vduse: Delay iova domain creation
vduse: Support specifying bounce buffer size via sysfs
drivers/vdpa/vdpa_user/vduse_dev.c | 497 +++++++++++++++++++++++++----
drivers/virtio/virtio_vdpa.c | 32 ++
include/linux/vdpa.h | 21 ++
kernel/irq/affinity.c | 1 +
4 files changed, 488 insertions(+), 63 deletions(-)
--
2.20.1
Allocate memory for vduse virtqueues one by one instead of
doing one allocation for all of them.
This is a preparation for adding sysfs interface for virtqueues.
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 98 ++++++++++++++++++++----------
1 file changed, 66 insertions(+), 32 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 35dceee3ed56..37809bfcb7ef 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -76,7 +76,7 @@ struct vduse_umem {
struct vduse_dev {
struct vduse_vdpa *vdev;
struct device *dev;
- struct vduse_virtqueue *vqs;
+ struct vduse_virtqueue **vqs;
struct vduse_iova_domain *domain;
char *name;
struct mutex lock;
@@ -434,7 +434,7 @@ static void vduse_dev_reset(struct vduse_dev *dev)
flush_work(&dev->inject);
for (i = 0; i < dev->vq_num; i++) {
- struct vduse_virtqueue *vq = &dev->vqs[i];
+ struct vduse_virtqueue *vq = dev->vqs[i];
vq->ready = false;
vq->desc_addr = 0;
@@ -466,7 +466,7 @@ static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
u64 device_area)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- struct vduse_virtqueue *vq = &dev->vqs[idx];
+ struct vduse_virtqueue *vq = dev->vqs[idx];
vq->desc_addr = desc_area;
vq->driver_addr = driver_area;
@@ -500,7 +500,7 @@ static void vduse_vq_kick_work(struct work_struct *work)
static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- struct vduse_virtqueue *vq = &dev->vqs[idx];
+ struct vduse_virtqueue *vq = dev->vqs[idx];
if (!eventfd_signal_allowed()) {
schedule_work(&vq->kick);
@@ -513,7 +513,7 @@ static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
struct vdpa_callback *cb)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- struct vduse_virtqueue *vq = &dev->vqs[idx];
+ struct vduse_virtqueue *vq = dev->vqs[idx];
spin_lock(&vq->irq_lock);
vq->cb.callback = cb->callback;
@@ -524,7 +524,7 @@ static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- struct vduse_virtqueue *vq = &dev->vqs[idx];
+ struct vduse_virtqueue *vq = dev->vqs[idx];
vq->num = num;
}
@@ -533,7 +533,7 @@ static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
u16 idx, bool ready)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- struct vduse_virtqueue *vq = &dev->vqs[idx];
+ struct vduse_virtqueue *vq = dev->vqs[idx];
vq->ready = ready;
}
@@ -541,7 +541,7 @@ static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- struct vduse_virtqueue *vq = &dev->vqs[idx];
+ struct vduse_virtqueue *vq = dev->vqs[idx];
return vq->ready;
}
@@ -550,7 +550,7 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
const struct vdpa_vq_state *state)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- struct vduse_virtqueue *vq = &dev->vqs[idx];
+ struct vduse_virtqueue *vq = dev->vqs[idx];
if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
vq->state.packed.last_avail_counter =
@@ -569,7 +569,7 @@ static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
struct vdpa_vq_state *state)
{
struct vduse_dev *dev = vdpa_to_vduse(vdpa);
- struct vduse_virtqueue *vq = &dev->vqs[idx];
+ struct vduse_virtqueue *vq = dev->vqs[idx];
if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
@@ -624,8 +624,8 @@ static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
int i;
for (i = 0; i < dev->vq_num; i++)
- if (num_max < dev->vqs[i].num_max)
- num_max = dev->vqs[i].num_max;
+ if (num_max < dev->vqs[i]->num_max)
+ num_max = dev->vqs[i]->num_max;
return num_max;
}
@@ -863,7 +863,7 @@ static int vduse_kickfd_setup(struct vduse_dev *dev,
return -EINVAL;
index = array_index_nospec(eventfd->index, dev->vq_num);
- vq = &dev->vqs[index];
+ vq = dev->vqs[index];
if (eventfd->fd >= 0) {
ctx = eventfd_ctx_fdget(eventfd->fd);
if (IS_ERR(ctx))
@@ -889,7 +889,7 @@ static bool vduse_dev_is_ready(struct vduse_dev *dev)
int i;
for (i = 0; i < dev->vq_num; i++)
- if (!dev->vqs[i].num_max)
+ if (!dev->vqs[i]->num_max)
return false;
return true;
@@ -1130,7 +1130,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
break;
index = array_index_nospec(config.index, dev->vq_num);
- dev->vqs[index].num_max = config.max_size;
+ dev->vqs[index]->num_max = config.max_size;
ret = 0;
break;
}
@@ -1148,7 +1148,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
break;
index = array_index_nospec(vq_info.index, dev->vq_num);
- vq = &dev->vqs[index];
+ vq = dev->vqs[index];
vq_info.desc_addr = vq->desc_addr;
vq_info.driver_addr = vq->driver_addr;
vq_info.device_addr = vq->device_addr;
@@ -1198,7 +1198,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
break;
index = array_index_nospec(index, dev->vq_num);
- ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
+ ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject);
break;
}
case VDUSE_IOTLB_REG_UMEM: {
@@ -1339,6 +1339,49 @@ static const struct file_operations vduse_dev_fops = {
.llseek = noop_llseek,
};
+static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
+{
+ int i;
+
+ if (!dev->vqs)
+ return;
+
+ for (i = 0; i < dev->vq_num; i++)
+ kfree(dev->vqs[i]);
+ kfree(dev->vqs);
+}
+
+static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
+{
+ int i;
+
+ dev->vq_align = vq_align;
+ dev->vq_num = vq_num;
+ dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
+ if (!dev->vqs)
+ return -ENOMEM;
+
+ for (i = 0; i < vq_num; i++) {
+ dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
+ if (!dev->vqs[i])
+ goto err;
+
+ dev->vqs[i]->index = i;
+ INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
+ INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
+ spin_lock_init(&dev->vqs[i]->kick_lock);
+ spin_lock_init(&dev->vqs[i]->irq_lock);
+ }
+
+ return 0;
+err:
+ while (i--)
+ kfree(dev->vqs[i]);
+ kfree(dev->vqs);
+ dev->vqs = NULL;
+ return -ENOMEM;
+}
+
static struct vduse_dev *vduse_dev_create(void)
{
struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
@@ -1396,7 +1439,7 @@ static int vduse_destroy_dev(char *name)
device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
idr_remove(&vduse_idr, dev->minor);
kvfree(dev->config);
- kfree(dev->vqs);
+ vduse_dev_deinit_vqs(dev);
vduse_domain_destroy(dev->domain);
kfree(dev->name);
vduse_dev_destroy(dev);
@@ -1483,7 +1526,7 @@ ATTRIBUTE_GROUPS(vduse_dev);
static int vduse_create_dev(struct vduse_dev_config *config,
void *config_buf, u64 api_version)
{
- int i, ret;
+ int ret;
struct vduse_dev *dev;
ret = -EEXIST;
@@ -1510,19 +1553,10 @@ static int vduse_create_dev(struct vduse_dev_config *config,
dev->config = config_buf;
dev->config_size = config->config_size;
- dev->vq_align = config->vq_align;
- dev->vq_num = config->vq_num;
- dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
- if (!dev->vqs)
- goto err_vqs;
- for (i = 0; i < dev->vq_num; i++) {
- dev->vqs[i].index = i;
- INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
- INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
- spin_lock_init(&dev->vqs[i].kick_lock);
- spin_lock_init(&dev->vqs[i].irq_lock);
- }
+ ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
+ if (ret)
+ goto err_vqs;
ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
if (ret < 0)
@@ -1543,7 +1577,7 @@ static int vduse_create_dev(struct vduse_dev_config *config,
err_dev:
idr_remove(&vduse_idr, dev->minor);
err_idr:
- kfree(dev->vqs);
+ vduse_dev_deinit_vqs(dev);
err_vqs:
vduse_domain_destroy(dev->domain);
err_domain:
--
2.20.1
This brings current interrupt affinity spreading mechanism
to vduse device. We will make use of irq_create_affinity_masks()
to create an irq callback affinity mask for each virtqueue of
vduse device. Then we will choose the CPU which has the lowest
number of interrupt allocated in the affinity mask to run the
irq callback.
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 50 ++++++++++++++++++++++++++++++
1 file changed, 50 insertions(+)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index d126f3e32a20..90c2896039d9 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -23,6 +23,7 @@
#include <linux/nospec.h>
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
+#include <linux/interrupt.h>
#include <uapi/linux/vduse.h>
#include <uapi/linux/vdpa.h>
#include <uapi/linux/virtio_config.h>
@@ -58,6 +59,8 @@ struct vduse_virtqueue {
struct work_struct inject;
struct work_struct kick;
int irq_effective_cpu;
+ struct cpumask irq_affinity;
+ spinlock_t irq_affinity_lock;
};
struct vduse_dev;
@@ -123,6 +126,7 @@ struct vduse_control {
static DEFINE_MUTEX(vduse_lock);
static DEFINE_IDR(vduse_idr);
+static DEFINE_PER_CPU(unsigned long, vduse_allocated_irq);
static dev_t vduse_major;
static struct class *vduse_class;
@@ -710,6 +714,49 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
return dev->generation;
}
+static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
+{
+ unsigned int cpu, best_cpu;
+ unsigned long allocated, allocated_min = UINT_MAX;
+
+ spin_lock(&vq->irq_affinity_lock);
+
+ best_cpu = vq->irq_effective_cpu;
+ if (best_cpu != -1)
+ per_cpu(vduse_allocated_irq, best_cpu) -= 1;
+
+ for_each_cpu(cpu, &vq->irq_affinity) {
+ allocated = per_cpu(vduse_allocated_irq, cpu);
+ if (!cpu_online(cpu) || allocated >= allocated_min)
+ continue;
+
+ best_cpu = cpu;
+ allocated_min = allocated;
+ }
+ vq->irq_effective_cpu = best_cpu;
+ per_cpu(vduse_allocated_irq, best_cpu) += 1;
+
+ spin_unlock(&vq->irq_affinity_lock);
+}
+
+static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
+ struct irq_affinity *desc)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+ struct irq_affinity_desc *affd = NULL;
+ int i;
+
+ affd = irq_create_affinity_masks(dev->vq_num, desc);
+ if (!affd)
+ return;
+
+ for (i = 0; i < dev->vq_num; i++) {
+ cpumask_copy(&dev->vqs[i]->irq_affinity, &affd[i].mask);
+ vduse_vq_update_effective_cpu(dev->vqs[i]);
+ }
+ kfree(affd);
+}
+
static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
unsigned int asid,
struct vhost_iotlb *iotlb)
@@ -760,6 +807,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
.get_config = vduse_vdpa_get_config,
.set_config = vduse_vdpa_set_config,
.get_generation = vduse_vdpa_get_generation,
+ .set_irq_affinity = vduse_vdpa_set_irq_affinity,
.reset = vduse_vdpa_reset,
.set_map = vduse_vdpa_set_map,
.free = vduse_vdpa_free,
@@ -1380,6 +1428,8 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
spin_lock_init(&dev->vqs[i]->kick_lock);
spin_lock_init(&dev->vqs[i]->irq_lock);
+ spin_lock_init(&dev->vqs[i]->irq_affinity_lock);
+ cpumask_setall(&dev->vqs[i]->irq_affinity);
}
return 0;
--
2.20.1
Since we already support irq callback affinity management,
let's implement the set/get_vq_affinity callbacks to make
it possible for the virtio device driver to change or be
aware of the affinity. This would make it possible for the
virtio-blk driver to build the blk-mq queues based on the
irq callback affinity.
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 90c2896039d9..6507a78abc9d 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -739,6 +739,28 @@ static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
spin_unlock(&vq->irq_affinity_lock);
}
+static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx,
+ const struct cpumask *cpu_mask)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask);
+ vduse_vq_update_effective_cpu(dev->vqs[idx]);
+
+ return 0;
+}
+
+static const struct cpumask *
+vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx)
+{
+ struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+ if (dev->vqs[idx]->irq_effective_cpu == -1)
+ return NULL;
+
+ return &dev->vqs[idx]->irq_affinity;
+}
+
static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
struct irq_affinity *desc)
{
@@ -807,6 +829,8 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
.get_config = vduse_vdpa_get_config,
.set_config = vduse_vdpa_set_config,
.get_generation = vduse_vdpa_get_generation,
+ .set_vq_affinity = vduse_vdpa_set_vq_affinity,
+ .get_vq_affinity = vduse_vdpa_get_vq_affinity,
.set_irq_affinity = vduse_vdpa_set_irq_affinity,
.reset = vduse_vdpa_reset,
.set_map = vduse_vdpa_set_map,
--
2.20.1
This introduces a bound workqueue to support running
irq callback in a specified cpu.
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 29 ++++++++++++++++++++++-------
1 file changed, 22 insertions(+), 7 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 37809bfcb7ef..d126f3e32a20 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -57,6 +57,7 @@ struct vduse_virtqueue {
struct vdpa_callback cb;
struct work_struct inject;
struct work_struct kick;
+ int irq_effective_cpu;
};
struct vduse_dev;
@@ -128,6 +129,7 @@ static struct class *vduse_class;
static struct cdev vduse_ctrl_cdev;
static struct cdev vduse_cdev;
static struct workqueue_struct *vduse_irq_wq;
+static struct workqueue_struct *vduse_irq_bound_wq;
static u32 allowed_device_id[] = {
VIRTIO_ID_BLOCK,
@@ -917,7 +919,8 @@ static void vduse_vq_irq_inject(struct work_struct *work)
}
static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
- struct work_struct *irq_work)
+ struct work_struct *irq_work,
+ int irq_effective_cpu)
{
int ret = -EINVAL;
@@ -926,7 +929,11 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
goto unlock;
ret = 0;
- queue_work(vduse_irq_wq, irq_work);
+ if (irq_effective_cpu == -1)
+ queue_work(vduse_irq_wq, irq_work);
+ else
+ queue_work_on(irq_effective_cpu,
+ vduse_irq_bound_wq, irq_work);
unlock:
up_read(&dev->rwsem);
@@ -1111,7 +1118,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
break;
}
case VDUSE_DEV_INJECT_CONFIG_IRQ:
- ret = vduse_dev_queue_irq_work(dev, &dev->inject);
+ ret = vduse_dev_queue_irq_work(dev, &dev->inject, -1);
break;
case VDUSE_VQ_SETUP: {
struct vduse_vq_config config;
@@ -1198,7 +1205,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
break;
index = array_index_nospec(index, dev->vq_num);
- ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject);
+ ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject,
+ dev->vqs[index]->irq_effective_cpu);
break;
}
case VDUSE_IOTLB_REG_UMEM: {
@@ -1367,6 +1375,7 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
goto err;
dev->vqs[i]->index = i;
+ dev->vqs[i]->irq_effective_cpu = -1;
INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
spin_lock_init(&dev->vqs[i]->kick_lock);
@@ -1855,12 +1864,15 @@ static int vduse_init(void)
if (ret)
goto err_cdev;
+ ret = -ENOMEM;
vduse_irq_wq = alloc_workqueue("vduse-irq",
WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0);
- if (!vduse_irq_wq) {
- ret = -ENOMEM;
+ if (!vduse_irq_wq)
goto err_wq;
- }
+
+ vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0);
+ if (!vduse_irq_bound_wq)
+ goto err_bound_wq;
ret = vduse_domain_init();
if (ret)
@@ -1874,6 +1886,8 @@ static int vduse_init(void)
err_mgmtdev:
vduse_domain_exit();
err_domain:
+ destroy_workqueue(vduse_irq_bound_wq);
+err_bound_wq:
destroy_workqueue(vduse_irq_wq);
err_wq:
cdev_del(&vduse_cdev);
@@ -1893,6 +1907,7 @@ static void vduse_exit(void)
{
vduse_mgmtdev_exit();
vduse_domain_exit();
+ destroy_workqueue(vduse_irq_bound_wq);
destroy_workqueue(vduse_irq_wq);
cdev_del(&vduse_cdev);
device_destroy(vduse_class, vduse_major);
--
2.20.1
This introduces set_irq_affinity callback in
vdpa_config_ops so that vdpa device driver can
get the interrupt affinity hint from the virtio
device driver. The interrupt affinity hint would
be needed by the interrupt affinity spreading
mechanism.
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/virtio/virtio_vdpa.c | 4 ++++
include/linux/vdpa.h | 8 ++++++++
2 files changed, 12 insertions(+)
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 08084b49e5a1..4731e4616ee0 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -275,9 +275,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
struct vdpa_device *vdpa = vd_get_vdpa(vdev);
const struct vdpa_config_ops *ops = vdpa->config;
+ struct irq_affinity default_affd = { 0 };
struct vdpa_callback cb;
int i, err, queue_idx = 0;
+ if (ops->set_irq_affinity)
+ ops->set_irq_affinity(vdpa, desc ? desc : &default_affd);
+
for (i = 0; i < nvqs; ++i) {
if (!names[i]) {
vqs[i] = NULL;
diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 0ff6c9363356..482ff7d0206f 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -256,6 +256,12 @@ struct vdpa_map_file {
* @vdev: vdpa device
* @idx: virtqueue index
* Returns the irq affinity mask
+ * @set_irq_affinity: Pass the irq affinity hint from the virtio
+ * device driver to vdpa driver (optional).
+ * Needed by the interrupt affinity spreading
+ * mechanism.
+ * @vdev: vdpa device
+ * @desc: irq affinity hint
* @set_group_asid: Set address space identifier for a
* virtqueue group (optional)
* @vdev: vdpa device
@@ -344,6 +350,8 @@ struct vdpa_config_ops {
const struct cpumask *cpu_mask);
const struct cpumask *(*get_vq_affinity)(struct vdpa_device *vdev,
u16 idx);
+ void (*set_irq_affinity)(struct vdpa_device *vdev,
+ struct irq_affinity *desc);
/* DMA ops */
int (*set_map)(struct vdpa_device *vdev, unsigned int asid,
--
2.20.1
As discussed in [1], this adds sysfs interface to support
specifying bounce buffer size in virtio-vdpa case. It would
be a performance tuning parameter for high throughput workloads.
[1] https://www.spinics.net/lists/netdev/msg754288.html
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 45 +++++++++++++++++++++++++++++-
1 file changed, 44 insertions(+), 1 deletion(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index bd1ba6c33e09..0458d61cee1f 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -38,8 +38,11 @@
#define DRV_LICENSE "GPL v2"
#define VDUSE_DEV_MAX (1U << MINORBITS)
+#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
+#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
-#define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
+/* 128 MB reserved for virtqueue creation */
+#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
#define VDUSE_MSG_DEFAULT_TIMEOUT 30
struct vduse_virtqueue {
@@ -1795,8 +1798,48 @@ static ssize_t msg_timeout_store(struct device *device,
static DEVICE_ATTR_RW(msg_timeout);
+static ssize_t bounce_size_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct vduse_dev *dev = dev_get_drvdata(device);
+
+ return sysfs_emit(buf, "%u\n", dev->bounce_size);
+}
+
+static ssize_t bounce_size_store(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct vduse_dev *dev = dev_get_drvdata(device);
+ unsigned int bounce_size;
+ int ret;
+
+ ret = -EPERM;
+ mutex_lock(&dev->domain_lock);
+ if (dev->domain)
+ goto unlock;
+
+ ret = kstrtouint(buf, 10, &bounce_size);
+ if (ret < 0)
+ goto unlock;
+
+ ret = -EINVAL;
+ if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
+ bounce_size < VDUSE_MIN_BOUNCE_SIZE)
+ goto unlock;
+
+ dev->bounce_size = bounce_size & PAGE_MASK;
+ ret = count;
+unlock:
+ mutex_unlock(&dev->domain_lock);
+ return ret;
+}
+
+static DEVICE_ATTR_RW(bounce_size);
+
static struct attribute *vduse_dev_attrs[] = {
&dev_attr_msg_timeout.attr,
+ &dev_attr_bounce_size.attr,
NULL
};
--
2.20.1
Add enable_irq_wq sysfs interface to control whether
use workqueue to inject irq or not. The vhost-vdpa case
can benefit from it.
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 50 +++++++++++++++++++++++++++++-
1 file changed, 49 insertions(+), 1 deletion(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index c65f84100e30..ed06c7afd484 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -62,6 +62,7 @@ struct vduse_virtqueue {
struct cpumask irq_affinity;
spinlock_t irq_affinity_lock;
struct kobject kobj;
+ bool enable_irq_wq;
};
struct vduse_dev;
@@ -1013,6 +1014,26 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
return ret;
}
+static int vduse_dev_inject_vq_irq(struct vduse_dev *dev,
+ struct vduse_virtqueue *vq)
+{
+ int ret = -EINVAL;
+
+ down_read(&dev->rwsem);
+ if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
+ goto unlock;
+
+ ret = 0;
+ spin_lock_irq(&vq->irq_lock);
+ if (vq->ready && vq->cb.callback)
+ vq->cb.callback(vq->cb.private);
+ spin_unlock_irq(&vq->irq_lock);
+unlock:
+ up_read(&dev->rwsem);
+
+ return ret;
+}
+
static int vduse_dev_dereg_umem(struct vduse_dev *dev,
u64 iova, u64 size)
{
@@ -1278,8 +1299,12 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
break;
index = array_index_nospec(index, dev->vq_num);
- ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject,
+ if (dev->vqs[index]->enable_irq_wq)
+ ret = vduse_dev_queue_irq_work(dev,
+ &dev->vqs[index]->inject,
dev->vqs[index]->irq_effective_cpu);
+ else
+ ret = vduse_dev_inject_vq_irq(dev, dev->vqs[index]);
break;
}
case VDUSE_IOTLB_REG_UMEM: {
@@ -1420,6 +1445,26 @@ static const struct file_operations vduse_dev_fops = {
.llseek = noop_llseek,
};
+static ssize_t enable_irq_wq_show(struct vduse_virtqueue *vq, char *buf)
+{
+ return sprintf(buf, "%d\n", vq->enable_irq_wq);
+}
+
+static ssize_t enable_irq_wq_store(struct vduse_virtqueue *vq,
+ const char *buf, size_t count)
+{
+ bool enabled;
+ int ret;
+
+ ret = kstrtobool(buf, &enabled);
+ if (ret)
+ return ret;
+
+ vq->enable_irq_wq = enabled;
+
+ return count;
+}
+
static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
{
return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
@@ -1480,10 +1525,12 @@ struct vq_sysfs_entry {
static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RO(irq_cb_affinity);
static struct vq_sysfs_entry irq_cb_effective_affinity_attr =
__ATTR_RW(irq_cb_effective_affinity);
+static struct vq_sysfs_entry enable_irq_wq_attr = __ATTR_RW(enable_irq_wq);
static struct attribute *vq_attrs[] = {
&irq_cb_affinity_attr.attr,
&irq_cb_effective_affinity_attr.attr,
+ &enable_irq_wq_attr.attr,
NULL,
};
ATTRIBUTE_GROUPS(vq);
@@ -1565,6 +1612,7 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
dev->vqs[i]->index = i;
dev->vqs[i]->irq_effective_cpu = -1;
+ dev->vqs[i]->enable_irq_wq = true;
INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
spin_lock_init(&dev->vqs[i]->kick_lock);
--
2.20.1
Delay creating iova domain until the vduse device is binded
to vdpa bus.
This is a preparation for adding sysfs interface to support
specifying bounce buffer size for the iova domain.
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 75 +++++++++++++++++++++---------
1 file changed, 53 insertions(+), 22 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index ed06c7afd484..bd1ba6c33e09 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -112,6 +112,8 @@ struct vduse_dev {
u32 vq_align;
struct vduse_umem *umem;
struct mutex mem_lock;
+ unsigned int bounce_size;
+ struct mutex domain_lock;
};
struct vduse_dev_msg {
@@ -427,7 +429,7 @@ static void vduse_dev_reset(struct vduse_dev *dev)
struct vduse_iova_domain *domain = dev->domain;
/* The coherent mappings are handled in vduse_dev_free_coherent() */
- if (domain->bounce_map)
+ if (domain && domain->bounce_map)
vduse_domain_reset_bounce_map(domain);
down_write(&dev->rwsem);
@@ -1045,6 +1047,9 @@ static int vduse_dev_dereg_umem(struct vduse_dev *dev,
goto unlock;
ret = -EINVAL;
+ if (!dev->domain)
+ goto unlock;
+
if (dev->umem->iova != iova || size != dev->domain->bounce_size)
goto unlock;
@@ -1071,7 +1076,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
unsigned long npages, lock_limit;
int ret;
- if (!dev->domain->bounce_map ||
+ if (!dev->domain || !dev->domain->bounce_map ||
size != dev->domain->bounce_size ||
iova != 0 || uaddr & ~PAGE_MASK)
return -EINVAL;
@@ -1145,7 +1150,6 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
struct vduse_iotlb_entry entry;
struct vhost_iotlb_map *map;
struct vdpa_map_file *map_file;
- struct vduse_iova_domain *domain = dev->domain;
struct file *f = NULL;
ret = -EFAULT;
@@ -1156,8 +1160,13 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
if (entry.start > entry.last)
break;
- spin_lock(&domain->iotlb_lock);
- map = vhost_iotlb_itree_first(domain->iotlb,
+ mutex_lock(&dev->domain_lock);
+ if (!dev->domain) {
+ mutex_unlock(&dev->domain_lock);
+ break;
+ }
+ spin_lock(&dev->domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(dev->domain->iotlb,
entry.start, entry.last);
if (map) {
map_file = (struct vdpa_map_file *)map->opaque;
@@ -1167,7 +1176,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
entry.last = map->last;
entry.perm = map->perm;
}
- spin_unlock(&domain->iotlb_lock);
+ spin_unlock(&dev->domain->iotlb_lock);
+ mutex_unlock(&dev->domain_lock);
ret = -EINVAL;
if (!f)
break;
@@ -1319,8 +1329,10 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
sizeof(umem.reserved)))
break;
+ mutex_lock(&dev->domain_lock);
ret = vduse_dev_reg_umem(dev, umem.iova,
umem.uaddr, umem.size);
+ mutex_unlock(&dev->domain_lock);
break;
}
case VDUSE_IOTLB_DEREG_UMEM: {
@@ -1334,15 +1346,15 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
if (!is_mem_zero((const char *)umem.reserved,
sizeof(umem.reserved)))
break;
-
+ mutex_lock(&dev->domain_lock);
ret = vduse_dev_dereg_umem(dev, umem.iova,
umem.size);
+ mutex_unlock(&dev->domain_lock);
break;
}
case VDUSE_IOTLB_GET_INFO: {
struct vduse_iova_info info;
struct vhost_iotlb_map *map;
- struct vduse_iova_domain *domain = dev->domain;
ret = -EFAULT;
if (copy_from_user(&info, argp, sizeof(info)))
@@ -1356,18 +1368,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
sizeof(info.reserved)))
break;
- spin_lock(&domain->iotlb_lock);
- map = vhost_iotlb_itree_first(domain->iotlb,
+ mutex_lock(&dev->domain_lock);
+ if (!dev->domain) {
+ mutex_unlock(&dev->domain_lock);
+ break;
+ }
+ spin_lock(&dev->domain->iotlb_lock);
+ map = vhost_iotlb_itree_first(dev->domain->iotlb,
info.start, info.last);
if (map) {
info.start = map->start;
info.last = map->last;
info.capability = 0;
- if (domain->bounce_map && map->start == 0 &&
- map->last == domain->bounce_size - 1)
+ if (dev->domain->bounce_map && map->start == 0 &&
+ map->last == dev->domain->bounce_size - 1)
info.capability |= VDUSE_IOVA_CAP_UMEM;
}
- spin_unlock(&domain->iotlb_lock);
+ spin_unlock(&dev->domain->iotlb_lock);
+ mutex_unlock(&dev->domain_lock);
if (!map)
break;
@@ -1390,7 +1408,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
{
struct vduse_dev *dev = file->private_data;
- vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
+ mutex_lock(&dev->domain_lock);
+ if (dev->domain)
+ vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
+ mutex_unlock(&dev->domain_lock);
spin_lock(&dev->msg_lock);
/* Make sure the inflight messages can processed after reconncection */
list_splice_init(&dev->recv_list, &dev->send_list);
@@ -1647,6 +1668,7 @@ static struct vduse_dev *vduse_dev_create(void)
mutex_init(&dev->lock);
mutex_init(&dev->mem_lock);
+ mutex_init(&dev->domain_lock);
spin_lock_init(&dev->msg_lock);
INIT_LIST_HEAD(&dev->send_list);
INIT_LIST_HEAD(&dev->recv_list);
@@ -1696,7 +1718,8 @@ static int vduse_destroy_dev(char *name)
idr_remove(&vduse_idr, dev->minor);
kvfree(dev->config);
vduse_dev_deinit_vqs(dev);
- vduse_domain_destroy(dev->domain);
+ if (dev->domain)
+ vduse_domain_destroy(dev->domain);
kfree(dev->name);
vduse_dev_destroy(dev);
module_put(THIS_MODULE);
@@ -1802,11 +1825,7 @@ static int vduse_create_dev(struct vduse_dev_config *config,
if (!dev->name)
goto err_str;
- dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
- VDUSE_BOUNCE_SIZE);
- if (!dev->domain)
- goto err_domain;
-
+ dev->bounce_size = VDUSE_BOUNCE_SIZE;
dev->config = config_buf;
dev->config_size = config->config_size;
@@ -1836,8 +1855,6 @@ static int vduse_create_dev(struct vduse_dev_config *config,
err_dev:
idr_remove(&vduse_idr, dev->minor);
err_idr:
- vduse_domain_destroy(dev->domain);
-err_domain:
kfree(dev->name);
err_str:
vduse_dev_destroy(dev);
@@ -2004,9 +2021,23 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
if (ret)
return ret;
+ mutex_lock(&dev->domain_lock);
+ if (!dev->domain)
+ dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
+ dev->bounce_size);
+ mutex_unlock(&dev->domain_lock);
+ if (!dev->domain) {
+ put_device(&dev->vdev->vdpa.dev);
+ return -ENOMEM;
+ }
+
ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
if (ret) {
put_device(&dev->vdev->vdpa.dev);
+ mutex_lock(&dev->domain_lock);
+ vduse_domain_destroy(dev->domain);
+ dev->domain = NULL;
+ mutex_unlock(&dev->domain_lock);
return ret;
}
--
2.20.1
Add sysfs interface for each vduse virtqueue to
show the affinity and effective affinity for irq
callback.
And we can also use this interface to change the
effective affinity which must be a subset of the
irq callback affinity mask for the virtqueue. This
might be useful for performance tuning when the irq
callback affinity mask contains more than one CPU.
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++---
1 file changed, 137 insertions(+), 11 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 6507a78abc9d..c65f84100e30 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -61,6 +61,7 @@ struct vduse_virtqueue {
int irq_effective_cpu;
struct cpumask irq_affinity;
spinlock_t irq_affinity_lock;
+ struct kobject kobj;
};
struct vduse_dev;
@@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = {
.llseek = noop_llseek,
};
+static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
+{
+ return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
+}
+
+static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq,
+ char *buf)
+{
+ struct cpumask all_mask = CPU_MASK_ALL;
+ const struct cpumask *mask = &all_mask;
+
+ if (vq->irq_effective_cpu != -1)
+ mask = get_cpu_mask(vq->irq_effective_cpu);
+
+ return sprintf(buf, "%*pb\n", cpumask_pr_args(mask));
+}
+
+static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq,
+ const char *buf, size_t count)
+{
+ cpumask_var_t new_value;
+ int ret;
+
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, new_value);
+ if (ret)
+ goto free_mask;
+
+ ret = -EINVAL;
+ if (!cpumask_intersects(new_value, &vq->irq_affinity))
+ goto free_mask;
+
+ spin_lock(&vq->irq_affinity_lock);
+
+ if (vq->irq_effective_cpu != -1)
+ per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1;
+
+ vq->irq_effective_cpu = cpumask_first(new_value);
+ per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) += 1;
+
+ spin_unlock(&vq->irq_affinity_lock);
+ ret = count;
+
+free_mask:
+ free_cpumask_var(new_value);
+ return ret;
+}
+
+struct vq_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
+ ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
+ size_t count);
+};
+
+static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RO(irq_cb_affinity);
+static struct vq_sysfs_entry irq_cb_effective_affinity_attr =
+ __ATTR_RW(irq_cb_effective_affinity);
+
+static struct attribute *vq_attrs[] = {
+ &irq_cb_affinity_attr.attr,
+ &irq_cb_effective_affinity_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(vq);
+
+static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct vduse_virtqueue *vq = container_of(kobj,
+ struct vduse_virtqueue, kobj);
+ struct vq_sysfs_entry *entry = container_of(attr,
+ struct vq_sysfs_entry, attr);
+
+ if (!entry->show)
+ return -EIO;
+
+ return entry->show(vq, buf);
+}
+
+static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct vduse_virtqueue *vq = container_of(kobj,
+ struct vduse_virtqueue, kobj);
+ struct vq_sysfs_entry *entry = container_of(attr,
+ struct vq_sysfs_entry, attr);
+
+ if (!entry->store)
+ return -EIO;
+
+ return entry->store(vq, buf, count);
+}
+
+static const struct sysfs_ops vq_sysfs_ops = {
+ .show = vq_attr_show,
+ .store = vq_attr_store,
+};
+
+static void vq_release(struct kobject *kobj)
+{
+ struct vduse_virtqueue *vq = container_of(kobj,
+ struct vduse_virtqueue, kobj);
+ kfree(vq);
+}
+
+static struct kobj_type vq_type = {
+ .release = vq_release,
+ .sysfs_ops = &vq_sysfs_ops,
+ .default_groups = vq_groups,
+};
+
static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
{
int i;
@@ -1427,13 +1542,13 @@ static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
return;
for (i = 0; i < dev->vq_num; i++)
- kfree(dev->vqs[i]);
+ kobject_put(&dev->vqs[i]->kobj);
kfree(dev->vqs);
}
static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
{
- int i;
+ int ret, i;
dev->vq_align = vq_align;
dev->vq_num = vq_num;
@@ -1443,8 +1558,10 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
for (i = 0; i < vq_num; i++) {
dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
- if (!dev->vqs[i])
+ if (!dev->vqs[i]) {
+ ret = -ENOMEM;
goto err;
+ }
dev->vqs[i]->index = i;
dev->vqs[i]->irq_effective_cpu = -1;
@@ -1454,15 +1571,23 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
spin_lock_init(&dev->vqs[i]->irq_lock);
spin_lock_init(&dev->vqs[i]->irq_affinity_lock);
cpumask_setall(&dev->vqs[i]->irq_affinity);
+
+ kobject_init(&dev->vqs[i]->kobj, &vq_type);
+ ret = kobject_add(&dev->vqs[i]->kobj,
+ &dev->dev->kobj, "vq%d", i);
+ if (ret) {
+ kfree(dev->vqs[i]);
+ goto err;
+ }
}
return 0;
err:
while (i--)
- kfree(dev->vqs[i]);
+ kobject_put(&dev->vqs[i]->kobj);
kfree(dev->vqs);
dev->vqs = NULL;
- return -ENOMEM;
+ return ret;
}
static struct vduse_dev *vduse_dev_create(void)
@@ -1637,10 +1762,6 @@ static int vduse_create_dev(struct vduse_dev_config *config,
dev->config = config_buf;
dev->config_size = config->config_size;
- ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
- if (ret)
- goto err_vqs;
-
ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
if (ret < 0)
goto err_idr;
@@ -1654,14 +1775,19 @@ static int vduse_create_dev(struct vduse_dev_config *config,
ret = PTR_ERR(dev->dev);
goto err_dev;
}
+
+ ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
+ if (ret)
+ goto err_vqs;
+
__module_get(THIS_MODULE);
return 0;
+err_vqs:
+ device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
err_dev:
idr_remove(&vduse_idr, dev->minor);
err_idr:
- vduse_dev_deinit_vqs(dev);
-err_vqs:
vduse_domain_destroy(dev->domain);
err_domain:
kfree(dev->name);
--
2.20.1
On Mon, Dec 5, 2022 at 4:44 PM Xie Yongji <[email protected]> wrote:
>
> This introduces a bound workqueue to support running
> irq callback in a specified cpu.
>
> Signed-off-by: Xie Yongji <[email protected]>
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 29 ++++++++++++++++++++++-------
> 1 file changed, 22 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 37809bfcb7ef..d126f3e32a20 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -57,6 +57,7 @@ struct vduse_virtqueue {
> struct vdpa_callback cb;
> struct work_struct inject;
> struct work_struct kick;
> + int irq_effective_cpu;
I wonder why it's a cpu number instead of a cpumask. The latter seems
more flexible, e.g when using NUMA.
> };
>
> struct vduse_dev;
> @@ -128,6 +129,7 @@ static struct class *vduse_class;
> static struct cdev vduse_ctrl_cdev;
> static struct cdev vduse_cdev;
> static struct workqueue_struct *vduse_irq_wq;
> +static struct workqueue_struct *vduse_irq_bound_wq;
>
> static u32 allowed_device_id[] = {
> VIRTIO_ID_BLOCK,
> @@ -917,7 +919,8 @@ static void vduse_vq_irq_inject(struct work_struct *work)
> }
>
> static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
> - struct work_struct *irq_work)
> + struct work_struct *irq_work,
> + int irq_effective_cpu)
> {
> int ret = -EINVAL;
>
> @@ -926,7 +929,11 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
> goto unlock;
>
> ret = 0;
> - queue_work(vduse_irq_wq, irq_work);
> + if (irq_effective_cpu == -1)
Is it better to have a macro for this magic number?
Thanks
On Mon, Dec 5, 2022 at 4:44 PM Xie Yongji <[email protected]> wrote:
>
> Allocate memory for vduse virtqueues one by one instead of
> doing one allocation for all of them.
>
> This is a preparation for adding sysfs interface for virtqueues.
>
> Signed-off-by: Xie Yongji <[email protected]>
Acked-by: Jason Wang <[email protected]>
Thanks
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 98 ++++++++++++++++++++----------
> 1 file changed, 66 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 35dceee3ed56..37809bfcb7ef 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -76,7 +76,7 @@ struct vduse_umem {
> struct vduse_dev {
> struct vduse_vdpa *vdev;
> struct device *dev;
> - struct vduse_virtqueue *vqs;
> + struct vduse_virtqueue **vqs;
> struct vduse_iova_domain *domain;
> char *name;
> struct mutex lock;
> @@ -434,7 +434,7 @@ static void vduse_dev_reset(struct vduse_dev *dev)
> flush_work(&dev->inject);
>
> for (i = 0; i < dev->vq_num; i++) {
> - struct vduse_virtqueue *vq = &dev->vqs[i];
> + struct vduse_virtqueue *vq = dev->vqs[i];
>
> vq->ready = false;
> vq->desc_addr = 0;
> @@ -466,7 +466,7 @@ static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
> u64 device_area)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> vq->desc_addr = desc_area;
> vq->driver_addr = driver_area;
> @@ -500,7 +500,7 @@ static void vduse_vq_kick_work(struct work_struct *work)
> static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> if (!eventfd_signal_allowed()) {
> schedule_work(&vq->kick);
> @@ -513,7 +513,7 @@ static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
> struct vdpa_callback *cb)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> spin_lock(&vq->irq_lock);
> vq->cb.callback = cb->callback;
> @@ -524,7 +524,7 @@ static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
> static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> vq->num = num;
> }
> @@ -533,7 +533,7 @@ static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
> u16 idx, bool ready)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> vq->ready = ready;
> }
> @@ -541,7 +541,7 @@ static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
> static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> return vq->ready;
> }
> @@ -550,7 +550,7 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
> const struct vdpa_vq_state *state)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
> vq->state.packed.last_avail_counter =
> @@ -569,7 +569,7 @@ static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> struct vdpa_vq_state *state)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
> return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
> @@ -624,8 +624,8 @@ static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
> int i;
>
> for (i = 0; i < dev->vq_num; i++)
> - if (num_max < dev->vqs[i].num_max)
> - num_max = dev->vqs[i].num_max;
> + if (num_max < dev->vqs[i]->num_max)
> + num_max = dev->vqs[i]->num_max;
>
> return num_max;
> }
> @@ -863,7 +863,7 @@ static int vduse_kickfd_setup(struct vduse_dev *dev,
> return -EINVAL;
>
> index = array_index_nospec(eventfd->index, dev->vq_num);
> - vq = &dev->vqs[index];
> + vq = dev->vqs[index];
> if (eventfd->fd >= 0) {
> ctx = eventfd_ctx_fdget(eventfd->fd);
> if (IS_ERR(ctx))
> @@ -889,7 +889,7 @@ static bool vduse_dev_is_ready(struct vduse_dev *dev)
> int i;
>
> for (i = 0; i < dev->vq_num; i++)
> - if (!dev->vqs[i].num_max)
> + if (!dev->vqs[i]->num_max)
> return false;
>
> return true;
> @@ -1130,7 +1130,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> break;
>
> index = array_index_nospec(config.index, dev->vq_num);
> - dev->vqs[index].num_max = config.max_size;
> + dev->vqs[index]->num_max = config.max_size;
> ret = 0;
> break;
> }
> @@ -1148,7 +1148,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> break;
>
> index = array_index_nospec(vq_info.index, dev->vq_num);
> - vq = &dev->vqs[index];
> + vq = dev->vqs[index];
> vq_info.desc_addr = vq->desc_addr;
> vq_info.driver_addr = vq->driver_addr;
> vq_info.device_addr = vq->device_addr;
> @@ -1198,7 +1198,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> break;
>
> index = array_index_nospec(index, dev->vq_num);
> - ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
> + ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject);
> break;
> }
> case VDUSE_IOTLB_REG_UMEM: {
> @@ -1339,6 +1339,49 @@ static const struct file_operations vduse_dev_fops = {
> .llseek = noop_llseek,
> };
>
> +static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
> +{
> + int i;
> +
> + if (!dev->vqs)
> + return;
> +
> + for (i = 0; i < dev->vq_num; i++)
> + kfree(dev->vqs[i]);
> + kfree(dev->vqs);
> +}
> +
> +static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
> +{
> + int i;
> +
> + dev->vq_align = vq_align;
> + dev->vq_num = vq_num;
> + dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
> + if (!dev->vqs)
> + return -ENOMEM;
> +
> + for (i = 0; i < vq_num; i++) {
> + dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
> + if (!dev->vqs[i])
> + goto err;
> +
> + dev->vqs[i]->index = i;
> + INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
> + INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
> + spin_lock_init(&dev->vqs[i]->kick_lock);
> + spin_lock_init(&dev->vqs[i]->irq_lock);
> + }
> +
> + return 0;
> +err:
> + while (i--)
> + kfree(dev->vqs[i]);
> + kfree(dev->vqs);
> + dev->vqs = NULL;
> + return -ENOMEM;
> +}
> +
> static struct vduse_dev *vduse_dev_create(void)
> {
> struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> @@ -1396,7 +1439,7 @@ static int vduse_destroy_dev(char *name)
> device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
> idr_remove(&vduse_idr, dev->minor);
> kvfree(dev->config);
> - kfree(dev->vqs);
> + vduse_dev_deinit_vqs(dev);
> vduse_domain_destroy(dev->domain);
> kfree(dev->name);
> vduse_dev_destroy(dev);
> @@ -1483,7 +1526,7 @@ ATTRIBUTE_GROUPS(vduse_dev);
> static int vduse_create_dev(struct vduse_dev_config *config,
> void *config_buf, u64 api_version)
> {
> - int i, ret;
> + int ret;
> struct vduse_dev *dev;
>
> ret = -EEXIST;
> @@ -1510,19 +1553,10 @@ static int vduse_create_dev(struct vduse_dev_config *config,
>
> dev->config = config_buf;
> dev->config_size = config->config_size;
> - dev->vq_align = config->vq_align;
> - dev->vq_num = config->vq_num;
> - dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
> - if (!dev->vqs)
> - goto err_vqs;
>
> - for (i = 0; i < dev->vq_num; i++) {
> - dev->vqs[i].index = i;
> - INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
> - INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
> - spin_lock_init(&dev->vqs[i].kick_lock);
> - spin_lock_init(&dev->vqs[i].irq_lock);
> - }
> + ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
> + if (ret)
> + goto err_vqs;
>
> ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
> if (ret < 0)
> @@ -1543,7 +1577,7 @@ static int vduse_create_dev(struct vduse_dev_config *config,
> err_dev:
> idr_remove(&vduse_idr, dev->minor);
> err_idr:
> - kfree(dev->vqs);
> + vduse_dev_deinit_vqs(dev);
> err_vqs:
> vduse_domain_destroy(dev->domain);
> err_domain:
> --
> 2.20.1
>
On Mon, Dec 5, 2022 at 4:43 PM Xie Yongji <[email protected]> wrote:
>
> This introduces set_irq_affinity callback in
> vdpa_config_ops so that vdpa device driver can
> get the interrupt affinity hint from the virtio
> device driver. The interrupt affinity hint would
> be needed by the interrupt affinity spreading
> mechanism.
>
> Signed-off-by: Xie Yongji <[email protected]>
> ---
> drivers/virtio/virtio_vdpa.c | 4 ++++
> include/linux/vdpa.h | 8 ++++++++
> 2 files changed, 12 insertions(+)
>
> diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> index 08084b49e5a1..4731e4616ee0 100644
> --- a/drivers/virtio/virtio_vdpa.c
> +++ b/drivers/virtio/virtio_vdpa.c
> @@ -275,9 +275,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> struct vdpa_device *vdpa = vd_get_vdpa(vdev);
> const struct vdpa_config_ops *ops = vdpa->config;
> + struct irq_affinity default_affd = { 0 };
> struct vdpa_callback cb;
> int i, err, queue_idx = 0;
>
> + if (ops->set_irq_affinity)
> + ops->set_irq_affinity(vdpa, desc ? desc : &default_affd);
I wonder if we need to do this in vhost-vDPA. Or it's better to have a
default affinity by the vDPA parent itself.
(Looking at virtio-pci, it doesn't do something like this).
Thanks
> +
> for (i = 0; i < nvqs; ++i) {
> if (!names[i]) {
> vqs[i] = NULL;
> diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
> index 0ff6c9363356..482ff7d0206f 100644
> --- a/include/linux/vdpa.h
> +++ b/include/linux/vdpa.h
> @@ -256,6 +256,12 @@ struct vdpa_map_file {
> * @vdev: vdpa device
> * @idx: virtqueue index
> * Returns the irq affinity mask
> + * @set_irq_affinity: Pass the irq affinity hint from the virtio
> + * device driver to vdpa driver (optional).
> + * Needed by the interrupt affinity spreading
> + * mechanism.
> + * @vdev: vdpa device
> + * @desc: irq affinity hint
> * @set_group_asid: Set address space identifier for a
> * virtqueue group (optional)
> * @vdev: vdpa device
> @@ -344,6 +350,8 @@ struct vdpa_config_ops {
> const struct cpumask *cpu_mask);
> const struct cpumask *(*get_vq_affinity)(struct vdpa_device *vdev,
> u16 idx);
> + void (*set_irq_affinity)(struct vdpa_device *vdev,
> + struct irq_affinity *desc);
>
> /* DMA ops */
> int (*set_map)(struct vdpa_device *vdev, unsigned int asid,
> --
> 2.20.1
>
On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <[email protected]> wrote:
>
> Add sysfs interface for each vduse virtqueue to
> show the affinity and effective affinity for irq
> callback.
>
> And we can also use this interface to change the
> effective affinity which must be a subset of the
> irq callback affinity mask for the virtqueue. This
> might be useful for performance tuning when the irq
> callback affinity mask contains more than one CPU.
>
> Signed-off-by: Xie Yongji <[email protected]>
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++---
> 1 file changed, 137 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 6507a78abc9d..c65f84100e30 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -61,6 +61,7 @@ struct vduse_virtqueue {
> int irq_effective_cpu;
> struct cpumask irq_affinity;
> spinlock_t irq_affinity_lock;
> + struct kobject kobj;
> };
>
> struct vduse_dev;
> @@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = {
> .llseek = noop_llseek,
> };
>
> +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
> +{
> + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
> +}
> +
> +static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq,
> + char *buf)
> +{
> + struct cpumask all_mask = CPU_MASK_ALL;
> + const struct cpumask *mask = &all_mask;
> +
> + if (vq->irq_effective_cpu != -1)
> + mask = get_cpu_mask(vq->irq_effective_cpu);
Shouldn't this be vq->irq_affinity?
> +
> + return sprintf(buf, "%*pb\n", cpumask_pr_args(mask));
> +}
> +
> +static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq,
> + const char *buf, size_t count)
> +{
> + cpumask_var_t new_value;
> + int ret;
> +
> + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
> + return -ENOMEM;
> +
> + ret = cpumask_parse(buf, new_value);
> + if (ret)
> + goto free_mask;
> +
> + ret = -EINVAL;
> + if (!cpumask_intersects(new_value, &vq->irq_affinity))
> + goto free_mask;
> +
> + spin_lock(&vq->irq_affinity_lock);
> +
> + if (vq->irq_effective_cpu != -1)
> + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1;
> +
> + vq->irq_effective_cpu = cpumask_first(new_value);
Does this mean except for the first cpu, the rest of the mask is unused?
Thanks
> + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) += 1;
> +
> + spin_unlock(&vq->irq_affinity_lock);
> + ret = count;
> +
> +free_mask:
> + free_cpumask_var(new_value);
> + return ret;
> +}
> +
> +struct vq_sysfs_entry {
> + struct attribute attr;
> + ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
> + ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
> + size_t count);
> +};
> +
> +static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RO(irq_cb_affinity);
> +static struct vq_sysfs_entry irq_cb_effective_affinity_attr =
> + __ATTR_RW(irq_cb_effective_affinity);
> +
> +static struct attribute *vq_attrs[] = {
> + &irq_cb_affinity_attr.attr,
> + &irq_cb_effective_affinity_attr.attr,
> + NULL,
> +};
> +ATTRIBUTE_GROUPS(vq);
> +
> +static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
> + char *buf)
> +{
> + struct vduse_virtqueue *vq = container_of(kobj,
> + struct vduse_virtqueue, kobj);
> + struct vq_sysfs_entry *entry = container_of(attr,
> + struct vq_sysfs_entry, attr);
> +
> + if (!entry->show)
> + return -EIO;
> +
> + return entry->show(vq, buf);
> +}
> +
> +static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct vduse_virtqueue *vq = container_of(kobj,
> + struct vduse_virtqueue, kobj);
> + struct vq_sysfs_entry *entry = container_of(attr,
> + struct vq_sysfs_entry, attr);
> +
> + if (!entry->store)
> + return -EIO;
> +
> + return entry->store(vq, buf, count);
> +}
> +
> +static const struct sysfs_ops vq_sysfs_ops = {
> + .show = vq_attr_show,
> + .store = vq_attr_store,
> +};
> +
> +static void vq_release(struct kobject *kobj)
> +{
> + struct vduse_virtqueue *vq = container_of(kobj,
> + struct vduse_virtqueue, kobj);
> + kfree(vq);
> +}
> +
> +static struct kobj_type vq_type = {
> + .release = vq_release,
> + .sysfs_ops = &vq_sysfs_ops,
> + .default_groups = vq_groups,
> +};
> +
> static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
> {
> int i;
> @@ -1427,13 +1542,13 @@ static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
> return;
>
> for (i = 0; i < dev->vq_num; i++)
> - kfree(dev->vqs[i]);
> + kobject_put(&dev->vqs[i]->kobj);
> kfree(dev->vqs);
> }
>
> static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
> {
> - int i;
> + int ret, i;
>
> dev->vq_align = vq_align;
> dev->vq_num = vq_num;
> @@ -1443,8 +1558,10 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
>
> for (i = 0; i < vq_num; i++) {
> dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
> - if (!dev->vqs[i])
> + if (!dev->vqs[i]) {
> + ret = -ENOMEM;
> goto err;
> + }
>
> dev->vqs[i]->index = i;
> dev->vqs[i]->irq_effective_cpu = -1;
> @@ -1454,15 +1571,23 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
> spin_lock_init(&dev->vqs[i]->irq_lock);
> spin_lock_init(&dev->vqs[i]->irq_affinity_lock);
> cpumask_setall(&dev->vqs[i]->irq_affinity);
> +
> + kobject_init(&dev->vqs[i]->kobj, &vq_type);
> + ret = kobject_add(&dev->vqs[i]->kobj,
> + &dev->dev->kobj, "vq%d", i);
> + if (ret) {
> + kfree(dev->vqs[i]);
> + goto err;
> + }
> }
>
> return 0;
> err:
> while (i--)
> - kfree(dev->vqs[i]);
> + kobject_put(&dev->vqs[i]->kobj);
> kfree(dev->vqs);
> dev->vqs = NULL;
> - return -ENOMEM;
> + return ret;
> }
>
> static struct vduse_dev *vduse_dev_create(void)
> @@ -1637,10 +1762,6 @@ static int vduse_create_dev(struct vduse_dev_config *config,
> dev->config = config_buf;
> dev->config_size = config->config_size;
>
> - ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
> - if (ret)
> - goto err_vqs;
> -
> ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
> if (ret < 0)
> goto err_idr;
> @@ -1654,14 +1775,19 @@ static int vduse_create_dev(struct vduse_dev_config *config,
> ret = PTR_ERR(dev->dev);
> goto err_dev;
> }
> +
> + ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
> + if (ret)
> + goto err_vqs;
> +
> __module_get(THIS_MODULE);
>
> return 0;
> +err_vqs:
> + device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
> err_dev:
> idr_remove(&vduse_idr, dev->minor);
> err_idr:
> - vduse_dev_deinit_vqs(dev);
> -err_vqs:
> vduse_domain_destroy(dev->domain);
> err_domain:
> kfree(dev->name);
> --
> 2.20.1
>
On Mon, Dec 5, 2022 at 4:59 PM Xie Yongji <[email protected]> wrote:
>
> This brings current interrupt affinity spreading mechanism
> to vduse device. We will make use of irq_create_affinity_masks()
> to create an irq callback affinity mask for each virtqueue of
> vduse device. Then we will choose the CPU which has the lowest
> number of interrupt allocated in the affinity mask to run the
> irq callback.
This seems a balance mechanism but it might not be the semantic of the
affinity or any reason we need to do this? I guess we should use at
least round-robin in this case.
>
> Signed-off-by: Xie Yongji <[email protected]>
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 50 ++++++++++++++++++++++++++++++
> 1 file changed, 50 insertions(+)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index d126f3e32a20..90c2896039d9 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -23,6 +23,7 @@
> #include <linux/nospec.h>
> #include <linux/vmalloc.h>
> #include <linux/sched/mm.h>
> +#include <linux/interrupt.h>
> #include <uapi/linux/vduse.h>
> #include <uapi/linux/vdpa.h>
> #include <uapi/linux/virtio_config.h>
> @@ -58,6 +59,8 @@ struct vduse_virtqueue {
> struct work_struct inject;
> struct work_struct kick;
> int irq_effective_cpu;
> + struct cpumask irq_affinity;
> + spinlock_t irq_affinity_lock;
Ok, I'd suggest to squash this into patch 5 to make it more easier to
be reviewed.
> };
>
> struct vduse_dev;
> @@ -123,6 +126,7 @@ struct vduse_control {
>
> static DEFINE_MUTEX(vduse_lock);
> static DEFINE_IDR(vduse_idr);
> +static DEFINE_PER_CPU(unsigned long, vduse_allocated_irq);
>
> static dev_t vduse_major;
> static struct class *vduse_class;
> @@ -710,6 +714,49 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
> return dev->generation;
> }
>
> +static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
> +{
> + unsigned int cpu, best_cpu;
> + unsigned long allocated, allocated_min = UINT_MAX;
> +
> + spin_lock(&vq->irq_affinity_lock);
> +
> + best_cpu = vq->irq_effective_cpu;
> + if (best_cpu != -1)
> + per_cpu(vduse_allocated_irq, best_cpu) -= 1;
> +
> + for_each_cpu(cpu, &vq->irq_affinity) {
> + allocated = per_cpu(vduse_allocated_irq, cpu);
> + if (!cpu_online(cpu) || allocated >= allocated_min)
> + continue;
> +
> + best_cpu = cpu;
> + allocated_min = allocated;
> + }
> + vq->irq_effective_cpu = best_cpu;
> + per_cpu(vduse_allocated_irq, best_cpu) += 1;
> +
> + spin_unlock(&vq->irq_affinity_lock);
> +}
> +
> +static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
> + struct irq_affinity *desc)
> +{
> + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> + struct irq_affinity_desc *affd = NULL;
> + int i;
> +
> + affd = irq_create_affinity_masks(dev->vq_num, desc);
> + if (!affd)
Let's add a comment on the vdpa config ops to say set_irq_affinity()
is best effort.
Thanks
> + return;
> +
> + for (i = 0; i < dev->vq_num; i++) {
> + cpumask_copy(&dev->vqs[i]->irq_affinity, &affd[i].mask);
> + vduse_vq_update_effective_cpu(dev->vqs[i]);
> + }
> + kfree(affd);
> +}
> +
> static int vduse_vdpa_set_map(struct vdpa_device *vdpa,
> unsigned int asid,
> struct vhost_iotlb *iotlb)
> @@ -760,6 +807,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = {
> .get_config = vduse_vdpa_get_config,
> .set_config = vduse_vdpa_set_config,
> .get_generation = vduse_vdpa_get_generation,
> + .set_irq_affinity = vduse_vdpa_set_irq_affinity,
> .reset = vduse_vdpa_reset,
> .set_map = vduse_vdpa_set_map,
> .free = vduse_vdpa_free,
> @@ -1380,6 +1428,8 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
> INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
> spin_lock_init(&dev->vqs[i]->kick_lock);
> spin_lock_init(&dev->vqs[i]->irq_lock);
> + spin_lock_init(&dev->vqs[i]->irq_affinity_lock);
> + cpumask_setall(&dev->vqs[i]->irq_affinity);
> }
>
> return 0;
> --
> 2.20.1
>
On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <[email protected]> wrote:
>
> Delay creating iova domain until the vduse device is binded
> to vdpa bus.
s/binded/registered/
Other than this.
Acked-by: Jason Wang <[email protected]>
Thanks
>
> This is a preparation for adding sysfs interface to support
> specifying bounce buffer size for the iova domain.
>
> Signed-off-by: Xie Yongji <[email protected]>
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 75 +++++++++++++++++++++---------
> 1 file changed, 53 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index ed06c7afd484..bd1ba6c33e09 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -112,6 +112,8 @@ struct vduse_dev {
> u32 vq_align;
> struct vduse_umem *umem;
> struct mutex mem_lock;
> + unsigned int bounce_size;
> + struct mutex domain_lock;
> };
>
> struct vduse_dev_msg {
> @@ -427,7 +429,7 @@ static void vduse_dev_reset(struct vduse_dev *dev)
> struct vduse_iova_domain *domain = dev->domain;
>
> /* The coherent mappings are handled in vduse_dev_free_coherent() */
> - if (domain->bounce_map)
> + if (domain && domain->bounce_map)
> vduse_domain_reset_bounce_map(domain);
>
> down_write(&dev->rwsem);
> @@ -1045,6 +1047,9 @@ static int vduse_dev_dereg_umem(struct vduse_dev *dev,
> goto unlock;
>
> ret = -EINVAL;
> + if (!dev->domain)
> + goto unlock;
> +
> if (dev->umem->iova != iova || size != dev->domain->bounce_size)
> goto unlock;
>
> @@ -1071,7 +1076,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev,
> unsigned long npages, lock_limit;
> int ret;
>
> - if (!dev->domain->bounce_map ||
> + if (!dev->domain || !dev->domain->bounce_map ||
> size != dev->domain->bounce_size ||
> iova != 0 || uaddr & ~PAGE_MASK)
> return -EINVAL;
> @@ -1145,7 +1150,6 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> struct vduse_iotlb_entry entry;
> struct vhost_iotlb_map *map;
> struct vdpa_map_file *map_file;
> - struct vduse_iova_domain *domain = dev->domain;
> struct file *f = NULL;
>
> ret = -EFAULT;
> @@ -1156,8 +1160,13 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> if (entry.start > entry.last)
> break;
>
> - spin_lock(&domain->iotlb_lock);
> - map = vhost_iotlb_itree_first(domain->iotlb,
> + mutex_lock(&dev->domain_lock);
> + if (!dev->domain) {
> + mutex_unlock(&dev->domain_lock);
> + break;
> + }
> + spin_lock(&dev->domain->iotlb_lock);
> + map = vhost_iotlb_itree_first(dev->domain->iotlb,
> entry.start, entry.last);
> if (map) {
> map_file = (struct vdpa_map_file *)map->opaque;
> @@ -1167,7 +1176,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> entry.last = map->last;
> entry.perm = map->perm;
> }
> - spin_unlock(&domain->iotlb_lock);
> + spin_unlock(&dev->domain->iotlb_lock);
> + mutex_unlock(&dev->domain_lock);
> ret = -EINVAL;
> if (!f)
> break;
> @@ -1319,8 +1329,10 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> sizeof(umem.reserved)))
> break;
>
> + mutex_lock(&dev->domain_lock);
> ret = vduse_dev_reg_umem(dev, umem.iova,
> umem.uaddr, umem.size);
> + mutex_unlock(&dev->domain_lock);
> break;
> }
> case VDUSE_IOTLB_DEREG_UMEM: {
> @@ -1334,15 +1346,15 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> if (!is_mem_zero((const char *)umem.reserved,
> sizeof(umem.reserved)))
> break;
> -
> + mutex_lock(&dev->domain_lock);
> ret = vduse_dev_dereg_umem(dev, umem.iova,
> umem.size);
> + mutex_unlock(&dev->domain_lock);
> break;
> }
> case VDUSE_IOTLB_GET_INFO: {
> struct vduse_iova_info info;
> struct vhost_iotlb_map *map;
> - struct vduse_iova_domain *domain = dev->domain;
>
> ret = -EFAULT;
> if (copy_from_user(&info, argp, sizeof(info)))
> @@ -1356,18 +1368,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> sizeof(info.reserved)))
> break;
>
> - spin_lock(&domain->iotlb_lock);
> - map = vhost_iotlb_itree_first(domain->iotlb,
> + mutex_lock(&dev->domain_lock);
> + if (!dev->domain) {
> + mutex_unlock(&dev->domain_lock);
> + break;
> + }
> + spin_lock(&dev->domain->iotlb_lock);
> + map = vhost_iotlb_itree_first(dev->domain->iotlb,
> info.start, info.last);
> if (map) {
> info.start = map->start;
> info.last = map->last;
> info.capability = 0;
> - if (domain->bounce_map && map->start == 0 &&
> - map->last == domain->bounce_size - 1)
> + if (dev->domain->bounce_map && map->start == 0 &&
> + map->last == dev->domain->bounce_size - 1)
> info.capability |= VDUSE_IOVA_CAP_UMEM;
> }
> - spin_unlock(&domain->iotlb_lock);
> + spin_unlock(&dev->domain->iotlb_lock);
> + mutex_unlock(&dev->domain_lock);
> if (!map)
> break;
>
> @@ -1390,7 +1408,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
> {
> struct vduse_dev *dev = file->private_data;
>
> - vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
> + mutex_lock(&dev->domain_lock);
> + if (dev->domain)
> + vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
> + mutex_unlock(&dev->domain_lock);
> spin_lock(&dev->msg_lock);
> /* Make sure the inflight messages can processed after reconncection */
> list_splice_init(&dev->recv_list, &dev->send_list);
> @@ -1647,6 +1668,7 @@ static struct vduse_dev *vduse_dev_create(void)
>
> mutex_init(&dev->lock);
> mutex_init(&dev->mem_lock);
> + mutex_init(&dev->domain_lock);
> spin_lock_init(&dev->msg_lock);
> INIT_LIST_HEAD(&dev->send_list);
> INIT_LIST_HEAD(&dev->recv_list);
> @@ -1696,7 +1718,8 @@ static int vduse_destroy_dev(char *name)
> idr_remove(&vduse_idr, dev->minor);
> kvfree(dev->config);
> vduse_dev_deinit_vqs(dev);
> - vduse_domain_destroy(dev->domain);
> + if (dev->domain)
> + vduse_domain_destroy(dev->domain);
> kfree(dev->name);
> vduse_dev_destroy(dev);
> module_put(THIS_MODULE);
> @@ -1802,11 +1825,7 @@ static int vduse_create_dev(struct vduse_dev_config *config,
> if (!dev->name)
> goto err_str;
>
> - dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
> - VDUSE_BOUNCE_SIZE);
> - if (!dev->domain)
> - goto err_domain;
> -
> + dev->bounce_size = VDUSE_BOUNCE_SIZE;
> dev->config = config_buf;
> dev->config_size = config->config_size;
>
> @@ -1836,8 +1855,6 @@ static int vduse_create_dev(struct vduse_dev_config *config,
> err_dev:
> idr_remove(&vduse_idr, dev->minor);
> err_idr:
> - vduse_domain_destroy(dev->domain);
> -err_domain:
> kfree(dev->name);
> err_str:
> vduse_dev_destroy(dev);
> @@ -2004,9 +2021,23 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name,
> if (ret)
> return ret;
>
> + mutex_lock(&dev->domain_lock);
> + if (!dev->domain)
> + dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1,
> + dev->bounce_size);
> + mutex_unlock(&dev->domain_lock);
> + if (!dev->domain) {
> + put_device(&dev->vdev->vdpa.dev);
> + return -ENOMEM;
> + }
> +
> ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num);
> if (ret) {
> put_device(&dev->vdev->vdpa.dev);
> + mutex_lock(&dev->domain_lock);
> + vduse_domain_destroy(dev->domain);
> + dev->domain = NULL;
> + mutex_unlock(&dev->domain_lock);
> return ret;
> }
>
> --
> 2.20.1
>
On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <[email protected]> wrote:
>
> As discussed in [1], this adds sysfs interface to support
> specifying bounce buffer size in virtio-vdpa case. It would
> be a performance tuning parameter for high throughput workloads.
>
> [1] https://www.spinics.net/lists/netdev/msg754288.html
>
> Signed-off-by: Xie Yongji <[email protected]>
Acked-by: Jason Wang <[email protected]>
Thanks
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 45 +++++++++++++++++++++++++++++-
> 1 file changed, 44 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index bd1ba6c33e09..0458d61cee1f 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -38,8 +38,11 @@
> #define DRV_LICENSE "GPL v2"
>
> #define VDUSE_DEV_MAX (1U << MINORBITS)
> +#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> +#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> -#define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
> +/* 128 MB reserved for virtqueue creation */
> +#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
> #define VDUSE_MSG_DEFAULT_TIMEOUT 30
>
> struct vduse_virtqueue {
> @@ -1795,8 +1798,48 @@ static ssize_t msg_timeout_store(struct device *device,
>
> static DEVICE_ATTR_RW(msg_timeout);
>
> +static ssize_t bounce_size_show(struct device *device,
> + struct device_attribute *attr, char *buf)
> +{
> + struct vduse_dev *dev = dev_get_drvdata(device);
> +
> + return sysfs_emit(buf, "%u\n", dev->bounce_size);
> +}
> +
> +static ssize_t bounce_size_store(struct device *device,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct vduse_dev *dev = dev_get_drvdata(device);
> + unsigned int bounce_size;
> + int ret;
> +
> + ret = -EPERM;
> + mutex_lock(&dev->domain_lock);
> + if (dev->domain)
> + goto unlock;
> +
> + ret = kstrtouint(buf, 10, &bounce_size);
> + if (ret < 0)
> + goto unlock;
> +
> + ret = -EINVAL;
> + if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
> + bounce_size < VDUSE_MIN_BOUNCE_SIZE)
> + goto unlock;
> +
> + dev->bounce_size = bounce_size & PAGE_MASK;
> + ret = count;
> +unlock:
> + mutex_unlock(&dev->domain_lock);
> + return ret;
> +}
> +
> +static DEVICE_ATTR_RW(bounce_size);
> +
> static struct attribute *vduse_dev_attrs[] = {
> &dev_attr_msg_timeout.attr,
> + &dev_attr_bounce_size.attr,
> NULL
> };
>
> --
> 2.20.1
>
On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <[email protected]> wrote:
>
> Add enable_irq_wq sysfs interface to control whether
> use workqueue to inject irq or not. The vhost-vdpa case
> can benefit from it.
Do we have a benchmark result for this?
Or I wonder if we can extend set_vq_cb() by associating an eventfd
then VDUSE can signal that eventfd directly?
Thanks
>
> Signed-off-by: Xie Yongji <[email protected]>
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 50 +++++++++++++++++++++++++++++-
> 1 file changed, 49 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index c65f84100e30..ed06c7afd484 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -62,6 +62,7 @@ struct vduse_virtqueue {
> struct cpumask irq_affinity;
> spinlock_t irq_affinity_lock;
> struct kobject kobj;
> + bool enable_irq_wq;
> };
>
> struct vduse_dev;
> @@ -1013,6 +1014,26 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
> return ret;
> }
>
> +static int vduse_dev_inject_vq_irq(struct vduse_dev *dev,
> + struct vduse_virtqueue *vq)
> +{
> + int ret = -EINVAL;
> +
> + down_read(&dev->rwsem);
> + if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
> + goto unlock;
> +
> + ret = 0;
> + spin_lock_irq(&vq->irq_lock);
> + if (vq->ready && vq->cb.callback)
> + vq->cb.callback(vq->cb.private);
> + spin_unlock_irq(&vq->irq_lock);
> +unlock:
> + up_read(&dev->rwsem);
> +
> + return ret;
> +}
> +
> static int vduse_dev_dereg_umem(struct vduse_dev *dev,
> u64 iova, u64 size)
> {
> @@ -1278,8 +1299,12 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> break;
>
> index = array_index_nospec(index, dev->vq_num);
> - ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject,
> + if (dev->vqs[index]->enable_irq_wq)
> + ret = vduse_dev_queue_irq_work(dev,
> + &dev->vqs[index]->inject,
> dev->vqs[index]->irq_effective_cpu);
> + else
> + ret = vduse_dev_inject_vq_irq(dev, dev->vqs[index]);
> break;
> }
> case VDUSE_IOTLB_REG_UMEM: {
> @@ -1420,6 +1445,26 @@ static const struct file_operations vduse_dev_fops = {
> .llseek = noop_llseek,
> };
>
> +static ssize_t enable_irq_wq_show(struct vduse_virtqueue *vq, char *buf)
> +{
> + return sprintf(buf, "%d\n", vq->enable_irq_wq);
> +}
> +
> +static ssize_t enable_irq_wq_store(struct vduse_virtqueue *vq,
> + const char *buf, size_t count)
> +{
> + bool enabled;
> + int ret;
> +
> + ret = kstrtobool(buf, &enabled);
> + if (ret)
> + return ret;
> +
> + vq->enable_irq_wq = enabled;
> +
> + return count;
> +}
> +
> static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
> {
> return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
> @@ -1480,10 +1525,12 @@ struct vq_sysfs_entry {
> static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RO(irq_cb_affinity);
> static struct vq_sysfs_entry irq_cb_effective_affinity_attr =
> __ATTR_RW(irq_cb_effective_affinity);
> +static struct vq_sysfs_entry enable_irq_wq_attr = __ATTR_RW(enable_irq_wq);
>
> static struct attribute *vq_attrs[] = {
> &irq_cb_affinity_attr.attr,
> &irq_cb_effective_affinity_attr.attr,
> + &enable_irq_wq_attr.attr,
> NULL,
> };
> ATTRIBUTE_GROUPS(vq);
> @@ -1565,6 +1612,7 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
>
> dev->vqs[i]->index = i;
> dev->vqs[i]->irq_effective_cpu = -1;
> + dev->vqs[i]->enable_irq_wq = true;
> INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
> INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
> spin_lock_init(&dev->vqs[i]->kick_lock);
> --
> 2.20.1
>
On Fri, Dec 16, 2022 at 1:30 PM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 5, 2022 at 4:59 PM Xie Yongji <[email protected]> wrote:
> >
> > This brings current interrupt affinity spreading mechanism
> > to vduse device. We will make use of irq_create_affinity_masks()
> > to create an irq callback affinity mask for each virtqueue of
> > vduse device. Then we will choose the CPU which has the lowest
> > number of interrupt allocated in the affinity mask to run the
> > irq callback.
>
> This seems a balance mechanism but it might not be the semantic of the
> affinity or any reason we need to do this? I guess we should use at
> least round-robin in this case.
>
Here we try to follow the pci interrupt management mechanism. In VM
cases, the interrupt should always be triggered to one specific CPU
rather than to each CPU in turn.
> >
> > Signed-off-by: Xie Yongji <[email protected]>
> > ---
> > drivers/vdpa/vdpa_user/vduse_dev.c | 50 ++++++++++++++++++++++++++++++
> > 1 file changed, 50 insertions(+)
> >
> > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > index d126f3e32a20..90c2896039d9 100644
> > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > @@ -23,6 +23,7 @@
> > #include <linux/nospec.h>
> > #include <linux/vmalloc.h>
> > #include <linux/sched/mm.h>
> > +#include <linux/interrupt.h>
> > #include <uapi/linux/vduse.h>
> > #include <uapi/linux/vdpa.h>
> > #include <uapi/linux/virtio_config.h>
> > @@ -58,6 +59,8 @@ struct vduse_virtqueue {
> > struct work_struct inject;
> > struct work_struct kick;
> > int irq_effective_cpu;
> > + struct cpumask irq_affinity;
> > + spinlock_t irq_affinity_lock;
>
> Ok, I'd suggest to squash this into patch 5 to make it more easier to
> be reviewed.
>
OK.
> > };
> >
> > struct vduse_dev;
> > @@ -123,6 +126,7 @@ struct vduse_control {
> >
> > static DEFINE_MUTEX(vduse_lock);
> > static DEFINE_IDR(vduse_idr);
> > +static DEFINE_PER_CPU(unsigned long, vduse_allocated_irq);
> >
> > static dev_t vduse_major;
> > static struct class *vduse_class;
> > @@ -710,6 +714,49 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
> > return dev->generation;
> > }
> >
> > +static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
> > +{
> > + unsigned int cpu, best_cpu;
> > + unsigned long allocated, allocated_min = UINT_MAX;
> > +
> > + spin_lock(&vq->irq_affinity_lock);
> > +
> > + best_cpu = vq->irq_effective_cpu;
> > + if (best_cpu != -1)
> > + per_cpu(vduse_allocated_irq, best_cpu) -= 1;
> > +
> > + for_each_cpu(cpu, &vq->irq_affinity) {
> > + allocated = per_cpu(vduse_allocated_irq, cpu);
> > + if (!cpu_online(cpu) || allocated >= allocated_min)
> > + continue;
> > +
> > + best_cpu = cpu;
> > + allocated_min = allocated;
> > + }
> > + vq->irq_effective_cpu = best_cpu;
> > + per_cpu(vduse_allocated_irq, best_cpu) += 1;
> > +
> > + spin_unlock(&vq->irq_affinity_lock);
> > +}
> > +
> > +static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
> > + struct irq_affinity *desc)
> > +{
> > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > + struct irq_affinity_desc *affd = NULL;
> > + int i;
> > +
> > + affd = irq_create_affinity_masks(dev->vq_num, desc);
> > + if (!affd)
>
> Let's add a comment on the vdpa config ops to say set_irq_affinity()
> is best effort.
>
OK.
Thanks,
Yongji
On Fri, Dec 16, 2022 at 11:58 AM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 5, 2022 at 4:43 PM Xie Yongji <[email protected]> wrote:
> >
> > This introduces set_irq_affinity callback in
> > vdpa_config_ops so that vdpa device driver can
> > get the interrupt affinity hint from the virtio
> > device driver. The interrupt affinity hint would
> > be needed by the interrupt affinity spreading
> > mechanism.
> >
> > Signed-off-by: Xie Yongji <[email protected]>
> > ---
> > drivers/virtio/virtio_vdpa.c | 4 ++++
> > include/linux/vdpa.h | 8 ++++++++
> > 2 files changed, 12 insertions(+)
> >
> > diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> > index 08084b49e5a1..4731e4616ee0 100644
> > --- a/drivers/virtio/virtio_vdpa.c
> > +++ b/drivers/virtio/virtio_vdpa.c
> > @@ -275,9 +275,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> > struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> > struct vdpa_device *vdpa = vd_get_vdpa(vdev);
> > const struct vdpa_config_ops *ops = vdpa->config;
> > + struct irq_affinity default_affd = { 0 };
> > struct vdpa_callback cb;
> > int i, err, queue_idx = 0;
> >
> > + if (ops->set_irq_affinity)
> > + ops->set_irq_affinity(vdpa, desc ? desc : &default_affd);
>
> I wonder if we need to do this in vhost-vDPA.
I don't get why we need to do this in vhost-vDPA? Should this be done in VM?
> Or it's better to have a
> default affinity by the vDPA parent
>
I think both are OK. But the default value should always be zero, so I
put it in a common place.
> (Looking at virtio-pci, it doesn't do something like this).
>
Yes, but we did something like this in the pci layer:
pci_alloc_irq_vectors_affinity().
Thanks,
Yongji
On Fri, Dec 16, 2022 at 12:02 PM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 5, 2022 at 4:44 PM Xie Yongji <[email protected]> wrote:
> >
> > This introduces a bound workqueue to support running
> > irq callback in a specified cpu.
> >
> > Signed-off-by: Xie Yongji <[email protected]>
> > ---
> > drivers/vdpa/vdpa_user/vduse_dev.c | 29 ++++++++++++++++++++++-------
> > 1 file changed, 22 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > index 37809bfcb7ef..d126f3e32a20 100644
> > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > @@ -57,6 +57,7 @@ struct vduse_virtqueue {
> > struct vdpa_callback cb;
> > struct work_struct inject;
> > struct work_struct kick;
> > + int irq_effective_cpu;
>
> I wonder why it's a cpu number instead of a cpumask. The latter seems
> more flexible, e.g when using NUMA.
>
This variable represents the CPU that runs the interrupt callback
rather than CPU affinity.
> > };
> >
> > struct vduse_dev;
> > @@ -128,6 +129,7 @@ static struct class *vduse_class;
> > static struct cdev vduse_ctrl_cdev;
> > static struct cdev vduse_cdev;
> > static struct workqueue_struct *vduse_irq_wq;
> > +static struct workqueue_struct *vduse_irq_bound_wq;
> >
> > static u32 allowed_device_id[] = {
> > VIRTIO_ID_BLOCK,
> > @@ -917,7 +919,8 @@ static void vduse_vq_irq_inject(struct work_struct *work)
> > }
> >
> > static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
> > - struct work_struct *irq_work)
> > + struct work_struct *irq_work,
> > + int irq_effective_cpu)
> > {
> > int ret = -EINVAL;
> >
> > @@ -926,7 +929,11 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
> > goto unlock;
> >
> > ret = 0;
> > - queue_work(vduse_irq_wq, irq_work);
> > + if (irq_effective_cpu == -1)
>
> Is it better to have a macro for this magic number?
>
It makes sense to me.
Thanks,
Yongji
On Fri, Dec 16, 2022 at 1:35 PM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <[email protected]> wrote:
> >
> > Add sysfs interface for each vduse virtqueue to
> > show the affinity and effective affinity for irq
> > callback.
> >
> > And we can also use this interface to change the
> > effective affinity which must be a subset of the
> > irq callback affinity mask for the virtqueue. This
> > might be useful for performance tuning when the irq
> > callback affinity mask contains more than one CPU.
> >
> > Signed-off-by: Xie Yongji <[email protected]>
> > ---
> > drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++---
> > 1 file changed, 137 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > index 6507a78abc9d..c65f84100e30 100644
> > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > @@ -61,6 +61,7 @@ struct vduse_virtqueue {
> > int irq_effective_cpu;
> > struct cpumask irq_affinity;
> > spinlock_t irq_affinity_lock;
> > + struct kobject kobj;
> > };
> >
> > struct vduse_dev;
> > @@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = {
> > .llseek = noop_llseek,
> > };
> >
> > +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
> > +{
> > + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
> > +}
> > +
> > +static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq,
> > + char *buf)
> > +{
> > + struct cpumask all_mask = CPU_MASK_ALL;
> > + const struct cpumask *mask = &all_mask;
> > +
> > + if (vq->irq_effective_cpu != -1)
> > + mask = get_cpu_mask(vq->irq_effective_cpu);
>
> Shouldn't this be vq->irq_affinity?
>
This sysfs interface is provided for effective irq affinity rather
than irq affinity. We created another read-only sysfs interface for
irq affinity.
> > +
> > + return sprintf(buf, "%*pb\n", cpumask_pr_args(mask));
> > +}
> > +
> > +static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq,
> > + const char *buf, size_t count)
> > +{
> > + cpumask_var_t new_value;
> > + int ret;
> > +
> > + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
> > + return -ENOMEM;
> > +
> > + ret = cpumask_parse(buf, new_value);
> > + if (ret)
> > + goto free_mask;
> > +
> > + ret = -EINVAL;
> > + if (!cpumask_intersects(new_value, &vq->irq_affinity))
> > + goto free_mask;
> > +
> > + spin_lock(&vq->irq_affinity_lock);
> > +
> > + if (vq->irq_effective_cpu != -1)
> > + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1;
> > +
> > + vq->irq_effective_cpu = cpumask_first(new_value);
>
> Does this mean except for the first cpu, the rest of the mask is unused?
>
Yes, but the user should always specify a mask that only contains one
CPU to make it work as expected. This sysfs interface is used to
specify the effective irq affinity rather than irq affinity.
Thanks,
Yongji
On Mon, Dec 19, 2022 at 12:39 PM Yongji Xie <[email protected]> wrote:
>
> On Fri, Dec 16, 2022 at 11:58 AM Jason Wang <[email protected]> wrote:
> >
> > On Mon, Dec 5, 2022 at 4:43 PM Xie Yongji <[email protected]> wrote:
> > >
> > > This introduces set_irq_affinity callback in
> > > vdpa_config_ops so that vdpa device driver can
> > > get the interrupt affinity hint from the virtio
> > > device driver. The interrupt affinity hint would
> > > be needed by the interrupt affinity spreading
> > > mechanism.
> > >
> > > Signed-off-by: Xie Yongji <[email protected]>
> > > ---
> > > drivers/virtio/virtio_vdpa.c | 4 ++++
> > > include/linux/vdpa.h | 8 ++++++++
> > > 2 files changed, 12 insertions(+)
> > >
> > > diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> > > index 08084b49e5a1..4731e4616ee0 100644
> > > --- a/drivers/virtio/virtio_vdpa.c
> > > +++ b/drivers/virtio/virtio_vdpa.c
> > > @@ -275,9 +275,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> > > struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> > > struct vdpa_device *vdpa = vd_get_vdpa(vdev);
> > > const struct vdpa_config_ops *ops = vdpa->config;
> > > + struct irq_affinity default_affd = { 0 };
> > > struct vdpa_callback cb;
> > > int i, err, queue_idx = 0;
> > >
> > > + if (ops->set_irq_affinity)
> > > + ops->set_irq_affinity(vdpa, desc ? desc : &default_affd);
> >
> > I wonder if we need to do this in vhost-vDPA.
>
> I don't get why we need to do this in vhost-vDPA? Should this be done in VM?
If I was not wrong, this tries to set affinity on the host instead of
the guest. More below.
>
> > Or it's better to have a
> > default affinity by the vDPA parent
> >
>
> I think both are OK. But the default value should always be zero, so I
> put it in a common place.
I think we should either:
1) document the zero default value in vdpa.c
2) set the zero in both vhost-vdpa and virtio-vdpa, or in the vdpa core
>
> > (Looking at virtio-pci, it doesn't do something like this).
> >
>
> Yes, but we did something like this in the pci layer:
> pci_alloc_irq_vectors_affinity().
Ok.
Thanks
>
> Thanks,
> Yongji
>
On Fri, Dec 16, 2022 at 1:43 PM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <[email protected]> wrote:
> >
> > Add enable_irq_wq sysfs interface to control whether
> > use workqueue to inject irq or not. The vhost-vdpa case
> > can benefit from it.
>
> Do we have a benchmark result for this?
>
It can reduce 2~3 us latency in our sync I/O test.
> Or I wonder if we can extend set_vq_cb() by associating an eventfd
> then VDUSE can signal that eventfd directly?
>
It looks good to me. I can try this way in v3.
Thanks,
Yongji
On Mon, Dec 19, 2022 at 2:06 PM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 19, 2022 at 12:39 PM Yongji Xie <[email protected]> wrote:
> >
> > On Fri, Dec 16, 2022 at 11:58 AM Jason Wang <[email protected]> wrote:
> > >
> > > On Mon, Dec 5, 2022 at 4:43 PM Xie Yongji <[email protected]> wrote:
> > > >
> > > > This introduces set_irq_affinity callback in
> > > > vdpa_config_ops so that vdpa device driver can
> > > > get the interrupt affinity hint from the virtio
> > > > device driver. The interrupt affinity hint would
> > > > be needed by the interrupt affinity spreading
> > > > mechanism.
> > > >
> > > > Signed-off-by: Xie Yongji <[email protected]>
> > > > ---
> > > > drivers/virtio/virtio_vdpa.c | 4 ++++
> > > > include/linux/vdpa.h | 8 ++++++++
> > > > 2 files changed, 12 insertions(+)
> > > >
> > > > diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> > > > index 08084b49e5a1..4731e4616ee0 100644
> > > > --- a/drivers/virtio/virtio_vdpa.c
> > > > +++ b/drivers/virtio/virtio_vdpa.c
> > > > @@ -275,9 +275,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> > > > struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> > > > struct vdpa_device *vdpa = vd_get_vdpa(vdev);
> > > > const struct vdpa_config_ops *ops = vdpa->config;
> > > > + struct irq_affinity default_affd = { 0 };
> > > > struct vdpa_callback cb;
> > > > int i, err, queue_idx = 0;
> > > >
> > > > + if (ops->set_irq_affinity)
> > > > + ops->set_irq_affinity(vdpa, desc ? desc : &default_affd);
> > >
> > > I wonder if we need to do this in vhost-vDPA.
> >
> > I don't get why we need to do this in vhost-vDPA? Should this be done in VM?
>
> If I was not wrong, this tries to set affinity on the host instead of
> the guest. More below.
>
Yes, it's host stuff. This is used by the virtio device driver to pass
the irq affinity hint (tell which irq vectors don't need affinity
management) to the irq affinity manager. In the VM case, it should
only be related to the guest's virtio device driver and pci irq
affinity manager. So I don't get why we need to do this in vhost-vDPA.
> >
> > > Or it's better to have a
> > > default affinity by the vDPA parent
> > >
> >
> > I think both are OK. But the default value should always be zero, so I
> > put it in a common place.
>
> I think we should either:
>
> 1) document the zero default value in vdpa.c
> 2) set the zero in both vhost-vdpa and virtio-vdpa, or in the vdpa core
>
Can we only call it in the virtio-vdpa case? Thus the vdpa device
driver can know whether it needs to do the automatic irq affinity
management or not. In the vhost-vdpa case, we actually don't need the
irq affinity management.
Thanks,
Yongji
On Mon, Dec 19, 2022 at 1:04 PM Yongji Xie <[email protected]> wrote:
>
> On Fri, Dec 16, 2022 at 12:02 PM Jason Wang <[email protected]> wrote:
> >
> > On Mon, Dec 5, 2022 at 4:44 PM Xie Yongji <[email protected]> wrote:
> > >
> > > This introduces a bound workqueue to support running
> > > irq callback in a specified cpu.
> > >
> > > Signed-off-by: Xie Yongji <[email protected]>
> > > ---
> > > drivers/vdpa/vdpa_user/vduse_dev.c | 29 ++++++++++++++++++++++-------
> > > 1 file changed, 22 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > index 37809bfcb7ef..d126f3e32a20 100644
> > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > @@ -57,6 +57,7 @@ struct vduse_virtqueue {
> > > struct vdpa_callback cb;
> > > struct work_struct inject;
> > > struct work_struct kick;
> > > + int irq_effective_cpu;
> >
> > I wonder why it's a cpu number instead of a cpumask. The latter seems
> > more flexible, e.g when using NUMA.
> >
>
> This variable represents the CPU that runs the interrupt callback
> rather than CPU affinity.
Ok, but for some reason it only gets updated when a new affinity is set?
(Btw, I don't see how the code deals with cpu hotplug, do we need
cpuhot notifier?)
Thanks
>
> > > };
> > >
> > > struct vduse_dev;
> > > @@ -128,6 +129,7 @@ static struct class *vduse_class;
> > > static struct cdev vduse_ctrl_cdev;
> > > static struct cdev vduse_cdev;
> > > static struct workqueue_struct *vduse_irq_wq;
> > > +static struct workqueue_struct *vduse_irq_bound_wq;
> > >
> > > static u32 allowed_device_id[] = {
> > > VIRTIO_ID_BLOCK,
> > > @@ -917,7 +919,8 @@ static void vduse_vq_irq_inject(struct work_struct *work)
> > > }
> > >
> > > static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
> > > - struct work_struct *irq_work)
> > > + struct work_struct *irq_work,
> > > + int irq_effective_cpu)
> > > {
> > > int ret = -EINVAL;
> > >
> > > @@ -926,7 +929,11 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
> > > goto unlock;
> > >
> > > ret = 0;
> > > - queue_work(vduse_irq_wq, irq_work);
> > > + if (irq_effective_cpu == -1)
> >
> > Is it better to have a macro for this magic number?
> >
>
> It makes sense to me.
>
> Thanks,
> Yongji
>
On Mon, Dec 19, 2022 at 1:16 PM Yongji Xie <[email protected]> wrote:
>
> On Fri, Dec 16, 2022 at 1:35 PM Jason Wang <[email protected]> wrote:
> >
> > On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <[email protected]> wrote:
> > >
> > > Add sysfs interface for each vduse virtqueue to
> > > show the affinity and effective affinity for irq
> > > callback.
> > >
> > > And we can also use this interface to change the
> > > effective affinity which must be a subset of the
> > > irq callback affinity mask for the virtqueue. This
> > > might be useful for performance tuning when the irq
> > > callback affinity mask contains more than one CPU.
> > >
> > > Signed-off-by: Xie Yongji <[email protected]>
> > > ---
> > > drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++---
> > > 1 file changed, 137 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > index 6507a78abc9d..c65f84100e30 100644
> > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > @@ -61,6 +61,7 @@ struct vduse_virtqueue {
> > > int irq_effective_cpu;
> > > struct cpumask irq_affinity;
> > > spinlock_t irq_affinity_lock;
> > > + struct kobject kobj;
> > > };
> > >
> > > struct vduse_dev;
> > > @@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = {
> > > .llseek = noop_llseek,
> > > };
> > >
> > > +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
> > > +{
> > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
> > > +}
> > > +
> > > +static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq,
> > > + char *buf)
> > > +{
> > > + struct cpumask all_mask = CPU_MASK_ALL;
> > > + const struct cpumask *mask = &all_mask;
> > > +
> > > + if (vq->irq_effective_cpu != -1)
> > > + mask = get_cpu_mask(vq->irq_effective_cpu);
> >
> > Shouldn't this be vq->irq_affinity?
> >
>
> This sysfs interface is provided for effective irq affinity rather
> than irq affinity. We created another read-only sysfs interface for
> irq affinity.
>
> > > +
> > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(mask));
> > > +}
> > > +
> > > +static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq,
> > > + const char *buf, size_t count)
> > > +{
> > > + cpumask_var_t new_value;
> > > + int ret;
> > > +
> > > + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
> > > + return -ENOMEM;
> > > +
> > > + ret = cpumask_parse(buf, new_value);
> > > + if (ret)
> > > + goto free_mask;
> > > +
> > > + ret = -EINVAL;
> > > + if (!cpumask_intersects(new_value, &vq->irq_affinity))
> > > + goto free_mask;
> > > +
> > > + spin_lock(&vq->irq_affinity_lock);
> > > +
> > > + if (vq->irq_effective_cpu != -1)
> > > + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1;
> > > +
> > > + vq->irq_effective_cpu = cpumask_first(new_value);
> >
> > Does this mean except for the first cpu, the rest of the mask is unused?
> >
>
> Yes, but the user should always specify a mask that only contains one
> CPU to make it work as expected.
This doesn't seem to be the way that the IRQ affinity{hint} exported
via /sys work. Any reason for doing this?
(E.g we may have the require to limit the IRQ/callback to a NUMA node
instead of a specific cpu)
Thanks
> This sysfs interface is used to
> specify the effective irq affinity rather than irq affinity.
>
> Thanks,
> Yongji
>
On Mon, Dec 19, 2022 at 12:56 PM Yongji Xie <[email protected]> wrote:
>
> On Fri, Dec 16, 2022 at 1:30 PM Jason Wang <[email protected]> wrote:
> >
> > On Mon, Dec 5, 2022 at 4:59 PM Xie Yongji <[email protected]> wrote:
> > >
> > > This brings current interrupt affinity spreading mechanism
> > > to vduse device. We will make use of irq_create_affinity_masks()
> > > to create an irq callback affinity mask for each virtqueue of
> > > vduse device. Then we will choose the CPU which has the lowest
> > > number of interrupt allocated in the affinity mask to run the
> > > irq callback.
> >
> > This seems a balance mechanism but it might not be the semantic of the
> > affinity or any reason we need to do this? I guess we should use at
> > least round-robin in this case.
> >
>
> Here we try to follow the pci interrupt management mechanism. In VM
> cases, the interrupt should always be triggered to one specific CPU
> rather than to each CPU in turn.
If I was not wrong, when using MSI, most arch allows not only the
cpuid as the destination but policy like rr and low priority first.
Thanks
>
> > >
> > > Signed-off-by: Xie Yongji <[email protected]>
> > > ---
> > > drivers/vdpa/vdpa_user/vduse_dev.c | 50 ++++++++++++++++++++++++++++++
> > > 1 file changed, 50 insertions(+)
> > >
> > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > index d126f3e32a20..90c2896039d9 100644
> > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > @@ -23,6 +23,7 @@
> > > #include <linux/nospec.h>
> > > #include <linux/vmalloc.h>
> > > #include <linux/sched/mm.h>
> > > +#include <linux/interrupt.h>
> > > #include <uapi/linux/vduse.h>
> > > #include <uapi/linux/vdpa.h>
> > > #include <uapi/linux/virtio_config.h>
> > > @@ -58,6 +59,8 @@ struct vduse_virtqueue {
> > > struct work_struct inject;
> > > struct work_struct kick;
> > > int irq_effective_cpu;
> > > + struct cpumask irq_affinity;
> > > + spinlock_t irq_affinity_lock;
> >
> > Ok, I'd suggest to squash this into patch 5 to make it more easier to
> > be reviewed.
> >
>
> OK.
>
> > > };
> > >
> > > struct vduse_dev;
> > > @@ -123,6 +126,7 @@ struct vduse_control {
> > >
> > > static DEFINE_MUTEX(vduse_lock);
> > > static DEFINE_IDR(vduse_idr);
> > > +static DEFINE_PER_CPU(unsigned long, vduse_allocated_irq);
> > >
> > > static dev_t vduse_major;
> > > static struct class *vduse_class;
> > > @@ -710,6 +714,49 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
> > > return dev->generation;
> > > }
> > >
> > > +static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq)
> > > +{
> > > + unsigned int cpu, best_cpu;
> > > + unsigned long allocated, allocated_min = UINT_MAX;
> > > +
> > > + spin_lock(&vq->irq_affinity_lock);
> > > +
> > > + best_cpu = vq->irq_effective_cpu;
> > > + if (best_cpu != -1)
> > > + per_cpu(vduse_allocated_irq, best_cpu) -= 1;
> > > +
> > > + for_each_cpu(cpu, &vq->irq_affinity) {
> > > + allocated = per_cpu(vduse_allocated_irq, cpu);
> > > + if (!cpu_online(cpu) || allocated >= allocated_min)
> > > + continue;
> > > +
> > > + best_cpu = cpu;
> > > + allocated_min = allocated;
> > > + }
> > > + vq->irq_effective_cpu = best_cpu;
> > > + per_cpu(vduse_allocated_irq, best_cpu) += 1;
> > > +
> > > + spin_unlock(&vq->irq_affinity_lock);
> > > +}
> > > +
> > > +static void vduse_vdpa_set_irq_affinity(struct vdpa_device *vdpa,
> > > + struct irq_affinity *desc)
> > > +{
> > > + struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> > > + struct irq_affinity_desc *affd = NULL;
> > > + int i;
> > > +
> > > + affd = irq_create_affinity_masks(dev->vq_num, desc);
> > > + if (!affd)
> >
> > Let's add a comment on the vdpa config ops to say set_irq_affinity()
> > is best effort.
> >
>
> OK.
>
> Thanks,
> Yongji
>
On Mon, Dec 19, 2022 at 3:12 PM Yongji Xie <[email protected]> wrote:
>
> On Mon, Dec 19, 2022 at 2:06 PM Jason Wang <[email protected]> wrote:
> >
> > On Mon, Dec 19, 2022 at 12:39 PM Yongji Xie <[email protected]> wrote:
> > >
> > > On Fri, Dec 16, 2022 at 11:58 AM Jason Wang <[email protected]> wrote:
> > > >
> > > > On Mon, Dec 5, 2022 at 4:43 PM Xie Yongji <[email protected]> wrote:
> > > > >
> > > > > This introduces set_irq_affinity callback in
> > > > > vdpa_config_ops so that vdpa device driver can
> > > > > get the interrupt affinity hint from the virtio
> > > > > device driver. The interrupt affinity hint would
> > > > > be needed by the interrupt affinity spreading
> > > > > mechanism.
> > > > >
> > > > > Signed-off-by: Xie Yongji <[email protected]>
> > > > > ---
> > > > > drivers/virtio/virtio_vdpa.c | 4 ++++
> > > > > include/linux/vdpa.h | 8 ++++++++
> > > > > 2 files changed, 12 insertions(+)
> > > > >
> > > > > diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> > > > > index 08084b49e5a1..4731e4616ee0 100644
> > > > > --- a/drivers/virtio/virtio_vdpa.c
> > > > > +++ b/drivers/virtio/virtio_vdpa.c
> > > > > @@ -275,9 +275,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> > > > > struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> > > > > struct vdpa_device *vdpa = vd_get_vdpa(vdev);
> > > > > const struct vdpa_config_ops *ops = vdpa->config;
> > > > > + struct irq_affinity default_affd = { 0 };
> > > > > struct vdpa_callback cb;
> > > > > int i, err, queue_idx = 0;
> > > > >
> > > > > + if (ops->set_irq_affinity)
> > > > > + ops->set_irq_affinity(vdpa, desc ? desc : &default_affd);
> > > >
> > > > I wonder if we need to do this in vhost-vDPA.
> > >
> > > I don't get why we need to do this in vhost-vDPA? Should this be done in VM?
> >
> > If I was not wrong, this tries to set affinity on the host instead of
> > the guest. More below.
> >
>
> Yes, it's host stuff. This is used by the virtio device driver to pass
> the irq affinity hint (tell which irq vectors don't need affinity
> management) to the irq affinity manager. In the VM case, it should
> only be related to the guest's virtio device driver and pci irq
> affinity manager. So I don't get why we need to do this in vhost-vDPA.
It's not necessarily the VM, do we have the same requirement for
userspace (like DPDK) drivers?
Thanks
>
> > >
> > > > Or it's better to have a
> > > > default affinity by the vDPA parent
> > > >
> > >
> > > I think both are OK. But the default value should always be zero, so I
> > > put it in a common place.
> >
> > I think we should either:
> >
> > 1) document the zero default value in vdpa.c
> > 2) set the zero in both vhost-vdpa and virtio-vdpa, or in the vdpa core
> >
>
> Can we only call it in the virtio-vdpa case? Thus the vdpa device
> driver can know whether it needs to do the automatic irq affinity
> management or not. In the vhost-vdpa case, we actually don't need the
> irq affinity management.
>
> Thanks,
> Yongji
>
On Tue, Dec 20, 2022 at 2:29 PM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 19, 2022 at 1:16 PM Yongji Xie <[email protected]> wrote:
> >
> > On Fri, Dec 16, 2022 at 1:35 PM Jason Wang <[email protected]> wrote:
> > >
> > > On Mon, Dec 5, 2022 at 5:03 PM Xie Yongji <[email protected]> wrote:
> > > >
> > > > Add sysfs interface for each vduse virtqueue to
> > > > show the affinity and effective affinity for irq
> > > > callback.
> > > >
> > > > And we can also use this interface to change the
> > > > effective affinity which must be a subset of the
> > > > irq callback affinity mask for the virtqueue. This
> > > > might be useful for performance tuning when the irq
> > > > callback affinity mask contains more than one CPU.
> > > >
> > > > Signed-off-by: Xie Yongji <[email protected]>
> > > > ---
> > > > drivers/vdpa/vdpa_user/vduse_dev.c | 148 ++++++++++++++++++++++++++---
> > > > 1 file changed, 137 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > index 6507a78abc9d..c65f84100e30 100644
> > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > @@ -61,6 +61,7 @@ struct vduse_virtqueue {
> > > > int irq_effective_cpu;
> > > > struct cpumask irq_affinity;
> > > > spinlock_t irq_affinity_lock;
> > > > + struct kobject kobj;
> > > > };
> > > >
> > > > struct vduse_dev;
> > > > @@ -1419,6 +1420,120 @@ static const struct file_operations vduse_dev_fops = {
> > > > .llseek = noop_llseek,
> > > > };
> > > >
> > > > +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf)
> > > > +{
> > > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity));
> > > > +}
> > > > +
> > > > +static ssize_t irq_cb_effective_affinity_show(struct vduse_virtqueue *vq,
> > > > + char *buf)
> > > > +{
> > > > + struct cpumask all_mask = CPU_MASK_ALL;
> > > > + const struct cpumask *mask = &all_mask;
> > > > +
> > > > + if (vq->irq_effective_cpu != -1)
> > > > + mask = get_cpu_mask(vq->irq_effective_cpu);
> > >
> > > Shouldn't this be vq->irq_affinity?
> > >
> >
> > This sysfs interface is provided for effective irq affinity rather
> > than irq affinity. We created another read-only sysfs interface for
> > irq affinity.
> >
> > > > +
> > > > + return sprintf(buf, "%*pb\n", cpumask_pr_args(mask));
> > > > +}
> > > > +
> > > > +static ssize_t irq_cb_effective_affinity_store(struct vduse_virtqueue *vq,
> > > > + const char *buf, size_t count)
> > > > +{
> > > > + cpumask_var_t new_value;
> > > > + int ret;
> > > > +
> > > > + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
> > > > + return -ENOMEM;
> > > > +
> > > > + ret = cpumask_parse(buf, new_value);
> > > > + if (ret)
> > > > + goto free_mask;
> > > > +
> > > > + ret = -EINVAL;
> > > > + if (!cpumask_intersects(new_value, &vq->irq_affinity))
> > > > + goto free_mask;
> > > > +
> > > > + spin_lock(&vq->irq_affinity_lock);
> > > > +
> > > > + if (vq->irq_effective_cpu != -1)
> > > > + per_cpu(vduse_allocated_irq, vq->irq_effective_cpu) -= 1;
> > > > +
> > > > + vq->irq_effective_cpu = cpumask_first(new_value);
> > >
> > > Does this mean except for the first cpu, the rest of the mask is unused?
> > >
> >
> > Yes, but the user should always specify a mask that only contains one
> > CPU to make it work as expected.
>
> This doesn't seem to be the way that the IRQ affinity{hint} exported
> via /sys work. Any reason for doing this?
>
> (E.g we may have the require to limit the IRQ/callback to a NUMA node
> instead of a specific cpu)
>
Yes, I think we need to make the sysfs interface for irq affinity
writable. The effective irq affinity can be removed now since we
choose using round-robin to spread IRQs between CPUs.
Thanks,
Yongji
On Tue, Dec 20, 2022 at 2:32 PM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 19, 2022 at 12:56 PM Yongji Xie <[email protected]> wrote:
> >
> > On Fri, Dec 16, 2022 at 1:30 PM Jason Wang <[email protected]> wrote:
> > >
> > > On Mon, Dec 5, 2022 at 4:59 PM Xie Yongji <[email protected]> wrote:
> > > >
> > > > This brings current interrupt affinity spreading mechanism
> > > > to vduse device. We will make use of irq_create_affinity_masks()
> > > > to create an irq callback affinity mask for each virtqueue of
> > > > vduse device. Then we will choose the CPU which has the lowest
> > > > number of interrupt allocated in the affinity mask to run the
> > > > irq callback.
> > >
> > > This seems a balance mechanism but it might not be the semantic of the
> > > affinity or any reason we need to do this? I guess we should use at
> > > least round-robin in this case.
> > >
> >
> > Here we try to follow the pci interrupt management mechanism. In VM
> > cases, the interrupt should always be triggered to one specific CPU
> > rather than to each CPU in turn.
>
> If I was not wrong, when using MSI, most arch allows not only the
> cpuid as the destination but policy like rr and low priority first.
>
I see. I think we can remove the irq effective affinity and just use
the irq affinity. If the irq affinity mask contains more than one CPU,
we can use round-robin to spread IRQ between CPUs. But the sysfs
interface for irq affinity should be writable so that someone wants to
stop round-robin and pick one CPU to run irq callback.
Thanks,
Yongji
On Tue, Dec 20, 2022 at 2:31 PM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 19, 2022 at 3:12 PM Yongji Xie <[email protected]> wrote:
> >
> > On Mon, Dec 19, 2022 at 2:06 PM Jason Wang <[email protected]> wrote:
> > >
> > > On Mon, Dec 19, 2022 at 12:39 PM Yongji Xie <[email protected]> wrote:
> > > >
> > > > On Fri, Dec 16, 2022 at 11:58 AM Jason Wang <[email protected]> wrote:
> > > > >
> > > > > On Mon, Dec 5, 2022 at 4:43 PM Xie Yongji <[email protected]> wrote:
> > > > > >
> > > > > > This introduces set_irq_affinity callback in
> > > > > > vdpa_config_ops so that vdpa device driver can
> > > > > > get the interrupt affinity hint from the virtio
> > > > > > device driver. The interrupt affinity hint would
> > > > > > be needed by the interrupt affinity spreading
> > > > > > mechanism.
> > > > > >
> > > > > > Signed-off-by: Xie Yongji <[email protected]>
> > > > > > ---
> > > > > > drivers/virtio/virtio_vdpa.c | 4 ++++
> > > > > > include/linux/vdpa.h | 8 ++++++++
> > > > > > 2 files changed, 12 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> > > > > > index 08084b49e5a1..4731e4616ee0 100644
> > > > > > --- a/drivers/virtio/virtio_vdpa.c
> > > > > > +++ b/drivers/virtio/virtio_vdpa.c
> > > > > > @@ -275,9 +275,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> > > > > > struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> > > > > > struct vdpa_device *vdpa = vd_get_vdpa(vdev);
> > > > > > const struct vdpa_config_ops *ops = vdpa->config;
> > > > > > + struct irq_affinity default_affd = { 0 };
> > > > > > struct vdpa_callback cb;
> > > > > > int i, err, queue_idx = 0;
> > > > > >
> > > > > > + if (ops->set_irq_affinity)
> > > > > > + ops->set_irq_affinity(vdpa, desc ? desc : &default_affd);
> > > > >
> > > > > I wonder if we need to do this in vhost-vDPA.
> > > >
> > > > I don't get why we need to do this in vhost-vDPA? Should this be done in VM?
> > >
> > > If I was not wrong, this tries to set affinity on the host instead of
> > > the guest. More below.
> > >
> >
> > Yes, it's host stuff. This is used by the virtio device driver to pass
> > the irq affinity hint (tell which irq vectors don't need affinity
> > management) to the irq affinity manager. In the VM case, it should
> > only be related to the guest's virtio device driver and pci irq
> > affinity manager. So I don't get why we need to do this in vhost-vDPA.
>
> It's not necessarily the VM, do we have the same requirement for
> userspace (like DPDK) drivers?
>
IIUC the vhost-vdpa's irq callback just signals the eventfd. I didn't
see how to use the irq affinity hint in vdpa device driver. The real
irq callback should be called in DPDK internally.
Thanks,
Yongji
On Tue, Dec 20, 2022 at 2:28 PM Jason Wang <[email protected]> wrote:
>
> On Mon, Dec 19, 2022 at 1:04 PM Yongji Xie <[email protected]> wrote:
> >
> > On Fri, Dec 16, 2022 at 12:02 PM Jason Wang <[email protected]> wrote:
> > >
> > > On Mon, Dec 5, 2022 at 4:44 PM Xie Yongji <[email protected]> wrote:
> > > >
> > > > This introduces a bound workqueue to support running
> > > > irq callback in a specified cpu.
> > > >
> > > > Signed-off-by: Xie Yongji <[email protected]>
> > > > ---
> > > > drivers/vdpa/vdpa_user/vduse_dev.c | 29 ++++++++++++++++++++++-------
> > > > 1 file changed, 22 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > index 37809bfcb7ef..d126f3e32a20 100644
> > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > @@ -57,6 +57,7 @@ struct vduse_virtqueue {
> > > > struct vdpa_callback cb;
> > > > struct work_struct inject;
> > > > struct work_struct kick;
> > > > + int irq_effective_cpu;
> > >
> > > I wonder why it's a cpu number instead of a cpumask. The latter seems
> > > more flexible, e.g when using NUMA.
> > >
> >
> > This variable represents the CPU that runs the interrupt callback
> > rather than CPU affinity.
>
> Ok, but for some reason it only gets updated when a new affinity is set?
>
Yes, since we don't use round-robin now. And if affinity is not set,
we rollback to the default behavior (use un-bounded workqueue to run
irq callback).
> (Btw, I don't see how the code deals with cpu hotplug, do we need
> cpuhot notifier?)
>
Currently the queue_work_on() can handle the cpu hotplug case, so I
think we can simply check whether the CPU is online each time queuing
the kwork, then update the affinity if needed.
Thanks,
Yongji
On Tue, Dec 20, 2022 at 6:14 PM Yongji Xie <[email protected]> wrote:
>
> On Tue, Dec 20, 2022 at 2:31 PM Jason Wang <[email protected]> wrote:
> >
> > On Mon, Dec 19, 2022 at 3:12 PM Yongji Xie <[email protected]> wrote:
> > >
> > > On Mon, Dec 19, 2022 at 2:06 PM Jason Wang <[email protected]> wrote:
> > > >
> > > > On Mon, Dec 19, 2022 at 12:39 PM Yongji Xie <[email protected]> wrote:
> > > > >
> > > > > On Fri, Dec 16, 2022 at 11:58 AM Jason Wang <[email protected]> wrote:
> > > > > >
> > > > > > On Mon, Dec 5, 2022 at 4:43 PM Xie Yongji <[email protected]> wrote:
> > > > > > >
> > > > > > > This introduces set_irq_affinity callback in
> > > > > > > vdpa_config_ops so that vdpa device driver can
> > > > > > > get the interrupt affinity hint from the virtio
> > > > > > > device driver. The interrupt affinity hint would
> > > > > > > be needed by the interrupt affinity spreading
> > > > > > > mechanism.
> > > > > > >
> > > > > > > Signed-off-by: Xie Yongji <[email protected]>
> > > > > > > ---
> > > > > > > drivers/virtio/virtio_vdpa.c | 4 ++++
> > > > > > > include/linux/vdpa.h | 8 ++++++++
> > > > > > > 2 files changed, 12 insertions(+)
> > > > > > >
> > > > > > > diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
> > > > > > > index 08084b49e5a1..4731e4616ee0 100644
> > > > > > > --- a/drivers/virtio/virtio_vdpa.c
> > > > > > > +++ b/drivers/virtio/virtio_vdpa.c
> > > > > > > @@ -275,9 +275,13 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
> > > > > > > struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev);
> > > > > > > struct vdpa_device *vdpa = vd_get_vdpa(vdev);
> > > > > > > const struct vdpa_config_ops *ops = vdpa->config;
> > > > > > > + struct irq_affinity default_affd = { 0 };
> > > > > > > struct vdpa_callback cb;
> > > > > > > int i, err, queue_idx = 0;
> > > > > > >
> > > > > > > + if (ops->set_irq_affinity)
> > > > > > > + ops->set_irq_affinity(vdpa, desc ? desc : &default_affd);
> > > > > >
> > > > > > I wonder if we need to do this in vhost-vDPA.
> > > > >
> > > > > I don't get why we need to do this in vhost-vDPA? Should this be done in VM?
> > > >
> > > > If I was not wrong, this tries to set affinity on the host instead of
> > > > the guest. More below.
> > > >
> > >
> > > Yes, it's host stuff. This is used by the virtio device driver to pass
> > > the irq affinity hint (tell which irq vectors don't need affinity
> > > management) to the irq affinity manager. In the VM case, it should
> > > only be related to the guest's virtio device driver and pci irq
> > > affinity manager. So I don't get why we need to do this in vhost-vDPA.
> >
> > It's not necessarily the VM, do we have the same requirement for
> > userspace (like DPDK) drivers?
> >
>
> IIUC the vhost-vdpa's irq callback just signals the eventfd. I didn't
> see how to use the irq affinity hint in vdpa device driver. The real
> irq callback should be called in DPDK internally.
I agree.
Thanks
>
> Thanks,
> Yongji
>
On Tue, Dec 20, 2022 at 6:02 PM Yongji Xie <[email protected]> wrote:
>
> On Tue, Dec 20, 2022 at 2:28 PM Jason Wang <[email protected]> wrote:
> >
> > On Mon, Dec 19, 2022 at 1:04 PM Yongji Xie <[email protected]> wrote:
> > >
> > > On Fri, Dec 16, 2022 at 12:02 PM Jason Wang <[email protected]> wrote:
> > > >
> > > > On Mon, Dec 5, 2022 at 4:44 PM Xie Yongji <[email protected]> wrote:
> > > > >
> > > > > This introduces a bound workqueue to support running
> > > > > irq callback in a specified cpu.
> > > > >
> > > > > Signed-off-by: Xie Yongji <[email protected]>
> > > > > ---
> > > > > drivers/vdpa/vdpa_user/vduse_dev.c | 29 ++++++++++++++++++++++-------
> > > > > 1 file changed, 22 insertions(+), 7 deletions(-)
> > > > >
> > > > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > index 37809bfcb7ef..d126f3e32a20 100644
> > > > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > > > @@ -57,6 +57,7 @@ struct vduse_virtqueue {
> > > > > struct vdpa_callback cb;
> > > > > struct work_struct inject;
> > > > > struct work_struct kick;
> > > > > + int irq_effective_cpu;
> > > >
> > > > I wonder why it's a cpu number instead of a cpumask. The latter seems
> > > > more flexible, e.g when using NUMA.
> > > >
> > >
> > > This variable represents the CPU that runs the interrupt callback
> > > rather than CPU affinity.
> >
> > Ok, but for some reason it only gets updated when a new affinity is set?
> >
>
> Yes, since we don't use round-robin now. And if affinity is not set,
> we rollback to the default behavior (use un-bounded workqueue to run
> irq callback).
>
> > (Btw, I don't see how the code deals with cpu hotplug, do we need
> > cpuhot notifier?)
> >
>
> Currently the queue_work_on() can handle the cpu hotplug case, so I
> think we can simply check whether the CPU is online each time queuing
> the kwork, then update the affinity if needed.
Right.
Thanks
>
> Thanks,
> Yongji
>