This adds infrastructure required for supporting
multiple ring formats.
The idea is as follows: we convert descriptors to an
independent format first, and process that converting to
iov later.
Used ring is similar: we fetch into an independent struct first,
convert that to IOV later.
The point is that we have a tight loop that fetches
descriptors, which is good for cache utilization.
This will also allow all kind of batching tricks -
e.g. it seems possible to keep SMAP disabled while
we are fetching multiple descriptors.
For used descriptors, this allows keeping track of the buffer length
without need to rescan IOV.
This seems to perform exactly the same as the original
code based on a microbenchmark.
Lightly tested.
More testing would be very much appreciated.
changes from v5:
- addressed comments by Jason: squashed API changes, fixed up discard
changes from v4:
- added used descriptor format independence
- addressed comments by jason
- fixed a crash detected by the lkp robot.
changes from v3:
- fixed error handling in case of indirect descriptors
- add BUG_ON to detect buffer overflow in case of bugs
in response to comment by Jason Wang
- minor code tweaks
Changes from v2:
- fixed indirect descriptor batching
reported by Jason Wang
Changes from v1:
- typo fixes
Michael S. Tsirkin (11):
vhost: option to fetch descriptors through an independent struct
vhost: use batched get_vq_desc version
vhost/net: pass net specific struct pointer
vhost: reorder functions
vhost: format-independent API for used buffers
vhost/net: convert to new API: heads->bufs
vhost/net: avoid iov length math
vhost/test: convert to the buf API
vhost/scsi: switch to buf APIs
vhost/vsock: switch to the buf API
vhost: drop head based APIs
drivers/vhost/net.c | 174 ++++++++++---------
drivers/vhost/scsi.c | 73 ++++----
drivers/vhost/test.c | 22 +--
drivers/vhost/vhost.c | 382 +++++++++++++++++++++++++++---------------
drivers/vhost/vhost.h | 44 +++--
drivers/vhost/vsock.c | 30 ++--
6 files changed, 443 insertions(+), 282 deletions(-)
--
MST
As testing shows no performance change, switch to that now.
Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Eugenio Pérez <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/test.c | 2 +-
drivers/vhost/vhost.c | 318 ++++++++----------------------------------
drivers/vhost/vhost.h | 7 +-
3 files changed, 65 insertions(+), 262 deletions(-)
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 0466921f4772..7d69778aaa26 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -119,7 +119,7 @@ static int vhost_test_open(struct inode *inode, struct file *f)
dev = &n->dev;
vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
- vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
+ vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV + 64,
VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
f->private_data = n;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 180b7b58c76b..41d6b132c234 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -304,6 +304,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
{
vq->num = 1;
vq->ndescs = 0;
+ vq->first_desc = 0;
vq->desc = NULL;
vq->avail = NULL;
vq->used = NULL;
@@ -372,6 +373,11 @@ static int vhost_worker(void *data)
return 0;
}
+static int vhost_vq_num_batch_descs(struct vhost_virtqueue *vq)
+{
+ return vq->max_descs - UIO_MAXIOV;
+}
+
static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
{
kfree(vq->descs);
@@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
vq->max_descs = dev->iov_limit;
+ if (vhost_vq_num_batch_descs(vq) < 0) {
+ return -EINVAL;
+ }
vq->descs = kmalloc_array(vq->max_descs,
sizeof(*vq->descs),
GFP_KERNEL);
@@ -1610,6 +1619,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
vq->last_avail_idx = s.num;
/* Forget the cached index value. */
vq->avail_idx = vq->last_avail_idx;
+ vq->ndescs = vq->first_desc = 0;
break;
case VHOST_GET_VRING_BASE:
s.index = idx;
@@ -2078,253 +2088,6 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
return next;
}
-static int get_indirect(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num,
- struct vring_desc *indirect)
-{
- struct vring_desc desc;
- unsigned int i = 0, count, found = 0;
- u32 len = vhost32_to_cpu(vq, indirect->len);
- struct iov_iter from;
- int ret, access;
-
- /* Sanity check */
- if (unlikely(len % sizeof desc)) {
- vq_err(vq, "Invalid length in indirect descriptor: "
- "len 0x%llx not multiple of 0x%zx\n",
- (unsigned long long)len,
- sizeof desc);
- return -EINVAL;
- }
-
- ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
- UIO_MAXIOV, VHOST_ACCESS_RO);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Translation failure %d in indirect.\n", ret);
- return ret;
- }
- iov_iter_init(&from, READ, vq->indirect, ret, len);
-
- /* We will use the result as an address to read from, so most
- * architectures only need a compiler barrier here. */
- read_barrier_depends();
-
- count = len / sizeof desc;
- /* Buffers are chained via a 16 bit next field, so
- * we can have at most 2^16 of these. */
- if (unlikely(count > USHRT_MAX + 1)) {
- vq_err(vq, "Indirect buffer length too big: %d\n",
- indirect->len);
- return -E2BIG;
- }
-
- do {
- unsigned iov_count = *in_num + *out_num;
- if (unlikely(++found > count)) {
- vq_err(vq, "Loop detected: last one at %u "
- "indirect size %u\n",
- i, count);
- return -EINVAL;
- }
- if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
- vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
- i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
- return -EINVAL;
- }
- if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
- vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
- i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
- return -EINVAL;
- }
-
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
- access = VHOST_ACCESS_WO;
- else
- access = VHOST_ACCESS_RO;
-
- ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
- vhost32_to_cpu(vq, desc.len), iov + iov_count,
- iov_size - iov_count, access);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Translation failure %d indirect idx %d\n",
- ret, i);
- return ret;
- }
- /* If this is an input descriptor, increment that count. */
- if (access == VHOST_ACCESS_WO) {
- *in_num += ret;
- if (unlikely(log && ret)) {
- log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
- log[*log_num].len = vhost32_to_cpu(vq, desc.len);
- ++*log_num;
- }
- } else {
- /* If it's an output descriptor, they're all supposed
- * to come before any input descriptors. */
- if (unlikely(*in_num)) {
- vq_err(vq, "Indirect descriptor "
- "has out after in: idx %d\n", i);
- return -EINVAL;
- }
- *out_num += ret;
- }
- } while ((i = next_desc(vq, &desc)) != -1);
- return 0;
-}
-
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
-{
- struct vring_desc desc;
- unsigned int i, head, found = 0;
- u16 last_avail_idx;
- __virtio16 avail_idx;
- __virtio16 ring_head;
- int ret, access;
-
- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vq->last_avail_idx;
-
- if (vq->avail_idx == vq->last_avail_idx) {
- if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
- vq_err(vq, "Failed to access avail idx at %p\n",
- &vq->avail->idx);
- return -EFAULT;
- }
- vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-
- if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
- vq_err(vq, "Guest moved used index from %u to %u",
- last_avail_idx, vq->avail_idx);
- return -EFAULT;
- }
-
- /* If there's nothing new since last we looked, return
- * invalid.
- */
- if (vq->avail_idx == last_avail_idx)
- return vq->num;
-
- /* Only get avail ring entries after they have been
- * exposed by guest.
- */
- smp_rmb();
- }
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return -EFAULT;
- }
-
- head = vhost16_to_cpu(vq, ring_head);
-
- /* If their number is silly, that's an error. */
- if (unlikely(head >= vq->num)) {
- vq_err(vq, "Guest says index %u > %u is available",
- head, vq->num);
- return -EINVAL;
- }
-
- /* When we start there are none of either input nor output. */
- *out_num = *in_num = 0;
- if (unlikely(log))
- *log_num = 0;
-
- i = head;
- do {
- unsigned iov_count = *in_num + *out_num;
- if (unlikely(i >= vq->num)) {
- vq_err(vq, "Desc index is %u > %u, head = %u",
- i, vq->num, head);
- return -EINVAL;
- }
- if (unlikely(++found > vq->num)) {
- vq_err(vq, "Loop detected: last one at %u "
- "vq size %u head %u\n",
- i, vq->num, head);
- return -EINVAL;
- }
- ret = vhost_get_desc(vq, &desc, i);
- if (unlikely(ret)) {
- vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
- i, vq->desc + i);
- return -EFAULT;
- }
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
- ret = get_indirect(vq, iov, iov_size,
- out_num, in_num,
- log, log_num, &desc);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Failure detected "
- "in indirect descriptor at idx %d\n", i);
- return ret;
- }
- continue;
- }
-
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
- access = VHOST_ACCESS_WO;
- else
- access = VHOST_ACCESS_RO;
- ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
- vhost32_to_cpu(vq, desc.len), iov + iov_count,
- iov_size - iov_count, access);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Translation failure %d descriptor idx %d\n",
- ret, i);
- return ret;
- }
- if (access == VHOST_ACCESS_WO) {
- /* If this is an input descriptor,
- * increment that count. */
- *in_num += ret;
- if (unlikely(log && ret)) {
- log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
- log[*log_num].len = vhost32_to_cpu(vq, desc.len);
- ++*log_num;
- }
- } else {
- /* If it's an output descriptor, they're all supposed
- * to come before any input descriptors. */
- if (unlikely(*in_num)) {
- vq_err(vq, "Descriptor has out after in: "
- "idx %d\n", i);
- return -EINVAL;
- }
- *out_num += ret;
- }
- } while ((i = next_desc(vq, &desc)) != -1);
-
- /* On success, increment avail index. */
- vq->last_avail_idx++;
-
- /* Assume notifications from guest are disabled at this point,
- * if they aren't we would need to update avail_event index. */
- BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
- return head;
-}
-EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
-
static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
{
BUG_ON(!vq->ndescs);
@@ -2428,7 +2191,7 @@ static int fetch_indirect_descs(struct vhost_virtqueue *vq,
/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
* A negative code is returned on error. */
-static int fetch_descs(struct vhost_virtqueue *vq)
+static int fetch_buf(struct vhost_virtqueue *vq)
{
unsigned int i, head, found = 0;
struct vhost_desc *last;
@@ -2441,7 +2204,11 @@ static int fetch_descs(struct vhost_virtqueue *vq)
/* Check it isn't doing very strange things with descriptor numbers. */
last_avail_idx = vq->last_avail_idx;
- if (vq->avail_idx == vq->last_avail_idx) {
+ if (unlikely(vq->avail_idx == vq->last_avail_idx)) {
+ /* If we already have work to do, don't bother re-checking. */
+ if (likely(vq->ndescs))
+ return 1;
+
if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
vq_err(vq, "Failed to access avail idx at %p\n",
&vq->avail->idx);
@@ -2532,6 +2299,41 @@ static int fetch_descs(struct vhost_virtqueue *vq)
return 1;
}
+/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
+ * A negative code is returned on error. */
+static int fetch_descs(struct vhost_virtqueue *vq)
+{
+ int ret;
+
+ if (unlikely(vq->first_desc >= vq->ndescs)) {
+ vq->first_desc = 0;
+ vq->ndescs = 0;
+ }
+
+ if (vq->ndescs)
+ return 1;
+
+ for (ret = 1;
+ ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
+ ret = fetch_buf(vq))
+ ;
+
+ /* On success we expect some descs */
+ BUG_ON(ret > 0 && !vq->ndescs);
+ return ret;
+}
+
+/* Reverse the effects of fetch_descs */
+static void unfetch_descs(struct vhost_virtqueue *vq)
+{
+ int i;
+
+ for (i = vq->first_desc; i < vq->ndescs; ++i)
+ if (!(vq->descs[i].flags & VRING_DESC_F_NEXT))
+ vq->last_avail_idx -= 1;
+ vq->ndescs = 0;
+}
+
/* This looks in the virtqueue and for the first available buffer, and converts
* it to an iovec for convenient access. Since descriptors consist of some
* number of output then some number of input descriptors, it's actually two
@@ -2540,7 +2342,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
* This function returns the descriptor number found, or vq->num (which is
* never a valid descriptor number) if none was found. A negative code is
* returned on error. */
-int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num)
@@ -2549,7 +2351,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
int i;
if (ret <= 0)
- goto err_fetch;
+ goto err;
/* Now convert to IOV */
/* When we start there are none of either input nor output. */
@@ -2557,7 +2359,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
if (unlikely(log))
*log_num = 0;
- for (i = 0; i < vq->ndescs; ++i) {
+ for (i = vq->first_desc; i < vq->ndescs; ++i) {
unsigned iov_count = *in_num + *out_num;
struct vhost_desc *desc = &vq->descs[i];
int access;
@@ -2603,24 +2405,26 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
}
ret = desc->id;
+
+ if (!(desc->flags & VRING_DESC_F_NEXT))
+ break;
}
- vq->ndescs = 0;
+ vq->first_desc = i + 1;
return ret;
err:
- vhost_discard_vq_desc(vq, 1);
-err_fetch:
- vq->ndescs = 0;
+ unfetch_descs(vq);
return ret;
}
-EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
+EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
{
+ unfetch_descs(vq);
vq->last_avail_idx -= n;
}
EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 87089d51490d..fed36af5c444 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -81,6 +81,7 @@ struct vhost_virtqueue {
struct vhost_desc *descs;
int ndescs;
+ int first_desc;
int max_descs;
struct file *kick;
@@ -189,10 +190,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);
-int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
- struct iovec iov[], unsigned int iov_count,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num);
int vhost_get_vq_desc(struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
@@ -261,6 +258,8 @@ static inline void vhost_vq_set_backend(struct vhost_virtqueue *vq,
void *private_data)
{
vq->private_data = private_data;
+ vq->ndescs = 0;
+ vq->first_desc = 0;
}
/**
--
MST
In preparation for further cleanup, pass net specific pointer
to ubuf callbacks so we can move net specific fields
out to net structures.
Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/net.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index bf5e1d81ae25..ff594eec8ae3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -94,7 +94,7 @@ struct vhost_net_ubuf_ref {
*/
atomic_t refcount;
wait_queue_head_t wait;
- struct vhost_virtqueue *vq;
+ struct vhost_net_virtqueue *nvq;
};
#define VHOST_NET_BATCH 64
@@ -231,7 +231,7 @@ static void vhost_net_enable_zcopy(int vq)
}
static struct vhost_net_ubuf_ref *
-vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
+vhost_net_ubuf_alloc(struct vhost_net_virtqueue *nvq, bool zcopy)
{
struct vhost_net_ubuf_ref *ubufs;
/* No zero copy backend? Nothing to count. */
@@ -242,7 +242,7 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
return ERR_PTR(-ENOMEM);
atomic_set(&ubufs->refcount, 1);
init_waitqueue_head(&ubufs->wait);
- ubufs->vq = vq;
+ ubufs->nvq = nvq;
return ubufs;
}
@@ -384,13 +384,13 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
{
struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
- struct vhost_virtqueue *vq = ubufs->vq;
+ struct vhost_net_virtqueue *nvq = ubufs->nvq;
int cnt;
rcu_read_lock_bh();
/* set len to mark this desc buffers done DMA */
- vq->heads[ubuf->desc].len = success ?
+ nvq->vq.heads[ubuf->desc].in_len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
cnt = vhost_net_ubuf_put(ubufs);
@@ -402,7 +402,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
* less than 10% of times).
*/
if (cnt <= 1 || !(cnt % 16))
- vhost_poll_queue(&vq->poll);
+ vhost_poll_queue(&nvq->vq.poll);
rcu_read_unlock_bh();
}
@@ -1525,7 +1525,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
/* start polling new socket */
oldsock = vhost_vq_get_backend(vq);
if (sock != oldsock) {
- ubufs = vhost_net_ubuf_alloc(vq,
+ ubufs = vhost_net_ubuf_alloc(nvq,
sock && vhost_sock_zcopy(sock));
if (IS_ERR(ubufs)) {
r = PTR_ERR(ubufs);
--
MST
Hi Michael,
On Mon, Jun 08, 2020 at 08:52:51AM -0400, Michael S. Tsirkin wrote:
>
>
> This adds infrastructure required for supporting
> multiple ring formats.
>
> The idea is as follows: we convert descriptors to an
> independent format first, and process that converting to
> iov later.
>
> Used ring is similar: we fetch into an independent struct first,
> convert that to IOV later.
>
> The point is that we have a tight loop that fetches
> descriptors, which is good for cache utilization.
> This will also allow all kind of batching tricks -
> e.g. it seems possible to keep SMAP disabled while
> we are fetching multiple descriptors.
>
> For used descriptors, this allows keeping track of the buffer length
> without need to rescan IOV.
>
> This seems to perform exactly the same as the original
> code based on a microbenchmark.
> Lightly tested.
> More testing would be very much appreciated.
while testing the vhost-vsock I found some issues in vhost-net (the VM
had also a virtio-net device).
This is the dmesg of the host (it is a QEMU VM):
[ 171.860074] CPU: 0 PID: 16613 Comm: vhost-16595 Not tainted 5.7.0-ste-12703-gaf7b4801030c-dirty #6
[ 171.862210] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[ 171.865998] Call Trace:
[ 171.866440] <IRQ>
[ 171.866817] dump_stack+0x57/0x7a
[ 171.867440] nmi_cpu_backtrace.cold+0x14/0x54
[ 171.868233] ? lapic_can_unplug_cpu.cold+0x3b/0x3b
[ 171.869153] nmi_trigger_cpumask_backtrace+0x85/0x92
[ 171.870143] arch_trigger_cpumask_backtrace+0x19/0x20
[ 171.871134] rcu_dump_cpu_stacks+0xa0/0xd2
[ 171.872203] rcu_sched_clock_irq.cold+0x23a/0x41c
[ 171.873098] update_process_times+0x2c/0x60
[ 171.874119] tick_sched_timer+0x59/0x160
[ 171.874777] ? tick_switch_to_oneshot.cold+0x79/0x79
[ 171.875602] __hrtimer_run_queues+0x10d/0x290
[ 171.876317] hrtimer_interrupt+0x109/0x220
[ 171.877025] smp_apic_timer_interrupt+0x76/0x150
[ 171.877875] apic_timer_interrupt+0xf/0x20
[ 171.878563] </IRQ>
[ 171.878897] RIP: 0010:vhost_get_avail_buf+0x5f8/0x860 [vhost]
[ 171.879951] Code: 48 8b bb 88 00 00 00 48 85 ff 0f 84 ad 00 00 00 be 01 00 00 00 44 89 45 80 e8 24 52 08 c1 8b 43 68 44 8b 45 80 e9 e9 fb ff ff <45> 85 c0 0f 85 48 fd ff ff 48 8b 43 38 48 83 bb 38 45 00 00 00 48
[ 171.889938] RSP: 0018:ffffc90000397c40 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
[ 171.896828] RAX: 0000000000000040 RBX: ffff88822c3f4688 RCX: ffff888231090000
[ 171.898903] RDX: 0000000000000440 RSI: ffff888231090000 RDI: ffffc90000397c80
[ 171.901025] RBP: ffffc90000397ce8 R08: 0000000000000001 R09: ffffc90000397dc4
[ 171.903136] R10: 000000231edc461f R11: 0000000000000003 R12: 0000000000000001
[ 171.905213] R13: 0000000000000001 R14: ffffc90000397dd4 R15: ffff88822c3f87a8
[ 171.907553] get_tx_bufs+0x49/0x180 [vhost_net]
[ 171.909142] handle_tx_copy+0xb4/0x5c0 [vhost_net]
[ 171.911495] ? update_curr+0x67/0x160
[ 171.913376] handle_tx+0xb0/0xe0 [vhost_net]
[ 171.916451] handle_tx_kick+0x15/0x20 [vhost_net]
[ 171.919912] vhost_worker+0xb3/0x110 [vhost]
[ 171.923379] kthread+0x106/0x140
[ 171.925314] ? __vhost_add_used_n+0x1c0/0x1c0 [vhost]
[ 171.933388] ? kthread_park+0x90/0x90
[ 171.936148] ret_from_fork+0x22/0x30
[ 234.859212] rcu: INFO: rcu_sched self-detected stall on CPU
[ 234.860036] rcu: 0-....: (20981 ticks this GP) idle=962/1/0x4000000000000002 softirq=15513/15513 fqs=10340
[ 234.861547] (t=21003 jiffies g=24773 q=2390)
[ 234.862158] NMI backtrace for cpu 0
[ 234.862638] CPU: 0 PID: 16613 Comm: vhost-16595 Not tainted 5.7.0-ste-12703-gaf7b4801030c-dirty #6
[ 234.864008] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[ 234.866084] Call Trace:
[ 234.866395] <IRQ>
[ 234.866648] dump_stack+0x57/0x7a
[ 234.867079] nmi_cpu_backtrace.cold+0x14/0x54
[ 234.867679] ? lapic_can_unplug_cpu.cold+0x3b/0x3b
[ 234.868322] nmi_trigger_cpumask_backtrace+0x85/0x92
[ 234.869013] arch_trigger_cpumask_backtrace+0x19/0x20
[ 234.869747] rcu_dump_cpu_stacks+0xa0/0xd2
[ 234.870267] rcu_sched_clock_irq.cold+0x23a/0x41c
[ 234.870960] update_process_times+0x2c/0x60
[ 234.871578] tick_sched_timer+0x59/0x160
[ 234.872148] ? tick_switch_to_oneshot.cold+0x79/0x79
[ 234.872949] __hrtimer_run_queues+0x10d/0x290
[ 234.873711] hrtimer_interrupt+0x109/0x220
[ 234.874271] smp_apic_timer_interrupt+0x76/0x150
[ 234.874913] apic_timer_interrupt+0xf/0x20
[ 234.876507] </IRQ>
[ 234.876799] RIP: 0010:vhost_get_avail_buf+0x8a/0x860 [vhost]
[ 234.877828] Code: 8d 72 06 00 00 85 c0 0f 85 fb 02 00 00 8b 57 70 89 d0 2d 00 04 00 00 0f 88 72 06 00 00 45 31 c0 4c 8d bb 20 41 00 00 4d 89 ee <44> 0f b7 a3 08 01 00 00 66 44 3b a3 0a 01 00 00 0f 84 58 05 00 00
[ 234.882059] RSP: 0018:ffffc90000397c40 EFLAGS: 00000283 ORIG_RAX: ffffffffffffff13
[ 234.883227] RAX: 0000000000000040 RBX: ffff88822c3f4688 RCX: ffff888231090000
[ 234.884317] RDX: 0000000000000440 RSI: ffff888231090000 RDI: ffffc90000397c80
[ 234.886531] RBP: ffffc90000397ce8 R08: 0000000000000001 R09: ffffc90000397dc4
[ 234.891840] R10: 000000231edc461f R11: 0000000000000003 R12: 0000000000000001
[ 234.896670] R13: 0000000000000001 R14: ffffc90000397dd4 R15: ffff88822c3f87a8
[ 234.900918] get_tx_bufs+0x49/0x180 [vhost_net]
[ 234.904280] handle_tx_copy+0xb4/0x5c0 [vhost_net]
[ 234.916402] ? update_curr+0x67/0x160
[ 234.917688] handle_tx+0xb0/0xe0 [vhost_net]
[ 234.918865] handle_tx_kick+0x15/0x20 [vhost_net]
[ 234.920366] vhost_worker+0xb3/0x110 [vhost]
[ 234.921500] kthread+0x106/0x140
[ 234.922219] ? __vhost_add_used_n+0x1c0/0x1c0 [vhost]
[ 234.923595] ? kthread_park+0x90/0x90
[ 234.924442] ret_from_fork+0x22/0x30
[ 297.870095] rcu: INFO: rcu_sched self-detected stall on CPU
[ 297.871352] rcu: 0-....: (36719 ticks this GP) idle=962/1/0x4000000000000002 softirq=15513/15513 fqs=18087
[ 297.873585] (t=36756 jiffies g=24773 q=2853)
[ 297.874478] NMI backtrace for cpu 0
[ 297.875229] CPU: 0 PID: 16613 Comm: vhost-16595 Not tainted 5.7.0-ste-12703-gaf7b4801030c-dirty #6
[ 297.877204] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
[ 297.881644] Call Trace:
[ 297.882185] <IRQ>
[ 297.882621] dump_stack+0x57/0x7a
[ 297.883387] nmi_cpu_backtrace.cold+0x14/0x54
[ 297.884390] ? lapic_can_unplug_cpu.cold+0x3b/0x3b
[ 297.885568] nmi_trigger_cpumask_backtrace+0x85/0x92
[ 297.886746] arch_trigger_cpumask_backtrace+0x19/0x20
[ 297.888260] rcu_dump_cpu_stacks+0xa0/0xd2
[ 297.889508] rcu_sched_clock_irq.cold+0x23a/0x41c
[ 297.890803] update_process_times+0x2c/0x60
[ 297.893357] tick_sched_timer+0x59/0x160
[ 297.895143] ? tick_switch_to_oneshot.cold+0x79/0x79
[ 297.897832] __hrtimer_run_queues+0x10d/0x290
[ 297.899841] hrtimer_interrupt+0x109/0x220
[ 297.900909] smp_apic_timer_interrupt+0x76/0x150
[ 297.903543] apic_timer_interrupt+0xf/0x20
[ 297.906509] </IRQ>
[ 297.908004] RIP: 0010:vhost_get_avail_buf+0x92/0x860 [vhost]
[ 297.911536] Code: 85 fb 02 00 00 8b 57 70 89 d0 2d 00 04 00 00 0f 88 72 06 00 00 45 31 c0 4c 8d bb 20 41 00 00 4d 89 ee 44 0f b7 a3 08 01 00 00 <66> 44 3b a3 0a 01 00 00 0f 84 58 05 00 00 8b 43 28 83 e8 01 41 21
[ 297.930274] RSP: 0018:ffffc90000397c40 EFLAGS: 00000283 ORIG_RAX: ffffffffffffff13
[ 297.934056] RAX: 0000000000000040 RBX: ffff88822c3f4688 RCX: ffff888231090000
[ 297.938371] RDX: 0000000000000440 RSI: ffff888231090000 RDI: ffffc90000397c80
[ 297.944222] RBP: ffffc90000397ce8 R08: 0000000000000001 R09: ffffc90000397dc4
[ 297.953817] R10: 000000231edc461f R11: 0000000000000003 R12: 0000000000000001
[ 297.956453] R13: 0000000000000001 R14: ffffc90000397dd4 R15: ffff88822c3f87a8
[ 297.960873] get_tx_bufs+0x49/0x180 [vhost_net]
[ 297.964163] handle_tx_copy+0xb4/0x5c0 [vhost_net]
[ 297.965871] ? update_curr+0x67/0x160
[ 297.966893] handle_tx+0xb0/0xe0 [vhost_net]
[ 297.968442] handle_tx_kick+0x15/0x20 [vhost_net]
[ 297.971327] vhost_worker+0xb3/0x110 [vhost]
[ 297.974275] kthread+0x106/0x140
[ 297.976141] ? __vhost_add_used_n+0x1c0/0x1c0 [vhost]
[ 297.979518] ? kthread_park+0x90/0x90
[ 297.981665] ret_from_fork+0x22/0x30
On 2020/6/8 下午8:52, Michael S. Tsirkin wrote:
> As testing shows no performance change, switch to that now.
>
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> Signed-off-by: Eugenio Pérez <[email protected]>
> Link: https://lore.kernel.org/r/[email protected]
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> ---
> drivers/vhost/test.c | 2 +-
> drivers/vhost/vhost.c | 318 ++++++++----------------------------------
> drivers/vhost/vhost.h | 7 +-
> 3 files changed, 65 insertions(+), 262 deletions(-)
>
> diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
> index 0466921f4772..7d69778aaa26 100644
> --- a/drivers/vhost/test.c
> +++ b/drivers/vhost/test.c
> @@ -119,7 +119,7 @@ static int vhost_test_open(struct inode *inode, struct file *f)
> dev = &n->dev;
> vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
> n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
> - vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
> + vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV + 64,
> VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
>
> f->private_data = n;
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 180b7b58c76b..41d6b132c234 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -304,6 +304,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> {
> vq->num = 1;
> vq->ndescs = 0;
> + vq->first_desc = 0;
> vq->desc = NULL;
> vq->avail = NULL;
> vq->used = NULL;
> @@ -372,6 +373,11 @@ static int vhost_worker(void *data)
> return 0;
> }
>
> +static int vhost_vq_num_batch_descs(struct vhost_virtqueue *vq)
> +{
> + return vq->max_descs - UIO_MAXIOV;
> +}
> +
> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> {
> kfree(vq->descs);
> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> for (i = 0; i < dev->nvqs; ++i) {
> vq = dev->vqs[i];
> vq->max_descs = dev->iov_limit;
> + if (vhost_vq_num_batch_descs(vq) < 0) {
> + return -EINVAL;
> + }
> vq->descs = kmalloc_array(vq->max_descs,
> sizeof(*vq->descs),
> GFP_KERNEL);
> @@ -1610,6 +1619,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> vq->last_avail_idx = s.num;
> /* Forget the cached index value. */
> vq->avail_idx = vq->last_avail_idx;
> + vq->ndescs = vq->first_desc = 0;
> break;
> case VHOST_GET_VRING_BASE:
> s.index = idx;
> @@ -2078,253 +2088,6 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
> return next;
> }
>
> -static int get_indirect(struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num,
> - struct vring_desc *indirect)
> -{
> - struct vring_desc desc;
> - unsigned int i = 0, count, found = 0;
> - u32 len = vhost32_to_cpu(vq, indirect->len);
> - struct iov_iter from;
> - int ret, access;
> -
> - /* Sanity check */
> - if (unlikely(len % sizeof desc)) {
> - vq_err(vq, "Invalid length in indirect descriptor: "
> - "len 0x%llx not multiple of 0x%zx\n",
> - (unsigned long long)len,
> - sizeof desc);
> - return -EINVAL;
> - }
> -
> - ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
> - UIO_MAXIOV, VHOST_ACCESS_RO);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d in indirect.\n", ret);
> - return ret;
> - }
> - iov_iter_init(&from, READ, vq->indirect, ret, len);
> -
> - /* We will use the result as an address to read from, so most
> - * architectures only need a compiler barrier here. */
> - read_barrier_depends();
> -
> - count = len / sizeof desc;
> - /* Buffers are chained via a 16 bit next field, so
> - * we can have at most 2^16 of these. */
> - if (unlikely(count > USHRT_MAX + 1)) {
> - vq_err(vq, "Indirect buffer length too big: %d\n",
> - indirect->len);
> - return -E2BIG;
> - }
> -
> - do {
> - unsigned iov_count = *in_num + *out_num;
> - if (unlikely(++found > count)) {
> - vq_err(vq, "Loop detected: last one at %u "
> - "indirect size %u\n",
> - i, count);
> - return -EINVAL;
> - }
> - if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
> - vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
> - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> - return -EINVAL;
> - }
> - if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
> - vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
> - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> - return -EINVAL;
> - }
> -
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> - access = VHOST_ACCESS_WO;
> - else
> - access = VHOST_ACCESS_RO;
> -
> - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> - iov_size - iov_count, access);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d indirect idx %d\n",
> - ret, i);
> - return ret;
> - }
> - /* If this is an input descriptor, increment that count. */
> - if (access == VHOST_ACCESS_WO) {
> - *in_num += ret;
> - if (unlikely(log && ret)) {
> - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> - ++*log_num;
> - }
> - } else {
> - /* If it's an output descriptor, they're all supposed
> - * to come before any input descriptors. */
> - if (unlikely(*in_num)) {
> - vq_err(vq, "Indirect descriptor "
> - "has out after in: idx %d\n", i);
> - return -EINVAL;
> - }
> - *out_num += ret;
> - }
> - } while ((i = next_desc(vq, &desc)) != -1);
> - return 0;
> -}
> -
> -/* This looks in the virtqueue and for the first available buffer, and converts
> - * it to an iovec for convenient access. Since descriptors consist of some
> - * number of output then some number of input descriptors, it's actually two
> - * iovecs, but we pack them into one and note how many of each there were.
> - *
> - * This function returns the descriptor number found, or vq->num (which is
> - * never a valid descriptor number) if none was found. A negative code is
> - * returned on error. */
> -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num)
> -{
> - struct vring_desc desc;
> - unsigned int i, head, found = 0;
> - u16 last_avail_idx;
> - __virtio16 avail_idx;
> - __virtio16 ring_head;
> - int ret, access;
> -
> - /* Check it isn't doing very strange things with descriptor numbers. */
> - last_avail_idx = vq->last_avail_idx;
> -
> - if (vq->avail_idx == vq->last_avail_idx) {
> - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> - vq_err(vq, "Failed to access avail idx at %p\n",
> - &vq->avail->idx);
> - return -EFAULT;
> - }
> - vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> -
> - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
> - vq_err(vq, "Guest moved used index from %u to %u",
> - last_avail_idx, vq->avail_idx);
> - return -EFAULT;
> - }
> -
> - /* If there's nothing new since last we looked, return
> - * invalid.
> - */
> - if (vq->avail_idx == last_avail_idx)
> - return vq->num;
> -
> - /* Only get avail ring entries after they have been
> - * exposed by guest.
> - */
> - smp_rmb();
> - }
> -
> - /* Grab the next descriptor number they're advertising, and increment
> - * the index we've seen. */
> - if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
> - vq_err(vq, "Failed to read head: idx %d address %p\n",
> - last_avail_idx,
> - &vq->avail->ring[last_avail_idx % vq->num]);
> - return -EFAULT;
> - }
> -
> - head = vhost16_to_cpu(vq, ring_head);
> -
> - /* If their number is silly, that's an error. */
> - if (unlikely(head >= vq->num)) {
> - vq_err(vq, "Guest says index %u > %u is available",
> - head, vq->num);
> - return -EINVAL;
> - }
> -
> - /* When we start there are none of either input nor output. */
> - *out_num = *in_num = 0;
> - if (unlikely(log))
> - *log_num = 0;
> -
> - i = head;
> - do {
> - unsigned iov_count = *in_num + *out_num;
> - if (unlikely(i >= vq->num)) {
> - vq_err(vq, "Desc index is %u > %u, head = %u",
> - i, vq->num, head);
> - return -EINVAL;
> - }
> - if (unlikely(++found > vq->num)) {
> - vq_err(vq, "Loop detected: last one at %u "
> - "vq size %u head %u\n",
> - i, vq->num, head);
> - return -EINVAL;
> - }
> - ret = vhost_get_desc(vq, &desc, i);
> - if (unlikely(ret)) {
> - vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
> - i, vq->desc + i);
> - return -EFAULT;
> - }
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
> - ret = get_indirect(vq, iov, iov_size,
> - out_num, in_num,
> - log, log_num, &desc);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Failure detected "
> - "in indirect descriptor at idx %d\n", i);
> - return ret;
> - }
> - continue;
> - }
> -
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> - access = VHOST_ACCESS_WO;
> - else
> - access = VHOST_ACCESS_RO;
> - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> - iov_size - iov_count, access);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d descriptor idx %d\n",
> - ret, i);
> - return ret;
> - }
> - if (access == VHOST_ACCESS_WO) {
> - /* If this is an input descriptor,
> - * increment that count. */
> - *in_num += ret;
> - if (unlikely(log && ret)) {
> - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> - ++*log_num;
> - }
> - } else {
> - /* If it's an output descriptor, they're all supposed
> - * to come before any input descriptors. */
> - if (unlikely(*in_num)) {
> - vq_err(vq, "Descriptor has out after in: "
> - "idx %d\n", i);
> - return -EINVAL;
> - }
> - *out_num += ret;
> - }
> - } while ((i = next_desc(vq, &desc)) != -1);
> -
> - /* On success, increment avail index. */
> - vq->last_avail_idx++;
> -
> - /* Assume notifications from guest are disabled at this point,
> - * if they aren't we would need to update avail_event index. */
> - BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> - return head;
> -}
> -EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> -
> static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
> {
> BUG_ON(!vq->ndescs);
> @@ -2428,7 +2191,7 @@ static int fetch_indirect_descs(struct vhost_virtqueue *vq,
>
> /* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> * A negative code is returned on error. */
> -static int fetch_descs(struct vhost_virtqueue *vq)
> +static int fetch_buf(struct vhost_virtqueue *vq)
> {
> unsigned int i, head, found = 0;
> struct vhost_desc *last;
> @@ -2441,7 +2204,11 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> /* Check it isn't doing very strange things with descriptor numbers. */
> last_avail_idx = vq->last_avail_idx;
>
> - if (vq->avail_idx == vq->last_avail_idx) {
> + if (unlikely(vq->avail_idx == vq->last_avail_idx)) {
> + /* If we already have work to do, don't bother re-checking. */
> + if (likely(vq->ndescs))
> + return 1;
> +
> if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> vq_err(vq, "Failed to access avail idx at %p\n",
> &vq->avail->idx);
> @@ -2532,6 +2299,41 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> return 1;
> }
>
> +/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> + * A negative code is returned on error. */
> +static int fetch_descs(struct vhost_virtqueue *vq)
> +{
> + int ret;
> +
> + if (unlikely(vq->first_desc >= vq->ndescs)) {
> + vq->first_desc = 0;
> + vq->ndescs = 0;
> + }
> +
> + if (vq->ndescs)
> + return 1;
> +
> + for (ret = 1;
> + ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
> + ret = fetch_buf(vq))
> + ;
> +
> + /* On success we expect some descs */
> + BUG_ON(ret > 0 && !vq->ndescs);
> + return ret;
> +}
> +
> +/* Reverse the effects of fetch_descs */
> +static void unfetch_descs(struct vhost_virtqueue *vq)
> +{
> + int i;
> +
> + for (i = vq->first_desc; i < vq->ndescs; ++i)
> + if (!(vq->descs[i].flags & VRING_DESC_F_NEXT))
> + vq->last_avail_idx -= 1;
> + vq->ndescs = 0;
> +}
Is it better to set first_desc to zero here?
> +
> /* This looks in the virtqueue and for the first available buffer, and converts
> * it to an iovec for convenient access. Since descriptors consist of some
> * number of output then some number of input descriptors, it's actually two
> @@ -2540,7 +2342,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> * This function returns the descriptor number found, or vq->num (which is
> * never a valid descriptor number) if none was found. A negative code is
> * returned on error. */
> -int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> struct iovec iov[], unsigned int iov_size,
> unsigned int *out_num, unsigned int *in_num,
> struct vhost_log *log, unsigned int *log_num)
> @@ -2549,7 +2351,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> int i;
>
> if (ret <= 0)
> - goto err_fetch;
> + goto err;
>
> /* Now convert to IOV */
> /* When we start there are none of either input nor output. */
> @@ -2557,7 +2359,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> if (unlikely(log))
> *log_num = 0;
>
> - for (i = 0; i < vq->ndescs; ++i) {
> + for (i = vq->first_desc; i < vq->ndescs; ++i) {
> unsigned iov_count = *in_num + *out_num;
> struct vhost_desc *desc = &vq->descs[i];
> int access;
> @@ -2603,24 +2405,26 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> }
>
> ret = desc->id;
> +
> + if (!(desc->flags & VRING_DESC_F_NEXT))
> + break;
> }
>
> - vq->ndescs = 0;
> + vq->first_desc = i + 1;
>
> return ret;
>
> err:
> - vhost_discard_vq_desc(vq, 1);
> -err_fetch:
> - vq->ndescs = 0;
> + unfetch_descs(vq);
>
> return ret;
> }
> -EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
> +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>
> /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> {
> + unfetch_descs(vq);
> vq->last_avail_idx -= n;
So unfetch_descs() has decreased last_avail_idx.
Can we fix this by letting unfetch_descs() return the number and then we
can do:
int d = unfetch_descs(vq);
vq->last_avail_idx -= (n > d) ? n - d: 0;
Thanks
> }
> EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 87089d51490d..fed36af5c444 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -81,6 +81,7 @@ struct vhost_virtqueue {
>
> struct vhost_desc *descs;
> int ndescs;
> + int first_desc;
> int max_descs;
>
> struct file *kick;
> @@ -189,10 +190,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
> bool vhost_log_access_ok(struct vhost_dev *);
>
> -int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
> - struct iovec iov[], unsigned int iov_count,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num);
> int vhost_get_vq_desc(struct vhost_virtqueue *,
> struct iovec iov[], unsigned int iov_count,
> unsigned int *out_num, unsigned int *in_num,
> @@ -261,6 +258,8 @@ static inline void vhost_vq_set_backend(struct vhost_virtqueue *vq,
> void *private_data)
> {
> vq->private_data = private_data;
> + vq->ndescs = 0;
> + vq->first_desc = 0;
> }
>
> /**
On Wed, Jun 10, 2020 at 11:14:49AM +0800, Jason Wang wrote:
>
> On 2020/6/8 下午8:52, Michael S. Tsirkin wrote:
> > As testing shows no performance change, switch to that now.
> >
> > Signed-off-by: Michael S. Tsirkin <[email protected]>
> > Signed-off-by: Eugenio Pérez <[email protected]>
> > Link: https://lore.kernel.org/r/[email protected]
> > Signed-off-by: Michael S. Tsirkin <[email protected]>
> > ---
> > drivers/vhost/test.c | 2 +-
> > drivers/vhost/vhost.c | 318 ++++++++----------------------------------
> > drivers/vhost/vhost.h | 7 +-
> > 3 files changed, 65 insertions(+), 262 deletions(-)
> >
> > diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
> > index 0466921f4772..7d69778aaa26 100644
> > --- a/drivers/vhost/test.c
> > +++ b/drivers/vhost/test.c
> > @@ -119,7 +119,7 @@ static int vhost_test_open(struct inode *inode, struct file *f)
> > dev = &n->dev;
> > vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
> > n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
> > - vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
> > + vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV + 64,
> > VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
> > f->private_data = n;
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index 180b7b58c76b..41d6b132c234 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -304,6 +304,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> > {
> > vq->num = 1;
> > vq->ndescs = 0;
> > + vq->first_desc = 0;
> > vq->desc = NULL;
> > vq->avail = NULL;
> > vq->used = NULL;
> > @@ -372,6 +373,11 @@ static int vhost_worker(void *data)
> > return 0;
> > }
> > +static int vhost_vq_num_batch_descs(struct vhost_virtqueue *vq)
> > +{
> > + return vq->max_descs - UIO_MAXIOV;
> > +}
> > +
> > static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> > {
> > kfree(vq->descs);
> > @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> > for (i = 0; i < dev->nvqs; ++i) {
> > vq = dev->vqs[i];
> > vq->max_descs = dev->iov_limit;
> > + if (vhost_vq_num_batch_descs(vq) < 0) {
> > + return -EINVAL;
> > + }
> > vq->descs = kmalloc_array(vq->max_descs,
> > sizeof(*vq->descs),
> > GFP_KERNEL);
> > @@ -1610,6 +1619,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> > vq->last_avail_idx = s.num;
> > /* Forget the cached index value. */
> > vq->avail_idx = vq->last_avail_idx;
> > + vq->ndescs = vq->first_desc = 0;
> > break;
> > case VHOST_GET_VRING_BASE:
> > s.index = idx;
> > @@ -2078,253 +2088,6 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
> > return next;
> > }
> > -static int get_indirect(struct vhost_virtqueue *vq,
> > - struct iovec iov[], unsigned int iov_size,
> > - unsigned int *out_num, unsigned int *in_num,
> > - struct vhost_log *log, unsigned int *log_num,
> > - struct vring_desc *indirect)
> > -{
> > - struct vring_desc desc;
> > - unsigned int i = 0, count, found = 0;
> > - u32 len = vhost32_to_cpu(vq, indirect->len);
> > - struct iov_iter from;
> > - int ret, access;
> > -
> > - /* Sanity check */
> > - if (unlikely(len % sizeof desc)) {
> > - vq_err(vq, "Invalid length in indirect descriptor: "
> > - "len 0x%llx not multiple of 0x%zx\n",
> > - (unsigned long long)len,
> > - sizeof desc);
> > - return -EINVAL;
> > - }
> > -
> > - ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
> > - UIO_MAXIOV, VHOST_ACCESS_RO);
> > - if (unlikely(ret < 0)) {
> > - if (ret != -EAGAIN)
> > - vq_err(vq, "Translation failure %d in indirect.\n", ret);
> > - return ret;
> > - }
> > - iov_iter_init(&from, READ, vq->indirect, ret, len);
> > -
> > - /* We will use the result as an address to read from, so most
> > - * architectures only need a compiler barrier here. */
> > - read_barrier_depends();
> > -
> > - count = len / sizeof desc;
> > - /* Buffers are chained via a 16 bit next field, so
> > - * we can have at most 2^16 of these. */
> > - if (unlikely(count > USHRT_MAX + 1)) {
> > - vq_err(vq, "Indirect buffer length too big: %d\n",
> > - indirect->len);
> > - return -E2BIG;
> > - }
> > -
> > - do {
> > - unsigned iov_count = *in_num + *out_num;
> > - if (unlikely(++found > count)) {
> > - vq_err(vq, "Loop detected: last one at %u "
> > - "indirect size %u\n",
> > - i, count);
> > - return -EINVAL;
> > - }
> > - if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
> > - vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
> > - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> > - return -EINVAL;
> > - }
> > - if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
> > - vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
> > - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> > - return -EINVAL;
> > - }
> > -
> > - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> > - access = VHOST_ACCESS_WO;
> > - else
> > - access = VHOST_ACCESS_RO;
> > -
> > - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> > - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> > - iov_size - iov_count, access);
> > - if (unlikely(ret < 0)) {
> > - if (ret != -EAGAIN)
> > - vq_err(vq, "Translation failure %d indirect idx %d\n",
> > - ret, i);
> > - return ret;
> > - }
> > - /* If this is an input descriptor, increment that count. */
> > - if (access == VHOST_ACCESS_WO) {
> > - *in_num += ret;
> > - if (unlikely(log && ret)) {
> > - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> > - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> > - ++*log_num;
> > - }
> > - } else {
> > - /* If it's an output descriptor, they're all supposed
> > - * to come before any input descriptors. */
> > - if (unlikely(*in_num)) {
> > - vq_err(vq, "Indirect descriptor "
> > - "has out after in: idx %d\n", i);
> > - return -EINVAL;
> > - }
> > - *out_num += ret;
> > - }
> > - } while ((i = next_desc(vq, &desc)) != -1);
> > - return 0;
> > -}
> > -
> > -/* This looks in the virtqueue and for the first available buffer, and converts
> > - * it to an iovec for convenient access. Since descriptors consist of some
> > - * number of output then some number of input descriptors, it's actually two
> > - * iovecs, but we pack them into one and note how many of each there were.
> > - *
> > - * This function returns the descriptor number found, or vq->num (which is
> > - * never a valid descriptor number) if none was found. A negative code is
> > - * returned on error. */
> > -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > - struct iovec iov[], unsigned int iov_size,
> > - unsigned int *out_num, unsigned int *in_num,
> > - struct vhost_log *log, unsigned int *log_num)
> > -{
> > - struct vring_desc desc;
> > - unsigned int i, head, found = 0;
> > - u16 last_avail_idx;
> > - __virtio16 avail_idx;
> > - __virtio16 ring_head;
> > - int ret, access;
> > -
> > - /* Check it isn't doing very strange things with descriptor numbers. */
> > - last_avail_idx = vq->last_avail_idx;
> > -
> > - if (vq->avail_idx == vq->last_avail_idx) {
> > - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> > - vq_err(vq, "Failed to access avail idx at %p\n",
> > - &vq->avail->idx);
> > - return -EFAULT;
> > - }
> > - vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> > -
> > - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
> > - vq_err(vq, "Guest moved used index from %u to %u",
> > - last_avail_idx, vq->avail_idx);
> > - return -EFAULT;
> > - }
> > -
> > - /* If there's nothing new since last we looked, return
> > - * invalid.
> > - */
> > - if (vq->avail_idx == last_avail_idx)
> > - return vq->num;
> > -
> > - /* Only get avail ring entries after they have been
> > - * exposed by guest.
> > - */
> > - smp_rmb();
> > - }
> > -
> > - /* Grab the next descriptor number they're advertising, and increment
> > - * the index we've seen. */
> > - if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
> > - vq_err(vq, "Failed to read head: idx %d address %p\n",
> > - last_avail_idx,
> > - &vq->avail->ring[last_avail_idx % vq->num]);
> > - return -EFAULT;
> > - }
> > -
> > - head = vhost16_to_cpu(vq, ring_head);
> > -
> > - /* If their number is silly, that's an error. */
> > - if (unlikely(head >= vq->num)) {
> > - vq_err(vq, "Guest says index %u > %u is available",
> > - head, vq->num);
> > - return -EINVAL;
> > - }
> > -
> > - /* When we start there are none of either input nor output. */
> > - *out_num = *in_num = 0;
> > - if (unlikely(log))
> > - *log_num = 0;
> > -
> > - i = head;
> > - do {
> > - unsigned iov_count = *in_num + *out_num;
> > - if (unlikely(i >= vq->num)) {
> > - vq_err(vq, "Desc index is %u > %u, head = %u",
> > - i, vq->num, head);
> > - return -EINVAL;
> > - }
> > - if (unlikely(++found > vq->num)) {
> > - vq_err(vq, "Loop detected: last one at %u "
> > - "vq size %u head %u\n",
> > - i, vq->num, head);
> > - return -EINVAL;
> > - }
> > - ret = vhost_get_desc(vq, &desc, i);
> > - if (unlikely(ret)) {
> > - vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
> > - i, vq->desc + i);
> > - return -EFAULT;
> > - }
> > - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
> > - ret = get_indirect(vq, iov, iov_size,
> > - out_num, in_num,
> > - log, log_num, &desc);
> > - if (unlikely(ret < 0)) {
> > - if (ret != -EAGAIN)
> > - vq_err(vq, "Failure detected "
> > - "in indirect descriptor at idx %d\n", i);
> > - return ret;
> > - }
> > - continue;
> > - }
> > -
> > - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> > - access = VHOST_ACCESS_WO;
> > - else
> > - access = VHOST_ACCESS_RO;
> > - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> > - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> > - iov_size - iov_count, access);
> > - if (unlikely(ret < 0)) {
> > - if (ret != -EAGAIN)
> > - vq_err(vq, "Translation failure %d descriptor idx %d\n",
> > - ret, i);
> > - return ret;
> > - }
> > - if (access == VHOST_ACCESS_WO) {
> > - /* If this is an input descriptor,
> > - * increment that count. */
> > - *in_num += ret;
> > - if (unlikely(log && ret)) {
> > - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> > - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> > - ++*log_num;
> > - }
> > - } else {
> > - /* If it's an output descriptor, they're all supposed
> > - * to come before any input descriptors. */
> > - if (unlikely(*in_num)) {
> > - vq_err(vq, "Descriptor has out after in: "
> > - "idx %d\n", i);
> > - return -EINVAL;
> > - }
> > - *out_num += ret;
> > - }
> > - } while ((i = next_desc(vq, &desc)) != -1);
> > -
> > - /* On success, increment avail index. */
> > - vq->last_avail_idx++;
> > -
> > - /* Assume notifications from guest are disabled at this point,
> > - * if they aren't we would need to update avail_event index. */
> > - BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> > - return head;
> > -}
> > -EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > -
> > static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
> > {
> > BUG_ON(!vq->ndescs);
> > @@ -2428,7 +2191,7 @@ static int fetch_indirect_descs(struct vhost_virtqueue *vq,
> > /* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> > * A negative code is returned on error. */
> > -static int fetch_descs(struct vhost_virtqueue *vq)
> > +static int fetch_buf(struct vhost_virtqueue *vq)
> > {
> > unsigned int i, head, found = 0;
> > struct vhost_desc *last;
> > @@ -2441,7 +2204,11 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> > /* Check it isn't doing very strange things with descriptor numbers. */
> > last_avail_idx = vq->last_avail_idx;
> > - if (vq->avail_idx == vq->last_avail_idx) {
> > + if (unlikely(vq->avail_idx == vq->last_avail_idx)) {
> > + /* If we already have work to do, don't bother re-checking. */
> > + if (likely(vq->ndescs))
> > + return 1;
> > +
> > if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> > vq_err(vq, "Failed to access avail idx at %p\n",
> > &vq->avail->idx);
> > @@ -2532,6 +2299,41 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> > return 1;
> > }
> > +/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> > + * A negative code is returned on error. */
> > +static int fetch_descs(struct vhost_virtqueue *vq)
> > +{
> > + int ret;
> > +
> > + if (unlikely(vq->first_desc >= vq->ndescs)) {
> > + vq->first_desc = 0;
> > + vq->ndescs = 0;
> > + }
> > +
> > + if (vq->ndescs)
> > + return 1;
> > +
> > + for (ret = 1;
> > + ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
> > + ret = fetch_buf(vq))
> > + ;
> > +
> > + /* On success we expect some descs */
> > + BUG_ON(ret > 0 && !vq->ndescs);
> > + return ret;
> > +}
> > +
> > +/* Reverse the effects of fetch_descs */
> > +static void unfetch_descs(struct vhost_virtqueue *vq)
> > +{
> > + int i;
> > +
> > + for (i = vq->first_desc; i < vq->ndescs; ++i)
> > + if (!(vq->descs[i].flags & VRING_DESC_F_NEXT))
> > + vq->last_avail_idx -= 1;
> > + vq->ndescs = 0;
> > +}
>
>
> Is it better to set first_desc to zero here?
>
>
> > +
> > /* This looks in the virtqueue and for the first available buffer, and converts
> > * it to an iovec for convenient access. Since descriptors consist of some
> > * number of output then some number of input descriptors, it's actually two
> > @@ -2540,7 +2342,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> > * This function returns the descriptor number found, or vq->num (which is
> > * never a valid descriptor number) if none was found. A negative code is
> > * returned on error. */
> > -int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> > +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> > struct iovec iov[], unsigned int iov_size,
> > unsigned int *out_num, unsigned int *in_num,
> > struct vhost_log *log, unsigned int *log_num)
> > @@ -2549,7 +2351,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> > int i;
> > if (ret <= 0)
> > - goto err_fetch;
> > + goto err;
> > /* Now convert to IOV */
> > /* When we start there are none of either input nor output. */
> > @@ -2557,7 +2359,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> > if (unlikely(log))
> > *log_num = 0;
> > - for (i = 0; i < vq->ndescs; ++i) {
> > + for (i = vq->first_desc; i < vq->ndescs; ++i) {
> > unsigned iov_count = *in_num + *out_num;
> > struct vhost_desc *desc = &vq->descs[i];
> > int access;
> > @@ -2603,24 +2405,26 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> > }
> > ret = desc->id;
> > +
> > + if (!(desc->flags & VRING_DESC_F_NEXT))
> > + break;
> > }
> > - vq->ndescs = 0;
> > + vq->first_desc = i + 1;
> > return ret;
> > err:
> > - vhost_discard_vq_desc(vq, 1);
> > -err_fetch:
> > - vq->ndescs = 0;
> > + unfetch_descs(vq);
> > return ret;
> > }
> > -EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
> > +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> > {
> > + unfetch_descs(vq);
> > vq->last_avail_idx -= n;
>
>
> So unfetch_descs() has decreased last_avail_idx.
> Can we fix this by letting unfetch_descs() return the number and then we can
> do:
>
> int d = unfetch_descs(vq);
> vq->last_avail_idx -= (n > d) ? n - d: 0;
>
> Thanks
That's intentional I think - we need both.
Unfetch_descs drops the descriptors in the cache that were
*not returned to caller* through get_vq_desc.
vhost_discard_vq_desc drops the ones that were returned through get_vq_desc.
Did I miss anything?
>
> > }
> > EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
> > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> > index 87089d51490d..fed36af5c444 100644
> > --- a/drivers/vhost/vhost.h
> > +++ b/drivers/vhost/vhost.h
> > @@ -81,6 +81,7 @@ struct vhost_virtqueue {
> > struct vhost_desc *descs;
> > int ndescs;
> > + int first_desc;
> > int max_descs;
> > struct file *kick;
> > @@ -189,10 +190,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> > bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
> > bool vhost_log_access_ok(struct vhost_dev *);
> > -int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
> > - struct iovec iov[], unsigned int iov_count,
> > - unsigned int *out_num, unsigned int *in_num,
> > - struct vhost_log *log, unsigned int *log_num);
> > int vhost_get_vq_desc(struct vhost_virtqueue *,
> > struct iovec iov[], unsigned int iov_count,
> > unsigned int *out_num, unsigned int *in_num,
> > @@ -261,6 +258,8 @@ static inline void vhost_vq_set_backend(struct vhost_virtqueue *vq,
> > void *private_data)
> > {
> > vq->private_data = private_data;
> > + vq->ndescs = 0;
> > + vq->first_desc = 0;
> > }
> > /**
On Mon, Jun 8, 2020 at 2:53 PM Michael S. Tsirkin <[email protected]> wrote:
>
> As testing shows no performance change, switch to that now.
>
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> Signed-off-by: Eugenio Pérez <[email protected]>
> Link: https://lore.kernel.org/r/[email protected]
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> ---
> drivers/vhost/test.c | 2 +-
> drivers/vhost/vhost.c | 318 ++++++++----------------------------------
> drivers/vhost/vhost.h | 7 +-
> 3 files changed, 65 insertions(+), 262 deletions(-)
>
> diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
> index 0466921f4772..7d69778aaa26 100644
> --- a/drivers/vhost/test.c
> +++ b/drivers/vhost/test.c
> @@ -119,7 +119,7 @@ static int vhost_test_open(struct inode *inode, struct file *f)
> dev = &n->dev;
> vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
> n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
> - vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
> + vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV + 64,
> VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
>
> f->private_data = n;
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 180b7b58c76b..41d6b132c234 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -304,6 +304,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> {
> vq->num = 1;
> vq->ndescs = 0;
> + vq->first_desc = 0;
> vq->desc = NULL;
> vq->avail = NULL;
> vq->used = NULL;
> @@ -372,6 +373,11 @@ static int vhost_worker(void *data)
> return 0;
> }
>
> +static int vhost_vq_num_batch_descs(struct vhost_virtqueue *vq)
> +{
> + return vq->max_descs - UIO_MAXIOV;
> +}
> +
> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> {
> kfree(vq->descs);
> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> for (i = 0; i < dev->nvqs; ++i) {
> vq = dev->vqs[i];
> vq->max_descs = dev->iov_limit;
> + if (vhost_vq_num_batch_descs(vq) < 0) {
> + return -EINVAL;
> + }
> vq->descs = kmalloc_array(vq->max_descs,
> sizeof(*vq->descs),
> GFP_KERNEL);
> @@ -1610,6 +1619,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> vq->last_avail_idx = s.num;
> /* Forget the cached index value. */
> vq->avail_idx = vq->last_avail_idx;
> + vq->ndescs = vq->first_desc = 0;
> break;
> case VHOST_GET_VRING_BASE:
> s.index = idx;
> @@ -2078,253 +2088,6 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
> return next;
> }
>
> -static int get_indirect(struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num,
> - struct vring_desc *indirect)
> -{
> - struct vring_desc desc;
> - unsigned int i = 0, count, found = 0;
> - u32 len = vhost32_to_cpu(vq, indirect->len);
> - struct iov_iter from;
> - int ret, access;
> -
> - /* Sanity check */
> - if (unlikely(len % sizeof desc)) {
> - vq_err(vq, "Invalid length in indirect descriptor: "
> - "len 0x%llx not multiple of 0x%zx\n",
> - (unsigned long long)len,
> - sizeof desc);
> - return -EINVAL;
> - }
> -
> - ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
> - UIO_MAXIOV, VHOST_ACCESS_RO);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d in indirect.\n", ret);
> - return ret;
> - }
> - iov_iter_init(&from, READ, vq->indirect, ret, len);
> -
> - /* We will use the result as an address to read from, so most
> - * architectures only need a compiler barrier here. */
> - read_barrier_depends();
> -
> - count = len / sizeof desc;
> - /* Buffers are chained via a 16 bit next field, so
> - * we can have at most 2^16 of these. */
> - if (unlikely(count > USHRT_MAX + 1)) {
> - vq_err(vq, "Indirect buffer length too big: %d\n",
> - indirect->len);
> - return -E2BIG;
> - }
> -
> - do {
> - unsigned iov_count = *in_num + *out_num;
> - if (unlikely(++found > count)) {
> - vq_err(vq, "Loop detected: last one at %u "
> - "indirect size %u\n",
> - i, count);
> - return -EINVAL;
> - }
> - if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
> - vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
> - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> - return -EINVAL;
> - }
> - if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
> - vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
> - i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
> - return -EINVAL;
> - }
> -
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> - access = VHOST_ACCESS_WO;
> - else
> - access = VHOST_ACCESS_RO;
> -
> - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> - iov_size - iov_count, access);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d indirect idx %d\n",
> - ret, i);
> - return ret;
> - }
> - /* If this is an input descriptor, increment that count. */
> - if (access == VHOST_ACCESS_WO) {
> - *in_num += ret;
> - if (unlikely(log && ret)) {
> - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> - ++*log_num;
> - }
> - } else {
> - /* If it's an output descriptor, they're all supposed
> - * to come before any input descriptors. */
> - if (unlikely(*in_num)) {
> - vq_err(vq, "Indirect descriptor "
> - "has out after in: idx %d\n", i);
> - return -EINVAL;
> - }
> - *out_num += ret;
> - }
> - } while ((i = next_desc(vq, &desc)) != -1);
> - return 0;
> -}
> -
> -/* This looks in the virtqueue and for the first available buffer, and converts
> - * it to an iovec for convenient access. Since descriptors consist of some
> - * number of output then some number of input descriptors, it's actually two
> - * iovecs, but we pack them into one and note how many of each there were.
> - *
> - * This function returns the descriptor number found, or vq->num (which is
> - * never a valid descriptor number) if none was found. A negative code is
> - * returned on error. */
> -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num)
> -{
> - struct vring_desc desc;
> - unsigned int i, head, found = 0;
> - u16 last_avail_idx;
> - __virtio16 avail_idx;
> - __virtio16 ring_head;
> - int ret, access;
> -
> - /* Check it isn't doing very strange things with descriptor numbers. */
> - last_avail_idx = vq->last_avail_idx;
> -
> - if (vq->avail_idx == vq->last_avail_idx) {
> - if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> - vq_err(vq, "Failed to access avail idx at %p\n",
> - &vq->avail->idx);
> - return -EFAULT;
> - }
> - vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
> -
> - if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
> - vq_err(vq, "Guest moved used index from %u to %u",
> - last_avail_idx, vq->avail_idx);
> - return -EFAULT;
> - }
> -
> - /* If there's nothing new since last we looked, return
> - * invalid.
> - */
> - if (vq->avail_idx == last_avail_idx)
> - return vq->num;
> -
> - /* Only get avail ring entries after they have been
> - * exposed by guest.
> - */
> - smp_rmb();
> - }
> -
> - /* Grab the next descriptor number they're advertising, and increment
> - * the index we've seen. */
> - if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
> - vq_err(vq, "Failed to read head: idx %d address %p\n",
> - last_avail_idx,
> - &vq->avail->ring[last_avail_idx % vq->num]);
> - return -EFAULT;
> - }
> -
> - head = vhost16_to_cpu(vq, ring_head);
> -
> - /* If their number is silly, that's an error. */
> - if (unlikely(head >= vq->num)) {
> - vq_err(vq, "Guest says index %u > %u is available",
> - head, vq->num);
> - return -EINVAL;
> - }
> -
> - /* When we start there are none of either input nor output. */
> - *out_num = *in_num = 0;
> - if (unlikely(log))
> - *log_num = 0;
> -
> - i = head;
> - do {
> - unsigned iov_count = *in_num + *out_num;
> - if (unlikely(i >= vq->num)) {
> - vq_err(vq, "Desc index is %u > %u, head = %u",
> - i, vq->num, head);
> - return -EINVAL;
> - }
> - if (unlikely(++found > vq->num)) {
> - vq_err(vq, "Loop detected: last one at %u "
> - "vq size %u head %u\n",
> - i, vq->num, head);
> - return -EINVAL;
> - }
> - ret = vhost_get_desc(vq, &desc, i);
> - if (unlikely(ret)) {
> - vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
> - i, vq->desc + i);
> - return -EFAULT;
> - }
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
> - ret = get_indirect(vq, iov, iov_size,
> - out_num, in_num,
> - log, log_num, &desc);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Failure detected "
> - "in indirect descriptor at idx %d\n", i);
> - return ret;
> - }
> - continue;
> - }
> -
> - if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
> - access = VHOST_ACCESS_WO;
> - else
> - access = VHOST_ACCESS_RO;
> - ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
> - vhost32_to_cpu(vq, desc.len), iov + iov_count,
> - iov_size - iov_count, access);
> - if (unlikely(ret < 0)) {
> - if (ret != -EAGAIN)
> - vq_err(vq, "Translation failure %d descriptor idx %d\n",
> - ret, i);
> - return ret;
> - }
> - if (access == VHOST_ACCESS_WO) {
> - /* If this is an input descriptor,
> - * increment that count. */
> - *in_num += ret;
> - if (unlikely(log && ret)) {
> - log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
> - log[*log_num].len = vhost32_to_cpu(vq, desc.len);
> - ++*log_num;
> - }
> - } else {
> - /* If it's an output descriptor, they're all supposed
> - * to come before any input descriptors. */
> - if (unlikely(*in_num)) {
> - vq_err(vq, "Descriptor has out after in: "
> - "idx %d\n", i);
> - return -EINVAL;
> - }
> - *out_num += ret;
> - }
> - } while ((i = next_desc(vq, &desc)) != -1);
> -
> - /* On success, increment avail index. */
> - vq->last_avail_idx++;
> -
> - /* Assume notifications from guest are disabled at this point,
> - * if they aren't we would need to update avail_event index. */
> - BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
> - return head;
> -}
> -EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> -
> static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
> {
> BUG_ON(!vq->ndescs);
> @@ -2428,7 +2191,7 @@ static int fetch_indirect_descs(struct vhost_virtqueue *vq,
>
> /* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> * A negative code is returned on error. */
> -static int fetch_descs(struct vhost_virtqueue *vq)
> +static int (struct vhost_virtqueue *vq)
> {
> unsigned int i, head, found = 0;
> struct vhost_desc *last;
> @@ -2441,7 +2204,11 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> /* Check it isn't doing very strange things with descriptor numbers. */
> last_avail_idx = vq->last_avail_idx;
>
> - if (vq->avail_idx == vq->last_avail_idx) {
> + if (unlikely(vq->avail_idx == vq->last_avail_idx)) {
> + /* If we already have work to do, don't bother re-checking. */
> + if (likely(vq->ndescs))
> + return 1;
If this path is taken and vq->ndescs < vhost_vq_num_batch_descs, the
for{} in fetch_descs will never exit. Do we want to accumulate as many
buffers as possible, or to return them early? Maybe to find a proper
threshold?
(Still testing next commits)
> +
> if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
> vq_err(vq, "Failed to access avail idx at %p\n",
> &vq->avail->idx);
> @@ -2532,6 +2299,41 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> return 1;
> }
>
> +/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> + * A negative code is returned on error. */
> +static int fetch_descs(struct vhost_virtqueue *vq)
> +{
> + int ret;
> +
> + if (unlikely(vq->first_desc >= vq->ndescs)) {
> + vq->first_desc = 0;
> + vq->ndescs = 0;
> + }
> +
> + if (vq->ndescs)
> + return 1;
> +
> + for (ret = 1;
> + ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
> + ret = fetch_buf(vq))
> + ;
> +
> + /* On success we expect some descs */
> + BUG_ON(ret > 0 && !vq->ndescs);
> + return ret;
> +}
> +
> +/* Reverse the effects of fetch_descs */
> +static void unfetch_descs(struct vhost_virtqueue *vq)
> +{
> + int i;
> +
> + for (i = vq->first_desc; i < vq->ndescs; ++i)
> + if (!(vq->descs[i].flags & VRING_DESC_F_NEXT))
> + vq->last_avail_idx -= 1;
> + vq->ndescs = 0;
> +}
> +
> /* This looks in the virtqueue and for the first available buffer, and converts
> * it to an iovec for convenient access. Since descriptors consist of some
> * number of output then some number of input descriptors, it's actually two
> @@ -2540,7 +2342,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
> * This function returns the descriptor number found, or vq->num (which is
> * never a valid descriptor number) if none was found. A negative code is
> * returned on error. */
> -int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> struct iovec iov[], unsigned int iov_size,
> unsigned int *out_num, unsigned int *in_num,
> struct vhost_log *log, unsigned int *log_num)
> @@ -2549,7 +2351,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> int i;
>
> if (ret <= 0)
> - goto err_fetch;
> + goto err;
This return needs to differentiate between fetch_desc returning 0 (in
that case, it needs to return vq->num) and fetch_descs < 0 (goto err).
(Already noted by you in different thread, but pointing here just to
not forget it).
>
> /* Now convert to IOV */
> /* When we start there are none of either input nor output. */
> @@ -2557,7 +2359,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> if (unlikely(log))
> *log_num = 0;
>
> - for (i = 0; i < vq->ndescs; ++i) {
> + for (i = vq->first_desc; i < vq->ndescs; ++i) {
> unsigned iov_count = *in_num + *out_num;
> struct vhost_desc *desc = &vq->descs[i];
> int access;
> @@ -2603,24 +2405,26 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
> }
>
> ret = desc->id;
> +
> + if (!(desc->flags & VRING_DESC_F_NEXT))
> + break;
> }
>
> - vq->ndescs = 0;
> + vq->first_desc = i + 1;
>
> return ret;
>
> err:
> - vhost_discard_vq_desc(vq, 1);
> -err_fetch:
> - vq->ndescs = 0;
> + unfetch_descs(vq);
>
> return ret;
> }
> -EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
> +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>
> /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> {
> + unfetch_descs(vq);
> vq->last_avail_idx -= n;
> }
> EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 87089d51490d..fed36af5c444 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -81,6 +81,7 @@ struct vhost_virtqueue {
>
> struct vhost_desc *descs;
> int ndescs;
> + int first_desc;
> int max_descs;
>
> struct file *kick;
> @@ -189,10 +190,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
> bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
> bool vhost_log_access_ok(struct vhost_dev *);
>
> -int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
> - struct iovec iov[], unsigned int iov_count,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num);
> int vhost_get_vq_desc(struct vhost_virtqueue *,
> struct iovec iov[], unsigned int iov_count,
> unsigned int *out_num, unsigned int *in_num,
> @@ -261,6 +258,8 @@ static inline void vhost_vq_set_backend(struct vhost_virtqueue *vq,
> void *private_data)
> {
> vq->private_data = private_data;
> + vq->ndescs = 0;
> + vq->first_desc = 0;
> }
>
> /**
> --
> MST
>
On 2020/6/10 下午7:05, Michael S. Tsirkin wrote:
>>> +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>>> /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
>>> void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
>>> {
>>> + unfetch_descs(vq);
>>> vq->last_avail_idx -= n;
>> So unfetch_descs() has decreased last_avail_idx.
>> Can we fix this by letting unfetch_descs() return the number and then we can
>> do:
>>
>> int d = unfetch_descs(vq);
>> vq->last_avail_idx -= (n > d) ? n - d: 0;
>>
>> Thanks
> That's intentional I think - we need both.
Yes, but:
>
> Unfetch_descs drops the descriptors in the cache that were
> *not returned to caller* through get_vq_desc.
>
> vhost_discard_vq_desc drops the ones that were returned through get_vq_desc.
>
> Did I miss anything?
We could count some descriptors twice, consider the case e.g we only
cache on descriptor:
fetch_descs()
fetch_buf()
last_avail_idx++;
Then we want do discard it:
vhost_discard_avail_buf(1)
unfetch_descs()
last_avail_idx--;
last_avail_idx -= 1;
Thanks
On Thu, Jun 11, 2020 at 11:02:57AM +0800, Jason Wang wrote:
>
> On 2020/6/10 下午7:05, Michael S. Tsirkin wrote:
> > > > +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
> > > > /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
> > > > void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
> > > > {
> > > > + unfetch_descs(vq);
> > > > vq->last_avail_idx -= n;
> > > So unfetch_descs() has decreased last_avail_idx.
> > > Can we fix this by letting unfetch_descs() return the number and then we can
> > > do:
> > >
> > > int d = unfetch_descs(vq);
> > > vq->last_avail_idx -= (n > d) ? n - d: 0;
> > >
> > > Thanks
> > That's intentional I think - we need both.
>
>
> Yes, but:
>
>
> >
> > Unfetch_descs drops the descriptors in the cache that were
> > *not returned to caller* through get_vq_desc.
> >
> > vhost_discard_vq_desc drops the ones that were returned through get_vq_desc.
> >
> > Did I miss anything?
>
> We could count some descriptors twice, consider the case e.g we only cache
> on descriptor:
>
> fetch_descs()
> fetch_buf()
> last_avail_idx++;
>
> Then we want do discard it:
> vhost_discard_avail_buf(1)
> unfetch_descs()
> last_avail_idx--;
> last_avail_idx -= 1;
>
> Thanks
I don't think that happens. vhost_discard_avail_buf(1) is only called
after get vhost_get_avail_buf. vhost_get_avail_buf increments
first_desc. unfetch_descs only counts from first_desc to ndescs.
If I'm wrong, could you show values of first_desc and ndescs in this
scenario?
--
MST
On 2020/6/11 下午5:06, Michael S. Tsirkin wrote:
> On Thu, Jun 11, 2020 at 11:02:57AM +0800, Jason Wang wrote:
>> On 2020/6/10 下午7:05, Michael S. Tsirkin wrote:
>>>>> +EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>>>>> /* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
>>>>> void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
>>>>> {
>>>>> + unfetch_descs(vq);
>>>>> vq->last_avail_idx -= n;
>>>> So unfetch_descs() has decreased last_avail_idx.
>>>> Can we fix this by letting unfetch_descs() return the number and then we can
>>>> do:
>>>>
>>>> int d = unfetch_descs(vq);
>>>> vq->last_avail_idx -= (n > d) ? n - d: 0;
>>>>
>>>> Thanks
>>> That's intentional I think - we need both.
>>
>> Yes, but:
>>
>>
>>> Unfetch_descs drops the descriptors in the cache that were
>>> *not returned to caller* through get_vq_desc.
>>>
>>> vhost_discard_vq_desc drops the ones that were returned through get_vq_desc.
>>>
>>> Did I miss anything?
>> We could count some descriptors twice, consider the case e.g we only cache
>> on descriptor:
>>
>> fetch_descs()
>> fetch_buf()
>> last_avail_idx++;
>>
>> Then we want do discard it:
>> vhost_discard_avail_buf(1)
>> unfetch_descs()
>> last_avail_idx--;
>> last_avail_idx -= 1;
>>
>> Thanks
>
> I don't think that happens. vhost_discard_avail_buf(1) is only called
> after get vhost_get_avail_buf. vhost_get_avail_buf increments
> first_desc. unfetch_descs only counts from first_desc to ndescs.
>
> If I'm wrong, could you show values of first_desc and ndescs in this
> scenario?
You're right, fetch_descriptor could not be called directly from the
device code.
Thanks
>