Hello everyone,
This RFC implements packed ring support in virtio driver.
Some simple functional tests have been done with Jason's
packed ring implementation in vhost:
https://lkml.org/lkml/2018/4/23/12
Both of ping and netperf worked as expected (with EVENT_IDX
disabled).
TODO:
- Refinements (for code and commit log);
- More tests;
- Bug fixes;
RFC v3 -> RFC v4:
- Make ID allocation support out-of-order (Jason);
- Various fixes for EVENT_IDX support;
RFC v2 -> RFC v3:
- Split into small patches (Jason);
- Add helper virtqueue_use_indirect() (Jason);
- Just set id for the last descriptor of a list (Jason);
- Calculate the prev in virtqueue_add_packed() (Jason);
- Fix/improve desc suppression code (Jason/MST);
- Refine the code layout for XXX_split/packed and wrappers (MST);
- Fix the comments and API in uapi (MST);
- Remove the BUG_ON() for indirect (Jason);
- Some other refinements and bug fixes;
RFC v1 -> RFC v2:
- Add indirect descriptor support - compile test only;
- Add event suppression supprt - compile test only;
- Move vring_packed_init() out of uapi (Jason, MST);
- Merge two loops into one in virtqueue_add_packed() (Jason);
- Split vring_unmap_one() for packed ring and split ring (Jason);
- Avoid using '%' operator (Jason);
- Rename free_head -> next_avail_idx (Jason);
- Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
- Some other refinements and bug fixes;
Thanks!
Tiwei Bie (5):
virtio: add packed ring definitions
virtio_ring: support creating packed ring
virtio_ring: add packed ring support
virtio_ring: add event idx support in packed ring
virtio_ring: enable packed ring
drivers/virtio/virtio_ring.c | 1338 ++++++++++++++++++++++------
include/linux/virtio_ring.h | 8 +-
include/uapi/linux/virtio_config.h | 12 +-
include/uapi/linux/virtio_ring.h | 36 +
4 files changed, 1116 insertions(+), 278 deletions(-)
--
2.17.0
This commit introduces the support for creating packed ring.
All split ring specific functions are added _split suffix.
Some necessary stubs for packed ring are also added.
Signed-off-by: Tiwei Bie <[email protected]>
---
drivers/virtio/virtio_ring.c | 764 +++++++++++++++++++++++------------
include/linux/virtio_ring.h | 8 +-
2 files changed, 513 insertions(+), 259 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 71458f493cf8..62d7c407841a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -64,8 +64,8 @@ struct vring_desc_state {
struct vring_virtqueue {
struct virtqueue vq;
- /* Actual memory layout for this queue */
- struct vring vring;
+ /* Is this a packed ring? */
+ bool packed;
/* Can we use weak barriers? */
bool weak_barriers;
@@ -79,19 +79,45 @@ struct vring_virtqueue {
/* Host publishes avail event idx */
bool event;
- /* Head of free buffer list. */
- unsigned int free_head;
/* Number we've added since last sync. */
unsigned int num_added;
/* Last used index we've seen. */
u16 last_used_idx;
- /* Last written value to avail->flags */
- u16 avail_flags_shadow;
+ union {
+ /* Available for split ring */
+ struct {
+ /* Actual memory layout for this queue. */
+ struct vring vring;
- /* Last written value to avail->idx in guest byte order */
- u16 avail_idx_shadow;
+ /* Head of free buffer list. */
+ unsigned int free_head;
+
+ /* Last written value to avail->flags */
+ u16 avail_flags_shadow;
+
+ /* Last written value to avail->idx in
+ * guest byte order. */
+ u16 avail_idx_shadow;
+ };
+
+ /* Available for packed ring */
+ struct {
+ /* Actual memory layout for this queue. */
+ struct vring_packed vring_packed;
+
+ /* Driver ring wrap counter. */
+ u8 wrap_counter;
+
+ /* Index of the next avail descriptor. */
+ u16 next_avail_idx;
+
+ /* Last written value to driver->flags in
+ * guest byte order. */
+ u16 event_flags_shadow;
+ };
+ };
/* How to notify other side. FIXME: commonalize hcalls! */
bool (*notify)(struct virtqueue *vq);
@@ -201,8 +227,17 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
cpu_addr, size, direction);
}
-static void vring_unmap_one(const struct vring_virtqueue *vq,
- struct vring_desc *desc)
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+ dma_addr_t addr)
+{
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return 0;
+
+ return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
+static void vring_unmap_one_split(const struct vring_virtqueue *vq,
+ struct vring_desc *desc)
{
u16 flags;
@@ -226,17 +261,9 @@ static void vring_unmap_one(const struct vring_virtqueue *vq,
}
}
-static int vring_mapping_error(const struct vring_virtqueue *vq,
- dma_addr_t addr)
-{
- if (!vring_use_dma_api(vq->vq.vdev))
- return 0;
-
- return dma_mapping_error(vring_dma_dev(vq), addr);
-}
-
-static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
- unsigned int total_sg, gfp_t gfp)
+static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
+ unsigned int total_sg,
+ gfp_t gfp)
{
struct vring_desc *desc;
unsigned int i;
@@ -257,14 +284,14 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
return desc;
}
-static inline int virtqueue_add(struct virtqueue *_vq,
- struct scatterlist *sgs[],
- unsigned int total_sg,
- unsigned int out_sgs,
- unsigned int in_sgs,
- void *data,
- void *ctx,
- gfp_t gfp)
+static inline int virtqueue_add_split(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ unsigned int total_sg,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ void *ctx,
+ gfp_t gfp)
{
struct vring_virtqueue *vq = to_vvq(_vq);
struct scatterlist *sg;
@@ -303,7 +330,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
/* If the host supports indirect descriptor tables, and we have multiple
* buffers, then go indirect. FIXME: tune this threshold */
if (vq->indirect && total_sg > 1 && vq->vq.num_free)
- desc = alloc_indirect(_vq, total_sg, gfp);
+ desc = alloc_indirect_split(_vq, total_sg, gfp);
else {
desc = NULL;
WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
@@ -424,7 +451,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
for (n = 0; n < total_sg; n++) {
if (i == err_idx)
break;
- vring_unmap_one(vq, &desc[i]);
+ vring_unmap_one_split(vq, &desc[i]);
i = virtio16_to_cpu(_vq->vdev, vq->vring.desc[i].next);
}
@@ -435,6 +462,355 @@ static inline int virtqueue_add(struct virtqueue *_vq,
return -EIO;
}
+static bool virtqueue_kick_prepare_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 new, old;
+ bool needs_kick;
+
+ START_USE(vq);
+ /* We need to expose available array entries before checking avail
+ * event. */
+ virtio_mb(vq->weak_barriers);
+
+ old = vq->avail_idx_shadow - vq->num_added;
+ new = vq->avail_idx_shadow;
+ vq->num_added = 0;
+
+#ifdef DEBUG
+ if (vq->last_add_time_valid) {
+ WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
+ vq->last_add_time)) > 100);
+ }
+ vq->last_add_time_valid = false;
+#endif
+
+ if (vq->event) {
+ needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
+ new, old);
+ } else {
+ needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
+ }
+ END_USE(vq);
+ return needs_kick;
+}
+
+static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
+ void **ctx)
+{
+ unsigned int i, j;
+ __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
+
+ /* Clear data ptr. */
+ vq->desc_state[head].data = NULL;
+
+ /* Put back on free list: unmap first-level descriptors and find end */
+ i = head;
+
+ while (vq->vring.desc[i].flags & nextflag) {
+ vring_unmap_one_split(vq, &vq->vring.desc[i]);
+ i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
+ vq->vq.num_free++;
+ }
+
+ vring_unmap_one_split(vq, &vq->vring.desc[i]);
+ vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
+ vq->free_head = head;
+
+ /* Plus final descriptor */
+ vq->vq.num_free++;
+
+ if (vq->indirect) {
+ struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
+ u32 len;
+
+ /* Free the indirect table, if any, now that it's unmapped. */
+ if (!indir_desc)
+ return;
+
+ len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
+
+ BUG_ON(!(vq->vring.desc[head].flags &
+ cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
+ BUG_ON(len == 0 || len % sizeof(struct vring_desc));
+
+ for (j = 0; j < len / sizeof(struct vring_desc); j++)
+ vring_unmap_one_split(vq, &indir_desc[j]);
+
+ kfree(indir_desc);
+ vq->desc_state[head].indir_desc = NULL;
+ } else if (ctx) {
+ *ctx = vq->desc_state[head].indir_desc;
+ }
+}
+
+static inline bool more_used_split(const struct vring_virtqueue *vq)
+{
+ return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
+}
+
+static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq,
+ unsigned int *len,
+ void **ctx)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ void *ret;
+ unsigned int i;
+ u16 last_used;
+
+ START_USE(vq);
+
+ if (unlikely(vq->broken)) {
+ END_USE(vq);
+ return NULL;
+ }
+
+ if (!more_used_split(vq)) {
+ pr_debug("No more buffers in queue\n");
+ END_USE(vq);
+ return NULL;
+ }
+
+ /* Only get used array entries after they have been exposed by host. */
+ virtio_rmb(vq->weak_barriers);
+
+ last_used = (vq->last_used_idx & (vq->vring.num - 1));
+ i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
+ *len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
+
+ if (unlikely(i >= vq->vring.num)) {
+ BAD_RING(vq, "id %u out of range\n", i);
+ return NULL;
+ }
+ if (unlikely(!vq->desc_state[i].data)) {
+ BAD_RING(vq, "id %u is not a head!\n", i);
+ return NULL;
+ }
+
+ /* detach_buf_split clears data, so grab it now. */
+ ret = vq->desc_state[i].data;
+ detach_buf_split(vq, i, ctx);
+ vq->last_used_idx++;
+ /* If we expect an interrupt for the next entry, tell host
+ * by writing event index and flush out the write before
+ * the read in the next get_buf call. */
+ if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
+ virtio_store_mb(vq->weak_barriers,
+ &vring_used_event(&vq->vring),
+ cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
+
+#ifdef DEBUG
+ vq->last_add_time_valid = false;
+#endif
+
+ END_USE(vq);
+ return ret;
+}
+
+static void virtqueue_disable_cb_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
+ vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+ if (!vq->event)
+ vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+ }
+}
+
+static unsigned virtqueue_enable_cb_prepare_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 last_used_idx;
+
+ START_USE(vq);
+
+ /* We optimistically turn back on interrupts, then check if there was
+ * more to do. */
+ /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
+ * either clear the flags bit or point the event index at the next
+ * entry. Always do both to keep code simple. */
+ if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+ vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+ if (!vq->event)
+ vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+ }
+ vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
+ END_USE(vq);
+ return last_used_idx;
+}
+
+static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned last_used_idx)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ virtio_mb(vq->weak_barriers);
+ return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
+}
+
+static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 bufs;
+
+ START_USE(vq);
+
+ /* We optimistically turn back on interrupts, then check if there was
+ * more to do. */
+ /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+ * either clear the flags bit or point the event index at the next
+ * entry. Always update the event index to keep code simple. */
+ if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
+ vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
+ if (!vq->event)
+ vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
+ }
+ /* TODO: tune this threshold */
+ bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
+
+ virtio_store_mb(vq->weak_barriers,
+ &vring_used_event(&vq->vring),
+ cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
+
+ if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
+ END_USE(vq);
+ return false;
+ }
+
+ END_USE(vq);
+ return true;
+}
+
+static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ unsigned int i;
+ void *buf;
+
+ START_USE(vq);
+
+ for (i = 0; i < vq->vring.num; i++) {
+ if (!vq->desc_state[i].data)
+ continue;
+ /* detach_buf clears data, so grab it now. */
+ buf = vq->desc_state[i].data;
+ detach_buf_split(vq, i, NULL);
+ vq->avail_idx_shadow--;
+ vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
+ END_USE(vq);
+ return buf;
+ }
+ /* That should have freed everything. */
+ BUG_ON(vq->vq.num_free != vq->vring.num);
+
+ END_USE(vq);
+ return NULL;
+}
+
+/*
+ * The layout for the packed ring is a continuous chunk of memory
+ * which looks like this.
+ *
+ * struct vring_packed {
+ * // The actual descriptors (16 bytes each)
+ * struct vring_packed_desc desc[num];
+ *
+ * // Padding to the next align boundary.
+ * char pad[];
+ *
+ * // Driver Event Suppression
+ * struct vring_packed_desc_event driver;
+ *
+ * // Device Event Suppression
+ * struct vring_packed_desc_event device;
+ * };
+ */
+static inline void vring_init_packed(struct vring_packed *vr, unsigned int num,
+ void *p, unsigned long align)
+{
+ vr->num = num;
+ vr->desc = p;
+ vr->driver = (void *)(((uintptr_t)p + sizeof(struct vring_packed_desc)
+ * num + align - 1) & ~(align - 1));
+ vr->device = vr->driver + 1;
+}
+
+static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
+{
+ return ((sizeof(struct vring_packed_desc) * num + align - 1)
+ & ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
+}
+
+static inline int virtqueue_add_packed(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ unsigned int total_sg,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ void *ctx,
+ gfp_t gfp)
+{
+ return -EIO;
+}
+
+static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
+{
+ return false;
+}
+
+static inline bool more_used_packed(const struct vring_virtqueue *vq)
+{
+ return false;
+}
+
+static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
+ unsigned int *len,
+ void **ctx)
+{
+ return NULL;
+}
+
+static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
+{
+}
+
+static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
+{
+ return 0;
+}
+
+static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
+{
+ return false;
+}
+
+static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
+{
+ return false;
+}
+
+static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
+{
+ return NULL;
+}
+
+static inline int virtqueue_add(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ unsigned int total_sg,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ void *ctx,
+ gfp_t gfp)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
+ in_sgs, data, ctx, gfp) :
+ virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
+ in_sgs, data, ctx, gfp);
+}
+
/**
* virtqueue_add_sgs - expose buffers to other end
* @vq: the struct virtqueue we're talking about.
@@ -551,34 +927,9 @@ EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
bool virtqueue_kick_prepare(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- u16 new, old;
- bool needs_kick;
- START_USE(vq);
- /* We need to expose available array entries before checking avail
- * event. */
- virtio_mb(vq->weak_barriers);
-
- old = vq->avail_idx_shadow - vq->num_added;
- new = vq->avail_idx_shadow;
- vq->num_added = 0;
-
-#ifdef DEBUG
- if (vq->last_add_time_valid) {
- WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
- vq->last_add_time)) > 100);
- }
- vq->last_add_time_valid = false;
-#endif
-
- if (vq->event) {
- needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
- new, old);
- } else {
- needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
- }
- END_USE(vq);
- return needs_kick;
+ return vq->packed ? virtqueue_kick_prepare_packed(_vq) :
+ virtqueue_kick_prepare_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
@@ -626,58 +977,9 @@ bool virtqueue_kick(struct virtqueue *vq)
}
EXPORT_SYMBOL_GPL(virtqueue_kick);
-static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
- void **ctx)
-{
- unsigned int i, j;
- __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
-
- /* Clear data ptr. */
- vq->desc_state[head].data = NULL;
-
- /* Put back on free list: unmap first-level descriptors and find end */
- i = head;
-
- while (vq->vring.desc[i].flags & nextflag) {
- vring_unmap_one(vq, &vq->vring.desc[i]);
- i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
- vq->vq.num_free++;
- }
-
- vring_unmap_one(vq, &vq->vring.desc[i]);
- vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
- vq->free_head = head;
-
- /* Plus final descriptor */
- vq->vq.num_free++;
-
- if (vq->indirect) {
- struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
- u32 len;
-
- /* Free the indirect table, if any, now that it's unmapped. */
- if (!indir_desc)
- return;
-
- len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
-
- BUG_ON(!(vq->vring.desc[head].flags &
- cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
- BUG_ON(len == 0 || len % sizeof(struct vring_desc));
-
- for (j = 0; j < len / sizeof(struct vring_desc); j++)
- vring_unmap_one(vq, &indir_desc[j]);
-
- kfree(indir_desc);
- vq->desc_state[head].indir_desc = NULL;
- } else if (ctx) {
- *ctx = vq->desc_state[head].indir_desc;
- }
-}
-
static inline bool more_used(const struct vring_virtqueue *vq)
{
- return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
+ return vq->packed ? more_used_packed(vq) : more_used_split(vq);
}
/**
@@ -700,57 +1002,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
void **ctx)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- void *ret;
- unsigned int i;
- u16 last_used;
- START_USE(vq);
-
- if (unlikely(vq->broken)) {
- END_USE(vq);
- return NULL;
- }
-
- if (!more_used(vq)) {
- pr_debug("No more buffers in queue\n");
- END_USE(vq);
- return NULL;
- }
-
- /* Only get used array entries after they have been exposed by host. */
- virtio_rmb(vq->weak_barriers);
-
- last_used = (vq->last_used_idx & (vq->vring.num - 1));
- i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
- *len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
-
- if (unlikely(i >= vq->vring.num)) {
- BAD_RING(vq, "id %u out of range\n", i);
- return NULL;
- }
- if (unlikely(!vq->desc_state[i].data)) {
- BAD_RING(vq, "id %u is not a head!\n", i);
- return NULL;
- }
-
- /* detach_buf clears data, so grab it now. */
- ret = vq->desc_state[i].data;
- detach_buf(vq, i, ctx);
- vq->last_used_idx++;
- /* If we expect an interrupt for the next entry, tell host
- * by writing event index and flush out the write before
- * the read in the next get_buf call. */
- if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
- virtio_store_mb(vq->weak_barriers,
- &vring_used_event(&vq->vring),
- cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
-
-#ifdef DEBUG
- vq->last_add_time_valid = false;
-#endif
-
- END_USE(vq);
- return ret;
+ return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
+ virtqueue_get_buf_ctx_split(_vq, len, ctx);
}
EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
@@ -772,12 +1026,10 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
- vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
- if (!vq->event)
- vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
- }
-
+ if (vq->packed)
+ virtqueue_disable_cb_packed(_vq);
+ else
+ virtqueue_disable_cb_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
@@ -796,23 +1048,9 @@ EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- u16 last_used_idx;
- START_USE(vq);
-
- /* We optimistically turn back on interrupts, then check if there was
- * more to do. */
- /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
- * either clear the flags bit or point the event index at the next
- * entry. Always do both to keep code simple. */
- if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
- vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
- if (!vq->event)
- vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
- }
- vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
- END_USE(vq);
- return last_used_idx;
+ return vq->packed ? virtqueue_enable_cb_prepare_packed(_vq) :
+ virtqueue_enable_cb_prepare_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
@@ -829,8 +1067,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- virtio_mb(vq->weak_barriers);
- return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
+ return vq->packed ? virtqueue_poll_packed(_vq, last_used_idx) :
+ virtqueue_poll_split(_vq, last_used_idx);
}
EXPORT_SYMBOL_GPL(virtqueue_poll);
@@ -868,34 +1106,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- u16 bufs;
- START_USE(vq);
-
- /* We optimistically turn back on interrupts, then check if there was
- * more to do. */
- /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
- * either clear the flags bit or point the event index at the next
- * entry. Always update the event index to keep code simple. */
- if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
- vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
- if (!vq->event)
- vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
- }
- /* TODO: tune this threshold */
- bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
-
- virtio_store_mb(vq->weak_barriers,
- &vring_used_event(&vq->vring),
- cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
-
- if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
- END_USE(vq);
- return false;
- }
-
- END_USE(vq);
- return true;
+ return vq->packed ? virtqueue_enable_cb_delayed_packed(_vq) :
+ virtqueue_enable_cb_delayed_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
@@ -910,27 +1123,9 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- unsigned int i;
- void *buf;
- START_USE(vq);
-
- for (i = 0; i < vq->vring.num; i++) {
- if (!vq->desc_state[i].data)
- continue;
- /* detach_buf clears data, so grab it now. */
- buf = vq->desc_state[i].data;
- detach_buf(vq, i, NULL);
- vq->avail_idx_shadow--;
- vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
- END_USE(vq);
- return buf;
- }
- /* That should have freed everything. */
- BUG_ON(vq->vq.num_free != vq->vring.num);
-
- END_USE(vq);
- return NULL;
+ return vq->packed ? virtqueue_detach_unused_buf_packed(_vq) :
+ virtqueue_detach_unused_buf_split(_vq);
}
EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
@@ -955,7 +1150,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
EXPORT_SYMBOL_GPL(vring_interrupt);
struct virtqueue *__vring_new_virtqueue(unsigned int index,
- struct vring vring,
+ union vring_union vring,
+ bool packed,
struct virtio_device *vdev,
bool weak_barriers,
bool context,
@@ -963,19 +1159,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
void (*callback)(struct virtqueue *),
const char *name)
{
- unsigned int i;
+ unsigned int num, i;
struct vring_virtqueue *vq;
- vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
+ num = packed ? vring.vring_packed.num : vring.vring_split.num;
+
+ vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
GFP_KERNEL);
if (!vq)
return NULL;
- vq->vring = vring;
vq->vq.callback = callback;
vq->vq.vdev = vdev;
vq->vq.name = name;
- vq->vq.num_free = vring.num;
+ vq->vq.num_free = num;
vq->vq.index = index;
vq->we_own_ring = false;
vq->queue_dma_addr = 0;
@@ -984,9 +1181,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
vq->weak_barriers = weak_barriers;
vq->broken = false;
vq->last_used_idx = 0;
- vq->avail_flags_shadow = 0;
- vq->avail_idx_shadow = 0;
vq->num_added = 0;
+ vq->packed = packed;
list_add_tail(&vq->vq.list, &vdev->vqs);
#ifdef DEBUG
vq->in_use = false;
@@ -997,18 +1193,37 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
!context;
vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
+ if (vq->packed) {
+ vq->vring_packed = vring.vring_packed;
+ vq->next_avail_idx = 0;
+ vq->wrap_counter = 1;
+ vq->event_flags_shadow = 0;
+ } else {
+ vq->vring = vring.vring_split;
+ vq->avail_flags_shadow = 0;
+ vq->avail_idx_shadow = 0;
+
+ /* Put everything in free lists. */
+ vq->free_head = 0;
+ for (i = 0; i < num-1; i++)
+ vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
+ }
+
/* No callback? Tell other side not to bother us. */
if (!callback) {
- vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
- if (!vq->event)
- vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
+ if (packed) {
+ vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
+ vq->vring_packed.driver->flags = cpu_to_virtio16(vdev,
+ vq->event_flags_shadow);
+ } else {
+ vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
+ if (!vq->event)
+ vq->vring.avail->flags = cpu_to_virtio16(vdev,
+ vq->avail_flags_shadow);
+ }
}
- /* Put everything in free lists. */
- vq->free_head = 0;
- for (i = 0; i < vring.num-1; i++)
- vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
- memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
+ memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
return &vq->vq;
}
@@ -1056,6 +1271,12 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
}
}
+static inline int
+__vring_size(unsigned int num, unsigned long align, bool packed)
+{
+ return packed ? vring_size_packed(num, align) : vring_size(num, align);
+}
+
struct virtqueue *vring_create_virtqueue(
unsigned int index,
unsigned int num,
@@ -1072,7 +1293,8 @@ struct virtqueue *vring_create_virtqueue(
void *queue = NULL;
dma_addr_t dma_addr;
size_t queue_size_in_bytes;
- struct vring vring;
+ union vring_union vring;
+ bool packed;
/* We assume num is a power of 2. */
if (num & (num - 1)) {
@@ -1080,9 +1302,13 @@ struct virtqueue *vring_create_virtqueue(
return NULL;
}
+ packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+
/* TODO: allocate each queue chunk individually */
- for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
- queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+ for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
+ num /= 2) {
+ queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+ packed),
&dma_addr,
GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
if (queue)
@@ -1094,17 +1320,21 @@ struct virtqueue *vring_create_virtqueue(
if (!queue) {
/* Try to get a single page. You are my only hope! */
- queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
+ queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
+ packed),
&dma_addr, GFP_KERNEL|__GFP_ZERO);
}
if (!queue)
return NULL;
- queue_size_in_bytes = vring_size(num, vring_align);
- vring_init(&vring, num, queue, vring_align);
+ queue_size_in_bytes = __vring_size(num, vring_align, packed);
+ if (packed)
+ vring_init_packed(&vring.vring_packed, num, queue, vring_align);
+ else
+ vring_init(&vring.vring_split, num, queue, vring_align);
- vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
- notify, callback, name);
+ vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+ context, notify, callback, name);
if (!vq) {
vring_free_queue(vdev, queue_size_in_bytes, queue,
dma_addr);
@@ -1130,10 +1360,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
void (*callback)(struct virtqueue *vq),
const char *name)
{
- struct vring vring;
- vring_init(&vring, num, pages, vring_align);
- return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
- notify, callback, name);
+ union vring_union vring;
+ bool packed;
+
+ packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
+ if (packed)
+ vring_init_packed(&vring.vring_packed, num, pages, vring_align);
+ else
+ vring_init(&vring.vring_split, num, pages, vring_align);
+
+ return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
+ context, notify, callback, name);
}
EXPORT_SYMBOL_GPL(vring_new_virtqueue);
@@ -1143,7 +1380,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
if (vq->we_own_ring) {
vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
- vq->vring.desc, vq->queue_dma_addr);
+ vq->packed ? (void *)vq->vring_packed.desc :
+ (void *)vq->vring.desc,
+ vq->queue_dma_addr);
}
list_del(&_vq->list);
kfree(vq);
@@ -1185,7 +1424,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
struct vring_virtqueue *vq = to_vvq(_vq);
- return vq->vring.num;
+ return vq->packed ? vq->vring_packed.num : vq->vring.num;
}
EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
@@ -1228,6 +1467,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
BUG_ON(!vq->we_own_ring);
+ if (vq->packed)
+ return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
+ (char *)vq->vring_packed.desc);
+
return vq->queue_dma_addr +
((char *)vq->vring.avail - (char *)vq->vring.desc);
}
@@ -1239,11 +1482,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
BUG_ON(!vq->we_own_ring);
+ if (vq->packed)
+ return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
+ (char *)vq->vring_packed.desc);
+
return vq->queue_dma_addr +
((char *)vq->vring.used - (char *)vq->vring.desc);
}
EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
+/* Only available for split ring */
const struct vring *virtqueue_get_vring(struct virtqueue *vq)
{
return &to_vvq(vq)->vring;
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index bbf32524ab27..a0075894ad16 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
struct virtio_device;
struct virtqueue;
+union vring_union {
+ struct vring vring_split;
+ struct vring_packed vring_packed;
+};
+
/*
* Creates a virtqueue and allocates the descriptor ring. If
* may_reduce_num is set, then this may allocate a smaller ring than
@@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
/* Creates a virtqueue with a custom layout. */
struct virtqueue *__vring_new_virtqueue(unsigned int index,
- struct vring vring,
+ union vring_union vring,
+ bool packed,
struct virtio_device *vdev,
bool weak_barriers,
bool ctx,
--
2.17.0
This commit introduces the basic support (without EVENT_IDX)
for packed ring.
Signed-off-by: Tiwei Bie <[email protected]>
---
drivers/virtio/virtio_ring.c | 491 ++++++++++++++++++++++++++++++++++-
1 file changed, 481 insertions(+), 10 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 62d7c407841a..c6c5deb0e3ae 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -58,7 +58,8 @@
struct vring_desc_state {
void *data; /* Data for callback. */
- struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
+ void *indir_desc; /* Indirect descriptor, if any. */
+ int num; /* Descriptor list length. */
};
struct vring_virtqueue {
@@ -116,6 +117,9 @@ struct vring_virtqueue {
/* Last written value to driver->flags in
* guest byte order. */
u16 event_flags_shadow;
+
+ /* ID allocation. */
+ struct idr buffer_id;
};
};
@@ -142,6 +146,16 @@ struct vring_virtqueue {
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+static inline bool virtqueue_use_indirect(struct virtqueue *_vq,
+ unsigned int total_sg)
+{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ /* If the host supports indirect descriptor tables, and we have multiple
+ * buffers, then go indirect. FIXME: tune this threshold */
+ return (vq->indirect && total_sg > 1 && vq->vq.num_free);
+}
+
/*
* Modern virtio devices have feature bits to specify whether they need a
* quirk and bypass the IOMMU. If not there, just use the DMA API.
@@ -327,9 +341,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
head = vq->free_head;
- /* If the host supports indirect descriptor tables, and we have multiple
- * buffers, then go indirect. FIXME: tune this threshold */
- if (vq->indirect && total_sg > 1 && vq->vq.num_free)
+ if (virtqueue_use_indirect(_vq, total_sg))
desc = alloc_indirect_split(_vq, total_sg, gfp);
else {
desc = NULL;
@@ -741,6 +753,63 @@ static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
}
+static void vring_unmap_one_packed(const struct vring_virtqueue *vq,
+ struct vring_packed_desc *desc)
+{
+ u16 flags;
+
+ if (!vring_use_dma_api(vq->vq.vdev))
+ return;
+
+ flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+ if (flags & VRING_DESC_F_INDIRECT) {
+ dma_unmap_single(vring_dma_dev(vq),
+ virtio64_to_cpu(vq->vq.vdev, desc->addr),
+ virtio32_to_cpu(vq->vq.vdev, desc->len),
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ } else {
+ dma_unmap_page(vring_dma_dev(vq),
+ virtio64_to_cpu(vq->vq.vdev, desc->addr),
+ virtio32_to_cpu(vq->vq.vdev, desc->len),
+ (flags & VRING_DESC_F_WRITE) ?
+ DMA_FROM_DEVICE : DMA_TO_DEVICE);
+ }
+}
+
+static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
+ unsigned int total_sg,
+ gfp_t gfp)
+{
+ struct vring_packed_desc *desc;
+
+ /*
+ * We require lowmem mappings for the descriptors because
+ * otherwise virt_to_phys will give us bogus addresses in the
+ * virtqueue.
+ */
+ gfp &= ~__GFP_HIGHMEM;
+
+ desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
+
+ return desc;
+}
+
+static u16 alloc_id_packed(struct vring_virtqueue *vq)
+{
+ u16 id;
+
+ id = idr_alloc(&vq->buffer_id, NULL, 0, vq->vring_packed.num,
+ GFP_KERNEL);
+ return id;
+}
+
+static void free_id_packed(struct vring_virtqueue *vq, u16 id)
+{
+ idr_remove(&vq->buffer_id, id);
+}
+
static inline int virtqueue_add_packed(struct virtqueue *_vq,
struct scatterlist *sgs[],
unsigned int total_sg,
@@ -750,47 +819,446 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
void *ctx,
gfp_t gfp)
{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ struct vring_packed_desc *desc;
+ struct scatterlist *sg;
+ unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
+ __virtio16 uninitialized_var(head_flags), flags;
+ u16 head, wrap_counter, id;
+ bool indirect;
+
+ START_USE(vq);
+
+ BUG_ON(data == NULL);
+ BUG_ON(ctx && vq->indirect);
+
+ if (unlikely(vq->broken)) {
+ END_USE(vq);
+ return -EIO;
+ }
+
+#ifdef DEBUG
+ {
+ ktime_t now = ktime_get();
+
+ /* No kick or get, with .1 second between? Warn. */
+ if (vq->last_add_time_valid)
+ WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
+ > 100);
+ vq->last_add_time = now;
+ vq->last_add_time_valid = true;
+ }
+#endif
+
+ BUG_ON(total_sg == 0);
+
+ head = vq->next_avail_idx;
+ wrap_counter = vq->wrap_counter;
+
+ if (virtqueue_use_indirect(_vq, total_sg))
+ desc = alloc_indirect_packed(_vq, total_sg, gfp);
+ else {
+ desc = NULL;
+ WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
+ }
+
+ if (desc) {
+ /* Use a single buffer which doesn't continue */
+ indirect = true;
+ /* Set up rest to use this indirect table. */
+ i = 0;
+ descs_used = 1;
+ } else {
+ indirect = false;
+ desc = vq->vring_packed.desc;
+ i = head;
+ descs_used = total_sg;
+ }
+
+ if (vq->vq.num_free < descs_used) {
+ pr_debug("Can't add buf len %i - avail = %i\n",
+ descs_used, vq->vq.num_free);
+ /* FIXME: for historical reasons, we force a notify here if
+ * there are outgoing parts to the buffer. Presumably the
+ * host should service the ring ASAP. */
+ if (out_sgs)
+ vq->notify(&vq->vq);
+ if (indirect)
+ kfree(desc);
+ END_USE(vq);
+ return -ENOSPC;
+ }
+
+ id = alloc_id_packed(vq);
+
+ for (n = 0; n < out_sgs + in_sgs; n++) {
+ for (sg = sgs[n]; sg; sg = sg_next(sg)) {
+ dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ if (vring_mapping_error(vq, addr))
+ goto unmap_release;
+
+ flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
+ (n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
+ VRING_DESC_F_AVAIL(vq->wrap_counter) |
+ VRING_DESC_F_USED(!vq->wrap_counter));
+ if (!indirect && i == head)
+ head_flags = flags;
+ else
+ desc[i].flags = flags;
+
+ desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
+ desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
+ i++;
+ if (!indirect && i >= vq->vring_packed.num) {
+ i = 0;
+ vq->wrap_counter ^= 1;
+ }
+ }
+ }
+
+ prev = (i > 0 ? i : vq->vring_packed.num) - 1;
+ desc[prev].id = cpu_to_virtio16(_vq->vdev, id);
+
+ /* Last one doesn't continue. */
+ if (total_sg == 1)
+ head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
+ else
+ desc[prev].flags &= cpu_to_virtio16(_vq->vdev,
+ ~VRING_DESC_F_NEXT);
+
+ if (indirect) {
+ /* Now that the indirect table is filled in, map it. */
+ dma_addr_t addr = vring_map_single(
+ vq, desc, total_sg * sizeof(struct vring_packed_desc),
+ DMA_TO_DEVICE);
+ if (vring_mapping_error(vq, addr))
+ goto unmap_release;
+
+ head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
+ VRING_DESC_F_AVAIL(wrap_counter) |
+ VRING_DESC_F_USED(!wrap_counter));
+ vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev,
+ addr);
+ vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
+ total_sg * sizeof(struct vring_packed_desc));
+ vq->vring_packed.desc[head].id = cpu_to_virtio16(_vq->vdev, id);
+ }
+
+ /* We're using some buffers from the free list. */
+ vq->vq.num_free -= descs_used;
+
+ /* Update free pointer */
+ if (indirect) {
+ n = head + 1;
+ if (n >= vq->vring_packed.num) {
+ n = 0;
+ vq->wrap_counter ^= 1;
+ }
+ vq->next_avail_idx = n;
+ } else
+ vq->next_avail_idx = i;
+
+ /* Store token and indirect buffer state. */
+ vq->desc_state[id].num = descs_used;
+ vq->desc_state[id].data = data;
+ if (indirect)
+ vq->desc_state[id].indir_desc = desc;
+ else
+ vq->desc_state[id].indir_desc = ctx;
+
+ /* A driver MUST NOT make the first descriptor in the list
+ * available before all subsequent descriptors comprising
+ * the list are made available. */
+ virtio_wmb(vq->weak_barriers);
+ vq->vring_packed.desc[head].flags = head_flags;
+ vq->num_added += descs_used;
+
+ pr_debug("Added buffer head %i to %p\n", head, vq);
+ END_USE(vq);
+
+ return 0;
+
+unmap_release:
+ err_idx = i;
+ i = head;
+
+ for (n = 0; n < total_sg; n++) {
+ if (i == err_idx)
+ break;
+ vring_unmap_one_packed(vq, &desc[i]);
+ i++;
+ if (!indirect && i >= vq->vring_packed.num)
+ i = 0;
+ }
+
+ vq->wrap_counter = wrap_counter;
+
+ if (indirect)
+ kfree(desc);
+
+ free_id_packed(vq, id);
+
+ END_USE(vq);
return -EIO;
}
static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
{
- return false;
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 flags;
+ bool needs_kick;
+ u32 snapshot;
+
+ START_USE(vq);
+ /* We need to expose the new flags value before checking notification
+ * suppressions. */
+ virtio_mb(vq->weak_barriers);
+
+ snapshot = *(u32 *)vq->vring_packed.device;
+ flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;
+
+#ifdef DEBUG
+ if (vq->last_add_time_valid) {
+ WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
+ vq->last_add_time)) > 100);
+ }
+ vq->last_add_time_valid = false;
+#endif
+
+ needs_kick = (flags != VRING_EVENT_F_DISABLE);
+ END_USE(vq);
+ return needs_kick;
+}
+
+static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
+ unsigned int id, void **ctx)
+{
+ struct vring_packed_desc *desc;
+ unsigned int i, j;
+
+ /* Clear data ptr. */
+ vq->desc_state[id].data = NULL;
+
+ i = head;
+
+ for (j = 0; j < vq->desc_state[id].num; j++) {
+ desc = &vq->vring_packed.desc[i];
+ vring_unmap_one_packed(vq, desc);
+ i++;
+ if (i >= vq->vring_packed.num)
+ i = 0;
+ }
+
+ vq->vq.num_free += vq->desc_state[id].num;
+
+ if (vq->indirect) {
+ u32 len;
+
+ /* Free the indirect table, if any, now that it's unmapped. */
+ desc = vq->desc_state[id].indir_desc;
+ if (!desc)
+ goto out;
+
+ len = virtio32_to_cpu(vq->vq.vdev,
+ vq->vring_packed.desc[head].len);
+
+ for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
+ vring_unmap_one_packed(vq, &desc[j]);
+
+ kfree(desc);
+ vq->desc_state[id].indir_desc = NULL;
+ } else if (ctx) {
+ *ctx = vq->desc_state[id].indir_desc;
+ }
+
+out:
+ free_id_packed(vq, id);
}
static inline bool more_used_packed(const struct vring_virtqueue *vq)
{
- return false;
+ u16 last_used, flags;
+ bool avail, used;
+
+ if (vq->vq.num_free == vq->vring_packed.num)
+ return false;
+
+ last_used = vq->last_used_idx;
+ flags = virtio16_to_cpu(vq->vq.vdev,
+ vq->vring_packed.desc[last_used].flags);
+ avail = flags & VRING_DESC_F_AVAIL(1);
+ used = flags & VRING_DESC_F_USED(1);
+
+ return avail == used;
}
static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
unsigned int *len,
void **ctx)
{
- return NULL;
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 last_used, id;
+ void *ret;
+
+ START_USE(vq);
+
+ if (unlikely(vq->broken)) {
+ END_USE(vq);
+ return NULL;
+ }
+
+ if (!more_used_packed(vq)) {
+ pr_debug("No more buffers in queue\n");
+ END_USE(vq);
+ return NULL;
+ }
+
+ /* Only get used elements after they have been exposed by host. */
+ virtio_rmb(vq->weak_barriers);
+
+ last_used = vq->last_used_idx;
+ id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
+ *len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
+
+ if (unlikely(id >= vq->vring_packed.num)) {
+ BAD_RING(vq, "id %u out of range\n", id);
+ return NULL;
+ }
+ if (unlikely(!vq->desc_state[id].data)) {
+ BAD_RING(vq, "id %u is not a head!\n", id);
+ return NULL;
+ }
+
+ vq->last_used_idx += vq->desc_state[id].num;
+ if (vq->last_used_idx >= vq->vring_packed.num)
+ vq->last_used_idx -= vq->vring_packed.num;
+
+ /* detach_buf_packed clears data, so grab it now. */
+ ret = vq->desc_state[id].data;
+ detach_buf_packed(vq, last_used, id, ctx);
+
+#ifdef DEBUG
+ vq->last_add_time_valid = false;
+#endif
+
+ END_USE(vq);
+ return ret;
}
static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
+ vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
+ vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+ vq->event_flags_shadow);
+ }
}
static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
{
- return 0;
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ START_USE(vq);
+
+ /* We optimistically turn back on interrupts, then check if there was
+ * more to do. */
+
+ if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
+ virtio_wmb(vq->weak_barriers);
+ vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+ vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+ vq->event_flags_shadow);
+ }
+
+ END_USE(vq);
+ return vq->last_used_idx;
}
static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
{
- return false;
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ bool avail, used;
+ u16 flags;
+
+ virtio_mb(vq->weak_barriers);
+ flags = virtio16_to_cpu(vq->vq.vdev,
+ vq->vring_packed.desc[last_used_idx].flags);
+ avail = flags & VRING_DESC_F_AVAIL(1);
+ used = flags & VRING_DESC_F_USED(1);
+ return avail == used;
}
static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
{
- return false;
+ struct vring_virtqueue *vq = to_vvq(_vq);
+
+ START_USE(vq);
+
+ /* We optimistically turn back on interrupts, then check if there was
+ * more to do. */
+
+ if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
+ virtio_wmb(vq->weak_barriers);
+ vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+ vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
+ vq->event_flags_shadow);
+ }
+
+ if (more_used_packed(vq)) {
+ END_USE(vq);
+ return false;
+ }
+
+ END_USE(vq);
+ return true;
}
static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
{
+ struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 flags, head, id, i;
+ unsigned int len;
+ void *buf;
+
+ START_USE(vq);
+
+ /* Detach the used descriptors. */
+ if (more_used_packed(vq)) {
+ buf = virtqueue_get_buf_ctx_packed(_vq, &len, NULL);
+ END_USE(vq);
+ return buf;
+ }
+
+ /* Detach the available descriptors. */
+ for (i = vq->last_used_idx; i != vq->next_avail_idx;
+ i = (i + 1) % vq->vring_packed.num) {
+ flags = virtio16_to_cpu(vq->vq.vdev,
+ vq->vring_packed.desc[i].flags);
+ while (flags & VRING_DESC_F_NEXT) {
+ i = (i + 1) % vq->vring_packed.num;
+ flags = virtio16_to_cpu(vq->vq.vdev,
+ vq->vring_packed.desc[i].flags);
+ }
+ id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[i].id);
+ if (!vq->desc_state[id].data)
+ continue;
+
+ len = vq->desc_state[id].num - 1;
+ head = (i < len ? i + vq->vring_packed.num : i) - len;
+
+ /* detach_buf clears data, so grab it now. */
+ buf = vq->desc_state[id].data;
+ detach_buf_packed(vq, head, id, NULL);
+ END_USE(vq);
+ return buf;
+ }
+ /* That should have freed everything. */
+ BUG_ON(vq->vq.num_free != vq->vring_packed.num);
+
+ END_USE(vq);
return NULL;
}
@@ -1198,6 +1666,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
vq->next_avail_idx = 0;
vq->wrap_counter = 1;
vq->event_flags_shadow = 0;
+ idr_init(&vq->buffer_id);
} else {
vq->vring = vring.vring_split;
vq->avail_flags_shadow = 0;
@@ -1384,6 +1853,8 @@ void vring_del_virtqueue(struct virtqueue *_vq)
(void *)vq->vring.desc,
vq->queue_dma_addr);
}
+ if (vq->packed)
+ idr_destroy(&vq->buffer_id);
list_del(&_vq->list);
kfree(vq);
}
--
2.17.0
Signed-off-by: Tiwei Bie <[email protected]>
---
drivers/virtio/virtio_ring.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index de3839f3621a..b158692263b0 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1940,6 +1940,8 @@ void vring_transport_features(struct virtio_device *vdev)
break;
case VIRTIO_F_IOMMU_PLATFORM:
break;
+ case VIRTIO_F_RING_PACKED:
+ break;
default:
/* We don't understand this bit. */
__virtio_clear_bit(vdev, i);
--
2.17.0
This commit introduces the event idx support in
packed ring.
Signed-off-by: Tiwei Bie <[email protected]>
---
drivers/virtio/virtio_ring.c | 75 +++++++++++++++++++++++++++++++++---
1 file changed, 70 insertions(+), 5 deletions(-)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index c6c5deb0e3ae..de3839f3621a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1006,7 +1006,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- u16 flags;
+ u16 new, old, off_wrap, flags, wrap_counter, event_idx;
bool needs_kick;
u32 snapshot;
@@ -1015,9 +1015,19 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
* suppressions. */
virtio_mb(vq->weak_barriers);
+ old = vq->next_avail_idx - vq->num_added;
+ new = vq->next_avail_idx;
+ vq->num_added = 0;
+
snapshot = *(u32 *)vq->vring_packed.device;
+ off_wrap = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot & 0xffff));
flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;
+ wrap_counter = off_wrap >> 15;
+ event_idx = off_wrap & ~(1<<15);
+ if (wrap_counter != vq->wrap_counter)
+ event_idx -= vq->vring_packed.num;
+
#ifdef DEBUG
if (vq->last_add_time_valid) {
WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
@@ -1026,7 +1036,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
vq->last_add_time_valid = false;
#endif
- needs_kick = (flags != VRING_EVENT_F_DISABLE);
+ if (flags == VRING_EVENT_F_DESC)
+ needs_kick = vring_need_event(event_idx, new, old);
+ else
+ needs_kick = (flags != VRING_EVENT_F_DISABLE);
END_USE(vq);
return needs_kick;
}
@@ -1098,7 +1111,7 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
void **ctx)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- u16 last_used, id;
+ u16 wrap_counter, last_used, id;
void *ret;
START_USE(vq);
@@ -1138,6 +1151,19 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
ret = vq->desc_state[id].data;
detach_buf_packed(vq, last_used, id, ctx);
+ wrap_counter = vq->wrap_counter;
+ if (vq->last_used_idx > vq->next_avail_idx)
+ wrap_counter ^= 1;
+
+ /* If we expect an interrupt for the next entry, tell host
+ * by writing event index and flush out the write before
+ * the read in the next get_buf call. */
+ if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
+ virtio_store_mb(vq->weak_barriers,
+ &vq->vring_packed.driver->off_wrap,
+ cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
+ (wrap_counter << 15)));
+
#ifdef DEBUG
vq->last_add_time_valid = false;
#endif
@@ -1160,15 +1186,27 @@ static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 wrap_counter;
START_USE(vq);
/* We optimistically turn back on interrupts, then check if there was
* more to do. */
+ /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+ * either clear the flags bit or point the event index at the next
+ * entry. Always update the event index to keep code simple. */
+
+ wrap_counter = vq->wrap_counter;
+ if (vq->last_used_idx > vq->next_avail_idx)
+ wrap_counter ^= 1;
+
+ vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
+ vq->last_used_idx | (wrap_counter << 15));
if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
virtio_wmb(vq->weak_barriers);
- vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+ vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
+ VRING_EVENT_F_ENABLE;
vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
vq->event_flags_shadow);
}
@@ -1194,15 +1232,40 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
{
struct vring_virtqueue *vq = to_vvq(_vq);
+ u16 bufs, used_idx, wrap_counter;
START_USE(vq);
/* We optimistically turn back on interrupts, then check if there was
* more to do. */
+ /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
+ * either clear the flags bit or point the event index at the next
+ * entry. Always update the event index to keep code simple. */
+
+ /* TODO: tune this threshold */
+ if (vq->next_avail_idx < vq->last_used_idx)
+ bufs = (vq->vring_packed.num + vq->next_avail_idx -
+ vq->last_used_idx) * 3 / 4;
+ else
+ bufs = (vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
+
+ wrap_counter = vq->wrap_counter;
+ if (vq->last_used_idx > vq->next_avail_idx)
+ wrap_counter ^= 1;
+
+ used_idx = vq->last_used_idx + bufs;
+ if (used_idx >= vq->vring_packed.num) {
+ used_idx -= vq->vring_packed.num;
+ wrap_counter ^= 1;
+ }
+
+ vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
+ used_idx | (wrap_counter << 15));
if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
virtio_wmb(vq->weak_barriers);
- vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
+ vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
+ VRING_EVENT_F_ENABLE;
vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
vq->event_flags_shadow);
}
@@ -1869,8 +1932,10 @@ void vring_transport_features(struct virtio_device *vdev)
switch (i) {
case VIRTIO_RING_F_INDIRECT_DESC:
break;
+#if 0
case VIRTIO_RING_F_EVENT_IDX:
break;
+#endif
case VIRTIO_F_VERSION_1:
break;
case VIRTIO_F_IOMMU_PLATFORM:
--
2.17.0
Signed-off-by: Tiwei Bie <[email protected]>
---
include/uapi/linux/virtio_config.h | 12 +++++++++-
include/uapi/linux/virtio_ring.h | 36 ++++++++++++++++++++++++++++++
2 files changed, 47 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index 308e2096291f..a6e392325e3a 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -49,7 +49,7 @@
* transport being used (eg. virtio_ring), the rest are per-device feature
* bits. */
#define VIRTIO_TRANSPORT_F_START 28
-#define VIRTIO_TRANSPORT_F_END 34
+#define VIRTIO_TRANSPORT_F_END 36
#ifndef VIRTIO_CONFIG_NO_LEGACY
/* Do we get callbacks when the ring is completely used, even if we've
@@ -71,4 +71,14 @@
* this is for compatibility with legacy systems.
*/
#define VIRTIO_F_IOMMU_PLATFORM 33
+
+/* This feature indicates support for the packed virtqueue layout. */
+#define VIRTIO_F_RING_PACKED 34
+
+/*
+ * This feature indicates that all buffers are used by the device
+ * in the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER 35
+
#endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5faa989b..3932cb80c347 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -44,6 +44,9 @@
/* This means the buffer contains a list of buffer descriptors. */
#define VRING_DESC_F_INDIRECT 4
+#define VRING_DESC_F_AVAIL(b) ((b) << 7)
+#define VRING_DESC_F_USED(b) ((b) << 15)
+
/* The Host uses this in used->flags to advise the Guest: don't kick me when
* you add a buffer. It's unreliable, so it's simply an optimization. Guest
* will still kick if it's out of buffers. */
@@ -53,6 +56,10 @@
* optimization. */
#define VRING_AVAIL_F_NO_INTERRUPT 1
+#define VRING_EVENT_F_ENABLE 0x0
+#define VRING_EVENT_F_DISABLE 0x1
+#define VRING_EVENT_F_DESC 0x2
+
/* We support indirect buffer descriptors */
#define VIRTIO_RING_F_INDIRECT_DESC 28
@@ -171,4 +178,33 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
}
+struct vring_packed_desc_event {
+ /* __virtio16 off : 15; // Descriptor Event Offset
+ * __virtio16 wrap : 1; // Descriptor Event Wrap Counter */
+ __virtio16 off_wrap;
+ /* __virtio16 flags : 2; // Descriptor Event Flags */
+ __virtio16 flags;
+};
+
+struct vring_packed_desc {
+ /* Buffer Address. */
+ __virtio64 addr;
+ /* Buffer Length. */
+ __virtio32 len;
+ /* Buffer ID. */
+ __virtio16 id;
+ /* The flags depending on descriptor type. */
+ __virtio16 flags;
+};
+
+struct vring_packed {
+ unsigned int num;
+
+ struct vring_packed_desc *desc;
+
+ struct vring_packed_desc_event *driver;
+
+ struct vring_packed_desc_event *device;
+};
+
#endif /* _UAPI_LINUX_VIRTIO_RING_H */
--
2.17.0
On Wed, May 16, 2018 at 01:15:48PM +0300, Sergei Shtylyov wrote:
> On 5/16/2018 11:37 AM, Tiwei Bie wrote:
>
> > Signed-off-by: Tiwei Bie <[email protected]>
> > ---
> > drivers/virtio/virtio_ring.c | 2 ++
> > 1 file changed, 2 insertions(+)
> >
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index de3839f3621a..b158692263b0 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -1940,6 +1940,8 @@ void vring_transport_features(struct virtio_device *vdev)
> > break;
> > case VIRTIO_F_IOMMU_PLATFORM:
> > break;
> > + case VIRTIO_F_RING_PACKED:
> > + break;
>
> Why not just add this *case* under the previous *case*?
Do you mean fallthrough? Something like:
case VIRTIO_F_IOMMU_PLATFORM:
case VIRTIO_F_RING_PACKED:
break;
Best regards,
Tiwei Bie
>
> > default:
> > /* We don't understand this bit. */
> > __virtio_clear_bit(vdev, i);
>
> MBR, Sergei
On 5/16/2018 11:37 AM, Tiwei Bie wrote:
> Signed-off-by: Tiwei Bie <[email protected]>
> ---
> drivers/virtio/virtio_ring.c | 2 ++
> 1 file changed, 2 insertions(+)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index de3839f3621a..b158692263b0 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1940,6 +1940,8 @@ void vring_transport_features(struct virtio_device *vdev)
> break;
> case VIRTIO_F_IOMMU_PLATFORM:
> break;
> + case VIRTIO_F_RING_PACKED:
> + break;
Why not just add this *case* under the previous *case*?
> default:
> /* We don't understand this bit. */
> __virtio_clear_bit(vdev, i);
MBR, Sergei
On 05/16/2018 01:21 PM, Tiwei Bie wrote:
>>> Signed-off-by: Tiwei Bie <[email protected]>
>>> ---
>>> drivers/virtio/virtio_ring.c | 2 ++
>>> 1 file changed, 2 insertions(+)
>>>
>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
>>> index de3839f3621a..b158692263b0 100644
>>> --- a/drivers/virtio/virtio_ring.c
>>> +++ b/drivers/virtio/virtio_ring.c
>>> @@ -1940,6 +1940,8 @@ void vring_transport_features(struct virtio_device *vdev)
>>> break;
>>> case VIRTIO_F_IOMMU_PLATFORM:
>>> break;
>>> + case VIRTIO_F_RING_PACKED:
>>> + break;
>>
>> Why not just add this *case* under the previous *case*?
>
> Do you mean fallthrough? Something like:
>
> case VIRTIO_F_IOMMU_PLATFORM:
> case VIRTIO_F_RING_PACKED:
> break;
Yes, exactly. :-)
> Best regards,
> Tiwei Bie
[...]
MBR, Sergei
On 2018年05月16日 16:37, Tiwei Bie wrote:
> This commit introduces the basic support (without EVENT_IDX)
> for packed ring.
>
> Signed-off-by: Tiwei Bie <[email protected]>
> ---
> drivers/virtio/virtio_ring.c | 491 ++++++++++++++++++++++++++++++++++-
> 1 file changed, 481 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 62d7c407841a..c6c5deb0e3ae 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -58,7 +58,8 @@
>
> struct vring_desc_state {
> void *data; /* Data for callback. */
> - struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
> + void *indir_desc; /* Indirect descriptor, if any. */
> + int num; /* Descriptor list length. */
> };
>
> struct vring_virtqueue {
> @@ -116,6 +117,9 @@ struct vring_virtqueue {
> /* Last written value to driver->flags in
> * guest byte order. */
> u16 event_flags_shadow;
> +
> + /* ID allocation. */
> + struct idr buffer_id;
I'm not sure idr is fit for the performance critical case here. Need to
measure its performance impact, especially if we have few unused slots.
> };
> };
>
> @@ -142,6 +146,16 @@ struct vring_virtqueue {
>
> #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
>
> +static inline bool virtqueue_use_indirect(struct virtqueue *_vq,
> + unsigned int total_sg)
> +{
> + struct vring_virtqueue *vq = to_vvq(_vq);
> +
> + /* If the host supports indirect descriptor tables, and we have multiple
> + * buffers, then go indirect. FIXME: tune this threshold */
> + return (vq->indirect && total_sg > 1 && vq->vq.num_free);
> +}
> +
> /*
> * Modern virtio devices have feature bits to specify whether they need a
> * quirk and bypass the IOMMU. If not there, just use the DMA API.
> @@ -327,9 +341,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq,
>
> head = vq->free_head;
>
> - /* If the host supports indirect descriptor tables, and we have multiple
> - * buffers, then go indirect. FIXME: tune this threshold */
> - if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> + if (virtqueue_use_indirect(_vq, total_sg))
> desc = alloc_indirect_split(_vq, total_sg, gfp);
> else {
> desc = NULL;
> @@ -741,6 +753,63 @@ static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
> & ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> }
>
> +static void vring_unmap_one_packed(const struct vring_virtqueue *vq,
> + struct vring_packed_desc *desc)
> +{
> + u16 flags;
> +
> + if (!vring_use_dma_api(vq->vq.vdev))
> + return;
> +
> + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +
> + if (flags & VRING_DESC_F_INDIRECT) {
> + dma_unmap_single(vring_dma_dev(vq),
> + virtio64_to_cpu(vq->vq.vdev, desc->addr),
> + virtio32_to_cpu(vq->vq.vdev, desc->len),
> + (flags & VRING_DESC_F_WRITE) ?
> + DMA_FROM_DEVICE : DMA_TO_DEVICE);
> + } else {
> + dma_unmap_page(vring_dma_dev(vq),
> + virtio64_to_cpu(vq->vq.vdev, desc->addr),
> + virtio32_to_cpu(vq->vq.vdev, desc->len),
> + (flags & VRING_DESC_F_WRITE) ?
> + DMA_FROM_DEVICE : DMA_TO_DEVICE);
> + }
> +}
> +
> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> + unsigned int total_sg,
> + gfp_t gfp)
> +{
> + struct vring_packed_desc *desc;
> +
> + /*
> + * We require lowmem mappings for the descriptors because
> + * otherwise virt_to_phys will give us bogus addresses in the
> + * virtqueue.
> + */
> + gfp &= ~__GFP_HIGHMEM;
> +
> + desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> +
> + return desc;
> +}
> +
> +static u16 alloc_id_packed(struct vring_virtqueue *vq)
> +{
> + u16 id;
> +
> + id = idr_alloc(&vq->buffer_id, NULL, 0, vq->vring_packed.num,
> + GFP_KERNEL);
> + return id;
> +}
> +
> +static void free_id_packed(struct vring_virtqueue *vq, u16 id)
> +{
> + idr_remove(&vq->buffer_id, id);
> +}
> +
> static inline int virtqueue_add_packed(struct virtqueue *_vq,
> struct scatterlist *sgs[],
> unsigned int total_sg,
> @@ -750,47 +819,446 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> void *ctx,
> gfp_t gfp)
> {
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + struct vring_packed_desc *desc;
> + struct scatterlist *sg;
> + unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> + __virtio16 uninitialized_var(head_flags), flags;
> + u16 head, wrap_counter, id;
> + bool indirect;
> +
> + START_USE(vq);
> +
> + BUG_ON(data == NULL);
> + BUG_ON(ctx && vq->indirect);
> +
> + if (unlikely(vq->broken)) {
> + END_USE(vq);
> + return -EIO;
> + }
> +
> +#ifdef DEBUG
> + {
> + ktime_t now = ktime_get();
> +
> + /* No kick or get, with .1 second between? Warn. */
> + if (vq->last_add_time_valid)
> + WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> + > 100);
> + vq->last_add_time = now;
> + vq->last_add_time_valid = true;
> + }
> +#endif
> +
> + BUG_ON(total_sg == 0);
> +
> + head = vq->next_avail_idx;
> + wrap_counter = vq->wrap_counter;
> +
> + if (virtqueue_use_indirect(_vq, total_sg))
> + desc = alloc_indirect_packed(_vq, total_sg, gfp);
> + else {
> + desc = NULL;
> + WARN_ON_ONCE(total_sg > vq->vring_packed.num && !vq->indirect);
> + }
> +
> + if (desc) {
> + /* Use a single buffer which doesn't continue */
> + indirect = true;
> + /* Set up rest to use this indirect table. */
> + i = 0;
> + descs_used = 1;
> + } else {
> + indirect = false;
> + desc = vq->vring_packed.desc;
> + i = head;
> + descs_used = total_sg;
> + }
> +
> + if (vq->vq.num_free < descs_used) {
> + pr_debug("Can't add buf len %i - avail = %i\n",
> + descs_used, vq->vq.num_free);
> + /* FIXME: for historical reasons, we force a notify here if
> + * there are outgoing parts to the buffer. Presumably the
> + * host should service the ring ASAP. */
> + if (out_sgs)
> + vq->notify(&vq->vq);
> + if (indirect)
> + kfree(desc);
> + END_USE(vq);
> + return -ENOSPC;
> + }
> +
> + id = alloc_id_packed(vq);
> +
> + for (n = 0; n < out_sgs + in_sgs; n++) {
> + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> + dma_addr_t addr = vring_map_one_sg(vq, sg, n < out_sgs ?
> + DMA_TO_DEVICE : DMA_FROM_DEVICE);
> + if (vring_mapping_error(vq, addr))
> + goto unmap_release;
> +
> + flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> + (n < out_sgs ? 0 : VRING_DESC_F_WRITE) |
> + VRING_DESC_F_AVAIL(vq->wrap_counter) |
> + VRING_DESC_F_USED(!vq->wrap_counter));
> + if (!indirect && i == head)
> + head_flags = flags;
> + else
> + desc[i].flags = flags;
> +
> + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> + i++;
> + if (!indirect && i >= vq->vring_packed.num) {
> + i = 0;
> + vq->wrap_counter ^= 1;
> + }
> + }
> + }
> +
> + prev = (i > 0 ? i : vq->vring_packed.num) - 1;
> + desc[prev].id = cpu_to_virtio16(_vq->vdev, id);
> +
> + /* Last one doesn't continue. */
> + if (total_sg == 1)
> + head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> + else
> + desc[prev].flags &= cpu_to_virtio16(_vq->vdev,
> + ~VRING_DESC_F_NEXT);
> +
> + if (indirect) {
> + /* Now that the indirect table is filled in, map it. */
> + dma_addr_t addr = vring_map_single(
> + vq, desc, total_sg * sizeof(struct vring_packed_desc),
> + DMA_TO_DEVICE);
> + if (vring_mapping_error(vq, addr))
> + goto unmap_release;
> +
> + head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> + VRING_DESC_F_AVAIL(wrap_counter) |
> + VRING_DESC_F_USED(!wrap_counter));
> + vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev,
> + addr);
> + vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> + total_sg * sizeof(struct vring_packed_desc));
> + vq->vring_packed.desc[head].id = cpu_to_virtio16(_vq->vdev, id);
> + }
> +
> + /* We're using some buffers from the free list. */
> + vq->vq.num_free -= descs_used;
> +
> + /* Update free pointer */
> + if (indirect) {
> + n = head + 1;
> + if (n >= vq->vring_packed.num) {
> + n = 0;
> + vq->wrap_counter ^= 1;
> + }
> + vq->next_avail_idx = n;
> + } else
> + vq->next_avail_idx = i;
> +
> + /* Store token and indirect buffer state. */
> + vq->desc_state[id].num = descs_used;
> + vq->desc_state[id].data = data;
> + if (indirect)
> + vq->desc_state[id].indir_desc = desc;
> + else
> + vq->desc_state[id].indir_desc = ctx;
> +
> + /* A driver MUST NOT make the first descriptor in the list
> + * available before all subsequent descriptors comprising
> + * the list are made available. */
> + virtio_wmb(vq->weak_barriers);
> + vq->vring_packed.desc[head].flags = head_flags;
> + vq->num_added += descs_used;
> +
> + pr_debug("Added buffer head %i to %p\n", head, vq);
> + END_USE(vq);
> +
> + return 0;
> +
> +unmap_release:
> + err_idx = i;
> + i = head;
> +
> + for (n = 0; n < total_sg; n++) {
> + if (i == err_idx)
> + break;
> + vring_unmap_one_packed(vq, &desc[i]);
> + i++;
> + if (!indirect && i >= vq->vring_packed.num)
> + i = 0;
> + }
> +
> + vq->wrap_counter = wrap_counter;
> +
> + if (indirect)
> + kfree(desc);
> +
> + free_id_packed(vq, id);
> +
> + END_USE(vq);
> return -EIO;
> }
>
> static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> {
> - return false;
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 flags;
> + bool needs_kick;
> + u32 snapshot;
> +
> + START_USE(vq);
> + /* We need to expose the new flags value before checking notification
> + * suppressions. */
> + virtio_mb(vq->weak_barriers);
> +
> + snapshot = *(u32 *)vq->vring_packed.device;
> + flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;
> +
> +#ifdef DEBUG
> + if (vq->last_add_time_valid) {
> + WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> + vq->last_add_time)) > 100);
> + }
> + vq->last_add_time_valid = false;
> +#endif
> +
> + needs_kick = (flags != VRING_EVENT_F_DISABLE);
> + END_USE(vq);
> + return needs_kick;
> +}
> +
> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> + unsigned int id, void **ctx)
> +{
> + struct vring_packed_desc *desc;
> + unsigned int i, j;
> +
> + /* Clear data ptr. */
> + vq->desc_state[id].data = NULL;
> +
> + i = head;
> +
> + for (j = 0; j < vq->desc_state[id].num; j++) {
> + desc = &vq->vring_packed.desc[i];
> + vring_unmap_one_packed(vq, desc);
As mentioned in previous discussion, this probably won't work for the
case of out of order completion since it depends on the information in
the descriptor ring. We probably need to extend ctx to record such
information.
Thanks
> + i++;
> + if (i >= vq->vring_packed.num)
> + i = 0;
> + }
> +
> + vq->vq.num_free += vq->desc_state[id].num;
> +
> + if (vq->indirect) {
> + u32 len;
> +
> + /* Free the indirect table, if any, now that it's unmapped. */
> + desc = vq->desc_state[id].indir_desc;
> + if (!desc)
> + goto out;
> +
> + len = virtio32_to_cpu(vq->vq.vdev,
> + vq->vring_packed.desc[head].len);
> +
> + for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> + vring_unmap_one_packed(vq, &desc[j]);
> +
> + kfree(desc);
> + vq->desc_state[id].indir_desc = NULL;
> + } else if (ctx) {
> + *ctx = vq->desc_state[id].indir_desc;
> + }
> +
> +out:
> + free_id_packed(vq, id);
> }
>
> static inline bool more_used_packed(const struct vring_virtqueue *vq)
> {
> - return false;
> + u16 last_used, flags;
> + bool avail, used;
> +
> + if (vq->vq.num_free == vq->vring_packed.num)
> + return false;
> +
> + last_used = vq->last_used_idx;
> + flags = virtio16_to_cpu(vq->vq.vdev,
> + vq->vring_packed.desc[last_used].flags);
> + avail = flags & VRING_DESC_F_AVAIL(1);
> + used = flags & VRING_DESC_F_USED(1);
> +
> + return avail == used;
> }
>
> static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> unsigned int *len,
> void **ctx)
> {
> - return NULL;
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 last_used, id;
> + void *ret;
> +
> + START_USE(vq);
> +
> + if (unlikely(vq->broken)) {
> + END_USE(vq);
> + return NULL;
> + }
> +
> + if (!more_used_packed(vq)) {
> + pr_debug("No more buffers in queue\n");
> + END_USE(vq);
> + return NULL;
> + }
> +
> + /* Only get used elements after they have been exposed by host. */
> + virtio_rmb(vq->weak_barriers);
> +
> + last_used = vq->last_used_idx;
> + id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
> + *len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
> +
> + if (unlikely(id >= vq->vring_packed.num)) {
> + BAD_RING(vq, "id %u out of range\n", id);
> + return NULL;
> + }
> + if (unlikely(!vq->desc_state[id].data)) {
> + BAD_RING(vq, "id %u is not a head!\n", id);
> + return NULL;
> + }
> +
> + vq->last_used_idx += vq->desc_state[id].num;
> + if (vq->last_used_idx >= vq->vring_packed.num)
> + vq->last_used_idx -= vq->vring_packed.num;
> +
> + /* detach_buf_packed clears data, so grab it now. */
> + ret = vq->desc_state[id].data;
> + detach_buf_packed(vq, last_used, id, ctx);
> +
> +#ifdef DEBUG
> + vq->last_add_time_valid = false;
> +#endif
> +
> + END_USE(vq);
> + return ret;
> }
>
> static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
> {
> + struct vring_virtqueue *vq = to_vvq(_vq);
> +
> + if (vq->event_flags_shadow != VRING_EVENT_F_DISABLE) {
> + vq->event_flags_shadow = VRING_EVENT_F_DISABLE;
> + vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> + vq->event_flags_shadow);
> + }
> }
>
> static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> {
> - return 0;
> + struct vring_virtqueue *vq = to_vvq(_vq);
> +
> + START_USE(vq);
> +
> + /* We optimistically turn back on interrupts, then check if there was
> + * more to do. */
> +
> + if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> + virtio_wmb(vq->weak_barriers);
> + vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> + vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> + vq->event_flags_shadow);
> + }
> +
> + END_USE(vq);
> + return vq->last_used_idx;
> }
>
> static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> {
> - return false;
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + bool avail, used;
> + u16 flags;
> +
> + virtio_mb(vq->weak_barriers);
> + flags = virtio16_to_cpu(vq->vq.vdev,
> + vq->vring_packed.desc[last_used_idx].flags);
> + avail = flags & VRING_DESC_F_AVAIL(1);
> + used = flags & VRING_DESC_F_USED(1);
> + return avail == used;
> }
>
> static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> {
> - return false;
> + struct vring_virtqueue *vq = to_vvq(_vq);
> +
> + START_USE(vq);
> +
> + /* We optimistically turn back on interrupts, then check if there was
> + * more to do. */
> +
> + if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> + virtio_wmb(vq->weak_barriers);
> + vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> + vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> + vq->event_flags_shadow);
> + }
> +
> + if (more_used_packed(vq)) {
> + END_USE(vq);
> + return false;
> + }
> +
> + END_USE(vq);
> + return true;
> }
>
> static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq)
> {
> + struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 flags, head, id, i;
> + unsigned int len;
> + void *buf;
> +
> + START_USE(vq);
> +
> + /* Detach the used descriptors. */
> + if (more_used_packed(vq)) {
> + buf = virtqueue_get_buf_ctx_packed(_vq, &len, NULL);
> + END_USE(vq);
> + return buf;
> + }
> +
> + /* Detach the available descriptors. */
> + for (i = vq->last_used_idx; i != vq->next_avail_idx;
> + i = (i + 1) % vq->vring_packed.num) {
> + flags = virtio16_to_cpu(vq->vq.vdev,
> + vq->vring_packed.desc[i].flags);
> + while (flags & VRING_DESC_F_NEXT) {
> + i = (i + 1) % vq->vring_packed.num;
> + flags = virtio16_to_cpu(vq->vq.vdev,
> + vq->vring_packed.desc[i].flags);
> + }
> + id = virtio16_to_cpu(_vq->vdev, vq->vring_packed.desc[i].id);
> + if (!vq->desc_state[id].data)
> + continue;
> +
> + len = vq->desc_state[id].num - 1;
> + head = (i < len ? i + vq->vring_packed.num : i) - len;
> +
> + /* detach_buf clears data, so grab it now. */
> + buf = vq->desc_state[id].data;
> + detach_buf_packed(vq, head, id, NULL);
> + END_USE(vq);
> + return buf;
> + }
> + /* That should have freed everything. */
> + BUG_ON(vq->vq.num_free != vq->vring_packed.num);
> +
> + END_USE(vq);
> return NULL;
> }
>
> @@ -1198,6 +1666,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> vq->next_avail_idx = 0;
> vq->wrap_counter = 1;
> vq->event_flags_shadow = 0;
> + idr_init(&vq->buffer_id);
> } else {
> vq->vring = vring.vring_split;
> vq->avail_flags_shadow = 0;
> @@ -1384,6 +1853,8 @@ void vring_del_virtqueue(struct virtqueue *_vq)
> (void *)vq->vring.desc,
> vq->queue_dma_addr);
> }
> + if (vq->packed)
> + idr_destroy(&vq->buffer_id);
> list_del(&_vq->list);
> kfree(vq);
> }
On 2018年05月16日 16:37, Tiwei Bie wrote:
> This commit introduces the event idx support in
> packed ring.
>
> Signed-off-by: Tiwei Bie <[email protected]>
> ---
> drivers/virtio/virtio_ring.c | 75 +++++++++++++++++++++++++++++++++---
> 1 file changed, 70 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index c6c5deb0e3ae..de3839f3621a 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -1006,7 +1006,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
> static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> - u16 flags;
> + u16 new, old, off_wrap, flags, wrap_counter, event_idx;
> bool needs_kick;
> u32 snapshot;
>
> @@ -1015,9 +1015,19 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> * suppressions. */
> virtio_mb(vq->weak_barriers);
>
> + old = vq->next_avail_idx - vq->num_added;
> + new = vq->next_avail_idx;
> + vq->num_added = 0;
> +
> snapshot = *(u32 *)vq->vring_packed.device;
> + off_wrap = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot & 0xffff));
> flags = virtio16_to_cpu(_vq->vdev, (__virtio16)(snapshot >> 16)) & 0x3;
>
> + wrap_counter = off_wrap >> 15;
> + event_idx = off_wrap & ~(1<<15);
> + if (wrap_counter != vq->wrap_counter)
> + event_idx -= vq->vring_packed.num;
> +
> #ifdef DEBUG
> if (vq->last_add_time_valid) {
> WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
> @@ -1026,7 +1036,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
> vq->last_add_time_valid = false;
> #endif
>
> - needs_kick = (flags != VRING_EVENT_F_DISABLE);
> + if (flags == VRING_EVENT_F_DESC)
> + needs_kick = vring_need_event(event_idx, new, old);
> + else
> + needs_kick = (flags != VRING_EVENT_F_DISABLE);
> END_USE(vq);
> return needs_kick;
> }
> @@ -1098,7 +1111,7 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> void **ctx)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> - u16 last_used, id;
> + u16 wrap_counter, last_used, id;
> void *ret;
>
> START_USE(vq);
> @@ -1138,6 +1151,19 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
> ret = vq->desc_state[id].data;
> detach_buf_packed(vq, last_used, id, ctx);
>
> + wrap_counter = vq->wrap_counter;
> + if (vq->last_used_idx > vq->next_avail_idx)
> + wrap_counter ^= 1;
> +
> + /* If we expect an interrupt for the next entry, tell host
> + * by writing event index and flush out the write before
> + * the read in the next get_buf call. */
> + if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> + virtio_store_mb(vq->weak_barriers,
> + &vq->vring_packed.driver->off_wrap,
> + cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> + (wrap_counter << 15)));
> +
> #ifdef DEBUG
> vq->last_add_time_valid = false;
> #endif
> @@ -1160,15 +1186,27 @@ static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
> static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 wrap_counter;
>
> START_USE(vq);
>
> /* We optimistically turn back on interrupts, then check if there was
> * more to do. */
> + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> + * either clear the flags bit or point the event index at the next
> + * entry. Always update the event index to keep code simple. */
> +
> + wrap_counter = vq->wrap_counter;
> + if (vq->last_used_idx > vq->next_avail_idx)
Should this be ">=" consider rx refill may try to completely fill the ring?
> + wrap_counter ^= 1;
> +
> + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> + vq->last_used_idx | (wrap_counter << 15));
>
> if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> virtio_wmb(vq->weak_barriers);
> - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> + VRING_EVENT_F_ENABLE;
> vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> vq->event_flags_shadow);
> }
> @@ -1194,15 +1232,40 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> {
> struct vring_virtqueue *vq = to_vvq(_vq);
> + u16 bufs, used_idx, wrap_counter;
>
> START_USE(vq);
>
> /* We optimistically turn back on interrupts, then check if there was
> * more to do. */
> + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> + * either clear the flags bit or point the event index at the next
> + * entry. Always update the event index to keep code simple. */
> +
> + /* TODO: tune this threshold */
> + if (vq->next_avail_idx < vq->last_used_idx)
> + bufs = (vq->vring_packed.num + vq->next_avail_idx -
> + vq->last_used_idx) * 3 / 4;
> + else
> + bufs = (vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> +
> + wrap_counter = vq->wrap_counter;
> + if (vq->last_used_idx > vq->next_avail_idx)
> + wrap_counter ^= 1;
> +
> + used_idx = vq->last_used_idx + bufs;
> + if (used_idx >= vq->vring_packed.num) {
> + used_idx -= vq->vring_packed.num;
> + wrap_counter ^= 1;
> + }
Looks correct but maybe it's better to add some comments for such logic.
> +
> + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> + used_idx | (wrap_counter << 15));
>
> if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> virtio_wmb(vq->weak_barriers);
> - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> + VRING_EVENT_F_ENABLE;
> vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> vq->event_flags_shadow);
> }
> @@ -1869,8 +1932,10 @@ void vring_transport_features(struct virtio_device *vdev)
> switch (i) {
> case VIRTIO_RING_F_INDIRECT_DESC:
> break;
> +#if 0
> case VIRTIO_RING_F_EVENT_IDX:
> break;
> +#endif
Maybe it's time to remove this #if 0.
Thanks
> case VIRTIO_F_VERSION_1:
> break;
> case VIRTIO_F_IOMMU_PLATFORM:
On Wed, May 16, 2018 at 02:42:53PM +0300, Sergei Shtylyov wrote:
> On 05/16/2018 01:21 PM, Tiwei Bie wrote:
>
> >>> Signed-off-by: Tiwei Bie <[email protected]>
> >>> ---
> >>> drivers/virtio/virtio_ring.c | 2 ++
> >>> 1 file changed, 2 insertions(+)
> >>>
> >>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> >>> index de3839f3621a..b158692263b0 100644
> >>> --- a/drivers/virtio/virtio_ring.c
> >>> +++ b/drivers/virtio/virtio_ring.c
> >>> @@ -1940,6 +1940,8 @@ void vring_transport_features(struct virtio_device *vdev)
> >>> break;
> >>> case VIRTIO_F_IOMMU_PLATFORM:
> >>> break;
> >>> + case VIRTIO_F_RING_PACKED:
> >>> + break;
> >>
> >> Why not just add this *case* under the previous *case*?
> >
> > Do you mean fallthrough? Something like:
> >
> > case VIRTIO_F_IOMMU_PLATFORM:
> > case VIRTIO_F_RING_PACKED:
> > break;
>
> Yes, exactly. :-)
Using fallthrough in this case will make the code more
compact. I like such coding style. But unfortunately,
it's not consistent with the existing code. :(
The whole function will become something like this:
void vring_transport_features(struct virtio_device *vdev)
{
unsigned int i;
for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
switch (i) {
case VIRTIO_RING_F_INDIRECT_DESC:
break;
case VIRTIO_RING_F_EVENT_IDX:
break;
case VIRTIO_F_VERSION_1:
break;
case VIRTIO_F_IOMMU_PLATFORM:
case VIRTIO_F_RING_PACKED:
break;
default:
/* We don't understand this bit. */
__virtio_clear_bit(vdev, i);
}
}
}
Best regards,
Tiwei Bie
>
> > Best regards,
> > Tiwei Bie
>
> [...]
>
> MBR, Sergei
>
On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
> On 2018年05月16日 16:37, Tiwei Bie wrote:
[...]
> > struct vring_virtqueue {
> > @@ -116,6 +117,9 @@ struct vring_virtqueue {
> > /* Last written value to driver->flags in
> > * guest byte order. */
> > u16 event_flags_shadow;
> > +
> > + /* ID allocation. */
> > + struct idr buffer_id;
>
> I'm not sure idr is fit for the performance critical case here. Need to
> measure its performance impact, especially if we have few unused slots.
I'm also not sure.. But fortunately, it should be quite easy
to replace it with something else without changing other code.
If it will really hurt the performance, I'll change it.
>
> > };
> > };
[...]
> > +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > + unsigned int id, void **ctx)
> > +{
> > + struct vring_packed_desc *desc;
> > + unsigned int i, j;
> > +
> > + /* Clear data ptr. */
> > + vq->desc_state[id].data = NULL;
> > +
> > + i = head;
> > +
> > + for (j = 0; j < vq->desc_state[id].num; j++) {
> > + desc = &vq->vring_packed.desc[i];
> > + vring_unmap_one_packed(vq, desc);
>
> As mentioned in previous discussion, this probably won't work for the case
> of out of order completion since it depends on the information in the
> descriptor ring. We probably need to extend ctx to record such information.
Above code doesn't depend on the information in the descriptor
ring. The vq->desc_state[] is the extended ctx.
Best regards,
Tiwei Bie
On 2018年05月16日 20:39, Tiwei Bie wrote:
> On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
>> On 2018年05月16日 16:37, Tiwei Bie wrote:
> [...]
>>> struct vring_virtqueue {
>>> @@ -116,6 +117,9 @@ struct vring_virtqueue {
>>> /* Last written value to driver->flags in
>>> * guest byte order. */
>>> u16 event_flags_shadow;
>>> +
>>> + /* ID allocation. */
>>> + struct idr buffer_id;
>> I'm not sure idr is fit for the performance critical case here. Need to
>> measure its performance impact, especially if we have few unused slots.
> I'm also not sure.. But fortunately, it should be quite easy
> to replace it with something else without changing other code.
> If it will really hurt the performance, I'll change it.
We may want to do some benchmarking/profiling to see.
>
>>> };
>>> };
> [...]
>>> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>> + unsigned int id, void **ctx)
>>> +{
>>> + struct vring_packed_desc *desc;
>>> + unsigned int i, j;
>>> +
>>> + /* Clear data ptr. */
>>> + vq->desc_state[id].data = NULL;
>>> +
>>> + i = head;
>>> +
>>> + for (j = 0; j < vq->desc_state[id].num; j++) {
>>> + desc = &vq->vring_packed.desc[i];
>>> + vring_unmap_one_packed(vq, desc);
>> As mentioned in previous discussion, this probably won't work for the case
>> of out of order completion since it depends on the information in the
>> descriptor ring. We probably need to extend ctx to record such information.
> Above code doesn't depend on the information in the descriptor
> ring. The vq->desc_state[] is the extended ctx.
>
> Best regards,
> Tiwei Bie
Yes, but desc is a pointer to descriptor ring I think so
vring_unmap_one_packed() still depends on the content of descriptor ring?
Thanks
On Wed, May 16, 2018 at 08:17:21PM +0800, Jason Wang wrote:
> On 2018年05月16日 16:37, Tiwei Bie wrote:
[...]
> > @@ -1160,15 +1186,27 @@ static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
> > static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> > {
> > struct vring_virtqueue *vq = to_vvq(_vq);
> > + u16 wrap_counter;
> > START_USE(vq);
> > /* We optimistically turn back on interrupts, then check if there was
> > * more to do. */
> > + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > + * either clear the flags bit or point the event index at the next
> > + * entry. Always update the event index to keep code simple. */
> > +
> > + wrap_counter = vq->wrap_counter;
> > + if (vq->last_used_idx > vq->next_avail_idx)
>
> Should this be ">=" consider rx refill may try to completely fill the ring?
It seems that there are two cases that last_used_idx
equals to next_avail_idx. The first one is that the
ring is empty. And the second one is that the ring
is full. Although in the first case, most probably,
the driver won't enable the interrupt.
Maybe I really should track the used_wrap_counter
instead of calculating it each time I need it.. I'll
give it a try..
>
> > + wrap_counter ^= 1;
> > +
> > + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > + vq->last_used_idx | (wrap_counter << 15));
> > if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > virtio_wmb(vq->weak_barriers);
> > - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > + VRING_EVENT_F_ENABLE;
> > vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > vq->event_flags_shadow);
> > }
> > @@ -1194,15 +1232,40 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> > static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> > {
> > struct vring_virtqueue *vq = to_vvq(_vq);
> > + u16 bufs, used_idx, wrap_counter;
> > START_USE(vq);
> > /* We optimistically turn back on interrupts, then check if there was
> > * more to do. */
> > + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > + * either clear the flags bit or point the event index at the next
> > + * entry. Always update the event index to keep code simple. */
> > +
> > + /* TODO: tune this threshold */
> > + if (vq->next_avail_idx < vq->last_used_idx)
> > + bufs = (vq->vring_packed.num + vq->next_avail_idx -
> > + vq->last_used_idx) * 3 / 4;
> > + else
> > + bufs = (vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> > +
> > + wrap_counter = vq->wrap_counter;
> > + if (vq->last_used_idx > vq->next_avail_idx)
> > + wrap_counter ^= 1;
> > +
> > + used_idx = vq->last_used_idx + bufs;
> > + if (used_idx >= vq->vring_packed.num) {
> > + used_idx -= vq->vring_packed.num;
> > + wrap_counter ^= 1;
> > + }
>
> Looks correct but maybe it's better to add some comments for such logic.
Make sense.
>
> > +
> > + vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > + used_idx | (wrap_counter << 15));
> > if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > virtio_wmb(vq->weak_barriers);
> > - vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > + vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > + VRING_EVENT_F_ENABLE;
> > vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > vq->event_flags_shadow);
> > }
> > @@ -1869,8 +1932,10 @@ void vring_transport_features(struct virtio_device *vdev)
> > switch (i) {
> > case VIRTIO_RING_F_INDIRECT_DESC:
> > break;
> > +#if 0
> > case VIRTIO_RING_F_EVENT_IDX:
> > break;
> > +#endif
>
> Maybe it's time to remove this #if 0.
Will do it.
Thanks for the review!
Best regards,
Tiwei Bie
On 2018年05月16日 20:58, Tiwei Bie wrote:
> On Wed, May 16, 2018 at 08:17:21PM +0800, Jason Wang wrote:
>> On 2018年05月16日 16:37, Tiwei Bie wrote:
> [...]
>>> @@ -1160,15 +1186,27 @@ static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
>>> static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>>> {
>>> struct vring_virtqueue *vq = to_vvq(_vq);
>>> + u16 wrap_counter;
>>> START_USE(vq);
>>> /* We optimistically turn back on interrupts, then check if there was
>>> * more to do. */
>>> + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
>>> + * either clear the flags bit or point the event index at the next
>>> + * entry. Always update the event index to keep code simple. */
>>> +
>>> + wrap_counter = vq->wrap_counter;
>>> + if (vq->last_used_idx > vq->next_avail_idx)
>> Should this be ">=" consider rx refill may try to completely fill the ring?
> It seems that there are two cases that last_used_idx
> equals to next_avail_idx. The first one is that the
> ring is empty. And the second one is that the ring
> is full. Although in the first case, most probably,
> the driver won't enable the interrupt.
>
> Maybe I really should track the used_wrap_counter
> instead of calculating it each time I need it.. I'll
> give it a try..
>
Right, good to know and this will match spec sample code.
Thanks
On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
> On 2018年05月16日 20:39, Tiwei Bie wrote:
> > On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
> > > On 2018年05月16日 16:37, Tiwei Bie wrote:
> > [...]
> > > > struct vring_virtqueue {
> > > > @@ -116,6 +117,9 @@ struct vring_virtqueue {
> > > > /* Last written value to driver->flags in
> > > > * guest byte order. */
> > > > u16 event_flags_shadow;
> > > > +
> > > > + /* ID allocation. */
> > > > + struct idr buffer_id;
> > > I'm not sure idr is fit for the performance critical case here. Need to
> > > measure its performance impact, especially if we have few unused slots.
> > I'm also not sure.. But fortunately, it should be quite easy
> > to replace it with something else without changing other code.
> > If it will really hurt the performance, I'll change it.
>
> We may want to do some benchmarking/profiling to see.
Yeah!
>
> >
> > > > };
> > > > };
> > [...]
> > > > +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > + unsigned int id, void **ctx)
> > > > +{
> > > > + struct vring_packed_desc *desc;
> > > > + unsigned int i, j;
> > > > +
> > > > + /* Clear data ptr. */
> > > > + vq->desc_state[id].data = NULL;
> > > > +
> > > > + i = head;
> > > > +
> > > > + for (j = 0; j < vq->desc_state[id].num; j++) {
> > > > + desc = &vq->vring_packed.desc[i];
> > > > + vring_unmap_one_packed(vq, desc);
> > > As mentioned in previous discussion, this probably won't work for the case
> > > of out of order completion since it depends on the information in the
> > > descriptor ring. We probably need to extend ctx to record such information.
> > Above code doesn't depend on the information in the descriptor
> > ring. The vq->desc_state[] is the extended ctx.
> >
> > Best regards,
> > Tiwei Bie
>
> Yes, but desc is a pointer to descriptor ring I think so
> vring_unmap_one_packed() still depends on the content of descriptor ring?
>
I got your point now. I think it makes sense to reserve
the bits of the addr field. Driver shouldn't try to get
addrs from the descriptors when cleanup the descriptors
no matter whether we support out-of-order or not.
But combining it with the out-of-order support, it will
mean that the driver still needs to maintain a desc/ctx
list that is very similar to the desc ring in the split
ring. I'm not quite sure whether it's something we want.
If it is true, I'll do it. So do you think we also want
to maintain such a desc/ctx list for packed ring?
Best regards,
Tiwei Bie
On 2018年05月16日 21:45, Tiwei Bie wrote:
> On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
>> On 2018年05月16日 20:39, Tiwei Bie wrote:
>>> On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
>>>> On 2018年05月16日 16:37, Tiwei Bie wrote:
>>> [...]
>>>>> struct vring_virtqueue {
>>>>> @@ -116,6 +117,9 @@ struct vring_virtqueue {
>>>>> /* Last written value to driver->flags in
>>>>> * guest byte order. */
>>>>> u16 event_flags_shadow;
>>>>> +
>>>>> + /* ID allocation. */
>>>>> + struct idr buffer_id;
>>>> I'm not sure idr is fit for the performance critical case here. Need to
>>>> measure its performance impact, especially if we have few unused slots.
>>> I'm also not sure.. But fortunately, it should be quite easy
>>> to replace it with something else without changing other code.
>>> If it will really hurt the performance, I'll change it.
>> We may want to do some benchmarking/profiling to see.
> Yeah!
>
>>>>> };
>>>>> };
>>> [...]
>>>>> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>>>> + unsigned int id, void **ctx)
>>>>> +{
>>>>> + struct vring_packed_desc *desc;
>>>>> + unsigned int i, j;
>>>>> +
>>>>> + /* Clear data ptr. */
>>>>> + vq->desc_state[id].data = NULL;
>>>>> +
>>>>> + i = head;
>>>>> +
>>>>> + for (j = 0; j < vq->desc_state[id].num; j++) {
>>>>> + desc = &vq->vring_packed.desc[i];
>>>>> + vring_unmap_one_packed(vq, desc);
>>>> As mentioned in previous discussion, this probably won't work for the case
>>>> of out of order completion since it depends on the information in the
>>>> descriptor ring. We probably need to extend ctx to record such information.
>>> Above code doesn't depend on the information in the descriptor
>>> ring. The vq->desc_state[] is the extended ctx.
>>>
>>> Best regards,
>>> Tiwei Bie
>> Yes, but desc is a pointer to descriptor ring I think so
>> vring_unmap_one_packed() still depends on the content of descriptor ring?
>>
> I got your point now. I think it makes sense to reserve
> the bits of the addr field. Driver shouldn't try to get
> addrs from the descriptors when cleanup the descriptors
> no matter whether we support out-of-order or not.
Maybe I was wrong, but I remember spec mentioned something like this.
>
> But combining it with the out-of-order support, it will
> mean that the driver still needs to maintain a desc/ctx
> list that is very similar to the desc ring in the split
> ring. I'm not quite sure whether it's something we want.
> If it is true, I'll do it. So do you think we also want
> to maintain such a desc/ctx list for packed ring?
To make it work for OOO backends I think we need something like this
(hardware NIC drivers are usually have something like this).
Not for the patch, but it looks like having a OUT_OF_ORDER feature bit
is much more simpler to be started with.
Thanks
>
> Best regards,
> Tiwei Bie
On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
> On 2018年05月16日 21:45, Tiwei Bie wrote:
> > On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
> > > On 2018年05月16日 20:39, Tiwei Bie wrote:
> > > > On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
> > > > > On 2018年05月16日 16:37, Tiwei Bie wrote:
[...]
> > > > > > +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > + unsigned int id, void **ctx)
> > > > > > +{
> > > > > > + struct vring_packed_desc *desc;
> > > > > > + unsigned int i, j;
> > > > > > +
> > > > > > + /* Clear data ptr. */
> > > > > > + vq->desc_state[id].data = NULL;
> > > > > > +
> > > > > > + i = head;
> > > > > > +
> > > > > > + for (j = 0; j < vq->desc_state[id].num; j++) {
> > > > > > + desc = &vq->vring_packed.desc[i];
> > > > > > + vring_unmap_one_packed(vq, desc);
> > > > > As mentioned in previous discussion, this probably won't work for the case
> > > > > of out of order completion since it depends on the information in the
> > > > > descriptor ring. We probably need to extend ctx to record such information.
> > > > Above code doesn't depend on the information in the descriptor
> > > > ring. The vq->desc_state[] is the extended ctx.
> > > >
> > > > Best regards,
> > > > Tiwei Bie
> > > Yes, but desc is a pointer to descriptor ring I think so
> > > vring_unmap_one_packed() still depends on the content of descriptor ring?
> > >
> > I got your point now. I think it makes sense to reserve
> > the bits of the addr field. Driver shouldn't try to get
> > addrs from the descriptors when cleanup the descriptors
> > no matter whether we support out-of-order or not.
>
> Maybe I was wrong, but I remember spec mentioned something like this.
You're right. Spec mentioned this. I was just repeating
the spec to emphasize that it does make sense. :)
>
> >
> > But combining it with the out-of-order support, it will
> > mean that the driver still needs to maintain a desc/ctx
> > list that is very similar to the desc ring in the split
> > ring. I'm not quite sure whether it's something we want.
> > If it is true, I'll do it. So do you think we also want
> > to maintain such a desc/ctx list for packed ring?
>
> To make it work for OOO backends I think we need something like this
> (hardware NIC drivers are usually have something like this).
Which hardware NIC drivers have this?
>
> Not for the patch, but it looks like having a OUT_OF_ORDER feature bit is
> much more simpler to be started with.
+1
Best regards,
Tiwei Bie
On 2018年05月16日 22:33, Tiwei Bie wrote:
> On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
>> On 2018年05月16日 21:45, Tiwei Bie wrote:
>>> On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
>>>> On 2018年05月16日 20:39, Tiwei Bie wrote:
>>>>> On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
>>>>>> On 2018年05月16日 16:37, Tiwei Bie wrote:
> [...]
>>>>>>> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>>>>>> + unsigned int id, void **ctx)
>>>>>>> +{
>>>>>>> + struct vring_packed_desc *desc;
>>>>>>> + unsigned int i, j;
>>>>>>> +
>>>>>>> + /* Clear data ptr. */
>>>>>>> + vq->desc_state[id].data = NULL;
>>>>>>> +
>>>>>>> + i = head;
>>>>>>> +
>>>>>>> + for (j = 0; j < vq->desc_state[id].num; j++) {
>>>>>>> + desc = &vq->vring_packed.desc[i];
>>>>>>> + vring_unmap_one_packed(vq, desc);
>>>>>> As mentioned in previous discussion, this probably won't work for the case
>>>>>> of out of order completion since it depends on the information in the
>>>>>> descriptor ring. We probably need to extend ctx to record such information.
>>>>> Above code doesn't depend on the information in the descriptor
>>>>> ring. The vq->desc_state[] is the extended ctx.
>>>>>
>>>>> Best regards,
>>>>> Tiwei Bie
>>>> Yes, but desc is a pointer to descriptor ring I think so
>>>> vring_unmap_one_packed() still depends on the content of descriptor ring?
>>>>
>>> I got your point now. I think it makes sense to reserve
>>> the bits of the addr field. Driver shouldn't try to get
>>> addrs from the descriptors when cleanup the descriptors
>>> no matter whether we support out-of-order or not.
>> Maybe I was wrong, but I remember spec mentioned something like this.
> You're right. Spec mentioned this. I was just repeating
> the spec to emphasize that it does make sense. :)
>
>>> But combining it with the out-of-order support, it will
>>> mean that the driver still needs to maintain a desc/ctx
>>> list that is very similar to the desc ring in the split
>>> ring. I'm not quite sure whether it's something we want.
>>> If it is true, I'll do it. So do you think we also want
>>> to maintain such a desc/ctx list for packed ring?
>> To make it work for OOO backends I think we need something like this
>> (hardware NIC drivers are usually have something like this).
> Which hardware NIC drivers have this?
It's quite common I think, e.g driver track e.g dma addr and page frag
somewhere. e.g the ring->rx_info in mlx4 driver.
Thanks
>
>> Not for the patch, but it looks like having a OUT_OF_ORDER feature bit is
>> much more simpler to be started with.
> +1
>
> Best regards,
> Tiwei Bie
On Thu, May 17, 2018 at 08:01:52PM +0800, Jason Wang wrote:
> On 2018年05月16日 22:33, Tiwei Bie wrote:
> > On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
> > > On 2018年05月16日 21:45, Tiwei Bie wrote:
> > > > On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
> > > > > On 2018年05月16日 20:39, Tiwei Bie wrote:
> > > > > > On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
> > > > > > > On 2018年05月16日 16:37, Tiwei Bie wrote:
> > [...]
> > > > > > > > +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > > > + unsigned int id, void **ctx)
> > > > > > > > +{
> > > > > > > > + struct vring_packed_desc *desc;
> > > > > > > > + unsigned int i, j;
> > > > > > > > +
> > > > > > > > + /* Clear data ptr. */
> > > > > > > > + vq->desc_state[id].data = NULL;
> > > > > > > > +
> > > > > > > > + i = head;
> > > > > > > > +
> > > > > > > > + for (j = 0; j < vq->desc_state[id].num; j++) {
> > > > > > > > + desc = &vq->vring_packed.desc[i];
> > > > > > > > + vring_unmap_one_packed(vq, desc);
> > > > > > > As mentioned in previous discussion, this probably won't work for the case
> > > > > > > of out of order completion since it depends on the information in the
> > > > > > > descriptor ring. We probably need to extend ctx to record such information.
> > > > > > Above code doesn't depend on the information in the descriptor
> > > > > > ring. The vq->desc_state[] is the extended ctx.
> > > > > >
> > > > > > Best regards,
> > > > > > Tiwei Bie
> > > > > Yes, but desc is a pointer to descriptor ring I think so
> > > > > vring_unmap_one_packed() still depends on the content of descriptor ring?
> > > > >
> > > > I got your point now. I think it makes sense to reserve
> > > > the bits of the addr field. Driver shouldn't try to get
> > > > addrs from the descriptors when cleanup the descriptors
> > > > no matter whether we support out-of-order or not.
> > > Maybe I was wrong, but I remember spec mentioned something like this.
> > You're right. Spec mentioned this. I was just repeating
> > the spec to emphasize that it does make sense. :)
> >
> > > > But combining it with the out-of-order support, it will
> > > > mean that the driver still needs to maintain a desc/ctx
> > > > list that is very similar to the desc ring in the split
> > > > ring. I'm not quite sure whether it's something we want.
> > > > If it is true, I'll do it. So do you think we also want
> > > > to maintain such a desc/ctx list for packed ring?
> > > To make it work for OOO backends I think we need something like this
> > > (hardware NIC drivers are usually have something like this).
> > Which hardware NIC drivers have this?
>
> It's quite common I think, e.g driver track e.g dma addr and page frag
> somewhere. e.g the ring->rx_info in mlx4 driver.
It seems that I had a misunderstanding on your
previous comments. I know it's quite common for
drivers to track e.g. DMA addrs somewhere (and
I think one reason behind this is that they want
to reuse the bits of addr field). But tracking
addrs somewhere doesn't means supporting OOO.
I thought you were saying it's quite common for
hardware NIC drivers to support OOO (i.e. NICs
will return the descriptors OOO):
I'm not familiar with mlx4, maybe I'm wrong.
I just had a quick glance. And I found below
comments in mlx4_en_process_rx_cq():
```
/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
* descriptor offset can be deduced from the CQE index instead of
* reading 'cqe->index' */
index = cq->mcq.cons_index & ring->size_mask;
cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
```
It seems that although they have a completion
queue, they are still using the ring in order.
I guess maybe storage device may want OOO.
Best regards,
Tiwei Bie
>
> Thanks
>
> >
> > > Not for the patch, but it looks like having a OUT_OF_ORDER feature bit is
> > > much more simpler to be started with.
> > +1
> >
> > Best regards,
> > Tiwei Bie
>
On 2018年05月18日 19:29, Tiwei Bie wrote:
> On Thu, May 17, 2018 at 08:01:52PM +0800, Jason Wang wrote:
>> On 2018年05月16日 22:33, Tiwei Bie wrote:
>>> On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
>>>> On 2018年05月16日 21:45, Tiwei Bie wrote:
>>>>> On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
>>>>>> On 2018年05月16日 20:39, Tiwei Bie wrote:
>>>>>>> On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
>>>>>>>> On 2018年05月16日 16:37, Tiwei Bie wrote:
>>> [...]
>>>>>>>>> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>>>>>>>> + unsigned int id, void **ctx)
>>>>>>>>> +{
>>>>>>>>> + struct vring_packed_desc *desc;
>>>>>>>>> + unsigned int i, j;
>>>>>>>>> +
>>>>>>>>> + /* Clear data ptr. */
>>>>>>>>> + vq->desc_state[id].data = NULL;
>>>>>>>>> +
>>>>>>>>> + i = head;
>>>>>>>>> +
>>>>>>>>> + for (j = 0; j < vq->desc_state[id].num; j++) {
>>>>>>>>> + desc = &vq->vring_packed.desc[i];
>>>>>>>>> + vring_unmap_one_packed(vq, desc);
>>>>>>>> As mentioned in previous discussion, this probably won't work for the case
>>>>>>>> of out of order completion since it depends on the information in the
>>>>>>>> descriptor ring. We probably need to extend ctx to record such information.
>>>>>>> Above code doesn't depend on the information in the descriptor
>>>>>>> ring. The vq->desc_state[] is the extended ctx.
>>>>>>>
>>>>>>> Best regards,
>>>>>>> Tiwei Bie
>>>>>> Yes, but desc is a pointer to descriptor ring I think so
>>>>>> vring_unmap_one_packed() still depends on the content of descriptor ring?
>>>>>>
>>>>> I got your point now. I think it makes sense to reserve
>>>>> the bits of the addr field. Driver shouldn't try to get
>>>>> addrs from the descriptors when cleanup the descriptors
>>>>> no matter whether we support out-of-order or not.
>>>> Maybe I was wrong, but I remember spec mentioned something like this.
>>> You're right. Spec mentioned this. I was just repeating
>>> the spec to emphasize that it does make sense. :)
>>>
>>>>> But combining it with the out-of-order support, it will
>>>>> mean that the driver still needs to maintain a desc/ctx
>>>>> list that is very similar to the desc ring in the split
>>>>> ring. I'm not quite sure whether it's something we want.
>>>>> If it is true, I'll do it. So do you think we also want
>>>>> to maintain such a desc/ctx list for packed ring?
>>>> To make it work for OOO backends I think we need something like this
>>>> (hardware NIC drivers are usually have something like this).
>>> Which hardware NIC drivers have this?
>> It's quite common I think, e.g driver track e.g dma addr and page frag
>> somewhere. e.g the ring->rx_info in mlx4 driver.
> It seems that I had a misunderstanding on your
> previous comments. I know it's quite common for
> drivers to track e.g. DMA addrs somewhere (and
> I think one reason behind this is that they want
> to reuse the bits of addr field).
Yes, we may want this for virtio-net as well in the future.
> But tracking
> addrs somewhere doesn't means supporting OOO.
> I thought you were saying it's quite common for
> hardware NIC drivers to support OOO (i.e. NICs
> will return the descriptors OOO):
>
> I'm not familiar with mlx4, maybe I'm wrong.
> I just had a quick glance. And I found below
> comments in mlx4_en_process_rx_cq():
>
> ```
> /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
> * descriptor offset can be deduced from the CQE index instead of
> * reading 'cqe->index' */
> index = cq->mcq.cons_index & ring->size_mask;
> cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
> ```
>
> It seems that although they have a completion
> queue, they are still using the ring in order.
I guess so (at least from the above bits). Git grep -i "out of order" in
drivers/net gives some hints. Looks like there're few deivces do this.
> I guess maybe storage device may want OOO.
Right, some iSCSI did.
But tracking them elsewhere is not only for OOO.
Spec said:
for element address
"
In a used descriptor, Element Address is unused.
"
for Next flag:
"
For example, if descriptors are used in the same order in which they are
made available, this will result in
the used descriptor overwriting the first available descriptor in the
list, the used descriptor for the next list
overwriting the first available descriptor in the next list, etc.
"
for in order completion:
"
This will result in the used descriptor overwriting the first available
descriptor in the batch, the used descriptor
for the next batch overwriting the first available descriptor in the
next batch, etc.
"
So:
- It's an alignment to the spec
- device may (or should) overwrite the descriptor make also make address
field useless.
Thanks
>
> Best regards,
> Tiwei Bie
>
>> Thanks
>>
>>>> Not for the patch, but it looks like having a OUT_OF_ORDER feature bit is
>>>> much more simpler to be started with.
>>> +1
>>>
>>> Best regards,
>>> Tiwei Bie
On Fri, May 18, 2018 at 09:17:05PM +0800, Jason Wang wrote:
> On 2018年05月18日 19:29, Tiwei Bie wrote:
> > On Thu, May 17, 2018 at 08:01:52PM +0800, Jason Wang wrote:
> > > On 2018年05月16日 22:33, Tiwei Bie wrote:
> > > > On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
> > > > > On 2018年05月16日 21:45, Tiwei Bie wrote:
> > > > > > On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
> > > > > > > On 2018年05月16日 20:39, Tiwei Bie wrote:
> > > > > > > > On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
> > > > > > > > > On 2018年05月16日 16:37, Tiwei Bie wrote:
> > > > [...]
> > > > > > > > > > +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > > > > > + unsigned int id, void **ctx)
> > > > > > > > > > +{
> > > > > > > > > > + struct vring_packed_desc *desc;
> > > > > > > > > > + unsigned int i, j;
> > > > > > > > > > +
> > > > > > > > > > + /* Clear data ptr. */
> > > > > > > > > > + vq->desc_state[id].data = NULL;
> > > > > > > > > > +
> > > > > > > > > > + i = head;
> > > > > > > > > > +
> > > > > > > > > > + for (j = 0; j < vq->desc_state[id].num; j++) {
> > > > > > > > > > + desc = &vq->vring_packed.desc[i];
> > > > > > > > > > + vring_unmap_one_packed(vq, desc);
> > > > > > > > > As mentioned in previous discussion, this probably won't work for the case
> > > > > > > > > of out of order completion since it depends on the information in the
> > > > > > > > > descriptor ring. We probably need to extend ctx to record such information.
> > > > > > > > Above code doesn't depend on the information in the descriptor
> > > > > > > > ring. The vq->desc_state[] is the extended ctx.
> > > > > > > >
> > > > > > > > Best regards,
> > > > > > > > Tiwei Bie
> > > > > > > Yes, but desc is a pointer to descriptor ring I think so
> > > > > > > vring_unmap_one_packed() still depends on the content of descriptor ring?
> > > > > > >
> > > > > > I got your point now. I think it makes sense to reserve
> > > > > > the bits of the addr field. Driver shouldn't try to get
> > > > > > addrs from the descriptors when cleanup the descriptors
> > > > > > no matter whether we support out-of-order or not.
> > > > > Maybe I was wrong, but I remember spec mentioned something like this.
> > > > You're right. Spec mentioned this. I was just repeating
> > > > the spec to emphasize that it does make sense. :)
> > > >
> > > > > > But combining it with the out-of-order support, it will
> > > > > > mean that the driver still needs to maintain a desc/ctx
> > > > > > list that is very similar to the desc ring in the split
> > > > > > ring. I'm not quite sure whether it's something we want.
> > > > > > If it is true, I'll do it. So do you think we also want
> > > > > > to maintain such a desc/ctx list for packed ring?
> > > > > To make it work for OOO backends I think we need something like this
> > > > > (hardware NIC drivers are usually have something like this).
> > > > Which hardware NIC drivers have this?
> > > It's quite common I think, e.g driver track e.g dma addr and page frag
> > > somewhere. e.g the ring->rx_info in mlx4 driver.
> > It seems that I had a misunderstanding on your
> > previous comments. I know it's quite common for
> > drivers to track e.g. DMA addrs somewhere (and
> > I think one reason behind this is that they want
> > to reuse the bits of addr field).
>
> Yes, we may want this for virtio-net as well in the future.
>
> > But tracking
> > addrs somewhere doesn't means supporting OOO.
> > I thought you were saying it's quite common for
> > hardware NIC drivers to support OOO (i.e. NICs
> > will return the descriptors OOO):
> >
> > I'm not familiar with mlx4, maybe I'm wrong.
> > I just had a quick glance. And I found below
> > comments in mlx4_en_process_rx_cq():
> >
> > ```
> > /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
> > * descriptor offset can be deduced from the CQE index instead of
> > * reading 'cqe->index' */
> > index = cq->mcq.cons_index & ring->size_mask;
> > cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
> > ```
> >
> > It seems that although they have a completion
> > queue, they are still using the ring in order.
>
> I guess so (at least from the above bits). Git grep -i "out of order" in
> drivers/net gives some hints. Looks like there're few deivces do this.
>
> > I guess maybe storage device may want OOO.
>
> Right, some iSCSI did.
>
> But tracking them elsewhere is not only for OOO.
>
> Spec said:
>
> for element address
>
> "
> In a used descriptor, Element Address is unused.
> "
>
> for Next flag:
>
> "
> For example, if descriptors are used in the same order in which they are
> made available, this will result in
> the used descriptor overwriting the first available descriptor in the list,
> the used descriptor for the next list
> overwriting the first available descriptor in the next list, etc.
> "
>
> for in order completion:
>
> "
> This will result in the used descriptor overwriting the first available
> descriptor in the batch, the used descriptor
> for the next batch overwriting the first available descriptor in the next
> batch, etc.
> "
>
> So:
>
> - It's an alignment to the spec
> - device may (or should) overwrite the descriptor make also make address
> field useless.
You didn't get my point...
I agreed driver should track the DMA addrs or some
other necessary things from the very beginning. And
I also repeated the spec to emphasize that it does
make sense. And I'd like to do that.
What I was saying is that, to support OOO, we may
need to manage these context (which saves DMA addrs
etc) via a list which is similar to the desc list
maintained via `next` in split ring instead of an
array whose elements always can be indexed directly.
The desc ring in split ring is an array, but its
free entries are managed as list via next. I was
just wondering, do we want to manage such a list
because of OOO. It's just a very simple question
that I want to hear your opinion... (It doesn't
means anything, e.g. It doesn't mean I don't want
to support OOO. It's just a simple question...)
Best regards,
Tiwei Bie
>
> Thanks
>
> >
> > Best regards,
> > Tiwei Bie
> >
> > > Thanks
> > >
> > > > > Not for the patch, but it looks like having a OUT_OF_ORDER feature bit is
> > > > > much more simpler to be started with.
> > > > +1
> > > >
> > > > Best regards,
> > > > Tiwei Bie
>
On 2018年05月18日 22:33, Tiwei Bie wrote:
> On Fri, May 18, 2018 at 09:17:05PM +0800, Jason Wang wrote:
>> On 2018年05月18日 19:29, Tiwei Bie wrote:
>>> On Thu, May 17, 2018 at 08:01:52PM +0800, Jason Wang wrote:
>>>> On 2018年05月16日 22:33, Tiwei Bie wrote:
>>>>> On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
>>>>>> On 2018年05月16日 21:45, Tiwei Bie wrote:
>>>>>>> On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
>>>>>>>> On 2018年05月16日 20:39, Tiwei Bie wrote:
>>>>>>>>> On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
>>>>>>>>>> On 2018年05月16日 16:37, Tiwei Bie wrote:
>>>>> [...]
>>>>>>>>>>> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>>>>>>>>>> + unsigned int id, void **ctx)
>>>>>>>>>>> +{
>>>>>>>>>>> + struct vring_packed_desc *desc;
>>>>>>>>>>> + unsigned int i, j;
>>>>>>>>>>> +
>>>>>>>>>>> + /* Clear data ptr. */
>>>>>>>>>>> + vq->desc_state[id].data = NULL;
>>>>>>>>>>> +
>>>>>>>>>>> + i = head;
>>>>>>>>>>> +
>>>>>>>>>>> + for (j = 0; j < vq->desc_state[id].num; j++) {
>>>>>>>>>>> + desc = &vq->vring_packed.desc[i];
>>>>>>>>>>> + vring_unmap_one_packed(vq, desc);
>>>>>>>>>> As mentioned in previous discussion, this probably won't work for the case
>>>>>>>>>> of out of order completion since it depends on the information in the
>>>>>>>>>> descriptor ring. We probably need to extend ctx to record such information.
>>>>>>>>> Above code doesn't depend on the information in the descriptor
>>>>>>>>> ring. The vq->desc_state[] is the extended ctx.
>>>>>>>>>
>>>>>>>>> Best regards,
>>>>>>>>> Tiwei Bie
>>>>>>>> Yes, but desc is a pointer to descriptor ring I think so
>>>>>>>> vring_unmap_one_packed() still depends on the content of descriptor ring?
>>>>>>>>
>>>>>>> I got your point now. I think it makes sense to reserve
>>>>>>> the bits of the addr field. Driver shouldn't try to get
>>>>>>> addrs from the descriptors when cleanup the descriptors
>>>>>>> no matter whether we support out-of-order or not.
>>>>>> Maybe I was wrong, but I remember spec mentioned something like this.
>>>>> You're right. Spec mentioned this. I was just repeating
>>>>> the spec to emphasize that it does make sense. :)
>>>>>
>>>>>>> But combining it with the out-of-order support, it will
>>>>>>> mean that the driver still needs to maintain a desc/ctx
>>>>>>> list that is very similar to the desc ring in the split
>>>>>>> ring. I'm not quite sure whether it's something we want.
>>>>>>> If it is true, I'll do it. So do you think we also want
>>>>>>> to maintain such a desc/ctx list for packed ring?
>>>>>> To make it work for OOO backends I think we need something like this
>>>>>> (hardware NIC drivers are usually have something like this).
>>>>> Which hardware NIC drivers have this?
>>>> It's quite common I think, e.g driver track e.g dma addr and page frag
>>>> somewhere. e.g the ring->rx_info in mlx4 driver.
>>> It seems that I had a misunderstanding on your
>>> previous comments. I know it's quite common for
>>> drivers to track e.g. DMA addrs somewhere (and
>>> I think one reason behind this is that they want
>>> to reuse the bits of addr field).
>> Yes, we may want this for virtio-net as well in the future.
>>
>>> But tracking
>>> addrs somewhere doesn't means supporting OOO.
>>> I thought you were saying it's quite common for
>>> hardware NIC drivers to support OOO (i.e. NICs
>>> will return the descriptors OOO):
>>>
>>> I'm not familiar with mlx4, maybe I'm wrong.
>>> I just had a quick glance. And I found below
>>> comments in mlx4_en_process_rx_cq():
>>>
>>> ```
>>> /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
>>> * descriptor offset can be deduced from the CQE index instead of
>>> * reading 'cqe->index' */
>>> index = cq->mcq.cons_index & ring->size_mask;
>>> cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
>>> ```
>>>
>>> It seems that although they have a completion
>>> queue, they are still using the ring in order.
>> I guess so (at least from the above bits). Git grep -i "out of order" in
>> drivers/net gives some hints. Looks like there're few deivces do this.
>>
>>> I guess maybe storage device may want OOO.
>> Right, some iSCSI did.
>>
>> But tracking them elsewhere is not only for OOO.
>>
>> Spec said:
>>
>> for element address
>>
>> "
>> In a used descriptor, Element Address is unused.
>> "
>>
>> for Next flag:
>>
>> "
>> For example, if descriptors are used in the same order in which they are
>> made available, this will result in
>> the used descriptor overwriting the first available descriptor in the list,
>> the used descriptor for the next list
>> overwriting the first available descriptor in the next list, etc.
>> "
>>
>> for in order completion:
>>
>> "
>> This will result in the used descriptor overwriting the first available
>> descriptor in the batch, the used descriptor
>> for the next batch overwriting the first available descriptor in the next
>> batch, etc.
>> "
>>
>> So:
>>
>> - It's an alignment to the spec
>> - device may (or should) overwrite the descriptor make also make address
>> field useless.
> You didn't get my point...
I don't hope so.
> I agreed driver should track the DMA addrs or some
> other necessary things from the very beginning. And
> I also repeated the spec to emphasize that it does
> make sense. And I'd like to do that.
>
> What I was saying is that, to support OOO, we may
> need to manage these context (which saves DMA addrs
> etc) via a list which is similar to the desc list
> maintained via `next` in split ring instead of an
> array whose elements always can be indexed directly.
My point is these context is a must (not only for OOO).
>
> The desc ring in split ring is an array, but its
> free entries are managed as list via next. I was
> just wondering, do we want to manage such a list
> because of OOO. It's just a very simple question
> that I want to hear your opinion... (It doesn't
> means anything, e.g. It doesn't mean I don't want
> to support OOO. It's just a simple question...)
So the question is yes. But I admit I don't have better idea other than
what you propose here (something like split ring which is a little bit
sad). Maybe Michael had.
Thanks
>
> Best regards,
> Tiwei Bie
>
>> Thanks
>>
>>> Best regards,
>>> Tiwei Bie
>>>
>>>> Thanks
>>>>
>>>>>> Not for the patch, but it looks like having a OUT_OF_ORDER feature bit is
>>>>>> much more simpler to be started with.
>>>>> +1
>>>>>
>>>>> Best regards,
>>>>> Tiwei Bie
On Sat, May 19, 2018 at 09:12:30AM +0800, Jason Wang wrote:
> On 2018年05月18日 22:33, Tiwei Bie wrote:
> > On Fri, May 18, 2018 at 09:17:05PM +0800, Jason Wang wrote:
> > > On 2018年05月18日 19:29, Tiwei Bie wrote:
> > > > On Thu, May 17, 2018 at 08:01:52PM +0800, Jason Wang wrote:
> > > > > On 2018年05月16日 22:33, Tiwei Bie wrote:
> > > > > > On Wed, May 16, 2018 at 10:05:44PM +0800, Jason Wang wrote:
> > > > > > > On 2018年05月16日 21:45, Tiwei Bie wrote:
> > > > > > > > On Wed, May 16, 2018 at 08:51:43PM +0800, Jason Wang wrote:
> > > > > > > > > On 2018年05月16日 20:39, Tiwei Bie wrote:
> > > > > > > > > > On Wed, May 16, 2018 at 07:50:16PM +0800, Jason Wang wrote:
> > > > > > > > > > > On 2018年05月16日 16:37, Tiwei Bie wrote:
> > > > > > [...]
> > > > > > > > > > > > +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > > > > > > > > > > > + unsigned int id, void **ctx)
> > > > > > > > > > > > +{
> > > > > > > > > > > > + struct vring_packed_desc *desc;
> > > > > > > > > > > > + unsigned int i, j;
> > > > > > > > > > > > +
> > > > > > > > > > > > + /* Clear data ptr. */
> > > > > > > > > > > > + vq->desc_state[id].data = NULL;
> > > > > > > > > > > > +
> > > > > > > > > > > > + i = head;
> > > > > > > > > > > > +
> > > > > > > > > > > > + for (j = 0; j < vq->desc_state[id].num; j++) {
> > > > > > > > > > > > + desc = &vq->vring_packed.desc[i];
> > > > > > > > > > > > + vring_unmap_one_packed(vq, desc);
> > > > > > > > > > > As mentioned in previous discussion, this probably won't work for the case
> > > > > > > > > > > of out of order completion since it depends on the information in the
> > > > > > > > > > > descriptor ring. We probably need to extend ctx to record such information.
> > > > > > > > > > Above code doesn't depend on the information in the descriptor
> > > > > > > > > > ring. The vq->desc_state[] is the extended ctx.
> > > > > > > > > >
> > > > > > > > > > Best regards,
> > > > > > > > > > Tiwei Bie
> > > > > > > > > Yes, but desc is a pointer to descriptor ring I think so
> > > > > > > > > vring_unmap_one_packed() still depends on the content of descriptor ring?
> > > > > > > > >
> > > > > > > > I got your point now. I think it makes sense to reserve
> > > > > > > > the bits of the addr field. Driver shouldn't try to get
> > > > > > > > addrs from the descriptors when cleanup the descriptors
> > > > > > > > no matter whether we support out-of-order or not.
> > > > > > > Maybe I was wrong, but I remember spec mentioned something like this.
> > > > > > You're right. Spec mentioned this. I was just repeating
> > > > > > the spec to emphasize that it does make sense. :)
> > > > > >
> > > > > > > > But combining it with the out-of-order support, it will
> > > > > > > > mean that the driver still needs to maintain a desc/ctx
> > > > > > > > list that is very similar to the desc ring in the split
> > > > > > > > ring. I'm not quite sure whether it's something we want.
> > > > > > > > If it is true, I'll do it. So do you think we also want
> > > > > > > > to maintain such a desc/ctx list for packed ring?
> > > > > > > To make it work for OOO backends I think we need something like this
> > > > > > > (hardware NIC drivers are usually have something like this).
> > > > > > Which hardware NIC drivers have this?
> > > > > It's quite common I think, e.g driver track e.g dma addr and page frag
> > > > > somewhere. e.g the ring->rx_info in mlx4 driver.
> > > > It seems that I had a misunderstanding on your
> > > > previous comments. I know it's quite common for
> > > > drivers to track e.g. DMA addrs somewhere (and
> > > > I think one reason behind this is that they want
> > > > to reuse the bits of addr field).
> > > Yes, we may want this for virtio-net as well in the future.
> > >
> > > > But tracking
> > > > addrs somewhere doesn't means supporting OOO.
> > > > I thought you were saying it's quite common for
> > > > hardware NIC drivers to support OOO (i.e. NICs
> > > > will return the descriptors OOO):
> > > >
> > > > I'm not familiar with mlx4, maybe I'm wrong.
> > > > I just had a quick glance. And I found below
> > > > comments in mlx4_en_process_rx_cq():
> > > >
> > > > ```
> > > > /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
> > > > * descriptor offset can be deduced from the CQE index instead of
> > > > * reading 'cqe->index' */
> > > > index = cq->mcq.cons_index & ring->size_mask;
> > > > cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
> > > > ```
> > > >
> > > > It seems that although they have a completion
> > > > queue, they are still using the ring in order.
> > > I guess so (at least from the above bits). Git grep -i "out of order" in
> > > drivers/net gives some hints. Looks like there're few deivces do this.
> > >
> > > > I guess maybe storage device may want OOO.
> > > Right, some iSCSI did.
> > >
> > > But tracking them elsewhere is not only for OOO.
> > >
> > > Spec said:
> > >
> > > for element address
> > >
> > > "
> > > In a used descriptor, Element Address is unused.
> > > "
> > >
> > > for Next flag:
> > >
> > > "
> > > For example, if descriptors are used in the same order in which they are
> > > made available, this will result in
> > > the used descriptor overwriting the first available descriptor in the list,
> > > the used descriptor for the next list
> > > overwriting the first available descriptor in the next list, etc.
> > > "
> > >
> > > for in order completion:
> > >
> > > "
> > > This will result in the used descriptor overwriting the first available
> > > descriptor in the batch, the used descriptor
> > > for the next batch overwriting the first available descriptor in the next
> > > batch, etc.
> > > "
> > >
> > > So:
> > >
> > > - It's an alignment to the spec
> > > - device may (or should) overwrite the descriptor make also make address
> > > field useless.
> > You didn't get my point...
>
> I don't hope so.
>
> > I agreed driver should track the DMA addrs or some
> > other necessary things from the very beginning. And
> > I also repeated the spec to emphasize that it does
> > make sense. And I'd like to do that.
> >
> > What I was saying is that, to support OOO, we may
> > need to manage these context (which saves DMA addrs
> > etc) via a list which is similar to the desc list
> > maintained via `next` in split ring instead of an
> > array whose elements always can be indexed directly.
>
> My point is these context is a must (not only for OOO).
Yeah, and I have the exactly same point after you
pointed that I shouldn't get the addrs from descs.
I do think it makes sense. I'll do it in the next
version. I don't have any doubt about it. All my
questions are about the OOO, instead of whether we
should save context or not. It just seems that you
thought I don't want to do it, and were trying to
convince me that I should do it.
>
> >
> > The desc ring in split ring is an array, but its
> > free entries are managed as list via next. I was
> > just wondering, do we want to manage such a list
> > because of OOO. It's just a very simple question
> > that I want to hear your opinion... (It doesn't
> > means anything, e.g. It doesn't mean I don't want
> > to support OOO. It's just a simple question...)
>
> So the question is yes. But I admit I don't have better idea other than what
> you propose here (something like split ring which is a little bit sad).
> Maybe Michael had.
Yeah, that's why I asked this question. It will
make the packed ring a bit similar to split ring
at least in the driver part. So I want to draw
your attention on this to make sure that we're
on the same page.
Best regards,
Tiwei Bie
>
> Thanks
>
> >
> > Best regards,
> > Tiwei Bie
> >
> > > Thanks
> > >
> > > > Best regards,
> > > > Tiwei Bie
> > > >
> > > > > Thanks
> > > > >
> > > > > > > Not for the patch, but it looks like having a OUT_OF_ORDER feature bit is
> > > > > > > much more simpler to be started with.
> > > > > > +1
> > > > > >
> > > > > > Best regards,
> > > > > > Tiwei Bie
>
On 2018年05月19日 10:29, Tiwei Bie wrote:
>> I don't hope so.
>>
>>> I agreed driver should track the DMA addrs or some
>>> other necessary things from the very beginning. And
>>> I also repeated the spec to emphasize that it does
>>> make sense. And I'd like to do that.
>>>
>>> What I was saying is that, to support OOO, we may
>>> need to manage these context (which saves DMA addrs
>>> etc) via a list which is similar to the desc list
>>> maintained via `next` in split ring instead of an
>>> array whose elements always can be indexed directly.
>> My point is these context is a must (not only for OOO).
> Yeah, and I have the exactly same point after you
> pointed that I shouldn't get the addrs from descs.
> I do think it makes sense. I'll do it in the next
> version. I don't have any doubt about it. All my
> questions are about the OOO, instead of whether we
> should save context or not. It just seems that you
> thought I don't want to do it, and were trying to
> convince me that I should do it.
Right, but looks like I was wrong :)
>
>>> The desc ring in split ring is an array, but its
>>> free entries are managed as list via next. I was
>>> just wondering, do we want to manage such a list
>>> because of OOO. It's just a very simple question
>>> that I want to hear your opinion... (It doesn't
>>> means anything, e.g. It doesn't mean I don't want
>>> to support OOO. It's just a simple question...)
>> So the question is yes. But I admit I don't have better idea other than what
>> you propose here (something like split ring which is a little bit sad).
>> Maybe Michael had.
> Yeah, that's why I asked this question. It will
> make the packed ring a bit similar to split ring
> at least in the driver part. So I want to draw
> your attention on this to make sure that we're
> on the same page.
Yes. I think we are.
Thanks
> Best regards,
> Tiwei Bie
>
On Mon, May 21, 2018 at 10:30:51AM +0800, Jason Wang wrote:
> On 2018年05月19日 10:29, Tiwei Bie wrote:
> > > I don't hope so.
> > >
> > > > I agreed driver should track the DMA addrs or some
> > > > other necessary things from the very beginning. And
> > > > I also repeated the spec to emphasize that it does
> > > > make sense. And I'd like to do that.
> > > >
> > > > What I was saying is that, to support OOO, we may
> > > > need to manage these context (which saves DMA addrs
> > > > etc) via a list which is similar to the desc list
> > > > maintained via `next` in split ring instead of an
> > > > array whose elements always can be indexed directly.
> > > My point is these context is a must (not only for OOO).
> > Yeah, and I have the exactly same point after you
> > pointed that I shouldn't get the addrs from descs.
> > I do think it makes sense. I'll do it in the next
> > version. I don't have any doubt about it. All my
> > questions are about the OOO, instead of whether we
> > should save context or not. It just seems that you
> > thought I don't want to do it, and were trying to
> > convince me that I should do it.
>
> Right, but looks like I was wrong :)
>
> >
> > > > The desc ring in split ring is an array, but its
> > > > free entries are managed as list via next. I was
> > > > just wondering, do we want to manage such a list
> > > > because of OOO. It's just a very simple question
> > > > that I want to hear your opinion... (It doesn't
> > > > means anything, e.g. It doesn't mean I don't want
> > > > to support OOO. It's just a simple question...)
> > > So the question is yes. But I admit I don't have better idea other than what
> > > you propose here (something like split ring which is a little bit sad).
> > > Maybe Michael had.
> > Yeah, that's why I asked this question. It will
> > make the packed ring a bit similar to split ring
> > at least in the driver part. So I want to draw
> > your attention on this to make sure that we're
> > on the same page.
>
> Yes. I think we are.
Cool. Glad to hear that! Thanks! :)
Best regards,
Tiwei Bie
>
> Thanks
>
> > Best regards,
> > Tiwei Bie
> >
>