LinuxLists.cc - [PATCH RFC v8 00/11] vhost: ring format independence

[permalink] [raw]

Subject: [PATCH RFC v8 06/11] vhost/net: convert to new API: heads->bufs

Convert vhost net to use the new format-agnostic API.
In particular, don't poke at vq internals such as the
heads array.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/net.c | 154 +++++++++++++++++++++++---------------------
1 file changed, 82 insertions(+), 72 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ff594eec8ae3..830fe84912a5 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -59,13 +59,13 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
* status internally; used for zerocopy tx only.
*/
/* Lower device DMA failed */
-#define VHOST_DMA_FAILED_LEN ((__force __virtio32)3)
+#define VHOST_DMA_FAILED_LEN (3)
/* Lower device DMA done */
-#define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
+#define VHOST_DMA_DONE_LEN (2)
/* Lower device DMA in progress */
-#define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1)
+#define VHOST_DMA_IN_PROGRESS (1)
/* Buffer unused */
-#define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0)
+#define VHOST_DMA_CLEAR_LEN (0)

#define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)

@@ -112,9 +112,12 @@ struct vhost_net_virtqueue {
/* last used idx for outstanding DMA zerocopy buffers */
int upend_idx;
/* For TX, first used idx for DMA done zerocopy buffers
- * For RX, number of batched heads
+ * For RX, number of batched bufs
*/
int done_idx;
+ /* Outstanding user bufs. UIO_MAXIOV in length. */
+ /* TODO: we can make this smaller for sure. */
+ struct vhost_buf *bufs;
/* Number of XDP frames batched */
int batched_xdp;
/* an array of userspace buffers info */
@@ -271,6 +274,8 @@ static void vhost_net_clear_ubuf_info(struct vhost_net *n)
int i;

for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
+ kfree(n->vqs[i].bufs);
+ n->vqs[i].bufs = NULL;
kfree(n->vqs[i].ubuf_info);
n->vqs[i].ubuf_info = NULL;
}
@@ -282,6 +287,12 @@ static int vhost_net_set_ubuf_info(struct vhost_net *n)
int i;

for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
+ n->vqs[i].bufs = kmalloc_array(UIO_MAXIOV,
+ sizeof(*n->vqs[i].bufs),
+ GFP_KERNEL);
+ if (!n->vqs[i].bufs)
+ goto err;
+
zcopy = vhost_net_zcopy_mask & (0x1 << i);
if (!zcopy)
continue;
@@ -364,18 +375,18 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
int j = 0;

for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
- if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+ if (nvq->bufs[i].in_len == VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net);
- if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
- vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+ if (VHOST_DMA_IS_DONE(nvq->bufs[i].in_len)) {
+ nvq->bufs[i].in_len = VHOST_DMA_CLEAR_LEN;
++j;
} else
break;
}
while (j) {
add = min(UIO_MAXIOV - nvq->done_idx, j);
- vhost_add_used_and_signal_n(vq->dev, vq,
- &vq->heads[nvq->done_idx], add);
+ vhost_put_used_n_bufs(vq, &nvq->bufs[nvq->done_idx], add);
+ vhost_signal(vq->dev, vq);
nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
j -= add;
}
@@ -390,7 +401,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
rcu_read_lock_bh();

/* set len to mark this desc buffers done DMA */
- nvq->vq.heads[ubuf->desc].in_len = success ?
+ nvq->bufs[ubuf->desc].in_len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
cnt = vhost_net_ubuf_put(ubufs);

@@ -452,7 +463,8 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
if (!nvq->done_idx)
return;

- vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
+ vhost_put_used_n_bufs(vq, nvq->bufs, nvq->done_idx);
+ vhost_signal(dev, vq);
nvq->done_idx = 0;
}

@@ -558,6 +570,7 @@ static void vhost_net_busy_poll(struct vhost_net *net,

static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_net_virtqueue *tnvq,
+ struct vhost_buf *buf,
unsigned int *out_num, unsigned int *in_num,
struct msghdr *msghdr, bool *busyloop_intr)
{
@@ -565,10 +578,10 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *rvq = &rnvq->vq;
struct vhost_virtqueue *tvq = &tnvq->vq;

- int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
- out_num, in_num, NULL, NULL);
+ int r = vhost_get_avail_buf(tvq, buf, tvq->iov, ARRAY_SIZE(tvq->iov),
+ out_num, in_num, NULL, NULL);

- if (r == tvq->num && tvq->busyloop_timeout) {
+ if (!r && tvq->busyloop_timeout) {
/* Flush batched packets first */
if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq)))
vhost_tx_batch(net, tnvq,
@@ -577,8 +590,8 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,

vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);

- r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
- out_num, in_num, NULL, NULL);
+ r = vhost_get_avail_buf(tvq, buf, tvq->iov, ARRAY_SIZE(tvq->iov),
+ out_num, in_num, NULL, NULL);
}

return r;
@@ -607,6 +620,7 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,

static int get_tx_bufs(struct vhost_net *net,
struct vhost_net_virtqueue *nvq,
+ struct vhost_buf *buf,
struct msghdr *msg,
unsigned int *out, unsigned int *in,
size_t *len, bool *busyloop_intr)
@@ -614,9 +628,9 @@ static int get_tx_bufs(struct vhost_net *net,
struct vhost_virtqueue *vq = &nvq->vq;
int ret;

- ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
+ ret = vhost_net_tx_get_vq_desc(net, nvq, buf, out, in, msg, busyloop_intr);

- if (ret < 0 || ret == vq->num)
+ if (ret <= 0)
return ret;

if (*in) {
@@ -761,7 +775,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
unsigned out, in;
- int head;
+ int ret;
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
@@ -773,6 +787,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
int err;
int sent_pkts = 0;
bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
+ struct vhost_buf buf;

do {
bool busyloop_intr = false;
@@ -780,13 +795,13 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
if (nvq->done_idx == VHOST_NET_BATCH)
vhost_tx_batch(net, nvq, sock, &msg);

- head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
- &busyloop_intr);
+ ret = get_tx_bufs(net, nvq, &buf, &msg, &out, &in, &len,
+ &busyloop_intr);
/* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
+ if (unlikely(ret < 0))
break;
/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) {
+ if (!ret) {
if (unlikely(busyloop_intr)) {
vhost_poll_queue(&vq->poll);
} else if (unlikely(vhost_enable_notify(&net->dev,
@@ -808,7 +823,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
goto done;
} else if (unlikely(err != -ENOSPC)) {
vhost_tx_batch(net, nvq, sock, &msg);
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_avail_bufs(vq, &buf, 1);
vhost_net_enable_vq(net, vq);
break;
}
@@ -829,7 +844,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(sock, &msg, len);
if (unlikely(err < 0)) {
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_avail_bufs(vq, &buf, 1);
vhost_net_enable_vq(net, vq);
break;
}
@@ -837,8 +852,7 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
pr_debug("Truncated TX packet: len %d != %zd\n",
err, len);
done:
- vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
- vq->heads[nvq->done_idx].len = 0;
+ nvq->bufs[nvq->done_idx] = buf;
++nvq->done_idx;
} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));

@@ -850,7 +864,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
unsigned out, in;
- int head;
+ int ret;
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
@@ -864,6 +878,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
bool zcopy_used;
int sent_pkts = 0;
+ struct vhost_buf buf;

do {
bool busyloop_intr;
@@ -872,13 +887,13 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
vhost_zerocopy_signal_used(net, vq);

busyloop_intr = false;
- head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
- &busyloop_intr);
+ ret = get_tx_bufs(net, nvq, &buf, &msg, &out, &in, &len,
+ &busyloop_intr);
/* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
+ if (unlikely(ret < 0))
break;
/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) {
+ if (!ret) {
if (unlikely(busyloop_intr)) {
vhost_poll_queue(&vq->poll);
} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
@@ -897,8 +912,8 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;

- vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
- vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
+ nvq->bufs[nvq->upend_idx] = buf;
+ nvq->bufs[nvq->upend_idx].in_len = VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
@@ -930,17 +945,19 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
% UIO_MAXIOV;
}
- vhost_discard_vq_desc(vq, 1);
+ vhost_discard_avail_bufs(vq, &buf, 1);
vhost_net_enable_vq(net, vq);
break;
}
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- if (!zcopy_used)
- vhost_add_used_and_signal(&net->dev, vq, head, 0);
- else
+ if (!zcopy_used) {
+ vhost_put_used_buf(vq, &buf);
+ vhost_signal(&net->dev, vq);
+ } else {
vhost_zerocopy_signal_used(net, vq);
+ }
vhost_net_tx_packet(net);
} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
}
@@ -1004,7 +1021,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
int len = peek_head_len(rnvq, sk);

if (!len && rvq->busyloop_timeout) {
- /* Flush batched heads first */
+ /* Flush batched bufs first */
vhost_net_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
@@ -1022,11 +1039,11 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
* @iovcount - returned count of io vectors we fill
* @log - vhost log
* @log_num - log offset
- * @quota - headcount quota, 1 for big buffer
- * returns number of buffer heads allocated, negative on error
+ * @quota - bufcount quota, 1 for big buffer
+ * returns number of buffers allocated, negative on error
*/
static int get_rx_bufs(struct vhost_virtqueue *vq,
- struct vring_used_elem *heads,
+ struct vhost_buf *bufs,
int datalen,
unsigned *iovcount,
struct vhost_log *log,
@@ -1035,30 +1052,24 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
{
unsigned int out, in;
int seg = 0;
- int headcount = 0;
- unsigned d;
+ int bufcount = 0;
int r, nlogs = 0;
/* len is always initialized before use since we are always called with
* datalen > 0.
*/
u32 uninitialized_var(len);

- while (datalen > 0 && headcount < quota) {
+ while (datalen > 0 && bufcount < quota) {
if (unlikely(seg >= UIO_MAXIOV)) {
r = -ENOBUFS;
goto err;
}
- r = vhost_get_vq_desc(vq, vq->iov + seg,
- ARRAY_SIZE(vq->iov) - seg, &out,
- &in, log, log_num);
- if (unlikely(r < 0))
+ r = vhost_get_avail_buf(vq, bufs + bufcount, vq->iov + seg,
+ ARRAY_SIZE(vq->iov) - seg, &out,
+ &in, log, log_num);
+ if (unlikely(r <= 0))
goto err;

- d = r;
- if (d == vq->num) {
- r = 0;
- goto err;
- }
if (unlikely(out || in <= 0)) {
vq_err(vq, "unexpected descriptor format for RX: "
"out %d, in %d\n", out, in);
@@ -1069,14 +1080,12 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
nlogs += *log_num;
log += *log_num;
}
- heads[headcount].id = cpu_to_vhost32(vq, d);
len = iov_length(vq->iov + seg, in);
- heads[headcount].len = cpu_to_vhost32(vq, len);
datalen -= len;
- ++headcount;
+ ++bufcount;
seg += in;
}
- heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
+ bufs[bufcount - 1].in_len = len + datalen;
*iovcount = seg;
if (unlikely(log))
*log_num = nlogs;
@@ -1086,9 +1095,9 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
r = UIO_MAXIOV + 1;
goto err;
}
- return headcount;
+ return bufcount;
err:
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_avail_bufs(vq, bufs, bufcount);
return r;
}

@@ -1113,7 +1122,7 @@ static void handle_rx(struct vhost_net *net)
};
size_t total_len = 0;
int err, mergeable;
- s16 headcount;
+ int bufcount;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
bool busyloop_intr = false;
@@ -1147,14 +1156,14 @@ static void handle_rx(struct vhost_net *net)
break;
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
- headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
- vhost_len, &in, vq_log, &log,
- likely(mergeable) ? UIO_MAXIOV : 1);
+ bufcount = get_rx_bufs(vq, nvq->bufs + nvq->done_idx,
+ vhost_len, &in, vq_log, &log,
+ likely(mergeable) ? UIO_MAXIOV : 1);
/* On error, stop handling until the next kick. */
- if (unlikely(headcount < 0))
+ if (unlikely(bufcount < 0))
goto out;
/* OK, now we need to know about added descriptors. */
- if (!headcount) {
+ if (!bufcount) {
if (unlikely(busyloop_intr)) {
vhost_poll_queue(&vq->poll);
} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
@@ -1171,7 +1180,7 @@ static void handle_rx(struct vhost_net *net)
if (nvq->rx_ring)
msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
/* On overrun, truncate and discard */
- if (unlikely(headcount > UIO_MAXIOV)) {
+ if (unlikely(bufcount > UIO_MAXIOV)) {
iov_iter_init(&msg.msg_iter, READ, vq->iov, 1, 1);
err = sock->ops->recvmsg(sock, &msg,
1, MSG_DONTWAIT | MSG_TRUNC);
@@ -1195,7 +1204,7 @@ static void handle_rx(struct vhost_net *net)
if (unlikely(err != sock_len)) {
pr_debug("Discarded rx packet: "
" len %d, expected %zd\n", err, sock_len);
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_avail_bufs(vq, nvq->bufs + nvq->done_idx, bufcount);
continue;
}
/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
@@ -1214,15 +1223,15 @@ static void handle_rx(struct vhost_net *net)
}
/* TODO: Should check and handle checksum. */

- num_buffers = cpu_to_vhost16(vq, headcount);
+ num_buffers = cpu_to_vhost16(vq, bufcount);
if (likely(mergeable) &&
copy_to_iter(&num_buffers, sizeof num_buffers,
&fixup) != sizeof num_buffers) {
vq_err(vq, "Failed num_buffers write");
- vhost_discard_vq_desc(vq, headcount);
+ vhost_discard_avail_bufs(vq, nvq->bufs + nvq->done_idx, bufcount);
goto out;
}
- nvq->done_idx += headcount;
+ nvq->done_idx += bufcount;
if (nvq->done_idx > VHOST_NET_BATCH)
vhost_net_signal_used(nvq);
if (unlikely(vq_log))
@@ -1314,6 +1323,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
+ n->vqs[i].bufs = NULL;
n->vqs[i].ubufs = NULL;
n->vqs[i].ubuf_info = NULL;
n->vqs[i].upend_idx = 0;
--
MST

2020-06-11 11:38:16

[permalink] [raw]

Subject: [PATCH RFC v8 08/11] vhost/test: convert to the buf API

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/test.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 7d69778aaa26..12304eb8da15 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -44,9 +44,10 @@ static void handle_vq(struct vhost_test *n)
{
struct vhost_virtqueue *vq = &n->vqs[VHOST_TEST_VQ];
unsigned out, in;
- int head;
+ int ret;
size_t len, total_len = 0;
void *private;
+ struct vhost_buf buf;

mutex_lock(&vq->mutex);
private = vhost_vq_get_backend(vq);
@@ -58,15 +59,15 @@ static void handle_vq(struct vhost_test *n)
vhost_disable_notify(&n->dev, vq);

for (;;) {
- head = vhost_get_vq_desc(vq, vq->iov,
- ARRAY_SIZE(vq->iov),
- &out, &in,
- NULL, NULL);
+ ret = vhost_get_avail_buf(vq, &buf, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in,
+ NULL, NULL);
/* On error, stop handling until the next kick. */
- if (unlikely(head < 0))
+ if (unlikely(ret < 0))
break;
/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (head == vq->num) {
+ if (!ret) {
if (unlikely(vhost_enable_notify(&n->dev, vq))) {
vhost_disable_notify(&n->dev, vq);
continue;
@@ -78,13 +79,14 @@ static void handle_vq(struct vhost_test *n)
"out %d, int %d\n", out, in);
break;
}
- len = iov_length(vq->iov, out);
+ len = buf.out_len;
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected 0 len for TX\n");
break;
}
- vhost_add_used_and_signal(&n->dev, vq, head, 0);
+ vhost_put_used_buf(vq, &buf);
+ vhost_signal(&n->dev, vq);
total_len += len;
if (unlikely(vhost_exceeds_weight(vq, 0, total_len)))
break;
--
MST

2020-06-11 11:38:24

[permalink] [raw]

Subject: [PATCH RFC v8 10/11] vhost/vsock: switch to the buf API

A straight-forward conversion.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/vsock.c | 30 ++++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index a483cec31d5c..61c6d3dd2ae3 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -103,7 +103,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
unsigned out, in;
size_t nbytes;
size_t iov_len, payload_len;
- int head;
+ struct vhost_buf buf;
+ int ret;

spin_lock_bh(&vsock->send_pkt_list_lock);
if (list_empty(&vsock->send_pkt_list)) {
@@ -117,16 +118,17 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
list_del_init(&pkt->list);
spin_unlock_bh(&vsock->send_pkt_list_lock);

- head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- &out, &in, NULL, NULL);
- if (head < 0) {
+ ret = vhost_get_avail_buf(vq, &buf,
+ vq->iov, ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (ret < 0) {
spin_lock_bh(&vsock->send_pkt_list_lock);
list_add(&pkt->list, &vsock->send_pkt_list);
spin_unlock_bh(&vsock->send_pkt_list_lock);
break;
}

- if (head == vq->num) {
+ if (!ret) {
spin_lock_bh(&vsock->send_pkt_list_lock);
list_add(&pkt->list, &vsock->send_pkt_list);
spin_unlock_bh(&vsock->send_pkt_list_lock);
@@ -186,7 +188,8 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
*/
virtio_transport_deliver_tap_pkt(pkt);

- vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
+ buf.in_len = sizeof(pkt->hdr) + payload_len;
+ vhost_put_used_buf(vq, &buf);
added = true;

pkt->off += payload_len;
@@ -440,7 +443,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
dev);
struct virtio_vsock_pkt *pkt;
- int head, pkts = 0, total_len = 0;
+ int ret, pkts = 0, total_len = 0;
+ struct vhost_buf buf;
unsigned int out, in;
bool added = false;

@@ -461,12 +465,13 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
goto no_more_replies;
}

- head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- &out, &in, NULL, NULL);
- if (head < 0)
+ ret = vhost_get_avail_buf(vq, &buf,
+ vq->iov, ARRAY_SIZE(vq->iov),
+ &out, &in, NULL, NULL);
+ if (ret < 0)
break;

- if (head == vq->num) {
+ if (!ret) {
if (unlikely(vhost_enable_notify(&vsock->dev, vq))) {
vhost_disable_notify(&vsock->dev, vq);
continue;
@@ -494,7 +499,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
virtio_transport_free_pkt(pkt);

len += sizeof(pkt->hdr);
- vhost_add_used(vq, head, len);
+ buf.in_len = len;
+ vhost_put_used_buf(vq, &buf);
total_len += len;
added = true;
} while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
--
MST

2020-06-11 11:39:20

[permalink] [raw]

Subject: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

As testing shows no performance change, switch to that now.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Eugenio Pérez <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/test.c | 2 +-
drivers/vhost/vhost.c | 314 ++++++++----------------------------------
drivers/vhost/vhost.h | 7 +-
3 files changed, 61 insertions(+), 262 deletions(-)

diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 0466921f4772..7d69778aaa26 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -119,7 +119,7 @@ static int vhost_test_open(struct inode *inode, struct file *f)
dev = &n->dev;
vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
- vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
+ vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV + 64,
VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);

f->private_data = n;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 11433d709651..dfcdb36d4227 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -304,6 +304,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
{
vq->num = 1;
vq->ndescs = 0;
+ vq->first_desc = 0;
vq->desc = NULL;
vq->avail = NULL;
vq->used = NULL;
@@ -372,6 +373,11 @@ static int vhost_worker(void *data)
return 0;
}

+static int vhost_vq_num_batch_descs(struct vhost_virtqueue *vq)
+{
+ return vq->max_descs - UIO_MAXIOV;
+}
+
static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
{
kfree(vq->descs);
@@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
vq->max_descs = dev->iov_limit;
+ if (vhost_vq_num_batch_descs(vq) < 0) {
+ return -EINVAL;
+ }
vq->descs = kmalloc_array(vq->max_descs,
sizeof(*vq->descs),
GFP_KERNEL);
@@ -1610,6 +1619,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
vq->last_avail_idx = s.num;
/* Forget the cached index value. */
vq->avail_idx = vq->last_avail_idx;
+ vq->ndescs = vq->first_desc = 0;
break;
case VHOST_GET_VRING_BASE:
s.index = idx;
@@ -2078,253 +2088,6 @@ static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc)
return next;
}

-static int get_indirect(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num,
- struct vring_desc *indirect)
-{
- struct vring_desc desc;
- unsigned int i = 0, count, found = 0;
- u32 len = vhost32_to_cpu(vq, indirect->len);
- struct iov_iter from;
- int ret, access;
-
- /* Sanity check */
- if (unlikely(len % sizeof desc)) {
- vq_err(vq, "Invalid length in indirect descriptor: "
- "len 0x%llx not multiple of 0x%zx\n",
- (unsigned long long)len,
- sizeof desc);
- return -EINVAL;
- }
-
- ret = translate_desc(vq, vhost64_to_cpu(vq, indirect->addr), len, vq->indirect,
- UIO_MAXIOV, VHOST_ACCESS_RO);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Translation failure %d in indirect.\n", ret);
- return ret;
- }
- iov_iter_init(&from, READ, vq->indirect, ret, len);
-
- /* We will use the result as an address to read from, so most
- * architectures only need a compiler barrier here. */
- read_barrier_depends();
-
- count = len / sizeof desc;
- /* Buffers are chained via a 16 bit next field, so
- * we can have at most 2^16 of these. */
- if (unlikely(count > USHRT_MAX + 1)) {
- vq_err(vq, "Indirect buffer length too big: %d\n",
- indirect->len);
- return -E2BIG;
- }
-
- do {
- unsigned iov_count = *in_num + *out_num;
- if (unlikely(++found > count)) {
- vq_err(vq, "Loop detected: last one at %u "
- "indirect size %u\n",
- i, count);
- return -EINVAL;
- }
- if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
- vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
- i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
- return -EINVAL;
- }
- if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
- vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
- i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
- return -EINVAL;
- }
-
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
- access = VHOST_ACCESS_WO;
- else
- access = VHOST_ACCESS_RO;
-
- ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
- vhost32_to_cpu(vq, desc.len), iov + iov_count,
- iov_size - iov_count, access);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Translation failure %d indirect idx %d\n",
- ret, i);
- return ret;
- }
- /* If this is an input descriptor, increment that count. */
- if (access == VHOST_ACCESS_WO) {
- *in_num += ret;
- if (unlikely(log && ret)) {
- log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
- log[*log_num].len = vhost32_to_cpu(vq, desc.len);
- ++*log_num;
- }
- } else {
- /* If it's an output descriptor, they're all supposed
- * to come before any input descriptors. */
- if (unlikely(*in_num)) {
- vq_err(vq, "Indirect descriptor "
- "has out after in: idx %d\n", i);
- return -EINVAL;
- }
- *out_num += ret;
- }
- } while ((i = next_desc(vq, &desc)) != -1);
- return 0;
-}
-
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
-{
- struct vring_desc desc;
- unsigned int i, head, found = 0;
- u16 last_avail_idx;
- __virtio16 avail_idx;
- __virtio16 ring_head;
- int ret, access;
-
- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vq->last_avail_idx;
-
- if (vq->avail_idx == vq->last_avail_idx) {
- if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
- vq_err(vq, "Failed to access avail idx at %p\n",
- &vq->avail->idx);
- return -EFAULT;
- }
- vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
-
- if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
- vq_err(vq, "Guest moved used index from %u to %u",
- last_avail_idx, vq->avail_idx);
- return -EFAULT;
- }
-
- /* If there's nothing new since last we looked, return
- * invalid.
- */
- if (vq->avail_idx == last_avail_idx)
- return vq->num;
-
- /* Only get avail ring entries after they have been
- * exposed by guest.
- */
- smp_rmb();
- }
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
- vq_err(vq, "Failed to read head: idx %d address %p\n",
- last_avail_idx,
- &vq->avail->ring[last_avail_idx % vq->num]);
- return -EFAULT;
- }
-
- head = vhost16_to_cpu(vq, ring_head);
-
- /* If their number is silly, that's an error. */
- if (unlikely(head >= vq->num)) {
- vq_err(vq, "Guest says index %u > %u is available",
- head, vq->num);
- return -EINVAL;
- }
-
- /* When we start there are none of either input nor output. */
- *out_num = *in_num = 0;
- if (unlikely(log))
- *log_num = 0;
-
- i = head;
- do {
- unsigned iov_count = *in_num + *out_num;
- if (unlikely(i >= vq->num)) {
- vq_err(vq, "Desc index is %u > %u, head = %u",
- i, vq->num, head);
- return -EINVAL;
- }
- if (unlikely(++found > vq->num)) {
- vq_err(vq, "Loop detected: last one at %u "
- "vq size %u head %u\n",
- i, vq->num, head);
- return -EINVAL;
- }
- ret = vhost_get_desc(vq, &desc, i);
- if (unlikely(ret)) {
- vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
- i, vq->desc + i);
- return -EFAULT;
- }
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT)) {
- ret = get_indirect(vq, iov, iov_size,
- out_num, in_num,
- log, log_num, &desc);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Failure detected "
- "in indirect descriptor at idx %d\n", i);
- return ret;
- }
- continue;
- }
-
- if (desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_WRITE))
- access = VHOST_ACCESS_WO;
- else
- access = VHOST_ACCESS_RO;
- ret = translate_desc(vq, vhost64_to_cpu(vq, desc.addr),
- vhost32_to_cpu(vq, desc.len), iov + iov_count,
- iov_size - iov_count, access);
- if (unlikely(ret < 0)) {
- if (ret != -EAGAIN)
- vq_err(vq, "Translation failure %d descriptor idx %d\n",
- ret, i);
- return ret;
- }
- if (access == VHOST_ACCESS_WO) {
- /* If this is an input descriptor,
- * increment that count. */
- *in_num += ret;
- if (unlikely(log && ret)) {
- log[*log_num].addr = vhost64_to_cpu(vq, desc.addr);
- log[*log_num].len = vhost32_to_cpu(vq, desc.len);
- ++*log_num;
- }
- } else {
- /* If it's an output descriptor, they're all supposed
- * to come before any input descriptors. */
- if (unlikely(*in_num)) {
- vq_err(vq, "Descriptor has out after in: "
- "idx %d\n", i);
- return -EINVAL;
- }
- *out_num += ret;
- }
- } while ((i = next_desc(vq, &desc)) != -1);
-
- /* On success, increment avail index. */
- vq->last_avail_idx++;
-
- /* Assume notifications from guest are disabled at this point,
- * if they aren't we would need to update avail_event index. */
- BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
- return head;
-}
-EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
-
static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
{
BUG_ON(!vq->ndescs);
@@ -2428,7 +2191,7 @@ static int fetch_indirect_descs(struct vhost_virtqueue *vq,

/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
* A negative code is returned on error. */
-static int fetch_descs(struct vhost_virtqueue *vq)
+static int fetch_buf(struct vhost_virtqueue *vq)
{
unsigned int i, head, found = 0;
struct vhost_desc *last;
@@ -2441,7 +2204,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
/* Check it isn't doing very strange things with descriptor numbers. */
last_avail_idx = vq->last_avail_idx;

- if (vq->avail_idx == vq->last_avail_idx) {
+ if (unlikely(vq->avail_idx == vq->last_avail_idx)) {
if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
vq_err(vq, "Failed to access avail idx at %p\n",
&vq->avail->idx);
@@ -2532,6 +2295,41 @@ static int fetch_descs(struct vhost_virtqueue *vq)
return 1;
}

+/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
+ * A negative code is returned on error. */
+static int fetch_descs(struct vhost_virtqueue *vq)
+{
+ int ret;
+
+ if (unlikely(vq->first_desc >= vq->ndescs)) {
+ vq->first_desc = 0;
+ vq->ndescs = 0;
+ }
+
+ if (vq->ndescs)
+ return 1;
+
+ for (ret = 1;
+ ret > 0 && vq->ndescs <= vhost_vq_num_batch_descs(vq);
+ ret = fetch_buf(vq))
+ ;
+
+ /* On success we expect some descs */
+ BUG_ON(ret > 0 && !vq->ndescs);
+ return ret;
+}
+
+/* Reverse the effects of fetch_descs */
+static void unfetch_descs(struct vhost_virtqueue *vq)
+{
+ int i;
+
+ for (i = vq->first_desc; i < vq->ndescs; ++i)
+ if (!(vq->descs[i].flags & VRING_DESC_F_NEXT))
+ vq->last_avail_idx -= 1;
+ vq->ndescs = 0;
+}
+
/* This looks in the virtqueue and for the first available buffer, and converts
* it to an iovec for convenient access. Since descriptors consist of some
* number of output then some number of input descriptors, it's actually two
@@ -2540,7 +2338,7 @@ static int fetch_descs(struct vhost_virtqueue *vq)
* This function returns the descriptor number found, or vq->num (which is
* never a valid descriptor number) if none was found. A negative code is
* returned on error. */
-int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num)
@@ -2549,7 +2347,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
int i;

if (ret <= 0)
- goto err_fetch;
+ goto err;

/* Now convert to IOV */
/* When we start there are none of either input nor output. */
@@ -2557,7 +2355,7 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
if (unlikely(log))
*log_num = 0;

- for (i = 0; i < vq->ndescs; ++i) {
+ for (i = vq->first_desc; i < vq->ndescs; ++i) {
unsigned iov_count = *in_num + *out_num;
struct vhost_desc *desc = &vq->descs[i];
int access;
@@ -2603,24 +2401,26 @@ int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
}

ret = desc->id;
+
+ if (!(desc->flags & VRING_DESC_F_NEXT))
+ break;
}

- vq->ndescs = 0;
+ vq->first_desc = i + 1;

return ret;

err:
- vhost_discard_vq_desc(vq, 1);
-err_fetch:
- vq->ndescs = 0;
+ unfetch_descs(vq);

return ret ? ret : vq->num;
}
-EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
+EXPORT_SYMBOL_GPL(vhost_get_vq_desc);

/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
{
+ unfetch_descs(vq);
vq->last_avail_idx -= n;
}
EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 87089d51490d..fed36af5c444 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -81,6 +81,7 @@ struct vhost_virtqueue {

struct vhost_desc *descs;
int ndescs;
+ int first_desc;
int max_descs;

struct file *kick;
@@ -189,10 +190,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);

-int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
- struct iovec iov[], unsigned int iov_count,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num);
int vhost_get_vq_desc(struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
@@ -261,6 +258,8 @@ static inline void vhost_vq_set_backend(struct vhost_virtqueue *vq,
void *private_data)
{
vq->private_data = private_data;
+ vq->ndescs = 0;
+ vq->first_desc = 0;
}

/**
--
MST

2020-06-11 11:39:21

[permalink] [raw]

Subject: [PATCH RFC v8 03/11] vhost/net: pass net specific struct pointer

In preparation for further cleanup, pass net specific pointer
to ubuf callbacks so we can move net specific fields
out to net structures.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/net.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index bf5e1d81ae25..ff594eec8ae3 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -94,7 +94,7 @@ struct vhost_net_ubuf_ref {
*/
atomic_t refcount;
wait_queue_head_t wait;
- struct vhost_virtqueue *vq;
+ struct vhost_net_virtqueue *nvq;
};

#define VHOST_NET_BATCH 64
@@ -231,7 +231,7 @@ static void vhost_net_enable_zcopy(int vq)
}

static struct vhost_net_ubuf_ref *
-vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
+vhost_net_ubuf_alloc(struct vhost_net_virtqueue *nvq, bool zcopy)
{
struct vhost_net_ubuf_ref *ubufs;
/* No zero copy backend? Nothing to count. */
@@ -242,7 +242,7 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
return ERR_PTR(-ENOMEM);
atomic_set(&ubufs->refcount, 1);
init_waitqueue_head(&ubufs->wait);
- ubufs->vq = vq;
+ ubufs->nvq = nvq;
return ubufs;
}

@@ -384,13 +384,13 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
{
struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
- struct vhost_virtqueue *vq = ubufs->vq;
+ struct vhost_net_virtqueue *nvq = ubufs->nvq;
int cnt;

rcu_read_lock_bh();

/* set len to mark this desc buffers done DMA */
- vq->heads[ubuf->desc].len = success ?
+ nvq->vq.heads[ubuf->desc].in_len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
cnt = vhost_net_ubuf_put(ubufs);

@@ -402,7 +402,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
* less than 10% of times).
*/
if (cnt <= 1 || !(cnt % 16))
- vhost_poll_queue(&vq->poll);
+ vhost_poll_queue(&nvq->vq.poll);

rcu_read_unlock_bh();
}
@@ -1525,7 +1525,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
/* start polling new socket */
oldsock = vhost_vq_get_backend(vq);
if (sock != oldsock) {
- ubufs = vhost_net_ubuf_alloc(vq,
+ ubufs = vhost_net_ubuf_alloc(nvq,
sock && vhost_sock_zcopy(sock));
if (IS_ERR(ubufs)) {
r = PTR_ERR(ubufs);
--
MST

2020-06-11 11:39:22

[permalink] [raw]

Subject: [PATCH RFC v8 01/11] vhost: option to fetch descriptors through an independent struct

The idea is to support multiple ring formats by converting
to a format-independent array of descriptors.

This costs extra cycles, but we gain in ability
to fetch a batch of descriptors in one go, which
is good for code cache locality.

When used, this causes a minor performance degradation,
it's been kept as simple as possible for ease of review.
A follow-up patch gets us back the performance by adding batching.

To simplify benchmarking, I kept the old code around so one can switch
back and forth between old and new code. This will go away in the final
submission.

Signed-off-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Eugenio Pérez <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/vhost.c | 305 +++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/vhost.h | 16 +++
2 files changed, 320 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 172da092107e..11433d709651 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -303,6 +303,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
struct vhost_virtqueue *vq)
{
vq->num = 1;
+ vq->ndescs = 0;
vq->desc = NULL;
vq->avail = NULL;
vq->used = NULL;
@@ -373,6 +374,9 @@ static int vhost_worker(void *data)

static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
{
+ kfree(vq->descs);
+ vq->descs = NULL;
+ vq->max_descs = 0;
kfree(vq->indirect);
vq->indirect = NULL;
kfree(vq->log);
@@ -389,6 +393,10 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)

for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
+ vq->max_descs = dev->iov_limit;
+ vq->descs = kmalloc_array(vq->max_descs,
+ sizeof(*vq->descs),
+ GFP_KERNEL);
vq->indirect = kmalloc_array(UIO_MAXIOV,
sizeof(*vq->indirect),
GFP_KERNEL);
@@ -396,7 +404,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
GFP_KERNEL);
vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads),
GFP_KERNEL);
- if (!vq->indirect || !vq->log || !vq->heads)
+ if (!vq->indirect || !vq->log || !vq->heads || !vq->descs)
goto err_nomem;
}
return 0;
@@ -488,6 +496,8 @@ void vhost_dev_init(struct vhost_dev *dev,

for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
+ vq->descs = NULL;
+ vq->max_descs = 0;
vq->log = NULL;
vq->indirect = NULL;
vq->heads = NULL;
@@ -2315,6 +2325,299 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);

+static struct vhost_desc *peek_split_desc(struct vhost_virtqueue *vq)
+{
+ BUG_ON(!vq->ndescs);
+ return &vq->descs[vq->ndescs - 1];
+}
+
+static void pop_split_desc(struct vhost_virtqueue *vq)
+{
+ BUG_ON(!vq->ndescs);
+ --vq->ndescs;
+}
+
+#define VHOST_DESC_FLAGS (VRING_DESC_F_INDIRECT | VRING_DESC_F_WRITE | \
+ VRING_DESC_F_NEXT)
+static int push_split_desc(struct vhost_virtqueue *vq, struct vring_desc *desc, u16 id)
+{
+ struct vhost_desc *h;
+
+ if (unlikely(vq->ndescs >= vq->max_descs))
+ return -EINVAL;
+ h = &vq->descs[vq->ndescs++];
+ h->addr = vhost64_to_cpu(vq, desc->addr);
+ h->len = vhost32_to_cpu(vq, desc->len);
+ h->flags = vhost16_to_cpu(vq, desc->flags) & VHOST_DESC_FLAGS;
+ h->id = id;
+
+ return 0;
+}
+
+static int fetch_indirect_descs(struct vhost_virtqueue *vq,
+ struct vhost_desc *indirect,
+ u16 head)
+{
+ struct vring_desc desc;
+ unsigned int i = 0, count, found = 0;
+ u32 len = indirect->len;
+ struct iov_iter from;
+ int ret;
+
+ /* Sanity check */
+ if (unlikely(len % sizeof desc)) {
+ vq_err(vq, "Invalid length in indirect descriptor: "
+ "len 0x%llx not multiple of 0x%zx\n",
+ (unsigned long long)len,
+ sizeof desc);
+ return -EINVAL;
+ }
+
+ ret = translate_desc(vq, indirect->addr, len, vq->indirect,
+ UIO_MAXIOV, VHOST_ACCESS_RO);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d in indirect.\n", ret);
+ return ret;
+ }
+ iov_iter_init(&from, READ, vq->indirect, ret, len);
+
+ /* We will use the result as an address to read from, so most
+ * architectures only need a compiler barrier here. */
+ read_barrier_depends();
+
+ count = len / sizeof desc;
+ /* Buffers are chained via a 16 bit next field, so
+ * we can have at most 2^16 of these. */
+ if (unlikely(count > USHRT_MAX + 1)) {
+ vq_err(vq, "Indirect buffer length too big: %d\n",
+ indirect->len);
+ return -E2BIG;
+ }
+ if (unlikely(vq->ndescs + count > vq->max_descs)) {
+ vq_err(vq, "Too many indirect + direct descs: %d + %d\n",
+ vq->ndescs, indirect->len);
+ return -E2BIG;
+ }
+
+ do {
+ if (unlikely(++found > count)) {
+ vq_err(vq, "Loop detected: last one at %u "
+ "indirect size %u\n",
+ i, count);
+ return -EINVAL;
+ }
+ if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
+ vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
+ i, (size_t)indirect->addr + i * sizeof desc);
+ return -EINVAL;
+ }
+ if (unlikely(desc.flags & cpu_to_vhost16(vq, VRING_DESC_F_INDIRECT))) {
+ vq_err(vq, "Nested indirect descriptor: idx %d, %zx\n",
+ i, (size_t)indirect->addr + i * sizeof desc);
+ return -EINVAL;
+ }
+
+ /* Note: push_split_desc can't fail here:
+ * we never fetch unless there's space. */
+ ret = push_split_desc(vq, &desc, head);
+ WARN_ON(ret);
+ } while ((i = next_desc(vq, &desc)) != -1);
+ return 0;
+}
+
+/* This function returns a value > 0 if a descriptor was found, or 0 if none were found.
+ * A negative code is returned on error. */
+static int fetch_descs(struct vhost_virtqueue *vq)
+{
+ unsigned int i, head, found = 0;
+ struct vhost_desc *last;
+ struct vring_desc desc;
+ __virtio16 avail_idx;
+ __virtio16 ring_head;
+ u16 last_avail_idx;
+ int ret;
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vq->last_avail_idx;
+
+ if (vq->avail_idx == vq->last_avail_idx) {
+ if (unlikely(vhost_get_avail_idx(vq, &avail_idx))) {
+ vq_err(vq, "Failed to access avail idx at %p\n",
+ &vq->avail->idx);
+ return -EFAULT;
+ }
+ vq->avail_idx = vhost16_to_cpu(vq, avail_idx);
+
+ if (unlikely((u16)(vq->avail_idx - last_avail_idx) > vq->num)) {
+ vq_err(vq, "Guest moved used index from %u to %u",
+ last_avail_idx, vq->avail_idx);
+ return -EFAULT;
+ }
+
+ /* If there's nothing new since last we looked, return
+ * invalid.
+ */
+ if (vq->avail_idx == last_avail_idx)
+ return 0;
+
+ /* Only get avail ring entries after they have been
+ * exposed by guest.
+ */
+ smp_rmb();
+ }
+
+ /* Grab the next descriptor number they're advertising */
+ if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) {
+ vq_err(vq, "Failed to read head: idx %d address %p\n",
+ last_avail_idx,
+ &vq->avail->ring[last_avail_idx % vq->num]);
+ return -EFAULT;
+ }
+
+ head = vhost16_to_cpu(vq, ring_head);
+
+ /* If their number is silly, that's an error. */
+ if (unlikely(head >= vq->num)) {
+ vq_err(vq, "Guest says index %u > %u is available",
+ head, vq->num);
+ return -EINVAL;
+ }
+
+ i = head;
+ do {
+ if (unlikely(i >= vq->num)) {
+ vq_err(vq, "Desc index is %u > %u, head = %u",
+ i, vq->num, head);
+ return -EINVAL;
+ }
+ if (unlikely(++found > vq->num)) {
+ vq_err(vq, "Loop detected: last one at %u "
+ "vq size %u head %u\n",
+ i, vq->num, head);
+ return -EINVAL;
+ }
+ ret = vhost_get_desc(vq, &desc, i);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to get descriptor: idx %d addr %p\n",
+ i, vq->desc + i);
+ return -EFAULT;
+ }
+ ret = push_split_desc(vq, &desc, head);
+ if (unlikely(ret)) {
+ vq_err(vq, "Failed to save descriptor: idx %d\n", i);
+ return -EINVAL;
+ }
+ } while ((i = next_desc(vq, &desc)) != -1);
+
+ last = peek_split_desc(vq);
+ if (unlikely(last->flags & VRING_DESC_F_INDIRECT)) {
+ pop_split_desc(vq);
+ ret = fetch_indirect_descs(vq, last, head);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Failure detected "
+ "in indirect descriptor at idx %d\n", head);
+ return ret;
+ }
+ }
+
+ /* Assume notifications from guest are disabled at this point,
+ * if they aren't we would need to update avail_event index. */
+ BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY));
+
+ /* On success, increment avail index. */
+ vq->last_avail_idx++;
+
+ return 1;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found. A negative code is
+ * returned on error. */
+int vhost_get_vq_desc_batch(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ int ret = fetch_descs(vq);
+ int i;
+
+ if (ret <= 0)
+ goto err_fetch;
+
+ /* Now convert to IOV */
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+ if (unlikely(log))
+ *log_num = 0;
+
+ for (i = 0; i < vq->ndescs; ++i) {
+ unsigned iov_count = *in_num + *out_num;
+ struct vhost_desc *desc = &vq->descs[i];
+ int access;
+
+ if (desc->flags & ~VHOST_DESC_FLAGS) {
+ vq_err(vq, "Unexpected flags: 0x%x at descriptor id 0x%x\n",
+ desc->flags, desc->id);
+ ret = -EINVAL;
+ goto err;
+ }
+ if (desc->flags & VRING_DESC_F_WRITE)
+ access = VHOST_ACCESS_WO;
+ else
+ access = VHOST_ACCESS_RO;
+ ret = translate_desc(vq, desc->addr,
+ desc->len, iov + iov_count,
+ iov_size - iov_count, access);
+ if (unlikely(ret < 0)) {
+ if (ret != -EAGAIN)
+ vq_err(vq, "Translation failure %d descriptor idx %d\n",
+ ret, i);
+ goto err;
+ }
+ if (access == VHOST_ACCESS_WO) {
+ /* If this is an input descriptor,
+ * increment that count. */
+ *in_num += ret;
+ if (unlikely(log && ret)) {
+ log[*log_num].addr = desc->addr;
+ log[*log_num].len = desc->len;
+ ++*log_num;
+ }
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (unlikely(*in_num)) {
+ vq_err(vq, "Descriptor has out after in: "
+ "idx %d\n", i);
+ ret = -EINVAL;
+ goto err;
+ }
+ *out_num += ret;
+ }
+
+ ret = desc->id;
+ }
+
+ vq->ndescs = 0;
+
+ return ret;
+
+err:
+ vhost_discard_vq_desc(vq, 1);
+err_fetch:
+ vq->ndescs = 0;
+
+ return ret ? ret : vq->num;
+}
+EXPORT_SYMBOL_GPL(vhost_get_vq_desc_batch);
+
/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
{
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index c8e96a095d3b..87089d51490d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -60,6 +60,13 @@ enum vhost_uaddr_type {
VHOST_NUM_ADDRS = 3,
};

+struct vhost_desc {
+ u64 addr;
+ u32 len;
+ u16 flags; /* VRING_DESC_F_WRITE, VRING_DESC_F_NEXT */
+ u16 id;
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -71,6 +78,11 @@ struct vhost_virtqueue {
vring_avail_t __user *avail;
vring_used_t __user *used;
const struct vhost_iotlb_map *meta_iotlb[VHOST_NUM_ADDRS];
+
+ struct vhost_desc *descs;
+ int ndescs;
+ int max_descs;
+
struct file *kick;
struct eventfd_ctx *call_ctx;
struct eventfd_ctx *error_ctx;
@@ -177,6 +189,10 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);

+int vhost_get_vq_desc_batch(struct vhost_virtqueue *,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num);
int vhost_get_vq_desc(struct vhost_virtqueue *,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
--
MST

2020-06-11 11:39:47

[permalink] [raw]

Subject: [PATCH RFC v8 07/11] vhost/net: avoid iov length math

Now that API exposes buffer length, we no longer need to
scan IOVs to figure it out.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/net.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 830fe84912a5..0b509be8d7b1 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -607,11 +607,9 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
}

static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
- size_t hdr_size, int out)
+ size_t len, size_t hdr_size, int out)
{
/* Skip header. TODO: support TSO. */
- size_t len = iov_length(vq->iov, out);
-
iov_iter_init(iter, WRITE, vq->iov, out, len);
iov_iter_advance(iter, hdr_size);

@@ -640,7 +638,7 @@ static int get_tx_bufs(struct vhost_net *net,
}

/* Sanity check */
- *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out);
+ *len = init_iov_iter(vq, &msg->msg_iter, buf->out_len, nvq->vhost_hlen, *out);
if (*len == 0) {
vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
*len, nvq->vhost_hlen);
@@ -1080,7 +1078,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
nlogs += *log_num;
log += *log_num;
}
- len = iov_length(vq->iov + seg, in);
+ len = bufs[bufcount].in_len;
datalen -= len;
++bufcount;
seg += in;
--
MST

2020-06-11 11:40:09

[permalink] [raw]

Subject: [PATCH RFC v8 11/11] vhost: drop head based APIs

Everyone's using buf APIs, no need for head based ones anymore.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/vhost.c | 64 ++++++-------------------------------------
drivers/vhost/vhost.h | 12 --------
2 files changed, 8 insertions(+), 68 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 03e6bca02288..9096bd291c91 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2425,39 +2425,11 @@ EXPORT_SYMBOL_GPL(vhost_get_avail_buf);
void vhost_discard_avail_bufs(struct vhost_virtqueue *vq,
struct vhost_buf *buf, unsigned count)
{
- vhost_discard_vq_desc(vq, count);
+ unfetch_descs(vq);
+ vq->last_avail_idx -= count;
}
EXPORT_SYMBOL_GPL(vhost_discard_avail_bufs);

-/* This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
-{
- struct vhost_buf buf;
- int ret = vhost_get_avail_buf(vq, &buf,
- iov, iov_size, out_num, in_num,
- log, log_num);
-
- if (likely(ret > 0))
- return buf->id;
- if (likely(!ret))
- return vq->num;
- return ret;
-}
-EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
-
-/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */
-void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
-{
- unfetch_descs(vq);
- vq->last_avail_idx -= n;
-}
-EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);
-
static int __vhost_add_used_n(struct vhost_virtqueue *vq,
struct vring_used_elem *heads,
unsigned count)
@@ -2490,8 +2462,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
return 0;
}

-/* After we've used one of their buffers, we tell them about it. We'll then
- * want to notify the guest, using eventfd. */
+static
int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
unsigned count)
{
@@ -2525,10 +2496,8 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
}
return r;
}
-EXPORT_SYMBOL_GPL(vhost_add_used_n);

-/* After we've used one of their buffers, we tell them about it. We'll then
- * want to notify the guest, using eventfd. */
+static
int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
{
struct vring_used_elem heads = {
@@ -2538,14 +2507,17 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)

return vhost_add_used_n(vq, &heads, 1);
}
-EXPORT_SYMBOL_GPL(vhost_add_used);

+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using vhost_signal. */
int vhost_put_used_buf(struct vhost_virtqueue *vq, struct vhost_buf *buf)
{
return vhost_add_used(vq, buf->id, buf->in_len);
}
EXPORT_SYMBOL_GPL(vhost_put_used_buf);

+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using vhost_signal. */
int vhost_put_used_n_bufs(struct vhost_virtqueue *vq,
struct vhost_buf *bufs, unsigned count)
{
@@ -2606,26 +2578,6 @@ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
}
EXPORT_SYMBOL_GPL(vhost_signal);

-/* And here's the combo meal deal. Supersize me! */
-void vhost_add_used_and_signal(struct vhost_dev *dev,
- struct vhost_virtqueue *vq,
- unsigned int head, int len)
-{
- vhost_add_used(vq, head, len);
- vhost_signal(dev, vq);
-}
-EXPORT_SYMBOL_GPL(vhost_add_used_and_signal);
-
-/* multi-buffer version of vhost_add_used_and_signal */
-void vhost_add_used_and_signal_n(struct vhost_dev *dev,
- struct vhost_virtqueue *vq,
- struct vring_used_elem *heads, unsigned count)
-{
- vhost_add_used_n(vq, heads, count);
- vhost_signal(dev, vq);
-}
-EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n);
-
/* return true if we're sure that avaiable ring is empty */
bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 28eea0155efb..264a2a2fae97 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -197,11 +197,6 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *arg
bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
bool vhost_log_access_ok(struct vhost_dev *);

-int vhost_get_vq_desc(struct vhost_virtqueue *,
- struct iovec iov[], unsigned int iov_count,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num);
-void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
int vhost_get_avail_buf(struct vhost_virtqueue *, struct vhost_buf *buf,
struct iovec iov[], unsigned int iov_count,
unsigned int *out_num, unsigned int *in_num,
@@ -209,13 +204,6 @@ int vhost_get_avail_buf(struct vhost_virtqueue *, struct vhost_buf *buf,
void vhost_discard_avail_bufs(struct vhost_virtqueue *,
struct vhost_buf *, unsigned count);
int vhost_vq_init_access(struct vhost_virtqueue *);
-int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
-int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
- unsigned count);
-void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
- unsigned int id, int len);
-void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
- struct vring_used_elem *heads, unsigned count);
int vhost_put_used_buf(struct vhost_virtqueue *, struct vhost_buf *buf);
int vhost_put_used_n_bufs(struct vhost_virtqueue *,
struct vhost_buf *bufs, unsigned count);
--
MST

2020-06-11 11:40:12

[permalink] [raw]

Subject: [PATCH RFC v8 09/11] vhost/scsi: switch to buf APIs

Switch to buf APIs. Doing this exposes a spec violation in vhost scsi:
all used bufs are marked with length 0.
Fix that is left for another day.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/scsi.c | 73 ++++++++++++++++++++++++++------------------
1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 0cbaa0b3893d..a5cdd4c01a3a 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -71,8 +71,8 @@ struct vhost_scsi_inflight {
};

struct vhost_scsi_cmd {
- /* Descriptor from vhost_get_vq_desc() for virt_queue segment */
- int tvc_vq_desc;
+ /* Descriptor from vhost_get_avail_buf() for virt_queue segment */
+ struct vhost_buf tvc_vq_desc;
/* virtio-scsi initiator task attribute */
int tvc_task_attr;
/* virtio-scsi response incoming iovecs */
@@ -213,7 +213,7 @@ struct vhost_scsi {
* Context for processing request and control queue operations.
*/
struct vhost_scsi_ctx {
- int head;
+ struct vhost_buf buf;
unsigned int out, in;
size_t req_size, rsp_size;
size_t out_size, in_size;
@@ -443,6 +443,20 @@ static int vhost_scsi_check_stop_free(struct se_cmd *se_cmd)
return target_put_sess_cmd(se_cmd);
}

+/* Signal to guest that request finished with no input buffer. */
+/* TODO calling this when writing into buffer and most likely a bug */
+static void vhost_scsi_signal_noinput(struct vhost_dev *vdev,
+ struct vhost_virtqueue *vq,
+ struct vhost_buf *bufp)
+{
+ struct vhost_buf buf = *bufp;
+
+ buf.in_len = 0;
+ vhost_put_used_buf(vq, &buf);
+ vhost_signal(vdev, vq);
+}
+
+
static void
vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
{
@@ -450,7 +464,8 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
struct virtio_scsi_event *event = &evt->event;
struct virtio_scsi_event __user *eventp;
unsigned out, in;
- int head, ret;
+ struct vhost_buf buf;
+ int ret;

if (!vhost_vq_get_backend(vq)) {
vs->vs_events_missed = true;
@@ -459,14 +474,14 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)

again:
vhost_disable_notify(&vs->dev, vq);
- head = vhost_get_vq_desc(vq, vq->iov,
- ARRAY_SIZE(vq->iov), &out, &in,
- NULL, NULL);
- if (head < 0) {
+ ret = vhost_get_avail_buf(vq, &buf,
+ vq->iov, ARRAY_SIZE(vq->iov), &out, &in,
+ NULL, NULL);
+ if (ret < 0) {
vs->vs_events_missed = true;
return;
}
- if (head == vq->num) {
+ if (!ret) {
if (vhost_enable_notify(&vs->dev, vq))
goto again;
vs->vs_events_missed = true;
@@ -488,7 +503,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt)
eventp = vq->iov[out].iov_base;
ret = __copy_to_user(eventp, event, sizeof(*event));
if (!ret)
- vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+ vhost_scsi_signal_noinput(&vs->dev, vq, &buf);
else
vq_err(vq, "Faulted on vhost_scsi_send_event\n");
}
@@ -549,7 +564,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
if (likely(ret == sizeof(v_rsp))) {
struct vhost_scsi_virtqueue *q;
- vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
+ vhost_put_used_buf(cmd->tvc_vq, &cmd->tvc_vq_desc);
q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
vq = q - vs->vqs;
__set_bit(vq, signal);
@@ -793,7 +808,7 @@ static void vhost_scsi_submission_work(struct work_struct *work)
static void
vhost_scsi_send_bad_target(struct vhost_scsi *vs,
struct vhost_virtqueue *vq,
- int head, unsigned out)
+ struct vhost_buf *buf, unsigned out)
{
struct virtio_scsi_cmd_resp __user *resp;
struct virtio_scsi_cmd_resp rsp;
@@ -804,7 +819,7 @@ vhost_scsi_send_bad_target(struct vhost_scsi *vs,
resp = vq->iov[out].iov_base;
ret = __copy_to_user(resp, &rsp, sizeof(rsp));
if (!ret)
- vhost_add_used_and_signal(&vs->dev, vq, head, 0);
+ vhost_scsi_signal_noinput(&vs->dev, vq, buf);
else
pr_err("Faulted on virtio_scsi_cmd_resp\n");
}
@@ -813,21 +828,21 @@ static int
vhost_scsi_get_desc(struct vhost_scsi *vs, struct vhost_virtqueue *vq,
struct vhost_scsi_ctx *vc)
{
- int ret = -ENXIO;
+ int r, ret = -ENXIO;

- vc->head = vhost_get_vq_desc(vq, vq->iov,
- ARRAY_SIZE(vq->iov), &vc->out, &vc->in,
- NULL, NULL);
+ r = vhost_get_avail_buf(vq, &vc->buf,
+ vq->iov, ARRAY_SIZE(vq->iov), &vc->out, &vc->in,
+ NULL, NULL);

- pr_debug("vhost_get_vq_desc: head: %d, out: %u in: %u\n",
- vc->head, vc->out, vc->in);
+ pr_debug("vhost_get_avail_buf: buf: %d, out: %u in: %u\n",
+ vc->buf.id, vc->out, vc->in);

/* On error, stop handling until the next kick. */
- if (unlikely(vc->head < 0))
+ if (unlikely(r < 0))
goto done;

/* Nothing new? Wait for eventfd to tell us they refilled. */
- if (vc->head == vq->num) {
+ if (!r) {
if (unlikely(vhost_enable_notify(&vs->dev, vq))) {
vhost_disable_notify(&vs->dev, vq);
ret = -EAGAIN;
@@ -1093,11 +1108,11 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
}
}
/*
- * Save the descriptor from vhost_get_vq_desc() to be used to
+ * Save the descriptor from vhost_get_avail_buf() to be used to
* complete the virtio-scsi request in TCM callback context via
* vhost_scsi_queue_data_in() and vhost_scsi_queue_status()
*/
- cmd->tvc_vq_desc = vc.head;
+ cmd->tvc_vq_desc = vc.buf;
/*
* Dispatch cmd descriptor for cmwq execution in process
* context provided by vhost_scsi_workqueue. This also ensures
@@ -1117,7 +1132,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
if (ret == -ENXIO)
break;
else if (ret == -EIO)
- vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
+ vhost_scsi_send_bad_target(vs, vq, &vc.buf, vc.out);
} while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
out:
mutex_unlock(&vq->mutex);
@@ -1139,9 +1154,9 @@ vhost_scsi_send_tmf_reject(struct vhost_scsi *vs,
iov_iter_init(&iov_iter, READ, &vq->iov[vc->out], vc->in, sizeof(rsp));

ret = copy_to_iter(&rsp, sizeof(rsp), &iov_iter);
- if (likely(ret == sizeof(rsp)))
- vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0);
- else
+ if (likely(ret == sizeof(rsp))) {
+ vhost_scsi_signal_noinput(&vs->dev, vq, &vc->buf);
+ } else
pr_err("Faulted on virtio_scsi_ctrl_tmf_resp\n");
}

@@ -1162,7 +1177,7 @@ vhost_scsi_send_an_resp(struct vhost_scsi *vs,

ret = copy_to_iter(&rsp, sizeof(rsp), &iov_iter);
if (likely(ret == sizeof(rsp)))
- vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0);
+ vhost_scsi_signal_noinput(&vs->dev, vq, &vc->buf);
else
pr_err("Faulted on virtio_scsi_ctrl_an_resp\n");
}
@@ -1269,7 +1284,7 @@ vhost_scsi_ctl_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
if (ret == -ENXIO)
break;
else if (ret == -EIO)
- vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out);
+ vhost_scsi_send_bad_target(vs, vq, &vc.buf, vc.out);
} while (likely(!vhost_exceeds_weight(vq, ++c, 0)));
out:
mutex_unlock(&vq->mutex);
--
MST

2020-06-11 11:41:05

[permalink] [raw]

Subject: [PATCH RFC v8 04/11] vhost: reorder functions

Reorder functions in the file to not rely on forward
declarations, in preparation to making them static
down the road.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/vhost.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dfcdb36d4227..c38605b01080 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2425,19 +2425,6 @@ void vhost_discard_vq_desc(struct vhost_virtqueue *vq, int n)
}
EXPORT_SYMBOL_GPL(vhost_discard_vq_desc);

-/* After we've used one of their buffers, we tell them about it. We'll then
- * want to notify the guest, using eventfd. */
-int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
-{
- struct vring_used_elem heads = {
- cpu_to_vhost32(vq, head),
- cpu_to_vhost32(vq, len)
- };
-
- return vhost_add_used_n(vq, &heads, 1);
-}
-EXPORT_SYMBOL_GPL(vhost_add_used);
-
static int __vhost_add_used_n(struct vhost_virtqueue *vq,
struct vring_used_elem *heads,
unsigned count)
@@ -2507,6 +2494,19 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
}
EXPORT_SYMBOL_GPL(vhost_add_used_n);

+/* After we've used one of their buffers, we tell them about it. We'll then
+ * want to notify the guest, using eventfd. */
+int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
+{
+ struct vring_used_elem heads = {
+ cpu_to_vhost32(vq, head),
+ cpu_to_vhost32(vq, len)
+ };
+
+ return vhost_add_used_n(vq, &heads, 1);
+}
+EXPORT_SYMBOL_GPL(vhost_add_used);
+
static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
__u16 old, new;
--
MST

2020-06-11 15:24:57

by Konrad Rzeszutek Wilk

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> As testing shows no performance change, switch to that now.

What kind of testing? 100GiB? Low latency?

2020-06-11 18:47:18

[permalink] [raw]

Subject: [PATCH RFC v8 05/11] vhost: format-independent API for used buffers

Add a new API that doesn't assume used ring, heads, etc.
For now, we keep the old APIs around to make it easier
to convert drivers.

Signed-off-by: Michael S. Tsirkin <[email protected]>
---
drivers/vhost/vhost.c | 73 +++++++++++++++++++++++++++++++++++++------
drivers/vhost/vhost.h | 17 +++++++++-
2 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c38605b01080..03e6bca02288 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2335,13 +2335,12 @@ static void unfetch_descs(struct vhost_virtqueue *vq)
* number of output then some number of input descriptors, it's actually two
* iovecs, but we pack them into one and note how many of each there were.
*
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error. */
-int vhost_get_vq_desc(struct vhost_virtqueue *vq,
- struct iovec iov[], unsigned int iov_size,
- unsigned int *out_num, unsigned int *in_num,
- struct vhost_log *log, unsigned int *log_num)
+ * This function returns a value > 0 if a descriptor was found, or 0 if none were found.
+ * A negative code is returned on error. */
+int vhost_get_avail_buf(struct vhost_virtqueue *vq, struct vhost_buf *buf,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
{
int ret = fetch_descs(vq);
int i;
@@ -2354,6 +2353,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
*out_num = *in_num = 0;
if (unlikely(log))
*log_num = 0;
+ buf->in_len = buf->out_len = 0;
+ buf->descs = 0;

for (i = vq->first_desc; i < vq->ndescs; ++i) {
unsigned iov_count = *in_num + *out_num;
@@ -2383,6 +2384,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
/* If this is an input descriptor,
* increment that count. */
*in_num += ret;
+ buf->in_len += desc->len;
if (unlikely(log && ret)) {
log[*log_num].addr = desc->addr;
log[*log_num].len = desc->len;
@@ -2398,9 +2400,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
goto err;
}
*out_num += ret;
+ buf->out_len += desc->len;
}

- ret = desc->id;
+ buf->id = desc->id;
+ ++buf->descs;

if (!(desc->flags & VRING_DESC_F_NEXT))
break;
@@ -2408,12 +2412,41 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,

vq->first_desc = i + 1;

- return ret;
+ return 1;

err:
unfetch_descs(vq);

- return ret ? ret : vq->num;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vhost_get_avail_buf);
+
+/* Reverse the effect of vhost_get_avail_buf. Useful for error handling. */
+void vhost_discard_avail_bufs(struct vhost_virtqueue *vq,
+ struct vhost_buf *buf, unsigned count)
+{
+ vhost_discard_vq_desc(vq, count);
+}
+EXPORT_SYMBOL_GPL(vhost_discard_avail_bufs);
+
+/* This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found. A negative code is
+ * returned on error. */
+int vhost_get_vq_desc(struct vhost_virtqueue *vq,
+ struct iovec iov[], unsigned int iov_size,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num)
+{
+ struct vhost_buf buf;
+ int ret = vhost_get_avail_buf(vq, &buf,
+ iov, iov_size, out_num, in_num,
+ log, log_num);
+
+ if (likely(ret > 0))
+ return buf->id;
+ if (likely(!ret))
+ return vq->num;
+ return ret;
}
EXPORT_SYMBOL_GPL(vhost_get_vq_desc);

@@ -2507,6 +2540,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
}
EXPORT_SYMBOL_GPL(vhost_add_used);

+int vhost_put_used_buf(struct vhost_virtqueue *vq, struct vhost_buf *buf)
+{
+ return vhost_add_used(vq, buf->id, buf->in_len);
+}
+EXPORT_SYMBOL_GPL(vhost_put_used_buf);
+
+int vhost_put_used_n_bufs(struct vhost_virtqueue *vq,
+ struct vhost_buf *bufs, unsigned count)
+{
+ unsigned i;
+
+ for (i = 0; i < count; ++i) {
+ vq->heads[i].id = cpu_to_vhost32(vq, bufs[i].id);
+ vq->heads[i].len = cpu_to_vhost32(vq, bufs[i].in_len);
+ }
+
+ return vhost_add_used_n(vq, vq->heads, count);
+}
+EXPORT_SYMBOL_GPL(vhost_put_used_n_bufs);
+
static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
__u16 old, new;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index fed36af5c444..28eea0155efb 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -67,6 +67,13 @@ struct vhost_desc {
u16 id;
};

+struct vhost_buf {
+ u32 out_len;
+ u32 in_len;
+ u16 descs;
+ u16 id;
+};
+
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -195,7 +202,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
unsigned int *out_num, unsigned int *in_num,
struct vhost_log *log, unsigned int *log_num);
void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
-
+int vhost_get_avail_buf(struct vhost_virtqueue *, struct vhost_buf *buf,
+ struct iovec iov[], unsigned int iov_count,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vhost_log *log, unsigned int *log_num);
+void vhost_discard_avail_bufs(struct vhost_virtqueue *,
+ struct vhost_buf *, unsigned count);
int vhost_vq_init_access(struct vhost_virtqueue *);
int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
@@ -204,6 +216,9 @@ void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
unsigned int id, int len);
void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
struct vring_used_elem *heads, unsigned count);
+int vhost_put_used_buf(struct vhost_virtqueue *, struct vhost_buf *buf);
+int vhost_put_used_n_bufs(struct vhost_virtqueue *,
+ struct vhost_buf *bufs, unsigned count);
void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
--
MST

2020-06-15 16:11:39

[permalink] [raw]

Subject: Re: [PATCH RFC v8 03/11] vhost/net: pass net specific struct pointer

On Thu, Jun 11, 2020 at 1:34 PM Michael S. Tsirkin <[email protected]> wrote:
>
> In preparation for further cleanup, pass net specific pointer
> to ubuf callbacks so we can move net specific fields
> out to net structures.
>
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> ---
> drivers/vhost/net.c | 14 +++++++-------
> 1 file changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index bf5e1d81ae25..ff594eec8ae3 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -94,7 +94,7 @@ struct vhost_net_ubuf_ref {
> */
> atomic_t refcount;
> wait_queue_head_t wait;
> - struct vhost_virtqueue *vq;
> + struct vhost_net_virtqueue *nvq;
> };
>
> #define VHOST_NET_BATCH 64
> @@ -231,7 +231,7 @@ static void vhost_net_enable_zcopy(int vq)
> }
>
> static struct vhost_net_ubuf_ref *
> -vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
> +vhost_net_ubuf_alloc(struct vhost_net_virtqueue *nvq, bool zcopy)
> {
> struct vhost_net_ubuf_ref *ubufs;
> /* No zero copy backend? Nothing to count. */
> @@ -242,7 +242,7 @@ vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
> return ERR_PTR(-ENOMEM);
> atomic_set(&ubufs->refcount, 1);
> init_waitqueue_head(&ubufs->wait);
> - ubufs->vq = vq;
> + ubufs->nvq = nvq;
> return ubufs;
> }
>
> @@ -384,13 +384,13 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
> static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
> {
> struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
> - struct vhost_virtqueue *vq = ubufs->vq;
> + struct vhost_net_virtqueue *nvq = ubufs->nvq;
> int cnt;
>
> rcu_read_lock_bh();
>
> /* set len to mark this desc buffers done DMA */
> - vq->heads[ubuf->desc].len = success ?
> + nvq->vq.heads[ubuf->desc].in_len = success ?

This change should access .len, not .in_len, until patch 6 (net:
convert to new API) in this series. Not very important, but make
easier to debug these intermediate commits.

Thanks!

> VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
> cnt = vhost_net_ubuf_put(ubufs);
>
> @@ -402,7 +402,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
> * less than 10% of times).
> */
> if (cnt <= 1 || !(cnt % 16))
> - vhost_poll_queue(&vq->poll);
> + vhost_poll_queue(&nvq->vq.poll);
>
> rcu_read_unlock_bh();
> }
> @@ -1525,7 +1525,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
> /* start polling new socket */
> oldsock = vhost_vq_get_backend(vq);
> if (sock != oldsock) {
> - ubufs = vhost_net_ubuf_alloc(vq,
> + ubufs = vhost_net_ubuf_alloc(nvq,
> sock && vhost_sock_zcopy(sock));
> if (IS_ERR(ubufs)) {
> r = PTR_ERR(ubufs);
> --
> MST
>

2020-06-15 16:14:51

[permalink] [raw]

Subject: Re: [PATCH RFC v8 05/11] vhost: format-independent API for used buffers

On Thu, Jun 11, 2020 at 1:34 PM Michael S. Tsirkin <[email protected]> wrote:
>
> Add a new API that doesn't assume used ring, heads, etc.
> For now, we keep the old APIs around to make it easier
> to convert drivers.
>
> Signed-off-by: Michael S. Tsirkin <[email protected]>
> ---
> drivers/vhost/vhost.c | 73 +++++++++++++++++++++++++++++++++++++------
> drivers/vhost/vhost.h | 17 +++++++++-
> 2 files changed, 79 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index c38605b01080..03e6bca02288 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -2335,13 +2335,12 @@ static void unfetch_descs(struct vhost_virtqueue *vq)
> * number of output then some number of input descriptors, it's actually two
> * iovecs, but we pack them into one and note how many of each there were.
> *
> - * This function returns the descriptor number found, or vq->num (which is
> - * never a valid descriptor number) if none was found. A negative code is
> - * returned on error. */
> -int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> - struct iovec iov[], unsigned int iov_size,
> - unsigned int *out_num, unsigned int *in_num,
> - struct vhost_log *log, unsigned int *log_num)
> + * This function returns a value > 0 if a descriptor was found, or 0 if none were found.
> + * A negative code is returned on error. */
> +int vhost_get_avail_buf(struct vhost_virtqueue *vq, struct vhost_buf *buf,
> + struct iovec iov[], unsigned int iov_size,
> + unsigned int *out_num, unsigned int *in_num,
> + struct vhost_log *log, unsigned int *log_num)
> {
> int ret = fetch_descs(vq);
> int i;
> @@ -2354,6 +2353,8 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> *out_num = *in_num = 0;
> if (unlikely(log))
> *log_num = 0;
> + buf->in_len = buf->out_len = 0;
> + buf->descs = 0;
>
> for (i = vq->first_desc; i < vq->ndescs; ++i) {
> unsigned iov_count = *in_num + *out_num;
> @@ -2383,6 +2384,7 @@ int (struct vhost_virtqueue *vq,
> /* If this is an input descriptor,
> * increment that count. */
> *in_num += ret;
> + buf->in_len += desc->len;
> if (unlikely(log && ret)) {
> log[*log_num].addr = desc->addr;
> log[*log_num].len = desc->len;
> @@ -2398,9 +2400,11 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> goto err;
> }
> *out_num += ret;
> + buf->out_len += desc->len;
> }
>
> - ret = desc->id;
> + buf->id = desc->id;
> + ++buf->descs;
>
> if (!(desc->flags & VRING_DESC_F_NEXT))
> break;
> @@ -2408,12 +2412,41 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
>
> vq->first_desc = i + 1;
>
> - return ret;
> + return 1;
>
> err:
> unfetch_descs(vq);
>
> - return ret ? ret : vq->num;
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(vhost_get_avail_buf);
> +
> +/* Reverse the effect of vhost_get_avail_buf. Useful for error handling. */
> +void vhost_discard_avail_bufs(struct vhost_virtqueue *vq,
> + struct vhost_buf *buf, unsigned count)
> +{
> + vhost_discard_vq_desc(vq, count);
> +}
> +EXPORT_SYMBOL_GPL(vhost_discard_avail_bufs);
> +
> +/* This function returns the descriptor number found, or vq->num (which is
> + * never a valid descriptor number) if none was found. A negative code is
> + * returned on error. */
> +int vhost_get_vq_desc(struct vhost_virtqueue *vq,
> + struct iovec iov[], unsigned int iov_size,
> + unsigned int *out_num, unsigned int *in_num,
> + struct vhost_log *log, unsigned int *log_num)
> +{
> + struct vhost_buf buf;
> + int ret = vhost_get_avail_buf(vq, &buf,
> + iov, iov_size, out_num, in_num,
> + log, log_num);
> +
> + if (likely(ret > 0))
> + return buf->id;

This should be buf.id, isn't it?

> + if (likely(!ret))
> + return vq->num;
> + return ret;
> }
> EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
>
> @@ -2507,6 +2540,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len)
> }
> EXPORT_SYMBOL_GPL(vhost_add_used);
>
> +int vhost_put_used_buf(struct vhost_virtqueue *vq, struct vhost_buf *buf)
> +{
> + return vhost_add_used(vq, buf->id, buf->in_len);
> +}
> +EXPORT_SYMBOL_GPL(vhost_put_used_buf);
> +
> +int vhost_put_used_n_bufs(struct vhost_virtqueue *vq,
> + struct vhost_buf *bufs, unsigned count)
> +{
> + unsigned i;
> +
> + for (i = 0; i < count; ++i) {
> + vq->heads[i].id = cpu_to_vhost32(vq, bufs[i].id);
> + vq->heads[i].len = cpu_to_vhost32(vq, bufs[i].in_len);
> + }
> +
> + return vhost_add_used_n(vq, vq->heads, count);
> +}
> +EXPORT_SYMBOL_GPL(vhost_put_used_n_bufs);
> +
> static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> {
> __u16 old, new;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index fed36af5c444..28eea0155efb 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -67,6 +67,13 @@ struct vhost_desc {
> u16 id;
> };
>
> +struct vhost_buf {
> + u32 out_len;
> + u32 in_len;
> + u16 descs;
> + u16 id;
> +};
> +
> /* The virtqueue structure describes a queue attached to a device. */
> struct vhost_virtqueue {
> struct vhost_dev *dev;
> @@ -195,7 +202,12 @@ int vhost_get_vq_desc(struct vhost_virtqueue *,
> unsigned int *out_num, unsigned int *in_num,
> struct vhost_log *log, unsigned int *log_num);
> void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
> -
> +int vhost_get_avail_buf(struct vhost_virtqueue *, struct vhost_buf *buf,
> + struct iovec iov[], unsigned int iov_count,
> + unsigned int *out_num, unsigned int *in_num,
> + struct vhost_log *log, unsigned int *log_num);
> +void vhost_discard_avail_bufs(struct vhost_virtqueue *,
> + struct vhost_buf *, unsigned count);
> int vhost_vq_init_access(struct vhost_virtqueue *);
> int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len);
> int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
> @@ -204,6 +216,9 @@ void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *,
> unsigned int id, int len);
> void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *,
> struct vring_used_elem *heads, unsigned count);
> +int vhost_put_used_buf(struct vhost_virtqueue *, struct vhost_buf *buf);
> +int vhost_put_used_n_bufs(struct vhost_virtqueue *,
> + struct vhost_buf *bufs, unsigned count);
> void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *);
> void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *);
> bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);
> --
> MST
>

2020-06-15 17:05:16

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
<[email protected]> wrote:
>
> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > As testing shows no performance change, switch to that now.
>
> What kind of testing? 100GiB? Low latency?
>

Hi Konrad.

I tested this version of the patch:
https://lkml.org/lkml/2019/10/13/42

It was tested for throughput with DPDK's testpmd (as described in
http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
and kernel pktgen. No latency tests were performed by me. Maybe it is
interesting to perform a latency test or just a different set of tests
over a recent version.

Thanks!

2020-06-17 03:22:01

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/6/11 下午7:34, Michael S. Tsirkin wrote:
> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> {
> kfree(vq->descs);
> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> for (i = 0; i < dev->nvqs; ++i) {
> vq = dev->vqs[i];
> vq->max_descs = dev->iov_limit;
> + if (vhost_vq_num_batch_descs(vq) < 0) {
> + return -EINVAL;
> + }

This check breaks vdpa which set iov_limit to zero. Consider iov_limit
is meaningless to vDPA, I wonder we can skip the test when device
doesn't use worker.

Thanks

2020-06-20 04:19:15

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Wed, Jun 17, 2020 at 5:19 AM Jason Wang <[email protected]> wrote:
>
>
> On 2020/6/11 下午7:34, Michael S. Tsirkin wrote:
> > static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> > {
> > kfree(vq->descs);
> > @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> > for (i = 0; i < dev->nvqs; ++i) {
> > vq = dev->vqs[i];
> > vq->max_descs = dev->iov_limit;
> > + if (vhost_vq_num_batch_descs(vq) < 0) {
> > + return -EINVAL;
> > + }
>
>
> This check breaks vdpa which set iov_limit to zero. Consider iov_limit
> is meaningless to vDPA, I wonder we can skip the test when device
> doesn't use worker.

I tested as

if (dev->use_worker && vhost_vq_num_batch_descs(vq) < 0)

In v9. Please let me know if that is ok for you.

Thanks!

>
> Thanks
>

2020-06-20 04:21:23

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
<[email protected]> wrote:
>
> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> <[email protected]> wrote:
> >
> > On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > > As testing shows no performance change, switch to that now.
> >
> > What kind of testing? 100GiB? Low latency?
> >
>
> Hi Konrad.
>
> I tested this version of the patch:
> https://lkml.org/lkml/2019/10/13/42
>
> It was tested for throughput with DPDK's testpmd (as described in
> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> and kernel pktgen. No latency tests were performed by me. Maybe it is
> interesting to perform a latency test or just a different set of tests
> over a recent version.
>
> Thanks!

I have repeated the tests with v9, and results are a little bit different:
* If I test opening it with testpmd, I see no change between versions
* If I forward packets between two vhost-net interfaces in the guest
using a linux bridge in the host:
- netperf UDP_STREAM shows a performance increase of 1.8, almost
doubling performance. This gets lower as frame size increase.
- rests of the test goes noticeably worse: UDP_RR goes from ~6347
transactions/sec to 5830
- TCP_STREAM goes from ~10.7 gbps to ~7Gbps
- TCP_RR from 6223.64 transactions/sec to 5739.44

2020-06-20 04:25:20

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Fri, Jun 19, 2020 at 8:07 PM Eugenio Perez Martin
<[email protected]> wrote:
>
> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> <[email protected]> wrote:
> >
> > On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> > <[email protected]> wrote:
> > >
> > > On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > > > As testing shows no performance change, switch to that now.
> > >
> > > What kind of testing? 100GiB? Low latency?
> > >
> >
> > Hi Konrad.
> >
> > I tested this version of the patch:
> > https://lkml.org/lkml/2019/10/13/42
> >
> > It was tested for throughput with DPDK's testpmd (as described in
> > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> > and kernel pktgen. No latency tests were performed by me. Maybe it is
> > interesting to perform a latency test or just a different set of tests
> > over a recent version.
> >
> > Thanks!
>
> I have repeated the tests with v9, and results are a little bit different:
> * If I test opening it with testpmd, I see no change between versions
> * If I forward packets between two vhost-net interfaces in the guest
> using a linux bridge in the host:
> - netperf UDP_STREAM shows a performance increase of 1.8, almost
> doubling performance. This gets lower as frame size increase.
> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> transactions/sec to 5830
> - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
> - TCP_RR from 6223.64 transactions/sec to 5739.44

And I forgot to add: It seems that avoiding IOV length math helps,
since performance increases in all tests from patch 02/11 ("vhost: use
batched get_vq_desc version") to 11/11 ("vhost: drop head based
APIs").

2020-06-22 09:10:33

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/6/20 上午2:07, Eugenio Perez Martin wrote:
> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> <[email protected]> wrote:
>> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
>> <[email protected]> wrote:
>>> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
>>>> As testing shows no performance change, switch to that now.
>>> What kind of testing? 100GiB? Low latency?
>>>
>> Hi Konrad.
>>
>> I tested this version of the patch:
>> https://lkml.org/lkml/2019/10/13/42
>>
>> It was tested for throughput with DPDK's testpmd (as described in
>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
>> and kernel pktgen. No latency tests were performed by me. Maybe it is
>> interesting to perform a latency test or just a different set of tests
>> over a recent version.
>>
>> Thanks!
> I have repeated the tests with v9, and results are a little bit different:
> * If I test opening it with testpmd, I see no change between versions
> * If I forward packets between two vhost-net interfaces in the guest
> using a linux bridge in the host:
> - netperf UDP_STREAM shows a performance increase of 1.8, almost
> doubling performance. This gets lower as frame size increase.
> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> transactions/sec to 5830
> - TCP_STREAM goes from ~10.7 gbps to ~7Gbps

Which direction did you mean here? Guest TX or RX?

> - TCP_RR from 6223.64 transactions/sec to 5739.44

Perf diff might help. I think we can start from the RR result which
should be easier. Maybe you can test it for each patch then you may see
which patch is the source of the regression.

Thanks

2020-06-22 10:47:06

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Mon, Jun 22, 2020 at 11:07 AM Jason Wang <[email protected]> wrote:
>
>
> On 2020/6/20 上午2:07, Eugenio Perez Martin wrote:
> > On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> > <[email protected]> wrote:
> >> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> >> <[email protected]> wrote:
> >>> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> >>>> As testing shows no performance change, switch to that now.
> >>> What kind of testing? 100GiB? Low latency?
> >>>
> >> Hi Konrad.
> >>
> >> I tested this version of the patch:
> >> https://lkml.org/lkml/2019/10/13/42
> >>
> >> It was tested for throughput with DPDK's testpmd (as described in
> >> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> >> and kernel pktgen. No latency tests were performed by me. Maybe it is
> >> interesting to perform a latency test or just a different set of tests
> >> over a recent version.
> >>
> >> Thanks!
> > I have repeated the tests with v9, and results are a little bit different:
> > * If I test opening it with testpmd, I see no change between versions
> > * If I forward packets between two vhost-net interfaces in the guest
> > using a linux bridge in the host:
> > - netperf UDP_STREAM shows a performance increase of 1.8, almost
> > doubling performance. This gets lower as frame size increase.
> > - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> > transactions/sec to 5830
> > - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
>
>
> Which direction did you mean here? Guest TX or RX?

Hi Jason.

For both I created a linux bridge in the host, attach two guest
interfaces with vhost-net, and make the netperf run on them.

>
>
> > - TCP_RR from 6223.64 transactions/sec to 5739.44
>
>
> Perf diff might help. I think we can start from the RR result which
> should be easier. Maybe you can test it for each patch then you may see
> which patch is the source of the regression.
>

Ok, I will look for differences.

Thanks!

> Thanks
>

2020-06-22 15:58:40

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> <[email protected]> wrote:
> >
> > On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> > <[email protected]> wrote:
> > >
> > > On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > > > As testing shows no performance change, switch to that now.
> > >
> > > What kind of testing? 100GiB? Low latency?
> > >
> >
> > Hi Konrad.
> >
> > I tested this version of the patch:
> > https://lkml.org/lkml/2019/10/13/42
> >
> > It was tested for throughput with DPDK's testpmd (as described in
> > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> > and kernel pktgen. No latency tests were performed by me. Maybe it is
> > interesting to perform a latency test or just a different set of tests
> > over a recent version.
> >
> > Thanks!
>
> I have repeated the tests with v9, and results are a little bit different:
> * If I test opening it with testpmd, I see no change between versions

OK that is testpmd on guest, right? And vhost-net on the host?

> * If I forward packets between two vhost-net interfaces in the guest
> using a linux bridge in the host:

And here I guess you mean virtio-net in the guest kernel?

> - netperf UDP_STREAM shows a performance increase of 1.8, almost
> doubling performance. This gets lower as frame size increase.
> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> transactions/sec to 5830

OK so it seems plausible that we still have a bug where an interrupt
is delayed. That is the main difference between pmd and virtio.
Let's try disabling event index, and see what happens - that's
the trickiest part of interrupts.

> - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
> - TCP_RR from 6223.64 transactions/sec to 5739.44

2020-06-22 16:03:36

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Wed, Jun 17, 2020 at 11:19:26AM +0800, Jason Wang wrote:
>
> On 2020/6/11 下午7:34, Michael S. Tsirkin wrote:
> > static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> > {
> > kfree(vq->descs);
> > @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> > for (i = 0; i < dev->nvqs; ++i) {
> > vq = dev->vqs[i];
> > vq->max_descs = dev->iov_limit;
> > + if (vhost_vq_num_batch_descs(vq) < 0) {
> > + return -EINVAL;
> > + }
>
>
> This check breaks vdpa which set iov_limit to zero. Consider iov_limit is
> meaningless to vDPA, I wonder we can skip the test when device doesn't use
> worker.
>
> Thanks

It doesn't need iovecs at all, right?

--
MST

2020-06-22 16:14:09

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
>
> On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> > On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> > <[email protected]> wrote:
> > >
> > > On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> > > <[email protected]> wrote:
> > > >
> > > > On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > > > > As testing shows no performance change, switch to that now.
> > > >
> > > > What kind of testing? 100GiB? Low latency?
> > > >
> > >
> > > Hi Konrad.
> > >
> > > I tested this version of the patch:
> > > https://lkml.org/lkml/2019/10/13/42
> > >
> > > It was tested for throughput with DPDK's testpmd (as described in
> > > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> > > and kernel pktgen. No latency tests were performed by me. Maybe it is
> > > interesting to perform a latency test or just a different set of tests
> > > over a recent version.
> > >
> > > Thanks!
> >
> > I have repeated the tests with v9, and results are a little bit different:
> > * If I test opening it with testpmd, I see no change between versions
>
>
> OK that is testpmd on guest, right? And vhost-net on the host?
>

Hi Michael.

No, sorry, as described in
http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
But I could add to test it in the guest too.

These kinds of raw packets "bursts" do not show performance
differences, but I could test deeper if you think it would be worth
it.

> > * If I forward packets between two vhost-net interfaces in the guest
> > using a linux bridge in the host:
>
> And here I guess you mean virtio-net in the guest kernel?

Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
the host. More precisely:
* Adding one of the interfaces to another namespace, assigning it an
IP, and starting netserver there.
* Assign another IP in the range manually to the other virtual net
interface, and start the desired test there.

If you think it would be better to perform then differently please let me know.

>
> > - netperf UDP_STREAM shows a performance increase of 1.8, almost
> > doubling performance. This gets lower as frame size increase.
> > - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> > transactions/sec to 5830
>
> OK so it seems plausible that we still have a bug where an interrupt
> is delayed. That is the main difference between pmd and virtio.
> Let's try disabling event index, and see what happens - that's
> the trickiest part of interrupts.
>

Got it, will get back with the results.

Thank you very much!

>
>
> > - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
> > - TCP_RR from 6223.64 transactions/sec to 5739.44
>

2020-06-22 16:32:36

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
> On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
> >
> > On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> > > On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> > > <[email protected]> wrote:
> > > >
> > > > On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> > > > <[email protected]> wrote:
> > > > >
> > > > > On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > > > > > As testing shows no performance change, switch to that now.
> > > > >
> > > > > What kind of testing? 100GiB? Low latency?
> > > > >
> > > >
> > > > Hi Konrad.
> > > >
> > > > I tested this version of the patch:
> > > > https://lkml.org/lkml/2019/10/13/42
> > > >
> > > > It was tested for throughput with DPDK's testpmd (as described in
> > > > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> > > > and kernel pktgen. No latency tests were performed by me. Maybe it is
> > > > interesting to perform a latency test or just a different set of tests
> > > > over a recent version.
> > > >
> > > > Thanks!
> > >
> > > I have repeated the tests with v9, and results are a little bit different:
> > > * If I test opening it with testpmd, I see no change between versions
> >
> >
> > OK that is testpmd on guest, right? And vhost-net on the host?
> >
>
> Hi Michael.
>
> No, sorry, as described in
> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
> But I could add to test it in the guest too.
>
> These kinds of raw packets "bursts" do not show performance
> differences, but I could test deeper if you think it would be worth
> it.

Oh ok, so this is without guest, with virtio-user.
It might be worth checking dpdk within guest too just
as another data point.

> > > * If I forward packets between two vhost-net interfaces in the guest
> > > using a linux bridge in the host:
> >
> > And here I guess you mean virtio-net in the guest kernel?
>
> Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
> the host. More precisely:
> * Adding one of the interfaces to another namespace, assigning it an
> IP, and starting netserver there.
> * Assign another IP in the range manually to the other virtual net
> interface, and start the desired test there.
>
> If you think it would be better to perform then differently please let me know.

Not sure why you bother with namespaces since you said you are
using L2 bridging. I guess it's unimportant.

> >
> > > - netperf UDP_STREAM shows a performance increase of 1.8, almost
> > > doubling performance. This gets lower as frame size increase.
> > > - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> > > transactions/sec to 5830
> >
> > OK so it seems plausible that we still have a bug where an interrupt
> > is delayed. That is the main difference between pmd and virtio.
> > Let's try disabling event index, and see what happens - that's
> > the trickiest part of interrupts.
> >
>
> Got it, will get back with the results.
>
> Thank you very much!
>
> >
> >
> > > - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
> > > - TCP_RR from 6223.64 transactions/sec to 5739.44
> >

2020-06-23 02:57:04

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/6/23 上午12:00, Michael S. Tsirkin wrote:
> On Wed, Jun 17, 2020 at 11:19:26AM +0800, Jason Wang wrote:
>> On 2020/6/11 下午7:34, Michael S. Tsirkin wrote:
>>> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
>>> {
>>> kfree(vq->descs);
>>> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>>> for (i = 0; i < dev->nvqs; ++i) {
>>> vq = dev->vqs[i];
>>> vq->max_descs = dev->iov_limit;
>>> + if (vhost_vq_num_batch_descs(vq) < 0) {
>>> + return -EINVAL;
>>> + }
>> This check breaks vdpa which set iov_limit to zero. Consider iov_limit is
>> meaningless to vDPA, I wonder we can skip the test when device doesn't use
>> worker.
>>
>> Thanks
> It doesn't need iovecs at all, right?
>
> -- MST

Yes, so we may choose to bypass the iovecs as well.

Thanks

2020-06-23 07:04:32

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Tue, Jun 23, 2020 at 4:51 AM Jason Wang <[email protected]> wrote:
>
>
> On 2020/6/23 上午12:00, Michael S. Tsirkin wrote:
> > On Wed, Jun 17, 2020 at 11:19:26AM +0800, Jason Wang wrote:
> >> On 2020/6/11 下午7:34, Michael S. Tsirkin wrote:
> >>> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> >>> {
> >>> kfree(vq->descs);
> >>> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> >>> for (i = 0; i < dev->nvqs; ++i) {
> >>> vq = dev->vqs[i];
> >>> vq->max_descs = dev->iov_limit;
> >>> + if (vhost_vq_num_batch_descs(vq) < 0) {
> >>> + return -EINVAL;
> >>> + }
> >> This check breaks vdpa which set iov_limit to zero. Consider iov_limit is
> >> meaningless to vDPA, I wonder we can skip the test when device doesn't use
> >> worker.
> >>
> >> Thanks
> > It doesn't need iovecs at all, right?
> >
> > -- MST
>
>
> Yes, so we may choose to bypass the iovecs as well.
>
> Thanks
>

I think that the kmalloc_array returns ZERO_SIZE_PTR for all of them
in that case, so I didn't bother to skip the kmalloc_array parts.
Would you prefer to skip them all and let them NULL? Or have I
misunderstood what you mean?

Thanks!

2020-06-23 07:20:24

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/6/23 下午3:00, Eugenio Perez Martin wrote:
> On Tue, Jun 23, 2020 at 4:51 AM Jason Wang <[email protected]> wrote:
>>
>> On 2020/6/23 上午12:00, Michael S. Tsirkin wrote:
>>> On Wed, Jun 17, 2020 at 11:19:26AM +0800, Jason Wang wrote:
>>>> On 2020/6/11 下午7:34, Michael S. Tsirkin wrote:
>>>>> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
>>>>> {
>>>>> kfree(vq->descs);
>>>>> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
>>>>> for (i = 0; i < dev->nvqs; ++i) {
>>>>> vq = dev->vqs[i];
>>>>> vq->max_descs = dev->iov_limit;
>>>>> + if (vhost_vq_num_batch_descs(vq) < 0) {
>>>>> + return -EINVAL;
>>>>> + }
>>>> This check breaks vdpa which set iov_limit to zero. Consider iov_limit is
>>>> meaningless to vDPA, I wonder we can skip the test when device doesn't use
>>>> worker.
>>>>
>>>> Thanks
>>> It doesn't need iovecs at all, right?
>>>
>>> -- MST
>>
>> Yes, so we may choose to bypass the iovecs as well.
>>
>> Thanks
>>
> I think that the kmalloc_array returns ZERO_SIZE_PTR for all of them
> in that case, so I didn't bother to skip the kmalloc_array parts.
> Would you prefer to skip them all and let them NULL? Or have I
> misunderstood what you mean?

I'm ok with either approach, but my understanding is that Michael wants
to skip them all.

Thanks

>
> Thanks!
>

2020-06-23 08:28:31

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Tue, Jun 23, 2020 at 09:00:57AM +0200, Eugenio Perez Martin wrote:
> On Tue, Jun 23, 2020 at 4:51 AM Jason Wang <[email protected]> wrote:
> >
> >
> > On 2020/6/23 上午12:00, Michael S. Tsirkin wrote:
> > > On Wed, Jun 17, 2020 at 11:19:26AM +0800, Jason Wang wrote:
> > >> On 2020/6/11 下午7:34, Michael S. Tsirkin wrote:
> > >>> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> > >>> {
> > >>> kfree(vq->descs);
> > >>> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> > >>> for (i = 0; i < dev->nvqs; ++i) {
> > >>> vq = dev->vqs[i];
> > >>> vq->max_descs = dev->iov_limit;
> > >>> + if (vhost_vq_num_batch_descs(vq) < 0) {
> > >>> + return -EINVAL;
> > >>> + }
> > >> This check breaks vdpa which set iov_limit to zero. Consider iov_limit is
> > >> meaningless to vDPA, I wonder we can skip the test when device doesn't use
> > >> worker.
> > >>
> > >> Thanks
> > > It doesn't need iovecs at all, right?
> > >
> > > -- MST
> >
> >
> > Yes, so we may choose to bypass the iovecs as well.
> >
> > Thanks
> >
>
> I think that the kmalloc_array returns ZERO_SIZE_PTR for all of them
> in that case, so I didn't bother to skip the kmalloc_array parts.
> Would you prefer to skip them all and let them NULL? Or have I
> misunderstood what you mean?
>
> Thanks!

Sorry about being unclear. I just meant that it seems cleaner
to check for iov_limit being 0 not for worker thread.

--
MST

2020-06-23 15:59:19

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Tue, Jun 23, 2020 at 10:25 AM Michael S. Tsirkin <[email protected]> wrote:
>
> On Tue, Jun 23, 2020 at 09:00:57AM +0200, Eugenio Perez Martin wrote:
> > On Tue, Jun 23, 2020 at 4:51 AM Jason Wang <[email protected]> wrote:
> > >
> > >
> > > On 2020/6/23 上午12:00, Michael S. Tsirkin wrote:
> > > > On Wed, Jun 17, 2020 at 11:19:26AM +0800, Jason Wang wrote:
> > > >> On 2020/6/11 下午7:34, Michael S. Tsirkin wrote:
> > > >>> static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
> > > >>> {
> > > >>> kfree(vq->descs);
> > > >>> @@ -394,6 +400,9 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
> > > >>> for (i = 0; i < dev->nvqs; ++i) {
> > > >>> vq = dev->vqs[i];
> > > >>> vq->max_descs = dev->iov_limit;
> > > >>> + if (vhost_vq_num_batch_descs(vq) < 0) {
> > > >>> + return -EINVAL;
> > > >>> + }
> > > >> This check breaks vdpa which set iov_limit to zero. Consider iov_limit is
> > > >> meaningless to vDPA, I wonder we can skip the test when device doesn't use
> > > >> worker.
> > > >>
> > > >> Thanks
> > > > It doesn't need iovecs at all, right?
> > > >
> > > > -- MST
> > >
> > >
> > > Yes, so we may choose to bypass the iovecs as well.
> > >
> > > Thanks
> > >
> >
> > I think that the kmalloc_array returns ZERO_SIZE_PTR for all of them
> > in that case, so I didn't bother to skip the kmalloc_array parts.
> > Would you prefer to skip them all and let them NULL? Or have I
> > misunderstood what you mean?
> >
> > Thanks!
>
> Sorry about being unclear. I just meant that it seems cleaner
> to check for iov_limit being 0 not for worker thread.

Actually yes, I also think that iov_limit == 0 is a better check.
Changing for the next revision if everyone agrees.

Thanks!

>
> --
> MST
>

2020-06-23 16:20:54

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
>
> On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
> > On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
> > >
> > > On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> > > > On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> > > > <[email protected]> wrote:
> > > > >
> > > > > On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> > > > > <[email protected]> wrote:
> > > > > >
> > > > > > On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > > > > > > As testing shows no performance change, switch to that now.
> > > > > >
> > > > > > What kind of testing? 100GiB? Low latency?
> > > > > >
> > > > >
> > > > > Hi Konrad.
> > > > >
> > > > > I tested this version of the patch:
> > > > > https://lkml.org/lkml/2019/10/13/42
> > > > >
> > > > > It was tested for throughput with DPDK's testpmd (as described in
> > > > > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> > > > > and kernel pktgen. No latency tests were performed by me. Maybe it is
> > > > > interesting to perform a latency test or just a different set of tests
> > > > > over a recent version.
> > > > >
> > > > > Thanks!
> > > >
> > > > I have repeated the tests with v9, and results are a little bit different:
> > > > * If I test opening it with testpmd, I see no change between versions
> > >
> > >
> > > OK that is testpmd on guest, right? And vhost-net on the host?
> > >
> >
> > Hi Michael.
> >
> > No, sorry, as described in
> > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
> > But I could add to test it in the guest too.
> >
> > These kinds of raw packets "bursts" do not show performance
> > differences, but I could test deeper if you think it would be worth
> > it.
>
> Oh ok, so this is without guest, with virtio-user.
> It might be worth checking dpdk within guest too just
> as another data point.
>

Ok, I will do it!

> > > > * If I forward packets between two vhost-net interfaces in the guest
> > > > using a linux bridge in the host:
> > >
> > > And here I guess you mean virtio-net in the guest kernel?
> >
> > Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
> > the host. More precisely:
> > * Adding one of the interfaces to another namespace, assigning it an
> > IP, and starting netserver there.
> > * Assign another IP in the range manually to the other virtual net
> > interface, and start the desired test there.
> >
> > If you think it would be better to perform then differently please let me know.
>
>
> Not sure why you bother with namespaces since you said you are
> using L2 bridging. I guess it's unimportant.
>

Sorry, I think I should have provided more context about that.

The only reason to use namespaces is to force the traffic of these
netperf tests to go through the external bridge. To test netperf
different possibilities than the testpmd (or pktgen or others "blast
of frames unconditionally" tests).

This way, I make sure that is the same version of everything in the
guest, and is a little bit easier to manage cpu affinity, start and
stop testing...

I could use a different VM for sending and receiving, but I find this
way a faster one and it should not introduce a lot of noise. I can
test with two VM if you think that this use of network namespace
introduces too much noise.

Thanks!

> > >
> > > > - netperf UDP_STREAM shows a performance increase of 1.8, almost
> > > > doubling performance. This gets lower as frame size increase.
> > > > - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> > > > transactions/sec to 5830
> > >
> > > OK so it seems plausible that we still have a bug where an interrupt
> > > is delayed. That is the main difference between pmd and virtio.
> > > Let's try disabling event index, and see what happens - that's
> > > the trickiest part of interrupts.
> > >
> >
> > Got it, will get back with the results.
> >
> > Thank you very much!
> >
> > >
> > >
> > > > - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
> > > > - TCP_RR from 6223.64 transactions/sec to 5739.44
> > >
>

2020-07-01 10:44:38

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
<[email protected]> wrote:
>
> On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
> >
> > On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
> > > On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
> > > >
> > > > On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> > > > > On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> > > > > <[email protected]> wrote:
> > > > > >
> > > > > > On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> > > > > > <[email protected]> wrote:
> > > > > > >
> > > > > > > On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > > > > > > > As testing shows no performance change, switch to that now.
> > > > > > >
> > > > > > > What kind of testing? 100GiB? Low latency?
> > > > > > >
> > > > > >
> > > > > > Hi Konrad.
> > > > > >
> > > > > > I tested this version of the patch:
> > > > > > https://lkml.org/lkml/2019/10/13/42
> > > > > >
> > > > > > It was tested for throughput with DPDK's testpmd (as described in
> > > > > > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> > > > > > and kernel pktgen. No latency tests were performed by me. Maybe it is
> > > > > > interesting to perform a latency test or just a different set of tests
> > > > > > over a recent version.
> > > > > >
> > > > > > Thanks!
> > > > >
> > > > > I have repeated the tests with v9, and results are a little bit different:
> > > > > * If I test opening it with testpmd, I see no change between versions
> > > >
> > > >
> > > > OK that is testpmd on guest, right? And vhost-net on the host?
> > > >
> > >
> > > Hi Michael.
> > >
> > > No, sorry, as described in
> > > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
> > > But I could add to test it in the guest too.
> > >
> > > These kinds of raw packets "bursts" do not show performance
> > > differences, but I could test deeper if you think it would be worth
> > > it.
> >
> > Oh ok, so this is without guest, with virtio-user.
> > It might be worth checking dpdk within guest too just
> > as another data point.
> >
>
> Ok, I will do it!
>
> > > > > * If I forward packets between two vhost-net interfaces in the guest
> > > > > using a linux bridge in the host:
> > > >
> > > > And here I guess you mean virtio-net in the guest kernel?
> > >
> > > Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
> > > the host. More precisely:
> > > * Adding one of the interfaces to another namespace, assigning it an
> > > IP, and starting netserver there.
> > > * Assign another IP in the range manually to the other virtual net
> > > interface, and start the desired test there.
> > >
> > > If you think it would be better to perform then differently please let me know.
> >
> >
> > Not sure why you bother with namespaces since you said you are
> > using L2 bridging. I guess it's unimportant.
> >
>
> Sorry, I think I should have provided more context about that.
>
> The only reason to use namespaces is to force the traffic of these
> netperf tests to go through the external bridge. To test netperf
> different possibilities than the testpmd (or pktgen or others "blast
> of frames unconditionally" tests).
>
> This way, I make sure that is the same version of everything in the
> guest, and is a little bit easier to manage cpu affinity, start and
> stop testing...
>
> I could use a different VM for sending and receiving, but I find this
> way a faster one and it should not introduce a lot of noise. I can
> test with two VM if you think that this use of network namespace
> introduces too much noise.
>
> Thanks!
>
> > > >
> > > > > - netperf UDP_STREAM shows a performance increase of 1.8, almost
> > > > > doubling performance. This gets lower as frame size increase.

Regarding UDP_STREAM:
* with event_idx=on: The performance difference is reduced a lot if
applied affinity properly (manually assigning CPU on host/guest and
setting IRQs on guest), making them perform equally with and without
the patch again. Maybe the batching makes the scheduler perform
better.

> > > > > - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> > > > > transactions/sec to 5830

* Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
them perform similarly again, only a very small performance drop
observed. It could be just noise.
** All of them perform better than vanilla if event_idx=off, not sure
why. I can try to repeat them if you suspect that can be a test
failure.

* With testpmd and event_idx=off, if I send from the VM to host, I see
a performance increment especially in small packets. The buf api also
increases performance compared with only batching: Sending the minimum
packet size in testpmd makes pps go from 356kpps to 473 kpps. Sending
1024 length UDP-PDU makes it go from 570kpps to 64 kpps.

Something strange I observe in these tests: I get more pps the bigger
the transmitted buffer size is. Not sure why.

** Sending from the host to the VM does not make a big change with the
patches in small packets scenario (minimum, 64 bytes, about 645
without the patch, ~625 with batch and batch+buf api). If the packets
are bigger, I can see a performance increase: with 256 bits, it goes
from 590kpps to about 600kpps, and in case of 1500 bytes payload it
gets from 348kpps to 528kpps, so it is clearly an improvement.

* with testpmd and event_idx=on, batching+buf api perform similarly in
both directions.

All of testpmd tests were performed with no linux bridge, just a
host's tap interface (<interface type='ethernet'> in xml), with a
testpmd txonly and another in rxonly forward mode, and using the
receiving side packets/bytes data. Guest's rps, xps and interrupts,
and host's vhost threads affinity were also tuned in each test to
schedule both testpmd and vhost in different processors.

I will send the v10 RFC with the small changes requested by Stefan and Jason.

Thanks!

> > > >
> > > > OK so it seems plausible that we still have a bug where an interrupt
> > > > is delayed. That is the main difference between pmd and virtio.
> > > > Let's try disabling event index, and see what happens - that's
> > > > the trickiest part of interrupts.
> > > >
> > >
> > > Got it, will get back with the results.
> > >
> > > Thank you very much!
> > >
> > > >
> > > >
> > > > > - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
> > > > > - TCP_RR from 6223.64 transactions/sec to 5739.44
> > > >
> >

2020-07-01 11:12:48

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Wed, Jul 01, 2020 at 12:43:09PM +0200, Eugenio Perez Martin wrote:
> On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
> <[email protected]> wrote:
> >
> > On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
> > >
> > > On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
> > > > On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
> > > > >
> > > > > On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> > > > > > On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> > > > > > <[email protected]> wrote:
> > > > > > >
> > > > > > > On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> > > > > > > <[email protected]> wrote:
> > > > > > > >
> > > > > > > > On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > > > > > > > > As testing shows no performance change, switch to that now.
> > > > > > > >
> > > > > > > > What kind of testing? 100GiB? Low latency?
> > > > > > > >
> > > > > > >
> > > > > > > Hi Konrad.
> > > > > > >
> > > > > > > I tested this version of the patch:
> > > > > > > https://lkml.org/lkml/2019/10/13/42
> > > > > > >
> > > > > > > It was tested for throughput with DPDK's testpmd (as described in
> > > > > > > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> > > > > > > and kernel pktgen. No latency tests were performed by me. Maybe it is
> > > > > > > interesting to perform a latency test or just a different set of tests
> > > > > > > over a recent version.
> > > > > > >
> > > > > > > Thanks!
> > > > > >
> > > > > > I have repeated the tests with v9, and results are a little bit different:
> > > > > > * If I test opening it with testpmd, I see no change between versions
> > > > >
> > > > >
> > > > > OK that is testpmd on guest, right? And vhost-net on the host?
> > > > >
> > > >
> > > > Hi Michael.
> > > >
> > > > No, sorry, as described in
> > > > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
> > > > But I could add to test it in the guest too.
> > > >
> > > > These kinds of raw packets "bursts" do not show performance
> > > > differences, but I could test deeper if you think it would be worth
> > > > it.
> > >
> > > Oh ok, so this is without guest, with virtio-user.
> > > It might be worth checking dpdk within guest too just
> > > as another data point.
> > >
> >
> > Ok, I will do it!
> >
> > > > > > * If I forward packets between two vhost-net interfaces in the guest
> > > > > > using a linux bridge in the host:
> > > > >
> > > > > And here I guess you mean virtio-net in the guest kernel?
> > > >
> > > > Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
> > > > the host. More precisely:
> > > > * Adding one of the interfaces to another namespace, assigning it an
> > > > IP, and starting netserver there.
> > > > * Assign another IP in the range manually to the other virtual net
> > > > interface, and start the desired test there.
> > > >
> > > > If you think it would be better to perform then differently please let me know.
> > >
> > >
> > > Not sure why you bother with namespaces since you said you are
> > > using L2 bridging. I guess it's unimportant.
> > >
> >
> > Sorry, I think I should have provided more context about that.
> >
> > The only reason to use namespaces is to force the traffic of these
> > netperf tests to go through the external bridge. To test netperf
> > different possibilities than the testpmd (or pktgen or others "blast
> > of frames unconditionally" tests).
> >
> > This way, I make sure that is the same version of everything in the
> > guest, and is a little bit easier to manage cpu affinity, start and
> > stop testing...
> >
> > I could use a different VM for sending and receiving, but I find this
> > way a faster one and it should not introduce a lot of noise. I can
> > test with two VM if you think that this use of network namespace
> > introduces too much noise.
> >
> > Thanks!
> >
> > > > >
> > > > > > - netperf UDP_STREAM shows a performance increase of 1.8, almost
> > > > > > doubling performance. This gets lower as frame size increase.
>
> Regarding UDP_STREAM:
> * with event_idx=on: The performance difference is reduced a lot if
> applied affinity properly (manually assigning CPU on host/guest and
> setting IRQs on guest), making them perform equally with and without
> the patch again. Maybe the batching makes the scheduler perform
> better.
>
> > > > > > - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> > > > > > transactions/sec to 5830
>
> * Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
> them perform similarly again, only a very small performance drop
> observed. It could be just noise.
> ** All of them perform better than vanilla if event_idx=off, not sure
> why. I can try to repeat them if you suspect that can be a test
> failure.
>
> * With testpmd and event_idx=off, if I send from the VM to host, I see
> a performance increment especially in small packets. The buf api also
> increases performance compared with only batching: Sending the minimum
> packet size in testpmd makes pps go from 356kpps to 473 kpps. Sending
> 1024 length UDP-PDU makes it go from 570kpps to 64 kpps.
>
> Something strange I observe in these tests: I get more pps the bigger
> the transmitted buffer size is. Not sure why.
>
> ** Sending from the host to the VM does not make a big change with the
> patches in small packets scenario (minimum, 64 bytes, about 645
> without the patch, ~625 with batch and batch+buf api). If the packets
> are bigger, I can see a performance increase: with 256 bits, it goes
> from 590kpps to about 600kpps, and in case of 1500 bytes payload it
> gets from 348kpps to 528kpps, so it is clearly an improvement.
>
> * with testpmd and event_idx=on, batching+buf api perform similarly in
> both directions.
>
> All of testpmd tests were performed with no linux bridge, just a
> host's tap interface (<interface type='ethernet'> in xml), with a
> testpmd txonly and another in rxonly forward mode, and using the
> receiving side packets/bytes data. Guest's rps, xps and interrupts,
> and host's vhost threads affinity were also tuned in each test to
> schedule both testpmd and vhost in different processors.
>
> I will send the v10 RFC with the small changes requested by Stefan and Jason.
>
> Thanks!
>

OK so there's a chance you are seeing effects of an aggressive power
management. which tuned profile are you using? It might be helpful
to disable PM/frequency scaling.

>
>
>
>
>
> > > > >
> > > > > OK so it seems plausible that we still have a bug where an interrupt
> > > > > is delayed. That is the main difference between pmd and virtio.
> > > > > Let's try disabling event index, and see what happens - that's
> > > > > the trickiest part of interrupts.
> > > > >
> > > >
> > > > Got it, will get back with the results.
> > > >
> > > > Thank you very much!
> > > >
> > > > >
> > > > >
> > > > > > - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
> > > > > > - TCP_RR from 6223.64 transactions/sec to 5739.44
> > > > >
> > >

2020-07-01 12:40:35

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/7/1 下午6:43, Eugenio Perez Martin wrote:
> On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
> <[email protected]> wrote:
>> On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
>>> On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
>>>> On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
>>>>> On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
>>>>>> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
>>>>>> <[email protected]> wrote:
>>>>>>> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
>>>>>>> <[email protected]> wrote:
>>>>>>>> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
>>>>>>>>> As testing shows no performance change, switch to that now.
>>>>>>>> What kind of testing? 100GiB? Low latency?
>>>>>>>>
>>>>>>> Hi Konrad.
>>>>>>>
>>>>>>> I tested this version of the patch:
>>>>>>> https://lkml.org/lkml/2019/10/13/42
>>>>>>>
>>>>>>> It was tested for throughput with DPDK's testpmd (as described in
>>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
>>>>>>> and kernel pktgen. No latency tests were performed by me. Maybe it is
>>>>>>> interesting to perform a latency test or just a different set of tests
>>>>>>> over a recent version.
>>>>>>>
>>>>>>> Thanks!
>>>>>> I have repeated the tests with v9, and results are a little bit different:
>>>>>> * If I test opening it with testpmd, I see no change between versions
>>>>>
>>>>> OK that is testpmd on guest, right? And vhost-net on the host?
>>>>>
>>>> Hi Michael.
>>>>
>>>> No, sorry, as described in
>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
>>>> But I could add to test it in the guest too.
>>>>
>>>> These kinds of raw packets "bursts" do not show performance
>>>> differences, but I could test deeper if you think it would be worth
>>>> it.
>>> Oh ok, so this is without guest, with virtio-user.
>>> It might be worth checking dpdk within guest too just
>>> as another data point.
>>>
>> Ok, I will do it!
>>
>>>>>> * If I forward packets between two vhost-net interfaces in the guest
>>>>>> using a linux bridge in the host:
>>>>> And here I guess you mean virtio-net in the guest kernel?
>>>> Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
>>>> the host. More precisely:
>>>> * Adding one of the interfaces to another namespace, assigning it an
>>>> IP, and starting netserver there.
>>>> * Assign another IP in the range manually to the other virtual net
>>>> interface, and start the desired test there.
>>>>
>>>> If you think it would be better to perform then differently please let me know.
>>>
>>> Not sure why you bother with namespaces since you said you are
>>> using L2 bridging. I guess it's unimportant.
>>>
>> Sorry, I think I should have provided more context about that.
>>
>> The only reason to use namespaces is to force the traffic of these
>> netperf tests to go through the external bridge. To test netperf
>> different possibilities than the testpmd (or pktgen or others "blast
>> of frames unconditionally" tests).
>>
>> This way, I make sure that is the same version of everything in the
>> guest, and is a little bit easier to manage cpu affinity, start and
>> stop testing...
>>
>> I could use a different VM for sending and receiving, but I find this
>> way a faster one and it should not introduce a lot of noise. I can
>> test with two VM if you think that this use of network namespace
>> introduces too much noise.
>>
>> Thanks!
>>
>>>>>> - netperf UDP_STREAM shows a performance increase of 1.8, almost
>>>>>> doubling performance. This gets lower as frame size increase.
> Regarding UDP_STREAM:
> * with event_idx=on: The performance difference is reduced a lot if
> applied affinity properly (manually assigning CPU on host/guest and
> setting IRQs on guest), making them perform equally with and without
> the patch again. Maybe the batching makes the scheduler perform
> better.

Note that for UDP_STREAM, the result is pretty trick to be analyzed. E.g
setting a sndbuf for TAP may help for the performance (reduce the drop).

>
>>>>>> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
>>>>>> transactions/sec to 5830
> * Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
> them perform similarly again, only a very small performance drop
> observed. It could be just noise.
> ** All of them perform better than vanilla if event_idx=off, not sure
> why. I can try to repeat them if you suspect that can be a test
> failure.
>
> * With testpmd and event_idx=off, if I send from the VM to host, I see
> a performance increment especially in small packets. The buf api also
> increases performance compared with only batching: Sending the minimum
> packet size in testpmd makes pps go from 356kpps to 473 kpps.

What's your setup for this. The number looks rather low. I'd expected
1-2 Mpps at least.

> Sending
> 1024 length UDP-PDU makes it go from 570kpps to 64 kpps.
>
> Something strange I observe in these tests: I get more pps the bigger
> the transmitted buffer size is. Not sure why.
>
> ** Sending from the host to the VM does not make a big change with the
> patches in small packets scenario (minimum, 64 bytes, about 645
> without the patch, ~625 with batch and batch+buf api). If the packets
> are bigger, I can see a performance increase: with 256 bits,

I think you meant bytes?

> it goes
> from 590kpps to about 600kpps, and in case of 1500 bytes payload it
> gets from 348kpps to 528kpps, so it is clearly an improvement.
>
> * with testpmd and event_idx=on, batching+buf api perform similarly in
> both directions.
>
> All of testpmd tests were performed with no linux bridge, just a
> host's tap interface (<interface type='ethernet'> in xml),

What DPDK driver did you use in the test (AF_PACKET?).

> with a
> testpmd txonly and another in rxonly forward mode, and using the
> receiving side packets/bytes data. Guest's rps, xps and interrupts,
> and host's vhost threads affinity were also tuned in each test to
> schedule both testpmd and vhost in different processors.

My feeling is that if we start from simple setup, it would be more
easier as a start. E.g start without an VM.

1) TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP
2) RX: pkgetn -> TAP -> vhost_net -> testpmd(rxonly)

Thanks

>
> I will send the v10 RFC with the small changes requested by Stefan and Jason.
>
> Thanks!
>
>
>
>
>
>
>
>>>>> OK so it seems plausible that we still have a bug where an interrupt
>>>>> is delayed. That is the main difference between pmd and virtio.
>>>>> Let's try disabling event index, and see what happens - that's
>>>>> the trickiest part of interrupts.
>>>>>
>>>> Got it, will get back with the results.
>>>>
>>>> Thank you very much!
>>>>
>>>>>
>>>>>> - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
>>>>>> - TCP_RR from 6223.64 transactions/sec to 5739.44

2020-07-01 12:58:16

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Wed, Jul 1, 2020 at 1:12 PM Michael S. Tsirkin <[email protected]> wrote:
>
> On Wed, Jul 01, 2020 at 12:43:09PM +0200, Eugenio Perez Martin wrote:
> > On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
> > <[email protected]> wrote:
> > >
> > > On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
> > > >
> > > > On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
> > > > > On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
> > > > > >
> > > > > > On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> > > > > > > On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> > > > > > > <[email protected]> wrote:
> > > > > > > >
> > > > > > > > On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> > > > > > > > <[email protected]> wrote:
> > > > > > > > >
> > > > > > > > > On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > > > > > > > > > As testing shows no performance change, switch to that now.
> > > > > > > > >
> > > > > > > > > What kind of testing? 100GiB? Low latency?
> > > > > > > > >
> > > > > > > >
> > > > > > > > Hi Konrad.
> > > > > > > >
> > > > > > > > I tested this version of the patch:
> > > > > > > > https://lkml.org/lkml/2019/10/13/42
> > > > > > > >
> > > > > > > > It was tested for throughput with DPDK's testpmd (as described in
> > > > > > > > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> > > > > > > > and kernel pktgen. No latency tests were performed by me. Maybe it is
> > > > > > > > interesting to perform a latency test or just a different set of tests
> > > > > > > > over a recent version.
> > > > > > > >
> > > > > > > > Thanks!
> > > > > > >
> > > > > > > I have repeated the tests with v9, and results are a little bit different:
> > > > > > > * If I test opening it with testpmd, I see no change between versions
> > > > > >
> > > > > >
> > > > > > OK that is testpmd on guest, right? And vhost-net on the host?
> > > > > >
> > > > >
> > > > > Hi Michael.
> > > > >
> > > > > No, sorry, as described in
> > > > > http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
> > > > > But I could add to test it in the guest too.
> > > > >
> > > > > These kinds of raw packets "bursts" do not show performance
> > > > > differences, but I could test deeper if you think it would be worth
> > > > > it.
> > > >
> > > > Oh ok, so this is without guest, with virtio-user.
> > > > It might be worth checking dpdk within guest too just
> > > > as another data point.
> > > >
> > >
> > > Ok, I will do it!
> > >
> > > > > > > * If I forward packets between two vhost-net interfaces in the guest
> > > > > > > using a linux bridge in the host:
> > > > > >
> > > > > > And here I guess you mean virtio-net in the guest kernel?
> > > > >
> > > > > Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
> > > > > the host. More precisely:
> > > > > * Adding one of the interfaces to another namespace, assigning it an
> > > > > IP, and starting netserver there.
> > > > > * Assign another IP in the range manually to the other virtual net
> > > > > interface, and start the desired test there.
> > > > >
> > > > > If you think it would be better to perform then differently please let me know.
> > > >
> > > >
> > > > Not sure why you bother with namespaces since you said you are
> > > > using L2 bridging. I guess it's unimportant.
> > > >
> > >
> > > Sorry, I think I should have provided more context about that.
> > >
> > > The only reason to use namespaces is to force the traffic of these
> > > netperf tests to go through the external bridge. To test netperf
> > > different possibilities than the testpmd (or pktgen or others "blast
> > > of frames unconditionally" tests).
> > >
> > > This way, I make sure that is the same version of everything in the
> > > guest, and is a little bit easier to manage cpu affinity, start and
> > > stop testing...
> > >
> > > I could use a different VM for sending and receiving, but I find this
> > > way a faster one and it should not introduce a lot of noise. I can
> > > test with two VM if you think that this use of network namespace
> > > introduces too much noise.
> > >
> > > Thanks!
> > >
> > > > > >
> > > > > > > - netperf UDP_STREAM shows a performance increase of 1.8, almost
> > > > > > > doubling performance. This gets lower as frame size increase.
> >
> > Regarding UDP_STREAM:
> > * with event_idx=on: The performance difference is reduced a lot if
> > applied affinity properly (manually assigning CPU on host/guest and
> > setting IRQs on guest), making them perform equally with and without
> > the patch again. Maybe the batching makes the scheduler perform
> > better.
> >
> > > > > > > - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> > > > > > > transactions/sec to 5830
> >
> > * Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
> > them perform similarly again, only a very small performance drop
> > observed. It could be just noise.
> > ** All of them perform better than vanilla if event_idx=off, not sure
> > why. I can try to repeat them if you suspect that can be a test
> > failure.
> >
> > * With testpmd and event_idx=off, if I send from the VM to host, I see
> > a performance increment especially in small packets. The buf api also
> > increases performance compared with only batching: Sending the minimum
> > packet size in testpmd makes pps go from 356kpps to 473 kpps. Sending
> > 1024 length UDP-PDU makes it go from 570kpps to 64 kpps.
> >
> > Something strange I observe in these tests: I get more pps the bigger
> > the transmitted buffer size is. Not sure why.
> >
> > ** Sending from the host to the VM does not make a big change with the
> > patches in small packets scenario (minimum, 64 bytes, about 645
> > without the patch, ~625 with batch and batch+buf api). If the packets
> > are bigger, I can see a performance increase: with 256 bits, it goes
> > from 590kpps to about 600kpps, and in case of 1500 bytes payload it
> > gets from 348kpps to 528kpps, so it is clearly an improvement.
> >
> > * with testpmd and event_idx=on, batching+buf api perform similarly in
> > both directions.
> >
> > All of testpmd tests were performed with no linux bridge, just a
> > host's tap interface (<interface type='ethernet'> in xml), with a
> > testpmd txonly and another in rxonly forward mode, and using the
> > receiving side packets/bytes data. Guest's rps, xps and interrupts,
> > and host's vhost threads affinity were also tuned in each test to
> > schedule both testpmd and vhost in different processors.
> >
> > I will send the v10 RFC with the small changes requested by Stefan and Jason.
> >
> > Thanks!
> >
>
> OK so there's a chance you are seeing effects of an aggressive power
> management. which tuned profile are you using? It might be helpful
> to disable PM/frequency scaling.
>

I didn't change the tuned profile.

I set all cpus involved in the test isolated with cmdline:
'isolcpus=1,3,5,7,9,11 nohz_full=1,3,5,7,9,11 rcu_nocbs=1,3,5,7,9,11
rcu_nocb_poll intel_pstate=disable'

Wil try to change them though tuned, thanks!

>
> >
> >
> >
> >
> >
> > > > > >
> > > > > > OK so it seems plausible that we still have a bug where an interrupt
> > > > > > is delayed. That is the main difference between pmd and virtio.
> > > > > > Let's try disabling event index, and see what happens - that's
> > > > > > the trickiest part of interrupts.
> > > > > >
> > > > >
> > > > > Got it, will get back with the results.
> > > > >
> > > > > Thank you very much!
> > > > >
> > > > > >
> > > > > >
> > > > > > > - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
> > > > > > > - TCP_RR from 6223.64 transactions/sec to 5739.44
> > > > > >
> > > >
>

2020-07-01 13:08:29

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Wed, Jul 1, 2020 at 2:40 PM Jason Wang <[email protected]> wrote:
>
>
> On 2020/7/1 下午6:43, Eugenio Perez Martin wrote:
> > On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
> > <[email protected]> wrote:
> >> On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
> >>> On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
> >>>> On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
> >>>>> On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> >>>>>> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> >>>>>> <[email protected]> wrote:
> >>>>>>> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> >>>>>>> <[email protected]> wrote:
> >>>>>>>> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> >>>>>>>>> As testing shows no performance change, switch to that now.
> >>>>>>>> What kind of testing? 100GiB? Low latency?
> >>>>>>>>
> >>>>>>> Hi Konrad.
> >>>>>>>
> >>>>>>> I tested this version of the patch:
> >>>>>>> https://lkml.org/lkml/2019/10/13/42
> >>>>>>>
> >>>>>>> It was tested for throughput with DPDK's testpmd (as described in
> >>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> >>>>>>> and kernel pktgen. No latency tests were performed by me. Maybe it is
> >>>>>>> interesting to perform a latency test or just a different set of tests
> >>>>>>> over a recent version.
> >>>>>>>
> >>>>>>> Thanks!
> >>>>>> I have repeated the tests with v9, and results are a little bit different:
> >>>>>> * If I test opening it with testpmd, I see no change between versions
> >>>>>
> >>>>> OK that is testpmd on guest, right? And vhost-net on the host?
> >>>>>
> >>>> Hi Michael.
> >>>>
> >>>> No, sorry, as described in
> >>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
> >>>> But I could add to test it in the guest too.
> >>>>
> >>>> These kinds of raw packets "bursts" do not show performance
> >>>> differences, but I could test deeper if you think it would be worth
> >>>> it.
> >>> Oh ok, so this is without guest, with virtio-user.
> >>> It might be worth checking dpdk within guest too just
> >>> as another data point.
> >>>
> >> Ok, I will do it!
> >>
> >>>>>> * If I forward packets between two vhost-net interfaces in the guest
> >>>>>> using a linux bridge in the host:
> >>>>> And here I guess you mean virtio-net in the guest kernel?
> >>>> Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
> >>>> the host. More precisely:
> >>>> * Adding one of the interfaces to another namespace, assigning it an
> >>>> IP, and starting netserver there.
> >>>> * Assign another IP in the range manually to the other virtual net
> >>>> interface, and start the desired test there.
> >>>>
> >>>> If you think it would be better to perform then differently please let me know.
> >>>
> >>> Not sure why you bother with namespaces since you said you are
> >>> using L2 bridging. I guess it's unimportant.
> >>>
> >> Sorry, I think I should have provided more context about that.
> >>
> >> The only reason to use namespaces is to force the traffic of these
> >> netperf tests to go through the external bridge. To test netperf
> >> different possibilities than the testpmd (or pktgen or others "blast
> >> of frames unconditionally" tests).
> >>
> >> This way, I make sure that is the same version of everything in the
> >> guest, and is a little bit easier to manage cpu affinity, start and
> >> stop testing...
> >>
> >> I could use a different VM for sending and receiving, but I find this
> >> way a faster one and it should not introduce a lot of noise. I can
> >> test with two VM if you think that this use of network namespace
> >> introduces too much noise.
> >>
> >> Thanks!
> >>
> >>>>>> - netperf UDP_STREAM shows a performance increase of 1.8, almost
> >>>>>> doubling performance. This gets lower as frame size increase.
> > Regarding UDP_STREAM:
> > * with event_idx=on: The performance difference is reduced a lot if
> > applied affinity properly (manually assigning CPU on host/guest and
> > setting IRQs on guest), making them perform equally with and without
> > the patch again. Maybe the batching makes the scheduler perform
> > better.
>
>
> Note that for UDP_STREAM, the result is pretty trick to be analyzed. E.g
> setting a sndbuf for TAP may help for the performance (reduce the drop).
>

Ok, will add that to the test. Thanks!

>
> >
> >>>>>> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> >>>>>> transactions/sec to 5830
> > * Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
> > them perform similarly again, only a very small performance drop
> > observed. It could be just noise.
> > ** All of them perform better than vanilla if event_idx=off, not sure
> > why. I can try to repeat them if you suspect that can be a test
> > failure.
> >
> > * With testpmd and event_idx=off, if I send from the VM to host, I see
> > a performance increment especially in small packets. The buf api also
> > increases performance compared with only batching: Sending the minimum
> > packet size in testpmd makes pps go from 356kpps to 473 kpps.
>
>
> What's your setup for this. The number looks rather low. I'd expected
> 1-2 Mpps at least.
>

Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz, 2 NUMA nodes of 16G memory
each, and no device assigned to the NUMA node I'm testing in. Too low
for testpmd AF_PACKET driver too?

>
> > Sending
> > 1024 length UDP-PDU makes it go from 570kpps to 64 kpps.
> >
> > Something strange I observe in these tests: I get more pps the bigger
> > the transmitted buffer size is. Not sure why.
> >
> > ** Sending from the host to the VM does not make a big change with the
> > patches in small packets scenario (minimum, 64 bytes, about 645
> > without the patch, ~625 with batch and batch+buf api). If the packets
> > are bigger, I can see a performance increase: with 256 bits,
>
>
> I think you meant bytes?
>

Yes, sorry.

>
> > it goes
> > from 590kpps to about 600kpps, and in case of 1500 bytes payload it
> > gets from 348kpps to 528kpps, so it is clearly an improvement.
> >
> > * with testpmd and event_idx=on, batching+buf api perform similarly in
> > both directions.
> >
> > All of testpmd tests were performed with no linux bridge, just a
> > host's tap interface (<interface type='ethernet'> in xml),
>
>
> What DPDK driver did you use in the test (AF_PACKET?).
>

Yes, both testpmd are using AF_PACKET driver.

>
> > with a
> > testpmd txonly and another in rxonly forward mode, and using the
> > receiving side packets/bytes data. Guest's rps, xps and interrupts,
> > and host's vhost threads affinity were also tuned in each test to
> > schedule both testpmd and vhost in different processors.
>
>
> My feeling is that if we start from simple setup, it would be more
> easier as a start. E.g start without an VM.
>
> 1) TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP
> 2) RX: pkgetn -> TAP -> vhost_net -> testpmd(rxonly)
>

Got it. Is there a reason to prefer pktgen over testpmd?

> Thanks
>
>
> >
> > I will send the v10 RFC with the small changes requested by Stefan and Jason.
> >
> > Thanks!
> >
> >
> >
> >
> >
> >
> >
> >>>>> OK so it seems plausible that we still have a bug where an interrupt
> >>>>> is delayed. That is the main difference between pmd and virtio.
> >>>>> Let's try disabling event index, and see what happens - that's
> >>>>> the trickiest part of interrupts.
> >>>>>
> >>>> Got it, will get back with the results.
> >>>>
> >>>> Thank you very much!
> >>>>
> >>>>>
> >>>>>> - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
> >>>>>> - TCP_RR from 6223.64 transactions/sec to 5739.44
>

2020-07-01 14:11:53

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/7/1 下午9:04, Eugenio Perez Martin wrote:
> On Wed, Jul 1, 2020 at 2:40 PM Jason Wang <[email protected]> wrote:
>>
>> On 2020/7/1 下午6:43, Eugenio Perez Martin wrote:
>>> On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
>>> <[email protected]> wrote:
>>>> On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
>>>>> On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
>>>>>> On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
>>>>>>> On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
>>>>>>>> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
>>>>>>>> <[email protected]> wrote:
>>>>>>>>> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
>>>>>>>>>>> As testing shows no performance change, switch to that now.
>>>>>>>>>> What kind of testing? 100GiB? Low latency?
>>>>>>>>>>
>>>>>>>>> Hi Konrad.
>>>>>>>>>
>>>>>>>>> I tested this version of the patch:
>>>>>>>>> https://lkml.org/lkml/2019/10/13/42
>>>>>>>>>
>>>>>>>>> It was tested for throughput with DPDK's testpmd (as described in
>>>>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
>>>>>>>>> and kernel pktgen. No latency tests were performed by me. Maybe it is
>>>>>>>>> interesting to perform a latency test or just a different set of tests
>>>>>>>>> over a recent version.
>>>>>>>>>
>>>>>>>>> Thanks!
>>>>>>>> I have repeated the tests with v9, and results are a little bit different:
>>>>>>>> * If I test opening it with testpmd, I see no change between versions
>>>>>>> OK that is testpmd on guest, right? And vhost-net on the host?
>>>>>>>
>>>>>> Hi Michael.
>>>>>>
>>>>>> No, sorry, as described in
>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
>>>>>> But I could add to test it in the guest too.
>>>>>>
>>>>>> These kinds of raw packets "bursts" do not show performance
>>>>>> differences, but I could test deeper if you think it would be worth
>>>>>> it.
>>>>> Oh ok, so this is without guest, with virtio-user.
>>>>> It might be worth checking dpdk within guest too just
>>>>> as another data point.
>>>>>
>>>> Ok, I will do it!
>>>>
>>>>>>>> * If I forward packets between two vhost-net interfaces in the guest
>>>>>>>> using a linux bridge in the host:
>>>>>>> And here I guess you mean virtio-net in the guest kernel?
>>>>>> Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
>>>>>> the host. More precisely:
>>>>>> * Adding one of the interfaces to another namespace, assigning it an
>>>>>> IP, and starting netserver there.
>>>>>> * Assign another IP in the range manually to the other virtual net
>>>>>> interface, and start the desired test there.
>>>>>>
>>>>>> If you think it would be better to perform then differently please let me know.
>>>>> Not sure why you bother with namespaces since you said you are
>>>>> using L2 bridging. I guess it's unimportant.
>>>>>
>>>> Sorry, I think I should have provided more context about that.
>>>>
>>>> The only reason to use namespaces is to force the traffic of these
>>>> netperf tests to go through the external bridge. To test netperf
>>>> different possibilities than the testpmd (or pktgen or others "blast
>>>> of frames unconditionally" tests).
>>>>
>>>> This way, I make sure that is the same version of everything in the
>>>> guest, and is a little bit easier to manage cpu affinity, start and
>>>> stop testing...
>>>>
>>>> I could use a different VM for sending and receiving, but I find this
>>>> way a faster one and it should not introduce a lot of noise. I can
>>>> test with two VM if you think that this use of network namespace
>>>> introduces too much noise.
>>>>
>>>> Thanks!
>>>>
>>>>>>>> - netperf UDP_STREAM shows a performance increase of 1.8, almost
>>>>>>>> doubling performance. This gets lower as frame size increase.
>>> Regarding UDP_STREAM:
>>> * with event_idx=on: The performance difference is reduced a lot if
>>> applied affinity properly (manually assigning CPU on host/guest and
>>> setting IRQs on guest), making them perform equally with and without
>>> the patch again. Maybe the batching makes the scheduler perform
>>> better.
>>
>> Note that for UDP_STREAM, the result is pretty trick to be analyzed. E.g
>> setting a sndbuf for TAP may help for the performance (reduce the drop).
>>
> Ok, will add that to the test. Thanks!

Actually, it's better to skip the UDP_STREAM test since:

- My understanding is very few application is using raw UDP stream
- It's hard to analyze (usually you need to count the drop ratio etc)

>
>>>>>>>> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
>>>>>>>> transactions/sec to 5830
>>> * Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
>>> them perform similarly again, only a very small performance drop
>>> observed. It could be just noise.
>>> ** All of them perform better than vanilla if event_idx=off, not sure
>>> why. I can try to repeat them if you suspect that can be a test
>>> failure.
>>>
>>> * With testpmd and event_idx=off, if I send from the VM to host, I see
>>> a performance increment especially in small packets. The buf api also
>>> increases performance compared with only batching: Sending the minimum
>>> packet size in testpmd makes pps go from 356kpps to 473 kpps.
>>
>> What's your setup for this. The number looks rather low. I'd expected
>> 1-2 Mpps at least.
>>
> Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz, 2 NUMA nodes of 16G memory
> each, and no device assigned to the NUMA node I'm testing in. Too low
> for testpmd AF_PACKET driver too?

I don't test AF_PACKET, I guess it should use the V3 which mmap based
zerocopy interface.

And it might worth to check the cpu utilization of vhost thread. It's
required to stress it as 100% otherwise there could be a bottleneck
somewhere.

>
>>> Sending
>>> 1024 length UDP-PDU makes it go from 570kpps to 64 kpps.
>>>
>>> Something strange I observe in these tests: I get more pps the bigger
>>> the transmitted buffer size is. Not sure why.
>>>
>>> ** Sending from the host to the VM does not make a big change with the
>>> patches in small packets scenario (minimum, 64 bytes, about 645
>>> without the patch, ~625 with batch and batch+buf api). If the packets
>>> are bigger, I can see a performance increase: with 256 bits,
>>
>> I think you meant bytes?
>>
> Yes, sorry.
>
>>> it goes
>>> from 590kpps to about 600kpps, and in case of 1500 bytes payload it
>>> gets from 348kpps to 528kpps, so it is clearly an improvement.
>>>
>>> * with testpmd and event_idx=on, batching+buf api perform similarly in
>>> both directions.
>>>
>>> All of testpmd tests were performed with no linux bridge, just a
>>> host's tap interface (<interface type='ethernet'> in xml),
>>
>> What DPDK driver did you use in the test (AF_PACKET?).
>>
> Yes, both testpmd are using AF_PACKET driver.

I see, using AF_PACKET means extra layers of issues need to be analyzed
which is probably not good.

>
>>> with a
>>> testpmd txonly and another in rxonly forward mode, and using the
>>> receiving side packets/bytes data. Guest's rps, xps and interrupts,
>>> and host's vhost threads affinity were also tuned in each test to
>>> schedule both testpmd and vhost in different processors.
>>
>> My feeling is that if we start from simple setup, it would be more
>> easier as a start. E.g start without an VM.
>>
>> 1) TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP
>> 2) RX: pkgetn -> TAP -> vhost_net -> testpmd(rxonly)
>>
> Got it. Is there a reason to prefer pktgen over testpmd?

I think the reason is using testpmd you must use a userspace kernel
interface (AF_PACKET), and it could not be as fast as pktgen since:

- it talks directly to xmit of TAP
- skb can be cloned

Thanks

>
>> Thanks
>>
>>
>>> I will send the v10 RFC with the small changes requested by Stefan and Jason.
>>>
>>> Thanks!
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>>>>> OK so it seems plausible that we still have a bug where an interrupt
>>>>>>> is delayed. That is the main difference between pmd and virtio.
>>>>>>> Let's try disabling event index, and see what happens - that's
>>>>>>> the trickiest part of interrupts.
>>>>>>>
>>>>>> Got it, will get back with the results.
>>>>>>
>>>>>> Thank you very much!
>>>>>>
>>>>>>>> - TCP_STREAM goes from ~10.7 gbps to ~7Gbps
>>>>>>>> - TCP_RR from 6223.64 transactions/sec to 5739.44

2020-07-09 16:47:39

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Wed, Jul 1, 2020 at 4:10 PM Jason Wang <[email protected]> wrote:
>
>
> On 2020/7/1 下午9:04, Eugenio Perez Martin wrote:
> > On Wed, Jul 1, 2020 at 2:40 PM Jason Wang <[email protected]> wrote:
> >>
> >> On 2020/7/1 下午6:43, Eugenio Perez Martin wrote:
> >>> On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
> >>> <[email protected]> wrote:
> >>>> On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
> >>>>> On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
> >>>>>> On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
> >>>>>>> On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> >>>>>>>> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> >>>>>>>> <[email protected]> wrote:
> >>>>>>>>> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> >>>>>>>>> <[email protected]> wrote:
> >>>>>>>>>> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> >>>>>>>>>>> As testing shows no performance change, switch to that now.
> >>>>>>>>>> What kind of testing? 100GiB? Low latency?
> >>>>>>>>>>
> >>>>>>>>> Hi Konrad.
> >>>>>>>>>
> >>>>>>>>> I tested this version of the patch:
> >>>>>>>>> https://lkml.org/lkml/2019/10/13/42
> >>>>>>>>>
> >>>>>>>>> It was tested for throughput with DPDK's testpmd (as described in
> >>>>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> >>>>>>>>> and kernel pktgen. No latency tests were performed by me. Maybe it is
> >>>>>>>>> interesting to perform a latency test or just a different set of tests
> >>>>>>>>> over a recent version.
> >>>>>>>>>
> >>>>>>>>> Thanks!
> >>>>>>>> I have repeated the tests with v9, and results are a little bit different:
> >>>>>>>> * If I test opening it with testpmd, I see no change between versions
> >>>>>>> OK that is testpmd on guest, right? And vhost-net on the host?
> >>>>>>>
> >>>>>> Hi Michael.
> >>>>>>
> >>>>>> No, sorry, as described in
> >>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
> >>>>>> But I could add to test it in the guest too.
> >>>>>>
> >>>>>> These kinds of raw packets "bursts" do not show performance
> >>>>>> differences, but I could test deeper if you think it would be worth
> >>>>>> it.
> >>>>> Oh ok, so this is without guest, with virtio-user.
> >>>>> It might be worth checking dpdk within guest too just
> >>>>> as another data point.
> >>>>>
> >>>> Ok, I will do it!
> >>>>
> >>>>>>>> * If I forward packets between two vhost-net interfaces in the guest
> >>>>>>>> using a linux bridge in the host:
> >>>>>>> And here I guess you mean virtio-net in the guest kernel?
> >>>>>> Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
> >>>>>> the host. More precisely:
> >>>>>> * Adding one of the interfaces to another namespace, assigning it an
> >>>>>> IP, and starting netserver there.
> >>>>>> * Assign another IP in the range manually to the other virtual net
> >>>>>> interface, and start the desired test there.
> >>>>>>
> >>>>>> If you think it would be better to perform then differently please let me know.
> >>>>> Not sure why you bother with namespaces since you said you are
> >>>>> using L2 bridging. I guess it's unimportant.
> >>>>>
> >>>> Sorry, I think I should have provided more context about that.
> >>>>
> >>>> The only reason to use namespaces is to force the traffic of these
> >>>> netperf tests to go through the external bridge. To test netperf
> >>>> different possibilities than the testpmd (or pktgen or others "blast
> >>>> of frames unconditionally" tests).
> >>>>
> >>>> This way, I make sure that is the same version of everything in the
> >>>> guest, and is a little bit easier to manage cpu affinity, start and
> >>>> stop testing...
> >>>>
> >>>> I could use a different VM for sending and receiving, but I find this
> >>>> way a faster one and it should not introduce a lot of noise. I can
> >>>> test with two VM if you think that this use of network namespace
> >>>> introduces too much noise.
> >>>>
> >>>> Thanks!
> >>>>
> >>>>>>>> - netperf UDP_STREAM shows a performance increase of 1.8, almost
> >>>>>>>> doubling performance. This gets lower as frame size increase.
> >>> Regarding UDP_STREAM:
> >>> * with event_idx=on: The performance difference is reduced a lot if
> >>> applied affinity properly (manually assigning CPU on host/guest and
> >>> setting IRQs on guest), making them perform equally with and without
> >>> the patch again. Maybe the batching makes the scheduler perform
> >>> better.
> >>
> >> Note that for UDP_STREAM, the result is pretty trick to be analyzed. E.g
> >> setting a sndbuf for TAP may help for the performance (reduce the drop).
> >>
> > Ok, will add that to the test. Thanks!
>
>
> Actually, it's better to skip the UDP_STREAM test since:
>
> - My understanding is very few application is using raw UDP stream
> - It's hard to analyze (usually you need to count the drop ratio etc)
>
>
> >
> >>>>>>>> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> >>>>>>>> transactions/sec to 5830
> >>> * Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
> >>> them perform similarly again, only a very small performance drop
> >>> observed. It could be just noise.
> >>> ** All of them perform better than vanilla if event_idx=off, not sure
> >>> why. I can try to repeat them if you suspect that can be a test
> >>> failure.
> >>>
> >>> * With testpmd and event_idx=off, if I send from the VM to host, I see
> >>> a performance increment especially in small packets. The buf api also
> >>> increases performance compared with only batching: Sending the minimum
> >>> packet size in testpmd makes pps go from 356kpps to 473 kpps.
> >>
> >> What's your setup for this. The number looks rather low. I'd expected
> >> 1-2 Mpps at least.
> >>
> > Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz, 2 NUMA nodes of 16G memory
> > each, and no device assigned to the NUMA node I'm testing in. Too low
> > for testpmd AF_PACKET driver too?
>
>
> I don't test AF_PACKET, I guess it should use the V3 which mmap based
> zerocopy interface.
>
> And it might worth to check the cpu utilization of vhost thread. It's
> required to stress it as 100% otherwise there could be a bottleneck
> somewhere.
>
>
> >
> >>> Sending
> >>> 1024 length UDP-PDU makes it go from 570kpps to 64 kpps.
> >>>
> >>> Something strange I observe in these tests: I get more pps the bigger
> >>> the transmitted buffer size is. Not sure why.
> >>>
> >>> ** Sending from the host to the VM does not make a big change with the
> >>> patches in small packets scenario (minimum, 64 bytes, about 645
> >>> without the patch, ~625 with batch and batch+buf api). If the packets
> >>> are bigger, I can see a performance increase: with 256 bits,
> >>
> >> I think you meant bytes?
> >>
> > Yes, sorry.
> >
> >>> it goes
> >>> from 590kpps to about 600kpps, and in case of 1500 bytes payload it
> >>> gets from 348kpps to 528kpps, so it is clearly an improvement.
> >>>
> >>> * with testpmd and event_idx=on, batching+buf api perform similarly in
> >>> both directions.
> >>>
> >>> All of testpmd tests were performed with no linux bridge, just a
> >>> host's tap interface (<interface type='ethernet'> in xml),
> >>
> >> What DPDK driver did you use in the test (AF_PACKET?).
> >>
> > Yes, both testpmd are using AF_PACKET driver.
>
>
> I see, using AF_PACKET means extra layers of issues need to be analyzed
> which is probably not good.
>
>
> >
> >>> with a
> >>> testpmd txonly and another in rxonly forward mode, and using the
> >>> receiving side packets/bytes data. Guest's rps, xps and interrupts,
> >>> and host's vhost threads affinity were also tuned in each test to
> >>> schedule both testpmd and vhost in different processors.
> >>
> >> My feeling is that if we start from simple setup, it would be more
> >> easier as a start. E.g start without an VM.
> >>
> >> 1) TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP
> >> 2) RX: pkgetn -> TAP -> vhost_net -> testpmd(rxonly)
> >>
> > Got it. Is there a reason to prefer pktgen over testpmd?
>
>
> I think the reason is using testpmd you must use a userspace kernel
> interface (AF_PACKET), and it could not be as fast as pktgen since:
>
> - it talks directly to xmit of TAP
> - skb can be cloned
>

Hi!

Here it is the result of the tests. Details on [1].

Tx:
===

For tx packets it seems that the batching patch makes things a little
bit worse, but the buf_api outperforms baseline by a 7%:

* We start with a baseline of 4208772.571 pps and 269361444.6 bytes/s [2].
* When we add the batching, I see a small performance decrease:
4133292.308 and 264530707.7 bytes/s.
* However, the buf api it outperform the baseline: 4551319.631pps,
291205178.1 bytes/s

I don't have numbers on the receiver side since it is just a XDP_DROP.
I think it would be interesting to see them.

Rx:
===

Regarding Rx, the reverse is observed: a small performance increase is
observed with batching (~2%), but buf_api makes tests perform equally
to baseline.

pktgen was called using pktgen_sample01_simple.sh, with the environment:
DEV="$tap_name" F_THREAD=1 DST_MAC=$MAC_ADDR COUNT=$((2500000*25))
SKB_CLONE=$((2**31))

And testpmd is the same as Tx but with forward-mode=rxonly.

Pktgen reports:
Baseline: 1853025pps 622Mb/sec (622616400bps) errors: 7915231
Batch: 1891404pps 635Mb/sec (635511744bps) errors: 4926093
Buf_api: 1844008pps 619Mb/sec (619586688bps) errors: 47766692

Testpmd reports:
Baseline: 1854448pps, 860464156 bps. [3]
Batch: 1892844.25pps, 878280070bps.
Buf_api: 1846139.75pps, 856609120bps.

Any thoughts?

Thanks!

[1]
Testpmd options: -l 1,3
--vdev=virtio_user0,mac=01:02:03:04:05:06,path=/dev/vhost-net,queue_size=1024
-- --auto-start --stats-period 5 --tx-offloads="$TX_OFFLOADS"
--rx-offloads="$RX_OFFLOADS" --txd=4096 --rxd=4096 --burst=512
--forward-mode=txonly

Where offloads were obtained manually running with
--[tr]x-offloads=0x8fff and examining testpmd response:
declare -r RX_OFFLOADS=0x81d
declare -r TX_OFFLOADS=0x802d

All of the tests results are an average of at least 3 samples of
testpmd, discarding the obvious deviations at start/end (like warming
up or waiting for pktgen to start). The result of pktgen is directly
c&p from its output.

The numbers do not change very much from one stats printing to another
of testpmd.

[2] Obtained subtracting each accumulated tx-packets from one stats
print to the previous one. If we attend testpmd output about Tx-pps,
it counts a little bit less performance, but it follows the same
pattern:

Testpmd pps/bps stats:
Baseline: 3510826.25 pps, 1797887912bps = 224735989bytes/sec
Batch: 3448515.571pps, 1765640226bps = 220705028.3bytes/sec
Buf api: 3794115.333pps, 1942587286bps = 242823410.8bytes/sec

[3] This is obtained using the rx-pps/rx-bps report of testpmd.

Seems strange to me that the relation between pps/bps is ~336 this
time, and between accumulated pkts/accumulated bytes is ~58. Also, the
relation between them is not even close to 8.

However, testpmd shows a lot of absolute packets received. If we see
the received packets in a period subtracting from the previous one,
testpmd tells that receive more pps than pktgen tx-pps:
Baseline: ~2222668.667pps 128914784.3bps.
Batch: 2269260.933pps, 131617134.9bps
Buf_api: 2213226.467pps, 128367135.9bp

2020-07-09 17:42:01

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Thu, Jul 09, 2020 at 06:46:13PM +0200, Eugenio Perez Martin wrote:
> On Wed, Jul 1, 2020 at 4:10 PM Jason Wang <[email protected]> wrote:
> >
> >
> > On 2020/7/1 下午9:04, Eugenio Perez Martin wrote:
> > > On Wed, Jul 1, 2020 at 2:40 PM Jason Wang <[email protected]> wrote:
> > >>
> > >> On 2020/7/1 下午6:43, Eugenio Perez Martin wrote:
> > >>> On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
> > >>> <[email protected]> wrote:
> > >>>> On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
> > >>>>> On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
> > >>>>>> On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
> > >>>>>>> On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> > >>>>>>>> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> > >>>>>>>> <[email protected]> wrote:
> > >>>>>>>>> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> > >>>>>>>>> <[email protected]> wrote:
> > >>>>>>>>>> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> > >>>>>>>>>>> As testing shows no performance change, switch to that now.
> > >>>>>>>>>> What kind of testing? 100GiB? Low latency?
> > >>>>>>>>>>
> > >>>>>>>>> Hi Konrad.
> > >>>>>>>>>
> > >>>>>>>>> I tested this version of the patch:
> > >>>>>>>>> https://lkml.org/lkml/2019/10/13/42
> > >>>>>>>>>
> > >>>>>>>>> It was tested for throughput with DPDK's testpmd (as described in
> > >>>>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> > >>>>>>>>> and kernel pktgen. No latency tests were performed by me. Maybe it is
> > >>>>>>>>> interesting to perform a latency test or just a different set of tests
> > >>>>>>>>> over a recent version.
> > >>>>>>>>>
> > >>>>>>>>> Thanks!
> > >>>>>>>> I have repeated the tests with v9, and results are a little bit different:
> > >>>>>>>> * If I test opening it with testpmd, I see no change between versions
> > >>>>>>> OK that is testpmd on guest, right? And vhost-net on the host?
> > >>>>>>>
> > >>>>>> Hi Michael.
> > >>>>>>
> > >>>>>> No, sorry, as described in
> > >>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
> > >>>>>> But I could add to test it in the guest too.
> > >>>>>>
> > >>>>>> These kinds of raw packets "bursts" do not show performance
> > >>>>>> differences, but I could test deeper if you think it would be worth
> > >>>>>> it.
> > >>>>> Oh ok, so this is without guest, with virtio-user.
> > >>>>> It might be worth checking dpdk within guest too just
> > >>>>> as another data point.
> > >>>>>
> > >>>> Ok, I will do it!
> > >>>>
> > >>>>>>>> * If I forward packets between two vhost-net interfaces in the guest
> > >>>>>>>> using a linux bridge in the host:
> > >>>>>>> And here I guess you mean virtio-net in the guest kernel?
> > >>>>>> Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
> > >>>>>> the host. More precisely:
> > >>>>>> * Adding one of the interfaces to another namespace, assigning it an
> > >>>>>> IP, and starting netserver there.
> > >>>>>> * Assign another IP in the range manually to the other virtual net
> > >>>>>> interface, and start the desired test there.
> > >>>>>>
> > >>>>>> If you think it would be better to perform then differently please let me know.
> > >>>>> Not sure why you bother with namespaces since you said you are
> > >>>>> using L2 bridging. I guess it's unimportant.
> > >>>>>
> > >>>> Sorry, I think I should have provided more context about that.
> > >>>>
> > >>>> The only reason to use namespaces is to force the traffic of these
> > >>>> netperf tests to go through the external bridge. To test netperf
> > >>>> different possibilities than the testpmd (or pktgen or others "blast
> > >>>> of frames unconditionally" tests).
> > >>>>
> > >>>> This way, I make sure that is the same version of everything in the
> > >>>> guest, and is a little bit easier to manage cpu affinity, start and
> > >>>> stop testing...
> > >>>>
> > >>>> I could use a different VM for sending and receiving, but I find this
> > >>>> way a faster one and it should not introduce a lot of noise. I can
> > >>>> test with two VM if you think that this use of network namespace
> > >>>> introduces too much noise.
> > >>>>
> > >>>> Thanks!
> > >>>>
> > >>>>>>>> - netperf UDP_STREAM shows a performance increase of 1.8, almost
> > >>>>>>>> doubling performance. This gets lower as frame size increase.
> > >>> Regarding UDP_STREAM:
> > >>> * with event_idx=on: The performance difference is reduced a lot if
> > >>> applied affinity properly (manually assigning CPU on host/guest and
> > >>> setting IRQs on guest), making them perform equally with and without
> > >>> the patch again. Maybe the batching makes the scheduler perform
> > >>> better.
> > >>
> > >> Note that for UDP_STREAM, the result is pretty trick to be analyzed. E.g
> > >> setting a sndbuf for TAP may help for the performance (reduce the drop).
> > >>
> > > Ok, will add that to the test. Thanks!
> >
> >
> > Actually, it's better to skip the UDP_STREAM test since:
> >
> > - My understanding is very few application is using raw UDP stream
> > - It's hard to analyze (usually you need to count the drop ratio etc)
> >
> >
> > >
> > >>>>>>>> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> > >>>>>>>> transactions/sec to 5830
> > >>> * Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
> > >>> them perform similarly again, only a very small performance drop
> > >>> observed. It could be just noise.
> > >>> ** All of them perform better than vanilla if event_idx=off, not sure
> > >>> why. I can try to repeat them if you suspect that can be a test
> > >>> failure.
> > >>>
> > >>> * With testpmd and event_idx=off, if I send from the VM to host, I see
> > >>> a performance increment especially in small packets. The buf api also
> > >>> increases performance compared with only batching: Sending the minimum
> > >>> packet size in testpmd makes pps go from 356kpps to 473 kpps.
> > >>
> > >> What's your setup for this. The number looks rather low. I'd expected
> > >> 1-2 Mpps at least.
> > >>
> > > Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz, 2 NUMA nodes of 16G memory
> > > each, and no device assigned to the NUMA node I'm testing in. Too low
> > > for testpmd AF_PACKET driver too?
> >
> >
> > I don't test AF_PACKET, I guess it should use the V3 which mmap based
> > zerocopy interface.
> >
> > And it might worth to check the cpu utilization of vhost thread. It's
> > required to stress it as 100% otherwise there could be a bottleneck
> > somewhere.
> >
> >
> > >
> > >>> Sending
> > >>> 1024 length UDP-PDU makes it go from 570kpps to 64 kpps.
> > >>>
> > >>> Something strange I observe in these tests: I get more pps the bigger
> > >>> the transmitted buffer size is. Not sure why.
> > >>>
> > >>> ** Sending from the host to the VM does not make a big change with the
> > >>> patches in small packets scenario (minimum, 64 bytes, about 645
> > >>> without the patch, ~625 with batch and batch+buf api). If the packets
> > >>> are bigger, I can see a performance increase: with 256 bits,
> > >>
> > >> I think you meant bytes?
> > >>
> > > Yes, sorry.
> > >
> > >>> it goes
> > >>> from 590kpps to about 600kpps, and in case of 1500 bytes payload it
> > >>> gets from 348kpps to 528kpps, so it is clearly an improvement.
> > >>>
> > >>> * with testpmd and event_idx=on, batching+buf api perform similarly in
> > >>> both directions.
> > >>>
> > >>> All of testpmd tests were performed with no linux bridge, just a
> > >>> host's tap interface (<interface type='ethernet'> in xml),
> > >>
> > >> What DPDK driver did you use in the test (AF_PACKET?).
> > >>
> > > Yes, both testpmd are using AF_PACKET driver.
> >
> >
> > I see, using AF_PACKET means extra layers of issues need to be analyzed
> > which is probably not good.
> >
> >
> > >
> > >>> with a
> > >>> testpmd txonly and another in rxonly forward mode, and using the
> > >>> receiving side packets/bytes data. Guest's rps, xps and interrupts,
> > >>> and host's vhost threads affinity were also tuned in each test to
> > >>> schedule both testpmd and vhost in different processors.
> > >>
> > >> My feeling is that if we start from simple setup, it would be more
> > >> easier as a start. E.g start without an VM.
> > >>
> > >> 1) TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP
> > >> 2) RX: pkgetn -> TAP -> vhost_net -> testpmd(rxonly)
> > >>
> > > Got it. Is there a reason to prefer pktgen over testpmd?
> >
> >
> > I think the reason is using testpmd you must use a userspace kernel
> > interface (AF_PACKET), and it could not be as fast as pktgen since:
> >
> > - it talks directly to xmit of TAP
> > - skb can be cloned
> >
>
> Hi!
>
> Here it is the result of the tests. Details on [1].
>
> Tx:
> ===
>
> For tx packets it seems that the batching patch makes things a little
> bit worse, but the buf_api outperforms baseline by a 7%:
>
> * We start with a baseline of 4208772.571 pps and 269361444.6 bytes/s [2].
> * When we add the batching, I see a small performance decrease:
> 4133292.308 and 264530707.7 bytes/s.
> * However, the buf api it outperform the baseline: 4551319.631pps,
> 291205178.1 bytes/s
>
> I don't have numbers on the receiver side since it is just a XDP_DROP.
> I think it would be interesting to see them.
>
> Rx:
> ===
>
> Regarding Rx, the reverse is observed: a small performance increase is
> observed with batching (~2%), but buf_api makes tests perform equally
> to baseline.
>
> pktgen was called using pktgen_sample01_simple.sh, with the environment:
> DEV="$tap_name" F_THREAD=1 DST_MAC=$MAC_ADDR COUNT=$((2500000*25))
> SKB_CLONE=$((2**31))
>
> And testpmd is the same as Tx but with forward-mode=rxonly.
>
> Pktgen reports:
> Baseline: 1853025pps 622Mb/sec (622616400bps) errors: 7915231
> Batch: 1891404pps 635Mb/sec (635511744bps) errors: 4926093
> Buf_api: 1844008pps 619Mb/sec (619586688bps) errors: 47766692
>
> Testpmd reports:
> Baseline: 1854448pps, 860464156 bps. [3]
> Batch: 1892844.25pps, 878280070bps.
> Buf_api: 1846139.75pps, 856609120bps.
>
> Any thoughts?
>
> Thanks!
>
> [1]
> Testpmd options: -l 1,3
> --vdev=virtio_user0,mac=01:02:03:04:05:06,path=/dev/vhost-net,queue_size=1024
> -- --auto-start --stats-period 5 --tx-offloads="$TX_OFFLOADS"
> --rx-offloads="$RX_OFFLOADS" --txd=4096 --rxd=4096 --burst=512
> --forward-mode=txonly
>
> Where offloads were obtained manually running with
> --[tr]x-offloads=0x8fff and examining testpmd response:
> declare -r RX_OFFLOADS=0x81d
> declare -r TX_OFFLOADS=0x802d
>
> All of the tests results are an average of at least 3 samples of
> testpmd, discarding the obvious deviations at start/end (like warming
> up or waiting for pktgen to start). The result of pktgen is directly
> c&p from its output.
>
> The numbers do not change very much from one stats printing to another
> of testpmd.
>
> [2] Obtained subtracting each accumulated tx-packets from one stats
> print to the previous one. If we attend testpmd output about Tx-pps,
> it counts a little bit less performance, but it follows the same
> pattern:
>
> Testpmd pps/bps stats:
> Baseline: 3510826.25 pps, 1797887912bps = 224735989bytes/sec
> Batch: 3448515.571pps, 1765640226bps = 220705028.3bytes/sec
> Buf api: 3794115.333pps, 1942587286bps = 242823410.8bytes/sec
>
> [3] This is obtained using the rx-pps/rx-bps report of testpmd.
>
> Seems strange to me that the relation between pps/bps is ~336 this
> time, and between accumulated pkts/accumulated bytes is ~58. Also, the
> relation between them is not even close to 8.
>
> However, testpmd shows a lot of absolute packets received. If we see
> the received packets in a period subtracting from the previous one,
> testpmd tells that receive more pps than pktgen tx-pps:
> Baseline: ~2222668.667pps 128914784.3bps.
> Batch: 2269260.933pps, 131617134.9bps
> Buf_api: 2213226.467pps, 128367135.9bp

How about playing with the batch size? Make it a mod parameter instead
of the hard coded 64, and measure for all values 1 to 64 ...

2020-07-10 03:57:05

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/7/10 上午1:37, Michael S. Tsirkin wrote:
> On Thu, Jul 09, 2020 at 06:46:13PM +0200, Eugenio Perez Martin wrote:
>> On Wed, Jul 1, 2020 at 4:10 PM Jason Wang <[email protected]> wrote:
>>>
>>> On 2020/7/1 下午9:04, Eugenio Perez Martin wrote:
>>>> On Wed, Jul 1, 2020 at 2:40 PM Jason Wang <[email protected]> wrote:
>>>>> On 2020/7/1 下午6:43, Eugenio Perez Martin wrote:
>>>>>> On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
>>>>>> <[email protected]> wrote:
>>>>>>> On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
>>>>>>>> On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
>>>>>>>>> On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
>>>>>>>>>> On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
>>>>>>>>>>> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
>>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>>> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
>>>>>>>>>>>> <[email protected]> wrote:
>>>>>>>>>>>>> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
>>>>>>>>>>>>>> As testing shows no performance change, switch to that now.
>>>>>>>>>>>>> What kind of testing? 100GiB? Low latency?
>>>>>>>>>>>>>
>>>>>>>>>>>> Hi Konrad.
>>>>>>>>>>>>
>>>>>>>>>>>> I tested this version of the patch:
>>>>>>>>>>>> https://lkml.org/lkml/2019/10/13/42
>>>>>>>>>>>>
>>>>>>>>>>>> It was tested for throughput with DPDK's testpmd (as described in
>>>>>>>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
>>>>>>>>>>>> and kernel pktgen. No latency tests were performed by me. Maybe it is
>>>>>>>>>>>> interesting to perform a latency test or just a different set of tests
>>>>>>>>>>>> over a recent version.
>>>>>>>>>>>>
>>>>>>>>>>>> Thanks!
>>>>>>>>>>> I have repeated the tests with v9, and results are a little bit different:
>>>>>>>>>>> * If I test opening it with testpmd, I see no change between versions
>>>>>>>>>> OK that is testpmd on guest, right? And vhost-net on the host?
>>>>>>>>>>
>>>>>>>>> Hi Michael.
>>>>>>>>>
>>>>>>>>> No, sorry, as described in
>>>>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
>>>>>>>>> But I could add to test it in the guest too.
>>>>>>>>>
>>>>>>>>> These kinds of raw packets "bursts" do not show performance
>>>>>>>>> differences, but I could test deeper if you think it would be worth
>>>>>>>>> it.
>>>>>>>> Oh ok, so this is without guest, with virtio-user.
>>>>>>>> It might be worth checking dpdk within guest too just
>>>>>>>> as another data point.
>>>>>>>>
>>>>>>> Ok, I will do it!
>>>>>>>
>>>>>>>>>>> * If I forward packets between two vhost-net interfaces in the guest
>>>>>>>>>>> using a linux bridge in the host:
>>>>>>>>>> And here I guess you mean virtio-net in the guest kernel?
>>>>>>>>> Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
>>>>>>>>> the host. More precisely:
>>>>>>>>> * Adding one of the interfaces to another namespace, assigning it an
>>>>>>>>> IP, and starting netserver there.
>>>>>>>>> * Assign another IP in the range manually to the other virtual net
>>>>>>>>> interface, and start the desired test there.
>>>>>>>>>
>>>>>>>>> If you think it would be better to perform then differently please let me know.
>>>>>>>> Not sure why you bother with namespaces since you said you are
>>>>>>>> using L2 bridging. I guess it's unimportant.
>>>>>>>>
>>>>>>> Sorry, I think I should have provided more context about that.
>>>>>>>
>>>>>>> The only reason to use namespaces is to force the traffic of these
>>>>>>> netperf tests to go through the external bridge. To test netperf
>>>>>>> different possibilities than the testpmd (or pktgen or others "blast
>>>>>>> of frames unconditionally" tests).
>>>>>>>
>>>>>>> This way, I make sure that is the same version of everything in the
>>>>>>> guest, and is a little bit easier to manage cpu affinity, start and
>>>>>>> stop testing...
>>>>>>>
>>>>>>> I could use a different VM for sending and receiving, but I find this
>>>>>>> way a faster one and it should not introduce a lot of noise. I can
>>>>>>> test with two VM if you think that this use of network namespace
>>>>>>> introduces too much noise.
>>>>>>>
>>>>>>> Thanks!
>>>>>>>
>>>>>>>>>>> - netperf UDP_STREAM shows a performance increase of 1.8, almost
>>>>>>>>>>> doubling performance. This gets lower as frame size increase.
>>>>>> Regarding UDP_STREAM:
>>>>>> * with event_idx=on: The performance difference is reduced a lot if
>>>>>> applied affinity properly (manually assigning CPU on host/guest and
>>>>>> setting IRQs on guest), making them perform equally with and without
>>>>>> the patch again. Maybe the batching makes the scheduler perform
>>>>>> better.
>>>>> Note that for UDP_STREAM, the result is pretty trick to be analyzed. E.g
>>>>> setting a sndbuf for TAP may help for the performance (reduce the drop).
>>>>>
>>>> Ok, will add that to the test. Thanks!
>>>
>>> Actually, it's better to skip the UDP_STREAM test since:
>>>
>>> - My understanding is very few application is using raw UDP stream
>>> - It's hard to analyze (usually you need to count the drop ratio etc)
>>>
>>>
>>>>>>>>>>> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
>>>>>>>>>>> transactions/sec to 5830
>>>>>> * Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
>>>>>> them perform similarly again, only a very small performance drop
>>>>>> observed. It could be just noise.
>>>>>> ** All of them perform better than vanilla if event_idx=off, not sure
>>>>>> why. I can try to repeat them if you suspect that can be a test
>>>>>> failure.
>>>>>>
>>>>>> * With testpmd and event_idx=off, if I send from the VM to host, I see
>>>>>> a performance increment especially in small packets. The buf api also
>>>>>> increases performance compared with only batching: Sending the minimum
>>>>>> packet size in testpmd makes pps go from 356kpps to 473 kpps.
>>>>> What's your setup for this. The number looks rather low. I'd expected
>>>>> 1-2 Mpps at least.
>>>>>
>>>> Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz, 2 NUMA nodes of 16G memory
>>>> each, and no device assigned to the NUMA node I'm testing in. Too low
>>>> for testpmd AF_PACKET driver too?
>>>
>>> I don't test AF_PACKET, I guess it should use the V3 which mmap based
>>> zerocopy interface.
>>>
>>> And it might worth to check the cpu utilization of vhost thread. It's
>>> required to stress it as 100% otherwise there could be a bottleneck
>>> somewhere.
>>>
>>>
>>>>>> Sending
>>>>>> 1024 length UDP-PDU makes it go from 570kpps to 64 kpps.
>>>>>>
>>>>>> Something strange I observe in these tests: I get more pps the bigger
>>>>>> the transmitted buffer size is. Not sure why.
>>>>>>
>>>>>> ** Sending from the host to the VM does not make a big change with the
>>>>>> patches in small packets scenario (minimum, 64 bytes, about 645
>>>>>> without the patch, ~625 with batch and batch+buf api). If the packets
>>>>>> are bigger, I can see a performance increase: with 256 bits,
>>>>> I think you meant bytes?
>>>>>
>>>> Yes, sorry.
>>>>
>>>>>> it goes
>>>>>> from 590kpps to about 600kpps, and in case of 1500 bytes payload it
>>>>>> gets from 348kpps to 528kpps, so it is clearly an improvement.
>>>>>>
>>>>>> * with testpmd and event_idx=on, batching+buf api perform similarly in
>>>>>> both directions.
>>>>>>
>>>>>> All of testpmd tests were performed with no linux bridge, just a
>>>>>> host's tap interface (<interface type='ethernet'> in xml),
>>>>> What DPDK driver did you use in the test (AF_PACKET?).
>>>>>
>>>> Yes, both testpmd are using AF_PACKET driver.
>>>
>>> I see, using AF_PACKET means extra layers of issues need to be analyzed
>>> which is probably not good.
>>>
>>>
>>>>>> with a
>>>>>> testpmd txonly and another in rxonly forward mode, and using the
>>>>>> receiving side packets/bytes data. Guest's rps, xps and interrupts,
>>>>>> and host's vhost threads affinity were also tuned in each test to
>>>>>> schedule both testpmd and vhost in different processors.
>>>>> My feeling is that if we start from simple setup, it would be more
>>>>> easier as a start. E.g start without an VM.
>>>>>
>>>>> 1) TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP
>>>>> 2) RX: pkgetn -> TAP -> vhost_net -> testpmd(rxonly)
>>>>>
>>>> Got it. Is there a reason to prefer pktgen over testpmd?
>>>
>>> I think the reason is using testpmd you must use a userspace kernel
>>> interface (AF_PACKET), and it could not be as fast as pktgen since:
>>>
>>> - it talks directly to xmit of TAP
>>> - skb can be cloned
>>>
>> Hi!
>>
>> Here it is the result of the tests. Details on [1].
>>
>> Tx:
>> ===
>>
>> For tx packets it seems that the batching patch makes things a little
>> bit worse, but the buf_api outperforms baseline by a 7%:
>>
>> * We start with a baseline of 4208772.571 pps and 269361444.6 bytes/s [2].
>> * When we add the batching, I see a small performance decrease:
>> 4133292.308 and 264530707.7 bytes/s.
>> * However, the buf api it outperform the baseline: 4551319.631pps,
>> 291205178.1 bytes/s
>>
>> I don't have numbers on the receiver side since it is just a XDP_DROP.
>> I think it would be interesting to see them.
>>
>> Rx:
>> ===
>>
>> Regarding Rx, the reverse is observed: a small performance increase is
>> observed with batching (~2%), but buf_api makes tests perform equally
>> to baseline.
>>
>> pktgen was called using pktgen_sample01_simple.sh, with the environment:
>> DEV="$tap_name" F_THREAD=1 DST_MAC=$MAC_ADDR COUNT=$((2500000*25))
>> SKB_CLONE=$((2**31))
>>
>> And testpmd is the same as Tx but with forward-mode=rxonly.
>>
>> Pktgen reports:
>> Baseline: 1853025pps 622Mb/sec (622616400bps) errors: 7915231
>> Batch: 1891404pps 635Mb/sec (635511744bps) errors: 4926093
>> Buf_api: 1844008pps 619Mb/sec (619586688bps) errors: 47766692
>>
>> Testpmd reports:
>> Baseline: 1854448pps, 860464156 bps. [3]
>> Batch: 1892844.25pps, 878280070bps.
>> Buf_api: 1846139.75pps, 856609120bps.
>>
>> Any thoughts?
>>
>> Thanks!
>>
>> [1]
>> Testpmd options: -l 1,3
>> --vdev=virtio_user0,mac=01:02:03:04:05:06,path=/dev/vhost-net,queue_size=1024
>> -- --auto-start --stats-period 5 --tx-offloads="$TX_OFFLOADS"
>> --rx-offloads="$RX_OFFLOADS" --txd=4096 --rxd=4096 --burst=512
>> --forward-mode=txonly
>>
>> Where offloads were obtained manually running with
>> --[tr]x-offloads=0x8fff and examining testpmd response:
>> declare -r RX_OFFLOADS=0x81d
>> declare -r TX_OFFLOADS=0x802d
>>
>> All of the tests results are an average of at least 3 samples of
>> testpmd, discarding the obvious deviations at start/end (like warming
>> up or waiting for pktgen to start). The result of pktgen is directly
>> c&p from its output.
>>
>> The numbers do not change very much from one stats printing to another
>> of testpmd.
>>
>> [2] Obtained subtracting each accumulated tx-packets from one stats
>> print to the previous one. If we attend testpmd output about Tx-pps,
>> it counts a little bit less performance, but it follows the same
>> pattern:
>>
>> Testpmd pps/bps stats:
>> Baseline: 3510826.25 pps, 1797887912bps = 224735989bytes/sec
>> Batch: 3448515.571pps, 1765640226bps = 220705028.3bytes/sec
>> Buf api: 3794115.333pps, 1942587286bps = 242823410.8bytes/sec
>>
>> [3] This is obtained using the rx-pps/rx-bps report of testpmd.
>>
>> Seems strange to me that the relation between pps/bps is ~336 this
>> time, and between accumulated pkts/accumulated bytes is ~58. Also, the
>> relation between them is not even close to 8.
>>
>> However, testpmd shows a lot of absolute packets received. If we see
>> the received packets in a period subtracting from the previous one,
>> testpmd tells that receive more pps than pktgen tx-pps:
>> Baseline: ~2222668.667pps 128914784.3bps.
>> Batch: 2269260.933pps, 131617134.9bps
>> Buf_api: 2213226.467pps, 128367135.9bp
> How about playing with the batch size? Make it a mod parameter instead
> of the hard coded 64, and measure for all values 1 to 64 ...

Right, according to the test result, 64 seems to be too aggressive in
the case of TX.

And it might also be worth to check:

1) Whether vhost thread is stressed as 100% CPU utilization, if not,
there's bottleneck elsewhere
2) For RX test, make sure pktgen kthread is running in the same NUMA
node with virtio-user

Thanks

>

2020-07-10 05:41:40

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Fri, Jul 10, 2020 at 5:56 AM Jason Wang <[email protected]> wrote:
>
>
> On 2020/7/10 上午1:37, Michael S. Tsirkin wrote:
> > On Thu, Jul 09, 2020 at 06:46:13PM +0200, Eugenio Perez Martin wrote:
> >> On Wed, Jul 1, 2020 at 4:10 PM Jason Wang <[email protected]> wrote:
> >>>
> >>> On 2020/7/1 下午9:04, Eugenio Perez Martin wrote:
> >>>> On Wed, Jul 1, 2020 at 2:40 PM Jason Wang <[email protected]> wrote:
> >>>>> On 2020/7/1 下午6:43, Eugenio Perez Martin wrote:
> >>>>>> On Tue, Jun 23, 2020 at 6:15 PM Eugenio Perez Martin
> >>>>>> <[email protected]> wrote:
> >>>>>>> On Mon, Jun 22, 2020 at 6:29 PM Michael S. Tsirkin <[email protected]> wrote:
> >>>>>>>> On Mon, Jun 22, 2020 at 06:11:21PM +0200, Eugenio Perez Martin wrote:
> >>>>>>>>> On Mon, Jun 22, 2020 at 5:55 PM Michael S. Tsirkin <[email protected]> wrote:
> >>>>>>>>>> On Fri, Jun 19, 2020 at 08:07:57PM +0200, Eugenio Perez Martin wrote:
> >>>>>>>>>>> On Mon, Jun 15, 2020 at 2:28 PM Eugenio Perez Martin
> >>>>>>>>>>> <[email protected]> wrote:
> >>>>>>>>>>>> On Thu, Jun 11, 2020 at 5:22 PM Konrad Rzeszutek Wilk
> >>>>>>>>>>>> <[email protected]> wrote:
> >>>>>>>>>>>>> On Thu, Jun 11, 2020 at 07:34:19AM -0400, Michael S. Tsirkin wrote:
> >>>>>>>>>>>>>> As testing shows no performance change, switch to that now.
> >>>>>>>>>>>>> What kind of testing? 100GiB? Low latency?
> >>>>>>>>>>>>>
> >>>>>>>>>>>> Hi Konrad.
> >>>>>>>>>>>>
> >>>>>>>>>>>> I tested this version of the patch:
> >>>>>>>>>>>> https://lkml.org/lkml/2019/10/13/42
> >>>>>>>>>>>>
> >>>>>>>>>>>> It was tested for throughput with DPDK's testpmd (as described in
> >>>>>>>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html)
> >>>>>>>>>>>> and kernel pktgen. No latency tests were performed by me. Maybe it is
> >>>>>>>>>>>> interesting to perform a latency test or just a different set of tests
> >>>>>>>>>>>> over a recent version.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Thanks!
> >>>>>>>>>>> I have repeated the tests with v9, and results are a little bit different:
> >>>>>>>>>>> * If I test opening it with testpmd, I see no change between versions
> >>>>>>>>>> OK that is testpmd on guest, right? And vhost-net on the host?
> >>>>>>>>>>
> >>>>>>>>> Hi Michael.
> >>>>>>>>>
> >>>>>>>>> No, sorry, as described in
> >>>>>>>>> http://doc.dpdk.org/guides/howto/virtio_user_as_exceptional_path.html.
> >>>>>>>>> But I could add to test it in the guest too.
> >>>>>>>>>
> >>>>>>>>> These kinds of raw packets "bursts" do not show performance
> >>>>>>>>> differences, but I could test deeper if you think it would be worth
> >>>>>>>>> it.
> >>>>>>>> Oh ok, so this is without guest, with virtio-user.
> >>>>>>>> It might be worth checking dpdk within guest too just
> >>>>>>>> as another data point.
> >>>>>>>>
> >>>>>>> Ok, I will do it!
> >>>>>>>
> >>>>>>>>>>> * If I forward packets between two vhost-net interfaces in the guest
> >>>>>>>>>>> using a linux bridge in the host:
> >>>>>>>>>> And here I guess you mean virtio-net in the guest kernel?
> >>>>>>>>> Yes, sorry: Two virtio-net interfaces connected with a linux bridge in
> >>>>>>>>> the host. More precisely:
> >>>>>>>>> * Adding one of the interfaces to another namespace, assigning it an
> >>>>>>>>> IP, and starting netserver there.
> >>>>>>>>> * Assign another IP in the range manually to the other virtual net
> >>>>>>>>> interface, and start the desired test there.
> >>>>>>>>>
> >>>>>>>>> If you think it would be better to perform then differently please let me know.
> >>>>>>>> Not sure why you bother with namespaces since you said you are
> >>>>>>>> using L2 bridging. I guess it's unimportant.
> >>>>>>>>
> >>>>>>> Sorry, I think I should have provided more context about that.
> >>>>>>>
> >>>>>>> The only reason to use namespaces is to force the traffic of these
> >>>>>>> netperf tests to go through the external bridge. To test netperf
> >>>>>>> different possibilities than the testpmd (or pktgen or others "blast
> >>>>>>> of frames unconditionally" tests).
> >>>>>>>
> >>>>>>> This way, I make sure that is the same version of everything in the
> >>>>>>> guest, and is a little bit easier to manage cpu affinity, start and
> >>>>>>> stop testing...
> >>>>>>>
> >>>>>>> I could use a different VM for sending and receiving, but I find this
> >>>>>>> way a faster one and it should not introduce a lot of noise. I can
> >>>>>>> test with two VM if you think that this use of network namespace
> >>>>>>> introduces too much noise.
> >>>>>>>
> >>>>>>> Thanks!
> >>>>>>>
> >>>>>>>>>>> - netperf UDP_STREAM shows a performance increase of 1.8, almost
> >>>>>>>>>>> doubling performance. This gets lower as frame size increase.
> >>>>>> Regarding UDP_STREAM:
> >>>>>> * with event_idx=on: The performance difference is reduced a lot if
> >>>>>> applied affinity properly (manually assigning CPU on host/guest and
> >>>>>> setting IRQs on guest), making them perform equally with and without
> >>>>>> the patch again. Maybe the batching makes the scheduler perform
> >>>>>> better.
> >>>>> Note that for UDP_STREAM, the result is pretty trick to be analyzed. E.g
> >>>>> setting a sndbuf for TAP may help for the performance (reduce the drop).
> >>>>>
> >>>> Ok, will add that to the test. Thanks!
> >>>
> >>> Actually, it's better to skip the UDP_STREAM test since:
> >>>
> >>> - My understanding is very few application is using raw UDP stream
> >>> - It's hard to analyze (usually you need to count the drop ratio etc)
> >>>
> >>>
> >>>>>>>>>>> - rests of the test goes noticeably worse: UDP_RR goes from ~6347
> >>>>>>>>>>> transactions/sec to 5830
> >>>>>> * Regarding UDP_RR, TCP_STREAM, and TCP_RR, proper CPU pinning makes
> >>>>>> them perform similarly again, only a very small performance drop
> >>>>>> observed. It could be just noise.
> >>>>>> ** All of them perform better than vanilla if event_idx=off, not sure
> >>>>>> why. I can try to repeat them if you suspect that can be a test
> >>>>>> failure.
> >>>>>>
> >>>>>> * With testpmd and event_idx=off, if I send from the VM to host, I see
> >>>>>> a performance increment especially in small packets. The buf api also
> >>>>>> increases performance compared with only batching: Sending the minimum
> >>>>>> packet size in testpmd makes pps go from 356kpps to 473 kpps.
> >>>>> What's your setup for this. The number looks rather low. I'd expected
> >>>>> 1-2 Mpps at least.
> >>>>>
> >>>> Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz, 2 NUMA nodes of 16G memory
> >>>> each, and no device assigned to the NUMA node I'm testing in. Too low
> >>>> for testpmd AF_PACKET driver too?
> >>>
> >>> I don't test AF_PACKET, I guess it should use the V3 which mmap based
> >>> zerocopy interface.
> >>>
> >>> And it might worth to check the cpu utilization of vhost thread. It's
> >>> required to stress it as 100% otherwise there could be a bottleneck
> >>> somewhere.
> >>>
> >>>
> >>>>>> Sending
> >>>>>> 1024 length UDP-PDU makes it go from 570kpps to 64 kpps.
> >>>>>>
> >>>>>> Something strange I observe in these tests: I get more pps the bigger
> >>>>>> the transmitted buffer size is. Not sure why.
> >>>>>>
> >>>>>> ** Sending from the host to the VM does not make a big change with the
> >>>>>> patches in small packets scenario (minimum, 64 bytes, about 645
> >>>>>> without the patch, ~625 with batch and batch+buf api). If the packets
> >>>>>> are bigger, I can see a performance increase: with 256 bits,
> >>>>> I think you meant bytes?
> >>>>>
> >>>> Yes, sorry.
> >>>>
> >>>>>> it goes
> >>>>>> from 590kpps to about 600kpps, and in case of 1500 bytes payload it
> >>>>>> gets from 348kpps to 528kpps, so it is clearly an improvement.
> >>>>>>
> >>>>>> * with testpmd and event_idx=on, batching+buf api perform similarly in
> >>>>>> both directions.
> >>>>>>
> >>>>>> All of testpmd tests were performed with no linux bridge, just a
> >>>>>> host's tap interface (<interface type='ethernet'> in xml),
> >>>>> What DPDK driver did you use in the test (AF_PACKET?).
> >>>>>
> >>>> Yes, both testpmd are using AF_PACKET driver.
> >>>
> >>> I see, using AF_PACKET means extra layers of issues need to be analyzed
> >>> which is probably not good.
> >>>
> >>>
> >>>>>> with a
> >>>>>> testpmd txonly and another in rxonly forward mode, and using the
> >>>>>> receiving side packets/bytes data. Guest's rps, xps and interrupts,
> >>>>>> and host's vhost threads affinity were also tuned in each test to
> >>>>>> schedule both testpmd and vhost in different processors.
> >>>>> My feeling is that if we start from simple setup, it would be more
> >>>>> easier as a start. E.g start without an VM.
> >>>>>
> >>>>> 1) TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP
> >>>>> 2) RX: pkgetn -> TAP -> vhost_net -> testpmd(rxonly)
> >>>>>
> >>>> Got it. Is there a reason to prefer pktgen over testpmd?
> >>>
> >>> I think the reason is using testpmd you must use a userspace kernel
> >>> interface (AF_PACKET), and it could not be as fast as pktgen since:
> >>>
> >>> - it talks directly to xmit of TAP
> >>> - skb can be cloned
> >>>
> >> Hi!
> >>
> >> Here it is the result of the tests. Details on [1].
> >>
> >> Tx:
> >> ===
> >>
> >> For tx packets it seems that the batching patch makes things a little
> >> bit worse, but the buf_api outperforms baseline by a 7%:
> >>
> >> * We start with a baseline of 4208772.571 pps and 269361444.6 bytes/s [2].
> >> * When we add the batching, I see a small performance decrease:
> >> 4133292.308 and 264530707.7 bytes/s.
> >> * However, the buf api it outperform the baseline: 4551319.631pps,
> >> 291205178.1 bytes/s
> >>
> >> I don't have numbers on the receiver side since it is just a XDP_DROP.
> >> I think it would be interesting to see them.
> >>
> >> Rx:
> >> ===
> >>
> >> Regarding Rx, the reverse is observed: a small performance increase is
> >> observed with batching (~2%), but buf_api makes tests perform equally
> >> to baseline.
> >>
> >> pktgen was called using pktgen_sample01_simple.sh, with the environment:
> >> DEV="$tap_name" F_THREAD=1 DST_MAC=$MAC_ADDR COUNT=$((2500000*25))
> >> SKB_CLONE=$((2**31))
> >>
> >> And testpmd is the same as Tx but with forward-mode=rxonly.
> >>
> >> Pktgen reports:
> >> Baseline: 1853025pps 622Mb/sec (622616400bps) errors: 7915231
> >> Batch: 1891404pps 635Mb/sec (635511744bps) errors: 4926093
> >> Buf_api: 1844008pps 619Mb/sec (619586688bps) errors: 47766692
> >>
> >> Testpmd reports:
> >> Baseline: 1854448pps, 860464156 bps. [3]
> >> Batch: 1892844.25pps, 878280070bps.
> >> Buf_api: 1846139.75pps, 856609120bps.
> >>
> >> Any thoughts?
> >>
> >> Thanks!
> >>
> >> [1]
> >> Testpmd options: -l 1,3
> >> --vdev=virtio_user0,mac=01:02:03:04:05:06,path=/dev/vhost-net,queue_size=1024
> >> -- --auto-start --stats-period 5 --tx-offloads="$TX_OFFLOADS"
> >> --rx-offloads="$RX_OFFLOADS" --txd=4096 --rxd=4096 --burst=512
> >> --forward-mode=txonly
> >>
> >> Where offloads were obtained manually running with
> >> --[tr]x-offloads=0x8fff and examining testpmd response:
> >> declare -r RX_OFFLOADS=0x81d
> >> declare -r TX_OFFLOADS=0x802d
> >>
> >> All of the tests results are an average of at least 3 samples of
> >> testpmd, discarding the obvious deviations at start/end (like warming
> >> up or waiting for pktgen to start). The result of pktgen is directly
> >> c&p from its output.
> >>
> >> The numbers do not change very much from one stats printing to another
> >> of testpmd.
> >>
> >> [2] Obtained subtracting each accumulated tx-packets from one stats
> >> print to the previous one. If we attend testpmd output about Tx-pps,
> >> it counts a little bit less performance, but it follows the same
> >> pattern:
> >>
> >> Testpmd pps/bps stats:
> >> Baseline: 3510826.25 pps, 1797887912bps = 224735989bytes/sec
> >> Batch: 3448515.571pps, 1765640226bps = 220705028.3bytes/sec
> >> Buf api: 3794115.333pps, 1942587286bps = 242823410.8bytes/sec
> >>
> >> [3] This is obtained using the rx-pps/rx-bps report of testpmd.
> >>
> >> Seems strange to me that the relation between pps/bps is ~336 this
> >> time, and between accumulated pkts/accumulated bytes is ~58. Also, the
> >> relation between them is not even close to 8.
> >>
> >> However, testpmd shows a lot of absolute packets received. If we see
> >> the received packets in a period subtracting from the previous one,
> >> testpmd tells that receive more pps than pktgen tx-pps:
> >> Baseline: ~2222668.667pps 128914784.3bps.
> >> Batch: 2269260.933pps, 131617134.9bps
> >> Buf_api: 2213226.467pps, 128367135.9bp
> > How about playing with the batch size? Make it a mod parameter instead
> > of the hard coded 64, and measure for all values 1 to 64 ...
>
>
> Right, according to the test result, 64 seems to be too aggressive in
> the case of TX.
>

Got it, thanks both!

> And it might also be worth to check:
>
> 1) Whether vhost thread is stressed as 100% CPU utilization, if not,
> there's bottleneck elsewhere

I forgot to check this, sorry. Will check in the next test.

> 2) For RX test, make sure pktgen kthread is running in the same NUMA
> node with virtio-user
>

It is allocated 1 thread in lcore 1 (F_THREAD=1) which belongs to the
same NUMA as testpmd. Actually, it is the testpmd master core, so it
should be a good idea to move it to another lcore of the same NUMA
node.

Is this enough for pktgen to allocate the memory in that numa node?
Since the script only write parameters to /proc, I assume that it has
no effect to run it under numactl/taskset, and pktgen will allocate
memory based on the lcore is running. Am I right?

Thanks!

> Thanks
>
>
> >
>

2020-07-10 06:01:15

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Fri, Jul 10, 2020 at 07:39:26AM +0200, Eugenio Perez Martin wrote:
> > > How about playing with the batch size? Make it a mod parameter instead
> > > of the hard coded 64, and measure for all values 1 to 64 ...
> >
> >
> > Right, according to the test result, 64 seems to be too aggressive in
> > the case of TX.
> >
>
> Got it, thanks both!

In particular I wonder whether with batch size 1
we get same performance as without batching
(would indicate 64 is too aggressive)
or not (would indicate one of the code changes
affects performance in an unexpected way).

--
MST

2020-07-10 06:46:51

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/7/10 下午1:39, Eugenio Perez Martin wrote:
> It is allocated 1 thread in lcore 1 (F_THREAD=1) which belongs to the
> same NUMA as testpmd. Actually, it is the testpmd master core, so it
> should be a good idea to move it to another lcore of the same NUMA
> node.
>
> Is this enough for pktgen to allocate the memory in that numa node?
> Since the script only write parameters to /proc, I assume that it has
> no effect to run it under numactl/taskset, and pktgen will allocate
> memory based on the lcore is running. Am I right?
>
> Thanks!
>

I think you're right.

Thanks

2020-07-16 17:17:48

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Fri, Jul 10, 2020 at 7:58 AM Michael S. Tsirkin <[email protected]> wrote:
>
> On Fri, Jul 10, 2020 at 07:39:26AM +0200, Eugenio Perez Martin wrote:
> > > > How about playing with the batch size? Make it a mod parameter instead
> > > > of the hard coded 64, and measure for all values 1 to 64 ...
> > >
> > >
> > > Right, according to the test result, 64 seems to be too aggressive in
> > > the case of TX.
> > >
> >
> > Got it, thanks both!
>
> In particular I wonder whether with batch size 1
> we get same performance as without batching
> (would indicate 64 is too aggressive)
> or not (would indicate one of the code changes
> affects performance in an unexpected way).
>
> --
> MST
>

Hi!

Varying batch_size as drivers/vhost/net.c:VHOST_NET_BATCH, and testing
the pps as previous mail says. This means that we have either only
vhost_net batching (in base testing, like previously to apply this
patch) or both batching sizes the same.

I've checked that vhost process (and pktgen) goes 100% cpu also.

For tx: Batching decrements always the performance, in all cases. Not
sure why bufapi made things better the last time.

Batching makes improvements until 64 bufs, I see increments of pps but like 1%.

For rx: Batching always improves performance. It seems that if we
batch little, bufapi decreases performance, but beyond 64, bufapi is
much better. The bufapi version keeps improving until I set a batching
of 1024. So I guess it is super good to have a bunch of buffers to
receive.

Since with this test I cannot disable event_idx or things like that,
what would be the next step for testing?

Thanks!

--
Results:
# Buf size: 1,16,32,64,128,256,512

# Tx
# ===
# Base
2293304.308,3396057.769,3540860.615,3636056.077,3332950.846,3694276.154,3689820
# Batch
2286723.857,3307191.643,3400346.571,3452527.786,3460766.857,3431042.5,3440722.286
# Batch + Bufapi
2257970.769,3151268.385,3260150.538,3379383.846,3424028.846,3433384.308,3385635.231,3406554.538

# Rx
# ==
# pktgen results (pps)
1223275,1668868,1728794,1769261,1808574,1837252,1846436
1456924,1797901,1831234,1868746,1877508,1931598,1936402
1368923,1719716,1794373,1865170,1884803,1916021,1975160

# Testpmd pps results
1222698.143,1670604,1731040.6,1769218,1811206,1839308.75,1848478.75
1450140.5,1799985.75,1834089.75,1871290,1880005.5,1934147.25,1939034
1370621,1721858,1796287.75,1866618.5,1885466.5,1918670.75,1976173.5,1988760.75,1978316

pktgen was run again for rx with 1024 and 2048 buf size, giving
1988760.75 and 1978316 pps. Testpmd goes the same way.

2020-07-20 08:56:10

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/7/17 上午1:16, Eugenio Perez Martin wrote:
> On Fri, Jul 10, 2020 at 7:58 AM Michael S. Tsirkin <[email protected]> wrote:
>> On Fri, Jul 10, 2020 at 07:39:26AM +0200, Eugenio Perez Martin wrote:
>>>>> How about playing with the batch size? Make it a mod parameter instead
>>>>> of the hard coded 64, and measure for all values 1 to 64 ...
>>>>
>>>> Right, according to the test result, 64 seems to be too aggressive in
>>>> the case of TX.
>>>>
>>> Got it, thanks both!
>> In particular I wonder whether with batch size 1
>> we get same performance as without batching
>> (would indicate 64 is too aggressive)
>> or not (would indicate one of the code changes
>> affects performance in an unexpected way).
>>
>> --
>> MST
>>
> Hi!
>
> Varying batch_size as drivers/vhost/net.c:VHOST_NET_BATCH,

Did you mean varying the value of VHOST_NET_BATCH itself or the number
of batched descriptors?

> and testing
> the pps as previous mail says. This means that we have either only
> vhost_net batching (in base testing, like previously to apply this
> patch) or both batching sizes the same.
>
> I've checked that vhost process (and pktgen) goes 100% cpu also.
>
> For tx: Batching decrements always the performance, in all cases. Not
> sure why bufapi made things better the last time.
>
> Batching makes improvements until 64 bufs, I see increments of pps but like 1%.
>
> For rx: Batching always improves performance. It seems that if we
> batch little, bufapi decreases performance, but beyond 64, bufapi is
> much better. The bufapi version keeps improving until I set a batching
> of 1024. So I guess it is super good to have a bunch of buffers to
> receive.
>
> Since with this test I cannot disable event_idx or things like that,
> what would be the next step for testing?
>
> Thanks!
>
> --
> Results:
> # Buf size: 1,16,32,64,128,256,512
>
> # Tx
> # ===
> # Base
> 2293304.308,3396057.769,3540860.615,3636056.077,3332950.846,3694276.154,3689820

What's the meaning of buf size in the context of "base"?

And I wonder maybe perf diff can help.

Thanks

> # Batch
> 2286723.857,3307191.643,3400346.571,3452527.786,3460766.857,3431042.5,3440722.286
> # Batch + Bufapi
> 2257970.769,3151268.385,3260150.538,3379383.846,3424028.846,3433384.308,3385635.231,3406554.538
>
> # Rx
> # ==
> # pktgen results (pps)
> 1223275,1668868,1728794,1769261,1808574,1837252,1846436
> 1456924,1797901,1831234,1868746,1877508,1931598,1936402
> 1368923,1719716,1794373,1865170,1884803,1916021,1975160
>
> # Testpmd pps results
> 1222698.143,1670604,1731040.6,1769218,1811206,1839308.75,1848478.75
> 1450140.5,1799985.75,1834089.75,1871290,1880005.5,1934147.25,1939034
> 1370621,1721858,1796287.75,1866618.5,1885466.5,1918670.75,1976173.5,1988760.75,1978316
>
> pktgen was run again for rx with 1024 and 2048 buf size, giving
> 1988760.75 and 1978316 pps. Testpmd goes the same way.
>

2020-07-20 09:29:55

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Thu, Jul 16, 2020 at 07:16:27PM +0200, Eugenio Perez Martin wrote:
> On Fri, Jul 10, 2020 at 7:58 AM Michael S. Tsirkin <[email protected]> wrote:
> >
> > On Fri, Jul 10, 2020 at 07:39:26AM +0200, Eugenio Perez Martin wrote:
> > > > > How about playing with the batch size? Make it a mod parameter instead
> > > > > of the hard coded 64, and measure for all values 1 to 64 ...
> > > >
> > > >
> > > > Right, according to the test result, 64 seems to be too aggressive in
> > > > the case of TX.
> > > >
> > >
> > > Got it, thanks both!
> >
> > In particular I wonder whether with batch size 1
> > we get same performance as without batching
> > (would indicate 64 is too aggressive)
> > or not (would indicate one of the code changes
> > affects performance in an unexpected way).
> >
> > --
> > MST
> >
>
> Hi!
>
> Varying batch_size as drivers/vhost/net.c:VHOST_NET_BATCH,

sorry this is not what I meant.

I mean something like this:

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 0b509be8d7b1..b94680e5721d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1279,6 +1279,10 @@ static void handle_rx_net(struct vhost_work *work)
handle_rx(net);
}

+MODULE_PARM_DESC(batch_num, "Number of batched descriptors. (offset from 64)");
+module_param(batch_num, int, 0644);
+static int batch_num = 0;
+
static int vhost_net_open(struct inode *inode, struct file *f)
{
struct vhost_net *n;
@@ -1333,7 +1337,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
vhost_net_buf_init(&n->vqs[i].rxq);
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
- UIO_MAXIOV + VHOST_NET_BATCH,
+ UIO_MAXIOV + VHOST_NET_BATCH + batch_num,
VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
NULL);

then you can try tweaking batching and playing with mod parameter without
recompiling.

VHOST_NET_BATCH affects lots of other things.

> and testing
> the pps as previous mail says. This means that we have either only
> vhost_net batching (in base testing, like previously to apply this
> patch) or both batching sizes the same.
>
> I've checked that vhost process (and pktgen) goes 100% cpu also.
>
> For tx: Batching decrements always the performance, in all cases. Not
> sure why bufapi made things better the last time.
>
> Batching makes improvements until 64 bufs, I see increments of pps but like 1%.
>
> For rx: Batching always improves performance. It seems that if we
> batch little, bufapi decreases performance, but beyond 64, bufapi is
> much better. The bufapi version keeps improving until I set a batching
> of 1024. So I guess it is super good to have a bunch of buffers to
> receive.
>
> Since with this test I cannot disable event_idx or things like that,
> what would be the next step for testing?
>
> Thanks!
>
> --
> Results:
> # Buf size: 1,16,32,64,128,256,512
>
> # Tx
> # ===
> # Base
> 2293304.308,3396057.769,3540860.615,3636056.077,3332950.846,3694276.154,3689820
> # Batch
> 2286723.857,3307191.643,3400346.571,3452527.786,3460766.857,3431042.5,3440722.286
> # Batch + Bufapi
> 2257970.769,3151268.385,3260150.538,3379383.846,3424028.846,3433384.308,3385635.231,3406554.538
>
> # Rx
> # ==
> # pktgen results (pps)
> 1223275,1668868,1728794,1769261,1808574,1837252,1846436
> 1456924,1797901,1831234,1868746,1877508,1931598,1936402
> 1368923,1719716,1794373,1865170,1884803,1916021,1975160
>
> # Testpmd pps results
> 1222698.143,1670604,1731040.6,1769218,1811206,1839308.75,1848478.75
> 1450140.5,1799985.75,1834089.75,1871290,1880005.5,1934147.25,1939034
> 1370621,1721858,1796287.75,1866618.5,1885466.5,1918670.75,1976173.5,1988760.75,1978316
>
> pktgen was run again for rx with 1024 and 2048 buf size, giving
> 1988760.75 and 1978316 pps. Testpmd goes the same way.

Don't really understand what does this data mean.
Which number of descs is batched for each run?

--
MST

2020-07-20 11:17:37

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Mon, Jul 20, 2020 at 11:27 AM Michael S. Tsirkin <[email protected]> wrote:
> On Thu, Jul 16, 2020 at 07:16:27PM +0200, Eugenio Perez Martin wrote:
> > On Fri, Jul 10, 2020 at 7:58 AM Michael S. Tsirkin <[email protected]> wrote:
> > > On Fri, Jul 10, 2020 at 07:39:26AM +0200, Eugenio Perez Martin wrote:
> > > > > > How about playing with the batch size? Make it a mod parameter instead
> > > > > > of the hard coded 64, and measure for all values 1 to 64 ...
> > > > >
> > > > > Right, according to the test result, 64 seems to be too aggressive in
> > > > > the case of TX.
> > > > >
> > > >
> > > > Got it, thanks both!
> > >
> > > In particular I wonder whether with batch size 1
> > > we get same performance as without batching
> > > (would indicate 64 is too aggressive)
> > > or not (would indicate one of the code changes
> > > affects performance in an unexpected way).
> > >
> > > --
> > > MST
> > >
> >
> > Hi!
> >
> > Varying batch_size as drivers/vhost/net.c:VHOST_NET_BATCH,
>
> sorry this is not what I meant.
>
> I mean something like this:
>
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 0b509be8d7b1..b94680e5721d 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -1279,6 +1279,10 @@ static void handle_rx_net(struct vhost_work *work)
> handle_rx(net);
> }
>
> +MODULE_PARM_DESC(batch_num, "Number of batched descriptors. (offset from 64)");
> +module_param(batch_num, int, 0644);
> +static int batch_num = 0;
> +
> static int vhost_net_open(struct inode *inode, struct file *f)
> {
> struct vhost_net *n;
> @@ -1333,7 +1337,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
> vhost_net_buf_init(&n->vqs[i].rxq);
> }
> vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
> - UIO_MAXIOV + VHOST_NET_BATCH,
> + UIO_MAXIOV + VHOST_NET_BATCH + batch_num,
> VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
> NULL);
>
>
> then you can try tweaking batching and playing with mod parameter without
> recompiling.
>
>
> VHOST_NET_BATCH affects lots of other things.
>

Ok, got it. Since they were aligned from the start, I thought it was a good idea to maintain them in-sync.

> > and testing
> > the pps as previous mail says. This means that we have either only
> > vhost_net batching (in base testing, like previously to apply this
> > patch) or both batching sizes the same.
> >
> > I've checked that vhost process (and pktgen) goes 100% cpu also.
> >
> > For tx: Batching decrements always the performance, in all cases. Not
> > sure why bufapi made things better the last time.
> >
> > Batching makes improvements until 64 bufs, I see increments of pps but like 1%.
> >
> > For rx: Batching always improves performance. It seems that if we
> > batch little, bufapi decreases performance, but beyond 64, bufapi is
> > much better. The bufapi version keeps improving until I set a batching
> > of 1024. So I guess it is super good to have a bunch of buffers to
> > receive.
> >
> > Since with this test I cannot disable event_idx or things like that,
> > what would be the next step for testing?
> >
> > Thanks!
> >
> > --
> > Results:
> > # Buf size: 1,16,32,64,128,256,512
> >
> > # Tx
> > # ===
> > # Base
> > 2293304.308,3396057.769,3540860.615,3636056.077,3332950.846,3694276.154,3689820
> > # Batch
> > 2286723.857,3307191.643,3400346.571,3452527.786,3460766.857,3431042.5,3440722.286
> > # Batch + Bufapi
> > 2257970.769,3151268.385,3260150.538,3379383.846,3424028.846,3433384.308,3385635.231,3406554.538
> >
> > # Rx
> > # ==
> > # pktgen results (pps)
> > 1223275,1668868,1728794,1769261,1808574,1837252,1846436
> > 1456924,1797901,1831234,1868746,1877508,1931598,1936402
> > 1368923,1719716,1794373,1865170,1884803,1916021,1975160
> >
> > # Testpmd pps results
> > 1222698.143,1670604,1731040.6,1769218,1811206,1839308.75,1848478.75
> > 1450140.5,1799985.75,1834089.75,1871290,1880005.5,1934147.25,1939034
> > 1370621,1721858,1796287.75,1866618.5,1885466.5,1918670.75,1976173.5,1988760.75,1978316
> >
> > pktgen was run again for rx with 1024 and 2048 buf size, giving
> > 1988760.75 and 1978316 pps. Testpmd goes the same way.
>
> Don't really understand what does this data mean.
> Which number of descs is batched for each run?
>

Sorry, I should have explained better. I will expand here, but feel free to skip it since we are going to discard the
data anyway. Or to propose a better way to tell them.

Is a CSV with the values I've obtained, in pps, from pktgen and testpmd. This way is easy to plot them.

Maybe is easier as tables, if mail readers/gmail does not misalign them.

> > # Tx
> > # ===

Base: With the previous code, not integrating any patch. testpmd is txonly mode, tap interface is XDP_DROP everything.
We vary VHOST_NET_BATCH (1, 16, 32, ...). As Jason put in a previous mail:

TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP

1 | 16 | 32 | 64 | 128 | 256 | 512 |
2293304.308| 3396057.769| 3540860.615| 3636056.077| 3332950.846| 3694276.154| 3689820|

If we add the batching part of the series, but not the bufapi:

1 | 16 | 32 | 64 | 128 | 256 | 512 |
2286723.857 | 3307191.643| 3400346.571| 3452527.786| 3460766.857| 3431042.5 | 3440722.286|

And if we add the bufapi part, i.e., all the series:

1 | 16 | 32 | 64 | 128 | 256 | 512 | 1024
2257970.769| 3151268.385| 3260150.538| 3379383.846| 3424028.846| 3433384.308| 3385635.231| 3406554.538

For easier treatment, all in the same table:

1 | 16 | 32 | 64 | 128 | 256 | 512 | 1024
------------+-------------+-------------+-------------+-------------+-------------+------------+------------
2293304.308 | 3396057.769 | 3540860.615 | 3636056.077 | 3332950.846 | 3694276.154 | 3689820 |
2286723.857 | 3307191.643 | 3400346.571 | 3452527.786 | 3460766.857 | 3431042.5 | 3440722.286|
2257970.769 | 3151268.385 | 3260150.538 | 3379383.846 | 3424028.846 | 3433384.308 | 3385635.231| 3406554.538

> > # Rx
> > # ==

The rx tests are done with pktgen injecting packets in tap interface, and testpmd in rxonly forward mode. Again, each
column is a different value of VHOST_NET_BATCH, and each row is base, +batching, and +buf_api:

> > # pktgen results (pps)

(Didn't record extreme cases like >512 bufs batching)

1 | 16 | 32 | 64 | 128 | 256 | 512
-------+--------+--------+--------+--------+--------+--------
1223275| 1668868| 1728794| 1769261| 1808574| 1837252| 1846436
1456924| 1797901| 1831234| 1868746| 1877508| 1931598| 1936402
1368923| 1719716| 1794373| 1865170| 1884803| 1916021| 1975160

> > # Testpmd pps results

1 | 16 | 32 | 64 | 128 | 256 | 512 | 1024 | 2048
------------+------------+------------+-----------+-----------+------------+------------+------------+---------
1222698.143 | 1670604 | 1731040.6 | 1769218 | 1811206 | 1839308.75 | 1848478.75 |
1450140.5 | 1799985.75 | 1834089.75 | 1871290 | 1880005.5 | 1934147.25 | 1939034 |
1370621 | 1721858 | 1796287.75 | 1866618.5 | 1885466.5 | 1918670.75 | 1976173.5 | 1988760.75 | 1978316

The last extreme cases (>512 bufs batched) were recorded just for the bufapi case.

Does that make sense now?

Thanks!

2020-07-20 11:46:32

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Mon, Jul 20, 2020 at 01:16:47PM +0200, Eugenio P?rez wrote:
>
> On Mon, Jul 20, 2020 at 11:27 AM Michael S. Tsirkin <[email protected]> wrote:
> > On Thu, Jul 16, 2020 at 07:16:27PM +0200, Eugenio Perez Martin wrote:
> > > On Fri, Jul 10, 2020 at 7:58 AM Michael S. Tsirkin <[email protected]> wrote:
> > > > On Fri, Jul 10, 2020 at 07:39:26AM +0200, Eugenio Perez Martin wrote:
> > > > > > > How about playing with the batch size? Make it a mod parameter instead
> > > > > > > of the hard coded 64, and measure for all values 1 to 64 ...
> > > > > >
> > > > > > Right, according to the test result, 64 seems to be too aggressive in
> > > > > > the case of TX.
> > > > > >
> > > > >
> > > > > Got it, thanks both!
> > > >
> > > > In particular I wonder whether with batch size 1
> > > > we get same performance as without batching
> > > > (would indicate 64 is too aggressive)
> > > > or not (would indicate one of the code changes
> > > > affects performance in an unexpected way).
> > > >
> > > > --
> > > > MST
> > > >
> > >
> > > Hi!
> > >
> > > Varying batch_size as drivers/vhost/net.c:VHOST_NET_BATCH,
> >
> > sorry this is not what I meant.
> >
> > I mean something like this:
> >
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 0b509be8d7b1..b94680e5721d 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -1279,6 +1279,10 @@ static void handle_rx_net(struct vhost_work *work)
> > handle_rx(net);
> > }
> >
> > +MODULE_PARM_DESC(batch_num, "Number of batched descriptors. (offset from 64)");
> > +module_param(batch_num, int, 0644);
> > +static int batch_num = 0;
> > +
> > static int vhost_net_open(struct inode *inode, struct file *f)
> > {
> > struct vhost_net *n;
> > @@ -1333,7 +1337,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
> > vhost_net_buf_init(&n->vqs[i].rxq);
> > }
> > vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
> > - UIO_MAXIOV + VHOST_NET_BATCH,
> > + UIO_MAXIOV + VHOST_NET_BATCH + batch_num,
> > VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
> > NULL);
> >
> >
> > then you can try tweaking batching and playing with mod parameter without
> > recompiling.
> >
> >
> > VHOST_NET_BATCH affects lots of other things.
> >
>
> Ok, got it. Since they were aligned from the start, I thought it was a good idea to maintain them in-sync.
>
> > > and testing
> > > the pps as previous mail says. This means that we have either only
> > > vhost_net batching (in base testing, like previously to apply this
> > > patch) or both batching sizes the same.
> > >
> > > I've checked that vhost process (and pktgen) goes 100% cpu also.
> > >
> > > For tx: Batching decrements always the performance, in all cases. Not
> > > sure why bufapi made things better the last time.
> > >
> > > Batching makes improvements until 64 bufs, I see increments of pps but like 1%.
> > >
> > > For rx: Batching always improves performance. It seems that if we
> > > batch little, bufapi decreases performance, but beyond 64, bufapi is
> > > much better. The bufapi version keeps improving until I set a batching
> > > of 1024. So I guess it is super good to have a bunch of buffers to
> > > receive.
> > >
> > > Since with this test I cannot disable event_idx or things like that,
> > > what would be the next step for testing?
> > >
> > > Thanks!
> > >
> > > --
> > > Results:
> > > # Buf size: 1,16,32,64,128,256,512
> > >
> > > # Tx
> > > # ===
> > > # Base
> > > 2293304.308,3396057.769,3540860.615,3636056.077,3332950.846,3694276.154,3689820
> > > # Batch
> > > 2286723.857,3307191.643,3400346.571,3452527.786,3460766.857,3431042.5,3440722.286
> > > # Batch + Bufapi
> > > 2257970.769,3151268.385,3260150.538,3379383.846,3424028.846,3433384.308,3385635.231,3406554.538
> > >
> > > # Rx
> > > # ==
> > > # pktgen results (pps)
> > > 1223275,1668868,1728794,1769261,1808574,1837252,1846436
> > > 1456924,1797901,1831234,1868746,1877508,1931598,1936402
> > > 1368923,1719716,1794373,1865170,1884803,1916021,1975160
> > >
> > > # Testpmd pps results
> > > 1222698.143,1670604,1731040.6,1769218,1811206,1839308.75,1848478.75
> > > 1450140.5,1799985.75,1834089.75,1871290,1880005.5,1934147.25,1939034
> > > 1370621,1721858,1796287.75,1866618.5,1885466.5,1918670.75,1976173.5,1988760.75,1978316
> > >
> > > pktgen was run again for rx with 1024 and 2048 buf size, giving
> > > 1988760.75 and 1978316 pps. Testpmd goes the same way.
> >
> > Don't really understand what does this data mean.
> > Which number of descs is batched for each run?
> >
>
> Sorry, I should have explained better. I will expand here, but feel free to skip it since we are going to discard the
> data anyway. Or to propose a better way to tell them.
>
> Is a CSV with the values I've obtained, in pps, from pktgen and testpmd. This way is easy to plot them.
>
> Maybe is easier as tables, if mail readers/gmail does not misalign them.
>
> > > # Tx
> > > # ===
>
> Base: With the previous code, not integrating any patch. testpmd is txonly mode, tap interface is XDP_DROP everything.
> We vary VHOST_NET_BATCH (1, 16, 32, ...). As Jason put in a previous mail:
>
> TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP
>
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512 |
> 2293304.308| 3396057.769| 3540860.615| 3636056.077| 3332950.846| 3694276.154| 3689820|
>
> If we add the batching part of the series, but not the bufapi:
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512 |
> 2286723.857 | 3307191.643| 3400346.571| 3452527.786| 3460766.857| 3431042.5 | 3440722.286|
>
> And if we add the bufapi part, i.e., all the series:
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512 | 1024
> 2257970.769| 3151268.385| 3260150.538| 3379383.846| 3424028.846| 3433384.308| 3385635.231| 3406554.538
>
> For easier treatment, all in the same table:
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512 | 1024
> ------------+-------------+-------------+-------------+-------------+-------------+------------+------------
> 2293304.308 | 3396057.769 | 3540860.615 | 3636056.077 | 3332950.846 | 3694276.154 | 3689820 |
> 2286723.857 | 3307191.643 | 3400346.571 | 3452527.786 | 3460766.857 | 3431042.5 | 3440722.286|
> 2257970.769 | 3151268.385 | 3260150.538 | 3379383.846 | 3424028.846 | 3433384.308 | 3385635.231| 3406554.538
>
> > > # Rx
> > > # ==
>
> The rx tests are done with pktgen injecting packets in tap interface, and testpmd in rxonly forward mode. Again, each
> column is a different value of VHOST_NET_BATCH, and each row is base, +batching, and +buf_api:
>
> > > # pktgen results (pps)
>
> (Didn't record extreme cases like >512 bufs batching)
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512
> -------+--------+--------+--------+--------+--------+--------
> 1223275| 1668868| 1728794| 1769261| 1808574| 1837252| 1846436
> 1456924| 1797901| 1831234| 1868746| 1877508| 1931598| 1936402
> 1368923| 1719716| 1794373| 1865170| 1884803| 1916021| 1975160
>
> > > # Testpmd pps results
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512 | 1024 | 2048
> ------------+------------+------------+-----------+-----------+------------+------------+------------+---------
> 1222698.143 | 1670604 | 1731040.6 | 1769218 | 1811206 | 1839308.75 | 1848478.75 |
> 1450140.5 | 1799985.75 | 1834089.75 | 1871290 | 1880005.5 | 1934147.25 | 1939034 |
> 1370621 | 1721858 | 1796287.75 | 1866618.5 | 1885466.5 | 1918670.75 | 1976173.5 | 1988760.75 | 1978316
>
> The last extreme cases (>512 bufs batched) were recorded just for the bufapi case.
>
> Does that make sense now?
>
> Thanks!

yes, thanks!

2020-07-20 13:11:25

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On Mon, Jul 20, 2020 at 10:55 AM Jason Wang <[email protected]> wrote:
>
>
> On 2020/7/17 上午1:16, Eugenio Perez Martin wrote:
> > On Fri, Jul 10, 2020 at 7:58 AM Michael S. Tsirkin <[email protected]> wrote:
> >> On Fri, Jul 10, 2020 at 07:39:26AM +0200, Eugenio Perez Martin wrote:
> >>>>> How about playing with the batch size? Make it a mod parameter instead
> >>>>> of the hard coded 64, and measure for all values 1 to 64 ...
> >>>>
> >>>> Right, according to the test result, 64 seems to be too aggressive in
> >>>> the case of TX.
> >>>>
> >>> Got it, thanks both!
> >> In particular I wonder whether with batch size 1
> >> we get same performance as without batching
> >> (would indicate 64 is too aggressive)
> >> or not (would indicate one of the code changes
> >> affects performance in an unexpected way).
> >>
> >> --
> >> MST
> >>
> > Hi!
> >
> > Varying batch_size as drivers/vhost/net.c:VHOST_NET_BATCH,
>
>
> Did you mean varying the value of VHOST_NET_BATCH itself or the number
> of batched descriptors?
>
>
> > and testing
> > the pps as previous mail says. This means that we have either only
> > vhost_net batching (in base testing, like previously to apply this
> > patch) or both batching sizes the same.
> >
> > I've checked that vhost process (and pktgen) goes 100% cpu also.
> >
> > For tx: Batching decrements always the performance, in all cases. Not
> > sure why bufapi made things better the last time.
> >
> > Batching makes improvements until 64 bufs, I see increments of pps but like 1%.
> >
> > For rx: Batching always improves performance. It seems that if we
> > batch little, bufapi decreases performance, but beyond 64, bufapi is
> > much better. The bufapi version keeps improving until I set a batching
> > of 1024. So I guess it is super good to have a bunch of buffers to
> > receive.
> >
> > Since with this test I cannot disable event_idx or things like that,
> > what would be the next step for testing?
> >
> > Thanks!
> >
> > --
> > Results:
> > # Buf size: 1,16,32,64,128,256,512
> >
> > # Tx
> > # ===
> > # Base
> > 2293304.308,3396057.769,3540860.615,3636056.077,3332950.846,3694276.154,3689820
>
>
> What's the meaning of buf size in the context of "base"?
>

Hi Jason.

I think that all the previous questions have been answered in the
response to MST, please let me know if I missed something.

> And I wonder maybe perf diff can help.

Great, I will run it too.

Thanks!

>
> Thanks
>
>
> > # Batch
> > 2286723.857,3307191.643,3400346.571,3452527.786,3460766.857,3431042.5,3440722.286
> > # Batch + Bufapi
> > 2257970.769,3151268.385,3260150.538,3379383.846,3424028.846,3433384.308,3385635.231,3406554.538
> >
> > # Rx
> > # ==
> > # pktgen results (pps)
> > 1223275,1668868,1728794,1769261,1808574,1837252,1846436
> > 1456924,1797901,1831234,1868746,1877508,1931598,1936402
> > 1368923,1719716,1794373,1865170,1884803,1916021,1975160
> >
> > # Testpmd pps results
> > 1222698.143,1670604,1731040.6,1769218,1811206,1839308.75,1848478.75
> > 1450140.5,1799985.75,1834089.75,1871290,1880005.5,1934147.25,1939034
> > 1370621,1721858,1796287.75,1866618.5,1885466.5,1918670.75,1976173.5,1988760.75,1978316
> >
> > pktgen was run again for rx with 1024 and 2048 buf size, giving
> > 1988760.75 and 1978316 pps. Testpmd goes the same way.
> >
>

2020-07-21 02:56:03

[permalink] [raw]

Subject: Re: [PATCH RFC v8 02/11] vhost: use batched get_vq_desc version

On 2020/7/20 下午7:16, Eugenio Pérez wrote:
> On Mon, Jul 20, 2020 at 11:27 AM Michael S. Tsirkin <[email protected]> wrote:
>> On Thu, Jul 16, 2020 at 07:16:27PM +0200, Eugenio Perez Martin wrote:
>>> On Fri, Jul 10, 2020 at 7:58 AM Michael S. Tsirkin <[email protected]> wrote:
>>>> On Fri, Jul 10, 2020 at 07:39:26AM +0200, Eugenio Perez Martin wrote:
>>>>>>> How about playing with the batch size? Make it a mod parameter instead
>>>>>>> of the hard coded 64, and measure for all values 1 to 64 ...
>>>>>> Right, according to the test result, 64 seems to be too aggressive in
>>>>>> the case of TX.
>>>>>>
>>>>> Got it, thanks both!
>>>> In particular I wonder whether with batch size 1
>>>> we get same performance as without batching
>>>> (would indicate 64 is too aggressive)
>>>> or not (would indicate one of the code changes
>>>> affects performance in an unexpected way).
>>>>
>>>> --
>>>> MST
>>>>
>>> Hi!
>>>
>>> Varying batch_size as drivers/vhost/net.c:VHOST_NET_BATCH,
>> sorry this is not what I meant.
>>
>> I mean something like this:
>>
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 0b509be8d7b1..b94680e5721d 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -1279,6 +1279,10 @@ static void handle_rx_net(struct vhost_work *work)
>> handle_rx(net);
>> }
>>
>> +MODULE_PARM_DESC(batch_num, "Number of batched descriptors. (offset from 64)");
>> +module_param(batch_num, int, 0644);
>> +static int batch_num = 0;
>> +
>> static int vhost_net_open(struct inode *inode, struct file *f)
>> {
>> struct vhost_net *n;
>> @@ -1333,7 +1337,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>> vhost_net_buf_init(&n->vqs[i].rxq);
>> }
>> vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
>> - UIO_MAXIOV + VHOST_NET_BATCH,
>> + UIO_MAXIOV + VHOST_NET_BATCH + batch_num,
>> VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
>> NULL);
>>
>>
>> then you can try tweaking batching and playing with mod parameter without
>> recompiling.
>>
>>
>> VHOST_NET_BATCH affects lots of other things.
>>
> Ok, got it. Since they were aligned from the start, I thought it was a good idea to maintain them in-sync.
>
>>> and testing
>>> the pps as previous mail says. This means that we have either only
>>> vhost_net batching (in base testing, like previously to apply this
>>> patch) or both batching sizes the same.
>>>
>>> I've checked that vhost process (and pktgen) goes 100% cpu also.
>>>
>>> For tx: Batching decrements always the performance, in all cases. Not
>>> sure why bufapi made things better the last time.
>>>
>>> Batching makes improvements until 64 bufs, I see increments of pps but like 1%.
>>>
>>> For rx: Batching always improves performance. It seems that if we
>>> batch little, bufapi decreases performance, but beyond 64, bufapi is
>>> much better. The bufapi version keeps improving until I set a batching
>>> of 1024. So I guess it is super good to have a bunch of buffers to
>>> receive.
>>>
>>> Since with this test I cannot disable event_idx or things like that,
>>> what would be the next step for testing?
>>>
>>> Thanks!
>>>
>>> --
>>> Results:
>>> # Buf size: 1,16,32,64,128,256,512
>>>
>>> # Tx
>>> # ===
>>> # Base
>>> 2293304.308,3396057.769,3540860.615,3636056.077,3332950.846,3694276.154,3689820
>>> # Batch
>>> 2286723.857,3307191.643,3400346.571,3452527.786,3460766.857,3431042.5,3440722.286
>>> # Batch + Bufapi
>>> 2257970.769,3151268.385,3260150.538,3379383.846,3424028.846,3433384.308,3385635.231,3406554.538
>>>
>>> # Rx
>>> # ==
>>> # pktgen results (pps)
>>> 1223275,1668868,1728794,1769261,1808574,1837252,1846436
>>> 1456924,1797901,1831234,1868746,1877508,1931598,1936402
>>> 1368923,1719716,1794373,1865170,1884803,1916021,1975160
>>>
>>> # Testpmd pps results
>>> 1222698.143,1670604,1731040.6,1769218,1811206,1839308.75,1848478.75
>>> 1450140.5,1799985.75,1834089.75,1871290,1880005.5,1934147.25,1939034
>>> 1370621,1721858,1796287.75,1866618.5,1885466.5,1918670.75,1976173.5,1988760.75,1978316
>>>
>>> pktgen was run again for rx with 1024 and 2048 buf size, giving
>>> 1988760.75 and 1978316 pps. Testpmd goes the same way.
>> Don't really understand what does this data mean.
>> Which number of descs is batched for each run?
>>
> Sorry, I should have explained better. I will expand here, but feel free to skip it since we are going to discard the
> data anyway. Or to propose a better way to tell them.
>
> Is a CSV with the values I've obtained, in pps, from pktgen and testpmd. This way is easy to plot them.
>
> Maybe is easier as tables, if mail readers/gmail does not misalign them.
>
>>> # Tx
>>> # ===
> Base: With the previous code, not integrating any patch. testpmd is txonly mode, tap interface is XDP_DROP everything.
> We vary VHOST_NET_BATCH (1, 16, 32, ...). As Jason put in a previous mail:
>
> TX: testpmd(txonly) -> virtio-user -> vhost_net -> XDP_DROP on TAP
>
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512 |
> 2293304.308| 3396057.769| 3540860.615| 3636056.077| 3332950.846| 3694276.154| 3689820|
>
> If we add the batching part of the series, but not the bufapi:
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512 |
> 2286723.857 | 3307191.643| 3400346.571| 3452527.786| 3460766.857| 3431042.5 | 3440722.286|
>
> And if we add the bufapi part, i.e., all the series:
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512 | 1024
> 2257970.769| 3151268.385| 3260150.538| 3379383.846| 3424028.846| 3433384.308| 3385635.231| 3406554.538
>
> For easier treatment, all in the same table:
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512 | 1024
> ------------+-------------+-------------+-------------+-------------+-------------+------------+------------
> 2293304.308 | 3396057.769 | 3540860.615 | 3636056.077 | 3332950.846 | 3694276.154 | 3689820 |
> 2286723.857 | 3307191.643 | 3400346.571 | 3452527.786 | 3460766.857 | 3431042.5 | 3440722.286|
> 2257970.769 | 3151268.385 | 3260150.538 | 3379383.846 | 3424028.846 | 3433384.308 | 3385635.231| 3406554.538
>
>>> # Rx
>>> # ==
> The rx tests are done with pktgen injecting packets in tap interface, and testpmd in rxonly forward mode. Again, each
> column is a different value of VHOST_NET_BATCH, and each row is base, +batching, and +buf_api:
>
>>> # pktgen results (pps)
> (Didn't record extreme cases like >512 bufs batching)
>
> 1 | 16 | 32 | 64 | 128 | 256 | 512
> -------+--------+--------+--------+--------+--------+--------
> 1223275| 1668868| 1728794| 1769261| 1808574| 1837252| 1846436
> 1456924| 1797901| 1831234| 1868746| 1877508| 1931598| 1936402
> 1368923| 1719716| 1794373| 1865170| 1884803| 1916021| 1975160
>
>>> # Testpmd pps results
> 1 | 16 | 32 | 64 | 128 | 256 | 512 | 1024 | 2048
> ------------+------------+------------+-----------+-----------+------------+------------+------------+---------
> 1222698.143 | 1670604 | 1731040.6 | 1769218 | 1811206 | 1839308.75 | 1848478.75 |
> 1450140.5 | 1799985.75 | 1834089.75 | 1871290 | 1880005.5 | 1934147.25 | 1939034 |
> 1370621 | 1721858 | 1796287.75 | 1866618.5 | 1885466.5 | 1918670.75 | 1976173.5 | 1988760.75 | 1978316
>
> The last extreme cases (>512 bufs batched) were recorded just for the bufapi case.
>
> Does that make sense now?
>
> Thanks!

I wonder why we saw huge difference between TX and RX pps. Have you used
samples/pktgen/XXX for doing the test? Maybe you can paste the perf
record result for the pktgen thread + vhost thread.

Thanks

>

2020-07-29 18:42:13