Hi all:
We do not support XDP batching for TUN since it can only receive one
packet a time from vhost_net. This series tries to remove this
limitation by:
- introduce a TUN specific msg_control that can hold a pointer to an
array of XDP buffs
- try copy and build XDP buff in vhost_net
- store XDP buffs in an array and submit them once for every N packets
from vhost_net
- since TUN can only do native XDP for datacopy packet, to simplify
the logic, split datacopy out logic and only do batching for
datacopy.
With this series, TX PPS can improve about 34% from 2.9Mpps to
3.9Mpps when doing xdp_redirect_map between TAP and ixgbe.
Thanks
Jason Wang (12):
vhost_net: introduce helper to initialize tx iov iter
vhost_net: introduce vhost_exceeds_weight()
vhost_net: introduce vhost_has_more_pkts()
vhost_net: split out datacopy logic
vhost_net: batch update used ring for datacopy TX
tuntap: enable premmption early
tuntap: simplify error handling in tun_build_skb()
tuntap: tweak on the path of non-xdp case in tun_build_skb()
tuntap: split out XDP logic
vhost_net: build xdp buff
vhost_net: passing raw xdp buff to tun
vhost_net: batch submitting XDP buffers to underlayer sockets
drivers/net/tun.c | 226 +++++++++++++++++++++++++++----------
drivers/vhost/net.c | 297 ++++++++++++++++++++++++++++++++++++++++++++-----
include/linux/if_tun.h | 7 ++
3 files changed, 444 insertions(+), 86 deletions(-)
--
2.7.4
Signed-off-by: Jason Wang <[email protected]>
---
drivers/vhost/net.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 15d191a..de544ee 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -479,6 +479,12 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
return len;
}
+static bool vhost_exceeds_weight(int pkts, int total_len)
+{
+ return unlikely(total_len >= VHOST_NET_WEIGHT) ||
+ unlikely(pkts >= VHOST_NET_PKT_WEIGHT);
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
@@ -570,7 +576,6 @@ static void handle_tx(struct vhost_net *net)
msg.msg_control = NULL;
ubufs = NULL;
}
-
total_len += len;
if (total_len < VHOST_NET_WEIGHT &&
!vhost_vq_avail_empty(&net->dev, vq) &&
@@ -600,8 +605,7 @@ static void handle_tx(struct vhost_net *net)
else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
- if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
- unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
+ if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
vhost_poll_queue(&vq->poll);
break;
}
@@ -887,8 +891,7 @@ static void handle_rx(struct vhost_net *net)
if (unlikely(vq_log))
vhost_log_write(vq, vq_log, log, vhost_len);
total_len += vhost_len;
- if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
- unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
+ if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
vhost_poll_queue(&vq->poll);
goto out;
}
--
2.7.4
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 36 ++++++++++++++----------------------
1 file changed, 14 insertions(+), 22 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 24ecd82..f6e0f96 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1612,7 +1612,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
int len, int *skb_xdp)
{
struct page_frag *alloc_frag = ¤t->task_frag;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
unsigned int delta = 0;
@@ -1638,6 +1638,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
if (copied != len)
return ERR_PTR(-EFAULT);
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
/* There's a small window that XDP may be set after the check
* of xdp_prog above, this should be rare and for simplicity
* we do XDP on skb in case the headroom is not enough.
@@ -1665,24 +1668,16 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
switch (act) {
case XDP_REDIRECT:
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
xdp_do_flush_map();
if (err)
- goto err_redirect;
- rcu_read_unlock();
- preempt_enable();
- return NULL;
+ goto err_xdp;
+ goto out;
case XDP_TX:
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
if (tun_xdp_tx(tun->dev, &xdp))
- goto err_redirect;
+ goto err_xdp;
tun_xdp_flush(tun->dev);
- rcu_read_unlock();
- preempt_enable();
- return NULL;
+ goto out;
case XDP_PASS:
delta = orig_data - xdp.data;
len = xdp.data_end - xdp.data;
@@ -1702,25 +1697,22 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
skb = build_skb(buf, buflen);
if (!skb) {
- rcu_read_unlock();
- preempt_enable();
- return ERR_PTR(-ENOMEM);
+ skb = ERR_PTR(-ENOMEM);
+ goto out;
}
skb_reserve(skb, pad - delta);
skb_put(skb, len);
- get_page(alloc_frag->page);
- alloc_frag->offset += buflen;
return skb;
-err_redirect:
- put_page(alloc_frag->page);
err_xdp:
+ alloc_frag->offset -= buflen;
+ put_page(alloc_frag->page);
+out:
rcu_read_unlock();
preempt_enable();
- this_cpu_inc(tun->pcpu_stats->rx_dropped);
- return NULL;
+ return skb;
}
/* Get packet from user space buffer */
--
2.7.4
If we're sure not to go native XDP, there's no need for several things
like touching preemption counter and rcu stuffs.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f6e0f96..ed04291 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1645,10 +1645,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
* of xdp_prog above, this should be rare and for simplicity
* we do XDP on skb in case the headroom is not enough.
*/
- if (hdr->gso_type || !xdp_prog)
+ if (hdr->gso_type || !xdp_prog) {
*skb_xdp = 1;
- else
- *skb_xdp = 0;
+ goto build;
+ }
+
+ *skb_xdp = 0;
preempt_disable();
rcu_read_lock();
@@ -1695,6 +1697,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
rcu_read_unlock();
preempt_enable();
+build:
skb = build_skb(buf, buflen);
if (!skb) {
skb = ERR_PTR(-ENOMEM);
--
2.7.4
This patch implements XDP batching for vhost_net with tun. This is
done by batching XDP buffs in vhost and submit them when:
- vhost_net can not build XDP buff (mostly because of the size of packet)
- #batched exceeds the limitation (VHOST_NET_RX_BATCH).
- tun accept a batch of XDP buff through msg_control and process them
in a batch
With this tun XDP can benefit from e.g batch transmission during
XDP_REDIRECT or XDP_TX.
Tests shows 21% improvement on TX pps (from ~3.2Mpps to ~3.9Mpps)
while transmitting through testpmd from guest to host by
xdp_redirect_map between tap0 and ixgbe.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 36 +++++++++++++++++----------
drivers/vhost/net.c | 71 ++++++++++++++++++++++++++++++++++++-----------------
2 files changed, 71 insertions(+), 36 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index b586b3f..5d16d18 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1616,7 +1616,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
switch (act) {
case XDP_REDIRECT:
*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
- xdp_do_flush_map();
if (*err)
break;
goto out;
@@ -1624,7 +1623,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
*err = tun_xdp_tx(tun->dev, xdp);
if (*err)
break;
- tun_xdp_flush(tun->dev);
goto out;
case XDP_PASS:
goto out;
@@ -2400,9 +2398,6 @@ static int tun_xdp_one(struct tun_struct *tun,
int err = 0;
bool skb_xdp = false;
- preempt_disable();
- rcu_read_lock();
-
xdp_prog = rcu_dereference(tun->xdp_prog);
if (xdp_prog) {
if (gso->gso_type) {
@@ -2461,15 +2456,12 @@ static int tun_xdp_one(struct tun_struct *tun,
tun_flow_update(tun, rxhash, tfile);
out:
- rcu_read_unlock();
- preempt_enable();
-
return err;
}
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
- int ret;
+ int ret, i;
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
struct tun_msg_ctl *ctl = m->msg_control;
@@ -2477,10 +2469,28 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
if (!tun)
return -EBADFD;
- if (ctl && ctl->type == TUN_MSG_PTR) {
- ret = tun_xdp_one(tun, tfile, ctl->ptr);
- if (!ret)
- ret = total_len;
+ if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
+ int n = ctl->type >> 16;
+
+ preempt_disable();
+ rcu_read_lock();
+
+ for (i = 0; i < n; i++) {
+ struct xdp_buff *x = (struct xdp_buff *)ctl->ptr;
+ struct xdp_buff *xdp = &x[i];
+
+ xdp_set_data_meta_invalid(xdp);
+ xdp->rxq = &tfile->xdp_rxq;
+ tun_xdp_one(tun, tfile, xdp);
+ }
+
+ xdp_do_flush_map();
+ tun_xdp_flush(tun->dev);
+
+ rcu_read_unlock();
+ preempt_enable();
+
+ ret = total_len;
goto out;
}
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 0d84de6..bec4109 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -118,6 +118,7 @@ struct vhost_net_virtqueue {
struct ptr_ring *rx_ring;
struct vhost_net_buf rxq;
struct xdp_buff xdp[VHOST_RX_BATCH];
+ struct vring_used_elem heads[VHOST_RX_BATCH];
};
struct vhost_net {
@@ -511,7 +512,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
void *buf;
int copied;
- if (len < nvq->sock_hlen)
+ if (unlikely(len < nvq->sock_hlen))
return -EFAULT;
if (SKB_DATA_ALIGN(len + pad) +
@@ -567,11 +568,37 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
return 0;
}
+static void vhost_tx_batch(struct vhost_net *net,
+ struct vhost_net_virtqueue *nvq,
+ struct socket *sock,
+ struct msghdr *msghdr, int n)
+{
+ struct tun_msg_ctl ctl = {
+ .type = n << 16 | TUN_MSG_PTR,
+ .ptr = nvq->xdp,
+ };
+ int err;
+
+ if (n == 0)
+ return;
+
+ msghdr->msg_control = &ctl;
+ err = sock->ops->sendmsg(sock, msghdr, 0);
+
+ if (unlikely(err < 0)) {
+ /* FIXME vq_err() */
+ vq_err(&nvq->vq, "sendmsg err!\n");
+ return;
+ }
+ vhost_add_used_and_signal_n(&net->dev, &nvq->vq, nvq->vq.heads, n);
+}
+
+/* Expects to be always run from workqueue - which acts as
+ * read-size critical section for our kind of RCU. */
static void handle_tx_copy(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
- struct xdp_buff xdp;
unsigned out, in;
int head;
struct msghdr msg = {
@@ -586,7 +613,6 @@ static void handle_tx_copy(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
- struct tun_msg_ctl ctl;
int sent_pkts = 0;
s16 nheads = 0;
@@ -631,22 +657,24 @@ static void handle_tx_copy(struct vhost_net *net)
vq->heads[nheads].id = cpu_to_vhost32(vq, head);
vq->heads[nheads].len = 0;
- err = vhost_net_build_xdp(nvq, &msg.msg_iter, &xdp);
- if (!err) {
- ctl.type = TUN_MSG_PTR;
- ctl.ptr = &xdp;
- msg.msg_control = &ctl;
- } else
- msg.msg_control = NULL;
-
total_len += len;
- if (total_len < VHOST_NET_WEIGHT &&
- vhost_has_more_pkts(net, vq)) {
- msg.msg_flags |= MSG_MORE;
- } else {
- msg.msg_flags &= ~MSG_MORE;
+ err = vhost_net_build_xdp(nvq, &msg.msg_iter,
+ &nvq->xdp[nheads]);
+ if (!err) {
+ if (++nheads == VHOST_RX_BATCH) {
+ vhost_tx_batch(net, nvq, sock, &msg, nheads);
+ nheads = 0;
+ }
+ goto done;
+ } else if (unlikely(err != -ENOSPC)) {
+ vq_err(vq, "Fail to build XDP buffer\n");
+ break;
}
+ vhost_tx_batch(net, nvq, sock, &msg, nheads);
+ msg.msg_control = NULL;
+ nheads = 0;
+
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(sock, &msg, len);
if (unlikely(err < 0)) {
@@ -657,11 +685,9 @@ static void handle_tx_copy(struct vhost_net *net)
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- if (++nheads == VHOST_RX_BATCH) {
- vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
- nheads);
- nheads = 0;
- }
+
+ vhost_add_used_and_signal(&net->dev, vq, head, 0);
+done:
if (vhost_exceeds_weight(++sent_pkts, total_len)) {
vhost_poll_queue(&vq->poll);
break;
@@ -669,8 +695,7 @@ static void handle_tx_copy(struct vhost_net *net)
}
out:
if (nheads)
- vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
- nheads);
+ vhost_tx_batch(net, nvq, sock, &msg, nheads);
mutex_unlock(&vq->mutex);
}
--
2.7.4
This patches implement a TUN specific msg_control:
#define TUN_MSG_UBUF 1
#define TUN_MSG_PTR 2
struct tun_msg_ctl {
int type;
void *ptr;
};
The first supported type is ubuf which is already used by vhost_net
zerocopy code. The second is XDP buff, which allows vhost_net to pass
XDP buff to TUN. This could be used to implement accepting an array of
XDP buffs from vhost_net in the following patches.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++-
drivers/vhost/net.c | 21 ++++++++++--
include/linux/if_tun.h | 7 ++++
3 files changed, 116 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2560378..b586b3f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2387,18 +2387,107 @@ static void tun_sock_write_space(struct sock *sk)
kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}
+static int tun_xdp_one(struct tun_struct *tun,
+ struct tun_file *tfile,
+ struct xdp_buff *xdp)
+{
+ struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
+ struct tun_pcpu_stats *stats;
+ struct bpf_prog *xdp_prog;
+ struct sk_buff *skb = NULL;
+ u32 rxhash = 0, act;
+ int buflen = *(int *)xdp->data_hard_start;
+ int err = 0;
+ bool skb_xdp = false;
+
+ preempt_disable();
+ rcu_read_lock();
+
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ if (gso->gso_type) {
+ skb_xdp = true;
+ goto build;
+ }
+ xdp_set_data_meta_invalid(xdp);
+ xdp->rxq = &tfile->xdp_rxq;
+ act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err);
+ if (err)
+ goto out;
+ if (act != XDP_PASS)
+ goto out;
+ }
+
+build:
+ skb = build_skb(xdp->data_hard_start, buflen);
+ if (!skb) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (skb_xdp) {
+ err = do_xdp_generic(xdp_prog, skb);
+ if (err != XDP_PASS)
+ goto out;
+ }
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ skb_put(skb, xdp->data_end - xdp->data);
+
+ if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
+ this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
+ kfree_skb(skb);
+ err = -EINVAL;
+ goto out;
+ }
+
+ skb->protocol = eth_type_trans(skb, tun->dev);
+ skb_reset_network_header(skb);
+ skb_probe_transport_header(skb, 0);
+
+ if (!rcu_dereference(tun->steering_prog))
+ rxhash = __skb_get_hash_symmetric(skb);
+
+ netif_receive_skb(skb);
+
+ stats = get_cpu_ptr(tun->pcpu_stats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ u64_stats_update_end(&stats->syncp);
+ put_cpu_ptr(stats);
+
+ if (rxhash)
+ tun_flow_update(tun, rxhash, tfile);
+
+out:
+ rcu_read_unlock();
+ preempt_enable();
+
+ return err;
+}
+
static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
int ret;
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
struct tun_struct *tun = tun_get(tfile);
+ struct tun_msg_ctl *ctl = m->msg_control;
if (!tun)
return -EBADFD;
- ret = tun_get_user(tun, tfile, m->msg_control, &m->msg_iter,
+ if (ctl && ctl->type == TUN_MSG_PTR) {
+ ret = tun_xdp_one(tun, tfile, ctl->ptr);
+ if (!ret)
+ ret = total_len;
+ goto out;
+ }
+
+ ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
m->msg_flags & MSG_DONTWAIT,
m->msg_flags & MSG_MORE);
+out:
tun_put(tun);
return ret;
}
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1209e84..0d84de6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -117,6 +117,7 @@ struct vhost_net_virtqueue {
struct vhost_net_ubuf_ref *ubufs;
struct ptr_ring *rx_ring;
struct vhost_net_buf rxq;
+ struct xdp_buff xdp[VHOST_RX_BATCH];
};
struct vhost_net {
@@ -570,6 +571,7 @@ static void handle_tx_copy(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
+ struct xdp_buff xdp;
unsigned out, in;
int head;
struct msghdr msg = {
@@ -584,6 +586,7 @@ static void handle_tx_copy(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+ struct tun_msg_ctl ctl;
int sent_pkts = 0;
s16 nheads = 0;
@@ -628,6 +631,14 @@ static void handle_tx_copy(struct vhost_net *net)
vq->heads[nheads].id = cpu_to_vhost32(vq, head);
vq->heads[nheads].len = 0;
+ err = vhost_net_build_xdp(nvq, &msg.msg_iter, &xdp);
+ if (!err) {
+ ctl.type = TUN_MSG_PTR;
+ ctl.ptr = &xdp;
+ msg.msg_control = &ctl;
+ } else
+ msg.msg_control = NULL;
+
total_len += len;
if (total_len < VHOST_NET_WEIGHT &&
vhost_has_more_pkts(net, vq)) {
@@ -734,16 +745,21 @@ static void handle_tx_zerocopy(struct vhost_net *net)
/* use msg_control to pass vhost zerocopy ubuf info to skb */
if (zcopy_used) {
struct ubuf_info *ubuf;
+ struct tun_msg_ctl ctl;
+
ubuf = nvq->ubuf_info + nvq->upend_idx;
+ ctl.type = TUN_MSG_UBUF;
+ ctl.ptr = ubuf;
+
vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
refcount_set(&ubuf->refcnt, 1);
- msg.msg_control = ubuf;
- msg.msg_controllen = sizeof(ubuf);
+ msg.msg_control = &ctl;
+ msg.msg_controllen = sizeof(ctl);
ubufs = nvq->ubufs;
atomic_inc(&ubufs->refcount);
nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
@@ -751,6 +767,7 @@ static void handle_tx_zerocopy(struct vhost_net *net)
msg.msg_control = NULL;
ubufs = NULL;
}
+
total_len += len;
if (total_len < VHOST_NET_WEIGHT &&
vhost_has_more_pkts(net, vq)) {
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 3d2996d..ba46dce 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,6 +19,13 @@
#define TUN_XDP_FLAG 0x1UL
+#define TUN_MSG_UBUF 1
+#define TUN_MSG_PTR 2
+struct tun_msg_ctl {
+ int type;
+ void *ptr;
+};
+
#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
struct ptr_ring *tun_get_tx_ring(struct file *file);
--
2.7.4
This patch split out XDP logic into a single function. This make it to
be reused by XDP batching path in the following patch.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 85 +++++++++++++++++++++++++++++++++----------------------
1 file changed, 51 insertions(+), 34 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ed04291..2560378 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1605,6 +1605,44 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
return true;
}
+static u32 tun_do_xdp(struct tun_struct *tun,
+ struct tun_file *tfile,
+ struct bpf_prog *xdp_prog,
+ struct xdp_buff *xdp,
+ int *err)
+{
+ u32 act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+ switch (act) {
+ case XDP_REDIRECT:
+ *err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+ xdp_do_flush_map();
+ if (*err)
+ break;
+ goto out;
+ case XDP_TX:
+ *err = tun_xdp_tx(tun->dev, xdp);
+ if (*err)
+ break;
+ tun_xdp_flush(tun->dev);
+ goto out;
+ case XDP_PASS:
+ goto out;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ break;
+ }
+
+ put_page(virt_to_head_page(xdp->data_hard_start));
+out:
+ return act;
+}
+
static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct tun_file *tfile,
struct iov_iter *from,
@@ -1615,10 +1653,10 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct sk_buff *skb = NULL;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- unsigned int delta = 0;
char *buf;
size_t copied;
- int err, pad = TUN_RX_PAD;
+ int pad = TUN_RX_PAD;
+ int err = 0;
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
@@ -1655,9 +1693,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
preempt_disable();
rcu_read_lock();
xdp_prog = rcu_dereference(tun->xdp_prog);
- if (xdp_prog && !*skb_xdp) {
+ if (xdp_prog) {
struct xdp_buff xdp;
- void *orig_data;
u32 act;
xdp.data_hard_start = buf;
@@ -1665,34 +1702,14 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + len;
xdp.rxq = &tfile->xdp_rxq;
- orig_data = xdp.data;
- act = bpf_prog_run_xdp(xdp_prog, &xdp);
-
- switch (act) {
- case XDP_REDIRECT:
- err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
- xdp_do_flush_map();
- if (err)
- goto err_xdp;
- goto out;
- case XDP_TX:
- if (tun_xdp_tx(tun->dev, &xdp))
- goto err_xdp;
- tun_xdp_flush(tun->dev);
- goto out;
- case XDP_PASS:
- delta = orig_data - xdp.data;
- len = xdp.data_end - xdp.data;
- break;
- default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
- case XDP_ABORTED:
- trace_xdp_exception(tun->dev, xdp_prog, act);
- /* fall through */
- case XDP_DROP:
+ act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
+ if (err)
goto err_xdp;
- }
+ if (act != XDP_PASS)
+ goto out;
+
+ pad = xdp.data - xdp.data_hard_start;
+ len = xdp.data_end - xdp.data;
}
rcu_read_unlock();
preempt_enable();
@@ -1700,18 +1717,18 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
build:
skb = build_skb(buf, buflen);
if (!skb) {
+ put_page(alloc_frag->page);
skb = ERR_PTR(-ENOMEM);
goto out;
}
- skb_reserve(skb, pad - delta);
+ skb_reserve(skb, pad);
skb_put(skb, len);
return skb;
err_xdp:
- alloc_frag->offset -= buflen;
- put_page(alloc_frag->page);
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
out:
rcu_read_unlock();
preempt_enable();
--
2.7.4
This patch implement build XDP buffers in vhost_net. The idea is do
userspace copy in vhost_net and build XDP buff based on the
page. Vhost_net can then submit one or an array of XDP buffs to
underlayer socket (e.g TUN). TUN can choose to do XDP or call
build_skb() to build skb. To support build skb, vnet header were also
stored into the header of the XDP buff.
This userspace copy and XDP buffs building is key to achieve XDP
batching in TUN, since TUN does not need to care about userspace copy
and then can disable premmption for several XDP buffs to achieve
batching from XDP.
TODO: reserve headroom based on the TUN XDP.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/vhost/net.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 74 insertions(+)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f0639d7..1209e84 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -492,6 +492,80 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
likely(!vhost_exceeds_maxpend(net));
}
+#define VHOST_NET_HEADROOM 256
+#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
+static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
+ struct iov_iter *from,
+ struct xdp_buff *xdp)
+{
+ struct vhost_virtqueue *vq = &nvq->vq;
+ struct page_frag *alloc_frag = ¤t->task_frag;
+ struct virtio_net_hdr *gso;
+ size_t len = iov_iter_count(from);
+ int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + VHOST_NET_HEADROOM
+ + nvq->sock_hlen);
+ int sock_hlen = nvq->sock_hlen;
+ void *buf;
+ int copied;
+
+ if (len < nvq->sock_hlen)
+ return -EFAULT;
+
+ if (SKB_DATA_ALIGN(len + pad) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+ return -ENOSPC;
+
+ buflen += SKB_DATA_ALIGN(len + pad);
+ alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
+ if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+ return -ENOMEM;
+
+ buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+
+ /* We store two kinds of metadata in the header which will be
+ * used for XDP_PASS to do build_skb():
+ * offset 0: buflen
+ * offset sizeof(int): vnet header
+ */
+ copied = copy_page_from_iter(alloc_frag->page,
+ alloc_frag->offset + sizeof(int), sock_hlen, from);
+ if (copied != sock_hlen)
+ return -EFAULT;
+
+ gso = (struct virtio_net_hdr *)(buf + sizeof(int));
+
+ if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+ vhost16_to_cpu(vq, gso->csum_start) +
+ vhost16_to_cpu(vq, gso->csum_offset) + 2 >
+ vhost16_to_cpu(vq, gso->hdr_len)) {
+ gso->hdr_len = cpu_to_vhost16(vq,
+ vhost16_to_cpu(vq, gso->csum_start) +
+ vhost16_to_cpu(vq, gso->csum_offset) + 2);
+
+ if (vhost16_to_cpu(vq, gso->hdr_len) > len)
+ return -EINVAL;
+ }
+
+ len -= sock_hlen;
+ copied = copy_page_from_iter(alloc_frag->page,
+ alloc_frag->offset + pad,
+ len, from);
+ if (copied != len)
+ return -EFAULT;
+
+ xdp->data_hard_start = buf;
+ xdp->data = buf + pad;
+ xdp->data_end = xdp->data + len;
+ *(int *)(xdp->data_hard_start)= buflen;
+
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
+ return 0;
+}
+
static void handle_tx_copy(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
--
2.7.4
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 44d4f3d..24ecd82 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1697,6 +1697,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
goto err_xdp;
}
}
+ rcu_read_unlock();
+ preempt_enable();
skb = build_skb(buf, buflen);
if (!skb) {
@@ -1710,9 +1712,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
- rcu_read_unlock();
- preempt_enable();
-
return skb;
err_redirect:
--
2.7.4
Like commit e2b3b35eb989 ("vhost_net: batch used ring update in rx"),
this patches implements batch used ring update for datacopy TX
(zerocopy has already done some kind of batching).
Testpmd transmission from guest to ixgbe via XDP_REDIRECT shows about
15% improvement from 2.8Mpps to 3.2Mpps.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/vhost/net.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4682fcc..f0639d7 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -511,6 +511,7 @@ static void handle_tx_copy(struct vhost_net *net)
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
int sent_pkts = 0;
+ s16 nheads = 0;
mutex_lock(&vq->mutex);
sock = vq->private_data;
@@ -550,6 +551,9 @@ static void handle_tx_copy(struct vhost_net *net)
if (len < 0)
break;
+ vq->heads[nheads].id = cpu_to_vhost32(vq, head);
+ vq->heads[nheads].len = 0;
+
total_len += len;
if (total_len < VHOST_NET_WEIGHT &&
vhost_has_more_pkts(net, vq)) {
@@ -568,13 +572,20 @@ static void handle_tx_copy(struct vhost_net *net)
if (err != len)
pr_debug("Truncated TX packet: "
" len %d != %zd\n", err, len);
- vhost_add_used_and_signal(&net->dev, vq, head, 0);
+ if (++nheads == VHOST_RX_BATCH) {
+ vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+ nheads);
+ nheads = 0;
+ }
if (vhost_exceeds_weight(++sent_pkts, total_len)) {
vhost_poll_queue(&vq->poll);
break;
}
}
out:
+ if (nheads)
+ vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
+ nheads);
mutex_unlock(&vq->mutex);
}
--
2.7.4
Instead of mixing zerocopy and datacopy logics, this patch tries to
split datacopy logic out. This results for a more compact code and
specific optimization could be done on top more easily.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/vhost/net.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 102 insertions(+), 9 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 4ebac76..4682fcc 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -492,9 +492,95 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
likely(!vhost_exceeds_maxpend(net));
}
+static void handle_tx_copy(struct vhost_net *net)
+{
+ struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *vq = &nvq->vq;
+ unsigned out, in;
+ int head;
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = MSG_DONTWAIT,
+ };
+ size_t len, total_len = 0;
+ int err;
+ size_t hdr_size;
+ struct socket *sock;
+ struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+ int sent_pkts = 0;
+
+ mutex_lock(&vq->mutex);
+ sock = vq->private_data;
+ if (!sock)
+ goto out;
+
+ if (!vq_iotlb_prefetch(vq))
+ goto out;
+
+ vhost_disable_notify(&net->dev, vq);
+ vhost_net_disable_vq(net, vq);
+
+ hdr_size = nvq->vhost_hlen;
+
+ for (;;) {
+ head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
+ ARRAY_SIZE(vq->iov),
+ &out, &in);
+ /* On error, stop handling until the next kick. */
+ if (unlikely(head < 0))
+ break;
+ /* Nothing new? Wait for eventfd to tell us they refilled. */
+ if (head == vq->num) {
+ if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ vhost_disable_notify(&net->dev, vq);
+ continue;
+ }
+ break;
+ }
+ if (in) {
+ vq_err(vq, "Unexpected descriptor format for TX: "
+ "out %d, int %d\n", out, in);
+ break;
+ }
+
+ len = init_iov_iter(vq, &msg.msg_iter, hdr_size, out);
+ if (len < 0)
+ break;
+
+ total_len += len;
+ if (total_len < VHOST_NET_WEIGHT &&
+ vhost_has_more_pkts(net, vq)) {
+ msg.msg_flags |= MSG_MORE;
+ } else {
+ msg.msg_flags &= ~MSG_MORE;
+ }
+
+ /* TODO: Check specific error and bomb out unless ENOBUFS? */
+ err = sock->ops->sendmsg(sock, &msg, len);
+ if (unlikely(err < 0)) {
+ vhost_discard_vq_desc(vq, 1);
+ vhost_net_enable_vq(net, vq);
+ break;
+ }
+ if (err != len)
+ pr_debug("Truncated TX packet: "
+ " len %d != %zd\n", err, len);
+ vhost_add_used_and_signal(&net->dev, vq, head, 0);
+ if (vhost_exceeds_weight(++sent_pkts, total_len)) {
+ vhost_poll_queue(&vq->poll);
+ break;
+ }
+ }
+out:
+ mutex_unlock(&vq->mutex);
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
-static void handle_tx(struct vhost_net *net)
+static void handle_tx_zerocopy(struct vhost_net *net)
{
struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = &nvq->vq;
@@ -512,7 +598,7 @@ static void handle_tx(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
- bool zcopy, zcopy_used;
+ bool zcopy_used;
int sent_pkts = 0;
mutex_lock(&vq->mutex);
@@ -527,13 +613,10 @@ static void handle_tx(struct vhost_net *net)
vhost_net_disable_vq(net, vq);
hdr_size = nvq->vhost_hlen;
- zcopy = nvq->ubufs;
for (;;) {
/* Release DMAs done buffers first */
- if (zcopy)
- vhost_zerocopy_signal_used(net, vq);
-
+ vhost_zerocopy_signal_used(net, vq);
head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
ARRAY_SIZE(vq->iov),
@@ -559,9 +642,9 @@ static void handle_tx(struct vhost_net *net)
if (len < 0)
break;
- zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
- && !vhost_exceeds_maxpend(net)
- && vhost_net_tx_select_zcopy(net);
+ zcopy_used = len >= VHOST_GOODCOPY_LEN
+ && !vhost_exceeds_maxpend(net)
+ && vhost_net_tx_select_zcopy(net);
/* use msg_control to pass vhost zerocopy ubuf info to skb */
if (zcopy_used) {
@@ -620,6 +703,16 @@ static void handle_tx(struct vhost_net *net)
mutex_unlock(&vq->mutex);
}
+static void handle_tx(struct vhost_net *net)
+{
+ struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+
+ if (nvq->ubufs)
+ handle_tx_zerocopy(net);
+ else
+ handle_tx_copy(net);
+}
+
static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
{
struct sk_buff *head;
--
2.7.4
Signed-off-by: Jason Wang <[email protected]>
---
drivers/vhost/net.c | 34 +++++++++++++++++++++++-----------
1 file changed, 23 insertions(+), 11 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c4b49fc..15d191a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -459,6 +459,26 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
}
+static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
+ size_t hdr_size, int out)
+{
+ /* Skip header. TODO: support TSO. */
+ size_t len = iov_length(vq->iov, out);
+
+ iov_iter_init(iter, WRITE, vq->iov, out, len);
+ iov_iter_advance(iter, hdr_size);
+ /* Sanity check */
+ if (!iov_iter_count(iter)) {
+ vq_err(vq, "Unexpected header len for TX: "
+ "%zd expected %zd\n",
+ len, hdr_size);
+ return -EFAULT;
+ }
+ len = iov_iter_count(iter);
+
+ return len;
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
@@ -521,18 +541,10 @@ static void handle_tx(struct vhost_net *net)
"out %d, int %d\n", out, in);
break;
}
- /* Skip header. TODO: support TSO. */
- len = iov_length(vq->iov, out);
- iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
- iov_iter_advance(&msg.msg_iter, hdr_size);
- /* Sanity check */
- if (!msg_data_left(&msg)) {
- vq_err(vq, "Unexpected header len for TX: "
- "%zd expected %zd\n",
- len, hdr_size);
+
+ len = init_iov_iter(vq, &msg.msg_iter, hdr_size, out);
+ if (len < 0)
break;
- }
- len = msg_data_left(&msg);
zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
&& !vhost_exceeds_maxpend(net)
--
2.7.4
Signed-off-by: Jason Wang <[email protected]>
---
drivers/vhost/net.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index de544ee..4ebac76 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -485,6 +485,13 @@ static bool vhost_exceeds_weight(int pkts, int total_len)
unlikely(pkts >= VHOST_NET_PKT_WEIGHT);
}
+static bool vhost_has_more_pkts(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ return !vhost_vq_avail_empty(&net->dev, vq) &&
+ likely(!vhost_exceeds_maxpend(net));
+}
+
/* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
@@ -578,8 +585,7 @@ static void handle_tx(struct vhost_net *net)
}
total_len += len;
if (total_len < VHOST_NET_WEIGHT &&
- !vhost_vq_avail_empty(&net->dev, vq) &&
- likely(!vhost_exceeds_maxpend(net))) {
+ vhost_has_more_pkts(net, vq)) {
msg.msg_flags |= MSG_MORE;
} else {
msg.msg_flags &= ~MSG_MORE;
@@ -605,7 +611,7 @@ static void handle_tx(struct vhost_net *net)
else
vhost_zerocopy_signal_used(net, vq);
vhost_net_tx_packet(net);
- if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
+ if (vhost_exceeds_weight(++sent_pkts, total_len)) {
vhost_poll_queue(&vq->poll);
break;
}
--
2.7.4
On Mon, May 21, 2018 at 05:04:27PM +0800, Jason Wang wrote:
> Signed-off-by: Jason Wang <[email protected]>
typo in subject
> ---
> drivers/net/tun.c | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 44d4f3d..24ecd82 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1697,6 +1697,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
> goto err_xdp;
> }
> }
> + rcu_read_unlock();
> + preempt_enable();
>
> skb = build_skb(buf, buflen);
> if (!skb) {
> @@ -1710,9 +1712,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
> get_page(alloc_frag->page);
> alloc_frag->offset += buflen;
>
> - rcu_read_unlock();
> - preempt_enable();
> -
> return skb;
>
> err_redirect:
> --
> 2.7.4
On Mon, May 21, 2018 at 05:04:33PM +0800, Jason Wang wrote:
> This patch implements XDP batching for vhost_net with tun. This is
> done by batching XDP buffs in vhost and submit them when:
>
> - vhost_net can not build XDP buff (mostly because of the size of packet)
> - #batched exceeds the limitation (VHOST_NET_RX_BATCH).
> - tun accept a batch of XDP buff through msg_control and process them
> in a batch
>
> With this tun XDP can benefit from e.g batch transmission during
> XDP_REDIRECT or XDP_TX.
>
> Tests shows 21% improvement on TX pps (from ~3.2Mpps to ~3.9Mpps)
> while transmitting through testpmd from guest to host by
> xdp_redirect_map between tap0 and ixgbe.
>
> Signed-off-by: Jason Wang <[email protected]>
s/underlayer/underlying/ ?
> ---
> drivers/net/tun.c | 36 +++++++++++++++++----------
> drivers/vhost/net.c | 71 ++++++++++++++++++++++++++++++++++++-----------------
> 2 files changed, 71 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index b586b3f..5d16d18 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1616,7 +1616,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
> switch (act) {
> case XDP_REDIRECT:
> *err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
> - xdp_do_flush_map();
> if (*err)
> break;
> goto out;
> @@ -1624,7 +1623,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
> *err = tun_xdp_tx(tun->dev, xdp);
> if (*err)
> break;
> - tun_xdp_flush(tun->dev);
> goto out;
> case XDP_PASS:
> goto out;
> @@ -2400,9 +2398,6 @@ static int tun_xdp_one(struct tun_struct *tun,
> int err = 0;
> bool skb_xdp = false;
>
> - preempt_disable();
> - rcu_read_lock();
> -
> xdp_prog = rcu_dereference(tun->xdp_prog);
> if (xdp_prog) {
> if (gso->gso_type) {
> @@ -2461,15 +2456,12 @@ static int tun_xdp_one(struct tun_struct *tun,
> tun_flow_update(tun, rxhash, tfile);
>
> out:
> - rcu_read_unlock();
> - preempt_enable();
> -
> return err;
> }
>
> static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
> {
> - int ret;
> + int ret, i;
> struct tun_file *tfile = container_of(sock, struct tun_file, socket);
> struct tun_struct *tun = tun_get(tfile);
> struct tun_msg_ctl *ctl = m->msg_control;
> @@ -2477,10 +2469,28 @@ static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
> if (!tun)
> return -EBADFD;
>
> - if (ctl && ctl->type == TUN_MSG_PTR) {
> - ret = tun_xdp_one(tun, tfile, ctl->ptr);
> - if (!ret)
> - ret = total_len;
> + if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
> + int n = ctl->type >> 16;
> +
> + preempt_disable();
> + rcu_read_lock();
> +
> + for (i = 0; i < n; i++) {
> + struct xdp_buff *x = (struct xdp_buff *)ctl->ptr;
> + struct xdp_buff *xdp = &x[i];
> +
> + xdp_set_data_meta_invalid(xdp);
> + xdp->rxq = &tfile->xdp_rxq;
> + tun_xdp_one(tun, tfile, xdp);
> + }
> +
> + xdp_do_flush_map();
> + tun_xdp_flush(tun->dev);
> +
> + rcu_read_unlock();
> + preempt_enable();
> +
> + ret = total_len;
> goto out;
> }
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 0d84de6..bec4109 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -118,6 +118,7 @@ struct vhost_net_virtqueue {
> struct ptr_ring *rx_ring;
> struct vhost_net_buf rxq;
> struct xdp_buff xdp[VHOST_RX_BATCH];
> + struct vring_used_elem heads[VHOST_RX_BATCH];
> };
>
> struct vhost_net {
> @@ -511,7 +512,7 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> void *buf;
> int copied;
>
> - if (len < nvq->sock_hlen)
> + if (unlikely(len < nvq->sock_hlen))
> return -EFAULT;
>
> if (SKB_DATA_ALIGN(len + pad) +
> @@ -567,11 +568,37 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> return 0;
> }
>
> +static void vhost_tx_batch(struct vhost_net *net,
> + struct vhost_net_virtqueue *nvq,
> + struct socket *sock,
> + struct msghdr *msghdr, int n)
> +{
> + struct tun_msg_ctl ctl = {
> + .type = n << 16 | TUN_MSG_PTR,
> + .ptr = nvq->xdp,
> + };
> + int err;
> +
> + if (n == 0)
> + return;
> +
> + msghdr->msg_control = &ctl;
> + err = sock->ops->sendmsg(sock, msghdr, 0);
> +
> + if (unlikely(err < 0)) {
> + /* FIXME vq_err() */
> + vq_err(&nvq->vq, "sendmsg err!\n");
> + return;
> + }
> + vhost_add_used_and_signal_n(&net->dev, &nvq->vq, nvq->vq.heads, n);
> +}
> +
> +/* Expects to be always run from workqueue - which acts as
> + * read-size critical section for our kind of RCU. */
> static void handle_tx_copy(struct vhost_net *net)
> {
> struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> struct vhost_virtqueue *vq = &nvq->vq;
> - struct xdp_buff xdp;
> unsigned out, in;
> int head;
> struct msghdr msg = {
> @@ -586,7 +613,6 @@ static void handle_tx_copy(struct vhost_net *net)
> size_t hdr_size;
> struct socket *sock;
> struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
> - struct tun_msg_ctl ctl;
> int sent_pkts = 0;
> s16 nheads = 0;
>
> @@ -631,22 +657,24 @@ static void handle_tx_copy(struct vhost_net *net)
> vq->heads[nheads].id = cpu_to_vhost32(vq, head);
> vq->heads[nheads].len = 0;
>
> - err = vhost_net_build_xdp(nvq, &msg.msg_iter, &xdp);
> - if (!err) {
> - ctl.type = TUN_MSG_PTR;
> - ctl.ptr = &xdp;
> - msg.msg_control = &ctl;
> - } else
> - msg.msg_control = NULL;
> -
> total_len += len;
> - if (total_len < VHOST_NET_WEIGHT &&
> - vhost_has_more_pkts(net, vq)) {
> - msg.msg_flags |= MSG_MORE;
> - } else {
> - msg.msg_flags &= ~MSG_MORE;
> + err = vhost_net_build_xdp(nvq, &msg.msg_iter,
> + &nvq->xdp[nheads]);
> + if (!err) {
> + if (++nheads == VHOST_RX_BATCH) {
> + vhost_tx_batch(net, nvq, sock, &msg, nheads);
> + nheads = 0;
> + }
> + goto done;
> + } else if (unlikely(err != -ENOSPC)) {
> + vq_err(vq, "Fail to build XDP buffer\n");
> + break;
> }
>
> + vhost_tx_batch(net, nvq, sock, &msg, nheads);
> + msg.msg_control = NULL;
> + nheads = 0;
> +
> /* TODO: Check specific error and bomb out unless ENOBUFS? */
> err = sock->ops->sendmsg(sock, &msg, len);
> if (unlikely(err < 0)) {
> @@ -657,11 +685,9 @@ static void handle_tx_copy(struct vhost_net *net)
> if (err != len)
> pr_debug("Truncated TX packet: "
> " len %d != %zd\n", err, len);
> - if (++nheads == VHOST_RX_BATCH) {
> - vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> - nheads);
> - nheads = 0;
> - }
> +
> + vhost_add_used_and_signal(&net->dev, vq, head, 0);
> +done:
> if (vhost_exceeds_weight(++sent_pkts, total_len)) {
> vhost_poll_queue(&vq->poll);
> break;
> @@ -669,8 +695,7 @@ static void handle_tx_copy(struct vhost_net *net)
> }
> out:
> if (nheads)
> - vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
> - nheads);
> + vhost_tx_batch(net, nvq, sock, &msg, nheads);
> mutex_unlock(&vq->mutex);
> }
>
> --
> 2.7.4
Hi Jason, a few nits.
On Mon, 21 May 2018 17:04:22 +0800 Jason wrote:
> Signed-off-by: Jason Wang <[email protected]>
> ---
> drivers/vhost/net.c | 34 +++++++++++++++++++++++-----------
> 1 file changed, 23 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index c4b49fc..15d191a 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -459,6 +459,26 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
> min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
> }
>
> +static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
> + size_t hdr_size, int out)
> +{
> + /* Skip header. TODO: support TSO. */
> + size_t len = iov_length(vq->iov, out);
> +
> + iov_iter_init(iter, WRITE, vq->iov, out, len);
> + iov_iter_advance(iter, hdr_size);
> + /* Sanity check */
> + if (!iov_iter_count(iter)) {
> + vq_err(vq, "Unexpected header len for TX: "
> + "%zd expected %zd\n",
> + len, hdr_size);
ok, it was like this before, but please unwrap the string in " ", there
should be no line breaks in string declarations and they are allowed to
go over 80 characters.
> + return -EFAULT;
> + }
> + len = iov_iter_count(iter);
> +
> + return len;
> +}
> +
> /* Expects to be always run from workqueue - which acts as
> * read-size critical section for our kind of RCU. */
> static void handle_tx(struct vhost_net *net)
> @@ -521,18 +541,10 @@ static void handle_tx(struct vhost_net *net)
> "out %d, int %d\n", out, in);
> break;
> }
> - /* Skip header. TODO: support TSO. */
> - len = iov_length(vq->iov, out);
> - iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
> - iov_iter_advance(&msg.msg_iter, hdr_size);
> - /* Sanity check */
> - if (!msg_data_left(&msg)) {
> - vq_err(vq, "Unexpected header len for TX: "
> - "%zd expected %zd\n",
> - len, hdr_size);
> +
> + len = init_iov_iter(vq, &msg.msg_iter, hdr_size, out);
> + if (len < 0)
len is declared as size_t, which is unsigned, and can never be
negative. I'm pretty sure this is a bug.
> break;
> - }
> - len = msg_data_left(&msg);
>
> zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
> && !vhost_exceeds_maxpend(net)
On Mon, 21 May 2018 17:04:23 +0800 Jason wrote:
> Signed-off-by: Jason Wang <[email protected]>
> ---
> drivers/vhost/net.c | 13 ++++++++-----
> 1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 15d191a..de544ee 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -479,6 +479,12 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
> return len;
> }
>
> +static bool vhost_exceeds_weight(int pkts, int total_len)
> +{
> + return unlikely(total_len >= VHOST_NET_WEIGHT) ||
> + unlikely(pkts >= VHOST_NET_PKT_WEIGHT);
I was going to say just one unlikely, but then the caller of this
function also says unlikely(vhost_exceeds...), so I think you should
just drop the unlikely statements here (both of them)
> +}
> +
> /* Expects to be always run from workqueue - which acts as
> * read-size critical section for our kind of RCU. */
> static void handle_tx(struct vhost_net *net)
> @@ -570,7 +576,6 @@ static void handle_tx(struct vhost_net *net)
> msg.msg_control = NULL;
> ubufs = NULL;
> }
> -
unrelated whitespace changes?
> total_len += len;
> if (total_len < VHOST_NET_WEIGHT &&
> !vhost_vq_avail_empty(&net->dev, vq) &&
> @@ -600,8 +605,7 @@ static void handle_tx(struct vhost_net *net)
> else
> vhost_zerocopy_signal_used(net, vq);
> vhost_net_tx_packet(net);
> - if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
> - unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
> + if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
> vhost_poll_queue(&vq->poll);
> break;
> }
> @@ -887,8 +891,7 @@ static void handle_rx(struct vhost_net *net)
> if (unlikely(vq_log))
> vhost_log_write(vq, vq_log, log, vhost_len);
> total_len += vhost_len;
> - if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
> - unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
> + if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
> vhost_poll_queue(&vq->poll);
> goto out;
> }
On Mon, 21 May 2018 17:04:24 +0800 Jason wrote:
> Signed-off-by: Jason Wang <[email protected]>
> ---
> drivers/vhost/net.c | 12 +++++++++---
> 1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index de544ee..4ebac76 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -485,6 +485,13 @@ static bool vhost_exceeds_weight(int pkts, int total_len)
> unlikely(pkts >= VHOST_NET_PKT_WEIGHT);
> }
>
> +static bool vhost_has_more_pkts(struct vhost_net *net,
> + struct vhost_virtqueue *vq)
> +{
> + return !vhost_vq_avail_empty(&net->dev, vq) &&
> + likely(!vhost_exceeds_maxpend(net));
This really seems like mis-use of likely/unlikely, in the middle of a
sequence of operations that will always be run when this function is
called. I think you should remove the likely from this helper,
especially, and control the branch from the branch point.
> +}
> +
> /* Expects to be always run from workqueue - which acts as
> * read-size critical section for our kind of RCU. */
> static void handle_tx(struct vhost_net *net)
> @@ -578,8 +585,7 @@ static void handle_tx(struct vhost_net *net)
> }
> total_len += len;
> if (total_len < VHOST_NET_WEIGHT &&
> - !vhost_vq_avail_empty(&net->dev, vq) &&
> - likely(!vhost_exceeds_maxpend(net))) {
> + vhost_has_more_pkts(net, vq)) {
Yes, I know it came from here, but likely/unlikely are for branch
control, so they should encapsulate everything inside the if, unless
I'm mistaken.
> msg.msg_flags |= MSG_MORE;
> } else {
> msg.msg_flags &= ~MSG_MORE;
> @@ -605,7 +611,7 @@ static void handle_tx(struct vhost_net *net)
> else
> vhost_zerocopy_signal_used(net, vq);
> vhost_net_tx_packet(net);
> - if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
> + if (vhost_exceeds_weight(++sent_pkts, total_len)) {
You should have kept the unlikely here, and not had it inside the
helper (as per the previous patch. Also, why wasn't this change part
of the previous patch?
> vhost_poll_queue(&vq->poll);
> break;
> }
On Mon, 21 May 2018 17:04:25 +0800 Jason wrote:
> Instead of mixing zerocopy and datacopy logics, this patch tries to
> split datacopy logic out. This results for a more compact code and
> specific optimization could be done on top more easily.
>
> Signed-off-by: Jason Wang <[email protected]>
> ---
> drivers/vhost/net.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 102 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 4ebac76..4682fcc 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -492,9 +492,95 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
> likely(!vhost_exceeds_maxpend(net));
> }
>
> +static void handle_tx_copy(struct vhost_net *net)
> +{
> + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> + struct vhost_virtqueue *vq = &nvq->vq;
> + unsigned out, in;
move "out" and "in" down to inside the for loop as well.
> + int head;
move this "head" declaration inside for-loop below.
> + struct msghdr msg = {
> + .msg_name = NULL,
> + .msg_namelen = 0,
> + .msg_control = NULL,
> + .msg_controllen = 0,
> + .msg_flags = MSG_DONTWAIT,
> + };
> + size_t len, total_len = 0;
> + int err;
> + size_t hdr_size;
> + struct socket *sock;
> + struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
> + int sent_pkts = 0;
why do we need so many locals?
> +
> + mutex_lock(&vq->mutex);
> + sock = vq->private_data;
> + if (!sock)
> + goto out;
> +
> + if (!vq_iotlb_prefetch(vq))
> + goto out;
> +
> + vhost_disable_notify(&net->dev, vq);
> + vhost_net_disable_vq(net, vq);
> +
> + hdr_size = nvq->vhost_hlen;
> +
> + for (;;) {
> + head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
> + ARRAY_SIZE(vq->iov),
> + &out, &in);
> + /* On error, stop handling until the next kick. */
> + if (unlikely(head < 0))
> + break;
> + /* Nothing new? Wait for eventfd to tell us they refilled. */
> + if (head == vq->num) {
> + if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> + vhost_disable_notify(&net->dev, vq);
> + continue;
> + }
> + break;
> + }
> + if (in) {
> + vq_err(vq, "Unexpected descriptor format for TX: "
> + "out %d, int %d\n", out, in);
don't break strings, keep all the bits between " " together, even if it
is longer than 80 chars.
> + break;
> + }
> +
> + len = init_iov_iter(vq, &msg.msg_iter, hdr_size, out);
> + if (len < 0)
> + break;
same comment as previous patch, len is a size_t which is unsigned.
> +
> + total_len += len;
> + if (total_len < VHOST_NET_WEIGHT &&
> + vhost_has_more_pkts(net, vq)) {
> + msg.msg_flags |= MSG_MORE;
> + } else {
> + msg.msg_flags &= ~MSG_MORE;
> + }
don't need { } here.
> +
> + /* TODO: Check specific error and bomb out unless ENOBUFS? */
> + err = sock->ops->sendmsg(sock, &msg, len);
> + if (unlikely(err < 0)) {
> + vhost_discard_vq_desc(vq, 1);
> + vhost_net_enable_vq(net, vq);
> + break;
> + }
> + if (err != len)
> + pr_debug("Truncated TX packet: "
> + " len %d != %zd\n", err, len);
single line " " string please
On Mon, 21 May 2018 17:04:31 +0800 Jason wrote:
> This patch implement build XDP buffers in vhost_net. The idea is do
> userspace copy in vhost_net and build XDP buff based on the
> page. Vhost_net can then submit one or an array of XDP buffs to
> underlayer socket (e.g TUN). TUN can choose to do XDP or call
> build_skb() to build skb. To support build skb, vnet header were also
> stored into the header of the XDP buff.
>
> This userspace copy and XDP buffs building is key to achieve XDP
> batching in TUN, since TUN does not need to care about userspace copy
> and then can disable premmption for several XDP buffs to achieve
> batching from XDP.
>
> TODO: reserve headroom based on the TUN XDP.
>
> Signed-off-by: Jason Wang <[email protected]>
> ---
> drivers/vhost/net.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 74 insertions(+)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index f0639d7..1209e84 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -492,6 +492,80 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
> likely(!vhost_exceeds_maxpend(net));
> }
>
> +#define VHOST_NET_HEADROOM 256
> +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> +
> +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> + struct iov_iter *from,
> + struct xdp_buff *xdp)
> +{
> + struct vhost_virtqueue *vq = &nvq->vq;
> + struct page_frag *alloc_frag = ¤t->task_frag;
> + struct virtio_net_hdr *gso;
> + size_t len = iov_iter_count(from);
> + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> + int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + VHOST_NET_HEADROOM
> + + nvq->sock_hlen);
> + int sock_hlen = nvq->sock_hlen;
> + void *buf;
> + int copied;
> +
> + if (len < nvq->sock_hlen)
> + return -EFAULT;
> +
> + if (SKB_DATA_ALIGN(len + pad) +
> + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
> + return -ENOSPC;
> +
> + buflen += SKB_DATA_ALIGN(len + pad);
maybe store the result of SKB_DATA_ALIGN in a local instead of doing
the work twice?
> + alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
> + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
> + return -ENOMEM;
> +
> + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +
> + /* We store two kinds of metadata in the header which will be
> + * used for XDP_PASS to do build_skb():
> + * offset 0: buflen
> + * offset sizeof(int): vnet header
> + */
> + copied = copy_page_from_iter(alloc_frag->page,
> + alloc_frag->offset + sizeof(int), sock_hlen, from);
> + if (copied != sock_hlen)
> + return -EFAULT;
> +
> + gso = (struct virtio_net_hdr *)(buf + sizeof(int));
> +
> + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> + vhost16_to_cpu(vq, gso->csum_start) +
> + vhost16_to_cpu(vq, gso->csum_offset) + 2 >
> + vhost16_to_cpu(vq, gso->hdr_len)) {
> + gso->hdr_len = cpu_to_vhost16(vq,
> + vhost16_to_cpu(vq, gso->csum_start) +
> + vhost16_to_cpu(vq, gso->csum_offset) + 2);
> +
> + if (vhost16_to_cpu(vq, gso->hdr_len) > len)
> + return -EINVAL;
> + }
> +
> + len -= sock_hlen;
> + copied = copy_page_from_iter(alloc_frag->page,
> + alloc_frag->offset + pad,
> + len, from);
> + if (copied != len)
> + return -EFAULT;
> +
> + xdp->data_hard_start = buf;
> + xdp->data = buf + pad;
> + xdp->data_end = xdp->data + len;
> + *(int *)(xdp->data_hard_start)= buflen;
space before =
> +
> + get_page(alloc_frag->page);
> + alloc_frag->offset += buflen;
> +
> + return 0;
> +}
> +
> static void handle_tx_copy(struct vhost_net *net)
> {
> struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
On Mon, May 21, 2018 at 09:56:11AM -0700, Jesse Brandeburg wrote:
> On Mon, 21 May 2018 17:04:31 +0800 Jason wrote:
> > This patch implement build XDP buffers in vhost_net. The idea is do
> > userspace copy in vhost_net and build XDP buff based on the
> > page. Vhost_net can then submit one or an array of XDP buffs to
> > underlayer socket (e.g TUN). TUN can choose to do XDP or call
> > build_skb() to build skb. To support build skb, vnet header were also
> > stored into the header of the XDP buff.
> >
> > This userspace copy and XDP buffs building is key to achieve XDP
> > batching in TUN, since TUN does not need to care about userspace copy
> > and then can disable premmption for several XDP buffs to achieve
> > batching from XDP.
> >
> > TODO: reserve headroom based on the TUN XDP.
> >
> > Signed-off-by: Jason Wang <[email protected]>
> > ---
> > drivers/vhost/net.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 74 insertions(+)
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index f0639d7..1209e84 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -492,6 +492,80 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
> > likely(!vhost_exceeds_maxpend(net));
> > }
> >
> > +#define VHOST_NET_HEADROOM 256
> > +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> > +
> > +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
> > + struct iov_iter *from,
> > + struct xdp_buff *xdp)
> > +{
> > + struct vhost_virtqueue *vq = &nvq->vq;
> > + struct page_frag *alloc_frag = ¤t->task_frag;
> > + struct virtio_net_hdr *gso;
> > + size_t len = iov_iter_count(from);
> > + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> > + int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + VHOST_NET_HEADROOM
> > + + nvq->sock_hlen);
> > + int sock_hlen = nvq->sock_hlen;
> > + void *buf;
> > + int copied;
> > +
> > + if (len < nvq->sock_hlen)
> > + return -EFAULT;
> > +
> > + if (SKB_DATA_ALIGN(len + pad) +
> > + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
> > + return -ENOSPC;
> > +
> > + buflen += SKB_DATA_ALIGN(len + pad);
>
> maybe store the result of SKB_DATA_ALIGN in a local instead of doing
> the work twice?
I don't mind, but I guess gcc can always do it itself?
> > + alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
> > + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
> > + return -ENOMEM;
> > +
> > + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > +
> > + /* We store two kinds of metadata in the header which will be
> > + * used for XDP_PASS to do build_skb():
> > + * offset 0: buflen
> > + * offset sizeof(int): vnet header
> > + */
> > + copied = copy_page_from_iter(alloc_frag->page,
> > + alloc_frag->offset + sizeof(int), sock_hlen, from);
> > + if (copied != sock_hlen)
> > + return -EFAULT;
> > +
> > + gso = (struct virtio_net_hdr *)(buf + sizeof(int));
> > +
> > + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
> > + vhost16_to_cpu(vq, gso->csum_start) +
> > + vhost16_to_cpu(vq, gso->csum_offset) + 2 >
> > + vhost16_to_cpu(vq, gso->hdr_len)) {
> > + gso->hdr_len = cpu_to_vhost16(vq,
> > + vhost16_to_cpu(vq, gso->csum_start) +
> > + vhost16_to_cpu(vq, gso->csum_offset) + 2);
> > +
> > + if (vhost16_to_cpu(vq, gso->hdr_len) > len)
> > + return -EINVAL;
> > + }
> > +
> > + len -= sock_hlen;
> > + copied = copy_page_from_iter(alloc_frag->page,
> > + alloc_frag->offset + pad,
> > + len, from);
> > + if (copied != len)
> > + return -EFAULT;
> > +
> > + xdp->data_hard_start = buf;
> > + xdp->data = buf + pad;
> > + xdp->data_end = xdp->data + len;
> > + *(int *)(xdp->data_hard_start)= buflen;
>
> space before =
>
> > +
> > + get_page(alloc_frag->page);
> > + alloc_frag->offset += buflen;
> > +
> > + return 0;
> > +}
> > +
> > static void handle_tx_copy(struct vhost_net *net)
> > {
> > struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
On 2018年05月22日 00:24, Jesse Brandeburg wrote:
> Hi Jason, a few nits.
>
> On Mon, 21 May 2018 17:04:22 +0800 Jason wrote:
>> Signed-off-by: Jason Wang <[email protected]>
>> ---
>> drivers/vhost/net.c | 34 +++++++++++++++++++++++-----------
>> 1 file changed, 23 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index c4b49fc..15d191a 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -459,6 +459,26 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net)
>> min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
>> }
>>
>> +static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
>> + size_t hdr_size, int out)
>> +{
>> + /* Skip header. TODO: support TSO. */
>> + size_t len = iov_length(vq->iov, out);
>> +
>> + iov_iter_init(iter, WRITE, vq->iov, out, len);
>> + iov_iter_advance(iter, hdr_size);
>> + /* Sanity check */
>> + if (!iov_iter_count(iter)) {
>> + vq_err(vq, "Unexpected header len for TX: "
>> + "%zd expected %zd\n",
>> + len, hdr_size);
> ok, it was like this before, but please unwrap the string in " ", there
> should be no line breaks in string declarations and they are allowed to
> go over 80 characters.
Ok.
>
>> + return -EFAULT;
>> + }
>> + len = iov_iter_count(iter);
>> +
>> + return len;
>> +}
>> +
>> /* Expects to be always run from workqueue - which acts as
>> * read-size critical section for our kind of RCU. */
>> static void handle_tx(struct vhost_net *net)
>> @@ -521,18 +541,10 @@ static void handle_tx(struct vhost_net *net)
>> "out %d, int %d\n", out, in);
>> break;
>> }
>> - /* Skip header. TODO: support TSO. */
>> - len = iov_length(vq->iov, out);
>> - iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len);
>> - iov_iter_advance(&msg.msg_iter, hdr_size);
>> - /* Sanity check */
>> - if (!msg_data_left(&msg)) {
>> - vq_err(vq, "Unexpected header len for TX: "
>> - "%zd expected %zd\n",
>> - len, hdr_size);
>> +
>> + len = init_iov_iter(vq, &msg.msg_iter, hdr_size, out);
>> + if (len < 0)
> len is declared as size_t, which is unsigned, and can never be
> negative. I'm pretty sure this is a bug.
Yes, let me fix it in next version.
Thanks
>
>
>> break;
>> - }
>> - len = msg_data_left(&msg);
>>
>> zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN
>> && !vhost_exceeds_maxpend(net)
On 2018年05月22日 00:29, Jesse Brandeburg wrote:
> On Mon, 21 May 2018 17:04:23 +0800 Jason wrote:
>> Signed-off-by: Jason Wang <[email protected]>
>> ---
>> drivers/vhost/net.c | 13 ++++++++-----
>> 1 file changed, 8 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 15d191a..de544ee 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -479,6 +479,12 @@ static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
>> return len;
>> }
>>
>> +static bool vhost_exceeds_weight(int pkts, int total_len)
>> +{
>> + return unlikely(total_len >= VHOST_NET_WEIGHT) ||
>> + unlikely(pkts >= VHOST_NET_PKT_WEIGHT);
> I was going to say just one unlikely, but then the caller of this
> function also says unlikely(vhost_exceeds...), so I think you should
> just drop the unlikely statements here (both of them)
Ok.
>
>> +}
>> +
>> /* Expects to be always run from workqueue - which acts as
>> * read-size critical section for our kind of RCU. */
>> static void handle_tx(struct vhost_net *net)
>> @@ -570,7 +576,6 @@ static void handle_tx(struct vhost_net *net)
>> msg.msg_control = NULL;
>> ubufs = NULL;
>> }
>> -
> unrelated whitespace changes?
Yes.
Thanks
>
>> total_len += len;
>> if (total_len < VHOST_NET_WEIGHT &&
>> !vhost_vq_avail_empty(&net->dev, vq) &&
>> @@ -600,8 +605,7 @@ static void handle_tx(struct vhost_net *net)
>> else
>> vhost_zerocopy_signal_used(net, vq);
>> vhost_net_tx_packet(net);
>> - if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
>> - unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT)) {
>> + if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
>> vhost_poll_queue(&vq->poll);
>> break;
>> }
>> @@ -887,8 +891,7 @@ static void handle_rx(struct vhost_net *net)
>> if (unlikely(vq_log))
>> vhost_log_write(vq, vq_log, log, vhost_len);
>> total_len += vhost_len;
>> - if (unlikely(total_len >= VHOST_NET_WEIGHT) ||
>> - unlikely(++recv_pkts >= VHOST_NET_PKT_WEIGHT)) {
>> + if (unlikely(vhost_exceeds_weight(++recv_pkts, total_len))) {
>> vhost_poll_queue(&vq->poll);
>> goto out;
>> }
On 2018年05月22日 00:39, Jesse Brandeburg wrote:
> On Mon, 21 May 2018 17:04:24 +0800 Jason wrote:
>> Signed-off-by: Jason Wang <[email protected]>
>> ---
>> drivers/vhost/net.c | 12 +++++++++---
>> 1 file changed, 9 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index de544ee..4ebac76 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -485,6 +485,13 @@ static bool vhost_exceeds_weight(int pkts, int total_len)
>> unlikely(pkts >= VHOST_NET_PKT_WEIGHT);
>> }
>>
>> +static bool vhost_has_more_pkts(struct vhost_net *net,
>> + struct vhost_virtqueue *vq)
>> +{
>> + return !vhost_vq_avail_empty(&net->dev, vq) &&
>> + likely(!vhost_exceeds_maxpend(net));
> This really seems like mis-use of likely/unlikely, in the middle of a
> sequence of operations that will always be run when this function is
> called. I think you should remove the likely from this helper,
> especially, and control the branch from the branch point.
Yes, so I'm consider to make it a macro in next version.
>
>
>> +}
>> +
>> /* Expects to be always run from workqueue - which acts as
>> * read-size critical section for our kind of RCU. */
>> static void handle_tx(struct vhost_net *net)
>> @@ -578,8 +585,7 @@ static void handle_tx(struct vhost_net *net)
>> }
>> total_len += len;
>> if (total_len < VHOST_NET_WEIGHT &&
>> - !vhost_vq_avail_empty(&net->dev, vq) &&
>> - likely(!vhost_exceeds_maxpend(net))) {
>> + vhost_has_more_pkts(net, vq)) {
> Yes, I know it came from here, but likely/unlikely are for branch
> control, so they should encapsulate everything inside the if, unless
> I'm mistaken.
Ok.
>
>> msg.msg_flags |= MSG_MORE;
>> } else {
>> msg.msg_flags &= ~MSG_MORE;
>> @@ -605,7 +611,7 @@ static void handle_tx(struct vhost_net *net)
>> else
>> vhost_zerocopy_signal_used(net, vq);
>> vhost_net_tx_packet(net);
>> - if (unlikely(vhost_exceeds_weight(++sent_pkts, total_len))) {
>> + if (vhost_exceeds_weight(++sent_pkts, total_len)) {
> You should have kept the unlikely here, and not had it inside the
> helper (as per the previous patch. Also, why wasn't this change part
> of the previous patch?
Yes, will squash the above into previous one.
Thanks
>
>> vhost_poll_queue(&vq->poll);
>> break;
>> }
On 2018年05月22日 00:46, Jesse Brandeburg wrote:
> On Mon, 21 May 2018 17:04:25 +0800 Jason wrote:
>> Instead of mixing zerocopy and datacopy logics, this patch tries to
>> split datacopy logic out. This results for a more compact code and
>> specific optimization could be done on top more easily.
>>
>> Signed-off-by: Jason Wang <[email protected]>
>> ---
>> drivers/vhost/net.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++-----
>> 1 file changed, 102 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index 4ebac76..4682fcc 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -492,9 +492,95 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
>> likely(!vhost_exceeds_maxpend(net));
>> }
>>
>> +static void handle_tx_copy(struct vhost_net *net)
>> +{
>> + struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>> + struct vhost_virtqueue *vq = &nvq->vq;
>> + unsigned out, in;
> move "out" and "in" down to inside the for loop as well.
>
>> + int head;
> move this "head" declaration inside for-loop below.
Will do these in next version, basically this function were just copied
from handle_tx_zerocopy and remove unnecessary parts. So I'm thinking an
independent patch from the beginning to avoid changes in two places.
>
>> + struct msghdr msg = {
>> + .msg_name = NULL,
>> + .msg_namelen = 0,
>> + .msg_control = NULL,
>> + .msg_controllen = 0,
>> + .msg_flags = MSG_DONTWAIT,
>> + };
>> + size_t len, total_len = 0;
>> + int err;
>> + size_t hdr_size;
>> + struct socket *sock;
>> + struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
>> + int sent_pkts = 0;
> why do we need so many locals?
So it looks to me ubufs is unnecessary but all other is really needed.
>
>> +
>> + mutex_lock(&vq->mutex);
>> + sock = vq->private_data;
>> + if (!sock)
>> + goto out;
>> +
>> + if (!vq_iotlb_prefetch(vq))
>> + goto out;
>> +
>> + vhost_disable_notify(&net->dev, vq);
>> + vhost_net_disable_vq(net, vq);
>> +
>> + hdr_size = nvq->vhost_hlen;
>> +
>> + for (;;) {
>> + head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
>> + ARRAY_SIZE(vq->iov),
>> + &out, &in);
>> + /* On error, stop handling until the next kick. */
>> + if (unlikely(head < 0))
>> + break;
>> + /* Nothing new? Wait for eventfd to tell us they refilled. */
>> + if (head == vq->num) {
>> + if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>> + vhost_disable_notify(&net->dev, vq);
>> + continue;
>> + }
>> + break;
>> + }
>> + if (in) {
>> + vq_err(vq, "Unexpected descriptor format for TX: "
>> + "out %d, int %d\n", out, in);
> don't break strings, keep all the bits between " " together, even if it
> is longer than 80 chars.
Ok.
>
>> + break;
>> + }
>> +
>> + len = init_iov_iter(vq, &msg.msg_iter, hdr_size, out);
>> + if (len < 0)
>> + break;
> same comment as previous patch, len is a size_t which is unsigned.
Yes.
>
>> +
>> + total_len += len;
>> + if (total_len < VHOST_NET_WEIGHT &&
>> + vhost_has_more_pkts(net, vq)) {
>> + msg.msg_flags |= MSG_MORE;
>> + } else {
>> + msg.msg_flags &= ~MSG_MORE;
>> + }
> don't need { } here.
Right.
>> +
>> + /* TODO: Check specific error and bomb out unless ENOBUFS? */
>> + err = sock->ops->sendmsg(sock, &msg, len);
>> + if (unlikely(err < 0)) {
>> + vhost_discard_vq_desc(vq, 1);
>> + vhost_net_enable_vq(net, vq);
>> + break;
>> + }
>> + if (err != len)
>> + pr_debug("Truncated TX packet: "
>> + " len %d != %zd\n", err, len);
> single line " " string please
>
Will fix.
Thanks
On 2018年05月22日 00:56, Jesse Brandeburg wrote:
> On Mon, 21 May 2018 17:04:31 +0800 Jason wrote:
>> This patch implement build XDP buffers in vhost_net. The idea is do
>> userspace copy in vhost_net and build XDP buff based on the
>> page. Vhost_net can then submit one or an array of XDP buffs to
>> underlayer socket (e.g TUN). TUN can choose to do XDP or call
>> build_skb() to build skb. To support build skb, vnet header were also
>> stored into the header of the XDP buff.
>>
>> This userspace copy and XDP buffs building is key to achieve XDP
>> batching in TUN, since TUN does not need to care about userspace copy
>> and then can disable premmption for several XDP buffs to achieve
>> batching from XDP.
>>
>> TODO: reserve headroom based on the TUN XDP.
>>
>> Signed-off-by: Jason Wang <[email protected]>
>> ---
>> drivers/vhost/net.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 74 insertions(+)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index f0639d7..1209e84 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -492,6 +492,80 @@ static bool vhost_has_more_pkts(struct vhost_net *net,
>> likely(!vhost_exceeds_maxpend(net));
>> }
>>
>> +#define VHOST_NET_HEADROOM 256
>> +#define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
>> +
>> +static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
>> + struct iov_iter *from,
>> + struct xdp_buff *xdp)
>> +{
>> + struct vhost_virtqueue *vq = &nvq->vq;
>> + struct page_frag *alloc_frag = ¤t->task_frag;
>> + struct virtio_net_hdr *gso;
>> + size_t len = iov_iter_count(from);
>> + int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> + int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + VHOST_NET_HEADROOM
>> + + nvq->sock_hlen);
>> + int sock_hlen = nvq->sock_hlen;
>> + void *buf;
>> + int copied;
>> +
>> + if (len < nvq->sock_hlen)
>> + return -EFAULT;
>> +
>> + if (SKB_DATA_ALIGN(len + pad) +
>> + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
>> + return -ENOSPC;
>> +
>> + buflen += SKB_DATA_ALIGN(len + pad);
> maybe store the result of SKB_DATA_ALIGN in a local instead of doing
> the work twice?
Ok.
>
>> + alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
>> + if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
>> + return -ENOMEM;
>> +
>> + buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>> +
>> + /* We store two kinds of metadata in the header which will be
>> + * used for XDP_PASS to do build_skb():
>> + * offset 0: buflen
>> + * offset sizeof(int): vnet header
>> + */
>> + copied = copy_page_from_iter(alloc_frag->page,
>> + alloc_frag->offset + sizeof(int), sock_hlen, from);
>> + if (copied != sock_hlen)
>> + return -EFAULT;
>> +
>> + gso = (struct virtio_net_hdr *)(buf + sizeof(int));
>> +
>> + if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
>> + vhost16_to_cpu(vq, gso->csum_start) +
>> + vhost16_to_cpu(vq, gso->csum_offset) + 2 >
>> + vhost16_to_cpu(vq, gso->hdr_len)) {
>> + gso->hdr_len = cpu_to_vhost16(vq,
>> + vhost16_to_cpu(vq, gso->csum_start) +
>> + vhost16_to_cpu(vq, gso->csum_offset) + 2);
>> +
>> + if (vhost16_to_cpu(vq, gso->hdr_len) > len)
>> + return -EINVAL;
>> + }
>> +
>> + len -= sock_hlen;
>> + copied = copy_page_from_iter(alloc_frag->page,
>> + alloc_frag->offset + pad,
>> + len, from);
>> + if (copied != len)
>> + return -EFAULT;
>> +
>> + xdp->data_hard_start = buf;
>> + xdp->data = buf + pad;
>> + xdp->data_end = xdp->data + len;
>> + *(int *)(xdp->data_hard_start)= buflen;
> space before =
Yes.
Thanks
>
>> +
>> + get_page(alloc_frag->page);
>> + alloc_frag->offset += buflen;
>> +
>> + return 0;
>> +}
>> +
>> static void handle_tx_copy(struct vhost_net *net)
>> {
>> struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
On Mon, May 21, 2018 at 05:04:21PM +0800, Jason Wang wrote:
> Hi all:
>
> We do not support XDP batching for TUN since it can only receive one
> packet a time from vhost_net. This series tries to remove this
> limitation by:
>
> - introduce a TUN specific msg_control that can hold a pointer to an
> array of XDP buffs
> - try copy and build XDP buff in vhost_net
> - store XDP buffs in an array and submit them once for every N packets
> from vhost_net
> - since TUN can only do native XDP for datacopy packet, to simplify
> the logic, split datacopy out logic and only do batching for
> datacopy.
I like how this rework looks. Pls go ahead and repost as
non-RFC.
> With this series, TX PPS can improve about 34% from 2.9Mpps to
> 3.9Mpps when doing xdp_redirect_map between TAP and ixgbe.
>
> Thanks
>
> Jason Wang (12):
> vhost_net: introduce helper to initialize tx iov iter
> vhost_net: introduce vhost_exceeds_weight()
> vhost_net: introduce vhost_has_more_pkts()
> vhost_net: split out datacopy logic
> vhost_net: batch update used ring for datacopy TX
> tuntap: enable premmption early
> tuntap: simplify error handling in tun_build_skb()
> tuntap: tweak on the path of non-xdp case in tun_build_skb()
> tuntap: split out XDP logic
> vhost_net: build xdp buff
> vhost_net: passing raw xdp buff to tun
> vhost_net: batch submitting XDP buffers to underlayer sockets
>
> drivers/net/tun.c | 226 +++++++++++++++++++++++++++----------
> drivers/vhost/net.c | 297 ++++++++++++++++++++++++++++++++++++++++++++-----
> include/linux/if_tun.h | 7 ++
> 3 files changed, 444 insertions(+), 86 deletions(-)
>
> --
> 2.7.4