This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
which can significantly reduce CPU utilization for XDP programs.
Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
ring has been utilized to queue different types of pointers by encoding
the type into the lower bits. Therefore, we introduce a new flag,
TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP descriptors
and differentiate them from XDP buffers and sk_buffs. Additionally, a
spin lock is added for enabling and disabling operations on the xsk pool.
The performance testing was performed on a Intel E5-2620 2.40GHz machine.
Traffic were generated/send through TUN(testpmd txonly with AF_XDP)
to VM (testpmd rxonly in guest).
+------+---------+---------+---------+
| | copy |zero-copy| speedup |
+------+---------+---------+---------+
| UDP | Mpps | Mpps | % |
| 64 | 2.5 | 4.0 | 60% |
| 512 | 2.1 | 3.6 | 71% |
| 1024 | 1.9 | 3.3 | 73% |
+------+---------+---------+---------+
Signed-off-by: Yunjian Wang <[email protected]>
---
drivers/net/tun.c | 177 +++++++++++++++++++++++++++++++++++++++--
drivers/vhost/net.c | 4 +
include/linux/if_tun.h | 32 ++++++++
3 files changed, 208 insertions(+), 5 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bc80fc1d576e..7f4ff50b532c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -63,6 +63,7 @@
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <net/xdp.h>
+#include <net/xdp_sock_drv.h>
#include <net/ip_tunnels.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
@@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *cmd);
#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+#define TUN_XDP_BATCH 64
/* TUN device flags */
@@ -146,6 +148,9 @@ struct tun_file {
struct tun_struct *detached;
struct ptr_ring tx_ring;
struct xdp_rxq_info xdp_rxq;
+ struct xsk_buff_pool *xsk_pool;
+ spinlock_t pool_lock; /* Protects xsk pool enable/disable */
+ u32 nb_descs;
};
struct tun_page {
@@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr)
struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
xdp_return_frame(xdpf);
+ } else if (tun_is_xdp_desc_frame(ptr)) {
+ return;
} else {
__skb_array_destroy_skb(ptr);
}
@@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile)
skb_queue_purge(&tfile->sk.sk_error_queue);
}
+static void tun_set_xsk_pool(struct tun_file *tfile, struct xsk_buff_pool *pool)
+{
+ if (!pool)
+ return;
+
+ spin_lock(&tfile->pool_lock);
+ xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
+ tfile->xsk_pool = pool;
+ spin_unlock(&tfile->pool_lock);
+}
+
+static void tun_clean_xsk_pool(struct tun_file *tfile)
+{
+ spin_lock(&tfile->pool_lock);
+ if (tfile->xsk_pool) {
+ void *ptr;
+
+ while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
+ tun_ptr_free(ptr);
+
+ if (tfile->nb_descs) {
+ xsk_tx_completed(tfile->xsk_pool, tfile->nb_descs);
+ if (xsk_uses_need_wakeup(tfile->xsk_pool))
+ xsk_set_tx_need_wakeup(tfile->xsk_pool);
+ tfile->nb_descs = 0;
+ }
+ tfile->xsk_pool = NULL;
+ }
+ spin_unlock(&tfile->pool_lock);
+}
+
static void __tun_detach(struct tun_file *tfile, bool clean)
{
struct tun_file *ntfile;
@@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
u16 index = tfile->queue_index;
BUG_ON(index >= tun->numqueues);
+ ntfile = rtnl_dereference(tun->tfiles[tun->numqueues - 1]);
+ /* Stop xsk zc xmit */
+ tun_clean_xsk_pool(tfile);
+ tun_clean_xsk_pool(ntfile);
+
rcu_assign_pointer(tun->tfiles[index],
tun->tfiles[tun->numqueues - 1]);
ntfile = rtnl_dereference(tun->tfiles[index]);
@@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
tun_flow_delete_by_queue(tun, tun->numqueues + 1);
/* Drop read queue */
tun_queue_purge(tfile);
+ tun_set_xsk_pool(ntfile, xsk_get_pool_from_qid(tun->dev, index));
tun_set_real_num_queues(tun);
} else if (tfile->detached && clean) {
tun = tun_enable_queue(tfile);
@@ -801,6 +845,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
if (tfile->xdp_rxq.queue_index != tfile->queue_index)
tfile->xdp_rxq.queue_index = tfile->queue_index;
+ tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev, tfile->queue_index));
} else {
/* Setup XDP RX-queue info, for new tfile getting attached */
err = xdp_rxq_info_reg(&tfile->xdp_rxq,
@@ -1221,11 +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
return 0;
}
+static int tun_xsk_pool_enable(struct net_device *netdev,
+ struct xsk_buff_pool *pool,
+ u16 qid)
+{
+ struct tun_struct *tun = netdev_priv(netdev);
+ struct tun_file *tfile;
+
+ if (qid >= tun->numqueues)
+ return -EINVAL;
+
+ tfile = rtnl_dereference(tun->tfiles[qid]);
+ tun_set_xsk_pool(tfile, pool);
+
+ return 0;
+}
+
+static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid)
+{
+ struct tun_struct *tun = netdev_priv(netdev);
+ struct tun_file *tfile;
+
+ if (qid >= MAX_TAP_QUEUES)
+ return -EINVAL;
+
+ tfile = rtnl_dereference(tun->tfiles[qid]);
+ if (tfile)
+ tun_clean_xsk_pool(tfile);
+ return 0;
+}
+
+static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
+ u16 qid)
+{
+ return pool ? tun_xsk_pool_enable(dev, pool, qid) :
+ tun_xsk_pool_disable(dev, qid);
+}
+
static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
{
switch (xdp->command) {
case XDP_SETUP_PROG:
return tun_xdp_set(dev, xdp->prog, xdp->extack);
+ case XDP_SETUP_XSK_POOL:
+ return tun_xsk_pool_setup(dev, xdp->xsk.pool, xdp->xsk.queue_id);
default:
return -EINVAL;
}
@@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
return nxmit;
}
+static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ struct tun_file *tfile;
+
+ rcu_read_lock();
+ tfile = rcu_dereference(tun->tfiles[qid]);
+ if (tfile)
+ __tun_xdp_flush_tfile(tfile);
+ rcu_read_unlock();
+ return 0;
+}
+
static const struct net_device_ops tap_netdev_ops = {
.ndo_init = tun_net_init,
.ndo_uninit = tun_net_uninit,
@@ -1346,6 +1443,7 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_get_stats64 = dev_get_tstats64,
.ndo_bpf = tun_xdp,
.ndo_xdp_xmit = tun_xdp_xmit,
+ .ndo_xsk_wakeup = tun_xsk_wakeup,
.ndo_change_carrier = tun_net_change_carrier,
};
@@ -1403,7 +1501,8 @@ static void tun_net_initialize(struct net_device *dev)
/* Currently tun does not support XDP, only tap does. */
dev->xdp_features = NETDEV_XDP_ACT_BASIC |
NETDEV_XDP_ACT_REDIRECT |
- NETDEV_XDP_ACT_NDO_XMIT;
+ NETDEV_XDP_ACT_NDO_XMIT |
+ NETDEV_XDP_ACT_XSK_ZEROCOPY;
break;
}
@@ -2058,11 +2157,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t tun_put_user_xdp(struct tun_struct *tun,
struct tun_file *tfile,
- struct xdp_frame *xdp_frame,
+ void *addr,
+ size_t size,
struct iov_iter *iter)
{
int vnet_hdr_sz = 0;
- size_t size = xdp_frame->len;
size_t ret;
if (tun->flags & IFF_VNET_HDR) {
@@ -2077,7 +2176,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
}
- ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
+ ret = copy_to_iter(addr, size, iter) + vnet_hdr_sz;
preempt_disable();
dev_sw_netstats_tx_add(tun->dev, 1, ret);
@@ -2240,8 +2339,20 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
if (tun_is_xdp_frame(ptr)) {
struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
- ret = tun_put_user_xdp(tun, tfile, xdpf, to);
+ ret = tun_put_user_xdp(tun, tfile, xdpf->data, xdpf->len, to);
xdp_return_frame(xdpf);
+ } else if (tun_is_xdp_desc_frame(ptr)) {
+ struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
+ void *data;
+
+ spin_lock(&tfile->pool_lock);
+ if (tfile->xsk_pool) {
+ data = xsk_buff_raw_get_data(tfile->xsk_pool, desc->addr);
+ ret = tun_put_user_xdp(tun, tfile, data, desc->len, to);
+ } else {
+ ret = 0;
+ }
+ spin_unlock(&tfile->pool_lock);
} else {
struct sk_buff *skb = ptr;
@@ -2654,6 +2765,10 @@ static int tun_ptr_peek_len(void *ptr)
struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
return xdpf->len;
+ } else if (tun_is_xdp_desc_frame(ptr)) {
+ struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
+
+ return desc->len;
}
return __skb_array_len_with_tag(ptr);
} else {
@@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
}
}
+static void tun_peek_xsk(struct tun_file *tfile)
+{
+ struct xsk_buff_pool *pool;
+ u32 i, batch, budget;
+ void *frame;
+
+ if (!ptr_ring_empty(&tfile->tx_ring))
+ return;
+
+ spin_lock(&tfile->pool_lock);
+ pool = tfile->xsk_pool;
+ if (!pool) {
+ spin_unlock(&tfile->pool_lock);
+ return;
+ }
+
+ if (tfile->nb_descs) {
+ xsk_tx_completed(pool, tfile->nb_descs);
+ if (xsk_uses_need_wakeup(pool))
+ xsk_set_tx_need_wakeup(pool);
+ }
+
+ spin_lock(&tfile->tx_ring.producer_lock);
+ budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
+
+ batch = xsk_tx_peek_release_desc_batch(pool, budget);
+ if (!batch) {
+ tfile->nb_descs = 0;
+ spin_unlock(&tfile->tx_ring.producer_lock);
+ spin_unlock(&tfile->pool_lock);
+ return;
+ }
+
+ tfile->nb_descs = batch;
+ for (i = 0; i < batch; i++) {
+ /* Encode the XDP DESC flag into lowest bit for consumer to differ
+ * XDP desc from XDP buffer and sk_buff.
+ */
+ frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
+ /* The budget must be less than or equal to tx_ring.size,
+ * so enqueuing will not fail.
+ */
+ __ptr_ring_produce(&tfile->tx_ring, frame);
+ }
+ spin_unlock(&tfile->tx_ring.producer_lock);
+ spin_unlock(&tfile->pool_lock);
+}
+
static int tun_peek_len(struct socket *sock)
{
struct tun_file *tfile = container_of(sock, struct tun_file, socket);
@@ -2671,6 +2834,9 @@ static int tun_peek_len(struct socket *sock)
if (!tun)
return 0;
+ if (sock_flag(&tfile->sk, SOCK_XDP))
+ tun_peek_xsk(tfile);
+
ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
tun_put(tun);
@@ -3473,6 +3639,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
}
mutex_init(&tfile->napi_mutex);
+ spin_lock_init(&tfile->pool_lock);
RCU_INIT_POINTER(tfile->tun, NULL);
tfile->flags = 0;
tfile->ifindex = 0;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 077e74421558..eb83764be26c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -202,6 +202,10 @@ static int vhost_net_buf_peek_len(void *ptr)
struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
return xdpf->len;
+ } else if (tun_is_xdp_desc_frame(ptr)) {
+ struct xdp_desc *desc = tun_ptr_to_xdp_desc(ptr);
+
+ return desc->len;
}
return __skb_array_len_with_tag(ptr);
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 043d442994b0..4142453b5e52 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -6,10 +6,12 @@
#ifndef __IF_TUN_H
#define __IF_TUN_H
+#include <uapi/linux/if_xdp.h>
#include <uapi/linux/if_tun.h>
#include <uapi/linux/virtio_net.h>
#define TUN_XDP_FLAG 0x1UL
+#define TUN_XDP_DESC_FLAG 0x2UL
#define TUN_MSG_UBUF 1
#define TUN_MSG_PTR 2
@@ -43,6 +45,21 @@ static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
}
+static inline bool tun_is_xdp_desc_frame(void *ptr)
+{
+ return (unsigned long)ptr & TUN_XDP_DESC_FLAG;
+}
+
+static inline void *tun_xdp_desc_to_ptr(struct xdp_desc *desc)
+{
+ return (void *)((unsigned long)desc | TUN_XDP_DESC_FLAG);
+}
+
+static inline struct xdp_desc *tun_ptr_to_xdp_desc(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~TUN_XDP_DESC_FLAG);
+}
+
void tun_ptr_free(void *ptr);
#else
#include <linux/err.h>
@@ -75,6 +92,21 @@ static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
return NULL;
}
+static inline bool tun_is_xdp_desc_frame(void *ptr)
+{
+ return false;
+}
+
+static inline void *tun_xdp_desc_to_ptr(struct xdp_desc *desc)
+{
+ return NULL;
+}
+
+static inline struct xdp_frame *tun_ptr_to_xdp_desc(void *ptr)
+{
+ return NULL;
+}
+
static inline void tun_ptr_free(void *ptr)
{
}
--
2.41.0
On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > -----Original Message-----
> > From: Paolo Abeni [mailto:[email protected]]
> > Sent: Thursday, February 29, 2024 7:13 PM
> > To: wangyunjian <[email protected]>; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]
> > Cc: [email protected]; [email protected];
> > [email protected]; [email protected];
> > [email protected]; xudingke <[email protected]>; liwei (DT)
> > <[email protected]>
> > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> >
> > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > > }
> > > }
> > >
> > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > + struct xsk_buff_pool *pool;
> > > + u32 i, batch, budget;
> > > + void *frame;
> > > +
> > > + if (!ptr_ring_empty(&tfile->tx_ring))
> > > + return;
> > > +
> > > + spin_lock(&tfile->pool_lock);
> > > + pool = tfile->xsk_pool;
> > > + if (!pool) {
> > > + spin_unlock(&tfile->pool_lock);
> > > + return;
> > > + }
> > > +
> > > + if (tfile->nb_descs) {
> > > + xsk_tx_completed(pool, tfile->nb_descs);
> > > + if (xsk_uses_need_wakeup(pool))
> > > + xsk_set_tx_need_wakeup(pool);
> > > + }
> > > +
> > > + spin_lock(&tfile->tx_ring.producer_lock);
> > > + budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
> > > +
> > > + batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > + if (!batch) {
> >
> > This branch looks like an unneeded "optimization". The generic loop below
> > should have the same effect with no measurable perf delta - and smaller code.
> > Just remove this.
> >
> > > + tfile->nb_descs = 0;
> > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > + spin_unlock(&tfile->pool_lock);
> > > + return;
> > > + }
> > > +
> > > + tfile->nb_descs = batch;
> > > + for (i = 0; i < batch; i++) {
> > > + /* Encode the XDP DESC flag into lowest bit for consumer to differ
> > > + * XDP desc from XDP buffer and sk_buff.
> > > + */
> > > + frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > + /* The budget must be less than or equal to tx_ring.size,
> > > + * so enqueuing will not fail.
> > > + */
> > > + __ptr_ring_produce(&tfile->tx_ring, frame);
> > > + }
> > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > + spin_unlock(&tfile->pool_lock);
> >
> > More related to the general design: it looks wrong. What if
> > get_rx_bufs() will fail (ENOBUF) after successful peeking? With no more
> > incoming packets, later peek will return 0 and it looks like that the
> > half-processed packets will stay in the ring forever???
> >
> > I think the 'ring produce' part should be moved into tun_do_read().
>
> Currently, the vhost-net obtains a batch descriptors/sk_buffs from the
> ptr_ring and enqueue the batch descriptors/sk_buffs to the virtqueue'queue,
> and then consumes the descriptors/sk_buffs from the virtqueue'queue in
> sequence. As a result, TUN does not know whether the batch descriptors have
> been used up, and thus does not know when to return the batch descriptors.
>
> So, I think it's reasonable that when vhost-net checks ptr_ring is empty,
> it calls peek_len to get new xsk's descs and return the descriptors.
>
> Thanks
What you need to think about is that if you peek, another call
in parallel can get the same value at the same time.
> >
> > Cheers,
> >
> > Paolo
>
On Wed, Feb 28, 2024 at 07:05:56PM +0800, Yunjian Wang wrote:
> This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> which can significantly reduce CPU utilization for XDP programs.
Why no Rx ZC support though? What will happen if I try rxdrop xdpsock
against tun with this patch? You clearly allow for that.
>
> Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
> ring has been utilized to queue different types of pointers by encoding
> the type into the lower bits. Therefore, we introduce a new flag,
> TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP descriptors
> and differentiate them from XDP buffers and sk_buffs. Additionally, a
> spin lock is added for enabling and disabling operations on the xsk pool.
>
> The performance testing was performed on a Intel E5-2620 2.40GHz machine.
> Traffic were generated/send through TUN(testpmd txonly with AF_XDP)
> to VM (testpmd rxonly in guest).
>
> +------+---------+---------+---------+
> | | copy |zero-copy| speedup |
> +------+---------+---------+---------+
> | UDP | Mpps | Mpps | % |
> | 64 | 2.5 | 4.0 | 60% |
> | 512 | 2.1 | 3.6 | 71% |
> | 1024 | 1.9 | 3.3 | 73% |
> +------+---------+---------+---------+
>
> Signed-off-by: Yunjian Wang <[email protected]>
> ---
> drivers/net/tun.c | 177 +++++++++++++++++++++++++++++++++++++++--
> drivers/vhost/net.c | 4 +
> include/linux/if_tun.h | 32 ++++++++
> 3 files changed, 208 insertions(+), 5 deletions(-)
>
Maciej Fijalkowski wrote:
> On Wed, Feb 28, 2024 at 07:05:56PM +0800, Yunjian Wang wrote:
> > This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> > which can significantly reduce CPU utilization for XDP programs.
>
> Why no Rx ZC support though? What will happen if I try rxdrop xdpsock
> against tun with this patch? You clearly allow for that.
This is AF_XDP receive zerocopy, right?
The naming is always confusing with tun, but even though from a tun
PoV this happens on ndo_start_xmit, it is the AF_XDP equivalent to
tun_put_user.
So the implementation is more like other device's Rx ZC.
I would have preferred that name, but I think Jason asked for this
and given tun's weird status, there is something bo said for either.
On Wed, Feb 28, 2024 at 7:06 PM Yunjian Wang <[email protected]> wrote:
>
> This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> which can significantly reduce CPU utilization for XDP programs.
>
> Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
> ring has been utilized to queue different types of pointers by encoding
> the type into the lower bits. Therefore, we introduce a new flag,
> TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP descriptors
> and differentiate them from XDP buffers and sk_buffs. Additionally, a
> spin lock is added for enabling and disabling operations on the xsk pool.
>
> The performance testing was performed on a Intel E5-2620 2.40GHz machine.
> Traffic were generated/send through TUN(testpmd txonly with AF_XDP)
> to VM (testpmd rxonly in guest).
>
> +------+---------+---------+---------+
> | | copy |zero-copy| speedup |
> +------+---------+---------+---------+
> | UDP | Mpps | Mpps | % |
> | 64 | 2.5 | 4.0 | 60% |
> | 512 | 2.1 | 3.6 | 71% |
> | 1024 | 1.9 | 3.3 | 73% |
> +------+---------+---------+---------+
>
> Signed-off-by: Yunjian Wang <[email protected]>
> ---
> drivers/net/tun.c | 177 +++++++++++++++++++++++++++++++++++++++--
> drivers/vhost/net.c | 4 +
> include/linux/if_tun.h | 32 ++++++++
> 3 files changed, 208 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index bc80fc1d576e..7f4ff50b532c 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -63,6 +63,7 @@
> #include <net/rtnetlink.h>
> #include <net/sock.h>
> #include <net/xdp.h>
> +#include <net/xdp_sock_drv.h>
> #include <net/ip_tunnels.h>
> #include <linux/seq_file.h>
> #include <linux/uio.h>
> @@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct net_device *dev,
> struct ethtool_link_ksettings *cmd);
>
> #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> +#define TUN_XDP_BATCH 64
>
> /* TUN device flags */
>
> @@ -146,6 +148,9 @@ struct tun_file {
> struct tun_struct *detached;
> struct ptr_ring tx_ring;
> struct xdp_rxq_info xdp_rxq;
> + struct xsk_buff_pool *xsk_pool;
> + spinlock_t pool_lock; /* Protects xsk pool enable/disable */
> + u32 nb_descs;
> };
>
> struct tun_page {
> @@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr)
> struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
>
> xdp_return_frame(xdpf);
> + } else if (tun_is_xdp_desc_frame(ptr)) {
> + return;
> } else {
> __skb_array_destroy_skb(ptr);
> }
> @@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile)
> skb_queue_purge(&tfile->sk.sk_error_queue);
> }
>
> +static void tun_set_xsk_pool(struct tun_file *tfile, struct xsk_buff_pool *pool)
> +{
> + if (!pool)
> + return;
> +
> + spin_lock(&tfile->pool_lock);
> + xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> + tfile->xsk_pool = pool;
> + spin_unlock(&tfile->pool_lock);
> +}
> +
> +static void tun_clean_xsk_pool(struct tun_file *tfile)
> +{
> + spin_lock(&tfile->pool_lock);
> + if (tfile->xsk_pool) {
> + void *ptr;
> +
> + while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
> + tun_ptr_free(ptr);
> +
> + if (tfile->nb_descs) {
> + xsk_tx_completed(tfile->xsk_pool, tfile->nb_descs);
> + if (xsk_uses_need_wakeup(tfile->xsk_pool))
> + xsk_set_tx_need_wakeup(tfile->xsk_pool);
> + tfile->nb_descs = 0;
> + }
> + tfile->xsk_pool = NULL;
> + }
> + spin_unlock(&tfile->pool_lock);
> +}
> +
> static void __tun_detach(struct tun_file *tfile, bool clean)
> {
> struct tun_file *ntfile;
> @@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
> u16 index = tfile->queue_index;
> BUG_ON(index >= tun->numqueues);
>
> + ntfile = rtnl_dereference(tun->tfiles[tun->numqueues - 1]);
> + /* Stop xsk zc xmit */
> + tun_clean_xsk_pool(tfile);
> + tun_clean_xsk_pool(ntfile);
> +
> rcu_assign_pointer(tun->tfiles[index],
> tun->tfiles[tun->numqueues - 1]);
> ntfile = rtnl_dereference(tun->tfiles[index]);
> @@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
> tun_flow_delete_by_queue(tun, tun->numqueues + 1);
> /* Drop read queue */
> tun_queue_purge(tfile);
> + tun_set_xsk_pool(ntfile, xsk_get_pool_from_qid(tun->dev, index));
> tun_set_real_num_queues(tun);
> } else if (tfile->detached && clean) {
> tun = tun_enable_queue(tfile);
> @@ -801,6 +845,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>
> if (tfile->xdp_rxq.queue_index != tfile->queue_index)
> tfile->xdp_rxq.queue_index = tfile->queue_index;
> + tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev, tfile->queue_index));
> } else {
> /* Setup XDP RX-queue info, for new tfile getting attached */
> err = xdp_rxq_info_reg(&tfile->xdp_rxq,
> @@ -1221,11 +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
> return 0;
> }
>
> +static int tun_xsk_pool_enable(struct net_device *netdev,
> + struct xsk_buff_pool *pool,
> + u16 qid)
> +{
> + struct tun_struct *tun = netdev_priv(netdev);
> + struct tun_file *tfile;
> +
> + if (qid >= tun->numqueues)
> + return -EINVAL;
> +
> + tfile = rtnl_dereference(tun->tfiles[qid]);
> + tun_set_xsk_pool(tfile, pool);
> +
> + return 0;
> +}
> +
> +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid)
> +{
> + struct tun_struct *tun = netdev_priv(netdev);
> + struct tun_file *tfile;
> +
> + if (qid >= MAX_TAP_QUEUES)
> + return -EINVAL;
> +
> + tfile = rtnl_dereference(tun->tfiles[qid]);
> + if (tfile)
> + tun_clean_xsk_pool(tfile);
> + return 0;
> +}
> +
> +static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
> + u16 qid)
> +{
> + return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> + tun_xsk_pool_disable(dev, qid);
> +}
> +
> static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
> {
> switch (xdp->command) {
> case XDP_SETUP_PROG:
> return tun_xdp_set(dev, xdp->prog, xdp->extack);
> + case XDP_SETUP_XSK_POOL:
> + return tun_xsk_pool_setup(dev, xdp->xsk.pool, xdp->xsk.queue_id);
> default:
> return -EINVAL;
> }
> @@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
> return nxmit;
> }
>
> +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> +{
> + struct tun_struct *tun = netdev_priv(dev);
> + struct tun_file *tfile;
> +
> + rcu_read_lock();
> + tfile = rcu_dereference(tun->tfiles[qid]);
> + if (tfile)
> + __tun_xdp_flush_tfile(tfile);
> + rcu_read_unlock();
> + return 0;
> +}
I may miss something but why not simply queue xdp frames into ptr ring
then we don't need tricks for peek?
Thanks
> -----Original Message-----
> From: Michael S. Tsirkin [mailto:[email protected]]
> Sent: Friday, March 1, 2024 7:53 PM
> To: wangyunjian <[email protected]>
> Cc: Paolo Abeni <[email protected]>; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; xudingke <[email protected]>; liwei (DT)
> <[email protected]>
> Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
>
> On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > > -----Original Message-----
> > > From: Paolo Abeni [mailto:[email protected]]
> > > Sent: Thursday, February 29, 2024 7:13 PM
> > > To: wangyunjian <[email protected]>; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]
> > > Cc: [email protected]; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]; xudingke <[email protected]>;
> > > liwei (DT) <[email protected]>
> > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > support
> > >
> > > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > > > }
> > > > }
> > > >
> > > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > > + struct xsk_buff_pool *pool;
> > > > + u32 i, batch, budget;
> > > > + void *frame;
> > > > +
> > > > + if (!ptr_ring_empty(&tfile->tx_ring))
> > > > + return;
> > > > +
> > > > + spin_lock(&tfile->pool_lock);
> > > > + pool = tfile->xsk_pool;
> > > > + if (!pool) {
> > > > + spin_unlock(&tfile->pool_lock);
> > > > + return;
> > > > + }
> > > > +
> > > > + if (tfile->nb_descs) {
> > > > + xsk_tx_completed(pool, tfile->nb_descs);
> > > > + if (xsk_uses_need_wakeup(pool))
> > > > + xsk_set_tx_need_wakeup(pool);
> > > > + }
> > > > +
> > > > + spin_lock(&tfile->tx_ring.producer_lock);
> > > > + budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
> > > > +
> > > > + batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > > + if (!batch) {
> > >
> > > This branch looks like an unneeded "optimization". The generic loop
> > > below should have the same effect with no measurable perf delta - and
> smaller code.
> > > Just remove this.
> > >
> > > > + tfile->nb_descs = 0;
> > > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > > + spin_unlock(&tfile->pool_lock);
> > > > + return;
> > > > + }
> > > > +
> > > > + tfile->nb_descs = batch;
> > > > + for (i = 0; i < batch; i++) {
> > > > + /* Encode the XDP DESC flag into lowest bit for consumer to
> differ
> > > > + * XDP desc from XDP buffer and sk_buff.
> > > > + */
> > > > + frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > > + /* The budget must be less than or equal to tx_ring.size,
> > > > + * so enqueuing will not fail.
> > > > + */
> > > > + __ptr_ring_produce(&tfile->tx_ring, frame);
> > > > + }
> > > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > > + spin_unlock(&tfile->pool_lock);
> > >
> > > More related to the general design: it looks wrong. What if
> > > get_rx_bufs() will fail (ENOBUF) after successful peeking? With no
> > > more incoming packets, later peek will return 0 and it looks like
> > > that the half-processed packets will stay in the ring forever???
> > >
> > > I think the 'ring produce' part should be moved into tun_do_read().
> >
> > Currently, the vhost-net obtains a batch descriptors/sk_buffs from the
> > ptr_ring and enqueue the batch descriptors/sk_buffs to the
> > virtqueue'queue, and then consumes the descriptors/sk_buffs from the
> > virtqueue'queue in sequence. As a result, TUN does not know whether
> > the batch descriptors have been used up, and thus does not know when to
> return the batch descriptors.
> >
> > So, I think it's reasonable that when vhost-net checks ptr_ring is
> > empty, it calls peek_len to get new xsk's descs and return the descriptors.
> >
> > Thanks
>
> What you need to think about is that if you peek, another call in parallel can get
> the same value at the same time.
Thank you. I have identified a problem. The tx_descs array was created within xsk's pool.
When xsk is freed, the pool and tx_descs are also freed. Howerver, some descs may
remain in the virtqueue'queue, which could lead to a use-after-free scenario. Currently,
I do not have an idea to solve this concurrency problem and believe this scenario may
not be appropriate for reusing the ptr_ring.
Thanks
>
>
> > >
> > > Cheers,
> > >
> > > Paolo
> >
> -----Original Message-----
> From: Jason Wang [mailto:[email protected]]
> Sent: Monday, March 4, 2024 2:56 PM
> To: wangyunjian <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; xudingke <[email protected]>; liwei (DT)
> <[email protected]>
> Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
>
> On Wed, Feb 28, 2024 at 7:06 PM Yunjian Wang <[email protected]>
> wrote:
> >
> > This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> > which can significantly reduce CPU utilization for XDP programs.
> >
> > Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
> > ring has been utilized to queue different types of pointers by
> > encoding the type into the lower bits. Therefore, we introduce a new
> > flag, TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP
> > descriptors and differentiate them from XDP buffers and sk_buffs.
> > Additionally, a spin lock is added for enabling and disabling operations on the
> xsk pool.
> >
> > The performance testing was performed on a Intel E5-2620 2.40GHz
> machine.
> > Traffic were generated/send through TUN(testpmd txonly with AF_XDP) to
> > VM (testpmd rxonly in guest).
> >
> > +------+---------+---------+---------+
> > | | copy |zero-copy| speedup |
> > +------+---------+---------+---------+
> > | UDP | Mpps | Mpps | % |
> > | 64 | 2.5 | 4.0 | 60% |
> > | 512 | 2.1 | 3.6 | 71% |
> > | 1024 | 1.9 | 3.3 | 73% |
> > +------+---------+---------+---------+
> >
> > Signed-off-by: Yunjian Wang <[email protected]>
> > ---
> > drivers/net/tun.c | 177
> +++++++++++++++++++++++++++++++++++++++--
> > drivers/vhost/net.c | 4 +
> > include/linux/if_tun.h | 32 ++++++++
> > 3 files changed, 208 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c index
> > bc80fc1d576e..7f4ff50b532c 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -63,6 +63,7 @@
> > #include <net/rtnetlink.h>
> > #include <net/sock.h>
> > #include <net/xdp.h>
> > +#include <net/xdp_sock_drv.h>
> > #include <net/ip_tunnels.h>
> > #include <linux/seq_file.h>
> > #include <linux/uio.h>
> > @@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct
> net_device *dev,
> > struct
> ethtool_link_ksettings
> > *cmd);
> >
> > #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> > +#define TUN_XDP_BATCH 64
> >
> > /* TUN device flags */
> >
> > @@ -146,6 +148,9 @@ struct tun_file {
> > struct tun_struct *detached;
> > struct ptr_ring tx_ring;
> > struct xdp_rxq_info xdp_rxq;
> > + struct xsk_buff_pool *xsk_pool;
> > + spinlock_t pool_lock; /* Protects xsk pool enable/disable */
> > + u32 nb_descs;
> > };
> >
> > struct tun_page {
> > @@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr)
> > struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
> >
> > xdp_return_frame(xdpf);
> > + } else if (tun_is_xdp_desc_frame(ptr)) {
> > + return;
> > } else {
> > __skb_array_destroy_skb(ptr);
> > }
> > @@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile)
> > skb_queue_purge(&tfile->sk.sk_error_queue);
> > }
> >
> > +static void tun_set_xsk_pool(struct tun_file *tfile, struct
> > +xsk_buff_pool *pool) {
> > + if (!pool)
> > + return;
> > +
> > + spin_lock(&tfile->pool_lock);
> > + xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> > + tfile->xsk_pool = pool;
> > + spin_unlock(&tfile->pool_lock); }
> > +
> > +static void tun_clean_xsk_pool(struct tun_file *tfile) {
> > + spin_lock(&tfile->pool_lock);
> > + if (tfile->xsk_pool) {
> > + void *ptr;
> > +
> > + while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
> > + tun_ptr_free(ptr);
> > +
> > + if (tfile->nb_descs) {
> > + xsk_tx_completed(tfile->xsk_pool,
> tfile->nb_descs);
> > + if (xsk_uses_need_wakeup(tfile->xsk_pool))
> > +
> xsk_set_tx_need_wakeup(tfile->xsk_pool);
> > + tfile->nb_descs = 0;
> > + }
> > + tfile->xsk_pool = NULL;
> > + }
> > + spin_unlock(&tfile->pool_lock); }
> > +
> > static void __tun_detach(struct tun_file *tfile, bool clean) {
> > struct tun_file *ntfile;
> > @@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool
> clean)
> > u16 index = tfile->queue_index;
> > BUG_ON(index >= tun->numqueues);
> >
> > + ntfile = rtnl_dereference(tun->tfiles[tun->numqueues -
> 1]);
> > + /* Stop xsk zc xmit */
> > + tun_clean_xsk_pool(tfile);
> > + tun_clean_xsk_pool(ntfile);
> > +
> > rcu_assign_pointer(tun->tfiles[index],
> > tun->tfiles[tun->numqueues - 1]);
> > ntfile = rtnl_dereference(tun->tfiles[index]);
> > @@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool
> clean)
> > tun_flow_delete_by_queue(tun, tun->numqueues + 1);
> > /* Drop read queue */
> > tun_queue_purge(tfile);
> > + tun_set_xsk_pool(ntfile,
> > + xsk_get_pool_from_qid(tun->dev, index));
> > tun_set_real_num_queues(tun);
> > } else if (tfile->detached && clean) {
> > tun = tun_enable_queue(tfile); @@ -801,6 +845,7 @@
> > static int tun_attach(struct tun_struct *tun, struct file *file,
> >
> > if (tfile->xdp_rxq.queue_index != tfile->queue_index)
> > tfile->xdp_rxq.queue_index =
> > tfile->queue_index;
> > + tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev,
> > + tfile->queue_index));
> > } else {
> > /* Setup XDP RX-queue info, for new tfile getting
> attached */
> > err = xdp_rxq_info_reg(&tfile->xdp_rxq, @@ -1221,11
> > +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog
> *prog,
> > return 0;
> > }
> >
> > +static int tun_xsk_pool_enable(struct net_device *netdev,
> > + struct xsk_buff_pool *pool,
> > + u16 qid) {
> > + struct tun_struct *tun = netdev_priv(netdev);
> > + struct tun_file *tfile;
> > +
> > + if (qid >= tun->numqueues)
> > + return -EINVAL;
> > +
> > + tfile = rtnl_dereference(tun->tfiles[qid]);
> > + tun_set_xsk_pool(tfile, pool);
> > +
> > + return 0;
> > +}
> > +
> > +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) {
> > + struct tun_struct *tun = netdev_priv(netdev);
> > + struct tun_file *tfile;
> > +
> > + if (qid >= MAX_TAP_QUEUES)
> > + return -EINVAL;
> > +
> > + tfile = rtnl_dereference(tun->tfiles[qid]);
> > + if (tfile)
> > + tun_clean_xsk_pool(tfile);
> > + return 0;
> > +}
> > +
> > +static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool
> *pool,
> > + u16 qid) {
> > + return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> > + tun_xsk_pool_disable(dev, qid); }
> > +
> > static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) {
> > switch (xdp->command) {
> > case XDP_SETUP_PROG:
> > return tun_xdp_set(dev, xdp->prog, xdp->extack);
> > + case XDP_SETUP_XSK_POOL:
> > + return tun_xsk_pool_setup(dev, xdp->xsk.pool,
> > + xdp->xsk.queue_id);
> > default:
> > return -EINVAL;
> > }
> > @@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev,
> struct xdp_buff *xdp)
> > return nxmit;
> > }
> >
> > +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> > +{
> > + struct tun_struct *tun = netdev_priv(dev);
> > + struct tun_file *tfile;
> > +
> > + rcu_read_lock();
> > + tfile = rcu_dereference(tun->tfiles[qid]);
> > + if (tfile)
> > + __tun_xdp_flush_tfile(tfile);
> > + rcu_read_unlock();
> > + return 0;
> > +}
>
> I may miss something but why not simply queue xdp frames into ptr ring then
> we don't need tricks for peek?
The current implementation is implemented with reference to other NIC's drivers
(ixgbe, ice, dpaa2, mlx5), which use XDP descriptors directly. Converting XDP
descriptors to XDP frames does not seem to be very beneficial.
Thanks
>
> Thanks
On Mon, Mar 4, 2024 at 7:24 PM wangyunjian <[email protected]> wrote:
>
>
>
> > -----Original Message-----
> > From: Jason Wang [mailto:[email protected]]
> > Sent: Monday, March 4, 2024 2:56 PM
> > To: wangyunjian <[email protected]>
> > Cc: [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; xudingke <[email protected]>; liwei (DT)
> > <[email protected]>
> > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> >
> > On Wed, Feb 28, 2024 at 7:06 PM Yunjian Wang <[email protected]>
> > wrote:
> > >
> > > This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> > > which can significantly reduce CPU utilization for XDP programs.
> > >
> > > Since commit fc72d1d54dd9 ("tuntap: XDP transmission"), the pointer
> > > ring has been utilized to queue different types of pointers by
> > > encoding the type into the lower bits. Therefore, we introduce a new
> > > flag, TUN_XDP_DESC_FLAG(0x2UL), which allows us to enqueue XDP
> > > descriptors and differentiate them from XDP buffers and sk_buffs.
> > > Additionally, a spin lock is added for enabling and disabling operations on the
> > xsk pool.
> > >
> > > The performance testing was performed on a Intel E5-2620 2.40GHz
> > machine.
> > > Traffic were generated/send through TUN(testpmd txonly with AF_XDP) to
> > > VM (testpmd rxonly in guest).
> > >
> > > +------+---------+---------+---------+
> > > | | copy |zero-copy| speedup |
> > > +------+---------+---------+---------+
> > > | UDP | Mpps | Mpps | % |
> > > | 64 | 2.5 | 4.0 | 60% |
> > > | 512 | 2.1 | 3.6 | 71% |
> > > | 1024 | 1.9 | 3.3 | 73% |
> > > +------+---------+---------+---------+
> > >
> > > Signed-off-by: Yunjian Wang <[email protected]>
> > > ---
> > > drivers/net/tun.c | 177
> > +++++++++++++++++++++++++++++++++++++++--
> > > drivers/vhost/net.c | 4 +
> > > include/linux/if_tun.h | 32 ++++++++
> > > 3 files changed, 208 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c index
> > > bc80fc1d576e..7f4ff50b532c 100644
> > > --- a/drivers/net/tun.c
> > > +++ b/drivers/net/tun.c
> > > @@ -63,6 +63,7 @@
> > > #include <net/rtnetlink.h>
> > > #include <net/sock.h>
> > > #include <net/xdp.h>
> > > +#include <net/xdp_sock_drv.h>
> > > #include <net/ip_tunnels.h>
> > > #include <linux/seq_file.h>
> > > #include <linux/uio.h>
> > > @@ -86,6 +87,7 @@ static void tun_default_link_ksettings(struct
> > net_device *dev,
> > > struct
> > ethtool_link_ksettings
> > > *cmd);
> > >
> > > #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> > > +#define TUN_XDP_BATCH 64
> > >
> > > /* TUN device flags */
> > >
> > > @@ -146,6 +148,9 @@ struct tun_file {
> > > struct tun_struct *detached;
> > > struct ptr_ring tx_ring;
> > > struct xdp_rxq_info xdp_rxq;
> > > + struct xsk_buff_pool *xsk_pool;
> > > + spinlock_t pool_lock; /* Protects xsk pool enable/disable */
> > > + u32 nb_descs;
> > > };
> > >
> > > struct tun_page {
> > > @@ -614,6 +619,8 @@ void tun_ptr_free(void *ptr)
> > > struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
> > >
> > > xdp_return_frame(xdpf);
> > > + } else if (tun_is_xdp_desc_frame(ptr)) {
> > > + return;
> > > } else {
> > > __skb_array_destroy_skb(ptr);
> > > }
> > > @@ -631,6 +638,37 @@ static void tun_queue_purge(struct tun_file *tfile)
> > > skb_queue_purge(&tfile->sk.sk_error_queue);
> > > }
> > >
> > > +static void tun_set_xsk_pool(struct tun_file *tfile, struct
> > > +xsk_buff_pool *pool) {
> > > + if (!pool)
> > > + return;
> > > +
> > > + spin_lock(&tfile->pool_lock);
> > > + xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> > > + tfile->xsk_pool = pool;
> > > + spin_unlock(&tfile->pool_lock); }
> > > +
> > > +static void tun_clean_xsk_pool(struct tun_file *tfile) {
> > > + spin_lock(&tfile->pool_lock);
> > > + if (tfile->xsk_pool) {
> > > + void *ptr;
> > > +
> > > + while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
> > > + tun_ptr_free(ptr);
> > > +
> > > + if (tfile->nb_descs) {
> > > + xsk_tx_completed(tfile->xsk_pool,
> > tfile->nb_descs);
> > > + if (xsk_uses_need_wakeup(tfile->xsk_pool))
> > > +
> > xsk_set_tx_need_wakeup(tfile->xsk_pool);
> > > + tfile->nb_descs = 0;
> > > + }
> > > + tfile->xsk_pool = NULL;
> > > + }
> > > + spin_unlock(&tfile->pool_lock); }
> > > +
> > > static void __tun_detach(struct tun_file *tfile, bool clean) {
> > > struct tun_file *ntfile;
> > > @@ -648,6 +686,11 @@ static void __tun_detach(struct tun_file *tfile, bool
> > clean)
> > > u16 index = tfile->queue_index;
> > > BUG_ON(index >= tun->numqueues);
> > >
> > > + ntfile = rtnl_dereference(tun->tfiles[tun->numqueues -
> > 1]);
> > > + /* Stop xsk zc xmit */
> > > + tun_clean_xsk_pool(tfile);
> > > + tun_clean_xsk_pool(ntfile);
> > > +
> > > rcu_assign_pointer(tun->tfiles[index],
> > > tun->tfiles[tun->numqueues - 1]);
> > > ntfile = rtnl_dereference(tun->tfiles[index]);
> > > @@ -668,6 +711,7 @@ static void __tun_detach(struct tun_file *tfile, bool
> > clean)
> > > tun_flow_delete_by_queue(tun, tun->numqueues + 1);
> > > /* Drop read queue */
> > > tun_queue_purge(tfile);
> > > + tun_set_xsk_pool(ntfile,
> > > + xsk_get_pool_from_qid(tun->dev, index));
> > > tun_set_real_num_queues(tun);
> > > } else if (tfile->detached && clean) {
> > > tun = tun_enable_queue(tfile); @@ -801,6 +845,7 @@
> > > static int tun_attach(struct tun_struct *tun, struct file *file,
> > >
> > > if (tfile->xdp_rxq.queue_index != tfile->queue_index)
> > > tfile->xdp_rxq.queue_index =
> > > tfile->queue_index;
> > > + tun_set_xsk_pool(tfile, xsk_get_pool_from_qid(dev,
> > > + tfile->queue_index));
> > > } else {
> > > /* Setup XDP RX-queue info, for new tfile getting
> > attached */
> > > err = xdp_rxq_info_reg(&tfile->xdp_rxq, @@ -1221,11
> > > +1266,50 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog
> > *prog,
> > > return 0;
> > > }
> > >
> > > +static int tun_xsk_pool_enable(struct net_device *netdev,
> > > + struct xsk_buff_pool *pool,
> > > + u16 qid) {
> > > + struct tun_struct *tun = netdev_priv(netdev);
> > > + struct tun_file *tfile;
> > > +
> > > + if (qid >= tun->numqueues)
> > > + return -EINVAL;
> > > +
> > > + tfile = rtnl_dereference(tun->tfiles[qid]);
> > > + tun_set_xsk_pool(tfile, pool);
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid) {
> > > + struct tun_struct *tun = netdev_priv(netdev);
> > > + struct tun_file *tfile;
> > > +
> > > + if (qid >= MAX_TAP_QUEUES)
> > > + return -EINVAL;
> > > +
> > > + tfile = rtnl_dereference(tun->tfiles[qid]);
> > > + if (tfile)
> > > + tun_clean_xsk_pool(tfile);
> > > + return 0;
> > > +}
> > > +
> > > +static int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool
> > *pool,
> > > + u16 qid) {
> > > + return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> > > + tun_xsk_pool_disable(dev, qid); }
> > > +
> > > static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp) {
> > > switch (xdp->command) {
> > > case XDP_SETUP_PROG:
> > > return tun_xdp_set(dev, xdp->prog, xdp->extack);
> > > + case XDP_SETUP_XSK_POOL:
> > > + return tun_xsk_pool_setup(dev, xdp->xsk.pool,
> > > + xdp->xsk.queue_id);
> > > default:
> > > return -EINVAL;
> > > }
> > > @@ -1330,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev,
> > struct xdp_buff *xdp)
> > > return nxmit;
> > > }
> > >
> > > +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> > > +{
> > > + struct tun_struct *tun = netdev_priv(dev);
> > > + struct tun_file *tfile;
> > > +
> > > + rcu_read_lock();
> > > + tfile = rcu_dereference(tun->tfiles[qid]);
> > > + if (tfile)
> > > + __tun_xdp_flush_tfile(tfile);
> > > + rcu_read_unlock();
> > > + return 0;
> > > +}
> >
> > I may miss something but why not simply queue xdp frames into ptr ring then
> > we don't need tricks for peek?
>
> The current implementation is implemented with reference to other NIC's drivers
> (ixgbe, ice, dpaa2, mlx5), which use XDP descriptors directly.
Well, they all do the same thing which is translate XDP descriptors to
the vendor specific descriptor format.
For TUN, the ptr_ring is the "vendor specific" format.
> Converting XDP
> descriptors to XDP frames does not seem to be very beneficial.
Get rid of the trick for peek like what is done in this patch?
Thanks
>
> Thanks
> >
> > Thanks
>
On Sat, Mar 2, 2024 at 2:40 AM Willem de Bruijn
<[email protected]> wrote:
>
> Maciej Fijalkowski wrote:
> > On Wed, Feb 28, 2024 at 07:05:56PM +0800, Yunjian Wang wrote:
> > > This patch set allows TUN to support the AF_XDP Tx zero-copy feature,
> > > which can significantly reduce CPU utilization for XDP programs.
> >
> > Why no Rx ZC support though? What will happen if I try rxdrop xdpsock
> > against tun with this patch? You clearly allow for that.
>
> This is AF_XDP receive zerocopy, right?
>
> The naming is always confusing with tun, but even though from a tun
> PoV this happens on ndo_start_xmit, it is the AF_XDP equivalent to
> tun_put_user.
>
> So the implementation is more like other device's Rx ZC.
>
> I would have preferred that name, but I think Jason asked for this
> and given tun's weird status, there is something bo said for either.
>
From the the view of the AF_XDP userspace program, it's the TX path,
and as you said it happens on the TUN xmit path as well. When using
with a VM, it's the RX path.
So TX seems better.
Thanks
On Mon, Mar 4, 2024 at 9:45 PM wangyunjian <[email protected]> wrote:
>
>
>
> > -----Original Message-----
> > From: Michael S. Tsirkin [mailto:[email protected]]
> > Sent: Friday, March 1, 2024 7:53 PM
> > To: wangyunjian <[email protected]>
> > Cc: Paolo Abeni <[email protected]>; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; xudingke <[email protected]>; liwei (DT)
> > <[email protected]>
> > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> >
> > On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > > > -----Original Message-----
> > > > From: Paolo Abeni [mailto:[email protected]]
> > > > Sent: Thursday, February 29, 2024 7:13 PM
> > > > To: wangyunjian <[email protected]>; [email protected];
> > > > [email protected]; [email protected];
> > > > [email protected]; [email protected]; [email protected];
> > > > [email protected]; [email protected];
> > > > [email protected]
> > > > Cc: [email protected]; [email protected];
> > > > [email protected]; [email protected];
> > > > [email protected]; xudingke <[email protected]>;
> > > > liwei (DT) <[email protected]>
> > > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > > support
> > > >
> > > > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > > > > }
> > > > > }
> > > > >
> > > > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > > > + struct xsk_buff_pool *pool;
> > > > > + u32 i, batch, budget;
> > > > > + void *frame;
> > > > > +
> > > > > + if (!ptr_ring_empty(&tfile->tx_ring))
> > > > > + return;
> > > > > +
> > > > > + spin_lock(&tfile->pool_lock);
> > > > > + pool = tfile->xsk_pool;
> > > > > + if (!pool) {
> > > > > + spin_unlock(&tfile->pool_lock);
> > > > > + return;
> > > > > + }
> > > > > +
> > > > > + if (tfile->nb_descs) {
> > > > > + xsk_tx_completed(pool, tfile->nb_descs);
> > > > > + if (xsk_uses_need_wakeup(pool))
> > > > > + xsk_set_tx_need_wakeup(pool);
> > > > > + }
> > > > > +
> > > > > + spin_lock(&tfile->tx_ring.producer_lock);
> > > > > + budget = min_t(u32, tfile->tx_ring.size, TUN_XDP_BATCH);
> > > > > +
> > > > > + batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > > > + if (!batch) {
> > > >
> > > > This branch looks like an unneeded "optimization". The generic loop
> > > > below should have the same effect with no measurable perf delta - and
> > smaller code.
> > > > Just remove this.
> > > >
> > > > > + tfile->nb_descs = 0;
> > > > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > + spin_unlock(&tfile->pool_lock);
> > > > > + return;
> > > > > + }
> > > > > +
> > > > > + tfile->nb_descs = batch;
> > > > > + for (i = 0; i < batch; i++) {
> > > > > + /* Encode the XDP DESC flag into lowest bit for consumer to
> > differ
> > > > > + * XDP desc from XDP buffer and sk_buff.
> > > > > + */
> > > > > + frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > > > + /* The budget must be less than or equal to tx_ring.size,
> > > > > + * so enqueuing will not fail.
> > > > > + */
> > > > > + __ptr_ring_produce(&tfile->tx_ring, frame);
> > > > > + }
> > > > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > + spin_unlock(&tfile->pool_lock);
> > > >
> > > > More related to the general design: it looks wrong. What if
> > > > get_rx_bufs() will fail (ENOBUF) after successful peeking? With no
> > > > more incoming packets, later peek will return 0 and it looks like
> > > > that the half-processed packets will stay in the ring forever???
> > > >
> > > > I think the 'ring produce' part should be moved into tun_do_read().
> > >
> > > Currently, the vhost-net obtains a batch descriptors/sk_buffs from the
> > > ptr_ring and enqueue the batch descriptors/sk_buffs to the
> > > virtqueue'queue, and then consumes the descriptors/sk_buffs from the
> > > virtqueue'queue in sequence. As a result, TUN does not know whether
> > > the batch descriptors have been used up, and thus does not know when to
> > return the batch descriptors.
> > >
> > > So, I think it's reasonable that when vhost-net checks ptr_ring is
> > > empty, it calls peek_len to get new xsk's descs and return the descriptors.
> > >
> > > Thanks
> >
> > What you need to think about is that if you peek, another call in parallel can get
> > the same value at the same time.
>
> Thank you. I have identified a problem. The tx_descs array was created within xsk's pool.
> When xsk is freed, the pool and tx_descs are also freed. Howerver, some descs may
> remain in the virtqueue'queue, which could lead to a use-after-free scenario.
This can probably solving by when xsk pool is disabled, signal the
vhost_net to drop those descriptors.
Thanks
> Currently,
> I do not have an idea to solve this concurrency problem and believe this scenario may
> not be appropriate for reusing the ptr_ring.
>
> Thanks
>
> >
> >
> > > >
> > > > Cheers,
> > > >
> > > > Paolo
> > >
>
On Mon, Mar 11, 2024 at 9:28 PM wangyunjian <[email protected]> wrote:
>
>
>
> > -----Original Message-----
> > From: Jason Wang [mailto:[email protected]]
> > Sent: Monday, March 11, 2024 12:01 PM
> > To: wangyunjian <[email protected]>
> > Cc: Michael S. Tsirkin <[email protected]>; Paolo Abeni <[email protected]>;
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; xudingke <[email protected]>; liwei (DT)
> > <[email protected]>
> > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
> >
> > On Mon, Mar 4, 2024 at 9:45 PM wangyunjian <[email protected]>
> > wrote:
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Michael S. Tsirkin [mailto:[email protected]]
> > > > Sent: Friday, March 1, 2024 7:53 PM
> > > > To: wangyunjian <[email protected]>
> > > > Cc: Paolo Abeni <[email protected]>;
> > > > [email protected]; [email protected];
> > > > [email protected]; [email protected]; [email protected];
> > > > [email protected]; [email protected];
> > > > [email protected]; [email protected]; [email protected];
> > > > [email protected]; [email protected];
> > > > [email protected]; xudingke <[email protected]>;
> > > > liwei (DT) <[email protected]>
> > > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > > support
> > > >
> > > > On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > > > > > -----Original Message-----
> > > > > > From: Paolo Abeni [mailto:[email protected]]
> > > > > > Sent: Thursday, February 29, 2024 7:13 PM
> > > > > > To: wangyunjian <[email protected]>; [email protected];
> > > > > > [email protected]; [email protected];
> > > > > > [email protected]; [email protected]; [email protected];
> > > > > > [email protected]; [email protected];
> > > > > > [email protected]
> > > > > > Cc: [email protected]; [email protected];
> > > > > > [email protected]; [email protected];
> > > > > > [email protected]; xudingke <[email protected]>;
> > > > > > liwei (DT) <[email protected]>
> > > > > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > > > > support
> > > > > >
> > > > > > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > > > > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > > > > > > }
> > > > > > > }
> > > > > > >
> > > > > > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > > > > > + struct xsk_buff_pool *pool;
> > > > > > > + u32 i, batch, budget;
> > > > > > > + void *frame;
> > > > > > > +
> > > > > > > + if (!ptr_ring_empty(&tfile->tx_ring))
> > > > > > > + return;
> > > > > > > +
> > > > > > > + spin_lock(&tfile->pool_lock);
> > > > > > > + pool = tfile->xsk_pool;
> > > > > > > + if (!pool) {
> > > > > > > + spin_unlock(&tfile->pool_lock);
> > > > > > > + return;
> > > > > > > + }
> > > > > > > +
> > > > > > > + if (tfile->nb_descs) {
> > > > > > > + xsk_tx_completed(pool, tfile->nb_descs);
> > > > > > > + if (xsk_uses_need_wakeup(pool))
> > > > > > > + xsk_set_tx_need_wakeup(pool);
> > > > > > > + }
> > > > > > > +
> > > > > > > + spin_lock(&tfile->tx_ring.producer_lock);
> > > > > > > + budget = min_t(u32, tfile->tx_ring.size,
> > > > > > > + TUN_XDP_BATCH);
> > > > > > > +
> > > > > > > + batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > > > > > + if (!batch) {
> > > > > >
> > > > > > This branch looks like an unneeded "optimization". The generic
> > > > > > loop below should have the same effect with no measurable perf
> > > > > > delta - and
> > > > smaller code.
> > > > > > Just remove this.
> > > > > >
> > > > > > > + tfile->nb_descs = 0;
> > > > > > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > > > + spin_unlock(&tfile->pool_lock);
> > > > > > > + return;
> > > > > > > + }
> > > > > > > +
> > > > > > > + tfile->nb_descs = batch;
> > > > > > > + for (i = 0; i < batch; i++) {
> > > > > > > + /* Encode the XDP DESC flag into lowest bit
> > > > > > > + for consumer to
> > > > differ
> > > > > > > + * XDP desc from XDP buffer and sk_buff.
> > > > > > > + */
> > > > > > > + frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > > > > > + /* The budget must be less than or equal to
> > tx_ring.size,
> > > > > > > + * so enqueuing will not fail.
> > > > > > > + */
> > > > > > > + __ptr_ring_produce(&tfile->tx_ring, frame);
> > > > > > > + }
> > > > > > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > > > + spin_unlock(&tfile->pool_lock);
> > > > > >
> > > > > > More related to the general design: it looks wrong. What if
> > > > > > get_rx_bufs() will fail (ENOBUF) after successful peeking? With
> > > > > > no more incoming packets, later peek will return 0 and it looks
> > > > > > like that the half-processed packets will stay in the ring forever???
> > > > > >
> > > > > > I think the 'ring produce' part should be moved into tun_do_read().
> > > > >
> > > > > Currently, the vhost-net obtains a batch descriptors/sk_buffs from
> > > > > the ptr_ring and enqueue the batch descriptors/sk_buffs to the
> > > > > virtqueue'queue, and then consumes the descriptors/sk_buffs from
> > > > > the virtqueue'queue in sequence. As a result, TUN does not know
> > > > > whether the batch descriptors have been used up, and thus does not
> > > > > know when to
> > > > return the batch descriptors.
> > > > >
> > > > > So, I think it's reasonable that when vhost-net checks ptr_ring is
> > > > > empty, it calls peek_len to get new xsk's descs and return the descriptors.
> > > > >
> > > > > Thanks
> > > >
> > > > What you need to think about is that if you peek, another call in
> > > > parallel can get the same value at the same time.
> > >
> > > Thank you. I have identified a problem. The tx_descs array was created within
> > xsk's pool.
> > > When xsk is freed, the pool and tx_descs are also freed. Howerver,
> > > some descs may remain in the virtqueue'queue, which could lead to a
> > use-after-free scenario.
> >
> > This can probably solving by when xsk pool is disabled, signal the vhost_net to
> > drop those descriptors.
>
> I think TUN can notify vhost_net to drop these descriptors through netdev events.
Great, actually, the "issue" described above exist in this patch as
well. For example, you did:
spin_lock(&tfile->pool_lock);
if (tfile->pool) {
ret = tun_put_user_desc(tun, tfile,
&tfile->desc, to);
You did copy_to_user() under spinlock which is actually a bug.
> However, there is a potential concurrency problem. When handling netdev events
> and packets, vhost_net preempts the 'vq->mutex_lock', leading to unstable performance.
I think we don't need to care the perf in this case.
And we gain a lot:
1) no trick in peek
2) batching support
..
Thanks
>
> Thanks
> >
> > Thanks
> >
> > > Currently,
> > > I do not have an idea to solve this concurrency problem and believe
> > > this scenario may not be appropriate for reusing the ptr_ring.
> > >
> > > Thanks
> > >
> > > >
> > > >
> > > > > >
> > > > > > Cheers,
> > > > > >
> > > > > > Paolo
> > > > >
> > >
>
> -----Original Message-----
> From: Jason Wang [mailto:[email protected]]
> Sent: Monday, March 11, 2024 12:01 PM
> To: wangyunjian <[email protected]>
> Cc: Michael S. Tsirkin <[email protected]>; Paolo Abeni <[email protected]>;
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; xudingke <[email protected]>; liwei (DT)
> <[email protected]>
> Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy support
>
> On Mon, Mar 4, 2024 at 9:45 PM wangyunjian <[email protected]>
> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Michael S. Tsirkin [mailto:[email protected]]
> > > Sent: Friday, March 1, 2024 7:53 PM
> > > To: wangyunjian <[email protected]>
> > > Cc: Paolo Abeni <[email protected]>;
> > > [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]; xudingke <[email protected]>;
> > > liwei (DT) <[email protected]>
> > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > support
> > >
> > > On Fri, Mar 01, 2024 at 11:45:52AM +0000, wangyunjian wrote:
> > > > > -----Original Message-----
> > > > > From: Paolo Abeni [mailto:[email protected]]
> > > > > Sent: Thursday, February 29, 2024 7:13 PM
> > > > > To: wangyunjian <[email protected]>; [email protected];
> > > > > [email protected]; [email protected];
> > > > > [email protected]; [email protected]; [email protected];
> > > > > [email protected]; [email protected];
> > > > > [email protected]
> > > > > Cc: [email protected]; [email protected];
> > > > > [email protected]; [email protected];
> > > > > [email protected]; xudingke <[email protected]>;
> > > > > liwei (DT) <[email protected]>
> > > > > Subject: Re: [PATCH net-next v2 3/3] tun: AF_XDP Tx zero-copy
> > > > > support
> > > > >
> > > > > On Wed, 2024-02-28 at 19:05 +0800, Yunjian Wang wrote:
> > > > > > @@ -2661,6 +2776,54 @@ static int tun_ptr_peek_len(void *ptr)
> > > > > > }
> > > > > > }
> > > > > >
> > > > > > +static void tun_peek_xsk(struct tun_file *tfile) {
> > > > > > + struct xsk_buff_pool *pool;
> > > > > > + u32 i, batch, budget;
> > > > > > + void *frame;
> > > > > > +
> > > > > > + if (!ptr_ring_empty(&tfile->tx_ring))
> > > > > > + return;
> > > > > > +
> > > > > > + spin_lock(&tfile->pool_lock);
> > > > > > + pool = tfile->xsk_pool;
> > > > > > + if (!pool) {
> > > > > > + spin_unlock(&tfile->pool_lock);
> > > > > > + return;
> > > > > > + }
> > > > > > +
> > > > > > + if (tfile->nb_descs) {
> > > > > > + xsk_tx_completed(pool, tfile->nb_descs);
> > > > > > + if (xsk_uses_need_wakeup(pool))
> > > > > > + xsk_set_tx_need_wakeup(pool);
> > > > > > + }
> > > > > > +
> > > > > > + spin_lock(&tfile->tx_ring.producer_lock);
> > > > > > + budget = min_t(u32, tfile->tx_ring.size,
> > > > > > + TUN_XDP_BATCH);
> > > > > > +
> > > > > > + batch = xsk_tx_peek_release_desc_batch(pool, budget);
> > > > > > + if (!batch) {
> > > > >
> > > > > This branch looks like an unneeded "optimization". The generic
> > > > > loop below should have the same effect with no measurable perf
> > > > > delta - and
> > > smaller code.
> > > > > Just remove this.
> > > > >
> > > > > > + tfile->nb_descs = 0;
> > > > > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > > + spin_unlock(&tfile->pool_lock);
> > > > > > + return;
> > > > > > + }
> > > > > > +
> > > > > > + tfile->nb_descs = batch;
> > > > > > + for (i = 0; i < batch; i++) {
> > > > > > + /* Encode the XDP DESC flag into lowest bit
> > > > > > + for consumer to
> > > differ
> > > > > > + * XDP desc from XDP buffer and sk_buff.
> > > > > > + */
> > > > > > + frame = tun_xdp_desc_to_ptr(&pool->tx_descs[i]);
> > > > > > + /* The budget must be less than or equal to
> tx_ring.size,
> > > > > > + * so enqueuing will not fail.
> > > > > > + */
> > > > > > + __ptr_ring_produce(&tfile->tx_ring, frame);
> > > > > > + }
> > > > > > + spin_unlock(&tfile->tx_ring.producer_lock);
> > > > > > + spin_unlock(&tfile->pool_lock);
> > > > >
> > > > > More related to the general design: it looks wrong. What if
> > > > > get_rx_bufs() will fail (ENOBUF) after successful peeking? With
> > > > > no more incoming packets, later peek will return 0 and it looks
> > > > > like that the half-processed packets will stay in the ring forever???
> > > > >
> > > > > I think the 'ring produce' part should be moved into tun_do_read().
> > > >
> > > > Currently, the vhost-net obtains a batch descriptors/sk_buffs from
> > > > the ptr_ring and enqueue the batch descriptors/sk_buffs to the
> > > > virtqueue'queue, and then consumes the descriptors/sk_buffs from
> > > > the virtqueue'queue in sequence. As a result, TUN does not know
> > > > whether the batch descriptors have been used up, and thus does not
> > > > know when to
> > > return the batch descriptors.
> > > >
> > > > So, I think it's reasonable that when vhost-net checks ptr_ring is
> > > > empty, it calls peek_len to get new xsk's descs and return the descriptors.
> > > >
> > > > Thanks
> > >
> > > What you need to think about is that if you peek, another call in
> > > parallel can get the same value at the same time.
> >
> > Thank you. I have identified a problem. The tx_descs array was created within
> xsk's pool.
> > When xsk is freed, the pool and tx_descs are also freed. Howerver,
> > some descs may remain in the virtqueue'queue, which could lead to a
> use-after-free scenario.
>
> This can probably solving by when xsk pool is disabled, signal the vhost_net to
> drop those descriptors.
I think TUN can notify vhost_net to drop these descriptors through netdev events.
However, there is a potential concurrency problem. When handling netdev events
and packets, vhost_net preempts the 'vq->mutex_lock', leading to unstable performance.
Thanks
>
> Thanks
>
> > Currently,
> > I do not have an idea to solve this concurrency problem and believe
> > this scenario may not be appropriate for reusing the ptr_ring.
> >
> > Thanks
> >
> > >
> > >
> > > > >
> > > > > Cheers,
> > > > >
> > > > > Paolo
> > > >
> >