2023-08-03 16:24:07

by 黄杰

[permalink] [raw]
Subject: [RFC Optimizing veth xsk performance 05/10] veth: use send queue tx napi to xmit xsk tx desc

Signed-off-by: huangjie.albert <[email protected]>
---
drivers/net/veth.c | 265 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 264 insertions(+), 1 deletion(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 63c3ebe4c5d0..944761807ca4 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -27,6 +27,8 @@
#include <linux/bpf_trace.h>
#include <linux/net_tstamp.h>
#include <net/page_pool.h>
+#include <net/xdp_sock_drv.h>
+#include <net/xdp.h>

#define DRV_NAME "veth"
#define DRV_VERSION "1.0"
@@ -1061,6 +1063,176 @@ static int veth_poll(struct napi_struct *napi, int budget)
return done;
}

+static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, int budget)
+{
+ struct veth_priv *priv, *peer_priv;
+ struct net_device *dev, *peer_dev;
+ struct veth_rq *peer_rq;
+ struct veth_stats peer_stats = {};
+ struct veth_stats stats = {};
+ struct veth_xdp_tx_bq bq;
+ struct xdp_desc desc;
+ void *xdpf;
+ int done = 0;
+
+ bq.count = 0;
+ dev = sq->dev;
+ priv = netdev_priv(dev);
+ peer_dev = priv->peer;
+ peer_priv = netdev_priv(peer_dev);
+
+ /* todo: queue index must set before this */
+ peer_rq = &peer_priv->rq[sq->queue_index];
+
+ /* set xsk wake up flag, to do: where to disable */
+ if (xsk_uses_need_wakeup(xsk_pool))
+ xsk_set_tx_need_wakeup(xsk_pool);
+
+ while (budget-- > 0) {
+ unsigned int truesize = 0;
+ struct xdp_frame *p_frame;
+ struct page *page;
+ void *new_addr;
+ void *addr;
+
+ /*
+ * get a desc from xsk pool
+ */
+ if (!xsk_tx_peek_desc(xsk_pool, &desc)) {
+ break;
+ }
+
+ /*
+ * Get a xmit addr
+ * desc.addr is a offset, so we should to convert to real virtual address
+ */
+ addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
+
+ /* can not hold all data in a page */
+ truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + desc.len + sizeof(struct xdp_frame);
+ if (truesize > PAGE_SIZE) {
+ stats.xdp_drops++;
+ xsk_tx_completed_addr(xsk_pool, desc.addr);
+ continue;
+ }
+
+ page = dev_alloc_page();
+ if (!page) {
+ /*
+ * error , release xdp frame and increase drops
+ */
+ xsk_tx_completed_addr(xsk_pool, desc.addr);
+ stats.xdp_drops++;
+ break;
+ }
+ new_addr = page_to_virt(page);
+
+ p_frame = new_addr;
+ new_addr += sizeof(struct xdp_frame);
+ p_frame->data = new_addr;
+ p_frame->len = desc.len;
+
+ /* frame should change to the page size, beacause the (struct skb_shared_info) is so large,
+ * if we build skb in veth_xdp_rcv_one, skb->tail may larger than skb->end which could triger a skb_panic
+ */
+ p_frame->headroom = 0;
+ p_frame->metasize = 0;
+ p_frame->frame_sz = PAGE_SIZE;
+ p_frame->flags = 0;
+ p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
+ memcpy(p_frame->data, addr, p_frame->len);
+ xsk_tx_completed_addr(xsk_pool, desc.addr);
+
+ /* if peer have xdp prog, if it has ,just send to peer */
+ p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+ /* if no xdp with this queue, convert to skb to xmit*/
+ if (p_frame) {
+ xdpf = p_frame;
+ veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
+ p_frame = NULL;
+ }
+
+ stats.xdp_bytes += desc.len;
+
+ done++;
+ }
+
+ /* release, move consumer,and wakeup the producer */
+ if (done) {
+ napi_schedule(&peer_rq->xdp_napi);
+ xsk_tx_release(xsk_pool);
+ }
+
+
+
+ /* just for peer rq */
+ if (peer_stats.xdp_tx > 0)
+ veth_xdp_flush(peer_rq, &bq);
+ if (peer_stats.xdp_redirect > 0)
+ xdp_do_flush();
+
+ /* update peer rq stats, or maybe we do not need to do this */
+ u64_stats_update_begin(&peer_rq->stats.syncp);
+ peer_rq->stats.vs.xdp_redirect += peer_stats.xdp_redirect;
+ peer_rq->stats.vs.xdp_packets += done;
+ peer_rq->stats.vs.xdp_bytes += stats.xdp_bytes;
+ peer_rq->stats.vs.xdp_drops += peer_stats.xdp_drops;
+ peer_rq->stats.vs.rx_drops += peer_stats.rx_drops;
+ peer_rq->stats.vs.xdp_tx += peer_stats.xdp_tx;
+ u64_stats_update_end(&peer_rq->stats.syncp);
+
+ /* update sq stats */
+ u64_stats_update_begin(&sq->stats.syncp);
+ sq->stats.vs.xdp_packets += done;
+ sq->stats.vs.xdp_bytes += stats.xdp_bytes;
+ sq->stats.vs.xdp_drops += stats.xdp_drops;
+ u64_stats_update_end(&sq->stats.syncp);
+
+ return done;
+}
+
+static int veth_poll_tx(struct napi_struct *napi, int budget)
+{
+ struct veth_sq *sq = container_of(napi, struct veth_sq, xdp_napi);
+ struct xsk_buff_pool *pool;
+ int done = 0;
+ xdp_set_return_frame_no_direct();
+
+ sq->xsk.last_cpu = smp_processor_id();
+
+ /* xmit for tx queue */
+ rcu_read_lock();
+ pool = rcu_dereference(sq->xsk.pool);
+ if (pool) {
+ done = veth_xsk_tx_xmit(sq, pool, budget);
+ }
+ rcu_read_unlock();
+
+ if (done < budget) {
+ /* if done < budget, the tx ring is no buffer */
+ napi_complete_done(napi, done);
+ }
+
+ xdp_clear_return_frame_no_direct();
+
+ return done;
+}
+
+
+static int veth_napi_add_tx(struct net_device *dev)
+{
+ struct veth_priv *priv = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct veth_sq *sq = &priv->sq[i];
+ netif_napi_add(dev, &sq->xdp_napi, veth_poll_tx);
+ napi_enable(&sq->xdp_napi);
+ }
+
+ return 0;
+}
+
static int veth_create_page_pool(struct veth_rq *rq)
{
struct page_pool_params pp_params = {
@@ -1153,6 +1325,19 @@ static void veth_napi_del_range(struct net_device *dev, int start, int end)
}
}

+static void veth_napi_del_tx(struct net_device *dev)
+{
+ struct veth_priv *priv = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct veth_sq *sq = &priv->sq[i];
+
+ napi_disable(&sq->xdp_napi);
+ __netif_napi_del(&sq->xdp_napi);
+ }
+}
+
static void veth_napi_del(struct net_device *dev)
{
veth_napi_del_range(dev, 0, dev->real_num_rx_queues);
@@ -1360,7 +1545,7 @@ static void veth_set_xdp_features(struct net_device *dev)
struct veth_priv *priv_peer = netdev_priv(peer);
xdp_features_t val = NETDEV_XDP_ACT_BASIC |
NETDEV_XDP_ACT_REDIRECT |
- NETDEV_XDP_ACT_RX_SG;
+ NETDEV_XDP_ACT_RX_SG | NETDEV_XDP_ACT_XSK_ZEROCOPY;

if (priv_peer->_xdp_prog || veth_gro_requested(peer))
val |= NETDEV_XDP_ACT_NDO_XMIT |
@@ -1737,11 +1922,89 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
return err;
}

+static int veth_xsk_pool_enable(struct net_device *dev, struct xsk_buff_pool *pool, u16 qid)
+{
+ struct veth_priv *peer_priv;
+ struct veth_priv *priv = netdev_priv(dev);
+ struct net_device *peer_dev = priv->peer;
+ int err = 0;
+
+ if (qid >= dev->real_num_tx_queues)
+ return -EINVAL;
+
+ if(!peer_dev)
+ return -EINVAL;
+
+ /* no dma, so we just skip dma skip in xsk zero copy */
+ pool->dma_check_skip = true;
+
+ peer_priv = netdev_priv(peer_dev);
+ /*
+ * enable peer tx xdp here, this side
+ * xdp is enable by veth_xdp_set
+ * to do: we need to check whther this side is already enable xdp
+ * maybe it do not have xdp prog
+ */
+ if (!(peer_priv->_xdp_prog) && (!veth_gro_requested(peer_dev))) {
+ /* peer should enable napi*/
+ err = veth_napi_enable(peer_dev);
+ if (err)
+ return err;
+ }
+
+ /* Here is already protected by rtnl_lock, so rcu_assign_pointer
+ * is safe.
+ */
+ rcu_assign_pointer(priv->sq[qid].xsk.pool, pool);
+
+ veth_napi_add_tx(dev);
+
+ return err;
+}
+
+static int veth_xsk_pool_disable(struct net_device *dev, u16 qid)
+{
+ struct veth_priv *peer_priv;
+ struct veth_priv *priv = netdev_priv(dev);
+ struct net_device *peer_dev = priv->peer;
+ int err = 0;
+
+ if (qid >= dev->real_num_tx_queues)
+ return -EINVAL;
+
+ if(!peer_dev)
+ return -EINVAL;
+
+ peer_priv = netdev_priv(peer_dev);
+
+ /* to do: this may be failed */
+ if (!(peer_priv->_xdp_prog) && (!veth_gro_requested(peer_dev))) {
+ /* disable peer napi */
+ veth_napi_del(peer_dev);
+ }
+
+ veth_napi_del_tx(dev);
+
+ rcu_assign_pointer(priv->sq[qid].xsk.pool, NULL);
+ return err;
+}
+
+/* this is for setup xdp */
+static int veth_xsk_pool_setup(struct net_device *dev, struct netdev_bpf *xdp)
+{
+ if (xdp->xsk.pool)
+ return veth_xsk_pool_enable(dev, xdp->xsk.pool, xdp->xsk.queue_id);
+ else
+ return veth_xsk_pool_disable(dev, xdp->xsk.queue_id);
+}
+
static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
{
switch (xdp->command) {
case XDP_SETUP_PROG:
return veth_xdp_set(dev, xdp->prog, xdp->extack);
+ case XDP_SETUP_XSK_POOL:
+ return veth_xsk_pool_setup(dev, xdp);
default:
return -EINVAL;
}
--
2.20.1



2023-08-04 21:13:21

by Simon Horman

[permalink] [raw]
Subject: Re: [RFC Optimizing veth xsk performance 05/10] veth: use send queue tx napi to xmit xsk tx desc

On Thu, Aug 03, 2023 at 10:04:31PM +0800, huangjie.albert wrote:

Please include a patch description.

> Signed-off-by: huangjie.albert <[email protected]>

Please consider formatting this as:

... Albert Huang <[email protected]>

> ---
> drivers/net/veth.c | 265 ++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 264 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 63c3ebe4c5d0..944761807ca4 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -27,6 +27,8 @@
> #include <linux/bpf_trace.h>
> #include <linux/net_tstamp.h>
> #include <net/page_pool.h>
> +#include <net/xdp_sock_drv.h>
> +#include <net/xdp.h>
>
> #define DRV_NAME "veth"
> #define DRV_VERSION "1.0"

> @@ -1061,6 +1063,176 @@ static int veth_poll(struct napi_struct *napi, int budget)
> return done;
> }
>
> +static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, int budget)
> +{
> + struct veth_priv *priv, *peer_priv;
> + struct net_device *dev, *peer_dev;
> + struct veth_rq *peer_rq;
> + struct veth_stats peer_stats = {};
> + struct veth_stats stats = {};
> + struct veth_xdp_tx_bq bq;
> + struct xdp_desc desc;
> + void *xdpf;
> + int done = 0;

Please try to use reverse xmas tree ordering - longest line to shortest -
for local variable declarations in new Networking code.

https://github.com/ecree-solarflare/xmastree is your friend here.

> +
> + bq.count = 0;
> + dev = sq->dev;
> + priv = netdev_priv(dev);
> + peer_dev = priv->peer;

Sparse seems a bit unhappy about this.

.../veth.c:1081:18: warning: incorrect type in assignment (different address spaces)
.../veth.c:1081:18: expected struct net_device *peer_dev
.../veth.c:1081:18: got struct net_device [noderef] __rcu *peer

Looking over existing code in this file, perhaps this is appropriate:

peer_dev = rtnl_dereference(priv->peer);

Likewise in a few other places in this patch.

...