AF_XDP is a kernel bypass technology that can greatly improve performance.
However, for virtual devices like veth, even with the use of AF_XDP sockets,
there are still many additional software paths that consume CPU resources.
This patch series focuses on optimizing the performance of AF_XDP sockets
for veth virtual devices. Patches 1 to 4 mainly involve preparatory work.
Patch 5 introduces tx queue and tx napi for packet transmission, while
patch 9 primarily implements zero-copy, and patch 10 adds support for
batch sending of IPv4 UDP packets. These optimizations significantly reduce
the software path and support checksum offload.
I tested those feature with
A typical topology is shown below:
veth<-->veth-peer veth1-peer<--->veth1
1 | | 7
|2 6|
| |
bridge<------->eth0(mlnx5)- switch -eth1(mlnx5)<--->bridge1
3 4 5
(machine1) (machine2)
AF_XDP socket is attach to veth and veth1. and send packets to physical NIC(eth0)
veth:(172.17.0.2/24)
bridge:(172.17.0.1/24)
eth0:(192.168.156.66/24)
eth1(172.17.0.2/24)
bridge1:(172.17.0.1/24)
eth0:(192.168.156.88/24)
after set default route、snat、dnat. we can have a tests
to get the performance results.
packets send from veth to veth1:
af_xdp test tool:
link:https://github.com/cclinuxer/libxudp
send:(veth)
./objs/xudpperf send --dst 192.168.156.88:6002 -l 1300
recv:(veth1)
./objs/xudpperf recv --src 172.17.0.2:6002
udp test tool:iperf3
send:(veth)
iperf3 -c 192.168.156.88 -p 6002 -l 1300 -b 60G -u
recv:(veth1)
iperf3 -s -p 6002
performance:
performance:(test weth libxdp lib)
UDP : 250 Kpps (with 100% cpu)
AF_XDP no zerocopy + no batch : 480 Kpps (with ksoftirqd 100% cpu)
AF_XDP with zerocopy + no batch : 540 Kpps (with ksoftirqd 100% cpu)
AF_XDP with batch + zerocopy : 1.5 Mpps (with ksoftirqd 15% cpu)
With af_xdp batch, the libxdp user-space program reaches a bottleneck.
Therefore, the softirq did not reach the limit.
This is just an RFC patch series, and some code details still need
further consideration. Please review this proposal.
thanks!
huangjie.albert (10):
veth: Implement ethtool's get_ringparam() callback
xsk: add dma_check_skip for skipping dma check
veth: add support for send queue
xsk: add xsk_tx_completed_addr function
veth: use send queue tx napi to xmit xsk tx desc
veth: add ndo_xsk_wakeup callback for veth
sk_buff: add destructor_arg_xsk_pool for zero copy
xdp: add xdp_mem_type MEM_TYPE_XSK_BUFF_POOL_TX
veth: support zero copy for af xdp
veth: af_xdp tx batch support for ipv4 udp
drivers/net/veth.c | 729 +++++++++++++++++++++++++++++++++++-
include/linux/skbuff.h | 1 +
include/net/xdp.h | 1 +
include/net/xdp_sock_drv.h | 1 +
include/net/xsk_buff_pool.h | 1 +
net/xdp/xsk.c | 6 +
net/xdp/xsk_buff_pool.c | 3 +-
net/xdp/xsk_queue.h | 11 +
8 files changed, 751 insertions(+), 2 deletions(-)
--
2.20.1
The following conditions need to be satisfied to achieve zero-copy:
1. The tx desc has enough space to store the xdp_frame and skb_share_info.
2. The memory address pointed to by the tx desc is within a page.
test zero copy with libxdp
Performance:
|MSS (bytes) | Packet rate (PPS)
AF_XDP | 1300 | 480k
AF_XDP with zero copy| 1300 | 540K
signed-off-by: huangjie.albert <[email protected]>
---
drivers/net/veth.c | 207 ++++++++++++++++++++++++++++++++++++++-------
1 file changed, 178 insertions(+), 29 deletions(-)
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 600225e27e9e..e4f1a8345f42 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -103,6 +103,11 @@ struct veth_xdp_tx_bq {
unsigned int count;
};
+struct veth_seg_info {
+ u32 segs;
+ u64 desc[] ____cacheline_aligned_in_smp;
+};
+
/*
* ethtool interface
*/
@@ -645,6 +650,100 @@ static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
return 0;
}
+static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
+ int buflen)
+{
+ struct sk_buff *skb;
+
+ skb = build_skb(head, buflen);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, headroom);
+ skb_put(skb, len);
+
+ return skb;
+}
+
+static void veth_xsk_destruct_skb(struct sk_buff *skb)
+{
+ struct veth_seg_info *seg_info = (struct veth_seg_info *)skb_shinfo(skb)->destructor_arg;
+ struct xsk_buff_pool *pool = (struct xsk_buff_pool *)skb_shinfo(skb)->destructor_arg_xsk_pool;
+ unsigned long flags;
+ u32 index = 0;
+ u64 addr;
+
+ /* release cq */
+ spin_lock_irqsave(&pool->cq_lock, flags);
+ for (index = 0; index < seg_info->segs; index++) {
+ addr = (u64)(long)seg_info->desc[index];
+ xsk_tx_completed_addr(pool, addr);
+ }
+ spin_unlock_irqrestore(&pool->cq_lock, flags);
+
+ kfree(seg_info);
+ skb_shinfo(skb)->destructor_arg = NULL;
+ skb_shinfo(skb)->destructor_arg_xsk_pool = NULL;
+}
+
+static struct sk_buff *veth_build_skb_zerocopy(struct net_device *dev, struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ struct veth_seg_info *seg_info;
+ struct sk_buff *skb;
+ struct page *page;
+ void *hard_start;
+ u32 len, ts;
+ void *buffer;
+ int headroom;
+ u64 addr;
+ u32 index;
+
+ addr = desc->addr;
+ len = desc->len;
+ buffer = xsk_buff_raw_get_data(pool, addr);
+ ts = pool->unaligned ? len : pool->chunk_size;
+
+ headroom = offset_in_page(buffer);
+
+ /* offset in umem pool buffer */
+ addr = buffer - pool->addrs;
+
+ /* get the page of the desc */
+ page = pool->umem->pgs[addr >> PAGE_SHIFT];
+
+ /* in order to avoid to get freed by kfree_skb */
+ get_page(page);
+
+ hard_start = page_to_virt(page);
+
+ skb = veth_build_skb(hard_start, headroom, len, ts);
+ seg_info = (struct veth_seg_info *)kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS), GFP_KERNEL);
+ if (!seg_info)
+ {
+ printk("here must to deal with\n");
+ }
+
+ /* later we will support gso for this */
+ index = skb_shinfo(skb)->gso_segs;
+ seg_info->desc[index] = desc->addr;
+ seg_info->segs = ++index;
+
+ skb->truesize += ts;
+ skb->dev = dev;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)seg_info;
+ skb_shinfo(skb)->destructor_arg_xsk_pool = (void *)(long)pool;
+ skb->destructor = veth_xsk_destruct_skb;
+
+ /* set the mac header */
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* to do, add skb to sock. may be there is no need to do for this
+ * refcount_add(ts, &xs->sk.sk_wmem_alloc);
+ */
+ return skb;
+}
+
static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq,
struct xdp_frame *frame,
struct veth_xdp_tx_bq *bq,
@@ -1063,6 +1162,20 @@ static int veth_poll(struct napi_struct *napi, int budget)
return done;
}
+/* if buffer contain in a page */
+static inline bool buffer_in_page(void *buffer, u32 len)
+{
+ u32 offset;
+
+ offset = offset_in_page(buffer);
+
+ if(PAGE_SIZE - offset >= len) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool, int budget)
{
struct veth_priv *priv, *peer_priv;
@@ -1073,6 +1186,9 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
struct veth_xdp_tx_bq bq;
struct xdp_desc desc;
void *xdpf;
+ struct sk_buff *skb = NULL;
+ bool zc = xsk_pool->umem->zc;
+ u32 xsk_headroom = xsk_pool->headroom;
int done = 0;
bq.count = 0;
@@ -1102,12 +1218,6 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
break;
}
- /*
- * Get a xmit addr
- * desc.addr is a offset, so we should to convert to real virtual address
- */
- addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
-
/* can not hold all data in a page */
truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + desc.len + sizeof(struct xdp_frame);
if (truesize > PAGE_SIZE) {
@@ -1116,16 +1226,39 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
continue;
}
- page = dev_alloc_page();
- if (!page) {
- /*
- * error , release xdp frame and increase drops
- */
- xsk_tx_completed_addr(xsk_pool, desc.addr);
- stats.xdp_drops++;
- break;
+ /*
+ * Get a xmit addr
+ * desc.addr is a offset, so we should to convert to real virtual address
+ */
+ addr = xsk_buff_raw_get_data(xsk_pool, desc.addr);
+
+ /*
+ * in order to support zero copy, headroom must have enough space to hold xdp_frame
+ */
+ if (zc && (xsk_headroom < sizeof(struct xdp_frame)))
+ zc = false;
+
+ /*
+ * if desc not contain in a page, also do not support zero copy
+ */
+ if (!buffer_in_page(addr, desc.len))
+ zc = false;
+
+ if (zc) {
+ /* headroom is reserved for xdp_frame */
+ new_addr = addr - sizeof(struct xdp_frame);
+ } else {
+ page = dev_alloc_page();
+ if (!page) {
+ /*
+ * error , release xdp frame and increase drops
+ */
+ xsk_tx_completed_addr(xsk_pool, desc.addr);
+ stats.xdp_drops++;
+ break;
+ }
+ new_addr = page_to_virt(page);
}
- new_addr = page_to_virt(page);
p_frame = new_addr;
new_addr += sizeof(struct xdp_frame);
@@ -1137,19 +1270,37 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
*/
p_frame->headroom = 0;
p_frame->metasize = 0;
- p_frame->frame_sz = PAGE_SIZE;
p_frame->flags = 0;
- p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
- memcpy(p_frame->data, addr, p_frame->len);
- xsk_tx_completed_addr(xsk_pool, desc.addr);
-
- /* if peer have xdp prog, if it has ,just send to peer */
- p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
- /* if no xdp with this queue, convert to skb to xmit*/
- if (p_frame) {
- xdpf = p_frame;
- veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
- p_frame = NULL;
+
+ if (zc) {
+ p_frame->frame_sz = xsk_pool->frame_len;
+ /* to do: if there is a xdp, how to recycle the tx desc */
+ p_frame->mem.type = MEM_TYPE_XSK_BUFF_POOL_TX;
+ /* no need to copy address for af+xdp */
+ p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+ if (p_frame) {
+ skb = veth_build_skb_zerocopy(peer_dev, xsk_pool, &desc);
+ if (skb) {
+ napi_gro_receive(&peer_rq->xdp_napi, skb);
+ skb = NULL;
+ } else {
+ xsk_tx_completed_addr(xsk_pool, desc.addr);
+ }
+ }
+ } else {
+ p_frame->frame_sz = PAGE_SIZE;
+ p_frame->mem.type = MEM_TYPE_PAGE_SHARED;
+ memcpy(p_frame->data, addr, p_frame->len);
+ xsk_tx_completed_addr(xsk_pool, desc.addr);
+
+ /* if peer have xdp prog, if it has ,just send to peer */
+ p_frame = veth_xdp_rcv_one(peer_rq, p_frame, &bq, &peer_stats);
+ /* if no xdp with this queue, convert to skb to xmit*/
+ if (p_frame) {
+ xdpf = p_frame;
+ veth_xdp_rcv_bulk_skb(peer_rq, &xdpf, 1, &bq, &peer_stats);
+ p_frame = NULL;
+ }
}
stats.xdp_bytes += desc.len;
@@ -1163,8 +1314,6 @@ static int veth_xsk_tx_xmit(struct veth_sq *sq, struct xsk_buff_pool *xsk_pool,
xsk_tx_release(xsk_pool);
}
-
-
/* just for peer rq */
if (peer_stats.xdp_tx > 0)
veth_xdp_flush(peer_rq, &bq);
--
2.20.1
On Thu, Aug 03, 2023 at 10:04:35PM +0800, huangjie.albert wrote:
...
> +static struct sk_buff *veth_build_skb_zerocopy(struct net_device *dev, struct xsk_buff_pool *pool,
> + struct xdp_desc *desc)
> +{
> + struct veth_seg_info *seg_info;
> + struct sk_buff *skb;
> + struct page *page;
> + void *hard_start;
> + u32 len, ts;
> + void *buffer;
> + int headroom;
> + u64 addr;
> + u32 index;
> +
> + addr = desc->addr;
> + len = desc->len;
> + buffer = xsk_buff_raw_get_data(pool, addr);
> + ts = pool->unaligned ? len : pool->chunk_size;
> +
> + headroom = offset_in_page(buffer);
> +
> + /* offset in umem pool buffer */
> + addr = buffer - pool->addrs;
> +
> + /* get the page of the desc */
> + page = pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + /* in order to avoid to get freed by kfree_skb */
> + get_page(page);
> +
> + hard_start = page_to_virt(page);
> +
> + skb = veth_build_skb(hard_start, headroom, len, ts);
> + seg_info = (struct veth_seg_info *)kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS), GFP_KERNEL);
There is no need to explicitly case the return value of kmalloc,
as it returns void *.
seg_info = kmalloc(struct_size(seg_info, desc, MAX_SKB_FRAGS),
GFP_KERNEL);
...