Hi all:
This series tries to implement XDP support for tap. Two path were
implemented:
- fast path: small & non-gso packet, For performance reason we do it
at page level and use build_skb() to create skb if necessary.
- slow path: big or gso packet, we don't want to lose the capability
compared to generic XDP, so we export some generic xdp helpers and
do it after skb was created.
xdp1 shows about 41% improvement, xdp_redirect shows about 60%
improvement.
Changes from V1:
- fix the race between xdp set and free
- don't hold extra refcount
- add XDP_REDIRECT support
Please review.
Jason Wang (3):
tap: use build_skb() for small packet
net: export some generic xdp helpers
tap: XDP support
drivers/net/tun.c | 247 ++++++++++++++++++++++++++++++++++++++++++----
include/linux/netdevice.h | 2 +
net/core/dev.c | 14 +--
3 files changed, 236 insertions(+), 27 deletions(-)
--
2.7.4
This patch tries to export some generic xdp helpers to drivers. This
can let driver to do XDP for a specific skb. This is useful for the
case when the packet is hard to be processed at page level directly
(e.g jumbo/GSO frame).
With this patch, there's no need for driver to forbid the XDP set when
configuration is not suitable. Instead, it can defer the XDP for
packets that is hard to be processed directly after skb is created.
Signed-off-by: Jason Wang <[email protected]>
---
include/linux/netdevice.h | 2 ++
net/core/dev.c | 14 ++++++++------
2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1d238d5..0f1c4cb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3243,6 +3243,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
}
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 3f69f6e..1a6657a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3920,7 +3920,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
/* When doing generic XDP we have to bypass the qdisc layer and the
* network taps in order to match in-driver-XDP behavior.
*/
-static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -3941,13 +3941,12 @@ static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
kfree_skb(skb);
}
}
+EXPORT_SYMBOL_GPL(generic_xdp_tx);
static struct static_key generic_xdp_needed __read_mostly;
-static int do_xdp_generic(struct sk_buff *skb)
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
{
- struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-
if (xdp_prog) {
u32 act = netif_receive_generic_xdp(skb, xdp_prog);
int err;
@@ -3972,6 +3971,7 @@ static int do_xdp_generic(struct sk_buff *skb)
kfree_skb(skb);
return XDP_DROP;
}
+EXPORT_SYMBOL_GPL(do_xdp_generic);
static int netif_rx_internal(struct sk_buff *skb)
{
@@ -3982,7 +3982,8 @@ static int netif_rx_internal(struct sk_buff *skb)
trace_netif_rx(skb);
if (static_key_false(&generic_xdp_needed)) {
- int ret = do_xdp_generic(skb);
+ int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ skb);
/* Consider XDP consuming the packet a success from
* the netdev point of view we do not want to count
@@ -4503,7 +4504,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
rcu_read_lock();
if (static_key_false(&generic_xdp_needed)) {
- int ret = do_xdp_generic(skb);
+ int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ skb);
if (ret != XDP_PASS) {
rcu_read_unlock();
--
2.7.4
This patch tries to implement XDP for tun. The implementation was
split into two parts:
- fast path: small and no gso packet. We try to do XDP at page level
before build_skb(). For XDP_TX, since creating/destroying queues
were completely under control of userspace, it was implemented
through generic XDP helper after skb has been built. This could be
optimized in the future.
- slow path: big or gso packet. We try to do it after skb was created
through generic XDP helpers.
Test were done through pktgen with small packets.
xdp1 test shows ~41.1% improvement:
Before: ~1.7Mpps
After: ~2.3Mpps
xdp_redirect to ixgbe shows ~60% improvement:
Before: ~0.8Mpps
After: ~1.38Mpps
Suggested-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 142 insertions(+), 7 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9736df4..5892284 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -73,6 +73,8 @@
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
#include <linux/uaccess.h>
@@ -105,7 +107,8 @@ do { \
} while (0)
#endif
-#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+#define TUN_HEADROOM 256
+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD + TUN_HEADROOM)
/* TUN device flags */
@@ -224,6 +227,7 @@ struct tun_struct {
u32 flow_count;
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
+ struct bpf_prog __rcu *xdp_prog;
};
#ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -590,6 +594,7 @@ static void tun_detach(struct tun_file *tfile, bool clean)
static void tun_detach_all(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
+ struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog);
struct tun_file *tfile, *tmp;
int i, n = tun->numqueues;
@@ -622,6 +627,9 @@ static void tun_detach_all(struct net_device *dev)
}
BUG_ON(tun->numdisabled != 0);
+ if (xdp_prog)
+ bpf_prog_put(xdp_prog);
+
if (tun->flags & IFF_PERSIST)
module_put(THIS_MODULE);
}
@@ -1008,6 +1016,46 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
stats->tx_dropped = tx_dropped;
}
+static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+ struct netlink_ext_ack *extack)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ struct bpf_prog *old_prog;
+
+ old_prog = rtnl_dereference(tun->xdp_prog);
+ rcu_assign_pointer(tun->xdp_prog, prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+
+ return 0;
+}
+
+static u32 tun_xdp_query(struct net_device *dev)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ const struct bpf_prog *xdp_prog;
+
+ xdp_prog = rtnl_dereference(tun->xdp_prog);
+ if (xdp_prog)
+ return xdp_prog->aux->id;
+
+ return 0;
+}
+
+static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+ switch (xdp->command) {
+ case XDP_SETUP_PROG:
+ return tun_xdp_set(dev, xdp->prog, xdp->extack);
+ case XDP_QUERY_PROG:
+ xdp->prog_id = tun_xdp_query(dev);
+ xdp->prog_attached = !!xdp->prog_id;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
@@ -1038,6 +1086,7 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
+ .ndo_xdp = tun_xdp,
};
static void tun_flow_init(struct tun_struct *tun)
@@ -1217,16 +1266,22 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
return true;
}
-static struct sk_buff *tun_build_skb(struct tun_file *tfile,
+static struct sk_buff *tun_build_skb(struct tun_struct *tun,
+ struct tun_file *tfile,
struct iov_iter *from,
- int len)
+ struct virtio_net_hdr *hdr,
+ int len, int *generic_xdp)
{
struct page_frag *alloc_frag = &tfile->alloc_frag;
struct sk_buff *skb;
+ struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ unsigned int delta = 0;
char *buf;
size_t copied;
+ bool xdp_xmit = false;
+ int err;
if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
return ERR_PTR(-ENOMEM);
@@ -1238,16 +1293,77 @@ static struct sk_buff *tun_build_skb(struct tun_file *tfile,
if (copied != len)
return ERR_PTR(-EFAULT);
+ if (hdr->gso_type)
+ *generic_xdp = 1;
+ else
+ *generic_xdp = 0;
+
+ rcu_read_lock();
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog && !*generic_xdp) {
+ struct xdp_buff xdp;
+ void *orig_data;
+ u32 act;
+
+ xdp.data_hard_start = buf;
+ xdp.data = buf + TUN_RX_PAD;
+ xdp.data_end = xdp.data + len;
+ orig_data = xdp.data;
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+ switch (act) {
+ case XDP_REDIRECT:
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+ err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
+ if (err)
+ goto err_redirect;
+ return NULL;
+ case XDP_TX:
+ xdp_xmit = true;
+ /* fall through */
+ case XDP_PASS:
+ delta = orig_data - xdp.data;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ goto err_xdp;
+ }
+ }
+
skb = build_skb(buf, buflen);
- if (!skb)
+ if (!skb) {
+ rcu_read_unlock();
return ERR_PTR(-ENOMEM);
+ }
- skb_reserve(skb, TUN_RX_PAD);
- skb_put(skb, len);
+ skb_reserve(skb, TUN_RX_PAD - delta);
+ skb_put(skb, len + delta);
get_page(alloc_frag->page);
alloc_frag->offset += buflen;
+ if (xdp_xmit) {
+ skb->dev = tun->dev;
+ generic_xdp_tx(skb, xdp_prog);
+ rcu_read_lock();
+ return NULL;
+ }
+
+ rcu_read_unlock();
+
return skb;
+
+err_redirect:
+ put_page(alloc_frag->page);
+err_xdp:
+ rcu_read_unlock();
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ return NULL;
}
/* Get packet from user space buffer */
@@ -1266,6 +1382,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
bool zerocopy = false;
int err;
u32 rxhash;
+ int generic_xdp = 1;
if (!(tun->dev->flags & IFF_UP))
return -EIO;
@@ -1324,11 +1441,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
}
if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
- skb = tun_build_skb(tfile, from, len);
+ skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
if (IS_ERR(skb)) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
return PTR_ERR(skb);
}
+ if (!skb)
+ return total_len;
} else {
if (!zerocopy) {
copylen = len;
@@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
skb_reset_network_header(skb);
skb_probe_transport_header(skb, 0);
+ if (generic_xdp) {
+ struct bpf_prog *xdp_prog;
+ int ret;
+
+ rcu_read_lock();
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ ret = do_xdp_generic(xdp_prog, skb);
+ if (ret != XDP_PASS) {
+ rcu_read_unlock();
+ return total_len;
+ }
+ }
+ rcu_read_unlock();
+ }
+
rxhash = __skb_get_hash_symmetric(skb);
#ifndef CONFIG_4KSTACKS
tun_rx_batched(tun, tfile, skb, more);
--
2.7.4
We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
skb in the past. This socket based method is not suitable for high
speed userspace like virtualization which usually:
- ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
possible
- don't want to be block at sendmsg()
To eliminate the above overheads, this patch tries to use build_skb()
for small packet. We will do this only when the following conditions
are all met:
- TAP instead of TUN
- sk_sndbuf is INT_MAX
- caller don't want to be blocked
- zerocopy is not used
- packet size is smaller enough to use build_skb()
Pktgen from guest to host shows ~11% improvement for rx pps of tap:
Before: ~1.70Mpps
After : ~1.88Mpps
What's more important, this makes it possible to implement XDP for tap
before creating skbs.
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 112 ++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 91 insertions(+), 21 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d21510d..9736df4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -105,6 +105,8 @@ do { \
} while (0)
#endif
+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
/* TUN device flags */
/* IFF_ATTACH_QUEUE is never stored in device flags,
@@ -170,6 +172,7 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
+ struct page_frag alloc_frag;
};
struct tun_flow_entry {
@@ -571,6 +574,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
}
if (tun)
skb_array_cleanup(&tfile->tx_array);
+ if (tfile->alloc_frag.page)
+ put_page(tfile->alloc_frag.page);
sock_put(&tfile->sk);
}
}
@@ -1190,6 +1195,61 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
}
}
+static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
+ int len, int noblock, bool zerocopy)
+{
+ if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+ return false;
+
+ if (tfile->socket.sk->sk_sndbuf != INT_MAX)
+ return false;
+
+ if (!noblock)
+ return false;
+
+ if (zerocopy)
+ return false;
+
+ if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+ return false;
+
+ return true;
+}
+
+static struct sk_buff *tun_build_skb(struct tun_file *tfile,
+ struct iov_iter *from,
+ int len)
+{
+ struct page_frag *alloc_frag = &tfile->alloc_frag;
+ struct sk_buff *skb;
+ int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ char *buf;
+ size_t copied;
+
+ if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+ return ERR_PTR(-ENOMEM);
+
+ buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+ copied = copy_page_from_iter(alloc_frag->page,
+ alloc_frag->offset + TUN_RX_PAD,
+ len, from);
+ if (copied != len)
+ return ERR_PTR(-EFAULT);
+
+ skb = build_skb(buf, buflen);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ skb_reserve(skb, TUN_RX_PAD);
+ skb_put(skb, len);
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
+ return skb;
+}
+
/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
@@ -1263,30 +1323,38 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
zerocopy = true;
}
- if (!zerocopy) {
- copylen = len;
- if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
- linear = good_linear;
- else
- linear = tun16_to_cpu(tun, gso.hdr_len);
- }
-
- skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
- if (IS_ERR(skb)) {
- if (PTR_ERR(skb) != -EAGAIN)
+ if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+ skb = tun_build_skb(tfile, from, len);
+ if (IS_ERR(skb)) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
- return PTR_ERR(skb);
- }
+ return PTR_ERR(skb);
+ }
+ } else {
+ if (!zerocopy) {
+ copylen = len;
+ if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
+ linear = good_linear;
+ else
+ linear = tun16_to_cpu(tun, gso.hdr_len);
+ }
- if (zerocopy)
- err = zerocopy_sg_from_iter(skb, from);
- else
- err = skb_copy_datagram_from_iter(skb, 0, from, len);
+ skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+ if (IS_ERR(skb)) {
+ if (PTR_ERR(skb) != -EAGAIN)
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ return PTR_ERR(skb);
+ }
- if (err) {
- this_cpu_inc(tun->pcpu_stats->rx_dropped);
- kfree_skb(skb);
- return -EFAULT;
+ if (zerocopy)
+ err = zerocopy_sg_from_iter(skb, from);
+ else
+ err = skb_copy_datagram_from_iter(skb, 0, from, len);
+
+ if (err) {
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
}
if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
@@ -2377,6 +2445,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
tfile->sk.sk_write_space = tun_sock_write_space;
tfile->sk.sk_sndbuf = INT_MAX;
+ tfile->alloc_frag.page = NULL;
+
file->private_data = tfile;
INIT_LIST_HEAD(&tfile->next);
--
2.7.4
On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
> This patch tries to implement XDP for tun. The implementation was
> split into two parts:
>
> - fast path: small and no gso packet. We try to do XDP at page level
> before build_skb(). For XDP_TX, since creating/destroying queues
> were completely under control of userspace, it was implemented
> through generic XDP helper after skb has been built. This could be
> optimized in the future.
> - slow path: big or gso packet. We try to do it after skb was created
> through generic XDP helpers.
>
> Test were done through pktgen with small packets.
>
> xdp1 test shows ~41.1% improvement:
>
> Before: ~1.7Mpps
> After: ~2.3Mpps
>
> xdp_redirect to ixgbe shows ~60% improvement:
>
> Before: ~0.8Mpps
> After: ~1.38Mpps
>
> Suggested-by: Michael S. Tsirkin <[email protected]>
> Signed-off-by: Jason Wang <[email protected]>
Looks OK to me now :)
Out of curiosity, you say the build_skb() is for "small packets", and it
seems you are always reserving the 256B regardless of XDP being
installed. Does this have no performance impact on non-XDP case?
On 2017年08月12日 07:12, Jakub Kicinski wrote:
> On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
>> This patch tries to implement XDP for tun. The implementation was
>> split into two parts:
>>
>> - fast path: small and no gso packet. We try to do XDP at page level
>> before build_skb(). For XDP_TX, since creating/destroying queues
>> were completely under control of userspace, it was implemented
>> through generic XDP helper after skb has been built. This could be
>> optimized in the future.
>> - slow path: big or gso packet. We try to do it after skb was created
>> through generic XDP helpers.
>>
>> Test were done through pktgen with small packets.
>>
>> xdp1 test shows ~41.1% improvement:
>>
>> Before: ~1.7Mpps
>> After: ~2.3Mpps
>>
>> xdp_redirect to ixgbe shows ~60% improvement:
>>
>> Before: ~0.8Mpps
>> After: ~1.38Mpps
>>
>> Suggested-by: Michael S. Tsirkin <[email protected]>
>> Signed-off-by: Jason Wang <[email protected]>
> Looks OK to me now :)
>
> Out of curiosity, you say the build_skb() is for "small packets", and it
> seems you are always reserving the 256B regardless of XDP being
> installed. Does this have no performance impact on non-XDP case?
Have a test, only less than 1% were noticed which I think could be ignored.
Thanks
From: Jason Wang <[email protected]>
Date: Fri, 11 Aug 2017 19:41:15 +0800
> Hi all:
>
> This series tries to implement XDP support for tap. Two path were
> implemented:
>
> - fast path: small & non-gso packet, For performance reason we do it
> at page level and use build_skb() to create skb if necessary.
> - slow path: big or gso packet, we don't want to lose the capability
> compared to generic XDP, so we export some generic xdp helpers and
> do it after skb was created.
>
> xdp1 shows about 41% improvement, xdp_redirect shows about 60%
> improvement.
>
> Changes from V1:
> - fix the race between xdp set and free
> - don't hold extra refcount
> - add XDP_REDIRECT support
>
> Please review.
Series applied, thanks Jason.
On 08/11/2017 01:41 PM, Jason Wang wrote:
> This patch tries to implement XDP for tun. The implementation was
> split into two parts:
[...]
> @@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
> skb_reset_network_header(skb);
> skb_probe_transport_header(skb, 0);
>
> + if (generic_xdp) {
> + struct bpf_prog *xdp_prog;
> + int ret;
> +
> + rcu_read_lock();
> + xdp_prog = rcu_dereference(tun->xdp_prog);
The name generic_xdp is a bit confusing in this context given this
is 'native' XDP, perhaps above if (generic_xdp) should have a comment
explaining semantics for tun and how it relates to actual generic xdp
that sits at dev->xdp_prog, and gets run from netif_rx_ni(). Or just
name the bool xdp_handle_gso with a comment that we let the generic
XDP infrastructure deal with non-linear skbs instead of having to
re-implement the do_xdp_generic() internals, plus a statement that
the actual generic XDP comes a bit later in the path. That would at
least make it more obvious to read, imho.
> + if (xdp_prog) {
> + ret = do_xdp_generic(xdp_prog, skb);
> + if (ret != XDP_PASS) {
> + rcu_read_unlock();
> + return total_len;
> + }
> + }
> + rcu_read_unlock();
> + }
> +
> rxhash = __skb_get_hash_symmetric(skb);
> #ifndef CONFIG_4KSTACKS
> tun_rx_batched(tun, tfile, skb, more);
>
On Sat, Aug 12, 2017 at 10:48:49AM +0800, Jason Wang wrote:
>
>
> On 2017年08月12日 07:12, Jakub Kicinski wrote:
> > On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
> > > This patch tries to implement XDP for tun. The implementation was
> > > split into two parts:
> > >
> > > - fast path: small and no gso packet. We try to do XDP at page level
> > > before build_skb(). For XDP_TX, since creating/destroying queues
> > > were completely under control of userspace, it was implemented
> > > through generic XDP helper after skb has been built. This could be
> > > optimized in the future.
> > > - slow path: big or gso packet. We try to do it after skb was created
> > > through generic XDP helpers.
> > >
> > > Test were done through pktgen with small packets.
> > >
> > > xdp1 test shows ~41.1% improvement:
> > >
> > > Before: ~1.7Mpps
> > > After: ~2.3Mpps
> > >
> > > xdp_redirect to ixgbe shows ~60% improvement:
> > >
> > > Before: ~0.8Mpps
> > > After: ~1.38Mpps
> > >
> > > Suggested-by: Michael S. Tsirkin <[email protected]>
> > > Signed-off-by: Jason Wang <[email protected]>
> > Looks OK to me now :)
> >
> > Out of curiosity, you say the build_skb() is for "small packets", and it
> > seems you are always reserving the 256B regardless of XDP being
> > installed. Does this have no performance impact on non-XDP case?
>
> Have a test, only less than 1% were noticed which I think could be ignored.
>
> Thanks
What did you test btw? The biggest issue would be with something like
UDP with short packets.
--
MST
On 2017年08月14日 16:43, Daniel Borkmann wrote:
> On 08/11/2017 01:41 PM, Jason Wang wrote:
>> This patch tries to implement XDP for tun. The implementation was
>> split into two parts:
> [...]
>> @@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct
>> *tun, struct tun_file *tfile,
>> skb_reset_network_header(skb);
>> skb_probe_transport_header(skb, 0);
>>
>> + if (generic_xdp) {
>> + struct bpf_prog *xdp_prog;
>> + int ret;
>> +
>> + rcu_read_lock();
>> + xdp_prog = rcu_dereference(tun->xdp_prog);
>
> The name generic_xdp is a bit confusing in this context given this
> is 'native' XDP, perhaps above if (generic_xdp) should have a comment
> explaining semantics for tun and how it relates to actual generic xdp
> that sits at dev->xdp_prog, and gets run from netif_rx_ni(). Or just
> name the bool xdp_handle_gso with a comment that we let the generic
> XDP infrastructure deal with non-linear skbs instead of having to
> re-implement the do_xdp_generic() internals, plus a statement that
> the actual generic XDP comes a bit later in the path. That would at
> least make it more obvious to read, imho.
Ok, since non gso packet (e.g jumbo packet) may go this way too,
something like "xdp_handle_skb" is better. Will send a patch.
Thanks
>
>> + if (xdp_prog) {
>> + ret = do_xdp_generic(xdp_prog, skb);
>> + if (ret != XDP_PASS) {
>> + rcu_read_unlock();
>> + return total_len;
>> + }
>> + }
>> + rcu_read_unlock();
>> + }
>> +
>> rxhash = __skb_get_hash_symmetric(skb);
>> #ifndef CONFIG_4KSTACKS
>> tun_rx_batched(tun, tfile, skb, more);
>>
>
On 2017年08月15日 00:01, Michael S. Tsirkin wrote:
> On Sat, Aug 12, 2017 at 10:48:49AM +0800, Jason Wang wrote:
>>
>> On 2017年08月12日 07:12, Jakub Kicinski wrote:
>>> On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
>>>> This patch tries to implement XDP for tun. The implementation was
>>>> split into two parts:
>>>>
>>>> - fast path: small and no gso packet. We try to do XDP at page level
>>>> before build_skb(). For XDP_TX, since creating/destroying queues
>>>> were completely under control of userspace, it was implemented
>>>> through generic XDP helper after skb has been built. This could be
>>>> optimized in the future.
>>>> - slow path: big or gso packet. We try to do it after skb was created
>>>> through generic XDP helpers.
>>>>
>>>> Test were done through pktgen with small packets.
>>>>
>>>> xdp1 test shows ~41.1% improvement:
>>>>
>>>> Before: ~1.7Mpps
>>>> After: ~2.3Mpps
>>>>
>>>> xdp_redirect to ixgbe shows ~60% improvement:
>>>>
>>>> Before: ~0.8Mpps
>>>> After: ~1.38Mpps
>>>>
>>>> Suggested-by: Michael S. Tsirkin <[email protected]>
>>>> Signed-off-by: Jason Wang <[email protected]>
>>> Looks OK to me now :)
>>>
>>> Out of curiosity, you say the build_skb() is for "small packets", and it
>>> seems you are always reserving the 256B regardless of XDP being
>>> installed. Does this have no performance impact on non-XDP case?
>> Have a test, only less than 1% were noticed which I think could be ignored.
>>
>> Thanks
> What did you test btw?
Pktgen
> The biggest issue would be with something like
> UDP with short packets.
>
Note that we do this only when sndbuf is INT_MAX. So this is probably
not an issue. The only thing matter is more stress to page allocator,
but according to the result of pktgen it was very small that could be
ignored.
Thanks
On Tue, Aug 15, 2017 at 01:02:05PM +0800, Jason Wang wrote:
>
>
> On 2017年08月15日 00:01, Michael S. Tsirkin wrote:
> > On Sat, Aug 12, 2017 at 10:48:49AM +0800, Jason Wang wrote:
> > >
> > > On 2017年08月12日 07:12, Jakub Kicinski wrote:
> > > > On Fri, 11 Aug 2017 19:41:18 +0800, Jason Wang wrote:
> > > > > This patch tries to implement XDP for tun. The implementation was
> > > > > split into two parts:
> > > > >
> > > > > - fast path: small and no gso packet. We try to do XDP at page level
> > > > > before build_skb(). For XDP_TX, since creating/destroying queues
> > > > > were completely under control of userspace, it was implemented
> > > > > through generic XDP helper after skb has been built. This could be
> > > > > optimized in the future.
> > > > > - slow path: big or gso packet. We try to do it after skb was created
> > > > > through generic XDP helpers.
> > > > >
> > > > > Test were done through pktgen with small packets.
> > > > >
> > > > > xdp1 test shows ~41.1% improvement:
> > > > >
> > > > > Before: ~1.7Mpps
> > > > > After: ~2.3Mpps
> > > > >
> > > > > xdp_redirect to ixgbe shows ~60% improvement:
> > > > >
> > > > > Before: ~0.8Mpps
> > > > > After: ~1.38Mpps
> > > > >
> > > > > Suggested-by: Michael S. Tsirkin <[email protected]>
> > > > > Signed-off-by: Jason Wang <[email protected]>
> > > > Looks OK to me now :)
> > > >
> > > > Out of curiosity, you say the build_skb() is for "small packets", and it
> > > > seems you are always reserving the 256B regardless of XDP being
> > > > installed. Does this have no performance impact on non-XDP case?
> > > Have a test, only less than 1% were noticed which I think could be ignored.
> > >
> > > Thanks
> > What did you test btw?
>
> Pktgen
>
> > The biggest issue would be with something like
> > UDP with short packets.
> >
>
> Note that we do this only when sndbuf is INT_MAX. So this is probably not an
> issue.
I'd expect to see the issue for guest to host when the packets are
queued at a UDP socket in host.
> The only thing matter is more stress to page allocator, but according
> to the result of pktgen it was very small that could be ignored.
>
> Thanks
Besides guest to host, for bridging in host bigger truesize might affect
byte queue counts as well.
--
MST
On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
> skb in the past. This socket based method is not suitable for high
> speed userspace like virtualization which usually:
>
> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
> possible
> - don't want to be block at sendmsg()
>
> To eliminate the above overheads, this patch tries to use build_skb()
> for small packet. We will do this only when the following conditions
> are all met:
>
> - TAP instead of TUN
> - sk_sndbuf is INT_MAX
> - caller don't want to be blocked
> - zerocopy is not used
> - packet size is smaller enough to use build_skb()
>
> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>
> Before: ~1.70Mpps
> After : ~1.88Mpps
>
> What's more important, this makes it possible to implement XDP for tap
> before creating skbs.
Well well well.
You do realize that tun_build_skb() is not thread safe ?
general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
(ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 3982 Comm: syz-executor0 Not tainted 4.13.0-rc5-next-20170815+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
task: ffff880069f265c0 task.stack: ffff880067688000
RIP: 0010:__read_once_size include/linux/compiler.h:276 [inline]
RIP: 0010:compound_head include/linux/page-flags.h:146 [inline]
RIP: 0010:put_page include/linux/mm.h:811 [inline]
RIP: 0010:__skb_frag_unref include/linux/skbuff.h:2743 [inline]
RIP: 0010:skb_release_data+0x26c/0x790 net/core/skbuff.c:568
RSP: 0018:ffff88006768ef20 EFLAGS: 00010206
RAX: 00d70cb5b39acdeb RBX: dffffc0000000000 RCX: 1ffff1000ced1e13
RDX: 0000000000000000 RSI: ffff88003ec28c38 RDI: 06b865ad9cd66f59
RBP: ffff88006768f040 R08: ffffea0000ee74a0 R09: ffffed0007ab4200
R10: 0000000000028c28 R11: 0000000000000010 R12: ffff88003c5581b0
R13: ffffed000ced1dfb R14: 1ffff1000ced1df3 R15: 06b865ad9cd66f39
FS: 00007ffbc9ef7700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000002001aff0 CR3: 000000003d623000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
skb_release_all+0x4a/0x60 net/core/skbuff.c:631
__kfree_skb net/core/skbuff.c:645 [inline]
kfree_skb+0x15d/0x4c0 net/core/skbuff.c:663
__netif_receive_skb_core+0x10f8/0x33d0 net/core/dev.c:4425
__netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4456
netif_receive_skb_internal+0x10b/0x5e0 net/core/dev.c:4527
netif_receive_skb+0xae/0x390 net/core/dev.c:4551
tun_rx_batched.isra.43+0x5e7/0x860 drivers/net/tun.c:1221
tun_get_user+0x11dd/0x2150 drivers/net/tun.c:1542
tun_chr_write_iter+0xd8/0x190 drivers/net/tun.c:1568
call_write_iter include/linux/fs.h:1742 [inline]
new_sync_write fs/read_write.c:457 [inline]
__vfs_write+0x684/0x970 fs/read_write.c:470
vfs_write+0x189/0x510 fs/read_write.c:518
SYSC_write fs/read_write.c:565 [inline]
SyS_write+0xef/0x220 fs/read_write.c:557
entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x40bab1
RSP: 002b:00007ffbc9ef6c00 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000000036 RCX: 000000000040bab1
RDX: 0000000000000036 RSI: 0000000020002000 RDI: 0000000000000003
RBP: 0000000000a5f870 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
R13: 0000000000000000 R14: 00007ffbc9ef79c0 R15: 00007ffbc9ef7700
Code: c6 e8 c9 78 8d fd 4c 89 e0 48 c1 e8 03 80 3c 18 00 0f 85 93 04 00 00 4d 8b 3c 24 41 c6 45 00 00 49 8d 7f 20 48 89 f8 48 c1 e8 03 <80> 3c 18 00 0f 85 6b 04 00 00 41 80 7d 00 00 49 8b 47 20 0f 85
RIP: __read_once_size include/linux/compiler.h:276 [inline] RSP: ffff88006768ef20
RIP: compound_head include/linux/page-flags.h:146 [inline] RSP: ffff88006768ef20
RIP: put_page include/linux/mm.h:811 [inline] RSP: ffff88006768ef20
RIP: __skb_frag_unref include/linux/skbuff.h:2743 [inline] RSP: ffff88006768ef20
RIP: skb_release_data+0x26c/0x790 net/core/skbuff.c:568 RSP: ffff88006768ef20
---[ end trace 54050eb1ec52ff83 ]---
On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
> > We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
> > skb in the past. This socket based method is not suitable for high
> > speed userspace like virtualization which usually:
> >
> > - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
> > possible
> > - don't want to be block at sendmsg()
> >
> > To eliminate the above overheads, this patch tries to use build_skb()
> > for small packet. We will do this only when the following conditions
> > are all met:
> >
> > - TAP instead of TUN
> > - sk_sndbuf is INT_MAX
> > - caller don't want to be blocked
> > - zerocopy is not used
> > - packet size is smaller enough to use build_skb()
> >
> > Pktgen from guest to host shows ~11% improvement for rx pps of tap:
> >
> > Before: ~1.70Mpps
> > After : ~1.88Mpps
> >
> > What's more important, this makes it possible to implement XDP for tap
> > before creating skbs.
>
>
> Well well well.
>
> You do realize that tun_build_skb() is not thread safe ?
The issue is alloc frag, isn't it?
I guess for now we can limit this to XDP mode only, and
just allocate full pages in that mode.
> general protection fault: 0000 [#1] SMP KASAN
> Dumping ftrace buffer:
> (ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 3982 Comm: syz-executor0 Not tainted 4.13.0-rc5-next-20170815+ #3
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: ffff880069f265c0 task.stack: ffff880067688000
> RIP: 0010:__read_once_size include/linux/compiler.h:276 [inline]
> RIP: 0010:compound_head include/linux/page-flags.h:146 [inline]
> RIP: 0010:put_page include/linux/mm.h:811 [inline]
> RIP: 0010:__skb_frag_unref include/linux/skbuff.h:2743 [inline]
> RIP: 0010:skb_release_data+0x26c/0x790 net/core/skbuff.c:568
> RSP: 0018:ffff88006768ef20 EFLAGS: 00010206
> RAX: 00d70cb5b39acdeb RBX: dffffc0000000000 RCX: 1ffff1000ced1e13
> RDX: 0000000000000000 RSI: ffff88003ec28c38 RDI: 06b865ad9cd66f59
> RBP: ffff88006768f040 R08: ffffea0000ee74a0 R09: ffffed0007ab4200
> R10: 0000000000028c28 R11: 0000000000000010 R12: ffff88003c5581b0
> R13: ffffed000ced1dfb R14: 1ffff1000ced1df3 R15: 06b865ad9cd66f39
> FS: 00007ffbc9ef7700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000000002001aff0 CR3: 000000003d623000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
> skb_release_all+0x4a/0x60 net/core/skbuff.c:631
> __kfree_skb net/core/skbuff.c:645 [inline]
> kfree_skb+0x15d/0x4c0 net/core/skbuff.c:663
> __netif_receive_skb_core+0x10f8/0x33d0 net/core/dev.c:4425
> __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4456
> netif_receive_skb_internal+0x10b/0x5e0 net/core/dev.c:4527
> netif_receive_skb+0xae/0x390 net/core/dev.c:4551
> tun_rx_batched.isra.43+0x5e7/0x860 drivers/net/tun.c:1221
> tun_get_user+0x11dd/0x2150 drivers/net/tun.c:1542
> tun_chr_write_iter+0xd8/0x190 drivers/net/tun.c:1568
> call_write_iter include/linux/fs.h:1742 [inline]
> new_sync_write fs/read_write.c:457 [inline]
> __vfs_write+0x684/0x970 fs/read_write.c:470
> vfs_write+0x189/0x510 fs/read_write.c:518
> SYSC_write fs/read_write.c:565 [inline]
> SyS_write+0xef/0x220 fs/read_write.c:557
> entry_SYSCALL_64_fastpath+0x1f/0xbe
> RIP: 0033:0x40bab1
> RSP: 002b:00007ffbc9ef6c00 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> RAX: ffffffffffffffda RBX: 0000000000000036 RCX: 000000000040bab1
> RDX: 0000000000000036 RSI: 0000000020002000 RDI: 0000000000000003
> RBP: 0000000000a5f870 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
> R13: 0000000000000000 R14: 00007ffbc9ef79c0 R15: 00007ffbc9ef7700
> Code: c6 e8 c9 78 8d fd 4c 89 e0 48 c1 e8 03 80 3c 18 00 0f 85 93 04 00 00 4d 8b 3c 24 41 c6 45 00 00 49 8d 7f 20 48 89 f8 48 c1 e8 03 <80> 3c 18 00 0f 85 6b 04 00 00 41 80 7d 00 00 49 8b 47 20 0f 85
> RIP: __read_once_size include/linux/compiler.h:276 [inline] RSP: ffff88006768ef20
> RIP: compound_head include/linux/page-flags.h:146 [inline] RSP: ffff88006768ef20
> RIP: put_page include/linux/mm.h:811 [inline] RSP: ffff88006768ef20
> RIP: __skb_frag_unref include/linux/skbuff.h:2743 [inline] RSP: ffff88006768ef20
> RIP: skb_release_data+0x26c/0x790 net/core/skbuff.c:568 RSP: ffff88006768ef20
> ---[ end trace 54050eb1ec52ff83 ]---
On 2017年08月16日 11:45, Eric Dumazet wrote:
> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
>> skb in the past. This socket based method is not suitable for high
>> speed userspace like virtualization which usually:
>>
>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>> possible
>> - don't want to be block at sendmsg()
>>
>> To eliminate the above overheads, this patch tries to use build_skb()
>> for small packet. We will do this only when the following conditions
>> are all met:
>>
>> - TAP instead of TUN
>> - sk_sndbuf is INT_MAX
>> - caller don't want to be blocked
>> - zerocopy is not used
>> - packet size is smaller enough to use build_skb()
>>
>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>
>> Before: ~1.70Mpps
>> After : ~1.88Mpps
>>
>> What's more important, this makes it possible to implement XDP for tap
>> before creating skbs.
>
> Well well well.
>
> You do realize that tun_build_skb() is not thread safe ?
Ok, I think the issue if skb_page_frag_refill(), need a spinlock
probably. Will prepare a patch.
Thanks
>
> general protection fault: 0000 [#1] SMP KASAN
> Dumping ftrace buffer:
> (ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 3982 Comm: syz-executor0 Not tainted 4.13.0-rc5-next-20170815+ #3
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: ffff880069f265c0 task.stack: ffff880067688000
> RIP: 0010:__read_once_size include/linux/compiler.h:276 [inline]
> RIP: 0010:compound_head include/linux/page-flags.h:146 [inline]
> RIP: 0010:put_page include/linux/mm.h:811 [inline]
> RIP: 0010:__skb_frag_unref include/linux/skbuff.h:2743 [inline]
> RIP: 0010:skb_release_data+0x26c/0x790 net/core/skbuff.c:568
> RSP: 0018:ffff88006768ef20 EFLAGS: 00010206
> RAX: 00d70cb5b39acdeb RBX: dffffc0000000000 RCX: 1ffff1000ced1e13
> RDX: 0000000000000000 RSI: ffff88003ec28c38 RDI: 06b865ad9cd66f59
> RBP: ffff88006768f040 R08: ffffea0000ee74a0 R09: ffffed0007ab4200
> R10: 0000000000028c28 R11: 0000000000000010 R12: ffff88003c5581b0
> R13: ffffed000ced1dfb R14: 1ffff1000ced1df3 R15: 06b865ad9cd66f39
> FS: 00007ffbc9ef7700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000000002001aff0 CR3: 000000003d623000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
> skb_release_all+0x4a/0x60 net/core/skbuff.c:631
> __kfree_skb net/core/skbuff.c:645 [inline]
> kfree_skb+0x15d/0x4c0 net/core/skbuff.c:663
> __netif_receive_skb_core+0x10f8/0x33d0 net/core/dev.c:4425
> __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4456
> netif_receive_skb_internal+0x10b/0x5e0 net/core/dev.c:4527
> netif_receive_skb+0xae/0x390 net/core/dev.c:4551
> tun_rx_batched.isra.43+0x5e7/0x860 drivers/net/tun.c:1221
> tun_get_user+0x11dd/0x2150 drivers/net/tun.c:1542
> tun_chr_write_iter+0xd8/0x190 drivers/net/tun.c:1568
> call_write_iter include/linux/fs.h:1742 [inline]
> new_sync_write fs/read_write.c:457 [inline]
> __vfs_write+0x684/0x970 fs/read_write.c:470
> vfs_write+0x189/0x510 fs/read_write.c:518
> SYSC_write fs/read_write.c:565 [inline]
> SyS_write+0xef/0x220 fs/read_write.c:557
> entry_SYSCALL_64_fastpath+0x1f/0xbe
> RIP: 0033:0x40bab1
> RSP: 002b:00007ffbc9ef6c00 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> RAX: ffffffffffffffda RBX: 0000000000000036 RCX: 000000000040bab1
> RDX: 0000000000000036 RSI: 0000000020002000 RDI: 0000000000000003
> RBP: 0000000000a5f870 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
> R13: 0000000000000000 R14: 00007ffbc9ef79c0 R15: 00007ffbc9ef7700
> Code: c6 e8 c9 78 8d fd 4c 89 e0 48 c1 e8 03 80 3c 18 00 0f 85 93 04 00 00 4d 8b 3c 24 41 c6 45 00 00 49 8d 7f 20 48 89 f8 48 c1 e8 03 <80> 3c 18 00 0f 85 6b 04 00 00 41 80 7d 00 00 49 8b 47 20 0f 85
> RIP: __read_once_size include/linux/compiler.h:276 [inline] RSP: ffff88006768ef20
> RIP: compound_head include/linux/page-flags.h:146 [inline] RSP: ffff88006768ef20
> RIP: put_page include/linux/mm.h:811 [inline] RSP: ffff88006768ef20
> RIP: __skb_frag_unref include/linux/skbuff.h:2743 [inline] RSP: ffff88006768ef20
> RIP: skb_release_data+0x26c/0x790 net/core/skbuff.c:568 RSP: ffff88006768ef20
> ---[ end trace 54050eb1ec52ff83 ]---
>
On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
> On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
>> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
>>> skb in the past. This socket based method is not suitable for high
>>> speed userspace like virtualization which usually:
>>>
>>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>>> possible
>>> - don't want to be block at sendmsg()
>>>
>>> To eliminate the above overheads, this patch tries to use build_skb()
>>> for small packet. We will do this only when the following conditions
>>> are all met:
>>>
>>> - TAP instead of TUN
>>> - sk_sndbuf is INT_MAX
>>> - caller don't want to be blocked
>>> - zerocopy is not used
>>> - packet size is smaller enough to use build_skb()
>>>
>>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>>
>>> Before: ~1.70Mpps
>>> After : ~1.88Mpps
>>>
>>> What's more important, this makes it possible to implement XDP for tap
>>> before creating skbs.
>> Well well well.
>>
>> You do realize that tun_build_skb() is not thread safe ?
> The issue is alloc frag, isn't it?
> I guess for now we can limit this to XDP mode only, and
> just allocate full pages in that mode.
>
>
Limit this to XDP mode only does not prevent user from sending packets
to same queue in parallel I think?
Thanks
On Wed, Aug 16, 2017 at 11:57:51AM +0800, Jason Wang wrote:
>
>
> On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
> > On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
> > > On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
> > > > We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
> > > > skb in the past. This socket based method is not suitable for high
> > > > speed userspace like virtualization which usually:
> > > >
> > > > - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
> > > > possible
> > > > - don't want to be block at sendmsg()
> > > >
> > > > To eliminate the above overheads, this patch tries to use build_skb()
> > > > for small packet. We will do this only when the following conditions
> > > > are all met:
> > > >
> > > > - TAP instead of TUN
> > > > - sk_sndbuf is INT_MAX
> > > > - caller don't want to be blocked
> > > > - zerocopy is not used
> > > > - packet size is smaller enough to use build_skb()
> > > >
> > > > Pktgen from guest to host shows ~11% improvement for rx pps of tap:
> > > >
> > > > Before: ~1.70Mpps
> > > > After : ~1.88Mpps
> > > >
> > > > What's more important, this makes it possible to implement XDP for tap
> > > > before creating skbs.
> > > Well well well.
> > >
> > > You do realize that tun_build_skb() is not thread safe ?
> > The issue is alloc frag, isn't it?
> > I guess for now we can limit this to XDP mode only, and
> > just allocate full pages in that mode.
> >
> >
>
> Limit this to XDP mode only does not prevent user from sending packets to
> same queue in parallel I think?
>
> Thanks
Yes but then you can just drop the page frag allocator since
XDP is assumed not to care about truesize for most packets.
--
MST
On 2017年08月16日 11:59, Michael S. Tsirkin wrote:
> On Wed, Aug 16, 2017 at 11:57:51AM +0800, Jason Wang wrote:
>>
>> On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
>>> On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
>>>> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>>>>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
>>>>> skb in the past. This socket based method is not suitable for high
>>>>> speed userspace like virtualization which usually:
>>>>>
>>>>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>>>>> possible
>>>>> - don't want to be block at sendmsg()
>>>>>
>>>>> To eliminate the above overheads, this patch tries to use build_skb()
>>>>> for small packet. We will do this only when the following conditions
>>>>> are all met:
>>>>>
>>>>> - TAP instead of TUN
>>>>> - sk_sndbuf is INT_MAX
>>>>> - caller don't want to be blocked
>>>>> - zerocopy is not used
>>>>> - packet size is smaller enough to use build_skb()
>>>>>
>>>>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>>>>
>>>>> Before: ~1.70Mpps
>>>>> After : ~1.88Mpps
>>>>>
>>>>> What's more important, this makes it possible to implement XDP for tap
>>>>> before creating skbs.
>>>> Well well well.
>>>>
>>>> You do realize that tun_build_skb() is not thread safe ?
>>> The issue is alloc frag, isn't it?
>>> I guess for now we can limit this to XDP mode only, and
>>> just allocate full pages in that mode.
>>>
>>>
>> Limit this to XDP mode only does not prevent user from sending packets to
>> same queue in parallel I think?
>>
>> Thanks
> Yes but then you can just drop the page frag allocator since
> XDP is assumed not to care about truesize for most packets.
>
Ok, let me do some test to see the numbers between the two methods first.
Thanks
On 2017年08月16日 12:07, Jason Wang wrote:
>
>
> On 2017年08月16日 11:59, Michael S. Tsirkin wrote:
>> On Wed, Aug 16, 2017 at 11:57:51AM +0800, Jason Wang wrote:
>>>
>>> On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
>>>> On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
>>>>> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>>>>>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to
>>>>>> allocate
>>>>>> skb in the past. This socket based method is not suitable for high
>>>>>> speed userspace like virtualization which usually:
>>>>>>
>>>>>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as
>>>>>> fast as
>>>>>> possible
>>>>>> - don't want to be block at sendmsg()
>>>>>>
>>>>>> To eliminate the above overheads, this patch tries to use
>>>>>> build_skb()
>>>>>> for small packet. We will do this only when the following conditions
>>>>>> are all met:
>>>>>>
>>>>>> - TAP instead of TUN
>>>>>> - sk_sndbuf is INT_MAX
>>>>>> - caller don't want to be blocked
>>>>>> - zerocopy is not used
>>>>>> - packet size is smaller enough to use build_skb()
>>>>>>
>>>>>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>>>>>
>>>>>> Before: ~1.70Mpps
>>>>>> After : ~1.88Mpps
>>>>>>
>>>>>> What's more important, this makes it possible to implement XDP
>>>>>> for tap
>>>>>> before creating skbs.
>>>>> Well well well.
>>>>>
>>>>> You do realize that tun_build_skb() is not thread safe ?
>>>> The issue is alloc frag, isn't it?
>>>> I guess for now we can limit this to XDP mode only, and
>>>> just allocate full pages in that mode.
>>>>
>>>>
>>> Limit this to XDP mode only does not prevent user from sending
>>> packets to
>>> same queue in parallel I think?
>>>
>>> Thanks
>> Yes but then you can just drop the page frag allocator since
>> XDP is assumed not to care about truesize for most packets.
>>
>
> Ok, let me do some test to see the numbers between the two methods first.
>
> Thanks
It looks like full page allocation just produce too much stress on the
page allocator.
I get 1.58Mpps (full page) vs 1.95Mpps (page frag) with the patches
attached.
Since non-XDP case can also benefit from build_skb(), I tend to use
spinlock instead of full page in this case.
Thanks
On Wed, 2017-08-16 at 11:55 +0800, Jason Wang wrote:
>
> On 2017年08月16日 11:45, Eric Dumazet wrote:
> >
> > You do realize that tun_build_skb() is not thread safe ?
>
> Ok, I think the issue if skb_page_frag_refill(), need a spinlock
> probably. Will prepare a patch.
But since tun is used from process context, why don't you use the
per-thread generator (no lock involved)
tcp_sendmsg() uses this for GFP_KERNEL allocations.
Untested patch :
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5892284eb8d05b0678d820bad3d0d2c61a879aeb..c38cd840cc0b7fecf182b23976e36f709cacca1f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -175,7 +175,6 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
- struct page_frag alloc_frag;
};
struct tun_flow_entry {
@@ -578,8 +577,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
}
if (tun)
skb_array_cleanup(&tfile->tx_array);
- if (tfile->alloc_frag.page)
- put_page(tfile->alloc_frag.page);
sock_put(&tfile->sk);
}
}
@@ -1272,7 +1269,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
struct virtio_net_hdr *hdr,
int len, int *generic_xdp)
{
- struct page_frag *alloc_frag = &tfile->alloc_frag;
+ struct page_frag *alloc_frag = ¤t->task_frag;
struct sk_buff *skb;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
@@ -2580,8 +2577,6 @@ static int tun_chr_open(struct inode *inode, struct file * file)
tfile->sk.sk_write_space = tun_sock_write_space;
tfile->sk.sk_sndbuf = INT_MAX;
- tfile->alloc_frag.page = NULL;
-
file->private_data = tfile;
INIT_LIST_HEAD(&tfile->next);
On 2017年08月16日 18:24, Eric Dumazet wrote:
> On Wed, 2017-08-16 at 11:55 +0800, Jason Wang wrote:
>> On 2017年08月16日 11:45, Eric Dumazet wrote:
>>> You do realize that tun_build_skb() is not thread safe ?
>> Ok, I think the issue if skb_page_frag_refill(), need a spinlock
>> probably. Will prepare a patch.
> But since tun is used from process context, why don't you use the
> per-thread generator (no lock involved)
Haven't noticed this before.
>
> tcp_sendmsg() uses this for GFP_KERNEL allocations.
>
> Untested patch :
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 5892284eb8d05b0678d820bad3d0d2c61a879aeb..c38cd840cc0b7fecf182b23976e36f709cacca1f 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -175,7 +175,6 @@ struct tun_file {
> struct list_head next;
> struct tun_struct *detached;
> struct skb_array tx_array;
> - struct page_frag alloc_frag;
> };
>
> struct tun_flow_entry {
> @@ -578,8 +577,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
> }
> if (tun)
> skb_array_cleanup(&tfile->tx_array);
> - if (tfile->alloc_frag.page)
> - put_page(tfile->alloc_frag.page);
> sock_put(&tfile->sk);
> }
> }
> @@ -1272,7 +1269,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
> struct virtio_net_hdr *hdr,
> int len, int *generic_xdp)
> {
> - struct page_frag *alloc_frag = &tfile->alloc_frag;
> + struct page_frag *alloc_frag = ¤t->task_frag;
> struct sk_buff *skb;
> struct bpf_prog *xdp_prog;
> int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
> @@ -2580,8 +2577,6 @@ static int tun_chr_open(struct inode *inode, struct file * file)
> tfile->sk.sk_write_space = tun_sock_write_space;
> tfile->sk.sk_sndbuf = INT_MAX;
>
> - tfile->alloc_frag.page = NULL;
> -
> file->private_data = tfile;
> INIT_LIST_HEAD(&tfile->next);
>
>
>
>
>
Tested-by: Jason Wang <[email protected]>
Acked-by: Jason Wang <[email protected]>
From: Jason Wang <[email protected]>
Date: Wed, 16 Aug 2017 17:17:45 +0800
> It looks like full page allocation just produce too much stress on the
> page allocator.
>
> I get 1.58Mpps (full page) vs 1.95Mpps (page frag) with the patches
> attached.
Yes, this is why drivers doing XDP tend to drift towards implementing
a local cache of pages.