2017-07-27 09:25:42

by Jason Wang

[permalink] [raw]
Subject: [PATCH net-next 0/3] XDP support for tap

Hi all:

This series tries to implement XDP support for tap. Two path were
implemented:

- fast path: small & non-gso packet, For performance reason we do it
at page level and use build_skb() to create skb if necessary.
- slow path: big or gso packet, we don't want to lose the capability
compared to generic XDP, so we export some generic xdp helpers and
do it after skb was created.

xdp1 shows about 47% improvement.

Please review.

Jason Wang (3):
tap: use build_skb() for small packet
net: export some generic xdp helpers
tap: XDP support

drivers/net/tun.c | 247 ++++++++++++++++++++++++++++++++++++++++++----
include/linux/netdevice.h | 2 +
net/core/dev.c | 14 +--
3 files changed, 236 insertions(+), 27 deletions(-)

--
2.7.4


2017-07-27 09:25:47

by Jason Wang

[permalink] [raw]
Subject: [PATCH net-next 1/3] tap: use build_skb() for small packet

We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
skb in the past. This socket based method is not suitable for high
speed userspace like virtualization which usually:

- ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
possible
- don't want to be block at sendmsg()

To eliminate the above overheads, this patch tries to use build_skb()
for small packet. We will do this only when the following conditions
are all met:

- TAP instead of TUN
- sk_sndbuf is INT_MAX
- caller don't want to be blocked
- zerocopy is not used
- packet size is smaller enough to use build_skb()

Pktgen from guest to host shows ~15% improvement for rx pps of tap:

Before: ~1.9Mpps
After : ~2.2Mpps

What's more important, this makes it possible to implement XDP for tap
before creating skbs.

Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 112 ++++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 91 insertions(+), 21 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a93392d..de1a2ef 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -105,6 +105,8 @@ do { \
} while (0)
#endif

+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
/* TUN device flags */

/* IFF_ATTACH_QUEUE is never stored in device flags,
@@ -170,6 +172,7 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
+ struct page_frag alloc_frag;
};

struct tun_flow_entry {
@@ -571,6 +574,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
}
if (tun)
skb_array_cleanup(&tfile->tx_array);
+ if (tfile->alloc_frag.page)
+ put_page(tfile->alloc_frag.page);
sock_put(&tfile->sk);
}
}
@@ -1190,6 +1195,61 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
}
}

+static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
+ int len, int noblock, bool zerocopy)
+{
+ if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+ return false;
+
+ if (tfile->socket.sk->sk_sndbuf != INT_MAX)
+ return false;
+
+ if (!noblock)
+ return false;
+
+ if (zerocopy)
+ return false;
+
+ if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+ return false;
+
+ return true;
+}
+
+static struct sk_buff *tun_build_skb(struct tun_file *tfile,
+ struct iov_iter *from,
+ int len)
+{
+ struct page_frag *alloc_frag = &tfile->alloc_frag;
+ struct sk_buff *skb;
+ int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ char *buf;
+ size_t copied;
+
+ if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+ return ERR_PTR(-ENOMEM);
+
+ buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+ copied = copy_page_from_iter(alloc_frag->page,
+ alloc_frag->offset + TUN_RX_PAD,
+ len, from);
+ if (copied != len)
+ return ERR_PTR(-EFAULT);
+
+ skb = build_skb(buf, buflen);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ skb_reserve(skb, TUN_RX_PAD);
+ skb_put(skb, len);
+ get_page(alloc_frag->page);
+ alloc_frag->offset += buflen;
+
+ return skb;
+}
+
/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
@@ -1263,30 +1323,38 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
zerocopy = true;
}

- if (!zerocopy) {
- copylen = len;
- if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
- linear = good_linear;
- else
- linear = tun16_to_cpu(tun, gso.hdr_len);
- }
-
- skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
- if (IS_ERR(skb)) {
- if (PTR_ERR(skb) != -EAGAIN)
+ if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+ skb = tun_build_skb(tfile, from, len);
+ if (IS_ERR(skb)) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
- return PTR_ERR(skb);
- }
+ return PTR_ERR(skb);
+ }
+ } else {
+ if (!zerocopy) {
+ copylen = len;
+ if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
+ linear = good_linear;
+ else
+ linear = tun16_to_cpu(tun, gso.hdr_len);
+ }

- if (zerocopy)
- err = zerocopy_sg_from_iter(skb, from);
- else
- err = skb_copy_datagram_from_iter(skb, 0, from, len);
+ skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+ if (IS_ERR(skb)) {
+ if (PTR_ERR(skb) != -EAGAIN)
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ return PTR_ERR(skb);
+ }

- if (err) {
- this_cpu_inc(tun->pcpu_stats->rx_dropped);
- kfree_skb(skb);
- return -EFAULT;
+ if (zerocopy)
+ err = zerocopy_sg_from_iter(skb, from);
+ else
+ err = skb_copy_datagram_from_iter(skb, 0, from, len);
+
+ if (err) {
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
}

if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
@@ -2377,6 +2445,8 @@ static int tun_chr_open(struct inode *inode, struct file * file)
tfile->sk.sk_write_space = tun_sock_write_space;
tfile->sk.sk_sndbuf = INT_MAX;

+ tfile->alloc_frag.page = NULL;
+
file->private_data = tfile;
INIT_LIST_HEAD(&tfile->next);

--
2.7.4

2017-07-27 09:25:51

by Jason Wang

[permalink] [raw]
Subject: [PATCH net-next 2/3] net: export some generic xdp helpers

This patch tries to export some generic xdp helpers to drivers. This
can let driver to do XDP for a specific skb. This is useful for the
case when the packet is hard to be processed at page level directly
(e.g jumbo/GSO frame).

With this patch, there's no need for driver to forbid the XDP set when
configuration is not suitable. Instead, it can defer the XDP for
packets that is hard to be processed directly after skb is created.

Signed-off-by: Jason Wang <[email protected]>
---
include/linux/netdevice.h | 2 ++
net/core/dev.c | 14 ++++++++------
2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3a3cdc1..a736b7c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3261,6 +3261,8 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
}

+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
int netif_rx(struct sk_buff *skb);
int netif_rx_ni(struct sk_buff *skb);
int netif_receive_skb(struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ea6b4b..eb0d937 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3920,7 +3920,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
/* When doing generic XDP we have to bypass the qdisc layer and the
* network taps in order to match in-driver-XDP behavior.
*/
-static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -3941,13 +3941,12 @@ static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
kfree_skb(skb);
}
}
+EXPORT_SYMBOL_GPL(generic_xdp_tx);

static struct static_key generic_xdp_needed __read_mostly;

-static int do_xdp_generic(struct sk_buff *skb)
+int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
{
- struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-
if (xdp_prog) {
u32 act = netif_receive_generic_xdp(skb, xdp_prog);
int err;
@@ -3972,6 +3971,7 @@ static int do_xdp_generic(struct sk_buff *skb)
kfree_skb(skb);
return XDP_DROP;
}
+EXPORT_SYMBOL_GPL(do_xdp_generic);

static int netif_rx_internal(struct sk_buff *skb)
{
@@ -3982,7 +3982,8 @@ static int netif_rx_internal(struct sk_buff *skb)
trace_netif_rx(skb);

if (static_key_false(&generic_xdp_needed)) {
- int ret = do_xdp_generic(skb);
+ int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ skb);

/* Consider XDP consuming the packet a success from
* the netdev point of view we do not want to count
@@ -4503,7 +4504,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
rcu_read_lock();

if (static_key_false(&generic_xdp_needed)) {
- int ret = do_xdp_generic(skb);
+ int ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ skb);

if (ret != XDP_PASS) {
rcu_read_unlock();
--
2.7.4

2017-07-27 09:25:58

by Jason Wang

[permalink] [raw]
Subject: [PATCH net-next 3/3] tap: XDP support

This patch tries to implement XDP for tun. The implementation was
split into two parts:

- fast path: small and no gso packet. We try to do XDP at page level
before build_skb(). For XDP_TX, since creating/destroying queues
were completely under control of userspace, it was implemented
through generic XDP helper after skb has been built. This could be
optimized in the future.
- slow path: big or gso packet. We try to do it after skb was created
through generic XDP helpers.

XDP_REDIRECT was not implemented, it could be done on top.

xdp1 test shows 47.6% improvement:

Before: ~2.1Mpps
After: ~3.1Mpps

Suggested-by: Michael S. Tsirkin <[email protected]>
Signed-off-by: Jason Wang <[email protected]>
---
drivers/net/tun.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 142 insertions(+), 7 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index de1a2ef..13abe23 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -73,6 +73,8 @@
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>

#include <linux/uaccess.h>

@@ -105,7 +107,8 @@ do { \
} while (0)
#endif

-#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+#define TUN_HEADROOM 256
+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD + TUN_HEADROOM)

/* TUN device flags */

@@ -224,6 +227,7 @@ struct tun_struct {
u32 flow_count;
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
+ struct bpf_prog __rcu *xdp_prog;
};

#ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -590,6 +594,7 @@ static void tun_detach(struct tun_file *tfile, bool clean)
static void tun_detach_all(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
+ struct bpf_prog *xdp_prog = rtnl_dereference(tun->xdp_prog);
struct tun_file *tfile, *tmp;
int i, n = tun->numqueues;

@@ -622,6 +627,9 @@ static void tun_detach_all(struct net_device *dev)
}
BUG_ON(tun->numdisabled != 0);

+ if (xdp_prog)
+ bpf_prog_put(xdp_prog);
+
if (tun->flags & IFF_PERSIST)
module_put(THIS_MODULE);
}
@@ -1008,6 +1016,56 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
stats->tx_dropped = tx_dropped;
}

+static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+ struct netlink_ext_ack *extack)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ struct bpf_prog *old_prog;
+
+ /* We will shift the packet that can't be handled to generic
+ * XDP layer.
+ */
+
+ old_prog = rtnl_dereference(tun->xdp_prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ rcu_assign_pointer(tun->xdp_prog, prog);
+
+ if (prog) {
+ prog = bpf_prog_add(prog, 1);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ }
+
+ return 0;
+}
+
+static u32 tun_xdp_query(struct net_device *dev)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ const struct bpf_prog *xdp_prog;
+
+ xdp_prog = rtnl_dereference(tun->xdp_prog);
+ if (xdp_prog)
+ return xdp_prog->aux->id;
+
+ return 0;
+}
+
+static int tun_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+ switch (xdp->command) {
+ case XDP_SETUP_PROG:
+ return tun_xdp_set(dev, xdp->prog, xdp->extack);
+ case XDP_QUERY_PROG:
+ xdp->prog_id = tun_xdp_query(dev);
+ xdp->prog_attached = !!xdp->prog_id;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
@@ -1038,6 +1096,7 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
.ndo_get_stats64 = tun_net_get_stats64,
+ .ndo_xdp = tun_xdp,
};

static void tun_flow_init(struct tun_struct *tun)
@@ -1217,16 +1276,21 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
return true;
}

-static struct sk_buff *tun_build_skb(struct tun_file *tfile,
+static struct sk_buff *tun_build_skb(struct tun_struct *tun,
+ struct tun_file *tfile,
struct iov_iter *from,
- int len)
+ struct virtio_net_hdr *hdr,
+ int len, int *generic_xdp)
{
struct page_frag *alloc_frag = &tfile->alloc_frag;
struct sk_buff *skb;
+ struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ unsigned int delta = 0;
char *buf;
size_t copied;
+ bool xdp_xmit = false;

if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
return ERR_PTR(-ENOMEM);
@@ -1238,16 +1302,68 @@ static struct sk_buff *tun_build_skb(struct tun_file *tfile,
if (copied != len)
return ERR_PTR(-EFAULT);

+ if (hdr->gso_type)
+ *generic_xdp = 1;
+ else
+ *generic_xdp = 0;
+
+ rcu_read_lock();
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog && !*generic_xdp) {
+ struct xdp_buff xdp;
+ void *orig_data;
+ u32 act;
+
+ xdp.data_hard_start = buf;
+ xdp.data = buf + TUN_RX_PAD;
+ xdp.data_end = xdp.data + len;
+ orig_data = xdp.data;
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+ switch (act) {
+ case XDP_TX:
+ xdp_xmit = true;
+ /* fall through */
+ case XDP_PASS:
+ delta = orig_data - xdp.data;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(act);
+ /* fall through */
+ case XDP_ABORTED:
+ trace_xdp_exception(tun->dev, xdp_prog, act);
+ /* fall through */
+ case XDP_DROP:
+ goto err_xdp;
+ }
+ }
+
skb = build_skb(buf, buflen);
- if (!skb)
+ if (!skb) {
+ rcu_read_unlock();
return ERR_PTR(-ENOMEM);
+ }

- skb_reserve(skb, TUN_RX_PAD);
- skb_put(skb, len);
+ skb_reserve(skb, TUN_RX_PAD - delta);
+ skb_put(skb, len + delta);
get_page(alloc_frag->page);
alloc_frag->offset += buflen;

+ if (xdp_xmit) {
+ skb->dev = tun->dev;
+ generic_xdp_tx(skb, xdp_prog);
+ rcu_read_lock();
+ return NULL;
+ }
+
+ rcu_read_unlock();
+
return skb;
+
+err_xdp:
+ rcu_read_unlock();
+ this_cpu_inc(tun->pcpu_stats->rx_dropped);
+ return NULL;
}

/* Get packet from user space buffer */
@@ -1266,6 +1382,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
bool zerocopy = false;
int err;
u32 rxhash;
+ int generic_xdp = 1;

if (!(tun->dev->flags & IFF_UP))
return -EIO;
@@ -1324,11 +1441,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
}

if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
- skb = tun_build_skb(tfile, from, len);
+ skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
if (IS_ERR(skb)) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
return PTR_ERR(skb);
}
+ if (!skb)
+ return total_len;
} else {
if (!zerocopy) {
copylen = len;
@@ -1402,6 +1521,22 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
skb_reset_network_header(skb);
skb_probe_transport_header(skb, 0);

+ if (generic_xdp) {
+ struct bpf_prog *xdp_prog;
+ int ret;
+
+ rcu_read_lock();
+ xdp_prog = rcu_dereference(tun->xdp_prog);
+ if (xdp_prog) {
+ ret = do_xdp_generic(xdp_prog, skb);
+ if (ret != XDP_PASS) {
+ rcu_read_unlock();
+ return total_len;
+ }
+ }
+ rcu_read_unlock();
+ }
+
rxhash = __skb_get_hash_symmetric(skb);
#ifndef CONFIG_4KSTACKS
tun_rx_batched(tun, tfile, skb, more);
--
2.7.4

2017-07-28 03:20:25

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] tap: XDP support

On Thu, 27 Jul 2017 17:25:33 +0800, Jason Wang wrote:
> This patch tries to implement XDP for tun. The implementation was
> split into two parts:
>
> - fast path: small and no gso packet. We try to do XDP at page level
> before build_skb(). For XDP_TX, since creating/destroying queues
> were completely under control of userspace, it was implemented
> through generic XDP helper after skb has been built. This could be
> optimized in the future.
> - slow path: big or gso packet. We try to do it after skb was created
> through generic XDP helpers.
>
> XDP_REDIRECT was not implemented, it could be done on top.
>
> xdp1 test shows 47.6% improvement:
>
> Before: ~2.1Mpps
> After: ~3.1Mpps
>
> Suggested-by: Michael S. Tsirkin <[email protected]>
> Signed-off-by: Jason Wang <[email protected]>

> @@ -1008,6 +1016,56 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
> stats->tx_dropped = tx_dropped;
> }
>
> +static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
> + struct netlink_ext_ack *extack)
> +{
> + struct tun_struct *tun = netdev_priv(dev);
> + struct bpf_prog *old_prog;
> +
> + /* We will shift the packet that can't be handled to generic
> + * XDP layer.
> + */
> +
> + old_prog = rtnl_dereference(tun->xdp_prog);
> + if (old_prog)
> + bpf_prog_put(old_prog);
> + rcu_assign_pointer(tun->xdp_prog, prog);

Is this OK? Could this lead to the program getting freed and then
datapath accessing a stale pointer? I mean in the scenario where the
process gets pre-empted between the bpf_prog_put() and
rcu_assign_pointer()?

> + if (prog) {
> + prog = bpf_prog_add(prog, 1);
> + if (IS_ERR(prog))
> + return PTR_ERR(prog);
> + }

I don't think you need this extra reference here. dev_change_xdp_fd()
will call bpf_prog_get_type() which means driver gets the program with
a reference already taken, drivers does have to free that reference when
program is removed (or device is freed, as you correctly do).

> + return 0;
> +}
> +

2017-07-28 03:29:01

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] tap: XDP support



On 2017年07月28日 11:13, Jakub Kicinski wrote:
> On Thu, 27 Jul 2017 17:25:33 +0800, Jason Wang wrote:
>> This patch tries to implement XDP for tun. The implementation was
>> split into two parts:
>>
>> - fast path: small and no gso packet. We try to do XDP at page level
>> before build_skb(). For XDP_TX, since creating/destroying queues
>> were completely under control of userspace, it was implemented
>> through generic XDP helper after skb has been built. This could be
>> optimized in the future.
>> - slow path: big or gso packet. We try to do it after skb was created
>> through generic XDP helpers.
>>
>> XDP_REDIRECT was not implemented, it could be done on top.
>>
>> xdp1 test shows 47.6% improvement:
>>
>> Before: ~2.1Mpps
>> After: ~3.1Mpps
>>
>> Suggested-by: Michael S. Tsirkin <[email protected]>
>> Signed-off-by: Jason Wang <[email protected]>
>> @@ -1008,6 +1016,56 @@ tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
>> stats->tx_dropped = tx_dropped;
>> }
>>
>> +static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>> + struct netlink_ext_ack *extack)
>> +{
>> + struct tun_struct *tun = netdev_priv(dev);
>> + struct bpf_prog *old_prog;
>> +
>> + /* We will shift the packet that can't be handled to generic
>> + * XDP layer.
>> + */
>> +
>> + old_prog = rtnl_dereference(tun->xdp_prog);
>> + if (old_prog)
>> + bpf_prog_put(old_prog);
>> + rcu_assign_pointer(tun->xdp_prog, prog);
> Is this OK? Could this lead to the program getting freed and then
> datapath accessing a stale pointer? I mean in the scenario where the
> process gets pre-empted between the bpf_prog_put() and
> rcu_assign_pointer()?

Will call bpf_prog_put() after rcu_assign_pointer().

>
>> + if (prog) {
>> + prog = bpf_prog_add(prog, 1);
>> + if (IS_ERR(prog))
>> + return PTR_ERR(prog);
>> + }
> I don't think you need this extra reference here. dev_change_xdp_fd()
> will call bpf_prog_get_type() which means driver gets the program with
> a reference already taken, drivers does have to free that reference when
> program is removed (or device is freed, as you correctly do).

I see, will drop this in next version.

Thanks.

>
>> + return 0;
>> +}
>> +

2017-07-28 03:46:46

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] tap: XDP support

On Fri, Jul 28, 2017 at 11:28:54AM +0800, Jason Wang wrote:
> > > + old_prog = rtnl_dereference(tun->xdp_prog);
> > > + if (old_prog)
> > > + bpf_prog_put(old_prog);
> > > + rcu_assign_pointer(tun->xdp_prog, prog);
> > Is this OK? Could this lead to the program getting freed and then
> > datapath accessing a stale pointer? I mean in the scenario where the
> > process gets pre-empted between the bpf_prog_put() and
> > rcu_assign_pointer()?
>
> Will call bpf_prog_put() after rcu_assign_pointer().

I suspect you need to sync RCU or something before that.

2017-07-28 03:50:54

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] tap: XDP support



On 2017年07月28日 11:46, Michael S. Tsirkin wrote:
> On Fri, Jul 28, 2017 at 11:28:54AM +0800, Jason Wang wrote:
>>>> + old_prog = rtnl_dereference(tun->xdp_prog);
>>>> + if (old_prog)
>>>> + bpf_prog_put(old_prog);
>>>> + rcu_assign_pointer(tun->xdp_prog, prog);
>>> Is this OK? Could this lead to the program getting freed and then
>>> datapath accessing a stale pointer? I mean in the scenario where the
>>> process gets pre-empted between the bpf_prog_put() and
>>> rcu_assign_pointer()?
>> Will call bpf_prog_put() after rcu_assign_pointer().
> I suspect you need to sync RCU or something before that.

__bpf_prog_put() will do call_rcu(), so looks like it was ok.

Thanks

2017-07-28 03:51:50

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] tap: XDP support

On Fri, 28 Jul 2017 06:46:40 +0300, Michael S. Tsirkin wrote:
> On Fri, Jul 28, 2017 at 11:28:54AM +0800, Jason Wang wrote:
> > > > + old_prog = rtnl_dereference(tun->xdp_prog);
> > > > + if (old_prog)
> > > > + bpf_prog_put(old_prog);
> > > > + rcu_assign_pointer(tun->xdp_prog, prog);
> > > Is this OK? Could this lead to the program getting freed and then
> > > datapath accessing a stale pointer? I mean in the scenario where the
> > > process gets pre-empted between the bpf_prog_put() and
> > > rcu_assign_pointer()?
> >
> > Will call bpf_prog_put() after rcu_assign_pointer().
>
> I suspect you need to sync RCU or something before that.

I think the bpf_prog_put() will use call_rcu() to do the actual free:

static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
trace_bpf_prog_put_rcu(prog);
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
}

It's just that we are only under the rtnl here, RCU lock is not held, so
grace period may elapse between bpf_prog_put() and rcu_assign_pointer().

2017-07-28 04:05:06

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] tap: XDP support

On Fri, Jul 28, 2017 at 11:50:45AM +0800, Jason Wang wrote:
>
>
> On 2017年07月28日 11:46, Michael S. Tsirkin wrote:
> > On Fri, Jul 28, 2017 at 11:28:54AM +0800, Jason Wang wrote:
> > > > > + old_prog = rtnl_dereference(tun->xdp_prog);
> > > > > + if (old_prog)
> > > > > + bpf_prog_put(old_prog);
> > > > > + rcu_assign_pointer(tun->xdp_prog, prog);
> > > > Is this OK? Could this lead to the program getting freed and then
> > > > datapath accessing a stale pointer? I mean in the scenario where the
> > > > process gets pre-empted between the bpf_prog_put() and
> > > > rcu_assign_pointer()?
> > > Will call bpf_prog_put() after rcu_assign_pointer().
> > I suspect you need to sync RCU or something before that.
>
> __bpf_prog_put() will do call_rcu(), so looks like it was ok.
>
> Thanks

True - I missed that.

--
MST

2017-07-28 15:11:47

by Daniel Borkmann

[permalink] [raw]
Subject: Re: [PATCH net-next 3/3] tap: XDP support

On 07/28/2017 05:46 AM, Michael S. Tsirkin wrote:
> On Fri, Jul 28, 2017 at 11:28:54AM +0800, Jason Wang wrote:
>>>> + old_prog = rtnl_dereference(tun->xdp_prog);
>>>> + if (old_prog)
>>>> + bpf_prog_put(old_prog);
>>>> + rcu_assign_pointer(tun->xdp_prog, prog);
>>> Is this OK? Could this lead to the program getting freed and then
>>> datapath accessing a stale pointer? I mean in the scenario where the
>>> process gets pre-empted between the bpf_prog_put() and
>>> rcu_assign_pointer()?
>>
>> Will call bpf_prog_put() after rcu_assign_pointer().

+1, good catch Jakub.

> I suspect you need to sync RCU or something before that.

No, see for example generic_xdp_install().