2019-06-27 10:16:13

by Ilya Maximets

[permalink] [raw]
Subject: [PATCH bpf v5 2/2] xdp: fix hang while unregistering device bound to xdp socket

Device that bound to XDP socket will not have zero refcount until the
userspace application will not close it. This leads to hang inside
'netdev_wait_allrefs()' if device unregistering requested:

# ip link del p1
< hang on recvmsg on netlink socket >

# ps -x | grep ip
5126 pts/0 D+ 0:00 ip link del p1

# journalctl -b

Jun 05 07:19:16 kernel:
unregister_netdevice: waiting for p1 to become free. Usage count = 1

Jun 05 07:19:27 kernel:
unregister_netdevice: waiting for p1 to become free. Usage count = 1
...

Fix that by implementing NETDEV_UNREGISTER event notification handler
to properly clean up all the resources and unref device.

This should also allow socket killing via ss(8) utility.

Fixes: 965a99098443 ("xsk: add support for bind for Rx")
Signed-off-by: Ilya Maximets <[email protected]>
---
include/net/xdp_sock.h | 5 +++
net/xdp/xdp_umem.c | 10 ++---
net/xdp/xdp_umem.h | 1 +
net/xdp/xsk.c | 87 ++++++++++++++++++++++++++++++++++++------
4 files changed, 87 insertions(+), 16 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index d074b6d60f8a..82d153a637c7 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -61,6 +61,11 @@ struct xdp_sock {
struct xsk_queue *tx ____cacheline_aligned_in_smp;
struct list_head list;
bool zc;
+ enum {
+ XSK_UNINITIALIZED = 0,
+ XSK_BINDED,
+ XSK_UNBINDED,
+ } state;
/* Protects multiple processes in the control path */
struct mutex mutex;
/* Mutual exclusion of NAPI TX thread and sendmsg error paths
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 267b82a4cbcf..20c91f02d3d8 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -140,11 +140,13 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
return err;
}

-static void xdp_umem_clear_dev(struct xdp_umem *umem)
+void xdp_umem_clear_dev(struct xdp_umem *umem)
{
struct netdev_bpf bpf;
int err;

+ ASSERT_RTNL();
+
if (!umem->dev)
return;

@@ -153,17 +155,13 @@ static void xdp_umem_clear_dev(struct xdp_umem *umem)
bpf.xsk.umem = NULL;
bpf.xsk.queue_id = umem->queue_id;

- rtnl_lock();
err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
- rtnl_unlock();

if (err)
WARN(1, "failed to disable umem!\n");
}

- rtnl_lock();
xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
- rtnl_unlock();

dev_put(umem->dev);
umem->dev = NULL;
@@ -195,7 +193,9 @@ static void xdp_umem_unaccount_pages(struct xdp_umem *umem)

static void xdp_umem_release(struct xdp_umem *umem)
{
+ rtnl_lock();
xdp_umem_clear_dev(umem);
+ rtnl_unlock();

ida_simple_remove(&umem_ida, umem->id);

diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 27603227601b..a63a9fb251f5 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -10,6 +10,7 @@

int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u16 queue_id, u16 flags);
+void xdp_umem_clear_dev(struct xdp_umem *umem);
bool xdp_umem_validate_queues(struct xdp_umem *umem);
void xdp_get_umem(struct xdp_umem *umem);
void xdp_put_umem(struct xdp_umem *umem);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index a14e8864e4fa..336723948a36 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -335,6 +335,22 @@ static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
return 0;
}

+static void xsk_unbind_dev(struct xdp_sock *xs)
+{
+ struct net_device *dev = xs->dev;
+
+ if (!dev || xs->state != XSK_BINDED)
+ return;
+
+ xs->state = XSK_UNBINDED;
+
+ /* Wait for driver to stop using the xdp socket. */
+ xdp_del_sk_umem(xs->umem, xs);
+ xs->dev = NULL;
+ synchronize_net();
+ dev_put(dev);
+}
+
static int xsk_release(struct socket *sock)
{
struct sock *sk = sock->sk;
@@ -354,15 +370,7 @@ static int xsk_release(struct socket *sock)
sock_prot_inuse_add(net, sk->sk_prot, -1);
local_bh_enable();

- if (xs->dev) {
- struct net_device *dev = xs->dev;
-
- /* Wait for driver to stop using the xdp socket. */
- xdp_del_sk_umem(xs->umem, xs);
- xs->dev = NULL;
- synchronize_net();
- dev_put(dev);
- }
+ xsk_unbind_dev(xs);

xskq_destroy(xs->rx);
xskq_destroy(xs->tx);
@@ -412,7 +420,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
return -EINVAL;

mutex_lock(&xs->mutex);
- if (xs->dev) {
+ if (xs->state != XSK_UNINITIALIZED) {
err = -EBUSY;
goto out_release;
}
@@ -492,6 +500,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
out_unlock:
if (err)
dev_put(dev);
+ else
+ xs->state = XSK_BINDED;
out_release:
mutex_unlock(&xs->mutex);
return err;
@@ -520,6 +530,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT;

mutex_lock(&xs->mutex);
+ if (xs->state != XSK_UNINITIALIZED) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
err = xsk_init_queue(entries, q, false);
mutex_unlock(&xs->mutex);
@@ -534,7 +548,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT;

mutex_lock(&xs->mutex);
- if (xs->umem) {
+ if (xs->state != XSK_UNINITIALIZED || xs->umem) {
mutex_unlock(&xs->mutex);
return -EBUSY;
}
@@ -561,6 +575,10 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
return -EFAULT;

mutex_lock(&xs->mutex);
+ if (xs->state != XSK_UNINITIALIZED) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
if (!xs->umem) {
mutex_unlock(&xs->mutex);
return -EINVAL;
@@ -662,6 +680,9 @@ static int xsk_mmap(struct file *file, struct socket *sock,
unsigned long pfn;
struct page *qpg;

+ if (xs->state != XSK_UNINITIALIZED)
+ return -EBUSY;
+
if (offset == XDP_PGOFF_RX_RING) {
q = READ_ONCE(xs->rx);
} else if (offset == XDP_PGOFF_TX_RING) {
@@ -693,6 +714,38 @@ static int xsk_mmap(struct file *file, struct socket *sock,
size, vma->vm_page_prot);
}

+static int xsk_notifier(struct notifier_block *this,
+ unsigned long msg, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct sock *sk;
+
+ switch (msg) {
+ case NETDEV_UNREGISTER:
+ mutex_lock(&net->xdp.lock);
+ sk_for_each(sk, &net->xdp.list) {
+ struct xdp_sock *xs = xdp_sk(sk);
+
+ mutex_lock(&xs->mutex);
+ if (xs->dev == dev) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_error_report(sk);
+
+ xsk_unbind_dev(xs);
+
+ /* Clear device references in umem. */
+ xdp_umem_clear_dev(xs->umem);
+ }
+ mutex_unlock(&xs->mutex);
+ }
+ mutex_unlock(&net->xdp.lock);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
static struct proto xsk_proto = {
.name = "XDP",
.owner = THIS_MODULE,
@@ -764,6 +817,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
sock_set_flag(sk, SOCK_RCU_FREE);

xs = xdp_sk(sk);
+ xs->state = XSK_UNINITIALIZED;
mutex_init(&xs->mutex);
spin_lock_init(&xs->tx_completion_lock);

@@ -784,6 +838,10 @@ static const struct net_proto_family xsk_family_ops = {
.owner = THIS_MODULE,
};

+static struct notifier_block xsk_netdev_notifier = {
+ .notifier_call = xsk_notifier,
+};
+
static int __net_init xsk_net_init(struct net *net)
{
mutex_init(&net->xdp.lock);
@@ -816,8 +874,15 @@ static int __init xsk_init(void)
err = register_pernet_subsys(&xsk_net_ops);
if (err)
goto out_sk;
+
+ err = register_netdevice_notifier(&xsk_netdev_notifier);
+ if (err)
+ goto out_pernet;
+
return 0;

+out_pernet:
+ unregister_pernet_subsys(&xsk_net_ops);
out_sk:
sock_unregister(PF_XDP);
out_proto:
--
2.17.1


2019-06-27 22:06:32

by Jonathan Lemon

[permalink] [raw]
Subject: Re: [PATCH bpf v5 2/2] xdp: fix hang while unregistering device bound to xdp socket

On 27 Jun 2019, at 3:15, Ilya Maximets wrote:

> Device that bound to XDP socket will not have zero refcount until the
> userspace application will not close it. This leads to hang inside
> 'netdev_wait_allrefs()' if device unregistering requested:
>
> # ip link del p1
> < hang on recvmsg on netlink socket >
>
> # ps -x | grep ip
> 5126 pts/0 D+ 0:00 ip link del p1
>
> # journalctl -b
>
> Jun 05 07:19:16 kernel:
> unregister_netdevice: waiting for p1 to become free. Usage count = 1
>
> Jun 05 07:19:27 kernel:
> unregister_netdevice: waiting for p1 to become free. Usage count = 1
> ...
>
> Fix that by implementing NETDEV_UNREGISTER event notification handler
> to properly clean up all the resources and unref device.
>
> This should also allow socket killing via ss(8) utility.
>
> Fixes: 965a99098443 ("xsk: add support for bind for Rx")
> Signed-off-by: Ilya Maximets <[email protected]>
> ---
> include/net/xdp_sock.h | 5 +++
> net/xdp/xdp_umem.c | 10 ++---
> net/xdp/xdp_umem.h | 1 +
> net/xdp/xsk.c | 87
> ++++++++++++++++++++++++++++++++++++------
> 4 files changed, 87 insertions(+), 16 deletions(-)
>
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index d074b6d60f8a..82d153a637c7 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -61,6 +61,11 @@ struct xdp_sock {
> struct xsk_queue *tx ____cacheline_aligned_in_smp;
> struct list_head list;
> bool zc;
> + enum {
> + XSK_UNINITIALIZED = 0,
> + XSK_BINDED,
> + XSK_UNBINDED,
> + } state;

I'd prefer that these were named better, perhaps:
XSK_READY,
XSK_BOUND,
XSK_UNBOUND,


Other than that:
Acked-by: Jonathan Lemon <[email protected]>

--
Jonathan



> /* Protects multiple processes in the control path */
> struct mutex mutex;
> /* Mutual exclusion of NAPI TX thread and sendmsg error paths
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 267b82a4cbcf..20c91f02d3d8 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -140,11 +140,13 @@ int xdp_umem_assign_dev(struct xdp_umem *umem,
> struct net_device *dev,
> return err;
> }
>
> -static void xdp_umem_clear_dev(struct xdp_umem *umem)
> +void xdp_umem_clear_dev(struct xdp_umem *umem)
> {
> struct netdev_bpf bpf;
> int err;
>
> + ASSERT_RTNL();
> +
> if (!umem->dev)
> return;
>
> @@ -153,17 +155,13 @@ static void xdp_umem_clear_dev(struct xdp_umem
> *umem)
> bpf.xsk.umem = NULL;
> bpf.xsk.queue_id = umem->queue_id;
>
> - rtnl_lock();
> err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
> - rtnl_unlock();
>
> if (err)
> WARN(1, "failed to disable umem!\n");
> }
>
> - rtnl_lock();
> xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
> - rtnl_unlock();
>
> dev_put(umem->dev);
> umem->dev = NULL;
> @@ -195,7 +193,9 @@ static void xdp_umem_unaccount_pages(struct
> xdp_umem *umem)
>
> static void xdp_umem_release(struct xdp_umem *umem)
> {
> + rtnl_lock();
> xdp_umem_clear_dev(umem);
> + rtnl_unlock();
>
> ida_simple_remove(&umem_ida, umem->id);
>
> diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
> index 27603227601b..a63a9fb251f5 100644
> --- a/net/xdp/xdp_umem.h
> +++ b/net/xdp/xdp_umem.h
> @@ -10,6 +10,7 @@
>
> int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device
> *dev,
> u16 queue_id, u16 flags);
> +void xdp_umem_clear_dev(struct xdp_umem *umem);
> bool xdp_umem_validate_queues(struct xdp_umem *umem);
> void xdp_get_umem(struct xdp_umem *umem);
> void xdp_put_umem(struct xdp_umem *umem);
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index a14e8864e4fa..336723948a36 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -335,6 +335,22 @@ static int xsk_init_queue(u32 entries, struct
> xsk_queue **queue,
> return 0;
> }
>
> +static void xsk_unbind_dev(struct xdp_sock *xs)
> +{
> + struct net_device *dev = xs->dev;
> +
> + if (!dev || xs->state != XSK_BINDED)
> + return;
> +
> + xs->state = XSK_UNBINDED;
> +
> + /* Wait for driver to stop using the xdp socket. */
> + xdp_del_sk_umem(xs->umem, xs);
> + xs->dev = NULL;
> + synchronize_net();
> + dev_put(dev);
> +}
> +
> static int xsk_release(struct socket *sock)
> {
> struct sock *sk = sock->sk;
> @@ -354,15 +370,7 @@ static int xsk_release(struct socket *sock)
> sock_prot_inuse_add(net, sk->sk_prot, -1);
> local_bh_enable();
>
> - if (xs->dev) {
> - struct net_device *dev = xs->dev;
> -
> - /* Wait for driver to stop using the xdp socket. */
> - xdp_del_sk_umem(xs->umem, xs);
> - xs->dev = NULL;
> - synchronize_net();
> - dev_put(dev);
> - }
> + xsk_unbind_dev(xs);
>
> xskq_destroy(xs->rx);
> xskq_destroy(xs->tx);
> @@ -412,7 +420,7 @@ static int xsk_bind(struct socket *sock, struct
> sockaddr *addr, int addr_len)
> return -EINVAL;
>
> mutex_lock(&xs->mutex);
> - if (xs->dev) {
> + if (xs->state != XSK_UNINITIALIZED) {
> err = -EBUSY;
> goto out_release;
> }
> @@ -492,6 +500,8 @@ static int xsk_bind(struct socket *sock, struct
> sockaddr *addr, int addr_len)
> out_unlock:
> if (err)
> dev_put(dev);
> + else
> + xs->state = XSK_BINDED;
> out_release:
> mutex_unlock(&xs->mutex);
> return err;
> @@ -520,6 +530,10 @@ static int xsk_setsockopt(struct socket *sock,
> int level, int optname,
> return -EFAULT;
>
> mutex_lock(&xs->mutex);
> + if (xs->state != XSK_UNINITIALIZED) {
> + mutex_unlock(&xs->mutex);
> + return -EBUSY;
> + }
> q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
> err = xsk_init_queue(entries, q, false);
> mutex_unlock(&xs->mutex);
> @@ -534,7 +548,7 @@ static int xsk_setsockopt(struct socket *sock, int
> level, int optname,
> return -EFAULT;
>
> mutex_lock(&xs->mutex);
> - if (xs->umem) {
> + if (xs->state != XSK_UNINITIALIZED || xs->umem) {
> mutex_unlock(&xs->mutex);
> return -EBUSY;
> }
> @@ -561,6 +575,10 @@ static int xsk_setsockopt(struct socket *sock,
> int level, int optname,
> return -EFAULT;
>
> mutex_lock(&xs->mutex);
> + if (xs->state != XSK_UNINITIALIZED) {
> + mutex_unlock(&xs->mutex);
> + return -EBUSY;
> + }
> if (!xs->umem) {
> mutex_unlock(&xs->mutex);
> return -EINVAL;
> @@ -662,6 +680,9 @@ static int xsk_mmap(struct file *file, struct
> socket *sock,
> unsigned long pfn;
> struct page *qpg;
>
> + if (xs->state != XSK_UNINITIALIZED)
> + return -EBUSY;
> +
> if (offset == XDP_PGOFF_RX_RING) {
> q = READ_ONCE(xs->rx);
> } else if (offset == XDP_PGOFF_TX_RING) {
> @@ -693,6 +714,38 @@ static int xsk_mmap(struct file *file, struct
> socket *sock,
> size, vma->vm_page_prot);
> }
>
> +static int xsk_notifier(struct notifier_block *this,
> + unsigned long msg, void *ptr)
> +{
> + struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> + struct net *net = dev_net(dev);
> + struct sock *sk;
> +
> + switch (msg) {
> + case NETDEV_UNREGISTER:
> + mutex_lock(&net->xdp.lock);
> + sk_for_each(sk, &net->xdp.list) {
> + struct xdp_sock *xs = xdp_sk(sk);
> +
> + mutex_lock(&xs->mutex);
> + if (xs->dev == dev) {
> + sk->sk_err = ENETDOWN;
> + if (!sock_flag(sk, SOCK_DEAD))
> + sk->sk_error_report(sk);
> +
> + xsk_unbind_dev(xs);
> +
> + /* Clear device references in umem. */
> + xdp_umem_clear_dev(xs->umem);
> + }
> + mutex_unlock(&xs->mutex);
> + }
> + mutex_unlock(&net->xdp.lock);
> + break;
> + }
> + return NOTIFY_DONE;
> +}
> +
> static struct proto xsk_proto = {
> .name = "XDP",
> .owner = THIS_MODULE,
> @@ -764,6 +817,7 @@ static int xsk_create(struct net *net, struct
> socket *sock, int protocol,
> sock_set_flag(sk, SOCK_RCU_FREE);
>
> xs = xdp_sk(sk);
> + xs->state = XSK_UNINITIALIZED;
> mutex_init(&xs->mutex);
> spin_lock_init(&xs->tx_completion_lock);
>
> @@ -784,6 +838,10 @@ static const struct net_proto_family
> xsk_family_ops = {
> .owner = THIS_MODULE,
> };
>
> +static struct notifier_block xsk_netdev_notifier = {
> + .notifier_call = xsk_notifier,
> +};
> +
> static int __net_init xsk_net_init(struct net *net)
> {
> mutex_init(&net->xdp.lock);
> @@ -816,8 +874,15 @@ static int __init xsk_init(void)
> err = register_pernet_subsys(&xsk_net_ops);
> if (err)
> goto out_sk;
> +
> + err = register_netdevice_notifier(&xsk_netdev_notifier);
> + if (err)
> + goto out_pernet;
> +
> return 0;
>
> +out_pernet:
> + unregister_pernet_subsys(&xsk_net_ops);
> out_sk:
> sock_unregister(PF_XDP);
> out_proto:
> --
> 2.17.1

2019-06-28 08:02:09

by Ilya Maximets

[permalink] [raw]
Subject: Re: [PATCH bpf v5 2/2] xdp: fix hang while unregistering device bound to xdp socket

On 28.06.2019 1:04, Jonathan Lemon wrote:
> On 27 Jun 2019, at 3:15, Ilya Maximets wrote:
>
>> Device that bound to XDP socket will not have zero refcount until the
>> userspace application will not close it. This leads to hang inside
>> 'netdev_wait_allrefs()' if device unregistering requested:
>>
>>   # ip link del p1
>>   < hang on recvmsg on netlink socket >
>>
>>   # ps -x | grep ip
>>   5126  pts/0    D+   0:00 ip link del p1
>>
>>   # journalctl -b
>>
>>   Jun 05 07:19:16 kernel:
>>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>>
>>   Jun 05 07:19:27 kernel:
>>   unregister_netdevice: waiting for p1 to become free. Usage count = 1
>>   ...
>>
>> Fix that by implementing NETDEV_UNREGISTER event notification handler
>> to properly clean up all the resources and unref device.
>>
>> This should also allow socket killing via ss(8) utility.
>>
>> Fixes: 965a99098443 ("xsk: add support for bind for Rx")
>> Signed-off-by: Ilya Maximets <[email protected]>
>> ---
>>  include/net/xdp_sock.h |  5 +++
>>  net/xdp/xdp_umem.c     | 10 ++---
>>  net/xdp/xdp_umem.h     |  1 +
>>  net/xdp/xsk.c          | 87 ++++++++++++++++++++++++++++++++++++------
>>  4 files changed, 87 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
>> index d074b6d60f8a..82d153a637c7 100644
>> --- a/include/net/xdp_sock.h
>> +++ b/include/net/xdp_sock.h
>> @@ -61,6 +61,11 @@ struct xdp_sock {
>>      struct xsk_queue *tx ____cacheline_aligned_in_smp;
>>      struct list_head list;
>>      bool zc;
>> +    enum {
>> +        XSK_UNINITIALIZED = 0,
>> +        XSK_BINDED,
>> +        XSK_UNBINDED,
>> +    } state;
>
> I'd prefer that these were named better, perhaps:
>    XSK_READY,
>    XSK_BOUND,
>    XSK_UNBOUND,

Sure. Thanks for suggestion!

>
> Other than that:
> Acked-by: Jonathan Lemon <[email protected]>
>

I'll send a new version with the new state names keeping your ACK.

Best regards, Ilya Maximets.