2021-11-15 15:32:39

by Andrey Ryabinin

[permalink] [raw]
Subject: [PATCH 6/6] vhost_net: use RCU callbacks instead of synchronize_rcu()

Currently vhost_net_release() uses synchronize_rcu() to synchronize
freeing with vhost_zerocopy_callback(). However synchronize_rcu()
is quite costly operation. It take more than 10 seconds
to shutdown qemu launched with couple net devices like this:
-netdev tap,id=tap0,..,vhost=on,queues=80
because we end up calling synchronize_rcu() netdev_count*queues times.

Free vhost net structures in rcu callback instead of using
synchronize_rcu() to fix the problem.

Signed-off-by: Andrey Ryabinin <[email protected]>
---
drivers/vhost/net.c | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 97a209d6a527..0699d30e83d5 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -132,6 +132,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+ struct rcu_head rcu;
/* Number of TX recently submitted.
* Protected by tx vq lock. */
unsigned tx_packets;
@@ -1389,6 +1390,18 @@ static void vhost_net_flush(struct vhost_net *n)
}
}

+static void vhost_net_free(struct rcu_head *rcu_head)
+{
+ struct vhost_net *n = container_of(rcu_head, struct vhost_net, rcu);
+
+ kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
+ kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
+ kfree(n->dev.vqs);
+ if (n->page_frag.page)
+ __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
+ kvfree(n);
+}
+
static int vhost_net_release(struct inode *inode, struct file *f)
{
struct vhost_net *n = f->private_data;
@@ -1404,15 +1417,8 @@ static int vhost_net_release(struct inode *inode, struct file *f)
sockfd_put(tx_sock);
if (rx_sock)
sockfd_put(rx_sock);
- /* Make sure no callbacks are outstanding */
- synchronize_rcu();

- kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
- kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
- kfree(n->dev.vqs);
- if (n->page_frag.page)
- __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
- kvfree(n);
+ call_rcu(&n->rcu, vhost_net_free);
return 0;
}

--
2.32.0



2021-11-16 05:01:46

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH 6/6] vhost_net: use RCU callbacks instead of synchronize_rcu()

On Mon, Nov 15, 2021 at 11:32 PM Andrey Ryabinin <[email protected]> wrote:
>
> Currently vhost_net_release() uses synchronize_rcu() to synchronize
> freeing with vhost_zerocopy_callback(). However synchronize_rcu()
> is quite costly operation. It take more than 10 seconds
> to shutdown qemu launched with couple net devices like this:
> -netdev tap,id=tap0,..,vhost=on,queues=80
> because we end up calling synchronize_rcu() netdev_count*queues times.
>
> Free vhost net structures in rcu callback instead of using
> synchronize_rcu() to fix the problem.

I admit the release code is somehow hard to understand. But I wonder
if the following case can still happen with this:

CPU 0 (vhost_dev_cleanup) CPU1
(vhost_net_zerocopy_callback()->vhost_work_queue())
if (!dev->worker)
dev->worker = NULL

wake_up_process(dev->worker)

If this is true. It seems the fix is to move RCU synchronization stuff
in vhost_net_ubuf_put_and_wait()?

Thanks

>
> Signed-off-by: Andrey Ryabinin <[email protected]>
> ---
> drivers/vhost/net.c | 22 ++++++++++++++--------
> 1 file changed, 14 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 97a209d6a527..0699d30e83d5 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -132,6 +132,7 @@ struct vhost_net {
> struct vhost_dev dev;
> struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
> struct vhost_poll poll[VHOST_NET_VQ_MAX];
> + struct rcu_head rcu;
> /* Number of TX recently submitted.
> * Protected by tx vq lock. */
> unsigned tx_packets;
> @@ -1389,6 +1390,18 @@ static void vhost_net_flush(struct vhost_net *n)
> }
> }
>
> +static void vhost_net_free(struct rcu_head *rcu_head)
> +{
> + struct vhost_net *n = container_of(rcu_head, struct vhost_net, rcu);
> +
> + kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
> + kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
> + kfree(n->dev.vqs);
> + if (n->page_frag.page)
> + __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
> + kvfree(n);
> +}
> +
> static int vhost_net_release(struct inode *inode, struct file *f)
> {
> struct vhost_net *n = f->private_data;
> @@ -1404,15 +1417,8 @@ static int vhost_net_release(struct inode *inode, struct file *f)
> sockfd_put(tx_sock);
> if (rx_sock)
> sockfd_put(rx_sock);
> - /* Make sure no callbacks are outstanding */
> - synchronize_rcu();
>
> - kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
> - kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
> - kfree(n->dev.vqs);
> - if (n->page_frag.page)
> - __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
> - kvfree(n);
> + call_rcu(&n->rcu, vhost_net_free);
> return 0;
> }
>
> --
> 2.32.0
>


2021-11-19 11:30:48

by Andrey Ryabinin

[permalink] [raw]
Subject: Re: [PATCH 6/6] vhost_net: use RCU callbacks instead of synchronize_rcu()



On 11/16/21 8:00 AM, Jason Wang wrote:
> On Mon, Nov 15, 2021 at 11:32 PM Andrey Ryabinin <[email protected]> wrote:
>>
>> Currently vhost_net_release() uses synchronize_rcu() to synchronize
>> freeing with vhost_zerocopy_callback(). However synchronize_rcu()
>> is quite costly operation. It take more than 10 seconds
>> to shutdown qemu launched with couple net devices like this:
>> -netdev tap,id=tap0,..,vhost=on,queues=80
>> because we end up calling synchronize_rcu() netdev_count*queues times.
>>
>> Free vhost net structures in rcu callback instead of using
>> synchronize_rcu() to fix the problem.
>
> I admit the release code is somehow hard to understand. But I wonder
> if the following case can still happen with this:
>
> CPU 0 (vhost_dev_cleanup) CPU1
> (vhost_net_zerocopy_callback()->vhost_work_queue())
> if (!dev->worker)
> dev->worker = NULL
>
> wake_up_process(dev->worker)
>
> If this is true. It seems the fix is to move RCU synchronization stuff
> in vhost_net_ubuf_put_and_wait()?
>

It all depends whether vhost_zerocopy_callback() can be called outside of vhost
thread context or not. If it can run after vhost thread stopped, than the race you
describe seems possible and the fix in commit b0c057ca7e83 ("vhost: fix a theoretical race in device cleanup")
wasn't complete. I would fix it by calling synchronize_rcu() after vhost_net_flush()
and before vhost_dev_cleanup().

As for the performance problem, it can be solved by replacing synchronize_rcu() with synchronize_rcu_expedited().

But now I'm not sure that this race is actually exists and that synchronize_rcu() needed at all.
I did a bit of testing and I only see callback being called from vhost thread:

vhost-3724 3733 [002] 2701.768731: probe:vhost_zerocopy_callback: (ffffffff81af8c10)
ffffffff81af8c11 vhost_zerocopy_callback+0x1 ([kernel.kallsyms])
ffffffff81bb34f6 skb_copy_ubufs+0x256 ([kernel.kallsyms])
ffffffff81bce621 __netif_receive_skb_core.constprop.0+0xac1 ([kernel.kallsyms])
ffffffff81bd062d __netif_receive_skb_one_core+0x3d ([kernel.kallsyms])
ffffffff81bd0748 netif_receive_skb+0x38 ([kernel.kallsyms])
ffffffff819a2a1e tun_get_user+0xdce ([kernel.kallsyms])
ffffffff819a2cf4 tun_sendmsg+0xa4 ([kernel.kallsyms])
ffffffff81af9229 handle_tx_zerocopy+0x149 ([kernel.kallsyms])
ffffffff81afaf05 handle_tx+0xc5 ([kernel.kallsyms])
ffffffff81afce86 vhost_worker+0x76 ([kernel.kallsyms])
ffffffff811581e9 kthread+0x169 ([kernel.kallsyms])
ffffffff810018cf ret_from_fork+0x1f ([kernel.kallsyms])
0 [unknown] ([unknown])

This means that the callback can't run after kthread_stop() in vhost_dev_cleanup() and no synchronize_rcu() needed.

I'm not confident that my quite limited testing cover all possible vhost_zerocopy_callback() callstacks.

2021-11-22 02:48:58

by Jason Wang

[permalink] [raw]
Subject: Re: [PATCH 6/6] vhost_net: use RCU callbacks instead of synchronize_rcu()

On Fri, Nov 19, 2021 at 7:31 PM Andrey Ryabinin <[email protected]> wrote:
>
>
>
> On 11/16/21 8:00 AM, Jason Wang wrote:
> > On Mon, Nov 15, 2021 at 11:32 PM Andrey Ryabinin <[email protected]> wrote:
> >>
> >> Currently vhost_net_release() uses synchronize_rcu() to synchronize
> >> freeing with vhost_zerocopy_callback(). However synchronize_rcu()
> >> is quite costly operation. It take more than 10 seconds
> >> to shutdown qemu launched with couple net devices like this:
> >> -netdev tap,id=tap0,..,vhost=on,queues=80
> >> because we end up calling synchronize_rcu() netdev_count*queues times.
> >>
> >> Free vhost net structures in rcu callback instead of using
> >> synchronize_rcu() to fix the problem.
> >
> > I admit the release code is somehow hard to understand. But I wonder
> > if the following case can still happen with this:
> >
> > CPU 0 (vhost_dev_cleanup) CPU1
> > (vhost_net_zerocopy_callback()->vhost_work_queue())
> > if (!dev->worker)
> > dev->worker = NULL
> >
> > wake_up_process(dev->worker)
> >
> > If this is true. It seems the fix is to move RCU synchronization stuff
> > in vhost_net_ubuf_put_and_wait()?
> >
>
> It all depends whether vhost_zerocopy_callback() can be called outside of vhost
> thread context or not.

I think the answer is yes, the callback will be mainly used in the
zerocopy path when the underlayer NIC finishes the DMA of a packet.

> If it can run after vhost thread stopped, than the race you
> describe seems possible and the fix in commit b0c057ca7e83 ("vhost: fix a theoretical race in device cleanup")
> wasn't complete. I would fix it by calling synchronize_rcu() after vhost_net_flush()
> and before vhost_dev_cleanup().
>
> As for the performance problem, it can be solved by replacing synchronize_rcu() with synchronize_rcu_expedited().

Yes, that's another way, but see below.

>
> But now I'm not sure that this race is actually exists and that synchronize_rcu() needed at all.
> I did a bit of testing and I only see callback being called from vhost thread:
>
> vhost-3724 3733 [002] 2701.768731: probe:vhost_zerocopy_callback: (ffffffff81af8c10)
> ffffffff81af8c11 vhost_zerocopy_callback+0x1 ([kernel.kallsyms])
> ffffffff81bb34f6 skb_copy_ubufs+0x256 ([kernel.kallsyms])
> ffffffff81bce621 __netif_receive_skb_core.constprop.0+0xac1 ([kernel.kallsyms])
> ffffffff81bd062d __netif_receive_skb_one_core+0x3d ([kernel.kallsyms])
> ffffffff81bd0748 netif_receive_skb+0x38 ([kernel.kallsyms])
> ffffffff819a2a1e tun_get_user+0xdce ([kernel.kallsyms])
> ffffffff819a2cf4 tun_sendmsg+0xa4 ([kernel.kallsyms])
> ffffffff81af9229 handle_tx_zerocopy+0x149 ([kernel.kallsyms])
> ffffffff81afaf05 handle_tx+0xc5 ([kernel.kallsyms])
> ffffffff81afce86 vhost_worker+0x76 ([kernel.kallsyms])
> ffffffff811581e9 kthread+0x169 ([kernel.kallsyms])
> ffffffff810018cf ret_from_fork+0x1f ([kernel.kallsyms])
> 0 [unknown] ([unknown])
>

From the call trace you can send packets between two TAP. Since the TX
of TAP is synchronous so we can't see callback to be called out of
vhost thread.

In order to test it, we need 1) enable zerocopy
(experimental_zcopytx=1) and 2) sending the packet to the real NIC
with bridge or macvlan

Zerocopy was disalbed due to a lot of isuses (098eadce3c62 "vhost_net:
disable zerocopy by default"). So if we fix by moving it to
vhost_net_ubuf_put_and_wait(), there won't be a synchronize_rcu() in
the non-zerocopy path which seems to be sufficient. And we can use
synchronize_rcu_expedited() on top if it is really needed.

Thanks

> This means that the callback can't run after kthread_stop() in vhost_dev_cleanup() and no synchronize_rcu() needed.
>
> I'm not confident that my quite limited testing cover all possible vhost_zerocopy_callback() callstacks.
>


2021-11-22 09:37:59

by Michael S. Tsirkin

[permalink] [raw]
Subject: Re: [PATCH 6/6] vhost_net: use RCU callbacks instead of synchronize_rcu()

On Fri, Nov 19, 2021 at 02:32:05PM +0300, Andrey Ryabinin wrote:
>
>
> On 11/16/21 8:00 AM, Jason Wang wrote:
> > On Mon, Nov 15, 2021 at 11:32 PM Andrey Ryabinin <[email protected]> wrote:
> >>
> >> Currently vhost_net_release() uses synchronize_rcu() to synchronize
> >> freeing with vhost_zerocopy_callback(). However synchronize_rcu()
> >> is quite costly operation. It take more than 10 seconds
> >> to shutdown qemu launched with couple net devices like this:
> >> -netdev tap,id=tap0,..,vhost=on,queues=80
> >> because we end up calling synchronize_rcu() netdev_count*queues times.
> >>
> >> Free vhost net structures in rcu callback instead of using
> >> synchronize_rcu() to fix the problem.
> >
> > I admit the release code is somehow hard to understand. But I wonder
> > if the following case can still happen with this:
> >
> > CPU 0 (vhost_dev_cleanup) CPU1
> > (vhost_net_zerocopy_callback()->vhost_work_queue())
> > if (!dev->worker)
> > dev->worker = NULL
> >
> > wake_up_process(dev->worker)
> >
> > If this is true. It seems the fix is to move RCU synchronization stuff
> > in vhost_net_ubuf_put_and_wait()?
> >
>
> It all depends whether vhost_zerocopy_callback() can be called outside of vhost
> thread context or not. If it can run after vhost thread stopped, than the race you
> describe seems possible and the fix in commit b0c057ca7e83 ("vhost: fix a theoretical race in device cleanup")
> wasn't complete. I would fix it by calling synchronize_rcu() after vhost_net_flush()
> and before vhost_dev_cleanup().
>
> As for the performance problem, it can be solved by replacing synchronize_rcu() with synchronize_rcu_expedited().

expedited causes a stop of IPIs though, so it's problematic to
do it upon a userspace syscall.

> But now I'm not sure that this race is actually exists and that synchronize_rcu() needed at all.
> I did a bit of testing and I only see callback being called from vhost thread:
>
> vhost-3724 3733 [002] 2701.768731: probe:vhost_zerocopy_callback: (ffffffff81af8c10)
> ffffffff81af8c11 vhost_zerocopy_callback+0x1 ([kernel.kallsyms])
> ffffffff81bb34f6 skb_copy_ubufs+0x256 ([kernel.kallsyms])
> ffffffff81bce621 __netif_receive_skb_core.constprop.0+0xac1 ([kernel.kallsyms])
> ffffffff81bd062d __netif_receive_skb_one_core+0x3d ([kernel.kallsyms])
> ffffffff81bd0748 netif_receive_skb+0x38 ([kernel.kallsyms])
> ffffffff819a2a1e tun_get_user+0xdce ([kernel.kallsyms])
> ffffffff819a2cf4 tun_sendmsg+0xa4 ([kernel.kallsyms])
> ffffffff81af9229 handle_tx_zerocopy+0x149 ([kernel.kallsyms])
> ffffffff81afaf05 handle_tx+0xc5 ([kernel.kallsyms])
> ffffffff81afce86 vhost_worker+0x76 ([kernel.kallsyms])
> ffffffff811581e9 kthread+0x169 ([kernel.kallsyms])
> ffffffff810018cf ret_from_fork+0x1f ([kernel.kallsyms])
> 0 [unknown] ([unknown])
>
> This means that the callback can't run after kthread_stop() in vhost_dev_cleanup() and no synchronize_rcu() needed.
>
> I'm not confident that my quite limited testing cover all possible vhost_zerocopy_callback() callstacks.


2021-11-22 15:11:34

by Andrey Ryabinin

[permalink] [raw]
Subject: Re: [PATCH 6/6] vhost_net: use RCU callbacks instead of synchronize_rcu()



On 11/22/21 12:37 PM, Michael S. Tsirkin wrote:
> On Fri, Nov 19, 2021 at 02:32:05PM +0300, Andrey Ryabinin wrote:
>>
>>
>> On 11/16/21 8:00 AM, Jason Wang wrote:
>>> On Mon, Nov 15, 2021 at 11:32 PM Andrey Ryabinin <[email protected]> wrote:
>>>>
>>>> Currently vhost_net_release() uses synchronize_rcu() to synchronize
>>>> freeing with vhost_zerocopy_callback(). However synchronize_rcu()
>>>> is quite costly operation. It take more than 10 seconds
>>>> to shutdown qemu launched with couple net devices like this:
>>>> -netdev tap,id=tap0,..,vhost=on,queues=80
>>>> because we end up calling synchronize_rcu() netdev_count*queues times.
>>>>
>>>> Free vhost net structures in rcu callback instead of using
>>>> synchronize_rcu() to fix the problem.
>>>
>>> I admit the release code is somehow hard to understand. But I wonder
>>> if the following case can still happen with this:
>>>
>>> CPU 0 (vhost_dev_cleanup) CPU1
>>> (vhost_net_zerocopy_callback()->vhost_work_queue())
>>> if (!dev->worker)
>>> dev->worker = NULL
>>>
>>> wake_up_process(dev->worker)
>>>
>>> If this is true. It seems the fix is to move RCU synchronization stuff
>>> in vhost_net_ubuf_put_and_wait()?
>>>
>>
>> It all depends whether vhost_zerocopy_callback() can be called outside of vhost
>> thread context or not. If it can run after vhost thread stopped, than the race you
>> describe seems possible and the fix in commit b0c057ca7e83 ("vhost: fix a theoretical race in device cleanup")
>> wasn't complete. I would fix it by calling synchronize_rcu() after vhost_net_flush()
>> and before vhost_dev_cleanup().
>>
>> As for the performance problem, it can be solved by replacing synchronize_rcu() with synchronize_rcu_expedited().
>
> expedited causes a stop of IPIs though, so it's problematic to
> do it upon a userspace syscall.
>

How about something like this?


---
drivers/vhost/net.c | 40 ++++++++++++++++++++++++++--------------
1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 97a209d6a527..556df26c584d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -144,6 +144,10 @@ struct vhost_net {
struct page_frag page_frag;
/* Refcount bias of page frag */
int refcnt_bias;
+
+ struct socket *tx_sock;
+ struct socket *rx_sock;
+ struct rcu_work rwork;
};

static unsigned vhost_net_zcopy_mask __read_mostly;
@@ -1389,6 +1393,24 @@ static void vhost_net_flush(struct vhost_net *n)
}
}

+static void vhost_net_cleanup(struct work_struct *work)
+{
+ struct vhost_net *n =
+ container_of(to_rcu_work(work), struct vhost_net, rwork);
+ vhost_dev_cleanup(&n->dev);
+ vhost_net_vq_reset(n);
+ if (n->tx_sock)
+ sockfd_put(n->tx_sock);
+ if (n->rx_sock)
+ sockfd_put(n->rx_sock);
+ kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
+ kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
+ kfree(n->dev.vqs);
+ if (n->page_frag.page)
+ __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
+ kvfree(n);
+}
+
static int vhost_net_release(struct inode *inode, struct file *f)
{
struct vhost_net *n = f->private_data;
@@ -1398,21 +1420,11 @@ static int vhost_net_release(struct inode *inode, struct file *f)
vhost_net_stop(n, &tx_sock, &rx_sock);
vhost_net_flush(n);
vhost_dev_stop(&n->dev);
- vhost_dev_cleanup(&n->dev);
- vhost_net_vq_reset(n);
- if (tx_sock)
- sockfd_put(tx_sock);
- if (rx_sock)
- sockfd_put(rx_sock);
- /* Make sure no callbacks are outstanding */
- synchronize_rcu();
+ n->tx_sock = tx_sock;
+ n->rx_sock = rx_sock;

- kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
- kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
- kfree(n->dev.vqs);
- if (n->page_frag.page)
- __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
- kvfree(n);
+ INIT_RCU_WORK(&n->rwork, vhost_net_cleanup);
+ queue_rcu_work(system_wq, &n->rwork);
return 0;
}

--