LinuxLists.cc - [RFC PATCH 5/8] vhost: Add callback that stops new work and waits on running ones

2023-05-18 00:21:43

Subject: [RFC PATCH 5/8] vhost: Add callback that stops new work and waits on running ones

When the vhost_task gets a SIGKILL we want to stop new work from being
queued and also wait for and handle completions for running work. For the
latter, we still need to use the vhost_task to handle the completing work
so we can't just exit right away. But, this has us kick off the stopping
and flushing/stopping of the device/vhost_task/worker to the system work
queue while the vhost_task handles completions. When all completions are
done we will then do vhost_task_stop and we will exit.

Signed-off-by: Mike Christie <[email protected]>
---
drivers/vhost/net.c | 2 +-
drivers/vhost/scsi.c | 4 ++--
drivers/vhost/test.c | 3 ++-
drivers/vhost/vdpa.c | 2 +-
drivers/vhost/vhost.c | 48 ++++++++++++++++++++++++++++++++++++-------
drivers/vhost/vhost.h | 10 ++++++++-
drivers/vhost/vsock.c | 4 ++--
7 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8557072ff05e..90c25127b3f8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1409,7 +1409,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
UIO_MAXIOV + VHOST_NET_BATCH,
VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
- NULL);
+ NULL, NULL);

vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index bb10fa4bb4f6..40f9135e1a62 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1820,8 +1820,8 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
vqs[i] = &vs->vqs[i].vq;
vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
}
- vhost_dev_init(&vs->dev, vqs, nvqs, UIO_MAXIOV,
- VHOST_SCSI_WEIGHT, 0, true, NULL);
+ vhost_dev_init(&vs->dev, vqs, nvqs, UIO_MAXIOV, VHOST_SCSI_WEIGHT, 0,
+ true, NULL, NULL);

vhost_scsi_init_inflight(vs, NULL);

diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 42c955a5b211..11a2823d7532 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -120,7 +120,8 @@ static int vhost_test_open(struct inode *inode, struct file *f)
vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
- VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
+ VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL,
+ NULL);

f->private_data = n;

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 8c1aefc865f0..de9a83ecb70d 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -1279,7 +1279,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep)
vqs[i]->handle_kick = handle_vq_kick;
}
vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false,
- vhost_vdpa_process_iotlb_msg);
+ vhost_vdpa_process_iotlb_msg, NULL);

r = vhost_vdpa_alloc_domain(v);
if (r)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1ba9e068b2ab..4163c86db50c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -336,6 +336,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
static int vhost_worker(void *data)
{
struct vhost_worker *worker = data;
+ struct vhost_dev *dev = worker->dev;
struct vhost_work *work, *work_next;
struct llist_node *node;

@@ -352,12 +353,13 @@ static int vhost_worker(void *data)
if (!node) {
schedule();
/*
- * When we get a SIGKILL our release function will
- * be called. That will stop new IOs from being queued
- * and check for outstanding cmd responses. It will then
- * call vhost_task_stop to exit us.
+ * When we get a SIGKILL we kick off a work to
+ * run the driver's helper to stop new work and
+ * handle completions. When they are done they will
+ * call vhost_task_stop to tell us to exit.
*/
- vhost_task_get_signal();
+ if (vhost_task_get_signal())
+ schedule_work(&dev->destroy_worker);
}

node = llist_reverse_order(node);
@@ -376,6 +378,33 @@ static int vhost_worker(void *data)
return 0;
}

+static void __vhost_dev_stop_work(struct vhost_dev *dev)
+{
+ mutex_lock(&dev->stop_work_mutex);
+ if (dev->work_stopped)
+ goto done;
+
+ if (dev->stop_dev_work)
+ dev->stop_dev_work(dev);
+ dev->work_stopped = true;
+done:
+ mutex_unlock(&dev->stop_work_mutex);
+}
+
+void vhost_dev_stop_work(struct vhost_dev *dev)
+{
+ __vhost_dev_stop_work(dev);
+ flush_work(&dev->destroy_worker);
+}
+EXPORT_SYMBOL_GPL(vhost_dev_stop_work);
+
+static void vhost_worker_destroy(struct work_struct *work)
+{
+ struct vhost_dev *dev = container_of(work, struct vhost_dev,
+ destroy_worker);
+ __vhost_dev_stop_work(dev);
+}
+
static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
{
kfree(vq->indirect);
@@ -464,7 +493,8 @@ void vhost_dev_init(struct vhost_dev *dev,
int iov_limit, int weight, int byte_weight,
bool use_worker,
int (*msg_handler)(struct vhost_dev *dev, u32 asid,
- struct vhost_iotlb_msg *msg))
+ struct vhost_iotlb_msg *msg),
+ void (*stop_dev_work)(struct vhost_dev *dev))
{
struct vhost_virtqueue *vq;
int i;
@@ -472,6 +502,7 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->vqs = vqs;
dev->nvqs = nvqs;
mutex_init(&dev->mutex);
+ mutex_init(&dev->stop_work_mutex);
dev->log_ctx = NULL;
dev->umem = NULL;
dev->iotlb = NULL;
@@ -482,12 +513,14 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->byte_weight = byte_weight;
dev->use_worker = use_worker;
dev->msg_handler = msg_handler;
+ dev->work_stopped = false;
+ dev->stop_dev_work = stop_dev_work;
+ INIT_WORK(&dev->destroy_worker, vhost_worker_destroy);
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
spin_lock_init(&dev->iotlb_lock);

-
for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
vq->log = NULL;
@@ -572,6 +605,7 @@ static int vhost_worker_create(struct vhost_dev *dev)
if (!worker)
return -ENOMEM;

+ worker->dev = dev;
dev->worker = worker;
worker->kcov_handle = kcov_common_handle();
init_llist_head(&worker->work_list);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0308638cdeee..325e5e52c7ae 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -17,6 +17,7 @@

struct vhost_work;
struct vhost_task;
+struct vhost_dev;
typedef void (*vhost_work_fn_t)(struct vhost_work *work);

#define VHOST_WORK_QUEUED 1
@@ -28,6 +29,7 @@ struct vhost_work {

struct vhost_worker {
struct vhost_task *vtsk;
+ struct vhost_dev *dev;
struct llist_head work_list;
u64 kcov_handle;
};
@@ -165,8 +167,12 @@ struct vhost_dev {
int weight;
int byte_weight;
bool use_worker;
+ struct mutex stop_work_mutex;
+ bool work_stopped;
+ struct work_struct destroy_worker;
int (*msg_handler)(struct vhost_dev *dev, u32 asid,
struct vhost_iotlb_msg *msg);
+ void (*stop_dev_work)(struct vhost_dev *dev);
};

bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
@@ -174,7 +180,8 @@ void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
int nvqs, int iov_limit, int weight, int byte_weight,
bool use_worker,
int (*msg_handler)(struct vhost_dev *dev, u32 asid,
- struct vhost_iotlb_msg *msg));
+ struct vhost_iotlb_msg *msg),
+ void (*stop_dev_work)(struct vhost_dev *dev));
long vhost_dev_set_owner(struct vhost_dev *dev);
bool vhost_dev_has_owner(struct vhost_dev *dev);
long vhost_dev_check_owner(struct vhost_dev *);
@@ -182,6 +189,7 @@ struct vhost_iotlb *vhost_dev_reset_owner_prepare(void);
void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *iotlb);
void vhost_dev_cleanup(struct vhost_dev *);
void vhost_dev_stop(struct vhost_dev *);
+void vhost_dev_stop_work(struct vhost_dev *dev);
long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp);
bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 6578db78f0ae..1ef53722d494 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -664,8 +664,8 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick;

vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs),
- UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT,
- VHOST_VSOCK_WEIGHT, true, NULL);
+ UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT, VHOST_VSOCK_WEIGHT,
+ true, NULL, NULL);

file->private_data = vsock;
skb_queue_head_init(&vsock->send_pkt_queue);
--
2.25.1

2023-05-18 14:41:45

by Christian Brauner

[permalink] [raw]

Subject: Re: [RFC PATCH 5/8] vhost: Add callback that stops new work and waits on running ones

On Wed, May 17, 2023 at 07:09:17PM -0500, Mike Christie wrote:
> When the vhost_task gets a SIGKILL we want to stop new work from being
> queued and also wait for and handle completions for running work. For the
> latter, we still need to use the vhost_task to handle the completing work
> so we can't just exit right away. But, this has us kick off the stopping
> and flushing/stopping of the device/vhost_task/worker to the system work
> queue while the vhost_task handles completions. When all completions are
> done we will then do vhost_task_stop and we will exit.
>
> Signed-off-by: Mike Christie <[email protected]>
> ---
> drivers/vhost/net.c | 2 +-
> drivers/vhost/scsi.c | 4 ++--
> drivers/vhost/test.c | 3 ++-
> drivers/vhost/vdpa.c | 2 +-
> drivers/vhost/vhost.c | 48 ++++++++++++++++++++++++++++++++++++-------
> drivers/vhost/vhost.h | 10 ++++++++-
> drivers/vhost/vsock.c | 4 ++--
> 7 files changed, 58 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 8557072ff05e..90c25127b3f8 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -1409,7 +1409,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
> vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
> UIO_MAXIOV + VHOST_NET_BATCH,
> VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
> - NULL);
> + NULL, NULL);
>
> vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
> vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index bb10fa4bb4f6..40f9135e1a62 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -1820,8 +1820,8 @@ static int vhost_scsi_open(struct inode *inode, struct file *f)
> vqs[i] = &vs->vqs[i].vq;
> vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
> }
> - vhost_dev_init(&vs->dev, vqs, nvqs, UIO_MAXIOV,
> - VHOST_SCSI_WEIGHT, 0, true, NULL);
> + vhost_dev_init(&vs->dev, vqs, nvqs, UIO_MAXIOV, VHOST_SCSI_WEIGHT, 0,
> + true, NULL, NULL);
>
> vhost_scsi_init_inflight(vs, NULL);
>
> diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
> index 42c955a5b211..11a2823d7532 100644
> --- a/drivers/vhost/test.c
> +++ b/drivers/vhost/test.c
> @@ -120,7 +120,8 @@ static int vhost_test_open(struct inode *inode, struct file *f)
> vqs[VHOST_TEST_VQ] = &n->vqs[VHOST_TEST_VQ];
> n->vqs[VHOST_TEST_VQ].handle_kick = handle_vq_kick;
> vhost_dev_init(dev, vqs, VHOST_TEST_VQ_MAX, UIO_MAXIOV,
> - VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL);
> + VHOST_TEST_PKT_WEIGHT, VHOST_TEST_WEIGHT, true, NULL,
> + NULL);
>
> f->private_data = n;
>
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> index 8c1aefc865f0..de9a83ecb70d 100644
> --- a/drivers/vhost/vdpa.c
> +++ b/drivers/vhost/vdpa.c
> @@ -1279,7 +1279,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep)
> vqs[i]->handle_kick = handle_vq_kick;
> }
> vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false,
> - vhost_vdpa_process_iotlb_msg);
> + vhost_vdpa_process_iotlb_msg, NULL);
>
> r = vhost_vdpa_alloc_domain(v);
> if (r)
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 1ba9e068b2ab..4163c86db50c 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -336,6 +336,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
> static int vhost_worker(void *data)
> {
> struct vhost_worker *worker = data;
> + struct vhost_dev *dev = worker->dev;
> struct vhost_work *work, *work_next;
> struct llist_node *node;
>
> @@ -352,12 +353,13 @@ static int vhost_worker(void *data)
> if (!node) {
> schedule();
> /*
> - * When we get a SIGKILL our release function will
> - * be called. That will stop new IOs from being queued
> - * and check for outstanding cmd responses. It will then
> - * call vhost_task_stop to exit us.
> + * When we get a SIGKILL we kick off a work to
> + * run the driver's helper to stop new work and
> + * handle completions. When they are done they will
> + * call vhost_task_stop to tell us to exit.
> */
> - vhost_task_get_signal();
> + if (vhost_task_get_signal())
> + schedule_work(&dev->destroy_worker);
> }

I'm pretty sure you still need to actually call exit here. Basically
mirror what's done in io_worker_exit() minus the io specific bits.

2023-05-18 15:29:31

by Christian Brauner

[permalink] [raw]

Subject: Re: [RFC PATCH 5/8] vhost: Add callback that stops new work and waits on running ones

On Thu, May 18, 2023 at 10:03:32AM -0500, Mike Christie wrote:
> On 5/18/23 9:18 AM, Christian Brauner wrote:
> >> @@ -352,12 +353,13 @@ static int vhost_worker(void *data)
> >> if (!node) {
> >> schedule();
> >> /*
> >> - * When we get a SIGKILL our release function will
> >> - * be called. That will stop new IOs from being queued
> >> - * and check for outstanding cmd responses. It will then
> >> - * call vhost_task_stop to exit us.
> >> + * When we get a SIGKILL we kick off a work to
> >> + * run the driver's helper to stop new work and
> >> + * handle completions. When they are done they will
> >> + * call vhost_task_stop to tell us to exit.
> >> */
> >> - vhost_task_get_signal();
> >> + if (vhost_task_get_signal())
> >> + schedule_work(&dev->destroy_worker);
> >> }
> >
> > I'm pretty sure you still need to actually call exit here. Basically
> > mirror what's done in io_worker_exit() minus the io specific bits.
>
> We do call do_exit(). Once destory_worker has flushed the device and
> all outstanding IO has completed it call vhost_task_stop(). vhost_worker()
> above then breaks out of the loop and returns and vhost_task_fn() does
> do_exit().

Ah, that callchain wasn't obvious. Thanks.

2023-05-18 15:41:56

by Mike Christie

[permalink] [raw]

Subject: Re: [RFC PATCH 5/8] vhost: Add callback that stops new work and waits on running ones

On 5/18/23 9:18 AM, Christian Brauner wrote:
>> @@ -352,12 +353,13 @@ static int vhost_worker(void *data)
>> if (!node) {
>> schedule();
>> /*
>> - * When we get a SIGKILL our release function will
>> - * be called. That will stop new IOs from being queued
>> - * and check for outstanding cmd responses. It will then
>> - * call vhost_task_stop to exit us.
>> + * When we get a SIGKILL we kick off a work to
>> + * run the driver's helper to stop new work and
>> + * handle completions. When they are done they will
>> + * call vhost_task_stop to tell us to exit.
>> */
>> - vhost_task_get_signal();
>> + if (vhost_task_get_signal())
>> + schedule_work(&dev->destroy_worker);
>> }
>
> I'm pretty sure you still need to actually call exit here. Basically
> mirror what's done in io_worker_exit() minus the io specific bits.

We do call do_exit(). Once destory_worker has flushed the device and
all outstanding IO has completed it call vhost_task_stop(). vhost_worker()
above then breaks out of the loop and returns and vhost_task_fn() does
do_exit().

2023-05-18 19:18:05

by Eric W. Biederman

[permalink] [raw]

Subject: Re: [RFC PATCH 5/8] vhost: Add callback that stops new work and waits on running ones

Mike Christie <[email protected]> writes:

> On 5/18/23 9:18 AM, Christian Brauner wrote:
>>> @@ -352,12 +353,13 @@ static int vhost_worker(void *data)
>>> if (!node) {
>>> schedule();
>>> /*
>>> - * When we get a SIGKILL our release function will
>>> - * be called. That will stop new IOs from being queued
>>> - * and check for outstanding cmd responses. It will then
>>> - * call vhost_task_stop to exit us.
>>> + * When we get a SIGKILL we kick off a work to
>>> + * run the driver's helper to stop new work and
>>> + * handle completions. When they are done they will
>>> + * call vhost_task_stop to tell us to exit.
>>> */
>>> - vhost_task_get_signal();
>>> + if (vhost_task_get_signal())
>>> + schedule_work(&dev->destroy_worker);
>>> }
>>
>> I'm pretty sure you still need to actually call exit here. Basically
>> mirror what's done in io_worker_exit() minus the io specific bits.
>
> We do call do_exit(). Once destory_worker has flushed the device and
> all outstanding IO has completed it call vhost_task_stop(). vhost_worker()
> above then breaks out of the loop and returns and vhost_task_fn() does
> do_exit().

I am not certain how you want to structure this but you really should
not call get_signal after it returns positive before you call do_exit.

You are in complete uncharted and untested waters calling get_signal
multiple times, when get_signal figures the proper response is to
call do_exit itself.

Eric