2021-12-27 09:13:34

by Yongji Xie

[permalink] [raw]
Subject: [PATCH v2] nbd: Don't use workqueue to handle recv work

The rescuer thread might take over the works queued on
the workqueue when the worker thread creation timed out.
If this happens, we have no chance to create multiple
recv threads which causes I/O hung on this nbd device.

To fix it, we can not simply remove the WQ_MEM_RECLAIM
flag since the recv work is in the memory reclaim path.
So this patch tries to create kthreads directly to
handle the recv work instead of using workqueue.

Fixes: 124d6db07c3b ("nbd: use our own workqueue for recv threads")
Signed-off-by: Xie Yongji <[email protected]>
---
drivers/block/nbd.c | 95 +++++++++++++++++++++++++++++++----------------------
1 file changed, 56 insertions(+), 39 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 5a1f98494ddd..e572d1dc20b4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -63,7 +63,6 @@ struct nbd_sock {
};

struct recv_thread_args {
- struct work_struct work;
struct nbd_device *nbd;
int index;
};
@@ -97,6 +96,7 @@ struct nbd_config {

atomic_t recv_threads;
wait_queue_head_t recv_wq;
+ spinlock_t recv_lock;
unsigned int blksize_bits;
loff_t bytesize;
#if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -118,7 +118,6 @@ struct nbd_device {
struct nbd_config *config;
struct mutex config_lock;
struct gendisk *disk;
- struct workqueue_struct *recv_workq;
struct work_struct remove_work;

struct list_head list;
@@ -260,7 +259,6 @@ static void nbd_dev_remove(struct nbd_device *nbd)
mutex_lock(&nbd_index_mutex);
idr_remove(&nbd_index_idr, nbd->index);
mutex_unlock(&nbd_index_mutex);
- destroy_workqueue(nbd->recv_workq);
kfree(nbd);
}

@@ -818,11 +816,19 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
return ret ? ERR_PTR(ret) : cmd;
}

-static void recv_work(struct work_struct *work)
+static void flush_recv_works(struct nbd_device *nbd)
{
- struct recv_thread_args *args = container_of(work,
- struct recv_thread_args,
- work);
+ wait_event(nbd->config->recv_wq,
+ atomic_read(&nbd->config->recv_threads) == 0);
+
+ /* Make sure recv threads have no reference to nbd->config */
+ spin_lock(&nbd->config->recv_lock);
+ spin_unlock(&nbd->config->recv_lock);
+}
+
+static int recv_work(void *data)
+{
+ struct recv_thread_args *args = (struct recv_thread_args *)data;
struct nbd_device *nbd = args->nbd;
struct nbd_config *config = nbd->config;
struct request_queue *q = nbd->disk->queue;
@@ -866,9 +872,14 @@ static void recv_work(struct work_struct *work)
mutex_unlock(&nsock->tx_lock);

nbd_config_put(nbd);
+
+ spin_lock(&config->recv_lock);
atomic_dec(&config->recv_threads);
wake_up(&config->recv_wq);
+ spin_unlock(&config->recv_lock);
kfree(args);
+
+ return 0;
}

static bool nbd_clear_req(struct request *req, void *data, bool reserved)
@@ -1176,6 +1187,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)

for (i = 0; i < config->num_connections; i++) {
struct nbd_sock *nsock = config->socks[i];
+ struct task_struct *worker;

if (!nsock->dead)
continue;
@@ -1185,6 +1197,14 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
mutex_unlock(&nsock->tx_lock);
continue;
}
+ worker = kthread_create(recv_work, args, "knbd%d.%d-recv",
+ nbd->index, i);
+ if (!worker) {
+ sockfd_put(sock);
+ kfree(args);
+ return -ENOMEM;
+ }
+
sk_set_memalloc(sock->sk);
if (nbd->tag_set.timeout)
sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
@@ -1194,7 +1214,6 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
nsock->fallback_index = -1;
nsock->sock = sock;
nsock->dead = false;
- INIT_WORK(&args->work, recv_work);
args->index = i;
args->nbd = nbd;
nsock->cookie++;
@@ -1206,7 +1225,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
/* We take the tx_mutex in an error path in the recv_work, so we
* need to queue_work outside of the tx_mutex.
*/
- queue_work(nbd->recv_workq, &args->work);
+ wake_up_process(worker);

atomic_inc(&config->live_connections);
wake_up(&config->conn_wait);
@@ -1359,34 +1378,42 @@ static int nbd_start_device(struct nbd_device *nbd)
nbd_dev_dbg_init(nbd);
for (i = 0; i < num_connections; i++) {
struct recv_thread_args *args;
+ struct task_struct *worker;

args = kzalloc(sizeof(*args), GFP_KERNEL);
- if (!args) {
- sock_shutdown(nbd);
- /*
- * If num_connections is m (2 < m),
- * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
- * But NO.(n + 1) failed. We still have n recv threads.
- * So, add flush_workqueue here to prevent recv threads
- * dropping the last config_refs and trying to destroy
- * the workqueue from inside the workqueue.
- */
- if (i)
- flush_workqueue(nbd->recv_workq);
- return -ENOMEM;
+ if (!args)
+ goto err;
+
+ worker = kthread_create(recv_work, args, "knbd%d.%d-recv",
+ nbd->index, i);
+ if (!worker) {
+ kfree(args);
+ goto err;
}
+
sk_set_memalloc(config->socks[i]->sock->sk);
if (nbd->tag_set.timeout)
config->socks[i]->sock->sk->sk_sndtimeo =
nbd->tag_set.timeout;
atomic_inc(&config->recv_threads);
refcount_inc(&nbd->config_refs);
- INIT_WORK(&args->work, recv_work);
args->nbd = nbd;
args->index = i;
- queue_work(nbd->recv_workq, &args->work);
+ wake_up_process(worker);
}
return nbd_set_size(nbd, config->bytesize, nbd_blksize(config));
+err:
+ sock_shutdown(nbd);
+ /*
+ * If num_connections is m (2 < m),
+ * and NO.1 ~ NO.n(1 < n < m) connections are successful.
+ * But NO.(n + 1) failed. We still have n recv threads.
+ * So, add flush_recv_works here to prevent recv threads
+ * dropping the last config_refs.
+ */
+ flush_recv_works(nbd);
+
+ return -ENOMEM;
}

static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
@@ -1405,7 +1432,7 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *b
atomic_read(&config->recv_threads) == 0);
if (ret)
sock_shutdown(nbd);
- flush_workqueue(nbd->recv_workq);
+ flush_recv_works(nbd);

mutex_lock(&nbd->config_lock);
nbd_bdev_reset(bdev);
@@ -1525,6 +1552,7 @@ static struct nbd_config *nbd_alloc_config(void)
atomic_set(&config->recv_threads, 0);
init_waitqueue_head(&config->recv_wq);
init_waitqueue_head(&config->conn_wait);
+ spin_lock_init(&config->recv_lock);
config->blksize_bits = NBD_DEF_BLKSIZE_BITS;
atomic_set(&config->live_connections, 0);
try_module_get(THIS_MODULE);
@@ -1769,15 +1797,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
}
nbd->disk = disk;

- nbd->recv_workq = alloc_workqueue("nbd%d-recv",
- WQ_MEM_RECLAIM | WQ_HIGHPRI |
- WQ_UNBOUND, 0, nbd->index);
- if (!nbd->recv_workq) {
- dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
- err = -ENOMEM;
- goto out_err_disk;
- }
-
/*
* Tell the block layer that we are not a rotational device
*/
@@ -1808,7 +1827,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
disk->first_minor = index << part_shift;
if (disk->first_minor < index || disk->first_minor > MINORMASK) {
err = -EINVAL;
- goto out_free_work;
+ goto out_err_disk;
}

disk->minors = 1 << part_shift;
@@ -1817,7 +1836,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
sprintf(disk->disk_name, "nbd%d", index);
err = add_disk(disk);
if (err)
- goto out_free_work;
+ goto out_err_disk;

/*
* Now publish the device.
@@ -1826,8 +1845,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
nbd_total_devices++;
return nbd;

-out_free_work:
- destroy_workqueue(nbd->recv_workq);
out_err_disk:
blk_cleanup_disk(disk);
out_free_idr:
@@ -2086,7 +2103,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd)
* Make sure recv thread has finished, we can safely call nbd_clear_que()
* to cancel the inflight I/Os.
*/
- flush_workqueue(nbd->recv_workq);
+ flush_recv_works(nbd);
nbd_clear_que(nbd);
nbd->task_setup = NULL;
mutex_unlock(&nbd->config_lock);
--
2.11.0



2021-12-29 17:35:57

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> The rescuer thread might take over the works queued on
> the workqueue when the worker thread creation timed out.
> If this happens, we have no chance to create multiple
> recv threads which causes I/O hung on this nbd device.

If a workqueue is used there aren't really 'receive threads'.
What is the deadlock here?

2021-12-30 04:01:38

by Yongji Xie

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Thu, Dec 30, 2021 at 1:35 AM Christoph Hellwig <[email protected]> wrote:
>
> On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> > The rescuer thread might take over the works queued on
> > the workqueue when the worker thread creation timed out.
> > If this happens, we have no chance to create multiple
> > recv threads which causes I/O hung on this nbd device.
>
> If a workqueue is used there aren't really 'receive threads'.
> What is the deadlock here?

We might have multiple recv works, and those recv works won't quit
unless the socket is closed. If the rescuer thread takes over those
works, only the first recv work can run. The I/O needed to be handled
in other recv works would be hung since no thread can handle them.

In that case, we can see below stacks in rescuer thread:

__schedule
schedule
scheule_timeout
unix_stream_read_generic
unix_stream_recvmsg
sock_xmit
nbd_read_stat
recv_work
process_one_work
rescuer_thread
kthread
ret_from_fork

Thanks,
Yongji

2022-01-03 16:10:53

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Thu, Dec 30, 2021 at 12:01:23PM +0800, Yongji Xie wrote:
> On Thu, Dec 30, 2021 at 1:35 AM Christoph Hellwig <[email protected]> wrote:
> >
> > On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> > > The rescuer thread might take over the works queued on
> > > the workqueue when the worker thread creation timed out.
> > > If this happens, we have no chance to create multiple
> > > recv threads which causes I/O hung on this nbd device.
> >
> > If a workqueue is used there aren't really 'receive threads'.
> > What is the deadlock here?
>
> We might have multiple recv works, and those recv works won't quit
> unless the socket is closed. If the rescuer thread takes over those
> works, only the first recv work can run. The I/O needed to be handled
> in other recv works would be hung since no thread can handle them.
>

I'm not following this explanation. What is the rescuer thread you're talking
about? If there's an error we close the socket which will error out the recvmsg
which will make the recv workqueue close down.

> In that case, we can see below stacks in rescuer thread:
>
> __schedule
> schedule
> scheule_timeout
> unix_stream_read_generic
> unix_stream_recvmsg
> sock_xmit
> nbd_read_stat
> recv_work
> process_one_work
> rescuer_thread
> kthread
> ret_from_fork

This is just the thing hanging waiting for an incoming request, so this doesn't
tell me anything. Thanks,

Josef

2022-01-04 05:32:01

by Yongji Xie

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Tue, Jan 4, 2022 at 12:10 AM Josef Bacik <[email protected]> wrote:
>
> On Thu, Dec 30, 2021 at 12:01:23PM +0800, Yongji Xie wrote:
> > On Thu, Dec 30, 2021 at 1:35 AM Christoph Hellwig <[email protected]> wrote:
> > >
> > > On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> > > > The rescuer thread might take over the works queued on
> > > > the workqueue when the worker thread creation timed out.
> > > > If this happens, we have no chance to create multiple
> > > > recv threads which causes I/O hung on this nbd device.
> > >
> > > If a workqueue is used there aren't really 'receive threads'.
> > > What is the deadlock here?
> >
> > We might have multiple recv works, and those recv works won't quit
> > unless the socket is closed. If the rescuer thread takes over those
> > works, only the first recv work can run. The I/O needed to be handled
> > in other recv works would be hung since no thread can handle them.
> >
>
> I'm not following this explanation. What is the rescuer thread you're talking

https://www.kernel.org/doc/html/latest/core-api/workqueue.html#c.rescuer_thread

> about? If there's an error we close the socket which will error out the recvmsg
> which will make the recv workqueue close down.

When to close the socket? The nbd daemon doesn't know what happens in
the kernel.

>
> > In that case, we can see below stacks in rescuer thread:
> >
> > __schedule
> > schedule
> > scheule_timeout
> > unix_stream_read_generic
> > unix_stream_recvmsg
> > sock_xmit
> > nbd_read_stat
> > recv_work
> > process_one_work
> > rescuer_thread
> > kthread
> > ret_from_fork
>
> This is just the thing hanging waiting for an incoming request, so this doesn't
> tell me anything. Thanks,
>

The point is the *recv_work* is handled in the *rescuer_thread*.
Normally it should be handled in *work_thread* like:

__schedule
schedule
scheule_timeout
unix_stream_read_generic
unix_stream_recvmsg
sock_xmit
nbd_read_stat
recv_work
process_one_work
*work_thread*
kthread
ret_from_fork

Thanks,
Yongji

2022-01-04 18:06:37

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Tue, Jan 04, 2022 at 01:31:47PM +0800, Yongji Xie wrote:
> On Tue, Jan 4, 2022 at 12:10 AM Josef Bacik <[email protected]> wrote:
> >
> > On Thu, Dec 30, 2021 at 12:01:23PM +0800, Yongji Xie wrote:
> > > On Thu, Dec 30, 2021 at 1:35 AM Christoph Hellwig <[email protected]> wrote:
> > > >
> > > > On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> > > > > The rescuer thread might take over the works queued on
> > > > > the workqueue when the worker thread creation timed out.
> > > > > If this happens, we have no chance to create multiple
> > > > > recv threads which causes I/O hung on this nbd device.
> > > >
> > > > If a workqueue is used there aren't really 'receive threads'.
> > > > What is the deadlock here?
> > >
> > > We might have multiple recv works, and those recv works won't quit
> > > unless the socket is closed. If the rescuer thread takes over those
> > > works, only the first recv work can run. The I/O needed to be handled
> > > in other recv works would be hung since no thread can handle them.
> > >
> >
> > I'm not following this explanation. What is the rescuer thread you're talking
>
> https://www.kernel.org/doc/html/latest/core-api/workqueue.html#c.rescuer_thread
>

Ahhh ok now I see, thanks, I didn't know this is how this worked.

So what happens is we do the queue_work(), this needs to do a GFP_KERNEL
allocation internally, we are unable to satisfy this, and thus the work gets
pushed onto the rescuer thread.

Then the rescuer thread can't be used in the future because it's doing this long
running thing.

I think the correct thing to do here is simply drop the WQ_MEM_RECLAIM bit. It
makes sense for workqueue's that are handling the work of short lived works that
are in the memory reclaim path. That's not what these workers are doing, yes
they are in the reclaim path, but they run the entire time the device is up.
The actual work happens as they process incoming requests. AFAICT
WQ_MEM_RECLAIM doesn't affect the actual allocations that the worker thread
needs to do, which is what I think the intention was in using WQ_MEM_RECLAIM,
which isn't really what it's used for.

tl;dr, just remove thee WQ_MEM_RECLAIM flag completely and I think that's good
enough? Thanks,

Josef

2022-01-05 05:36:23

by Yongji Xie

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Wed, Jan 5, 2022 at 2:06 AM Josef Bacik <[email protected]> wrote:
>
> On Tue, Jan 04, 2022 at 01:31:47PM +0800, Yongji Xie wrote:
> > On Tue, Jan 4, 2022 at 12:10 AM Josef Bacik <[email protected]> wrote:
> > >
> > > On Thu, Dec 30, 2021 at 12:01:23PM +0800, Yongji Xie wrote:
> > > > On Thu, Dec 30, 2021 at 1:35 AM Christoph Hellwig <[email protected]> wrote:
> > > > >
> > > > > On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> > > > > > The rescuer thread might take over the works queued on
> > > > > > the workqueue when the worker thread creation timed out.
> > > > > > If this happens, we have no chance to create multiple
> > > > > > recv threads which causes I/O hung on this nbd device.
> > > > >
> > > > > If a workqueue is used there aren't really 'receive threads'.
> > > > > What is the deadlock here?
> > > >
> > > > We might have multiple recv works, and those recv works won't quit
> > > > unless the socket is closed. If the rescuer thread takes over those
> > > > works, only the first recv work can run. The I/O needed to be handled
> > > > in other recv works would be hung since no thread can handle them.
> > > >
> > >
> > > I'm not following this explanation. What is the rescuer thread you're talking
> >
> > https://www.kernel.org/doc/html/latest/core-api/workqueue.html#c.rescuer_thread
> >
>
> Ahhh ok now I see, thanks, I didn't know this is how this worked.
>
> So what happens is we do the queue_work(), this needs to do a GFP_KERNEL
> allocation internally, we are unable to satisfy this, and thus the work gets
> pushed onto the rescuer thread.
>
> Then the rescuer thread can't be used in the future because it's doing this long
> running thing.
>

Yes.

> I think the correct thing to do here is simply drop the WQ_MEM_RECLAIM bit. It
> makes sense for workqueue's that are handling the work of short lived works that
> are in the memory reclaim path. That's not what these workers are doing, yes
> they are in the reclaim path, but they run the entire time the device is up.
> The actual work happens as they process incoming requests. AFAICT
> WQ_MEM_RECLAIM doesn't affect the actual allocations that the worker thread
> needs to do, which is what I think the intention was in using WQ_MEM_RECLAIM,
> which isn't really what it's used for.
>
> tl;dr, just remove thee WQ_MEM_RECLAIM flag completely and I think that's good
> enough? Thanks,
>

In the reconnect case, we still need to call queue_work() while the
device is running. So it looks like we can't simply remove the
WQ_MEM_RECLAIM flag.

Thanks,
Yongji

2022-01-22 00:42:24

by Yongji Xie

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

Ping.

On Wed, Jan 5, 2022 at 1:36 PM Yongji Xie <[email protected]> wrote:
>
> On Wed, Jan 5, 2022 at 2:06 AM Josef Bacik <[email protected]> wrote:
> >
> > On Tue, Jan 04, 2022 at 01:31:47PM +0800, Yongji Xie wrote:
> > > On Tue, Jan 4, 2022 at 12:10 AM Josef Bacik <[email protected]> wrote:
> > > >
> > > > On Thu, Dec 30, 2021 at 12:01:23PM +0800, Yongji Xie wrote:
> > > > > On Thu, Dec 30, 2021 at 1:35 AM Christoph Hellwig <[email protected]> wrote:
> > > > > >
> > > > > > On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> > > > > > > The rescuer thread might take over the works queued on
> > > > > > > the workqueue when the worker thread creation timed out.
> > > > > > > If this happens, we have no chance to create multiple
> > > > > > > recv threads which causes I/O hung on this nbd device.
> > > > > >
> > > > > > If a workqueue is used there aren't really 'receive threads'.
> > > > > > What is the deadlock here?
> > > > >
> > > > > We might have multiple recv works, and those recv works won't quit
> > > > > unless the socket is closed. If the rescuer thread takes over those
> > > > > works, only the first recv work can run. The I/O needed to be handled
> > > > > in other recv works would be hung since no thread can handle them.
> > > > >
> > > >
> > > > I'm not following this explanation. What is the rescuer thread you're talking
> > >
> > > https://www.kernel.org/doc/html/latest/core-api/workqueue.html#c.rescuer_thread
> > >
> >
> > Ahhh ok now I see, thanks, I didn't know this is how this worked.
> >
> > So what happens is we do the queue_work(), this needs to do a GFP_KERNEL
> > allocation internally, we are unable to satisfy this, and thus the work gets
> > pushed onto the rescuer thread.
> >
> > Then the rescuer thread can't be used in the future because it's doing this long
> > running thing.
> >
>
> Yes.
>
> > I think the correct thing to do here is simply drop the WQ_MEM_RECLAIM bit. It
> > makes sense for workqueue's that are handling the work of short lived works that
> > are in the memory reclaim path. That's not what these workers are doing, yes
> > they are in the reclaim path, but they run the entire time the device is up.
> > The actual work happens as they process incoming requests. AFAICT
> > WQ_MEM_RECLAIM doesn't affect the actual allocations that the worker thread
> > needs to do, which is what I think the intention was in using WQ_MEM_RECLAIM,
> > which isn't really what it's used for.
> >
> > tl;dr, just remove thee WQ_MEM_RECLAIM flag completely and I think that's good
> > enough? Thanks,
> >
>
> In the reconnect case, we still need to call queue_work() while the
> device is running. So it looks like we can't simply remove the
> WQ_MEM_RECLAIM flag.
>
> Thanks,
> Yongji

2022-02-15 15:28:17

by Yongji Xie

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

Ping again.

Hi Josef, could you take a look?

On Fri, Jan 21, 2022 at 4:34 PM Yongji Xie <[email protected]> wrote:
>
> Ping.
>
> On Wed, Jan 5, 2022 at 1:36 PM Yongji Xie <[email protected]> wrote:
> >
> > On Wed, Jan 5, 2022 at 2:06 AM Josef Bacik <[email protected]> wrote:
> > >
> > > On Tue, Jan 04, 2022 at 01:31:47PM +0800, Yongji Xie wrote:
> > > > On Tue, Jan 4, 2022 at 12:10 AM Josef Bacik <[email protected]> wrote:
> > > > >
> > > > > On Thu, Dec 30, 2021 at 12:01:23PM +0800, Yongji Xie wrote:
> > > > > > On Thu, Dec 30, 2021 at 1:35 AM Christoph Hellwig <[email protected]> wrote:
> > > > > > >
> > > > > > > On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> > > > > > > > The rescuer thread might take over the works queued on
> > > > > > > > the workqueue when the worker thread creation timed out.
> > > > > > > > If this happens, we have no chance to create multiple
> > > > > > > > recv threads which causes I/O hung on this nbd device.
> > > > > > >
> > > > > > > If a workqueue is used there aren't really 'receive threads'.
> > > > > > > What is the deadlock here?
> > > > > >
> > > > > > We might have multiple recv works, and those recv works won't quit
> > > > > > unless the socket is closed. If the rescuer thread takes over those
> > > > > > works, only the first recv work can run. The I/O needed to be handled
> > > > > > in other recv works would be hung since no thread can handle them.
> > > > > >
> > > > >
> > > > > I'm not following this explanation. What is the rescuer thread you're talking
> > > >
> > > > https://www.kernel.org/doc/html/latest/core-api/workqueue.html#c.rescuer_thread
> > > >
> > >
> > > Ahhh ok now I see, thanks, I didn't know this is how this worked.
> > >
> > > So what happens is we do the queue_work(), this needs to do a GFP_KERNEL
> > > allocation internally, we are unable to satisfy this, and thus the work gets
> > > pushed onto the rescuer thread.
> > >
> > > Then the rescuer thread can't be used in the future because it's doing this long
> > > running thing.
> > >
> >
> > Yes.
> >
> > > I think the correct thing to do here is simply drop the WQ_MEM_RECLAIM bit. It
> > > makes sense for workqueue's that are handling the work of short lived works that
> > > are in the memory reclaim path. That's not what these workers are doing, yes
> > > they are in the reclaim path, but they run the entire time the device is up.
> > > The actual work happens as they process incoming requests. AFAICT
> > > WQ_MEM_RECLAIM doesn't affect the actual allocations that the worker thread
> > > needs to do, which is what I think the intention was in using WQ_MEM_RECLAIM,
> > > which isn't really what it's used for.
> > >
> > > tl;dr, just remove thee WQ_MEM_RECLAIM flag completely and I think that's good
> > > enough? Thanks,
> > >
> >
> > In the reconnect case, we still need to call queue_work() while the
> > device is running. So it looks like we can't simply remove the
> > WQ_MEM_RECLAIM flag.
> >
> > Thanks,
> > Yongji

2022-02-18 15:36:08

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Tue, Feb 15, 2022 at 09:17:37PM +0800, Yongji Xie wrote:
> Ping again.
>
> Hi Josef, could you take a look?

Sorry Yongji this got lost. Again in the reconnect case we're still setting up
a long running thread, so it's not like it'll happen during a normal reclaim
path thing, it'll be acted upon by userspace. Thanks,

Josef

2022-02-19 13:31:19

by Yongji Xie

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Fri, Feb 18, 2022 at 11:21 PM Josef Bacik <[email protected]> wrote:
>
> On Tue, Feb 15, 2022 at 09:17:37PM +0800, Yongji Xie wrote:
> > Ping again.
> >
> > Hi Josef, could you take a look?
>
> Sorry Yongji this got lost. Again in the reconnect case we're still setting up
> a long running thread, so it's not like it'll happen during a normal reclaim
> path thing, it'll be acted upon by userspace. Thanks,
>

During creating this long running thread, we might trigger memory
reclaim since workqueue will use GFP_KERNEL allocation for it. Then a
deadlock can occur if the memory reclaim happens to hit the page cache
on this reconnecting nbd device.

Thanks,
Yongji

2022-03-22 08:33:19

by Yongji Xie

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

Ping.

Hi Josef, any other comments for this approach.

On Sat, Feb 19, 2022 at 9:04 PM Yongji Xie <[email protected]> wrote:
>
> On Fri, Feb 18, 2022 at 11:21 PM Josef Bacik <[email protected]> wrote:
> >
> > On Tue, Feb 15, 2022 at 09:17:37PM +0800, Yongji Xie wrote:
> > > Ping again.
> > >
> > > Hi Josef, could you take a look?
> >
> > Sorry Yongji this got lost. Again in the reconnect case we're still setting up
> > a long running thread, so it's not like it'll happen during a normal reclaim
> > path thing, it'll be acted upon by userspace. Thanks,
> >
>
> During creating this long running thread, we might trigger memory
> reclaim since workqueue will use GFP_KERNEL allocation for it. Then a
> deadlock can occur if the memory reclaim happens to hit the page cache
> on this reconnecting nbd device.
>
> Thanks,
> Yongji

2022-03-23 08:29:02

by Josef Bacik

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> The rescuer thread might take over the works queued on
> the workqueue when the worker thread creation timed out.
> If this happens, we have no chance to create multiple
> recv threads which causes I/O hung on this nbd device.
>
> To fix it, we can not simply remove the WQ_MEM_RECLAIM
> flag since the recv work is in the memory reclaim path.
> So this patch tries to create kthreads directly to
> handle the recv work instead of using workqueue.
>

I still don't understand why we can't drop WQ_MEM_RECLAIM. IIRC your argument
is that we need it because a reconnect could happen under memory pressure and we
need to be able to queue work for that. However your code makes it so we're
just doing a kthread_create(), which isn't coming out of some emergency pool, so
it's just as likely to fail as a !WQ_MEM_RECLAIM workqueue. Thanks,

Josef

2022-03-25 04:27:20

by Yongji Xie

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

On Wed, Mar 23, 2022 at 4:11 AM Josef Bacik <[email protected]> wrote:
>
> On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> > The rescuer thread might take over the works queued on
> > the workqueue when the worker thread creation timed out.
> > If this happens, we have no chance to create multiple
> > recv threads which causes I/O hung on this nbd device.
> >
> > To fix it, we can not simply remove the WQ_MEM_RECLAIM
> > flag since the recv work is in the memory reclaim path.
> > So this patch tries to create kthreads directly to
> > handle the recv work instead of using workqueue.
> >
>
> I still don't understand why we can't drop WQ_MEM_RECLAIM. IIRC your argument
> is that we need it because a reconnect could happen under memory pressure and we
> need to be able to queue work for that. However your code makes it so we're
> just doing a kthread_create(), which isn't coming out of some emergency pool, so
> it's just as likely to fail as a !WQ_MEM_RECLAIM workqueue. Thanks,
>

I think the key point is the context in which the work thread is
created. It's the context of the nbd process if using kthread_create()
to create a workthread (might do some allocation). Then we can benefit
from the PR_SET_IO_FLUSHER flag, so memory reclaim would never hit the
page cache on the nbd device. But using queue_work() to create a
workthread, the actual thread creation happens in the context of the
work thread rather than the nbd process, so we can't rely on the
PR_SET_IO_FLUSHER flag to avoid deadlock.

Thanks,
Yongji

2022-05-17 01:11:43

by Yongji Xie

[permalink] [raw]
Subject: Re: [PATCH v2] nbd: Don't use workqueue to handle recv work

Ping.

On Wed, Mar 23, 2022 at 7:21 PM Yongji Xie <[email protected]> wrote:
>
> On Wed, Mar 23, 2022 at 4:11 AM Josef Bacik <[email protected]> wrote:
> >
> > On Mon, Dec 27, 2021 at 05:12:41PM +0800, Xie Yongji wrote:
> > > The rescuer thread might take over the works queued on
> > > the workqueue when the worker thread creation timed out.
> > > If this happens, we have no chance to create multiple
> > > recv threads which causes I/O hung on this nbd device.
> > >
> > > To fix it, we can not simply remove the WQ_MEM_RECLAIM
> > > flag since the recv work is in the memory reclaim path.
> > > So this patch tries to create kthreads directly to
> > > handle the recv work instead of using workqueue.
> > >
> >
> > I still don't understand why we can't drop WQ_MEM_RECLAIM. IIRC your argument
> > is that we need it because a reconnect could happen under memory pressure and we
> > need to be able to queue work for that. However your code makes it so we're
> > just doing a kthread_create(), which isn't coming out of some emergency pool, so
> > it's just as likely to fail as a !WQ_MEM_RECLAIM workqueue. Thanks,
> >
>
> I think the key point is the context in which the work thread is
> created. It's the context of the nbd process if using kthread_create()
> to create a workthread (might do some allocation). Then we can benefit
> from the PR_SET_IO_FLUSHER flag, so memory reclaim would never hit the
> page cache on the nbd device. But using queue_work() to create a
> workthread, the actual thread creation happens in the context of the
> work thread rather than the nbd process, so we can't rely on the
> PR_SET_IO_FLUSHER flag to avoid deadlock.
>
> Thanks,
> Yongji