After write SSD completed, bcache schedule journal_write work to
system_wq, that is a public workqueue in system, without WQ_MEM_RECLAIM
flag. system_wq is also a bound wq, and there may be no idle kworker on
current processor. Creating a new kworker may unfortunately need to
reclaim memory first, by shrinking cache and slab used by vfs, which
depends on bcache device. That's a deadlock.
This patch create a new workqueue for journal_write with WQ_MEM_RECLAIM
flag. It's rescuer thread will work to avoid the deadlock.
Signed-off-by: guoju <[email protected]>
---
drivers/md/bcache/bcache.h | 1 +
drivers/md/bcache/journal.c | 6 +++---
drivers/md/bcache/super.c | 8 ++++++++
3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 83504dd..954dad2 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -965,6 +965,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k,
void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
extern struct workqueue_struct *bcache_wq;
+extern struct workqueue_struct *bch_journal_wq;
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 6116bbf..522c742 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -485,7 +485,7 @@ static void do_journal_discard(struct cache *ca)
closure_get(&ca->set->cl);
INIT_WORK(&ja->discard_work, journal_discard_work);
- schedule_work(&ja->discard_work);
+ queue_work(bch_journal_wq, &ja->discard_work);
}
}
@@ -592,7 +592,7 @@ static void journal_write_done(struct closure *cl)
: &j->w[0];
__closure_wake_up(&w->wait);
- continue_at_nobarrier(cl, journal_write, system_wq);
+ continue_at_nobarrier(cl, journal_write, bch_journal_wq);
}
static void journal_write_unlock(struct closure *cl)
@@ -627,7 +627,7 @@ static void journal_write_unlocked(struct closure *cl)
spin_unlock(&c->journal.lock);
btree_flush_write(c);
- continue_at(cl, journal_write, system_wq);
+ continue_at(cl, journal_write, bch_journal_wq);
return;
}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 3ede144..64715a8 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -47,6 +47,7 @@
static DEFINE_IDA(bcache_device_idx);
static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;
+struct workqueue_struct *bch_journal_wq;
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
/* limitation of partitions number on single bcache device */
@@ -2344,6 +2345,9 @@ static void bcache_exit(void)
kobject_put(bcache_kobj);
if (bcache_wq)
destroy_workqueue(bcache_wq);
+ if (bch_journal_wq)
+ destroy_workqueue(bch_journal_wq);
+
if (bcache_major)
unregister_blkdev(bcache_major, "bcache");
unregister_reboot_notifier(&reboot);
@@ -2373,6 +2377,10 @@ static int __init bcache_init(void)
if (!bcache_wq)
goto err;
+ bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
+ if (!bch_journal_wq)
+ goto err;
+
bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
if (!bcache_kobj)
goto err;
--
1.8.3.1
On 9/27/18 9:45 PM, guoju wrote:
> After write SSD completed, bcache schedule journal_write work to
> system_wq, that is a public workqueue in system, without WQ_MEM_RECLAIM
> flag. system_wq is also a bound wq, and there may be no idle kworker on
> current processor. Creating a new kworker may unfortunately need to
> reclaim memory first, by shrinking cache and slab used by vfs, which
> depends on bcache device. That's a deadlock.
>
> This patch create a new workqueue for journal_write with WQ_MEM_RECLAIM
> flag. It's rescuer thread will work to avoid the deadlock.
>
> Signed-off-by: guoju <[email protected]>
Nice catch, this fix is quite important. I will try to submit to Jens ASAP.
Thanks.
Coly Li
> ---
> drivers/md/bcache/bcache.h | 1 +
> drivers/md/bcache/journal.c | 6 +++---
> drivers/md/bcache/super.c | 8 ++++++++
> 3 files changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 83504dd..954dad2 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -965,6 +965,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k,
> void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
>
> extern struct workqueue_struct *bcache_wq;
> +extern struct workqueue_struct *bch_journal_wq;
> extern struct mutex bch_register_lock;
> extern struct list_head bch_cache_sets;
>
> diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
> index 6116bbf..522c742 100644
> --- a/drivers/md/bcache/journal.c
> +++ b/drivers/md/bcache/journal.c
> @@ -485,7 +485,7 @@ static void do_journal_discard(struct cache *ca)
>
> closure_get(&ca->set->cl);
> INIT_WORK(&ja->discard_work, journal_discard_work);
> - schedule_work(&ja->discard_work);
> + queue_work(bch_journal_wq, &ja->discard_work);
> }
> }
>
> @@ -592,7 +592,7 @@ static void journal_write_done(struct closure *cl)
> : &j->w[0];
>
> __closure_wake_up(&w->wait);
> - continue_at_nobarrier(cl, journal_write, system_wq);
> + continue_at_nobarrier(cl, journal_write, bch_journal_wq);
> }
>
> static void journal_write_unlock(struct closure *cl)
> @@ -627,7 +627,7 @@ static void journal_write_unlocked(struct closure *cl)
> spin_unlock(&c->journal.lock);
>
> btree_flush_write(c);
> - continue_at(cl, journal_write, system_wq);
> + continue_at(cl, journal_write, bch_journal_wq);
> return;
> }
>
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 3ede144..64715a8 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -47,6 +47,7 @@
> static DEFINE_IDA(bcache_device_idx);
> static wait_queue_head_t unregister_wait;
> struct workqueue_struct *bcache_wq;
> +struct workqueue_struct *bch_journal_wq;
>
> #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
> /* limitation of partitions number on single bcache device */
> @@ -2344,6 +2345,9 @@ static void bcache_exit(void)
> kobject_put(bcache_kobj);
> if (bcache_wq)
> destroy_workqueue(bcache_wq);
> + if (bch_journal_wq)
> + destroy_workqueue(bch_journal_wq);
> +
> if (bcache_major)
> unregister_blkdev(bcache_major, "bcache");
> unregister_reboot_notifier(&reboot);
> @@ -2373,6 +2377,10 @@ static int __init bcache_init(void)
> if (!bcache_wq)
> goto err;
>
> + bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
> + if (!bch_journal_wq)
> + goto err;
> +
> bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
> if (!bcache_kobj)
> goto err;
On 27/09/18 16:23, Coly Li wrote:
>
> On 9/27/18 9:45 PM, guoju wrote:
>> After write SSD completed, bcache schedule journal_write work to
>> system_wq, that is a public workqueue in system, without WQ_MEM_RECLAIM
>> flag. system_wq is also a bound wq, and there may be no idle kworker on
>> current processor. Creating a new kworker may unfortunately need to
>> reclaim memory first, by shrinking cache and slab used by vfs, which
>> depends on bcache device. That's a deadlock.
>>
>> This patch create a new workqueue for journal_write with WQ_MEM_RECLAIM
>> flag. It's rescuer thread will work to avoid the deadlock.
>>
>> Signed-off-by: guoju <[email protected]>
>
> Nice catch, this fix is quite important. I will try to submit to Jens ASAP.
>
> Thanks.
>
> Coly Li
Once this goes into 4.19, would this be a candidate for backporting to
any stable kernels, or does it only fix something introduced in this cycle?
thanks,
Eddie
Hi Coly,
is this the deadlock I reported some weeks ago?
Greets,
Stefan
Excuse my typo sent from my mobile phone.
Am 27.09.2018 um 17:53 schrieb Eddie Chapman <[email protected]
<mailto:[email protected]>>:
> On 27/09/18 16:23, Coly Li wrote:
>> On 9/27/18 9:45 PM, guoju wrote:
>>> After write SSD completed, bcache schedule journal_write work to
>>> system_wq, that is a public workqueue in system, without WQ_MEM_RECLAIM
>>> flag. system_wq is also a bound wq, and there may be no idle kworker on
>>> current processor. Creating a new kworker may unfortunately need to
>>> reclaim memory first, by shrinking cache and slab used by vfs, which
>>> depends on bcache device. That's a deadlock.
>>>
>>> This patch create a new workqueue for journal_write with WQ_MEM_RECLAIM
>>> flag. It's rescuer thread will work to avoid the deadlock.
>>>
>>> Signed-off-by: guoju <[email protected] <mailto:[email protected]>>
>> Nice catch, this fix is quite important. I will try to submit to Jens
>> ASAP.
>> Thanks.
>> Coly Li
>
> Once this goes into 4.19, would this be a candidate for backporting to
> any stable kernels, or does it only fix something introduced in this
> cycle?
>
> thanks,
> Eddie
Hi Stefan,
This bug was triggered by following condition:
1, few system memory available to allocate
2, journal delayed its operations to system_wq, which needs to allocate
memory to execute.
3, Due to lack of memory, kernel starts to reclaim system memory, and
trigger writeback to file system on top of bcache device
4, the memory writeback I/O hitting bcache device via upper layer file
system, requiring more bcache journal operations
5, a loop-blocking issue happens in bcache journal
If your system is under heavy memory pressure, this deadlock may also
happens in your environment. Anyway, this is a patch I suggest to apply
because it fix a real deadlock which is probably happens when system
memory is exhausted.
Thanks.
Coly Li
On 9/28/18 1:16 AM, Stefan Priebe - Profihost AG wrote:
> Hi Coly,
>
> is this the deadlock I reported some weeks ago?
>
> Greets,
> Stefan
>
> Excuse my typo sent from my mobile phone.
>
> Am 27.09.2018 um 17:53 schrieb Eddie Chapman <[email protected]
> <mailto:[email protected]>>:
>
>> On 27/09/18 16:23, Coly Li wrote:
>>> On 9/27/18 9:45 PM, guoju wrote:
>>>> After write SSD completed, bcache schedule journal_write work to
>>>> system_wq, that is a public workqueue in system, without WQ_MEM_RECLAIM
>>>> flag. system_wq is also a bound wq, and there may be no idle kworker on
>>>> current processor. Creating a new kworker may unfortunately need to
>>>> reclaim memory first, by shrinking cache and slab used by vfs, which
>>>> depends on bcache device. That's a deadlock.
>>>>
>>>> This patch create a new workqueue for journal_write with WQ_MEM_RECLAIM
>>>> flag. It's rescuer thread will work to avoid the deadlock.
>>>>
>>>> Signed-off-by: guoju <[email protected] <mailto:[email protected]>>
>>> Nice catch, this fix is quite important. I will try to submit to
>>> Jens ASAP.
>>> Thanks.
>>> Coly Li
>>
>> Once this goes into 4.19, would this be a candidate for backporting
>> to any stable kernels, or does it only fix something introduced in
>> this cycle?
>>
>> thanks,
>> Eddie
On 9/27/18 11:53 PM, Eddie Chapman wrote:
> On 27/09/18 16:23, Coly Li wrote:
>>
>> On 9/27/18 9:45 PM, guoju wrote:
>>> After write SSD completed, bcache schedule journal_write work to
>>> system_wq, that is a public workqueue in system, without WQ_MEM_RECLAIM
>>> flag. system_wq is also a bound wq, and there may be no idle kworker on
>>> current processor. Creating a new kworker may unfortunately need to
>>> reclaim memory first, by shrinking cache and slab used by vfs, which
>>> depends on bcache device. That's a deadlock.
>>>
>>> This patch create a new workqueue for journal_write with WQ_MEM_RECLAIM
>>> flag. It's rescuer thread will work to avoid the deadlock.
>>>
>>> Signed-off-by: guoju <[email protected]>
>>
>> Nice catch, this fix is quite important. I will try to submit to Jens
>> ASAP.
>>
>> Thanks.
>>
>> Coly Li
>
> Once this goes into 4.19, would this be a candidate for backporting to
> any stable kernels, or does it only fix something introduced in this
> cycle?
>
This bug exists in upstream for quite long time, it should be applied to
all stable kernels which it can be applied. And it is Cced to
[email protected] already.
Coly Li
On 28/09/18 03:32, Coly Li wrote:
>
> On 9/27/18 11:53 PM, Eddie Chapman wrote:
>> On 27/09/18 16:23, Coly Li wrote:
>>>
>>> On 9/27/18 9:45 PM, guoju wrote:
>>>> After write SSD completed, bcache schedule journal_write work to
>>>> system_wq, that is a public workqueue in system, without WQ_MEM_RECLAIM>>>> flag. system_wq is also a bound wq, and there may be no idle kworker on
>>>> current processor. Creating a new kworker may unfortunately need to
>>>> reclaim memory first, by shrinking cache and slab used by vfs, which
>>>> depends on bcache device. That's a deadlock.
>>>>
>>>> This patch create a new workqueue for journal_write with WQ_MEM_RECLAIM
>>>> flag. It's rescuer thread will work to avoid the deadlock.
>>>>
>>>> Signed-off-by: guoju <[email protected]>
>>>
>>> Nice catch, this fix is quite important. I will try to submit to Jens
>>> ASAP.
>>>
>>> Thanks.
>>>
>>> Coly Li
>>
>> Once this goes into 4.19, would this be a candidate for backporting to
>> any stable kernels, or does it only fix something introduced in this
>> cycle?
>>
> This bug exists in upstream for quite long time, it should be applied to
> all stable kernels which it can be applied. And it is Cced to
> [email protected] already.
>
> Coly Li
Thanks Coly! :-)
Just to let you know, I applied this (and couple of other cherry picks)
to a couple of 4.14 boxes last night, so far so good, running without
issues. However, this one needed this recent commit upstream as a
pre-requisite:
16c1fdf4cfd6c0091e59b93ec2cb7e99973f8244
bcache: do not assign in if condition in bcache_init()
in order to be able to apply it.
This is because the context of the second hunk for
drivers/md/bcache/super.c (in this journal_write workqueue patch)
contains code added by that commit 16c1fdf4cfd6c0091e59b93ec2cb7e99973f8244.
So I guess either 16c1fdf4cfd6c0091e59b93ec2cb7e99973f8244 also needs
tagging for stable, or perhaps a backport of this journal_write
workqueue will have to be created for earlier kernels, with different
context for that hunk?
Eddie
On 2018/10/4 下午10:07, Eddie Chapman wrote:
> On 28/09/18 03:32, Coly Li wrote:
>>
>> On 9/27/18 11:53 PM, Eddie Chapman wrote:
>>> On 27/09/18 16:23, Coly Li wrote:
>>>>
>>>> On 9/27/18 9:45 PM, guoju wrote:
>>>>> After write SSD completed, bcache schedule journal_write work to
>>>>> system_wq, that is a public workqueue in system, without
>>>>> WQ_MEM_RECLAIM>>>> flag. system_wq is also a bound wq, and there
>>>>> may be no idle kworker on
>>>>> current processor. Creating a new kworker may unfortunately need to
>>>>> reclaim memory first, by shrinking cache and slab used by vfs, which
>>>>> depends on bcache device. That's a deadlock.
>>>>>
>>>>> This patch create a new workqueue for journal_write with
>>>>> WQ_MEM_RECLAIM
>>>>> flag. It's rescuer thread will work to avoid the deadlock.
>>>>>
>>>>> Signed-off-by: guoju <[email protected]>
>>>>
>>>> Nice catch, this fix is quite important. I will try to submit to
>>>> Jens ASAP.
>>>>
>>>> Thanks.
>>>>
>>>> Coly Li
>>>
>>> Once this goes into 4.19, would this be a candidate for backporting
>>> to any stable kernels, or does it only fix something introduced in
>>> this cycle?
>>>
>> This bug exists in upstream for quite long time, it should be applied
>> to all stable kernels which it can be applied. And it is Cced to
>> [email protected] already.
>>
>> Coly Li
>
> Thanks Coly! :-)
>
> Just to let you know, I applied this (and couple of other cherry picks)
> to a couple of 4.14 boxes last night, so far so good, running without
> issues. However, this one needed this recent commit upstream as a
> pre-requisite:
>
> 16c1fdf4cfd6c0091e59b93ec2cb7e99973f8244
> bcache: do not assign in if condition in bcache_init()
>
> in order to be able to apply it.
>
> This is because the context of the second hunk for
> drivers/md/bcache/super.c (in this journal_write workqueue patch)
> contains code added by that commit
> 16c1fdf4cfd6c0091e59b93ec2cb7e99973f8244.
>
> So I guess either 16c1fdf4cfd6c0091e59b93ec2cb7e99973f8244 also needs
> tagging for stable, or perhaps a backport of this journal_write
> workqueue will have to be created for earlier kernels, with different
> context for that hunk?
>
> Eddie
Hi Eddie,
Yes I missed the patch dependency, thanks for the hint:-)
Guoju Fang or I will take care of the back port to stable tree.
Coly Li