2022-05-10 18:14:25

by Zhihao Cheng

[permalink] [raw]
Subject: [PATCH v3 1/1] fs-writeback: writeback_sb_inodes :Recalculate 'wrote' according skipped pages

Commit 505a666ee3fc ("writeback: plug writeback in wb_writeback() and
writeback_inodes_wb()") has us holding a plug during wb_writeback, which
may cause a potential ABBA dead lock:

wb_writeback fat_file_fsync
blk_start_plug(&plug)
for (;;) {
iter i-1: some reqs have been added into plug->mq_list // LOCK A
iter i:
progress = __writeback_inodes_wb(wb, work)
. writeback_sb_inodes // fat's bdev
. __writeback_single_inode
. . generic_writepages
. . __block_write_full_page
. . . . __generic_file_fsync
. . . . sync_inode_metadata
. . . . writeback_single_inode
. . . . __writeback_single_inode
. . . . fat_write_inode
. . . . __fat_write_inode
. . . . sync_dirty_buffer // fat's bdev
. . . . lock_buffer(bh) // LOCK B
. . . . submit_bh
. . . . blk_mq_get_tag // LOCK A
. . . trylock_buffer(bh) // LOCK B
. . . redirty_page_for_writepage
. . . wbc->pages_skipped++
. . --wbc->nr_to_write
. wrote += write_chunk - wbc.nr_to_write // wrote > 0
. requeue_inode
. redirty_tail_locked
if (progress) // progress > 0
continue;
iter i+1:
queue_io
// similar process with iter i, infinite for-loop !
}
blk_finish_plug(&plug) // flush plug won't be called

Above process triggers a hungtask like:
[ 399.044861] INFO: task bb:2607 blocked for more than 30 seconds.
[ 399.046824] Not tainted 5.18.0-rc1-00005-gefae4d9eb6a2-dirty
[ 399.051539] task:bb state:D stack: 0 pid: 2607 ppid:
2426 flags:0x00004000
[ 399.051556] Call Trace:
[ 399.051570] __schedule+0x480/0x1050
[ 399.051592] schedule+0x92/0x1a0
[ 399.051602] io_schedule+0x22/0x50
[ 399.051613] blk_mq_get_tag+0x1d3/0x3c0
[ 399.051640] __blk_mq_alloc_requests+0x21d/0x3f0
[ 399.051657] blk_mq_submit_bio+0x68d/0xca0
[ 399.051674] __submit_bio+0x1b5/0x2d0
[ 399.051708] submit_bio_noacct+0x34e/0x720
[ 399.051718] submit_bio+0x3b/0x150
[ 399.051725] submit_bh_wbc+0x161/0x230
[ 399.051734] __sync_dirty_buffer+0xd1/0x420
[ 399.051744] sync_dirty_buffer+0x17/0x20
[ 399.051750] __fat_write_inode+0x289/0x310
[ 399.051766] fat_write_inode+0x2a/0xa0
[ 399.051783] __writeback_single_inode+0x53c/0x6f0
[ 399.051795] writeback_single_inode+0x145/0x200
[ 399.051803] sync_inode_metadata+0x45/0x70
[ 399.051856] __generic_file_fsync+0xa3/0x150
[ 399.051880] fat_file_fsync+0x1d/0x80
[ 399.051895] vfs_fsync_range+0x40/0xb0
[ 399.051929] __x64_sys_fsync+0x18/0x30

In my test, 'need_resched()' (which is imported by 590dca3a71 "fs-writeback:
unplug before cond_resched in writeback_sb_inodes") in function
'writeback_sb_inodes()' seldom comes true, unless cond_resched() is deleted
from write_cache_pages().

Fix it by correcting wrote number according number of skipped pages
in writeback_sb_inodes().

Goto Link to find a reproducer.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=215837
Cc: [email protected] # v4.3
Signed-off-by: Zhihao Cheng <[email protected]>
---
v2->v3:
Don't update 'work->nr_pages' (This variable means how many pages
to be processed).
fs/fs-writeback.c | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 591fe9cf1659..b20b70de9143 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1775,11 +1775,12 @@ static long writeback_sb_inodes(struct super_block *sb,
};
unsigned long start_time = jiffies;
long write_chunk;
- long wrote = 0; /* count both pages and inodes */
+ long total_wrote = 0; /* count both pages and inodes */

while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct bdi_writeback *tmp_wb;
+ long wrote;

if (inode->i_sb != sb) {
if (work->sb) {
@@ -1855,7 +1856,9 @@ static long writeback_sb_inodes(struct super_block *sb,

wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
- wrote += write_chunk - wbc.nr_to_write;
+ wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
+ wrote = wrote < 0 ? 0 : wrote;
+ total_wrote += wrote;

if (need_resched()) {
/*
@@ -1877,7 +1880,7 @@ static long writeback_sb_inodes(struct super_block *sb,
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
- wrote++;
+ total_wrote++;
requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
@@ -1891,14 +1894,14 @@ static long writeback_sb_inodes(struct super_block *sb,
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
*/
- if (wrote) {
+ if (total_wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
- return wrote;
+ return total_wrote;
}

static long __writeback_inodes_wb(struct bdi_writeback *wb,
--
2.31.1



2022-05-19 21:55:12

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH v3 1/1] fs-writeback: writeback_sb_inode s:Recalculate 'wrote' according skipped pages

On Tue 10-05-22 21:38:05, Zhihao Cheng wrote:
> Commit 505a666ee3fc ("writeback: plug writeback in wb_writeback() and
> writeback_inodes_wb()") has us holding a plug during wb_writeback, which
> may cause a potential ABBA dead lock:
>
> wb_writeback fat_file_fsync
> blk_start_plug(&plug)
> for (;;) {
> iter i-1: some reqs have been added into plug->mq_list // LOCK A
> iter i:
> progress = __writeback_inodes_wb(wb, work)
> . writeback_sb_inodes // fat's bdev
> . __writeback_single_inode
> . . generic_writepages
> . . __block_write_full_page
> . . . . __generic_file_fsync
> . . . . sync_inode_metadata
> . . . . writeback_single_inode
> . . . . __writeback_single_inode
> . . . . fat_write_inode
> . . . . __fat_write_inode
> . . . . sync_dirty_buffer // fat's bdev
> . . . . lock_buffer(bh) // LOCK B
> . . . . submit_bh
> . . . . blk_mq_get_tag // LOCK A
> . . . trylock_buffer(bh) // LOCK B
> . . . redirty_page_for_writepage
> . . . wbc->pages_skipped++
> . . --wbc->nr_to_write
> . wrote += write_chunk - wbc.nr_to_write // wrote > 0
> . requeue_inode
> . redirty_tail_locked
> if (progress) // progress > 0
> continue;
> iter i+1:
> queue_io
> // similar process with iter i, infinite for-loop !
> }
> blk_finish_plug(&plug) // flush plug won't be called
>
> Above process triggers a hungtask like:
> [ 399.044861] INFO: task bb:2607 blocked for more than 30 seconds.
> [ 399.046824] Not tainted 5.18.0-rc1-00005-gefae4d9eb6a2-dirty
> [ 399.051539] task:bb state:D stack: 0 pid: 2607 ppid:
> 2426 flags:0x00004000
> [ 399.051556] Call Trace:
> [ 399.051570] __schedule+0x480/0x1050
> [ 399.051592] schedule+0x92/0x1a0
> [ 399.051602] io_schedule+0x22/0x50
> [ 399.051613] blk_mq_get_tag+0x1d3/0x3c0
> [ 399.051640] __blk_mq_alloc_requests+0x21d/0x3f0
> [ 399.051657] blk_mq_submit_bio+0x68d/0xca0
> [ 399.051674] __submit_bio+0x1b5/0x2d0
> [ 399.051708] submit_bio_noacct+0x34e/0x720
> [ 399.051718] submit_bio+0x3b/0x150
> [ 399.051725] submit_bh_wbc+0x161/0x230
> [ 399.051734] __sync_dirty_buffer+0xd1/0x420
> [ 399.051744] sync_dirty_buffer+0x17/0x20
> [ 399.051750] __fat_write_inode+0x289/0x310
> [ 399.051766] fat_write_inode+0x2a/0xa0
> [ 399.051783] __writeback_single_inode+0x53c/0x6f0
> [ 399.051795] writeback_single_inode+0x145/0x200
> [ 399.051803] sync_inode_metadata+0x45/0x70
> [ 399.051856] __generic_file_fsync+0xa3/0x150
> [ 399.051880] fat_file_fsync+0x1d/0x80
> [ 399.051895] vfs_fsync_range+0x40/0xb0
> [ 399.051929] __x64_sys_fsync+0x18/0x30
>
> In my test, 'need_resched()' (which is imported by 590dca3a71 "fs-writeback:
> unplug before cond_resched in writeback_sb_inodes") in function
> 'writeback_sb_inodes()' seldom comes true, unless cond_resched() is deleted
> from write_cache_pages().
>
> Fix it by correcting wrote number according number of skipped pages
> in writeback_sb_inodes().
>
> Goto Link to find a reproducer.
>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=215837
> Cc: [email protected] # v4.3
> Signed-off-by: Zhihao Cheng <[email protected]>

Indeed, subtle. The fix looks good. Feel free to add:

Reviewed-by: Jan Kara <[email protected]>

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2022-05-20 03:26:40

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH v3 1/1] fs-writeback: writeback_sb _inodes:Recalculate 'wrote' according skipped pages

On 5/10/22 7:38 AM, Zhihao Cheng wrote:
> Commit 505a666ee3fc ("writeback: plug writeback in wb_writeback() and
> writeback_inodes_wb()") has us holding a plug during wb_writeback, which
> may cause a potential ABBA dead lock:
>
> wb_writeback fat_file_fsync
> blk_start_plug(&plug)
> for (;;) {
> iter i-1: some reqs have been added into plug->mq_list // LOCK A
> iter i:
> progress = __writeback_inodes_wb(wb, work)
> . writeback_sb_inodes // fat's bdev
> . __writeback_single_inode
> . . generic_writepages
> . . __block_write_full_page
> . . . . __generic_file_fsync
> . . . . sync_inode_metadata
> . . . . writeback_single_inode
> . . . . __writeback_single_inode
> . . . . fat_write_inode
> . . . . __fat_write_inode
> . . . . sync_dirty_buffer // fat's bdev
> . . . . lock_buffer(bh) // LOCK B
> . . . . submit_bh
> . . . . blk_mq_get_tag // LOCK A
> . . . trylock_buffer(bh) // LOCK B
> . . . redirty_page_for_writepage
> . . . wbc->pages_skipped++
> . . --wbc->nr_to_write
> . wrote += write_chunk - wbc.nr_to_write // wrote > 0
> . requeue_inode
> . redirty_tail_locked
> if (progress) // progress > 0
> continue;
> iter i+1:
> queue_io
> // similar process with iter i, infinite for-loop !
> }
> blk_finish_plug(&plug) // flush plug won't be called
>
> Above process triggers a hungtask like:
> [ 399.044861] INFO: task bb:2607 blocked for more than 30 seconds.
> [ 399.046824] Not tainted 5.18.0-rc1-00005-gefae4d9eb6a2-dirty
> [ 399.051539] task:bb state:D stack: 0 pid: 2607 ppid:
> 2426 flags:0x00004000
> [ 399.051556] Call Trace:
> [ 399.051570] __schedule+0x480/0x1050
> [ 399.051592] schedule+0x92/0x1a0
> [ 399.051602] io_schedule+0x22/0x50
> [ 399.051613] blk_mq_get_tag+0x1d3/0x3c0
> [ 399.051640] __blk_mq_alloc_requests+0x21d/0x3f0
> [ 399.051657] blk_mq_submit_bio+0x68d/0xca0
> [ 399.051674] __submit_bio+0x1b5/0x2d0
> [ 399.051708] submit_bio_noacct+0x34e/0x720
> [ 399.051718] submit_bio+0x3b/0x150
> [ 399.051725] submit_bh_wbc+0x161/0x230
> [ 399.051734] __sync_dirty_buffer+0xd1/0x420
> [ 399.051744] sync_dirty_buffer+0x17/0x20
> [ 399.051750] __fat_write_inode+0x289/0x310
> [ 399.051766] fat_write_inode+0x2a/0xa0
> [ 399.051783] __writeback_single_inode+0x53c/0x6f0
> [ 399.051795] writeback_single_inode+0x145/0x200
> [ 399.051803] sync_inode_metadata+0x45/0x70
> [ 399.051856] __generic_file_fsync+0xa3/0x150
> [ 399.051880] fat_file_fsync+0x1d/0x80
> [ 399.051895] vfs_fsync_range+0x40/0xb0
> [ 399.051929] __x64_sys_fsync+0x18/0x30
>
> In my test, 'need_resched()' (which is imported by 590dca3a71 "fs-writeback:
> unplug before cond_resched in writeback_sb_inodes") in function
> 'writeback_sb_inodes()' seldom comes true, unless cond_resched() is deleted
> from write_cache_pages().
>
> Fix it by correcting wrote number according number of skipped pages
> in writeback_sb_inodes().
>
> Goto Link to find a reproducer.

I can take this one for 5.19, thanks.

--
Jens Axboe