From: Ye Bin <[email protected]>
Diff V2 vs V1
Use ext4_error() when detect 'i_reserved_data_blocks' and pending tree abnormal.
Ye Bin (3):
ext4: fix incorrect calculate 'reserved' in '__es_remove_extent' when
enable bigalloc feature
ext4: record error when detect abnormal 'i_reserved_data_blocks'
ext4: add check pending tree when evict inode
fs/ext4/extents_status.c | 31 ++++++++++++++++++++++++++++++-
fs/ext4/extents_status.h | 1 +
fs/ext4/super.c | 10 ++++++----
3 files changed, 37 insertions(+), 5 deletions(-)
--
2.31.1
From: Ye Bin <[email protected]>
If 'i_reserved_data_blocks' is not cleared which mean something wrong with
code, free space accounting is likely wrong, according to Jan Kara's advice
use ext4_error() to record this abnormal let fsck to repair and also we can
capture this issue.
Signed-off-by: Ye Bin <[email protected]>
---
fs/ext4/super.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0690e2e0b74d..3d30007502a4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1387,10 +1387,9 @@ static void ext4_destroy_inode(struct inode *inode)
}
if (EXT4_I(inode)->i_reserved_data_blocks)
- ext4_msg(inode->i_sb, KERN_ERR,
- "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
- inode->i_ino, EXT4_I(inode),
- EXT4_I(inode)->i_reserved_data_blocks);
+ ext4_error(inode->i_sb, "Inode %lu (%p) i_reserved_data_blocks"
+ " (%u) not cleared!", inode->i_ino, EXT4_I(inode),
+ EXT4_I(inode)->i_reserved_data_blocks);
}
static void init_once(void *foo)
--
2.31.1
From: Ye Bin <[email protected]>
Syzbot found the following issue:
BUG: memory leak
unreferenced object 0xffff8881bde17420 (size 32):
comm "rep", pid 2327, jiffies 4295381963 (age 32.265s)
hex dump (first 32 bytes):
01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
backtrace:
[<00000000ac6d38f8>] __insert_pending+0x13c/0x2d0
[<00000000d717de3b>] ext4_es_insert_delayed_block+0x399/0x4e0
[<000000004be03913>] ext4_da_map_blocks.constprop.0+0x739/0xfa0
[<00000000885a832a>] ext4_da_get_block_prep+0x10c/0x440
[<0000000029b7f8ef>] __block_write_begin_int+0x28d/0x860
[<00000000e182ebc3>] ext4_da_write_inline_data_begin+0x2d1/0xf30
[<00000000ced0c8a2>] ext4_da_write_begin+0x612/0x860
[<000000008d5f27fa>] generic_perform_write+0x215/0x4d0
[<00000000552c1cde>] ext4_buffered_write_iter+0x101/0x3b0
[<0000000052177ae8>] do_iter_readv_writev+0x19f/0x340
[<000000004b9de834>] do_iter_write+0x13b/0x650
[<00000000e2401b9b>] iter_file_splice_write+0x5a5/0xab0
[<0000000023aa5d90>] direct_splice_actor+0x103/0x1e0
[<0000000089e00fc1>] splice_direct_to_actor+0x2c9/0x7b0
[<000000004386851e>] do_splice_direct+0x159/0x280
[<00000000b567e609>] do_sendfile+0x932/0x1200
Above issue fixed by 1b8f787ef547 "ext4: fix warning in 'ext4_da_release_space'"
in this scene. To make things better add check pending tree when evit inode.
To avoid possible memleak free pending tree when check tree isn't cleared.
Reported-by: [email protected]
Signed-off-by: Ye Bin <[email protected]>
---
fs/ext4/extents_status.c | 28 ++++++++++++++++++++++++++++
fs/ext4/extents_status.h | 1 +
fs/ext4/super.c | 3 +++
3 files changed, 32 insertions(+)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 4684eaea9471..a7a612eb70fe 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1948,6 +1948,34 @@ void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
write_unlock(&ei->i_es_lock);
}
+void ext4_check_inode_pending(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct pending_reservation *pr;
+ struct ext4_pending_tree *tree;
+ struct rb_node *node;
+ int count = 0;
+
+ write_lock(&ei->i_es_lock);
+ tree = &EXT4_I(inode)->i_pending_tree;
+ node = rb_first(&tree->root);
+ while (node) {
+ pr = rb_entry(node, struct pending_reservation, rb_node);
+ node = rb_next(node);
+ rb_erase(&pr->rb_node, &tree->root);
+ kmem_cache_free(ext4_pending_cachep, pr);
+ count++;
+ }
+ write_unlock(&ei->i_es_lock);
+
+ if (count)
+ ext4_error(inode->i_sb,
+ "Inode %lu: pending tree has %d not cleared!",
+ inode->i_ino, count);
+
+ return;
+}
+
/*
* ext4_is_pending - determine whether a cluster has a pending reservation
* on it
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 4ec30a798260..631267d45ab2 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -248,6 +248,7 @@ extern int __init ext4_init_pending(void);
extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
+extern void ext4_check_inode_pending(struct inode *inode);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3d30007502a4..0498feecf10d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1390,6 +1390,9 @@ static void ext4_destroy_inode(struct inode *inode)
ext4_error(inode->i_sb, "Inode %lu (%p) i_reserved_data_blocks"
" (%u) not cleared!", inode->i_ino, EXT4_I(inode),
EXT4_I(inode)->i_reserved_data_blocks);
+
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio != 1)
+ ext4_check_inode_pending(inode);
}
static void init_once(void *foo)
--
2.31.1
From: Ye Bin <[email protected]>
Syzbot report issue as follows:
EXT4-fs error (device loop0): ext4_validate_block_bitmap:398: comm rep: bg 0: block 5: invalid block bitmap
EXT4-fs (loop0): Delayed block allocation failed for inode 18 at logical offset 0 with max blocks 32 with error 28
EXT4-fs (loop0): This should not happen!! Data will be lost
EXT4-fs (loop0): Total free blocks count 0
EXT4-fs (loop0): Free/Dirty block details
EXT4-fs (loop0): free_blocks=0
EXT4-fs (loop0): dirty_blocks=32
EXT4-fs (loop0): Block reservation details
EXT4-fs (loop0): i_reserved_data_blocks=2
EXT4-fs (loop0): Inode 18 (00000000845cd634): i_reserved_data_blocks (1) not cleared!
Above issue happens as follows:
Assume:
sbi->s_cluster_ratio = 16
Step1: Insert delay block [0, 31] -> ei->i_reserved_data_blocks=2
Step2:
ext4_writepages
mpage_map_and_submit_extent -> return failed
mpage_release_unused_pages -> to release [0, 30]
ext4_es_remove_extent -> remove lblk=0 end=30
__es_remove_extent -> len1=0 len2=31-30=1
__es_remove_extent:
...
if (len2 > 0) {
...
if (len1 > 0) {
...
} else {
es->es_lblk = end + 1;
es->es_len = len2;
...
}
if (count_reserved)
count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, &orig_es, &rc);
goto out; -> will return but didn't calculate 'reserved'
...
Step3: ext4_destroy_inode -> trigger "i_reserved_data_blocks (1) not cleared!"
To solve above issue if 'len2>0' call 'get_rsvd()' before goto out.
Reported-by: [email protected]
Signed-off-by: Ye Bin <[email protected]>
---
fs/ext4/extents_status.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index cd0a861853e3..4684eaea9471 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1371,7 +1371,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (count_reserved)
count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
&orig_es, &rc);
- goto out;
+ goto count;
}
if (len1 > 0) {
@@ -1413,6 +1413,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
}
}
+count:
if (count_reserved)
*reserved = get_rsvd(inode, end, es, &rc);
out:
--
2.31.1
* Ye Bin <[email protected]>:
> From: Ye Bin <[email protected]>
>
> If 'i_reserved_data_blocks' is not cleared which mean something wrong with
> code, free space accounting is likely wrong, according to Jan Kara's advice
> use ext4_error() to record this abnormal let fsck to repair and also we can
> capture this issue.
>
> Signed-off-by: Ye Bin <[email protected]>
> ---
> fs/ext4/super.c | 7 +++----
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 0690e2e0b74d..3d30007502a4 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1387,10 +1387,9 @@ static void ext4_destroy_inode(struct inode *inode)
> }
>
> if (EXT4_I(inode)->i_reserved_data_blocks)
> - ext4_msg(inode->i_sb, KERN_ERR,
Per the coding standard, IIRC, the string should not be split across lines
for "greppability", so it should remain as is. It's always good to run
checkpatch to catch stuff like this.
> - "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
> - inode->i_ino, EXT4_I(inode),
> - EXT4_I(inode)->i_reserved_data_blocks);
> + ext4_error(inode->i_sb, "Inode %lu (%p) i_reserved_data_blocks"
> + " (%u) not cleared!", inode->i_ino, EXT4_I(inode),
> + EXT4_I(inode)->i_reserved_data_blocks);
> }
>
> static void init_once(void *foo)
This is an improvement over the first version. If i_reserved_data_blocks is
non-zero, something is definitely broken, but it's perhaps less likely to
indicate file system damage than if it hits zero while there are still
outstanding delayed blocks (handled well elsewhere). So, it's not clear
we need to escalate handling this case but it doesn't hurt, either.
Eric
> --
> 2.31.1
>
* Ye Bin <[email protected]>:
> From: Ye Bin <[email protected]>
>
> Syzbot found the following issue:
> BUG: memory leak
> unreferenced object 0xffff8881bde17420 (size 32):
> comm "rep", pid 2327, jiffies 4295381963 (age 32.265s)
> hex dump (first 32 bytes):
> 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
> backtrace:
> [<00000000ac6d38f8>] __insert_pending+0x13c/0x2d0
> [<00000000d717de3b>] ext4_es_insert_delayed_block+0x399/0x4e0
> [<000000004be03913>] ext4_da_map_blocks.constprop.0+0x739/0xfa0
> [<00000000885a832a>] ext4_da_get_block_prep+0x10c/0x440
> [<0000000029b7f8ef>] __block_write_begin_int+0x28d/0x860
> [<00000000e182ebc3>] ext4_da_write_inline_data_begin+0x2d1/0xf30
> [<00000000ced0c8a2>] ext4_da_write_begin+0x612/0x860
> [<000000008d5f27fa>] generic_perform_write+0x215/0x4d0
> [<00000000552c1cde>] ext4_buffered_write_iter+0x101/0x3b0
> [<0000000052177ae8>] do_iter_readv_writev+0x19f/0x340
> [<000000004b9de834>] do_iter_write+0x13b/0x650
> [<00000000e2401b9b>] iter_file_splice_write+0x5a5/0xab0
> [<0000000023aa5d90>] direct_splice_actor+0x103/0x1e0
> [<0000000089e00fc1>] splice_direct_to_actor+0x2c9/0x7b0
> [<000000004386851e>] do_splice_direct+0x159/0x280
> [<00000000b567e609>] do_sendfile+0x932/0x1200
>
> Above issue fixed by 1b8f787ef547 "ext4: fix warning in 'ext4_da_release_space'"
> in this scene. To make things better add check pending tree when evit inode.
> To avoid possible memleak free pending tree when check tree isn't cleared.
>
> Reported-by: [email protected]
> Signed-off-by: Ye Bin <[email protected]>
> ---
> fs/ext4/extents_status.c | 28 ++++++++++++++++++++++++++++
> fs/ext4/extents_status.h | 1 +
> fs/ext4/super.c | 3 +++
> 3 files changed, 32 insertions(+)
>
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 4684eaea9471..a7a612eb70fe 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -1948,6 +1948,34 @@ void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
> write_unlock(&ei->i_es_lock);
> }
>
> +void ext4_check_inode_pending(struct inode *inode)
> +{
> + struct ext4_inode_info *ei = EXT4_I(inode);
> + struct pending_reservation *pr;
> + struct ext4_pending_tree *tree;
> + struct rb_node *node;
> + int count = 0;
> +
> + write_lock(&ei->i_es_lock);
> + tree = &EXT4_I(inode)->i_pending_tree;
> + node = rb_first(&tree->root);
> + while (node) {
> + pr = rb_entry(node, struct pending_reservation, rb_node);
> + node = rb_next(node);
> + rb_erase(&pr->rb_node, &tree->root);
> + kmem_cache_free(ext4_pending_cachep, pr);
> + count++;
> + }
> + write_unlock(&ei->i_es_lock);
> +
> + if (count)
> + ext4_error(inode->i_sb,
> + "Inode %lu: pending tree has %d not cleared!",
> + inode->i_ino, count);
> +
> + return;
> +}
> +
> /*
> * ext4_is_pending - determine whether a cluster has a pending reservation
> * on it
> diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
> index 4ec30a798260..631267d45ab2 100644
> --- a/fs/ext4/extents_status.h
> +++ b/fs/ext4/extents_status.h
> @@ -248,6 +248,7 @@ extern int __init ext4_init_pending(void);
> extern void ext4_exit_pending(void);
> extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
> extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
> +extern void ext4_check_inode_pending(struct inode *inode);
> extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
> extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
> bool allocated);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 3d30007502a4..0498feecf10d 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1390,6 +1390,9 @@ static void ext4_destroy_inode(struct inode *inode)
> ext4_error(inode->i_sb, "Inode %lu (%p) i_reserved_data_blocks"
> " (%u) not cleared!", inode->i_ino, EXT4_I(inode),
> EXT4_I(inode)->i_reserved_data_blocks);
> +
> + if (EXT4_SB(inode->i_sb)->s_cluster_ratio != 1)
> + ext4_check_inode_pending(inode);
> }
>
> static void init_once(void *foo)
Ye Bin:
I think there's no need to add code that tears down non-empty pending trees
when an inode is destroyed. If one ever appears, that's an indication of a
bug (and a possibly damaged file system), and the better thing to do is to
simply find and fix that bug. Code that is never supposed to run, like what
you are proposing, is difficult to test and maintain over time and is prone
to bit rot.
If we do anything here, the better thing to do is to simply log an error
in the event of a non-empty pending tree and let the administrator/user
decide how they want to handle the problem. That would probably include
filing a bug report so we can fix the problem if it ever occurs.
My suggestion is to add an inline function to extents_status.c that uses
something like this to test for the presence of a non-empty pending
reservation tree (then called from ext4_destroy_inode):
if (RB_EMPTY_ROOT(&EXT4_I(inode)->i_pending_tree.root))
...
Please note that you've been testing a file system configuration that still
needs development work - bigalloc + inline. From the kernel stacks you've
been posting, that appears to be the case, anyway. When that work is
complete, we will have done enough testing to be confident we don't have
memory leaks. Until then, leaks and other bugs are possible.
Eric
> --
> 2.31.1
>
* Ye Bin <[email protected]>:
> From: Ye Bin <[email protected]>
>
> Syzbot report issue as follows:
> EXT4-fs error (device loop0): ext4_validate_block_bitmap:398: comm rep: bg 0: block 5: invalid block bitmap
> EXT4-fs (loop0): Delayed block allocation failed for inode 18 at logical offset 0 with max blocks 32 with error 28
> EXT4-fs (loop0): This should not happen!! Data will be lost
>
> EXT4-fs (loop0): Total free blocks count 0
> EXT4-fs (loop0): Free/Dirty block details
> EXT4-fs (loop0): free_blocks=0
> EXT4-fs (loop0): dirty_blocks=32
> EXT4-fs (loop0): Block reservation details
> EXT4-fs (loop0): i_reserved_data_blocks=2
> EXT4-fs (loop0): Inode 18 (00000000845cd634): i_reserved_data_blocks (1) not cleared!
>
> Above issue happens as follows:
> Assume:
> sbi->s_cluster_ratio = 16
> Step1: Insert delay block [0, 31] -> ei->i_reserved_data_blocks=2
> Step2:
> ext4_writepages
> mpage_map_and_submit_extent -> return failed
> mpage_release_unused_pages -> to release [0, 30]
> ext4_es_remove_extent -> remove lblk=0 end=30
> __es_remove_extent -> len1=0 len2=31-30=1
> __es_remove_extent:
> ...
> if (len2 > 0) {
> ...
> if (len1 > 0) {
> ...
> } else {
> es->es_lblk = end + 1;
> es->es_len = len2;
> ...
> }
> if (count_reserved)
> count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, &orig_es, &rc);
> goto out; -> will return but didn't calculate 'reserved'
> ...
> Step3: ext4_destroy_inode -> trigger "i_reserved_data_blocks (1) not cleared!"
>
> To solve above issue if 'len2>0' call 'get_rsvd()' before goto out.
>
> Reported-by: [email protected]
> Signed-off-by: Ye Bin <[email protected]>
> ---
> fs/ext4/extents_status.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index cd0a861853e3..4684eaea9471 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -1371,7 +1371,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
> if (count_reserved)
> count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
> &orig_es, &rc);
> - goto out;
> + goto count;
> }
>
> if (len1 > 0) {
> @@ -1413,6 +1413,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
> }
> }
>
> +count:
> if (count_reserved)
> *reserved = get_rsvd(inode, end, es, &rc);
> out:
> --
> 2.31.1
>
I'm unable to find the sysbot report for this patch, so I can't verify that
this fix works. The more serious problem would be whatever is causing
the invalid block bitmap and delayed allocation failure messages before the
i_reserved_data_blocks message. Perhaps that's simply what syzkaller set
up, but it's not clear from this posting. Have you looked for the cause
of those first two messages?
However, by inspection this patch should fix an obvious bug causing that last
message, introduced by 8fcc3a580651 ("ext4: rework reserved cluster accounting
when invalidating pages"). A Fixes tag should be added to the patch. Also,
the readability of the code should be improved by changing the label "count" to
the more descriptive "out_get_reserved".
With those two changes, feel free to add:
Reviewed-by: Eric Whitney <[email protected]>
Eric