Now that in the error patch of extent updating procedure cannot handle
error and roll-back partial updates properly, so we could access the
left inconsistent extent buffer later and lead to BUGON in
errors=continue mode. For example, we could get below BUGON if we
update leaf extent but failed to update index extent in
ext4_ext_insert_extent() and try to alloc block again.
kernel BUG at fs/ext4/mballoc.c:4085!
invalid opcode: 0000 [#1] SMP PTI
CPU: 30 PID: 1177 Comm: xfs_io Not tainted 5.14.0-00006-g555c93b65a81 #543
RIP: 0010:ext4_mb_normalize_request.constprop.0+0x72c/0x8a0
RSP: 0018:ffffb398c0abb8d8 EFLAGS: 00010202
RAX: 0000000000000000 RBX: ffff8d79d3125000 RCX: 0000000000001500
RDX: 0000000000001500 RSI: 0000000000000000 RDI: ffffb398c0abba50
RBP: 0000000000001500 R08: 0000000000000001 R09: 00000000000015c0
R10: 0000000000660000 R11: ffff8d79e21212d8 R12: ffff8d79e21215b8
R13: 0000000000001500 R14: ffffb398c0abba50 R15: ffff8d79e21215b8
FS: 00007f1bf519f800(0000) GS:ffff8d80e5d80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000560c5b96d000 CR3: 000000010f170000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
ext4_mb_new_blocks+0x8f3/0x1c90
? __read_extent_tree_block+0x20a/0x260
ext4_ext_map_blocks+0xb7c/0x1d30
ext4_map_blocks+0x15d/0xa30
? __cond_resched+0x1d/0x50
? kmem_cache_alloc+0x206/0x400
_ext4_get_block+0xa8/0x170
ext4_get_block+0x1a/0x30
ext4_block_write_begin+0x179/0x8c0
? ext4_get_block_unwritten+0x20/0x20
? __ext4_journal_start_sb+0x179/0x1d0
ext4_write_begin+0x42d/0x910
ext4_da_write_begin+0x2eb/0x750
generic_perform_write+0xcb/0x280
ext4_buffered_write_iter+0xc3/0x1e0
ext4_file_write_iter+0x70/0xac0
? _raw_spin_unlock+0x12/0x30
? __handle_mm_fault+0x13e1/0x2520
new_sync_write+0x166/0x220
vfs_write+0x1d7/0x3b0
ksys_pwrite64+0x85/0xf0
__x64_sys_pwrite64+0x22/0x30
do_syscall_64+0x3b/0x90
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f1bf5760378
This patch set address to enhance extent check in __ext4_ext_check() and
force check buffer again through clear buffer's verified bit if we
breaks off extent updating.
Thanks,
Yi.
Zhang Yi (3):
ext4: check for out-of-order index extents in
ext4_valid_extent_entries()
ext4: check for inconsistent extents between index and leaf block
ext4: prevent partial update of the extent blocks
fs/ext4/extents.c | 95 +++++++++++++++++++++++++++++++----------------
1 file changed, 64 insertions(+), 31 deletions(-)
--
2.31.1
Now that we can check out overlapping extents in leaf block and
out-of-order index extents in index block. But the .ee_block in the
first extent of one leaf block should equal to the .ei_block in it's
parent index extent entry. This patch add a check to verify such
inconsistent between the index and leaf block.
Signed-off-by: Zhang Yi <[email protected]>
---
fs/ext4/extents.c | 59 +++++++++++++++++++++++++++++------------------
1 file changed, 36 insertions(+), 23 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4bb1153c01b3..d2601194b462 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -354,7 +354,8 @@ static int ext4_valid_extent_idx(struct inode *inode,
static int ext4_valid_extent_entries(struct inode *inode,
struct ext4_extent_header *eh,
- ext4_fsblk_t *pblk, int depth)
+ ext4_lblk_t lblk, ext4_fsblk_t *pblk,
+ int depth)
{
unsigned short entries;
ext4_lblk_t lblock = 0;
@@ -368,6 +369,14 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext->ee_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
@@ -384,6 +393,14 @@ static int ext4_valid_extent_entries(struct inode *inode,
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
+
+ /*
+ * The logical block in the first entry should equal to
+ * the number in the parent index block.
+ */
+ if (depth != ext_depth(inode) &&
+ lblk != le32_to_cpu(ext_idx->ei_block))
+ return 0;
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
@@ -404,7 +421,7 @@ static int ext4_valid_extent_entries(struct inode *inode,
static int __ext4_ext_check(const char *function, unsigned int line,
struct inode *inode, struct ext4_extent_header *eh,
- int depth, ext4_fsblk_t pblk)
+ int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
{
const char *error_msg;
int max = 0, err = -EFSCORRUPTED;
@@ -430,7 +447,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries";
goto corrupted;
}
- if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) {
+ if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
}
@@ -460,7 +477,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
}
#define ext4_ext_check(inode, eh, depth, pblk) \
- __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk))
+ __ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
int ext4_ext_check_inode(struct inode *inode)
{
@@ -493,16 +510,18 @@ static void ext4_cache_extents(struct inode *inode,
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
- struct inode *inode, ext4_fsblk_t pblk, int depth,
- int flags)
+ struct inode *inode, struct ext4_extent_idx *idx,
+ int depth, int flags)
{
struct buffer_head *bh;
int err;
gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS;
+ ext4_fsblk_t pblk;
if (flags & EXT4_EX_NOFAIL)
gfp_flags |= __GFP_NOFAIL;
+ pblk = ext4_idx_pblock(idx);
bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
@@ -515,8 +534,8 @@ __read_extent_tree_block(const char *function, unsigned int line,
}
if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh;
- err = __ext4_ext_check(function, line, inode,
- ext_block_hdr(bh), depth, pblk);
+ err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
+ depth, pblk, le32_to_cpu(idx->ei_block));
if (err)
goto errout;
set_buffer_verified(bh);
@@ -534,8 +553,8 @@ __read_extent_tree_block(const char *function, unsigned int line,
}
-#define read_extent_tree_block(inode, pblk, depth, flags) \
- __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
+#define read_extent_tree_block(inode, idx, depth, flags) \
+ __read_extent_tree_block(__func__, __LINE__, (inode), (idx), \
(depth), (flags))
/*
@@ -585,8 +604,7 @@ int ext4_ext_precache(struct inode *inode)
i--;
continue;
}
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx++),
+ bh = read_extent_tree_block(inode, path[i].p_idx++,
depth - i - 1,
EXT4_EX_FORCE_CACHE);
if (IS_ERR(bh)) {
@@ -891,8 +909,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
- flags);
+ bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
@@ -1501,7 +1518,6 @@ static int ext4_ext_search_right(struct inode *inode,
struct ext4_extent_header *eh;
struct ext4_extent_idx *ix;
struct ext4_extent *ex;
- ext4_fsblk_t block;
int depth; /* Note, NOT eh_depth; depth from top of tree */
int ee_len;
@@ -1568,20 +1584,17 @@ static int ext4_ext_search_right(struct inode *inode,
* follow it and find the closest allocated
* block to the right */
ix++;
- block = ext4_idx_pblock(ix);
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, block,
- path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
ix = EXT_FIRST_INDEX(eh);
- block = ext4_idx_pblock(ix);
put_bh(bh);
}
- bh = read_extent_tree_block(inode, block, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -2960,9 +2973,9 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
ext_debug(inode, "move to level %d (block %llu)\n",
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
- bh = read_extent_tree_block(inode,
- ext4_idx_pblock(path[i].p_idx), depth - i - 1,
- EXT4_EX_NOCACHE);
+ bh = read_extent_tree_block(inode, path[i].p_idx,
+ depth - i - 1,
+ EXT4_EX_NOCACHE);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
--
2.31.1
In the most error path of current extents updating operations are not
roll back partial updates properly when some bad things happens(.e.g in
ext4_ext_insert_extent()). So we may get an inconsistent extents tree
if journal has been aborted due to IO error, which may probability lead
to BUGON later when we accessing these extent entries in errors=continue
mode. This patch drop extent buffer's verify flag before updatng the
contents in ext4_ext_get_access(), and reset it after updating in
__ext4_ext_dirty(). After this patch we could force to check the extent
buffer if extents tree updating was break off, make sure the extents are
consistent.
Signed-off-by: Zhang Yi <[email protected]>
---
fs/ext4/extents.c | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d2601194b462..9228de6950a2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -136,15 +136,25 @@ int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
+ int err = 0;
+
if (path->p_bh) {
/* path points to block */
BUFFER_TRACE(path->p_bh, "get_write_access");
- return ext4_journal_get_write_access(handle, inode->i_sb,
- path->p_bh, EXT4_JTR_NONE);
+ err = ext4_journal_get_write_access(handle, inode->i_sb,
+ path->p_bh, EXT4_JTR_NONE);
+ /*
+ * The extent buffer's verified bit will be set again in
+ * __ext4_ext_dirty(). We could leave an inconsistent
+ * buffer if the extents updating procudure break off du
+ * to some error happens, force to check it again.
+ */
+ if (!err)
+ clear_buffer_verified(path->p_bh);
}
/* path points to leaf/index in inode body */
/* we use in-core data, no need to protect them */
- return 0;
+ return err;
}
/*
@@ -165,6 +175,9 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
/* path points to block */
err = __ext4_handle_dirty_metadata(where, line, handle,
inode, path->p_bh);
+ /* Extents updating done, re-set verified flag */
+ if (!err)
+ set_buffer_verified(path->p_bh);
} else {
/* path points to leaf/index in inode body */
err = ext4_mark_inode_dirty(handle, inode);
--
2.31.1
After commit 5946d089379a ("ext4: check for overlapping extents in
ext4_valid_extent_entries()"), we can check out the overlapping extent
entry in leaf extent blocks. But the out-of-order extent entry in index
extent blocks could also trigger bad things if the filesystem is
inconsistent. So this patch add a check to figure out the out-of-order
index extents and return error.
Signed-off-by: Zhang Yi <[email protected]>
---
fs/ext4/extents.c | 17 ++++++++++++-----
1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c0de30f25185..4bb1153c01b3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -357,6 +357,9 @@ static int ext4_valid_extent_entries(struct inode *inode,
ext4_fsblk_t *pblk, int depth)
{
unsigned short entries;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t prev = 0;
+
if (eh->eh_entries == 0)
return 1;
@@ -365,31 +368,35 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
- ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
- int len = 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- len = ext4_ext_get_actual_len(ext);
if ((lblock <= prev) && prev) {
*pblk = ext4_ext_pblock(ext);
return 0;
}
+ prev = lblock + ext4_ext_get_actual_len(ext) - 1;
ext++;
entries--;
- prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
while (entries) {
if (!ext4_valid_extent_idx(inode, ext_idx))
return 0;
+
+ /* Check for overlapping index extents */
+ lblock = le32_to_cpu(ext_idx->ei_block);
+ if ((lblock <= prev) && prev) {
+ *pblk = ext4_idx_pblock(ext_idx);
+ return 0;
+ }
ext_idx++;
entries--;
+ prev = lblock;
}
}
return 1;
--
2.31.1
Gentle ping.
On 2021/9/8 20:08, Zhang Yi wrote:
> Now that in the error patch of extent updating procedure cannot handle
> error and roll-back partial updates properly, so we could access the
> left inconsistent extent buffer later and lead to BUGON in
> errors=continue mode. For example, we could get below BUGON if we
> update leaf extent but failed to update index extent in
> ext4_ext_insert_extent() and try to alloc block again.
>
> kernel BUG at fs/ext4/mballoc.c:4085!
> invalid opcode: 0000 [#1] SMP PTI
> CPU: 30 PID: 1177 Comm: xfs_io Not tainted 5.14.0-00006-g555c93b65a81 #543
> RIP: 0010:ext4_mb_normalize_request.constprop.0+0x72c/0x8a0
> RSP: 0018:ffffb398c0abb8d8 EFLAGS: 00010202
> RAX: 0000000000000000 RBX: ffff8d79d3125000 RCX: 0000000000001500
> RDX: 0000000000001500 RSI: 0000000000000000 RDI: ffffb398c0abba50
> RBP: 0000000000001500 R08: 0000000000000001 R09: 00000000000015c0
> R10: 0000000000660000 R11: ffff8d79e21212d8 R12: ffff8d79e21215b8
> R13: 0000000000001500 R14: ffffb398c0abba50 R15: ffff8d79e21215b8
> FS: 00007f1bf519f800(0000) GS:ffff8d80e5d80000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000560c5b96d000 CR3: 000000010f170000 CR4: 00000000000006e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
> ext4_mb_new_blocks+0x8f3/0x1c90
> ? __read_extent_tree_block+0x20a/0x260
> ext4_ext_map_blocks+0xb7c/0x1d30
> ext4_map_blocks+0x15d/0xa30
> ? __cond_resched+0x1d/0x50
> ? kmem_cache_alloc+0x206/0x400
> _ext4_get_block+0xa8/0x170
> ext4_get_block+0x1a/0x30
> ext4_block_write_begin+0x179/0x8c0
> ? ext4_get_block_unwritten+0x20/0x20
> ? __ext4_journal_start_sb+0x179/0x1d0
> ext4_write_begin+0x42d/0x910
> ext4_da_write_begin+0x2eb/0x750
> generic_perform_write+0xcb/0x280
> ext4_buffered_write_iter+0xc3/0x1e0
> ext4_file_write_iter+0x70/0xac0
> ? _raw_spin_unlock+0x12/0x30
> ? __handle_mm_fault+0x13e1/0x2520
> new_sync_write+0x166/0x220
> vfs_write+0x1d7/0x3b0
> ksys_pwrite64+0x85/0xf0
> __x64_sys_pwrite64+0x22/0x30
> do_syscall_64+0x3b/0x90
> entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x7f1bf5760378
>
> This patch set address to enhance extent check in __ext4_ext_check() and
> force check buffer again through clear buffer's verified bit if we
> breaks off extent updating.
>
> Thanks,
> Yi.
>
>
> Zhang Yi (3):
> ext4: check for out-of-order index extents in
> ext4_valid_extent_entries()
> ext4: check for inconsistent extents between index and leaf block
> ext4: prevent partial update of the extent blocks
>
> fs/ext4/extents.c | 95 +++++++++++++++++++++++++++++++----------------
> 1 file changed, 64 insertions(+), 31 deletions(-)
>
On Wed, Sep 08, 2021 at 08:08:49PM +0800, Zhang Yi wrote:
> Now that we can check out overlapping extents in leaf block and
> out-of-order index extents in index block. But the .ee_block in the
> first extent of one leaf block should equal to the .ei_block in it's
> parent index extent entry.
I don't believe this is always guaranteed.
The punch hole operation can remove some or part of the first entry in
the leaf block, and it won't update the parent index. So it's OK for
the first entry of the leaf block to be greater than entry in the
parent block. However, if the first entry of the leaf block is less
than the entry in the parent block, that's definitely going to be a
problem.
- Ted
On Wed, Sep 08, 2021 at 08:08:48PM +0800, Zhang Yi wrote:
> After commit 5946d089379a ("ext4: check for overlapping extents in
> ext4_valid_extent_entries()"), we can check out the overlapping extent
> entry in leaf extent blocks. But the out-of-order extent entry in index
> extent blocks could also trigger bad things if the filesystem is
> inconsistent. So this patch add a check to figure out the out-of-order
> index extents and return error.
>
> Signed-off-by: Zhang Yi <[email protected]>
Looks good,
Reviewed-by: Theodore Ts'o <[email protected]>
On Wed, Sep 08, 2021 at 08:08:50PM +0800, Zhang Yi wrote:
> In the most error path of current extents updating operations are not
> roll back partial updates properly when some bad things happens(.e.g in
> ext4_ext_insert_extent()). So we may get an inconsistent extents tree
> if journal has been aborted due to IO error, which may probability lead
> to BUGON later when we accessing these extent entries in errors=continue
> mode. This patch drop extent buffer's verify flag before updatng the
> contents in ext4_ext_get_access(), and reset it after updating in
> __ext4_ext_dirty(). After this patch we could force to check the extent
> buffer if extents tree updating was break off, make sure the extents are
> consistent.
>
> Signed-off-by: Zhang Yi <[email protected]>
Looks good, thanks
Reviewed-by: Theodore Ts'o <[email protected]>
- Ted
On 2021/10/8 0:37, Theodore Ts'o wrote:
> On Wed, Sep 08, 2021 at 08:08:49PM +0800, Zhang Yi wrote:
>> Now that we can check out overlapping extents in leaf block and
>> out-of-order index extents in index block. But the .ee_block in the
>> first extent of one leaf block should equal to the .ei_block in it's
>> parent index extent entry.
>
> I don't believe this is always guaranteed.
>
> The punch hole operation can remove some or part of the first entry in
> the leaf block, and it won't update the parent index. So it's OK for
> the first entry of the leaf block to be greater than entry in the
> parent block. However, if the first entry of the leaf block is less
> than the entry in the parent block, that's definitely going to be a
> problem.
>
Hi, Ted.
ext4_punch_hole()->ext4_ext_remove_space()->ext4_ext_rm_leaf() call
ext4_ext_correct_indexes() or ext4_ext_rm_idx() to update the parent index
if the removing extent entry is the first entry of the leaf block.
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct partial_cluster *partial,
ext4_lblk_t start, ext4_lblk_t end)
{
...
if (ex == EXT_FIRST_EXTENT(eh)) {
correct_index = 1;
...
if (correct_index && eh->eh_entries)
err = ext4_ext_correct_indexes(handle, inode, path);
...
}
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path, int depth)
{
...
while (--depth >= 0) {
...
path->p_idx->ei_block = (path+1)->p_idx->ei_block;
...
}
...
}
And the fsck does also check the mismatch case in scan_extent_node(), am I
missing something?
Thanks,
Yi.
On Wed, 8 Sep 2021 20:08:47 +0800, Zhang Yi wrote:
> Now that in the error patch of extent updating procedure cannot handle
> error and roll-back partial updates properly, so we could access the
> left inconsistent extent buffer later and lead to BUGON in
> errors=continue mode. For example, we could get below BUGON if we
> update leaf extent but failed to update index extent in
> ext4_ext_insert_extent() and try to alloc block again.
>
> [...]
Applied, thanks!
[1/3] ext4: check for out-of-order index extents in ext4_valid_extent_entries()
commit: efbcc1015b07e3e8bafa97394b743812c180a9dd
[2/3] ext4: check for inconsistent extents between index and leaf block
commit: a992bc717652fb15b435884c587ae5249415239c
[3/3] ext4: prevent partial update of the extent blocks
commit: 916ff8d5ea0e24fd43f113b6b5326d5ea8f68310
Best regards,
--
Theodore Ts'o <[email protected]>
Thanks, I've applied this patch series. You're right, we are always
updating the index node. There is a comment that we might try to skip
updating the index in some cases, but it's probably better to always
update the index node since we can count on knowing what logical
blocks could be mentioned in the leaf nodes.
- Ted