Hi,
Please have a look at the patch attempting to handle the problem with deep extent tree.
Thanks, Alex
In some cases when a lot of extents are created initially by sparse file writes, they
get merged over time, but there is no way to merge blocks in different indexes.
For example, if a file is written synchronously, all even blocks first, then odd blocks.
The resulting extents tree looks like the following in "debugfs stat" output, often
with only a single block in each index/leaf:
EXTENTS:
(ETB0):33796
(ETB1):33795
(0-677):2588672-2589349
(ETB1):2590753
(678):2589350
(ETB1):2590720
(679-1357):2589351-2590029
(ETB1):2590752
(1358):2590030
(ETB1):2590721
(1359-2037):2590031-2590709
(ETB1):2590751
(2038):2590710
(ETB1):2590722
:
:
With the patch applied the index and lead blocks are properly merged (0.6% slower
under this random sync write workload, but later read IOPS are greatly reduced):
EXTENTS:
(ETB0):33796
(ETB1):2590736
(0-2047):2588672-2590719
(2048-11999):2592768-2602719
Originally the problem was hit with a real application operating on huge datasets and with just
27371 extents "inode has invalid extent depth: 6” problem occurred.
With the patch applied the application succeeded having finally 73637 in 3-level tree.
Signed-off-by: Alex Zhuravlev <[email protected]>
---
fs/ext4/extents.c | 185 ++++++++++++++++++++++++++++++++++++++++--
fs/jbd2/transaction.c | 1 +
2 files changed, 180 insertions(+), 6 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e4115d338f10..5b0e05cd5595 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1885,7 +1885,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
* This function tries to merge the @ex extent to neighbours in the tree, then
* tries to collapse the extent tree into the inode.
*/
-static void ext4_ext_try_to_merge(handle_t *handle,
+static int ext4_ext_try_to_merge(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex)
@@ -1902,9 +1902,178 @@ static void ext4_ext_try_to_merge(handle_t *handle,
merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
if (!merge_done)
- (void) ext4_ext_try_to_merge_right(inode, path, ex);
+ merge_done = ext4_ext_try_to_merge_right(inode, path, ex);
ext4_ext_try_to_merge_up(handle, inode, path);
+
+ return merge_done;
+}
+
+/*
+ * This function tries to merge blocks from @path into @npath
+ */
+static int ext4_ext_merge_blocks(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_ext_path *npath)
+{
+ unsigned int depth = ext_depth(inode);
+ int used, nused, free, i, k, err;
+ ext4_lblk_t next;
+
+ if (path[depth].p_hdr == npath[depth].p_hdr)
+ return 0;
+
+ used = le16_to_cpu(path[depth].p_hdr->eh_entries);
+ free = le16_to_cpu(npath[depth].p_hdr->eh_max) -
+ le16_to_cpu(npath[depth].p_hdr->eh_entries);
+ if (free < used)
+ return 0;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ return err;
+ err = ext4_ext_get_access(handle, inode, npath + depth);
+ if (err)
+ return err;
+
+ /* move entries from the current leave to the next one */
+ nused = le16_to_cpu(npath[depth].p_hdr->eh_entries);
+ memmove(EXT_FIRST_EXTENT(npath[depth].p_hdr) + used,
+ EXT_FIRST_EXTENT(npath[depth].p_hdr),
+ nused * sizeof(struct ext4_extent));
+ memcpy(EXT_FIRST_EXTENT(npath[depth].p_hdr),
+ EXT_FIRST_EXTENT(path[depth].p_hdr),
+ used * sizeof(struct ext4_extent));
+ le16_add_cpu(&npath[depth].p_hdr->eh_entries, used);
+ le16_add_cpu(&path[depth].p_hdr->eh_entries, -used);
+ ext4_ext_try_to_merge_right(inode, npath,
+ EXT_FIRST_EXTENT(npath[depth].p_hdr));
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ return err;
+ err = ext4_ext_dirty(handle, inode, npath + depth);
+ if (err)
+ return err;
+
+ /* otherwise the index won't get corrected */
+ npath[depth].p_ext = EXT_FIRST_EXTENT(npath[depth].p_hdr);
+ err = ext4_ext_correct_indexes(handle, inode, npath);
+ if (err)
+ return err;
+
+ for (i = depth - 1; i >= 0; i--) {
+
+ next = ext4_idx_pblock(path[i].p_idx);
+ ext4_free_blocks(handle, inode, NULL, next, 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
+ err = ext4_ext_get_access(handle, inode, path + i);
+ if (err)
+ return err;
+ le16_add_cpu(&path[i].p_hdr->eh_entries, -1);
+ if (le16_to_cpu(path[i].p_hdr->eh_entries) == 0) {
+ /* whole index block collapsed, go up */
+ continue;
+ }
+ /* remove index pointer */
+ used = EXT_LAST_INDEX(path[i].p_hdr) - path[i].p_idx + 1;
+ memmove(path[i].p_idx, path[i].p_idx + 1,
+ used * sizeof(struct ext4_extent_idx));
+
+ err = ext4_ext_dirty(handle, inode, path + i);
+ if (err)
+ return err;
+
+ if (path[i].p_hdr == npath[i].p_hdr)
+ break;
+
+ /* try to move index pointers */
+ used = le16_to_cpu(path[i].p_hdr->eh_entries);
+ free = le16_to_cpu(npath[i].p_hdr->eh_max) -
+ le16_to_cpu(npath[i].p_hdr->eh_entries);
+ if (used > free)
+ break;
+ err = ext4_ext_get_access(handle, inode, npath + i);
+ if (err)
+ return err;
+ memmove(EXT_FIRST_INDEX(npath[i].p_hdr) + used,
+ EXT_FIRST_INDEX(npath[i].p_hdr),
+ npath[i].p_hdr->eh_entries * sizeof(struct ext4_extent_idx));
+ memcpy(EXT_FIRST_INDEX(npath[i].p_hdr), EXT_FIRST_INDEX(path[i].p_hdr),
+ used * sizeof(struct ext4_extent_idx));
+ le16_add_cpu(&path[i].p_hdr->eh_entries, -used);
+ le16_add_cpu(&npath[i].p_hdr->eh_entries, used);
+ err = ext4_ext_dirty(handle, inode, path + i);
+ if (err)
+ return err;
+ err = ext4_ext_dirty(handle, inode, npath + i);
+ if (err)
+ return err;
+
+ /* correct index above */
+ for (k = i; k > 0; k--) {
+ err = ext4_ext_get_access(handle, inode, npath + k - 1);
+ if (err)
+ return err;
+ npath[k-1].p_idx->ei_block =
+ EXT_FIRST_INDEX(npath[k].p_hdr)->ei_block;
+ err = ext4_ext_dirty(handle, inode, npath + k - 1);
+ if (err)
+ return err;
+ }
+ }
+
+ /*
+ * TODO: given we've got two paths, it should be possible to
+ * collapse those two blocks into the root one in some cases
+ */
+ return 1;
+}
+
+static int ext4_ext_try_to_merge_blocks(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct ext4_ext_path *npath = NULL;
+ unsigned int depth = ext_depth(inode);
+ ext4_lblk_t next;
+ int used, rc = 0;
+
+ if (depth == 0)
+ return 0;
+
+ used = le16_to_cpu(path[depth].p_hdr->eh_entries);
+ /* don't be too agressive as checking space in
+ * the next block is not free */
+ if (used > ext4_ext_space_block(inode, 0) / 4)
+ return 0;
+
+ /* try to merge to the next block */
+ next = ext4_ext_next_leaf_block(path);
+ if (next == EXT_MAX_BLOCKS)
+ return 0;
+ npath = ext4_find_extent(inode, next, NULL, 0);
+ if (IS_ERR(npath))
+ return 0;
+ rc = ext4_ext_merge_blocks(handle, inode, path, npath);
+ ext4_ext_drop_refs(npath);
+ kfree(npath);
+ if (rc)
+ return rc > 0 ? 0 : rc;
+
+ /* try to merge with the previous block */
+ if (EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block == 0)
+ return 0;
+ next = EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block - 1;
+ npath = ext4_find_extent(inode, next, NULL, 0);
+ if (IS_ERR(npath))
+ return 0;
+ rc = ext4_ext_merge_blocks(handle, inode, npath, path);
+ ext4_ext_drop_refs(npath);
+ kfree(npath);
+ return rc > 0 ? 0 : rc;
}
/*
@@ -1976,6 +2145,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
int depth, len, err;
ext4_lblk_t next;
int mb_flags = 0, unwritten;
+ int merged = 0;
if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
mb_flags |= EXT4_MB_DELALLOC_RESERVED;
@@ -2167,8 +2337,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
merge:
/* try to merge extents */
if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
- ext4_ext_try_to_merge(handle, inode, path, nearex);
-
+ merged = ext4_ext_try_to_merge(handle, inode, path, nearex);
/* time to correct all indexes above */
err = ext4_ext_correct_indexes(handle, inode, path);
@@ -2176,6 +2345,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
goto cleanup;
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+ if (!err && merged)
+ err = ext4_ext_try_to_merge_blocks(handle, inode, path);
cleanup:
ext4_free_ext_path(npath);
@@ -3765,7 +3936,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
/* note: ext4_ext_correct_indexes() isn't needed here because
* borders are not changed
*/
- ext4_ext_try_to_merge(handle, inode, path, ex);
+ if (ext4_ext_try_to_merge(handle, inode, path, ex))
+ ext4_ext_try_to_merge_blocks(handle, inode, path);
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
@@ -3828,7 +4000,8 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
/* note: ext4_ext_correct_indexes() isn't needed here because
* borders are not changed
*/
- ext4_ext_try_to_merge(handle, inode, path, ex);
+ if (ext4_ext_try_to_merge(handle, inode, path, ex))
+ ext4_ext_try_to_merge_blocks(handle, inode, path);
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 18611241f451..7421f2af9cf2 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -513,6 +513,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
}
rsv_handle->h_reserved = 1;
rsv_handle->h_journal = journal;
+ rsv_handle->h_revoke_credits = revoke_records;
handle->h_rsv_handle = rsv_handle;
}
handle->h_revoke_credits = revoke_records;
--
2.40.1
On Jul 19, 2023, at 12:49 PM, Alex Zhuravlev <[email protected]> wrote:
>
> In some cases when a lot of extents are created initially by sparse file
> writes, they get merged over time, but there is no way to merge blocks in
> different indexes.
>
> For example, if a file is written synchronously, all even blocks first,
> then odd blocks. The resulting extents tree looks like the following in
> "debugfs stat" output, often with only a single block in each index/leaf:
>
> EXTENTS:
> (ETB0):33796
> (ETB1):33795
> (0-677):2588672-2589349
> (ETB1):2590753
> (678):2589350
> (ETB1):2590720
> (679-1357):2589351-2590029
> (ETB1):2590752
> (1358):2590030
> (ETB1):2590721
> (1359-2037):2590031-2590709
> (ETB1):2590751
> (2038):2590710
> (ETB1):2590722
> :
> :
>
> With the patch applied the index and lead blocks are properly merged
> (0.6% slower under this random sync write workload, but later read
> IOPS are greatly reduced):
>
> EXTENTS:
> (ETB0):33796
> (ETB1):2590736
> (0-2047):2588672-2590719
> (2048-11999):2592768-2602719
>
> Originally the problem was hit with a real application operating on
> huge datasets and with just 27371 extents
>
> inode has invalid extent depth: 6
>
> problem occurred. With the patch applied the application succeeded
> having finally 73637 extents in 3-level tree.
>
> Signed-off-by: Alex Zhuravlev <[email protected]>
Reviewed-by: Andreas Dilger <[email protected]>
> ---
> fs/ext4/extents.c | 185 ++++++++++++++++++++++++++++++++++++++++--
> fs/jbd2/transaction.c | 1 +
> 2 files changed, 180 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index e4115d338f10..5b0e05cd5595 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -1885,7 +1885,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
> * This function tries to merge the @ex extent to neighbours in the tree, then
> * tries to collapse the extent tree into the inode.
> */
> -static void ext4_ext_try_to_merge(handle_t *handle,
> +static int ext4_ext_try_to_merge(handle_t *handle,
> struct inode *inode,
> struct ext4_ext_path *path,
> struct ext4_extent *ex)
> @@ -1902,9 +1902,178 @@ static void ext4_ext_try_to_merge(handle_t *handle,
> merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
>
> if (!merge_done)
> - (void) ext4_ext_try_to_merge_right(inode, path, ex);
> + merge_done = ext4_ext_try_to_merge_right(inode, path, ex);
>
> ext4_ext_try_to_merge_up(handle, inode, path);
> +
> + return merge_done;
> +}
> +
> +/*
> + * This function tries to merge blocks from @path into @npath
> + */
> +static int ext4_ext_merge_blocks(handle_t *handle,
> + struct inode *inode,
> + struct ext4_ext_path *path,
> + struct ext4_ext_path *npath)
> +{
> + unsigned int depth = ext_depth(inode);
> + int used, nused, free, i, k, err;
> + ext4_lblk_t next;
> +
> + if (path[depth].p_hdr == npath[depth].p_hdr)
> + return 0;
> +
> + used = le16_to_cpu(path[depth].p_hdr->eh_entries);
> + free = le16_to_cpu(npath[depth].p_hdr->eh_max) -
> + le16_to_cpu(npath[depth].p_hdr->eh_entries);
> + if (free < used)
> + return 0;
> +
> + err = ext4_ext_get_access(handle, inode, path + depth);
> + if (err)
> + return err;
> + err = ext4_ext_get_access(handle, inode, npath + depth);
> + if (err)
> + return err;
> +
> + /* move entries from the current leave to the next one */
> + nused = le16_to_cpu(npath[depth].p_hdr->eh_entries);
> + memmove(EXT_FIRST_EXTENT(npath[depth].p_hdr) + used,
> + EXT_FIRST_EXTENT(npath[depth].p_hdr),
> + nused * sizeof(struct ext4_extent));
> + memcpy(EXT_FIRST_EXTENT(npath[depth].p_hdr),
> + EXT_FIRST_EXTENT(path[depth].p_hdr),
> + used * sizeof(struct ext4_extent));
> + le16_add_cpu(&npath[depth].p_hdr->eh_entries, used);
> + le16_add_cpu(&path[depth].p_hdr->eh_entries, -used);
> + ext4_ext_try_to_merge_right(inode, npath,
> + EXT_FIRST_EXTENT(npath[depth].p_hdr));
> +
> + err = ext4_ext_dirty(handle, inode, path + depth);
> + if (err)
> + return err;
> + err = ext4_ext_dirty(handle, inode, npath + depth);
> + if (err)
> + return err;
> +
> + /* otherwise the index won't get corrected */
> + npath[depth].p_ext = EXT_FIRST_EXTENT(npath[depth].p_hdr);
> + err = ext4_ext_correct_indexes(handle, inode, npath);
> + if (err)
> + return err;
> +
> + for (i = depth - 1; i >= 0; i--) {
> +
> + next = ext4_idx_pblock(path[i].p_idx);
> + ext4_free_blocks(handle, inode, NULL, next, 1,
> + EXT4_FREE_BLOCKS_METADATA |
> + EXT4_FREE_BLOCKS_FORGET);
> + err = ext4_ext_get_access(handle, inode, path + i);
> + if (err)
> + return err;
> + le16_add_cpu(&path[i].p_hdr->eh_entries, -1);
> + if (le16_to_cpu(path[i].p_hdr->eh_entries) == 0) {
> + /* whole index block collapsed, go up */
> + continue;
> + }
> + /* remove index pointer */
> + used = EXT_LAST_INDEX(path[i].p_hdr) - path[i].p_idx + 1;
> + memmove(path[i].p_idx, path[i].p_idx + 1,
> + used * sizeof(struct ext4_extent_idx));
> +
> + err = ext4_ext_dirty(handle, inode, path + i);
> + if (err)
> + return err;
> +
> + if (path[i].p_hdr == npath[i].p_hdr)
> + break;
> +
> + /* try to move index pointers */
> + used = le16_to_cpu(path[i].p_hdr->eh_entries);
> + free = le16_to_cpu(npath[i].p_hdr->eh_max) -
> + le16_to_cpu(npath[i].p_hdr->eh_entries);
> + if (used > free)
> + break;
> + err = ext4_ext_get_access(handle, inode, npath + i);
> + if (err)
> + return err;
> + memmove(EXT_FIRST_INDEX(npath[i].p_hdr) + used,
> + EXT_FIRST_INDEX(npath[i].p_hdr),
> + npath[i].p_hdr->eh_entries * sizeof(struct ext4_extent_idx));
> + memcpy(EXT_FIRST_INDEX(npath[i].p_hdr), EXT_FIRST_INDEX(path[i].p_hdr),
> + used * sizeof(struct ext4_extent_idx));
> + le16_add_cpu(&path[i].p_hdr->eh_entries, -used);
> + le16_add_cpu(&npath[i].p_hdr->eh_entries, used);
> + err = ext4_ext_dirty(handle, inode, path + i);
> + if (err)
> + return err;
> + err = ext4_ext_dirty(handle, inode, npath + i);
> + if (err)
> + return err;
> +
> + /* correct index above */
> + for (k = i; k > 0; k--) {
> + err = ext4_ext_get_access(handle, inode, npath + k - 1);
> + if (err)
> + return err;
> + npath[k-1].p_idx->ei_block =
> + EXT_FIRST_INDEX(npath[k].p_hdr)->ei_block;
> + err = ext4_ext_dirty(handle, inode, npath + k - 1);
> + if (err)
> + return err;
> + }
> + }
> +
> + /*
> + * TODO: given we've got two paths, it should be possible to
> + * collapse those two blocks into the root one in some cases
> + */
> + return 1;
> +}
> +
> +static int ext4_ext_try_to_merge_blocks(handle_t *handle,
> + struct inode *inode,
> + struct ext4_ext_path *path)
> +{
> + struct ext4_ext_path *npath = NULL;
> + unsigned int depth = ext_depth(inode);
> + ext4_lblk_t next;
> + int used, rc = 0;
> +
> + if (depth == 0)
> + return 0;
> +
> + used = le16_to_cpu(path[depth].p_hdr->eh_entries);
> + /* don't be too agressive as checking space in
> + * the next block is not free */
> + if (used > ext4_ext_space_block(inode, 0) / 4)
> + return 0;
> +
> + /* try to merge to the next block */
> + next = ext4_ext_next_leaf_block(path);
> + if (next == EXT_MAX_BLOCKS)
> + return 0;
> + npath = ext4_find_extent(inode, next, NULL, 0);
> + if (IS_ERR(npath))
> + return 0;
> + rc = ext4_ext_merge_blocks(handle, inode, path, npath);
> + ext4_ext_drop_refs(npath);
> + kfree(npath);
> + if (rc)
> + return rc > 0 ? 0 : rc;
> +
> + /* try to merge with the previous block */
> + if (EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block == 0)
> + return 0;
> + next = EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block - 1;
> + npath = ext4_find_extent(inode, next, NULL, 0);
> + if (IS_ERR(npath))
> + return 0;
> + rc = ext4_ext_merge_blocks(handle, inode, npath, path);
> + ext4_ext_drop_refs(npath);
> + kfree(npath);
> + return rc > 0 ? 0 : rc;
> }
>
> /*
> @@ -1976,6 +2145,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
> int depth, len, err;
> ext4_lblk_t next;
> int mb_flags = 0, unwritten;
> + int merged = 0;
>
> if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
> mb_flags |= EXT4_MB_DELALLOC_RESERVED;
> @@ -2167,8 +2337,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
> merge:
> /* try to merge extents */
> if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
> - ext4_ext_try_to_merge(handle, inode, path, nearex);
> -
> + merged = ext4_ext_try_to_merge(handle, inode, path, nearex);
>
> /* time to correct all indexes above */
> err = ext4_ext_correct_indexes(handle, inode, path);
> @@ -2176,6 +2345,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
> goto cleanup;
>
> err = ext4_ext_dirty(handle, inode, path + path->p_depth);
> + if (!err && merged)
> + err = ext4_ext_try_to_merge_blocks(handle, inode, path);
>
> cleanup:
> ext4_free_ext_path(npath);
> @@ -3765,7 +3936,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
> /* note: ext4_ext_correct_indexes() isn't needed here because
> * borders are not changed
> */
> - ext4_ext_try_to_merge(handle, inode, path, ex);
> + if (ext4_ext_try_to_merge(handle, inode, path, ex))
> + ext4_ext_try_to_merge_blocks(handle, inode, path);
>
> /* Mark modified extent as dirty */
> err = ext4_ext_dirty(handle, inode, path + path->p_depth);
> @@ -3828,7 +4000,8 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
> /* note: ext4_ext_correct_indexes() isn't needed here because
> * borders are not changed
> */
> - ext4_ext_try_to_merge(handle, inode, path, ex);
> + if (ext4_ext_try_to_merge(handle, inode, path, ex))
> + ext4_ext_try_to_merge_blocks(handle, inode, path);
>
> /* Mark modified extent as dirty */
> err = ext4_ext_dirty(handle, inode, path + path->p_depth);
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index 18611241f451..7421f2af9cf2 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -513,6 +513,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
> }
> rsv_handle->h_reserved = 1;
> rsv_handle->h_journal = journal;
> + rsv_handle->h_revoke_credits = revoke_records;
> handle->h_rsv_handle = rsv_handle;
> }
> handle->h_revoke_credits = revoke_records;
> --
> 2.40.1
>
Cheers, Andreas