2021-05-26 08:44:46

by Wang Jianchao

[permalink] [raw]
Subject: [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data

Split the codes that inserts and merges ext4_free_data structures
into a new interface ext4_insert_free_data. This is preparing for
following async background discard.

Signed-off-by: Wang Jianchao <[email protected]>
---
fs/ext4/mballoc.c | 96 +++++++++++++++++++++++++++++--------------------------
1 file changed, 51 insertions(+), 45 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 85418cf..16f06d2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -350,6 +350,12 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
+static inline struct ext4_free_data *efd_entry(struct rb_node *n)
+{
+ return rb_entry_safe(n, struct ext4_free_data, efd_node);
+}
+static int ext4_insert_free_data(struct ext4_sb_info *sbi,
+ struct rb_root *root, struct ext4_free_data *nfd);

/*
* The algorithm using this percpu seq counter goes below:
@@ -5069,28 +5075,53 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
kmem_cache_free(ext4_free_data_cachep, entry);
}

+static int ext4_insert_free_data(struct ext4_sb_info *sbi,
+ struct rb_root *root, struct ext4_free_data *nfd)
+{
+ struct rb_node **n = &root->rb_node;
+ struct rb_node *p = NULL;
+ struct ext4_free_data *fd;
+
+ while (*n) {
+ p = *n;
+ fd = rb_entry(p, struct ext4_free_data, efd_node);
+ if (nfd->efd_start_cluster < fd->efd_start_cluster)
+ n = &(*n)->rb_left;
+ else if (nfd->efd_start_cluster >=
+ (fd->efd_start_cluster + fd->efd_count))
+ n = &(*n)->rb_right;
+ else
+ return -EINVAL;
+ }
+
+ rb_link_node(&nfd->efd_node, p, n);
+ rb_insert_color(&nfd->efd_node, root);
+
+ /* Now try to see the extent can be merged to left and right */
+ fd = efd_entry(rb_prev(&nfd->efd_node));
+ if (fd)
+ ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+
+ fd = efd_entry(rb_next(&nfd->efd_node));
+ if (fd)
+ ext4_try_merge_freed_extent(sbi, fd, nfd, root);
+
+ return 0;
+}
+
static noinline_for_stack int
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
- struct ext4_free_data *new_entry)
+ struct ext4_free_data *nfd)
{
- ext4_group_t group = e4b->bd_group;
- ext4_grpblk_t cluster;
- ext4_grpblk_t clusters = new_entry->efd_count;
- struct ext4_free_data *entry;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct rb_node **n = &db->bb_free_root.rb_node, *node;
- struct rb_node *parent = NULL, *new_node;

BUG_ON(!ext4_handle_valid(handle));
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);

- new_node = &new_entry->efd_node;
- cluster = new_entry->efd_start_cluster;
-
- if (!*n) {
+ if (!db->bb_free_root.rb_node) {
/* first free block exent. We need to
protect buddy cache from being freed,
* otherwise we'll refresh it from
@@ -5099,44 +5130,19 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
get_page(e4b->bd_buddy_page);
get_page(e4b->bd_bitmap_page);
}
- while (*n) {
- parent = *n;
- entry = rb_entry(parent, struct ext4_free_data, efd_node);
- if (cluster < entry->efd_start_cluster)
- n = &(*n)->rb_left;
- else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
- n = &(*n)->rb_right;
- else {
- ext4_grp_locked_error(sb, group, 0,
- ext4_group_first_block_no(sb, group) +
- EXT4_C2B(sbi, cluster),
- "Block already on to-be-freed list");
- kmem_cache_free(ext4_free_data_cachep, new_entry);
- return 0;
- }
- }
-
- rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &db->bb_free_root);
-
- /* Now try to see the extent can be merged to left and right */
- node = rb_prev(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
- }

- node = rb_next(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
+ if (ext4_insert_free_data(sbi, &db->bb_free_root, nfd)) {
+ ext4_grp_locked_error(sb, e4b->bd_group, 0,
+ ext4_group_first_block_no(sb, e4b->bd_group) +
+ EXT4_C2B(sbi, nfd->efd_start_cluster),
+ "Block already on to-be-freed list");
+ kmem_cache_free(ext4_free_data_cachep, nfd);
+ return 0;
}

spin_lock(&sbi->s_md_lock);
- list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
- sbi->s_mb_free_pending += clusters;
+ list_add_tail(&nfd->efd_list, &sbi->s_freed_data_list);
+ sbi->s_mb_free_pending += nfd->efd_count;
spin_unlock(&sbi->s_md_lock);
return 0;
}
--
1.8.3.1


2021-05-27 20:10:20

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data

On May 26, 2021, at 2:43 AM, Wang Jianchao <[email protected]> wrote:
>
> Split the codes that inserts and merges ext4_free_data structures
> into a new interface ext4_insert_free_data. This is preparing for
> following async background discard.

Thank you for your patch series. I think this is an important area to
improve, since the current "-o discard" option adds too much overhead
to be really usable in practice.

One problem with tracking the fine-grained freed extents and then using
them directly to submit TRIM requests is that the underlying device may
ignore TRIM requests that are too small. Submitting the TRIM right
after each transaction commit does not allow much time for freed blocks
to be aggregated (e.g. "rm -r" of a big directory tree), so it would be
better to delay TRIM requests until more freed extents can be merged.
Since most users only run fstrim once a day or every few days, it makes
sense to allow time to merge freed space (tunable, maybe 5-15 minutes).

However, tracking the rbtree for each group may be quite a lot of overhead
if this is kept in memory for minutes or hours, so minimizing the memory
usage to track freed extents is also important.

We discussed on the ext4 developer call today whether it is necessary
to track the fine-grained free extents in memory, or if it would be
better to only track min/max freed blocks within each group? Depending
on the fragmentation of the free blocks in the group, it may be enough
to just store a single bit in each group (as is done today), and only
clear this when there are blocks freed in the group.

Either way, the improvement would be that the kernel is scheduling
groups to be trimmed, and submitting TRIM requests at a much larger size,
instead of depending on userspace to run fstrim. This also allows the
fstrim scheduler to decide when the device is less busy and submit more
TRIM requests, and back off when the device is busy.

The other potential improvement is to track the TRIMMED state persistently
in the block groups, so that unmount/remount doesn't result in every group
being trimmed again. It would be good to refresh and include patches from:

"ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim"
https://patchwork.ozlabs.org/project/linux-ext4/list/?series=184981

and

e2fsprogs: add EXT2_FLAG_BG_WAS_TRIMMED to optimize fstrim
https://patchwork.ozlabs.org/project/linux-ext4/list/?series=179639

along with this series.

> Signed-off-by: Wang Jianchao <[email protected]>
> ---
> fs/ext4/mballoc.c | 96 +++++++++++++++++++++++++++++--------------------------
> 1 file changed, 51 insertions(+), 45 deletions(-)
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 85418cf..16f06d2 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -350,6 +350,12 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
> static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
> ext4_group_t group);
> static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
> +static inline struct ext4_free_data *efd_entry(struct rb_node *n)
> +{
> + return rb_entry_safe(n, struct ext4_free_data, efd_node);
> +}
> +static int ext4_insert_free_data(struct ext4_sb_info *sbi,
> + struct rb_root *root, struct ext4_free_data *nfd);
>
> /*
> * The algorithm using this percpu seq counter goes below:
> @@ -5069,28 +5075,53 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
> kmem_cache_free(ext4_free_data_cachep, entry);
> }
>
> +static int ext4_insert_free_data(struct ext4_sb_info *sbi,
> + struct rb_root *root, struct ext4_free_data *nfd)
> +{
> + struct rb_node **n = &root->rb_node;
> + struct rb_node *p = NULL;
> + struct ext4_free_data *fd;
> +
> + while (*n) {
> + p = *n;
> + fd = rb_entry(p, struct ext4_free_data, efd_node);
> + if (nfd->efd_start_cluster < fd->efd_start_cluster)
> + n = &(*n)->rb_left;
> + else if (nfd->efd_start_cluster >=
> + (fd->efd_start_cluster + fd->efd_count))
> + n = &(*n)->rb_right;
> + else
> + return -EINVAL;
> + }
> +
> + rb_link_node(&nfd->efd_node, p, n);
> + rb_insert_color(&nfd->efd_node, root);
> +
> + /* Now try to see the extent can be merged to left and right */
> + fd = efd_entry(rb_prev(&nfd->efd_node));
> + if (fd)
> + ext4_try_merge_freed_extent(sbi, fd, nfd, root);
> +
> + fd = efd_entry(rb_next(&nfd->efd_node));
> + if (fd)
> + ext4_try_merge_freed_extent(sbi, fd, nfd, root);
> +
> + return 0;
> +}
> +
> static noinline_for_stack int
> ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
> - struct ext4_free_data *new_entry)
> + struct ext4_free_data *nfd)
> {
> - ext4_group_t group = e4b->bd_group;
> - ext4_grpblk_t cluster;
> - ext4_grpblk_t clusters = new_entry->efd_count;
> - struct ext4_free_data *entry;
> struct ext4_group_info *db = e4b->bd_info;
> struct super_block *sb = e4b->bd_sb;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> - struct rb_node **n = &db->bb_free_root.rb_node, *node;
> - struct rb_node *parent = NULL, *new_node;
>
> BUG_ON(!ext4_handle_valid(handle));
> BUG_ON(e4b->bd_bitmap_page == NULL);
> BUG_ON(e4b->bd_buddy_page == NULL);
>
> - new_node = &new_entry->efd_node;
> - cluster = new_entry->efd_start_cluster;
> -
> - if (!*n) {
> + if (!db->bb_free_root.rb_node) {
> /* first free block exent. We need to
> protect buddy cache from being freed,
> * otherwise we'll refresh it from
> @@ -5099,44 +5130,19 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
> get_page(e4b->bd_buddy_page);
> get_page(e4b->bd_bitmap_page);
> }
> - while (*n) {
> - parent = *n;
> - entry = rb_entry(parent, struct ext4_free_data, efd_node);
> - if (cluster < entry->efd_start_cluster)
> - n = &(*n)->rb_left;
> - else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
> - n = &(*n)->rb_right;
> - else {
> - ext4_grp_locked_error(sb, group, 0,
> - ext4_group_first_block_no(sb, group) +
> - EXT4_C2B(sbi, cluster),
> - "Block already on to-be-freed list");
> - kmem_cache_free(ext4_free_data_cachep, new_entry);
> - return 0;
> - }
> - }
> -
> - rb_link_node(new_node, parent, n);
> - rb_insert_color(new_node, &db->bb_free_root);
> -
> - /* Now try to see the extent can be merged to left and right */
> - node = rb_prev(new_node);
> - if (node) {
> - entry = rb_entry(node, struct ext4_free_data, efd_node);
> - ext4_try_merge_freed_extent(sbi, entry, new_entry,
> - &(db->bb_free_root));
> - }
>
> - node = rb_next(new_node);
> - if (node) {
> - entry = rb_entry(node, struct ext4_free_data, efd_node);
> - ext4_try_merge_freed_extent(sbi, entry, new_entry,
> - &(db->bb_free_root));
> + if (ext4_insert_free_data(sbi, &db->bb_free_root, nfd)) {
> + ext4_grp_locked_error(sb, e4b->bd_group, 0,
> + ext4_group_first_block_no(sb, e4b->bd_group) +
> + EXT4_C2B(sbi, nfd->efd_start_cluster),
> + "Block already on to-be-freed list");
> + kmem_cache_free(ext4_free_data_cachep, nfd);
> + return 0;
> }
>
> spin_lock(&sbi->s_md_lock);
> - list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
> - sbi->s_mb_free_pending += clusters;
> + list_add_tail(&nfd->efd_list, &sbi->s_freed_data_list);
> + sbi->s_mb_free_pending += nfd->efd_count;
> spin_unlock(&sbi->s_md_lock);
> return 0;
> }
> --
> 1.8.3.1
>


Cheers, Andreas






Attachments:
signature.asc (890.00 B)
Message signed with OpenPGP

2021-05-28 06:11:00

by Wang Jianchao

[permalink] [raw]
Subject: Re: [PATCH V2 4/7] ext4: add new helper interface ext4_insert_free_data



On 2021/5/28 4:09 AM, Andreas Dilger wrote:
> On May 26, 2021, at 2:43 AM, Wang Jianchao <[email protected]> wrote:
>>
>> Split the codes that inserts and merges ext4_free_data structures
>> into a new interface ext4_insert_free_data. This is preparing for
>> following async background discard.
>
> Thank you for your patch series. I think this is an important area to
> improve, since the current "-o discard" option adds too much overhead
> to be really usable in practice.

Yes, indeed
The discard can help to free unusable spaces back to storage cluster.
But do discard after every commit can be disaster,
- the jbd2 commit kthread can be blocked for long time sometimes, and
then all of the metadata modify operations are blocked due to no log
space
- the flooding discard can saturate the storage backend and then the
real write operations are blocked, especially the jbd2 log records

Even in the system with this patch, we can still observed the log write IO
can be blocked by the discard T_T...

>
> One problem with tracking the fine-grained freed extents and then using
> them directly to submit TRIM requests is that the underlying device may
> ignore TRIM requests that are too small. Submitting the TRIM right
> after each transaction commit does not allow much time for freed blocks
> to be aggregated (e.g. "rm -r" of a big directory tree), so it would be
> better to delay TRIM requests until more freed extents can be merged.
> Since most users only run fstrim once a day or every few days, it makes
> sense to allow time to merge freed space (tunable, maybe 5-15 minutes).
>
> However, tracking the rbtree for each group may be quite a lot of overhead
> if this is kept in memory for minutes or hours, so minimizing the memory
> usage to track freed extents is also important.
>
> We discussed on the ext4 developer call today whether it is necessary
> to track the fine-grained free extents in memory, or if it would be
> better to only track min/max freed blocks within each group? Depending
> on the fragmentation of the free blocks in the group, it may be enough
> to just store a single bit in each group (as is done today), and only
> clear this when there are blocks freed in the group.
>
> Either way, the improvement would be that the kernel is scheduling
> groups to be trimmed, and submitting TRIM requests at a much larger size,
> instead of depending on userspace to run fstrim. This also allows the
> fstrim scheduler to decide when the device is less busy and submit more
> TRIM requests, and back off when the device is busy.

Schedule a background trim task in kernel when the storage is not so busy
and pick up a block group that that has bigger enough free blocks.
This sounds fair.

>
> The other potential improvement is to track the TRIMMED state persistently
> in the block groups, so that unmount/remount doesn't result in every group
> being trimmed again. It would be good to refresh and include patches from:
>
> "ext4: introduce EXT4_BG_WAS_TRIMMED to optimize trim"
> https://patchwork.ozlabs.org/project/linux-ext4/list/?series=184981
>
> and
>
> e2fsprogs: add EXT2_FLAG_BG_WAS_TRIMMED to optimize fstrim
> https://patchwork.ozlabs.org/project/linux-ext4/list/?series=179639
>
> along with this series.
>

Yes, thanks a million

Best regard
Jianchao

>> Signed-off-by: Wang Jianchao <[email protected]>

>
>