For zoned filesytsems we heavily rely on relocation for garbage collecting
as we cannot do any in-place updates of disk blocks.
But there can be situations where we're running out of space for doing the
relocation.
To solve this, always have a zone reserved for relocation.
This is a subset of another approach to this problem I've submitted in
https://lore.kernel.org/r/[email protected]
---
Johannes Thumshirn (2):
btrfs: zoned: reserve relocation zone on mount
btrfs: reserve new relocation zone after successful relocation
fs/btrfs/disk-io.c | 2 ++
fs/btrfs/relocation.c | 4 ++++
fs/btrfs/zoned.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++
fs/btrfs/zoned.h | 3 +++
4 files changed, 69 insertions(+)
---
base-commit: d52875a6df98dc77933853e8427bd77f4598a9a7
change-id: 20240514-zoned-gc-2ce793459eb7
Best regards,
--
Johannes Thumshirn <[email protected]>
From: Johannes Thumshirn <[email protected]>
Reserve one zone as a data relocation target on each mount. If we already
find one empty block group, there's no need to force a chunk allocation,
but we can use this empty data block group as our relocation target.
Signed-off-by: Johannes Thumshirn <[email protected]>
---
fs/btrfs/disk-io.c | 2 ++
fs/btrfs/zoned.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/btrfs/zoned.h | 3 +++
3 files changed, 65 insertions(+)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a91a8056758a..0490f2f45fb1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3558,6 +3558,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
btrfs_discard_resume(fs_info);
+ btrfs_reserve_relocation_zone(fs_info);
+
if (fs_info->uuid_root &&
(btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 4cba80b34387..b752f8c95f40 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -17,6 +17,7 @@
#include "fs.h"
#include "accessors.h"
#include "bio.h"
+#include "transaction.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -2634,3 +2635,62 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->zone_active_bgs_lock);
}
+
+static u64 find_empty_block_group(struct btrfs_space_info *sinfo, u64 flags)
+{
+ struct btrfs_block_group *bg;
+
+ for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ list_for_each_entry(bg, &sinfo->block_groups[i], list) {
+ if (bg->flags != flags)
+ continue;
+ if (bg->used == 0)
+ return bg->start;
+ }
+ }
+
+ return 0;
+}
+
+void btrfs_reserve_relocation_zone(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+ struct btrfs_trans_handle *trans;
+ u64 flags = btrfs_get_alloc_profile(fs_info, sinfo->flags);
+ u64 bytenr = 0;
+
+ lockdep_assert_not_held(&fs_info->relocation_bg_lock);
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ bytenr = find_empty_block_group(sinfo, flags);
+ if (!bytenr) {
+ int ret;
+
+ trans = btrfs_join_transaction(tree_root);
+ if (IS_ERR(trans))
+ return;
+
+ ret = btrfs_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
+
+ if (!ret) {
+ struct btrfs_block_group *bg;
+
+ bytenr = find_empty_block_group(sinfo, flags);
+ if (!bytenr)
+ goto out;
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ ASSERT(bg);
+
+ if (!btrfs_zone_activate(bg))
+ bytenr = 0;
+ btrfs_put_block_group(bg);
+ }
+ }
+
+out:
+ fs_info->data_reloc_bg = bytenr;
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 77c4321e331f..048ffada4549 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -97,6 +97,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
+void btrfs_reserve_relocation_zone(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -271,6 +272,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
+static inline void btrfs_reserve_relocation_zone(struct btrfs_fs_info *fs_info) { }
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
--
2.35.3
From: Johannes Thumshirn <[email protected]>
After we've committed a relocation transaction, we know we have just freed
up space. Set it as hint for the next relocation.
Signed-off-by: Johannes Thumshirn <[email protected]>
---
fs/btrfs/relocation.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8b24bb5a0aa1..d943abf5cb33 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3808,9 +3808,13 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
err = PTR_ERR(trans);
goto out_free;
}
+
ret = btrfs_commit_transaction(trans);
if (ret && !err)
err = ret;
+
+ /* We know we have just freed space, set it as hint for the next relocation */
+ btrfs_reserve_relocation_zone(fs_info);
out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
--
2.35.3
On Tue, May 14, 2024 at 11:13:21PM +0200, Johannes Thumshirn wrote:
> From: Johannes Thumshirn <[email protected]>
>
> Reserve one zone as a data relocation target on each mount. If we already
> find one empty block group, there's no need to force a chunk allocation,
> but we can use this empty data block group as our relocation target.
>
> Signed-off-by: Johannes Thumshirn <[email protected]>
> ---
> fs/btrfs/disk-io.c | 2 ++
> fs/btrfs/zoned.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/zoned.h | 3 +++
> 3 files changed, 65 insertions(+)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index a91a8056758a..0490f2f45fb1 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3558,6 +3558,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> }
> btrfs_discard_resume(fs_info);
>
> + btrfs_reserve_relocation_zone(fs_info);
> +
> if (fs_info->uuid_root &&
> (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
> fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index 4cba80b34387..b752f8c95f40 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -17,6 +17,7 @@
> #include "fs.h"
> #include "accessors.h"
> #include "bio.h"
> +#include "transaction.h"
>
> /* Maximum number of zones to report per blkdev_report_zones() call */
> #define BTRFS_REPORT_NR_ZONES 4096
> @@ -2634,3 +2635,62 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
> }
> spin_unlock(&fs_info->zone_active_bgs_lock);
> }
> +
> +static u64 find_empty_block_group(struct btrfs_space_info *sinfo, u64 flags)
> +{
> + struct btrfs_block_group *bg;
> +
> + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
> + list_for_each_entry(bg, &sinfo->block_groups[i], list) {
> + if (bg->flags != flags)
> + continue;
> + if (bg->used == 0)
> + return bg->start;
> + }
> + }
> +
> + return 0;
> +}
> +
> +void btrfs_reserve_relocation_zone(struct btrfs_fs_info *fs_info)
This function reserves the data relocation block group but not a zone. So,
I'd prefer "btrfs_reserve_relocation_block_group" or "..._bg".
> +{
> + struct btrfs_root *tree_root = fs_info->tree_root;
> + struct btrfs_space_info *sinfo = fs_info->data_sinfo;
> + struct btrfs_trans_handle *trans;
> + u64 flags = btrfs_get_alloc_profile(fs_info, sinfo->flags);
> + u64 bytenr = 0;
> +
> + lockdep_assert_not_held(&fs_info->relocation_bg_lock);
> +
> + if (!btrfs_is_zoned(fs_info))
> + return;
> +
Don't we need to check fs_info->data_reloc_bg first? If we don't find an
empty BG and we fail to allocate a new BG, this is going to kill the
existing data reloc BG. It's OK for mount time. But, this is going to be
called on runtime in the next patch.
> + bytenr = find_empty_block_group(sinfo, flags);
> + if (!bytenr) {
> + int ret;
> +
> + trans = btrfs_join_transaction(tree_root);
> + if (IS_ERR(trans))
> + return;
> +
> + ret = btrfs_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
> + btrfs_end_transaction(trans);
I'd like to have a comment on the error case, especially relates to the
above point. Is it OK to override fs_info->data_reloc_bg to 0 in the error
case?
> +
> + if (!ret) {
> + struct btrfs_block_group *bg;
> +
> + bytenr = find_empty_block_group(sinfo, flags);
> + if (!bytenr)
Maybe, same here. This is a case of someone stealing the allocated
chunk. Isn't it worth retrying? Especially when this function is called on
runtime?
> + goto out;
> + bg = btrfs_lookup_block_group(fs_info, bytenr);
> + ASSERT(bg);
> +
> + if (!btrfs_zone_activate(bg))
> + bytenr = 0;
> + btrfs_put_block_group(bg);
> + }
> + }
> +
> +out:
> + fs_info->data_reloc_bg = bytenr;
> +}
> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> index 77c4321e331f..048ffada4549 100644
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -97,6 +97,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
> int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
> struct btrfs_space_info *space_info, bool do_finish);
> void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
> +void btrfs_reserve_relocation_zone(struct btrfs_fs_info *fs_info);
> #else /* CONFIG_BLK_DEV_ZONED */
> static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
> struct blk_zone *zone)
> @@ -271,6 +272,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
>
> static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
>
> +static inline void btrfs_reserve_relocation_zone(struct btrfs_fs_info *fs_info) { }
> +
> #endif
>
> static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
>
> --
> 2.35.3
>
On Tue, May 14, 2024 at 11:13:22PM +0200, Johannes Thumshirn wrote:
> From: Johannes Thumshirn <[email protected]>
>
> After we've committed a relocation transaction, we know we have just freed
> up space. Set it as hint for the next relocation.
>
> Signed-off-by: Johannes Thumshirn <[email protected]>
> ---
> fs/btrfs/relocation.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index 8b24bb5a0aa1..d943abf5cb33 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -3808,9 +3808,13 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
> err = PTR_ERR(trans);
> goto out_free;
> }
> +
> ret = btrfs_commit_transaction(trans);
> if (ret && !err)
> err = ret;
> +
> + /* We know we have just freed space, set it as hint for the next relocation */
> + btrfs_reserve_relocation_zone(fs_info);
With "if (!err)"? We definitely bail out fast in an error case.
> out_free:
> ret = clean_dirty_subvols(rc);
> if (ret < 0 && !err)
>
> --
> 2.35.3
>
On 15.05.24 12:01, Naohiro Aota wrote:
>> +
>> + /* We know we have just freed space, set it as hint for the next relocation */
>> + btrfs_reserve_relocation_zone(fs_info);
>
> With "if (!err)"? We definitely bail out fast in an error case.
>
Right, I'll fix it in the next version.