For zoned filesytsems we heavily rely on relocation for garbage collecting
as we cannot do any in-place updates of disk blocks.
But there can be situations where we're running out of space for doing the
relocation.
To solve this, always have a zone reserved for relocation.
This is a subset of another approach to this problem I've submitted in
https://lore.kernel.org/r/[email protected]
---
Changes in v3:
- Rename btrfs_reserve_relocation_zone -> btrfs_reserve_relocation_bg
- Bail out if we already have a relocation bg set
- Link to v2: https://lore.kernel.org/r/[email protected]
Changes in v2:
- Incorporate Naohiro's review
- Link to v1: https://lore.kernel.org/r/[email protected]
---
Johannes Thumshirn (2):
btrfs: zoned: reserve relocation block-group on mount
btrfs: reserve new relocation block-group after successful relocation
fs/btrfs/disk-io.c | 2 ++
fs/btrfs/relocation.c | 7 ++++++
fs/btrfs/zoned.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++
fs/btrfs/zoned.h | 3 +++
4 files changed, 77 insertions(+)
---
base-commit: d52875a6df98dc77933853e8427bd77f4598a9a7
change-id: 20240514-zoned-gc-2ce793459eb7
Best regards,
--
Johannes Thumshirn <[email protected]>
From: Johannes Thumshirn <[email protected]>
Reserve one zone as a data relocation target on each mount. If we already
find one empty block group, there's no need to force a chunk allocation,
but we can use this empty data block group as our relocation target.
Signed-off-by: Johannes Thumshirn <[email protected]>
---
fs/btrfs/disk-io.c | 2 ++
fs/btrfs/zoned.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/btrfs/zoned.h | 3 +++
3 files changed, 70 insertions(+)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a91a8056758a..19e7b4a59a9e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3558,6 +3558,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
btrfs_discard_resume(fs_info);
+ btrfs_reserve_relocation_bg(fs_info);
+
if (fs_info->uuid_root &&
(btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 4cba80b34387..9404cb32256f 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -17,6 +17,7 @@
#include "fs.h"
#include "accessors.h"
#include "bio.h"
+#include "transaction.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -2634,3 +2635,67 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->zone_active_bgs_lock);
}
+
+static u64 find_empty_block_group(struct btrfs_space_info *sinfo, u64 flags)
+{
+ struct btrfs_block_group *bg;
+
+ for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ list_for_each_entry(bg, &sinfo->block_groups[i], list) {
+ if (bg->flags != flags)
+ continue;
+ if (bg->used == 0)
+ return bg->start;
+ }
+ }
+
+ return 0;
+}
+
+void btrfs_reserve_relocation_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *tree_root = fs_info->tree_root;
+ struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+ struct btrfs_trans_handle *trans;
+ u64 flags = btrfs_get_alloc_profile(fs_info, sinfo->flags);
+ u64 bytenr = 0;
+
+ lockdep_assert_not_held(&fs_info->relocation_bg_lock);
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ if (fs_info->data_reloc_bg)
+ return;
+
+ bytenr = find_empty_block_group(sinfo, flags);
+ if (!bytenr) {
+ int ret;
+
+ trans = btrfs_join_transaction(tree_root);
+ if (IS_ERR(trans))
+ return;
+
+ ret = btrfs_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
+
+ if (!ret) {
+ struct btrfs_block_group *bg;
+
+ bytenr = find_empty_block_group(sinfo, flags);
+ if (!bytenr)
+ goto out;
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ ASSERT(bg);
+
+ if (!btrfs_zone_activate(bg))
+ bytenr = 0;
+ btrfs_put_block_group(bg);
+ }
+ }
+
+out:
+ spin_lock(&fs_info->relocation_bg_lock);
+ fs_info->data_reloc_bg = bytenr;
+ spin_unlock(&fs_info->relocation_bg_lock);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 77c4321e331f..b9935222bf7a 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -97,6 +97,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
+void btrfs_reserve_relocation_bg(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -271,6 +272,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
+static inline void btrfs_reserve_relocation_zone(struct btrfs_fs_info *fs_info) { }
+
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
--
2.43.0
From: Johannes Thumshirn <[email protected]>
After we've committed a relocation transaction, we know we have just freed
up space. Set it as hint for the next relocation.
Signed-off-by: Johannes Thumshirn <[email protected]>
---
fs/btrfs/relocation.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8b24bb5a0aa1..764317a1c55d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3811,6 +3811,13 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
ret = btrfs_commit_transaction(trans);
if (ret && !err)
err = ret;
+
+ /*
+ * We know we have just freed space, set it as hint for the
+ * next relocation.
+ */
+ if (!err)
+ btrfs_reserve_relocation_bg(fs_info);
out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
--
2.43.0
On Tue, May 21, 2024 at 3:58 PM Johannes Thumshirn <[email protected]> wrote:
>
> From: Johannes Thumshirn <[email protected]>
>
> Reserve one zone as a data relocation target on each mount. If we already
> find one empty block group, there's no need to force a chunk allocation,
> but we can use this empty data block group as our relocation target.
>
> Signed-off-by: Johannes Thumshirn <[email protected]>
> ---
> fs/btrfs/disk-io.c | 2 ++
> fs/btrfs/zoned.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/zoned.h | 3 +++
> 3 files changed, 70 insertions(+)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index a91a8056758a..19e7b4a59a9e 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3558,6 +3558,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> }
> btrfs_discard_resume(fs_info);
>
> + btrfs_reserve_relocation_bg(fs_info);
> +
> if (fs_info->uuid_root &&
> (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
> fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index 4cba80b34387..9404cb32256f 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -17,6 +17,7 @@
> #include "fs.h"
> #include "accessors.h"
> #include "bio.h"
> +#include "transaction.h"
>
> /* Maximum number of zones to report per blkdev_report_zones() call */
> #define BTRFS_REPORT_NR_ZONES 4096
> @@ -2634,3 +2635,67 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
> }
> spin_unlock(&fs_info->zone_active_bgs_lock);
> }
> +
> +static u64 find_empty_block_group(struct btrfs_space_info *sinfo, u64 flags)
> +{
> + struct btrfs_block_group *bg;
> +
> + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
> + list_for_each_entry(bg, &sinfo->block_groups[i], list) {
> + if (bg->flags != flags)
> + continue;
> + if (bg->used == 0)
> + return bg->start;
> + }
> + }
I believe I commented about this in some previous patchset version,
but here goes again.
This happens at mount time, where we have already loaded all block groups.
When we load them, if we find unused ones, we add them to the list of
empty block groups, so that the next time the cleaner kthread runs it
deletes them.
I don't see any code here removing the selected block group from that
list, or anything at btrfs_delete_unused_bgs() that prevents deleting
a block group if it was selected as the data reloc bg.
Maybe I'm missing something?
How do ensure the selected block group isn't deleted by the cleaner kthread?
Thanks.
> +
> + return 0;
> +}
> +
> +void btrfs_reserve_relocation_bg(struct btrfs_fs_info *fs_info)
> +{
> + struct btrfs_root *tree_root = fs_info->tree_root;
> + struct btrfs_space_info *sinfo = fs_info->data_sinfo;
> + struct btrfs_trans_handle *trans;
> + u64 flags = btrfs_get_alloc_profile(fs_info, sinfo->flags);
> + u64 bytenr = 0;
> +
> + lockdep_assert_not_held(&fs_info->relocation_bg_lock);
> +
> + if (!btrfs_is_zoned(fs_info))
> + return;
> +
> + if (fs_info->data_reloc_bg)
> + return;
> +
> + bytenr = find_empty_block_group(sinfo, flags);
> + if (!bytenr) {
> + int ret;
> +
> + trans = btrfs_join_transaction(tree_root);
> + if (IS_ERR(trans))
> + return;
> +
> + ret = btrfs_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
> + btrfs_end_transaction(trans);
> +
> + if (!ret) {
> + struct btrfs_block_group *bg;
> +
> + bytenr = find_empty_block_group(sinfo, flags);
> + if (!bytenr)
> + goto out;
> + bg = btrfs_lookup_block_group(fs_info, bytenr);
> + ASSERT(bg);
> +
> + if (!btrfs_zone_activate(bg))
> + bytenr = 0;
> + btrfs_put_block_group(bg);
> + }
> + }
> +
> +out:
> + spin_lock(&fs_info->relocation_bg_lock);
> + fs_info->data_reloc_bg = bytenr;
> + spin_unlock(&fs_info->relocation_bg_lock);
> +}
> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> index 77c4321e331f..b9935222bf7a 100644
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -97,6 +97,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
> int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
> struct btrfs_space_info *space_info, bool do_finish);
> void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
> +void btrfs_reserve_relocation_bg(struct btrfs_fs_info *fs_info);
> #else /* CONFIG_BLK_DEV_ZONED */
> static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
> struct blk_zone *zone)
> @@ -271,6 +272,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
>
> static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
>
> +static inline void btrfs_reserve_relocation_zone(struct btrfs_fs_info *fs_info) { }
> +
> #endif
>
> static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
>
> --
> 2.43.0
>
>
On Tue, May 21, 2024 at 04:58:08PM GMT, Johannes Thumshirn wrote:
> From: Johannes Thumshirn <[email protected]>
>
> After we've committed a relocation transaction, we know we have just freed
> up space. Set it as hint for the next relocation.
>
> Signed-off-by: Johannes Thumshirn <[email protected]>
> ---
Looks good.
Reviewed-by: Naohiro Aota <[email protected]>
On 21.05.24 17:23, Filipe Manana wrote:
>> +static u64 find_empty_block_group(struct btrfs_space_info *sinfo, u64 flags)
>> +{
>> + struct btrfs_block_group *bg;
>> +
>> + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
>> + list_for_each_entry(bg, &sinfo->block_groups[i], list) {
>> + if (bg->flags != flags)
>> + continue;
>> + if (bg->used == 0)
>> + return bg->start;
>> + }
>> + }
> I believe I commented about this in some previous patchset version,
> but here goes again.
>
> This happens at mount time, where we have already loaded all block groups.
> When we load them, if we find unused ones, we add them to the list of
> empty block groups, so that the next time the cleaner kthread runs it
> deletes them.
>
> I don't see any code here removing the selected block group from that
> list, or anything at btrfs_delete_unused_bgs() that prevents deleting
> a block group if it was selected as the data reloc bg.
>
> Maybe I'm missing something?
> How do ensure the selected block group isn't deleted by the cleaner kthread?
Indeed, I forgot about that.