2019-08-22 09:37:25

by Li Dongyang

[permalink] [raw]
Subject: [PATCH v2 1/4] libext2fs: optimize ext2fs_convert_subcluster_bitmap()

For a bigalloc filesystem, converting the block bitmap from blocks
to chunks in ext2fs_convert_subcluster_bitmap() can take a long time
when the device is huge, because we test the bitmap
bit-by-bit using ext2fs_test_block_bitmap2().
Use ext2fs_find_first_set_block_bitmap2() which is more efficient
for mke2fs when the fs is mostly empty.

e2fsck can also benefit from this during pass1 block scanning.

Time taken for "mke2fs -O bigalloc,extent -C 131072 -b 4096" on a 1PB
device:

without patch:
real 27m49.457s
user 21m36.474s
sys 6m9.514s

with patch:
real 6m31.908s
user 0m1.806s
sys 6m29.697s

Signed-off-by: Li Dongyang <[email protected]>
---
lib/ext2fs/gen_bitmap64.c | 20 +++++++-------------
1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/lib/ext2fs/gen_bitmap64.c b/lib/ext2fs/gen_bitmap64.c
index 6e4d8b71..f1dd1891 100644
--- a/lib/ext2fs/gen_bitmap64.c
+++ b/lib/ext2fs/gen_bitmap64.c
@@ -799,8 +799,7 @@ errcode_t ext2fs_convert_subcluster_bitmap(ext2_filsys fs,
ext2fs_generic_bitmap_64 bmap, cmap;
ext2fs_block_bitmap gen_bmap = *bitmap, gen_cmap;
errcode_t retval;
- blk64_t i, b_end, c_end;
- int n, ratio;
+ blk64_t i, next, b_end, c_end;

bmap = (ext2fs_generic_bitmap_64) gen_bmap;
if (fs->cluster_ratio_bits == ext2fs_get_bitmap_granularity(gen_bmap))
@@ -817,18 +816,13 @@ errcode_t ext2fs_convert_subcluster_bitmap(ext2_filsys fs,
bmap->end = bmap->real_end;
c_end = cmap->end;
cmap->end = cmap->real_end;
- n = 0;
- ratio = 1 << fs->cluster_ratio_bits;
while (i < bmap->real_end) {
- if (ext2fs_test_block_bitmap2(gen_bmap, i)) {
- ext2fs_mark_block_bitmap2(gen_cmap, i);
- i += ratio - n;
- n = 0;
- continue;
- }
- i++; n++;
- if (n >= ratio)
- n = 0;
+ retval = ext2fs_find_first_set_block_bitmap2(gen_bmap,
+ i, bmap->real_end, &next);
+ if (retval)
+ break;
+ ext2fs_mark_block_bitmap2(gen_cmap, next);
+ i = EXT2FS_C2B(fs, EXT2FS_B2C(fs, next) + 1);
}
bmap->end = b_end;
cmap->end = c_end;
--
2.22.1


2019-08-22 09:37:25

by Li Dongyang

[permalink] [raw]
Subject: [PATCH v2 4/4] mke2fs: set overhead in super block for bigalloc

If overhead is not recorded in the super block, it is caculated
during mount in kernel, for bigalloc file systems the it takes
O(groups**2) in time.
For a 1PB deivce with 32K cluste size it takes ~12 mins to
mount, with most of the time spent on figuring out overhead.

While we can not improve the overhead algorithm in kernel
due to the nature of bigalloc, we can work out the overhead
during mke2fs and set it in the super block, avoiding calculating
it every time when it mounts.

Overhead is s_first_data_block plus internal journal blocks plus
the block and inode bitmaps, inode table, super block backups and
group descriptor blocks for every group. This patch introduces
ext2fs_count_used_clusters(), which calculates the clusters used
in the block bitmap for the given range.

When bad blocks are involved, it gets tricky because the blocks
counted as overhead and the bad blocks can end up in the same
allocation cluster. In this case we will unmark the bad blocks from
the block bitmap, covert to cluster bitmap and get the overhead,
then mark the bad blocks back in the cluster bitmap.

Signed-off-by: Li Dongyang <[email protected]>
---
lib/ext2fs/ext2fs.h | 2 ++
lib/ext2fs/gen_bitmap64.c | 35 +++++++++++++++++++++++++++
misc/mke2fs.c | 50 ++++++++++++++++++++++++++++++++++++++-
3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index 59fd9742..a8ddb9e4 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -1437,6 +1437,8 @@ errcode_t ext2fs_set_generic_bmap_range(ext2fs_generic_bitmap bmap,
void *in);
errcode_t ext2fs_convert_subcluster_bitmap(ext2_filsys fs,
ext2fs_block_bitmap *bitmap);
+errcode_t ext2fs_count_used_clusters(ext2_filsys fs, blk64_t start,
+ blk64_t end, blk64_t *out);

/* get_num_dirs.c */
extern errcode_t ext2fs_get_num_dirs(ext2_filsys fs, ext2_ino_t *ret_num_dirs);
diff --git a/lib/ext2fs/gen_bitmap64.c b/lib/ext2fs/gen_bitmap64.c
index f1dd1891..b2370667 100644
--- a/lib/ext2fs/gen_bitmap64.c
+++ b/lib/ext2fs/gen_bitmap64.c
@@ -940,3 +940,38 @@ errcode_t ext2fs_find_first_set_generic_bmap(ext2fs_generic_bitmap bitmap,

return ENOENT;
}
+
+errcode_t ext2fs_count_used_clusters(ext2_filsys fs, blk64_t start,
+ blk64_t end, blk64_t *out)
+{
+ blk64_t next;
+ blk64_t tot_set = 0;
+ errcode_t retval;
+
+ while (start < end) {
+ retval = ext2fs_find_first_set_block_bitmap2(fs->block_map,
+ start, end, &next);
+ if (retval) {
+ if (retval == ENOENT)
+ retval = 0;
+ break;
+ }
+ start = next;
+
+ retval = ext2fs_find_first_zero_block_bitmap2(fs->block_map,
+ start, end, &next);
+ if (retval == 0) {
+ tot_set += next - start;
+ start = next + 1;
+ } else if (retval == ENOENT) {
+ retval = 0;
+ tot_set += end - start + 1;
+ break;
+ } else
+ break;
+ }
+
+ if (!retval)
+ *out = EXT2FS_NUM_B2C(fs, tot_set);
+ return retval;
+}
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 30e353d3..1928c9bf 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -2912,6 +2912,8 @@ int main (int argc, char *argv[])
errcode_t retval = 0;
ext2_filsys fs;
badblocks_list bb_list = 0;
+ badblocks_iterate bb_iter;
+ blk_t blk;
unsigned int journal_blocks = 0;
unsigned int i, checkinterval;
int max_mnt_count;
@@ -2922,6 +2924,7 @@ int main (int argc, char *argv[])
char opt_string[40];
char *hash_alg_str;
int itable_zeroed = 0;
+ blk64_t overhead;

#ifdef ENABLE_NLS
setlocale(LC_MESSAGES, "");
@@ -3213,6 +3216,23 @@ int main (int argc, char *argv[])
if (!quiet)
printf("%s", _("done \n"));

+ /*
+ * Unmark bad blocks to calculate overhead, because metadata
+ * blocks and bad blocks can land on the same allocation cluster.
+ */
+ if (bb_list) {
+ retval = ext2fs_badblocks_list_iterate_begin(bb_list,
+ &bb_iter);
+ if (retval) {
+ com_err("ext2fs_badblocks_list_iterate_begin", retval,
+ "%s", _("while unmarking bad blocks"));
+ exit(1);
+ }
+ while (ext2fs_badblocks_list_iterate(bb_iter, &blk))
+ ext2fs_unmark_block_bitmap2(fs->block_map, blk);
+ ext2fs_badblocks_list_iterate_end(bb_iter);
+ }
+
retval = ext2fs_convert_subcluster_bitmap(fs, &fs->block_map);
if (retval) {
com_err(program_name, retval, "%s",
@@ -3220,6 +3240,28 @@ int main (int argc, char *argv[])
exit(1);
}

+ retval = ext2fs_count_used_clusters(fs, fs->super->s_first_data_block,
+ ext2fs_blocks_count(fs->super) - 1,
+ &overhead);
+ if (retval) {
+ com_err(program_name, retval, "%s",
+ _("while calculating overhead"));
+ exit(1);
+ }
+
+ if (bb_list) {
+ retval = ext2fs_badblocks_list_iterate_begin(bb_list,
+ &bb_iter);
+ if (retval) {
+ com_err("ext2fs_badblocks_list_iterate_begin", retval,
+ "%s", _("while marking bad blocks as used"));
+ exit(1);
+ }
+ while (ext2fs_badblocks_list_iterate(bb_iter, &blk))
+ ext2fs_mark_block_bitmap2(fs->block_map, blk);
+ ext2fs_badblocks_list_iterate_end(bb_iter);
+ }
+
if (super_only) {
check_plausibility(device_name, CHECK_FS_EXIST, NULL);
printf(_("%s may be further corrupted by superblock rewrite\n"),
@@ -3317,6 +3359,7 @@ int main (int argc, char *argv[])
free(journal_device);
} else if ((journal_size) ||
ext2fs_has_feature_journal(&fs_param)) {
+ overhead += EXT2FS_NUM_B2C(fs, journal_blocks);
if (super_only) {
printf("%s", _("Skipping journal creation in super-only mode\n"));
fs->super->s_journal_inum = EXT2_JOURNAL_INO;
@@ -3359,8 +3402,13 @@ no_journal:
fs->super->s_mmp_update_interval);
}

- if (ext2fs_has_feature_bigalloc(&fs_param))
+ overhead += fs->super->s_first_data_block;
+
+ if (ext2fs_has_feature_bigalloc(&fs_param)) {
+ if (!super_only)
+ fs->super->s_overhead_clusters = overhead;
fix_cluster_bg_counts(fs);
+ }
if (ext2fs_has_feature_quota(&fs_param))
create_quota_inodes(fs);

--
2.22.1

2019-08-22 09:39:00

by Li Dongyang

[permalink] [raw]
Subject: [PATCH v2 3/4] ext2fs: rename "s_overhead_blocks" to "s_overhead_clusters"

Rename s_overhead_blocks field from struct ext2_super_block to
make it consistent with the kernel counterpart.

Signed-off-by: Li Dongyang <[email protected]>
---
debugfs/set_fields.c | 2 +-
lib/e2p/ls.c | 6 +++---
lib/ext2fs/ext2_fs.h | 2 +-
lib/ext2fs/swapfs.c | 2 +-
lib/ext2fs/tst_super_size.c | 2 +-
5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/debugfs/set_fields.c b/debugfs/set_fields.c
index 5142554d..f497bd92 100644
--- a/debugfs/set_fields.c
+++ b/debugfs/set_fields.c
@@ -160,7 +160,7 @@ static struct field_set_info super_fields[] = {
{ "usr_quota_inum", &set_sb.s_usr_quota_inum, NULL, 4, parse_uint },
{ "grp_quota_inum", &set_sb.s_grp_quota_inum, NULL, 4, parse_uint },
{ "prj_quota_inum", &set_sb.s_prj_quota_inum, NULL, 4, parse_uint },
- { "overhead_blocks", &set_sb.s_overhead_blocks, NULL, 4, parse_uint },
+ { "overhead_clusters", &set_sb.s_overhead_clusters, NULL, 4, parse_uint },
{ "backup_bgs", &set_sb.s_backup_bgs[0], NULL, 4, parse_uint,
FLAG_ARRAY, 2 },
{ "checksum", &set_sb.s_checksum, NULL, 4, parse_uint },
diff --git a/lib/e2p/ls.c b/lib/e2p/ls.c
index 5a446178..5ca750f6 100644
--- a/lib/e2p/ls.c
+++ b/lib/e2p/ls.c
@@ -272,9 +272,9 @@ void list_super2(struct ext2_super_block * sb, FILE *f)
fprintf(f, "Inode count: %u\n", sb->s_inodes_count);
fprintf(f, "Block count: %llu\n", e2p_blocks_count(sb));
fprintf(f, "Reserved block count: %llu\n", e2p_r_blocks_count(sb));
- if (sb->s_overhead_blocks)
- fprintf(f, "Overhead blocks: %u\n",
- sb->s_overhead_blocks);
+ if (sb->s_overhead_clusters)
+ fprintf(f, "Overhead clusters: %u\n",
+ sb->s_overhead_clusters);
fprintf(f, "Free blocks: %llu\n", e2p_free_blocks_count(sb));
fprintf(f, "Free inodes: %u\n", sb->s_free_inodes_count);
fprintf(f, "First block: %u\n", sb->s_first_data_block);
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index cbb44bdb..5737dc61 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -742,7 +742,7 @@ struct ext2_super_block {
/*200*/ __u8 s_mount_opts[64];
/*240*/ __u32 s_usr_quota_inum; /* inode number of user quota file */
__u32 s_grp_quota_inum; /* inode number of group quota file */
- __u32 s_overhead_blocks; /* overhead blocks/clusters in fs */
+ __u32 s_overhead_clusters; /* overhead blocks/clusters in fs */
/*24c*/ __u32 s_backup_bgs[2]; /* If sparse_super2 enabled */
/*254*/ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
/*258*/ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
diff --git a/lib/ext2fs/swapfs.c b/lib/ext2fs/swapfs.c
index a1560045..63b24330 100644
--- a/lib/ext2fs/swapfs.c
+++ b/lib/ext2fs/swapfs.c
@@ -121,7 +121,7 @@ void ext2fs_swap_super(struct ext2_super_block * sb)
/* sb->s_mount_opts is __u8 and does not need swabbing */
sb->s_usr_quota_inum = ext2fs_swab32(sb->s_usr_quota_inum);
sb->s_grp_quota_inum = ext2fs_swab32(sb->s_grp_quota_inum);
- sb->s_overhead_blocks = ext2fs_swab32(sb->s_overhead_blocks);
+ sb->s_overhead_clusters = ext2fs_swab32(sb->s_overhead_clusters);
sb->s_backup_bgs[0] = ext2fs_swab32(sb->s_backup_bgs[0]);
sb->s_backup_bgs[1] = ext2fs_swab32(sb->s_backup_bgs[1]);
/* sb->s_encrypt_algos is __u8 and does not need swabbing */
diff --git a/lib/ext2fs/tst_super_size.c b/lib/ext2fs/tst_super_size.c
index a932685d..ab38dd59 100644
--- a/lib/ext2fs/tst_super_size.c
+++ b/lib/ext2fs/tst_super_size.c
@@ -135,7 +135,7 @@ int main(int argc, char **argv)
check_field(s_mount_opts, 64);
check_field(s_usr_quota_inum, 4);
check_field(s_grp_quota_inum, 4);
- check_field(s_overhead_blocks, 4);
+ check_field(s_overhead_clusters, 4);
check_field(s_backup_bgs, 8);
check_field(s_encrypt_algos, 4);
check_field(s_encrypt_pw_salt, 16);
--
2.22.1

2019-08-26 03:16:59

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH v2 3/4] ext2fs: rename "s_overhead_blocks" to "s_overhead_clusters"

On Aug 22, 2019, at 2:26 AM, Dongyang Li <[email protected]> wrote:
>
> Rename s_overhead_blocks field from struct ext2_super_block to
> make it consistent with the kernel counterpart.
>
> Signed-off-by: Li Dongyang <[email protected]>

Reviewed-by: Andreas Dilger <[email protected]>

with one minor comment/question below...

> ---
> debugfs/set_fields.c | 2 +-
> lib/e2p/ls.c | 6 +++---
> lib/ext2fs/ext2_fs.h | 2 +-
> lib/ext2fs/swapfs.c | 2 +-
> lib/ext2fs/tst_super_size.c | 2 +-
> 5 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/debugfs/set_fields.c b/debugfs/set_fields.c
> index 5142554d..f497bd92 100644
> --- a/debugfs/set_fields.c
> +++ b/debugfs/set_fields.c
> @@ -160,7 +160,7 @@ static struct field_set_info super_fields[] = {
> { "usr_quota_inum", &set_sb.s_usr_quota_inum, NULL, 4, parse_uint },
> { "grp_quota_inum", &set_sb.s_grp_quota_inum, NULL, 4, parse_uint },
> { "prj_quota_inum", &set_sb.s_prj_quota_inum, NULL, 4, parse_uint },
> - { "overhead_blocks", &set_sb.s_overhead_blocks, NULL, 4, parse_uint },
> + { "overhead_clusters", &set_sb.s_overhead_clusters, NULL, 4, parse_uint },

Should we consider to keep the "overhead_blocks" name for compatibility? It
should be listed second, after "overhead_clusters", maybe with a comment.

> { "backup_bgs", &set_sb.s_backup_bgs[0], NULL, 4, parse_uint,
> FLAG_ARRAY, 2 },
> { "checksum", &set_sb.s_checksum, NULL, 4, parse_uint },
> diff --git a/lib/e2p/ls.c b/lib/e2p/ls.c
> index 5a446178..5ca750f6 100644
> --- a/lib/e2p/ls.c
> +++ b/lib/e2p/ls.c
> @@ -272,9 +272,9 @@ void list_super2(struct ext2_super_block * sb, FILE *f)
> fprintf(f, "Inode count: %u\n", sb->s_inodes_count);
> fprintf(f, "Block count: %llu\n", e2p_blocks_count(sb));
> fprintf(f, "Reserved block count: %llu\n", e2p_r_blocks_count(sb));
> - if (sb->s_overhead_blocks)
> - fprintf(f, "Overhead blocks: %u\n",
> - sb->s_overhead_blocks);
> + if (sb->s_overhead_clusters)
> + fprintf(f, "Overhead clusters: %u\n",
> + sb->s_overhead_clusters);
> fprintf(f, "Free blocks: %llu\n", e2p_free_blocks_count(sb));
> fprintf(f, "Free inodes: %u\n", sb->s_free_inodes_count);
> fprintf(f, "First block: %u\n", sb->s_first_data_block);
> diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
> index cbb44bdb..5737dc61 100644
> --- a/lib/ext2fs/ext2_fs.h
> +++ b/lib/ext2fs/ext2_fs.h
> @@ -742,7 +742,7 @@ struct ext2_super_block {
> /*200*/ __u8 s_mount_opts[64];
> /*240*/ __u32 s_usr_quota_inum; /* inode number of user quota file */
> __u32 s_grp_quota_inum; /* inode number of group quota file */
> - __u32 s_overhead_blocks; /* overhead blocks/clusters in fs */
> + __u32 s_overhead_clusters; /* overhead blocks/clusters in fs */
> /*24c*/ __u32 s_backup_bgs[2]; /* If sparse_super2 enabled */
> /*254*/ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
> /*258*/ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
> diff --git a/lib/ext2fs/swapfs.c b/lib/ext2fs/swapfs.c
> index a1560045..63b24330 100644
> --- a/lib/ext2fs/swapfs.c
> +++ b/lib/ext2fs/swapfs.c
> @@ -121,7 +121,7 @@ void ext2fs_swap_super(struct ext2_super_block * sb)
> /* sb->s_mount_opts is __u8 and does not need swabbing */
> sb->s_usr_quota_inum = ext2fs_swab32(sb->s_usr_quota_inum);
> sb->s_grp_quota_inum = ext2fs_swab32(sb->s_grp_quota_inum);
> - sb->s_overhead_blocks = ext2fs_swab32(sb->s_overhead_blocks);
> + sb->s_overhead_clusters = ext2fs_swab32(sb->s_overhead_clusters);
> sb->s_backup_bgs[0] = ext2fs_swab32(sb->s_backup_bgs[0]);
> sb->s_backup_bgs[1] = ext2fs_swab32(sb->s_backup_bgs[1]);
> /* sb->s_encrypt_algos is __u8 and does not need swabbing */
> diff --git a/lib/ext2fs/tst_super_size.c b/lib/ext2fs/tst_super_size.c
> index a932685d..ab38dd59 100644
> --- a/lib/ext2fs/tst_super_size.c
> +++ b/lib/ext2fs/tst_super_size.c
> @@ -135,7 +135,7 @@ int main(int argc, char **argv)
> check_field(s_mount_opts, 64);
> check_field(s_usr_quota_inum, 4);
> check_field(s_grp_quota_inum, 4);
> - check_field(s_overhead_blocks, 4);
> + check_field(s_overhead_clusters, 4);
> check_field(s_backup_bgs, 8);
> check_field(s_encrypt_algos, 4);
> check_field(s_encrypt_pw_salt, 16);
> --
> 2.22.1
>


Cheers, Andreas






Attachments:
signature.asc (890.00 B)
Message signed with OpenPGP

2019-08-26 03:17:00

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] libext2fs: optimize ext2fs_convert_subcluster_bitmap()

On Aug 22, 2019, at 2:26 AM, Dongyang Li <[email protected]> wrote:
>
> For a bigalloc filesystem, converting the block bitmap from blocks
> to chunks in ext2fs_convert_subcluster_bitmap() can take a long time
> when the device is huge, because we test the bitmap
> bit-by-bit using ext2fs_test_block_bitmap2().
> Use ext2fs_find_first_set_block_bitmap2() which is more efficient
> for mke2fs when the fs is mostly empty.
>
> e2fsck can also benefit from this during pass1 block scanning.
>
> Time taken for "mke2fs -O bigalloc,extent -C 131072 -b 4096" on a 1PB
> device:
>
> without patch:
> real 27m49.457s
> user 21m36.474s
> sys 6m9.514s
>
> with patch:
> real 6m31.908s
> user 0m1.806s
> sys 6m29.697s
>
> Signed-off-by: Li Dongyang <[email protected]>

Reviewed-by: Andreas Dilger <[email protected]>

> ---
> lib/ext2fs/gen_bitmap64.c | 20 +++++++-------------
> 1 file changed, 7 insertions(+), 13 deletions(-)
>
> diff --git a/lib/ext2fs/gen_bitmap64.c b/lib/ext2fs/gen_bitmap64.c
> index 6e4d8b71..f1dd1891 100644
> --- a/lib/ext2fs/gen_bitmap64.c
> +++ b/lib/ext2fs/gen_bitmap64.c
> @@ -799,8 +799,7 @@ errcode_t ext2fs_convert_subcluster_bitmap(ext2_filsys fs,
> ext2fs_generic_bitmap_64 bmap, cmap;
> ext2fs_block_bitmap gen_bmap = *bitmap, gen_cmap;
> errcode_t retval;
> - blk64_t i, b_end, c_end;
> - int n, ratio;
> + blk64_t i, next, b_end, c_end;
>
> bmap = (ext2fs_generic_bitmap_64) gen_bmap;
> if (fs->cluster_ratio_bits == ext2fs_get_bitmap_granularity(gen_bmap))
> @@ -817,18 +816,13 @@ errcode_t ext2fs_convert_subcluster_bitmap(ext2_filsys fs,
> bmap->end = bmap->real_end;
> c_end = cmap->end;
> cmap->end = cmap->real_end;
> - n = 0;
> - ratio = 1 << fs->cluster_ratio_bits;
> while (i < bmap->real_end) {
> - if (ext2fs_test_block_bitmap2(gen_bmap, i)) {
> - ext2fs_mark_block_bitmap2(gen_cmap, i);
> - i += ratio - n;
> - n = 0;
> - continue;
> - }
> - i++; n++;
> - if (n >= ratio)
> - n = 0;
> + retval = ext2fs_find_first_set_block_bitmap2(gen_bmap,
> + i, bmap->real_end, &next);
> + if (retval)
> + break;
> + ext2fs_mark_block_bitmap2(gen_cmap, next);
> + i = EXT2FS_C2B(fs, EXT2FS_B2C(fs, next) + 1);
> }
> bmap->end = b_end;
> cmap->end = c_end;
> --
> 2.22.1
>


Cheers, Andreas






Attachments:
signature.asc (890.00 B)
Message signed with OpenPGP

2019-08-26 03:38:13

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mke2fs: set overhead in super block for bigalloc

On Aug 22, 2019, at 2:26 AM, Dongyang Li <[email protected]> wrote:
>
> If overhead is not recorded in the super block, it is caculated
> during mount in kernel, for bigalloc file systems the it takes
> O(groups**2) in time.
> For a 1PB deivce with 32K cluste size it takes ~12 mins to
> mount, with most of the time spent on figuring out overhead.
>
> While we can not improve the overhead algorithm in kernel
> due to the nature of bigalloc, we can work out the overhead
> during mke2fs and set it in the super block, avoiding calculating
> it every time when it mounts.

It would also be good to get an ext4 patch to save the calculated
overhead to s_overhead_clusters if the kernel finds it unset?
That isn't any less accurate than recomputing it each time, and
avoids extra overhead on each mount for filesystems that did not
get it set at mke2fs time.

> Overhead is s_first_data_block plus internal journal blocks plus
> the block and inode bitmaps, inode table, super block backups and
> group descriptor blocks for every group. This patch introduces
> ext2fs_count_used_clusters(), which calculates the clusters used
> in the block bitmap for the given range.
>
> When bad blocks are involved, it gets tricky because the blocks
> counted as overhead and the bad blocks can end up in the same
> allocation cluster.

On the other hand, would it be wrong if the bad blocks are stored
in "s_overhead_clusters"?

> In this case we will unmark the bad blocks from
> the block bitmap, covert to cluster bitmap and get the overhead,

(typo) "convert"

> then mark the bad blocks back in the cluster bitmap.

In this case, should the bad block numbers be converted to
clusters during the second iteration?

> Signed-off-by: Li Dongyang <[email protected]>
> ---
> lib/ext2fs/ext2fs.h | 2 ++
> lib/ext2fs/gen_bitmap64.c | 35 +++++++++++++++++++++++++++
> misc/mke2fs.c | 50 ++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 86 insertions(+), 1 deletion(-)
>
> diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
> index 59fd9742..a8ddb9e4 100644
> --- a/lib/ext2fs/ext2fs.h
> +++ b/lib/ext2fs/ext2fs.h
> @@ -1437,6 +1437,8 @@ errcode_t ext2fs_set_generic_bmap_range(ext2fs_generic_bitmap bmap,
> void *in);
> errcode_t ext2fs_convert_subcluster_bitmap(ext2_filsys fs,
> ext2fs_block_bitmap *bitmap);
> +errcode_t ext2fs_count_used_clusters(ext2_filsys fs, blk64_t start,
> + blk64_t end, blk64_t *out);
>
> /* get_num_dirs.c */
> extern errcode_t ext2fs_get_num_dirs(ext2_filsys fs, ext2_ino_t *ret_num_dirs);
> diff --git a/lib/ext2fs/gen_bitmap64.c b/lib/ext2fs/gen_bitmap64.c
> index f1dd1891..b2370667 100644
> --- a/lib/ext2fs/gen_bitmap64.c
> +++ b/lib/ext2fs/gen_bitmap64.c
> @@ -940,3 +940,38 @@ errcode_t ext2fs_find_first_set_generic_bmap(ext2fs_generic_bitmap bitmap,
>
> return ENOENT;
> }
> +
> +errcode_t ext2fs_count_used_clusters(ext2_filsys fs, blk64_t start,
> + blk64_t end, blk64_t *out)
> +{
> + blk64_t next;
> + blk64_t tot_set = 0;
> + errcode_t retval;
> +
> + while (start < end) {
> + retval = ext2fs_find_first_set_block_bitmap2(fs->block_map,
> + start, end, &next);
> + if (retval) {
> + if (retval == ENOENT)
> + retval = 0;
> + break;
> + }
> + start = next;
> +
> + retval = ext2fs_find_first_zero_block_bitmap2(fs->block_map,
> + start, end, &next);
> + if (retval == 0) {
> + tot_set += next - start;
> + start = next + 1;
> + } else if (retval == ENOENT) {
> + retval = 0;
> + tot_set += end - start + 1;
> + break;
> + } else
> + break;
> + }
> +
> + if (!retval)
> + *out = EXT2FS_NUM_B2C(fs, tot_set);
> + return retval;
> +}
> diff --git a/misc/mke2fs.c b/misc/mke2fs.c
> index 30e353d3..1928c9bf 100644
> --- a/misc/mke2fs.c
> +++ b/misc/mke2fs.c
> @@ -2912,6 +2912,8 @@ int main (int argc, char *argv[])
> errcode_t retval = 0;
> ext2_filsys fs;
> badblocks_list bb_list = 0;
> + badblocks_iterate bb_iter;
> + blk_t blk;
> unsigned int journal_blocks = 0;
> unsigned int i, checkinterval;
> int max_mnt_count;
> @@ -2922,6 +2924,7 @@ int main (int argc, char *argv[])
> char opt_string[40];
> char *hash_alg_str;
> int itable_zeroed = 0;
> + blk64_t overhead;
>
> #ifdef ENABLE_NLS
> setlocale(LC_MESSAGES, "");
> @@ -3213,6 +3216,23 @@ int main (int argc, char *argv[])
> if (!quiet)
> printf("%s", _("done \n"));
>
> + /*
> + * Unmark bad blocks to calculate overhead, because metadata
> + * blocks and bad blocks can land on the same allocation cluster.
> + */
> + if (bb_list) {
> + retval = ext2fs_badblocks_list_iterate_begin(bb_list,
> + &bb_iter);
> + if (retval) {
> + com_err("ext2fs_badblocks_list_iterate_begin", retval,
> + "%s", _("while unmarking bad blocks"));
> + exit(1);
> + }
> + while (ext2fs_badblocks_list_iterate(bb_iter, &blk))
> + ext2fs_unmark_block_bitmap2(fs->block_map, blk);
> + ext2fs_badblocks_list_iterate_end(bb_iter);
> + }
> +
> retval = ext2fs_convert_subcluster_bitmap(fs, &fs->block_map);
> if (retval) {
> com_err(program_name, retval, "%s",
> @@ -3220,6 +3240,28 @@ int main (int argc, char *argv[])
> exit(1);
> }
>
> + retval = ext2fs_count_used_clusters(fs, fs->super->s_first_data_block,
> + ext2fs_blocks_count(fs->super) - 1,
> + &overhead);
> + if (retval) {
> + com_err(program_name, retval, "%s",
> + _("while calculating overhead"));
> + exit(1);
> + }
> +
> + if (bb_list) {
> + retval = ext2fs_badblocks_list_iterate_begin(bb_list,
> + &bb_iter);
> + if (retval) {
> + com_err("ext2fs_badblocks_list_iterate_begin", retval,
> + "%s", _("while marking bad blocks as used"));
> + exit(1);
> + }
> + while (ext2fs_badblocks_list_iterate(bb_iter, &blk))
> + ext2fs_mark_block_bitmap2(fs->block_map, blk);
> + ext2fs_badblocks_list_iterate_end(bb_iter);
> + }
> +
> if (super_only) {
> check_plausibility(device_name, CHECK_FS_EXIST, NULL);
> printf(_("%s may be further corrupted by superblock rewrite\n"),
> @@ -3317,6 +3359,7 @@ int main (int argc, char *argv[])
> free(journal_device);
> } else if ((journal_size) ||
> ext2fs_has_feature_journal(&fs_param)) {
> + overhead += EXT2FS_NUM_B2C(fs, journal_blocks);
> if (super_only) {
> printf("%s", _("Skipping journal creation in super-only mode\n"));
> fs->super->s_journal_inum = EXT2_JOURNAL_INO;
> @@ -3359,8 +3402,13 @@ no_journal:
> fs->super->s_mmp_update_interval);
> }
>
> - if (ext2fs_has_feature_bigalloc(&fs_param))
> + overhead += fs->super->s_first_data_block;
> +
> + if (ext2fs_has_feature_bigalloc(&fs_param)) {
> + if (!super_only)
> + fs->super->s_overhead_clusters = overhead;
> fix_cluster_bg_counts(fs);
> + }

Should we consider to always store the overhead value into the superblock,
regardless of whether bigalloc is enabled or not?

Cheers, Andreas






Attachments:
signature.asc (890.00 B)
Message signed with OpenPGP

2019-08-26 05:57:25

by Li Dongyang

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] mke2fs: set overhead in super block for bigalloc

On Sun, 2019-08-25 at 21:29 -0600, Andreas Dilger wrote:
> On Aug 22, 2019, at 2:26 AM, Dongyang Li <[email protected]> wrote:
> > If overhead is not recorded in the super block, it is caculated
> > during mount in kernel, for bigalloc file systems the it takes
> > O(groups**2) in time.
> > For a 1PB deivce with 32K cluste size it takes ~12 mins to
> > mount, with most of the time spent on figuring out overhead.
> >
> > While we can not improve the overhead algorithm in kernel
> > due to the nature of bigalloc, we can work out the overhead
> > during mke2fs and set it in the super block, avoiding calculating
> > it every time when it mounts.
>
> It would also be good to get an ext4 patch to save the calculated
> overhead to s_overhead_clusters if the kernel finds it unset?
> That isn't any less accurate than recomputing it each time, and
> avoids extra overhead on each mount for filesystems that did not
> get it set at mke2fs time.
Sounds good, we also need to update the overhead when resize happens.
>
> > Overhead is s_first_data_block plus internal journal blocks plus
> > the block and inode bitmaps, inode table, super block backups and
> > group descriptor blocks for every group. This patch introduces
> > ext2fs_count_used_clusters(), which calculates the clusters used
> > in the block bitmap for the given range.
> >
> > When bad blocks are involved, it gets tricky because the blocks
> > counted as overhead and the bad blocks can end up in the same
> > allocation cluster.
>
> On the other hand, would it be wrong if the bad blocks are stored
> in "s_overhead_clusters"?
IMHO the bad blocks are considered as used blocks, overhead is the
filesystem structures, so they are different.
Someone please correct me if I'm wrong, considering bad blocks as
overhead will make this heaps easier.
>
> > In this case we will unmark the bad blocks from
> > the block bitmap, covert to cluster bitmap and get the overhead,
>
> (typo) "convert"
>
> > then mark the bad blocks back in the cluster bitmap.
>
> In this case, should the bad block numbers be converted to
> clusters during the second iteration?
ext2fs_mark_generic_bmap() will do that for us.
>
> > Signed-off-by: Li Dongyang <[email protected]>
> > ---
> > lib/ext2fs/ext2fs.h | 2 ++
> > lib/ext2fs/gen_bitmap64.c | 35 +++++++++++++++++++++++++++
> > misc/mke2fs.c | 50
> > ++++++++++++++++++++++++++++++++++++++-
> > 3 files changed, 86 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
> > index 59fd9742..a8ddb9e4 100644
> > --- a/lib/ext2fs/ext2fs.h
> > +++ b/lib/ext2fs/ext2fs.h
> > @@ -1437,6 +1437,8 @@ errcode_t
> > ext2fs_set_generic_bmap_range(ext2fs_generic_bitmap bmap,
> > void *in);
> > errcode_t ext2fs_convert_subcluster_bitmap(ext2_filsys fs,
> > ext2fs_block_bitmap
> > *bitmap);
> > +errcode_t ext2fs_count_used_clusters(ext2_filsys fs, blk64_t
> > start,
> > + blk64_t end, blk64_t *out);
> >
> > /* get_num_dirs.c */
> > extern errcode_t ext2fs_get_num_dirs(ext2_filsys fs, ext2_ino_t
> > *ret_num_dirs);
> > diff --git a/lib/ext2fs/gen_bitmap64.c b/lib/ext2fs/gen_bitmap64.c
> > index f1dd1891..b2370667 100644
> > --- a/lib/ext2fs/gen_bitmap64.c
> > +++ b/lib/ext2fs/gen_bitmap64.c
> > @@ -940,3 +940,38 @@ errcode_t
> > ext2fs_find_first_set_generic_bmap(ext2fs_generic_bitmap bitmap,
> >
> > return ENOENT;
> > }
> > +
> > +errcode_t ext2fs_count_used_clusters(ext2_filsys fs, blk64_t
> > start,
> > + blk64_t end, blk64_t *out)
> > +{
> > + blk64_t next;
> > + blk64_t tot_set = 0;
> > + errcode_t retval;
> > +
> > + while (start < end) {
> > + retval = ext2fs_find_first_set_block_bitmap2(fs-
> > >block_map,
> > + start, end,
> > &next);
> > + if (retval) {
> > + if (retval == ENOENT)
> > + retval = 0;
> > + break;
> > + }
> > + start = next;
> > +
> > + retval = ext2fs_find_first_zero_block_bitmap2(fs-
> > >block_map,
> > + start, end,
> > &next);
> > + if (retval == 0) {
> > + tot_set += next - start;
> > + start = next + 1;
> > + } else if (retval == ENOENT) {
> > + retval = 0;
> > + tot_set += end - start + 1;
> > + break;
> > + } else
> > + break;
> > + }
> > +
> > + if (!retval)
> > + *out = EXT2FS_NUM_B2C(fs, tot_set);
> > + return retval;
> > +}
> > diff --git a/misc/mke2fs.c b/misc/mke2fs.c
> > index 30e353d3..1928c9bf 100644
> > --- a/misc/mke2fs.c
> > +++ b/misc/mke2fs.c
> > @@ -2912,6 +2912,8 @@ int main (int argc, char *argv[])
> > errcode_t retval = 0;
> > ext2_filsys fs;
> > badblocks_list bb_list = 0;
> > + badblocks_iterate bb_iter;
> > + blk_t blk;
> > unsigned int journal_blocks = 0;
> > unsigned int i, checkinterval;
> > int max_mnt_count;
> > @@ -2922,6 +2924,7 @@ int main (int argc, char *argv[])
> > char opt_string[40];
> > char *hash_alg_str;
> > int itable_zeroed = 0;
> > + blk64_t overhead;
> >
> > #ifdef ENABLE_NLS
> > setlocale(LC_MESSAGES, "");
> > @@ -3213,6 +3216,23 @@ int main (int argc, char *argv[])
> > if (!quiet)
> > printf("%s", _("done \n"));
> >
> > + /*
> > + * Unmark bad blocks to calculate overhead, because metadata
> > + * blocks and bad blocks can land on the same allocation
> > cluster.
> > + */
> > + if (bb_list) {
> > + retval = ext2fs_badblocks_list_iterate_begin(bb_list,
> > + &bb_iter);
> > + if (retval) {
> > + com_err("ext2fs_badblocks_list_iterate_begin",
> > retval,
> > + "%s", _("while unmarking bad blocks"));
> > + exit(1);
> > + }
> > + while (ext2fs_badblocks_list_iterate(bb_iter, &blk))
> > + ext2fs_unmark_block_bitmap2(fs->block_map,
> > blk);
> > + ext2fs_badblocks_list_iterate_end(bb_iter);
> > + }
> > +
> > retval = ext2fs_convert_subcluster_bitmap(fs, &fs->block_map);
> > if (retval) {
> > com_err(program_name, retval, "%s",
> > @@ -3220,6 +3240,28 @@ int main (int argc, char *argv[])
> > exit(1);
> > }
> >
> > + retval = ext2fs_count_used_clusters(fs, fs->super-
> > >s_first_data_block,
> > + ext2fs_blocks_count(fs->super)
> > - 1,
> > + &overhead);
> > + if (retval) {
> > + com_err(program_name, retval, "%s",
> > + _("while calculating overhead"));
> > + exit(1);
> > + }
> > +
> > + if (bb_list) {
> > + retval = ext2fs_badblocks_list_iterate_begin(bb_list,
> > + &bb_iter);
> > + if (retval) {
> > + com_err("ext2fs_badblocks_list_iterate_begin",
> > retval,
> > + "%s", _("while marking bad blocks as
> > used"));
> > + exit(1);
> > + }
> > + while (ext2fs_badblocks_list_iterate(bb_iter, &blk))
> > + ext2fs_mark_block_bitmap2(fs->block_map, blk);
> > + ext2fs_badblocks_list_iterate_end(bb_iter);
> > + }
> > +
> > if (super_only) {
> > check_plausibility(device_name, CHECK_FS_EXIST, NULL);
> > printf(_("%s may be further corrupted by superblock
> > rewrite\n"),
> > @@ -3317,6 +3359,7 @@ int main (int argc, char *argv[])
> > free(journal_device);
> > } else if ((journal_size) ||
> > ext2fs_has_feature_journal(&fs_param)) {
> > + overhead += EXT2FS_NUM_B2C(fs, journal_blocks);
> > if (super_only) {
> > printf("%s", _("Skipping journal creation in
> > super-only mode\n"));
> > fs->super->s_journal_inum = EXT2_JOURNAL_INO;
> > @@ -3359,8 +3402,13 @@ no_journal:
> > fs->super->s_mmp_update_interval);
> > }
> >
> > - if (ext2fs_has_feature_bigalloc(&fs_param))
> > + overhead += fs->super->s_first_data_block;
> > +
> > + if (ext2fs_has_feature_bigalloc(&fs_param)) {
> > + if (!super_only)
> > + fs->super->s_overhead_clusters = overhead;
> > fix_cluster_bg_counts(fs);
> > + }
>
> Should we consider to always store the overhead value into the
> superblock,
> regardless of whether bigalloc is enabled or not?
>
> Cheers, Andreas
>
>
>
>
>