LinuxLists.cc - [PATCH] ext4: Add support for online resizing with bigalloc.

2017-10-25 04:42:27

Subject: [PATCH] ext4: Add support for online resizing with bigalloc.

This patch adds support for online resizing on bigalloc file system by
implementing EXT4_IOC_RESIZE_FS ioctl. Old resize interfaces (add
block groups and extend last block group) are left untouched. Tests
performed with cluster sizes of 1, 2, 4 and 8 blocks (of size 4k) per
cluster. I will add these tests to xfstests.

Signed-off-by: Harshad Shirwadkar <[email protected]>
---
fs/ext4/ext4.h | 4 +--
fs/ext4/ioctl.c | 6 ----
fs/ext4/mballoc.c | 28 ++++++++-------
fs/ext4/resize.c | 104 ++++++++++++++++++++++++++++++++++--------------------
4 files changed, 84 insertions(+), 58 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b84aa1ca480a..53dcfd808567 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -534,8 +534,8 @@ struct ext4_new_group_data {
__u64 inode_table;
__u32 blocks_count;
__u16 reserved_blocks;
- __u16 unused;
- __u32 free_blocks_count;
+ __u16 mdata_blocks;
+ __u32 free_clusters_count;
};

/* Indexes used to index group tables in ext4_new_group_data */
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 28cc412852af..e165b06767ff 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -702,12 +702,6 @@ group_add_out:
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;

- if (ext4_has_feature_bigalloc(sb)) {
- ext4_msg(sb, KERN_ERR,
- "Online resizing not (yet) supported with bigalloc");
- return -EOPNOTSUPP;
- }
-
if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
sizeof(__u64))) {
return -EFAULT;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c1ab3ec30423..b3922e1c7eec 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4925,8 +4925,11 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
- int err = 0, ret, blk_free_count;
- ext4_grpblk_t blocks_freed;
+ int err = 0, ret, free_clusters_count;
+ ext4_grpblk_t clusters_freed;
+ ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
+ ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
+ unsigned long cluster_count = last_cluster - first_cluster + 1;

ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);

@@ -4938,8 +4941,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
* Check to see if we are freeing blocks across a group
* boundary.
*/
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- ext4_warning(sb, "too much blocks added to group %u",
+ if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
+ ext4_warning(sb, "too many blocks added to group %u",
block_group);
err = -EINVAL;
goto error_return;
@@ -4985,14 +4988,14 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
if (err)
goto error_return;

- for (i = 0, blocks_freed = 0; i < count; i++) {
+ for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
ext4_error(sb, "bit already cleared for block %llu",
(ext4_fsblk_t)(block + i));
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
- blocks_freed++;
+ clusters_freed++;
}
}

@@ -5006,19 +5009,20 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
* them with group lock_held
*/
ext4_lock_group(sb, block_group);
- mb_clear_bits(bitmap_bh->b_data, bit, count);
- mb_free_blocks(NULL, &e4b, bit, count);
- blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
- ext4_free_group_clusters_set(sb, desc, blk_free_count);
+ mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
+ mb_free_blocks(NULL, &e4b, bit, cluster_count);
+ free_clusters_count = clusters_freed +
+ ext4_free_group_clusters(sb, desc);
+ ext4_free_group_clusters_set(sb, desc, free_clusters_count);
ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
- EXT4_NUM_B2C(sbi, blocks_freed));
+ clusters_freed);

if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
+ atomic64_add(clusters_freed,
&sbi->s_flex_groups[flex_group].free_clusters);
}

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index cf681004b196..b3c8c7be8838 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -105,7 +105,7 @@ static int verify_group_input(struct super_block *sb,

overhead = ext4_group_overhead_blocks(sb, group);
metaend = start + overhead;
- input->free_blocks_count = free_blocks_count =
+ input->free_clusters_count = free_blocks_count =
input->blocks_count - 2 - overhead - sbi->s_itb_per_group;

if (test_opt(sb, DEBUG))
@@ -256,6 +256,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
ext4_group_t last_group;
unsigned overhead;
__u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
+ int i;

BUG_ON(flex_gd->count == 0 || group_data == NULL);

@@ -292,7 +293,7 @@ next_group:
group_data[bb_index].block_bitmap = start_blk++;
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
- group_data[group].free_blocks_count--;
+ group_data[group].mdata_blocks++;
flex_gd->bg_flags[group] &= uninit_mask;
}

@@ -303,7 +304,7 @@ next_group:
group_data[ib_index].inode_bitmap = start_blk++;
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
- group_data[group].free_blocks_count--;
+ group_data[group].mdata_blocks++;
flex_gd->bg_flags[group] &= uninit_mask;
}

@@ -322,15 +323,22 @@ next_group:
if (start_blk + itb > next_group_start) {
flex_gd->bg_flags[group + 1] &= uninit_mask;
overhead = start_blk + itb - next_group_start;
- group_data[group + 1].free_blocks_count -= overhead;
+ group_data[group + 1].mdata_blocks += overhead;
itb -= overhead;
}

- group_data[group].free_blocks_count -= itb;
+ group_data[group].mdata_blocks += itb;
flex_gd->bg_flags[group] &= uninit_mask;
start_blk += EXT4_SB(sb)->s_itb_per_group;
}

+ /* Update free clusters count to exclude metadata blocks */
+ for (i = 0; i < flex_gd->count; i++) {
+ group_data[i].free_clusters_count -=
+ EXT4_NUM_B2C(EXT4_SB(sb),
+ group_data[i].mdata_blocks);
+ }
+
if (test_opt(sb, DEBUG)) {
int i;
group = group_data[0].group;
@@ -340,12 +348,13 @@ next_group:
flexbg_size);

for (i = 0; i < flex_gd->count; i++) {
- printk(KERN_DEBUG "adding %s group %u: %u "
- "blocks (%d free)\n",
+ ext4_debug(
+ "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n",
ext4_bg_has_super(sb, group + i) ? "normal" :
"no-super", group + i,
group_data[i].blocks_count,
- group_data[i].free_blocks_count);
+ group_data[i].free_clusters_count,
+ group_data[i].mdata_blocks);
}
}
return 0;
@@ -397,7 +406,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
}

/*
- * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ * set_flexbg_block_bitmap() mark clusters [@first_cluster, @last_cluster] used.
*
* Helper function for ext4_setup_new_group_blocks() which set .
*
@@ -407,22 +416,26 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
*/
static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
struct ext4_new_flex_group_data *flex_gd,
- ext4_fsblk_t block, ext4_group_t count)
+ ext4_fsblk_t first_cluster, ext4_fsblk_t last_cluster)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t count = last_cluster - first_cluster + 1;
ext4_group_t count2;

- ext4_debug("mark blocks [%llu/%u] used\n", block, count);
- for (count2 = count; count > 0; count -= count2, block += count2) {
+ ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster,
+ last_cluster);
+ for (count2 = count; count > 0;
+ count -= count2, first_cluster += count2) {
ext4_fsblk_t start;
struct buffer_head *bh;
ext4_group_t group;
int err;

- group = ext4_get_group_number(sb, block);
- start = ext4_group_first_block_no(sb, group);
+ group = ext4_get_group_number(sb, EXT4_C2B(sbi, first_cluster));
+ start = EXT4_B2C(sbi, ext4_group_first_block_no(sb, group));
group -= flex_gd->groups[0].group;

- count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
+ count2 = EXT4_CLUSTERS_PER_GROUP(sb) - (first_cluster - start);
if (count2 > count)
count2 = count;

@@ -443,9 +456,9 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
err = ext4_journal_get_write_access(handle, bh);
if (err)
return err;
- ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
- block - start, count2);
- ext4_set_bits(bh->b_data, block - start, count2);
+ ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n",
+ first_cluster, first_cluster - start, count2);
+ ext4_set_bits(bh->b_data, first_cluster - start, count2);

err = ext4_handle_dirty_metadata(handle, NULL, bh);
if (unlikely(err))
@@ -594,9 +607,10 @@ handle_bb:
if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n",
start);
- ext4_set_bits(bh->b_data, 0, overhead);
+ ext4_set_bits(bh->b_data, 0,
+ EXT4_NUM_B2C(sbi, overhead));
}
- ext4_mark_bitmap_end(group_data[i].blocks_count,
+ ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count),
sb->s_blocksize * 8, bh->b_data);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
if (err)
@@ -641,7 +655,11 @@ handle_ib:
continue;
}
err = set_flexbg_block_bitmap(sb, handle,
- flex_gd, start, count);
+ flex_gd,
+ EXT4_B2C(sbi, start),
+ EXT4_B2C(sbi,
+ start + count
+ - 1));
if (err)
goto out;
count = group_table_count[j];
@@ -651,7 +669,11 @@ handle_ib:

if (count) {
err = set_flexbg_block_bitmap(sb, handle,
- flex_gd, start, count);
+ flex_gd,
+ EXT4_B2C(sbi, start),
+ EXT4_B2C(sbi,
+ start + count
+ - 1));
if (err)
goto out;
}
@@ -839,7 +861,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
ext4_std_error(sb, err);
goto exit_inode;
}
- inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+ inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >>
+ (9 - EXT4_SB(sb)->s_cluster_bits);
ext4_mark_iloc_dirty(handle, inode, &iloc);
memset(gdb_bh->b_data, 0, sb->s_blocksize);
err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
@@ -934,6 +957,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
{
struct super_block *sb = inode->i_sb;
int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+ int cluster_bits = EXT4_SB(sb)->s_cluster_bits;
struct buffer_head **primary;
struct buffer_head *dind;
struct ext4_iloc iloc;
@@ -1009,7 +1033,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
if (!err)
err = err2;
}
- inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+
+ inode->i_blocks += reserved_gdb * sb->s_blocksize >> (9 - cluster_bits);
ext4_mark_iloc_dirty(handle, inode, &iloc);

exit_bh:
@@ -1243,7 +1268,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
ext4_group_t group;
__u16 *bg_flags = flex_gd->bg_flags;
int i, gdb_off, gdb_num, err = 0;
-
+

for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
group = group_data->group;
@@ -1270,7 +1295,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,

ext4_inode_table_set(sb, gdp, group_data->inode_table);
ext4_free_group_clusters_set(sb, gdp,
- EXT4_NUM_B2C(sbi, group_data->free_blocks_count));
+ group_data->free_clusters_count);
ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
if (ext4_has_group_desc_csum(sb))
ext4_itable_unused_set(sb, gdp,
@@ -1326,7 +1351,7 @@ static void ext4_update_super(struct super_block *sb,
*/
for (i = 0; i < flex_gd->count; i++) {
blocks_count += group_data[i].blocks_count;
- free_blocks += group_data[i].free_blocks_count;
+ free_blocks += EXT4_C2B(sbi, group_data[i].free_clusters_count);
}

reserved_blocks = ext4_r_blocks_count(es) * 100;
@@ -1498,17 +1523,18 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
ext4_fsblk_t n_blocks_count,
unsigned long flexbg_size)
{
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
struct ext4_new_group_data *group_data = flex_gd->groups;
ext4_fsblk_t o_blocks_count;
ext4_group_t n_group;
ext4_group_t group;
ext4_group_t last_group;
ext4_grpblk_t last;
- ext4_grpblk_t blocks_per_group;
+ ext4_grpblk_t clusters_per_group;
unsigned long i;

- blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+ clusters_per_group = EXT4_CLUSTERS_PER_GROUP(sb);

o_blocks_count = ext4_blocks_count(es);

@@ -1529,9 +1555,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
int overhead;

group_data[i].group = group + i;
- group_data[i].blocks_count = blocks_per_group;
+ group_data[i].blocks_count = EXT4_BLOCKS_PER_GROUP(sb);
overhead = ext4_group_overhead_blocks(sb, group + i);
- group_data[i].free_blocks_count = blocks_per_group - overhead;
+ group_data[i].mdata_blocks = overhead;
+ group_data[i].free_clusters_count = EXT4_CLUSTERS_PER_GROUP(sb);
if (ext4_has_group_desc_csum(sb)) {
flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
EXT4_BG_INODE_UNINIT;
@@ -1545,10 +1572,10 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
/* We need to initialize block bitmap of last group. */
flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;

- if ((last_group == n_group) && (last != blocks_per_group - 1)) {
- group_data[i - 1].blocks_count = last + 1;
- group_data[i - 1].free_blocks_count -= blocks_per_group-
- last - 1;
+ if ((last_group == n_group) && (last != clusters_per_group - 1)) {
+ group_data[i - 1].blocks_count = EXT4_C2B(sbi, last + 1);
+ group_data[i - 1].free_clusters_count -= clusters_per_group -
+ last - 1;
}

return 1;
@@ -1795,7 +1822,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
}

/* Do a quick sanity check of the resize inode */
- if (inode->i_blocks != 1 << (inode->i_blkbits - 9))
+ if (inode->i_blocks != 1 << (inode->i_blkbits -
+ (9 - sbi->s_cluster_bits)))
goto invalid_resize_inode;
for (i = 0; i < EXT4_N_BLOCKS; i++) {
if (i == EXT4_DIND_BLOCK) {
@@ -1957,7 +1985,7 @@ retry:
if (n_group == o_group)
add = n_blocks_count - o_blocks_count;
else
- add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+ add = EXT4_C2B(sbi, EXT4_CLUSTERS_PER_GROUP(sb) - (offset + 1));
if (add > 0) {
err = ext4_group_extend_no_check(sb, o_blocks_count, add);
if (err)
--
2.15.0.rc0.271.g36b669edcc-goog

2017-10-29 15:44:22

by Theodore Ts'o

[permalink] [raw]

Subject: Re: [PATCH] ext4: Add support for online resizing with bigalloc.

(If you work for a company which supports enterprise distros, please
keep reading. I have a question for you below.)

On Tue, Oct 24, 2017 at 09:42:08PM -0700, harshads wrote:
> This patch adds support for online resizing on bigalloc file system by
> implementing EXT4_IOC_RESIZE_FS ioctl. Old resize interfaces (add
> block groups and extend last block group) are left untouched. Tests
> performed with cluster sizes of 1, 2, 4 and 8 blocks (of size 4k) per
> cluster. I will add these tests to xfstests.
>
> Signed-off-by: Harshad Shirwadkar <[email protected]>

Can you talk more about what sort of testing you have done? What file
system sizes did you use?

I'm guessing you didn't test the old resize ioctls, because they are
currently broken. I did a quick test because the code paths that are
touched by this patch are also used by the old resize ioctls, and the
following is failing:

mke2fs -t ext4 -Fq /dev/vdc 1G
mount /dev/vdc
export RESIZE2FS_KERNEL_VERSION=3.2.0
strace -o /tmp/st-$RESIZE2FS_KERNEL_VERSION resize2fs /dev/vdc 5G
umount /dev/vdc

>From a quick check, and it looks like this broke sometime between 4.1
and 4.4. It works on 3.18 and 4.1, and but it failed for me in 4.4.

The 4.1 kernel was released in June 2015, and 4.4 kernel was released
in January 2016. E2fsprogs v1.42 started using the new resize ioctl,
which was released in November 2011. So if we take an enterprise
distro which was initially released > 5-6 years ago, and it upgrades
to a kernel newer than 4.1-4.4, the resize2fs from e2fsprogs 1.41.x
(or earlier) will not be able to do online resize.

The question is whether or not we care at this point. We could use
the observation that our lack of good testing, sometime in second half
of 2015, we broke online resize backwards compatibility with e2fsprogs
from before 2011, given that no one screamed --- perhaps we can just
rip out support for the old resize ioctls entirely.

Or we can try to fix the old resize ioctl's, and then add better
resize testing to xfstests.

What do the maintainers from the enterprise distro's think?

- Ted

2017-10-30 21:01:48

by harshads

[permalink] [raw]

Subject: Re: [PATCH] ext4: Add support for online resizing with bigalloc.

On Sun, Oct 29, 2017 at 8:06 AM, Theodore Ts'o <[email protected]> wrote:
> (If you work for a company which supports enterprise distros, please
> keep reading. I have a question for you below.)
>
> On Tue, Oct 24, 2017 at 09:42:08PM -0700, harshads wrote:
>> This patch adds support for online resizing on bigalloc file system by
>> implementing EXT4_IOC_RESIZE_FS ioctl. Old resize interfaces (add
>> block groups and extend last block group) are left untouched. Tests
>> performed with cluster sizes of 1, 2, 4 and 8 blocks (of size 4k) per
>> cluster. I will add these tests to xfstests.
>>
>> Signed-off-by: Harshad Shirwadkar <[email protected]>
>
> Can you talk more about what sort of testing you have done? What file
> system sizes did you use?

In my testing, I have tried to ensure that all different code paths
that resize IOCTL triggers get executed. At a high level, these
different code paths are - 1) extending last block group, 2) adding
new block groups 3) conversion of file system to meta-bg.
I performed tests with combinations of different cluster sizes (1, 2,
4, 8 block(s) per cluster) and different file system sizes. Following
csv table lists all the tests that were performed and were successful
with new resize IOCTL.

Cluster size (# 4K blocks), Original Size(# clusters), Final Size(# clusters)
1, 16384(64M), 24576(96M)
1, 24576(96M), 32767(128M)
1, 24576(96M), 32768(128M)
1, 24576(96M), 32769(128M)
1, 24576(96M), 49152(192M)
1, 24576(96M), 65536(256M)
1, 24576(96M), 491521(1.9G)
1, 24576(96M), 507904(~1.9G)
1, 24576(96M), 527488(2G)
1, 24576(96M), 5274880(20G)
2, 16384(128M), 24576(192M)
2, 24576(192M), 32767(256M)
2, 24576(192M), 32768(256M)
2, 24576(192M), 32769(256M)
2, 24576(192M), 49152(384M)
2, 24576(192M), 65536(512M)
2, 24576(192M), 491521(3.8G)
2, 24576(192M), 507904(3.8G)
2, 24576(192M), 527488(4G)
2, 24576(192M), 5274880(40G)
4, 16384(256M), 24576(384M)
4, 24576(384M), 32767(512M)
4, 24576(384M), 32768(512M)
4, 24576(384M), 32769(512M)
4, 24576(384M), 49152(768M)
4, 24576(384M), 65536(1G)
4, 24576(384M), 491521(7.6G)
4, 24576(384M), 507904(7.6G)
4, 24576(384M), 527488(8G)
4, 24576(384M), 5274880(80G)
8, 16384(512M), 24576(768M)
8, 24576(768M), 32767(1G)
8, 24576(768M), 32768(1G)
8, 24576(768M), 32769(1G)
8, 24576(768M), 49152(1.5G)
8, 24576(768M), 65536(2G)
8, 24576(768M), 491521(15.2G)
8, 24576(768M), 507904(15.2G)
8, 24576(768M), 527488(16G)
8, 24576(768M), 5274880(160G)

>
> I'm guessing you didn't test the old resize ioctls, because they are
> currently broken. I did a quick test because the code paths that are
> touched by this patch are also used by the old resize ioctls, and the
> following is failing:

Yes, you are right. Also in the latest kernel, there are code-paths
that are present only for the old resize IOCTLs and I have not touch
them yet. Let's see what others have to say and then I can fix those
if required.

Thanks,
Harshad.

>
> mke2fs -t ext4 -Fq /dev/vdc 1G
> mount /dev/vdc
> export RESIZE2FS_KERNEL_VERSION=3.2.0
> strace -o /tmp/st-$RESIZE2FS_KERNEL_VERSION resize2fs /dev/vdc 5G
> umount /dev/vdc
>
> From a quick check, and it looks like this broke sometime between 4.1
> and 4.4. It works on 3.18 and 4.1, and but it failed for me in 4.4.
>
> The 4.1 kernel was released in June 2015, and 4.4 kernel was released
> in January 2016. E2fsprogs v1.42 started using the new resize ioctl,
> which was released in November 2011. So if we take an enterprise
> distro which was initially released > 5-6 years ago, and it upgrades
> to a kernel newer than 4.1-4.4, the resize2fs from e2fsprogs 1.41.x
> (or earlier) will not be able to do online resize.
>
> The question is whether or not we care at this point. We could use
> the observation that our lack of good testing, sometime in second half
> of 2015, we broke online resize backwards compatibility with e2fsprogs
> from before 2011, given that no one screamed --- perhaps we can just
> rip out support for the old resize ioctls entirely.
>
> Or we can try to fix the old resize ioctl's, and then add better
> resize testing to xfstests.
>
> What do the maintainers from the enterprise distro's think?
>
> - Ted

2017-10-30 23:14:14

by Darrick J. Wong

[permalink] [raw]

Subject: Re: [PATCH] ext4: Add support for online resizing with bigalloc.

On Mon, Oct 30, 2017 at 02:01:46PM -0700, Harshad Shirwadkar wrote:
> On Sun, Oct 29, 2017 at 8:06 AM, Theodore Ts'o <[email protected]> wrote:
> > (If you work for a company which supports enterprise distros, please
> > keep reading. I have a question for you below.)
> >
> > On Tue, Oct 24, 2017 at 09:42:08PM -0700, harshads wrote:
> >> This patch adds support for online resizing on bigalloc file system by
> >> implementing EXT4_IOC_RESIZE_FS ioctl. Old resize interfaces (add
> >> block groups and extend last block group) are left untouched. Tests
> >> performed with cluster sizes of 1, 2, 4 and 8 blocks (of size 4k) per
> >> cluster. I will add these tests to xfstests.
> >>
> >> Signed-off-by: Harshad Shirwadkar <[email protected]>
> >
> > Can you talk more about what sort of testing you have done? What file
> > system sizes did you use?
>
> In my testing, I have tried to ensure that all different code paths
> that resize IOCTL triggers get executed. At a high level, these
> different code paths are - 1) extending last block group, 2) adding
> new block groups 3) conversion of file system to meta-bg.
> I performed tests with combinations of different cluster sizes (1, 2,
> 4, 8 block(s) per cluster) and different file system sizes. Following
> csv table lists all the tests that were performed and were successful
> with new resize IOCTL.
>
> Cluster size (# 4K blocks), Original Size(# clusters), Final Size(# clusters)
> 1, 16384(64M), 24576(96M)
> 1, 24576(96M), 32767(128M)
> 1, 24576(96M), 32768(128M)
> 1, 24576(96M), 32769(128M)
> 1, 24576(96M), 49152(192M)
> 1, 24576(96M), 65536(256M)
> 1, 24576(96M), 491521(1.9G)
> 1, 24576(96M), 507904(~1.9G)
> 1, 24576(96M), 527488(2G)
> 1, 24576(96M), 5274880(20G)
> 2, 16384(128M), 24576(192M)
> 2, 24576(192M), 32767(256M)
> 2, 24576(192M), 32768(256M)
> 2, 24576(192M), 32769(256M)
> 2, 24576(192M), 49152(384M)
> 2, 24576(192M), 65536(512M)
> 2, 24576(192M), 491521(3.8G)
> 2, 24576(192M), 507904(3.8G)
> 2, 24576(192M), 527488(4G)
> 2, 24576(192M), 5274880(40G)
> 4, 16384(256M), 24576(384M)
> 4, 24576(384M), 32767(512M)
> 4, 24576(384M), 32768(512M)
> 4, 24576(384M), 32769(512M)
> 4, 24576(384M), 49152(768M)
> 4, 24576(384M), 65536(1G)
> 4, 24576(384M), 491521(7.6G)
> 4, 24576(384M), 507904(7.6G)
> 4, 24576(384M), 527488(8G)
> 4, 24576(384M), 5274880(80G)
> 8, 16384(512M), 24576(768M)
> 8, 24576(768M), 32767(1G)
> 8, 24576(768M), 32768(1G)
> 8, 24576(768M), 32769(1G)
> 8, 24576(768M), 49152(1.5G)
> 8, 24576(768M), 65536(2G)
> 8, 24576(768M), 491521(15.2G)
> 8, 24576(768M), 507904(15.2G)
> 8, 24576(768M), 527488(16G)
> 8, 24576(768M), 5274880(160G)
>
> >
> > I'm guessing you didn't test the old resize ioctls, because they are
> > currently broken. I did a quick test because the code paths that are
> > touched by this patch are also used by the old resize ioctls, and the
> > following is failing:
>
> Yes, you are right. Also in the latest kernel, there are code-paths
> that are present only for the old resize IOCTLs and I have not touch
> them yet. Let's see what others have to say and then I can fix those
> if required.
>
> Thanks,
> Harshad.
>
> >
> > mke2fs -t ext4 -Fq /dev/vdc 1G
> > mount /dev/vdc
> > export RESIZE2FS_KERNEL_VERSION=3.2.0
> > strace -o /tmp/st-$RESIZE2FS_KERNEL_VERSION resize2fs /dev/vdc 5G
> > umount /dev/vdc
> >
> > From a quick check, and it looks like this broke sometime between 4.1
> > and 4.4. It works on 3.18 and 4.1, and but it failed for me in 4.4.
> >
> > The 4.1 kernel was released in June 2015, and 4.4 kernel was released
> > in January 2016. E2fsprogs v1.42 started using the new resize ioctl,
> > which was released in November 2011. So if we take an enterprise
> > distro which was initially released > 5-6 years ago, and it upgrades
> > to a kernel newer than 4.1-4.4, the resize2fs from e2fsprogs 1.41.x
> > (or earlier) will not be able to do online resize.
> >
> > The question is whether or not we care at this point. We could use
> > the observation that our lack of good testing, sometime in second half
> > of 2015, we broke online resize backwards compatibility with e2fsprogs
> > from before 2011, given that no one screamed --- perhaps we can just
> > rip out support for the old resize ioctls entirely.

How about deprecate the old resize ioctls if you have any of the new
features that landed in e2fsprogs after online-resize2fs gained the
ability to call the new resize ioctl?

In other words, if your e2fsprogs is new enough to know about bigalloc,
it's new enough to call the new resize ioctl, so we can have the old
ioctl return EOPNOTSUPP if bigalloc is enabled.

--D

> >
> > Or we can try to fix the old resize ioctl's, and then add better
> > resize testing to xfstests.
> >
> > What do the maintainers from the enterprise distro's think?
> >
> > - Ted