From: Jose R. Santos <[email protected]>
New bitmap and inode table allocation for FLEX_BG
Change the way we allocate bitmaps and inode tables if the FLEX_BG
feature is used at mke2fs time. It places calculates a new offset for
bitmaps and inode table base on the number of groups that the user
wishes to pack together using the new "-G" option. Creating a
filesystem with 64 block groups in a flex group can be done by:
mke2fs -j -I 256 -O flex_bg -G 32 /dev/sdX
Signed-off-by: Jose R. Santos <[email protected]>
Signed-off-by: Valerie Clement <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
--
lib/ext2fs/alloc_tables.c | 110 ++++++++++++++++++++++++++++++++++++++++++++-
lib/ext2fs/ext2_fs.h | 5 ++
lib/ext2fs/initialize.c | 9 +++-
misc/mke2fs.8.in | 15 ++++++
misc/mke2fs.c | 51 +++++++++++++++++++--
5 files changed, 181 insertions(+), 9 deletions(-)
diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
index 9b4f0e5..c9f5e59 100644
--- a/lib/ext2fs/alloc_tables.c
+++ b/lib/ext2fs/alloc_tables.c
@@ -27,18 +27,79 @@
#include "ext2_fs.h"
#include "ext2fs.h"
+/*
+ * This routine searches for free blocks that can allocate a full
+ * group of bitmaps or inode tables for a flexbg group. Returns the
+ * block number with a correct offset were the bitmaps and inode
+ * tables can be allocated continously and in order.
+ */
+static blk_t flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
+ ext2fs_block_bitmap bmap, int offset, int size)
+{
+ int flexbg, flexbg_size, elem_size;
+ blk_t last_blk, first_free = 0;
+ dgrp_t last_grp;
+
+ flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+ flexbg = group / flexbg_size;
+
+ if (size > fs->super->s_blocks_per_group / 8)
+ size = fs->super->s_blocks_per_group / 8;
+
+ /*
+ * Dont do a long search if the previous block
+ * search is still valid.
+ */
+ if (start_blk && group % flexbg_size) {
+ if (size > flexbg_size)
+ elem_size = fs->inode_blocks_per_group;
+ else
+ elem_size = 1;
+ if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
+ size))
+ return start_blk + elem_size;
+ }
+
+ start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
+ last_grp = group | (flexbg_size - 1);
+ if (last_grp > fs->group_desc_count)
+ last_grp = fs->group_desc_count;
+ last_blk = ext2fs_group_last_block(fs, last_grp);
+
+ /* Find the first available block */
+ if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
+ &first_free))
+ return first_free;
+
+ if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
+ bmap, &first_free))
+ return first_free;
+
+ return first_free;
+}
+
errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
ext2fs_block_bitmap bmap)
{
errcode_t retval;
blk_t group_blk, start_blk, last_blk, new_blk, blk;
- int j;
+ dgrp_t last_grp;
+ int j, rem_grps, flexbg_size = 0;
group_blk = ext2fs_group_first_block(fs, group);
last_blk = ext2fs_group_last_block(fs, group);
if (!bmap)
bmap = fs->block_map;
+
+ if (EXT2_HAS_INCOMPAT_FEATURE(fs->super,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+ last_grp = group | (flexbg_size - 1);
+ rem_grps = last_grp - group;
+ if (last_grp > fs->group_desc_count)
+ last_grp = fs->group_desc_count;
+ }
/*
* Allocate the block and inode bitmaps, if necessary
@@ -56,6 +117,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
} else
start_blk = group_blk;
+ if (flexbg_size) {
+ int prev_block = 0;
+ if (group && fs->group_desc[group-1].bg_block_bitmap)
+ prev_block = fs->group_desc[group-1].bg_block_bitmap;
+ start_blk = flexbg_offset(fs, group, prev_block, bmap,
+ 0, rem_grps);
+ last_blk = ext2fs_group_last_block(fs, last_grp);
+ }
+
if (!fs->group_desc[group].bg_block_bitmap) {
retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
1, bmap, &new_blk);
@@ -66,6 +136,20 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
return retval;
ext2fs_mark_block_bitmap(bmap, new_blk);
fs->group_desc[group].bg_block_bitmap = new_blk;
+ if (flexbg_size) {
+ dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+ fs->group_desc[tmp].bg_free_blocks_count--;
+ fs->super->s_free_blocks_count--;
+ }
+ }
+
+ if (flexbg_size) {
+ int prev_block = 0;
+ if (group && fs->group_desc[group-1].bg_inode_bitmap)
+ prev_block = fs->group_desc[group-1].bg_inode_bitmap;
+ start_blk = flexbg_offset(fs, group, prev_block, bmap,
+ flexbg_size, rem_grps);
+ last_blk = ext2fs_group_last_block(fs, last_grp);
}
if (!fs->group_desc[group].bg_inode_bitmap) {
@@ -78,11 +162,27 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
return retval;
ext2fs_mark_block_bitmap(bmap, new_blk);
fs->group_desc[group].bg_inode_bitmap = new_blk;
+ if (flexbg_size) {
+ dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+ fs->group_desc[tmp].bg_free_blocks_count--;
+ fs->super->s_free_blocks_count--;
+ }
}
/*
* Allocate the inode table
*/
+ if (flexbg_size) {
+ int prev_block = 0;
+ if (group && fs->group_desc[group-1].bg_inode_table)
+ prev_block = fs->group_desc[group-1].bg_inode_table;
+ group_blk = flexbg_offset(fs, group, prev_block, bmap,
+ flexbg_size * 2,
+ fs->inode_blocks_per_group *
+ rem_grps);
+ last_blk = ext2fs_group_last_block(fs, last_grp);
+ }
+
if (!fs->group_desc[group].bg_inode_table) {
retval = ext2fs_get_free_blocks(fs, group_blk, last_blk,
fs->inode_blocks_per_group,
@@ -91,8 +191,14 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
return retval;
for (j=0, blk = new_blk;
j < fs->inode_blocks_per_group;
- j++, blk++)
+ j++, blk++) {
ext2fs_mark_block_bitmap(bmap, blk);
+ if (flexbg_size) {
+ dgrp_t tmp = ext2fs_group_of_blk(fs, blk);
+ fs->group_desc[tmp].bg_free_blocks_count--;
+ fs->super->s_free_blocks_count--;
+ }
+ }
fs->group_desc[group].bg_inode_table = new_blk;
}
ext2fs_group_desc_csum_set(fs, group);
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index 444211d..d4731f8 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -563,7 +563,10 @@ struct ext2_super_block {
__u16 s_mmp_interval; /* # seconds to wait in MMP checking */
__u64 s_mmp_block; /* Block for multi-mount protection */
__u32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad;
+ __u16 s_reserved_pad; /* Padding to next 32bits */
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
/*
diff --git a/lib/ext2fs/initialize.c b/lib/ext2fs/initialize.c
index c2e00e8..68cfbd8 100644
--- a/lib/ext2fs/initialize.c
+++ b/lib/ext2fs/initialize.c
@@ -158,6 +158,7 @@ errcode_t ext2fs_initialize(const char *name, int flags,
set_field(s_first_meta_bg, 0);
set_field(s_raid_stride, 0); /* default stride size: 0 */
set_field(s_raid_stripe_width, 0); /* default stripe width: 0 */
+ set_field(s_log_groups_per_flex, 0);
set_field(s_flags, 0);
if (super->s_feature_incompat & ~EXT2_LIB_FEATURE_INCOMPAT_SUPP) {
retval = EXT2_ET_UNSUPP_FEATURE;
@@ -366,12 +367,16 @@ ipg_retry:
* group, and fill in the correct group statistics for group.
* Note that although the block bitmap, inode bitmap, and
* inode table have not been allocated (and in fact won't be
- * by this routine), they are accounted for nevertheless.
+ * by this routine), they are accounted for nevertheless. If
+ * FLEX_BG meta-data grouping is used, only account for the
+ * superblock and group descriptors (the inode tables and
+ * bitmaps will be accounted for when allocated).
*/
super->s_free_blocks_count = 0;
for (i = 0; i < fs->group_desc_count; i++) {
numblocks = ext2fs_reserve_super_and_bgd(fs, i, fs->block_map);
-
+ if (fs->super->s_log_groups_per_flex)
+ numblocks += 2 + fs->inode_blocks_per_group;
super->s_free_blocks_count += numblocks;
fs->group_desc[i].bg_free_blocks_count = numblocks;
fs->group_desc[i].bg_free_inodes_count =
diff --git a/misc/mke2fs.8.in b/misc/mke2fs.8.in
index a32c34a..9cc3895 100644
--- a/misc/mke2fs.8.in
+++ b/misc/mke2fs.8.in
@@ -26,6 +26,10 @@ mke2fs \- create an ext2/ext3 filesystem
.I blocks-per-group
]
[
+.B \-G
+.I number-of-groups
+]
+[
.B \-i
.I bytes-per-inode
]
@@ -232,6 +236,12 @@ option rather than manipulating the number of blocks per group.)
This option is generally used by developers who
are developing test cases.
.TP
+.BI \-G " number-of-groups"
+Specify the number of block goups that will be packed together to
+create one large virtual block group on an ext4 filesystem. This
+improves meta-data locality and performance on meta-data heavy
+workloads. The number of goups must be a power of 2.
+.TP
.BI \-i " bytes-per-inode"
Specify the bytes/inode ratio.
.B mke2fs
@@ -425,6 +435,11 @@ Use hashed b-trees to speed up lookups in large directories.
.B filetype
Store file type information in directory entries.
.TP
+.B flex_bg
+Allow bitmaps and inode tables for a block group to be placed anywhere
+on the storage media (use with -G option to group meta-data in order
+to create a large virtual block group).
+.TP
.B has_journal
Create an ext3 journal (as if using the
.TP
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 3fd2e5d..7787341 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -98,8 +98,9 @@ static void usage(void)
fprintf(stderr, _("Usage: %s [-c|-l filename] [-b block-size] "
"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
"[-J journal-options]\n"
- "\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
- "[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
+ "\t[-G meta group size] [-N number-of-inodes]\n"
+ "\t[-m reserved-blocks-percentage] [-o creator-os]\n"
+ "\t[-g blocks-per-group] [-L volume-label] "
"[-M last-mounted-directory]\n\t[-O feature[,...]] "
"[-r fs-revision] [-E extended-option[,...]]\n"
"\t[-T fs-type] [-jnqvFSV] device [blocks-count]\n"),
@@ -448,6 +449,30 @@ static void write_inode_tables(ext2_filsys fs)
progress_close(&progress);
}
+static blk64_t expected_free_block_count(ext2_filsys fs, dgrp_t group)
+{
+ blk64_t tmp;
+ blk64_t blks;
+
+ blks = ext2fs_super_and_bgd_loc(fs, group, 0, 0, 0, 0);
+
+ if (EXT2_HAS_INCOMPAT_FEATURE(fs->super,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ tmp = fs->group_desc[group].bg_block_bitmap;
+ if (group != ext2fs_group_of_blk(fs, tmp))
+ blks++;
+
+ tmp = fs->group_desc[group].bg_inode_bitmap;
+ if (group != ext2fs_group_of_blk(fs, tmp))
+ blks++;
+
+ tmp = fs->group_desc[group].bg_inode_table;
+ if (group != ext2fs_group_of_blk(fs, tmp))
+ blks += fs->inode_blocks_per_group;
+ }
+ return blks;
+}
+
static void setup_lazy_bg(ext2_filsys fs)
{
dgrp_t i;
@@ -481,7 +506,7 @@ static void setup_lazy_bg(ext2_filsys fs)
i == fs->group_desc_count - 1)
continue;
- blks = ext2fs_super_and_bgd_loc(fs, i, 0, 0, 0, 0);
+ blks = expected_free_block_count(fs, i);
if (bg->bg_free_blocks_count == blks &&
bg->bg_flags & EXT2_BG_INODE_UNINIT) {
bg->bg_flags |= EXT2_BG_BLOCK_UNINIT;
@@ -1151,6 +1176,7 @@ static void PRS(int argc, char *argv[])
int blocksize = 0;
int inode_ratio = 0;
int inode_size = 0;
+ unsigned long flex_bg_size = 0;
double reserved_ratio = 5.0;
int sector_size = 0;
int show_version_only = 0;
@@ -1234,7 +1260,7 @@ static void PRS(int argc, char *argv[])
}
while ((c = getopt (argc, argv,
- "b:cf:g:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
+ "b:cf:g:G:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
switch (c) {
case 'b':
blocksize = strtol(optarg, &tmp, 0);
@@ -1285,6 +1311,20 @@ static void PRS(int argc, char *argv[])
exit(1);
}
break;
+ case 'G':
+ flex_bg_size = strtoul(optarg, &tmp, 0);
+ if (*tmp) {
+ com_err(program_name, 0,
+ _("Illegal number for Flex_BG size"));
+ exit(1);
+ }
+ if (flex_bg_size < 2 ||
+ (flex_bg_size & (flex_bg_size-1)) != 0) {
+ com_err(program_name, 0,
+ _("Flex_BG size must be a power of 2"));
+ exit(1);
+ }
+ break;
case 'i':
inode_ratio = strtoul(optarg, &tmp, 0);
if (inode_ratio < EXT2_MIN_BLOCK_SIZE ||
@@ -1685,6 +1725,9 @@ static void PRS(int argc, char *argv[])
}
}
+ if (flex_bg_size)
+ fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
+
if (inode_size == 0) {
inode_size = get_int_from_profile(fs_types, "inode_size", 0);
On Mon, Mar 31, 2008 at 10:13:11PM -0500, Jose R. Santos wrote:
> From: Jose R. Santos <[email protected]>
>
> New bitmap and inode table allocation for FLEX_BG
>
> Change the way we allocate bitmaps and inode tables if the FLEX_BG
> feature is used at mke2fs time. It places calculates a new offset for
> bitmaps and inode table base on the number of groups that the user
> wishes to pack together using the new "-G" option. Creating a
> filesystem with 64 block groups in a flex group can be done by:
I was just testing out this patch, and it looks like it creates a
filesystem with an incorrect free blocks count when creating a
filesystem with both uninit_groups and flex_bg. Can you look at this,
please?
- Ted
<tytso.root@closure> {/usr/projects/e2fsprogs/e2fsprogs/build/misc}, level 2 [js/flex-bg]
550# ./mke2fs -O uninit_groups,flex_bg /dev/closure/testext4
mke2fs 1.40.8 (13-Mar-2008)
Filesystem label=
OS type: Linux
Block size=4096 (log=2)
Fragment size=4096 (log=2)
65536 inodes, 262144 blocks
13107 blocks (5.00%) reserved for the super user
First data block=0
Maximum filesystem blocks=268435456
8 block groups
32768 blocks per group, 32768 fragments per group
8192 inodes per group
Superblock backups stored on blocks:
32768, 98304, 163840, 229376
Writing inode tables: done
Writing superblocks and filesystem accounting information: done
This filesystem will be automatically checked every 27 mounts or
180 days, whichever comes first. Use tune2fs -c or -i to override.
<tytso.root@closure> {/usr/projects/e2fsprogs/e2fsprogs/build/misc}, level 2 [js/flex-bg]
551# ../e2fsck/e2fsck -fy /dev/closure/testext4
e2fsck 1.40.8 (13-Mar-2008)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 4: Checking reference counts
Pass 5: Checking group summary information
Free blocks count wrong for group #0 (31669, counted=32183).
Fix? yes
Free blocks count wrong for group #1 (31675, counted=32189).
Fix? yes
Free blocks count wrong for group #2 (31740, counted=32254).
Fix? yes
Free blocks count wrong for group #3 (31675, counted=32189).
Fix? yes
Free blocks count wrong for group #4 (31740, counted=32254).
Fix? yes
Free blocks count wrong for group #5 (31675, counted=32189).
Fix? yes
Free blocks count wrong for group #6 (31740, counted=32254).
Fix? yes
Free blocks count wrong for group #7 (31675, counted=32189).
Fix? yes
Free blocks count wrong (253589, counted=257701).
Fix? yes
/dev/closure/testext4: ***** FILE SYSTEM WAS MODIFIED *****
/dev/closure/testext4: 11/65536 files (9.1% non-contiguous), 4443/262144 blocks
On Thu, 3 Apr 2008 09:12:40 -0400
Theodore Tso <[email protected]> wrote:
> On Mon, Mar 31, 2008 at 10:13:11PM -0500, Jose R. Santos wrote:
> > From: Jose R. Santos <[email protected]>
> >
> > New bitmap and inode table allocation for FLEX_BG
> >
> > Change the way we allocate bitmaps and inode tables if the FLEX_BG
> > feature is used at mke2fs time. It places calculates a new offset for
> > bitmaps and inode table base on the number of groups that the user
> > wishes to pack together using the new "-G" option. Creating a
> > filesystem with 64 block groups in a flex group can be done by:
>
> I was just testing out this patch, and it looks like it creates a
> filesystem with an incorrect free blocks count when creating a
> filesystem with both uninit_groups and flex_bg. Can you look at this,
> please?
>
> - Ted
I blame Undo Manager for being so slow that cause me to skip some of
the testing needed to be done. I was incorrectly checking the feature
flag instead of checking the value of fs->super->s_log_groups_per_flex.
Fixed patch is on its way.
-JRS
From: Jose R. Santos <[email protected]>
mke2fs: New bitmap and inode table allocation for FLEX_BG
Change the way we allocate bitmaps and inode tables if the FLEX_BG
feature is used at mke2fs time. It places calculates a new offset for
bitmaps and inode table base on the number of groups that the user
wishes to pack together using the new "-G" option. Creating a
filesystem with 64 block groups in a flex group can be done by:
mke2fs -j -I 256 -O flex_bg -G 32 /dev/sdX
Signed-off-by: Jose R. Santos <[email protected]>
Signed-off-by: Valerie Clement <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
--
lib/ext2fs/alloc_tables.c | 109 ++++++++++++++++++++++++++++++++++++++++++++-
lib/ext2fs/ext2_fs.h | 5 ++
lib/ext2fs/initialize.c | 9 +++-
misc/mke2fs.8.in | 15 ++++++
misc/mke2fs.c | 51 +++++++++++++++++++--
5 files changed, 180 insertions(+), 9 deletions(-)
diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
index 9b4f0e5..070c960 100644
--- a/lib/ext2fs/alloc_tables.c
+++ b/lib/ext2fs/alloc_tables.c
@@ -27,18 +27,78 @@
#include "ext2_fs.h"
#include "ext2fs.h"
+/*
+ * This routine searches for free blocks that can allocate a full
+ * group of bitmaps or inode tables for a flexbg group. Returns the
+ * block number with a correct offset were the bitmaps and inode
+ * tables can be allocated continously and in order.
+ */
+static blk_t flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
+ ext2fs_block_bitmap bmap, int offset, int size)
+{
+ int flexbg, flexbg_size, elem_size;
+ blk_t last_blk, first_free = 0;
+ dgrp_t last_grp;
+
+ flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+ flexbg = group / flexbg_size;
+
+ if (size > fs->super->s_blocks_per_group / 8)
+ size = fs->super->s_blocks_per_group / 8;
+
+ /*
+ * Dont do a long search if the previous block
+ * search is still valid.
+ */
+ if (start_blk && group % flexbg_size) {
+ if (size > flexbg_size)
+ elem_size = fs->inode_blocks_per_group;
+ else
+ elem_size = 1;
+ if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
+ size))
+ return start_blk + elem_size;
+ }
+
+ start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
+ last_grp = group | (flexbg_size - 1);
+ if (last_grp > fs->group_desc_count)
+ last_grp = fs->group_desc_count;
+ last_blk = ext2fs_group_last_block(fs, last_grp);
+
+ /* Find the first available block */
+ if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
+ &first_free))
+ return first_free;
+
+ if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
+ bmap, &first_free))
+ return first_free;
+
+ return first_free;
+}
+
errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
ext2fs_block_bitmap bmap)
{
errcode_t retval;
blk_t group_blk, start_blk, last_blk, new_blk, blk;
- int j;
+ dgrp_t last_grp;
+ int j, rem_grps, flexbg_size = 0;
group_blk = ext2fs_group_first_block(fs, group);
last_blk = ext2fs_group_last_block(fs, group);
if (!bmap)
bmap = fs->block_map;
+
+ if (fs->super->s_log_groups_per_flex) {
+ flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+ last_grp = group | (flexbg_size - 1);
+ rem_grps = last_grp - group;
+ if (last_grp > fs->group_desc_count)
+ last_grp = fs->group_desc_count;
+ }
/*
* Allocate the block and inode bitmaps, if necessary
@@ -56,6 +116,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
} else
start_blk = group_blk;
+ if (flexbg_size) {
+ int prev_block = 0;
+ if (group && fs->group_desc[group-1].bg_block_bitmap)
+ prev_block = fs->group_desc[group-1].bg_block_bitmap;
+ start_blk = flexbg_offset(fs, group, prev_block, bmap,
+ 0, rem_grps);
+ last_blk = ext2fs_group_last_block(fs, last_grp);
+ }
+
if (!fs->group_desc[group].bg_block_bitmap) {
retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
1, bmap, &new_blk);
@@ -66,6 +135,20 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
return retval;
ext2fs_mark_block_bitmap(bmap, new_blk);
fs->group_desc[group].bg_block_bitmap = new_blk;
+ if (flexbg_size) {
+ dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+ fs->group_desc[tmp].bg_free_blocks_count--;
+ fs->super->s_free_blocks_count--;
+ }
+ }
+
+ if (flexbg_size) {
+ int prev_block = 0;
+ if (group && fs->group_desc[group-1].bg_inode_bitmap)
+ prev_block = fs->group_desc[group-1].bg_inode_bitmap;
+ start_blk = flexbg_offset(fs, group, prev_block, bmap,
+ flexbg_size, rem_grps);
+ last_blk = ext2fs_group_last_block(fs, last_grp);
}
if (!fs->group_desc[group].bg_inode_bitmap) {
@@ -78,11 +161,27 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
return retval;
ext2fs_mark_block_bitmap(bmap, new_blk);
fs->group_desc[group].bg_inode_bitmap = new_blk;
+ if (flexbg_size) {
+ dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+ fs->group_desc[tmp].bg_free_blocks_count--;
+ fs->super->s_free_blocks_count--;
+ }
}
/*
* Allocate the inode table
*/
+ if (flexbg_size) {
+ int prev_block = 0;
+ if (group && fs->group_desc[group-1].bg_inode_table)
+ prev_block = fs->group_desc[group-1].bg_inode_table;
+ group_blk = flexbg_offset(fs, group, prev_block, bmap,
+ flexbg_size * 2,
+ fs->inode_blocks_per_group *
+ rem_grps);
+ last_blk = ext2fs_group_last_block(fs, last_grp);
+ }
+
if (!fs->group_desc[group].bg_inode_table) {
retval = ext2fs_get_free_blocks(fs, group_blk, last_blk,
fs->inode_blocks_per_group,
@@ -91,8 +190,14 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
return retval;
for (j=0, blk = new_blk;
j < fs->inode_blocks_per_group;
- j++, blk++)
+ j++, blk++) {
ext2fs_mark_block_bitmap(bmap, blk);
+ if (flexbg_size) {
+ dgrp_t tmp = ext2fs_group_of_blk(fs, blk);
+ fs->group_desc[tmp].bg_free_blocks_count--;
+ fs->super->s_free_blocks_count--;
+ }
+ }
fs->group_desc[group].bg_inode_table = new_blk;
}
ext2fs_group_desc_csum_set(fs, group);
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index 444211d..d4731f8 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -563,7 +563,10 @@ struct ext2_super_block {
__u16 s_mmp_interval; /* # seconds to wait in MMP checking */
__u64 s_mmp_block; /* Block for multi-mount protection */
__u32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad;
+ __u16 s_reserved_pad; /* Padding to next 32bits */
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
/*
diff --git a/lib/ext2fs/initialize.c b/lib/ext2fs/initialize.c
index c2e00e8..68cfbd8 100644
--- a/lib/ext2fs/initialize.c
+++ b/lib/ext2fs/initialize.c
@@ -158,6 +158,7 @@ errcode_t ext2fs_initialize(const char *name, int flags,
set_field(s_first_meta_bg, 0);
set_field(s_raid_stride, 0); /* default stride size: 0 */
set_field(s_raid_stripe_width, 0); /* default stripe width: 0 */
+ set_field(s_log_groups_per_flex, 0);
set_field(s_flags, 0);
if (super->s_feature_incompat & ~EXT2_LIB_FEATURE_INCOMPAT_SUPP) {
retval = EXT2_ET_UNSUPP_FEATURE;
@@ -366,12 +367,16 @@ ipg_retry:
* group, and fill in the correct group statistics for group.
* Note that although the block bitmap, inode bitmap, and
* inode table have not been allocated (and in fact won't be
- * by this routine), they are accounted for nevertheless.
+ * by this routine), they are accounted for nevertheless. If
+ * FLEX_BG meta-data grouping is used, only account for the
+ * superblock and group descriptors (the inode tables and
+ * bitmaps will be accounted for when allocated).
*/
super->s_free_blocks_count = 0;
for (i = 0; i < fs->group_desc_count; i++) {
numblocks = ext2fs_reserve_super_and_bgd(fs, i, fs->block_map);
-
+ if (fs->super->s_log_groups_per_flex)
+ numblocks += 2 + fs->inode_blocks_per_group;
super->s_free_blocks_count += numblocks;
fs->group_desc[i].bg_free_blocks_count = numblocks;
fs->group_desc[i].bg_free_inodes_count =
diff --git a/misc/mke2fs.8.in b/misc/mke2fs.8.in
index 6cd10b1..c56013f 100644
--- a/misc/mke2fs.8.in
+++ b/misc/mke2fs.8.in
@@ -26,6 +26,10 @@ mke2fs \- create an ext2/ext3 filesystem
.I blocks-per-group
]
[
+.B \-G
+.I number-of-groups
+]
+[
.B \-i
.I bytes-per-inode
]
@@ -232,6 +236,12 @@ option rather than manipulating the number of blocks per group.)
This option is generally used by developers who
are developing test cases.
.TP
+.BI \-G " number-of-groups"
+Specify the number of block goups that will be packed together to
+create one large virtual block group on an ext4 filesystem. This
+improves meta-data locality and performance on meta-data heavy
+workloads. The number of goups must be a power of 2.
+.TP
.BI \-i " bytes-per-inode"
Specify the bytes/inode ratio.
.B mke2fs
@@ -425,6 +435,11 @@ Use hashed b-trees to speed up lookups in large directories.
.B filetype
Store file type information in directory entries.
.TP
+.B flex_bg
+Allow bitmaps and inode tables for a block group to be placed anywhere
+on the storage media (use with -G option to group meta-data in order
+to create a large virtual block group).
+.TP
.B has_journal
Create an ext3 journal (as if using the
.B \-j
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index a438403..c97e55d 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -98,8 +98,9 @@ static void usage(void)
fprintf(stderr, _("Usage: %s [-c|-l filename] [-b block-size] "
"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
"[-J journal-options]\n"
- "\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
- "[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
+ "\t[-G meta group size] [-N number-of-inodes]\n"
+ "\t[-m reserved-blocks-percentage] [-o creator-os]\n"
+ "\t[-g blocks-per-group] [-L volume-label] "
"[-M last-mounted-directory]\n\t[-O feature[,...]] "
"[-r fs-revision] [-E extended-option[,...]]\n"
"\t[-T fs-type] [-jnqvFSV] device [blocks-count]\n"),
@@ -448,6 +449,30 @@ static void write_inode_tables(ext2_filsys fs)
progress_close(&progress);
}
+static blk64_t expected_free_block_count(ext2_filsys fs, dgrp_t group)
+{
+ blk64_t tmp;
+ blk64_t blks;
+
+ blks = ext2fs_super_and_bgd_loc(fs, group, 0, 0, 0, 0);
+
+ if (EXT2_HAS_INCOMPAT_FEATURE(fs->super,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ tmp = fs->group_desc[group].bg_block_bitmap;
+ if (group != ext2fs_group_of_blk(fs, tmp))
+ blks++;
+
+ tmp = fs->group_desc[group].bg_inode_bitmap;
+ if (group != ext2fs_group_of_blk(fs, tmp))
+ blks++;
+
+ tmp = fs->group_desc[group].bg_inode_table;
+ if (group != ext2fs_group_of_blk(fs, tmp))
+ blks += fs->inode_blocks_per_group;
+ }
+ return blks;
+}
+
static void setup_lazy_bg(ext2_filsys fs)
{
dgrp_t i;
@@ -481,7 +506,7 @@ static void setup_lazy_bg(ext2_filsys fs)
i == fs->group_desc_count - 1)
continue;
- blks = ext2fs_super_and_bgd_loc(fs, i, 0, 0, 0, 0);
+ blks = expected_free_block_count(fs, i);
if (bg->bg_free_blocks_count == blks &&
bg->bg_flags & EXT2_BG_INODE_UNINIT) {
bg->bg_flags |= EXT2_BG_BLOCK_UNINIT;
@@ -1151,6 +1176,7 @@ static void PRS(int argc, char *argv[])
int blocksize = 0;
int inode_ratio = 0;
int inode_size = 0;
+ unsigned long flex_bg_size = 0;
double reserved_ratio = 5.0;
int sector_size = 0;
int show_version_only = 0;
@@ -1234,7 +1260,7 @@ static void PRS(int argc, char *argv[])
}
while ((c = getopt (argc, argv,
- "b:cf:g:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
+ "b:cf:g:G:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
switch (c) {
case 'b':
blocksize = strtol(optarg, &tmp, 0);
@@ -1285,6 +1311,20 @@ static void PRS(int argc, char *argv[])
exit(1);
}
break;
+ case 'G':
+ flex_bg_size = strtoul(optarg, &tmp, 0);
+ if (*tmp) {
+ com_err(program_name, 0,
+ _("Illegal number for Flex_BG size"));
+ exit(1);
+ }
+ if (flex_bg_size < 2 ||
+ (flex_bg_size & (flex_bg_size-1)) != 0) {
+ com_err(program_name, 0,
+ _("Flex_BG size must be a power of 2"));
+ exit(1);
+ }
+ break;
case 'i':
inode_ratio = strtoul(optarg, &tmp, 0);
if (inode_ratio < EXT2_MIN_BLOCK_SIZE ||
@@ -1685,6 +1725,9 @@ static void PRS(int argc, char *argv[])
}
}
+ if (flex_bg_size)
+ fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
+
if (inode_size == 0) {
inode_size = get_int_from_profile(fs_types, "inode_size", 0);
On Thu, Apr 03, 2008 at 09:28:58AM -0500, Jose R. Santos wrote:
> I blame Undo Manager for being so slow that cause me to skip some of
> the testing needed to be done.
If that means we need a patch to disable the undo manager, via a
command-line option, feel free. :-)
> I was incorrectly checking the feature
> flag instead of checking the value of fs->super->s_log_groups_per_flex.
Actually, you should check both, and we need to make mke2fs have an
intelligent default, which can be overridden via mke2fs.conf.
Also, it looks like this patch doesn't create a valid filesystem in
combination with meta_bg:
mke2fs G 32 -O meta_bg,flex_bg,uninit_groups,^resize_inode /tmp/foo.img
Then try running "e2fsck -f /tmp/foo.img" with the patch applied.
One obvious question is why is this patch so fragile....? Is there
some way we can make it more likely not to break given other changes
to e2fsprogs in the future.
- Ted
On Thu, 3 Apr 2008 23:24:43 -0400
Theodore Tso <[email protected]> wrote:
> On Thu, Apr 03, 2008 at 09:28:58AM -0500, Jose R. Santos wrote:
> > I blame Undo Manager for being so slow that cause me to skip some of
> > the testing needed to be done.
>
> If that means we need a patch to disable the undo manager, via a
> command-line option, feel free. :-)
Im sufficiently annoyed that I may just do that.
> > I was incorrectly checking the feature
> > flag instead of checking the value of fs->super->s_log_groups_per_flex.
>
> Actually, you should check both, and we need to make mke2fs have an
> intelligent default, which can be overridden via mke2fs.conf.
Yes it should check both(will fix). I was expecting more people to
give flexbg a test before trying to determined an intelligent default
though.
> Also, it looks like this patch doesn't create a valid filesystem in
> combination with meta_bg:
>
> mke2fs G 32 -O meta_bg,flex_bg,uninit_groups,^resize_inode /tmp/foo.img
>
> Then try running "e2fsck -f /tmp/foo.img" with the patch applied.
Wow, it really breaks. Throws e2fsck into an infinite loop.
> One obvious question is why is this patch so fragile....? Is there
> some way we can make it more likely not to break given other changes
> to e2fsprogs in the future.
Getting the right free block count for every group descriptor seems to
be the tricky part since libe2fs make all sort of assumptions about
number of used blocks that break when meta-data is no longer on the
same block group. Seems like I forgot a check for the adjusted flexbg
block group size in meta_bg since the first place it barfs is in group
127 which is the last group of the meta_bg with a group descriptor
block being used. This should be easy to find and fix.
> - Ted
-JRS
On Fri, Apr 04, 2008 at 12:37:57AM -0500, Jose R. Santos wrote:
> Getting the right free block count for every group descriptor seems to
> be the tricky part since libe2fs make all sort of assumptions about
> number of used blocks that break when meta-data is no longer on the
> same block group.
Originally the overlap you're referring to was very simple.
ext2fs_initialize() set the free block counts, and then
ext2fs_alloc_tables() actually did the allocation of the bitmaps and
inode tables.
Flex_bg made things more complex, but it transferred the
responsibility of setting the block number to the routines that
allocate the inode and bitmaps.
> Seems like I forgot a check for the adjusted flexbg
> block group size in meta_bg since the first place it barfs is in group
> 127 which is the last group of the meta_bg with a group descriptor
> block being used. This should be easy to find and fix.
It can't be just that since e2fsck is also reporting a block bitmap
discrepancy. And there's the question of why e2fsck is doing that in
an infinite loop.
Hmm.... So at least part of the problem is that BLOCK_UNINIT isn't
getting set when it should. That seems to be bug #1, and that was the
proximate cause of the failure, that was triggered by the flex-bg
allocation patch.
In e2fsck pass5, the uninit_groups patch changed it to compare the
actual bitmap against a virtual bitmap if the bitmap block hadn't been
initialized. (Around line 180, in e2fsck/pass5.c, in the function
check_block_bitmaps()). The problem is that it is using
ext2fs_bg_has_super(), and assuming that if it is set, that the
superblock and block group descriptors are present using the old-style
non-meta_bg layout. What it *should* have used is
ext2fs_super_and_bgd_loc(), and used it to set the pseudo-bitmap
correctly. That's bug #2.
Since it is wrong, it is attempting to fix it, but code to clear
BLOCK_UNINIT is only when the block is used but marked so in the
bitmap --- and not in the other case, when the block isn't used, but
is apparently marked as such. That's bug #3.
Bugs #2 and #3 is my fault, and reflects the fact that LAZY_BG was a
quick hack that I had coded up only for testing purposes for people
who wanted to play with really big filesystems. I'll take care of
that, which is why e2fsck was looping. In retrospect, lazy_bg was a
mistake, or at least should have been implemented *much* more
carefully, and I'm beginning to think it should be removed entirely in
favor of uninit_groups, per my other e-mail.
- Ted
On Fri, 4 Apr 2008 08:43:58 -0400
Theodore Tso <[email protected]> wrote:
> On Fri, Apr 04, 2008 at 12:37:57AM -0500, Jose R. Santos wrote:
> > Getting the right free block count for every group descriptor seems to
> > be the tricky part since libe2fs make all sort of assumptions about
> > number of used blocks that break when meta-data is no longer on the
> > same block group.
>
> Originally the overlap you're referring to was very simple.
> ext2fs_initialize() set the free block counts, and then
> ext2fs_alloc_tables() actually did the allocation of the bitmaps and
> inode tables.
>
> Flex_bg made things more complex, but it transferred the
> responsibility of setting the block number to the routines that
> allocate the inode and bitmaps.
>
> > Seems like I forgot a check for the adjusted flexbg
> > block group size in meta_bg since the first place it barfs is in group
> > 127 which is the last group of the meta_bg with a group descriptor
> > block being used. This should be easy to find and fix.
>
> It can't be just that since e2fsck is also reporting a block bitmap
> discrepancy. And there's the question of why e2fsck is doing that in
> an infinite loop.
>
> Hmm.... So at least part of the problem is that BLOCK_UNINIT isn't
> getting set when it should. That seems to be bug #1, and that was the
> proximate cause of the failure, that was triggered by the flex-bg
> allocation patch.
The problem is that ext2fs_super_and_bgd_loc() does not correctly
predict the size of a block group with no bitmaps and not inode
tables. So the BLOCK_UNINIT is being set correctly, but e2fsck just
gets confused when the predicted number of used blocks is different
than the actual. In mke2fs I go around this in
expected_free_block_count() but maybe the right way to fix it is in
ext2fs_super_and_bgd_loc(). The only problem is that predicting the
size of groups with all the meta-data is tricky especially when the
inode tables are too big to fit in a single block group. This is why I
currently do not mark BLOCK_UNINIT on block group with meta-data in
them. Does ext2fs_super_and_bgd_loc() need to return an exact number of
free blocks or is a guesstimate good enough?
>
> In e2fsck pass5, the uninit_groups patch changed it to compare the
> actual bitmap against a virtual bitmap if the bitmap block hadn't been
> initialized. (Around line 180, in e2fsck/pass5.c, in the function
> check_block_bitmaps()). The problem is that it is using
> ext2fs_bg_has_super(), and assuming that if it is set, that the
> superblock and block group descriptors are present using the old-style
> non-meta_bg layout. What it *should* have used is
> ext2fs_super_and_bgd_loc(), and used it to set the pseudo-bitmap
> correctly. That's bug #2.
It would still break since ext2fs_super_and_bgd_loc() does not know of
block groups with not meta-data in them.
> Since it is wrong, it is attempting to fix it, but code to clear
> BLOCK_UNINIT is only when the block is used but marked so in the
> bitmap --- and not in the other case, when the block isn't used, but
> is apparently marked as such. That's bug #3.
>
> Bugs #2 and #3 is my fault, and reflects the fact that LAZY_BG was a
> quick hack that I had coded up only for testing purposes for people
> who wanted to play with really big filesystems. I'll take care of
> that, which is why e2fsck was looping. In retrospect, lazy_bg was a
> mistake, or at least should have been implemented *much* more
> carefully, and I'm beginning to think it should be removed entirely in
> favor of uninit_groups, per my other e-mail.
>
> - Ted
Agree.
-JRS