2008-02-07 17:09:14

by Jose R. Santos

[permalink] [raw]
Subject: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG

New bitmap and inode table allocation for FLEX_BG

From: Jose R. Santos <[email protected]>

Change the way we allocate bitmaps and inode tables if the FLEX_BG
feature is used at mke2fs time. It places calculates a new offset for
bitmaps and inode table base on the number of groups that the user
wishes to pack together using the new "-G" option. Creating a
filesystem with 64 block groups in a flex group can be done by:

mke2fs -j -I 256 -O flex_bg -G 32 /dev/sdX

Signed-off-by: Jose R. Santos <[email protected]>
Signed-off-by: Valerie Clement <[email protected]>
---

lib/ext2fs/alloc_tables.c | 116 ++++++++++++++++++++++++++++++++++++++++++++-
lib/ext2fs/closefs.c | 6 ++
lib/ext2fs/ext2_fs.h | 6 ++
lib/ext2fs/initialize.c | 6 ++
misc/mke2fs.c | 25 +++++++++-
5 files changed, 151 insertions(+), 8 deletions(-)


diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
index 4ad2ba9..8281858 100644
--- a/lib/ext2fs/alloc_tables.c
+++ b/lib/ext2fs/alloc_tables.c
@@ -27,18 +27,82 @@
#include "ext2_fs.h"
#include "ext2fs.h"

+void ext2fs_bgd_set_flex_meta_flag(ext2_filsys fs, blk_t block)
+{
+ dgrp_t group;
+
+ group = ext2fs_group_of_blk(fs, block);
+ if (!(fs->group_desc[group].bg_flags & EXT2_BG_FLEX_METADATA))
+ fs->group_desc[group].bg_flags |= EXT2_BG_FLEX_METADATA;
+}
+
+blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
+ ext2fs_block_bitmap bmap, int offset, int size)
+{
+ int flexbg, flexbg_size, elem_size;
+ blk_t last_blk, first_free = 0;
+ dgrp_t last_grp;
+
+ flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+ flexbg = group / flexbg_size;
+
+ if (size > fs->super->s_blocks_per_group / 8)
+ size = fs->super->s_blocks_per_group / 8;
+
+ /*
+ * Dont do a long search if the previous block
+ * search is still valid.
+ */
+ if (start_blk && group % flexbg_size) {
+ if (size > flexbg_size)
+ elem_size = fs->inode_blocks_per_group;
+ else
+ elem_size = 1;
+ if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
+ size))
+ return start_blk + elem_size;
+ }
+
+ start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
+ last_grp = (group + (flexbg_size - (group % flexbg_size) - 1));
+ last_blk = ext2fs_group_last_block(fs, last_grp);
+ if (last_grp > fs->group_desc_count)
+ last_grp = fs->group_desc_count;
+
+ /* Find the first available block */
+ if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
+ &first_free))
+ return first_free;
+
+ if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
+ bmap, &first_free))
+ return first_free;
+
+ return first_free;
+}
+
errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
ext2fs_block_bitmap bmap)
{
errcode_t retval;
blk_t group_blk, start_blk, last_blk, new_blk, blk;
- int j;
+ dgrp_t last_grp;
+ int j, rem_grps, flexbg_size = 0;

group_blk = ext2fs_group_first_block(fs, group);
last_blk = ext2fs_group_last_block(fs, group);

if (!bmap)
bmap = fs->block_map;
+
+ if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
+ EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ flexbg_size = 1 << fs->super->s_log_groups_per_flex;
+ rem_grps = flexbg_size - (group % flexbg_size);
+ last_grp = group + rem_grps - 1;
+ if (last_grp > fs->group_desc_count)
+ last_grp = fs->group_desc_count;
+ }

/*
* Allocate the block and inode bitmaps, if necessary
@@ -56,6 +120,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
} else
start_blk = group_blk;

+ if (flexbg_size) {
+ int prev_block = 0;
+ if (group && fs->group_desc[group-1].bg_block_bitmap)
+ prev_block = fs->group_desc[group-1].bg_block_bitmap;
+ start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
+ 0, rem_grps);
+ last_blk = ext2fs_group_last_block(fs, last_grp);
+ }
+
if (!fs->group_desc[group].bg_block_bitmap) {
retval = ext2fs_get_free_blocks(fs, start_blk, last_blk,
1, bmap, &new_blk);
@@ -66,6 +139,21 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
return retval;
ext2fs_mark_block_bitmap(bmap, new_blk);
fs->group_desc[group].bg_block_bitmap = new_blk;
+ if (flexbg_size) {
+ dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+ ext2fs_bgd_set_flex_meta_flag(fs, new_blk);
+ fs->group_desc[tmp].bg_free_blocks_count--;
+ fs->super->s_free_blocks_count--;
+ }
+ }
+
+ if (flexbg_size) {
+ int prev_block = 0;
+ if (group && fs->group_desc[group-1].bg_inode_bitmap)
+ prev_block = fs->group_desc[group-1].bg_inode_bitmap;
+ start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
+ flexbg_size, rem_grps);
+ last_blk = ext2fs_group_last_block(fs, last_grp);
}

if (!fs->group_desc[group].bg_inode_bitmap) {
@@ -78,11 +166,28 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
return retval;
ext2fs_mark_block_bitmap(bmap, new_blk);
fs->group_desc[group].bg_inode_bitmap = new_blk;
+ if (flexbg_size) {
+ dgrp_t tmp = ext2fs_group_of_blk(fs, new_blk);
+ ext2fs_bgd_set_flex_meta_flag(fs, new_blk);
+ fs->group_desc[tmp].bg_free_blocks_count--;
+ fs->super->s_free_blocks_count--;
+ }
}

/*
* Allocate the inode table
*/
+ if (flexbg_size) {
+ int prev_block = 0;
+ if (group && fs->group_desc[group-1].bg_inode_table)
+ prev_block = fs->group_desc[group-1].bg_inode_table;
+ group_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
+ flexbg_size * 2,
+ fs->inode_blocks_per_group *
+ rem_grps);
+ last_blk = ext2fs_group_last_block(fs, last_grp);
+ }
+
if (!fs->group_desc[group].bg_inode_table) {
retval = ext2fs_get_free_blocks(fs, group_blk, last_blk,
fs->inode_blocks_per_group,
@@ -91,8 +196,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
return retval;
for (j=0, blk = new_blk;
j < fs->inode_blocks_per_group;
- j++, blk++)
+ j++, blk++) {
ext2fs_mark_block_bitmap(bmap, blk);
+ if (flexbg_size) {
+ dgrp_t tmp = ext2fs_group_of_blk(fs, blk);
+ ext2fs_bgd_set_flex_meta_flag(fs, blk);
+ fs->group_desc[tmp].bg_free_blocks_count--;
+ fs->super->s_free_blocks_count--;
+ }
+ }
fs->group_desc[group].bg_inode_table = new_blk;
}

diff --git a/lib/ext2fs/closefs.c b/lib/ext2fs/closefs.c
index c8ef6ef..96b6af1 100644
--- a/lib/ext2fs/closefs.c
+++ b/lib/ext2fs/closefs.c
@@ -56,6 +56,7 @@ int ext2fs_super_and_bgd_loc(ext2_filsys fs,
unsigned int meta_bg, meta_bg_size;
blk_t numblocks, old_desc_blocks;
int has_super;
+ unsigned int flex_bg_size = 1 << fs->super->s_log_groups_per_flex;

group_block = ext2fs_group_first_block(fs, group);

@@ -99,8 +100,9 @@ int ext2fs_super_and_bgd_loc(ext2_filsys fs,
numblocks--;
}
}
-
- numblocks -= 2 + fs->inode_blocks_per_group;
+
+ if (!fs->super->s_log_groups_per_flex)
+ numblocks -= 2 + fs->inode_blocks_per_group;

if (ret_super_blk)
*ret_super_blk = super_blk;
diff --git a/lib/ext2fs/ext2_fs.h b/lib/ext2fs/ext2_fs.h
index a11e6be..a09bdd8 100644
--- a/lib/ext2fs/ext2_fs.h
+++ b/lib/ext2fs/ext2_fs.h
@@ -173,6 +173,7 @@ struct ext4_group_desc

#define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not initialized */
#define EXT2_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not initialized */
+#define EXT2_BG_FLEX_METADATA 0x0004 /* FLEX_BG block group contains meta-data */

/*
* Data structures used by the directory indexing feature
@@ -555,7 +556,10 @@ struct ext2_super_block {
__u16 s_mmp_interval; /* # seconds to wait in MMP checking */
__u64 s_mmp_block; /* Block for multi-mount protection */
__u32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad;
+ __u16 s_reserved_pad; /* Padding to next 32bits */
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};

/*
diff --git a/lib/ext2fs/initialize.c b/lib/ext2fs/initialize.c
index a7f777b..464737d 100644
--- a/lib/ext2fs/initialize.c
+++ b/lib/ext2fs/initialize.c
@@ -156,6 +156,7 @@ errcode_t ext2fs_initialize(const char *name, int flags,
set_field(s_feature_incompat, 0);
set_field(s_feature_ro_compat, 0);
set_field(s_first_meta_bg, 0);
+ set_field(s_log_groups_per_flex, 0);
if (super->s_feature_incompat & ~EXT2_LIB_FEATURE_INCOMPAT_SUPP) {
retval = EXT2_ET_UNSUPP_FEATURE;
goto cleanup;
@@ -363,7 +364,10 @@ ipg_retry:
* group, and fill in the correct group statistics for group.
* Note that although the block bitmap, inode bitmap, and
* inode table have not been allocated (and in fact won't be
- * by this routine), they are accounted for nevertheless.
+ * by this routine), they are accounted for nevertheless. If
+ * FLEX_BG meta-data grouping is used, only account for the
+ * superblock and group descriptors (the inode tables and
+ * bitmaps will be accounted for when allocated).
*/
super->s_free_blocks_count = 0;
for (i = 0; i < fs->group_desc_count; i++) {
diff --git a/misc/mke2fs.c b/misc/mke2fs.c
index 44f45aa..c043173 100644
--- a/misc/mke2fs.c
+++ b/misc/mke2fs.c
@@ -96,7 +96,7 @@ static void usage(void)
{
fprintf(stderr, _("Usage: %s [-c|-t|-l filename] [-b block-size] "
"[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
- "[-j] [-J journal-options]\n"
+ "[-j] [-J journal-options] [-G meta group size]\n"
"\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
"[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
"[-M last-mounted-directory]\n\t[-O feature[,...]] "
@@ -464,6 +464,8 @@ static void setup_lazy_bg(ext2_filsys fs)
sb->s_free_inodes_count -=
sb->s_inodes_per_group;
}
+ if ((bg->bg_flags & EXT2_BG_FLEX_METADATA))
+ continue;
blks = ext2fs_super_and_bgd_loc(fs, i, 0, 0, 0, 0);
if (bg->bg_free_blocks_count == blks) {
bg->bg_free_blocks_count = 0;
@@ -909,6 +911,7 @@ static void PRS(int argc, char *argv[])
int blocksize = 0;
int inode_ratio = 0;
int inode_size = 0;
+ unsigned long flex_bg_size = 0;
double reserved_ratio = 5.0;
int sector_size = 0;
int show_version_only = 0;
@@ -991,7 +994,7 @@ static void PRS(int argc, char *argv[])
}

while ((c = getopt (argc, argv,
- "b:cf:g:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
+ "b:cf:g:G:i:jl:m:no:qr:s:tvE:FI:J:L:M:N:O:R:ST:V")) != EOF) {
switch (c) {
case 'b':
blocksize = strtol(optarg, &tmp, 0);
@@ -1042,6 +1045,20 @@ static void PRS(int argc, char *argv[])
exit(1);
}
break;
+ case 'G':
+ flex_bg_size = strtoul(optarg, &tmp, 0);
+ if (*tmp) {
+ com_err(program_name, 0,
+ _("Illegal number for Flex_BG size"));
+ exit(1);
+ }
+ if (flex_bg_size < 2 ||
+ (flex_bg_size & (flex_bg_size-1)) != 0) {
+ com_err(program_name, 0,
+ _("Flex_BG size must be a power of 2"));
+ exit(1);
+ }
+ break;
case 'i':
inode_ratio = strtoul(optarg, &tmp, 0);
if (inode_ratio < EXT2_MIN_BLOCK_SIZE ||
@@ -1437,6 +1454,10 @@ static void PRS(int argc, char *argv[])
}
}

+ if(flex_bg_size) {
+ fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
+ }
+
if (!force && fs_param.s_blocks_count >= ((unsigned) 1 << 31)) {
com_err(program_name, 0,
_("Filesystem too large. No more than 2**31-1 blocks\n"



-JRS


2008-02-08 05:11:34

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG

On Feb 07, 2008 11:09 -0600, Jose R. Santos wrote:
> +blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
> + ext2fs_block_bitmap bmap, int offset, int size)
> +{

Can you add a comment on the intent of this function. By the name it seems
to be calculating some offset relative to the flex group, but that doesn't
convey anything about searching for free blocks???

> + /*
> + * Dont do a long search if the previous block
> + * search is still valid.
> + */

What condition here makse the previous search still valid?

> + if (start_blk && group % flexbg_size) {
> + if (size > flexbg_size)
> + elem_size = fs->inode_blocks_per_group;
> + else
> + elem_size = 1;
> + if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
> + size))
> + return start_blk + elem_size;
> + }
> +
> + start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
> + last_grp = (group + (flexbg_size - (group % flexbg_size) - 1));

This is a confusing calculation... Is it trying to find the last group
in the flex group that "group" is part of? I.e. round "group" to the
end of the flex group? Since flexbg_size is a power-of-two value, you
could just do "last_grp = group | (flexbg_size - 1)"?

> + last_blk = ext2fs_group_last_block(fs, last_grp);
> + if (last_grp > fs->group_desc_count)
> + last_grp = fs->group_desc_count;

Doesn't it make more sense to calculate "last_blk" AFTER limiting
"last_grp" to the actual number of groups in the filesystem?

> + /* Find the first available block */
> + if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
> + &first_free))
> + return first_free;
> +
> + if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
> + bmap, &first_free))
> + return first_free;

I don't quite understand this either? The first search is looking for a
single free block between "start_blk" and "last_blk", while the second
search is looking for "size" free blocks between "first_free + offset"
and "last_blk". What is the reason to do the second search after doing
the first one, or alternately just doing the second one directly?

Should both of these calls actually be saving the error return value and
returning that to a caller (returning first_free via a pointer arg like
ext2fs_get_free_blocks() does)?

> errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
> ext2fs_block_bitmap bmap)
> {
> if (!bmap)
> bmap = fs->block_map;
> +
> + if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> + EXT4_FEATURE_INCOMPAT_FLEX_BG)) {

No space between ..._FEATURE and "(".

> + flexbg_size = 1 << fs->super->s_log_groups_per_flex;
> + rem_grps = flexbg_size - (group % flexbg_size);
> + last_grp = group + rem_grps - 1;

Could this be written as:

last_grp = group | (flexbg_size - 1);
rem_grps = last_grp - group;

I'm also trying to see if you have an off-by-one error in your version?
Consider the case of group = 5, flexbg_size = 4. That gives us with my calc:

last_grp = 5 | (4 - 1) = 7;
rem_grps = 7 - 5 = 2;

This makes sense, since group 5 is in the second flexbg [4,7], and there
are 2 groups remaining after group 5 in that flexbg.

> @@ -56,6 +120,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs,
> } else
> start_blk = group_blk;
>
> + if (flexbg_size) {
> + int prev_block = 0;
> + if (group && fs->group_desc[group-1].bg_block_bitmap)
> + prev_block = fs->group_desc[group-1].bg_block_bitmap;
> + start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
> + 0, rem_grps);
> + last_blk = ext2fs_group_last_block(fs, last_grp);
> + }

This is presumably trying to allocate the block bitmap for "group" to be
after the block bitmap for "group - 1" (allocated in an earlier call).

> + if (group && fs->group_desc[group-1].bg_inode_bitmap)
> + prev_block = fs->group_desc[group-1].bg_inode_bitmap;
> + start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
> + flexbg_size, rem_grps);
> + last_blk = ext2fs_group_last_block(fs, last_grp);
> }

This is allocating the inode bitmap in the same manner. Would it make more
sense to change the mke2fs code to allocate all of the block bitmaps first
(well, after the sb+gdt backups), then all of the inode bitmaps, and finally
all of the inode tables?

> #define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not initialized */
> #define EXT2_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not initialized */
> +#define EXT2_BG_FLEX_METADATA 0x0004 /* FLEX_BG block group contains meta-data */

Hrm, I thought I had reserved that value in the uninit_groups patch?
+#define EXT3_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */

The kernel and e2fsck can use that to determine if the inode table was
zeroed out at mke2fs time (i.e. formatted with LAZY_BG or not). Then
the kernel can zero out the entire inode table and set INODE_ZEROED lazily
some time after mount, instead of mke2fs doing it at mke2fs time sucking
up all of the node's RAM and making it very slow.

This is still needed at some point to avoid the problem of e2fsck trying
to salvage ancient inodes if the group descriptor is corrupted for some
reason and the inode high watermark is lost.

Not implemented yet... but intended to be done by a helper thread started
at mount time to do GDT/bitmap/itable checksum validation, create the
mballoc buddy buffers, prefetch metadata, etc.

> @@ -96,7 +96,7 @@ static void usage(void)
> {
> fprintf(stderr, _("Usage: %s [-c|-t|-l filename] [-b block-size] "
> "[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
> - "[-j] [-J journal-options]\n"
> + "[-j] [-J journal-options] [-G meta group size]\n"
> "\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
> "[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
> "[-M last-mounted-directory]\n\t[-O feature[,...]] "

This also needs an update to the mke2fs.8.in man page.

> + if(flex_bg_size) {
> + fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
> + }

Whitespace, braces:

if (flex_bg_size)
fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);


Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2008-02-08 17:37:47

by Jose R. Santos

[permalink] [raw]
Subject: Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG

On Fri, 08 Feb 2008 00:11:16 -0500
Andreas Dilger <[email protected]> wrote:

> On Feb 07, 2008 11:09 -0600, Jose R. Santos wrote:
> > +blk_t ext2fs_flexbg_offset(ext2_filsys fs, dgrp_t group, blk_t start_blk,
> > + ext2fs_block_bitmap bmap, int offset, int size)
> > +{
>
> Can you add a comment on the intent of this function. By the name it seems
> to be calculating some offset relative to the flex group, but that doesn't
> convey anything about searching for free blocks???

I will add a comment. The function calculates where the search of
bitmaps/inode tables for a give block group starts by returning a block
number where all of the bitmaps/inode tables can be allocated in a
contiguous fashion. The search for free blocks is needed determine
where within the flex group we can allocated the meta-data.

>
> > + /*
> > + * Dont do a long search if the previous block
> > + * search is still valid.
> > + */
>
> What condition here makse the previous search still valid?

We pass the previous allocation as an argument to the function. If the
is enough space to allocate the rest of the inode tables after the
previous allocation, then no need to do a search. There are two
reasons why this is done.

1) If the size of the of a flexbg is big enough, searching for
inode_blocks_per_group * flexbg becomes very expensive if there happens
to be some blocks in the middle of where we started the search. This
easy happens if the size of all the inode tables in a flex group are
larger than a single block group. If the next block group has super
block backups or meta_bg blocks ext2fs_get_free_blocks() becomes very
expensive. If we have to do such an expensive search, better do it
once. This is also a problem when resizing since there is no telling
what block usage of those last groups would be.

2) This avoids having inode_tables or bitmaps being allocated out of
order. The search for very large blocks can leave empty space at the
begining of a flex group. The search for the last groups in the flex
group could actually be place in these smaller empty blocks which would
put things out of order.

> > + if (start_blk && group % flexbg_size) {
> > + if (size > flexbg_size)
> > + elem_size = fs->inode_blocks_per_group;
> > + else
> > + elem_size = 1;
> > + if (ext2fs_test_block_bitmap_range(bmap, start_blk + elem_size,
> > + size))
> > + return start_blk + elem_size;
> > + }
> > +
> > + start_blk = ext2fs_group_first_block(fs, flexbg_size * flexbg);
> > + last_grp = (group + (flexbg_size - (group % flexbg_size) - 1));
>
> This is a confusing calculation... Is it trying to find the last group
> in the flex group that "group" is part of? I.e. round "group" to the
> end of the flex group? Since flexbg_size is a power-of-two value, you
> could just do "last_grp = group | (flexbg_size - 1)"?

Yes, I will fix that.

> > + last_blk = ext2fs_group_last_block(fs, last_grp);
> > + if (last_grp > fs->group_desc_count)
> > + last_grp = fs->group_desc_count;
>
> Doesn't it make more sense to calculate "last_blk" AFTER limiting
> "last_grp" to the actual number of groups in the filesystem?

Ops... Thanks for catching.

> > + /* Find the first available block */
> > + if (ext2fs_get_free_blocks(fs, start_blk, last_blk, 1, bmap,
> > + &first_free))
> > + return first_free;
> > +
> > + if (ext2fs_get_free_blocks(fs, first_free + offset, last_blk, size,
> > + bmap, &first_free))
> > + return first_free;
>
> I don't quite understand this either? The first search is looking for a
> single free block between "start_blk" and "last_blk", while the second
> search is looking for "size" free blocks between "first_free + offset"
> and "last_blk". What is the reason to do the second search after doing
> the first one, or alternately just doing the second one directly?

Because the second search starts from the first free block + an
offset. This is used in order to have space to allocate each group of
inode/block bitmaps and inode tables contiguously. Cant do the second
search directly without knowing where I should start the search.

> Should both of these calls actually be saving the error return value and
> returning that to a caller (returning first_free via a pointer arg like
> ext2fs_get_free_blocks() does)?

Failure to find a contiguous set of blocks for all the groups meta-data
is not fatal since we don't allocate here. This function just helps
ext2fs_allocate_group_table() find where is should start its search. I
leet ext2fs_allocate_group_table() do the error handling if it truly
can find enough space for the allocation.

> > errcode_t ext2fs_allocate_group_table(ext2_filsys fs, dgrp_t group,
> > ext2fs_block_bitmap bmap)
> > {
> > if (!bmap)
> > bmap = fs->block_map;
> > +
> > + if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> > + EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
>
> No space between ..._FEATURE and "(".

Will fix.

> > + flexbg_size = 1 << fs->super->s_log_groups_per_flex;
> > + rem_grps = flexbg_size - (group % flexbg_size);
> > + last_grp = group + rem_grps - 1;
>
> Could this be written as:
>
> last_grp = group | (flexbg_size - 1);
> rem_grps = last_grp - group;

Will fix.

> I'm also trying to see if you have an off-by-one error in your version?
> Consider the case of group = 5, flexbg_size = 4. That gives us with my calc:
>
> last_grp = 5 | (4 - 1) = 7;
> rem_grps = 7 - 5 = 2;
>
> This makes sense, since group 5 is in the second flexbg [4,7], and there
> are 2 groups remaining after group 5 in that flexbg.

I think you're right. Thanks for catching.

> > @@ -56,6 +120,15 @@ errcode_t ext2fs_allocate_group_table(ext2_filsys fs,
> > } else
> > start_blk = group_blk;
> >
> > + if (flexbg_size) {
> > + int prev_block = 0;
> > + if (group && fs->group_desc[group-1].bg_block_bitmap)
> > + prev_block = fs->group_desc[group-1].bg_block_bitmap;
> > + start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
> > + 0, rem_grps);
> > + last_blk = ext2fs_group_last_block(fs, last_grp);
> > + }
>
> This is presumably trying to allocate the block bitmap for "group" to be
> after the block bitmap for "group - 1" (allocated in an earlier call).
>
> > + if (group && fs->group_desc[group-1].bg_inode_bitmap)
> > + prev_block = fs->group_desc[group-1].bg_inode_bitmap;
> > + start_blk = ext2fs_flexbg_offset(fs, group, prev_block, bmap,
> > + flexbg_size, rem_grps);
> > + last_blk = ext2fs_group_last_block(fs, last_grp);
> > }
>
> This is allocating the inode bitmap in the same manner. Would it make more
> sense to change the mke2fs code to allocate all of the block bitmaps first
> (well, after the sb+gdt backups), then all of the inode bitmaps, and finally
> all of the inode tables?

This is how I originally created the patch but since it was handle
outside ext2_allocate_table, it would mean that it could not be used
for resize2fs. I can be done from inside ext2_allocate_table() but it
seems wrong to allocate all of the groups in a flexbg when we expect
that this function only does it for one single group.

> > #define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not initialized */
> > #define EXT2_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not initialized */
> > +#define EXT2_BG_FLEX_METADATA 0x0004 /* FLEX_BG block group contains meta-data */
>
> Hrm, I thought I had reserved that value in the uninit_groups patch?
> +#define EXT3_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */

I may have been, I just based the patch on the next branch as Ted had
ask for new e2fsprog patches. The uninit group patch was not part of
the next branch when I pulled.

> The kernel and e2fsck can use that to determine if the inode table was
> zeroed out at mke2fs time (i.e. formatted with LAZY_BG or not). Then
> the kernel can zero out the entire inode table and set INODE_ZEROED lazily
> some time after mount, instead of mke2fs doing it at mke2fs time sucking
> up all of the node's RAM and making it very slow.
>
> This is still needed at some point to avoid the problem of e2fsck trying
> to salvage ancient inodes if the group descriptor is corrupted for some
> reason and the inode high watermark is lost.
>
> Not implemented yet... but intended to be done by a helper thread started
> at mount time to do GDT/bitmap/itable checksum validation, create the
> mballoc buddy buffers, prefetch metadata, etc.
>
> > @@ -96,7 +96,7 @@ static void usage(void)
> > {
> > fprintf(stderr, _("Usage: %s [-c|-t|-l filename] [-b block-size] "
> > "[-f fragment-size]\n\t[-i bytes-per-inode] [-I inode-size] "
> > - "[-j] [-J journal-options]\n"
> > + "[-j] [-J journal-options] [-G meta group size]\n"
> > "\t[-N number-of-inodes] [-m reserved-blocks-percentage] "
> > "[-o creator-os]\n\t[-g blocks-per-group] [-L volume-label] "
> > "[-M last-mounted-directory]\n\t[-O feature[,...]] "
>
> This also needs an update to the mke2fs.8.in man page.

Yes it does.

> > + if(flex_bg_size) {
> > + fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
> > + }
>
> Whitespace, braces:

Will fix.

>
> if (flex_bg_size)
> fs_param.s_log_groups_per_flex = int_log2(flex_bg_size);
>
>
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>

Thanks for the feedback.

-JRS

2008-02-11 04:33:54

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG

On Fri, Feb 08, 2008 at 11:37:40AM -0600, Jose R. Santos wrote:
> > > #define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not initialized */
> > > #define EXT2_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not initialized */
> > > +#define EXT2_BG_FLEX_METADATA 0x0004 /* FLEX_BG block group contains meta-data */
> >
> > Hrm, I thought I had reserved that value in the uninit_groups patch?
> > +#define EXT3_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
>
> I may have been, I just based the patch on the next branch as Ted had
> ask for new e2fsprog patches. The uninit group patch was not part of
> the next branch when I pulled.

Yes, but whenever you start reserving code points that impact the
on-disk format, you need to be careful and coordinate. Exactly is the
purpose of this flag, and why is it here?

And I don't see any patch in the kernel patch queue that uses this
flag. Is this intended for internal use inside e2fsprogs? If so,
this might not be the best place for it.....

- Ted

2008-02-11 15:05:50

by Jose R. Santos

[permalink] [raw]
Subject: Re: [PATCH] e2fsprogs: New bitmap and inode table allocation for FLEX_BG

On Sun, 10 Feb 2008 23:33:51 -0500
Theodore Tso <[email protected]> wrote:

> On Fri, Feb 08, 2008 at 11:37:40AM -0600, Jose R. Santos wrote:
> > > > #define EXT2_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not initialized */
> > > > #define EXT2_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not initialized */
> > > > +#define EXT2_BG_FLEX_METADATA 0x0004 /* FLEX_BG block group contains meta-data */
> > >
> > > Hrm, I thought I had reserved that value in the uninit_groups patch?
> > > +#define EXT3_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
> >
> > I may have been, I just based the patch on the next branch as Ted had
> > ask for new e2fsprog patches. The uninit group patch was not part of
> > the next branch when I pulled.
>
> Yes, but whenever you start reserving code points that impact the
> on-disk format, you need to be careful and coordinate. Exactly is the
> purpose of this flag, and why is it here?

Will fix.

> And I don't see any patch in the kernel patch queue that uses this
> flag. Is this intended for internal use inside e2fsprogs? If so,
> this might not be the best place for it.....
>
> - Ted

Currently, this is only used in e2fsprogs to determine which groups to
avoid when setting the EXT2_BG_BLOCK_UNINIT. It will be use on
ext4_init_block_bitmap() to return the right number of free block when
a block group does not have any meta-data in it. Eventually, it would
be nice to accurately and efficiently calculate the number of meta data
block used for a flexbg and be able to have these block groups
uninitialized as well. This flag will be use to determine which groups
need to have their meta data block usage calculated.

-JRS