2007-12-06 22:10:45

by Jose R. Santos

[permalink] [raw]
Subject: [RFC] [PATCH] Flex_BG ialloc awareness V2.

Hi folks,

New version of the Flex_BG ialloc allocation patch.

Changes from the last version:

- Size of the FLEX_BG in now written to the super block at mke2fs time
instead of calculating at mount time (testing patch for e2fsprog's
next branch attached).

- Rename a lots of the confusing "meta" terms as suggested by Andreas.

- Use the Orlov if the FLEX_BG size is 0.

- Use shift instead of divide in ext4_meta_group() as suggested by
Andreas.

- Use sb_bgl_lock() instead of one spinlock per FLEX_BG as suggested by
Andreas. (Needs more perf testing)

- Remove some dead/prototype code.

Signed-off-by: Jose R. Santos <[email protected]>
--

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index b102b0e..7ef9787 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -600,6 +600,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
struct ext4_sb_info *sbi;
int err = 0, ret;
ext4_grpblk_t group_freed;
+ ext4_group_t flex_group;

*pdquot_freed_blocks = 0;
sbi = EXT4_SB(sb);
@@ -745,6 +746,14 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);

+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1610,6 +1619,7 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
unsigned short windowsz = 0;
ext4_group_t ngroups;
unsigned long num = *count;
+ ext4_group_t flex_group;

*errp = -ENOSPC;
sb = inode->i_sb;
@@ -1815,6 +1825,14 @@ allocated:
spin_unlock(sb_bgl_lock(sbi, group_no));
percpu_counter_sub(&sbi->s_freeblocks_counter, num);

+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ flex_group = ext4_flex_group(sbi, group_no);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= num;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext4_journal_dirty_metadata(handle, gdp_bh);
if (!fatal)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 17b5df1..35ab8ff 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -158,6 +158,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
struct ext4_super_block * es;
struct ext4_sb_info *sbi;
int fatal = 0, err;
+ ext4_group_t flex_group;

if (atomic_read(&inode->i_count) > 1) {
printk ("ext4_free_inode: inode has count=%d\n",
@@ -235,6 +236,13 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);

+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes++;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
}
BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle, bh2);
@@ -289,6 +297,71 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
return ret;
}

+#define free_block_ratio 10
+
+int find_group_flex(struct super_block *sb, struct inode *parent)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ struct flex_groups *flex_group = sbi->s_flex_groups;
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+ ext4_group_t ngroups = sbi->s_groups_count;
+ int flex_size = ext4_flex_bg_size(sbi);
+ ext4_group_t best_flex = -1;
+ ext4_group_t best_group = -1;
+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+ int flex_freeb_ratio;
+ ext4_group_t n_fbg_groups;
+ ext4_group_t i;
+
+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) / flex_size;
+ best_flex = parent_fbg_group;
+
+find_close_to_parent:
+ flex_freeb_ratio = flex_group[best_flex].free_blocks*100/blocks_per_flex;
+ if (flex_group[best_flex].free_inodes &&
+ flex_freeb_ratio > free_block_ratio)
+ goto found_flexbg;
+
+ if (best_flex && best_flex == parent_fbg_group) {
+ best_flex--;
+ goto find_close_to_parent;
+ }
+
+ for (i = 0; i < n_fbg_groups; i++) {
+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
+ continue;
+
+ flex_freeb_ratio = flex_group[i].free_blocks*100/blocks_per_flex;
+
+ if (flex_freeb_ratio > free_block_ratio &&
+ flex_group[i].free_inodes) {
+ best_flex = i;
+ break;
+ }
+
+ if (best_flex < 0 ||
+ (flex_group[i].free_blocks >
+ flex_group[best_flex].free_blocks &&
+ flex_group[i].free_inodes))
+ best_flex = i;
+ }
+
+found_flexbg:
+ for (i = best_flex * flex_size; i < ngroups &&
+ i < (best_flex + 1) * flex_size; i++) {
+ desc = ext4_get_group_desc(sb, i, &bh);
+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ best_group = i;
+ goto out;
+ }
+ }
+out:
+ return best_group;
+}
+
/*
* Orlov's allocator for directories.
*
@@ -504,6 +577,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
struct inode *ret;
ext4_group_t i;
int free = 0;
+ ext4_group_t flex_group;

/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -517,6 +591,13 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)

sbi = EXT4_SB(sb);
es = sbi->s_es;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ group = find_group_flex(sb, dir);
+ goto got_group;
+ }
+
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
@@ -525,6 +606,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
} else
ret2 = find_group_other(sb, dir, &group);

+got_group:
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -681,6 +763,14 @@ got:
percpu_counter_inc(&sbi->s_dirs_counter);
sb->s_dirt = 1;

+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_groups_per_flex_shift) {
+ flex_group = ext4_flex_group(sbi, group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes--;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
inode->i_uid = current->fsuid;
if (test_opt (sb, GRPID))
inode->i_gid = dir->i_gid;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b626339..81ad9b1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -518,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
+ kfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1423,6 +1424,61 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
return res;
}

+static int ext4_fill_flex_info(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *bh;
+ ext4_group_t flex_group_count;
+ ext4_group_t flex_group;
+ unsigned int shift;
+ int groups_per_flex = 0;
+ int tmp = 0;
+ __u64 block_bitmap = 0, cur_block_bitmap;
+ int i;
+
+ groups_per_flex = le16_to_cpu(sbi->s_es->s_flex_bg_size);
+
+ if (!groups_per_flex) {
+ sbi->s_groups_per_flex_shift = 0;
+ return 1;
+ }
+
+ shift = 0;
+ tmp = groups_per_flex;
+ while ((tmp >>= 1UL) != 0UL)
+ shift++;
+
+ sbi->s_groups_per_flex_shift = shift;
+ flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+ groups_per_flex;
+ sbi->s_flex_groups = kmalloc(flex_group_count *
+ sizeof(struct flex_groups), GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
+ goto failed;
+ }
+ memset(sbi->s_flex_groups, 0, flex_group_count *
+ sizeof(struct flex_groups));
+
+ gdp = ext4_get_group_desc(sb, 1, &bh);
+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, &bh);
+
+ flex_group = ext4_flex_group(sbi, i);
+ sbi->s_flex_groups[flex_group].free_inodes +=
+ le16_to_cpu(gdp->bg_free_inodes_count);
+ sbi->s_flex_groups[flex_group].free_blocks +=
+ le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return 1;
+failed:
+ return 0;
+}
+
__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
struct ext4_group_desc *gdp)
{
@@ -2037,6 +2093,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ printk(KERN_ERR
+ "EXT4-fs: unable to initialize flex_bg meta info!\n");
+ goto failed_mount2;
+ }
+
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index bcdb59d..3b94dbf 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -152,6 +152,15 @@ struct ext4_group_desc
__u32 bg_reserved2[3];
};

+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ __u32 free_inodes;
+ __u32 free_blocks;
+};
+
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -622,7 +631,9 @@ struct ext4_super_block {
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __le16 s_flex_bg_size; /* FLEX_BG group size */
+ __le16 padding; /* Padding to next 32bits */
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};

#ifdef __KERNEL__
@@ -1120,6 +1131,17 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}

+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_groups_per_flex_shift;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_groups_per_flex_shift;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index 744e746..ac7af1b 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -147,6 +147,9 @@ struct ext4_sb_info {

/* locality groups */
struct ext4_locality_group *s_locality_groups;
+
+ unsigned int s_groups_per_flex_shift;
+ struct flex_groups *s_flex_groups;
};
#define EXT4_GROUP_INFO(sb, group) \
EXT4_SB(sb)->s_group_info[(group) >> EXT4_DESC_PER_BLOCK_BITS(sb)] \


Attachments:
(No filename) (11.34 kB)
e2fsprogs-flexbg-grouping.patch (7.74 kB)
Download all attachments

2007-12-07 10:14:34

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Dec 06, 2007 16:10 -0600, Jose R. Santos wrote:
> @@ -600,6 +600,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
> struct ext4_sb_info *sbi;
> int err = 0, ret;
> ext4_grpblk_t group_freed;
> + ext4_group_t flex_group;
>
> *pdquot_freed_blocks = 0;
> sbi = EXT4_SB(sb);
> @@ -745,6 +746,14 @@ do_more:
> spin_unlock(sb_bgl_lock(sbi, block_group));
> percpu_counter_add(&sbi->s_freeblocks_counter, count);
>
> + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
> + sbi->s_groups_per_flex_shift) {
> + flex_group = ext4_flex_group(sbi, block_group);
> + spin_lock(sb_bgl_lock(sbi, flex_group));
> + sbi->s_flex_groups[flex_group].free_blocks += count;
> + spin_unlock(sb_bgl_lock(sbi, flex_group));
> + }

In general, I prefer to keep variables in as local a scope as possible.
In this case, flex_group could be declared inside the "if (EXT4_HAS_INCOMPAT"
check.

> @@ -1610,6 +1619,7 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
> unsigned short windowsz = 0;
> ext4_group_t ngroups;
> unsigned long num = *count;
> + ext4_group_t flex_group;
>
> *errp = -ENOSPC;
> sb = inode->i_sb;
> @@ -1815,6 +1825,14 @@ allocated:
> spin_unlock(sb_bgl_lock(sbi, group_no));
> percpu_counter_sub(&sbi->s_freeblocks_counter, num);
>
> + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
> + sbi->s_groups_per_flex_shift) {
> + flex_group = ext4_flex_group(sbi, group_no);
> + spin_lock(sb_bgl_lock(sbi, flex_group));
> + sbi->s_flex_groups[flex_group].free_blocks -= num;
> + spin_unlock(sb_bgl_lock(sbi, flex_group));
> + }

Same as above.

> @@ -158,6 +158,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
> struct ext4_super_block * es;
> struct ext4_sb_info *sbi;
> int fatal = 0, err;
> + ext4_group_t flex_group;
>
> if (atomic_read(&inode->i_count) > 1) {
> printk ("ext4_free_inode: inode has count=%d\n",
> @@ -235,6 +236,13 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
> if (is_directory)
> percpu_counter_dec(&sbi->s_dirs_counter);
>
> + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
> + sbi->s_groups_per_flex_shift) {
> + flex_group = ext4_flex_group(sbi, block_group);
> + spin_lock(sb_bgl_lock(sbi, flex_group));
> + sbi->s_flex_groups[flex_group].free_inodes++;
> + spin_unlock(sb_bgl_lock(sbi, flex_group));
> + }

Same as above...

> +#define free_block_ratio 10
> +
> +int find_group_flex(struct super_block *sb, struct inode *parent)
> +{
> + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) / flex_size;
> + best_flex = parent_fbg_group;
> +
> +find_close_to_parent:
> + flex_freeb_ratio = flex_group[best_flex].free_blocks*100/blocks_per_flex;

There is no particular reason that this ratio needs to be "*100", it could
just as easily be a fraction of 256 and make the multiply into a shift.
The free_block_ratio would be 26 in that case.

> + for (i = 0; i < n_fbg_groups; i++) {
> + if (i == parent_fbg_group || i == parent_fbg_group - 1)
> + continue;

It seems this scans flex groups the way we used to scan groups?

> +found_flexbg:
> + for (i = best_flex * flex_size; i < ngroups &&
> + i < (best_flex + 1) * flex_size; i++) {

And now that we've found a suitable flex group, we need to find which
block group therein has some free inodes...

> +static int ext4_fill_flex_info(struct super_block *sb)
> +{

It still seems desirable to have a single per-group array instead of

> @@ -622,7 +631,9 @@ struct ext4_super_block {
> __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
> __le64 s_mmp_block; /* Block for multi-mount protection */
> __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
> - __u32 s_reserved[163]; /* Padding to the end of the block */
> + __le16 s_flex_bg_size; /* FLEX_BG group size */

Shouldn't this be "s_flex_bg_bits"?

> +{
> + return block_group >> sbi->s_groups_per_flex_shift;
> +}
> +
> +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
> +{
> + return 1 << sbi->s_groups_per_flex_shift;
> +}
> +
> #define ext4_std_error(sb, errno) \
> do { \
> if ((errno)) \

> diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
> --- a/lib/ext2fs/alloc_tables.c
> +++ b/lib/ext2fs/alloc_tables.c
> + if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> + EXT4_FEATURE_INCOMPAT_FLEX_BG))
> + ext2fs_allocate_flex_groups(fs);
> +
> + else {
> + for (i = 0; i < fs->group_desc_count; i++) {
> + retval = ext2fs_allocate_group_table(fs, i, fs->block_map);
> + if (retval)
> + return retval;
> + }
> }

My preference would be to have "if (EXT2_HAS_INCOMPAT...) { ... } else {"
(i.e. add { } for the first part) since there are { } on the second part,
and it is just easier to read.

> @@ -1045,6 +1046,19 @@ static void PRS(int argc, char *argv[])
> + if ((flex_bg_size & (flex_bg_size-1)) != 0) {
> + com_err(program_name, 0,
> + _("Flex_BG size must be a power of 2"));
> + exit(1);

If flex_bg_size is a power of two then there isn't any need to store anything
except __u8 s_flex_bg_bits in the superblock.

> @@ -1444,6 +1458,10 @@ static void PRS(int argc, char *argv[])
> + if(flex_bg_size) {
> + fs_param.s_flex_bg_size = ext2fs_swab16(flex_bg_size);
> + }

Space between if and (, and no need for braces for a single line body.

It would also be nice to get a m_flexbg test case along with this patch
that (at minimum) creates a filesystem with flexbg enabled, and then
runs e2fsck on it. This was broken for the lazy_bg feature for a long
time, so it makes sense to add a test to verify each new feature has
some basic functionality. If the f_random_corruption test is in the
git tree, it would be good to add the flex_bg option to the list of
possible feature combinations to test.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2007-12-07 15:52:39

by Jose R. Santos

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Fri, 7 Dec 2007 03:14:28 -0700
Andreas Dilger <[email protected]> wrote:

> On Dec 06, 2007 16:10 -0600, Jose R. Santos wrote:
> > @@ -600,6 +600,7 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
> > struct ext4_sb_info *sbi;
> > int err = 0, ret;
> > ext4_grpblk_t group_freed;
> > + ext4_group_t flex_group;
> >
> > *pdquot_freed_blocks = 0;
> > sbi = EXT4_SB(sb);
> > @@ -745,6 +746,14 @@ do_more:
> > spin_unlock(sb_bgl_lock(sbi, block_group));
> > percpu_counter_add(&sbi->s_freeblocks_counter, count);
> >
> > + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
> > + sbi->s_groups_per_flex_shift) {
> > + flex_group = ext4_flex_group(sbi, block_group);
> > + spin_lock(sb_bgl_lock(sbi, flex_group));
> > + sbi->s_flex_groups[flex_group].free_blocks += count;
> > + spin_unlock(sb_bgl_lock(sbi, flex_group));
> > + }
>
> In general, I prefer to keep variables in as local a scope as possible.
> In this case, flex_group could be declared inside the "if (EXT4_HAS_INCOMPAT"
> check.

Ok.

> > +#define free_block_ratio 10
> > +
> > +int find_group_flex(struct super_block *sb, struct inode *parent)
> > +{
> > + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) / flex_size;
> > + best_flex = parent_fbg_group;
> > +
> > +find_close_to_parent:
> > + flex_freeb_ratio = flex_group[best_flex].free_blocks*100/blocks_per_flex;
>
> There is no particular reason that this ratio needs to be "*100", it could
> just as easily be a fraction of 256 and make the multiply into a shift.
> The free_block_ratio would be 26 in that case.

The idea here is to reserve 10% (free_block_ratio) of free blocks in a
flexbg for allocation of new files and expansion of existing one. The
"*100" make the math here easy but this still something that need to be
tune further. I'm sure we can do this in a series of shifts, just
haven't spent the time thinking of a clever way to do this.

Although, given all the multiplies, divides, endian changes that occur
while using Orlov, I'm not so concern about this right now.

> > + for (i = 0; i < n_fbg_groups; i++) {
> > + if (i == parent_fbg_group || i == parent_fbg_group - 1)
> > + continue;
>
> It seems this scans flex groups the way we used to scan groups?

No. It does something slightly different, the scan does not start from
the parent group forward. This help compress data as much as possible
in the disk and helps avoid large seeks. Reclaiming as much used
groups as possible will also help uninitialized block groups by avoiding
using groups when there is perfectly good unused space at the beginning
of the disk. Currently the search starts at the first flexbg but for
very large filesystems, this should be tune to start at a location
closer to the parents flex group. This is another area where the patch
needs more tuning, but I was hopping people would give this patch a
try to see what deficiencies they find before going into lengthy disk
testing/tuning cycle.

> > +found_flexbg:
> > + for (i = best_flex * flex_size; i < ngroups &&
> > + i < (best_flex + 1) * flex_size; i++) {
>
> And now that we've found a suitable flex group, we need to find which
> block group therein has some free inodes...

Yes, but we treat all inode tables in a flex group as one giant table
to improve locality and reduce initialization of inode tables to
improve fsck time.

>
> > +static int ext4_fill_flex_info(struct super_block *sb)
> > +{
>
> It still seems desirable to have a single per-group array instead of

?

> > @@ -622,7 +631,9 @@ struct ext4_super_block {
> > __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
> > __le64 s_mmp_block; /* Block for multi-mount protection */
> > __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
> > - __u32 s_reserved[163]; /* Padding to the end of the block */
> > + __le16 s_flex_bg_size; /* FLEX_BG group size */
>
> Shouldn't this be "s_flex_bg_bits"?

I debated whether to store this as the s_flex_bg_size and calculate the
bits during the filesystem mount time or just stored the bit in the
super block to begging with. The reason I stored the size is that it
seemed more in line with the other fields in the super block. I don't
mind either way since this is more of a style issue, although saving an
extra 8bits in the super block may be good enough reason to change it.

> > +{
> > + return block_group >> sbi->s_groups_per_flex_shift;
> > +}
> > +
> > +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
> > +{
> > + return 1 << sbi->s_groups_per_flex_shift;
> > +}
> > +
> > #define ext4_std_error(sb, errno) \
> > do { \
> > if ((errno)) \
>
> > diff --git a/lib/ext2fs/alloc_tables.c b/lib/ext2fs/alloc_tables.c
> > --- a/lib/ext2fs/alloc_tables.c
> > +++ b/lib/ext2fs/alloc_tables.c
> > + if (EXT2_HAS_INCOMPAT_FEATURE (fs->super,
> > + EXT4_FEATURE_INCOMPAT_FLEX_BG))
> > + ext2fs_allocate_flex_groups(fs);
> > +
> > + else {
> > + for (i = 0; i < fs->group_desc_count; i++) {
> > + retval = ext2fs_allocate_group_table(fs, i, fs->block_map);
> > + if (retval)
> > + return retval;
> > + }
> > }
>
> My preference would be to have "if (EXT2_HAS_INCOMPAT...) { ... } else {"
> (i.e. add { } for the first part) since there are { } on the second part,
> and it is just easier to read.

Mine too, but checkpatch complained about this. :)

>
> > @@ -1045,6 +1046,19 @@ static void PRS(int argc, char *argv[])
> > + if ((flex_bg_size & (flex_bg_size-1)) != 0) {
> > + com_err(program_name, 0,
> > + _("Flex_BG size must be a power of 2"));
> > + exit(1);
>
> If flex_bg_size is a power of two then there isn't any need to store anything
> except __u8 s_flex_bg_bits in the superblock.

Agree.

> > @@ -1444,6 +1458,10 @@ static void PRS(int argc, char *argv[])
> > + if(flex_bg_size) {
> > + fs_param.s_flex_bg_size = ext2fs_swab16(flex_bg_size);
> > + }
>
> Space between if and (, and no need for braces for a single line body.

This patch is intended for testing and need a lot of work. I still
haven't looked at any of the styling issues, but I'm surprise this is
the only one you found. :)

> It would also be nice to get a m_flexbg test case along with this patch
> that (at minimum) creates a filesystem with flexbg enabled, and then
> runs e2fsck on it. This was broken for the lazy_bg feature for a long
> time, so it makes sense to add a test to verify each new feature has
> some basic functionality. If the f_random_corruption test is in the
> git tree, it would be good to add the flex_bg option to the list of
> possible feature combinations to test.

Yes, it's on my todo list. First I need to get a patch that meta-data
allocation patch that Ted is willing to put into the e2fsprog's next
branch. After that, I will add test case for the feature.

> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>

Thanks

-JRS

2007-12-11 11:00:35

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Dec 07, 2007 09:52 -0600, Jose R. Santos wrote:
> Andreas Dilger <[email protected]> wrote:
> > There is no particular reason that this ratio needs to be "*100", it could
> > just as easily be a fraction of 256 and make the multiply into a shift.
> > The free_block_ratio would be 26 in that case.
>
> The idea here is to reserve 10% (free_block_ratio) of free blocks in a
> flexbg for allocation of new files and expansion of existing one. The
> "*100" make the math here easy but this still something that need to be
> tune further. I'm sure we can do this in a series of shifts, just
> haven't spent the time thinking of a clever way to do this.

This is a common misconception for code to have 10% mean 10 / 100. It
is just as good to have 26/256

> > > @@ -622,7 +631,9 @@ struct ext4_super_block {
> > > __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
> > > __le64 s_mmp_block; /* Block for multi-mount protection */
> > > __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
> > > - __u32 s_reserved[163]; /* Padding to the end of the block */
> > > + __le16 s_flex_bg_size; /* FLEX_BG group size */
> >
> > Shouldn't this be "s_flex_bg_bits"?
>
> I debated whether to store this as the s_flex_bg_size and calculate the
> bits during the filesystem mount time or just stored the bit in the
> super block to begging with. The reason I stored the size is that it
> seemed more in line with the other fields in the super block. I don't
> mind either way since this is more of a style issue, although saving an
> extra 8bits in the super block may be good enough reason to change it.

I'd think being able to avoid the divide for every inode allocation is more
important than 8 bits in the superblock.

> > My preference would be to have "if (EXT2_HAS_INCOMPAT...) { ... } else {"
> > (i.e. add { } for the first part) since there are { } on the second part,
> > and it is just easier to read.
>
> Mine too, but checkpatch complained about this. :)

Time to fix checkpatch it would seem.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2007-12-11 18:18:26

by Jose R. Santos

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Tue, 11 Dec 2007 04:00:33 -0700
Andreas Dilger <[email protected]> wrote:

> On Dec 07, 2007 09:52 -0600, Jose R. Santos wrote:
> > Andreas Dilger <[email protected]> wrote:
> > > There is no particular reason that this ratio needs to be "*100", it could
> > > just as easily be a fraction of 256 and make the multiply into a shift.
> > > The free_block_ratio would be 26 in that case.
> >
> > The idea here is to reserve 10% (free_block_ratio) of free blocks in a
> > flexbg for allocation of new files and expansion of existing one. The
> > "*100" make the math here easy but this still something that need to be
> > tune further. I'm sure we can do this in a series of shifts, just
> > haven't spent the time thinking of a clever way to do this.
>
> This is a common misconception for code to have 10% mean 10 / 100. It
> is just as good to have 26/256

I understand that part, but my point is that changing the multiply to
256 doesn't do anything to eliminate the divide by blocks_per_flex.
Give that is more common to have an arch with no divide instruction
than one with no multiply, it seems more important to take care of the
divide by blocks_per_flex rather than the multiply by 100.

We could store the blocks_per_flex_bits in the sbi to do this but the
last flexbg is not guarantied to be the same size as the other flexbg
so it needs to be treated differently.

Hum... Now that I think of it, the last flexbg is not treated
differently on the current patch either. Looks like I found a bug. :)

> > > > @@ -622,7 +631,9 @@ struct ext4_super_block {
> > > > __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
> > > > __le64 s_mmp_block; /* Block for multi-mount protection */
> > > > __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
> > > > - __u32 s_reserved[163]; /* Padding to the end of the block */
> > > > + __le16 s_flex_bg_size; /* FLEX_BG group size */
> > >
> > > Shouldn't this be "s_flex_bg_bits"?
> >
> > I debated whether to store this as the s_flex_bg_size and calculate the
> > bits during the filesystem mount time or just stored the bit in the
> > super block to begging with. The reason I stored the size is that it
> > seemed more in line with the other fields in the super block. I don't
> > mind either way since this is more of a style issue, although saving an
> > extra 8bits in the super block may be good enough reason to change it.
>
> I'd think being able to avoid the divide for every inode allocation is more
> important than 8 bits in the superblock.

We already avoid the divide since what we store in the sbi IS the bits
which are calculated at mount time for each fs. Base on the other
fields in the super block struct, I decided to put explicit size of the
flexbg in the super block. The kernel code can decide how best to use
that number which in this case its used to calculate the number of bits
in order to avoid doing divides.

So this is really a styling issue in how to record data in the super
block. The only technical issue with this is whether it's important to
save those extra 8 bits in the super block struct.

> > > My preference would be to have "if (EXT2_HAS_INCOMPAT...) { ... } else {"
> > > (i.e. add { } for the first part) since there are { } on the second part,
> > > and it is just easier to read.
> >
> > Mine too, but checkpatch complained about this. :)
>
> Time to fix checkpatch it would seem.
>
> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>



-JRS

2007-12-11 23:15:32

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Dec 11, 2007 10:08 -0600, Jose R. Santos wrote:
> > I'd think being able to avoid the divide for every inode allocation is more
> > important than 8 bits in the superblock.
>
> We already avoid the divide since what we store in the sbi IS the bits
> which are calculated at mount time for each fs. Base on the other
> fields in the super block struct, I decided to put explicit size of the
> flexbg in the super block. The kernel code can decide how best to use
> that number which in this case its used to calculate the number of bits
> in order to avoid doing divides.
>
> So this is really a styling issue in how to record data in the super
> block. The only technical issue with this is whether it's important to
> save those extra 8 bits in the super block struct.

Well, if it is stored in the superblock as a non-power-of-two value, then
there always exists the possibility that it is set incorrectly (maybe by
a version of mke2fs that doesn't verify this) and the code will not do the
right thing. Storing it in bits (as is done with e.g. s_log_block_size and
s_log_frag_size) ensures there is no possibility of a value that isn't a
power-of-two.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2007-12-13 15:51:26

by Jose R. Santos

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Tue, 11 Dec 2007 16:15:28 -0700
Andreas Dilger <[email protected]> wrote:

> On Dec 11, 2007 10:08 -0600, Jose R. Santos wrote:
> > > I'd think being able to avoid the divide for every inode allocation is more
> > > important than 8 bits in the superblock.
> >
> > We already avoid the divide since what we store in the sbi IS the bits
> > which are calculated at mount time for each fs. Base on the other
> > fields in the super block struct, I decided to put explicit size of the
> > flexbg in the super block. The kernel code can decide how best to use
> > that number which in this case its used to calculate the number of bits
> > in order to avoid doing divides.
> >
> > So this is really a styling issue in how to record data in the super
> > block. The only technical issue with this is whether it's important to
> > save those extra 8 bits in the super block struct.
>
> Well, if it is stored in the superblock as a non-power-of-two value, then
> there always exists the possibility that it is set incorrectly (maybe by
> a version of mke2fs that doesn't verify this) and the code will not do the
> right thing. Storing it in bits (as is done with e.g. s_log_block_size and
> s_log_frag_size) ensures there is no possibility of a value that isn't a
> power-of-two.

While I don't necessary buy the mke2fs example (the only patch that
set this already checks for power-of-two), you are right about the
possibility of being set incorrectly. I will change it to store the
bits in the next release which I'll do after I fix the resize2fs issues
since this will require changes to the e2fsprogs as well.

Now, storing the bits only guaranties that the flexbg size is always a
power-of-two and does not guarantee that the super block flexbg size
represents the actual meta-data grouping on disk. For this we need to
verify that the bitmap offsets match what the super block reports. It
may be an unlikely scenario, but it may be worth it to check this as
well at mount time.

> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>


-JRS

2007-12-13 22:58:59

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Dec 13, 2007 09:51 -0600, Jose R. Santos wrote:
> Now, storing the bits only guaranties that the flexbg size is always a
> power-of-two and does not guarantee that the super block flexbg size
> represents the actual meta-data grouping on disk. For this we need to
> verify that the bitmap offsets match what the super block reports. It
> may be an unlikely scenario, but it may be worth it to check this as
> well at mount time.

I'm not sure what you mean... Isn't the flexbg size just a count of
the number of block groups? If it is always a power of two, and the
groups per metabg is always a power of two (it is) then they will
always be even multiples.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2007-12-14 02:36:20

by Jose R. Santos

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Thu, 13 Dec 2007 15:58:57 -0700
Andreas Dilger <[email protected]> wrote:

> On Dec 13, 2007 09:51 -0600, Jose R. Santos wrote:
> > Now, storing the bits only guaranties that the flexbg size is always a
> > power-of-two and does not guarantee that the super block flexbg size
> > represents the actual meta-data grouping on disk. For this we need to
> > verify that the bitmap offsets match what the super block reports. It
> > may be an unlikely scenario, but it may be worth it to check this as
> > well at mount time.
>
> I'm not sure what you mean... Isn't the flexbg size just a count of
> the number of block groups? If it is always a power of two, and the
> groups per metabg is always a power of two (it is) then they will
> always be even multiples.

What I mean is that if the value in the super block is corrupted and
does not represent the actual flexbg size, the inode allocation will
behave in weird unexpected ways. Just as we check that the bitmaps are
within the block group range (when not using flexbg), we should
probably sanity check the size of the flexbg as reported in the super
block.

Or do you believe the check is unnecessary?

> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>



-JRS

2007-12-14 17:01:08

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Dec 13, 2007 20:36 -0600, Jose R. Santos wrote:
> ... if the value in the super block is corrupted and
> does not represent the actual flexbg size, the inode allocation will
> behave in weird unexpected ways. Just as we check that the bitmaps are
> within the block group range (when not using flexbg), we should
> probably sanity check the size of the flexbg as reported in the super
> block.
>
> Or do you believe the check is unnecessary?

Well, I can imagine in some cases that the flexbg will not be completely
contiguous on disk (e.g. after a filesystem resize, if there are bad
blocks, etc). As long as the group descriptors themselves are correct
(i.e. referencing valid bitmaps/itable) then it shouldn't cause a mount
failure if the per-group data isn't strictly aligned according to the
superblock flexbg count.

We would need to validate the group descriptor separately though (e.g.
group checksums).

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.

2007-12-14 18:07:43

by Jose R. Santos

[permalink] [raw]
Subject: Re: [RFC] [PATCH] Flex_BG ialloc awareness V2.

On Fri, 14 Dec 2007 10:01:06 -0700
Andreas Dilger <[email protected]> wrote:
> Well, I can imagine in some cases that the flexbg will not be completely
> contiguous on disk (e.g. after a filesystem resize, if there are bad
> blocks, etc). As long as the group descriptors themselves are correct
> (i.e. referencing valid bitmaps/itable) then it shouldn't cause a mount
> failure if the per-group data isn't strictly aligned according to the
> superblock flexbg count.

Yes, the meta-data may not be completely contiguous on the disk as per
the definition of flexbg. What I was planing on doing was to check the
first, second and last-1 flexbg to see if how the meta-data is
arranged. If none of those flexbg matches the size of the flexbg size
in the super block the we can set sbi->s_groups_per_flex_shift to zero
which would make the fs fallback to Orlov.

> We would need to validate the group descriptor separately though (e.g.
> group checksums).

Agree

> Cheers, Andreas
> --
> Andreas Dilger
> Sr. Staff Engineer, Lustre Group
> Sun Microsystems of Canada, Inc.
>

-JRS