2009-02-18 15:43:26

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

The find_group_flex() inode allocator is now only used if the
filesystem is mounted using the "oldalloc" mount option. It is
replaced with the original Orlov allocator that has been updated for
flex_bg filesystems (it should behave the same way if flex_bg is
disabled). The inode allocator now functions by taking into account
each flex_bg group, instead of each block group, when deciding whether
or not it's time to allocate a new directory into a fresh flex_bg.

The block allocator has also been changed so that the first block
group in each flex_bg is preferred for use for storing directory
blocks. This keeps directory blocks close together, which is good for
speeding up e2fsck since large directories are more likely to look
like this:

debugfs: stat /home/tytso/Maildir/cur
Inode: 1844562 Type: directory Mode: 0700 Flags: 0x81000
Generation: 1132745781 Version: 0x00000000:0000ad71
User: 15806 Group: 15806 Size: 1060864
File ACL: 0 Directory ACL: 0
Links: 2 Blockcount: 2072
Fragment: Address: 0 Number: 0 Size: 0
ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009
atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009
mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009
crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009
Size of extra inode fields: 28
BLOCKS:
(0):7348651, (1-258):7348654-7348911
TOTAL: 259

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4_i.h | 3 +
fs/ext4/extents.c | 17 ++++-
fs/ext4/ialloc.c | 199 ++++++++++++++++++++++++++++++++++++++++-------------
fs/ext4/inode.c | 18 +++++-
4 files changed, 186 insertions(+), 51 deletions(-)

diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 2d516c0..4ce2187 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -122,6 +122,9 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;

+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+
/* allocation reservation info for delalloc */
unsigned int i_reserved_data_blocks;
unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e2eab19..0aeb8c2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
int depth;

if (path) {
@@ -170,10 +172,23 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
}

/* OK. use inode's group */
- bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+ block_group = ei->i_block_group;
+ if (flex_size >= 4) {
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 21080ab..212e5be 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -409,6 +409,42 @@ out:
}

/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg
+ */
+struct orlov_stats {
+ __u32 free_inodes;
+ __u32 free_blocks;
+ __u32 used_dirs;
+};
+
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+ int flex_size, struct orlov_stats *stats)
+{
+ struct ext4_group_desc *desc;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ int i;
+
+ stats->free_inodes = 0;
+ stats->free_blocks = 0;
+ stats->used_dirs = 0;
+
+ g *= flex_size;
+
+ for (i = 0; i < flex_size; i++) {
+ if (g >= ngroups)
+ break;
+ desc = ext4_get_group_desc(sb, g++, NULL);
+ if (!desc)
+ continue;
+
+ stats->free_inodes += ext4_free_inodes_count(sb, desc);
+ stats->free_blocks += ext4_free_blks_count(sb, desc);
+ stats->used_dirs += ext4_used_dirs_count(sb, desc);
+ }
+}
+
+/*
* Orlov's allocator for directories.
*
* We always try to spread first-level directories.
@@ -423,7 +459,6 @@ out:
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
@@ -437,7 +472,7 @@ out:
#define BLOCK_COST 256

static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -446,12 +481,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext4_fsblk_t freeb, avefreeb;
- ext4_fsblk_t blocks_per_dir;
unsigned int ndirs;
- int max_debt, max_dirs, min_inodes;
+ int max_dirs, min_inodes;
ext4_grpblk_t min_blocks;
- ext4_group_t i;
+ ext4_group_t i, grp, g;
struct ext4_group_desc *desc;
+ struct orlov_stats stats;
+ int flex_size = ext4_flex_bg_size(sbi);
+
+ if (flex_size < 4)
+ flex_size = 1;
+ else {
+ ngroups = (ngroups + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }

freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
@@ -460,71 +504,85 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
do_div(avefreeb, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);

- if ((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+ if (S_ISDIR(mode) &&
+ ((parent == sb->s_root->d_inode) ||
+ (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
int best_ndir = inodes_per_group;
- ext4_group_t grp;
int ret = -1;

get_random_bytes(&grp, sizeof(grp));
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
- grp = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, grp, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
+ g = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, g, flex_size, &stats);
+ if (!stats.free_inodes)
continue;
- if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+ if (stats.used_dirs >= best_ndir)
continue;
- if (ext4_free_inodes_count(sb, desc) < avefreei)
+ if (stats.free_inodes < avefreei)
continue;
- if (ext4_free_blks_count(sb, desc) < avefreeb)
+ if (stats.free_blocks < avefreeb)
continue;
- *group = grp;
+ grp = g;
ret = 0;
- best_ndir = ext4_used_dirs_count(sb, desc);
+ best_ndir = stats.used_dirs;
+ }
+ if (ret)
+ goto fallback;
+ found_flex_bg:
+ if (flex_size == 1) {
+ *group = grp;
+ return 0;
+ }
+
+ grp *= flex_size;
+ for (i = 1; i < flex_size; i++) {
+ if (grp+i >= sbi->s_groups_count)
+ break;
+ desc = ext4_get_group_desc(sb, grp+i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = grp+i;
+ return 0;
+ }
+ }
+ desc = ext4_get_group_desc(sb, grp, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = grp;
+ return 0;
}
- if (ret == 0)
- return ret;
goto fallback;
}

- blocks_per_dir = ext4_blocks_count(es) - freeb;
- do_div(blocks_per_dir, ndirs);
-
max_dirs = ndirs / ngroups + inodes_per_group / 16;
- min_inodes = avefreei - inodes_per_group / 4;
- min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
- max_debt = EXT4_BLOCKS_PER_GROUP(sb);
- max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
- if (max_debt * INODE_COST > inodes_per_group)
- max_debt = inodes_per_group / INODE_COST;
- if (max_debt > 255)
- max_debt = 255;
- if (max_debt == 0)
- max_debt = 1;
+ min_inodes = avefreei - inodes_per_group*flex_size / 4;
+ if (min_inodes < 1)
+ min_inodes = 1;
+ min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;

for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
+ grp = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, grp, flex_size, &stats);
+ if (stats.used_dirs >= max_dirs)
continue;
- if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+ if (stats.free_inodes < min_inodes)
continue;
- if (ext4_free_inodes_count(sb, desc) < min_inodes)
+ if (stats.free_blocks < min_blocks)
continue;
- if (ext4_free_blks_count(sb, desc) < min_blocks)
- continue;
- return 0;
+ goto found_flex_bg;
}

fallback:
+ ngroups = sbi->s_groups_count;
+ avefreei = freei / ngroups;
+ parent_group = EXT4_I(parent)->i_block_group;
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
+ grp = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc(sb, grp, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_inodes_count(sb, desc) >= avefreei)
+ ext4_free_inodes_count(sb, desc) >= avefreei) {
+ *group = grp;
return 0;
+ }
}

if (avefreei) {
@@ -540,12 +598,54 @@ fallback:
}

static int find_group_other(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
struct ext4_group_desc *desc;
- ext4_group_t i;
+ ext4_group_t i, last;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+ /*
+ * If we are doing flex_bg style allocation, try to put
+ * special inodes in the first block group; start files and
+ * directories at the 2nd block group in the flex_bg.
+ */
+ if (flex_size >= 4) {
+ int ret, retry = 0;
+
+ try_again:
+ i = (parent_group & ~(flex_size-1));
+ last = i+flex_size;
+ if (last > ngroups)
+ last = ngroups;
+ if (S_ISDIR(mode) || S_ISREG(mode))
+ i++;
+ while (i < last) {
+ desc = ext4_get_group_desc(sb, i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = i;
+ return 0;
+ }
+ i++;
+ }
+ if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+ retry = 1;
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ goto try_again;
+ }
+ /*
+ * If this didn't work, use the Orlov search
+ * algorithm; we pass in the mode to avoid the subdir
+ * of topdir algorithms. If successful, save the
+ * group that we used so that future allocations in
+ * that directory are stable.
+ */
+ ret = find_group_orlov(sb, parent, &parent_group, mode);
+ if (ret == 0)
+ EXT4_I(parent)->i_last_alloc_group = parent_group;
+ return ret;
+ }

/*
* Try to place the inode in its parent directory
@@ -713,10 +813,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;

- if (sbi->s_log_groups_per_flex) {
+ if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
ret2 = find_group_flex(sb, dir, &group);
if (ret2 == -1) {
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);
if (ret2 == 0)
printk(KERN_NOTICE "ext4: find_group_flex "
"failed, fallback succeeded dir %lu\n",
@@ -729,9 +829,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (test_opt(sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
else
- ret2 = find_group_orlov(sb, dir, &group);
+ ret2 = find_group_orlov(sb, dir, &group, mode);
} else
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);

got_group:
err = -ENOSPC;
@@ -890,6 +990,7 @@ got:
ei->i_file_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
+ ei->i_last_alloc_group = ~0;

ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cbd2ca9..12f1374 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -459,6 +459,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));

/* Try to find previous block */
for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +476,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
* It is going to be referred to from the inode itself? OK, just put it
* into the same cylinder group then.
*/
- bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+ block_group = ei->i_block_group;
+ if (flex_size >= 4) {
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -4250,6 +4265,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_disksize = inode->i_size;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
+ ei->i_last_alloc_group = ~0;
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
--
1.5.6.3



2009-02-24 09:00:04

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

4 files changed, 186 insertions(+), 51 deletions(-)
>
> diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
> index 2d516c0..4ce2187 100644
> --- a/fs/ext4/ext4_i.h
> +++ b/fs/ext4/ext4_i.h
> @@ -122,6 +122,9 @@ struct ext4_inode_info {
> struct list_head i_prealloc_list;
> spinlock_t i_prealloc_lock;
>
> + /* ialloc */
> + ext4_group_t i_last_alloc_group;
> +
> /* allocation reservation info for delalloc */
> unsigned int i_reserved_data_blocks;
> unsigned int i_reserved_meta_blocks;
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index e2eab19..0aeb8c2 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
> ext4_fsblk_t bg_start;
> ext4_fsblk_t last_block;
> ext4_grpblk_t colour;
> + ext4_group_t block_group;
> + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
> int depth;
>
> if (path) {
> @@ -170,10 +172,23 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
> }
>
> /* OK. use inode's group */
> - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
> + block_group = ei->i_block_group;
> + if (flex_size >= 4) {
> + block_group &= ~(flex_size-1);
> + if (S_ISREG(inode->i_mode))
> + block_group++;
> + }


Can you explain why we select 4 here ?


Also add a comment explaining directory/symlink block allocation goes to
first group of flex_bg and regular files goes to second group and which
type of workload that would help ? You have the comment in commit message.


> + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
> le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
> last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
>
> + /*
> + * If we are doing delayed allocation, we don't need take
> + * colour into account.
> + */
> + if (test_opt(inode->i_sb, DELALLOC))
> + return bg_start;
> +

Again why we don't want to look at colour for delayed allocation ?


> if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
> colour = (current->pid % 16) *
> (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 21080ab..212e5be 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -409,6 +409,42 @@ out:
> }
>
> /*
> + * Helper function for Orlov's allocator; returns critical information
> + * for a particular block group or flex_bg
> + */
> +struct orlov_stats {
> + __u32 free_inodes;
> + __u32 free_blocks;
> + __u32 used_dirs;
> +};
> +
> +void get_orlov_stats(struct super_block *sb, ext4_group_t g,
> + int flex_size, struct orlov_stats *stats)
> +{
> + struct ext4_group_desc *desc;
> + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
> + int i;
> +
> + stats->free_inodes = 0;
> + stats->free_blocks = 0;
> + stats->used_dirs = 0;
> +
> + g *= flex_size;


Can you add a comment to the function saying g can be flex group number
or the actual group number depending on flex_size ?. Without that
comment the above operation can be confusing.

> +
> + for (i = 0; i < flex_size; i++) {
> + if (g >= ngroups)
> + break;
> + desc = ext4_get_group_desc(sb, g++, NULL);
> + if (!desc)
> + continue;
> +
> + stats->free_inodes += ext4_free_inodes_count(sb, desc);
> + stats->free_blocks += ext4_free_blks_count(sb, desc);
> + stats->used_dirs += ext4_used_dirs_count(sb, desc);
> + }
> +}
> +
> +/*
> * Orlov's allocator for directories.
> *
> * We always try to spread first-level directories.
> @@ -423,7 +459,6 @@ out:
> * it has too many directories already (max_dirs) or
> * it has too few free inodes left (min_inodes) or
> * it has too few free blocks left (min_blocks) or
> - * it's already running too large debt (max_debt).
> * Parent's group is preferred, if it doesn't satisfy these
> * conditions we search cyclically through the rest. If none
> * of the groups look good we just look for a group with more
> @@ -437,7 +472,7 @@ out:
> #define BLOCK_COST 256

You can also remove further description about debt and INODE_COST and
BLOCK_COST



>
> static int find_group_orlov(struct super_block *sb, struct inode *parent,
> - ext4_group_t *group)
> + ext4_group_t *group, int mode)
> {
> ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -446,12 +481,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
> int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
> unsigned int freei, avefreei;
> ext4_fsblk_t freeb, avefreeb;
> - ext4_fsblk_t blocks_per_dir;
> unsigned int ndirs;
> - int max_debt, max_dirs, min_inodes;
> + int max_dirs, min_inodes;
> ext4_grpblk_t min_blocks;
> - ext4_group_t i;
> + ext4_group_t i, grp, g;
> struct ext4_group_desc *desc;
> + struct orlov_stats stats;
> + int flex_size = ext4_flex_bg_size(sbi);
> +
> + if (flex_size < 4)
> + flex_size = 1;
> + else {
> + ngroups = (ngroups + flex_size - 1) >>
> + sbi->s_log_groups_per_flex;
> + parent_group >>= sbi->s_log_groups_per_flex;
> + }
>
> freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
> avefreei = freei / ngroups;
> @@ -460,71 +504,85 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
> do_div(avefreeb, ngroups);
> ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
>
> - if ((parent == sb->s_root->d_inode) ||
> - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
> + if (S_ISDIR(mode) &&
> + ((parent == sb->s_root->d_inode) ||
> + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
> int best_ndir = inodes_per_group;
> - ext4_group_t grp;
> int ret = -1;
>
> get_random_bytes(&grp, sizeof(grp));
> parent_group = (unsigned)grp % ngroups;
> for (i = 0; i < ngroups; i++) {
> - grp = (parent_group + i) % ngroups;
> - desc = ext4_get_group_desc(sb, grp, NULL);
> - if (!desc || !ext4_free_inodes_count(sb, desc))
> + g = (parent_group + i) % ngroups;
> + get_orlov_stats(sb, g, flex_size, &stats);
> + if (!stats.free_inodes)
> continue;
> - if (ext4_used_dirs_count(sb, desc) >= best_ndir)
> + if (stats.used_dirs >= best_ndir)
> continue;
> - if (ext4_free_inodes_count(sb, desc) < avefreei)
> + if (stats.free_inodes < avefreei)
> continue;
> - if (ext4_free_blks_count(sb, desc) < avefreeb)
> + if (stats.free_blocks < avefreeb)
> continue;
> - *group = grp;
> + grp = g;
> ret = 0;
> - best_ndir = ext4_used_dirs_count(sb, desc);
> + best_ndir = stats.used_dirs;
> + }
> + if (ret)
> + goto fallback;
> + found_flex_bg:
> + if (flex_size == 1) {
> + *group = grp;
> + return 0;
> + }
> +
> + grp *= flex_size;
> + for (i = 1; i < flex_size; i++) {

Why we start with i = 1 ?


> + if (grp+i >= sbi->s_groups_count)
> + break;
> + desc = ext4_get_group_desc(sb, grp+i, NULL);
> + if (desc && ext4_free_inodes_count(sb, desc)) {


Can you add a comment saying that we just pick the first group with
free inode because the goal block for rest of the block allocation
of the file/directory looks at the flex block group number with flex_bg. (more
details on ext4_ext_find_goal)


> + *group = grp+i;
> + return 0;
> + }
> + }
> + desc = ext4_get_group_desc(sb, grp, NULL);
> + if (desc && ext4_free_inodes_count(sb, desc)) {
> + *group = grp;
> + return 0;
> }
> - if (ret == 0)
> - return ret;
> goto fallback;
> }
>
> - blocks_per_dir = ext4_blocks_count(es) - freeb;
> - do_div(blocks_per_dir, ndirs);
> -
> max_dirs = ndirs / ngroups + inodes_per_group / 16;
> - min_inodes = avefreei - inodes_per_group / 4;
> - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
> -
> - max_debt = EXT4_BLOCKS_PER_GROUP(sb);
> - max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
> - if (max_debt * INODE_COST > inodes_per_group)
> - max_debt = inodes_per_group / INODE_COST;
> - if (max_debt > 255)
> - max_debt = 255;
> - if (max_debt == 0)
> - max_debt = 1;
> + min_inodes = avefreei - inodes_per_group*flex_size / 4;
> + if (min_inodes < 1)
> + min_inodes = 1;
> + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
>
> for (i = 0; i < ngroups; i++) {
> - *group = (parent_group + i) % ngroups;
> - desc = ext4_get_group_desc(sb, *group, NULL);
> - if (!desc || !ext4_free_inodes_count(sb, desc))
> + grp = (parent_group + i) % ngroups;
> + get_orlov_stats(sb, grp, flex_size, &stats);
> + if (stats.used_dirs >= max_dirs)
> continue;
> - if (ext4_used_dirs_count(sb, desc) >= max_dirs)
> + if (stats.free_inodes < min_inodes)
> continue;
> - if (ext4_free_inodes_count(sb, desc) < min_inodes)
> + if (stats.free_blocks < min_blocks)
> continue;
> - if (ext4_free_blks_count(sb, desc) < min_blocks)
> - continue;
> - return 0;
> + goto found_flex_bg;
> }
>
> fallback:
> + ngroups = sbi->s_groups_count;
> + avefreei = freei / ngroups;
> + parent_group = EXT4_I(parent)->i_block_group;
> for (i = 0; i < ngroups; i++) {
> - *group = (parent_group + i) % ngroups;
> - desc = ext4_get_group_desc(sb, *group, NULL);
> + grp = (parent_group + i) % ngroups;
> + desc = ext4_get_group_desc(sb, grp, NULL);
> if (desc && ext4_free_inodes_count(sb, desc) &&
> - ext4_free_inodes_count(sb, desc) >= avefreei)
> + ext4_free_inodes_count(sb, desc) >= avefreei) {
> + *group = grp;
> return 0;
> + }
> }
>
> if (avefreei) {
> @@ -540,12 +598,54 @@ fallback:
> }
>
> static int find_group_other(struct super_block *sb, struct inode *parent,
> - ext4_group_t *group)
> + ext4_group_t *group, int mode)
> {
> ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
> struct ext4_group_desc *desc;
> - ext4_group_t i;
> + ext4_group_t i, last;
> + int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
> +
> + /*
> + * If we are doing flex_bg style allocation, try to put
> + * special inodes in the first block group; start files and
> + * directories at the 2nd block group in the flex_bg.
> + */

Why ? Can you explain whether this placing helps any specific work load
? or something where you have observed that this placement helps ?

> + if (flex_size >= 4) {
> + int ret, retry = 0;
> +
> + try_again:
> + i = (parent_group & ~(flex_size-1));
> + last = i+flex_size;
> + if (last > ngroups)
> + last = ngroups;
> + if (S_ISDIR(mode) || S_ISREG(mode))
> + i++;
> + while (i < last) {
> + desc = ext4_get_group_desc(sb, i, NULL);
> + if (desc && ext4_free_inodes_count(sb, desc)) {
> + *group = i;
> + return 0;
> + }
> + i++;
> + }
> + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
> + retry = 1;
> + parent_group = EXT4_I(parent)->i_last_alloc_group;
> + goto try_again;
> + }
> + /*
> + * If this didn't work, use the Orlov search
> + * algorithm; we pass in the mode to avoid the subdir
> + * of topdir algorithms. If successful, save the
> + * group that we used so that future allocations in
> + * that directory are stable.
> + */
> + ret = find_group_orlov(sb, parent, &parent_group, mode);
> + if (ret == 0)
> + EXT4_I(parent)->i_last_alloc_group = parent_group;
> + return ret;
> + }
>
> /*
> * Try to place the inode in its parent directory
> @@ -713,10 +813,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
> sbi = EXT4_SB(sb);
> es = sbi->s_es;
>
> - if (sbi->s_log_groups_per_flex) {
> + if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
> ret2 = find_group_flex(sb, dir, &group);
> if (ret2 == -1) {
> - ret2 = find_group_other(sb, dir, &group);
> + ret2 = find_group_other(sb, dir, &group, mode);
> if (ret2 == 0)
> printk(KERN_NOTICE "ext4: find_group_flex "
> "failed, fallback succeeded dir %lu\n",
> @@ -729,9 +829,9 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
> if (test_opt(sb, OLDALLOC))
> ret2 = find_group_dir(sb, dir, &group);
> else
> - ret2 = find_group_orlov(sb, dir, &group);
> + ret2 = find_group_orlov(sb, dir, &group, mode);
> } else
> - ret2 = find_group_other(sb, dir, &group);
> + ret2 = find_group_other(sb, dir, &group, mode);
>
> got_group:
> err = -ENOSPC;
> @@ -890,6 +990,7 @@ got:
> ei->i_file_acl = 0;
> ei->i_dtime = 0;
> ei->i_block_group = group;
> + ei->i_last_alloc_group = ~0;
>
> ext4_set_inode_flags(inode);
> if (IS_DIRSYNC(inode))
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index cbd2ca9..12f1374 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -459,6 +459,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
> ext4_fsblk_t bg_start;
> ext4_fsblk_t last_block;
> ext4_grpblk_t colour;
> + ext4_group_t block_group;
> + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
>
> /* Try to find previous block */
> for (p = ind->p - 1; p >= start; p--) {
> @@ -474,9 +476,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
> * It is going to be referred to from the inode itself? OK, just put it
> * into the same cylinder group then.
> */
> - bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
> + block_group = ei->i_block_group;
> + if (flex_size >= 4) {
> + block_group &= ~(flex_size-1);
> + if (S_ISREG(inode->i_mode))
> + block_group++;
> + }
> + bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
> last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
>
> + /*
> + * If we are doing delayed allocation, we don't need take
> + * colour into account.
> + */
> + if (test_opt(inode->i_sb, DELALLOC))
> + return bg_start;
> +
> if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
> colour = (current->pid % 16) *
> (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
> @@ -4250,6 +4265,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
> ei->i_disksize = inode->i_size;
> inode->i_generation = le32_to_cpu(raw_inode->i_generation);
> ei->i_block_group = iloc.block_group;
> + ei->i_last_alloc_group = ~0;
> /*
> * NOTE! The in-memory inode i_data array is in little-endian order
> * even on big-endian machines: we do NOT byteswap the block numbers!
> --

-aneesh

2009-02-24 15:27:44

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

On Tue, Feb 24, 2009 at 02:29:31PM +0530, Aneesh Kumar K.V wrote:
> > /* OK. use inode's group */
> > - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
> > + block_group = ei->i_block_group;
> > + if (flex_size >= 4) {
> > + block_group &= ~(flex_size-1);
> > + if (S_ISREG(inode->i_mode))
> > + block_group++;
> > + }
>
>
> Can you explain why we select 4 here ?
>
> Also add a comment explaining directory/symlink block allocation goes to
> first group of flex_bg and regular files goes to second group and which
> type of workload that would help ? You have the comment in commit message.

Yeah, I'll add a comment there.

> > + /*
> > + * If we are doing delayed allocation, we don't need take
> > + * colour into account.
> > + */
> > + if (test_opt(inode->i_sb, DELALLOC))
> > + return bg_start;
> > +
>
> Again why we don't want to look at colour for delayed allocation ?

If we're doing delayed allocation, we'll be allocating data blocks in
large chunks at a time. Colour is much more important if you have
multiple processes allocating singleton blocks at a time, so you don't
get interleved allocation (i.e, ABABAB or ABCABBCABC). But if we're
grabbing chunks of blocks at a time, the benefits largely go away, and
in fact, artificially starting at different location depending on the
process id can actually lead to a greater fragmentation of the free
space.

> > /*
> > + * Helper function for Orlov's allocator; returns critical information
> > + * for a particular block group or flex_bg
> > + */
> > +struct orlov_stats {
> > + __u32 free_inodes;
> > + __u32 free_blocks;
> > + __u32 used_dirs;
> > +};
> > +
> > +void get_orlov_stats(struct super_block *sb, ext4_group_t g,
> > + int flex_size, struct orlov_stats *stats)
> > +{
...
> > + g *= flex_size;
>
> Can you add a comment to the function saying g can be flex group number
> or the actual group number depending on flex_size ?. Without that
> comment the above operation can be confusing.

Sure.

> > +/*
> > * Orlov's allocator for directories.
> > *
>
> You can also remove further description about debt and INODE_COST and
> BLOCK_COST
>

Yep, good point.


> > + found_flex_bg:
> > + if (flex_size == 1) {
> > + *group = grp;
> > + return 0;
> > + }
> > +
> > + grp *= flex_size;
> > + for (i = 1; i < flex_size; i++) {
>
> Why we start with i = 1 ?
>

I was putting directories in the second bg of flexgroups; thinking
about it some more, there's really no good reason to do that. I'll
change that back to be one.

> Can you add a comment saying that we just pick the first group with
> free inode because the goal block for rest of the block allocation
> of the file/directory looks at the flex block group number with
> flex_bg. (more details on ext4_ext_find_goal)

I'll do that.

> > + /*
> > + * If we are doing flex_bg style allocation, try to put
> > + * special inodes in the first block group; start files and
> > + * directories at the 2nd block group in the flex_bg.
> > + */
>
> Why ? Can you explain whether this placing helps any specific work load
> ? or something where you have observed that this placement helps ?

This was left over from when I was using the inode number to influence
block allocation. We're not doing this any more, so this should go
away. Thanks for asking the question.

- Ted

2009-02-24 19:04:39

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

On Tue, Feb 24, 2009 at 10:27:34AM -0500, Theodore Tso wrote:
> > > + /*
> > > + * If we are doing flex_bg style allocation, try to put
> > > + * special inodes in the first block group; start files and
> > > + * directories at the 2nd block group in the flex_bg.
> > > + */
> >
> > Why ? Can you explain whether this placing helps any specific work load
> > ? or something where you have observed that this placement helps ?
>
> This was left over from when I was using the inode number to influence
> block allocation. We're not doing this any more, so this should go
> away. Thanks for asking the question.

Hm, I just tried taking it out, and it costs a 17% increase in e2fsck
time on my test filesystem. The reason is pass 2, we need to check to
make sure the filetype information in the directory blocks is correct.
If the inode in question is a regular file or a directory, we can
determine that by looking at an inode bitmap. However, if it is a
named pipe, device file, or symlink, we can only determine what it is
by reading the inode. In the filesystem in question, which is an
Ubuntu Intrepid image, there are 5 charater device files, 1 block
device file, 5 named pipes --- and 20,613 symbolic links. If we group
all of these inodes togehter, it saves about 3 seconds in pass 2 (13
seconds versus 17 seconds).

We can also solve this problem by caching the file type information.
For example, I could store a list of all symlink inodes, and if there
are only 20k symlinks, it will end up costing us 80k of memory. So
this might be a problem which is better solved with some e2fsck
hacking (which has the advantage that it will speed up ext3 fsck runs
as well).

- Ted

2009-02-24 22:41:33

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

Ted Ts'o wrote something like the following (didnt' get original email):
> @@ -122,6 +122,9 @@ struct ext4_inode_info {
> struct list_head i_prealloc_list;
> spinlock_t i_prealloc_lock;
>
> + /* ialloc */
> + ext4_group_t i_last_alloc_group;

Even better would be to store i_last_alloc_inode. In the past Eric
has demonstrated workloads that are allocating lots of inodes exhibit
O(n^2) behaviour because the entire group bitmap is searched from the
start each time, and that can cumulatively be very slow. Having the
directory start searching from the most recently allocated inode would
make this O(n), and would not significantly alter behaviour.

If the inode is newly read from disk this would be initialized to 0,
or the inode number of the directory. If it was recently creating
files it should be the previously allocated inode. The only issue
is with inodes being repeatedly created and deleted in the same
directory, in which case we could reset this at delete time to be

max(first inode in group, directory inode)

so that the same inodes keep being reused by the directory. This could
be limited to clearing last_alloc_inode if the inode being freed is in
the parent group.

Hmm, it seems something like this is already done in find_group_orlov().
More discussion there.

> @@ -170,10 +172,23 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
> + if (flex_size >= 4) {
> + block_group &= ~(flex_size-1);
> + if (S_ISREG(inode->i_mode))
> + block_group++;
> + }

The usage of "4" in several places to threshold flexbg size is also
confusing to me.

> + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
> le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
> last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

Similar to the above proposal, Eric and I discussed storing the most
recently selected (flex) group in the superblock would avoid repeated
group scanning for allocations in separate directories.

> @@ -446,12 +481,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
> + if (flex_size < 4)
> + flex_size = 1;
> + else {

Generally CodingStyle has { } around the first "if" clause if they are
around the "else" clause.

> @@ -460,71 +504,85 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
> + if (S_ISDIR(mode) &&
> + ((parent == sb->s_root->d_inode) ||
> + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
>
> get_random_bytes(&grp, sizeof(grp));
> parent_group = (unsigned)grp % ngroups;
> for (i = 0; i < ngroups; i++) {
> + g = (parent_group + i) % ngroups;
> + get_orlov_stats(sb, g, flex_size, &stats);

Two notes here -
- get_random_bytes() is actually a fairly CPU-intensive operation, though
I'm not sure it is significant in this context
- more importantly, scanning ALL groups in a very large filesystem to find
an optimal group. Large filesystems can have upwards of 60-100k groups
to scan, and even with flexbg it seems that get_orlov_stats() is not
using the flex_group[] stats in the common case where flex_size is the
same as the flexbg size, but recalculating these for every allocation.

> + if (!stats.free_inodes)
> continue;
> + if (stats.used_dirs >= best_ndir)
> continue;
> + if (stats.free_inodes < avefreei)
> continue;
> + if (stats.free_blocks < avefreeb)
> continue;
> + grp = g;
> ret = 0;
> + best_ndir = stats.used_dirs;

Based on the above, it would be preferable to settle for a "local"
optimal group after scanning N successful candidates, then checking
the "most recently selected" group saved in the superblock if it is
still better than the local optimum. Of course it should continue to
scan all groups if there are no suitable groups yet found.

Since the starting group is random, this will avoid being stuck in
a bad local minimum, while avoiding an exhaustive search when there
are a lot of groups.

> + min_inodes = avefreei - inodes_per_group*flex_size / 4;
> + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;

style - "inodes_per_group * flex_size / 4"

> static int find_group_other(struct super_block *sb, struct inode *parent,
> + ext4_group_t *group, int mode)
> {
> + /*
> + * If we are doing flex_bg style allocation, try to put
> + * special inodes in the first block group; start files and
> + * directories at the 2nd block group in the flex_bg.
> + */
> + if (flex_size >= 4) {
> + int ret, retry = 0;
> +
> + try_again:
> + i = (parent_group & ~(flex_size-1));
> + last = i+flex_size;

Can you please use a better variable name than "i", "grp", or
"new_group" or similar?

> + while (i < last) {
> + desc = ext4_get_group_desc(sb, i, NULL);
> + if (desc && ext4_free_inodes_count(sb, desc)) {
> + *group = i;
> + return 0;
> + }
> + i++;
> + }
> + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
> + retry = 1;
> + parent_group = EXT4_I(parent)->i_last_alloc_group;
> + goto try_again;
> + }

It would seem that using i_last_alloc_group FIRST (if set) would be better
than always checking all groups in the flexbg. Either i_last_alloc_group
is in the same flex group as parent_group (so no harm done) or a previous
allocation wasn't able to find any free inode in that flex group and it
had to do a more expensive search elsewhere.

All that needs to be changed is to possibly reset i_last_alloc_group
to be parent_group when there is a file unlinked from the directory,
though this wouldn't be great for huge directories since any unlink
would cause a full scan.

Hmm, maybe checking ONLY the parent group for free inodes, then the
i_last_alloc_group, then remaining groups in the flex group, then
all remaining groups is the best ordering. It allows inodes to be
clustered if there is space in the parent group (whether due to freeing
inodes in this directory or others), but avoids a potentially lengthy
scan for each inode (which might be noticable if there are 64 or 128
groups per flexbg).



Another possibility is if the starting goal inode was based on the group
a majority of dirent inode numbers in the same directory leaf. This would
require deferring inode number selection until the dirent is about to be
inserted (i.e. delalloc for inodes) by moving the inode selection into
ext4_add_entry(). Combining this with a hash->inode range mapping
(as I've proposed in the past, range size based on the directory size)
would improve the htree->inode ordering and would also improve performance
for large directories.

It might not be a bad idea to separate the inode selection from the
inode initialization anyway, as ext4_new_inode() is pretty huge already,
and I can't see anything except the i_ino setting and insert_inode_hash()
that is use the inode number. That means it should be possible to defer
the inode selection to ext4_add_entry() and ext4_dx_add_entry() once the
leaf block is found.


Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


2009-02-25 00:57:56

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

Andreas Dilger wrote:
> Ted Ts'o wrote something like the following (didnt' get original email):
>> @@ -122,6 +122,9 @@ struct ext4_inode_info {
>> struct list_head i_prealloc_list;
>> spinlock_t i_prealloc_lock;
>>
>> + /* ialloc */
>> + ext4_group_t i_last_alloc_group;
>
> Even better would be to store i_last_alloc_inode. In the past Eric
> has demonstrated workloads that are allocating lots of inodes exhibit
> O(n^2) behaviour because the entire group bitmap is searched from the
> start each time, and that can cumulatively be very slow. Having the
> directory start searching from the most recently allocated inode would
> make this O(n), and would not significantly alter behaviour.

A very hacky benchmark I had to demonstrate this is at

It just creates a directory tree starting at 000/ under the dir it's run
in, and times iterations of creates.

The tree is created in order, like:

000/000/000/000/000/000
000/000/000/000/000/001
000/000/000/000/000/002
...
000/000/000/000/000/fff
000/000/000/000/001/000
....

On ext3:

# ./seq_mkdirs
iter 0: 6.191491 sec
iter 1: 8.455782 sec
iter 2: 9.435375 sec
iter 3: 10.198069 sec
iter 4: 10.922969 sec
iter 5: 10.800908 sec
iter 6: 12.940676 sec
iter 7: 15.513261 sec
...

On upstream ext4:

# ./seq_mkdirs
iter 0: 5.628331 sec
iter 1: 6.581043 sec
iter 2: 6.723445 sec
iter 3: 6.567891 sec
iter 4: 5.862526 sec
iter 5: 6.462064 sec
iter 6: 7.208110 sec
iter 7: 6.549735 sec
...


I did play with saving the last-allocated position but if that's just
in-memory then it's a little odd that the first allocation will be
potentially much slower, but that's probably acceptable. It also
wouldn't fill in gaps when inodes are deleted if you don't re-search
from the parent. ISTR that the constant create/delete didn't cause a
problem, will need to remind myself why ...

-Eric

2009-02-25 00:59:25

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

Eric Sandeen wrote:
> Andreas Dilger wrote:
>> Ted Ts'o wrote something like the following (didnt' get original email):
>>> @@ -122,6 +122,9 @@ struct ext4_inode_info {
>>> struct list_head i_prealloc_list;
>>> spinlock_t i_prealloc_lock;
>>>
>>> + /* ialloc */
>>> + ext4_group_t i_last_alloc_group;
>> Even better would be to store i_last_alloc_inode. In the past Eric
>> has demonstrated workloads that are allocating lots of inodes exhibit
>> O(n^2) behaviour because the entire group bitmap is searched from the
>> start each time, and that can cumulatively be very slow. Having the
>> directory start searching from the most recently allocated inode would
>> make this O(n), and would not significantly alter behaviour.
>
> A very hacky benchmark I had to demonstrate this is at

Er, URL please, maestro...

http://people.redhat.com/esandeen/benchmarks/seq_mkdirs.c

> It just creates a directory tree starting at 000/ under the dir it's run
> in, and times iterations of creates.
>
> The tree is created in order, like:
>
> 000/000/000/000/000/000
> 000/000/000/000/000/001
> 000/000/000/000/000/002
> ...
> 000/000/000/000/000/fff
> 000/000/000/000/001/000
> ....
>
> On ext3:
>
> # ./seq_mkdirs
> iter 0: 6.191491 sec
> iter 1: 8.455782 sec
> iter 2: 9.435375 sec
> iter 3: 10.198069 sec
> iter 4: 10.922969 sec
> iter 5: 10.800908 sec
> iter 6: 12.940676 sec
> iter 7: 15.513261 sec
> ...
>
> On upstream ext4:
>
> # ./seq_mkdirs
> iter 0: 5.628331 sec
> iter 1: 6.581043 sec
> iter 2: 6.723445 sec
> iter 3: 6.567891 sec
> iter 4: 5.862526 sec
> iter 5: 6.462064 sec
> iter 6: 7.208110 sec
> iter 7: 6.549735 sec
> ...
>
>
> I did play with saving the last-allocated position but if that's just
> in-memory then it's a little odd that the first allocation will be
> potentially much slower, but that's probably acceptable. It also
> wouldn't fill in gaps when inodes are deleted if you don't re-search
> from the parent. ISTR that the constant create/delete didn't cause a
> problem, will need to remind myself why ...
>
> -Eric
>


2009-02-25 02:51:00

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

> Two notes here -
> - get_random_bytes() is actually a fairly CPU-intensive operation, though
> I'm not sure it is significant in this context
> - more importantly, scanning ALL groups in a very large filesystem to find
> an optimal group. Large filesystems can have upwards of 60-100k groups
> to scan, and even with flexbg it seems that get_orlov_stats() is not
> using the flex_group[] stats in the common case where flex_size is the
> same as the flexbg size, but recalculating these for every allocation.

Note that this only happens when we are creating directories. I'm
making the assumption here that creating directories is rare, and
certainly happens less often than allocating inodes or blocks. I was
deliberately not using flex_group[] stats, since I was thinking that
eventually it would be worthwhile to nuke the need to take a spinlock
for every inode and block allocation, and it wasn't keeping track of
the number of directories, anyway.

We also don't actually scan ALL the groups. For top-level directories
we do, yes. But most directories aren't top-level; and for non
top-level directories, we find the first block group which meets the
minimum criteria:

for (i = 0; i < ngroups; i++) {
grp = (parent_group + i) % ngroups;
get_orlov_stats(sb, grp, flex_size, &stats);
if (stats.used_dirs >= max_dirs)
continue;
if (stats.free_inodes < min_inodes)
continue;
if (stats.free_blocks < min_blocks)
continue;
goto found_flex_bg;
}

Looking at Eric's benchmark, it looks like each iteration is creating
196,608 directories for each iteration. If takes 6 seconds, then each
mkdir is taking 30 microseconds, and if takes 15 seconds to do an
interation, then each mkdir is taking 76 microseconds. So that
variance is large, but how bad does it really get?

Also, what workload is likely going to be creating tons and tons of
directories? Even a psycho squid cache hierarchy is only 65536
directories.

> Based on the above, it would be preferable to settle for a "local"
> optimal group after scanning N successful candidates, then checking
> the "most recently selected" group saved in the superblock if it is
> still better than the local optimum. Of course it should continue to
> scan all groups if there are no suitable groups yet found.
>
> Since the starting group is random, this will avoid being stuck in
> a bad local minimum, while avoiding an exhaustive search when there
> are a lot of groups.

The above algorithm is only used for top-level directories.

> It would seem that using i_last_alloc_group FIRST (if set) would be better
> than always checking all groups in the flexbg. Either i_last_alloc_group
> is in the same flex group as parent_group (so no harm done) or a previous
> allocation wasn't able to find any free inode in that flex group and it
> had to do a more expensive search elsewhere.

(This is the algorithm used for non-directories). Yeah, I agree that
keeping track of the last flex group and using it first makes sense.

I do think it makes sense to start at the beginning of each flex
group; checking each block group is pretty fast, since we're not
scanning the bitmap; just checking a count value. Even if there are
128 groups for flex groups, we're in big trouble if checking free
inodes count for 100+ groups is noticeable.

- Ted

2009-02-26 18:22:01

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

There is an updated patch to add updated orlov inode/block allocators
to ext4 in the ext4 patch queue, and which I've attached here.

It seems to make a huge difference on e2fsck times, which I've written
about here:

http://thunk.org/tytso/blog/2009/02/26/fast-ext4-fsck-times-revisited/

I've also tried running Eric Sandeen's seq_mkdirs benchmark (which
I've suggested should be renamed mkdirs_mark, with a suitable tweaked
logo: http://graphics.ucsd.edu/courses/rendering/2003/joshwills.jpg)
and what I see is a fairly consistent time per interation that does
*not* increase as more directories are added to the file system.

I tried adding some of Andreas' suggestions which would tend to pack
the inodes less agressively, in the hopes that it might speed up the
mkdir operation, but at least for seq_mkdir/mkdirs_mark benchmark, it
really didn't help, because we are disk bound, not cpu bound. (At
least, on a 5400 rpm drive the limiting factor was the hard drive, not
CPU; maybe things will be different on a big SAN-attached RAID array.)

CPU utilization and speed per interation goes up significantly once
the disk is full, since at that point we really *do* scan all block
groups, but that seems to be an acceptable thing to do; I don't think
it's all that important whether mkdir() returns ENOSPC quickly or not. :-)

At this point, the major speed advantage of XFS for this benchmark
seems to be caused by the fact that it supports small form-factor
directories which are encoded in the inode structure, whereas ext4
needs to write a 4k directory block even for an empty directory (this
is what is causing us to be I/O bound).

So any improvements in mkdirs_mark would require special-case hacks
such as treating a zero-length directory inode as a synthetic empty
inode, and not actually trying to allocate the directory block until
the first time a file is created in the directory. But that would be
a file system format change that would probably only really be useful
for better benchmark results --- how common are file systems with
hundreds of thousands of empty directories, after all?

- Ted

ext4: New inode/block allocation algorithms for flex_bg file systems

The find_group_flex() inode allocator is now only used if the
file system is mounted using the "oldalloc" mount option. It is
replaced with the original Orlov allocator that has been updated for
flex_bg file systems (it should behave the same way if flex_bg is
disabled). The inode allocator now functions by taking into account
each flex_bg group, instead of each block group, when deciding whether
or not it's time to allocate a new directory into a fresh flex_bg.

The block allocator has also been changed so that the first block
group in each flex_bg is preferred for use for storing directory
blocks. This keeps directory blocks close together, which is good for
speeding up e2fsck since large directories are more likely to look
like this:

debugfs: stat /home/tytso/Maildir/cur
Inode: 1844562 Type: directory Mode: 0700 Flags: 0x81000
Generation: 1132745781 Version: 0x00000000:0000ad71
User: 15806 Group: 15806 Size: 1060864
File ACL: 0 Directory ACL: 0
Links: 2 Blockcount: 2072
Fragment: Address: 0 Number: 0 Size: 0
ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009
atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009
mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009
crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009
Size of extra inode fields: 28
BLOCKS:
(0):7348651, (1-258):7348654-7348911
TOTAL: 259

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 6 ++
fs/ext4/ext4_i.h | 3 +
fs/ext4/extents.c | 25 ++++++-
fs/ext4/ialloc.c | 215 +++++++++++++++++++++++++++++++++++++++--------------
fs/ext4/inode.c | 18 ++++-
fs/ext4/mballoc.c | 7 ++
6 files changed, 216 insertions(+), 58 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9840cad..684a063 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -827,6 +827,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */

/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
+
+/*
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 2d516c0..4ce2187 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -122,6 +122,9 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;

+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+
/* allocation reservation info for delalloc */
unsigned int i_reserved_data_blocks;
unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e2eab19..bd0092a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
int depth;

if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
}

/* OK. use inode's group */
- bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ /*
+ * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+ * block groups per flexgroup, reserve the first block
+ * group for directories and special files. Regular
+ * files will start at the second block group. This
+ * tends to speed up directory access and improves
+ * fsck times.
+ */
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f524d9b..c0d2bb6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -408,6 +408,43 @@ out:
return 0;
}

+struct orlov_stats {
+ __u32 free_inodes;
+ __u32 free_blocks;
+ __u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg. If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+ int flex_size, struct orlov_stats *stats)
+{
+ struct ext4_group_desc *desc;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ int i;
+
+ stats->free_inodes = 0;
+ stats->free_blocks = 0;
+ stats->used_dirs = 0;
+
+ g *= flex_size;
+
+ for (i = 0; i < flex_size; i++) {
+ if (g >= ngroups)
+ break;
+ desc = ext4_get_group_desc(sb, g++, NULL);
+ if (!desc)
+ continue;
+
+ stats->free_inodes += ext4_free_inodes_count(sb, desc);
+ stats->free_blocks += ext4_free_blks_count(sb, desc);
+ stats->used_dirs += ext4_used_dirs_count(sb, desc);
+ }
+}
+
/*
* Orlov's allocator for directories.
*
@@ -423,35 +460,34 @@ out:
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
* free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
*/

-#define INODE_COST 64
-#define BLOCK_COST 256
-
static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
ext4_group_t ngroups = sbi->s_groups_count;
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext4_fsblk_t freeb, avefreeb;
- ext4_fsblk_t blocks_per_dir;
unsigned int ndirs;
- int max_debt, max_dirs, min_inodes;
+ int max_dirs, min_inodes;
ext4_grpblk_t min_blocks;
- ext4_group_t i;
+ ext4_group_t i, grp, g;
struct ext4_group_desc *desc;
+ struct orlov_stats stats;
+ int flex_size = ext4_flex_bg_size(sbi);
+
+ if (flex_size > 1) {
+ ngroups = (ngroups + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }

freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
@@ -460,71 +496,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
do_div(avefreeb, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);

- if ((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+ if (S_ISDIR(mode) &&
+ ((parent == sb->s_root->d_inode) ||
+ (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
int best_ndir = inodes_per_group;
- ext4_group_t grp;
int ret = -1;

get_random_bytes(&grp, sizeof(grp));
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
- grp = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, grp, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
+ g = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, g, flex_size, &stats);
+ if (!stats.free_inodes)
continue;
- if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+ if (stats.used_dirs >= best_ndir)
continue;
- if (ext4_free_inodes_count(sb, desc) < avefreei)
+ if (stats.free_inodes < avefreei)
continue;
- if (ext4_free_blks_count(sb, desc) < avefreeb)
+ if (stats.free_blocks < avefreeb)
continue;
- *group = grp;
+ grp = g;
ret = 0;
- best_ndir = ext4_used_dirs_count(sb, desc);
+ best_ndir = stats.used_dirs;
+ }
+ if (ret)
+ goto fallback;
+ found_flex_bg:
+ if (flex_size == 1) {
+ *group = grp;
+ return 0;
+ }
+
+ /*
+ * We pack inodes at the beginning of the flexgroup's
+ * inode tables. Block allocation decisions will do
+ * something similar, although regular files will
+ * start at 2nd block group of the flexgroup. See
+ * ext4_ext_find_goal() and ext4_find_near().
+ */
+ grp *= flex_size;
+ for (i = 0; i < flex_size; i++) {
+ if (grp+i >= sbi->s_groups_count)
+ break;
+ desc = ext4_get_group_desc(sb, grp+i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = grp+i;
+ return 0;
+ }
}
- if (ret == 0)
- return ret;
goto fallback;
}

- blocks_per_dir = ext4_blocks_count(es) - freeb;
- do_div(blocks_per_dir, ndirs);
-
max_dirs = ndirs / ngroups + inodes_per_group / 16;
- min_inodes = avefreei - inodes_per_group / 4;
- min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
- max_debt = EXT4_BLOCKS_PER_GROUP(sb);
- max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
- if (max_debt * INODE_COST > inodes_per_group)
- max_debt = inodes_per_group / INODE_COST;
- if (max_debt > 255)
- max_debt = 255;
- if (max_debt == 0)
- max_debt = 1;
+ min_inodes = avefreei - inodes_per_group*flex_size / 4;
+ if (min_inodes < 1)
+ min_inodes = 1;
+ min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+
+ /*
+ * Start looking in the flex group where we last allocated an
+ * inode for this parent directory
+ */
+ if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ if (flex_size > 1)
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }

for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
- continue;
- if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+ grp = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, grp, flex_size, &stats);
+ if (stats.used_dirs >= max_dirs)
continue;
- if (ext4_free_inodes_count(sb, desc) < min_inodes)
+ if (stats.free_inodes < min_inodes)
continue;
- if (ext4_free_blks_count(sb, desc) < min_blocks)
+ if (stats.free_blocks < min_blocks)
continue;
- return 0;
+ goto found_flex_bg;
}

fallback:
+ ngroups = sbi->s_groups_count;
+ avefreei = freei / ngroups;
+ parent_group = EXT4_I(parent)->i_block_group;
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
+ grp = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc(sb, grp, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_inodes_count(sb, desc) >= avefreei)
+ ext4_free_inodes_count(sb, desc) >= avefreei) {
+ *group = grp;
return 0;
+ }
}

if (avefreei) {
@@ -540,12 +602,51 @@ fallback:
}

static int find_group_other(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
struct ext4_group_desc *desc;
- ext4_group_t i;
+ ext4_group_t i, last;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+ /*
+ * Try to place the inode is the same flex group as its
+ * parent. If we can't find space, use the Orlov algorithm to
+ * find another flex group, and store that information in the
+ * parent directory's inode information so that use that flex
+ * group for future allocations.
+ */
+ if (flex_size > 1) {
+ int retry = 0;
+
+ try_again:
+ parent_group &= ~(flex_size-1);
+ last = parent_group + flex_size;
+ if (last > ngroups)
+ last = ngroups;
+ for (i = parent_group; i < last; i++) {
+ desc = ext4_get_group_desc(sb, i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = i;
+ return 0;
+ }
+ }
+ if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+ retry = 1;
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ goto try_again;
+ }
+ /*
+ * If this didn't work, use the Orlov search algorithm
+ * to find a new flex group; we pass in the mode to
+ * avoid the topdir algorithms.
+ */
+ *group = parent_group + flex_size;
+ if (*group > ngroups)
+ *group = 0;
+ return find_group_orlov(sb, parent, group, mode);
+ }

/*
* Try to place the inode in its parent directory
@@ -713,10 +814,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;

- if (sbi->s_log_groups_per_flex) {
+ if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
ret2 = find_group_flex(sb, dir, &group);
if (ret2 == -1) {
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);
if (ret2 == 0 && printk_ratelimit())
printk(KERN_NOTICE "ext4: find_group_flex "
"failed, fallback succeeded dir %lu\n",
@@ -729,11 +830,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (test_opt(sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
else
- ret2 = find_group_orlov(sb, dir, &group);
+ ret2 = find_group_orlov(sb, dir, &group, mode);
} else
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);

got_group:
+ EXT4_I(dir)->i_last_alloc_group = group;
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -890,6 +992,7 @@ got:
ei->i_file_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
+ ei->i_last_alloc_group = ~0;

ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 51cdd13..51d70bc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -459,6 +459,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));

/* Try to find previous block */
for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +476,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
* It is going to be referred to from the inode itself? OK, just put it
* into the same cylinder group then.
*/
- bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;

+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -4257,6 +4272,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_disksize = inode->i_size;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
+ ei->i_last_alloc_group = ~0;
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4415bee..0430143 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1726,6 +1726,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
{
unsigned free, fragments;
unsigned i, bits;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_desc *desc;
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);

@@ -1747,6 +1748,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
return 0;

+ /* Avoid using the first bg of a flexgroup for data files */
+ if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+ (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+ ((group % flex_size) == 0))
+ return 0;
+
bits = ac->ac_sb->s_blocksize_bits + 1;
for (i = ac->ac_2order; i <= bits; i++)
if (grp->bb_counters[i] > 0)

2009-02-26 18:38:43

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

....

> static int find_group_orlov(struct super_block *sb, struct inode *parent,
> - ext4_group_t *group)
> + ext4_group_t *group, int mode)
> {
> ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> - struct ext4_super_block *es = sbi->s_es;
> ext4_group_t ngroups = sbi->s_groups_count;
> int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
> unsigned int freei, avefreei;
> ext4_fsblk_t freeb, avefreeb;
> - ext4_fsblk_t blocks_per_dir;
> unsigned int ndirs;
> - int max_debt, max_dirs, min_inodes;
> + int max_dirs, min_inodes;
> ext4_grpblk_t min_blocks;
> - ext4_group_t i;
> + ext4_group_t i, grp, g;
> struct ext4_group_desc *desc;
> + struct orlov_stats stats;
> + int flex_size = ext4_flex_bg_size(sbi);
> +
> + if (flex_size > 1) {
> + ngroups = (ngroups + flex_size - 1) >>
> + sbi->s_log_groups_per_flex;
> + parent_group >>= sbi->s_log_groups_per_flex;
> + }
>
> freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
> avefreei = freei / ngroups;
> @@ -460,71 +496,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
> do_div(avefreeb, ngroups);
> ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
>
> - if ((parent == sb->s_root->d_inode) ||
> - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
> + if (S_ISDIR(mode) &&
> + ((parent == sb->s_root->d_inode) ||
> + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
> int best_ndir = inodes_per_group;
> - ext4_group_t grp;
> int ret = -1;
>
> get_random_bytes(&grp, sizeof(grp));
> parent_group = (unsigned)grp % ngroups;
> for (i = 0; i < ngroups; i++) {
> - grp = (parent_group + i) % ngroups;
> - desc = ext4_get_group_desc(sb, grp, NULL);
> - if (!desc || !ext4_free_inodes_count(sb, desc))
> + g = (parent_group + i) % ngroups;
> + get_orlov_stats(sb, g, flex_size, &stats);
> + if (!stats.free_inodes)
> continue;
> - if (ext4_used_dirs_count(sb, desc) >= best_ndir)
> + if (stats.used_dirs >= best_ndir)
> continue;
> - if (ext4_free_inodes_count(sb, desc) < avefreei)
> + if (stats.free_inodes < avefreei)
> continue;
> - if (ext4_free_blks_count(sb, desc) < avefreeb)
> + if (stats.free_blocks < avefreeb)
> continue;
> - *group = grp;
> + grp = g;
> ret = 0;
> - best_ndir = ext4_used_dirs_count(sb, desc);
> + best_ndir = stats.used_dirs;
> + }
> + if (ret)
> + goto fallback;
> + found_flex_bg:
> + if (flex_size == 1) {
> + *group = grp;
> + return 0;
> + }
> +
> + /*
> + * We pack inodes at the beginning of the flexgroup's
> + * inode tables. Block allocation decisions will do
> + * something similar, although regular files will
> + * start at 2nd block group of the flexgroup. See
> + * ext4_ext_find_goal() and ext4_find_near().
> + */
> + grp *= flex_size;
> + for (i = 0; i < flex_size; i++) {
> + if (grp+i >= sbi->s_groups_count)
> + break;
> + desc = ext4_get_group_desc(sb, grp+i, NULL);
> + if (desc && ext4_free_inodes_count(sb, desc)) {
> + *group = grp+i;
> + return 0;
> + }
> }
> - if (ret == 0)
> - return ret;
> goto fallback;
> }
>
> - blocks_per_dir = ext4_blocks_count(es) - freeb;
> - do_div(blocks_per_dir, ndirs);
> -
> max_dirs = ndirs / ngroups + inodes_per_group / 16;
> - min_inodes = avefreei - inodes_per_group / 4;
> - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
> -
> - max_debt = EXT4_BLOCKS_PER_GROUP(sb);
> - max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
> - if (max_debt * INODE_COST > inodes_per_group)
> - max_debt = inodes_per_group / INODE_COST;
> - if (max_debt > 255)
> - max_debt = 255;
> - if (max_debt == 0)
> - max_debt = 1;
> + min_inodes = avefreei - inodes_per_group*flex_size / 4;
> + if (min_inodes < 1)
> + min_inodes = 1;
> + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
> +
> + /*
> + * Start looking in the flex group where we last allocated an
> + * inode for this parent directory
> + */
> + if (EXT4_I(parent)->i_last_alloc_group != ~0) {
> + parent_group = EXT4_I(parent)->i_last_alloc_group;
> + if (flex_size > 1)
> + parent_group >>= sbi->s_log_groups_per_flex;
> + }
>
> for (i = 0; i < ngroups; i++) {
> - *group = (parent_group + i) % ngroups;
> - desc = ext4_get_group_desc(sb, *group, NULL);
> - if (!desc || !ext4_free_inodes_count(sb, desc))
> - continue;
> - if (ext4_used_dirs_count(sb, desc) >= max_dirs)
> + grp = (parent_group + i) % ngroups;
> + get_orlov_stats(sb, grp, flex_size, &stats);
> + if (stats.used_dirs >= max_dirs)
> continue;
> - if (ext4_free_inodes_count(sb, desc) < min_inodes)
> + if (stats.free_inodes < min_inodes)
> continue;
> - if (ext4_free_blks_count(sb, desc) < min_blocks)
> + if (stats.free_blocks < min_blocks)
> continue;
> - return 0;
> + goto found_flex_bg;
> }
>
> fallback:
> + ngroups = sbi->s_groups_count;
> + avefreei = freei / ngroups;
> + parent_group = EXT4_I(parent)->i_block_group;
> for (i = 0; i < ngroups; i++) {
> - *group = (parent_group + i) % ngroups;
> - desc = ext4_get_group_desc(sb, *group, NULL);
> + grp = (parent_group + i) % ngroups;
> + desc = ext4_get_group_desc(sb, grp, NULL);
> if (desc && ext4_free_inodes_count(sb, desc) &&
> - ext4_free_inodes_count(sb, desc) >= avefreei)
> + ext4_free_inodes_count(sb, desc) >= avefreei) {
> + *group = grp;
> return 0;
> + }
> }
>
> if (avefreei) {
> @@ -540,12 +602,51 @@ fallback:
> }
>
> static int find_group_other(struct super_block *sb, struct inode *parent,
> - ext4_group_t *group)
> + ext4_group_t *group, int mode)
> {
> ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
> struct ext4_group_desc *desc;
> - ext4_group_t i;
> + ext4_group_t i, last;
> + int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
> +
> + /*
> + * Try to place the inode is the same flex group as its
> + * parent. If we can't find space, use the Orlov algorithm to
> + * find another flex group, and store that information in the
> + * parent directory's inode information so that use that flex
> + * group for future allocations.
> + */
> + if (flex_size > 1) {
> + int retry = 0;
> +
> + try_again:
> + parent_group &= ~(flex_size-1);
> + last = parent_group + flex_size;
> + if (last > ngroups)
> + last = ngroups;
> + for (i = parent_group; i < last; i++) {
> + desc = ext4_get_group_desc(sb, i, NULL);
> + if (desc && ext4_free_inodes_count(sb, desc)) {
> + *group = i;
> + return 0;
> + }
> + }
> + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
> + retry = 1;
> + parent_group = EXT4_I(parent)->i_last_alloc_group;
> + goto try_again;
> + }


For directories we try first with i_last_alloc_group and then with the
parent i_block_group. For file we are doing the other way round here.
Does it make sense to try with i_last_alloc_group first ?



> + /*
> + * If this didn't work, use the Orlov search algorithm
> + * to find a new flex group; we pass in the mode to
> + * avoid the topdir algorithms.
> + */
> + *group = parent_group + flex_size;
> + if (*group > ngroups)
> + *group = 0;
> + return find_group_orlov(sb, parent, group, mode);
> + }
>
> /*

If you want you can add
Reviewed-by: Aneesh Kumar K.V <[email protected]>

-aneesh

2009-02-27 00:15:39

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

On Feb 26, 2009 13:21 -0500, Theodore Ts'o wrote:
> So any improvements in mkdirs_mark would require special-case hacks
> such as treating a zero-length directory inode as a synthetic empty
> inode, and not actually trying to allocate the directory block until
> the first time a file is created in the directory. But that would be
> a file system format change that would probably only really be useful
> for better benchmark results --- how common are file systems with
> hundreds of thousands of empty directories, after all?

Actually, I often reference some online statistics for HPC storage:
http://www.pdsi-scidac.org/cgi-bin/fsstats-list.cgi

and while one would think in HPC filesystems there are lots of huge
directories, the below stats show that a huge majority of DIRECTORIES
are only containing a few entries. That said, a large percentage of
the FILES are in larger directories, but that doesn't change the fact
that there are a large number of directories with very few entries.

Stats from the filesystem (incorrectly marked ext3, but really Lustre):
http://www.pdsi-scidac.org/fsstats/approved/PNNL-Oct102007-233TB-ext3-EvanFelix_nwfs.out

directory size:
count=888082 average=14.936094
min=0 max=57114
entries: dirs dir pct cumulative entries ents pct cum. ents
[ 0- 1]: 127934 (14.41%) (14.41%) 86753 ( 0.65%) ( 0.65%)
[ 2- 3]: 126204 (14.21%) (28.62%) 305501 ( 2.30%) ( 2.96%)
[ 4- 7]: 268058 (30.18%) (58.80%) 1314419 ( 9.91%) (12.87%)
[ 8-15]: 228065 (25.68%) (84.48%) 2449552 (18.47%) (31.33%)
[16-31]: 88365 ( 9.95%) (94.43%) 1965719 (14.82%) (46.15%)
[32-63]: 30436 ( 3.43%) (97.86%) 1355962 (10.22%) (56.38%)

filename length:
count=13264476 average=21.981972
min=1 max=232
chars: files file pct cumulative bytes byte pct cum. bytes
[ 0- 7]: 1557016 (11.74%) (11.74%) 7772274 ( 2.67%) ( 2.67%)
[ 8-15]: 4826194 (36.38%) (48.12%) 53282606 (18.27%) (20.94%)
[16-23]: 2598854 (19.59%) (67.72%) 50042818 (17.16%) (38.10%)
[24-31]: 1346382 (10.15%) (77.87%) 36152231 (12.40%) (50.50%)
[32-39]: 572299 ( 4.31%) (82.18%) 20691279 ( 7.10%) (57.60%)
[40-47]: 873408 ( 6.58%) (88.76%) 37941162 (13.01%) (70.61%)
[48-55]: 814905 ( 6.14%) (94.91%) 41733619 (14.31%) (84.92%)

Shows that we could quite easily store most (57%) of average named
files (24 chars or less) in average sized directories (15 files or
less) in 480-byte directories (including 8 bytes of dirent overhead
per name).

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


2009-02-27 09:17:23

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

On Feb 26, 2009 13:21 -0500, Theodore Ts'o wrote:
> I tried adding some of Andreas' suggestions which would tend to pack
> the inodes less agressively, in the hopes that it might speed up the
> mkdir operation, but at least for seq_mkdir/mkdirs_mark benchmark, it
> really didn't help, because we are disk bound, not cpu bound.

Could you explain in a bit more detail what you tried? In particular,
was this the "mapping hash range onto itable range" I have proposed in
the past?

As a rough outline of what I'm thinking, this kind of mapping might only
start once we exceed a single directory leaf block, as this coincides
with the start of htree hashing and hash-order vs. itable-order randomness.

Basically we would map the N leaf blocks of a directory into a range
of M itable blocks that had some number of free inodes. If we start
with 2 directory leaf blocks (right after split) that are 1/2 full:

4096 bytes/itable block / 512 bytes/inode = 8 inode/itable block
4096 bytes/leaf block / 40 bytes/dirent = 102 dirent/leaf block

102 dirent/leaf * 1/2 / 1 dirent/inode / 8 inode/itable = 6 itable/leaf

so that would mean filling the remaining 1/2 space in the 2 leaf blocks
would consume about 12 itable blocks. When there are 4 leaf blocks in the
directory we map to 24 itable blocks.

When we are scanning this directory (say at 4 leaf block size) for values
in the first leaf block (which is in hash order) the entries will likely
be in either:
+ the first 12 itable blocks (there was no itable ordering at that time)
+ the first 3 blocks of the first 12-block range (1/4 of hash values)
+ the first 6 blocks of the second 24-block range (1/4 of hash values)
= 21 blocks

Contrast this with regular htree inode allocation, the first 1/4 of the
directory entries will likely (randomly) have entries in all 12+12+24=48
of the blocks, so we are loading/modifying about 1/2 of the itable blocks
when doing stat/unlink in the directory.

If we make a table for stat/unlink of all entries in the first leaf block:

directory size total 1 leaf blk leaf blocks access
blocks:files itable blocks file ratio accessed ratio
1 102 12 1/1 12 1
2 204 12+12=24 1/2 12+6=18 3/4
4 408 12+12+24=48 1/4 12+3+6=21 1/3
8 816 12+12+24+48=96 1/8 12+2+3+6=23 1/4
16 1632 12+12+24+48+96=192 1/16 12+1+2+3+6=24 1/8
32 3264 384 1/32 24+1=25 1/15
64 6528 768 1/64 25+1=26 1/30
128 13056 1536 1/128 27 1/57

While initially it seems that past a directory of size 8 blocks we would
only modify at most 102 itable blocks per dirent block (== number of
entries in the dirent block) and the "access ratio" would stick around 1/4,
in practise we should continue to get proportionately fewer itable blocks
loaded/modified per dirent block because the itable blocks allocated
at the beginning (12+...) are used/modified repeatedly for the first
N dirent blocks and do not further negatively impact performance (no
re-loads due to cache pressure, or are redirtied in the journal).

In comparison, with the current "random" dirent->itable mapping we would
get another 102 new dirent blocks touched for each leaf block, and for
larger directories the leaf blocks cannot even all fit into a single
journal transaction and the performance tanks because each unlink will
cause a separate 4kB block to be written into the journal.

> + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
>
> + /* Avoid using the first bg of a flexgroup for data files */
> + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&

Since these are both constants, wouldn't it make more sense to just
check the sbi->s_log_groups_per_flex against the lg of the threshold:

if (sbi->s_log_groups_per_flex > (2)) (as a #defined constant)

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


2009-02-27 15:06:22

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

On Fri, Feb 27, 2009 at 02:17:04AM -0700, Andreas Dilger wrote:
> On Feb 26, 2009 13:21 -0500, Theodore Ts'o wrote:
> > I tried adding some of Andreas' suggestions which would tend to pack
> > the inodes less agressively, in the hopes that it might speed up the
> > mkdir operation, but at least for seq_mkdir/mkdirs_mark benchmark, it
> > really didn't help, because we are disk bound, not cpu bound.
>
> Could you explain in a bit more detail what you tried?

Sorry, no, that's not something which I tried. I was thinking more
about your complaints that Orlov was doing too much work to pack
inodes into the beginning of the flexgroup. So I tried some
experiments that reduced the number of times that we called
get_orlov_stats() (since you complained that we were using it instead
of the flex group statistics --- which involve too many spinlocks for
my taste, and which I want to nuke eventually), and what I found is
that at least for mkdirs_mark, we're not CPU bound, we're disk bound.
Therefore, speinding the extra CPU time to make sure the inode table
is tightly packed is a worth it, since crazy benchmarks like
TPC-C/TPC-H where you spend $$$ to add thousands of spindles so you
can be simultaneously CPU- and disk- bound don't create a large number
of directories or files. :-)

> In particular, was this the "mapping hash range onto itable range" I
> have proposed in the past?

No, it wasn't that. One of these days I'm going to have to track down
a formal write up of that idea (I'm trying to remember if you've
written it up fully or whether this is one of those things we've
talked about orally but which hasn't been formally written down to
explore all of the details).

The part which I'm not sure about is how do we take into account the
fact that we don't know how big the directory will when we are
creating the first couple of files, so we don't know how much room to
reserve in each itable range for each leaf directory. This is
critically important, since if we leave too much space in between
inodes, then we end up losing due to fragmentation, and it doesn't buy
us much.

For mid-sized directories, the optimization we have in ext4 where we
read some additional inode tables blocks (since the diffence between
4k and 32k is negligible) will is more likely to speed up the
readdir+stat workload if the inodes are packed closely together, after
all. Spreading out the inodes could end up hurting more than it helps.

Now, if we had a way of providing a hint at mkdir time --- I'm
creating this directory, and very shortly thereafter, I will be
populating it with 600 files with an average filename length of 12,
then we could some *really* interesting things; we could preallocate
blocks for the directory, and since we know the initial number of
files that will be dropped into the directory, we could reserve a
range of inodes, and then use the filename hash to determine
approximately what inode number to pick out of that reserved range for
that directory.

Without that hint, we would have to guess how many inodes would be
used by the directory, and it's that guestimate which makes this hard.


Another thought. another way of solving this problem is to have a
structure at the beginning of each directory tracking the minimum and
maximum number inode numbers in the directory, and the number of
directory entries in the directory. Then, you could have a hueristic
to detect the readdir+stat pattern, and if it is found, and if the
min/max inode range compares favorably to the number of directory
entries, the filesystem could simply arrange to do aggressive
pre-reading of the inodes in the min/max range on the theory that they
will likely be needed in the future.

> > + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
> >
> > + /* Avoid using the first bg of a flexgroup for data files */
> > + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
>
> Since these are both constants, wouldn't it make more sense to just
> check the sbi->s_log_groups_per_flex against the lg of the threshold:
>
> if (sbi->s_log_groups_per_flex > (2)) (as a #defined constant)

It would mean having two constnats in ext4.h,
EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME and
EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME_LOG, but yeah, that would save an
shift-left instructure.

- Ted

2009-03-30 08:48:30

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH, RFC] ext4: New inode/block allocation algorithms for flex_bg filesystems

On Fri, Feb 27, 2009 at 12:08:12AM +0530, Aneesh Kumar K.V wrote:
> ....
>
> > static int find_group_orlov(struct super_block *sb, struct inode *parent,
> > - ext4_group_t *group)
> > + ext4_group_t *group, int mode)
> > {
> > ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> > struct ext4_sb_info *sbi = EXT4_SB(sb);
> > - struct ext4_super_block *es = sbi->s_es;
> > ext4_group_t ngroups = sbi->s_groups_count;
> > int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
> > unsigned int freei, avefreei;
> > ext4_fsblk_t freeb, avefreeb;
> > - ext4_fsblk_t blocks_per_dir;
> > unsigned int ndirs;
> > - int max_debt, max_dirs, min_inodes;
> > + int max_dirs, min_inodes;
> > ext4_grpblk_t min_blocks;
> > - ext4_group_t i;
> > + ext4_group_t i, grp, g;
> > struct ext4_group_desc *desc;
> > + struct orlov_stats stats;
> > + int flex_size = ext4_flex_bg_size(sbi);
> > +
> > + if (flex_size > 1) {
> > + ngroups = (ngroups + flex_size - 1) >>
> > + sbi->s_log_groups_per_flex;
> > + parent_group >>= sbi->s_log_groups_per_flex;
> > + }
> >
> > freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
> > avefreei = freei / ngroups;
> > @@ -460,71 +496,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
> > do_div(avefreeb, ngroups);
> > ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
> >
> > - if ((parent == sb->s_root->d_inode) ||
> > - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
> > + if (S_ISDIR(mode) &&
> > + ((parent == sb->s_root->d_inode) ||
> > + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
> > int best_ndir = inodes_per_group;
> > - ext4_group_t grp;
> > int ret = -1;
> >
> > get_random_bytes(&grp, sizeof(grp));
> > parent_group = (unsigned)grp % ngroups;
> > for (i = 0; i < ngroups; i++) {
> > - grp = (parent_group + i) % ngroups;
> > - desc = ext4_get_group_desc(sb, grp, NULL);
> > - if (!desc || !ext4_free_inodes_count(sb, desc))
> > + g = (parent_group + i) % ngroups;
> > + get_orlov_stats(sb, g, flex_size, &stats);
> > + if (!stats.free_inodes)
> > continue;
> > - if (ext4_used_dirs_count(sb, desc) >= best_ndir)
> > + if (stats.used_dirs >= best_ndir)
> > continue;
> > - if (ext4_free_inodes_count(sb, desc) < avefreei)
> > + if (stats.free_inodes < avefreei)
> > continue;
> > - if (ext4_free_blks_count(sb, desc) < avefreeb)
> > + if (stats.free_blocks < avefreeb)
> > continue;
> > - *group = grp;
> > + grp = g;
> > ret = 0;
> > - best_ndir = ext4_used_dirs_count(sb, desc);
> > + best_ndir = stats.used_dirs;
> > + }
> > + if (ret)
> > + goto fallback;
> > + found_flex_bg:
> > + if (flex_size == 1) {
> > + *group = grp;
> > + return 0;
> > + }
> > +
> > + /*
> > + * We pack inodes at the beginning of the flexgroup's
> > + * inode tables. Block allocation decisions will do
> > + * something similar, although regular files will
> > + * start at 2nd block group of the flexgroup. See
> > + * ext4_ext_find_goal() and ext4_find_near().
> > + */
> > + grp *= flex_size;
> > + for (i = 0; i < flex_size; i++) {
> > + if (grp+i >= sbi->s_groups_count)
> > + break;
> > + desc = ext4_get_group_desc(sb, grp+i, NULL);
> > + if (desc && ext4_free_inodes_count(sb, desc)) {
> > + *group = grp+i;
> > + return 0;
> > + }
> > }
> > - if (ret == 0)
> > - return ret;
> > goto fallback;
> > }
> >
> > - blocks_per_dir = ext4_blocks_count(es) - freeb;
> > - do_div(blocks_per_dir, ndirs);
> > -
> > max_dirs = ndirs / ngroups + inodes_per_group / 16;
> > - min_inodes = avefreei - inodes_per_group / 4;
> > - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
> > -
> > - max_debt = EXT4_BLOCKS_PER_GROUP(sb);
> > - max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
> > - if (max_debt * INODE_COST > inodes_per_group)
> > - max_debt = inodes_per_group / INODE_COST;
> > - if (max_debt > 255)
> > - max_debt = 255;
> > - if (max_debt == 0)
> > - max_debt = 1;
> > + min_inodes = avefreei - inodes_per_group*flex_size / 4;
> > + if (min_inodes < 1)
> > + min_inodes = 1;
> > + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
> > +
> > + /*
> > + * Start looking in the flex group where we last allocated an
> > + * inode for this parent directory
> > + */
> > + if (EXT4_I(parent)->i_last_alloc_group != ~0) {
> > + parent_group = EXT4_I(parent)->i_last_alloc_group;
> > + if (flex_size > 1)
> > + parent_group >>= sbi->s_log_groups_per_flex;
> > + }
> >
> > for (i = 0; i < ngroups; i++) {
> > - *group = (parent_group + i) % ngroups;
> > - desc = ext4_get_group_desc(sb, *group, NULL);
> > - if (!desc || !ext4_free_inodes_count(sb, desc))
> > - continue;
> > - if (ext4_used_dirs_count(sb, desc) >= max_dirs)
> > + grp = (parent_group + i) % ngroups;
> > + get_orlov_stats(sb, grp, flex_size, &stats);
> > + if (stats.used_dirs >= max_dirs)
> > continue;
> > - if (ext4_free_inodes_count(sb, desc) < min_inodes)
> > + if (stats.free_inodes < min_inodes)
> > continue;
> > - if (ext4_free_blks_count(sb, desc) < min_blocks)
> > + if (stats.free_blocks < min_blocks)
> > continue;
> > - return 0;
> > + goto found_flex_bg;
> > }
> >
> > fallback:
> > + ngroups = sbi->s_groups_count;
> > + avefreei = freei / ngroups;
> > + parent_group = EXT4_I(parent)->i_block_group;
> > for (i = 0; i < ngroups; i++) {
> > - *group = (parent_group + i) % ngroups;
> > - desc = ext4_get_group_desc(sb, *group, NULL);
> > + grp = (parent_group + i) % ngroups;
> > + desc = ext4_get_group_desc(sb, grp, NULL);
> > if (desc && ext4_free_inodes_count(sb, desc) &&
> > - ext4_free_inodes_count(sb, desc) >= avefreei)
> > + ext4_free_inodes_count(sb, desc) >= avefreei) {
> > + *group = grp;
> > return 0;
> > + }
> > }
> >
> > if (avefreei) {
> > @@ -540,12 +602,51 @@ fallback:
> > }
> >
> > static int find_group_other(struct super_block *sb, struct inode *parent,
> > - ext4_group_t *group)
> > + ext4_group_t *group, int mode)
> > {
> > ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> > ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
> > struct ext4_group_desc *desc;
> > - ext4_group_t i;
> > + ext4_group_t i, last;
> > + int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
> > +
> > + /*
> > + * Try to place the inode is the same flex group as its
> > + * parent. If we can't find space, use the Orlov algorithm to
> > + * find another flex group, and store that information in the
> > + * parent directory's inode information so that use that flex
> > + * group for future allocations.
> > + */
> > + if (flex_size > 1) {
> > + int retry = 0;
> > +
> > + try_again:
> > + parent_group &= ~(flex_size-1);
> > + last = parent_group + flex_size;
> > + if (last > ngroups)
> > + last = ngroups;
> > + for (i = parent_group; i < last; i++) {
> > + desc = ext4_get_group_desc(sb, i, NULL);
> > + if (desc && ext4_free_inodes_count(sb, desc)) {
> > + *group = i;
> > + return 0;
> > + }
> > + }
> > + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
> > + retry = 1;
> > + parent_group = EXT4_I(parent)->i_last_alloc_group;
> > + goto try_again;
> > + }
>
>
> For directories we try first with i_last_alloc_group and then with the
> parent i_block_group. For file we are doing the other way round here.
> Does it make sense to try with i_last_alloc_group first ?
>
>
>
> > + /*
> > + * If this didn't work, use the Orlov search algorithm
> > + * to find a new flex group; we pass in the mode to
> > + * avoid the topdir algorithms.
> > + */
> > + *group = parent_group + flex_size;
> > + if (*group > ngroups)
> > + *group = 0;
> > + return find_group_orlov(sb, parent, group, mode);
> > + }
> >
> > /*
>
> If you want you can add
> Reviewed-by: Aneesh Kumar K.V <[email protected]>
>

How about the below update for the patch ?

commit 79561847d40ba84a8618f10445cb9f132c57bdd9
Author: Aneesh Kumar K.V <[email protected]>
Date: Mon Mar 30 13:04:16 2009 +0530

update for orlov allocator

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 47b84e8..4dac65c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -611,19 +611,26 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
static int find_group_other(struct super_block *sb, struct inode *parent,
ext4_group_t *group, int mode)
{
- ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t parent_group;
struct ext4_group_desc *desc;
ext4_group_t i, last;
int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
-
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
/*
* Try to place the inode is the same flex group as its
* parent. If we can't find space, use the Orlov algorithm to
* find another flex group, and store that information in the
* parent directory's inode information so that use that flex
* group for future allocations.
+ *
+ * Start looking in the flex group where we last allocated an
+ * for this parent directory.
*/
+ if (EXT4_I(parent)->i_last_alloc_group != ~0)
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ else
+ parent_group = EXT4_I(parent)->i_block_group;
+
if (flex_size > 1) {
int retry = 0;

@@ -640,8 +647,13 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
}
}
if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+ /*
+ * with i_last_alloc_group we failed to find
+ * group with free inodes. So try with parent
+ * directory inode group.
+ */
+ parent_group = EXT4_I(parent)->i_block_group;
retry = 1;
- parent_group = EXT4_I(parent)->i_last_alloc_group;
goto try_again;
}
/*