2009-06-10 20:27:27

by Andreas Dilger

[permalink] [raw]
Subject: [PATCH] allow goal inode to be specified

Allow a goal inode to be specified when allocating new inodes.
If the goal is within the range of valid inodes use this in
preference to one of the other target inode heuristics (Orlov or
parent directory).

The goal inode can be specified via /sys/fs/{dev}/inode_goal for
testing inode allocation beyond 2^32 blocks on very large filesystems,
and in the future will be used for large xattr inode allocation.

Signed-off-by: Andreas Dilger <[email protected]>

Index: linux-stage/fs/ext4/ialloc.c
===================================================================
--- linux-stage.orig/fs/ext4/ialloc.c
+++ linux-stage/fs/ext4/ialloc.c
@@ -675,7 +675,8 @@ err_ret:
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
+struct inode *ext4_new_inode_goal(handle_t *handle, struct inode *dir,
+ int mode, unsigned goal)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -706,6 +707,14 @@ struct inode *ext4_new_inode(handle_t *h
sbi = EXT4_SB(sb);
es = sbi->s_es;

+ if (goal && goal < le32_to_cpu(es->s_inodes_count)) {
+ group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+ ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+
+ ret2 = 0;
+ goto got_group;
+ }
+
if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
ret2 = find_group_flex(sb, dir, &group);
if (ret2 == -1) {
@@ -724,7 +733,7 @@ got_group:
if (ret2 == -1)
goto out;

- for (i = 0; i < sbi->s_groups_count; i++) {
+ for (i = 0; i < sbi->s_groups_count; i++, ino = 0) {
err = -EIO;

gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -736,8 +745,6 @@ got_group:
if (!inode_bitmap_bh)
goto fail;

- ino = 0;
-
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
inode_bitmap_bh->b_data,
Index: linux-stage/fs/ext4/ext4.h
===================================================================
--- linux-stage.orig/fs/ext4/ext4.h
+++ linux-stage/fs/ext4/ext4.h
@@ -1032,7 +1032,14 @@ extern int ext4fs_dirhash(const char *na
dx_hash_info *hinfo);

/* ialloc.c */
-extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern struct inode *ext4_new_inode_goal(handle_t *handle, struct inode *dir,
+ int mode, unsigned goal);
+static inline struct inode *ext4_new_inode(handle_t *handle, struct inode *dir,
+ int mode)
+{
+ return ext4_new_inode_goal(handle, dir, mode,
+ EXT4_SB(dir->i_sb)->s_inode_goal);
+}
extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
Index: linux-stage/fs/ext4/super.c
===================================================================
--- linux-stage.orig/fs/ext4/super.c
+++ linux-stage/fs/ext4/super.c
@@ -2145,6 +2145,7 @@
EXT4_RO_ATTR(lifetime_write_kbytes);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
@@ -2157,6 +2158,7 @@
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
ATTR_LIST(mb_max_to_scan),
ATTR_LIST(mb_min_to_scan),
Index: linux-stage/fs/ext4/ext4_sb.h
===================================================================
--- linux-stage.orig/fs/ext4/ext4_sb.h
+++ linux-stage/fs/ext4/ext4_sb.h
@@ -53,6 +53,7 @@ struct ext4_sb_info {
int s_inode_size;
int s_first_ino;
unsigned int s_inode_readahead_blks;
+ unsigned int s_inode_goal;
spinlock_t s_next_gen_lock;
u32 s_next_generation;
u32 s_hash_seed[4];

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.



2009-06-13 15:48:14

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] allow goal inode to be specified

On Wed, Jun 10, 2009 at 02:27:02PM -0600, Andreas Dilger wrote:
> Allow a goal inode to be specified when allocating new inodes.
> If the goal is within the range of valid inodes use this in
> preference to one of the other target inode heuristics (Orlov or
> parent directory).
>
> The goal inode can be specified via /sys/fs/{dev}/inode_goal for
> testing inode allocation beyond 2^32 blocks on very large filesystems,
> and in the future will be used for large xattr inode allocation.

This patch didn't quite apply against the mainline ext4, and in
addition it conflicted with a patch to keep the choice of Orlov topdir
inode numbers stable for the purposes of benchmarking. So instead I
dropped the following two patches into the ext4 patch queue.

- Ted

2009-06-13 16:33:20

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2/2] ext4: teach the inode allocator to use a goal inode number

From: Andreas Dilger <[email protected]>

Enhance the inode allocator to take a goal inode number as a
paremeter; if it is specified, it takes precedence over Orlov or
parent directory inode allocation algorithms.

The extents migration function uses the goal inode number so that the
extent trees allocated the migration function use the correct flex_bg.
In the future, the goal inode functionality will also be used to
allocate an adjacent inode for the extended attributes.

Also, for testing purposes the goal inode number can be specified via
/sys/fs/{dev}/inode_goal. This can be useful for testing inode
allocation beyond 2^32 blocks on very large filesystems.

Signed-off-by: Andreas Dilger <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
Documentation/ABI/testing/sysfs-fs-ext4 | 10 ++++++++++
fs/ext4/ext4.h | 3 ++-
fs/ext4/ialloc.c | 16 ++++++++++++----
fs/ext4/migrate.c | 5 ++++-
fs/ext4/namei.c | 10 ++++++----
fs/ext4/super.c | 2 ++
6 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index 4e79074..5fb7099 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -79,3 +79,13 @@ Description:
This file is read-only and shows the number of
kilobytes of data that have been written to this
filesystem since it was mounted.
+
+What: /sys/fs/ext4/<disk>/inode_goal
+Date: June 2008
+Contact: "Theodore Ts'o" <[email protected]>
+Description:
+ Tuning parameter which (if non-zero) controls the goal
+ inode used by the inode allocator in p0reference to
+ all other allocation hueristics. This is intended for
+ debugging use only, and should be 0 on production
+ systems.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d035cf1..746cdcb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -863,6 +863,7 @@ struct ext4_sb_info {
int s_inode_size;
int s_first_ino;
unsigned int s_inode_readahead_blks;
+ unsigned int s_inode_goal;
spinlock_t s_next_gen_lock;
u32 s_next_generation;
u32 s_hash_seed[4];
@@ -1316,7 +1317,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct

/* ialloc.c */
extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
- const struct qstr *qstr);
+ const struct qstr *qstr, __u32 goal);
extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3f98ee7..2f64573 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -799,7 +799,7 @@ err_ret:
* group to find a free inode.
*/
struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
- const struct qstr *qstr)
+ const struct qstr *qstr, __u32 goal)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -830,6 +830,16 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
ei = EXT4_I(inode);
sbi = EXT4_SB(sb);

+ if (!goal)
+ goal = sbi->s_inode_goal;
+
+ if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) {
+ group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+ ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+ ret2 = 0;
+ goto got_group;
+ }
+
if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
ret2 = find_group_flex(sb, dir, &group);
if (ret2 == -1) {
@@ -858,7 +868,7 @@ got_group:
if (ret2 == -1)
goto out;

- for (i = 0; i < ngroups; i++) {
+ for (i = 0; i < ngroups; i++, ino = 0) {
err = -EIO;

gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -870,8 +880,6 @@ got_group:
if (!inode_bitmap_bh)
goto fail;

- ino = 0;
-
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
inode_bitmap_bh->b_data,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 80d075b..313a50b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -458,6 +458,7 @@ int ext4_ext_migrate(struct inode *inode)
struct inode *tmp_inode = NULL;
struct list_blocks_struct lb;
unsigned long max_entries;
+ __u32 goal;

/*
* If the filesystem does not support extents, or the inode
@@ -483,8 +484,10 @@ int ext4_ext_migrate(struct inode *inode)
retval = PTR_ERR(handle);
return retval;
}
+ goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+ EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
- S_IFREG, 0);
+ S_IFREG, 0, goal);
if (IS_ERR(tmp_inode)) {
retval = -ENOMEM;
ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5f00d24..de04013 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1782,7 +1782,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);

- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name);
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -1816,7 +1816,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);

- inode = ext4_new_inode(handle, dir, mode, &dentry->d_name);
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
@@ -1853,7 +1853,8 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);

- inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name);
+ inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
+ &dentry->d_name, 0);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -2264,7 +2265,8 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);

- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name);
+ inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
+ &dentry->d_name, 0);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04486a5..23013d3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2206,6 +2206,7 @@ EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
@@ -2218,6 +2219,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
ATTR_LIST(mb_max_to_scan),
ATTR_LIST(mb_min_to_scan),
--
1.6.3.2.1.gb9f7d.dirty


2009-06-13 16:33:20

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 1/2] ext4: Use a hash of the topdir directory name for the Orlov parent group

Instead of using a random number to determine the goal parent grop for
the Orlov top directories, use a hash of the directory name. This
allows for repeatable results when trying to benchmark filesystem
layout algorithms.

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 3 ++-
fs/ext4/ialloc.c | 19 ++++++++++++++-----
fs/ext4/migrate.c | 5 ++---
fs/ext4/namei.c | 8 ++++----
4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 06ee5a5..d035cf1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1315,7 +1315,8 @@ extern int ext4fs_dirhash(const char *name, int len, struct
dx_hash_info *hinfo);

/* ialloc.c */
-extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
+ const struct qstr *qstr);
extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7d502f3..3f98ee7 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -470,7 +470,8 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g,
*/

static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group, int mode)
+ ext4_group_t *group, int mode,
+ const struct qstr *qstr)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -485,6 +486,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
struct ext4_group_desc *desc;
struct orlov_stats stats;
int flex_size = ext4_flex_bg_size(sbi);
+ struct dx_hash_info hinfo;

ngroups = real_ngroups;
if (flex_size > 1) {
@@ -506,7 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
int best_ndir = inodes_per_group;
int ret = -1;

- get_random_bytes(&grp, sizeof(grp));
+ if (qstr) {
+ hinfo.hash_version = DX_HASH_HALF_MD4;
+ hinfo.seed = sbi->s_hash_seed;
+ ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+ grp = hinfo.hash;
+ } else
+ get_random_bytes(&grp, sizeof(grp));
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
@@ -649,7 +657,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*group = parent_group + flex_size;
if (*group > ngroups)
*group = 0;
- return find_group_orlov(sb, parent, group, mode);
+ return find_group_orlov(sb, parent, group, mode, 0);
}

/*
@@ -790,7 +798,8 @@ err_ret:
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
+ const struct qstr *qstr)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
@@ -839,7 +848,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (test_opt(sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
else
- ret2 = find_group_orlov(sb, dir, &group, mode);
+ ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
} else
ret2 = find_group_other(sb, dir, &group, mode);

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index fe64d9f..80d075b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -483,9 +483,8 @@ int ext4_ext_migrate(struct inode *inode)
retval = PTR_ERR(handle);
return retval;
}
- tmp_inode = ext4_new_inode(handle,
- inode->i_sb->s_root->d_inode,
- S_IFREG);
+ tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+ S_IFREG, 0);
if (IS_ERR(tmp_inode)) {
retval = -ENOMEM;
ext4_journal_stop(handle);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 07eb664..5f00d24 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1782,7 +1782,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);

- inode = ext4_new_inode (handle, dir, mode);
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -1816,7 +1816,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);

- inode = ext4_new_inode(handle, dir, mode);
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
@@ -1853,7 +1853,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);

- inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
+ inode = ext4_new_inode(handle, dir, S_IFDIR | mode, &dentry->d_name);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -2264,7 +2264,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);

- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
+ inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
--
1.6.3.2.1.gb9f7d.dirty