2008-04-29 10:57:56

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH] ext4: Group meta-data blocks together.

Hi Eric,

I haven't yet tested this. Let me know what you think.

-aneesh

This adds a per inode meta-block prealloc space from which
meta-data block requests are served. This help in making
sure meta-data block are closer. This is needed to speedup
unlink of the file. Any new prealloc space is allocated near
the goal block specified. The goal block is the last block
allocated for the file. So we don't keep the data-block and
meta-data block far apart.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/ext4_i.h | 1 +
fs/ext4/mballoc.c | 247 +++++++++++++++++++++++++++++++++++++++++++++++-----
fs/ext4/mballoc.h | 7 +-
fs/ext4/super.c | 1 +
4 files changed, 230 insertions(+), 26 deletions(-)

diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae2..4f11ec4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -161,6 +161,7 @@ struct ext4_inode_info {

/* mballoc */
struct list_head i_prealloc_list;
+ struct list_head i_metaprealloc_list;
spinlock_t i_prealloc_lock;
};

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 2a6c814..0e7a9c5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1730,10 +1730,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
-
- /* searching for the right group start from the goal value specified */
- group = ac->ac_g_ex.fe_group;
-
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -1743,6 +1739,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
+ /*
+ * searching for the right group start
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
struct ext4_group_info *grp;
struct ext4_group_desc *desc;
@@ -2842,6 +2843,23 @@ out_err:
return err;
}

+static void
+ext4_mb_normalize_meta_data_request(struct ext4_allocation_context *ac)
+{
+ /*
+ * Need to find what the right nomalized block num should be
+ */
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ if (i_size_read(ac->ac_inode) >= sbi->s_mb_stream_request) {
+ /* large inode which is using inode prealloc */
+ ac->ac_g_ex.fe_len = 10;
+ } else {
+ ac->ac_g_ex.fe_len = 2;
+ }
+ mb_debug("#%u: goal %lu blocks for meta-data group\n",
+ current->pid, ac->ac_g_ex.fe_len);
+}
+
/*
* here we normalize request for locality group
* Group request are normalized to s_strip size if we set the same via mount
@@ -2879,11 +2897,6 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_prealloc_space *pa;

- /* do normalize only data requests, metadata requests
- do not need preallocation */
- if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
- return;
-
/* sometime caller may want exact blocks */
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
@@ -2893,6 +2906,14 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
return;

+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ /* meta-data preallocation space
+ * depends on the file size.
+ */
+ ext4_mb_normalize_meta_data_request(ac);
+ return;
+ }
+
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
ext4_mb_normalize_group_request(ac);
return ;
@@ -3074,6 +3095,26 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)

ext4_mb_store_history(ac);
}
+/*
+ * use blocks preallocated to meta-data prealloc space
+ */
+static void ext4_mb_use_meta_block_pa(struct ext4_allocation_context *ac,
+ struct ext4_prealloc_space *pa)
+{
+ unsigned len = ac->ac_o_ex.fe_len;
+
+ ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
+ &ac->ac_b_ex.fe_group,
+ &ac->ac_b_ex.fe_start);
+ ac->ac_b_ex.fe_len = len;
+ ac->ac_status = AC_STATUS_FOUND;
+ ac->ac_pa = pa;
+
+ mb_debug("use %u/%u from meta group pa %p\n", pa->pa_pstart, len, pa);
+ pa->pa_pstart += ac->ac_b_ex.fe_len;
+ pa->pa_free -= ac->ac_b_ex.fe_len;
+ pa->pa_len -= ac->ac_b_ex.fe_len;
+}

/*
* use blocks preallocated to inode
@@ -3136,9 +3177,26 @@ static noinline int ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
struct ext4_locality_group *lg;
struct ext4_prealloc_space *pa;

- /* only data can be preallocated */
- if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) {
+ /* meta-data allocation request */
+ rcu_read_lock();
+ list_for_each_entry_rcu(pa, &ei->i_metaprealloc_list,
+ pa_inode_list) {
+ /* found preallocated blocks, use them */
+ spin_lock(&pa->pa_lock);
+ if (pa->pa_deleted == 0 && pa->pa_free) {
+ atomic_inc(&pa->pa_count);
+ ext4_mb_use_meta_block_pa(ac, pa);
+ spin_unlock(&pa->pa_lock);
+ ac->ac_criteria = 10;
+ rcu_read_unlock();
+ return 1;
+ }
+ spin_unlock(&pa->pa_lock);
+ }
+ rcu_read_unlock();
return 0;
+ }

/* first, try per-file preallocation */
rcu_read_lock();
@@ -3291,6 +3349,58 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}

+static noinline int
+ext4_mb_new_meta_block_pa(struct ext4_allocation_context *ac)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_prealloc_space *pa;
+ struct ext4_group_info *grp;
+ struct ext4_inode_info *ei;
+
+ /* preallocate only when found space is larger then requested */
+ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
+ BUG_ON(ac->ac_status != AC_STATUS_FOUND);
+
+ pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
+ if (pa == NULL)
+ return -ENOMEM;
+
+ /* preallocation can change ac_b_ex, thus we store actually
+ * allocated blocks for history */
+ ac->ac_f_ex = ac->ac_b_ex;
+
+ pa->pa_lstart = 0;
+ pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+ pa->pa_len = ac->ac_b_ex.fe_len;
+ pa->pa_free = pa->pa_len;
+ atomic_set(&pa->pa_count, 1);
+ spin_lock_init(&pa->pa_lock);
+ pa->pa_deleted = 0;
+ pa->pa_type = PA_META_PA;
+
+ mb_debug("new meta pa %p: %llu/%u\n", pa,
+ pa->pa_pstart, pa->pa_len);
+
+ ext4_mb_use_meta_block_pa(ac, pa);
+ atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
+
+ ei = EXT4_I(ac->ac_inode);
+ grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+
+ pa->pa_obj_lock = &ei->i_prealloc_lock;
+ pa->pa_inode = ac->ac_inode;
+
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+ list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
+
+ spin_lock(pa->pa_obj_lock);
+ list_add_rcu(&pa->pa_inode_list, &ei->i_metaprealloc_list);
+ spin_unlock(pa->pa_obj_lock);
+
+ return 0;
+}
+
/*
* creates new preallocated space for given inode
*/
@@ -3353,7 +3463,7 @@ static noinline int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
pa->pa_deleted = 0;
- pa->pa_linear = 0;
+ pa->pa_type = PA_INODE_PA;

mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3409,7 +3519,7 @@ static noinline int ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
pa->pa_deleted = 0;
- pa->pa_linear = 1;
+ pa->pa_type = PA_GROUP_PA;

mb_debug("new group pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3439,7 +3549,9 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
{
int err;

- if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ err = ext4_mb_new_meta_block_pa(ac);
+ else if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
err = ext4_mb_new_group_pa(ac);
else
err = ext4_mb_new_inode_pa(ac);
@@ -3521,6 +3633,35 @@ static noinline int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,

return err;
}
+static noinline int ext4_mb_release_meta_block_pa(struct ext4_buddy *e4b,
+ struct ext4_prealloc_space *pa,
+ struct ext4_allocation_context *ac)
+{
+ struct super_block *sb = e4b->bd_sb;
+ ext4_group_t group;
+ ext4_grpblk_t bit;
+
+ if (ac)
+ ac->ac_op = EXT4_MB_HISTORY_DISCARD;
+
+ BUG_ON(pa->pa_deleted == 0);
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+ BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+ mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
+ atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
+
+ if (ac) {
+ ac->ac_sb = sb;
+ ac->ac_inode = NULL;
+ ac->ac_b_ex.fe_group = group;
+ ac->ac_b_ex.fe_start = bit;
+ ac->ac_b_ex.fe_len = pa->pa_len;
+ ac->ac_b_ex.fe_logical = 0;
+ ext4_mb_store_history(ac);
+ }
+
+ return 0;
+}

static noinline int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
struct ext4_prealloc_space *pa,
@@ -3649,11 +3790,18 @@ repeat:
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);

- if (pa->pa_linear)
+ switch (pa->pa_type) {
+ case PA_META_PA:
+ ext4_mb_release_meta_block_pa(&e4b, pa, ac);
+ break;
+ case PA_GROUP_PA:
ext4_mb_release_group_pa(&e4b, pa, ac);
- else
- ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
-
+ break;
+ case PA_INODE_PA:
+ ext4_mb_release_inode_pa(&e4b,
+ bitmap_bh, pa, ac);
+ break;
+ }
list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
@@ -3688,10 +3836,8 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
struct ext4_buddy e4b;
int err;

- if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
- /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
+ if (!test_opt(sb, MBALLOC))
return;
- }

mb_debug("discard preallocation for inode %lu\n", inode->i_ino);

@@ -3701,6 +3847,49 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
repeat:
/* first, collect all pa's in the inode */
spin_lock(&ei->i_prealloc_lock);
+ while (!list_empty(&ei->i_metaprealloc_list)) {
+ pa = list_entry(ei->i_metaprealloc_list.next,
+ struct ext4_prealloc_space, pa_inode_list);
+ BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+ spin_lock(&pa->pa_lock);
+ if (atomic_read(&pa->pa_count)) {
+ /* this shouldn't happen often - nobody should
+ * use preallocation while we're discarding it */
+ spin_unlock(&pa->pa_lock);
+ spin_unlock(&ei->i_prealloc_lock);
+ printk(KERN_ERR "uh-oh! used pa while discarding\n");
+ WARN_ON(1);
+ schedule_timeout_uninterruptible(HZ);
+ goto repeat;
+
+ }
+ if (pa->pa_deleted == 0) {
+ pa->pa_deleted = 1;
+ spin_unlock(&pa->pa_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ list_add(&pa->u.pa_tmp_list, &list);
+ continue;
+ }
+
+ /* someone is deleting pa right now */
+ spin_unlock(&pa->pa_lock);
+ spin_unlock(&ei->i_prealloc_lock);
+
+ /* we have to wait here because pa_deleted
+ * doesn't mean pa is already unlinked from
+ * the list. as we might be called from
+ * ->clear_inode() the inode will get freed
+ * and concurrent thread which is unlinking
+ * pa from inode's list may access already
+ * freed memory, bad-bad-bad */
+
+ /* XXX: if this happens too often, we can
+ * add a flag to force wait only in case
+ * of ->clear_inode(), but not in case of
+ * regular truncate */
+ schedule_timeout_uninterruptible(HZ);
+ goto repeat;
+ }
while (!list_empty(&ei->i_prealloc_list)) {
pa = list_entry(ei->i_prealloc_list.next,
struct ext4_prealloc_space, pa_inode_list);
@@ -3747,7 +3936,6 @@ repeat:
spin_unlock(&ei->i_prealloc_lock);

list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
- BUG_ON(pa->pa_linear != 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);

err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -3762,7 +3950,18 @@ repeat:

ext4_lock_group(sb, group);
list_del(&pa->pa_group_list);
- ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+
+ switch (pa->pa_type) {
+ case PA_META_PA:
+ ext4_mb_release_meta_block_pa(&e4b, pa, ac);
+ break;
+ case PA_INODE_PA:
+ ext4_mb_release_inode_pa(&e4b,
+ bitmap_bh, pa, ac);
+ break;
+ default:
+ BUG();
+ }
ext4_unlock_group(sb, group);

ext4_mb_release_desc(&e4b);
@@ -3966,7 +4165,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
if (ac->ac_pa) {
- if (ac->ac_pa->pa_linear) {
+ if (ac->ac_pa->pa_type == PA_GROUP_PA) {
/* see comment in ext4_mb_use_group_pa() */
spin_lock(&ac->ac_pa->pa_lock);
ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index bfe6add..2cc8440 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -131,6 +131,10 @@ struct ext4_group_info {
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))

+#define PA_INODE_PA 0
+#define PA_GROUP_PA 1
+#define PA_META_PA 2
+

struct ext4_prealloc_space {
struct list_head pa_inode_list;
@@ -146,8 +150,7 @@ struct ext4_prealloc_space {
ext4_lblk_t pa_lstart; /* log. block */
unsigned short pa_len; /* len of preallocated chunk */
unsigned short pa_free; /* how many blocks are free */
- unsigned short pa_linear; /* consumed in one direction
- * strictly, for grp prealloc */
+ unsigned short pa_type; /* Trype of prealloc space */
spinlock_t *pa_obj_lock;
struct inode *pa_inode; /* hack, for history only */
};
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6d54397..6d237ad 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -571,6 +571,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->vfs_inode.i_version = 1;
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
+ INIT_LIST_HEAD(&ei->i_metaprealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
return &ei->vfs_inode;
}
--
1.5.5.1.67.gbdb87.dirty



2008-04-29 20:46:21

by Eric Sandeen

[permalink] [raw]
Subject: Re: [RFC PATCH] ext4: Group meta-data blocks together.

Aneesh Kumar K.V wrote:
> Hi Eric,
>
> I haven't yet tested this. Let me know what you think.
>
> -aneesh

I'll look over it when I get a chance; I did do a quick test with my RM
scenario, and it came in about 1s faster than stock (5s vs. 6s) :)

http://people.redhat.com/esandeen/rm_test/ext4_aneesh_rm.png

-Eric

2008-04-30 10:19:36

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC PATCH] ext4: Group meta-data blocks together.

On Tue, Apr 29, 2008 at 03:19:35PM -0500, Eric Sandeen wrote:
> Aneesh Kumar K.V wrote:
> > Hi Eric,
> >
> > I haven't yet tested this. Let me know what you think.
> >
> > -aneesh
>
> I'll look over it when I get a chance; I did do a quick test with my RM
> scenario, and it came in about 1s faster than stock (5s vs. 6s) :)
>
> http://people.redhat.com/esandeen/rm_test/ext4_aneesh_rm.png

I did minimal testing. The layout of the meta-data blocks using the
patch is marked below. So the patch actually cluster the meta-data
blocks together. In the patch I am preallocating some blocks which
are used for the subsequent meta-block request. Number of blocks preallocated
is determined by

static void
ext4_mb_normalize_meta_data_request(struct ext4_allocation_context *ac)
{
/*
* Need to find what the right nomalized block num should be
*/
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
if (i_size_read(ac->ac_inode) >= sbi->s_mb_stream_request) {
/* large inode which is using inode prealloc */
ac->ac_g_ex.fe_len = 10;
} else {
ac->ac_g_ex.fe_len = 2;
}
mb_debug("#%u: goal %lu blocks for meta-data group\n",
current->pid, ac->ac_g_ex.fe_len);
}

ie, for large files for which we are using inode preallocation I am
preallocating 10 blocks and for small files 2 blocks. The output also
shows that when we are trying to allocate a new preallocation space
we try to place the window closer to the data block.

extent: lblk 0--335871, len 335872, pblk 45482, flags: 2ND_VISIT
tst_extents: ns
(Left 41)
extent: lblk 335872--679935, len 344064, pblk 45483, flags: (none)
tst_extents: ns
(Left 40)
extent: lblk 679936--1023999, len 344064, pblk 45484, flags: (none)
tst_extents: ns
(Left 39)
extent: lblk 1024000--1368063, len 344064, pblk 45485, flags: (none)
tst_extents: ns
(Left 38)
extent: lblk 1368064--1712127, len 344064, pblk 45487, flags: (none)
tst_extents: ns
(Left 37)
extent: lblk 1712128--2056191, len 344064, pblk 45488, flags: (none)
tst_extents: ns
(Left 36)
extent: lblk 2056192--2400255, len 344064, pblk 45489, flags: (none)
tst_extents: ns
(Left 35)
extent: lblk 2400256--2744319, len 344064, pblk 45490, flags: (none)
tst_extents: ns
(Left 34)
extent: lblk 2744320--3088383, len 344064, pblk 45491, flags: (none)
tst_extents: ns
(Left 33)
extent: lblk 3088384--3432447, len 344064, pblk 6205571, flags: (none)
tst_extents: ns
(Left 32)
extent: lblk 3432448--3776511, len 344064, pblk 6205572, flags: (none)
tst_extents: ns
(Left 31)
extent: lblk 3776512--4120575, len 344064, pblk 6205573, flags: (none)
tst_extents: ns
(Left 30)
extent: lblk 4120576--4464639, len 344064, pblk 6205574, flags: (none)
tst_extents: ns
(Left 29)
extent: lblk 4464640--4808703, len 344064, pblk 6205575, flags: (none)
tst_extents: ns
(Left 28)
extent: lblk 4808704--5013373, len 204670, pblk 6205576, flags: (none)
tst_extents: ns
(Left 27)
extent: lblk 5013374--5185405, len 172032, pblk 6205577, flags: (none)
tst_extents: ns
(Left 26)
extent: lblk 5185406--5357437, len 172032, pblk 6205578, flags: (none)
tst_extents: ns
(Left 25)
extent: lblk 5357438--5529469, len 172032, pblk 6205579, flags: (none)
tst_extents: ns
(Left 24)
extent: lblk 5529470--5701501, len 172032, pblk 6205580, flags: (none)
tst_extents: ns
(Left 23)
extent: lblk 5701502--5873533, len 172032, pblk 3371269, flags: (none)
tst_extents: ns
(Left 22)
extent: lblk 5873534--6045565, len 172032, pblk 3371270, flags: (none)
tst_extents: ns
(Left 21)
extent: lblk 6045566--6217597, len 172032, pblk 3371271, flags: (none)
tst_extents: ns
(Left 20)
extent: lblk 6217598--6389629, len 172032, pblk 3371272, flags: (none)
tst_extents: ns
(Left 19)
extent: lblk 6389630--6561661, len 172032, pblk 3371273, flags: (none)
tst_extents: ns
(Left 18)
extent: lblk 6561662--6733693, len 172032, pblk 3371274, flags: (none)
tst_extents: ns
(Left 17)
extent: lblk 6733694--6905725, len 172032, pblk 3371275, flags: (none)
tst_extents: ns
(Left 16)
extent: lblk 6905726--7077757, len 172032, pblk 3371276, flags: (none)
tst_extents: ns
(Left 15)
extent: lblk 7077758--7249789, len 172032, pblk 3371277, flags: (none)
tst_extents: ns
(Left 14)
extent: lblk 7249790--7421691, len 171902, pblk 3371278, flags: (none)
tst_extents: ns
(Left 13)
extent: lblk 7421692--7593723, len 172032, pblk 1094153, flags: (none)
tst_extents: ns

The meta-data block 1094153 is closer to data block 1093893,

tst_extents: nl
(Left 13)
<<<< OP = down
header: magic=f30a entries=84 max=84 depth=0 generation=0
Down to level 2/2, end_blk=7593724
(Left 83)
extent: lblk 7421692--7421951, len 260, pblk 1093893, flags: LEAF

Eric,

For the test you ran, If we update the preallocation window may be
we will have better numbers. Can you try with different values in
ext4_mb_normalize_meta_data_request ?

-aneesh



2008-05-02 17:49:12

by Andreas Dilger

[permalink] [raw]
Subject: Re: [RFC PATCH] ext4: Group meta-data blocks together.

On Apr 29, 2008 16:27 +0530, Aneesh Kumar K.V wrote:
> I haven't yet tested this. Let me know what you think.
>
> -aneesh
>
> This adds a per inode meta-block prealloc space from which
> meta-data block requests are served. This help in making
> sure meta-data block are closer. This is needed to speedup
> unlink of the file. Any new prealloc space is allocated near
> the goal block specified. The goal block is the last block
> allocated for the file. So we don't keep the data-block and
> meta-data block far apart.

Does this code use the mballoc "group allocator" mechanism that it
already uses for small files? It would be convenient to re-use
this code by creating a "metadata allocation group" instead of creating
another mechanism to aggregate small IO allocations.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


2008-05-02 18:12:45

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC PATCH] ext4: Group meta-data blocks together.

On Fri, May 02, 2008 at 10:49:01AM -0700, Andreas Dilger wrote:
> On Apr 29, 2008 16:27 +0530, Aneesh Kumar K.V wrote:
> > I haven't yet tested this. Let me know what you think.
> >
> > -aneesh
> >
> > This adds a per inode meta-block prealloc space from which
> > meta-data block requests are served. This help in making
> > sure meta-data block are closer. This is needed to speedup
> > unlink of the file. Any new prealloc space is allocated near
> > the goal block specified. The goal block is the last block
> > allocated for the file. So we don't keep the data-block and
> > meta-data block far apart.
>
> Does this code use the mballoc "group allocator" mechanism that it
> already uses for small files? It would be convenient to re-use
> this code by creating a "metadata allocation group" instead of creating
> another mechanism to aggregate small IO allocations.
>

That is mostly what it does. Except that the allocation space is hanging
from the inode rather than per-cpu.

-aneesh