2008-10-13 16:15:17

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V4] ext4: Use an rbtree for tracking blocks freed during transaction.

With this patch we track the block freed during a transaction using
rb tree. We also make sure contiguous blocks freed are collected
in one rb node.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/mballoc.c | 182 +++++++++++++++++++++++++++++++++-------------------
fs/ext4/mballoc.h | 29 +++++++--
2 files changed, 138 insertions(+), 73 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b580714..7d5f70c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ meta_group_info[i]->bb_free_root.rb_node = NULL;;

#ifdef DOUBLE_CHECK
{
@@ -2647,13 +2648,11 @@ int ext4_mb_release(struct super_block *sb)
static noinline_for_stack void
ext4_mb_free_committed_blocks(struct super_block *sb)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
- int i;
- int count = 0;
- int count2 = 0;
- struct ext4_free_metadata *md;
struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err, count = 0, count2 = 0;
+ struct ext4_free_data *entry;

if (list_empty(&sbi->s_committed_transaction))
return;
@@ -2661,44 +2660,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
/* there is committed blocks to be freed yet */
do {
/* get next array of blocks */
- md = NULL;
+ entry = NULL;
spin_lock(&sbi->s_md_lock);
if (!list_empty(&sbi->s_committed_transaction)) {
- md = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_metadata, list);
- list_del(&md->list);
+ entry = list_entry(sbi->s_committed_transaction.next,
+ struct ext4_free_data, list);
+ list_del(&entry->list);
}
spin_unlock(&sbi->s_md_lock);

- if (md == NULL)
+ if (entry == NULL)
break;

mb_debug("gonna free %u blocks in group %lu (0x%p):",
- md->num, md->group, md);
+ entry->count, entry->group, entry);

- err = ext4_mb_load_buddy(sb, md->group, &e4b);
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);

+ db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
- count += md->num;
+ count += entry->count;
count2++;
- ext4_lock_group(sb, md->group);
- for (i = 0; i < md->num; i++) {
- mb_debug(" %u", md->blocks[i]);
- mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+ ext4_lock_group(sb, entry->group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
}
- mb_debug("\n");
- ext4_unlock_group(sb, md->group);
-
- /* balance refcounts from ext4_mb_free_metadata() */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
+ ext4_unlock_group(sb, entry->group);

- kfree(md);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
-
- } while (md);
+ } while (1);

mb_debug("freed %u blocks in %u structures\n", count, count2);
}
@@ -2771,6 +2772,16 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
+
+ ext4_free_ext_cachep =
+ kmem_cache_create("ext4_free_block_extents",
+ sizeof(struct ext4_free_data),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ if (ext4_free_ext_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
return 0;
}

@@ -2779,6 +2790,7 @@ void exit_ext4_mballoc(void)
/* XXX: synchronize_rcu(); */
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
+ kmem_cache_destroy(ext4_free_ext_cachep);
}


@@ -4415,6 +4427,19 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb,
ext4_mb_free_committed_blocks(sb);
}

+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ /*
+ * Even though blocks are contiguous, we don't want to
+ * merge if they don't belong to the same transaction
+ */
+ if (entry1->t_tid == entry2->t_tid &&
+ (entry1->start_blk + entry1->count) == entry2->start_blk)
+ return 1;
+ return 0;
+}
+
static noinline_for_stack int
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_t group, ext4_grpblk_t block, int count)
@@ -4422,57 +4447,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_metadata *md;
- int i;
+ struct ext4_free_data *entry, *new_entry;
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+

BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);

+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = block;
+ new_entry->group = group;
+ new_entry->count = count;
+ new_entry->t_tid = handle->h_transaction->t_tid;
+ new_node = &new_entry->node;
+
ext4_lock_group(sb, group);
- for (i = 0; i < count; i++) {
- md = db->bb_md_cur;
- if (md && db->bb_tid != handle->h_transaction->t_tid) {
- db->bb_md_cur = NULL;
- md = NULL;
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, node);
+ if (block < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (block >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ ext4_error(sb, __func__,
+ "Double free of blocks %d (%d %d)\n",
+ block, entry->start_blk, entry->count);
+ return 0;
}
+ }

- if (md == NULL) {
- ext4_unlock_group(sb, group);
- md = kmalloc(sizeof(*md), GFP_NOFS);
- if (md == NULL)
- return -ENOMEM;
- md->num = 0;
- md->group = group;
-
- ext4_lock_group(sb, group);
- if (db->bb_md_cur == NULL) {
- spin_lock(&sbi->s_md_lock);
- list_add(&md->list, &sbi->s_active_transaction);
- spin_unlock(&sbi->s_md_lock);
- /* protect buddy cache from being freed,
- * otherwise we'll refresh it from
- * on-disk bitmap and lose not-yet-available
- * blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
- db->bb_md_cur = md;
- db->bb_tid = handle->h_transaction->t_tid;
- mb_debug("new md 0x%p for group %lu\n",
- md, md->group);
- } else {
- kfree(md);
- md = db->bb_md_cur;
- }
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);
+
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
+ }

- BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
- md->blocks[md->num] = block + i;
- md->num++;
- if (md->num == EXT4_BB_MAX_BLOCKS) {
- /* no more space, put full container on a sb's list */
- db->bb_md_cur = NULL;
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
+ /* Add the extent to active_transaction list */
+ spin_lock(&sbi->s_md_lock);
+ list_add(&new_entry->list, &sbi->s_active_transaction);
+ spin_unlock(&sbi->s_md_lock);
ext4_unlock_group(sb, group);
return 0;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b3b4828..c82abb9 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -98,23 +98,40 @@

static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;

#ifdef EXT4_BB_MAX_BLOCKS
#undef EXT4_BB_MAX_BLOCKS
#endif
#define EXT4_BB_MAX_BLOCKS 30

-struct ext4_free_metadata {
- ext4_group_t group;
- unsigned short num;
- ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
+struct ext4_free_data {
+ /* this link the free block
+ * information from group_info
+ */
+ struct rb_node node;
+
+ /* this link the free block
+ * information from ext4_sb_info
+ */
struct list_head list;
+
+ /* group this free block
+ * extent belong
+ */
+ ext4_group_t group;
+
+ /* free block extent */
+ ext4_grpblk_t start_blk;
+ ext4_grpblk_t count;
+
+ /* transaction this extent belong */
+ tid_t t_tid;
};

struct ext4_group_info {
unsigned long bb_state;
- unsigned long bb_tid;
- struct ext4_free_metadata *bb_md_cur;
+ struct rb_root bb_free_root;
unsigned short bb_first_free;
unsigned short bb_free;
unsigned short bb_fragments;
--
1.6.0.2.526.g5c283



2008-10-13 19:54:22

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V4] ext4: Use an rbtree for tracking blocks freed during transaction.

On Mon, Oct 13, 2008 at 09:45:10PM +0530, Aneesh Kumar K.V wrote:
> With this patch we track the block freed during a transaction using
> rb tree. We also make sure contiguous blocks freed are collected
> in one rb node.

One additional fix that I found while looking at the code was that you
shouldn't merge two extents to be freed if they are in different
groups. This could happen if one extent went to the end of one block
group, and the second extent starts at the beginning of a block group.
In that case, you don't want to merge the two extents given that the
current code depends on an freed extent not spanning groups.

I think this will be a problem with the following patch which treats
data blocks as metadata, because data extents *can* span block groups.
So even though with this fix below we will no longer merge extents
that are in different block groups, if a file has a extent that spans
multipe block groups, the resulting freed extent structure will also
span block groups, which will cause ext4_mb_free_committed_blocks() to
be very unhappy.

So we have two choices; one is that we change ext4_mb_free_metadata()
to break up freed extents into block group chunks, or we change
ext4_mb_free_committed_blocks() to be able to handle extents that span
multiple blocks. I suspect the latter is the best way to go, but in
case we go the former, then following patch will be needed to the
rbtree patch. (Also I cleaned up some of the comments documenting the
data structure, which I'd recommend you integrate regardless of how we
solve the problem of freed extents spanning multiple block groups.)

- Ted

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index acf6a32..1ba9586 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4427,10 +4427,17 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb,
ext4_mb_free_committed_blocks(sb);
}

+/*
+ * We can merge two free data extents of the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
static int can_merge(struct ext4_free_data *entry1,
struct ext4_free_data *entry2)
{
- if (entry1->start_blk + entry1->count == entry2->start_blk)
+ if ((entry1->t_tid == entry2->t_tid) &&
+ (entry1->group == entry2->group) &&
+ (entry1->start_blk + entry1->count) == entry2->start_blk)
return 1;
return 0;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 07dff39..23e08e5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -106,24 +106,21 @@ static struct kmem_cache *ext4_free_ext_cachep;
#define EXT4_BB_MAX_BLOCKS 30

struct ext4_free_data {
- /* this link the free block
- * information from group_info
- */
+ /* this links the free block information from group_info */
struct rb_node node;

- /* group this free block
- * extent belong
- */
+ /* this links the free block information from ext4_sb_info */
+ struct list_head list;
+
+ /* group which free block extent belongs */
ext4_group_t group;

/* free block extent */
ext4_grpblk_t start_blk;
ext4_grpblk_t count;

- /* this link the free block
- * information from ext4_sb_info
- */
- struct list_head list;
+ /* transaction which freed this extent */
+ tid_t t_tid;
};

struct ext4_group_info {

2008-10-14 03:58:42

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V4] ext4: Use an rbtree for tracking blocks freed during transaction.

On Mon, Oct 13, 2008 at 03:54:06PM -0400, Theodore Tso wrote:
> So we have two choices; one is that we change ext4_mb_free_metadata()
> to break up freed extents into block group chunks,

Never mind. I took a closer look and realized that
ext4_mb_free_blocks is already breaking up extents that span multiple
block groups.

So the only thing we need is the change to avoid merging freed extents
that cross block groups, per my patch. I've updated the patch queue
with such a fix so I can better test things out.

Aneesh, can you look and see if this makes sense?

- Ted

ext4: Use an rbtree for tracking blocks freed during transaction.

From: Aneesh Kumar K.V <[email protected]>

With this patch we track the block freed during a transaction using
rb tree. We also make sure contiguous blocks freed are collected
in one rb node.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: Theodore Ts'o <[email protected]>
---
fs/ext4/mballoc.c | 184 +++++++++++++++++++++++++++++++++-------------------
fs/ext4/mballoc.h | 23 +++++--
2 files changed, 134 insertions(+), 73 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 154f8de..1ba9586 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ meta_group_info[i]->bb_free_root.rb_node = NULL;;

#ifdef DOUBLE_CHECK
{
@@ -2647,13 +2648,11 @@ int ext4_mb_release(struct super_block *sb)
static noinline_for_stack void
ext4_mb_free_committed_blocks(struct super_block *sb)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
- int i;
- int count = 0;
- int count2 = 0;
- struct ext4_free_metadata *md;
struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err, count = 0, count2 = 0;
+ struct ext4_free_data *entry;

if (list_empty(&sbi->s_committed_transaction))
return;
@@ -2661,44 +2660,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
/* there is committed blocks to be freed yet */
do {
/* get next array of blocks */
- md = NULL;
+ entry = NULL;
spin_lock(&sbi->s_md_lock);
if (!list_empty(&sbi->s_committed_transaction)) {
- md = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_metadata, list);
- list_del(&md->list);
+ entry = list_entry(sbi->s_committed_transaction.next,
+ struct ext4_free_data, list);
+ list_del(&entry->list);
}
spin_unlock(&sbi->s_md_lock);

- if (md == NULL)
+ if (entry == NULL)
break;

mb_debug("gonna free %u blocks in group %lu (0x%p):",
- md->num, md->group, md);
+ entry->count, entry->group, entry);

- err = ext4_mb_load_buddy(sb, md->group, &e4b);
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);

+ db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
- count += md->num;
+ count += entry->count;
count2++;
- ext4_lock_group(sb, md->group);
- for (i = 0; i < md->num; i++) {
- mb_debug(" %u", md->blocks[i]);
- mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+ ext4_lock_group(sb, entry->group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
}
- mb_debug("\n");
- ext4_unlock_group(sb, md->group);
-
- /* balance refcounts from ext4_mb_free_metadata() */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
+ ext4_unlock_group(sb, entry->group);

- kfree(md);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
-
- } while (md);
+ } while (1);

mb_debug("freed %u blocks in %u structures\n", count, count2);
}
@@ -2771,6 +2772,16 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
+
+ ext4_free_ext_cachep =
+ kmem_cache_create("ext4_free_block_extents",
+ sizeof(struct ext4_free_data),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ if (ext4_free_ext_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
return 0;
}

@@ -2779,6 +2790,7 @@ void exit_ext4_mballoc(void)
/* XXX: synchronize_rcu(); */
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
+ kmem_cache_destroy(ext4_free_ext_cachep);
}


@@ -4415,6 +4427,21 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb,
ext4_mb_free_committed_blocks(sb);
}

+/*
+ * We can merge two free data extents of the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ if ((entry1->t_tid == entry2->t_tid) &&
+ (entry1->group == entry2->group) &&
+ (entry1->start_blk + entry1->count) == entry2->start_blk)
+ return 1;
+ return 0;
+}
+
static noinline_for_stack int
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_t group, ext4_grpblk_t block, int count)
@@ -4422,57 +4449,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_metadata *md;
- int i;
+ struct ext4_free_data *entry, *new_entry;
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+

BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);

+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = block;
+ new_entry->group = group;
+ new_entry->count = count;
+ new_entry->t_tid = handle->h_transaction->t_tid;
+ new_node = &new_entry->node;
+
ext4_lock_group(sb, group);
- for (i = 0; i < count; i++) {
- md = db->bb_md_cur;
- if (md && db->bb_tid != handle->h_transaction->t_tid) {
- db->bb_md_cur = NULL;
- md = NULL;
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, node);
+ if (block < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (block >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ ext4_error(sb, __func__,
+ "Double free of blocks %d (%d %d)\n",
+ block, entry->start_blk, entry->count);
+ return 0;
}
+ }

- if (md == NULL) {
- ext4_unlock_group(sb, group);
- md = kmalloc(sizeof(*md), GFP_NOFS);
- if (md == NULL)
- return -ENOMEM;
- md->num = 0;
- md->group = group;
-
- ext4_lock_group(sb, group);
- if (db->bb_md_cur == NULL) {
- spin_lock(&sbi->s_md_lock);
- list_add(&md->list, &sbi->s_active_transaction);
- spin_unlock(&sbi->s_md_lock);
- /* protect buddy cache from being freed,
- * otherwise we'll refresh it from
- * on-disk bitmap and lose not-yet-available
- * blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
- db->bb_md_cur = md;
- db->bb_tid = handle->h_transaction->t_tid;
- mb_debug("new md 0x%p for group %lu\n",
- md, md->group);
- } else {
- kfree(md);
- md = db->bb_md_cur;
- }
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);
+
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
+ }

- BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
- md->blocks[md->num] = block + i;
- md->num++;
- if (md->num == EXT4_BB_MAX_BLOCKS) {
- /* no more space, put full container on a sb's list */
- db->bb_md_cur = NULL;
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
+ /* Add the extent to active_transaction list */
+ spin_lock(&sbi->s_md_lock);
+ list_add(&new_entry->list, &sbi->s_active_transaction);
+ spin_unlock(&sbi->s_md_lock);
ext4_unlock_group(sb, group);
return 0;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b3b4828..23e08e5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -98,23 +98,34 @@

static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;

#ifdef EXT4_BB_MAX_BLOCKS
#undef EXT4_BB_MAX_BLOCKS
#endif
#define EXT4_BB_MAX_BLOCKS 30

-struct ext4_free_metadata {
- ext4_group_t group;
- unsigned short num;
- ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
+struct ext4_free_data {
+ /* this links the free block information from group_info */
+ struct rb_node node;
+
+ /* this links the free block information from ext4_sb_info */
struct list_head list;
+
+ /* group which free block extent belongs */
+ ext4_group_t group;
+
+ /* free block extent */
+ ext4_grpblk_t start_blk;
+ ext4_grpblk_t count;
+
+ /* transaction which freed this extent */
+ tid_t t_tid;
};

struct ext4_group_info {
unsigned long bb_state;
- unsigned long bb_tid;
- struct ext4_free_metadata *bb_md_cur;
+ struct rb_root bb_free_root;
unsigned short bb_first_free;
unsigned short bb_free;
unsigned short bb_fragments;

2008-10-14 06:48:12

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V4] ext4: Use an rbtree for tracking blocks freed during transaction.

On Mon, Oct 13, 2008 at 11:12:25PM -0400, Theodore Tso wrote:
> On Mon, Oct 13, 2008 at 03:54:06PM -0400, Theodore Tso wrote:
> > So we have two choices; one is that we change ext4_mb_free_metadata()
> > to break up freed extents into block group chunks,
>
> Never mind. I took a closer look and realized that
> ext4_mb_free_blocks is already breaking up extents that span multiple
> block groups.
>
> So the only thing we need is the change to avoid merging freed extents
> that cross block groups, per my patch. I've updated the patch queue
> with such a fix so I can better test things out.
>
> Aneesh, can you look and see if this makes sense?

Yes the patch looks fine. Some changes I made on top of this is below
I have sent the patch -V5 with the above changes

>
> - Ted
>
> ext4: Use an rbtree for tracking blocks freed during transaction.
>
> From: Aneesh Kumar K.V <[email protected]>
>
> With this patch we track the block freed during a transaction using
> rb tree. We also make sure contiguous blocks freed are collected
> in one rb node.
>

.......

>
> +/*
> + * We can merge two free data extents of the physical blocks

s/of the/only if the/

> + * are contiguous, AND the extents were freed by the same transaction,
> + * AND the blocks are associated with the same group.
> + */
> +static int can_merge(struct ext4_free_data *entry1,
> + struct ext4_free_data *entry2)
> +{
> + if ((entry1->t_tid == entry2->t_tid) &&
> + (entry1->group == entry2->group) &&
> + (entry1->start_blk + entry1->count) == entry2->start_blk)
> + return 1;
> + return 0;
> +}
> +
......

> +++ b/fs/ext4/mballoc.h
> @@ -98,23 +98,34 @@
>
> static struct kmem_cache *ext4_pspace_cachep;
> static struct kmem_cache *ext4_ac_cachep;
> +static struct kmem_cache *ext4_free_ext_cachep;
>
> #ifdef EXT4_BB_MAX_BLOCKS
> #undef EXT4_BB_MAX_BLOCKS
> #endif
> #define EXT4_BB_MAX_BLOCKS 30
>

We don't need the above define.

> -struct ext4_free_metadata {
> - ext4_group_t group;
> - unsigned short num;
> - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
> +struct ext4_free_data {
> + /* this links the free block information from group_info */
> + struct rb_node node;
> +
> + /* this links the free block information from ext4_sb_info */
> struct list_head list;
> +
> + /* group which free block extent belongs */
> + ext4_group_t group;
> +
> + /* free block extent */
> + ext4_grpblk_t start_blk;
> + ext4_grpblk_t count;
> +
> + /* transaction which freed this extent */
> + tid_t t_tid;
> };
>
> struct ext4_group_info {
> unsigned long bb_state;
> - unsigned long bb_tid;
> - struct ext4_free_metadata *bb_md_cur;
> + struct rb_root bb_free_root;
> unsigned short bb_first_free;
> unsigned short bb_free;
> unsigned short bb_fragments;