2008-10-09 15:32:19

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: Use an rb tree for tracking blocks freed during transaction.

With this patch we track the block freed during a transaction using
rb tree. We also make sure contiguos blocks freed are collected
in one rb node.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/mballoc.c | 168 ++++++++++++++++++++++++++++++++---------------------
fs/ext4/mballoc.h | 24 ++++++--
2 files changed, 120 insertions(+), 72 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 28bf8ef..64eeb9a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ meta_group_info[i]->bb_free_root.rb_node = NULL;;

#ifdef DOUBLE_CHECK
{
@@ -2647,13 +2648,11 @@ int ext4_mb_release(struct super_block *sb)
static noinline_for_stack void
ext4_mb_free_committed_blocks(struct super_block *sb)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
- int i;
- int count = 0;
- int count2 = 0;
- struct ext4_free_metadata *md;
struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err, count = 0, count2 = 0;
+ struct ext4_free_data *entry;

if (list_empty(&sbi->s_committed_transaction))
return;
@@ -2661,44 +2660,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
/* there is committed blocks to be freed yet */
do {
/* get next array of blocks */
- md = NULL;
+ entry = NULL;
spin_lock(&sbi->s_md_lock);
if (!list_empty(&sbi->s_committed_transaction)) {
- md = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_metadata, list);
- list_del(&md->list);
+ entry = list_entry(sbi->s_committed_transaction.next,
+ struct ext4_free_data, list);
+ list_del(&entry->list);
}
spin_unlock(&sbi->s_md_lock);

- if (md == NULL)
+ if (entry == NULL)
break;

mb_debug("gonna free %u blocks in group %lu (0x%p):",
- md->num, md->group, md);
+ entry->count, entry->group, entry);

- err = ext4_mb_load_buddy(sb, md->group, &e4b);
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);

+ db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
- count += md->num;
+ count += entry->count;
count2++;
- ext4_lock_group(sb, md->group);
- for (i = 0; i < md->num; i++) {
- mb_debug(" %u", md->blocks[i]);
- mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+ ext4_lock_group(sb, entry->group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
}
- mb_debug("\n");
- ext4_unlock_group(sb, md->group);
-
- /* balance refcounts from ext4_mb_free_metadata() */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
+ ext4_unlock_group(sb, entry->group);

- kfree(md);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
-
- } while (md);
+ } while (1);

mb_debug("freed %u blocks in %u structures\n", count, count2);
}
@@ -2771,6 +2772,16 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
+
+ ext4_free_ext_cachep =
+ kmem_cache_create("ext4_free_block_extents",
+ sizeof(struct ext4_free_data),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ if (ext4_free_ext_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
return 0;
}

@@ -2779,6 +2790,7 @@ void exit_ext4_mballoc(void)
/* XXX: synchronize_rcu(); */
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
+ kmem_cache_destroy(ext4_free_ext_cachep);
}


@@ -4407,6 +4419,14 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb,
ext4_mb_free_committed_blocks(sb);
}

+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ if (entry1->start_blk + entry1->count == entry2->start_blk)
+ return 1;
+ return 0;
+}
+
static noinline_for_stack int
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_t group, ext4_grpblk_t block, int count)
@@ -4414,57 +4434,71 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_metadata *md;
- int i;
+ struct ext4_free_data *entry, *new_entry;
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+

BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);

+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = block;
+ new_entry->group = group;
+ new_entry->count = count;
+ new_node = &new_entry->node;
+
ext4_lock_group(sb, group);
- for (i = 0; i < count; i++) {
- md = db->bb_md_cur;
- if (md && db->bb_tid != handle->h_transaction->t_tid) {
- db->bb_md_cur = NULL;
- md = NULL;
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, node);
+ if (block < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (block >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ ext4_error(sb, __func__,
+ "Double free of blocks %d (%d %d)\n",
+ block, entry->start_blk, entry->count);
+ return 0;
}
+ }

- if (md == NULL) {
- ext4_unlock_group(sb, group);
- md = kmalloc(sizeof(*md), GFP_NOFS);
- if (md == NULL)
- return -ENOMEM;
- md->num = 0;
- md->group = group;
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);

- ext4_lock_group(sb, group);
- if (db->bb_md_cur == NULL) {
- spin_lock(&sbi->s_md_lock);
- list_add(&md->list, &sbi->s_active_transaction);
- spin_unlock(&sbi->s_md_lock);
- /* protect buddy cache from being freed,
- * otherwise we'll refresh it from
- * on-disk bitmap and lose not-yet-available
- * blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
- db->bb_md_cur = md;
- db->bb_tid = handle->h_transaction->t_tid;
- mb_debug("new md 0x%p for group %lu\n",
- md, md->group);
- } else {
- kfree(md);
- md = db->bb_md_cur;
- }
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ list_del(&entry->list);
}
+ }

- BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
- md->blocks[md->num] = block + i;
- md->num++;
- if (md->num == EXT4_BB_MAX_BLOCKS) {
- /* no more space, put full container on a sb's list */
- db->bb_md_cur = NULL;
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ list_del(&entry->list);
}
}
+ /* Add the extent to active_transaction list */
+ list_add(&new_entry->list, &sbi->s_active_transaction);
ext4_unlock_group(sb, group);
return 0;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b3b4828..07dff39 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -98,23 +98,37 @@

static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;

#ifdef EXT4_BB_MAX_BLOCKS
#undef EXT4_BB_MAX_BLOCKS
#endif
#define EXT4_BB_MAX_BLOCKS 30

-struct ext4_free_metadata {
+struct ext4_free_data {
+ /* this link the free block
+ * information from group_info
+ */
+ struct rb_node node;
+
+ /* group this free block
+ * extent belong
+ */
ext4_group_t group;
- unsigned short num;
- ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
+
+ /* free block extent */
+ ext4_grpblk_t start_blk;
+ ext4_grpblk_t count;
+
+ /* this link the free block
+ * information from ext4_sb_info
+ */
struct list_head list;
};

struct ext4_group_info {
unsigned long bb_state;
- unsigned long bb_tid;
- struct ext4_free_metadata *bb_md_cur;
+ struct rb_root bb_free_root;
unsigned short bb_first_free;
unsigned short bb_free;
unsigned short bb_fragments;
--
1.6.0.1.285.g1070



2008-10-09 15:32:22

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: Don't reuse released data blocks untill transaction commits

We need to make sure we don't reuse the data blocks released
during the transaction untill the transaction commits. We force
this mode only for ordered and journalled mode. Writeback mode
already don't provided data consistency.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 12 ++++++++++--
1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd2ece2..b9821be 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,

/* this isn't the right place to decide whether block is metadata
* inode.c/extents.c knows better, but for safety ... */
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
- ext4_should_journal_data(inode))
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ metadata = 1;
+
+ /* We need to make sure we don't reuse
+ * block released untill the transaction commit.
+ * writeback mode have weak data consistency so
+ * don't force data as metadata when freeing block
+ * for writeback mode.
+ */
+ if (metadata == 0 && !ext4_should_writeback_data(inode))
metadata = 1;

sb = inode->i_sb;
--
1.6.0.1.285.g1070


2008-10-09 15:32:26

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH] ext4: Do mballoc init before doing filesystem recovery

During filesystem recovery we may be doing a truncate
which expects some of the mballoc data structures to
be initialized. So do ext4_mb_init before recovery.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/super.c | 30 +++++++++++++++---------------
1 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c760818..468ff72 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2452,6 +2452,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"available.\n");
}

+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
+ ext4_ext_init(sb);
+ err = ext4_mb_init(sb, needs_recovery);
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+ err);
+ goto failed_mount4;
+ }
+
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2471,21 +2486,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");

- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
- "requested data journaling mode\n");
- clear_opt(sbi->s_mount_opt, DELALLOC);
- } else if (test_opt(sb, DELALLOC))
- printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
-
- ext4_ext_init(sb);
- err = ext4_mb_init(sb, needs_recovery);
- if (err) {
- printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
- err);
- goto failed_mount4;
- }

2008-10-11 04:27:52

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] ext4: Use an rb tree for tracking blocks freed during transaction.

On Thu, Oct 09, 2008 at 09:02:07PM +0530, Aneesh Kumar K.V wrote:
> With this patch we track the block freed during a transaction using
> rb tree. We also make sure contiguos blocks freed are collected
> in one rb node.

Thanks, I've added your recent set of patches to the ext4 patch queue.
I'm pretty sure I grabbed the correct (most recent) patches from the
mailing list, but please double check for me.

Thanks!!

- Ted

2008-10-11 18:08:18

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Use an rb tree for tracking blocks freed during transaction.

On Sat, Oct 11, 2008 at 12:27:38AM -0400, Theodore Tso wrote:
> On Thu, Oct 09, 2008 at 09:02:07PM +0530, Aneesh Kumar K.V wrote:
> > With this patch we track the block freed during a transaction using
> > rb tree. We also make sure contiguos blocks freed are collected
> > in one rb node.
>
> Thanks, I've added your recent set of patches to the ext4 patch queue.
> I'm pretty sure I grabbed the correct (most recent) patches from the
> mailing list, but please double check for me.
>

I verified the patch queue. It looks fine. I also tested the latest
patchqueue (I commented out the defrag patch because they were giving
error during apply) on ABAT and all the tests PASSED.

-aneesh

2008-10-12 20:31:58

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] ext4: Use an rb tree for tracking blocks freed during transaction.

On Thu, Oct 09, 2008 at 09:02:07PM +0530, Aneesh Kumar K.V wrote:
> With this patch we track the block freed during a transaction using
> rb tree. We also make sure contiguos blocks freed are collected
> in one rb node.

There seems to be a memory leak. Over time, the number of active
objects in ext4_free_block_extents goes up. You can check via:

grep ext4_free_block_extents /proc/slabinfo

I think the problem is here:

> + /* Now try to see the extent can be merged to left and right */
> + node = rb_prev(new_node);
> + if (node) {
> + entry = rb_entry(node, struct ext4_free_data, node);
> + if (can_merge(entry, new_entry)) {
> + new_entry->start_blk = entry->start_blk;
> + new_entry->count += entry->count;
> + rb_erase(node, &(db->bb_free_root));
> + list_del(&entry->list);
> }
> + }

We aren't freeing new_entry in ext4_mb_free_metadata() in the case
where the extent can be merged with an existing node in the rbtree.

- Ted

2008-10-12 20:34:07

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH] ext4: Don't reuse released data blocks untill transaction commits

On Thu, Oct 09, 2008 at 09:02:08PM +0530, Aneesh Kumar K.V wrote:
> We need to make sure we don't reuse the data blocks released
> during the transaction untill the transaction commits. We force
> this mode only for ordered and journalled mode. Writeback mode
> already don't provided data consistency.

It might be a good idea in ext4_free_mb_metadata() to add a comment
that the name of the function is a little bit of a misnomer...

BTW, you may want to check what I put in the patch queue, because
iirc, I fixed up a few spelling/grammar in the patch comments.

- Ted

2008-10-13 09:46:35

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH] ext4: Use an rb tree for tracking blocks freed during transaction.

On Sun, Oct 12, 2008 at 04:31:47PM -0400, Theodore Tso wrote:
> On Thu, Oct 09, 2008 at 09:02:07PM +0530, Aneesh Kumar K.V wrote:
> > With this patch we track the block freed during a transaction using
> > rb tree. We also make sure contiguos blocks freed are collected
> > in one rb node.
>
> There seems to be a memory leak. Over time, the number of active
> objects in ext4_free_block_extents goes up. You can check via:
>
> grep ext4_free_block_extents /proc/slabinfo
>
> I think the problem is here:
>
> > + /* Now try to see the extent can be merged to left and right */
> > + node = rb_prev(new_node);
> > + if (node) {
> > + entry = rb_entry(node, struct ext4_free_data, node);
> > + if (can_merge(entry, new_entry)) {
> > + new_entry->start_blk = entry->start_blk;
> > + new_entry->count += entry->count;
> > + rb_erase(node, &(db->bb_free_root));
> > + list_del(&entry->list);
> > }
> > + }
>
> We aren't freeing new_entry in ext4_mb_free_metadata() in the case
> where the extent can be merged with an existing node in the rbtree.
>

Updated with the below patch

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9c151f2..2f38754 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4492,7 +4492,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
new_entry->start_blk = entry->start_blk;
new_entry->count += entry->count;
rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
}

@@ -4502,7 +4505,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
if (can_merge(new_entry, entry)) {
new_entry->count += entry->count;
rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
/* Add the extent to active_transaction list */