2008-11-25 19:27:12

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V4 1/2] ext4: Use EXT4_GROUP_INFO_NEED_INIT_BIT during resize

The new groups added during resize are flagged as
need_init group. Make sure we properly initialize these
groups. When we have block size < page size and we are adding
new groups the page may still be marked uptodate even though
we haven't initialized the group. While forcing the init
of buddy cache we need to make sure other groups part of the
same page of buddy cache is not using the cache.
group_info->alloc_sem is added to ensure the same.

FILENAME: aneesh-3-use_EXT4_GROUP_INFO_NEED_INIT_BIT-during-resize

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/balloc.c | 21 +++--
fs/ext4/ext4.h | 2 +-
fs/ext4/mballoc.c | 217 ++++++++++++++++++++++++++++++++++++++--------------
fs/ext4/mballoc.h | 3 +
fs/ext4/resize.c | 41 +----------
5 files changed, 176 insertions(+), 108 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index fa69cb3..39df492 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -381,6 +381,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);

ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ grp = ext4_get_group_info(sb, block_group);
/*
* Check to see if we are freeing blocks across a group
* boundary.
@@ -425,7 +426,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
-
+ /*
+ * make sure we don't allow a parallel init on other groups in the
+ * same buddy cache
+ */
+ down_write(&grp->alloc_sem);
for (i = 0, blocks_freed = 0; i < count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
@@ -450,6 +455,13 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
spin_unlock(sb_bgl_lock(sbi, flex_group));
}
+ /*
+ * request to reload the buddy with the
+ * new bitmap information
+ */
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+ ext4_mb_update_group_info(grp, blocks_freed);
+ up_write(&grp->alloc_sem);

/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -461,13 +473,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
if (!err)
err = ret;
sb->s_dirt = 1;
- /*
- * request to reload the buddy with the
- * new bitmap information
- */
- grp = ext4_get_group_info(sb, block_group);
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
- ext4_mb_update_group_info(grp, blocks_freed);

error_return:
brelse(bitmap_bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 49b46e7..bc13445 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1058,7 +1058,7 @@ extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
-extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
ext4_grpblk_t add);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9cc93af..11fd4b9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -886,18 +886,20 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
int blocks_per_page;
int block;
int pnum;
int poff;
struct page *page;
int ret;
+ struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;

mb_debug("load group %u\n", group);

blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ grp = ext4_get_group_info(sb, group);

e4b->bd_blkbits = sb->s_blocksize_bits;
e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +907,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
e4b->bd_group = group;
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
+ e4b->alloc_semp = &grp->alloc_sem;
+
+ /* Take the read lock on the group alloc
+ * sem. This would make sure a parallel
+ * ext4_mb_init_group happening on other
+ * groups mapped by the page is blocked
+ * till we are done with allocation
+ */
+ down_read(e4b->alloc_semp);

/*
* the buddy cache inode stores the block bitmap
@@ -920,6 +931,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page = find_get_page(inode->i_mapping, pnum);
if (page == NULL || !PageUptodate(page)) {
if (page)
+ /*
+ * drop the page reference and try
+ * to get the page with lock. If we
+ * are not uptodate that implies
+ * somebody just created the page but
+ * is yet to initialize the same. So
+ * wait for it to initialize.
+ */
page_cache_release(page);
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
@@ -985,6 +1004,9 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
+
+ /* Done with the buddy cache */
+ up_read(e4b->alloc_semp);
return ret;
}

@@ -994,6 +1016,8 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
+ /* Done with the buddy cache */
+ up_read(e4b->alloc_semp);
}


@@ -1694,6 +1718,129 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}

+static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ int i, ret;
+ void *bitmap;
+ int blocks_per_page;
+ int groups_per_page;
+ int block, pnum, poff;
+ int num_grp_locked = 0;
+ ext4_group_t first_group;
+ struct ext4_group_info *grp, *this_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ struct page *page = NULL, *bitmap_page = NULL;
+
+ mb_debug("init group %lu\n", group);
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ first_group = pnum * blocks_per_page / 2;
+ this_grp = ext4_get_group_info(sb, group);
+
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+
+ /* read all groups the page covers into the cache */
+ for (i = 0; i < groups_per_page; i++) {
+
+ if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+ break;
+ grp = ext4_get_group_info(sb, first_group + i);
+ /* take all groups write allocation
+ * semaphore. This make sure there is
+ * no block allocation going on in any
+ * of that groups
+ */
+ down_write(&grp->alloc_sem);
+ }
+ /*
+ * make sure we look at only those groups
+ * that are locked. A resize can add more
+ * groups while this happen
+ */
+ num_grp_locked = i;
+ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ ret = 0;
+ goto err;
+ }
+
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+ bitmap_page = page;
+ bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ /* init buddy cache */
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page == bitmap_page) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ unlock_page(page);
+ } else if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+err:
+ /* release locks on all the groups */
+ for (i = 0; i < num_grp_locked; i++) {
+
+ grp = ext4_get_group_info(sb, first_group + i);
+ /* take all groups write allocation
+ * semaphore. This make sure there is
+ * no block allocation going on in any
+ * of that groups
+ */
+ up_write(&grp->alloc_sem);
+ }
+ if (bitmap_page)
+ page_cache_release(bitmap_page);
+ if (page)
+ page_cache_release(page);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -1777,7 +1924,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
group = 0;

/* quick check to skip empty groups */
- grp = ext4_get_group_info(ac->ac_sb, group);
+ grp = ext4_get_group_info(sb, group);
if (grp->bb_free == 0)
continue;

@@ -1790,10 +1937,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* we need full data about the group
* to make a good selection
*/
- err = ext4_mb_load_buddy(sb, group, &e4b);
+ err = ext4_mb_init_group(sb, group);
if (err)
goto out;
- ext4_mb_release_desc(&e4b);
}

/*
@@ -2244,7 +2390,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)


/* Create and initialize ext4_group_info data for the given group. */
-static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
{
int i, len;
@@ -2302,6 +2448,7 @@ static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root.rb_node = NULL;;

#ifdef DOUBLE_CHECK
@@ -2329,54 +2476,6 @@ static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
} /* ext4_mb_add_groupinfo */

/*
- * Add a group to the existing groups.
- * This function is used for online resize
- */
-int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
- struct ext4_group_desc *desc)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- int blocks_per_page;
- int block;
- int pnum;
- struct page *page;
- int err;
-
- /* Add group based on group descriptor*/
- err = ext4_mb_add_groupinfo(sb, group, desc);
- if (err)
- return err;
-
- /*
- * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
- * datas) are set not up to date so that they will be re-initilaized
- * during the next call to ext4_mb_load_buddy
- */
-
- /* Set buddy page as not up to date */
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- block = group * 2;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- /* Set bitmap page as not up to date */
- block++;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- return 0;
-}
-
-/*
* Update an existing group.
* This function is used for online resize
*/
@@ -4583,11 +4682,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
-
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
- if (err)
- goto error_return;
-
#ifdef AGGRESSIVE_CHECK
{
int i;
@@ -4601,6 +4695,8 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ if (err)
+ goto error_return;

if (ac) {
ac->ac_b_ex.fe_group = block_group;
@@ -4609,6 +4705,9 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
ext4_mb_store_history(ac);
}

+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
if (metadata) {
/* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed */
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1f..a931b6b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
#include <linux/version.h>
#include <linux/blkdev.h>
#include <linux/marker.h>
+#include <linux/mutex.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "group.h"
@@ -130,6 +131,7 @@ struct ext4_group_info {
#ifdef DOUBLE_CHECK
void *bb_bitmap;
#endif
+ struct rw_semaphore alloc_sem;
unsigned short bb_counters[];
};

@@ -250,6 +252,7 @@ struct ext4_buddy {
struct super_block *bd_sb;
__u16 bd_blkbits;
ext4_group_t bd_group;
+ struct rw_semaphore *alloc_semp;
};
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6a0609..939cf26 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -870,7 +870,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* We can allocate memory for mb_alloc based on the new group
* descriptor
*/
- err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+ err = ext4_mb_add_groupinfo(sb, input->group, gdp);
if (err)
goto exit_journal;

@@ -1081,45 +1081,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
if ((err = ext4_journal_stop(handle)))
goto exit_put;

- /*
- * Mark mballoc pages as not up to date so that they will be updated
- * next time they are loaded by ext4_mb_load_buddy.
- *
- * XXX Bad, Bad, BAD!!! We should not be overloading the
- * Uptodate flag, particularly on thte bitmap bh, as way of
- * hinting to ext4_mb_load_buddy() that it needs to be
- * overloaded. A user could take a LVM snapshot, then do an
- * on-line fsck, and clear the uptodate flag, and this would
- * not be a bug in userspace, but a bug in the kernel. FIXME!!!
- */
- {
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- int blocks_per_page;
- int block;
- int pnum;
- struct page *page;
-
- /* Set buddy page as not up to date */
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- block = group * 2;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- /* Set bitmap page as not up to date */
- block++;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
- }


2008-11-25 19:27:47

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V4 1/2] ext4: Use EXT4_GROUP_INFO_NEED_INIT_BIT during resize

On Wed, Nov 26, 2008 at 12:55:44AM +0530, Aneesh Kumar K.V wrote:
> The new groups added during resize are flagged as
> need_init group. Make sure we properly initialize these
> groups. When we have block size < page size and we are adding
> new groups the page may still be marked uptodate even though
> we haven't initialized the group. While forcing the init
> of buddy cache we need to make sure other groups part of the
> same page of buddy cache is not using the cache.
> group_info->alloc_sem is added to ensure the same.
>
> FILENAME: aneesh-3-use_EXT4_GROUP_INFO_NEED_INIT_BIT-during-resize
>

The two patches only have comments updated after a really nice irc
discussion with Alex.

-aneesh

2008-11-25 19:31:01

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V4 2/2] ext4: cleanup mballoc header files

Move some of the forward declaration of the static functions
to mballoc.c where they are used. This enables us to include
mballoc.h in other .c files. Also correct the buddy cache
documentation.

FILENAME: aneesh-4-cleanup-mballoc-header-files

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 23 +++++++++++++++++++----
fs/ext4/mballoc.h | 18 +-----------------
2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 11fd4b9..598b418 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
* inode as:
*
* { page }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information. So for each group we
@@ -330,6 +330,16 @@
* object
*
*/
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+
+

static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -716,7 +726,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
* stored in the inode as
*
* { page }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information.
@@ -1322,8 +1332,13 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
ac->ac_tail = ret & 0xffff;
ac->ac_buddy = ret >> 16;

- /* XXXXXXX: SUCH A HORRIBLE **CK */
- /*FIXME!! Why ? */
+ /*
+ * take the page reference. We want the page to be pinned
+ * so that we don't get a ext4_mb_init_cache_call for this
+ * group untill we update the bitmap. That would mean we
+ * double allocate blocks. The reference is dropped
+ * in ext4_mb_release_context
+ */
ac->ac_bitmap_page = e4b->bd_bitmap_page;
get_page(ac->ac_bitmap_page);
ac->ac_buddy_page = e4b->bd_buddy_page;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index a931b6b..997f78f 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -99,9 +99,6 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512

-static struct kmem_cache *ext4_pspace_cachep;
-static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;

struct ext4_free_data {
/* this links the free block information from group_info */
@@ -262,25 +259,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
{
return;
}
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
#endif

#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)

struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);

-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
- ext4_group_t group);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
- struct ext4_buddy *e4b, sector_t block,
- int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
- struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);

2008-11-25 20:16:38

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V4 1/2] ext4: Use EXT4_GROUP_INFO_NEED_INIT_BIT during resize

On Wed, Nov 26, 2008 at 12:57:34AM +0530, Aneesh Kumar K.V wrote:
>
> The two patches only have comments updated after a really nice irc
> discussion with Alex.
>

Thanks, patch queue updated.

- Ted