2008-11-06 18:57:26

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -V3 1/9] ext4: sparse fixes

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/ext4.h | 9 ++++++---
fs/ext4/extents.c | 4 ++--
fs/ext4/file.c | 3 ---
fs/ext4/inode.c | 2 +-
fs/ext4/mballoc.c | 4 +++-
fs/ext4/super.c | 10 +++++-----
6 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9415a1a..c05d639 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -726,11 +726,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
*/

#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
- (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_compat) & mask)
#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
- (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & mask)
#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
- (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & mask)
#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -1284,6 +1284,9 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int create,
int extend_disksize, int flag);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
+
#endif /* __KERNEL__ */

#endif /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6b8c687..228261e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3081,7 +3081,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
/*
* Callback function called for each extent to gather FIEMAP information.
*/
-int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
struct ext4_ext_cache *newex, struct ext4_extent *ex,
void *data)
{
@@ -3150,7 +3150,7 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
/* fiemap flags we can handle specified here */
#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)

-int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+static int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
{
__u64 physical = 0;
__u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fb..f731cb5 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}

-extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len);
-
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 810a862..b46bcb7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3842,7 +3842,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
ext4_fsblk_t block;
int inodes_per_block, inode_offset;

- iloc->bh = 0;
+ iloc->bh = NULL;
if (!ext4_valid_inum(sb, inode->i_ino))
return -EIO;

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 59151c1..9cc93af 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1056,6 +1056,8 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)

static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
+__releases(bitlock)
+__acquires(bitlock)
{
int block = 0;
int max = 0;
@@ -2242,7 +2244,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)


/* Create and initialize ext4_group_info data for the given group. */
-int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
{
int i, len;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d4c84f4..6bec662 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1876,7 +1876,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
int db_count;
int i;
int needs_recovery, has_huge_files;
- __le32 features;
+ int features;
__u64 blocks_count;
int err;

@@ -2006,14 +2006,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (features) {
printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
"unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ sb->s_id, features);
goto failed_mount;
}
features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
if (!(sb->s_flags & MS_RDONLY) && features) {
printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
"unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ sb->s_id, features);
goto failed_mount;
}
has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -3036,13 +3036,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_mark_recovery_complete(sb, es);
lock_super(sb);
} else {
- __le32 ret;
+ int ret;
if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
~EXT4_FEATURE_RO_COMPAT_SUPP))) {
printk(KERN_WARNING "EXT4-fs: %s: couldn't "
"remount RDWR because of unsupported "
"optional features (%x).\n",
- sb->s_id, le32_to_cpu(ret));
+ sb->s_id, ret);
err = -EROFS;
goto restore_opts;
}
--
1.6.0.3.514.g2f91b



2008-11-06 18:57:28

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -V3 3/9] ext4: Use EXT4_GROUP_INFO_NEED_INIT_BIT during resize

The new groups added during resize are flagged as
need_init group. Make sure we properly initialize these
groups. When we have block size < page size and we are adding
new groups the page may still be marked uptodate even though
we haven't initialized the group. While forcing the init
of buddy cache we need to make sure other groups part of the
same page of buddy cache is not using the cache.
group_info->alloc_sem is added to ensure the same.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/balloc.c | 21 +++--
fs/ext4/ext4.h | 2 +-
fs/ext4/mballoc.c | 218 ++++++++++++++++++++++++++++++++++++++---------------
fs/ext4/mballoc.h | 3 +
fs/ext4/resize.c | 41 +----------
5 files changed, 176 insertions(+), 109 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index fa69cb3..39df492 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -381,6 +381,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);

ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ grp = ext4_get_group_info(sb, block_group);
/*
* Check to see if we are freeing blocks across a group
* boundary.
@@ -425,7 +426,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
-
+ /*
+ * make sure we don't allow a parallel init on other groups in the
+ * same buddy cache
+ */
+ down_write(&grp->alloc_sem);
for (i = 0, blocks_freed = 0; i < count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
@@ -450,6 +455,13 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
spin_unlock(sb_bgl_lock(sbi, flex_group));
}
+ /*
+ * request to reload the buddy with the
+ * new bitmap information
+ */
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+ ext4_mb_update_group_info(grp, blocks_freed);
+ up_write(&grp->alloc_sem);

/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -461,13 +473,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
if (!err)
err = ret;
sb->s_dirt = 1;
- /*
- * request to reload the buddy with the
- * new bitmap information
- */
- grp = ext4_get_group_info(sb, block_group);
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
- ext4_mb_update_group_info(grp, blocks_freed);

error_return:
brelse(bitmap_bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4514fa5..c21da63 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1058,7 +1058,7 @@ extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
-extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
ext4_grpblk_t add);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9cc93af..ade9d5e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -886,18 +886,20 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
int blocks_per_page;
int block;
int pnum;
int poff;
struct page *page;
int ret;
+ struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;

mb_debug("load group %u\n", group);

blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ grp = ext4_get_group_info(sb, group);

e4b->bd_blkbits = sb->s_blocksize_bits;
e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +907,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
e4b->bd_group = group;
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
+ e4b->alloc_semp = &grp->alloc_sem;
+
+ /* Take the read lock on the group alloc
+ * sem. This would make sure a parallel
+ * ext4_mb_init_group happening on other
+ * groups mapped by the page is blocked
+ * till we are done with allocation
+ */
+ down_read(e4b->alloc_semp);

/*
* the buddy cache inode stores the block bitmap
@@ -919,8 +930,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* what we'd like to avoid in fast path ... */
page = find_get_page(inode->i_mapping, pnum);
if (page == NULL || !PageUptodate(page)) {
- if (page)
+ if (page) {
+ /* If the page is not uptodate
+ * that would imply nobody is using
+ * it. So we should be able to drop
+ * page_cache without being worried
+ * about other group allocation
+ */
page_cache_release(page);
+ }
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
@@ -985,6 +1003,9 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
+
+ /* Done with the buddy cache */
+ up_read(e4b->alloc_semp);
return ret;
}

@@ -994,6 +1015,8 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
+ /* Done with the buddy cache */
+ up_read(e4b->alloc_semp);
}


@@ -1694,6 +1717,129 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}

+static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+ int i, ret;
+ void *bitmap;
+ int blocks_per_page;
+ int groups_per_page;
+ int block, pnum, poff;
+ int num_grp_locked = 0;
+ ext4_group_t first_group;
+ struct ext4_group_info *grp, *this_grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ struct page *page = NULL, *bitmap_page = NULL;
+
+ mb_debug("init group %lu\n", group);
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+ * So for each group we need two blocks.
+ */
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ first_group = pnum * blocks_per_page / 2;
+ this_grp = ext4_get_group_info(sb, group);
+
+ groups_per_page = blocks_per_page >> 1;
+ if (groups_per_page == 0)
+ groups_per_page = 1;
+
+ /* read all groups the page covers into the cache */
+ for (i = 0; i < groups_per_page; i++) {
+
+ if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+ break;
+ grp = ext4_get_group_info(sb, first_group + i);
+ /* take all groups write allocation
+ * semaphore. This make sure there is
+ * no block allocation going on in any
+ * of that groups
+ */
+ down_write(&grp->alloc_sem);
+ }
+ /*
+ * make sure we look at only those groups
+ * that are locked. A resize can add more
+ * groups while this happen
+ */
+ num_grp_locked = i;
+ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ /*
+ * somebody initialized the group
+ * return without doing anything
+ */
+ ret = 0;
+ goto err;
+ }
+
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+ bitmap_page = page;
+ bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+ /* init buddy cache */
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (page == bitmap_page) {
+ /*
+ * If both the bitmap and buddy are in
+ * the same page we don't need to force
+ * init the buddy
+ */
+ unlock_page(page);
+ } else if (page) {
+ BUG_ON(page->mapping != inode->i_mapping);
+ ret = ext4_mb_init_cache(page, bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ unlock_page(page);
+ }
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
+ goto err;
+ }
+ mark_page_accessed(page);
+err:
+ /* release locks on all the groups */
+ for (i = 0; i < num_grp_locked; i++) {
+
+ grp = ext4_get_group_info(sb, first_group + i);
+ /* take all groups write allocation
+ * semaphore. This make sure there is
+ * no block allocation going on in any
+ * of that groups
+ */
+ up_write(&grp->alloc_sem);
+ }
+ if (bitmap_page)
+ page_cache_release(bitmap_page);
+ if (page)
+ page_cache_release(page);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
@@ -1777,7 +1923,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
group = 0;

/* quick check to skip empty groups */
- grp = ext4_get_group_info(ac->ac_sb, group);
+ grp = ext4_get_group_info(sb, group);
if (grp->bb_free == 0)
continue;

@@ -1790,10 +1936,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* we need full data about the group
* to make a good selection
*/
- err = ext4_mb_load_buddy(sb, group, &e4b);
+ err = ext4_mb_init_group(sb, group);
if (err)
goto out;
- ext4_mb_release_desc(&e4b);
}

/*
@@ -2244,7 +2389,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)


/* Create and initialize ext4_group_info data for the given group. */
-static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
{
int i, len;
@@ -2302,6 +2447,7 @@ static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root.rb_node = NULL;;

#ifdef DOUBLE_CHECK
@@ -2329,54 +2475,6 @@ static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
} /* ext4_mb_add_groupinfo */

/*
- * Add a group to the existing groups.
- * This function is used for online resize
- */
-int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
- struct ext4_group_desc *desc)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- int blocks_per_page;
- int block;
- int pnum;
- struct page *page;
- int err;
-
- /* Add group based on group descriptor*/
- err = ext4_mb_add_groupinfo(sb, group, desc);
- if (err)
- return err;
-
- /*
- * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
- * datas) are set not up to date so that they will be re-initilaized
- * during the next call to ext4_mb_load_buddy
- */
-
- /* Set buddy page as not up to date */
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- block = group * 2;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- /* Set bitmap page as not up to date */
- block++;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- return 0;
-}
-
-/*
* Update an existing group.
* This function is used for online resize
*/
@@ -4583,11 +4681,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
err = ext4_journal_get_write_access(handle, gd_bh);
if (err)
goto error_return;
-
- err = ext4_mb_load_buddy(sb, block_group, &e4b);
- if (err)
- goto error_return;
-
#ifdef AGGRESSIVE_CHECK
{
int i;
@@ -4601,6 +4694,8 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ if (err)
+ goto error_return;

if (ac) {
ac->ac_b_ex.fe_group = block_group;
@@ -4609,6 +4704,9 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
ext4_mb_store_history(ac);
}

+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
if (metadata) {
/* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed */
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1f..a931b6b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
#include <linux/version.h>
#include <linux/blkdev.h>
#include <linux/marker.h>
+#include <linux/mutex.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "group.h"
@@ -130,6 +131,7 @@ struct ext4_group_info {
#ifdef DOUBLE_CHECK
void *bb_bitmap;
#endif
+ struct rw_semaphore alloc_sem;
unsigned short bb_counters[];
};

@@ -250,6 +252,7 @@ struct ext4_buddy {
struct super_block *bd_sb;
__u16 bd_blkbits;
ext4_group_t bd_group;
+ struct rw_semaphore *alloc_semp;
};
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6a0609..939cf26 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -870,7 +870,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
* We can allocate memory for mb_alloc based on the new group
* descriptor
*/
- err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+ err = ext4_mb_add_groupinfo(sb, input->group, gdp);
if (err)
goto exit_journal;

@@ -1081,45 +1081,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
if ((err = ext4_journal_stop(handle)))
goto exit_put;

- /*
- * Mark mballoc pages as not up to date so that they will be updated
- * next time they are loaded by ext4_mb_load_buddy.
- *
- * XXX Bad, Bad, BAD!!! We should not be overloading the
- * Uptodate flag, particularly on thte bitmap bh, as way of
- * hinting to ext4_mb_load_buddy() that it needs to be
- * overloaded. A user could take a LVM snapshot, then do an
- * on-line fsck, and clear the uptodate flag, and this would
- * not be a bug in userspace, but a bug in the kernel. FIXME!!!
- */
- {
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- int blocks_per_page;
- int block;
- int pnum;
- struct page *page;
-
- /* Set buddy page as not up to date */
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- block = group * 2;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
-
- /* Set bitmap page as not up to date */
- block++;
- pnum = block / blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
- if (page != NULL) {
- ClearPageUptodate(page);
- page_cache_release(page);
- }
- }

2008-11-06 18:57:31

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -V3 4/9] ext4: cleanup mballoc header files

Move some of the forward declaration of the static functions
to mballoc.c where they are used. This enables us to include
mballoc.h in other .c files. Also correct the buddy cache
documentation.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 14 ++++++++++++--
fs/ext4/mballoc.h | 18 +-----------------
2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ade9d5e..9211397 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
* inode as:
*
* { page }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information. So for each group we
@@ -330,6 +330,16 @@
* object
*
*/
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+
+

static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
@@ -716,7 +726,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
* stored in the inode as
*
* { page }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
*
*
* one block each for bitmap and buddy information.
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index a931b6b..997f78f 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -99,9 +99,6 @@
*/
#define MB_DEFAULT_GROUP_PREALLOC 512

-static struct kmem_cache *ext4_pspace_cachep;
-static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;

struct ext4_free_data {
/* this links the free block information from group_info */
@@ -262,25 +259,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
{
return;
}
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
#endif

#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)

struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);

-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
- ext4_group_t group);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
- struct ext4_buddy *e4b, sector_t block,
- int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
- struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);

2008-11-06 18:57:39

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -V3 7/9] ext4: don't use blocks freed but not yet committed in buddy cache init

When we generate buddy cache (especially during resize) we need to
make sure we don't use the blocks freed but not yet comitted. This
makes sure we have the right value of free blocks count in the group
info and also in the bitmap. This also ensures the ordered mode
consistency

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 83 ++++++++++++++++++++++++++++++++++++++--------------
1 files changed, 60 insertions(+), 23 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f4753bd..7293209 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -335,6 +335,8 @@
static struct kmem_cache *ext4_free_ext_cachep;
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
static int ext4_mb_init_per_dev_proc(struct super_block *sb);
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
@@ -858,7 +860,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/*
* incore got set to the group block bitmap below
*/
+ ext4_lock_group(sb, group);
ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_unlock_group(sb, group);
incore = NULL;
} else {
/* this is block of bitmap */
@@ -872,6 +876,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)

/* mark all preallocated blks used in in-core bitmap */
ext4_mb_generate_from_pa(sb, data, group);
+ ext4_mb_generate_from_freelist(sb, data, group);
ext4_unlock_group(sb, group);

/* set incore so that the buddy information can be
@@ -3428,6 +3433,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
}

/*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+ ext4_group_t group)
+{
+ struct rb_node *n;
+ struct ext4_group_info *grp;
+ struct ext4_free_data *entry;
+
+ grp = ext4_get_group_info(sb, group);
+ n = rb_first(&(grp->bb_free_root));
+
+ while (n) {
+ entry = rb_entry(n, struct ext4_free_data, node);
+ mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+ bitmap, entry->start_blk,
+ entry->count);
+ n = rb_next(n);
+ }
+ return;
+}
+
+/*
* the function goes through all preallocation in this group and marks them
* used in in-core bitmap. buddy must be generated from this bitmap
* Need to be called with ext4 group lock (ext4_lock_group)
@@ -4527,27 +4558,22 @@ static int can_merge(struct ext4_free_data *entry1,

static noinline_for_stack int
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
- ext4_group_t group, ext4_grpblk_t block, int count)
+ struct ext4_free_data *new_entry)
{
+ ext4_grpblk_t block;
+ struct ext4_free_data *entry;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_data *entry, *new_entry;
struct rb_node **n = &db->bb_free_root.rb_node, *node;
struct rb_node *parent = NULL, *new_node;

-
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);

- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
- new_entry->start_blk = block;
- new_entry->group = group;
- new_entry->count = count;
- new_entry->t_tid = handle->h_transaction->t_tid;
new_node = &new_entry->node;
+ block = new_entry->start_blk;

- ext4_lock_group(sb, group);
if (!*n) {
/* first free block exent. We need to
protect buddy cache from being freed,
@@ -4565,7 +4591,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
else if (block >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
else {
- ext4_unlock_group(sb, group);
ext4_error(sb, __func__,
"Double free of blocks %d (%d %d)\n",
block, entry->start_blk, entry->count);
@@ -4607,7 +4632,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
spin_lock(&sbi->s_md_lock);
list_add(&new_entry->list, &handle->h_transaction->t_private_list);
spin_unlock(&sbi->s_md_lock);
- ext4_unlock_group(sb, group);
return 0;
}

@@ -4712,15 +4736,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
}
#endif
- mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
- bit, count);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
- if (err)
- goto error_return;
-
if (ac) {
ac->ac_b_ex.fe_group = block_group;
ac->ac_b_ex.fe_start = bit;
@@ -4734,11 +4749,29 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
goto error_return;
}
if (metadata) {
- /* blocks being freed are metadata. these blocks shouldn't
- * be used until this transaction is committed */
- ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+ struct ext4_free_data *new_entry;
+ /*
+ * blocks being freed are metadata. these blocks shouldn't
+ * be used until this transaction is committed
+ */
+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = bit;
+ new_entry->group = block_group;
+ new_entry->count = count;
+ new_entry->t_tid = handle->h_transaction->t_tid;
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+ bit, count);
+ ext4_mb_free_metadata(handle, &e4b, new_entry);
+ ext4_unlock_group(sb, block_group);
} else {
ext4_lock_group(sb, block_group);
+ /* need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+ bit, count);
mb_free_blocks(inode, &e4b, bit, count);
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
ext4_unlock_group(sb, block_group);
@@ -4761,6 +4794,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,

*freed += count;

+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+
/* And the group descriptor block */
BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
ret = ext4_journal_dirty_metadata(handle, gd_bh);
--
1.6.0.3.514.g2f91b


2008-11-06 18:57:31

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -V3 5/9] ext4: Add sparse annotations for the group info semaphore

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 24 ++++++++++++++++++++----
1 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9211397..f4753bd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -895,6 +895,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
+__acquires(e4b->alloc_semp)
{
int blocks_per_page;
int block;
@@ -926,6 +927,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
* till we are done with allocation
*/
down_read(e4b->alloc_semp);
+ __acquire(e4b->alloc_semp);

/*
* the buddy cache inode stores the block bitmap
@@ -1020,6 +1022,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
}

static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+__releases(e4b->alloc_semp)
{
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
@@ -1027,6 +1030,7 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
page_cache_release(e4b->bd_buddy_page);
/* Done with the buddy cache */
up_read(e4b->alloc_semp);
+ __release(e4b->alloc_semp);
}


@@ -1468,8 +1472,10 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac,

BUG_ON(ex.fe_len <= 0);
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
- if (err)
+ if (err) {
+ __release(e4b->alloc_semp);
return err;
+ }

ext4_lock_group(ac->ac_sb, group);
max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
@@ -1499,8 +1505,10 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
return 0;

err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
- if (err)
+ if (err) {
+ __release(e4b->alloc_semp);
return err;
+ }

ext4_lock_group(ac->ac_sb, group);
max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
@@ -1959,8 +1967,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
continue;

err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err)
+ if (err) {
+ __release(e4b->alloc_semp);
goto out;
+ }

ext4_lock_group(sb, group);
if (!ext4_mb_good_group(ac, group, cr)) {
@@ -2271,6 +2281,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
sizeof(struct ext4_group_info);
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
+ __release(e4b->alloc_semp);
seq_printf(seq, "#%-5u: I/O error\n", group);
return 0;
}
@@ -3816,6 +3827,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,

err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
+ __release(e4b->alloc_semp);
ext4_error(sb, __func__, "Error in loading buddy "
"information for %u\n", group);
put_bh(bitmap_bh);
@@ -3983,6 +3995,7 @@ void ext4_discard_preallocations(struct inode *inode)

err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
+ __release(e4b->alloc_semp);
ext4_error(sb, __func__, "Error in loading buddy "
"information for %u\n", group);
continue;
@@ -4255,6 +4268,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,

ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
if (ext4_mb_load_buddy(sb, group, &e4b)) {
+ __release(e4b->alloc_semp);
ext4_error(sb, __func__, "Error in loading buddy "
"information for %u\n", group);
continue;
@@ -4715,8 +4729,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
}

err = ext4_mb_load_buddy(sb, block_group, &e4b);
- if (err)
+ if (err) {
+ __release(e4b->alloc_semp);
goto error_return;
+ }
if (metadata) {
/* blocks being freed are metadata. these blocks shouldn't
* be used until this transaction is committed */
--
1.6.0.3.514.g2f91b


2008-11-06 18:57:26

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -V3 2/9] ext4: Add blocks added during resize to bitmap

With this change new blocks added during resize
are marked as free in the block bitmap and the
group is flagged with EXT4_GROUP_INFO_NEED_INIT_BIT
flag. This make sure when mballoc tries to allocate
blocks from the new group we would reload the
buddy information using the bitmap present in the disk.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/balloc.c | 136 ++++++++++++------------------------------------------
fs/ext4/ext4.h | 5 +-
fs/ext4/resize.c | 11 +----
3 files changed, 34 insertions(+), 118 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 2295f3e..fa69cb3 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
#include "ext4.h"
#include "ext4_jbd2.h"
#include "group.h"
+#include "mballoc.h"

/*
* balloc.c contains the blocks allocation and deallocation routines
@@ -350,62 +351,43 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
}

/**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
- * @block: start physcial block to free
+ * @block: start physcial block to add to the block group
* @count: number of blocks to free
- * @pdquot_freed_blocks: pointer to quota
*
- * XXX This function is only used by the on-line resizing code, which
- * should probably be fixed up to call the mballoc variant. There
- * this needs to be cleaned up later; in fact, I'm not convinced this
- * is 100% correct in the face of the mballoc code. The online resizing
- * code needs to be fixed up to more tightly (and correctly) interlock
- * with the mballoc code.
+ * This marks the blocks as free in the bitmap. We ask the
+ * mballoc to reload the buddy after this by setting group
+ * EXT4_GROUP_INFO_NEED_INIT_BIT flag
*/
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count,
- unsigned long *pdquot_freed_blocks)
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count)
{
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *gd_bh;
ext4_group_t block_group;
ext4_grpblk_t bit;
unsigned int i;
- unsigned int overflow;
struct ext4_group_desc *desc;
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
int err = 0, ret;
- ext4_grpblk_t group_freed;
+ ext4_grpblk_t blocks_freed;
+ struct ext4_group_info *grp;

- *pdquot_freed_blocks = 0;
sbi = EXT4_SB(sb);
es = sbi->s_es;
- if (block < le32_to_cpu(es->s_first_data_block) ||
- block + count < block ||
- block + count > ext4_blocks_count(es)) {
- ext4_error(sb, "ext4_free_blocks",
- "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
- goto error_return;
- }
-
- ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
+ ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);

-do_more:
- overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
/*
* Check to see if we are freeing blocks across a group
* boundary.
*/
if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
- count -= overflow;
+ goto error_return;
}
- brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
@@ -418,18 +400,17 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, desc),
sbi->s_itb_per_group)) {
- ext4_error(sb, "ext4_free_blocks",
- "Freeing blocks in system zones - "
+ ext4_error(sb, __func__,
+ "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
goto error_return;
}

/*
- * We are about to start releasing blocks in the bitmap,
+ * We are about to add blocks to the bitmap,
* so we need undo access.
*/
- /* @@@ check errors */
BUFFER_TRACE(bitmap_bh, "getting undo access");
err = ext4_journal_get_undo_access(handle, bitmap_bh);
if (err)
@@ -445,87 +426,28 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
if (err)
goto error_return;

- jbd_lock_bh_state(bitmap_bh);
-
- for (i = 0, group_freed = 0; i < count; i++) {
- /*
- * An HJ special. This is expensive...
- */
-#ifdef CONFIG_JBD2_DEBUG
- jbd_unlock_bh_state(bitmap_bh);
- {
- struct buffer_head *debug_bh;
- debug_bh = sb_find_get_block(sb, block + i);
- if (debug_bh) {
- BUFFER_TRACE(debug_bh, "Deleted!");
- if (!bh2jh(bitmap_bh)->b_committed_data)
- BUFFER_TRACE(debug_bh,
- "No commited data in bitmap");
- BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
- __brelse(debug_bh);
- }
- }
- jbd_lock_bh_state(bitmap_bh);
-#endif
- if (need_resched()) {
- jbd_unlock_bh_state(bitmap_bh);
- cond_resched();
- jbd_lock_bh_state(bitmap_bh);
- }
- /* @@@ This prevents newly-allocated data from being
- * freed and then reallocated within the same
- * transaction.
- *
- * Ideally we would want to allow that to happen, but to
- * do so requires making jbd2_journal_forget() capable of
- * revoking the queued write of a data block, which
- * implies blocking on the journal lock. *forget()
- * cannot block due to truncate races.
- *
- * Eventually we can fix this by making jbd2_journal_forget()
- * return a status indicating whether or not it was able
- * to revoke the buffer. On successful revoke, it is
- * safe not to set the allocation bit in the committed
- * bitmap, because we know that there is no outstanding
- * activity on the buffer any more and so it is safe to
- * reallocate it.
- */
- BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
- J_ASSERT_BH(bitmap_bh,
- bh2jh(bitmap_bh)->b_committed_data != NULL);
- ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
- bh2jh(bitmap_bh)->b_committed_data);
-
- /*
- * We clear the bit in the bitmap after setting the committed
- * data bit, because this is the reverse order to that which
- * the allocator uses.
- */
+ for (i = 0, blocks_freed = 0; i < count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
bit + i, bitmap_bh->b_data)) {
- jbd_unlock_bh_state(bitmap_bh);
ext4_error(sb, __func__,
"bit already cleared for block %llu",
(ext4_fsblk_t)(block + i));
- jbd_lock_bh_state(bitmap_bh);
BUFFER_TRACE(bitmap_bh, "bit already cleared");
} else {
- group_freed++;
+ blocks_freed++;
}
}
- jbd_unlock_bh_state(bitmap_bh);
-
spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+ le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
spin_unlock(sb_bgl_lock(sbi, block_group));
- percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);

if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_blocks += count;
+ sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
spin_unlock(sb_bgl_lock(sbi, flex_group));
}

@@ -536,15 +458,17 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
/* And the group descriptor block */
BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
ret = ext4_journal_dirty_metadata(handle, gd_bh);
- if (!err) err = ret;
- *pdquot_freed_blocks += group_freed;
-
- if (overflow && !err) {
- block += count;
- count = overflow;
- goto do_more;
- }
+ if (!err)
+ err = ret;
sb->s_dirt = 1;
+ /*
+ * request to reload the buddy with the
+ * new bitmap information
+ */
+ grp = ext4_get_group_info(sb, block_group);
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+ ext4_mb_update_group_info(grp, blocks_freed);
+
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, err);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c05d639..4514fa5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1012,9 +1012,8 @@ extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count,
- unsigned long *pdquot_freed_blocks);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count);
extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 498ccbc..b6a0609 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -975,9 +975,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
struct buffer_head *bh;
handle_t *handle;
int err;
- unsigned long freed_blocks;
ext4_group_t group;
- struct ext4_group_info *grp;

/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
@@ -1076,7 +1074,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
unlock_super(sb);
ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
- ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+ /* We add the blocks to the bitmap and set the group need init bit */
+ ext4_add_groupblocks(handle, sb, o_blocks_count, add);
ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
if ((err = ext4_journal_stop(handle)))
@@ -1119,12 +1118,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ClearPageUptodate(page);
page_cache_release(page);
}
-
- /* Get the info on the last group */
- grp = ext4_get_group_info(sb, group);

2008-11-06 18:57:35

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -V3 6/9] jbd2: Call journal commit callback without holding j_list_lock

Avoid freeing the transaction in __jbd2_journal_drop_transaction() so
the journal commit callback can run without holding j_list_lock, to
avoid lock contention on this spinlock.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/jbd2/checkpoint.c | 2 +-
fs/jbd2/commit.c | 13 ++++++++-----
include/linux/jbd2.h | 4 ++--
3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1c10ce5..ad1d2d9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -682,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
safely remove this transaction from the log */

__jbd2_journal_drop_transaction(journal, transaction);
+ kfree(transaction);

/* Just in case anybody was waiting for more transactions to be
checkpointed... */
@@ -756,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(journal->j_running_transaction != transaction);

jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
- kfree(transaction);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b3bcb26..5befb2d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -340,7 +340,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int space_left = 0;
int first_tag = 0;
int tag_flag;
- int i;
+ int i, to_free = 0;
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
@@ -988,12 +988,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_average_commit_time = commit_time;
spin_unlock(&journal->j_state_lock);

- if (journal->j_commit_callback)
- journal->j_commit_callback(journal, commit_transaction);
-
if (commit_transaction->t_checkpoint_list == NULL &&
commit_transaction->t_checkpoint_io_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
+ to_free = 1;
} else {
if (journal->j_checkpoint_transactions == NULL) {
journal->j_checkpoint_transactions = commit_transaction;
@@ -1012,11 +1010,16 @@ void jbd2_journal_commit_transaction(journal_t *journal)
}
spin_unlock(&journal->j_list_lock);

+ if (journal->j_commit_callback)
+ journal->j_commit_callback(journal, commit_transaction);
+
trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
- journal->j_devname, journal->j_commit_sequence,
+ journal->j_devname, commit_transaction->t_tid,
journal->j_tail_sequence);
jbd_debug(1, "JBD: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
+ if (to_free)
+ kfree(commit_transaction);

wake_up(&journal->j_wait_done_commit);
}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index fdbd0bf..4932b34 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1170,8 +1170,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);

void __jbd2_log_wait_for_space(journal_t *journal);
-extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
-extern int jbd2_cleanup_journal_tail(journal_t *);
+extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
+extern int jbd2_cleanup_journal_tail(journal_t *);

/* Debugging code only: */

--
1.6.0.3.514.g2f91b


2008-11-06 18:57:43

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -V3 8/9] ext4: Fix double free of blocks

Blocks freed but not yet committed will be marked free in disk bitmap.
We need to consider them as used when releasing inode prealloc
space. Otherwise we would double free them via mb_free_blocks
ext4_mb_release_inode_pa looks at the block bitmap and mark the
block found free in the bitmap withing the inode prealloc space
as unused. If we have blocks allocated out of a prealloc space
and later freed we would have called ext4_mb_free_blocks on them
That would mark blocks as free in bitmap and later on journal commit
we would call mb_free_blocks on them, thus resuling in double free

ext4_lock_group is used to make sure nothing gets added to the free
list when we are doing a release_inode_pa

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 75 ++++++++++++++++++++++++++++++++++-------------------
1 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 7293209..f58de20 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3725,19 +3725,19 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
* TODO: optimize the case when there are no in-core structures yet
*/
static noinline_for_stack int
-ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
- struct ext4_prealloc_space *pa,
- struct ext4_allocation_context *ac)
+ext4_mb_release_inode_pa(struct ext4_buddy *e4b, void *bitmap,
+ struct ext4_prealloc_space *pa,
+ struct ext4_allocation_context *ac)
{
- struct super_block *sb = e4b->bd_sb;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err = 0;
+ int free = 0;
+ sector_t start;
unsigned int end;
unsigned int next;
ext4_group_t group;
ext4_grpblk_t bit;
- sector_t start;
- int err = 0;
- int free = 0;
+ struct super_block *sb = e4b->bd_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);

BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3749,12 +3749,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
ac->ac_inode = pa->pa_inode;
ac->ac_op = EXT4_MB_HISTORY_DISCARD;
}
-
while (bit < end) {
- bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
+ bit = mb_find_next_zero_bit(bitmap, end, bit);
if (bit >= end)
break;
- next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
+ next = mb_find_next_bit(bitmap, end, bit);
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3773,18 +3772,12 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
bit = next + 1;
}
- if (free != pa->pa_free) {
- printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
- pa, (unsigned long) pa->pa_lstart,
- (unsigned long) pa->pa_pstart,
- (unsigned long) pa->pa_len);
- ext4_error(sb, __func__, "free %u, pa_free %u\n",
- free, pa->pa_free);
- /*
- * pa is already deleted so we use the value obtained
- * from the bitmap and continue.
- */
- }
+ /*
+ * The blocks allocated and later freed from this pa
+ * can result in pa_free being different from the
+ * bitmap free block count. This is because we don't
+ * update pa_len on releasing blocks.
+ */
atomic_add(free, &sbi->s_mb_discarded);

return err;
@@ -3840,6 +3833,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
struct ext4_allocation_context *ac;
struct list_head list;
struct ext4_buddy e4b;
+ void *bitmap;
int err;
int busy = 0;
int free = 0;
@@ -3855,12 +3849,21 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
"bitmap for %u\n", group);
return 0;
}
-
+ /*
+ * Blocks freed but not yet committed will be marked free in
+ * disk bitmap. We need to consider them as used when
+ * releasing inode pa. Otherwise we would double free them via
+ * mb_free_blocks
+ */
+ bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+ if (!bitmap)
+ return 0;
err = ext4_mb_load_buddy(sb, group, &e4b);
if (err) {
__release(e4b->alloc_semp);
ext4_error(sb, __func__, "Error in loading buddy "
"information for %u\n", group);
+ kfree(bitmap);
put_bh(bitmap_bh);
return 0;
}
@@ -3915,6 +3918,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
goto out;
}

+ memcpy(bitmap, bitmap_bh->b_data, sb->s_blocksize);
+ ext4_mb_generate_from_freelist(sb, bitmap, group);
/* now free all selected PAs */
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {

@@ -3926,7 +3931,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
if (pa->pa_linear)
ext4_mb_release_group_pa(&e4b, pa, ac);
else
- ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+ ext4_mb_release_inode_pa(&e4b, bitmap, pa, ac);

list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3937,6 +3942,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
ext4_mb_release_desc(&e4b);
+ kfree(bitmap);
put_bh(bitmap_bh);
return free;
}
@@ -3961,6 +3967,7 @@ void ext4_discard_preallocations(struct inode *inode)
struct list_head list;
struct ext4_buddy e4b;
int err;
+ void *bitmap;

if (!S_ISREG(inode->i_mode)) {
/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
@@ -4039,14 +4046,28 @@ void ext4_discard_preallocations(struct inode *inode)
ext4_mb_release_desc(&e4b);
continue;
}
-
+ /* blocks freed but not yet committed will
+ * be marked free in disk bitmap. We need to
+ * consider them as used when releasing inode
+ * pa. Otherwise we would double free them
+ * via mb_free_blocks
+ */
+ bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+ if (!bitmap) {
+ ext4_mb_release_desc(&e4b);
+ put_bh(bitmap_bh);
+ continue;
+ }
+ memcpy(bitmap, bitmap_bh->b_data, sb->s_blocksize);
ext4_lock_group(sb, group);
+ ext4_mb_generate_from_freelist(sb, bitmap, group);
list_del(&pa->pa_group_list);
- ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+ ext4_mb_release_inode_pa(&e4b, bitmap, pa, ac);
ext4_unlock_group(sb, group);

ext4_mb_release_desc(&e4b);
put_bh(bitmap_bh);
+ kfree(bitmap);

list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
--
1.6.0.3.514.g2f91b


2008-11-06 18:57:53

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [RFC PATCH -V3 9/9] ext4: Fix lockdep recursive locking warning

Indicate that the group locks can be taken in loop.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 5 ++++-
1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f58de20..34b9b7c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2413,6 +2413,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
#define ext4_mb_history_init(sb)
#endif

+static struct lock_class_key alloc_sem_key[NR_BG_LOCKS];

/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2473,7 +2474,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
- init_rwsem(&meta_group_info[i]->alloc_sem);
+ __init_rwsem(&meta_group_info[i]->alloc_sem,
+ "&meta_group_info[i]->alloc_sem",
+ &alloc_sem_key[i]);
meta_group_info[i]->bb_free_root.rb_node = NULL;;

#ifdef DOUBLE_CHECK
--
1.6.0.3.514.g2f91b


2008-11-06 22:09:59

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [RFC PATCH -V3 1/9] ext4: sparse fixes

On Fri, Nov 07, 2008 at 12:19:26AM +0530, Aneesh Kumar K.V wrote:
> #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
> - (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
> + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_compat) & mask)
> #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
> - (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
> + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & mask)
> #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
> - (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
> + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & mask)

This is going to force a byte-swap instruction on every single
big-endian system out there, which could potentially be costly, or at
least undeed given that all of the times that these functions are
called, the result gets used only as a booelan.

Can we shut up sparse by using a explicit cast to an unsigned int?

- Ted

2008-11-06 22:37:07

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [RFC PATCH -V3 8/9] ext4: Fix double free of blocks

On Fri, Nov 07, 2008 at 12:19:33AM +0530, Aneesh Kumar K.V wrote:
> Blocks freed but not yet committed will be marked free in disk bitmap.

That should probably read, "marked free in the on-disk block
allocation bitmap".

And I'm having real trouble parsing this below

> ext4_mb_release_inode_pa looks at the block bitmap and mark the
> block found free in the bitmap withing the inode prealloc space

"the block" should that be plural? Which block is it?

"found free in the bitmap"? which bitmap? The block allocation
bitmap?

"withing"? Is that supposed to be within?

"withing the prealloc space"? What is that supposed to be modifying?
It is cloest to "the bitmap", but that doesn't make any sense; is this
phrase supposed to be modifying "the block"?

> as unused. If we have blocks allocated out of a prealloc space

"mark as unused" Mark as unused where? In the block allocation
bitmap? But I thought these were "blocks found free in the bitmap",
so aren't they already unused?

It's better to try to explain things at a higher level. What what
you've said on the conference call, I *think* what happens is that the
preallocation range is range of blocks which is reserved for
allocation for that particular inode. To support this, the blocks in
question are made to appear as though they are not available in the
mballoc's in-memory buddy bitmap, which is what mballoc consults when
making allocations.

For some strange reason which I don't understand, when blocks are
served out of the preallocation area and allocated to the inode, they
are not removed from the preallocation area. (This seems like the
real bug, but whatever...) So when we release a preallocation area
for an inode, for each block in the inode's preallocation area, we
check and see if the block is marked as free according to the on-disk
disk allocation bitmap, and if so we make it look available in
mballoc's in-memory buddy bitmap.

Unfortunately, if the disk blocks in question are freed before the
commit transmit, the blocks would look free in the on-disk block
allocation bitmap, and so there would be an attempt to mark them as
available twice in mballoc's buddy bitmap.

To get around this problem, the patch allocates a temporary bitmap
which also includes the blocks in the "release on commit" linked list.

Did I get this right? If so, we should put an abbreviated version of
the above in the commit, and more of this needs to be explicitly
documented in comments in mballoc.c

So ---- stupid question. Instead of creating the temporary bitmap,
which I fear will be very expensive --- why not remove the block from
the inode's preallocation area when it is served up? Then, it becomes
easier when releasing the inode preallocation area; we wouldn't need
to consult the on-disk block allocation bitmap at all, and merely just
iterate over all of the blocks in the inode preallocation space, and
mark them as available in the mballoc buddy bitmap?

- Ted

2008-11-07 08:07:27

by Li Zefan

[permalink] [raw]
Subject: Re: [RFC PATCH -V3 9/9] ext4: Fix lockdep recursive locking warning

I got compile error on IA64:

fs/ext4/mballoc.c: In function ‘ext4_mb_add_groupinfo’:
fs/ext4/mballoc.c:2482: error: implicit declaration of function ‘__init_rwsem’


Aneesh Kumar K.V wrote:
> Indicate that the group locks can be taken in loop.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> Signed-off-by: "Theodore Ts'o" <[email protected]>
> ---
> fs/ext4/mballoc.c | 5 ++++-
> 1 files changed, 4 insertions(+), 1 deletions(-)
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index f58de20..34b9b7c 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -2413,6 +2413,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
> #define ext4_mb_history_init(sb)
> #endif
>
> +static struct lock_class_key alloc_sem_key[NR_BG_LOCKS];
>
> /* Create and initialize ext4_group_info data for the given group. */
> int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
> @@ -2473,7 +2474,9 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
> }
>
> INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
> - init_rwsem(&meta_group_info[i]->alloc_sem);
> + __init_rwsem(&meta_group_info[i]->alloc_sem,
> + "&meta_group_info[i]->alloc_sem",
> + &alloc_sem_key[i]);
> meta_group_info[i]->bb_free_root.rb_node = NULL;;
>
> #ifdef DOUBLE_CHECK

2008-11-07 14:20:39

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC PATCH -V3 1/9] ext4: sparse fixes

On Thu, Nov 06, 2008 at 05:09:54PM -0500, Theodore Tso wrote:
> On Fri, Nov 07, 2008 at 12:19:26AM +0530, Aneesh Kumar K.V wrote:
> > #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
> > - (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
> > + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_compat) & mask)
> > #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
> > - (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
> > + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & mask)
> > #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
> > - (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
> > + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & mask)
>
> This is going to force a byte-swap instruction on every single
> big-endian system out there, which could potentially be costly, or at
> least undeed given that all of the times that these functions are
> called, the result gets used only as a booelan.
>
> Can we shut up sparse by using a explicit cast to an unsigned int?

I did it that way first But found the error path needing conversion.
But i guess it is ok since it happen in the error path printk.

ext4: sparse fixes

From: Aneesh Kumar K.V <[email protected]>

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/ext4.h | 9 ++++++---
fs/ext4/extents.c | 4 ++--
fs/ext4/file.c | 3 ---
fs/ext4/inode.c | 2 +-
fs/ext4/mballoc.c | 4 +++-
fs/ext4/super.c | 19 +++++++++++--------
6 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9415a1a..5fc8def 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -726,11 +726,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
*/

#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
- (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
+ (__force __u32)(EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
- (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
+ (__force __u32)(EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
- (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
+ (__force __u32)(EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -1284,6 +1284,9 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int create,
int extend_disksize, int flag);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
+
#endif /* __KERNEL__ */

#endif /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6b8c687..228261e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3081,7 +3081,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
/*
* Callback function called for each extent to gather FIEMAP information.
*/
-int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
struct ext4_ext_cache *newex, struct ext4_extent *ex,
void *data)
{
@@ -3150,7 +3150,7 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
/* fiemap flags we can handle specified here */
#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)

-int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+static int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
{
__u64 physical = 0;
__u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fb..f731cb5 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}

-extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- __u64 start, __u64 len);
-
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1ef1205..d42f748 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3856,7 +3856,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
ext4_fsblk_t block;
int inodes_per_block, inode_offset;

- iloc->bh = 0;
+ iloc->bh = NULL;
if (!ext4_valid_inum(sb, inode->i_ino))
return -EIO;

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 59151c1..9cc93af 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1056,6 +1056,8 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)

static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
+__releases(bitlock)
+__acquires(bitlock)
{
int block = 0;
int max = 0;
@@ -2242,7 +2244,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)


/* Create and initialize ext4_group_info data for the given group. */
-int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
{
int i, len;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d4c84f4..d597c1f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1876,7 +1876,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
int db_count;
int i;
int needs_recovery, has_huge_files;
- __le32 features;
+ int features;
__u64 blocks_count;
int err;

@@ -2005,15 +2005,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
if (features) {
printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ "unsupported optional features (%x).\n", sb->s_id,
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+ ~EXT4_FEATURE_INCOMPAT_SUPP));
goto failed_mount;
}
features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
if (!(sb->s_flags & MS_RDONLY) && features) {
printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
- "unsupported optional features (%x).\n",
- sb->s_id, le32_to_cpu(features));
+ "unsupported optional features (%x).\n", sb->s_id,
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+ ~EXT4_FEATURE_RO_COMPAT_SUPP));
goto failed_mount;
}
has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -3036,13 +3038,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_mark_recovery_complete(sb, es);
lock_super(sb);
} else {
- __le32 ret;
+ int ret;
if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
~EXT4_FEATURE_RO_COMPAT_SUPP))) {
printk(KERN_WARNING "EXT4-fs: %s: couldn't "
"remount RDWR because of unsupported "
- "optional features (%x).\n",
- sb->s_id, le32_to_cpu(ret));
+ "optional features (%x).\n", sb->s_id,
+ (le32_to_cpu(sbi->s_es->s_feature_ro_compat)
+ & ~EXT4_FEATURE_RO_COMPAT_SUPP));
err = -EROFS;
goto restore_opts;
}

2008-11-07 14:45:24

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [RFC PATCH -V3 1/9] ext4: sparse fixes

On Fri, Nov 07, 2008 at 07:50:22PM +0530, Aneesh Kumar K.V wrote:
> I did it that way first But found the error path needing conversion.
> But i guess it is ok since it happen in the error path printk.

If there is an error path that needs conversion (I assume you mean the
fs/ext4/super.c unknown feature printk?) we can just open code it
instead of using the cpp macro --- as I see you've already done.

- Ted

2008-11-07 15:08:20

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [RFC PATCH -V3 1/9] ext4: sparse fixes

On Fri, Nov 07, 2008 at 07:50:22PM +0530, Aneesh Kumar K.V wrote:
> On Thu, Nov 06, 2008 at 05:09:54PM -0500, Theodore Tso wrote:
> > On Fri, Nov 07, 2008 at 12:19:26AM +0530, Aneesh Kumar K.V wrote:
> > > #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
> > > - (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
> > > + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_compat) & mask)
> > > #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
> > > - (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
> > > + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & mask)
> > > #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
> > > - (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
> > > + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & mask)
> >
> > This is going to force a byte-swap instruction on every single
> > big-endian system out there, which could potentially be costly, or at
> > least undeed given that all of the times that these functions are
> > called, the result gets used only as a booelan.
> >
> > Can we shut up sparse by using a explicit cast to an unsigned int?
>
> I did it that way first But found the error path needing conversion.
> But i guess it is ok since it happen in the error path printk.


> #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
> - (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
> + (__force __u32)(EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))

Umm, no. You're not returning a __u32 here, but a flag.

((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)

should do the right thing.


2008-11-07 16:01:37

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC PATCH -V3 8/9] ext4: Fix double free of blocks

On Thu, Nov 06, 2008 at 05:37:02PM -0500, Theodore Tso wrote:
> On Fri, Nov 07, 2008 at 12:19:33AM +0530, Aneesh Kumar K.V wrote:
> > Blocks freed but not yet committed will be marked free in disk bitmap.
>
> That should probably read, "marked free in the on-disk block
> allocation bitmap".
>
> And I'm having real trouble parsing this below
>
> > ext4_mb_release_inode_pa looks at the block bitmap and mark the
> > block found free in the bitmap withing the inode prealloc space
>
> "the block" should that be plural? Which block is it?
>
> "found free in the bitmap"? which bitmap? The block allocation
> bitmap?
>
> "withing"? Is that supposed to be within?
>
> "withing the prealloc space"? What is that supposed to be modifying?
> It is cloest to "the bitmap", but that doesn't make any sense; is this
> phrase supposed to be modifying "the block"?
>
> > as unused. If we have blocks allocated out of a prealloc space
>
> "mark as unused" Mark as unused where? In the block allocation
> bitmap? But I thought these were "blocks found free in the bitmap",
> so aren't they already unused?
>
> It's better to try to explain things at a higher level. What what
> you've said on the conference call, I *think* what happens is that the
> preallocation range is range of blocks which is reserved for
> allocation for that particular inode. To support this, the blocks in
> question are made to appear as though they are not available in the
> mballoc's in-memory buddy bitmap, which is what mballoc consults when
> making allocations.


Yes. That is correct

>
> For some strange reason which I don't understand, when blocks are
> served out of the preallocation area and allocated to the inode, they
> are not removed from the preallocation area. (This seems like the
> real bug, but whatever...)

This happens for inode prealloc space. Because i node prealloc space
is mapped using the logical block number. Ie if we have a prealloc space
of contiguous 'x' blocks starting at physical block 'p'. When creating
this prealloc space we would have been requesting for blocks for logical
block 'l'. ie

[ p .........(p+k).............p+x]
l

Now request for 2 blocks at logical file block 'l' get allocated
from above prealloc space. Again request for 3 blocks at logical block
'l+k' get allocated from the above prealloc space. Also request for
3 blocks at logical block 'p+x+1' WON't use the above prealloc space.
Inshort inode prealloc space use logical block number to find which
prealloc space to use. We also don't reduce the pa_len of the prealloc
space. But we reduce pa_free. Now when we discard this prealloc space
we need to look at on-block bitmap and find out how many blocks actually
got allocated out of the prealloc space. So we start with the first zero
block on the block bitmap starting from physical block 'p'. We search
for next set bit. All the blocks in that range are free. We them mark
them free in the buddy bitmap.


>So when we release a preallocation area
> for an inode, for each block in the inode's preallocation area, we
> check and see if the block is marked as free according to the on-disk
> disk allocation bitmap, and if so we make it look available in
> mballoc's in-memory buddy bitmap.
>
> Unfortunately, if the disk blocks in question are freed before the
> commit transmit, the blocks would look free in the on-disk block
> allocation bitmap, and so there would be an attempt to mark them as
> available twice in mballoc's buddy bitmap.

exactly.

>
> To get around this problem, the patch allocates a temporary bitmap
> which also includes the blocks in the "release on commit" linked list.
>
> Did I get this right? If so, we should put an abbreviated version of
> the above in the commit, and more of this needs to be explicitly
> documented in comments in mballoc.c

I am tempted to add your mail above as it is :)

>
> So ---- stupid question. Instead of creating the temporary bitmap,
> which I fear will be very expensive --- why not remove the block from
> the inode's preallocation area when it is served up?

We can't do that because inode prealloc space look at logical block
of the file.


>Then, it becomes
> easier when releasing the inode preallocation area; we wouldn't need
> to consult the on-disk block allocation bitmap at all, and merely just
> iterate over all of the blocks in the inode preallocation space, and
> mark them as available in the mballoc buddy bitmap?

-aneesh

2008-11-07 16:26:09

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [RFC PATCH -V3 9/9] ext4: Fix lockdep recursive locking warning

On Fri, Nov 07, 2008 at 04:04:40PM +0800, Li Zefan wrote:
> I got compile error on IA64:
>
> fs/ext4/mballoc.c: In function ‘ext4_mb_add_groupinfo’:
> fs/ext4/mballoc.c:2482: error: implicit declaration of function ‘__init_rwsem’
>
>

Can you try this patch. I guess IA64 doesn't support lockdep

ext4: Fix lockdep recursive locking warning

From: Aneesh Kumar K.V <[email protected]>

Indicate that the group locks can be taken in loop.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 11 ++++++++++-
1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f58de20..cb0ccbe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2413,6 +2413,9 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
#define ext4_mb_history_init(sb)
#endif

+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key alloc_sem_key[NR_BG_LOCKS];
+#endif

/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2473,8 +2476,14 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
- init_rwsem(&meta_group_info[i]->alloc_sem);
+#ifdef CONFIG_LOCKDEP
+ __init_rwsem(&meta_group_info[i]->alloc_sem,
+ "&meta_group_info[i]->alloc_sem",
+ &alloc_sem_key[i]);
meta_group_info[i]->bb_free_root.rb_node = NULL;;
+#else
+ init_rwsem(&meta_group_info[i]->alloc_sem);
+#endif

#ifdef DOUBLE_CHECK
{

2008-11-08 01:24:40

by Li Zefan

[permalink] [raw]
Subject: Re: [RFC PATCH -V3 9/9] ext4: Fix lockdep recursive locking warning

Aneesh Kumar K.V wrote:
> On Fri, Nov 07, 2008 at 04:04:40PM +0800, Li Zefan wrote:
>> I got compile error on IA64:
>>
>> fs/ext4/mballoc.c: In function ‘ext4_mb_add_groupinfo’:
>> fs/ext4/mballoc.c:2482: error: implicit declaration of function ‘__init_rwsem’
>>
>>
>
> Can you try this patch. I guess IA64 doesn't support lockdep
>

Yes, it's not supported on IA64.

> ext4: Fix lockdep recursive locking warning
>

I manually did the same as this patch yesterday, and I've confirmed the
patch past compile.