2008-11-21 16:45:17

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V2 1/5] ext4: Remove unneeded code.

This also caused an ext4_error when mounting a file system
with only one block group.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/super.c | 4 ----
1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0582522..b5c7658 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1451,7 +1451,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
ext4_group_t flex_group_count;
ext4_group_t flex_group;
int groups_per_flex = 0;
- __u64 block_bitmap = 0;
int i;

if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1474,9 +1473,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
goto failed;
}

- gdp = ext4_get_group_desc(sb, 1, &bh);
- block_bitmap = ext4_block_bitmap(sb, gdp) - 1;


2008-11-21 16:44:44

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V2 2/5] ext4: unlock group before ext4_error

Otherwise ext4_error will cause BUG because of
scheduling in atomic context.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/ext4.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/mballoc.c | 30 +++++++++++++++---------------
fs/ext4/mballoc.h | 47 -----------------------------------------------
fs/ext4/super.c | 38 ++++++++++++++++++++++++++++++++++++++
4 files changed, 100 insertions(+), 62 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 774762e..ca88bd1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1184,6 +1184,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
extern void ext4_warning(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
+extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
+ const char *, const char *, ...)
+ __attribute__ ((format (printf, 4, 5)));
extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
@@ -1316,6 +1319,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
return ;
}

+struct ext4_group_info {
+ unsigned long bb_state;
+ struct rb_root bb_free_root;
+ unsigned short bb_first_free;
+ unsigned short bb_free;
+ unsigned short bb_fragments;
+ struct list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+ void *bb_bitmap;
+#endif
+ struct rw_semaphore alloc_sem;
+ unsigned short bb_counters[];
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
+#define EXT4_GROUP_INFO_LOCKED_BIT 1
+
+#define EXT4_MB_GRP_NEED_INIT(grp) \
+ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+ struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+ bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+ ext4_group_t group)
+{
+ struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+ bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline int ext4_is_group_locked(struct super_block *sb,
+ ext4_group_t group)
+{
+ struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+ return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+ &(grinfo->bb_state));
+}
+
/*
* Inodes and files operations
*/
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e648283..c5dcdf0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -457,8 +457,8 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
blocknr += first + i;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
- ext4_error(sb, __func__, "double-free of inode"
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ __func__, "double-free of inode"
" %lu's block %llu(bit %u in group %u)\n",
inode ? inode->i_ino : 0, blocknr,
first + i, e4b->bd_group);
@@ -702,7 +702,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
grp->bb_fragments = fragments;

if (free != grp->bb_free) {
- ext4_error(sb, __func__,
+ ext4_grp_locked_error(sb, group, __func__,
"EXT4-fs: group %u: %u blocks in bitmap, %u in gd\n",
group, free, grp->bb_free);
/*
@@ -1098,8 +1098,6 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)

static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
-__releases(bitlock)
-__acquires(bitlock)
{
int block = 0;
int max = 0;
@@ -1138,12 +1136,11 @@ __acquires(bitlock)
blocknr += block;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- ext4_unlock_group(sb, e4b->bd_group);
- ext4_error(sb, __func__, "double-free of inode"
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ __func__, "double-free of inode"
" %lu's block %llu(bit %u in group %u)\n",
inode ? inode->i_ino : 0, blocknr, block,
e4b->bd_group);
- ext4_lock_group(sb, e4b->bd_group);
}
mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
e4b->bd_info->bb_counters[order]++;
@@ -1623,7 +1620,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
* free blocks even though group info says we
* we have free blocks
*/
- ext4_error(sb, __func__, "%d free blocks as per "
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ __func__, "%d free blocks as per "
"group info. But bitmap says 0\n",
free);
break;
@@ -1632,7 +1630,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
- ext4_error(sb, __func__, "%d free blocks as per "
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ __func__, "%d free blocks as per "
"group info. But got %d blocks\n",
free, ex.fe_len);
/*
@@ -3791,8 +3790,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
pa, (unsigned long) pa->pa_lstart,
(unsigned long) pa->pa_pstart,
(unsigned long) pa->pa_len);
- ext4_error(sb, __func__, "free %u, pa_free %u\n",
- free, pa->pa_free);
+ ext4_grp_locked_error(sb, group,
+ __func__, "free %u, pa_free %u\n",
+ free, pa->pa_free);
/*
* pa is already deleted so we use the value obtained
* from the bitmap and continue.
@@ -4605,9 +4605,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
else if (block >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
else {
- ext4_error(sb, __func__,
- "Double free of blocks %d (%d %d)\n",
- block, entry->start_blk, entry->count);
+ ext4_grp_locked_error(sb, e4b->bd_group, __func__,
+ "Double free of blocks %d (%d %d)\n",
+ block, entry->start_blk, entry->count);
return 0;
}
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 407b39a..efb20b5 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -118,27 +118,6 @@ struct ext4_free_data {
tid_t t_tid;
};

-struct ext4_group_info {
- unsigned long bb_state;
- struct rb_root bb_free_root;
- unsigned short bb_first_free;
- unsigned short bb_free;
- unsigned short bb_fragments;
- struct list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
- void *bb_bitmap;
-#endif
- struct rw_semaphore alloc_sem;
- unsigned short bb_counters[];
-};
-
-#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
-#define EXT4_GROUP_INFO_LOCKED_BIT 1
-
-#define EXT4_MB_GRP_NEED_INIT(grp) \
- (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-
-
struct ext4_prealloc_space {
struct list_head pa_inode_list;
struct list_head pa_group_list;
@@ -265,32 +244,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)

struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
-
-
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline void ext4_unlock_group(struct super_block *sb,
- ext4_group_t group)
-{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline int ext4_is_group_locked(struct super_block *sb,
- ext4_group_t group)
-{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
- &(grinfo->bb_state));
-}
-
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
{
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b5c7658..42597b7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -350,6 +350,44 @@ void ext4_warning(struct super_block *sb, const char *function,
va_end(args);
}

+void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
+ const char *function, const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+ va_list args;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ va_start(args, fmt);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+
+ if (test_opt(sb, ERRORS_CONT)) {
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ ext4_commit_super(sb, es, 0);
+ return;
+ }
+ ext4_unlock_group(sb, grp);
+ ext4_handle_error(sb);
+ /*
+ * We only get here in the ERRORS_RO case; relocking the group
+ * may be dangerous, but nothing bad will happen since the
+ * filesystem will have already been marked read/only and the
+ * journal has been aborted. We return 1 as a hint to callers
+ * who might what to use the return value from
+ * ext4_grp_locked_error() to distinguish beween the
+ * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+ * aggressively from the ext4 function in question, with a
+ * more appropriate error code.
+ */
+ ext4_lock_group(sb, grp);
+ return;
+}
+
+
void ext4_update_dynamic_rev(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
--
1.6.0.4.735.gea4f


2008-11-21 16:44:48

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V2 4/5] ext4: Use both hi and lo bits of the group desc values.

Rename the lower bits with suffix _lo and add helper
to access the values. Also rename bg_itable_unused_hi
to bg_pad as in e2fsprogs. bg_itable_unused_hi is never
used before

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 13 ++++----
fs/ext4/ext4.h | 24 +++++++++++++--
fs/ext4/ialloc.c | 83 ++++++++++++++++++++++++++++-------------------------
fs/ext4/inode.c | 2 +-
fs/ext4/mballoc.c | 15 +++++----
fs/ext4/resize.c | 4 +-
fs/ext4/super.c | 64 +++++++++++++++++++++++++++++++++++++++-
7 files changed, 144 insertions(+), 61 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d268f5f..f2de3da 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -102,9 +102,9 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
ext4_error(sb, __func__,
"Checksum bad for group %u\n", block_group);
- gdp->bg_free_blocks_count = 0;
- gdp->bg_free_inodes_count = 0;
- gdp->bg_itable_unused = 0;
+ ext4_free_blks_set(sb, gdp, 0);
+ ext4_free_inodes_set(sb, gdp, 0);
+ ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
return 0;
}
@@ -372,7 +372,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
struct ext4_group_desc *desc;
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
- int err = 0, ret;
+ int err = 0, ret, blk_free_count;
ext4_grpblk_t blocks_freed;
struct ext4_group_info *grp;

@@ -444,7 +444,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
}
}
spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
+ blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+ ext4_free_blks_set(sb, desc, blk_free_count);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
@@ -742,7 +743,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ desc_count += ext4_free_blks_count(sb, gdp);
}

return desc_count;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ca88bd1..c1b2e41 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -161,9 +161,9 @@ struct ext4_group_desc
__le32 bg_block_bitmap_lo; /* Blocks bitmap block */
__le32 bg_inode_bitmap_lo; /* Inodes bitmap block */
__le32 bg_inode_table_lo; /* Inodes table block */
- __le16 bg_free_blocks_count; /* Free blocks count */
- __le16 bg_free_inodes_count; /* Free inodes count */
- __le16 bg_used_dirs_count; /* Directories count */
+ __le16 bg_free_blocks_count_lo;/* Free blocks count */
+ __le16 bg_free_inodes_count_lo;/* Free inodes count */
+ __le16 bg_used_dirs_count_lo; /* Directories count */
__le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
__u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
__le16 bg_itable_unused; /* Unused inodes count */
@@ -174,7 +174,7 @@ struct ext4_group_desc
__le16 bg_free_blocks_count_hi;/* Free blocks count MSB */
__le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
__le16 bg_used_dirs_count_hi; /* Directories count MSB */
- __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
+ __le16 bg_pad;
__u32 bg_reserved2[3];
};

@@ -1200,12 +1200,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
struct ext4_group_desc *bg);
+extern __u32 ext4_free_blks_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u16 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
extern void ext4_block_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_blks_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u16 count);
/* extents.c */
extern int ext4_ext_journal_restart(handle_t *handle, int needed);
/* defrag.c */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4039af6..84f060b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,9 +76,9 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
ext4_error(sb, __func__, "Checksum bad for group %u\n",
block_group);
- gdp->bg_free_blocks_count = 0;
- gdp->bg_free_inodes_count = 0;
- gdp->bg_itable_unused = 0;
+ ext4_free_blks_set(sb, gdp, 0);
+ ext4_free_inodes_set(sb, gdp, 0);
+ ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
return 0;
}
@@ -168,7 +168,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_group_desc *gdp;
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
- int fatal = 0, err;
+ int fatal = 0, err, count;
ext4_group_t flex_group;

if (atomic_read(&inode->i_count) > 1) {
@@ -236,9 +236,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)

if (gdp) {
spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&gdp->bg_free_inodes_count, 1);
- if (is_directory)
- le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+ count = ext4_free_inodes_count(sb, gdp) + 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (is_directory) {
+ count = ext4_used_dirs_count(sb, gdp) - 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ }
gdp->bg_checksum = ext4_group_desc_csum(sbi,
block_group, gdp);
spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -291,13 +294,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,

for (group = 0; group < ngroups; group++) {
desc = ext4_get_group_desc(sb, group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+ if (ext4_free_inodes_count(sb, desc) < avefreei)
continue;
if (!best_desc ||
- (le16_to_cpu(desc->bg_free_blocks_count) >
- le16_to_cpu(best_desc->bg_free_blocks_count))) {
+ (ext4_free_blks_count(sb, desc) >
+ ext4_free_blks_count(sb, best_desc))) {
*best_group = group;
best_desc = desc;
ret = 0;
@@ -369,7 +372,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
for (i = best_flex * flex_size; i < ngroups &&
i < (best_flex + 1) * flex_size; i++) {
desc = ext4_get_group_desc(sb, i, &bh);
- if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ if (ext4_free_inodes_count(sb, desc)) {
*best_group = i;
goto out;
}
@@ -443,17 +446,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
for (i = 0; i < ngroups; i++) {
grp = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, grp, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+ if (ext4_used_dirs_count(sb, desc) >= best_ndir)
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+ if (ext4_free_inodes_count(sb, desc) < avefreei)
continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+ if (ext4_free_blks_count(sb, desc) < avefreeb)
continue;
*group = grp;
ret = 0;
- best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+ best_ndir = ext4_used_dirs_count(sb, desc);
}
if (ret == 0)
return ret;
@@ -479,13 +482,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
for (i = 0; i < ngroups; i++) {
*group = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+ if (ext4_used_dirs_count(sb, desc) >= max_dirs)
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+ if (ext4_free_inodes_count(sb, desc) < min_inodes)
continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+ if (ext4_free_blks_count(sb, desc) < min_blocks)
continue;
return 0;
}
@@ -494,8 +497,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
for (i = 0; i < ngroups; i++) {
*group = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && desc->bg_free_inodes_count &&
- le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_inodes_count(sb, desc) >= avefreei)
return 0;
}

@@ -524,8 +527,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*/
*group = parent_group;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_blks_count(sb, desc))
return 0;

/*
@@ -548,8 +551,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
if (*group >= ngroups)
*group -= ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_blks_count(sb, desc))
return 0;
}

@@ -562,7 +565,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
if (++*group >= ngroups)
*group = 0;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+ if (desc && ext4_free_inodes_count(sb, desc))
return 0;
}

@@ -591,7 +594,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
struct ext4_super_block *es;
struct ext4_inode_info *ei;
struct ext4_sb_info *sbi;
- int ret2, err = 0;
+ int ret2, err = 0, count;
struct inode *ret;
ext4_group_t i;
int free = 0;
@@ -717,7 +720,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
free = ext4_free_blocks_after_init(sb, group, gdp);
- gdp->bg_free_blocks_count = cpu_to_le16(free);
+ ext4_free_blks_set(sb, gdp, free);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
gdp);
}
@@ -751,7 +754,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
free = 0;
} else {
free = EXT4_INODES_PER_GROUP(sb) -
- le16_to_cpu(gdp->bg_itable_unused);
+ ext4_itable_unused_count(sb, gdp);
}

/*
@@ -761,13 +764,15 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
*
*/
if (ino > free)
- gdp->bg_itable_unused =
- cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - ino));
}

- le16_add_cpu(&gdp->bg_free_inodes_count, -1);
+ count = ext4_free_inodes_count(sb, gdp) - 1;
+ ext4_free_inodes_set(sb, gdp, count);
if (S_ISDIR(mode)) {
- le16_add_cpu(&gdp->bg_used_dirs_count, 1);
+ count = ext4_used_dirs_count(sb, gdp) + 1;
+ ext4_used_dirs_set(sb, gdp, count);
}
gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
spin_unlock(sb_bgl_lock(sbi, group));
@@ -981,7 +986,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ desc_count += ext4_free_inodes_count(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_inode_bitmap(sb, i);
if (!bitmap_bh)
@@ -989,7 +994,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)

x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
- i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+ i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
@@ -1003,7 +1008,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ desc_count += ext4_free_inodes_count(sb, gdp);
cond_resched();
}
return desc_count;
@@ -1020,7 +1025,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- count += le16_to_cpu(gdp->bg_used_dirs_count);
+ count += ext4_used_dirs_count(sb, gdp);
}
return count;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 76b1d3a..2d31597 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3987,7 +3987,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
num = EXT4_INODES_PER_GROUP(sb);
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
- num -= le16_to_cpu(gdp->bg_itable_unused);
+ num -= ext4_itable_unused_count(sb, gdp);
table += num / inodes_per_block;
if (end > table)
end = table;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1ed949c..cbba8f4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2483,7 +2483,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
ext4_free_blocks_after_init(sb, group, desc);
} else {
meta_group_info[i]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
+ ext4_free_blks_count(sb, desc);
}

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -3017,12 +3017,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- gdp->bg_free_blocks_count =
- cpu_to_le16(ext4_free_blocks_after_init(sb,
- ac->ac_b_ex.fe_group,
- gdp));
+ ext4_free_blks_set(sb, gdp,
+ ext4_free_blocks_after_init(sb,
+ ac->ac_b_ex.fe_group, gdp));
}
- le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
+ len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+ ext4_free_blks_set(sb, gdp, len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -4799,7 +4799,8 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
}

spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&gdp->bg_free_blocks_count, count);
+ ret = ext4_free_blks_count(sb, gdp) + count;
+ ext4_free_blks_set(sb, gdp, ret);
gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 939cf26..6e61613 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -862,8 +862,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
- gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
- gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+ ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+ ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);

/*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 42597b7..046471f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -93,6 +93,36 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}

+__u32 ext4_free_blks_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u16 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_itable_unused);
+}
+
void ext4_block_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
@@ -117,6 +147,36 @@ void ext4_inode_table_set(struct super_block *sb,
bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}

+void ext4_free_blks_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u16 count)
+{
+ bg->bg_itable_unused = cpu_to_le16((__u16)count);
+}
+
/*
* Wrappers for jbd2_journal_start/end.
*
@@ -1516,9 +1576,9 @@ static int ext4_fill_flex_info(struct super_block *sb)

flex_group = ext4_flex_group(sbi, i);
sbi->s_flex_groups[flex_group].free_inodes +=
- le16_to_cpu(gdp->bg_free_inodes_count);
+ ext4_free_inodes_count(sb, gdp);
sbi->s_flex_groups[flex_group].free_blocks +=
- le16_to_cpu(gdp->bg_free_blocks_count);
+ ext4_free_blks_count(sb, gdp);
}

return 1;
--
1.6.0.4.735.gea4f


2008-11-21 16:44:50

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

We need to make sure we update the block bitmap and clear
EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held. We look
at EXT4_BG_BLOCK_UNINIT and reinit the block bitmap each
time in ext4_read_block_bitmap (introduced by
c806e68f5647109350ec546fee5b526962970fd2 )

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/mballoc.c | 17 ++++++++++++-----
1 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c5dcdf0..1ed949c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1065,6 +1065,8 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
__u32 *addr;

len = cur + len;
+ if (lock)
+ spin_lock(lock); \
while (cur < len) {
if ((cur & 31) == 0 && (len - cur) >= 32) {
/* fast path: clear whole word at once */
@@ -1073,9 +1075,11 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_clear_bit_atomic(lock, cur, bm);
+ mb_clear_bit(cur, bm);
cur++;
}
+ if (lock)
+ spin_unlock(lock); \
}

static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
@@ -1083,6 +1087,8 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
__u32 *addr;

len = cur + len;
+ if (lock)
+ spin_lock(lock); \
while (cur < len) {
if ((cur & 31) == 0 && (len - cur) >= 32) {
/* fast path: set whole word at once */
@@ -1091,9 +1097,11 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_set_bit_atomic(lock, cur, bm);
+ mb_set_bit(cur, bm);
cur++;
}
+ if (lock)
+ spin_unlock(lock); \
}

static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
@@ -3004,10 +3012,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
- ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);

2008-11-21 16:44:49

by Aneesh Kumar K.V

[permalink] [raw]
Subject: [PATCH -V2 5/5] ext4: Fix the race between read_inode_bitmap and ext4_new_inode

We need to make sure we update the inode bitmap and clear
EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held. We look
at EXT4_BG_INODE_UNINIT and reinit the inode bitmap each
time in ext4_read_inode_bitmap (introduced by
c806e68f5647109350ec546fee5b526962970fd2 )

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 2 +-
fs/ext4/ialloc.c | 191 ++++++++++++++++++++++++++++++-----------------------
fs/ext4/mballoc.c | 2 +-
3 files changed, 109 insertions(+), 86 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f2de3da..2189d3b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -329,8 +329,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);
set_buffer_uptodate(bh);
- unlock_buffer(bh);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ unlock_buffer(bh);
return bh;
}
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 84f060b..752fea2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -124,8 +124,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
ext4_init_inode_bitmap(sb, bh, block_group, desc);
set_buffer_uptodate(bh);
- unlock_buffer(bh);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ unlock_buffer(bh);
return bh;
}
spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
@@ -572,6 +572,71 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
return -1;
}

+static int ext4_claim_inode(struct super_block *sb,
+ struct buffer_head *inode_bitmap_bh,
+ unsigned long ino, ext4_group_t group, int mode)
+{
+ int free = 0, retval = 0, count;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+ spin_lock(sb_bgl_lock(sbi, group));
+ if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+ /* not a free inode */
+ retval = 1;
+ goto err_ret;
+ }
+ ino++;
+ if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+ ino > EXT4_INODES_PER_GROUP(sb)) {
+ spin_unlock(sb_bgl_lock(sbi, group));
+ ext4_error(sb, __func__,
+ "reserved inode or inode > inodes count - "
+ "block_group = %u, inode=%lu", group,
+ ino + group * EXT4_INODES_PER_GROUP(sb));
+ return 1;
+ }
+ /* If we didn't allocate from within the initialized part of the inode
+ * table then we need to initialize up to this inode. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ /* When marking the block group with
+ * ~EXT4_BG_INODE_UNINIT we don't want to depend
+ * on the value of bg_itable_unused even though
+ * mke2fs could have initialized the same for us.
+ * Instead we calculated the value below
+ */
+
+ free = 0;
+ } else {
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ }
+
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ *
+ */
+ if (ino > free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - ino));
+ }
+ count = ext4_free_inodes_count(sb, gdp) - 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (S_ISDIR(mode)) {
+ count = ext4_used_dirs_count(sb, gdp) + 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ }
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+ spin_unlock(sb_bgl_lock(sbi, group));
+ return retval;
+}
+
/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
@@ -585,8 +650,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
{
struct super_block *sb;
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *bh2;
+ struct buffer_head *inode_bitmap_bh = NULL;
+ struct buffer_head *group_desc_bh;
ext4_group_t group = 0;
unsigned long ino = 0;
struct inode *inode;
@@ -594,7 +659,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
struct ext4_super_block *es;
struct ext4_inode_info *ei;
struct ext4_sb_info *sbi;
- int ret2, err = 0, count;
+ int ret2, err = 0;
struct inode *ret;
ext4_group_t i;
int free = 0;
@@ -634,40 +699,50 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
for (i = 0; i < sbi->s_groups_count; i++) {
err = -EIO;

- gdp = ext4_get_group_desc(sb, group, &bh2);
+ gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp)
goto fail;

- brelse(bitmap_bh);
- bitmap_bh = ext4_read_inode_bitmap(sb, group);
- if (!bitmap_bh)
+ brelse(inode_bitmap_bh);
+ inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+ if (!inode_bitmap_bh)
goto fail;

ino = 0;

repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
- bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+ inode_bitmap_bh->b_data,
+ EXT4_INODES_PER_GROUP(sb), ino);
if (ino < EXT4_INODES_PER_GROUP(sb)) {

- BUFFER_TRACE(bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bitmap_bh);
+ BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle,
+ inode_bitmap_bh);
if (err)
goto fail;

- if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
- ino, bitmap_bh->b_data)) {
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle,
+ group_desc_bh);
+ if (err)
+ goto fail;
+ if (!ext4_claim_inode(sb, inode_bitmap_bh,
+ ino, group, mode)) {
/* we won it */
- BUFFER_TRACE(bitmap_bh,
+ BUFFER_TRACE(inode_bitmap_bh,
"call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle,
- bitmap_bh);
+ inode_bitmap_bh);
if (err)
goto fail;
+ /* zero bit is inode number 1*/
+ ino++;
goto got;
}
/* we lost it */
- jbd2_journal_release_buffer(handle, bitmap_bh);
+ jbd2_journal_release_buffer(handle, inode_bitmap_bh);
+ jbd2_journal_release_buffer(handle, group_desc_bh);

if (++ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
@@ -687,30 +762,16 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
goto out;

got:
- ino++;
- if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
- ino > EXT4_INODES_PER_GROUP(sb)) {
- ext4_error(sb, __func__,
- "reserved inode or inode > inodes count - "
- "block_group = %u, inode=%lu", group,
- ino + group * EXT4_INODES_PER_GROUP(sb));
- err = -EIO;
- goto fail;
- }
-
- BUFFER_TRACE(bh2, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh2);
- if (err) goto fail;
-
/* We may have to initialize the block bitmap if it isn't already */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
+ struct buffer_head *block_bitmap_bh;

- BUFFER_TRACE(block_bh, "get block bitmap access");
- err = ext4_journal_get_write_access(handle, block_bh);
+ block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+ err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
- brelse(block_bh);
+ brelse(block_bitmap_bh);
goto fail;
}

@@ -718,8 +779,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
spin_lock(sb_bgl_lock(sbi, group));
/* recheck and clear flag under lock if we still need to */
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
free = ext4_free_blocks_after_init(sb, group, gdp);
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
ext4_free_blks_set(sb, gdp, free);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
gdp);
@@ -728,57 +789,19 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)

/* Don't need to dirty bitmap block if we didn't change it */
if (free) {
- BUFFER_TRACE(block_bh, "dirty block bitmap");
- err = ext4_journal_dirty_metadata(handle, block_bh);
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_journal_dirty_metadata(handle,
+ block_bitmap_bh);
}

- brelse(block_bh);
+ brelse(block_bitmap_bh);
if (err)
goto fail;
}
-
- spin_lock(sb_bgl_lock(sbi, group));
- /* If we didn't allocate from within the initialized part of the inode
- * table then we need to initialize up to this inode. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-
- /* When marking the block group with
- * ~EXT4_BG_INODE_UNINIT we don't want to depend
- * on the value of bg_itable_unused even though
- * mke2fs could have initialized the same for us.
- * Instead we calculated the value below
- */
-
- free = 0;
- } else {
- free = EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp);
- }
-
- /*
- * Check the relative inode number against the last used
- * relative inode number in this group. if it is greater
- * we need to update the bg_itable_unused count
- *
- */
- if (ino > free)
- ext4_itable_unused_set(sb, gdp,
- (EXT4_INODES_PER_GROUP(sb) - ino));
- }
-
- count = ext4_free_inodes_count(sb, gdp) - 1;
- ext4_free_inodes_set(sb, gdp, count);
- if (S_ISDIR(mode)) {
- count = ext4_used_dirs_count(sb, gdp) + 1;
- ext4_used_dirs_set(sb, gdp, count);
- }
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
- spin_unlock(sb_bgl_lock(sbi, group));
- BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
- err = ext4_journal_dirty_metadata(handle, bh2);
- if (err) goto fail;
+ BUFFER_TRACE(group_desc_bh, "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle, group_desc_bh);
+ if (err)
+ goto fail;

percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
@@ -876,7 +899,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
iput(inode);
ret = ERR_PTR(err);
really_out:
- brelse(bitmap_bh);
+ brelse(inode_bitmap_bh);
return ret;

fail_free_drop:
@@ -887,7 +910,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
iput(inode);
- brelse(bitmap_bh);
+ brelse(inode_bitmap_bh);
return ERR_PTR(err);
}

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cbba8f4..93df76a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -804,8 +804,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
ext4_init_block_bitmap(sb, bh[i],
first_group + i, desc);
set_buffer_uptodate(bh[i]);
- unlock_buffer(bh[i]);
spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+ unlock_buffer(bh[i]);
continue;
}
spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
--
1.6.0.4.735.gea4f


2008-11-21 17:21:06

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH -V2 1/5] ext4: Remove unneeded code.

Aneesh Kumar K.V wrote:
> This also caused an ext4_error when mounting a file system
> with only one block group.

To make the commit log more useful, could you change it to something
more like:

ext4: Fix ext4_error case when only one block group exists

Leftover code in ext4_fill_flex_info was unconditionally calling
ext4_get_group_desc() for block group 1, and the result of that call was
never actually used. Worse, if we have a filesystem with only 1 block
group, (bg 0), the call would fail and call ext4_error() with the message:

"EXT4-fs error (device XXX): ext4_get_group_desc: block_group >=
groups_count - block_group = 1, groups_count = 1"

when attempting to mount a 1-blockgroup filesystem.

Just remove the unused call to fix this problem.

> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> fs/ext4/super.c | 4 ----
> 1 files changed, 0 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 0582522..b5c7658 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1451,7 +1451,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
> ext4_group_t flex_group_count;
> ext4_group_t flex_group;
> int groups_per_flex = 0;
> - __u64 block_bitmap = 0;
> int i;
>
> if (!sbi->s_es->s_log_groups_per_flex) {
> @@ -1474,9 +1473,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
> goto failed;
> }
>
> - gdp = ext4_get_group_desc(sb, 1, &bh);
> - block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
> -
> for (i = 0; i < sbi->s_groups_count; i++) {
> gdp = ext4_get_group_desc(sb, i, &bh);
>


2008-11-21 17:22:28

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

Aneesh Kumar K.V wrote:
> We need to make sure we update the block bitmap and clear
> EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held. We look
> at EXT4_BG_BLOCK_UNINIT and reinit the block bitmap each
> time in ext4_read_block_bitmap (introduced by
> c806e68f5647109350ec546fee5b526962970fd2 )

Can you add details about the failure mode(s) of this race, so people
(i.e. me) have an idea which bugs they've seen that it might address?

Thanks,
-Eric

2008-11-21 17:30:13

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH -V2 4/5] ext4: Use both hi and lo bits of the group desc values.

Aneesh Kumar K.V wrote:
> Rename the lower bits with suffix _lo and add helper
> to access the values. Also rename bg_itable_unused_hi
> to bg_pad as in e2fsprogs. bg_itable_unused_hi is never
> used before

Maybe you can add info about under which circumstances (what sort of
fielsystem geometry) the hi bits are needed? And the failure mode when
we don't use them, if that's known?

Sorry, I'm being nitpicky about commitlogs today :)

-Eric

2008-11-21 17:30:51

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH -V2 5/5] ext4: Fix the race between read_inode_bitmap and ext4_new_inode

Aneesh Kumar K.V wrote:
> We need to make sure we update the inode bitmap and clear
> EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held. We look
> at EXT4_BG_INODE_UNINIT and reinit the inode bitmap each
> time in ext4_read_inode_bitmap (introduced by
> c806e68f5647109350ec546fee5b526962970fd2 )

Same here as 3/5, if you can mention the failure modes this fixes,
that'd be great.

-Eric

2008-11-21 17:37:28

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

On Fri, Nov 21, 2008 at 11:22:04AM -0600, Eric Sandeen wrote:
> Aneesh Kumar K.V wrote:
> > We need to make sure we update the block bitmap and clear
> > EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held. We look
> > at EXT4_BG_BLOCK_UNINIT and reinit the block bitmap each
> > time in ext4_read_block_bitmap (introduced by
> > c806e68f5647109350ec546fee5b526962970fd2 )
>
> Can you add details about the failure mode(s) of this race, so people
> (i.e. me) have an idea which bugs they've seen that it might address?
>

ext4_read_block_bitmap does

spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);

the above ext4_init_block_bitmap actually zero out the block bitmap.

Now on the block allocation side we do

mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);

spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);

ie on allocation we update the bitmap then we take the sb_bgl_lock
and clear the EXT4_BG_BLOCK_UNINIT flag. What can happen is a
parallel ext4_read_block_bitmap can zero out the bitmap in between
the above mb_set_bits and spin_lock(sb_bg_lock..)

Result of this race is
a) blocks getting allocated multiple times
b) File corruption because two files have same blocks allocated
c) mb_free_blocks called multiple times on the same block

....

Same is true with inode bitmap also.

-aneesh

2008-11-21 17:40:09

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

Aneesh Kumar K.V wrote:
> On Fri, Nov 21, 2008 at 11:22:04AM -0600, Eric Sandeen wrote:
>> Aneesh Kumar K.V wrote:
>>> We need to make sure we update the block bitmap and clear
>>> EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held. We look
>>> at EXT4_BG_BLOCK_UNINIT and reinit the block bitmap each
>>> time in ext4_read_block_bitmap (introduced by
>>> c806e68f5647109350ec546fee5b526962970fd2 )
>> Can you add details about the failure mode(s) of this race, so people
>> (i.e. me) have an idea which bugs they've seen that it might address?
>>
>
> ext4_read_block_bitmap does
>
> spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
> if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> ext4_init_block_bitmap(sb, bh, block_group, desc);
>
> the above ext4_init_block_bitmap actually zero out the block bitmap.
>
> Now on the block allocation side we do
>
> mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
> ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
>
> spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
> if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
>
> ie on allocation we update the bitmap then we take the sb_bgl_lock
> and clear the EXT4_BG_BLOCK_UNINIT flag. What can happen is a
> parallel ext4_read_block_bitmap can zero out the bitmap in between
> the above mb_set_bits and spin_lock(sb_bg_lock..)
>
> Result of this race is
> a) blocks getting allocated multiple times
> b) File corruption because two files have same blocks allocated
> c) mb_free_blocks called multiple times on the same block

Thanks -

And do any of these cases lead to BUG(), WARNING(), ext3_error(), etc
messages that people may one day google for?

-Eric

2008-11-21 17:41:12

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

Aneesh Kumar K.V wrote:
> On Fri, Nov 21, 2008 at 11:01:35PM +0530, Aneesh Kumar K.V wrote:
>> On Fri, Nov 21, 2008 at 11:22:04AM -0600, Eric Sandeen wrote:
>>> Aneesh Kumar K.V wrote:
>>>> We need to make sure we update the block bitmap and clear
>>>> EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held. We look
>>>> at EXT4_BG_BLOCK_UNINIT and reinit the block bitmap each
>>>> time in ext4_read_block_bitmap (introduced by
>>>> c806e68f5647109350ec546fee5b526962970fd2 )
>>> Can you add details about the failure mode(s) of this race, so people
>>> (i.e. me) have an idea which bugs they've seen that it might address?
>>>
>
> The errors I have seen are

Ah, there we go. IMHO, putting a few of these errors into the commit
would be helpful.

Thanks,
-Eric

> a)
> 3795 if (free != pa->pa_free) {
> 3796 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
> 3797 pa, (unsigned long) pa->pa_lstart,
> 3798 (unsigned long) pa->pa_pstart,
> 3799 (unsigned long) pa->pa_len);
>
> b)
>
> 1091 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
> 1092 ext4_fsblk_t blocknr;
> 1093 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
> 1094 blocknr += block;
> 1095 blocknr +=
> 1096 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
> 1097 ext4_unlock_group(sb, e4b->bd_group);
> 1098 ext4_error(sb, __func__, "double-free of
> inode"
>
> For inode bitmap i have seen
>
> [[email protected] tmp]# ls -al /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71
> ls: /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71: Stale NFS file handle
> [[email protected] tmp]# ls -al /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/
> total 411
> drwxrwxrwx 3 689933 root 1024 Nov 18 05:59 .
> drwxrwxrwx 3 8391 root 1024 Nov 18 05:59 ..
> drwxrwxrwx 2 root root 1024 Nov 18 05:33 d83
> -rw-rw-rw- 1 root root 0 Nov 18 05:06 fb4
> -rw-rw-rw- 1 root root 3350138 Nov 18 05:33 fb9
> ?--------- ? ? ? ? ? l71
> lrwxrwxrwx 1 root root 509 Nov 18 05:23 ld9 -> xxxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxx
> [[email protected] tmp]#
>
> dmesg gives:
>
> EXT4-fs error (device sdb1): ext4_free_inode: bit already cleared for inode 168449
>
>
> Some other message i got before. But i didn't capture the info fully
>
> a) "Deleting nonexistent file ..." warning in ext4_unlink
>
> b) "Empty directory has too many links..." in ext4_rmdir
>


2008-11-21 17:43:53

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 4/5] ext4: Use both hi and lo bits of the group desc values.

On Fri, Nov 21, 2008 at 11:29:49AM -0600, Eric Sandeen wrote:
> Aneesh Kumar K.V wrote:
> > Rename the lower bits with suffix _lo and add helper
> > to access the values. Also rename bg_itable_unused_hi
> > to bg_pad as in e2fsprogs. bg_itable_unused_hi is never
> > used before
>
> Maybe you can add info about under which circumstances (what sort of
> fielsystem geometry) the hi bits are needed? And the failure mode when
> we don't use them, if that's known?

Coly actually said it was added for inode reservation. But he agreed to
drop the high bits now and add them later once the performance issue
with the patch is addressed.

-aneesh

2008-11-21 17:45:54

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

On Fri, Nov 21, 2008 at 11:01:35PM +0530, Aneesh Kumar K.V wrote:
> On Fri, Nov 21, 2008 at 11:22:04AM -0600, Eric Sandeen wrote:
> > Aneesh Kumar K.V wrote:
> > > We need to make sure we update the block bitmap and clear
> > > EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held. We look
> > > at EXT4_BG_BLOCK_UNINIT and reinit the block bitmap each
> > > time in ext4_read_block_bitmap (introduced by
> > > c806e68f5647109350ec546fee5b526962970fd2 )
> >
> > Can you add details about the failure mode(s) of this race, so people
> > (i.e. me) have an idea which bugs they've seen that it might address?
> >
>

The errors I have seen are

a)
3795 if (free != pa->pa_free) {
3796 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
3797 pa, (unsigned long) pa->pa_lstart,
3798 (unsigned long) pa->pa_pstart,
3799 (unsigned long) pa->pa_len);

b)

1091 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1092 ext4_fsblk_t blocknr;
1093 blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb);
1094 blocknr += block;
1095 blocknr +=
1096 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
1097 ext4_unlock_group(sb, e4b->bd_group);
1098 ext4_error(sb, __func__, "double-free of
inode"

For inode bitmap i have seen

[[email protected] tmp]# ls -al /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71
ls: /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71: Stale NFS file handle
[[email protected] tmp]# ls -al /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/
total 411
drwxrwxrwx 3 689933 root 1024 Nov 18 05:59 .
drwxrwxrwx 3 8391 root 1024 Nov 18 05:59 ..
drwxrwxrwx 2 root root 1024 Nov 18 05:33 d83
-rw-rw-rw- 1 root root 0 Nov 18 05:06 fb4
-rw-rw-rw- 1 root root 3350138 Nov 18 05:33 fb9
?--------- ? ? ? ? ? l71
lrwxrwxrwx 1 root root 509 Nov 18 05:23 ld9 -> xxxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxxx/xxxxxxxx
[[email protected] tmp]#

dmesg gives:

EXT4-fs error (device sdb1): ext4_free_inode: bit already cleared for inode 168449


Some other message i got before. But i didn't capture the info fully

a) "Deleting nonexistent file ..." warning in ext4_unlink

b) "Empty directory has too many links..." in ext4_rmdir


2008-11-21 17:53:58

by Eric Sandeen

[permalink] [raw]
Subject: Re: [PATCH -V2 4/5] ext4: Use both hi and lo bits of the group desc values.

Aneesh Kumar K.V wrote:
> On Fri, Nov 21, 2008 at 11:29:49AM -0600, Eric Sandeen wrote:
>> Aneesh Kumar K.V wrote:
>>> Rename the lower bits with suffix _lo and add helper
>>> to access the values. Also rename bg_itable_unused_hi
>>> to bg_pad as in e2fsprogs. bg_itable_unused_hi is never
>>> used before
>> Maybe you can add info about under which circumstances (what sort of
>> fielsystem geometry) the hi bits are needed? And the failure mode when
>> we don't use them, if that's known?
>
> Coly actually said it was added for inode reservation. But he agreed to
> drop the high bits now and add them later once the performance issue
> with the patch is addressed.
>
> -aneesh

Ok, and for the rest? (free_blocks, free_inodes) - I guess these are
only needed for > 2^32 block filesystems?

Thanks,
-eric

2008-11-23 04:09:47

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH -V2 4/5] ext4: Use both hi and lo bits of the group desc values.

On Nov 21, 2008 11:53 -0600, Eric Sandeen wrote:
> Ok, and for the rest? (free_blocks, free_inodes) - I guess these are
> only needed for > 2^32 block filesystems?

Actually, since this is a per-group number, I'm not sure why this is
needed at all. I don't object to keeping the helper functions though.

Cheers, Andreas
--
Andreas Dilger
Sr. Staff Engineer, Lustre Group
Sun Microsystems of Canada, Inc.


2008-11-23 13:37:24

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V2 2/5] ext4: unlock group before ext4_error

On Fri, Nov 21, 2008 at 10:14:32PM +0530, Aneesh Kumar K.V wrote:
> Otherwise ext4_error will cause BUG because of
> scheduling in atomic context.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>

When I tried adding this patch to the patch queue, I got the following
rejected chunk:

diff a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c (rejected hunks)
@@ -3791,8 +3790,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
pa, (unsigned long) pa->pa_lstart,
(unsigned long) pa->pa_pstart,
(unsigned long) pa->pa_len);
- ext4_error(sb, __func__, "free %u, pa_free %u\n",
- free, pa->pa_free);
+ ext4_grp_locked_error(sb, group,
+ __func__, "free %u, pa_free %u\n",
+ free, pa->pa_free);
/*
* pa is already deleted so we use the value obtained
* from the bitmap and continue.

This looks like it came from the patch
aneesh-8-fix-double-free-of-blocks which in a message sent roughly at
the same time you told me to drop from the patch queue. So I'm adding
this to the patch queue without this rejected hunk.

One of the challenges right now with applying your patches is that you
have a large number of patches in the unstable part of the tree, and
when you send me new patches, it's not clear where they apply and
since they are entagled with each other, I may get some of the wrong.

If there are certain patches which we are sure are OK, such as the
sparse fixes, I can move them into the stable part of the tree and
assume they aren't going to change moving forward. But for patches
which are unstable, I'm going to need some help from you to make sure
they get applied in a sane and stable order....

- Ted

2008-11-23 13:45:39

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V2 2/5] ext4: unlock group before ext4_error

On Sun, Nov 23, 2008 at 08:37:04AM -0500, Theodore Tso wrote:
> This looks like it came from the patch
> aneesh-8-fix-double-free-of-blocks which in a message sent roughly at
> the same time you told me to drop from the patch queue. So I'm adding
> this to the patch queue without this rejected hunk.

Sorry, I managed to confuse myself. The patch conflict went away
after I removed the patch per your e-mail. I have so many *.rej files
in my tree from failed patches that I managed to confuse myself. I'll
have to clean them out....

My comments about how difficult it is to figure out where your patches
apply are still relevant...... I'm still trying to figure this all
out, but you'll need to check the patch queue afterwards. Can you
please give me a status readout about which patches I can move into
the stable part of the tree, please? And what the status is of the
other patches? Thanks!!!

- Ted

2008-11-23 14:00:56

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

On Fri, Nov 21, 2008 at 10:14:33PM +0530, Aneesh Kumar K.V wrote:
> We need to make sure we update the block bitmap and clear
> EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held. We look
> at EXT4_BG_BLOCK_UNINIT and reinit the block bitmap each
> time in ext4_read_block_bitmap (introduced by
> c806e68f5647109350ec546fee5b526962970fd2 )

You are changing mb_clear_bits() and and mb_set_bits() so they take
the spinlock over the entire operaiton, instead of over each
particular bit. These function are used in a largish number of
places, not just for updating the block bitmap, but also the mb buddy
bitmaps, etc. So there may be a scalability impact here, although
taking the spinlock once instead of multiple times is probably a win.

My bigger concern is given that we are playing games like *this*:

if ((cur & 31) == 0 && (len - cur) >= 32) {
/* fast path: set whole word at once */
addr = bm + (cur >> 3);
*addr = 0xffffffff;
cur += 32;
continue;
}

without taking a lock, I'm a little surprised we haven't been
seriously burned by other race conditions. What's the point of
calling mb_set_bit_atomic() and passing in a spinlock if we are doing
this kind of check without the protection of the same spinlock?!?

Andreas, if you are using mb_clear_bits() and mb_set_bits() in
Lustre's mballoc.c with this in production, you may want to take a
look at this patch.

- Ted

2008-11-23 14:00:57

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 2/5] ext4: unlock group before ext4_error

On Sun, Nov 23, 2008 at 08:43:04AM -0500, Theodore Tso wrote:
> On Sun, Nov 23, 2008 at 08:37:04AM -0500, Theodore Tso wrote:
> > This looks like it came from the patch
> > aneesh-8-fix-double-free-of-blocks which in a message sent roughly at
> > the same time you told me to drop from the patch queue. So I'm adding
> > this to the patch queue without this rejected hunk.
>
> Sorry, I managed to confuse myself. The patch conflict went away
> after I removed the patch per your e-mail. I have so many *.rej files
> in my tree from failed patches that I managed to confuse myself. I'll
> have to clean them out....
>
> My comments about how difficult it is to figure out where your patches
> apply are still relevant...... I'm still trying to figure this all
> out, but you'll need to check the patch queue afterwards. Can you
> please give me a status readout about which patches I can move into
> the stable part of the tree, please? And what the status is of the
> other patches? Thanks!!!
>

All the patches I do are mostly after applying all the patches in the
patch queue. I would wait for some review and wider testing
on the patches before they can be moved to stable part of the
patch queue.

-aneesh

2008-11-23 19:02:43

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

On Fri, Nov 21, 2008 at 11:01:35PM +0530, Aneesh Kumar K.V wrote:
> Same is true with inode bitmap also.
>

I don't see how this patch could affect any races with the inode
bitmaps; the patch only affects mballoc.c, and only changes the
locking around bitmaps.

This is one that I think we may want to send to Linus soon as a 2.6.28
bugfix.

- Ted

2008-11-23 19:28:18

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V2 5/5] ext4: Fix the race between read_inode_bitmap and ext4_new_inode

It's a lot easier to review patches if you explain why new functions
(such as ext4_claim_inode(), in this case) are required, and to add a
comment at the beginning of the new function explaining what it does,
what its return code it needs, whether it needs to be called with any
locks held, whether it exits with locks taken or released, etc.

Thanks,

- Ted

2008-11-24 01:26:29

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V2 4/5] ext4: Use both hi and lo bits of the group desc values.

On Sat, Nov 22, 2008 at 10:09:39PM -0600, Andreas Dilger wrote:
> On Nov 21, 2008 11:53 -0600, Eric Sandeen wrote:
> > Ok, and for the rest? (free_blocks, free_inodes) - I guess these are
> > only needed for > 2^32 block filesystems?
>
> Actually, since this is a per-group number, I'm not sure why this is
> needed at all. I don't object to keeping the helper functions though.

For most systems, it wn't be needed at all. However, for filesystems
with block sizes >= 8k, we run into a limit where the maximum number
of blocks and inodes we can support per block group is 65528 (we can't
support 65536 since that would cause bg_free_blocks_count and/or
bg_free_blocks_count, which are both __u16 types, to overflow). So on
systems with a page size > 4k, or if Christoph's patch to support fs
block sizes > page size ever makes it into mainline, this support
could be useful.

- Ted

2008-11-24 02:15:33

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V2 4/5] ext4: Use both hi and lo bits of the group desc values.

On Fri, Nov 21, 2008 at 11:11:10PM +0530, Aneesh Kumar K.V wrote:
> On Fri, Nov 21, 2008 at 11:29:49AM -0600, Eric Sandeen wrote:
> > Aneesh Kumar K.V wrote:
> > > Rename the lower bits with suffix _lo and add helper
> > > to access the values. Also rename bg_itable_unused_hi
> > > to bg_pad as in e2fsprogs. bg_itable_unused_hi is never
> > > used before
> >
> > Maybe you can add info about under which circumstances (what sort of
> > fielsystem geometry) the hi bits are needed? And the failure mode when
> > we don't use them, if that's known?
>
> Coly actually said it was added for inode reservation. But he agreed to
> drop the high bits now and add them later once the performance issue
> with the patch is addressed.

The bg_itable_unused field is used for the lazy initialization of
inodes. If the units of this field was blocks was blocks, we wouldn't
need bg_itable_unused_hi for quite a larger set of blocks.

I don't think we should nuke bg_itable_unused_hi, especially given
that we have a patch to support the kernel initializing the inodes
(yet to be reviewed). As near as I can tell at the time when the
fields were added to e2fsprogs, we hadn't yet assigned the pad field
to bg_itable_unused_hi, but it's clearly needed if we want to support
a larger number of blocks or inodes per block group.

Instead we should create the helper functions for bg_itable_unused,
and update e2fsprogs to also support these fields. Adding e2fsprogs
for large block groups is lower priority than large block numbers,
though.

- Ted

2008-11-24 04:10:35

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH -V2 5/5] ext4: Fix the race between read_inode_bitmap and ext4_new_inode

On Fri, Nov 21, 2008 at 10:14:35PM +0530, Aneesh Kumar K.V wrote:
> We need to make sure we update the inode bitmap and clear
> EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held. We look
> at EXT4_BG_INODE_UNINIT and reinit the inode bitmap each
> time in ext4_read_inode_bitmap (introduced by
> c806e68f5647109350ec546fee5b526962970fd2 )

OK, I believe I've checked in all of your patches in this series into
the ext4 patch queue

Some of them have comments that still need to be cleared; this one in
particular needs a better commit comment, and ideally a comment for
the new function ext4_claim_inode().

Also, please don't rename variables unnecessarily; if you really think
it's needed, please do so in a separate patch. The renaming of
variables makes it much harder to review the patch, since it bloats
the patch, and obscures the true changes happening in the patch.
Please explain why you are making some of the changes you made in the
patch. In particular, why does it matter the order in which you
unlock the bh and sb_bgl_lock in balloc.c, mballoc.c and inode.c?


Thanks,

- Ted

2008-11-24 06:15:21

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

Theodore Tso wrote:
> My bigger concern is given that we are playing games like *this*:
>
> if ((cur & 31) == 0 && (len - cur) >= 32) {
> /* fast path: set whole word at once */
> addr = bm + (cur >> 3);
> *addr = 0xffffffff;
> cur += 32;
> continue;
> }

this is to avoid expensive LOCK prefix in some cases.

> without taking a lock, I'm a little surprised we haven't been
> seriously burned by other race conditions. What's the point of
> calling mb_set_bit_atomic() and passing in a spinlock if we are doing
> this kind of check without the protection of the same spinlock?!?

why would we need a lock for a whole word bitop ?

thanks, Alex


2008-11-24 06:41:04

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

On Sun, Nov 23, 2008 at 02:02:24PM -0500, Theodore Tso wrote:
> On Fri, Nov 21, 2008 at 11:01:35PM +0530, Aneesh Kumar K.V wrote:
> > Same is true with inode bitmap also.
> >
>
> I don't see how this patch could affect any races with the inode
> bitmaps; the patch only affects mballoc.c, and only changes the
> locking around bitmaps.

The intent was to explain why we need
[PATCH -V2 5/5] ext4: Fix the race between read_inode_bitmap
and ext4_new_inode


>
> This is one that I think we may want to send to Linus soon as a 2.6.28
> bugfix.
>

Yes.

-aneesh

2008-11-24 10:38:30

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 4/5] ext4: Use both hi and lo bits of the group desc values.

On Sun, Nov 23, 2008 at 09:13:02PM -0500, Theodore Tso wrote:
> On Fri, Nov 21, 2008 at 11:11:10PM +0530, Aneesh Kumar K.V wrote:
> > On Fri, Nov 21, 2008 at 11:29:49AM -0600, Eric Sandeen wrote:
> > > Aneesh Kumar K.V wrote:
> > > > Rename the lower bits with suffix _lo and add helper
> > > > to access the values. Also rename bg_itable_unused_hi
> > > > to bg_pad as in e2fsprogs. bg_itable_unused_hi is never
> > > > used before
> > >
> > > Maybe you can add info about under which circumstances (what sort of
> > > fielsystem geometry) the hi bits are needed? And the failure mode when
> > > we don't use them, if that's known?
> >
> > Coly actually said it was added for inode reservation. But he agreed to
> > drop the high bits now and add them later once the performance issue
> > with the patch is addressed.
>
> The bg_itable_unused field is used for the lazy initialization of
> inodes. If the units of this field was blocks was blocks, we wouldn't
> need bg_itable_unused_hi for quite a larger set of blocks.
>
> I don't think we should nuke bg_itable_unused_hi, especially given
> that we have a patch to support the kernel initializing the inodes
> (yet to be reviewed). As near as I can tell at the time when the
> fields were added to e2fsprogs, we hadn't yet assigned the pad field
> to bg_itable_unused_hi, but it's clearly needed if we want to support
> a larger number of blocks or inodes per block group.
>
> Instead we should create the helper functions for bg_itable_unused,
> and update e2fsprogs to also support these fields. Adding e2fsprogs
> for large block groups is lower priority than large block numbers,
> though.

The patch already contain the helper function. Initially I had the patch
to use bg_itable_unused_hi. But later dropped it after discussion with
Coly. Updated patch below.

ext4: Use high 16 bits of the block group descriptor's free counts fields

From: Aneesh Kumar K.V <[email protected]>

Rename the lower bits with suffix _lo and add helper
to access the values. Also rename bg_itable_unused_hi
to bg_pad as in e2fsprogs.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/balloc.c | 13 ++++----
fs/ext4/ext4.h | 26 +++++++++++++----
fs/ext4/ialloc.c | 83 ++++++++++++++++++++++++++++-------------------------
fs/ext4/inode.c | 2 +
fs/ext4/mballoc.c | 15 +++++-----
fs/ext4/resize.c | 4 +--
fs/ext4/super.c | 68 ++++++++++++++++++++++++++++++++++++++++++-
7 files changed, 149 insertions(+), 62 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 39df492..8dd7b0d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -102,9 +102,9 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
ext4_error(sb, __func__,
"Checksum bad for group %u\n", block_group);
- gdp->bg_free_blocks_count = 0;
- gdp->bg_free_inodes_count = 0;
- gdp->bg_itable_unused = 0;
+ ext4_free_blks_set(sb, gdp, 0);
+ ext4_free_inodes_set(sb, gdp, 0);
+ ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
return 0;
}
@@ -372,7 +372,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
struct ext4_group_desc *desc;
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
- int err = 0, ret;
+ int err = 0, ret, blk_free_count;
ext4_grpblk_t blocks_freed;
struct ext4_group_info *grp;

@@ -444,7 +444,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
}
}
spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
+ blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+ ext4_free_blks_set(sb, desc, blk_free_count);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
@@ -741,7 +742,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ desc_count += ext4_free_blks_count(sb, gdp);
}

return desc_count;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3f0c2e1..1f03808 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -156,12 +156,12 @@ struct ext4_group_desc
__le32 bg_block_bitmap_lo; /* Blocks bitmap block */
__le32 bg_inode_bitmap_lo; /* Inodes bitmap block */
__le32 bg_inode_table_lo; /* Inodes table block */
- __le16 bg_free_blocks_count; /* Free blocks count */
- __le16 bg_free_inodes_count; /* Free inodes count */
- __le16 bg_used_dirs_count; /* Directories count */
+ __le16 bg_free_blocks_count_lo;/* Free blocks count */
+ __le16 bg_free_inodes_count_lo;/* Free inodes count */
+ __le16 bg_used_dirs_count_lo; /* Directories count */
__le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */
__u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */
- __le16 bg_itable_unused; /* Unused inodes count */
+ __le16 bg_itable_unused_lo; /* Unused inodes count */
__le16 bg_checksum; /* crc16(sb_uuid+group+desc) */
__le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */
__le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */
@@ -169,7 +169,7 @@ struct ext4_group_desc
__le16 bg_free_blocks_count_hi;/* Free blocks count MSB */
__le16 bg_free_inodes_count_hi;/* Free inodes count MSB */
__le16 bg_used_dirs_count_hi; /* Directories count MSB */
- __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
+ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */
__u32 bg_reserved2[3];
};

@@ -1139,12 +1139,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
struct ext4_group_desc *bg);
+extern __u32 ext4_free_blks_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg);
extern void ext4_block_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_blks_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count);

static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4039af6..84f060b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,9 +76,9 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
ext4_error(sb, __func__, "Checksum bad for group %u\n",
block_group);
- gdp->bg_free_blocks_count = 0;
- gdp->bg_free_inodes_count = 0;
- gdp->bg_itable_unused = 0;
+ ext4_free_blks_set(sb, gdp, 0);
+ ext4_free_inodes_set(sb, gdp, 0);
+ ext4_itable_unused_set(sb, gdp, 0);
memset(bh->b_data, 0xff, sb->s_blocksize);
return 0;
}
@@ -168,7 +168,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_group_desc *gdp;
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
- int fatal = 0, err;
+ int fatal = 0, err, count;
ext4_group_t flex_group;

if (atomic_read(&inode->i_count) > 1) {
@@ -236,9 +236,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)

if (gdp) {
spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&gdp->bg_free_inodes_count, 1);
- if (is_directory)
- le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+ count = ext4_free_inodes_count(sb, gdp) + 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (is_directory) {
+ count = ext4_used_dirs_count(sb, gdp) - 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ }
gdp->bg_checksum = ext4_group_desc_csum(sbi,
block_group, gdp);
spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -291,13 +294,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,

for (group = 0; group < ngroups; group++) {
desc = ext4_get_group_desc(sb, group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+ if (ext4_free_inodes_count(sb, desc) < avefreei)
continue;
if (!best_desc ||
- (le16_to_cpu(desc->bg_free_blocks_count) >
- le16_to_cpu(best_desc->bg_free_blocks_count))) {
+ (ext4_free_blks_count(sb, desc) >
+ ext4_free_blks_count(sb, best_desc))) {
*best_group = group;
best_desc = desc;
ret = 0;
@@ -369,7 +372,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
for (i = best_flex * flex_size; i < ngroups &&
i < (best_flex + 1) * flex_size; i++) {
desc = ext4_get_group_desc(sb, i, &bh);
- if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ if (ext4_free_inodes_count(sb, desc)) {
*best_group = i;
goto out;
}
@@ -443,17 +446,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
for (i = 0; i < ngroups; i++) {
grp = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, grp, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+ if (ext4_used_dirs_count(sb, desc) >= best_ndir)
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+ if (ext4_free_inodes_count(sb, desc) < avefreei)
continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+ if (ext4_free_blks_count(sb, desc) < avefreeb)
continue;
*group = grp;
ret = 0;
- best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+ best_ndir = ext4_used_dirs_count(sb, desc);
}
if (ret == 0)
return ret;
@@ -479,13 +482,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
for (i = 0; i < ngroups; i++) {
*group = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (!desc || !desc->bg_free_inodes_count)
+ if (!desc || !ext4_free_inodes_count(sb, desc))
continue;
- if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+ if (ext4_used_dirs_count(sb, desc) >= max_dirs)
continue;
- if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+ if (ext4_free_inodes_count(sb, desc) < min_inodes)
continue;
- if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+ if (ext4_free_blks_count(sb, desc) < min_blocks)
continue;
return 0;
}
@@ -494,8 +497,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
for (i = 0; i < ngroups; i++) {
*group = (parent_group + i) % ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && desc->bg_free_inodes_count &&
- le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_inodes_count(sb, desc) >= avefreei)
return 0;
}

@@ -524,8 +527,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*/
*group = parent_group;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_blks_count(sb, desc))
return 0;

/*
@@ -548,8 +551,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
if (*group >= ngroups)
*group -= ngroups;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
- le16_to_cpu(desc->bg_free_blocks_count))
+ if (desc && ext4_free_inodes_count(sb, desc) &&
+ ext4_free_blks_count(sb, desc))
return 0;
}

@@ -562,7 +565,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
if (++*group >= ngroups)
*group = 0;
desc = ext4_get_group_desc(sb, *group, NULL);
- if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+ if (desc && ext4_free_inodes_count(sb, desc))
return 0;
}

@@ -591,7 +594,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
struct ext4_super_block *es;
struct ext4_inode_info *ei;
struct ext4_sb_info *sbi;
- int ret2, err = 0;
+ int ret2, err = 0, count;
struct inode *ret;
ext4_group_t i;
int free = 0;
@@ -717,7 +720,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
free = ext4_free_blocks_after_init(sb, group, gdp);
- gdp->bg_free_blocks_count = cpu_to_le16(free);
+ ext4_free_blks_set(sb, gdp, free);
gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
gdp);
}
@@ -751,7 +754,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
free = 0;
} else {
free = EXT4_INODES_PER_GROUP(sb) -
- le16_to_cpu(gdp->bg_itable_unused);
+ ext4_itable_unused_count(sb, gdp);
}

/*
@@ -761,13 +764,15 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
*
*/
if (ino > free)
- gdp->bg_itable_unused =
- cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - ino));
}

- le16_add_cpu(&gdp->bg_free_inodes_count, -1);
+ count = ext4_free_inodes_count(sb, gdp) - 1;
+ ext4_free_inodes_set(sb, gdp, count);
if (S_ISDIR(mode)) {
- le16_add_cpu(&gdp->bg_used_dirs_count, 1);
+ count = ext4_used_dirs_count(sb, gdp) + 1;
+ ext4_used_dirs_set(sb, gdp, count);
}
gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
spin_unlock(sb_bgl_lock(sbi, group));
@@ -981,7 +986,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ desc_count += ext4_free_inodes_count(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_inode_bitmap(sb, i);
if (!bitmap_bh)
@@ -989,7 +994,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)

x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
- i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+ i, ext4_free_inodes_count(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
@@ -1003,7 +1008,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ desc_count += ext4_free_inodes_count(sb, gdp);
cond_resched();
}
return desc_count;
@@ -1020,7 +1025,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- count += le16_to_cpu(gdp->bg_used_dirs_count);
+ count += ext4_used_dirs_count(sb, gdp);
}
return count;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 76b1d3a..2d31597 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3987,7 +3987,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
num = EXT4_INODES_PER_GROUP(sb);
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
- num -= le16_to_cpu(gdp->bg_itable_unused);
+ num -= ext4_itable_unused_count(sb, gdp);
table += num / inodes_per_block;
if (end > table)
end = table;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f6c1b0b..f97dfd8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2476,7 +2476,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
ext4_free_blocks_after_init(sb, group, desc);
} else {
meta_group_info[i]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
+ ext4_free_blks_count(sb, desc);
}

INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -3004,12 +3004,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- gdp->bg_free_blocks_count =
- cpu_to_le16(ext4_free_blocks_after_init(sb,
- ac->ac_b_ex.fe_group,
- gdp));
+ ext4_free_blks_set(sb, gdp,
+ ext4_free_blocks_after_init(sb,
+ ac->ac_b_ex.fe_group, gdp));
}
- le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
+ len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+ ext4_free_blks_set(sb, gdp, len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -4785,7 +4785,8 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
}

spin_lock(sb_bgl_lock(sbi, block_group));
- le16_add_cpu(&gdp->bg_free_blocks_count, count);
+ ret = ext4_free_blks_count(sb, gdp) + count;
+ ext4_free_blks_set(sb, gdp, ret);
gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 939cf26..6e61613 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -862,8 +862,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
- gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
- gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+ ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+ ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);

/*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14cd87..5acd386 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -93,6 +93,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}

+__u32 ext4_free_blks_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le16_to_cpu(bg->bg_itable_unused_lo) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
+
void ext4_block_bitmap_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk)
{
@@ -117,6 +149,38 @@ void ext4_inode_table_set(struct super_block *sb,
bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
}

+void ext4_free_blks_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_free_inodes_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_used_dirs_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_itable_unused_set(struct super_block *sb,
+ struct ext4_group_desc *bg, __u32 count)
+{
+ bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
+
/*
* Wrappers for jbd2_journal_start/end.
*
@@ -1510,9 +1574,9 @@ static int ext4_fill_flex_info(struct super_block *sb)

flex_group = ext4_flex_group(sbi, i);
sbi->s_flex_groups[flex_group].free_inodes +=
- le16_to_cpu(gdp->bg_free_inodes_count);
+ ext4_free_inodes_count(sb, gdp);
sbi->s_flex_groups[flex_group].free_blocks +=
- le16_to_cpu(gdp->bg_free_blocks_count);
+ ext4_free_blks_count(sb, gdp);
}

return 1;

2008-11-24 11:16:00

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 5/5] ext4: Fix the race between read_inode_bitmap and ext4_new_inode

On Sun, Nov 23, 2008 at 11:05:24PM -0500, Theodore Tso wrote:
> On Fri, Nov 21, 2008 at 10:14:35PM +0530, Aneesh Kumar K.V wrote:
> > We need to make sure we update the inode bitmap and clear
> > EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held. We look
> > at EXT4_BG_INODE_UNINIT and reinit the inode bitmap each
> > time in ext4_read_inode_bitmap (introduced by
> > c806e68f5647109350ec546fee5b526962970fd2 )
>
> OK, I believe I've checked in all of your patches in this series into
> the ext4 patch queue
>
> Some of them have comments that still need to be cleared; this one in
> particular needs a better commit comment, and ideally a comment for
> the new function ext4_claim_inode().

I will add a comment to the above function.

>
> Also, please don't rename variables unnecessarily; if you really think
> it's needed, please do so in a separate patch. The renaming of
> variables makes it much harder to review the patch, since it bloats
> the patch, and obscures the true changes happening in the patch.
> Please explain why you are making some of the changes you made in the
> patch. In particular, why does it matter the order in which you
> unlock the bh and sb_bgl_lock in balloc.c, mballoc.c and inode.c?
>
>

I will put the variable name cleanup and unlock cleanup into separate
patch. The unlock is done as a cleanup so that unlock appears in the
reverse order with which we did locking. It doesn't make any difference.

-aneesh

Updated patch below

ext4: Fix the race between read_inode_bitmap() and ext4_new_inode()

From: Aneesh Kumar K.V <[email protected]>

We need to make sure we update the inode bitmap and clear
EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held. We look
at EXT4_BG_INODE_UNINIT and reinit the inode bitmap each
time in ext4_read_inode_bitmap (introduced by
c806e68f5647109350ec546fee5b526962970fd2 )

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
fs/ext4/ialloc.c | 146 ++++++++++++++++++++++++++++++++----------------------
1 files changed, 86 insertions(+), 60 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 229708b..d1ccae5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -573,6 +573,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
}

/*
+ * claim the inode from the inode bitmap. If the group
+ * is uninit we need to take the groups's sb_bgl_lock
+ * and clear the uninit flag. The inode bitmap update
+ * and group desc uninit flag clear should be done
+ * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * doesn't race with the ext4_claim_inode
+ */
+static int ext4_claim_inode(struct super_block *sb,
+ struct buffer_head *inode_bitmap_bh,
+ unsigned long ino, ext4_group_t group, int mode)
+{
+ int free = 0, retval = 0, count;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+ spin_lock(sb_bgl_lock(sbi, group));
+ if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+ /* not a free inode */
+ retval = 1;
+ goto err_ret;
+ }
+ ino++;
+ if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+ ino > EXT4_INODES_PER_GROUP(sb)) {
+ spin_unlock(sb_bgl_lock(sbi, group));
+ ext4_error(sb, __func__,
+ "reserved inode or inode > inodes count - "
+ "block_group = %u, inode=%lu", group,
+ ino + group * EXT4_INODES_PER_GROUP(sb));
+ return 1;
+ }
+ /* If we didn't allocate from within the initialized part of the inode
+ * table then we need to initialize up to this inode. */
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ /* When marking the block group with
+ * ~EXT4_BG_INODE_UNINIT we don't want to depend
+ * on the value of bg_itable_unused even though
+ * mke2fs could have initialized the same for us.
+ * Instead we calculated the value below
+ */
+
+ free = 0;
+ } else {
+ free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+ }
+
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. if it is greater
+ * we need to update the bg_itable_unused count
+ *
+ */
+ if (ino > free)
+ ext4_itable_unused_set(sb, gdp,
+ (EXT4_INODES_PER_GROUP(sb) - ino));
+ }
+ count = ext4_free_inodes_count(sb, gdp) - 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (S_ISDIR(mode)) {
+ count = ext4_used_dirs_count(sb, gdp) + 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ }
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+ spin_unlock(sb_bgl_lock(sbi, group));
+ return retval;
+}
+
+/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
@@ -594,7 +667,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
struct ext4_super_block *es;
struct ext4_inode_info *ei;
struct ext4_sb_info *sbi;
- int ret2, err = 0, count;
+ int ret2, err = 0;
struct inode *ret;
ext4_group_t i;
int free = 0;
@@ -657,8 +730,13 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (err)
goto fail;

- if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
- ino, inode_bitmap_bh->b_data)) {
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle,
+ group_desc_bh);
+ if (err)
+ goto fail;
+ if (!ext4_claim_inode(sb, inode_bitmap_bh,
+ ino, group, mode)) {
/* we won it */
BUFFER_TRACE(inode_bitmap_bh,
"call ext4_journal_dirty_metadata");
@@ -666,10 +744,13 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
inode_bitmap_bh);
if (err)
goto fail;
+ /* zero bit is inode number 1*/
+ ino++;
goto got;
}
/* we lost it */
jbd2_journal_release_buffer(handle, inode_bitmap_bh);
+ jbd2_journal_release_buffer(handle, group_desc_bh);

if (++ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
@@ -689,22 +770,6 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
goto out;

got:
- ino++;
- if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
- ino > EXT4_INODES_PER_GROUP(sb)) {
- ext4_error(sb, __func__,
- "reserved inode or inode > inodes count - "
- "block_group = %u, inode=%lu", group,
- ino + group * EXT4_INODES_PER_GROUP(sb));
- err = -EIO;
- goto fail;
- }
-
- BUFFER_TRACE(group_desc_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, group_desc_bh);
- if (err)
- goto fail;
-
/* We may have to initialize the block bitmap if it isn't already */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -741,49 +806,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (err)
goto fail;
}
-
- spin_lock(sb_bgl_lock(sbi, group));
- /* If we didn't allocate from within the initialized part of the inode
- * table then we need to initialize up to this inode. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-
- /* When marking the block group with
- * ~EXT4_BG_INODE_UNINIT we don't want to depend
- * on the value of bg_itable_unused even though
- * mke2fs could have initialized the same for us.
- * Instead we calculated the value below
- */
-
- free = 0;
- } else {
- free = EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp);
- }
-
- /*
- * Check the relative inode number against the last used
- * relative inode number in this group. if it is greater
- * we need to update the bg_itable_unused count
- *
- */
- if (ino > free)
- ext4_itable_unused_set(sb, gdp,
- (EXT4_INODES_PER_GROUP(sb) - ino));
- }

2008-11-24 11:37:38

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

On Mon, Nov 24, 2008 at 10:14:27AM +0300, Alex Zhuravlev wrote:
> Theodore Tso wrote:
>> My bigger concern is given that we are playing games like *this*:
>>
>> if ((cur & 31) == 0 && (len - cur) >= 32) {
>> /* fast path: set whole word at once */
>> addr = bm + (cur >> 3);
>> *addr = 0xffffffff;
>> cur += 32;
>> continue;
>> }
>
> this is to avoid expensive LOCK prefix in some cases.
>
>> without taking a lock, I'm a little surprised we haven't been
>> seriously burned by other race conditions. What's the point of
>> calling mb_set_bit_atomic() and passing in a spinlock if we are doing
>> this kind of check without the protection of the same spinlock?!?
>
> why would we need a lock for a whole word bitop ?

Ok the changes was not done for this purpose. I need to make sure we
update bitmap and clear group_desc uninit flag after taking sb_bgl_lock
That means when we claim blocks we can't use mb_set_bits with
sb_bgl_lock because we would already be holding it. How about the below
change

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad99..53180b1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1031,7 +1031,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_clear_bit_atomic(lock, cur, bm);
+ if (lock)
+ mb_clear_bit_atomic(lock, cur, bm);
+ else
+ mb_clear_bit(cur, bm);
cur++;
}
}
@@ -1049,7 +1052,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- mb_set_bit_atomic(lock, cur, bm);
+ if (lock)
+ mb_set_bit_atomic(lock, cur, bm);
+ else
+ mb_set_bit(cur, bm);
cur++;
}
}

2008-11-24 16:36:59

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

Aneesh Kumar K.V wrote:
> Ok the changes was not done for this purpose. I need to make sure we
> update bitmap and clear group_desc uninit flag after taking sb_bgl_lock
> That means when we claim blocks we can't use mb_set_bits with
> sb_bgl_lock because we would already be holding it. How about the below
> change

may I have a look at the original patch?

thanks, Alex


2008-11-24 16:49:31

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

On Mon, Nov 24, 2008 at 07:36:49PM +0300, Alex Zhuravlev wrote:
> Aneesh Kumar K.V wrote:
>> Ok the changes was not done for this purpose. I need to make sure we
>> update bitmap and clear group_desc uninit flag after taking sb_bgl_lock
>> That means when we claim blocks we can't use mb_set_bits with
>> sb_bgl_lock because we would already be holding it. How about the below
>> change
>
> may I have a look at the original patch?

http://patchwork.ozlabs.org/patch/10065/

-aneesh

2008-11-24 18:03:42

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

Aneesh Kumar K.V wrote:
> On Mon, Nov 24, 2008 at 07:36:49PM +0300, Alex Zhuravlev wrote:
>> Aneesh Kumar K.V wrote:
>>> Ok the changes was not done for this purpose. I need to make sure we
>>> update bitmap and clear group_desc uninit flag after taking sb_bgl_lock
>>> That means when we claim blocks we can't use mb_set_bits with
>>> sb_bgl_lock because we would already be holding it. How about the below
>>> change
>> may I have a look at the original patch?
>
> http://patchwork.ozlabs.org/patch/10065/

I don't understand how a group can be "uninit" if we do some manipulations
inside. both allocation and preallocation initialize group first, see in
ext4_mb_init_cache()

thanks, Alex


2008-11-24 18:13:04

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

On Mon, Nov 24, 2008 at 09:03:21PM +0300, Alex Zhuravlev wrote:
> Aneesh Kumar K.V wrote:
>> On Mon, Nov 24, 2008 at 07:36:49PM +0300, Alex Zhuravlev wrote:
>>> Aneesh Kumar K.V wrote:
>>>> Ok the changes was not done for this purpose. I need to make sure we
>>>> update bitmap and clear group_desc uninit flag after taking sb_bgl_lock
>>>> That means when we claim blocks we can't use mb_set_bits with
>>>> sb_bgl_lock because we would already be holding it. How about the below
>>>> change
>>> may I have a look at the original patch?
>>
>> http://patchwork.ozlabs.org/patch/10065/
>
> I don't understand how a group can be "uninit" if we do some manipulations
> inside. both allocation and preallocation initialize group first, see in
> ext4_mb_init_cache()
>

With commit c806e68f we do a init_bitmap every time we do a
read_block_bitmap.

To quote the update commit message that i have

ext4: Fix race between read_block_bitmap() and mark_diskspace_used()

We need to make sure we update the block bitmap and clear
EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held. We look at
EXT4_BG_BLOCK_UNINIT and reinit the block bitmap each time in
ext4_read_block_bitmap (introduced by commit c806e68f), and this can
race with block allocations in ext4_mb_mark_diskspace_used().

ext4_read_block_bitmap does:

spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);

Now on the block allocation side we do

mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
....
spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);

ie on allocation we update the bitmap then we take the sb_bgl_lock
and clear the EXT4_BG_BLOCK_UNINIT flag. What can happen is a
parallel ext4_read_block_bitmap can zero out the bitmap in between
the above mb_set_bits and spin_lock(sb_bg_lock..)

The race results in below user visible errors
EXT4-fs error (device sdb1): ext4_mb_release_inode_pa: free 100, pa_free 105
EXT4-fs error (device sdb1): mb_free_blocks: double-free of inode 0's block 50(bit 100 in group 0)

-aneesh

2008-11-24 18:18:09

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

Aneesh Kumar K.V wrote:
> With commit c806e68f we do a init_bitmap every time we do a
> read_block_bitmap.

can you explain why do we need to init it every time?

thanks, Alex

>
> To quote the update commit message that i have
>
> ext4: Fix race between read_block_bitmap() and mark_diskspace_used()
>
> We need to make sure we update the block bitmap and clear
> EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held. We look at
> EXT4_BG_BLOCK_UNINIT and reinit the block bitmap each time in
> ext4_read_block_bitmap (introduced by commit c806e68f), and this can
> race with block allocations in ext4_mb_mark_diskspace_used().
>
> ext4_read_block_bitmap does:
>
> spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
> if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> ext4_init_block_bitmap(sb, bh, block_group, desc);
>
> Now on the block allocation side we do
>
> mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
> ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
> ....
> spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
> if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
>
> ie on allocation we update the bitmap then we take the sb_bgl_lock
> and clear the EXT4_BG_BLOCK_UNINIT flag. What can happen is a
> parallel ext4_read_block_bitmap can zero out the bitmap in between
> the above mb_set_bits and spin_lock(sb_bg_lock..)
>
> The race results in below user visible errors
> EXT4-fs error (device sdb1): ext4_mb_release_inode_pa: free 100, pa_free 105
> EXT4-fs error (device sdb1): mb_free_blocks: double-free of inode 0's block 50(bit 100 in group 0)
>
> -aneesh


2008-11-24 18:22:18

by Aneesh Kumar K.V

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

On Mon, Nov 24, 2008 at 09:17:53PM +0300, Alex Zhuravlev wrote:
> Aneesh Kumar K.V wrote:
>> With commit c806e68f we do a init_bitmap every time we do a
>> read_block_bitmap.
>
> can you explain why do we need to init it every time?
>

The commit message explains it well. It is because the buffer_head
can be marked uptodate by a read from userspace. So we would skip doing
a init_bitmap on the uninit group during resize.

commit c806e68f5647109350ec546fee5b526962970fd2
Author: Frederic Bohe <[email protected]>
Date: Fri Oct 10 08:09:18 2008 -0400

ext4: fix initialization of UNINIT bitmap blocks

This fixes a bug which caused on-line resizing of filesystems with a
1k blocksize to fail. The root cause of this bug was the fact that if
an uninitalized bitmap block gets read in by userspace (which
e2fsprogs does try to avoid, but can happen when the blocksize is less
than the pagesize and an adjacent blocks is read into memory)
ext4_read_block_bitmap() was erroneously depending on the buffer
uptodate flag to decide whether it needed to initialize the bitmap
block in memory --- i.e., to set the standard set of blocks in use by
a block group (superblock, bitmaps, inode table, etc.). Essentially,
ext4_read_block_bitmap() assumed it was the only routine that might
try to read a block containing a block bitmap, which is simply not
true.

To fix this, ext4_read_block_bitmap() and ext4_read_inode_bitmap()
must always initialize uninitialized bitmap blocks. Once a block or
inode is allocated out of that bitmap, it will be marked as
initialized in the block group descriptor, so in general this won't
result any extra unnecessary work.

Signed-off-by: Frederic Bohe <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 59566c0..bd2ece2 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -319,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
block_group, bitmap_blk);
return NULL;
}
- if (bh_uptodate_or_lock(bh))
+ if (buffer_uptodate(bh) &&
+ !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
return bh;

+ lock_buffer(bh);
spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1343bf1..fe34d74 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -115,9 +115,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
block_group, bitmap_blk);
return NULL;
}
- if (bh_uptodate_or_lock(bh))
+ if (buffer_uptodate(bh) &&
+ !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
return bh;

+ lock_buffer(bh);
spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
ext4_init_inode_bitmap(sb, bh, block_group, desc);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 335faee..b580714 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -782,9 +782,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (bh[i] == NULL)
goto out;

- if (bh_uptodate_or_lock(bh[i]))
+ if (buffer_uptodate(bh[i]) &&
+ !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
continue;

+ lock_buffer(bh[i]);
spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh[i],

2008-11-24 18:28:15

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

strange ... I'd expect the code to check "uptodate" once buffer is locked. you?

thanks, Alex

Aneesh Kumar K.V wrote:
> @@ -319,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
> block_group, bitmap_blk);
> return NULL;
> }
> - if (bh_uptodate_or_lock(bh))
> + if (buffer_uptodate(bh) &&
> + !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
> return bh;
>
> + lock_buffer(bh);
> spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
> if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> ext4_init_block_bitmap(sb, bh, block_group, desc);


2008-11-24 18:41:33

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

looks even more strange, IMHO. do I understand correct that two processes
doing allocation in the same group can do two initializations? what if one
process just allocated block(s) and not cleared UNINIT bit yet?

thanks, Alex



Aneesh Kumar K.V wrote:
> On Mon, Nov 24, 2008 at 09:17:53PM +0300, Alex Zhuravlev wrote:
>> Aneesh Kumar K.V wrote:
>>> With commit c806e68f we do a init_bitmap every time we do a
>>> read_block_bitmap.
>> can you explain why do we need to init it every time?
>>
>
> The commit message explains it well. It is because the buffer_head
> can be marked uptodate by a read from userspace. So we would skip doing
> a init_bitmap on the uninit group during resize.
>
> commit c806e68f5647109350ec546fee5b526962970fd2
> Author: Frederic Bohe <[email protected]>
> Date: Fri Oct 10 08:09:18 2008 -0400
>
> ext4: fix initialization of UNINIT bitmap blocks
>
> This fixes a bug which caused on-line resizing of filesystems with a
> 1k blocksize to fail. The root cause of this bug was the fact that if
> an uninitalized bitmap block gets read in by userspace (which
> e2fsprogs does try to avoid, but can happen when the blocksize is less
> than the pagesize and an adjacent blocks is read into memory)
> ext4_read_block_bitmap() was erroneously depending on the buffer
> uptodate flag to decide whether it needed to initialize the bitmap
> block in memory --- i.e., to set the standard set of blocks in use by
> a block group (superblock, bitmaps, inode table, etc.). Essentially,
> ext4_read_block_bitmap() assumed it was the only routine that might
> try to read a block containing a block bitmap, which is simply not
> true.
>
> To fix this, ext4_read_block_bitmap() and ext4_read_inode_bitmap()
> must always initialize uninitialized bitmap blocks. Once a block or
> inode is allocated out of that bitmap, it will be marked as
> initialized in the block group descriptor, so in general this won't
> result any extra unnecessary work.
>
> Signed-off-by: Frederic Bohe <[email protected]>
> Signed-off-by: "Theodore Ts'o" <[email protected]>
>
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 59566c0..bd2ece2 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -319,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
> block_group, bitmap_blk);
> return NULL;
> }
> - if (bh_uptodate_or_lock(bh))
> + if (buffer_uptodate(bh) &&
> + !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
> return bh;
>
> + lock_buffer(bh);
> spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
> if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> ext4_init_block_bitmap(sb, bh, block_group, desc);
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 1343bf1..fe34d74 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -115,9 +115,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
> block_group, bitmap_blk);
> return NULL;
> }
> - if (bh_uptodate_or_lock(bh))
> + if (buffer_uptodate(bh) &&
> + !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
> return bh;
>
> + lock_buffer(bh);
> spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
> if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
> ext4_init_inode_bitmap(sb, bh, block_group, desc);
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 335faee..b580714 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -782,9 +782,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
> if (bh[i] == NULL)
> goto out;
>
> - if (bh_uptodate_or_lock(bh[i]))
> + if (buffer_uptodate(bh[i]) &&
> + !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
> continue;
>
> + lock_buffer(bh[i]);
> spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
> if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
> ext4_init_block_bitmap(sb, bh[i],


2008-11-25 14:27:48

by Frédéric Bohé

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

> Ok the changes was not done for this purpose. I need to make sure we
> update bitmap and clear group_desc uninit flag after taking sb_bgl_lock
> That means when we claim blocks we can't use mb_set_bits with
> sb_bgl_lock because we would already be holding it. How about the below
> change
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 444ad99..53180b1 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -1031,7 +1031,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
> cur += 32;
> continue;
> }
> - mb_clear_bit_atomic(lock, cur, bm);
> + if (lock)
> + mb_clear_bit_atomic(lock, cur, bm);
> + else
> + mb_clear_bit(cur, bm);
> cur++;
> }
> }
> @@ -1049,7 +1052,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
> cur += 32;
> continue;
> }
> - mb_set_bit_atomic(lock, cur, bm);
> + if (lock)
> + mb_set_bit_atomic(lock, cur, bm);
> + else
> + mb_set_bit(cur, bm);
> cur++;
> }
> }


Aneesh,

I've just tried your patch on top of the patch queue and I still have
this error when I resize the fs while allocating blocks concurrently
(with dd):

EXT4-fs error (device md0): ext4_mb_mark_diskspace_used: Allocating
block in system zone - block = 9117697
EXT4-fs error (device md0): ext4_mb_generate_buddy: EXT4-fs: group 1113:
6144 blocks in bitmap, 6014 in gd


Anyway I'm not sure if it is related to the race you try to fix.

FYI this error first appeared with the first set of patches concerning
resize and mballoc.

Regards,

Frederic








2008-11-25 16:39:42

by Alex Zhuravlev

[permalink] [raw]
Subject: Re: [PATCH -V2 3/5] ext4: Fix the race between read_block_bitmap and mark_diskspace_used

Fr?d?ric Boh? wrote:
> I've just tried your patch on top of the patch queue and I still have
> this error when I resize the fs while allocating blocks concurrently
> (with dd):

I believe this is because original fix is wrong. instead of uptodate we
could use different flag.

thanks, Alex