2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 01/40] ext4: Use bitops to read/modify EXT4_I(inode)->i_state

commit 19f5fb7ad679bb361222c7916086435020c37cce upstream (as of v2.6.33-git11)

At several places we modify EXT4_I(inode)->i_state without holding
i_mutex (ext4_release_file, ext4_bmap, ext4_journalled_writepage,
ext4_do_update_inode, ...). These modifications are racy and we can
lose updates to i_state. So convert handling of i_state to use bitops
which are atomic.

Cc: Jan Kara <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 41 +++++++++++++++++++++++++++++------------
fs/ext4/extents.c | 8 ++++----
fs/ext4/file.c | 4 ++--
fs/ext4/ialloc.c | 3 ++-
fs/ext4/inode.c | 38 ++++++++++++++++++++------------------
fs/ext4/migrate.c | 6 +++---
fs/ext4/xattr.c | 22 +++++++++++-----------
7 files changed, 71 insertions(+), 51 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 602d5ad..8e8783a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -313,17 +313,6 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
return flags & EXT4_OTHER_FLMASK;
}

-/*
- * Inode dynamic state flags
- */
-#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
-#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
-#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
-#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
-#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
-#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
-#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
-
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
__u32 group; /* Group number for this data */
@@ -630,7 +619,7 @@ struct ext4_inode_info {
* near to their parent directory's inode.
*/
ext4_group_t i_block_group;
- __u32 i_state; /* Dynamic state flags for ext4 */
+ unsigned long i_state_flags; /* Dynamic state flags */

ext4_lblk_t i_dir_start_lookup;
#ifdef CONFIG_EXT4_FS_XATTR
@@ -1050,6 +1039,34 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
(ino >= EXT4_FIRST_INO(sb) &&
ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}
+
+/*
+ * Inode dynamic state flags
+ */
+enum {
+ EXT4_STATE_JDATA, /* journaled data exists */
+ EXT4_STATE_NEW, /* inode is newly created */
+ EXT4_STATE_XATTR, /* has in-inode xattrs */
+ EXT4_STATE_NO_EXPAND, /* No space for expansion */
+ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
+ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
+ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
+};
+
+static inline int ext4_test_inode_state(struct inode *inode, int bit)
+{
+ return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+
+static inline void ext4_set_inode_state(struct inode *inode, int bit)
+{
+ set_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
+
+static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+{
+ clear_bit(bit, &EXT4_I(inode)->i_state_flags);
+}
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cae75c1..9015ecb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3076,7 +3076,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
if (io)
io->flag = DIO_AIO_UNWRITTEN;
else
- EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
goto out;
}
/* async DIO end_io complete, convert the filled extent to written */
@@ -3362,8 +3362,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
if (io)
io->flag = DIO_AIO_UNWRITTEN;
else
- EXT4_I(inode)->i_state |=
- EXT4_STATE_DIO_UNWRITTEN;;
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
}
}
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
@@ -3739,7 +3739,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
int error = 0;

/* in-inode? */
- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
struct ext4_iloc iloc;
int offset; /* offset of xattr in inode */

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 9630583..f6071ce 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -35,9 +35,9 @@
*/
static int ext4_release_file(struct inode *inode, struct file *filp)
{
- if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
ext4_alloc_da_blocks(inode);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+ ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
}
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f3624ea..2fab5ad 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1029,7 +1029,8 @@ got:
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);

- ei->i_state = EXT4_STATE_NEW;
+ ei->i_state_flags = 0;
+ ext4_set_inode_state(inode, EXT4_STATE_NEW);

ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2059c34..6a7694e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1306,7 +1306,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
}

/*
@@ -1793,7 +1793,7 @@ static int ext4_journalled_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
if (new_i_size > EXT4_I(inode)->i_disksize) {
ext4_update_i_disksize(inode, new_i_size);
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2630,7 +2630,7 @@ static int __ext4_journalled_writepage(struct page *page,
ret = err;

walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
return ret;
}
@@ -3301,7 +3301,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
filemap_write_and_wait(mapping);
}

- if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+ if (EXT4_JOURNAL(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
/*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
@@ -3320,7 +3321,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
* everything they get.
*/

- EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+ ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
journal = EXT4_JOURNAL(inode);
jbd2_journal_lock_updates(journal);
err = jbd2_journal_flush(journal);
@@ -3788,8 +3789,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
ext4_free_io_end(iocb->private);
iocb->private = NULL;
- } else if (ret > 0 && (EXT4_I(inode)->i_state &
- EXT4_STATE_DIO_UNWRITTEN)) {
+ } else if (ret > 0 && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
int err;
/*
* for non AIO case, since the IO is already
@@ -3799,7 +3800,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
offset, ret);
if (err < 0)
ret = err;
- EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
}
return ret;
}
@@ -4434,7 +4435,7 @@ void ext4_truncate(struct inode *inode)
return;

if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
- ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
ext4_ext_truncate(inode);
@@ -4720,7 +4721,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext4_get_inode_loc(inode, iloc,
- !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}

void ext4_set_inode_flags(struct inode *inode)
@@ -4814,7 +4815,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);

- ei->i_state = 0;
+ ei->i_state_flags = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -4897,7 +4898,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_GOOD_OLD_INODE_SIZE +
ei->i_extra_isize;
if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ei->i_state |= EXT4_STATE_XATTR;
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
}
} else
ei->i_extra_isize = 0;
@@ -5037,7 +5038,7 @@ static int ext4_do_update_inode(handle_t *handle,

/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT4_STATE_NEW)
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

ext4_get_inode_flags(ei);
@@ -5133,7 +5134,7 @@ static int ext4_do_update_inode(handle_t *handle,
rc = ext4_handle_dirty_metadata(handle, inode, bh);
if (!err)
err = rc;
- ei->i_state &= ~EXT4_STATE_NEW;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);

ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse:
@@ -5557,8 +5558,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
entry = IFIRST(header);

/* No extended attributes present */
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
- header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
new_extra_isize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5602,7 +5603,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
- !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+ !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
/*
* We need extra buffer credits since we may write into EA block
* with this same handle. If journal_extend fails, then it will
@@ -5616,7 +5617,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
sbi->s_want_extra_isize,
iloc, handle);
if (ret) {
- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ ext4_set_inode_state(inode,
+ EXT4_STATE_NO_EXPAND);
if (mnt_count !=
le16_to_cpu(sbi->s_es->s_mnt_count)) {
ext4_warning(inode->i_sb, __func__,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8141581..46a4101 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -365,12 +365,12 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* happened after we started the migrate. We need to
* fail the migrate
*/
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
retval = -EAGAIN;
up_write(&EXT4_I(inode)->i_data_sem);
goto err_out;
} else
- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
/*
* We have the extent map build with the tmp inode.
* Now copy the i_data across
@@ -533,7 +533,7 @@ int ext4_ext_migrate(struct inode *inode)
* allocation.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
+ ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));

handle = ext4_journal_start(inode, 1);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f3a2f7e..c619a7e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -267,7 +267,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *end;
int error;

- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
return -ENODATA;
error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -396,7 +396,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
void *end;
int error;

- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
return 0;
error = ext4_get_inode_loc(inode, &iloc);
if (error)
@@ -908,7 +908,7 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
error = ext4_xattr_check_names(IFIRST(header), is->s.end);
if (error)
return error;
@@ -940,10 +940,10 @@ ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
header = IHDR(inode, ext4_raw_inode(&is->iloc));
if (!IS_LAST_ENTRY(s->first)) {
header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
- EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
} else {
header->h_magic = cpu_to_le32(0);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
}
return 0;
}
@@ -986,8 +986,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (strlen(name) > 255)
return -ERANGE;
down_write(&EXT4_I(inode)->xattr_sem);
- no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+ ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);

error = ext4_get_inode_loc(inode, &is.iloc);
if (error)
@@ -997,10 +997,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;

- if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
}

error = ext4_xattr_ibody_find(inode, &i, &is);
@@ -1052,7 +1052,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
ext4_xattr_update_super_block(handle, inode->i_sb);
inode->i_ctime = ext4_current_time(inode);
if (!value)
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
/*
* The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1067,7 +1067,7 @@ cleanup:
brelse(is.iloc.bh);
brelse(bs.bh);
if (no_expand == 0)
- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
up_write(&EXT4_I(inode)->xattr_sem);
return error;
}
--
1.6.6.1.1.g974db.dirty



2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 38/40] ext4: Conditionally define compat ioctl numbers

From: Ben Hutchings <[email protected]>

commit 899ad0cea6ad7ff4ba24b16318edbc3cbbe03fad upstream (as of v2.6.34-git13)

It is unnecessary, and in general impossible, to define the compat
ioctl numbers except when building the filesystem with CONFIG_COMPAT
defined.

Signed-off-by: Ben Hutchings <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1b4835f..ef12e95 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -468,6 +468,7 @@ struct ext4_new_group_data {
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)

+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
*/
@@ -483,6 +484,7 @@ struct ext4_new_group_data {
#endif
#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+#endif


/*
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 06/40] ext4: explicitly remove inode from orphan list after failed direct io

From: Dmitry Monakhov <[email protected]>

commit da1dafca84413145f5ac59998b4cdd06fb89f721 upstream (as of v2.6.33-git11)

Otherwise non-empty orphan list will be triggered on umount.

Signed-off-by: Dmitry Monakhov <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/inode.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 26a3d54..4e8759a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3437,6 +3437,9 @@ retry:
* but cannot extend i_size. Bail out and pretend
* the write failed... */
ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
goto out;
}
if (inode->i_nlink)
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 33/40] ext4: Show journal_checksum option

From: Jan Kara <[email protected]>

commit 39a4bade8c1826b658316d66ee81c09b0a4d7d42 upstream (as of v2.6.34-git13)

We failed to show journal_checksum option in /proc/mounts. Fix it.

Signed-off-by: Jan Kara <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/super.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 459da7c..9803da8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -890,6 +890,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
seq_puts(seq, ",journal_async_commit");
+ else if (test_opt(sb, JOURNAL_CHECKSUM))
+ seq_puts(seq, ",journal_checksum");
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
if (test_opt(sb, I_VERSION))
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 07/40] ext4: Handle non empty on-disk orphan link

From: Dmitry Monakhov <[email protected]>

commit 6e3617e579e070d3655a93ee9ed7149113e795e0 upstream (as of v2.6.33-git11)

In case of truncate errors we explicitly remove inode from in-core
orphan list via orphan_del(NULL, inode) without modifying the on-disk list.

But later on, the same inode may be inserted in the orphan list again
which will result the on-disk linked list getting corrupted. If inode
i_dtime contains valid value, then skip on-disk list modification.

Signed-off-by: Dmitry Monakhov <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/namei.c | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8bd1e20..1dd84f6 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2020,6 +2020,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
goto out_unlock;
+ /*
+ * Due to previous errors inode may be already a part of on-disk
+ * orphan list. If so skip on-disk list modification.
+ */
+ if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+ goto mem_insert;

/* Insert this inode at the head of the on-disk orphan list... */
NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
@@ -2037,6 +2044,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
*
* This is safe: on error we're going to ignore the orphan list
* anyway on the next recovery. */
+mem_insert:
if (!err)
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);

--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 08/40] ext4: make "offset" consistent in ext4_check_dir_entry()

From: Toshiyuki Okajima <[email protected]>

commit b8b8afe236e97b6359d46d3a3f8c46455e192271 upstream (as of v2.6.33-git11)

The callers of ext4_check_dir_entry() usually pass in the "file
offset" (ext4_readdir, htree_dirblock_to_tree, search_dirblock,
ext4_dx_find_entry, empty_dir), but a few callers (add_dirent_to_buf,
ext4_delete_entry) only pass in the buffer offset.

To accomodate those last two (which would be hard to fix otherwise),
this patch changes ext4_check_dir_entry() to print the physical block
number and the relative offset as well as the passed-in offset.

Signed-off-by: Toshiyuki Okajima <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/dir.c | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 9dc9316..92ae84f 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -84,9 +84,11 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,

if (error_msg != NULL)
ext4_error(dir->i_sb, function,
- "bad entry in directory #%lu: %s - "
- "offset=%u, inode=%u, rec_len=%d, name_len=%d",
- dir->i_ino, error_msg, offset,
+ "bad entry in directory #%lu: %s - block=%llu"
+ "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
+ dir->i_ino, error_msg,
+ (unsigned long long) bh->b_blocknr,
+ (unsigned) (offset%bh->b_size), offset,
le32_to_cpu(de->inode),
rlen, de->name_len);
return error_msg == NULL ? 1 : 0;
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 36/40] ext4: Clear the EXT4_EOFBLOCKS_FL flag only when warranted

commit 786ec7915e530936b9eb2e3d12274145cab7aa7d upstream (as of v2.6.34-git13)

Dimitry Monakhov discovered an edge case where it was possible for the
EXT4_EOFBLOCKS_FL flag could get cleared unnecessarily. This is true;
I have a test case that can be exercised via downloading and
decompressing the file:

wget ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/ext4-testcases/eofblocks-fl-test-case.img.bz2
bunzip2 eofblocks-fl-test-case.img
dd if=/dev/zero of=eofblocks-fl-test-case.img bs=1k seek=17925 bs=1k count=1 conv=notrunc

However, triggering it in real life is highly unlikely since it
requires an extremely fragmented sparse file with a hole in exactly
the right place in the extent tree. (It actually took quite a bit of
work to generate this test case.) Still, it's nice to get even
extreme corner cases to be correct, so this patch makes sure that we
don't clear the EXT4_EOFBLOCKS_FL incorrectly even in this corner
case.

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/extents.c | 26 ++++++++++++++++++--------
1 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e2d9bfb..799416f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3223,7 +3223,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent_header *eh;
struct ext4_extent newex, *ex, *last_ex;
ext4_fsblk_t newblock;
- int err = 0, depth, ret, cache_type;
+ int i, err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3404,19 +3404,29 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
}

if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
- if (eh->eh_entries) {
- last_ex = EXT_LAST_EXTENT(eh);
- if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
- + ext4_ext_get_actual_len(last_ex))
- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
- } else {
- WARN_ON(eh->eh_entries == 0);
+ if (unlikely(!eh->eh_entries)) {
ext4_error(inode->i_sb, __func__,
"inode#%lu, eh->eh_entries = 0 and "
"EOFBLOCKS_FL set", inode->i_ino);
err = -EIO;
goto out2;
}
+ last_ex = EXT_LAST_EXTENT(eh);
+ /*
+ * If the current leaf block was reached by looking at
+ * the last index block all the way down the tree, and
+ * we are extending the inode beyond the last extent
+ * in the current leaf block, then clear the
+ * EOFBLOCKS_FL flag.
+ */
+ for (i = depth-1; i >= 0; i--) {
+ if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+ break;
+ }
+ if ((i < 0) &&
+ (iblock + ar.len > le32_to_cpu(last_ex->ee_block) +
+ ext4_ext_get_actual_len(last_ex)))
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 04/40] ext4: Fix fencepost error in chosing choosing group vs file preallocation.

From: Tao Ma <[email protected]>

commit cc483f102c3f703e853c96f95a654f0106fb2603 upstream (as of v2.6.33-git11)

The ext4 multiblock allocator decides whether to use group or file
preallocation based on the file size. When the file size reaches
s_mb_stream_request (default is 16 blocks), it changes to use a
file-specific preallocation. This is cool, but it has a tiny problem.

See a simple script:
mkfs.ext4 -b 1024 /dev/sda8 1000000
mount -t ext4 -o nodelalloc /dev/sda8 /mnt/ext4
for((i=0;i<5;i++))
do
cat /mnt/4096>>/mnt/ext4/a #4096 is a file with 4096 characters.
cat /mnt/4096>>/mnt/ext4/b
done
debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1

And you get
BLOCKS:
(0-14):8705-8719, (15):2356, (16-19):8465-8468

So there are 3 extents, a bit strange for the lonely 15th logical
block. As we write to the 16 blocks, we choose file preallocation in
ext4_mb_group_or_file, but in ext4_mb_normalize_request, we meet with
the 16*1024 range, so no preallocation will be carried. file b then
reserves the space after '2356', so when when write 16, we start from
another part.

This patch just change the check in ext4_mb_group_or_file, so
that for the lonely 15 we will still use group preallocation.
After the patch, we will get:
debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1
BLOCKS:
(0-15):8705-8720, (16-19):8465-8468

Looks more sane. Thanks.

Signed-off-by: Tao Ma <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d34afad..301b173 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3938,7 +3938,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)

/* don't use group allocation for large files */
size = max(size, isize);
- if (size >= sbi->s_mb_stream_request) {
+ if (size > sbi->s_mb_stream_request) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
}
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 37/40] ext4: restart ext4_ext_remove_space() after transaction restart

From: Dmitry Monakhov <[email protected]>

commit 0617b83fa239db9743a18ce6cc0e556f4d0fd567 upstream (as of v2.6.34-git13)

If i_data_sem was internally dropped due to transaction restart, it is
necessary to restart path look-up because extents tree was possibly
modified by ext4_get_block().

https://bugzilla.kernel.org/show_bug.cgi?id=15827

Signed-off-by: Dmitry Monakhov <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
Acked-by: Jan Kara <[email protected]>
---
fs/ext4/extents.c | 16 +++++++++-------
1 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 799416f..1fca430 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
if (err <= 0)
return err;
err = ext4_truncate_restart_trans(handle, inode, needed);
- /*
- * We have dropped i_data_sem so someone might have cached again
- * an extent we are going to truncate.
- */
- ext4_ext_invalidate_cache(inode);
+ if (err == 0)
+ err = -EAGAIN;

return err;
}
@@ -2257,7 +2254,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
int depth = ext_depth(inode);
struct ext4_ext_path *path;
handle_t *handle;
- int i = 0, err = 0;
+ int i, err;

ext_debug("truncate since %u\n", start);

@@ -2266,23 +2263,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
if (IS_ERR(handle))
return PTR_ERR(handle);

+again:
ext4_ext_invalidate_cache(inode);

/*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
*/
+ depth = ext_depth(inode);
path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
if (path == NULL) {
ext4_journal_stop(handle);
return -ENOMEM;
}
+ path[0].p_depth = depth;
path[0].p_hdr = ext_inode_hdr(inode);
if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
err = -EIO;
goto out;
}
- path[0].p_depth = depth;
+ i = err = 0;

while (i >= 0 && err == 0) {
if (i == depth) {
@@ -2376,6 +2376,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
out:
ext4_ext_drop_refs(path);
kfree(path);
+ if (err == -EAGAIN)
+ goto again;
ext4_journal_stop(handle);

return err;
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 35/40] ext4: Avoid crashing on NULL ptr dereference on a filesystem error

commit f70f362b4a6fe47c239dbfb3efc0cc2c10e4f09c upstream (as of v2.6.34-git13)

If the EOFBLOCK_FL flag is set when it should not be and the inode is
zero length, then eh_entries is zero, and ex is NULL, so dereferencing
ex to print ex->ee_block causes a kernel OOPS in
ext4_ext_map_blocks().

On top of that, the error message which is printed isn't very helpful.
So we fix this by printing something more explanatory which doesn't
involve trying to print ex->ee_block.

Addresses-Google-Bug: #2655740

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/extents.c | 11 +++++++----
1 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5b1c104..e2d9bfb 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3275,8 +3275,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
if (path[depth].p_ext == NULL && depth != 0) {
ext4_error(inode->i_sb, __func__, "bad extent address "
- "inode: %lu, iblock: %d, depth: %d",
- inode->i_ino, iblock, depth);
+ "inode: %lu, iblock: %lu, depth: %d",
+ inode->i_ino, (unsigned long) iblock, depth);
err = -EIO;
goto out2;
}
@@ -3412,8 +3412,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
} else {
WARN_ON(eh->eh_entries == 0);
ext4_error(inode->i_sb, __func__,
- "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
- }
+ "inode#%lu, eh->eh_entries = 0 and "
+ "EOFBLOCKS_FL set", inode->i_ino);
+ err = -EIO;
+ goto out2;
+ }
}
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:37

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 25/40] ext4: stop issuing discards if not supported by device

From: Eric Sandeen <[email protected]>

commit a30eec2a8650a77f754e84b2e15f062fe652baa7 upstream (as of v2.6.34-git13)

Turn off issuance of discard requests if the device does
not support it - similar to the action we take for barriers.
This will save a little computation time if a non-discardable
device is mounted with -o discard, and also makes it obvious
that it's not doing what was asked at mount time ...

Signed-off-by: Eric Sandeen <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 8 +++++++-
1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e71593f..d0ddea0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2536,6 +2536,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
entry->count, entry->group, entry);

if (test_opt(sb, DISCARD)) {
+ int ret;
ext4_fsblk_t discard_block;

discard_block = entry->start_blk +
@@ -2543,7 +2544,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
trace_ext4_discard_blocks(sb,
(unsigned long long)discard_block,
entry->count);
- sb_issue_discard(sb, discard_block, entry->count);
+ ret = sb_issue_discard(sb, discard_block, entry->count);
+ if (ret == EOPNOTSUPP) {
+ ext4_warning(sb, __func__,
+ "discard not supported, disabling");
+ clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+ }
}

err = ext4_mb_load_buddy(sb, entry->group, &e4b);
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 02/40] ext4: Fix BUG_ON at fs/buffer.c:652 in no journal mode

From: Curt Wohlgemuth <[email protected]>

commit 73b50c1c92666d326b5fa2c945d46509f2f6d91f upstream (as of v2.6.33-git11)

Calls to ext4_handle_dirty_metadata should only pass in an inode
pointer for inode-specific metadata, and not for shared metadata
blocks such as inode table blocks, block group descriptors, the
superblock, etc.

The BUG_ON can get tripped when updating a special device (such as a
block device) that is opened (so that i_mapping is set in
fs/block_dev.c) and the file system is mounted in no journal mode.

Addresses-Google-Bug: #2404870

Signed-off-by: Curt Wohlgemuth <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4_jbd2.c | 2 +-
fs/ext4/ialloc.c | 2 +-
fs/ext4/inode.c | 6 +++---
fs/ext4/namei.c | 4 ++--
4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index b57e5c7..98033d9 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -125,7 +125,7 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
ext4_journal_abort_handle(where, __func__, bh,
handle, err);
} else {
- if (inode && bh)
+ if (inode)
mark_buffer_dirty_inode(bh, inode);
else
mark_buffer_dirty(bh);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 2fab5ad..179e45c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -904,7 +904,7 @@ repeat_in_this_group:
BUFFER_TRACE(inode_bitmap_bh,
"call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle,
- inode,
+ NULL,
inode_bitmap_bh);
if (err)
goto fail;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6a7694e..7128506 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5102,7 +5102,7 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
sb->s_dirt = 1;
ext4_handle_sync(handle);
- err = ext4_handle_dirty_metadata(handle, inode,
+ err = ext4_handle_dirty_metadata(handle, NULL,
EXT4_SB(sb)->s_sbh);
}
}
@@ -5131,7 +5131,7 @@ static int ext4_do_update_inode(handle_t *handle,
}

BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
@@ -5685,7 +5685,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
err = jbd2_journal_get_write_access(handle, iloc.bh);
if (!err)
err = ext4_handle_dirty_metadata(handle,
- inode,
+ NULL,
iloc.bh);
brelse(iloc.bh);
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 17a17e1..8bd1e20 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2024,7 +2024,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
/* Insert this inode at the head of the on-disk orphan list... */
NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
err = rc;
@@ -2096,7 +2096,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
if (err)
goto out_brelse;
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
+ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
} else {
struct ext4_iloc iloc2;
struct inode *i_prev =
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:38

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 24/40] ext4: don't return to userspace after freezing the fs with a mutex held

From: Eric Sandeen <[email protected]>

commit 6b0310fbf087ad6e9e3b8392adca97cd77184084 upstream (as of v2.6.34-git13)

ext4_freeze() used jbd2_journal_lock_updates() which takes
the j_barrier mutex, and then returns to userspace. The
kernel does not like this:

================================================
[ BUG: lock held when returning to user space! ]
------------------------------------------------
lvcreate/1075 is leaving the kernel with locks still held!
1 lock held by lvcreate/1075:
#0: (&journal->j_barrier){+.+...}, at: [<ffffffff811c6214>]
jbd2_journal_lock_updates+0xe1/0xf0

Use vfs_check_frozen() added to ext4_journal_start_sb() and
ext4_force_commit() instead.

Addresses-Red-Hat-Bugzilla: #568503

Signed-off-by: Eric Sandeen <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/super.c | 20 ++++++++++----------
1 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 735c20d..a6a8e5b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -227,6 +227,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);

+ vfs_check_frozen(sb, SB_FREEZE_WRITE);
/* Special case here: if the journal has aborted behind our
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly. */
@@ -3386,8 +3387,10 @@ int ext4_force_commit(struct super_block *sb)
return 0;

journal = EXT4_SB(sb)->s_journal;
- if (journal)
+ if (journal) {
+ vfs_check_frozen(sb, SB_FREEZE_WRITE);
ret = ext4_journal_force_commit(journal);
+ }

return ret;
}
@@ -3436,18 +3439,16 @@ static int ext4_freeze(struct super_block *sb)
* the journal.
*/
error = jbd2_journal_flush(journal);
- if (error < 0) {
- out:
- jbd2_journal_unlock_updates(journal);
- return error;
- }
+ if (error < 0)
+ goto out;

/* Journal blocked and flushed, clear needs_recovery flag. */
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
error = ext4_commit_super(sb, 1);
- if (error)
- goto out;
- return 0;
+out:
+ /* we rely on s_frozen to stop further updates */
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ return error;
}

/*
@@ -3464,7 +3465,6 @@ static int ext4_unfreeze(struct super_block *sb)
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
unlock_super(sb);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
return 0;
}

--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:46

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 11/40] ext4: Code cleanup for EXT4_IOC_MOVE_EXT ioctl

From: Akira Fujita <[email protected]>

commit c437b2733520599a2c6e0dbcdeae611319f84707 upstream (as of v2.6.33-git11)

a) Fix sparse warning in ext4_ioctl()
b) Remove unneeded variable in mext_leaf_block()
c) Fix spelling typo in mext_check_arguments()

Signed-off-by: Akira Fujita <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ioctl.c | 3 ++-
fs/ext4/move_extent.c | 4 +---
2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2220feb..016d024 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,8 @@ setversion_out:
if (me.moved_len > 0)
file_remove_suid(donor_filp);

- if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+ if (copy_to_user((struct move_extent __user *)arg,
+ &me, sizeof(me)))
err = -EFAULT;
mext_out:
fput(donor_filp);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 379c0c0..d7e9688 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -477,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
struct ext4_extent new_ext, start_ext, end_ext;
ext4_lblk_t new_ext_end;
- ext4_fsblk_t new_phys_end;
int oext_alen, new_ext_alen, end_ext_alen;
int depth = ext_depth(orig_inode);
int ret;
@@ -491,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
new_ext.ee_len = dext->ee_len;
new_ext_alen = ext4_ext_get_actual_len(&new_ext);
new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
- new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;

/*
* Case: original extent is first
@@ -932,7 +930,7 @@ out2:
}

/**
- * mext_check_argumants - Check whether move extent can be done
+ * mext_check_arguments - Check whether move extent can be done
*
* @orig_inode: original inode
* @donor_inode: donor inode
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:43

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 15/40] ext4: Fix buffer head leaks after calls to ext4_get_inode_loc()

From: Curt Wohlgemuth <[email protected]>

commit fd2dd9fbaf9e498ec63eef298921e36556f7214c upstream (as of v2.6.34-rc6)

Calls to ext4_get_inode_loc() returns with a reference to a buffer
head in iloc->bh. The callers of this function in ext4_write_inode()
when in no journal mode and in ext4_xattr_fiemap() don't release the
buffer head after using it.

Addresses-Google-Bug: #2548165

Signed-off-by: Curt Wohlgemuth <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/extents.c | 1 +
fs/ext4/inode.c | 1 +
2 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b38058a..3afdded 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3772,6 +3772,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
physical += offset;
length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
flags |= FIEMAP_EXTENT_DATA_INLINE;
+ brelse(iloc.bh);
} else { /* external block */
physical = EXT4_I(inode)->i_file_acl << blockbits;
length = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28152f8..3e06a5d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5217,6 +5217,7 @@ int ext4_write_inode(struct inode *inode, int wait)
(unsigned long long)iloc.bh->b_blocknr);
err = -EIO;
}
+ brelse(iloc.bh);
}
return err;
}
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:43

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 31/40] ext4: Prevent creation of files larger than RLIMIT_FSIZE using fallocate

From: Nikanth Karthikesan <[email protected]>

commit 6d19c42b7cf81c39632b6d4dbc514e8449bcd346 upstream (as of v2.6.34-git13)

Currently using posix_fallocate one can bypass an RLIMIT_FSIZE limit
and create a file larger than the limit. Add a check for that.

Signed-off-by: Nikanth Karthikesan <[email protected]>
Signed-off-by: Amit Arora <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/extents.c | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 39aab5c..2ff79e8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3601,6 +3601,11 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
*/
credits = ext4_chunk_trans_blocks(inode, max_blocks);
mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, (len + offset));
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
retry:
while (ret >= 0 && ret < max_blocks) {
block = block + ret;
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:41

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 22/40] ext4: fix quota accounting in case of fallocate

From: Dmitry Monakhov <[email protected]>

commit 35121c9860316d7799cea0fbc359a9186e7c2747 upstream (as of v2.6.34-git13)

allocated_meta_data is already included in 'used' variable.

Signed-off-by: Dmitry Monakhov <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/inode.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e06a5d..f01fe28 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1107,7 +1107,8 @@ void ext4_da_update_reserve_space(struct inode *inode,
*/
if (allocated_meta_blocks)
vfs_dq_claim_block(inode, allocated_meta_blocks);
- vfs_dq_release_reservation_block(inode, mdb_free + used);
+ vfs_dq_release_reservation_block(inode, mdb_free + used -
+ allocated_meta_blocks);
}

/*
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:35

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 28/40] ext4: clean up inode bitmaps manipulation in ext4_free_inode

From: Dmitry Monakhov <[email protected]>

commit d17413c08cd2b1dd2bf2cfdbb0f7b736b2b2b15c upstrea (as of v2..34-git13)

- Reorganize locking scheme to batch two atomic operation in to one.
This also allow us to state what healthy group must obey following rule
ext4_free_inodes_count(sb, gdp) == ext4_count_free(inode_bitmap, NUM);
- Fix possible undefined pointer dereference.
- Even if group descriptor stats aren't accessible we have to update
inode bitmaps.
- Move non-group members update out of group_lock.

Note: this commit has been observed to fix fs corruption problems
under heavy fs load

Signed-off-by: Dmitry Monakhov <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ialloc.c | 83 ++++++++++++++++++++++++-----------------------------
1 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8b4bb11..c55275a 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -244,57 +244,50 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (fatal)
goto error_return;

- /* Ok, now we can actually update the inode bitmaps.. */
- cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
- bit, bitmap_bh->b_data);
- if (!cleared)
- ext4_error(sb, "ext4_free_inode",
- "bit already cleared for inode %lu", ino);
- else {
- gdp = ext4_get_group_desc(sb, block_group, &bh2);
-
+ fatal = -ESRCH;
+ gdp = ext4_get_group_desc(sb, block_group, &bh2);
+ if (gdp) {
BUFFER_TRACE(bh2, "get_write_access");
fatal = ext4_journal_get_write_access(handle, bh2);
- if (fatal) goto error_return;
-
- if (gdp) {
- ext4_lock_group(sb, block_group);
- count = ext4_free_inodes_count(sb, gdp) + 1;
- ext4_free_inodes_set(sb, gdp, count);
- if (is_directory) {
- count = ext4_used_dirs_count(sb, gdp) - 1;
- ext4_used_dirs_set(sb, gdp, count);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f;
-
- f = ext4_flex_group(sbi, block_group);
- atomic_dec(&sbi->s_flex_groups[f].used_dirs);
- }
+ }
+ ext4_lock_group(sb, block_group);
+ cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+ if (fatal || !cleared) {
+ ext4_unlock_group(sb, block_group);
+ goto out;
+ }

- }
- gdp->bg_checksum = ext4_group_desc_csum(sbi,
- block_group, gdp);
- ext4_unlock_group(sb, block_group);
- percpu_counter_inc(&sbi->s_freeinodes_counter);
- if (is_directory)
- percpu_counter_dec(&sbi->s_dirs_counter);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f;
-
- f = ext4_flex_group(sbi, block_group);
- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
- }
- }
- BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, bh2);
- if (!fatal) fatal = err;
+ count = ext4_free_inodes_count(sb, gdp) + 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (is_directory) {
+ count = ext4_used_dirs_count(sb, gdp) - 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ percpu_counter_dec(&sbi->s_dirs_counter);
}
- BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
- if (!fatal)
- fatal = err;
- sb->s_dirt = 1;
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ext4_unlock_group(sb, block_group);
+
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ if (is_directory)
+ atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ }
+ BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+ fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+ if (cleared) {
+ BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (!fatal)
+ fatal = err;
+ sb->s_dirt = 1;
+ } else
+ ext4_error(sb, "ext4_free_inode",
+ "bit already cleared for inode %lu", ino);
+
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, fatal);
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:45

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 20/40] ext4: rename ext4_mb_release_desc() to ext4_mb_unload_buddy()

From: Jing Zhang <[email protected]>

commit e39e07fdfd98be8650385f12a7b81d6adc547510 upstream (as of v2.6.34-git13)

This function cleans up after ext4_mb_load_buddy(), so the renaming
makes the code clearer.

Signed-off-by: Jing Zhang <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 24 ++++++++++++------------
1 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8fdcfc7..e71593f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1150,7 +1150,7 @@ err:
return ret;
}

-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
@@ -1618,7 +1618,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
}

ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);

return 0;
}
@@ -1674,7 +1674,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);

return 0;
}
@@ -2044,7 +2044,7 @@ repeat:
if (!ext4_mb_good_group(ac, group, cr)) {
/* someone did allocation from this group */
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
continue;
}

@@ -2058,7 +2058,7 @@ repeat:
ext4_mb_complex_scan_group(ac, &e4b);

ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);

if (ac->ac_status != AC_STATUS_CONTINUE)
break;
@@ -2148,7 +2148,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
ext4_lock_group(sb, group);
memcpy(&sg, ext4_get_group_info(sb, group), i);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);

seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2568,7 +2568,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
}
ext4_unlock_group(sb, entry->group);
kmem_cache_free(ext4_free_ext_cachep, entry);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
}

mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -3705,7 +3705,7 @@ out:
ext4_unlock_group(sb, group);
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
return free;
}
@@ -3809,7 +3809,7 @@ repeat:
if (bitmap_bh == NULL) {
ext4_error(sb, __func__, "Error in reading block "
"bitmap for %u", group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
continue;
}

@@ -3818,7 +3818,7 @@ repeat:
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
ext4_unlock_group(sb, group);

- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);

list_del(&pa->u.pa_tmp_list);
@@ -4082,7 +4082,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
ext4_mb_release_group_pa(&e4b, pa, ac);
ext4_unlock_group(sb, group);

- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
@@ -4619,7 +4619,7 @@ do_more:
atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
}

- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);

freed += count;

--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:41

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 26/40] ext4: don't scan/accumulate more pages than mballoc will allocate

From: Eric Sandeen <[email protected]>

commit c445e3e0a5c2804524dec6e55f66d63f6bc5bc3e upstream (as of v2.6.34-git13)

There was a bug reported on RHEL5 that a 10G dd on a 12G box
had a very, very slow sync after that.

At issue was the loop in write_cache_pages scanning all the way
to the end of the 10G file, even though the subsequent call
to mpage_da_submit_io would only actually write a smallish amt; then
we went back to the write_cache_pages loop ... wasting tons of time
in calling __mpage_da_writepage for thousands of pages we would
just revisit (many times) later.

Upstream it's not such a big issue for sys_sync because we get
to the loop with a much smaller nr_to_write, which limits the loop.

However, talking with Aneesh he realized that fsync upstream still
gets here with a very large nr_to_write and we face the same problem.

This patch makes mpage_add_bh_to_extent stop the loop after we've
accumulated 2048 pages, by setting mpd->io_done = 1; which ultimately
causes the write_cache_pages loop to break.

Repeating the test with a dirty_ratio of 80 (to leave something for
fsync to do), I don't see huge IO performance gains, but the reduction
in cpu usage is striking: 80% usage with stock, and 2% with the
below patch. Instrumenting the loop in write_cache_pages clearly
shows that we are wasting time here.

Eventually we need to change mpage_da_map_pages() also submit its I/O
to the block layer, subsuming mpage_da_submit_io(), and then change it
call ext4_get_blocks() multiple times.

Signed-off-by: Eric Sandeen <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/inode.c | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f01fe28..3c16ab8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2319,6 +2319,15 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
sector_t next;
int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;

+ /*
+ * XXX Don't go larger than mballoc is willing to allocate
+ * This is a stopgap solution. We eventually need to fold
+ * mpage_da_submit_io() into this function and then call
+ * ext4_get_blocks() multiple times in a loop
+ */
+ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+ goto flush_it;
+
/* check if thereserved journal credits might overflow */
if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
if (nrblocks >= EXT4_MAX_TRANS_DATA) {
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:37

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 32/40] ext4: check for a good block group before loading buddy pages

From: Curt Wohlgemuth <[email protected]>

commit 8a57d9d61a6e361c7bb159dda797672c1df1a691 upstream (as of v2.6.34-git13)

This adds a new field in ext4_group_info to cache the largest available
block range in a block group; and don't load the buddy pages until *after*
we've done a sanity check on the block group.

With large allocation requests (e.g., fallocate(), 8MiB) and relatively full
partitions, it's easy to have no block groups with a block extent large
enough to satisfy the input request length. This currently causes the loop
during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages
for EVERY block group. That can be a lot of pages. The patch below allows
us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we
have check again after we lock the block group).

Addresses-Google-Bug: #2578108
Addresses-Google-Bug: #2704453

Signed-off-by: Curt Wohlgemuth <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 1 +
fs/ext4/mballoc.c | 70 +++++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 81b11cf..bc4013b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1660,6 +1660,7 @@ struct ext4_group_info {
ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d0ddea0..665e1c9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
}
}

+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+ int i;
+ int bits;
+
+ grp->bb_largest_free_order = -1; /* uninit */
+
+ bits = sb->s_blocksize_bits + 1;
+ for (i = bits; i >= 0; i--) {
+ if (grp->bb_counters[i] > 0) {
+ grp->bb_largest_free_order = i;
+ break;
+ }
+ }
+}
+
static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
*/
grp->bb_free = free;
}
+ mb_set_largest_free_order(sb, grp);

clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));

@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
* contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
* So it can have information regarding groups_per_page which
* is blocks_per_page/2
+ *
+ * Locking note: This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
*/

static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -910,6 +935,11 @@ out:
return err;
}

+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
{
@@ -1004,6 +1034,11 @@ err:
return ret;
}

+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
@@ -1300,6 +1335,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
buddy = buddy2;
} while (1);
}
+ mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}

@@ -1428,6 +1464,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
e4b->bd_info->bb_counters[ord]++;
e4b->bd_info->bb_counters[ord]++;
}
+ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);

mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
mb_check_buddy(e4b);
@@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
}

+/* This is now called BEFORE we load the buddy bitmap. */
static int ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
unsigned free, fragments;
- unsigned i, bits;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);

BUG_ON(cr < 0 || cr >= 4);
- BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ int ret = ext4_mb_init_group(ac->ac_sb, group);
+ if (ret)
+ return 0;
+ }

free = grp->bb_free;
fragments = grp->bb_fragments;
@@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
case 0:
BUG_ON(ac->ac_2order == 0);

+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return 0;

- bits = ac->ac_sb->s_blocksize_bits + 1;
- for (i = ac->ac_2order; i <= bits; i++)
- if (grp->bb_counters[i] > 0)
- return 1;
- break;
+ return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
return 1;
@@ -2026,14 +2068,11 @@ repeat:
group = ac->ac_g_ex.fe_group;

for (i = 0; i < ngroups; group++, i++) {
- struct ext4_group_info *grp;
-
if (group == ngroups)
group = 0;

- /* quick check to skip empty groups */
- grp = ext4_get_group_info(sb, group);
- if (grp->bb_free == 0)
+ /* This now checks without needing the buddy page */
+ if (!ext4_mb_good_group(ac, group, cr))
continue;

err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2041,8 +2080,12 @@ repeat:
goto out;

ext4_lock_group(sb, group);
+
+ /*
+ * We need to check again after locking the
+ * block group
+ */
if (!ext4_mb_good_group(ac, group, cr)) {
- /* someone did allocation from this group */
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
continue;
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root.rb_node = NULL;
+ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */

#ifdef DOUBLE_CHECK
{
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:49

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 16/40] ext4: Issue the discard operation *before* releasing the blocks to be reused

commit b90f687018e6d6c77d981b09203780f7001407e5 upstream (as of v2.6.34-rc6)

Otherwise, we can end up having data corruption because the blocks
could get reused and then discarded!

https://bugzilla.kernel.org/show_bug.cgi?id=15579

Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 24 +++++++++++-------------
1 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 301b173..2e98130 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2537,6 +2537,17 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry);

+ if (test_opt(sb, DISCARD)) {
+ ext4_fsblk_t discard_block;
+
+ discard_block = entry->start_blk +
+ ext4_group_first_block_no(sb, entry->group);
+ trace_ext4_discard_blocks(sb,
+ (unsigned long long)discard_block,
+ entry->count);
+ sb_issue_discard(sb, discard_block, entry->count);
+ }
+
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
@@ -2558,19 +2569,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
page_cache_release(e4b.bd_bitmap_page);
}
ext4_unlock_group(sb, entry->group);
- if (test_opt(sb, DISCARD)) {
- ext4_fsblk_t discard_block;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;

2010-06-01 12:03:38

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 27/40] ext4: Do not zero out uninitialized extents beyond i_size

From: Dmitry Monakhov <[email protected]>

commit 21ca087a3891efab4d45488db8febee474d26c68 upstream (as of v2.6.34-git13)

The extents code will sometimes zero out blocks and mark them as
initialized instead of splitting an extent into several smaller ones.
This optimization however, causes problems if the extent is beyond
i_size because fsck will complain if there are uninitialized blocks
after i_size as this can not be distinguished from an inode that has
an incorrect i_size field.

https://bugzilla.kernel.org/show_bug.cgi?id=15742

Signed-off-by: Dmitry Monakhov <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/extents.c | 67 ++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 032c95b..39aab5c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2527,11 +2527,21 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_extent *ex2 = NULL;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
int ret = 0;
+ int may_zeroout;
+
+ ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)iblock, max_blocks);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < iblock + max_blocks)
+ eof_block = iblock + max_blocks;

depth = ext_depth(inode);
eh = path[depth].p_hdr;
@@ -2540,16 +2550,23 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (iblock - ee_block);
newblock = iblock - ee_block + ext_pblock(ex);
+
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));

+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ may_zeroout = ee_block + ee_len <= eof_block;
+
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+ if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2580,7 +2597,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
if (allocated > max_blocks) {
unsigned int newdepth;
/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
- if (allocated <= EXT4_EXT_ZERO_LEN) {
+ if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
/*
* iblock == ee_block is handled by the zerouout
* at the beginning.
@@ -2656,7 +2673,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2680,8 +2697,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* update the extent length after successful insert of the
* split extent
*/
- orig_ex.ee_len = cpu_to_le16(ee_len -
- ext4_ext_get_actual_len(ex3));
+ ee_len -= ext4_ext_get_actual_len(ex3);
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ may_zeroout = ee_block + ee_len <= eof_block;
+
depth = newdepth;
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode, iblock, path);
@@ -2705,7 +2724,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* otherwise give the extent a chance to merge to left
*/
if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
- iblock != ee_block) {
+ iblock != ee_block && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2774,7 +2793,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2834,14 +2853,21 @@ static int ext4_split_unwritten_extents(handle_t *handle,
struct ext4_extent *ex2 = NULL;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
+ int may_zeroout;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)iblock, max_blocks);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < iblock + max_blocks)
+ eof_block = iblock + max_blocks;

- ext_debug("ext4_split_unwritten_extents: inode %lu,"
- "iblock %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)iblock, max_blocks);
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
@@ -2849,12 +2875,19 @@ static int ext4_split_unwritten_extents(handle_t *handle,
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (iblock - ee_block);
newblock = iblock - ee_block + ext_pblock(ex);
+
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));

/*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ may_zeroout = ee_block + ee_len <= eof_block;
+
+ /*
* If the uninitialized extent begins at the same logical
* block where the write begins, and the write completely
* covers the extent, then we don't need to split it.
@@ -2888,7 +2921,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2912,8 +2945,10 @@ static int ext4_split_unwritten_extents(handle_t *handle,
* update the extent length after successful insert of the
* split extent
*/
- orig_ex.ee_len = cpu_to_le16(ee_len -
- ext4_ext_get_actual_len(ex3));
+ ee_len -= ext4_ext_get_actual_len(ex3);
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ may_zeroout = ee_block + ee_len <= eof_block;
+
depth = newdepth;
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode, iblock, path);
@@ -2959,7 +2994,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:35

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 29/40] ext4: init statistics after journal recovery

From: Dmitry Monakhov <[email protected]>

commit 84061e07c5fbbbf9dc8aef8fb750fc3a2dfc31f3 upstream (as of v2.6.34-git13)

Currently block/inode/dir counters initialized before journal was
recovered. In fact after journal recovery this info will probably
change. And freeblocks it critical for correct delalloc mode
accounting.

https://bugzilla.kernel.org/show_bug.cgi?id=15768

Signed-off-by: Dmitry Monakhov <[email protected]>
Acked-by: Jan Kara <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/super.c | 41 ++++++++++++++++++-----------------------
1 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a6a8e5b..11e9376 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2710,24 +2710,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);

- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
- if (!err) {
- err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext4_count_free_inodes(sb));
- }
- if (!err) {
- err = percpu_counter_init(&sbi->s_dirs_counter,
- ext4_count_dirs(sb));
- }
- if (!err) {
- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
- }
- if (err) {
- ext4_msg(sb, KERN_ERR, "insufficient memory");
- goto failed_mount3;
- }
-
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;

@@ -2827,7 +2809,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);

no_journal:
-
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
+ ext4_count_free_blocks(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "insufficient memory");
+ goto failed_mount_wq;
+ }
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -2960,6 +2955,10 @@ failed_mount_wq:
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
+ percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount3:
if (sbi->s_flex_groups) {
if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -2967,10 +2966,6 @@ failed_mount3:
else
kfree(sbi->s_flex_groups);
}
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:49

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 21/40] ext4: allow defrag (EXT4_IOC_MOVE_EXT) in 32bit compat mode

From: Christian Borntraeger <[email protected]>

commit b684b2ee9409f2890a8b3aea98525bbe5f84e276 upstream (as of v2.6.34-git13)

I have an x86_64 kernel with i386 userspace. e4defrag fails on the
EXT4_IOC_MOVE_EXT ioctl because it is not wired up for the compat
case. It seems that struct move_extent is compat save, only types
with fixed widths are used:
{
__u32 reserved; /* should be zero */
__u32 donor_fd; /* donor file descriptor */
__u64 orig_start; /* logical start offset in block for orig */
__u64 donor_start; /* logical start offset in block for donor */
__u64 len; /* block length to be moved */
__u64 moved_len; /* moved block length */
};

Lets just wire up EXT4_IOC_MOVE_EXT for the compat case.

Signed-off-by: Christian Borntraeger <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
Reviewed-by: Eric Sandeen <[email protected]>
CC: Akira Fujita <[email protected]>
---
fs/ext4/ioctl.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d024..66fa0b0 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -375,6 +375,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
break;
case EXT4_IOC_GROUP_ADD:
break;
+ case EXT4_IOC_MOVE_EXT:
+ break;
default:
return -ENOIOCTLCMD;
}
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:41

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 18/40] ext4: fix memory leaks in error path handling of ext4_ext_zeroout()

From: Jing Zhang <[email protected]>

commit b720303df7352d4a7a1f61e467e0a124913c0d41 upstream (as of v2.6.34-git13)

When EIO occurs after bio is submitted, there is no memory free
operation for bio, which results in memory leakage. And there is also
no check against bio_alloc() for bio.

Acked-by: Dave Kleikamp <[email protected]>
Signed-off-by: Jing Zhang <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/extents.c | 15 ++++++++-------
1 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3afdded..032c95b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2440,7 +2440,7 @@ static void bi_complete(struct bio *bio, int error)
/* FIXME!! we need to try to merge to left or right after zero-out */
static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
- int ret = -EIO;
+ int ret;
struct bio *bio;
int blkbits, blocksize;
sector_t ee_pblock;
@@ -2464,6 +2464,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
len = ee_len;

bio = bio_alloc(GFP_NOIO, len);
+ if (!bio)
+ return -ENOMEM;
+
bio->bi_sector = ee_pblock;
bio->bi_bdev = inode->i_sb->s_bdev;

@@ -2491,17 +2494,15 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
submit_bio(WRITE, bio);
wait_for_completion(&event);

- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
- ret = 0;
- else {
- ret = -EIO;
- break;
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ bio_put(bio);
+ return -EIO;
}
bio_put(bio);
ee_len -= done;
ee_pblock += done << (blkbits - 9);
}
- return ret;
+ return 0;
}

#define EXT4_EXT_ZERO_LEN 7
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:35

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 30/40] ext4: Remove extraneous newlines in ext4_msg() calls

From: Curt Wohlgemuth <[email protected]>

commit fbe845ddf368f77f86aa7500f8fd2690f54c66a8 upstream (as of v2.6.34-git13)

Addresses-Google-Bug: #2562325

Signed-off-by: Curt Wohlgemuth <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/inode.c | 6 +++---
fs/ext4/super.c | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3c16ab8..25950b1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2252,7 +2252,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
ext4_msg(mpd->inode->i_sb, KERN_CRIT,
"delayed block allocation failed for inode %lu at "
"logical offset %llu with max blocks %zd with "
- "error %d\n", mpd->inode->i_ino,
+ "error %d", mpd->inode->i_ino,
(unsigned long long) next,
mpd->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_CRIT "This should not happen!! "
@@ -2913,7 +2913,7 @@ retry:
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
- "%ld pages, ino %lu; err %d\n", __func__,
+ "%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
goto out_writepages;
}
@@ -2988,7 +2988,7 @@ retry:
if (pages_skipped != wbc->pages_skipped)
ext4_msg(inode->i_sb, KERN_CRIT,
"This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d\n",
+ "with nr_to_write = %ld ret = %d",
__func__, wbc->nr_to_write, ret);

/* Update index */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 11e9376..459da7c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2897,7 +2897,7 @@ no_journal:
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
- "zone (%d)\n", err);
+ "zone (%d)", err);
goto failed_mount4;
}

--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:38

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 23/40] ext4: check s_log_groups_per_flex in online resize code

From: Eric Sandeen <[email protected]>

commit 42007efd569f1cf3bfb9a61da60ef6c2179508ca upstream (as of v2.6.34-git13)

If groups_per_flex < 2, sbi->s_flex_groups[] doesn't get filled out,
and every other access to this first tests s_log_groups_per_flex;
same thing needs to happen in resize or we'll wander off into
a null pointer when doing an online resize of the file system.

Thanks to Christoph Biedl, who came up with the trivial testcase:

# truncate --size 128M fsfile
# mkfs.ext3 -F fsfile
# tune2fs -O extents,uninit_bg,dir_index,flex_bg,huge_file,dir_nlink,extra_isize fsfile
# e2fsck -yDf -C0 fsfile
# truncate --size 132M fsfile
# losetup /dev/loop0 fsfile
# mount /dev/loop0 mnt
# resize2fs -p /dev/loop0

https://bugzilla.kernel.org/show_bug.cgi?id=13549

Reported-by: Alessandro Polverini <[email protected]>
Test-case-by: Christoph Biedl <[email protected]>
Signed-off-by: Eric Sandeen <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/resize.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3b2c554..433ea27 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -930,7 +930,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb));

- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, input->group);
atomic_add(input->free_blocks_count,
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:49

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 14/40] ext4: Fix possible lost inode write in no journal mode

From: Curt Wohlgemuth <[email protected]>

commit 8b472d739b2ddd8ab7fb278874f696cd95b25a5e upstream (as of v2.6.34-rc6)

In the no-journal case, ext4_write_inode() will fetch the bh and call
sync_dirty_buffer() on it. However, if the bh has already been
written and the bh reclaimed for some other purpose, AND if the inode
is the only one in the inode table block in use, then
ext4_get_inode_loc() will not read the inode table block from disk,
but as an optimization, fill the block with zero's assuming that its
caller will copy in the on-disk version of the inode. This is not
done by ext4_write_inode(), so the contents of the inode can simply
get lost. The fix is to use __ext4_get_inode_loc() with in_mem set to
0, instead of ext4_get_inode_loc(). Long term the API needs to be
fixed so it's obvious why latter is not safe.

Addresses-Google-Bug: #2526446

Signed-off-by: Curt Wohlgemuth <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/inode.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ff04c74..28152f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5204,7 +5204,7 @@ int ext4_write_inode(struct inode *inode, int wait)
} else {
struct ext4_iloc iloc;

- err = ext4_get_inode_loc(inode, &iloc);
+ err = __ext4_get_inode_loc(inode, &iloc, 0);
if (err)
return err;
if (wait)
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:43

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 17/40] ext4: check missed return value in ext4_sync_file()

From: Dmitry Monakhov <[email protected]>

commit 0671e704658b9f26f85e78d51176daa861f955c7 upstream (as of v2.6.34-git13)

Signed-off-by: Dmitry Monakhov <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/fsync.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 98bd140..eb52837 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -101,7 +101,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
- jbd2_log_wait_commit(journal, commit_tid);
+ ret = jbd2_log_wait_commit(journal, commit_tid);
} else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret;
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:56

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 19/40] ext4: Remove unnecessary call to ext4_get_group_desc() in mballoc

From: Jing Zhang <[email protected]>

commit 62e823a2cba18509ee826d775270e8ef9071b5bc upstream (as of v2.6.34-git13)

Signed-off-by: Jing Zhang <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/mballoc.c | 2 --
1 files changed, 0 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 2e98130..8fdcfc7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2027,7 +2027,6 @@ repeat:

for (i = 0; i < ngroups; group++, i++) {
struct ext4_group_info *grp;
- struct ext4_group_desc *desc;

if (group == ngroups)
group = 0;
@@ -2050,7 +2049,6 @@ repeat:
}

ac->ac_groups_scanned++;
- desc = ext4_get_group_desc(sb, group, NULL);
if (cr == 0)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 &&
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 34/40] ext4: Use bitops to read/modify i_flags in struct ext4_inode_info

From: Dmitry Monakhov <[email protected]>

commit 12e9b892002d9af057655d35b44db8ee9243b0dc upstream (as of v2.6.34-git13)

At several places we modify EXT4_I(inode)->i_flags without holding
i_mutex (ext4_do_update_inode, ...). These modifications are racy and
we can lose updates to i_flags. So convert handling of i_flags to use
bitops which are atomic.

https://bugzilla.kernel.org/show_bug.cgi?id=15792

Signed-off-by: Dmitry Monakhov <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/dir.c | 4 +-
fs/ext4/ext4.h | 109 ++++++++++++++++++++++++++++++++++++++++++-------
fs/ext4/ext4_jbd2.h | 6 +-
fs/ext4/extents.c | 10 ++--
fs/ext4/file.c | 2 +-
fs/ext4/ialloc.c | 4 +-
fs/ext4/inode.c | 30 +++++++-------
fs/ext4/mballoc.c | 4 +-
fs/ext4/migrate.c | 2 +-
fs/ext4/move_extent.c | 4 +-
fs/ext4/namei.c | 10 ++--
fs/ext4/super.c | 1 +
fs/ext4/xattr.c | 4 +-
13 files changed, 135 insertions(+), 55 deletions(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 92ae84f..c14f136 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -111,7 +111,7 @@ static int ext4_readdir(struct file *filp,

if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
- ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1))) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
@@ -122,7 +122,7 @@ static int ext4_readdir(struct file *filp,
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
}
stored = 0;
offset = filp->f_pos & (sb->s_blocksize - 1);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bc4013b..1b4835f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -315,6 +315,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
return flags & EXT4_OTHER_FLMASK;
}

+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+ EXT4_INODE_SECRM = 0, /* Secure deletion */
+ EXT4_INODE_UNRM = 1, /* Undelete */
+ EXT4_INODE_COMPR = 2, /* Compress file */
+ EXT4_INODE_SYNC = 3, /* Synchronous updates */
+ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
+ EXT4_INODE_APPEND = 5, /* writes to file may only append */
+ EXT4_INODE_NODUMP = 6, /* do not dump file */
+ EXT4_INODE_NOATIME = 7, /* do not update atime */
+/* Reserved for compression usage... */
+ EXT4_INODE_DIRTY = 8,
+ EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
+ EXT4_INODE_NOCOMPR = 10, /* Don't compress */
+ EXT4_INODE_ECOMPR = 11, /* Compression error */
+/* End compression flags --- maybe not all used */
+ EXT4_INODE_INDEX = 12, /* hash-indexed directory */
+ EXT4_INODE_IMAGIC = 13, /* AFS directory */
+ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
+ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
+ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
+ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
+ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
+ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
+ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
+};
+
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+ printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+ EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+ CHECK_FLAG_VALUE(SECRM);
+ CHECK_FLAG_VALUE(UNRM);
+ CHECK_FLAG_VALUE(COMPR);
+ CHECK_FLAG_VALUE(SYNC);
+ CHECK_FLAG_VALUE(IMMUTABLE);
+ CHECK_FLAG_VALUE(APPEND);
+ CHECK_FLAG_VALUE(NODUMP);
+ CHECK_FLAG_VALUE(NOATIME);
+ CHECK_FLAG_VALUE(DIRTY);
+ CHECK_FLAG_VALUE(COMPRBLK);
+ CHECK_FLAG_VALUE(NOCOMPR);
+ CHECK_FLAG_VALUE(ECOMPR);
+ CHECK_FLAG_VALUE(INDEX);
+ CHECK_FLAG_VALUE(IMAGIC);
+ CHECK_FLAG_VALUE(JOURNAL_DATA);
+ CHECK_FLAG_VALUE(NOTAIL);
+ CHECK_FLAG_VALUE(DIRSYNC);
+ CHECK_FLAG_VALUE(TOPDIR);
+ CHECK_FLAG_VALUE(HUGE_FILE);
+ CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(EA_INODE);
+ CHECK_FLAG_VALUE(EOFBLOCKS);
+ CHECK_FLAG_VALUE(RESERVED);
+}
+
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
__u32 group; /* Group number for this data */
@@ -609,9 +686,8 @@ struct ext4_ext_cache {
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
- __u32 i_flags;
- ext4_fsblk_t i_file_acl;
__u32 i_dtime;
+ ext4_fsblk_t i_file_acl;

/*
* i_block_group is the number of the block group which contains
@@ -622,6 +698,7 @@ struct ext4_inode_info {
*/
ext4_group_t i_block_group;
unsigned long i_state_flags; /* Dynamic state flags */
+ unsigned long i_flags;

ext4_lblk_t i_dir_start_lookup;
#ifdef CONFIG_EXT4_FS_XATTR
@@ -1055,20 +1132,22 @@ enum {
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
};

-static inline int ext4_test_inode_state(struct inode *inode, int bit)
-{
- return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+#define EXT4_INODE_BIT_FNS(name, field) \
+static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
+{ \
+ return test_bit(bit, &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
+{ \
+ set_bit(bit, &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{ \
+ clear_bit(bit, &EXT4_I(inode)->i_##field); \
}

-static inline void ext4_set_inode_state(struct inode *inode, int bit)
-{
- set_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
-
-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
-{
- clear_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
+EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(state, state_flags)
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
@@ -1253,7 +1332,7 @@ struct ext4_dir_entry_2 {

#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
EXT4_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+ ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 05eca81..91e8748 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
return 1;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
return 1;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 1;
return 0;
}
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return 0;
if (EXT4_JOURNAL(inode) == NULL)
return 1;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
return 1;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2ff79e8..5b1c104 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3403,12 +3403,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
}
}

- if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+ if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
if (eh->eh_entries) {
last_ex = EXT_LAST_EXTENT(eh);
if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+ ext4_ext_get_actual_len(last_ex))
- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
} else {
WARN_ON(eh->eh_entries == 0);
ext4_error(inode->i_sb, __func__,
@@ -3554,7 +3554,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
* can proceed even if the new size is the same as i_size.
*/
if (new_size > i_size_read(inode))
- EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}

}
@@ -3582,7 +3582,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
* currently supporting (pre)allocate mode for extent-based
* files _only_
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;

/* preallocation to directories is currently not supported */
@@ -3832,7 +3832,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int error = 0;

/* fallback to generic here if not in extents fmt */
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len,
ext4_get_block);

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f6071ce..2a60541 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -65,7 +65,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
* is smaller than s_maxbytes, which is for extent-mapped files.
*/

- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
size_t length = iov_length(iov, nr_segs);

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c55275a..55a93f5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -497,7 +497,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,

if (S_ISDIR(mode) &&
((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+ (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
int best_ndir = inodes_per_group;
int ret = -1;

@@ -1044,7 +1044,7 @@ got:
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
/* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
- EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode);
}
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 25950b1..9dd4022 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -915,7 +915,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
int count = 0;
ext4_fsblk_t first_block = 0;

- J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+ J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
depth = ext4_block_to_path(inode, iblock, offsets,
&blocks_to_boundary);
@@ -1043,7 +1043,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
*/
static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
{
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return ext4_ext_calc_metadata_amount(inode, lblock);

return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1232,7 +1232,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* file system block.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
bh, 0);
} else {
@@ -1294,7 +1294,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
*/
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
bh, flags);
} else {
@@ -2329,7 +2329,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
goto flush_it;

/* check if thereserved journal credits might overflow */
- if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
if (nrblocks >= EXT4_MAX_TRANS_DATA) {
/*
* With non-extent format we are limited by the journal
@@ -2793,7 +2793,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* number of contiguous block. So we will limit
* number of contiguous block to a sane value
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
(max_blocks > EXT4_MAX_TRANS_DATA))
max_blocks = EXT4_MAX_TRANS_DATA;

@@ -3829,7 +3829,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;

- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);

return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4447,12 +4447,12 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;

- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);

if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ext4_ext_truncate(inode);
return;
}
@@ -5294,7 +5294,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}

if (attr->ia_valid & ATTR_SIZE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5307,7 +5307,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE &&
(attr->ia_size < inode->i_size ||
- (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+ (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
handle_t *handle;

handle = ext4_journal_start(inode, 3);
@@ -5339,7 +5339,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
}
/* ext4_truncate will clear the flag */
- if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+ if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
ext4_truncate(inode);
}

@@ -5415,7 +5415,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,

static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
{
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
}
@@ -5750,9 +5750,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
*/

if (val)
- EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else
- EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
ext4_set_aops(inode);

jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 665e1c9..01ad04a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2008,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
/* non-extent files are limited to low blocks/groups */
- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
ngroups = sbi->s_blockfile_groups;

BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -3176,7 +3176,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
continue;

/* non-extent files can't have physical blocks past 2^32 */
- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
continue;

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 8b87bd0..f8ea28d 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -474,7 +474,7 @@ int ext4_ext_migrate(struct inode *inode)
*/
if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_INCOMPAT_EXTENTS) ||
- (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;

if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d7e9688..34b7a52 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -975,11 +975,11 @@ mext_check_arguments(struct inode *orig_inode,
}

/* Ext4 move extent supports only extent based file */
- if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
"based file [ino:orig %lu]\n", orig_inode->i_ino);
return -EOPNOTSUPP;
- } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: donor file is not extents "
"based file [ino:donor %lu]\n", donor_inode->i_ino);
return -EOPNOTSUPP;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1dd84f6..e99c4f1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -660,7 +660,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
start_hash, start_minor_hash));
dir = dir_file->f_path.dentry->d_inode;
- if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+ if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
hinfo.hash_version +=
@@ -805,7 +805,7 @@ static void ext4_update_dx_flag(struct inode *inode)
{
if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX))
- EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}

/*
@@ -1424,7 +1424,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
brelse(bh);
return retval;
}
- EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+ ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
data1 = bh2->b_data;

memcpy (data1, de, len);
@@ -1497,7 +1497,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
retval = ext4_dx_add_entry(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
return retval;
- EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
}
@@ -2292,7 +2292,7 @@ retry:
}
} else {
/* clear the extent format for fast symlink */
- EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
inode->i_op = &ext4_fast_symlink_inode_operations;
memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
inode->i_size = l-1;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9803da8..0e7b09b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4048,6 +4048,7 @@ static int __init init_ext4_fs(void)
{
int err;

+ ext4_check_flag_values();
err = init_ext4_system_zone();
if (err)
return err;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c619a7e..872617e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,7 +820,7 @@ inserted:
EXT4_I(inode)->i_block_group);

/* non-extent files can't have physical blocks past 2^32 */
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;

block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +828,7 @@ inserted:
if (error)
goto cleanup;

- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);

ea_idebug(inode, "creating block %d", block);
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 39/40] ext4: Fix compat EXT4_IOC_ADD_GROUP

From: Ben Hutchings <[email protected]>

commit 4d92dc0f00a775dc2e1267b0e00befb783902fe7 upstream (as of v2.6.34-git13)

struct ext4_new_group_input needs to be converted because u64 has
only 32-bit alignment on some 32-bit architectures, notably i386.

Signed-off-by: Ben Hutchings <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 16 ++++++++++++++++
fs/ext4/ioctl.c | 25 +++++++++++++++++++++++--
2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ef12e95..e231c63 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif

/*
* The fourth extended filesystem constants/structures
@@ -403,6 +406,18 @@ struct ext4_new_group_input {
__u16 unused;
};

+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+ u32 group;
+ compat_u64 block_bitmap;
+ compat_u64 inode_bitmap;
+ compat_u64 inode_table;
+ u32 blocks_count;
+ u16 reserved_blocks;
+ u16 unused;
+};
+#endif
+
/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
__u32 group;
@@ -479,6 +494,7 @@ struct ext4_new_group_data {
#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
#ifdef CONFIG_JBD2_DEBUG
#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
#endif
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 66fa0b0..6ddec84 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -373,8 +373,29 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC32_SETRSVSZ:
cmd = EXT4_IOC_SETRSVSZ;
break;
- case EXT4_IOC_GROUP_ADD:
- break;
+ case EXT4_IOC32_GROUP_ADD: {
+ struct compat_ext4_new_group_input __user *uinput;
+ struct ext4_new_group_input input;
+ mm_segment_t old_fs;
+ int err;
+
+ uinput = compat_ptr(arg);
+ err = get_user(input.group, &uinput->group);
+ err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+ err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+ err |= get_user(input.inode_table, &uinput->inode_table);
+ err |= get_user(input.blocks_count, &uinput->blocks_count);
+ err |= get_user(input.reserved_blocks,
+ &uinput->reserved_blocks);
+ if (err)
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+ (unsigned long) &input);
+ set_fs(old_fs);
+ return err;
+ }
case EXT4_IOC_MOVE_EXT:
break;
default:
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:35

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 40/40] ext4: Make fsync sync new parent directories in no-journal mode

From: Frank Mayhar <[email protected]>

commit 14ece1028b3ed53ffec1b1213ffc6acaf79ad77c upstream (as of v2.6.34-git13)

Add a new ext4 state to tell us when a file has been newly created; use
that state in ext4_sync_file in no-journal mode to tell us when we need
to sync the parent directory as well as the inode and data itself. This
fixes a problem in which a panic or power failure may lose the entire
file even when using fsync, since the parent directory entry is lost.

Addresses-Google-Bug: #2480057

Signed-off-by: Frank Mayhar <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 1 +
fs/ext4/fsync.c | 31 +++++++++++++++++++++++++++++--
fs/ext4/namei.c | 2 ++
3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e231c63..2ec215f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1148,6 +1148,7 @@ enum {
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
+ EXT4_STATE_NEWENTRY, /* File just added to dir */
};

#define EXT4_INODE_BIT_FNS(name, field) \
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index eb52837..5240354 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
#include <trace/events/ext4.h>

/*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file. This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+ struct dentry *dentry = NULL;
+
+ while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+ dentry = list_entry(inode->i_dentry.next,
+ struct dentry, d_alias);
+ if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+ break;
+ inode = dentry->d_parent->d_inode;
+ sync_mapping_buffers(inode->i_mapping);
+ }
+}
+
+/*
* akpm: A new design for ext4_sync_file().
*
* This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -67,8 +90,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (ret < 0)
return ret;

- if (!journal)
- return simple_fsync(file, dentry, datasync);
+ if (!journal) {
+ ret = simple_fsync(file, dentry, datasync);
+ if (!ret && !list_empty(&inode->i_dentry))
+ ext4_sync_parent(inode);
+ return ret;
+ }

/*
* data=writeback,ordered:
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e99c4f1..c3b6ad0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1525,6 +1525,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
+ if (retval == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
return retval;
}

--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 09/40] ext4: Fix insertion point of extent in mext_insert_across_blocks()

From: Akira Fujita <[email protected]>

commit 5fd5249aa36fad98c9fd5edced352939e54f9324 upstream (as of v2.6.33-git11)

If the leaf node has 2 extent space or fewer and EXT4_IOC_MOVE_EXT
ioctl is called with the file offset where after the 2nd extent
covers, mext_insert_across_blocks() always tries to insert extent into
the first extent. As a result, the file gets corrupted because of
wrong extent order. The patch fixes this problem.

Signed-off-by: Akira Fujita <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/move_extent.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 82c415b..b2b2e67 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
}

o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
new_flag = 1;

} else if (start_ext->ee_len && new_ext->ee_len &&
@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
* orig |------------------------------|
*/
o_start->ee_len = start_ext->ee_len;
+ eblock = le32_to_cpu(start_ext->ee_block);
new_flag = 1;

} else if (!start_ext->ee_len && new_ext->ee_len &&
@@ -502,6 +504,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
le32_to_cpu(oext->ee_block) + oext_alen) {
start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
le32_to_cpu(oext->ee_block));
+ start_ext.ee_block = oext->ee_block;
copy_extent_status(oext, &start_ext);
} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
prev_ext = oext - 1;
@@ -515,6 +518,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
start_ext.ee_len = cpu_to_le16(
ext4_ext_get_actual_len(prev_ext) +
new_ext_alen);
+ start_ext.ee_block = oext->ee_block;
copy_extent_status(prev_ext, &start_ext);
new_ext.ee_len = 0;
}
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 03/40] ext4: Add flag to files with blocks intentionally past EOF

From: Jiaying Zhang <[email protected]>

commit c8d46e41bc744c8fa0092112af3942fcd46c8b18 upstream (as of v2.6.33-git11)

fallocate() may potentially instantiate blocks past EOF, depending
on the flags used when it is called.

e2fsck currently has a test for blocks past i_size, and it
sometimes trips up - noticeably on xfstests 013 which runs fsstress.

This patch from Jiayang does fix it up - it (along with
e2fsprogs updates and other patches recently from Aneesh) has
survived many fsstress runs in a row.

Signed-off-by: Eric Sandeen <[email protected]>
Signed-off-by: Jiaying Zhang <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ext4.h | 6 ++++--
fs/ext4/extents.c | 22 +++++++++++++++++++++-
fs/ext4/inode.c | 9 ++++++++-
fs/ext4/ioctl.c | 9 +++++++++
4 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8e8783a..81b11cf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -284,10 +284,12 @@ struct flex_groups {
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
+#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */

-#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */

/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9015ecb..b38058a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3185,7 +3185,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
{
struct ext4_ext_path *path = NULL;
struct ext4_extent_header *eh;
- struct ext4_extent newex, *ex;
+ struct ext4_extent newex, *ex, *last_ex;
ext4_fsblk_t newblock;
int err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
@@ -3366,6 +3366,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
EXT4_STATE_DIO_UNWRITTEN);
}
}
+
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+ if (eh->eh_entries) {
+ last_ex = EXT_LAST_EXTENT(eh);
+ if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+ + ext4_ext_get_actual_len(last_ex))
+ EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+ } else {
+ WARN_ON(eh->eh_entries == 0);
+ ext4_error(inode->i_sb, __func__,
+ "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+ }
+ }
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
/* free data blocks we just allocated */
@@ -3499,6 +3512,13 @@ static void ext4_falloc_update_inode(struct inode *inode,
i_size_write(inode, new_size);
if (new_size > EXT4_I(inode)->i_disksize)
ext4_update_i_disksize(inode, new_size);
+ } else {
+ /*
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if (new_size > i_size_read(inode))
+ EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
}

}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7128506..26a3d54 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4434,6 +4434,8 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;

+ EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

@@ -5289,7 +5291,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}

if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+ attr->ia_valid & ATTR_SIZE &&
+ (attr->ia_size < inode->i_size ||
+ (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
handle_t *handle;

handle = ext4_journal_start(inode, 3);
@@ -5320,6 +5324,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
goto err_out;
}
}
+ /* ext4_truncate will clear the flag */
+ if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+ ext4_truncate(inode);
}

rc = inode_setattr(inode, attr);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index b63d193..2220feb 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags &= ~EXT4_EXTENTS_FL;
}

+ if (flags & EXT4_EOFBLOCKS_FL) {
+ /* we don't support adding EOFBLOCKS flag */
+ if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+ } else if (oldflags & EXT4_EOFBLOCKS_FL)
+ ext4_truncate(inode);
+
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 10/40] ext4: Fix the NULL reference in double_down_write_data_sem()

From: Akira Fujita <[email protected]>

commit 7247c0caa23d94a1cb6b307edba9dc45fb0798d4 upstream (as of v2.6.33-git11)

If EXT4_IOC_MOVE_EXT ioctl is called with NULL donor_fd, fget() in
ext4_ioctl() gets inappropriate file structure for donor; so we need
to do this check earlier, before calling double_down_write_data_sem().

Signed-off-by: Akira Fujita <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/move_extent.c | 16 ++++++++--------
1 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b2b2e67..379c0c0 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -953,14 +953,6 @@ mext_check_arguments(struct inode *orig_inode,
unsigned int blkbits = orig_inode->i_blkbits;
unsigned int blocksize = 1 << blkbits;

- /* Regular file check */
- if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
- ext4_debug("ext4 move extent: The argument files should be "
- "regular file [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }

2010-06-01 12:04:05

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 13/40] ext4: Fixed inode allocator to correctly track a flex_bg's used_dirs

From: Eric Sandeen <[email protected]>

commit c4caae25187ff3f5e837c6f04eb1acc2723c72d3 upstream (as of v2.6.34-rc3)

When used_dirs was introduced for the flex_groups struct, it looks
like the accounting was not put into place properly, in some places
manipulating free_inodes rather than used_dirs.

Signed-off-by: Eric Sandeen <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/ialloc.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 179e45c..8b4bb11 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -268,7 +268,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
ext4_group_t f;

f = ext4_flex_group(sbi, block_group);
- atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+ atomic_dec(&sbi->s_flex_groups[f].used_dirs);
}

}
@@ -779,7 +779,7 @@ static int ext4_claim_inode(struct super_block *sb,
if (sbi->s_log_groups_per_flex) {
ext4_group_t f = ext4_flex_group(sbi, group);

- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ atomic_inc(&sbi->s_flex_groups[f].used_dirs);
}
}
gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
--
1.6.6.1.1.g974db.dirty


2010-06-01 12:04:05

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 12/40] ext4: Fix estimate of # of blocks needed to write indirect-mapped files

From: Jan Kara <[email protected]>

commit d330a5befb88875a9b3d2db62f9b74dadf660b13 upstream (as of v2.6.34-rc3)

http://bugzilla.kernel.org/show_bug.cgi?id=15420

Signed-off-by: Jan Kara <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/inode.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4e8759a..ff04c74 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1018,7 +1018,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
sector_t lblock)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+ sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
int blk_bits;

if (lblock < EXT4_NDIR_BLOCKS)
@@ -1033,7 +1033,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
}
ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
ei->i_da_metadata_calc_len = 1;
- blk_bits = roundup_pow_of_two(lblock + 1);
+ blk_bits = order_base_2(lblock);
return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
}

--
1.6.6.1.1.g974db.dirty


2010-06-01 12:03:33

by Theodore Ts'o

[permalink] [raw]
Subject: [PATCH 2.6.33.y 05/40] ext4: fix error handling in migrate

From: Dmitry Monakhov <[email protected]>

commit f39490bcd1691d65dc33689222a12e1fc13dd824 upstream (as of v2.6.33-git11)

Set i_nlink to zero for temporary inode from very beginning.
otherwise we may fail to start new journal handle and this
inode will be unreferenced but with i_nlink == 1
Since we hold inode reference it can not be pruned.

Also add missed journal_start retval check.

Signed-off-by: Dmitry Monakhov <[email protected]>
Signed-off-by: "Theodore Ts'o" <[email protected]>
---
fs/ext4/migrate.c | 29 ++++++++++++++---------------
1 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 46a4101..8b87bd0 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -503,14 +503,10 @@ int ext4_ext_migrate(struct inode *inode)
}
i_size_write(tmp_inode, i_size_read(inode));
/*
- * We don't want the inode to be reclaimed
- * if we got interrupted in between. We have
- * this tmp inode carrying reference to the
- * data blocks of the original file. We set
- * the i_nlink to zero at the last stage after
- * switching the original file to extent format
+ * Set the i_nlink to zero so it will be deleted later
+ * when we drop inode reference.
*/
- tmp_inode->i_nlink = 1;
+ tmp_inode->i_nlink = 0;

ext4_ext_tree_init(handle, tmp_inode);
ext4_orphan_add(handle, tmp_inode);
@@ -537,6 +533,16 @@ int ext4_ext_migrate(struct inode *inode)
up_read((&EXT4_I(inode)->i_data_sem));

handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ /*
+ * It is impossible to update on-disk structures without
+ * a handle, so just rollback in-core changes and live other
+ * work to orphan_list_cleanup()
+ */
+ ext4_orphan_del(NULL, tmp_inode);
+ retval = PTR_ERR(handle);
+ goto out;
+ }

ei = EXT4_I(inode);
i_data = ei->i_data;
@@ -618,15 +624,8 @@ err_out:

/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
-
- /*
- * Set the i_nlink to zero so that
- * generic_drop_inode really deletes the
- * inode
- */
- tmp_inode->i_nlink = 0;

2010-07-28 23:30:14

by Greg KH

[permalink] [raw]
Subject: Re: [stable] [PATCH 2.6.33.y 01/40] ext4: Use bitops to read/modify EXT4_I(inode)->i_state

I only received patch 1/40 of this series, care to resend all of them?

And I don't see any series for the .34 kernel tree, was there one? If
so, care to resend it as well?

thanks,

greg k-h

2012-09-28 03:09:04

by Yongqiang Yang

[permalink] [raw]
Subject: Re: [PATCH 2.6.33.y 14/40] ext4: Fix possible lost inode write in no journal mode

Hi Curt,

In journal mode, IMHO the case you described also happens. Could you
shed light on journal mode case?

Thanks,
Yongqiang.

On Tue, Jun 1, 2010 at 8:03 PM, Theodore Ts'o <[email protected]> wrote:
> From: Curt Wohlgemuth <[email protected]>
>
> commit 8b472d739b2ddd8ab7fb278874f696cd95b25a5e upstream (as of v2.6.34-rc6)
>
> In the no-journal case, ext4_write_inode() will fetch the bh and call
> sync_dirty_buffer() on it. However, if the bh has already been
> written and the bh reclaimed for some other purpose, AND if the inode
> is the only one in the inode table block in use, then
> ext4_get_inode_loc() will not read the inode table block from disk,
> but as an optimization, fill the block with zero's assuming that its
> caller will copy in the on-disk version of the inode. This is not
> done by ext4_write_inode(), so the contents of the inode can simply
> get lost. The fix is to use __ext4_get_inode_loc() with in_mem set to
> 0, instead of ext4_get_inode_loc(). Long term the API needs to be
> fixed so it's obvious why latter is not safe.
>
> Addresses-Google-Bug: #2526446
>
> Signed-off-by: Curt Wohlgemuth <[email protected]>
> Signed-off-by: "Theodore Ts'o" <[email protected]>
> ---
> fs/ext4/inode.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ff04c74..28152f8 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5204,7 +5204,7 @@ int ext4_write_inode(struct inode *inode, int wait)
> } else {
> struct ext4_iloc iloc;
>
> - err = ext4_get_inode_loc(inode, &iloc);
> + err = __ext4_get_inode_loc(inode, &iloc, 0);
> if (err)
> return err;
> if (wait)
> --
> 1.6.6.1.1.g974db.dirty
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html



--
Best Wishes
Yongqiang Yang

2012-09-28 03:21:38

by Yongqiang Yang

[permalink] [raw]
Subject: Re: [PATCH 2.6.33.y 14/40] ext4: Fix possible lost inode write in no journal mode

I got it!

Thanks,
Yongqiang.
On Fri, Sep 28, 2012 at 11:08 AM, Yongqiang Yang <[email protected]> wrote:
> Hi Curt,
>
> In journal mode, IMHO the case you described also happens. Could you
> shed light on journal mode case?
>
> Thanks,
> Yongqiang.
>
> On Tue, Jun 1, 2010 at 8:03 PM, Theodore Ts'o <[email protected]> wrote:
>> From: Curt Wohlgemuth <[email protected]>
>>
>> commit 8b472d739b2ddd8ab7fb278874f696cd95b25a5e upstream (as of v2.6.34-rc6)
>>
>> In the no-journal case, ext4_write_inode() will fetch the bh and call
>> sync_dirty_buffer() on it. However, if the bh has already been
>> written and the bh reclaimed for some other purpose, AND if the inode
>> is the only one in the inode table block in use, then
>> ext4_get_inode_loc() will not read the inode table block from disk,
>> but as an optimization, fill the block with zero's assuming that its
>> caller will copy in the on-disk version of the inode. This is not
>> done by ext4_write_inode(), so the contents of the inode can simply
>> get lost. The fix is to use __ext4_get_inode_loc() with in_mem set to
>> 0, instead of ext4_get_inode_loc(). Long term the API needs to be
>> fixed so it's obvious why latter is not safe.
>>
>> Addresses-Google-Bug: #2526446
>>
>> Signed-off-by: Curt Wohlgemuth <[email protected]>
>> Signed-off-by: "Theodore Ts'o" <[email protected]>
>> ---
>> fs/ext4/inode.c | 2 +-
>> 1 files changed, 1 insertions(+), 1 deletions(-)
>>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index ff04c74..28152f8 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -5204,7 +5204,7 @@ int ext4_write_inode(struct inode *inode, int wait)
>> } else {
>> struct ext4_iloc iloc;
>>
>> - err = ext4_get_inode_loc(inode, &iloc);
>> + err = __ext4_get_inode_loc(inode, &iloc, 0);
>> if (err)
>> return err;
>> if (wait)
>> --
>> 1.6.6.1.1.g974db.dirty
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Best Wishes
> Yongqiang Yang



--
Best Wishes
Yongqiang Yang