2007-07-01 07:38:03

by Mingming Cao

[permalink] [raw]
Subject: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

This patch is on top of the nanosecond timestamp and i_version_hi
patches.

We need to make sure that existing filesystems can also avail the new
fields that have been added to the inode. We use s_want_extra_isize and
s_min_extra_isize to decide by how much we should expand the inode. If
EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature is set then we expand by
max(s_want_extra_isize, s_min_extra_isize , sizeof(ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE) bytes. Actually it is still an open question
about whether users should be able to set s_*_extra_isize smaller than
the known fields or not.

This patch also adds the functionality to expand inodes to include the
newly added fields. We start by trying to expand by s_want_extra_isize
bytes and if its fails we try to expand by s_min_extra_isize bytes. This
is done by changing the i_extra_isize if enough space is available in
the inode and no EAs are present. If EAs are present and there is enough
space in the inode then the EAs in the inode are shifted to make space.
If enough space is not available in the inode due to the EAs then 1 or
more EAs are shifted to the external EA block. In the worst case when
even the external EA block does not have enough space we inform the user
that some EA would need to be deleted or s_min_extra_isize would have to
be reduced.

This would be online expansion of inodes. I am also working on adding an
"expand_inodes" option to e2fsck which will expand all the used inodes.

Signed-off-by: Andreas Dilger <[email protected]>
Signed-off-by: Kalpak Shah <[email protected]>
Signed-off-by: Mingming Cao <[email protected]>

Index: linux-2.6.22-rc4/fs/ext4/inode.c
===================================================================
--- linux-2.6.22-rc4.orig/fs/ext4/inode.c 2007-06-14 17:32:27.000000000 -0700
+++ linux-2.6.22-rc4/fs/ext4/inode.c 2007-06-14 17:32:41.000000000 -0700
@@ -3120,6 +3120,40 @@ ext4_reserve_inode_write(handle_t *handl
}

/*
+ * Expand an inode by new_extra_isize bytes.
+ * Returns 0 on success or negative error number on failure.
+ */
+int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize,
+ struct ext4_iloc iloc, handle_t *handle)
+{
+ struct ext4_inode *raw_inode;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry;
+
+ if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
+ return 0;
+ }
+
+ raw_inode = ext4_raw_inode(&iloc);
+
+ header = IHDR(inode, raw_inode);
+ entry = IFIRST(header);
+
+ /* No extended attributes present */
+ if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
+ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+ new_extra_isize);
+ EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ return 0;
+ }
+
+ /* try to expand with EA present */
+ return ext4_expand_extra_isize_ea(inode, new_extra_isize,
+ raw_inode, handle);
+}
+
+/*
* What we do here is to mark the in-core inode as clean with respect to inode
* dirtiness (it may still be data-dirty).
* This means that the in-core inode may be reaped by prune_icache
@@ -3143,10 +3177,33 @@ ext4_reserve_inode_write(handle_t *handl
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
{
struct ext4_iloc iloc;
- int err;
+ int err, ret;
+ static int expand_message;

might_sleep();
err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (EXT4_I(inode)->i_extra_isize <
+ EXT4_SB(inode->i_sb)->s_want_extra_isize &&
+ !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+ /* We need extra buffer credits since we may write into EA block
+ * with this same handle */
+ if ((jbd2_journal_extend(handle,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+ ret = ext4_expand_extra_isize(inode,
+ EXT4_SB(inode->i_sb)->s_want_extra_isize,
+ iloc, handle);
+ if (ret) {
+ EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ if (!expand_message) {
+ ext4_warning(inode->i_sb, __FUNCTION__,
+ "Unable to expand inode %lu. Delete"
+ " some EAs or run e2fsck.",
+ inode->i_ino);
+ expand_message = 1;
+ }
+ }
+ }
+ }
if (!err)
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
return err;
Index: linux-2.6.22-rc4/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.22-rc4.orig/include/linux/ext4_fs.h 2007-06-14 17:32:27.000000000 -0700
+++ linux-2.6.22-rc4/include/linux/ext4_fs.h 2007-06-14 17:32:41.000000000 -0700
@@ -202,6 +202,7 @@ struct ext4_group_desc
#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
+#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */

/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
Index: linux-2.6.22-rc4/fs/ext4/xattr.c
===================================================================
--- linux-2.6.22-rc4.orig/fs/ext4/xattr.c 2007-06-14 17:30:47.000000000 -0700
+++ linux-2.6.22-rc4/fs/ext4/xattr.c 2007-06-14 17:32:41.000000000 -0700
@@ -66,13 +66,6 @@
#define BFIRST(bh) ENTRY(BHDR(bh)+1)
#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)

-#define IHDR(inode, raw_inode) \
- ((struct ext4_xattr_ibody_header *) \
- ((void *)raw_inode + \
- EXT4_GOOD_OLD_INODE_SIZE + \
- EXT4_I(inode)->i_extra_isize))
-#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-
#ifdef EXT4_XATTR_DEBUG
# define ea_idebug(inode, f...) do { \
printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -508,6 +501,20 @@ out:
return;
}

+static inline size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
+ size_t *min_offs, void *base, int *total)
+{
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ *total += EXT4_XATTR_LEN(last->e_name_len);
+ if (!last->e_value_block && last->e_value_size) {
+ size_t offs = le16_to_cpu(last->e_value_offs);
+ if (offs < *min_offs)
+ *min_offs = offs;
+ }
+ }
+ return (*min_offs - ((void *)last - base) - sizeof(__u32));
+}
+
struct ext4_xattr_info {
int name_index;
const char *name;
@@ -606,6 +613,7 @@ ext4_xattr_set_entry(struct ext4_xattr_i
memmove(s->here, (void *)s->here + size,
(void *)last - (void *)s->here + sizeof(__u32));
memset(last, 0, size);
+
}
}

@@ -1014,6 +1022,8 @@ ext4_xattr_set_handle(handle_t *handle,
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
inode->i_ctime = ext4_current_time(inode);
+ if (!value)
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
/*
* The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1066,6 +1076,239 @@ retry:
return error;
}

+static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
+ int value_offs_shift, void *to,
+ void *from, size_t n, int blocksize)
+{
+ struct ext4_xattr_entry *last = entry;
+ int new_offs;
+
+ /* Adjust the value offsets of the entries */
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ if (!last->e_value_block && last->e_value_size) {
+ new_offs = le16_to_cpu(last->e_value_offs) +
+ value_offs_shift;
+ BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
+ > blocksize);
+ last->e_value_offs = cpu_to_le16(new_offs);
+ }
+ }
+ /* Shift the entries by n bytes */
+ memmove(to, from, n);
+}
+
+/*
+ * Expand an inode by new_extra_isize bytes when EA presents.
+ * Returns 0 on success or negative error number on failure.
+ *
+ */
+int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ struct ext4_inode *raw_inode, handle_t *handle)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry, *last, *first;
+ struct buffer_head *bh = NULL;
+ struct ext4_xattr_ibody_find *is = NULL;
+ struct ext4_xattr_block_find *bs = NULL;
+ char *buffer = NULL, *b_entry_name = NULL;
+ size_t min_offs, free;
+ int total_ino, total_blk;
+ void *base, *start, *end;
+ int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
+ int s_min_extra_isize = EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+retry:
+ if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return 0;
+ }
+
+ header = IHDR(inode, raw_inode);
+ entry = IFIRST(header);
+
+ /*
+ * Check if enough free space is available in the inode to shift the
+ * entries ahead by new_extra_isize.
+ */
+
+ base = start = entry;
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ min_offs = end - base;
+ last = entry;
+ total_ino = sizeof(struct ext4_xattr_ibody_header);
+
+ free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
+ if (free >= new_extra_isize) {
+ entry = IFIRST(header);
+ ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
+ - new_extra_isize, (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
+ (void *)header, total_ino,
+ inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ error = 0;
+ goto cleanup;
+ }
+
+ /*
+ * Enough free space isn't available in the inode, check if
+ * EA block can hold new_extra_isize bytes.
+ */
+ if (EXT4_I(inode)->i_file_acl) {
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ if (!bh)
+ goto cleanup;
+ if (ext4_xattr_check_block(bh)) {
+ ext4_error(inode->i_sb, __FUNCTION__,
+ "inode %lu: bad block %llu", inode->i_ino,
+ EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ goto cleanup;
+ }
+ base = BHDR(bh);
+ first = BFIRST(bh);
+ end = bh->b_data + bh->b_size;
+ min_offs = end - base;
+ free = ext4_xattr_free_space(first, &min_offs, base,
+ &total_blk);
+ if (free < new_extra_isize) {
+ if (!tried_min_extra_isize && s_min_extra_isize) {
+ tried_min_extra_isize++;
+ new_extra_isize = s_min_extra_isize;
+ goto retry;
+ }
+ error = -1;
+ goto cleanup;
+ }
+ } else {
+ free = inode->i_sb->s_blocksize;
+ }
+
+ while (new_extra_isize > 0) {
+ size_t offs, size, entry_size;
+ struct ext4_xattr_entry *small_entry = NULL;
+ struct ext4_xattr_info i = {
+ .value = NULL,
+ .value_len = 0,
+ };
+ unsigned int total_size, shift_bytes, temp = ~0U;
+
+ is = (struct ext4_xattr_ibody_find *) kmalloc(sizeof(struct
+ ext4_xattr_ibody_find), GFP_KERNEL);
+ bs = (struct ext4_xattr_block_find *) kmalloc(sizeof(struct
+ ext4_xattr_block_find), GFP_KERNEL);
+ memset((void *)is, 0, sizeof(struct ext4_xattr_ibody_find));
+ memset((void *)bs, 0, sizeof(struct ext4_xattr_block_find));
+
+ is->s.not_found = bs->s.not_found = -ENODATA;
+ is->iloc.bh = NULL;
+ bs->bh = NULL;
+
+ last = IFIRST(header);
+ /* Find the entry best suited to be pushed into EA block */
+ entry = NULL;
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ total_size =
+ EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
+ EXT4_XATTR_LEN(last->e_name_len);
+ if (total_size <= free && total_size < temp) {
+ if (total_size < new_extra_isize) {
+ small_entry = last;
+ } else {
+ entry = last;
+ temp = total_size;
+ }
+ }
+ }
+
+ if (entry == NULL) {
+ if (small_entry) {
+ entry = small_entry;
+ } else {
+ if (!tried_min_extra_isize &&
+ s_min_extra_isize) {
+ tried_min_extra_isize++;
+ new_extra_isize = s_min_extra_isize;
+ goto retry;
+ }
+ error = -1;
+ goto cleanup;
+ }
+ }
+ offs = le16_to_cpu(entry->e_value_offs);
+ size = le32_to_cpu(entry->e_value_size);
+ entry_size = EXT4_XATTR_LEN(entry->e_name_len);
+ i.name_index = entry->e_name_index,
+ buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_KERNEL);
+ b_entry_name = kmalloc(entry->e_name_len + 1, GFP_KERNEL);
+ /* Save the entry name and the entry value */
+ memcpy((void *)buffer, (void *)IFIRST(header) + offs,
+ EXT4_XATTR_SIZE(size));
+ memcpy((void *)b_entry_name, (void *)entry->e_name,
+ entry->e_name_len);
+ b_entry_name[entry->e_name_len] = '\0';
+ i.name = b_entry_name;
+
+ error = ext4_get_inode_loc(inode, &is->iloc);
+ if (error)
+ goto cleanup;
+
+ error = ext4_xattr_ibody_find(inode, &i, is);
+ if (error)
+ goto cleanup;
+
+ /* Remove the chosen entry from the inode */
+ error = ext4_xattr_ibody_set(handle, inode, &i, is);
+
+ entry = IFIRST(header);
+ if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
+ shift_bytes = new_extra_isize;
+ else
+ shift_bytes = entry_size + size;
+ /* Adjust the offsets and shift the remaining entries ahead */
+ ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
+ shift_bytes, (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
+ (void *)header, total_ino - entry_size,
+ inode->i_sb->s_blocksize);
+
+ extra_isize += shift_bytes;
+ new_extra_isize -= shift_bytes;
+ EXT4_I(inode)->i_extra_isize = extra_isize;
+
+ i.name = b_entry_name;
+ i.value = buffer;
+ i.value_len = cpu_to_le32(size);
+ error = ext4_xattr_block_find(inode, &i, bs);
+ if (error)
+ goto cleanup;
+
+ /* Add entry which was removed from the inode into the block */
+ error = ext4_xattr_block_set(handle, inode, &i, bs);
+ if (error)
+ goto cleanup;
+ }
+
+cleanup:
+ if (b_entry_name)
+ kfree(b_entry_name);
+ if (buffer)
+ kfree(buffer);
+ if (is) {
+ brelse(is->iloc.bh);
+ kfree(is);
+ }
+ if (bs)
+ kfree(bs);
+ brelse(bh);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return error;
+}
+
+
+
/*
* ext4_xattr_delete_inode()
*
Index: linux-2.6.22-rc4/fs/ext4/xattr.h
===================================================================
--- linux-2.6.22-rc4.orig/fs/ext4/xattr.h 2007-06-14 17:30:37.000000000 -0700
+++ linux-2.6.22-rc4/fs/ext4/xattr.h 2007-06-14 17:32:41.000000000 -0700
@@ -56,6 +56,13 @@ struct ext4_xattr_entry {
#define EXT4_XATTR_SIZE(size) \
(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)

+#define IHDR(inode, raw_inode) \
+ ((struct ext4_xattr_ibody_header *) \
+ ((void *)raw_inode + \
+ EXT4_GOOD_OLD_INODE_SIZE + \
+ EXT4_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+
# ifdef CONFIG_EXT4DEV_FS_XATTR

extern struct xattr_handler ext4_xattr_user_handler;
@@ -74,6 +81,9 @@ extern int ext4_xattr_set_handle(handle_
extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
extern void ext4_xattr_put_super(struct super_block *);

+extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ struct ext4_inode *raw_inode, handle_t *handle);
+
extern int init_ext4_xattr(void);
extern void exit_ext4_xattr(void);

@@ -129,6 +139,13 @@ exit_ext4_xattr(void)
{
}

+static inline int
+ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ struct ext4_inode *raw_inode, handle_t *handle)
+{
+ return -EOPNOTSUPP;
+}
+
#define ext4_xattr_handlers NULL

# endif /* CONFIG_EXT4DEV_FS_XATTR */


2007-07-10 23:32:47

by Andrew Morton

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Sun, 01 Jul 2007 03:38:01 -0400
Mingming Cao <[email protected]> wrote:

> This patch is on top of the nanosecond timestamp and i_version_hi
> patches.

This sort of information isn't needed (or desired) when this patch hits the
git tree. Please ensure that things like this are cleaned up before the
patches go to Linus.

> We need to make sure that existing filesystems can also avail the new
> fields that have been added to the inode. We use s_want_extra_isize and
> s_min_extra_isize to decide by how much we should expand the inode. If
> EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE feature is set then we expand by
> max(s_want_extra_isize, s_min_extra_isize , sizeof(ext4_inode) -
> EXT4_GOOD_OLD_INODE_SIZE) bytes. Actually it is still an open question
> about whether users should be able to set s_*_extra_isize smaller than
> the known fields or not.
>
> This patch also adds the functionality to expand inodes to include the
> newly added fields. We start by trying to expand by s_want_extra_isize
> bytes and if its fails we try to expand by s_min_extra_isize bytes. This
> is done by changing the i_extra_isize if enough space is available in
> the inode and no EAs are present. If EAs are present and there is enough
> space in the inode then the EAs in the inode are shifted to make space.
> If enough space is not available in the inode due to the EAs then 1 or
> more EAs are shifted to the external EA block. In the worst case when
> even the external EA block does not have enough space we inform the user
> that some EA would need to be deleted or s_min_extra_isize would have to
> be reduced.
>
> This would be online expansion of inodes. I am also working on adding an
> "expand_inodes" option to e2fsck which will expand all the used inodes.
>
> Signed-off-by: Andreas Dilger <[email protected]>
> Signed-off-by: Kalpak Shah <[email protected]>
> Signed-off-by: Mingming Cao <[email protected]>
>
> Index: linux-2.6.22-rc4/fs/ext4/inode.c
> ===================================================================
> --- linux-2.6.22-rc4.orig/fs/ext4/inode.c 2007-06-14 17:32:27.000000000 -0700
> +++ linux-2.6.22-rc4/fs/ext4/inode.c 2007-06-14 17:32:41.000000000 -0700
> @@ -3120,6 +3120,40 @@ ext4_reserve_inode_write(handle_t *handl
> }
>
> /*
> + * Expand an inode by new_extra_isize bytes.
> + * Returns 0 on success or negative error number on failure.
> + */
> +int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize,
> + struct ext4_iloc iloc, handle_t *handle)
> +{
> + struct ext4_inode *raw_inode;
> + struct ext4_xattr_ibody_header *header;
> + struct ext4_xattr_entry *entry;
> +
> + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
> + return 0;
> + }

This patch has lots of unneeded braces in it. checkpatch appears to catch
them.


> + raw_inode = ext4_raw_inode(&iloc);
> +
> + header = IHDR(inode, raw_inode);
> + entry = IFIRST(header);
> +
> + /* No extended attributes present */
> + if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
> + header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
> + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
> + new_extra_isize);
> + EXT4_I(inode)->i_extra_isize = new_extra_isize;
> + return 0;
> + }
> +
> + /* try to expand with EA present */
> + return ext4_expand_extra_isize_ea(inode, new_extra_isize,
> + raw_inode, handle);
> +}
> +
> +/*
> * What we do here is to mark the in-core inode as clean with respect to inode
> * dirtiness (it may still be data-dirty).
> * This means that the in-core inode may be reaped by prune_icache
> @@ -3143,10 +3177,33 @@ ext4_reserve_inode_write(handle_t *handl
> int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
> {
> struct ext4_iloc iloc;
> - int err;
> + int err, ret;
> + static int expand_message;
>
> might_sleep();
> err = ext4_reserve_inode_write(handle, inode, &iloc);
> + if (EXT4_I(inode)->i_extra_isize <
> + EXT4_SB(inode->i_sb)->s_want_extra_isize &&
> + !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
> + /* We need extra buffer credits since we may write into EA block
> + * with this same handle */
> + if ((jbd2_journal_extend(handle,
> + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
> + ret = ext4_expand_extra_isize(inode,
> + EXT4_SB(inode->i_sb)->s_want_extra_isize,
> + iloc, handle);
> + if (ret) {
> + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
> + if (!expand_message) {
> + ext4_warning(inode->i_sb, __FUNCTION__,
> + "Unable to expand inode %lu. Delete"
> + " some EAs or run e2fsck.",
> + inode->i_ino);
> + expand_message = 1;
> + }
> + }
> + }
> + }

Maybe that message should come out once per mount rather than once per boot
(or once per modprobe)?

What are the consequences of a jbd2_journal_extend() failure in here?

> +static inline size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
> + size_t *min_offs, void *base, int *total)
> +{
> + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> + *total += EXT4_XATTR_LEN(last->e_name_len);
> + if (!last->e_value_block && last->e_value_size) {
> + size_t offs = le16_to_cpu(last->e_value_offs);
> + if (offs < *min_offs)
> + *min_offs = offs;
> + }
> + }
> + return (*min_offs - ((void *)last - base) - sizeof(__u32));
> +}

This is far too large to be inlined.

This function needs a comment explaining what it does. Most functions do..

> struct ext4_xattr_info {
> int name_index;
> const char *name;
> @@ -606,6 +613,7 @@ ext4_xattr_set_entry(struct ext4_xattr_i
> memmove(s->here, (void *)s->here + size,
> (void *)last - (void *)s->here + sizeof(__u32));
> memset(last, 0, size);
> +
> }
> }

Unneeded newline.

> @@ -1014,6 +1022,8 @@ ext4_xattr_set_handle(handle_t *handle,
> if (!error) {
> ext4_xattr_update_super_block(handle, inode->i_sb);
> inode->i_ctime = ext4_current_time(inode);
> + if (!value)
> + EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;

What is the locking for i_state?

> error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
> /*
> * The bh is consumed by ext4_mark_iloc_dirty, even with
> @@ -1066,6 +1076,239 @@ retry:
> return error;
> }
>
> +static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
> + int value_offs_shift, void *to,
> + void *from, size_t n, int blocksize)
> +{
> + struct ext4_xattr_entry *last = entry;
> + int new_offs;
> +
> + /* Adjust the value offsets of the entries */
> + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> + if (!last->e_value_block && last->e_value_size) {
> + new_offs = le16_to_cpu(last->e_value_offs) +
> + value_offs_shift;
> + BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
> + > blocksize);
> + last->e_value_offs = cpu_to_le16(new_offs);
> + }
> + }
> + /* Shift the entries by n bytes */
> + memmove(to, from, n);
> +}

This function could do with an overall comment.

Under what circumstances will that new BUG_ON trigger? Can it be triggered
via incorrect disk contents? If so, it should not be there.

> +/*
> + * Expand an inode by new_extra_isize bytes when EA presents.

"when an EA is present"?

> + * Returns 0 on success or negative error number on failure.
> + *
> + */
> +int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
> + struct ext4_inode *raw_inode, handle_t *handle)
> +{
> + struct ext4_xattr_ibody_header *header;
> + struct ext4_xattr_entry *entry, *last, *first;
> + struct buffer_head *bh = NULL;
> + struct ext4_xattr_ibody_find *is = NULL;
> + struct ext4_xattr_block_find *bs = NULL;
> + char *buffer = NULL, *b_entry_name = NULL;
> + size_t min_offs, free;
> + int total_ino, total_blk;
> + void *base, *start, *end;
> + int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
> + int s_min_extra_isize = EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize;
> +
> + down_write(&EXT4_I(inode)->xattr_sem);
> +retry:
> + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
> + up_write(&EXT4_I(inode)->xattr_sem);
> + return 0;
> + }

So.. xattr_sem is a lock which protects i_extra_isize? That seems a bit
strange to me - I'd have expected i_mutex.

> + header = IHDR(inode, raw_inode);
> + entry = IFIRST(header);
> +
> + /*
> + * Check if enough free space is available in the inode to shift the
> + * entries ahead by new_extra_isize.
> + */
> +
> + base = start = entry;
> + end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
> + min_offs = end - base;
> + last = entry;
> + total_ino = sizeof(struct ext4_xattr_ibody_header);
> +
> + free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
> + if (free >= new_extra_isize) {
> + entry = IFIRST(header);
> + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
> + - new_extra_isize, (void *)raw_inode +
> + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize,
> + (void *)header, total_ino,
> + inode->i_sb->s_blocksize);
> + EXT4_I(inode)->i_extra_isize = new_extra_isize;
> + error = 0;
> + goto cleanup;
> + }
> +
> + /*
> + * Enough free space isn't available in the inode, check if
> + * EA block can hold new_extra_isize bytes.
> + */
> + if (EXT4_I(inode)->i_file_acl) {
> + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> + error = -EIO;
> + if (!bh)
> + goto cleanup;
> + if (ext4_xattr_check_block(bh)) {
> + ext4_error(inode->i_sb, __FUNCTION__,
> + "inode %lu: bad block %llu", inode->i_ino,
> + EXT4_I(inode)->i_file_acl);
> + error = -EIO;
> + goto cleanup;
> + }
> + base = BHDR(bh);
> + first = BFIRST(bh);
> + end = bh->b_data + bh->b_size;
> + min_offs = end - base;
> + free = ext4_xattr_free_space(first, &min_offs, base,
> + &total_blk);
> + if (free < new_extra_isize) {
> + if (!tried_min_extra_isize && s_min_extra_isize) {
> + tried_min_extra_isize++;
> + new_extra_isize = s_min_extra_isize;

Aren't we missing a brelse(bh) here?

> + goto retry;
> + }
> + error = -1;
> + goto cleanup;
> + }
> + } else {
> + free = inode->i_sb->s_blocksize;
> + }
> +
> + while (new_extra_isize > 0) {
> + size_t offs, size, entry_size;
> + struct ext4_xattr_entry *small_entry = NULL;
> + struct ext4_xattr_info i = {
> + .value = NULL,
> + .value_len = 0,

These two initialisations actually aren't needed, but they serve as
commentary if that is what was intended.

> + };
> + unsigned int total_size, shift_bytes, temp = ~0U;

It would be better to do

unsigned int total_size; /* description */
unsigned int shift_bytes; /* description */
unsigned int temp = ~0U; /* description */

because a) it is more readable, b) it makes subsequent patches cleaner and
c) it leaves room for descriptions. c) is particularly important for
variables which are called "tmp" and "temp".

Please don't call variables "tmp" or "temp".

> +
> + is = (struct ext4_xattr_ibody_find *) kmalloc(sizeof(struct
> + ext4_xattr_ibody_find), GFP_KERNEL);
> + bs = (struct ext4_xattr_block_find *) kmalloc(sizeof(struct
> + ext4_xattr_block_find), GFP_KERNEL);

Two unneeded (and undesirable) casts of void*

> + memset((void *)is, 0, sizeof(struct ext4_xattr_ibody_find));
> + memset((void *)bs, 0, sizeof(struct ext4_xattr_block_find));

Two more unneeded casts.

This code should use kzalloc().

> + is->s.not_found = bs->s.not_found = -ENODATA;

Multiple assignement.

> + is->iloc.bh = NULL;
> + bs->bh = NULL;
> +
> + last = IFIRST(header);
> + /* Find the entry best suited to be pushed into EA block */
> + entry = NULL;
> + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> + total_size =
> + EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
> + EXT4_XATTR_LEN(last->e_name_len);
> + if (total_size <= free && total_size < temp) {
> + if (total_size < new_extra_isize) {
> + small_entry = last;
> + } else {
> + entry = last;
> + temp = total_size;
> + }
> + }
> + }
> +
> + if (entry == NULL) {
> + if (small_entry) {
> + entry = small_entry;
> + } else {
> + if (!tried_min_extra_isize &&
> + s_min_extra_isize) {
> + tried_min_extra_isize++;
> + new_extra_isize = s_min_extra_isize;
> + goto retry;
> + }
> + error = -1;
> + goto cleanup;
> + }
> + }
> + offs = le16_to_cpu(entry->e_value_offs);
> + size = le32_to_cpu(entry->e_value_size);
> + entry_size = EXT4_XATTR_LEN(entry->e_name_len);
> + i.name_index = entry->e_name_index,
> + buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_KERNEL);
> + b_entry_name = kmalloc(entry->e_name_len + 1, GFP_KERNEL);

Check kmalloc() failure, add recovery code. The fault-injection framework
might be useful when testing that recovery code.

> + /* Save the entry name and the entry value */
> + memcpy((void *)buffer, (void *)IFIRST(header) + offs,
> + EXT4_XATTR_SIZE(size));
> + memcpy((void *)b_entry_name, (void *)entry->e_name,
> + entry->e_name_len);

Lots of unneeded casts there.

> + b_entry_name[entry->e_name_len] = '\0';
> + i.name = b_entry_name;
> +
> + error = ext4_get_inode_loc(inode, &is->iloc);
> + if (error)
> + goto cleanup;
> +
> + error = ext4_xattr_ibody_find(inode, &i, is);
> + if (error)
> + goto cleanup;
> +
> + /* Remove the chosen entry from the inode */
> + error = ext4_xattr_ibody_set(handle, inode, &i, is);
> +
> + entry = IFIRST(header);
> + if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
> + shift_bytes = new_extra_isize;
> + else
> + shift_bytes = entry_size + size;
> + /* Adjust the offsets and shift the remaining entries ahead */
> + ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
> + shift_bytes, (void *)raw_inode +
> + EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
> + (void *)header, total_ino - entry_size,
> + inode->i_sb->s_blocksize);
> +
> + extra_isize += shift_bytes;
> + new_extra_isize -= shift_bytes;
> + EXT4_I(inode)->i_extra_isize = extra_isize;
> +
> + i.name = b_entry_name;
> + i.value = buffer;
> + i.value_len = cpu_to_le32(size);
> + error = ext4_xattr_block_find(inode, &i, bs);
> + if (error)
> + goto cleanup;
> +
> + /* Add entry which was removed from the inode into the block */
> + error = ext4_xattr_block_set(handle, inode, &i, bs);
> + if (error)
> + goto cleanup;
> + }
> +
> +cleanup:
> + if (b_entry_name)
> + kfree(b_entry_name);
> + if (buffer)
> + kfree(buffer);
> + if (is) {
> + brelse(is->iloc.bh);
> + kfree(is);
> + }
> + if (bs)
> + kfree(bs);

kfree(NULL) is legal. (Can we teach checkpatch about this?)

> + brelse(bh);
> + up_write(&EXT4_I(inode)->xattr_sem);
> + return error;
> +}
> +

We're doing GFP_KERNEL memory allocations while holding xattr_sem. This
can cause the VM to reenter the filesystem, perhaps taking i_mutex and/or
i_truncate_sem and/or journal_start() (I forget whether this still
happens). Have we checked whether this can occur and if so, whether we are
OK from a lock ranking POV? Bear in mind that journalled-data mode is more
complex in this regard.

> +
> /*
> * ext4_xattr_delete_inode()
> *
> Index: linux-2.6.22-rc4/fs/ext4/xattr.h
> ===================================================================
> --- linux-2.6.22-rc4.orig/fs/ext4/xattr.h 2007-06-14 17:30:37.000000000 -0700
> +++ linux-2.6.22-rc4/fs/ext4/xattr.h 2007-06-14 17:32:41.000000000 -0700
> @@ -56,6 +56,13 @@ struct ext4_xattr_entry {
> #define EXT4_XATTR_SIZE(size) \
> (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
>
> +#define IHDR(inode, raw_inode) \
> + ((struct ext4_xattr_ibody_header *) \
> + ((void *)raw_inode + \
> + EXT4_GOOD_OLD_INODE_SIZE + \
> + EXT4_I(inode)->i_extra_isize))
> +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))

Neither of these _have_ to be implemented as macros and hence they should
not be.



2007-07-11 12:10:58

by Andreas Dilger

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Jul 10, 2007 16:32 -0700, Andrew Morton wrote:
> > err = ext4_reserve_inode_write(handle, inode, &iloc);
> > + if (EXT4_I(inode)->i_extra_isize <
> > + EXT4_SB(inode->i_sb)->s_want_extra_isize &&
> > + !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
> > + /* We need extra buffer credits since we may write into EA block
> > + * with this same handle */
> > + if ((jbd2_journal_extend(handle,
> > + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
> > + ret = ext4_expand_extra_isize(inode,
> > + EXT4_SB(inode->i_sb)->s_want_extra_isize,
> > + iloc, handle);
> > + if (ret) {
> > + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
> > + if (!expand_message) {
> > + ext4_warning(inode->i_sb, __FUNCTION__,
> > + "Unable to expand inode %lu. Delete"
> > + " some EAs or run e2fsck.",
> > + inode->i_ino);
> > + expand_message = 1;
> > + }
> > + }
> > + }
> > + }
>
> Maybe that message should come out once per mount rather than once per boot
> (or once per modprobe)?

Probably true.

> What are the consequences of a jbd2_journal_extend() failure in here?

Not fatal, just like every user of i_extra_isize. If the inode isn't a
large inode, or it can't be expanded then there will be a minor loss of
functionality on that inode. If the i_extra_isize is critical, then
the sysadmin will have run e2fsck to force s_min_extra_isize large enough.

Note that this is only applicable for filesystems which are upgraded. For
new inodes (i.e. all inodes that exist in the filesystem if it was always
run on a kernel with the currently understood extra fields) then this will
never be invoked (until such a time new extra fields are added).

> > +static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
> > + int value_offs_shift, void *to,
> > + void *from, size_t n, int blocksize)
> > +{
> > + /* Adjust the value offsets of the entries */
> > + for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
> > + if (!last->e_value_block && last->e_value_size) {
> > + new_offs = le16_to_cpu(last->e_value_offs) +
> > + value_offs_shift;
> > + BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
> > + > blocksize);
> > + last->e_value_offs = cpu_to_le16(new_offs);
> > + }
> > + }
> > + /* Shift the entries by n bytes */
> > + memmove(to, from, n);
> > +}
>
> Under what circumstances will that new BUG_ON trigger? Can it be triggered
> via incorrect disk contents? If so, it should not be there.

Only under code defect or memory corruption. The needed extra space was
already checked in ext4_expand_extra_isize_ea(), but I asked for this
BUG_ON() because it would otherwise seem that there was a chance from
reading just the above code that the shifted EA might overflow the buffer.

> > +int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
> > + struct ext4_inode *raw_inode, handle_t *handle)
> > +{
> > + down_write(&EXT4_I(inode)->xattr_sem);
> > +retry:
> > + if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
> > + up_write(&EXT4_I(inode)->xattr_sem);
> > + return 0;
> > + }
>
> So.. xattr_sem is a lock which protects i_extra_isize? That seems a bit
> strange to me - I'd have expected i_mutex.

Well, since this is the only code that ever changes i_extra_isize, and it
also needs to move the EAs around, it makes sense to use the EA lock for it.
This is also a relatively low-contention lock, and it needs to be held to
access any of the EAs (which are the only things beyond i_extra_isize).

> > + if (EXT4_I(inode)->i_file_acl) {
> > + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> > + error = -EIO;
> > + if (!bh)
> > + goto cleanup;
> > + if (ext4_xattr_check_block(bh)) {
> > + ext4_error(inode->i_sb, __FUNCTION__,
> > + "inode %lu: bad block %llu", inode->i_ino,
> > + EXT4_I(inode)->i_file_acl);
> > + error = -EIO;
> > + goto cleanup;
> > + }
> > + base = BHDR(bh);
> > + first = BFIRST(bh);
> > + end = bh->b_data + bh->b_size;
> > + min_offs = end - base;
> > + free = ext4_xattr_free_space(first, &min_offs, base,
> > + &total_blk);
> > + if (free < new_extra_isize) {
> > + if (!tried_min_extra_isize && s_min_extra_isize) {
> > + tried_min_extra_isize++;
> > + new_extra_isize = s_min_extra_isize;
>
> Aren't we missing a brelse(bh) here?

Seems likely, yes.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2007-07-11 17:34:05

by Andrew Morton

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Wed, 11 Jul 2007 06:10:56 -0600 Andreas Dilger <[email protected]> wrote:

> On Jul 10, 2007 16:32 -0700, Andrew Morton wrote:
> > > err = ext4_reserve_inode_write(handle, inode, &iloc);
> > > + if (EXT4_I(inode)->i_extra_isize <
> > > + EXT4_SB(inode->i_sb)->s_want_extra_isize &&
> > > + !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
> > > + /* We need extra buffer credits since we may write into EA block
> > > + * with this same handle */
> > > + if ((jbd2_journal_extend(handle,
> > > + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
> > > + ret = ext4_expand_extra_isize(inode,
> > > + EXT4_SB(inode->i_sb)->s_want_extra_isize,
> > > + iloc, handle);
> > > + if (ret) {
> > > + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
> > > + if (!expand_message) {
> > > + ext4_warning(inode->i_sb, __FUNCTION__,
> > > + "Unable to expand inode %lu. Delete"
> > > + " some EAs or run e2fsck.",
> > > + inode->i_ino);
> > > + expand_message = 1;
> > > + }
> > > + }
> > > + }
> > > + }
> >
> > Maybe that message should come out once per mount rather than once per boot
> > (or once per modprobe)?
>
> Probably true.
>
> > What are the consequences of a jbd2_journal_extend() failure in here?
>
> Not fatal, just like every user of i_extra_isize. If the inode isn't a
> large inode, or it can't be expanded then there will be a minor loss of
> functionality on that inode. If the i_extra_isize is critical, then
> the sysadmin will have run e2fsck to force s_min_extra_isize large enough.
>
> Note that this is only applicable for filesystems which are upgraded. For
> new inodes (i.e. all inodes that exist in the filesystem if it was always
> run on a kernel with the currently understood extra fields) then this will
> never be invoked (until such a time new extra fields are added).

I'd suggest that we get a comment in the code explaining this: this
unchecked error does rather stand out.

> > > + if (EXT4_I(inode)->i_file_acl) {
> > > + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> > > + error = -EIO;
> > > + if (!bh)
> > > + goto cleanup;
> > > + if (ext4_xattr_check_block(bh)) {
> > > + ext4_error(inode->i_sb, __FUNCTION__,
> > > + "inode %lu: bad block %llu", inode->i_ino,
> > > + EXT4_I(inode)->i_file_acl);
> > > + error = -EIO;
> > > + goto cleanup;
> > > + }
> > > + base = BHDR(bh);
> > > + first = BFIRST(bh);
> > > + end = bh->b_data + bh->b_size;
> > > + min_offs = end - base;
> > > + free = ext4_xattr_free_space(first, &min_offs, base,
> > > + &total_blk);
> > > + if (free < new_extra_isize) {
> > > + if (!tried_min_extra_isize && s_min_extra_isize) {
> > > + tried_min_extra_isize++;
> > > + new_extra_isize = s_min_extra_isize;
> >
> > Aren't we missing a brelse(bh) here?
>
> Seems likely, yes.

OK - could we get a positive ack from someone indicating that this will get
looked at? Because I am about to forget about it.


2007-07-11 19:24:13

by Kalpak Shah

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Wed, 2007-07-11 at 10:34 -0700, Andrew Morton wrote:
> On Wed, 11 Jul 2007 06:10:56 -0600 Andreas Dilger <[email protected]> wrote:
>
> > On Jul 10, 2007 16:32 -0700, Andrew Morton wrote:
> > > > err = ext4_reserve_inode_write(handle, inode, &iloc);
> > > > + if (EXT4_I(inode)->i_extra_isize <
> > > > + EXT4_SB(inode->i_sb)->s_want_extra_isize &&
> > > > + !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
> > > > + /* We need extra buffer credits since we may write into EA block
> > > > + * with this same handle */
> > > > + if ((jbd2_journal_extend(handle,
> > > > + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
> > > > + ret = ext4_expand_extra_isize(inode,
> > > > + EXT4_SB(inode->i_sb)->s_want_extra_isize,
> > > > + iloc, handle);
> > > > + if (ret) {
> > > > + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
> > > > + if (!expand_message) {
> > > > + ext4_warning(inode->i_sb, __FUNCTION__,
> > > > + "Unable to expand inode %lu. Delete"
> > > > + " some EAs or run e2fsck.",
> > > > + inode->i_ino);
> > > > + expand_message = 1;
> > > > + }
> > > > + }
> > > > + }
> > > > + }
> > >
> > > Maybe that message should come out once per mount rather than once per boot
> > > (or once per modprobe)?
> >
> > Probably true.
> >
> > > What are the consequences of a jbd2_journal_extend() failure in here?
> >
> > Not fatal, just like every user of i_extra_isize. If the inode isn't a
> > large inode, or it can't be expanded then there will be a minor loss of
> > functionality on that inode. If the i_extra_isize is critical, then
> > the sysadmin will have run e2fsck to force s_min_extra_isize large enough.
> >
> > Note that this is only applicable for filesystems which are upgraded. For
> > new inodes (i.e. all inodes that exist in the filesystem if it was always
> > run on a kernel with the currently understood extra fields) then this will
> > never be invoked (until such a time new extra fields are added).
>
> I'd suggest that we get a comment in the code explaining this: this
> unchecked error does rather stand out.
>
> > > > + if (EXT4_I(inode)->i_file_acl) {
> > > > + bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
> > > > + error = -EIO;
> > > > + if (!bh)
> > > > + goto cleanup;
> > > > + if (ext4_xattr_check_block(bh)) {
> > > > + ext4_error(inode->i_sb, __FUNCTION__,
> > > > + "inode %lu: bad block %llu", inode->i_ino,
> > > > + EXT4_I(inode)->i_file_acl);
> > > > + error = -EIO;
> > > > + goto cleanup;
> > > > + }
> > > > + base = BHDR(bh);
> > > > + first = BFIRST(bh);
> > > > + end = bh->b_data + bh->b_size;
> > > > + min_offs = end - base;
> > > > + free = ext4_xattr_free_space(first, &min_offs, base,
> > > > + &total_blk);
> > > > + if (free < new_extra_isize) {
> > > > + if (!tried_min_extra_isize && s_min_extra_isize) {
> > > > + tried_min_extra_isize++;
> > > > + new_extra_isize = s_min_extra_isize;
> > >
> > > Aren't we missing a brelse(bh) here?
> >
> > Seems likely, yes.
>
> OK - could we get a positive ack from someone indicating that this will get
> looked at? Because I am about to forget about it.

I will send a patch to add the comments and make the suggested
corrections.

Thanks,
Kalpak.

> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html


2007-07-12 11:23:08

by Andy Whitcroft

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

The next version of checkpatch.pl (0.08) should have support for a
number of the missed sylistics you mention. Will let them soak for a
bit to ensure we're not majorly regressing anything else.

-apw

ERROR: braces {} are not necessary for single statements
#4: FILE: Z11.c:1:
+if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {

WARNING: declaring multiple variables together should be avoided
#9: FILE: Z11.c:6:
+unsigned int total_size, shift_bytes, temp = ~0U;

WARNING: multiple assignments should be avoided
#12: FILE: Z11.c:9:
+is->s.not_found = bs->s.not_found = -ENODATA;

WARNING: kfree(NULL) is safe this check is probabally not required
#16: FILE: Z11.c:13:
+if (bs)
+ kfree(bs);

2007-07-12 12:14:35

by Kalpak Shah

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Tue, 2007-07-10 at 16:32 -0700, Andrew Morton wrote:
> On Sun, 01 Jul 2007 03:38:01 -0400
> Mingming Cao <[email protected]> wrote:
>
> > This patch is on top of the nanosecond timestamp and i_version_hi
> > patches.
>
> This sort of information isn't needed (or desired) when this patch hits the
> git tree. Please ensure that things like this are cleaned up before the
> patches go to Linus.

The incremental patch I have attached contains an updated Changelog
entry which cleans this up.

> > + !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
> > + /* We need extra buffer credits since we may write into EA block
> > + * with this same handle */
> > + if ((jbd2_journal_extend(handle,
> > + EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
> > + ret = ext4_expand_extra_isize(inode,
> > + EXT4_SB(inode->i_sb)->s_want_extra_isize,
> > + iloc, handle);
> > + if (ret) {
> > + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
> > + if (!expand_message) {
> > + ext4_warning(inode->i_sb, __FUNCTION__,
> > + "Unable to expand inode %lu. Delete"
> > + " some EAs or run e2fsck.",
> > + inode->i_ino);
> > + expand_message = 1;
> > + }
> > + }
> > + }
> > + }
>
> Maybe that message should come out once per mount rather than once per boot
> (or once per modprobe)?

Done. Now the message gets printed the first time s_mnt_count changes,
which means once per mount.

> > + if (free < new_extra_isize) {
> > + if (!tried_min_extra_isize && s_min_extra_isize) {
> > + tried_min_extra_isize++;
> > + new_extra_isize = s_min_extra_isize;
>
> Aren't we missing a brelse(bh) here?

I have corrected this.

> >
> > +#define IHDR(inode, raw_inode) \
> > + ((struct ext4_xattr_ibody_header *) \
> > + ((void *)raw_inode + \
> > + EXT4_GOOD_OLD_INODE_SIZE + \
> > + EXT4_I(inode)->i_extra_isize))
> > +#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
>
> Neither of these _have_ to be implemented as macros and hence they should
> not be.

These macros existed before and have just been moved. There are similar
such macros in the ext3/4 xattr code and they should probably be changed
to inlines. But that should be done in a different patch.

Thanks,
Kalpak.


Attachments:
ext4_expand_inode_extra_isize_correction.patch (8.20 kB)

2007-07-13 09:05:29

by Andrew Morton

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Tue, 10 Jul 2007 16:32:47 -0700 Andrew Morton <[email protected]> wrote:

> > + brelse(bh);
> > + up_write(&EXT4_I(inode)->xattr_sem);
> > + return error;
> > +}
> > +
>
> We're doing GFP_KERNEL memory allocations while holding xattr_sem. This
> can cause the VM to reenter the filesystem, perhaps taking i_mutex and/or
> i_truncate_sem and/or journal_start() (I forget whether this still
> happens). Have we checked whether this can occur and if so, whether we are
> OK from a lock ranking POV? Bear in mind that journalled-data mode is more
> complex in this regard.

I notice that everyone carefully avoided addressing this ;)

Oh well, hopefully people are testing with lockdep enabled. As long
as the fs is put under extreme memory pressure, most bugs should be reported.

Except lockdep doesn't know about journal_start(), which has ranking
requirements similar to a semaphore. Nor does it know about lock_page().
We already have hard-to-hit but deadlockable bugs in this area.



2007-07-13 13:33:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Fri, 2007-07-13 at 02:05 -0700, Andrew Morton wrote:

> Except lockdep doesn't know about journal_start(), which has ranking
> requirements similar to a semaphore.

Something like so?

Or can journal_stop() be done by a different task than the one that did
journal_start()? - in which case nothing much can be done :-/

This seems to boot... albeit I did not push it hard.

Signed-off-by: Peter Zijlstra <[email protected]>
---
fs/jbd/transaction.c | 9 +++++++++
include/linux/jbd.h | 5 +++++
2 files changed, 14 insertions(+)

Index: linux-2.6/fs/jbd/transaction.c
===================================================================
--- linux-2.6.orig/fs/jbd/transaction.c
+++ linux-2.6/fs/jbd/transaction.c
@@ -233,6 +233,8 @@ out:
return ret;
}

+static struct lock_class_key jbd_handle_key;
+
/* Allocate a new handle. This should probably be in a slab... */
static handle_t *new_handle(int nblocks)
{
@@ -243,6 +245,8 @@ static handle_t *new_handle(int nblocks)
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;

+ lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
+
return handle;
}

@@ -286,6 +290,9 @@ handle_t *journal_start(journal_t *journ
current->journal_info = NULL;
handle = ERR_PTR(err);
}
+
+ lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+
return handle;
}

@@ -1411,6 +1418,8 @@ int journal_stop(handle_t *handle)
spin_unlock(&journal->j_state_lock);
}

+ lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+
jbd_free_handle(handle);
return err;
}
Index: linux-2.6/include/linux/jbd.h
===================================================================
--- linux-2.6.orig/include/linux/jbd.h
+++ linux-2.6/include/linux/jbd.h
@@ -30,6 +30,7 @@
#include <linux/bit_spinlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
+#include <linux/lockdep.h>

#include <asm/semaphore.h>
#endif
@@ -405,6 +406,10 @@ struct handle_s
unsigned int h_sync: 1; /* sync-on-close */
unsigned int h_jdata: 1; /* force data journaling */
unsigned int h_aborted: 1; /* fatal error on handle */
+
+#ifdef CONFIG_LOCKDEP
+ struct lockdep_map h_lockdep_map;
+#endif
};



2007-07-13 15:43:46

by Andreas Dilger

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Jul 13, 2007 15:33 +0200, Peter Zijlstra wrote:
> On Fri, 2007-07-13 at 02:05 -0700, Andrew Morton wrote:
> Or can journal_stop() be done by a different task than the one that did
> journal_start()? - in which case nothing much can be done :-/

The call to journal_stop() has to be in the same process, since the
journal handle is also held in current->journal_info so the handle
does not need to be passed as an argument all over the VFS.

> This seems to boot... albeit I did not push it hard.

Can you please also make a patch for jbd2.


Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2007-07-13 19:14:32

by Andrew Morton

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Fri, 13 Jul 2007 15:33:41 +0200
Peter Zijlstra <[email protected]> wrote:

> On Fri, 2007-07-13 at 02:05 -0700, Andrew Morton wrote:
>
> > Except lockdep doesn't know about journal_start(), which has ranking
> > requirements similar to a semaphore.
>
> Something like so?

Looks OK.

> Or can journal_stop() be done by a different task than the one that did
> journal_start()? - in which case nothing much can be done :-/

Yeah, journal_start() and journal_stop() are well-behaved.

> This seems to boot... albeit I did not push it hard.

I fear the consequences of this change :(

Oh well, please keep it alive, maybe beat on it a bit, resend it
later on?

2007-07-13 21:49:05

by Zach Brown

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

> I fear the consequences of this change :(

I love it. In the past I've lost time by working with patches which
didn't quite realize that ext3 holds a transaction open during
->direct_IO.

> Oh well, please keep it alive, maybe beat on it a bit, resend it
> later on?

I can test the patch to make sure that it catches mistakes I've made in
the past. Peter, do you have any interest in seeing how far we can get
at tracking lock_page()? I'm not holding my breath, but any little bit
would probably help.

- z

2007-07-14 04:21:55

by Andreas Dilger

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Jul 13, 2007 02:05 -0700, Andrew Morton wrote:
> On Tue, 10 Jul 2007 16:32:47 -0700 Andrew Morton <[email protected]> wrote:
>
> > > + brelse(bh);
> > > + up_write(&EXT4_I(inode)->xattr_sem);
> > > + return error;
> > > +}
> > > +
> >
> > We're doing GFP_KERNEL memory allocations while holding xattr_sem. This
> > can cause the VM to reenter the filesystem, perhaps taking i_mutex and/or
> > i_truncate_sem and/or journal_start() (I forget whether this still
> > happens). Have we checked whether this can occur and if so, whether we are
> > OK from a lock ranking POV? Bear in mind that journalled-data mode is more
> > complex in this regard.
>
> I notice that everyone carefully avoided addressing this ;)
>
> Oh well, hopefully people are testing with lockdep enabled. As long
> as the fs is put under extreme memory pressure, most bugs should be reported.

I have no objection to changing these to GFP_NOFS or GFP_ATOMIC, because
the number of times this function is called is really quite small (only
for existing inodes when the size of the fixed fields in the inode is
increasing) and the buffers are freed immediately so this won't put any
undue strain on the atomic memory pools.

That said, there is also a GFP_KERNEL allocations in ext3_xattr_block_set()
under xattr_sem, so the same problem would exist there.

I also just noticed that "buffer" and "b_entry_name" are leaked in
ext4_expand_extra_isize() if the while loop is run more than one time
(again a relatively rare event).

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2007-07-14 07:43:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:
> > I fear the consequences of this change :(
>
> I love it. In the past I've lost time by working with patches which
> didn't quite realize that ext3 holds a transaction open during
> ->direct_IO.
>
> > Oh well, please keep it alive, maybe beat on it a bit, resend it
> > later on?
>
> I can test the patch to make sure that it catches mistakes I've made in
> the past.

That would be much appreciated.

> Peter, do you have any interest in seeing how far we can get
> at tracking lock_page()? I'm not holding my breath, but any little bit
> would probably help.

Yeah I'm going to try that one next..


2007-07-15 10:21:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:

> Peter, do you have any interest in seeing how far we can get
> at tracking lock_page()? I'm not holding my breath, but any little bit
> would probably help.

I ran headfirst into the fact the unlock_page() need not be called by
the same task that did lock_page().

Esp IO-completion interrupts love to unlock pages they did not lock
themselves. Not at all sure that is fixable, it seems to be the nature
of the async structure of the problem :-(

2007-07-15 13:02:23

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:

> Peter, do you have any interest in seeing how far we can get
> at tracking lock_page()? I'm not holding my breath, but any little bit
> would probably help.

Would this be a valid report?

( /me goes hunt a x86_64 unwinder patch that will apply to this tree.
These stacktraces are pain )

=======================================================
[ INFO: possible circular locking dependency detected ]
[ 2.6.22-rt3-dirty #34
-------------------------------------------------------
mount/1296 is trying to acquire lock:
(&ei->truncate_mutex){--..}, at: [<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7

but task is already holding lock:
(lock_page_0){--..}, at: [<ffffffff80267107>] generic_file_buffered_write+0x1ee/0x646

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (lock_page_0){--..}:
[<ffffffff80251b26>] __lock_acquire+0xa72/0xc35
[<ffffffff802520c9>] lock_acquire+0x48/0x61
[<ffffffff80265e22>] add_to_page_cache_lru+0xe/0x23
[<ffffffff80265d31>] add_to_page_cache+0x1de/0x2c1
[<ffffffff80265e22>] add_to_page_cache_lru+0xe/0x23
[<ffffffff80266985>] find_or_create_page+0x4c/0x73
[<ffffffff802ae716>] __getblk+0x118/0x23c
[<ffffffff802afa91>] __bread+0x6/0x9c
[<ffffffff802f382d>] read_block_bitmap+0x34/0x65
[<ffffffff802f3e1b>] ext3_free_blocks_sb+0xec/0x3d4
[<ffffffff802f4131>] ext3_free_blocks+0x2e/0x61
[<ffffffff802f82bc>] ext3_free_data+0xaa/0xda
[<ffffffff802f8976>] ext3_truncate+0x4d2/0x84e
[<ffffffff8026df5a>] pagevec_lookup+0x17/0x1e
[<ffffffff8026e7b1>] truncate_inode_pages_range+0x1f4/0x323
[<ffffffff802614b4>] add_preempt_count+0x14/0xe4
[<ffffffff80304d13>] journal_stop+0x1fe/0x21d
[<ffffffff8027661a>] vmtruncate+0xa2/0xc0
[<ffffffff802a292b>] inode_setattr+0x22/0x10a
[<ffffffff802f9b51>] ext3_setattr+0x136/0x18f
[<ffffffff802a2b1d>] notify_change+0x10a/0x241
[<ffffffff802a2b3b>] notify_change+0x128/0x241
[<ffffffff8028e35e>] do_truncate+0x56/0x7f
[<ffffffff8028e369>] do_truncate+0x61/0x7f
[<ffffffff80296278>] get_write_access+0x3f/0x45
[<ffffffff802973c7>] may_open+0x193/0x1af
[<ffffffff80299869>] open_namei+0x2cb/0x63e
[<ffffffff8025718b>] rt_up_read+0x53/0x5c
[<ffffffff8056da59>] do_page_fault+0x479/0x7cc
[<ffffffff8028dce1>] do_filp_open+0x1c/0x38
[<ffffffff8056a4f9>] rt_spin_unlock+0x17/0x47
[<ffffffff8028da05>] get_unused_fd+0xf9/0x107
[<ffffffff8028dd45>] do_sys_open+0x48/0xd5
[<ffffffff8020950e>] system_call+0x7e/0x83
[<ffffffffffffffff>] 0xffffffffffffffff

-> #0 (&ei->truncate_mutex){--..}:
[<ffffffff802503b9>] print_circular_bug_header+0xcc/0xd3
[<ffffffff80251a22>] __lock_acquire+0x96e/0xc35
[<ffffffff802520c9>] lock_acquire+0x48/0x61
[<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
[<ffffffff8056a6d4>] _mutex_lock+0x26/0x52
[<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
[<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
[<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
[<ffffffff80250d7c>] debug_check_no_locks_freed+0x11d/0x129
[<ffffffff80250c33>] trace_hardirqs_on_caller+0x115/0x138
[<ffffffff8024efdc>] lockdep_init_map+0xac/0x41f
[<ffffffff802614b4>] add_preempt_count+0x14/0xe4
[<ffffffff802f8035>] ext3_get_block+0xc2/0xe4
[<ffffffff802aeed3>] __block_prepare_write+0x195/0x442
[<ffffffff802f7f73>] ext3_get_block+0x0/0xe4
[<ffffffff802af19a>] block_prepare_write+0x1a/0x25
[<ffffffff802f93e9>] ext3_prepare_write+0xb2/0x17b
[<ffffffff802671b1>] generic_file_buffered_write+0x298/0x646
[<ffffffff8023944e>] current_fs_time+0x3b/0x40
[<ffffffff802614b4>] add_preempt_count+0x14/0xe4
[<ffffffff802678ae>] __generic_file_aio_write_nolock+0x34f/0x3b9
[<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
[<ffffffff80267964>] generic_file_aio_write+0x4c/0xc4
[<ffffffff80267979>] generic_file_aio_write+0x61/0xc4
[<ffffffff802fcf18>] ext3_orphan_del+0x53/0x19f
[<ffffffff802f5768>] ext3_file_write+0x1c/0x9d
[<ffffffff8028ef31>] do_sync_write+0xcc/0x10f
[<ffffffff80246f9c>] autoremove_wake_function+0x0/0x2e
[<ffffffff8024ecfe>] get_lock_stats+0xe/0x3f
[<ffffffff8024ed9a>] lock_release_holdtime+0x41/0x4f
[<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
[<ffffffff8028dfb1>] sys_fchmod+0xa3/0xbd
[<ffffffff8056a717>] _mutex_unlock+0x17/0x20
[<ffffffff8028f6cd>] vfs_write+0xb6/0x148
[<ffffffff8028fc61>] sys_write+0x48/0x74
[<ffffffff8020950e>] system_call+0x7e/0x83
[<ffffffffffffffff>] 0xffffffffffffffff

other info that might help us debug this:

2 locks held by mount/1296:
#0: (&inode->i_mutex){--..}, at: [<ffffffff80267964>] generic_file_aio_write+0x4c/0xc4
#1: (lock_page_0){--..}, at: [<ffffffff80267107>] generic_file_buffered_write+0x1ee/0x646

stack backtrace:

Call Trace:
[<ffffffff8024ffbd>] print_circular_bug_tail+0x69/0x72
[<ffffffff802503b9>] print_circular_bug_header+0xcc/0xd3
[<ffffffff80251a22>] __lock_acquire+0x96e/0xc35
[<ffffffff802520c9>] lock_acquire+0x48/0x61
[<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
[<ffffffff8056a6d4>] _mutex_lock+0x26/0x52
[<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
[<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
[<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
[<ffffffff80250d7c>] debug_check_no_locks_freed+0x11d/0x129
[<ffffffff80250c33>] trace_hardirqs_on_caller+0x115/0x138
[<ffffffff8024efdc>] lockdep_init_map+0xac/0x41f
[<ffffffff802614b4>] add_preempt_count+0x14/0xe4
[<ffffffff802f8035>] ext3_get_block+0xc2/0xe4
[<ffffffff802aeed3>] __block_prepare_write+0x195/0x442
[<ffffffff802f7f73>] ext3_get_block+0x0/0xe4
[<ffffffff802af19a>] block_prepare_write+0x1a/0x25
[<ffffffff802f93e9>] ext3_prepare_write+0xb2/0x17b
[<ffffffff802671b1>] generic_file_buffered_write+0x298/0x646
[<ffffffff8023944e>] current_fs_time+0x3b/0x40
[<ffffffff802614b4>] add_preempt_count+0x14/0xe4
[<ffffffff802678ae>] __generic_file_aio_write_nolock+0x34f/0x3b9
[<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
[<ffffffff80267964>] generic_file_aio_write+0x4c/0xc4
[<ffffffff80267979>] generic_file_aio_write+0x61/0xc4
[<ffffffff802fcf18>] ext3_orphan_del+0x53/0x19f
[<ffffffff802f5768>] ext3_file_write+0x1c/0x9d
[<ffffffff8028ef31>] do_sync_write+0xcc/0x10f
[<ffffffff80246f9c>] autoremove_wake_function+0x0/0x2e
[<ffffffff8024ecfe>] get_lock_stats+0xe/0x3f
[<ffffffff8024ed9a>] lock_release_holdtime+0x41/0x4f
[<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
[<ffffffff8028dfb1>] sys_fchmod+0xa3/0xbd
[<ffffffff8056a717>] _mutex_unlock+0x17/0x20
[<ffffffff8028f6cd>] vfs_write+0xb6/0x148
[<ffffffff8028fc61>] sys_write+0x48/0x74
[<ffffffff8020950e>] system_call+0x7e/0x83

INFO: lockdep is turned off.
---------------------------
| preempt count: 00000000 ]
| 0-level deep critical section nesting:
----------------------------------------




2007-07-15 13:14:02

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Sun, 2007-07-15 at 15:02 +0200, Peter Zijlstra wrote:
> On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:
>
> > Peter, do you have any interest in seeing how far we can get
> > at tracking lock_page()? I'm not holding my breath, but any little bit
> > would probably help.
>
> Would this be a valid report?

=======================================================
[ INFO: possible circular locking dependency detected ]
[ 2.6.22-rt3-dirty #35
-------------------------------------------------------
mkdir/1662 is trying to acquire lock:
(lock_page_0){--..}, at: [<ffffffff80265df6>] add_to_page_cache_lru+0xe/0x23

but task is already holding lock:
(jbd_handle){--..}, at: [<ffffffff80305797>] journal_start+0x108/0x12c

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (jbd_handle){--..}:
[<ffffffff80251b16>] __lock_acquire+0xa72/0xc35
[<ffffffff802520b9>] lock_acquire+0x48/0x61
[<ffffffff80305797>] journal_start+0x108/0x12c
[<ffffffff803057b3>] journal_start+0x124/0x12c
[<ffffffff802f92e9>] ext3_prepare_write+0x42/0x17b
[<ffffffff80267185>] generic_file_buffered_write+0x298/0x646
[<ffffffff8023943e>] current_fs_time+0x3b/0x40
[<ffffffff802614a4>] add_preempt_count+0x14/0xe4
[<ffffffff80267882>] __generic_file_aio_write_nolock+0x34f/0x3b9
[<ffffffff8024ed2d>] put_lock_stats+0xe/0x2a
[<ffffffff80267938>] generic_file_aio_write+0x4c/0xc4
[<ffffffff8026794d>] generic_file_aio_write+0x61/0xc4
[<ffffffff802fce88>] ext3_orphan_del+0x53/0x19f
[<ffffffff802f56d8>] ext3_file_write+0x1c/0x9d
[<ffffffff8028eedd>] do_sync_write+0xcc/0x10f
[<ffffffff80246f8d>] autoremove_wake_function+0x0/0x2e
[<ffffffff8024ecee>] get_lock_stats+0xe/0x3f
[<ffffffff8024ed8a>] lock_release_holdtime+0x41/0x4f
[<ffffffff8024ed2d>] put_lock_stats+0xe/0x2a
[<ffffffff8028df5d>] sys_fchmod+0xa3/0xbd
[<ffffffff8056a6d7>] _mutex_unlock+0x17/0x20
[<ffffffff8028f679>] vfs_write+0xb6/0x148
[<ffffffff8028fc0d>] sys_write+0x48/0x74
[<ffffffff8020950e>] system_call+0x7e/0x83
[<ffffffffffffffff>] 0xffffffffffffffff

-> #0 (lock_page_0){--..}:
[<ffffffff802503a9>] print_circular_bug_header+0xcc/0xd3
[<ffffffff80251a12>] __lock_acquire+0x96e/0xc35
[<ffffffff802520b9>] lock_acquire+0x48/0x61
[<ffffffff80265df6>] add_to_page_cache_lru+0xe/0x23
[<ffffffff80265d05>] add_to_page_cache+0x1de/0x2c1
[<ffffffff80265df6>] add_to_page_cache_lru+0xe/0x23
[<ffffffff80266959>] find_or_create_page+0x4c/0x73
[<ffffffff802ae6c2>] __getblk+0x118/0x23c
[<ffffffff802f7d9a>] ext3_getblk+0xf2/0x23b
[<ffffffff80306337>] journal_dirty_metadata+0x1a8/0x1b3
[<ffffffff80301e3e>] __ext3_journal_dirty_metadata+0x1e/0x46
[<ffffffff802f6c63>] ext3_mark_iloc_dirty+0x293/0x30a
[<ffffffff802f70a1>] ext3_mark_inode_dirty+0x3f/0x48
[<ffffffff802f644e>] ext3_new_inode+0x8ff/0x943
[<ffffffff802f8c9c>] ext3_bread+0x11/0x84
[<ffffffff802fccc3>] ext3_mkdir+0xdd/0x24f
[<ffffffff80296893>] vfs_mkdir+0x6d/0xb5
[<ffffffff8029902e>] sys_mkdirat+0xa1/0xec
[<ffffffff80569f4d>] trace_hardirqs_on_thunk+0x3a/0x3c
[<ffffffff80250c23>] trace_hardirqs_on_caller+0x115/0x138
[<ffffffff80569f4d>] trace_hardirqs_on_thunk+0x3a/0x3c
[<ffffffff8020950e>] system_call+0x7e/0x83
[<ffffffffffffffff>] 0xffffffffffffffff

other info that might help us debug this:

2 locks held by mkdir/1662:
#0: (&inode->i_mutex/1){--..}, at: [<ffffffff80297154>] lookup_create+0x26/0x8b
#1: (jbd_handle){--..}, at: [<ffffffff80305797>] journal_start+0x108/0x12c

stack backtrace:

Call Trace:
[<ffffffff8024ffad>] print_circular_bug_tail+0x69/0x72
[<ffffffff802503a9>] print_circular_bug_header+0xcc/0xd3
[<ffffffff80251a12>] __lock_acquire+0x96e/0xc35
[<ffffffff802520b9>] lock_acquire+0x48/0x61
[<ffffffff80265df6>] add_to_page_cache_lru+0xe/0x23
[<ffffffff80265d05>] add_to_page_cache+0x1de/0x2c1
[<ffffffff80265df6>] add_to_page_cache_lru+0xe/0x23
[<ffffffff80266959>] find_or_create_page+0x4c/0x73
[<ffffffff802ae6c2>] __getblk+0x118/0x23c
[<ffffffff802f7d9a>] ext3_getblk+0xf2/0x23b
[<ffffffff80306337>] journal_dirty_metadata+0x1a8/0x1b3
[<ffffffff80301e3e>] __ext3_journal_dirty_metadata+0x1e/0x46
[<ffffffff802f6c63>] ext3_mark_iloc_dirty+0x293/0x30a
[<ffffffff802f70a1>] ext3_mark_inode_dirty+0x3f/0x48
[<ffffffff802f644e>] ext3_new_inode+0x8ff/0x943
[<ffffffff802f8c9c>] ext3_bread+0x11/0x84
[<ffffffff802fccc3>] ext3_mkdir+0xdd/0x24f
[<ffffffff80296893>] vfs_mkdir+0x6d/0xb5
[<ffffffff8029902e>] sys_mkdirat+0xa1/0xec
[<ffffffff80569f4d>] trace_hardirqs_on_thunk+0x3a/0x3c
[<ffffffff80250c23>] trace_hardirqs_on_caller+0x115/0x138
[<ffffffff80569f4d>] trace_hardirqs_on_thunk+0x3a/0x3c
[<ffffffff8020950e>] system_call+0x7e/0x83

INFO: lockdep is turned off.
---------------------------
| preempt count: 00000000 ]
| 0-level deep critical section nesting:
----------------------------------------




2007-07-15 18:12:34

by Andrew Morton

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Sun, 15 Jul 2007 15:02:23 +0200 Peter Zijlstra <[email protected]> wrote:

> On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:
>
> > Peter, do you have any interest in seeing how far we can get
> > at tracking lock_page()? I'm not holding my breath, but any little bit
> > would probably help.
>
> Would this be a valid report?
>
> ( /me goes hunt a x86_64 unwinder patch that will apply to this tree.
> These stacktraces are pain )

They are. lockdep reports are a pain too. It's still a struggle to
understand wtf they're trying to tell you. Mabe it's just me.

> =======================================================
> [ INFO: possible circular locking dependency detected ]
> [ 2.6.22-rt3-dirty #34
> -------------------------------------------------------
> mount/1296 is trying to acquire lock:
> (&ei->truncate_mutex){--..}, at: [<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
>
> but task is already holding lock:
> (lock_page_0){--..}, at: [<ffffffff80267107>] generic_file_buffered_write+0x1ee/0x646
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #1 (lock_page_0){--..}:
> [<ffffffff80251b26>] __lock_acquire+0xa72/0xc35
> [<ffffffff802520c9>] lock_acquire+0x48/0x61
> [<ffffffff80265e22>] add_to_page_cache_lru+0xe/0x23
> [<ffffffff80265d31>] add_to_page_cache+0x1de/0x2c1
> [<ffffffff80265e22>] add_to_page_cache_lru+0xe/0x23
> [<ffffffff80266985>] find_or_create_page+0x4c/0x73
> [<ffffffff802ae716>] __getblk+0x118/0x23c
> [<ffffffff802afa91>] __bread+0x6/0x9c
> [<ffffffff802f382d>] read_block_bitmap+0x34/0x65
> [<ffffffff802f3e1b>] ext3_free_blocks_sb+0xec/0x3d4
> [<ffffffff802f4131>] ext3_free_blocks+0x2e/0x61
> [<ffffffff802f82bc>] ext3_free_data+0xaa/0xda
> [<ffffffff802f8976>] ext3_truncate+0x4d2/0x84e
> [<ffffffff8026df5a>] pagevec_lookup+0x17/0x1e
> [<ffffffff8026e7b1>] truncate_inode_pages_range+0x1f4/0x323
> [<ffffffff802614b4>] add_preempt_count+0x14/0xe4
> [<ffffffff80304d13>] journal_stop+0x1fe/0x21d
> [<ffffffff8027661a>] vmtruncate+0xa2/0xc0
> [<ffffffff802a292b>] inode_setattr+0x22/0x10a
> [<ffffffff802f9b51>] ext3_setattr+0x136/0x18f
> [<ffffffff802a2b1d>] notify_change+0x10a/0x241
> [<ffffffff802a2b3b>] notify_change+0x128/0x241
> [<ffffffff8028e35e>] do_truncate+0x56/0x7f
> [<ffffffff8028e369>] do_truncate+0x61/0x7f
> [<ffffffff80296278>] get_write_access+0x3f/0x45
> [<ffffffff802973c7>] may_open+0x193/0x1af
> [<ffffffff80299869>] open_namei+0x2cb/0x63e
> [<ffffffff8025718b>] rt_up_read+0x53/0x5c
> [<ffffffff8056da59>] do_page_fault+0x479/0x7cc
> [<ffffffff8028dce1>] do_filp_open+0x1c/0x38
> [<ffffffff8056a4f9>] rt_spin_unlock+0x17/0x47
> [<ffffffff8028da05>] get_unused_fd+0xf9/0x107
> [<ffffffff8028dd45>] do_sys_open+0x48/0xd5
> [<ffffffff8020950e>] system_call+0x7e/0x83
> [<ffffffffffffffff>] 0xffffffffffffffff

I guess we're doing lock_page() against a blockdev pagecache page here
while holding truncate_mutex against some S_ISREG file.

> -> #0 (&ei->truncate_mutex){--..}:
> [<ffffffff802503b9>] print_circular_bug_header+0xcc/0xd3
> [<ffffffff80251a22>] __lock_acquire+0x96e/0xc35
> [<ffffffff802520c9>] lock_acquire+0x48/0x61
> [<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
> [<ffffffff8056a6d4>] _mutex_lock+0x26/0x52
> [<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
> [<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
> [<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
> [<ffffffff80250d7c>] debug_check_no_locks_freed+0x11d/0x129
> [<ffffffff80250c33>] trace_hardirqs_on_caller+0x115/0x138
> [<ffffffff8024efdc>] lockdep_init_map+0xac/0x41f
> [<ffffffff802614b4>] add_preempt_count+0x14/0xe4
> [<ffffffff802f8035>] ext3_get_block+0xc2/0xe4
> [<ffffffff802aeed3>] __block_prepare_write+0x195/0x442
> [<ffffffff802f7f73>] ext3_get_block+0x0/0xe4
> [<ffffffff802af19a>] block_prepare_write+0x1a/0x25
> [<ffffffff802f93e9>] ext3_prepare_write+0xb2/0x17b
> [<ffffffff802671b1>] generic_file_buffered_write+0x298/0x646
> [<ffffffff8023944e>] current_fs_time+0x3b/0x40
> [<ffffffff802614b4>] add_preempt_count+0x14/0xe4
> [<ffffffff802678ae>] __generic_file_aio_write_nolock+0x34f/0x3b9
> [<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
> [<ffffffff80267964>] generic_file_aio_write+0x4c/0xc4
> [<ffffffff80267979>] generic_file_aio_write+0x61/0xc4
> [<ffffffff802fcf18>] ext3_orphan_del+0x53/0x19f
> [<ffffffff802f5768>] ext3_file_write+0x1c/0x9d
> [<ffffffff8028ef31>] do_sync_write+0xcc/0x10f
> [<ffffffff80246f9c>] autoremove_wake_function+0x0/0x2e
> [<ffffffff8024ecfe>] get_lock_stats+0xe/0x3f
> [<ffffffff8024ed9a>] lock_release_holdtime+0x41/0x4f
> [<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
> [<ffffffff8028dfb1>] sys_fchmod+0xa3/0xbd
> [<ffffffff8056a717>] _mutex_unlock+0x17/0x20
> [<ffffffff8028f6cd>] vfs_write+0xb6/0x148
> [<ffffffff8028fc61>] sys_write+0x48/0x74
> [<ffffffff8020950e>] system_call+0x7e/0x83
> [<ffffffffffffffff>] 0xffffffffffffffff

Here we're taking a file's truncate_mutex while holding lock_page() against
one of its pages. This is the correct lock ranking, I suppose.

This is one of those fairly common cross-inode notabugs, I suspect.

2007-07-15 19:21:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Sun, 2007-07-15 at 11:11 -0700, Andrew Morton wrote:
> On Sun, 15 Jul 2007 15:02:23 +0200 Peter Zijlstra <[email protected]> wrote:
>
> > On Fri, 2007-07-13 at 14:47 -0700, Zach Brown wrote:
> >
> > > Peter, do you have any interest in seeing how far we can get
> > > at tracking lock_page()? I'm not holding my breath, but any little bit
> > > would probably help.
> >
> > Would this be a valid report?
> >
> > ( /me goes hunt a x86_64 unwinder patch that will apply to this tree.
> > These stacktraces are pain )
>
> They are. lockdep reports are a pain too. It's still a struggle to
> understand wtf they're trying to tell you. Mabe it's just me.

It got you confused alright,..

> > =======================================================
> > [ INFO: possible circular locking dependency detected ]
> > [ 2.6.22-rt3-dirty #34
> > -------------------------------------------------------
> > mount/1296 is trying to acquire lock:
> > (&ei->truncate_mutex){--..}, at: [<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
> >
> > but task is already holding lock:
> > (lock_page_0){--..}, at: [<ffffffff80267107>] generic_file_buffered_write+0x1ee/0x646
> >
> > which lock already depends on the new lock.
> >

So, the offence is trying to acquire &ei->truncate_mutex while already
holding lock_page_0.

These traces show how the previous (reverse) dependancy came into being

> >
> > the existing dependency chain (in reverse order) is:
> >
> > -> #1 (lock_page_0){--..}:
> > [<ffffffff80251b26>] __lock_acquire+0xa72/0xc35
> > [<ffffffff802520c9>] lock_acquire+0x48/0x61
> > [<ffffffff80265e22>] add_to_page_cache_lru+0xe/0x23
> > [<ffffffff80265d31>] add_to_page_cache+0x1de/0x2c1
> > [<ffffffff80265e22>] add_to_page_cache_lru+0xe/0x23
> > [<ffffffff80266985>] find_or_create_page+0x4c/0x73
> > [<ffffffff802ae716>] __getblk+0x118/0x23c
> > [<ffffffff802afa91>] __bread+0x6/0x9c
> > [<ffffffff802f382d>] read_block_bitmap+0x34/0x65
> > [<ffffffff802f3e1b>] ext3_free_blocks_sb+0xec/0x3d4
> > [<ffffffff802f4131>] ext3_free_blocks+0x2e/0x61
> > [<ffffffff802f82bc>] ext3_free_data+0xaa/0xda
> > [<ffffffff802f8976>] ext3_truncate+0x4d2/0x84e
> > [<ffffffff8026df5a>] pagevec_lookup+0x17/0x1e
> > [<ffffffff8026e7b1>] truncate_inode_pages_range+0x1f4/0x323
> > [<ffffffff802614b4>] add_preempt_count+0x14/0xe4
> > [<ffffffff80304d13>] journal_stop+0x1fe/0x21d
> > [<ffffffff8027661a>] vmtruncate+0xa2/0xc0
> > [<ffffffff802a292b>] inode_setattr+0x22/0x10a
> > [<ffffffff802f9b51>] ext3_setattr+0x136/0x18f
> > [<ffffffff802a2b1d>] notify_change+0x10a/0x241
> > [<ffffffff802a2b3b>] notify_change+0x128/0x241
> > [<ffffffff8028e35e>] do_truncate+0x56/0x7f
> > [<ffffffff8028e369>] do_truncate+0x61/0x7f
> > [<ffffffff80296278>] get_write_access+0x3f/0x45
> > [<ffffffff802973c7>] may_open+0x193/0x1af
> > [<ffffffff80299869>] open_namei+0x2cb/0x63e
> > [<ffffffff8025718b>] rt_up_read+0x53/0x5c
> > [<ffffffff8056da59>] do_page_fault+0x479/0x7cc
> > [<ffffffff8028dce1>] do_filp_open+0x1c/0x38
> > [<ffffffff8056a4f9>] rt_spin_unlock+0x17/0x47
> > [<ffffffff8028da05>] get_unused_fd+0xf9/0x107
> > [<ffffffff8028dd45>] do_sys_open+0x48/0xd5
> > [<ffffffff8020950e>] system_call+0x7e/0x83
> > [<ffffffffffffffff>] 0xffffffffffffffff
>
> I guess we're doing lock_page() against a blockdev pagecache page here
> while holding truncate_mutex against some S_ISREG file.

So this trace ( -> #1 ) shows how lock_page_0 became to depend on
&ei->truncate_mutex ( -> #0 ).

> > -> #0 (&ei->truncate_mutex){--..}:
> > [<ffffffff802503b9>] print_circular_bug_header+0xcc/0xd3
> > [<ffffffff80251a22>] __lock_acquire+0x96e/0xc35
> > [<ffffffff802520c9>] lock_acquire+0x48/0x61
> > [<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
> > [<ffffffff8056a6d4>] _mutex_lock+0x26/0x52
> > [<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
> > [<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
> > [<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
> > [<ffffffff80250d7c>] debug_check_no_locks_freed+0x11d/0x129
> > [<ffffffff80250c33>] trace_hardirqs_on_caller+0x115/0x138
> > [<ffffffff8024efdc>] lockdep_init_map+0xac/0x41f
> > [<ffffffff802614b4>] add_preempt_count+0x14/0xe4
> > [<ffffffff802f8035>] ext3_get_block+0xc2/0xe4
> > [<ffffffff802aeed3>] __block_prepare_write+0x195/0x442
> > [<ffffffff802f7f73>] ext3_get_block+0x0/0xe4
> > [<ffffffff802af19a>] block_prepare_write+0x1a/0x25
> > [<ffffffff802f93e9>] ext3_prepare_write+0xb2/0x17b
> > [<ffffffff802671b1>] generic_file_buffered_write+0x298/0x646
> > [<ffffffff8023944e>] current_fs_time+0x3b/0x40
> > [<ffffffff802614b4>] add_preempt_count+0x14/0xe4
> > [<ffffffff802678ae>] __generic_file_aio_write_nolock+0x34f/0x3b9
> > [<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
> > [<ffffffff80267964>] generic_file_aio_write+0x4c/0xc4
> > [<ffffffff80267979>] generic_file_aio_write+0x61/0xc4
> > [<ffffffff802fcf18>] ext3_orphan_del+0x53/0x19f
> > [<ffffffff802f5768>] ext3_file_write+0x1c/0x9d
> > [<ffffffff8028ef31>] do_sync_write+0xcc/0x10f
> > [<ffffffff80246f9c>] autoremove_wake_function+0x0/0x2e
> > [<ffffffff8024ecfe>] get_lock_stats+0xe/0x3f
> > [<ffffffff8024ed9a>] lock_release_holdtime+0x41/0x4f
> > [<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
> > [<ffffffff8028dfb1>] sys_fchmod+0xa3/0xbd
> > [<ffffffff8056a717>] _mutex_unlock+0x17/0x20
> > [<ffffffff8028f6cd>] vfs_write+0xb6/0x148
> > [<ffffffff8028fc61>] sys_write+0x48/0x74
> > [<ffffffff8020950e>] system_call+0x7e/0x83
> > [<ffffffffffffffff>] 0xffffffffffffffff
>
> Here we're taking a file's truncate_mutex while holding lock_page() against
> one of its pages. This is the correct lock ranking, I suppose.
>
> This is one of those fairly common cross-inode notabugs, I suspect.

So #0 and #1 are compatible, and build the dependancy:

&ei->truncate_mutex
lock_page_0

Then the part you cut out:

> > stack backtrace:
> >
> > Call Trace:
> > [<ffffffff8024ffbd>] print_circular_bug_tail+0x69/0x72
> > [<ffffffff802503b9>] print_circular_bug_header+0xcc/0xd3
> > [<ffffffff80251a22>] __lock_acquire+0x96e/0xc35
> > [<ffffffff802520c9>] lock_acquire+0x48/0x61
> > [<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
> > [<ffffffff8056a6d4>] _mutex_lock+0x26/0x52

Here we take: &ei->truncate_mutex

> > [<ffffffff802f75e5>] ext3_get_blocks_handle+0x1a4/0x8f7
> > [<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
> > [<ffffffff802504b2>] find_usage_backwards+0xb0/0xd9
> > [<ffffffff80250d7c>] debug_check_no_locks_freed+0x11d/0x129
> > [<ffffffff80250c33>] trace_hardirqs_on_caller+0x115/0x138
> > [<ffffffff8024efdc>] lockdep_init_map+0xac/0x41f
> > [<ffffffff802614b4>] add_preempt_count+0x14/0xe4
> > [<ffffffff802f8035>] ext3_get_block+0xc2/0xe4
> > [<ffffffff802aeed3>] __block_prepare_write+0x195/0x442
> > [<ffffffff802f7f73>] ext3_get_block+0x0/0xe4
> > [<ffffffff802af19a>] block_prepare_write+0x1a/0x25
> > [<ffffffff802f93e9>] ext3_prepare_write+0xb2/0x17b
> > [<ffffffff802671b1>] generic_file_buffered_write+0x298/0x646

while some path from generic_file_buffered_write() took: lock_page_0
probably __grab_cache_page()
> >
> > [<ffffffff8023944e>] current_fs_time+0x3b/0x40
> > [<ffffffff802614b4>] add_preempt_count+0x14/0xe4
> > [<ffffffff802678ae>] __generic_file_aio_write_nolock+0x34f/0x3b9
> > [<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
> > [<ffffffff80267964>] generic_file_aio_write+0x4c/0xc4
> > [<ffffffff80267979>] generic_file_aio_write+0x61/0xc4
> > [<ffffffff802fcf18>] ext3_orphan_del+0x53/0x19f
> > [<ffffffff802f5768>] ext3_file_write+0x1c/0x9d
> > [<ffffffff8028ef31>] do_sync_write+0xcc/0x10f
> > [<ffffffff80246f9c>] autoremove_wake_function+0x0/0x2e
> > [<ffffffff8024ecfe>] get_lock_stats+0xe/0x3f
> > [<ffffffff8024ed9a>] lock_release_holdtime+0x41/0x4f
> > [<ffffffff8024ed3d>] put_lock_stats+0xe/0x2a
> > [<ffffffff8028dfb1>] sys_fchmod+0xa3/0xbd
> > [<ffffffff8056a717>] _mutex_unlock+0x17/0x20
> > [<ffffffff8028f6cd>] vfs_write+0xb6/0x148
> > [<ffffffff8028fc61>] sys_write+0x48/0x74
> > [<ffffffff8020950e>] system_call+0x7e/0x83
> >
Shows the current stacktrace where we violate the previously established
locking order.


2007-07-15 19:59:21

by Andrew Morton

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Sun, 15 Jul 2007 21:21:03 +0200 Peter Zijlstra <[email protected]> wrote:

> Shows the current stacktrace where we violate the previously established
> locking order.

yup, but the lock_page() which we did inside truncate_mutex was a
lock_page() against a different address_space: the blockdev mapping.

So this is OK - we'll never take truncate_mutex against the blockdev
mapping (it doesn't have one, for a start ;))

This is similar to the quite common case where we take inode A's
i_mutex inside inode B's i_mutex, which needs special lockdep annotations.

I think. I haven't looked into this in detail.

2007-07-15 20:14:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Sun, 2007-07-15 at 12:59 -0700, Andrew Morton wrote:
> On Sun, 15 Jul 2007 21:21:03 +0200 Peter Zijlstra <[email protected]> wrote:
>
> > Shows the current stacktrace where we violate the previously established
> > locking order.
>
> yup, but the lock_page() which we did inside truncate_mutex was a
> lock_page() against a different address_space: the blockdev mapping.
>
> So this is OK - we'll never take truncate_mutex against the blockdev
> mapping (it doesn't have one, for a start ;))
>
> This is similar to the quite common case where we take inode A's
> i_mutex inside inode B's i_mutex, which needs special lockdep annotations.
>
> I think. I haven't looked into this in detail.

Right, I can make lock_page classes per address space. Lets see if this
one goes away.

2007-07-16 23:52:03

by Mingming Cao

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Fri, 2007-07-13 at 02:05 -0700, Andrew Morton wrote:
> On Tue, 10 Jul 2007 16:32:47 -0700 Andrew Morton <[email protected]> wrote:
>
> > > + brelse(bh);
> > > + up_write(&EXT4_I(inode)->xattr_sem);
> > > + return error;
> > > +}
> > > +
> >
> > We're doing GFP_KERNEL memory allocations while holding xattr_sem. This
> > can cause the VM to reenter the filesystem, perhaps taking i_mutex and/or
> > i_truncate_sem and/or journal_start() (I forget whether this still
> > happens). Have we checked whether this can occur and if so, whether we are
> > OK from a lock ranking POV? Bear in mind that journalled-data mode is more
> > complex in this regard.
>
> I notice that everyone carefully avoided addressing this ;)

I am not sure why we need GFP_KERNEL flag here. I think we should use
GFP_NOFS instead. The following patch use the GFP_NOFS flag, as well as
fixing memory leak issue introduced by the ext4 expand inode extra isize
patch.

Fixing memory allocation issue with expand inode extra isize patch.

- use GFP_NOFS instead of GFP_KERNEL flag for memory allocation
- use kzalloc instead of kmalloc
- fix memory leak in the success case, at the end of while loop.


Signed-off-by: Mingming Cao <[email protected]>
---
fs/ext4/xattr.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)

Index: linux-2.6.22/fs/ext4/xattr.c
===================================================================
--- linux-2.6.22.orig/fs/ext4/xattr.c 2007-07-16 16:12:18.000000000 -0700
+++ linux-2.6.22/fs/ext4/xattr.c 2007-07-16 16:35:59.000000000 -0700
@@ -1204,8 +1204,8 @@ retry:
unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
unsigned int min_total_size = ~0U;

- is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_KERNEL);
- bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_KERNEL);
+ is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+ bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
if (!is || !bs) {
error = -ENOMEM;
goto cleanup;
@@ -1251,8 +1251,8 @@ retry:
size = le32_to_cpu(entry->e_value_size);
entry_size = EXT4_XATTR_LEN(entry->e_name_len);
i.name_index = entry->e_name_index,
- buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_KERNEL);
- b_entry_name = kmalloc(entry->e_name_len + 1, GFP_KERNEL);
+ buffer = kzalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
+ b_entry_name = kzalloc(entry->e_name_len + 1, GFP_NOFS);
if (!buffer || !b_entry_name) {
error = -ENOMEM;
goto cleanup;
@@ -1302,7 +1302,15 @@ retry:
error = ext4_xattr_block_set(handle, inode, &i, bs);
if (error)
goto cleanup;
+ kfree(b_entry_name);
+ kfree(buffer);
+ brelse(is->iloc.bh);
+ kfree(is);
+ kfree(bs);
+ brelse(bh);
}
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return 0;

cleanup:
kfree(b_entry_name);

2007-07-17 00:06:36

by Andreas Dilger

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Jul 16, 2007 16:52 -0700, Mingming Cao wrote:
> I am not sure why we need GFP_KERNEL flag here. I think we should use
> GFP_NOFS instead. The following patch use the GFP_NOFS flag, as well as
> fixing memory leak issue introduced by the ext4 expand inode extra isize
> patch.
>
> Fixing memory allocation issue with expand inode extra isize patch.
>
> - use GFP_NOFS instead of GFP_KERNEL flag for memory allocation
> - use kzalloc instead of kmalloc

This doesn't need kzalloc() for buffer and b_entry_name, since they are
immediately overwritten by memcpy().

> - fix memory leak in the success case, at the end of while loop.
> goto cleanup;
> @@ -1302,7 +1302,15 @@ retry:
> error = ext4_xattr_block_set(handle, inode, &i, bs);
> if (error)
> goto cleanup;
> + kfree(b_entry_name);
> + kfree(buffer);
> + brelse(is->iloc.bh);
> + kfree(is);
> + kfree(bs);
> + brelse(bh);
> }
> + up_write(&EXT4_I(inode)->xattr_sem);
> + return 0;
>
> cleanup:
> kfree(b_entry_name);

I don't think you should have brelse(bh) inside the loop, since it is
allocated before the loop starts.

Cheers, Andreas
--
Andreas Dilger
Principal Software Engineer
Cluster File Systems, Inc.

2007-07-17 00:24:37

by Mingming Cao

[permalink] [raw]
Subject: Re: [EXT4 set 5][PATCH 1/1] expand inode i_extra_isize to support features in larger inode

On Mon, 2007-07-16 at 18:06 -0600, Andreas Dilger wrote:
> On Jul 16, 2007 16:52 -0700, Mingming Cao wrote:
> > I am not sure why we need GFP_KERNEL flag here. I think we should use
> > GFP_NOFS instead. The following patch use the GFP_NOFS flag, as well as
> > fixing memory leak issue introduced by the ext4 expand inode extra isize
> > patch.
> >
> > Fixing memory allocation issue with expand inode extra isize patch.
> >
> > - use GFP_NOFS instead of GFP_KERNEL flag for memory allocation
> > - use kzalloc instead of kmalloc
>
> This doesn't need kzalloc() for buffer and b_entry_name, since they are
> immediately overwritten by memcpy().
>
ok.
> > - fix memory leak in the success case, at the end of while loop.
> > goto cleanup;
> > @@ -1302,7 +1302,15 @@ retry:
> > error = ext4_xattr_block_set(handle, inode, &i, bs);
> > if (error)
> > goto cleanup;
> > + kfree(b_entry_name);
> > + kfree(buffer);
> > + brelse(is->iloc.bh);
> > + kfree(is);
> > + kfree(bs);
> > + brelse(bh);
> > }
> > + up_write(&EXT4_I(inode)->xattr_sem);
> > + return 0;
> >
> > cleanup:
> > kfree(b_entry_name);
>
> I don't think you should have brelse(bh) inside the loop, since it is
> allocated before the loop starts.
>
Ahh right. thanks.

Updated fix.

Fixing memory allocation issue with expand inode extra isize patch.

- use GFP_NOFS instead of GFP_KERNEL to prevent fs reenter
- fix memory leak in the success case, at the end of while loop.


Signed-off-by: Mingming Cao <[email protected]>
---
fs/ext4/xattr.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)

Index: linux-2.6.22/fs/ext4/xattr.c
===================================================================
--- linux-2.6.22.orig/fs/ext4/xattr.c 2007-07-16 17:21:14.000000000 -0700
+++ linux-2.6.22/fs/ext4/xattr.c 2007-07-16 17:22:25.000000000 -0700
@@ -1204,8 +1204,8 @@ retry:
unsigned int shift_bytes; /* No. of bytes to shift EAs by? */
unsigned int min_total_size = ~0U;

- is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_KERNEL);
- bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_KERNEL);
+ is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
+ bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
if (!is || !bs) {
error = -ENOMEM;
goto cleanup;
@@ -1251,8 +1251,8 @@ retry:
size = le32_to_cpu(entry->e_value_size);
entry_size = EXT4_XATTR_LEN(entry->e_name_len);
i.name_index = entry->e_name_index,
- buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_KERNEL);
- b_entry_name = kmalloc(entry->e_name_len + 1, GFP_KERNEL);
+ buffer = kmalloc(EXT4_XATTR_SIZE(size), GFP_NOFS);
+ b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
if (!buffer || !b_entry_name) {
error = -ENOMEM;
goto cleanup;
@@ -1302,7 +1302,15 @@ retry:
error = ext4_xattr_block_set(handle, inode, &i, bs);
if (error)
goto cleanup;
+ kfree(b_entry_name);
+ kfree(buffer);
+ brelse(is->iloc.bh);
+ kfree(is);
+ kfree(bs);
}
+ brelse(bh);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return 0;

cleanup:
kfree(b_entry_name);