From: Andreas Dilger Subject: Re: [PATCH 5/8] ext4: call ext4_forget() from ext4_free_blocks() Date: Mon, 23 Nov 2009 12:22:29 -0700 Message-ID: References: <1258942710-31930-1-git-send-email-tytso@mit.edu> <1258942710-31930-6-git-send-email-tytso@mit.edu> Mime-Version: 1.0 Content-Type: text/plain; CHARSET=US-ASCII; delsp=yes; format=flowed Content-Transfer-Encoding: 7BIT Cc: Ext4 Developers List , "Aneesh Kumar K.V" , Curt Wohlgemuth To: "Theodore Ts'o" Return-path: Received: from sca-es-mail-1.Sun.COM ([192.18.43.132]:32832 "EHLO sca-es-mail-1.sun.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753568AbZKWTWT (ORCPT ); Mon, 23 Nov 2009 14:22:19 -0500 Received: from fe-sfbay-09.sun.com ([192.18.43.129]) by sca-es-mail-1.sun.com (8.13.7+Sun/8.12.9) with ESMTP id nANJMPkD021330 for ; Mon, 23 Nov 2009 11:22:25 -0800 (PST) Received: from conversion-daemon.fe-sfbay-09.sun.com by fe-sfbay-09.sun.com (Sun Java(tm) System Messaging Server 7u2-7.04 64bit (built Jul 2 2009)) id <0KTK00I00TPPD300@fe-sfbay-09.sun.com> for linux-ext4@vger.kernel.org; Mon, 23 Nov 2009 11:22:24 -0800 (PST) In-reply-to: <1258942710-31930-6-git-send-email-tytso@mit.edu> Sender: linux-ext4-owner@vger.kernel.org List-ID: On 2009-11-22, at 19:18, Theodore Ts'o wrote: > Add the facility for ext4_forget() to be called form > ext4_free_blocks(). This simplifies the code in a large number of > places, and centralizes most of the work of calling ext4_forget() into > a single place. > > Also fix a bug in the extents migration code; it wasn't calling > ext4_forget() when releasing the indirect blocks during the > conversion. Would this also solve Curt's problem, mentioned in "Bug in extent zeroout: blocks not marked as new" where bforget was not being called when the blocks are freed? > As a result, if the system cashed during or shortly after the > extents migration, and the released indirect blocks get reused as > data blocks, the journal replay would corrupt the data blocks. > With this new patch, fixing this bug was as simple as adding the > EXT4_FREE_BLOCKS_FORGET flags to the call to ext4_free_blocks(). > > Signed-off-by: "Theodore Ts'o" > Cc: "Aneesh Kumar K.V" > --- > fs/ext4/ext4.h | 10 +++++- > fs/ext4/extents.c | 24 ++++++--------- > fs/ext4/inode.c | 67 +++++++++++++++ > +-------------------------- > fs/ext4/mballoc.c | 49 +++++++++++++++++++++++-------- > fs/ext4/migrate.c | 23 ++++++++++---- > fs/ext4/xattr.c | 8 +++-- > include/trace/events/ext4.h | 16 ++++++---- > 7 files changed, 109 insertions(+), 88 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 210e1b5..4cfc2f0 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -376,6 +376,12 @@ struct ext4_new_group_data { > EXT4_GET_BLOCKS_DIO_CREATE_EXT) > > /* > + * Flags used by ext4_free_blocks > + */ > +#define EXT4_FREE_BLOCKS_METADATA 0x0001 > +#define EXT4_FREE_BLOCKS_FORGET 0x0002 > + > +/* > * ioctl commands > */ > #define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS > @@ -1384,8 +1390,8 @@ extern void ext4_discard_preallocations(struct > inode *); > extern int __init init_ext4_mballoc(void); > extern void exit_ext4_mballoc(void); > extern void ext4_free_blocks(handle_t *handle, struct inode *inode, > - ext4_fsblk_t block, unsigned long count, > - int metadata); > + struct buffer_head *bh, ext4_fsblk_t block, > + unsigned long count, int flags); > extern int ext4_mb_add_groupinfo(struct super_block *sb, > ext4_group_t i, struct ext4_group_desc *desc); > extern int ext4_mb_get_buddy_cache_lock(struct super_block *, > ext4_group_t); > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index 74dcff8..2c4a932 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -1007,7 +1007,8 @@ cleanup: > for (i = 0; i < depth; i++) { > if (!ablocks[i]) > continue; > - ext4_free_blocks(handle, inode, ablocks[i], 1, 1); > + ext4_free_blocks(handle, inode, 0, ablocks[i], 1, > + EXT4_FREE_BLOCKS_METADATA); > } > } > kfree(ablocks); > @@ -1957,7 +1958,6 @@ errout: > static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, > struct ext4_ext_path *path) > { > - struct buffer_head *bh; > int err; > ext4_fsblk_t leaf; > > @@ -1973,9 +1973,8 @@ static int ext4_ext_rm_idx(handle_t *handle, > struct inode *inode, > if (err) > return err; > ext_debug("index is empty, remove it, free block %llu\n", leaf); > - bh = sb_find_get_block(inode->i_sb, leaf); > - ext4_forget(handle, 1, inode, bh, leaf); > - ext4_free_blocks(handle, inode, leaf, 1, 1); > + ext4_free_blocks(handle, inode, 0, leaf, 1, > + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); > return err; > } > > @@ -2042,12 +2041,11 @@ static int ext4_remove_blocks(handle_t > *handle, struct inode *inode, > struct ext4_extent *ex, > ext4_lblk_t from, ext4_lblk_t to) > { > - struct buffer_head *bh; > unsigned short ee_len = ext4_ext_get_actual_len(ex); > - int i, metadata = 0; > + int flags = EXT4_FREE_BLOCKS_FORGET; > > if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) > - metadata = 1; > + flags |= EXT4_FREE_BLOCKS_METADATA; > #ifdef EXTENTS_STATS > { > struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > @@ -2072,11 +2070,7 @@ static int ext4_remove_blocks(handle_t > *handle, struct inode *inode, > num = le32_to_cpu(ex->ee_block) + ee_len - from; > start = ext_pblock(ex) + ee_len - num; > ext_debug("free last %u blocks starting %llu\n", num, start); > - for (i = 0; i < num; i++) { > - bh = sb_find_get_block(inode->i_sb, start + i); > - ext4_forget(handle, metadata, inode, bh, start + i); > - } > - ext4_free_blocks(handle, inode, start, num, metadata); > + ext4_free_blocks(handle, inode, 0, start, num, flags); > } else if (from == le32_to_cpu(ex->ee_block) > && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { > printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", > @@ -3319,8 +3313,8 @@ int ext4_ext_get_blocks(handle_t *handle, > struct inode *inode, > /* not a good idea to call discard here directly, > * but otherwise we'd need to call it every free() */ > ext4_discard_preallocations(inode); > - ext4_free_blocks(handle, inode, ext_pblock(&newex), > - ext4_ext_get_actual_len(&newex), 0); > + ext4_free_blocks(handle, inode, 0, ext_pblock(&newex), > + ext4_ext_get_actual_len(&newex), 0); > goto out2; > } > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 72c6943..3b28e1f 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -669,7 +669,7 @@ allocated: > return ret; > failed_out: > for (i = 0; i < index; i++) > - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); > + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); > return ret; > } > > @@ -765,20 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, > struct inode *inode, > return err; > failed: > /* Allocation failed, free what we already allocated */ > + ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); > for (i = 1; i <= n ; i++) { > - BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); > /* > - * Note: is_metadata is 0 because branch[i].bh is > - * newly allocated, so there is no need to revoke the > - * block. If we do, it's harmless, but not necessary. > + * branch[i].bh is newly allocated, so there is no > + * need to revoke the block, which is why we don't > + * need to set EXT4_FREE_BLOCKS_METADATA. > */ > - ext4_forget(handle, 0, inode, branch[i].bh, > - branch[i].bh->b_blocknr); > + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, > + EXT4_FREE_BLOCKS_FORGET); > } > - for (i = 0; i < indirect_blks; i++) > - ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); > + for (i = n+1; i < indirect_blks; i++) > + ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); > > - ext4_free_blocks(handle, inode, new_blocks[i], num, 0); > + ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); > > return err; > } > @@ -857,18 +857,16 @@ static int ext4_splice_branch(handle_t > *handle, struct inode *inode, > > err_out: > for (i = 1; i <= num; i++) { > - BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); > /* > - * Note: is_metadata is 0 because branch[i].bh is > - * newly allocated, so there is no need to revoke the > - * block. If we do, it's harmless, but not necessary. > + * branch[i].bh is newly allocated, so there is no > + * need to revoke the block, which is why we don't > + * need to set EXT4_FREE_BLOCKS_METADATA. > */ > - ext4_forget(handle, 0, inode, where[i].bh, > - where[i].bh->b_blocknr); > - ext4_free_blocks(handle, inode, > - le32_to_cpu(where[i-1].key), 1, 0); > + ext4_free_blocks(handle, inode, where[i].bh, 0, 1, > + EXT4_FREE_BLOCKS_FORGET); > } > - ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, > 0); > + ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), > + blks, 0); > > return err; > } > @@ -4080,7 +4078,10 @@ static void ext4_clear_blocks(handle_t > *handle, struct inode *inode, > __le32 *last) > { > __le32 *p; > - int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); > + int flags = EXT4_FREE_BLOCKS_FORGET; > + > + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) > + flags |= EXT4_FREE_BLOCKS_METADATA; > > if (try_to_extend_transaction(handle, inode)) { > if (bh) { > @@ -4096,27 +4097,10 @@ static void ext4_clear_blocks(handle_t > *handle, struct inode *inode, > } > } > > - /* > - * Any buffers which are on the journal will be in memory. We > - * find them on the hash table so jbd2_journal_revoke() will > - * run jbd2_journal_forget() on them. We've already detached > - * each block from the file, so bforget() in > - * jbd2_journal_forget() should be safe. > - * > - * AKPM: turn on bforget in jbd2_journal_forget()!!! > - */ > - for (p = first; p < last; p++) { > - u32 nr = le32_to_cpu(*p); > - if (nr) { > - struct buffer_head *tbh; > - > - *p = 0; > - tbh = sb_find_get_block(inode->i_sb, nr); > - ext4_forget(handle, is_metadata, inode, tbh, nr); > - } > - } > + for (p = first; p < last; p++) > + *p = 0; > > - ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); > + ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); > } > > /** > @@ -4304,7 +4288,8 @@ static void ext4_free_branches(handle_t > *handle, struct inode *inode, > blocks_for_truncate(inode)); > } > > - ext4_free_blocks(handle, inode, nr, 1, 1); > + ext4_free_blocks(handle, inode, 0, nr, 1, > + EXT4_FREE_BLOCKS_METADATA); > > if (parent_bh) { > /* > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c > index 0dca90b..78de5d3 100644 > --- a/fs/ext4/mballoc.c > +++ b/fs/ext4/mballoc.c > @@ -4436,8 +4436,8 @@ ext4_mb_free_metadata(handle_t *handle, struct > ext4_buddy *e4b, > * @metadata: Are these metadata blocks > */ > void ext4_free_blocks(handle_t *handle, struct inode *inode, > - ext4_fsblk_t block, unsigned long count, > - int metadata) > + struct buffer_head *bh, ext4_fsblk_t block, > + unsigned long count, int flags) > { > struct buffer_head *bitmap_bh = NULL; > struct super_block *sb = inode->i_sb; > @@ -4454,15 +4454,12 @@ void ext4_free_blocks(handle_t *handle, > struct inode *inode, > int err = 0; > int ret; > > - /* > - * We need to make sure we don't reuse the freed block until > - * after the transaction is committed, which we can do by > - * treating the block as metadata, below. We make an > - * exception if the inode is to be written in writeback mode > - * since writeback mode has weak data consistency guarantees. > - */ > - if (!ext4_should_writeback_data(inode)) > - metadata = 1; > + if (bh) { > + if (block) > + BUG_ON(block != bh->b_blocknr); > + else > + block = bh->b_blocknr; > + } > > sbi = EXT4_SB(sb); > es = EXT4_SB(sb)->s_es; > @@ -4476,7 +4473,32 @@ void ext4_free_blocks(handle_t *handle, > struct inode *inode, > } > > ext4_debug("freeing block %llu\n", block); > - trace_ext4_free_blocks(inode, block, count, metadata); > + trace_ext4_free_blocks(inode, block, count, flags); > + > + if (flags & EXT4_FREE_BLOCKS_FORGET) { > + struct buffer_head *tbh = bh; > + int i; > + > + BUG_ON(bh && (count > 1)); > + > + for (i = 0; i < count; i++) { > + if (!bh) > + tbh = sb_find_get_block(inode->i_sb, > + block + i); > + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, > + inode, tbh, block + i); > + } > + } > + > + /* > + * We need to make sure we don't reuse the freed block until > + * after the transaction is committed, which we can do by > + * treating the block as metadata, below. We make an > + * exception if the inode is to be written in writeback mode > + * since writeback mode has weak data consistency guarantees. > + */ > + if (!ext4_should_writeback_data(inode)) > + flags |= EXT4_FREE_BLOCKS_METADATA; > > ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); > if (ac) { > @@ -4552,7 +4574,8 @@ do_more: > err = ext4_mb_load_buddy(sb, block_group, &e4b); > if (err) > goto error_return; > - if (metadata && ext4_handle_valid(handle)) { > + > + if ((flags & EXT4_FREE_BLOCKS_METADATA) && > ext4_handle_valid(handle)) { > struct ext4_free_data *new_entry; > /* > * blocks being freed are metadata. these blocks shouldn't > diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c > index a93d5b8..d641e13 100644 > --- a/fs/ext4/migrate.c > +++ b/fs/ext4/migrate.c > @@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle, > for (i = 0; i < max_entries; i++) { > if (tmp_idata[i]) { > extend_credit_for_blkdel(handle, inode); > - ext4_free_blocks(handle, inode, > - le32_to_cpu(tmp_idata[i]), 1, 1); > + ext4_free_blocks(handle, inode, 0, > + le32_to_cpu(tmp_idata[i]), 1, > + EXT4_FREE_BLOCKS_METADATA | > + EXT4_FREE_BLOCKS_FORGET); > } > } > put_bh(bh); > extend_credit_for_blkdel(handle, inode); > - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); > + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, > + EXT4_FREE_BLOCKS_METADATA | > + EXT4_FREE_BLOCKS_FORGET); > return 0; > } > > @@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle, > } > put_bh(bh); > extend_credit_for_blkdel(handle, inode); > - ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); > + ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, > + EXT4_FREE_BLOCKS_METADATA | > + EXT4_FREE_BLOCKS_FORGET); > return 0; > } > > @@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, > struct inode *inode, __le32 *i_data) > /* ei->i_data[EXT4_IND_BLOCK] */ > if (i_data[0]) { > extend_credit_for_blkdel(handle, inode); > - ext4_free_blocks(handle, inode, > - le32_to_cpu(i_data[0]), 1, 1); > + ext4_free_blocks(handle, inode, 0, > + le32_to_cpu(i_data[0]), 1, > + EXT4_FREE_BLOCKS_METADATA | > + EXT4_FREE_BLOCKS_FORGET); > } > > /* ei->i_data[EXT4_DIND_BLOCK] */ > @@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct > inode *inode, > } > put_bh(bh); > extend_credit_for_blkdel(handle, inode); > - ext4_free_blocks(handle, inode, block, 1, 1); > + ext4_free_blocks(handle, inode, 0, block, 1, > + EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); > return retval; > } > > diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c > index 0257019..910bf9a 100644 > --- a/fs/ext4/xattr.c > +++ b/fs/ext4/xattr.c > @@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle, > struct inode *inode, > ea_bdebug(bh, "refcount now=0; freeing"); > if (ce) > mb_cache_entry_free(ce); > - ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1); > get_bh(bh); > - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); > + ext4_free_blocks(handle, inode, bh, 0, 1, > + EXT4_FREE_BLOCKS_METADATA | > + EXT4_FREE_BLOCKS_FORGET); > } else { > le32_add_cpu(&BHDR(bh)->h_refcount, -1); > error = ext4_handle_dirty_metadata(handle, inode, bh); > @@ -832,7 +833,8 @@ inserted: > new_bh = sb_getblk(sb, block); > if (!new_bh) { > getblk_failed: > - ext4_free_blocks(handle, inode, block, 1, 1); > + ext4_free_blocks(handle, inode, 0, block, 1, > + EXT4_FREE_BLOCKS_METADATA); > error = -EIO; > goto cleanup; > } > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h > index b390e1f..74f628b 100644 > --- a/include/trace/events/ext4.h > +++ b/include/trace/events/ext4.h > @@ -650,30 +650,32 @@ TRACE_EVENT(ext4_allocate_blocks, > > TRACE_EVENT(ext4_free_blocks, > TP_PROTO(struct inode *inode, __u64 block, unsigned long count, > - int metadata), > + int flags), > > - TP_ARGS(inode, block, count, metadata), > + TP_ARGS(inode, block, count, flags), > > TP_STRUCT__entry( > __field( dev_t, dev ) > __field( ino_t, ino ) > + __field( umode_t, mode ) > __field( __u64, block ) > __field( unsigned long, count ) > - __field( int, metadata ) > - > + __field( int, flags ) > ), > > TP_fast_assign( > __entry->dev = inode->i_sb->s_dev; > __entry->ino = inode->i_ino; > + __entry->mode = inode->i_mode; > __entry->block = block; > __entry->count = count; > - __entry->metadata = metadata; > + __entry->flags = flags; > ), > > - TP_printk("dev %s ino %lu block %llu count %lu metadata %d", > + TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d", > jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, > - __entry->block, __entry->count, __entry->metadata) > + __entry->mode, __entry->block, __entry->count, > + __entry->flags) > ); > > TRACE_EVENT(ext4_sync_file, > -- > 1.6.5.216.g5288a.dirty Cheers, Andreas -- Andreas Dilger Sr. Staff Engineer, Lustre Group Sun Microsystems of Canada, Inc.