From: Alex Tomas Subject: [RFC] basic delayed allocation in ext4 Date: Thu, 26 Jul 2007 13:00:04 +0400 Message-ID: <46A86294.6050608@clusterfs.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit To: ext4 development , linux-fsdevel@vger.kernel.org Return-path: Received: from mail.chehov.net ([80.71.245.247]:50374 "EHLO mail.rialcom.ru" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1764556AbXGZJA2 (ORCPT ); Thu, 26 Jul 2007 05:00:28 -0400 Sender: linux-ext4-owner@vger.kernel.org List-Id: linux-ext4.vger.kernel.org Good day, please review ... thanks, Alex Basic delayed allocation in ext4 Two special ->get_block() methods are introduced: * ext4_da_get_block_prep() to be used with ->prepare_write(), defers allocation till flush * ext4_da_get_block_write() to be used with mpage_da_writepages(), allocate blocks and correct on-disk size Current implementation works with data=writeback only, you should mount filesystem with delalloc,data=writeback options. TODO: * reservation * data=ordered * quota * bmap Signed-off-by: Alex Tomas Index: linux-2.6.22/include/linux/ext4_fs.h =================================================================== --- linux-2.6.22.orig/include/linux/ext4_fs.h 2007-07-26 12:30:25.000000000 +0400 +++ linux-2.6.22/include/linux/ext4_fs.h 2007-07-26 12:32:04.000000000 +0400 @@ -488,6 +488,7 @@ do { \ #define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_DELALLOC 0x2000000 /* Delalloc support */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt Index: linux-2.6.22/fs/ext4/super.c =================================================================== --- linux-2.6.22.orig/fs/ext4/super.c 2007-07-26 12:30:25.000000000 +0400 +++ linux-2.6.22/fs/ext4/super.c 2007-07-26 12:32:04.000000000 +0400 @@ -728,7 +728,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_extents, Opt_noextents, + Opt_grpquota, Opt_extents, Opt_noextents, Opt_delalloc, }; static match_table_t tokens = { @@ -782,6 +782,7 @@ static match_table_t tokens = { {Opt_barrier, "barrier=%u"}, {Opt_extents, "extents"}, {Opt_noextents, "noextents"}, + {Opt_delalloc, "delalloc"}, {Opt_err, NULL}, {Opt_resize, "resize"}, }; @@ -1127,6 +1128,9 @@ clear_qf_name: case Opt_noextents: clear_opt (sbi->s_mount_opt, EXTENTS); break; + case Opt_delalloc: + set_opt (sbi->s_mount_opt, DELALLOC); + break; default: printk (KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " Index: linux-2.6.22/fs/ext4/inode.c =================================================================== --- linux-2.6.22.orig/fs/ext4/inode.c 2007-07-26 12:30:22.000000000 +0400 +++ linux-2.6.22/fs/ext4/inode.c 2007-07-26 12:32:04.000000000 +0400 @@ -39,6 +39,8 @@ #include "xattr.h" #include "acl.h" +static void ext4_invalidatepage(struct page *page, unsigned long offset); + /* * Test whether an inode is a fast symlink. */ @@ -1291,6 +1293,142 @@ static int ext4_journalled_commit_write( } /* + * this is a special callback for ->prepare_write() only + * it's intention is to return mapped block or reserve space + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + + /* first, we need to know whether the block is allocated already + * XXX: when the filesystem has a lot of free blocks, we could + * reserve even allocated blocks to save this lookup */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0); + if (ret >= 0) { + if (buffer_mapped(bh_result)) { + bh_result->b_size = (ret << inode->i_blkbits); + } else { + /* OK, the block isn't allocated yet, let's reserve space */ + /* XXX: call reservation here */ + /* XXX: __block_prepare_write() unmaps passed block, is it OK? */ + map_bh(bh_result, inode->i_sb, 0); + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + } + ret = 0; + } + + return ret; +} + + +static int ext4_da_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + return block_prepare_write(page, from, to, ext4_da_get_block_prep); +} + +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret, needed_blocks = ext4_writepage_trans_blocks(inode); + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + if (create) { + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + } + + ret = ext4_get_blocks_wrap(handle, inode, iblock, + max_blocks, bh_result, create, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + + /* + * Update on-disk size along with block allocation + * we don't use 'extend_disksize' as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + mutex_lock(&EXT4_I(inode)->truncate_mutex); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + mutex_unlock(&EXT4_I(inode)->truncate_mutex); + + if (EXT4_I(inode)->i_disksize == disksize) { + if (handle == NULL) + handle = ext4_journal_start(inode, 1); + if (!IS_ERR(handle)) + ext4_mark_inode_dirty(handle, inode); + } + } + + ret = 0; + } + +out: + if (handle && !IS_ERR(handle)) + ext4_journal_stop(handle); + + return ret; +} + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write); +} + +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + bh = head = page_buffers(page); + do { + unsigned int next_off = curr_off + bh->b_size; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off && buffer_delay(bh)) { + clear_buffer_delay(bh); + /* XXX: add real stuff here */ + } + curr_off = next_off; + bh = bh->b_this_page; + } while (bh != head); + +out: + ext4_invalidatepage(page, offset); + + return; +} + + +/* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * @@ -1741,10 +1879,28 @@ static const struct address_space_operat .releasepage = ext4_releasepage, }; +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writeback_writepage, + .writepages = ext4_da_writepages, + .sync_page = block_sync_page, + .prepare_write = ext4_da_prepare_write, + .commit_write = generic_commit_write, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, +}; + void ext4_set_aops(struct inode *inode) { if (ext4_should_order_data(inode)) inode->i_mapping->a_ops = &ext4_ordered_aops; + else if (ext4_should_writeback_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; else if (ext4_should_writeback_data(inode)) inode->i_mapping->a_ops = &ext4_writeback_aops; else