From: Akira Fujita <a-fujita@rs.jp.nec.com>
Subject: [RFC][PATCH 2/8] read and write file data with memory page
Date: Fri, 04 Apr 2008 20:19:26 +0900
Message-ID: <47F60EBE.7060703@rs.jp.nec.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=ISO-2022-JP
Content-Transfer-Encoding: 7bit
Cc: linux-fsdevel@vger.kernel.org,
	Akira Fujita <a-fujita@rs.jp.nec.com>
To: linux-ext4@vger.kernel.org, Theodore Tso <tytso@mit.edu>,
	Mingming Cao <cmm@us.ibm.com>,
	"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Sender: linux-ext4-owner@vger.kernel.org

ext4: online defrag-- Read and write file data with memory page

From: Akira Fujita <a-fujita@rs.jp.nec.com>

Read the file data from the old blocks to the page and
write the file data on the page into the new blocks.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
---
 fs/ext4/defrag.c |  466 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ext4.h   |    2 +
 fs/ext4/inode.c  |    3 +-
 3 files changed, 469 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index 5cdf610..a03da84 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -270,3 +270,469 @@ ext4_defrag_merge_extents(handle_t *handle, struct inode *inode,
 	return 0;

 }
+
+/**
+ * ext4_defrag_leaf_block - Defragmentation for one leaf extent block
+ *
+ * @handle	journal handle
+ * @org_inode	target inode
+ * @org_path	path indicates first extent to be defraged
+ * @dext	destination extent
+ * @from	start offset on the target file
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
+		struct ext4_ext_path *org_path, struct ext4_extent *dext,
+		ext4_lblk_t *from)
+{
+	unsigned long depth;
+	ext4_fsblk_t replaced = 0;
+	struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
+	struct ext4_extent new_ext, start_ext, end_ext;
+	ext4_lblk_t new_end, lblock;
+	unsigned short len;
+	ext4_fsblk_t new_phys_end;
+	int	ret;
+
+	depth = ext_depth(org_inode);
+	start_ext.ee_len = end_ext.ee_len = 0;
+	o_start = o_end = oext = org_path[depth].p_ext;
+	ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+	new_ext.ee_len = dext->ee_len;
+	len = le16_to_cpu(new_ext.ee_len);
+	new_ext.ee_block = cpu_to_le32(*from);
+	lblock = le32_to_cpu(oext->ee_block);
+	new_end = le32_to_cpu(new_ext.ee_block)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+	new_phys_end = ext_pblock(&new_ext)
+		+ le16_to_cpu(new_ext.ee_len) - 1;
+
+	/*
+	 * First original extent
+	 * dest	 |---------------|
+	 * org  |---------------|
+	 */
+	if (le32_to_cpu(new_ext.ee_block) >
+		le32_to_cpu(oext->ee_block) &&
+		le32_to_cpu(new_ext.ee_block) <
+		le32_to_cpu(oext->ee_block)
+		+ le16_to_cpu(oext->ee_len)) {
+		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block)
+					- le32_to_cpu(oext->ee_block));
+		replaced += le16_to_cpu(oext->ee_len)
+					- le16_to_cpu(start_ext.ee_len);
+	} else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
+		/* We can merge previous extent. */
+		prev_ext = oext - 1;
+		if (((ext_pblock(prev_ext) + le16_to_cpu(prev_ext->ee_len))
+				 == ext_pblock(&new_ext))
+		 && (le32_to_cpu(prev_ext->ee_block)
+			+ le16_to_cpu(prev_ext->ee_len)
+				 == le32_to_cpu(new_ext.ee_block))) {
+			o_start = prev_ext;
+			start_ext.ee_len = cpu_to_le16(
+					le16_to_cpu(prev_ext->ee_len)
+					+ le16_to_cpu(new_ext.ee_len));
+			new_ext.ee_len = 0;
+		}
+	}
+
+	for (;;) {
+		/* The extent for destination must be found. */
+		BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
+		lblock += le16_to_cpu(oext->ee_len);
+
+		/*
+		 * Middle of original extent
+		 * dest |-------------------|
+		 * org   |-----------------|
+		 */
+		if (le32_to_cpu(new_ext.ee_block) <=
+			le32_to_cpu(oext->ee_block) &&
+			new_end >= le32_to_cpu(oext->ee_block)
+			+ le16_to_cpu(oext->ee_len) - 1)
+			replaced += le16_to_cpu(oext->ee_len);
+
+		/*
+		 * Last original extent
+		 * dest |----------------|
+		 * org	  |---------------|
+		 */
+		if (new_end >= le32_to_cpu(oext->ee_block) &&
+			new_end < le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			end_ext.ee_len
+				= cpu_to_le16(le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1 - new_end);
+			ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len)));
+			end_ext.ee_block
+				= cpu_to_le32(le32_to_cpu(o_end->ee_block)
+				+ le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len));
+			replaced += le16_to_cpu(oext->ee_len)
+				- le16_to_cpu(end_ext.ee_len);
+		}
+
+		/*
+		 * Detected the block end, reached the number of replaced
+		 * blocks to dext->ee_len. Then merge the extent.
+		 */
+		if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
+			new_end <= le32_to_cpu(oext->ee_block)
+				+ le16_to_cpu(oext->ee_len) - 1) {
+			ret = ext4_defrag_merge_extents(handle, org_inode,
+					org_path, o_start, o_end, &start_ext,
+					&new_ext, &end_ext, replaced);
+			if (ret < 0)
+				return ret;
+
+			/* All expected blocks are replaced */
+			if (le16_to_cpu(new_ext.ee_len) <= 0) {
+				if (DQUOT_ALLOC_BLOCK(org_inode, len))
+					return -EDQUOT;
+				return 0;
+			}
+
+			/* Re-calculate new_ext */
+			new_ext.ee_len = cpu_to_le16(le16_to_cpu(new_ext.ee_len)
+				- replaced);
+			new_ext.ee_block =
+				cpu_to_le32(le32_to_cpu(new_ext.ee_block)
+				+ replaced);
+			ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
+					+ replaced);
+			replaced = 0;
+			start_ext.ee_len = end_ext.ee_len = 0;
+			o_start = NULL;
+
+			/* All expected blocks are replaced. */
+			if (le16_to_cpu(new_ext.ee_len) <= 0) {
+				if (DQUOT_ALLOC_BLOCK(org_inode, len))
+					return -EDQUOT;
+				return 0;
+			}
+		}
+
+		/* Get the next extent for original. */
+		if (org_path)
+			ext4_ext_drop_refs(org_path);
+		org_path = ext4_ext_find_extent(org_inode, lblock, org_path);
+		if (IS_ERR(org_path)) {
+			ret = PTR_ERR(org_path);
+			org_path = NULL;
+			return ret;
+		}
+		depth = ext_depth(org_inode);
+		oext = org_path[depth].p_ext;
+		if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+			<= lblock)
+			return -ENOENT;
+
+		o_end = oext;
+		if (!o_start)
+			o_start = oext;
+	}
+}
+
+/**
+ * ext4_defrag_replace_branches - Replace original extents with new extents
+ *
+ * @handle		journal handle
+ * @org_inode		original inode
+ * @dest_inode		temporary inode
+ * @from_page		page offset of org_inode
+ * @dest_from_page	page offset of dest_inode
+ * @count_page		page count to be replaced
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ * Replace extents for blocks from "from" to "from + count - 1".
+ */
+static int
+ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
+			struct inode *dest_inode, pgoff_t from_page,
+			pgoff_t dest_from_page, pgoff_t count_page)
+{
+	struct ext4_ext_path *org_path = NULL;
+	struct ext4_ext_path *dest_path = NULL;
+	struct ext4_extent *oext, *dext, *swap_ext;
+	struct ext4_extent tmp_ext, tmp_ext2;
+	ext4_lblk_t from, count, dest_off, diff, org_diff;
+	int err = 0;
+	int depth;
+	int replaced_count = 0;
+
+	from = (ext4_lblk_t)from_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	count = (ext4_lblk_t)count_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+	dest_off = (ext4_lblk_t)dest_from_page <<
+			(PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+
+	/* Get the original extent for the block "from" */
+	org_path = ext4_ext_find_extent(org_inode, from, NULL);
+	if (IS_ERR(org_path)) {
+		err = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get the destination extent for the head */
+	dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+	if (IS_ERR(dest_path)) {
+		err = PTR_ERR(dest_path);
+		dest_path = NULL;
+		goto out;
+	}
+	depth = ext_depth(dest_inode);
+	dext = dest_path[depth].p_ext;
+	/* When dext is too large, pick up the target range. */
+	diff = dest_off - le32_to_cpu(dext->ee_block);
+	ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+	tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+	tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+	if (count < le16_to_cpu(tmp_ext.ee_len))
+		tmp_ext.ee_len = cpu_to_le16(count);
+	dext = &tmp_ext;
+
+	depth = ext_depth(org_inode);
+	oext = org_path[depth].p_ext;
+	org_diff = from - le32_to_cpu(oext->ee_block);
+	ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+	tmp_ext2.ee_block = tmp_ext.ee_block;
+
+	/* Adjust extent length when blocksize != pagesize */
+	if (le16_to_cpu(tmp_ext.ee_len) <=
+		(le16_to_cpu(oext->ee_len) - org_diff)) {
+		tmp_ext2.ee_len = tmp_ext.ee_len;
+	} else {
+		tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+						- org_diff);
+		tmp_ext.ee_len = tmp_ext2.ee_len;
+	}
+	swap_ext = &tmp_ext2;
+
+	/* Loop for the destination extents */
+	while (1) {
+		/* The extent for destination must be found. */
+		BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
+
+		/* Loop for the original extent blocks */
+		err = ext4_defrag_leaf_block(handle, org_inode,
+						org_path, dext, &from);
+		if (err < 0)
+			goto out;
+
+		/*
+		 * We need the function which fixes extent information for
+		 * inserting.
+		 * e.g. ext4_defrag_merge_extents()
+		 */
+		err = ext4_defrag_leaf_block(handle, dest_inode,
+					dest_path, swap_ext, &dest_off);
+		if (err < 0)
+			goto out;
+
+		replaced_count += le16_to_cpu(dext->ee_len);
+		dest_off += le16_to_cpu(dext->ee_len);
+		from += le16_to_cpu(dext->ee_len);
+
+		/* Already moved the expected blocks */
+		if (replaced_count >= count)
+			break;
+
+		if (org_path)
+			ext4_ext_drop_refs(org_path);
+		org_path = ext4_ext_find_extent(org_inode, from, NULL);
+		if (IS_ERR(org_path)) {
+			err = PTR_ERR(org_path);
+			org_path = NULL;
+			goto out;
+		}
+		depth = ext_depth(org_inode);
+		oext = org_path[depth].p_ext;
+		if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+			<= from) {
+			err = 0;
+			goto out;
+		}
+
+		if (dest_path)
+			ext4_ext_drop_refs(dest_path);
+		dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+		if (IS_ERR(dest_path)) {
+			err = PTR_ERR(dest_path);
+			dest_path = NULL;
+			goto out;
+		}
+		depth = ext_depth(dest_inode);
+		dext = dest_path[depth].p_ext;
+		if (le32_to_cpu(dext->ee_block) + le16_to_cpu(dext->ee_len)
+			<= dest_off) {
+			err = 0;
+			goto out;
+		}
+
+		/* When dext is too large, pick up the target range. */
+		diff = dest_off - le32_to_cpu(dext->ee_block);
+		ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+		tmp_ext.ee_block =
+			cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+		tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+
+		if ((count - replaced_count) < le16_to_cpu(tmp_ext.ee_len))
+			tmp_ext.ee_len = cpu_to_le16(count - replaced_count);
+
+		dext = &tmp_ext;
+
+		org_diff = from - le32_to_cpu(oext->ee_block);
+		ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+		tmp_ext2.ee_block = tmp_ext.ee_block;
+
+		/* Adjust extent length when blocksize != pagesize */
+		if (le16_to_cpu(tmp_ext.ee_len) <=
+			le16_to_cpu(oext->ee_len) - org_diff) {
+			tmp_ext2.ee_len = tmp_ext.ee_len;
+		} else {
+			tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+							- org_diff);
+			tmp_ext.ee_len = tmp_ext2.ee_len;
+		}
+		swap_ext = &tmp_ext2;
+	}
+
+out:
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (dest_path) {
+		ext4_ext_drop_refs(dest_path);
+		kfree(dest_path);
+	}
+
+	return err;
+}
+
+/**
+ * ext4_defrag_partial - Defrag a file per page
+ *
+ * @tmp_inode:		the inode which has blocks to swap with original
+ * @filp:		pointer to file
+ * @org_offset:		page index on original file
+ * @dest_offset:	page index on temporary file
+ *
+ * This function returns 0 if succeeded, otherwise returns error value.
+ */
+static int
+ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
+			pgoff_t org_offset, pgoff_t dest_offset)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+	struct buffer_head *bh;
+	struct page *page;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	handle_t *handle;
+	pgoff_t offset_in_page = PAGE_SIZE;
+	int jblocks;
+	int ret = 0;
+	int blocksize = inode->i_sb->s_blocksize;
+	int blocks_per_page = 0;
+	int i = 0;
+	long long offs = org_offset << PAGE_CACHE_SHIFT;
+	unsigned long blk_off = 0;
+	unsigned int w_flags = 0;
+	void *fsdata;
+
+	/*
+	 * It needs twice the amount of ordinary journal buffers because
+	 * inode and tmp_inode may change each different metadata blocks.
+	 */
+	jblocks = ext4_writepage_trans_blocks(inode) * 2;
+	handle = ext4_journal_start(inode, jblocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		return ret;
+	}
+
+	if (segment_eq(get_fs(), KERNEL_DS))
+		w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	if (org_offset == ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
+		offset_in_page = (inode->i_size & (PAGE_CACHE_SIZE - 1));
+		/*
+		 * Set PAGE_CACHE_SIZE to offset_in_page not be 0
+		 * if org_offset is the last page and i_size is
+		 * multiples of PAGE_CACHE_SIZE.
+		 */
+		if (offset_in_page == 0)
+			offset_in_page = PAGE_CACHE_SIZE;
+	}
+
+	up_write(&EXT4_I(inode)->i_data_sem);
+	ret = a_ops->write_begin(filp, mapping, offs,
+				offset_in_page, w_flags, &page, &fsdata);
+	down_write(&EXT4_I(inode)->i_data_sem);
+
+	if (unlikely(ret < 0))
+		goto out;
+
+	if (!PageUptodate(page)) {
+		mapping->a_ops->readpage(filp, page);
+		lock_page(page);
+	}
+
+	/*
+	 * try_to_release_page() doesn't call relasepage in writeback mode.
+	 * We should care about the order of writing to the same file
+	 * by multiple defrag processes.
+	 * It needs to call wait_on_page_writeback() to wait for the
+	 * writeback of the page.
+	 */
+	if (PageWriteback(page))
+		wait_on_page_writeback(page);
+
+	/* Release old bh and drop refs */
+	try_to_release_page(page, 0);
+	ret = ext4_defrag_replace_branches(handle, inode, tmp_inode,
+					org_offset, dest_offset, 1);
+
+	if (ret < 0)
+		goto out;
+
+	/* Clear the inode cache not to refer to the old data */
+	ext4_ext_invalidate_cache(inode);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+
+	blocks_per_page = PAGE_SIZE / blocksize;
+	blk_off = org_offset * blocks_per_page;
+
+	bh = page_buffers(page);
+	for (i = 0; i < blocks_per_page; i++) {
+		up_write(&EXT4_I(inode)->i_data_sem);
+		ret = ext4_get_block(inode, blk_off++, bh, 0);
+		down_write(&EXT4_I(inode)->i_data_sem);
+
+		if (ret < 0)
+			goto out;
+
+		if (bh->b_this_page != NULL)
+			bh = bh->b_this_page;
+	}
+
+	ret = a_ops->write_end(filp, mapping, offs, offset_in_page,
+				offset_in_page, page, fsdata);
+
+	if (unlikely(ret < 0))
+		goto out;
+out:
+	ext4_journal_stop(handle);
+
+	return (ret < 0 ? ret : 0);
+}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 07c05b3..92162f9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1060,6 +1060,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_get_block(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh_result, int create);

 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 858329d..53943b6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1001,8 +1001,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }