ext4: online defrag-- Read and write file data with memory page
From: Akira Fujita <[email protected]>
Read the file data from the old blocks to the page and
write the file data on the page into the new blocks.
Signed-off-by: Akira Fujita <[email protected]>
Signed-off-by: Takashi Sato <[email protected]>
---
fs/ext4/defrag.c | 464 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/ext4.h | 2 +
fs/ext4/inode.c | 3 +-
3 files changed, 466 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index 621276b..f5d75c2 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -134,6 +134,368 @@ int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
}
/**
+ * ext4_defrag_merge_extents - Merge new extent
+ *
+ * @handle: journal handle
+ * @org_inode: original inode
+ * @org_path: path indicates first extent to be defraged
+ * @o_start: first original extent to be defraged
+ * @o_end: last original extent to be defraged
+ * @start_ext: first new extent to be merged
+ * @new_ext: middle of new extent to be merged
+ * @end_ext: last new extent to be merged
+ * @replaced: the number of blocks which will be replaced with new_ext
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
+ struct ext4_ext_path *org_path,
+ struct ext4_extent *o_start, struct ext4_extent *o_end,
+ struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext, ext4_fsblk_t replaced)
+{
+ return 0;
+}
+
+/**
+ * ext4_defrag_leaf_block - Defragmentation for one leaf extent block
+ *
+ * @handle: journal handle
+ * @org_inode: original inode
+ * @org_path: path indicates first extent to be defraged
+ * @dext: destination extent
+ * @from: start offset on the target file
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
+ struct ext4_ext_path *org_path, struct ext4_extent *dext,
+ ext4_lblk_t *from)
+{
+ struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
+ struct ext4_extent new_ext, start_ext, end_ext;
+ ext4_fsblk_t replaced = 0;
+ ext4_lblk_t new_end, lblock;
+ unsigned long depth;
+ unsigned short len;
+ ext4_fsblk_t new_phys_end;
+ int ret;
+
+ depth = ext_depth(org_inode);
+ start_ext.ee_len = end_ext.ee_len = 0;
+ o_start = o_end = oext = org_path[depth].p_ext;
+ ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+ new_ext.ee_len = dext->ee_len;
+ len = le16_to_cpu(new_ext.ee_len);
+ new_ext.ee_block = cpu_to_le32(*from);
+ lblock = le32_to_cpu(oext->ee_block);
+ new_end = le32_to_cpu(new_ext.ee_block)
+ + le16_to_cpu(new_ext.ee_len) - 1;
+ new_phys_end = ext_pblock(&new_ext)
+ + le16_to_cpu(new_ext.ee_len) - 1;
+
+ /*
+ * First original extent
+ * dest |---------------|
+ * org |---------------|
+ */
+ if (le32_to_cpu(new_ext.ee_block) >
+ le32_to_cpu(oext->ee_block) &&
+ le32_to_cpu(new_ext.ee_block) <
+ le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len)) {
+ start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block)
+ - le32_to_cpu(oext->ee_block));
+ replaced += le16_to_cpu(oext->ee_len)
+ - le16_to_cpu(start_ext.ee_len);
+ } else if (oext > EXT_FIRST_EXTENT(org_path[depth].p_hdr)) {
+ /* We can merge previous extent. */
+ prev_ext = oext - 1;
+ if (((ext_pblock(prev_ext) + le16_to_cpu(prev_ext->ee_len))
+ == ext_pblock(&new_ext))
+ && (le32_to_cpu(prev_ext->ee_block)
+ + le16_to_cpu(prev_ext->ee_len)
+ == le32_to_cpu(new_ext.ee_block))) {
+ o_start = prev_ext;
+ start_ext.ee_len = cpu_to_le16(
+ le16_to_cpu(prev_ext->ee_len)
+ + le16_to_cpu(new_ext.ee_len));
+ new_ext.ee_len = 0;
+ }
+ }
+
+ for (;;) {
+ /* The extent for destination must be found. */
+ BUG_ON(!oext || lblock != le32_to_cpu(oext->ee_block));
+ lblock += le16_to_cpu(oext->ee_len);
+
+ /*
+ * Middle of original extent
+ * dest |-------------------|
+ * org |-----------------|
+ */
+ if (le32_to_cpu(new_ext.ee_block) <=
+ le32_to_cpu(oext->ee_block) &&
+ new_end >= le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len) - 1)
+ replaced += le16_to_cpu(oext->ee_len);
+
+ /*
+ * Last original extent
+ * dest |----------------|
+ * org |---------------|
+ */
+ if (new_end >= le32_to_cpu(oext->ee_block) &&
+ new_end < le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len) - 1) {
+ end_ext.ee_len
+ = cpu_to_le16(le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len) - 1 - new_end);
+ ext4_ext_store_pblock(&end_ext, (ext_pblock(o_end)
+ + le16_to_cpu(oext->ee_len)
+ - le16_to_cpu(end_ext.ee_len)));
+ end_ext.ee_block
+ = cpu_to_le32(le32_to_cpu(o_end->ee_block)
+ + le16_to_cpu(oext->ee_len)
+ - le16_to_cpu(end_ext.ee_len));
+ replaced += le16_to_cpu(oext->ee_len)
+ - le16_to_cpu(end_ext.ee_len);
+ }
+
+ /*
+ * Detected the block end, reached the number of replaced
+ * blocks to dext->ee_len. Then merge the extent.
+ */
+ if (oext == EXT_LAST_EXTENT(org_path[depth].p_hdr) ||
+ new_end <= le32_to_cpu(oext->ee_block)
+ + le16_to_cpu(oext->ee_len) - 1) {
+ ret = ext4_defrag_merge_extents(handle, org_inode,
+ org_path, o_start, o_end, &start_ext,
+ &new_ext, &end_ext, replaced);
+ if (ret < 0)
+ return ret;
+
+ /* All expected blocks are replaced */
+ if (le16_to_cpu(new_ext.ee_len) <= 0)
+ return 0;
+
+ /* Re-calculate new_ext */
+ le16_add_cpu(&new_ext.ee_len, -replaced);
+ le32_add_cpu(&new_ext.ee_block, replaced);
+ ext4_ext_store_pblock(&new_ext, ext_pblock(&new_ext)
+ + replaced);
+ replaced = 0;
+ start_ext.ee_len = end_ext.ee_len = 0;
+ o_start = NULL;
+
+ /* All expected blocks are replaced. */
+ if (le16_to_cpu(new_ext.ee_len) <= 0)
+ return 0;
+ }
+
+ /* Get the next extent for original. */
+ if (org_path)
+ ext4_ext_drop_refs(org_path);
+ org_path = ext4_ext_find_extent(org_inode, lblock, org_path);
+ if (IS_ERR(org_path)) {
+ ret = PTR_ERR(org_path);
+ org_path = NULL;
+ return ret;
+ }
+ depth = ext_depth(org_inode);
+ oext = org_path[depth].p_ext;
+ if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+ <= lblock)
+ return -ENOENT;
+
+ o_end = oext;
+ if (!o_start)
+ o_start = oext;
+ }
+}
+
+/**
+ * ext4_defrag_replace_branches - Replace original extents with new extents
+ *
+ * @handle: journal handle
+ * @org_inode: original inode
+ * @dest_inode: temporary inode
+ * @from_page: page offset of org_inode
+ * @dest_from_page: page offset of dest_inode
+ * @count_page: page count to be replaced
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ * Replace extents for blocks from "from" to "from + count - 1".
+ */
+static int
+ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
+ struct inode *dest_inode, pgoff_t from_page,
+ pgoff_t dest_from_page, pgoff_t count_page)
+{
+ struct ext4_ext_path *org_path = NULL;
+ struct ext4_ext_path *dest_path = NULL;
+ struct ext4_extent *oext, *dext, *swap_ext;
+ struct ext4_extent tmp_ext, tmp_ext2;
+ ext4_lblk_t from, count, dest_off, diff, org_diff;
+ int err = 0;
+ int depth;
+ int replaced_count = 0;
+
+ from = (ext4_lblk_t)from_page <<
+ (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+ count = (ext4_lblk_t)count_page <<
+ (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+ dest_off = (ext4_lblk_t)dest_from_page <<
+ (PAGE_CACHE_SHIFT - dest_inode->i_blkbits);
+
+ /* Get the original extent for the block "from" */
+ org_path = ext4_ext_find_extent(org_inode, from, NULL);
+ if (IS_ERR(org_path)) {
+ err = PTR_ERR(org_path);
+ org_path = NULL;
+ goto out;
+ }
+
+ /* Get the destination extent for the head */
+ dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+ if (IS_ERR(dest_path)) {
+ err = PTR_ERR(dest_path);
+ dest_path = NULL;
+ goto out;
+ }
+ depth = ext_depth(dest_inode);
+ dext = dest_path[depth].p_ext;
+ /* When dext is too large, pick up the target range. */
+ diff = dest_off - le32_to_cpu(dext->ee_block);
+ ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+ tmp_ext.ee_block = cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+ tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+ if (count < le16_to_cpu(tmp_ext.ee_len))
+ tmp_ext.ee_len = cpu_to_le16(count);
+ dext = &tmp_ext;
+
+ depth = ext_depth(org_inode);
+ oext = org_path[depth].p_ext;
+ org_diff = from - le32_to_cpu(oext->ee_block);
+ ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+ tmp_ext2.ee_block = tmp_ext.ee_block;
+
+ /* Adjust extent length when blocksize != pagesize */
+ if (le16_to_cpu(tmp_ext.ee_len) <=
+ le16_to_cpu(oext->ee_len) - org_diff) {
+ tmp_ext2.ee_len = tmp_ext.ee_len;
+ } else {
+ tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+ - org_diff);
+ tmp_ext.ee_len = tmp_ext2.ee_len;
+ }
+ swap_ext = &tmp_ext2;
+
+ /* Loop for the destination extents */
+ while (1) {
+ /* The extent for destination must be found. */
+ BUG_ON(!dext || dest_off != le32_to_cpu(dext->ee_block));
+
+ /* Loop for the original extent blocks */
+ err = ext4_defrag_leaf_block(handle, org_inode,
+ org_path, dext, &from);
+ if (err < 0)
+ goto out;
+
+ /*
+ * We need the function which fixes extent information for
+ * inserting.
+ * e.g. ext4_defrag_merge_extents()
+ */
+ err = ext4_defrag_leaf_block(handle, dest_inode,
+ dest_path, swap_ext, &dest_off);
+ if (err < 0)
+ goto out;
+
+ replaced_count += le16_to_cpu(dext->ee_len);
+ dest_off += le16_to_cpu(dext->ee_len);
+ from += le16_to_cpu(dext->ee_len);
+
+ /* Already moved the expected blocks */
+ if (replaced_count >= count)
+ break;
+
+ if (org_path)
+ ext4_ext_drop_refs(org_path);
+ org_path = ext4_ext_find_extent(org_inode, from, NULL);
+ if (IS_ERR(org_path)) {
+ err = PTR_ERR(org_path);
+ org_path = NULL;
+ goto out;
+ }
+ depth = ext_depth(org_inode);
+ oext = org_path[depth].p_ext;
+ if (le32_to_cpu(oext->ee_block) + le16_to_cpu(oext->ee_len)
+ <= from) {
+ err = 0;
+ goto out;
+ }
+
+ if (dest_path)
+ ext4_ext_drop_refs(dest_path);
+ dest_path = ext4_ext_find_extent(dest_inode, dest_off, NULL);
+ if (IS_ERR(dest_path)) {
+ err = PTR_ERR(dest_path);
+ dest_path = NULL;
+ goto out;
+ }
+ depth = ext_depth(dest_inode);
+ dext = dest_path[depth].p_ext;
+ if (le32_to_cpu(dext->ee_block) + le16_to_cpu(dext->ee_len)
+ <= dest_off) {
+ err = 0;
+ goto out;
+ }
+
+ /* When dext is too large, pick up the target range. */
+ diff = dest_off - le32_to_cpu(dext->ee_block);
+ ext4_ext_store_pblock(&tmp_ext, ext_pblock(dext) + diff);
+ tmp_ext.ee_block =
+ cpu_to_le32(le32_to_cpu(dext->ee_block) + diff);
+ tmp_ext.ee_len = cpu_to_le16(le16_to_cpu(dext->ee_len) - diff);
+
+ if (count - replaced_count < le16_to_cpu(tmp_ext.ee_len))
+ tmp_ext.ee_len = cpu_to_le16(count - replaced_count);
+
+ dext = &tmp_ext;
+
+ org_diff = from - le32_to_cpu(oext->ee_block);
+ ext4_ext_store_pblock(&tmp_ext2, ext_pblock(oext) + org_diff);
+ tmp_ext2.ee_block = tmp_ext.ee_block;
+
+ /* Adjust extent length when blocksize != pagesize */
+ if (le16_to_cpu(tmp_ext.ee_len) <=
+ le16_to_cpu(oext->ee_len) - org_diff) {
+ tmp_ext2.ee_len = tmp_ext.ee_len;
+ } else {
+ tmp_ext2.ee_len = cpu_to_le16(le16_to_cpu(oext->ee_len)
+ - org_diff);
+ tmp_ext.ee_len = tmp_ext2.ee_len;
+ }
+ swap_ext = &tmp_ext2;
+ }
+
+out:
+ if (org_path) {
+ ext4_ext_drop_refs(org_path);
+ kfree(org_path);
+ }
+ if (dest_path) {
+ ext4_ext_drop_refs(dest_path);
+ kfree(dest_path);
+ }
+
+ return err;
+}
+
+/**
* ext4_defrag_fill_ar - Prepare to multiple block allocate for tmp inode
*
* @org_inode: original inode
@@ -228,7 +590,107 @@ static int
ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
pgoff_t org_offset, pgoff_t dest_offset)
{
- return 0;
+ struct inode *org_inode = filp->f_dentry->d_inode;
+ struct address_space *mapping = org_inode->i_mapping;
+ struct buffer_head *bh;
+ struct page *page;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ handle_t *handle;
+ pgoff_t offset_in_page = PAGE_SIZE;
+ int ret, i, jblocks, blocks_per_page;
+ int blocksize = org_inode->i_sb->s_blocksize;
+ long long offs = org_offset << PAGE_CACHE_SHIFT;
+ unsigned long blk_off = 0;
+ unsigned int w_flags = 0;
+ void *fsdata;
+
+ /*
+ * It needs twice the amount of ordinary journal buffers because
+ * inode and tmp_inode may change each different metadata blocks.
+ */
+ jblocks = ext4_writepage_trans_blocks(org_inode) * 2;
+ handle = ext4_journal_start(org_inode, jblocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+
+ if (segment_eq(get_fs(), KERNEL_DS))
+ w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ if (org_offset == ((org_inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
+ offset_in_page = (org_inode->i_size & (PAGE_CACHE_SIZE - 1));
+ /*
+ * Set PAGE_CACHE_SIZE to offset_in_page not be 0
+ * if org_offset is the last page and i_size is
+ * multiples of PAGE_CACHE_SIZE.
+ */
+ if (offset_in_page == 0)
+ offset_in_page = PAGE_CACHE_SIZE;
+ }
+
+ up_write(&EXT4_I(org_inode)->i_data_sem);
+ ret = a_ops->write_begin(filp, mapping, offs,
+ offset_in_page, w_flags, &page, &fsdata);
+ down_write(&EXT4_I(org_inode)->i_data_sem);
+
+ if (unlikely(ret < 0))
+ goto out;
+
+ if (!PageUptodate(page)) {
+ mapping->a_ops->readpage(filp, page);
+ lock_page(page);
+ }
+
+ /*
+ * try_to_release_page() doesn't call relasepage in writeback mode.
+ * We should care about the order of writing to the same file
+ * by multiple defrag processes.
+ * It needs to call wait_on_page_writeback() to wait for the
+ * writeback of the page.
+ */
+ if (PageWriteback(page))
+ wait_on_page_writeback(page);
+
+ /* Release old bh and drop refs */
+ try_to_release_page(page, 0);
+ ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode,
+ org_offset, dest_offset, 1);
+
+ if (ret < 0)
+ goto out;
+
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(org_inode);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << org_inode->i_blkbits, 0);
+
+ blocks_per_page = PAGE_SIZE / blocksize;
+ blk_off = org_offset * blocks_per_page;
+
+ bh = page_buffers(page);
+ for (i = 0; i < blocks_per_page; i++) {
+ up_write(&EXT4_I(org_inode)->i_data_sem);
+ ret = ext4_get_block(org_inode, blk_off++, bh, 0);
+ down_write(&EXT4_I(org_inode)->i_data_sem);
+
+ if (ret < 0)
+ goto out;
+
+ if (bh->b_this_page != NULL)
+ bh = bh->b_this_page;
+ }
+
+ ret = a_ops->write_end(filp, mapping, offs, offset_in_page,
+ offset_in_page, page, fsdata);
+
+ if (unlikely(ret < 0))
+ goto out;
+out:
+ ext4_journal_stop(handle);
+
+ return (ret < 0 ? ret : 0);
}
/**
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d64a4ae..1e9ce39 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1079,6 +1079,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71db3d6..a30f56c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1037,8 +1037,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
up_write((&EXT4_I(inode)->i_data_sem));
return retval;
}