ext4: online defrag-- Main function of defrag and ioctl implementation
From: Akira Fujita <[email protected]>
Create the temporary inode and do defrag per
defrag_size (defalut 64MB).
Signed-off-by: Akira Fujita <[email protected]>
Signed-off-by: Takashi Sato <[email protected]>
---
fs/ext4/defrag.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/ext4.h | 19 ++++-
fs/ext4/ioctl.c | 3 +
3 files changed, 310 insertions(+), 1 deletions(-)
diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index 09b2c56..baa04d9 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -1,3 +1,18 @@
+/*
+ * Copyright (c) 2008, NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <[email protected]>
+ * Akira Fujita <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
/* Online defragmentation for EXT4 */
#include <linux/quotaops.h>
@@ -74,6 +89,27 @@ ext4_defrag_next_extent(struct inode *inode,
return 1;
}
+int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ int err = 0;
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return -EINVAL;
+
+ if (cmd == EXT4_IOC_DEFRAG) {
+ struct ext4_ext_defrag_data defrag;
+
+ if (copy_from_user(&defrag,
+ (struct ext4_ext_defrag_data __user *)arg,
+ sizeof(defrag)))
+ return -EFAULT;
+ err = ext4_defrag(filp, defrag.start_offset,
+ defrag.defrag_size);
+ }
+
+ return err;
+}
+
/**
* ext4_defrag_merge_across_blocks - Merge extents across leaf block
*
@@ -1022,3 +1058,256 @@ ext4_defrag_new_extent_tree(struct inode *inode, struct inode *tmp_inode,
out:
return ret;
}
+
+/**
+ * ext4_defrag - Defrag the specified range of a file
+ *
+ * @filp: pointer to file
+ * @block_start: starting offset to defrag in blocks
+ * @defrag_size: size of defrag in blocks
+ *
+ * This function returns the number of blocks if succeeded, otherwise
+ * returns error value.
+ */
+int
+ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+ ext4_lblk_t defrag_size)
+{
+ struct inode *inode = filp->f_dentry->d_inode, *tmp_inode = NULL;
+ struct ext4_ext_path *path = NULL, *holecheck_path = NULL;
+ struct ext4_extent *ext_prev = NULL, *ext_cur = NULL, *ext_dummy = NULL;
+ handle_t *handle;
+ ext4_lblk_t block_end = block_start + defrag_size - 1;
+ ext4_lblk_t seq_blocks = 0, seq_start = 0;
+ ext4_lblk_t add_blocks = 0;
+ ext4_lblk_t file_end = (inode->i_size - 1) >> inode->i_blkbits;
+ pgoff_t page_offset = 0, dest_offset = 0, seq_end_page = 0;
+ int ret = 0, depth = 0, last_extent = 0, seq_extents = 0;
+
+ /* ext4 online defrag supports only 4KB block size */
+ if (inode->i_sb->s_blocksize != DEFRAG_BLOCK_SIZE) {
+ printk(KERN_ERR "ext4 defrag: ext4 online defrag supports "
+ "only 4KB block size for the moment.\n");
+ return -EINVAL;
+ }
+
+ /* ext4 online defrag needs mballoc mount option. */
+ if (!test_opt(inode->i_sb, MBALLOC)) {
+ printk(KERN_ERR "ext4 defrag: multiblock allocation "
+ "is disabled\n");
+ return -EINVAL;
+ }
+
+ if (file_end < block_end)
+ defrag_size -= block_end - file_end;
+
+ mutex_lock(&inode->i_mutex);
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+ path = ext4_ext_find_extent(inode, block_start, NULL);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
+ path = NULL;
+ goto out;
+ }
+
+ /* Get path structure to check the hole */
+ holecheck_path = ext4_ext_find_extent(inode, block_start, NULL);
+ if (IS_ERR(holecheck_path)) {
+ ret = PTR_ERR(holecheck_path);
+ holecheck_path = NULL;
+ goto out;
+ }
+
+ depth = ext_depth(inode);
+ ext_cur = holecheck_path[depth].p_ext;
+ if (ext_cur == NULL)
+ goto out;
+
+ /*
+ * Get proper extent whose ee_block is beyond block_start
+ * if block_start was within the hole.
+ */
+ if (le32_to_cpu(ext_cur->ee_block) +
+ le16_to_cpu(ext_cur->ee_len) - 1 < block_start) {
+ last_extent = ext4_defrag_next_extent(inode, holecheck_path,
+ &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ last_extent = ext4_defrag_next_extent(inode, path, &ext_dummy);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ }
+ seq_extents = 1;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+
+ /* No blocks within the specified range. */
+ if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+ printk(KERN_INFO "ext4 defrag: The specified range of file"
+ " may be the hole\n");
+ goto out;
+ }
+
+ /* Adjust start blocks */
+ add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+ le16_to_cpu(ext_cur->ee_len), block_end + 1) -
+ max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+ while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+ seq_blocks += add_blocks;
+
+ handle = ext4_journal_start(inode,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) + 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ tmp_inode = ext4_new_inode(handle,
+ inode->i_sb->s_root->d_inode, S_IFREG);
+ if (IS_ERR(tmp_inode)) {
+ ret = -ENOMEM;
+ ext4_journal_stop(handle);
+ tmp_inode = NULL;
+ goto out;
+ }
+
+ i_size_write(tmp_inode, i_size_read(inode));
+ tmp_inode->i_nlink = 0;
+ ext4_ext_tree_init(handle, tmp_inode);
+ ext4_orphan_add(handle, tmp_inode);
+ ext4_journal_stop(handle);
+
+ /* Adjust tail blocks */
+ if (seq_start + seq_blocks - 1 > block_end)
+ seq_blocks = block_end - seq_start + 1;
+
+ ext_prev = ext_cur;
+ last_extent = ext4_defrag_next_extent(inode, holecheck_path,
+ &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ break;
+ }
+ if (!last_extent)
+ seq_extents++;
+ add_blocks = le16_to_cpu(ext_cur->ee_len);
+
+ /*
+ * Extend the length of contiguous block (seq_blocks)
+ * if extents are contiguous.
+ */
+ if ((le32_to_cpu(ext_prev->ee_block) +
+ le16_to_cpu(ext_prev->ee_len) ==
+ le32_to_cpu(ext_cur->ee_block) &&
+ block_end >= le32_to_cpu(ext_cur->ee_block) &&
+ !last_extent)) {
+ if (tmp_inode) {
+ iput(tmp_inode);
+ tmp_inode = NULL;
+ }
+ continue;
+ }
+
+ /* Found an isolated block */
+ if (seq_extents == 1) {
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ goto CLEANUP;
+ }
+
+ ret = ext4_defrag_new_extent_tree(inode, tmp_inode, path,
+ seq_start, seq_blocks, block_start);
+
+ if (ret < 0) {
+ break;
+ } else if (ret == 1) {
+ ret = 0;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ goto CLEANUP;
+ }
+
+ page_offset = seq_start >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ seq_end_page = (seq_start + seq_blocks - 1) >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ dest_offset = 0;
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+
+ /*
+ * Discard all preallocations.
+ * This is provisional solution.
+ * When true ext4_mb_return_to_preallocation() is
+ * implemented, this will be removed.
+ */
+ ext4_mb_discard_inode_preallocations(inode);
+
+ while (page_offset <= seq_end_page) {
+ /* Swap original branches with new branches */
+ ret = ext4_defrag_partial(tmp_inode, filp,
+ page_offset, dest_offset);
+ if (ret < 0)
+ goto out;
+
+ page_offset++;
+ dest_offset++;
+ }
+
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+ holecheck_path =
+ ext4_ext_find_extent(inode, seq_start, holecheck_path);
+ if (IS_ERR(holecheck_path)) {
+ ret = PTR_ERR(holecheck_path);
+ holecheck_path = NULL;
+ break;
+ }
+ depth = holecheck_path->p_depth;
+
+CLEANUP:
+ /* Decrease buffer counter */
+ if (path)
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, seq_start, path);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
+ path = NULL;
+ break;
+ }
+
+ ext_cur = holecheck_path[depth].p_ext;
+ add_blocks = le16_to_cpu(ext_cur->ee_len);
+ seq_blocks = 0;
+ dest_offset = 0;
+ seq_extents = 1;
+
+ if (tmp_inode) {
+ iput(tmp_inode);
+ tmp_inode = NULL;
+ }
+ }
+
+out:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ if (holecheck_path) {
+ ext4_ext_drop_refs(holecheck_path);
+ kfree(holecheck_path);
+ }
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ mutex_unlock(&inode->i_mutex);
+
+ if (tmp_inode)
+ iput(tmp_inode);
+
+ return (ret ? ret : defrag_size);
+}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8dc174f..18c1fcf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -298,6 +298,7 @@ struct ext4_new_group_data {
#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
#define EXT4_IOC_MIGRATE _IO('f', 7)
+#define EXT4_IOC_DEFRAG _IOW('f', 10, struct ext4_ext_defrag_data)
/*
* ioctl commands in 32 bit emulation
@@ -315,8 +316,19 @@ struct ext4_new_group_data {
#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
-#define EXT4_TRANS_META_BLOCKS 4 /* bitmap + group desc + sb + inode */
+/*
+ * Will go away.
+ * ext4 online defrag supports only 4KB block size.
+ */
+#define DEFRAG_BLOCK_SIZE 4096
+struct ext4_ext_defrag_data {
+ ext4_lblk_t start_offset; /* start offset to defrag in blocks */
+ ext4_lblk_t defrag_size; /* size of defrag in blocks */
+ ext4_fsblk_t goal; /* block offset for allocation */
+};
+
+#define EXT4_TRANS_META_BLOCKS 4 /* bitmap + group desc + sb + inode */
/*
* Mount options
@@ -1114,6 +1126,11 @@ extern void ext4_inode_table_set(struct super_block *sb,
struct ext4_group_desc *bg, ext4_fsblk_t blk);
/* extents.c */
extern handle_t *ext4_ext_journal_restart(handle_t *handle, int needed);
+/* defrag.c */
+extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+ ext4_lblk_t defrag_size);
+extern int ext4_defrag_ioctl(struct inode *, struct file *, unsigned int,
+ unsigned long);
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0d7430a..98b6f4a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -231,6 +231,9 @@ flags_err:
return err;
}
+ case EXT4_IOC_DEFRAG: {
+ return ext4_defrag_ioctl(inode, filp, cmd, arg);
+ }
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
struct super_block *sb = inode->i_sb;