2008-09-27 07:26:47

by Akira Fujita

[permalink] [raw]
Subject: [RFC][PATCH 4/12]ext4: exchange the blocks between two inodes

ext4: online defrag -- Exchange the blocks between two inodes.

From: Akira Fujita <[email protected]>

Exchange the data blocks between the temporary inode and
the original inode.

Signed-off-by: Akira Fujita <[email protected]>
Signed-off-by: Takashi Sato <[email protected]>
---
fs/ext4/defrag.c | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++---
fs/ext4/ext4.h | 2 +-
2 files changed, 274 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index d3ff958..4dd4318 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -103,6 +103,7 @@ int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,

if (cmd == EXT4_IOC_DEFRAG) {
struct ext4_ext_defrag_data defrag;
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;

if (!capable(CAP_DAC_OVERRIDE)) {
if ((inode->i_mode & S_IRUSR) != S_IRUSR)
@@ -116,14 +117,205 @@ int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
sizeof(defrag)))
return -EFAULT;

+ /* Check goal offset if goal offset was given from userspace */
+ if (defrag.goal != -1 &&
+ ext4_blocks_count(es) <= defrag.goal) {
+ printk(KERN_ERR "ext4 defrag: Invalid goal offset"
+ " %llu, you can set goal offset up to %llu\n",
+ defrag.goal, ext4_blocks_count(es) - 1);
+ return -EINVAL;
+ }
+
err = ext4_defrag(filp, defrag.start_offset,
- defrag.defrag_size);
+ defrag.defrag_size, defrag.goal);
}

return err;
}

/**
+ * ext4_defrag_merge_across_blocks - Merge extents across leaf block
+ *
+ * @handle: journal handle
+ * @org_inode: original inode
+ * @o_start: first original extent to be defraged
+ * @o_end: last original extent to be defraged
+ * @start_ext: first new extent to be merged
+ * @new_ext: middle of new extent to be merged
+ * @end_ext: last new extent to be merged
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_merge_across_blocks(handle_t *handle, struct inode *org_inode,
+ struct ext4_extent *o_start, struct ext4_extent *o_end,
+ struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext)
+{
+ struct ext4_ext_path *org_path = NULL;
+ ext4_lblk_t eblock = 0;
+ int new_flag = 0;
+ int end_flag = 0;
+ int err;
+
+ if (le16_to_cpu(start_ext->ee_len) &&
+ le16_to_cpu(new_ext->ee_len) &&
+ le16_to_cpu(end_ext->ee_len)) {
+
+ if (o_start == o_end) {
+
+ /* start_ext new_ext end_ext
+ * dest |---------|-----------|--------|
+ * org |------------------------------|
+ */
+
+ end_flag = 1;
+ } else {
+
+ /* start_ext new_ext end_ext
+ * dest |---------|----------|---------|
+ * org |---------------|--------------|
+ */
+
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+ }
+
+ o_start->ee_len = start_ext->ee_len;
+ new_flag = 1;
+
+ } else if (le16_to_cpu(start_ext->ee_len) &&
+ le16_to_cpu(new_ext->ee_len) &&
+ !le16_to_cpu(end_ext->ee_len) &&
+ o_start == o_end) {
+
+ /* start_ext new_ext
+ * dest |--------------|---------------|
+ * org |------------------------------|
+ */
+
+ o_start->ee_len = start_ext->ee_len;
+ new_flag = 1;
+
+ } else if (!le16_to_cpu(start_ext->ee_len) &&
+ le16_to_cpu(new_ext->ee_len) &&
+ le16_to_cpu(end_ext->ee_len) &&
+ o_start == o_end) {
+
+ /* new_ext end_ext
+ * dest |--------------|---------------|
+ * org |------------------------------|
+ */
+
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+
+ /*
+ * Set 0 to the extent block if new_ext was
+ * the first block.
+ */
+ if (!new_ext->ee_block)
+ eblock = 0;
+ else
+ eblock = le32_to_cpu(new_ext->ee_block);
+
+ new_flag = 1;
+ } else {
+ printk(KERN_ERR "ext4 defrag: Unexpected merge case\n");
+ return -EIO;
+ }
+
+ if (new_flag) {
+ org_path = ext4_ext_find_extent(org_inode, eblock, NULL);
+ if (IS_ERR(org_path)) {
+ err = PTR_ERR(org_path);
+ org_path = NULL;
+ goto out;
+ }
+ err = ext4_ext_insert_extent(handle, org_inode,
+ org_path, new_ext);
+ if (err)
+ goto out;
+ }
+
+ if (end_flag) {
+ org_path = ext4_ext_find_extent(org_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, org_path);
+ if (IS_ERR(org_path)) {
+ err = PTR_ERR(org_path);
+ org_path = NULL;
+ goto out;
+ }
+ err = ext4_ext_insert_extent(handle, org_inode,
+ org_path, end_ext);
+ if (err)
+ goto out;
+ }
+out:
+ if (org_path) {
+ ext4_ext_drop_refs(org_path);
+ kfree(org_path);
+ }
+
+ return err;
+
+}
+
+/**
+ * ext4_defrag_merge_inside_block - Merge new extent to the extent block
+ *
+ * @o_start: first original extent to be merged
+ * @o_end: last original extent to be merged
+ * @start_ext: first new extent to be merged
+ * @new_ext: middle of new extent to be merged
+ * @end_ext: last new extent to be merged
+ * @eh: extent header of target leaf block
+ * @replaced: the number of blocks which will be replaced with new_ext
+ * @range_to_move: used to decide how to merge
+ *
+ * This function always returns 0.
+ */
+static int
+ext4_defrag_merge_inside_block(struct ext4_extent *o_start,
+ struct ext4_extent *o_end, struct ext4_extent *start_ext,
+ struct ext4_extent *new_ext, struct ext4_extent *end_ext,
+ struct ext4_extent_header *eh, ext4_fsblk_t replaced,
+ int range_to_move)
+{
+ int i = 0;
+ unsigned len;
+
+ /* Move the existing extents */
+ if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+ len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+ (unsigned long)(o_end + 1);
+ memmove(o_end + 1 + range_to_move, o_end + 1, len);
+ }
+
+ /* Insert start entry */
+ if (le16_to_cpu(start_ext->ee_len))
+ o_start[i++].ee_len = start_ext->ee_len;
+
+ /* Insert new entry */
+ if (le16_to_cpu(new_ext->ee_len)) {
+ o_start[i].ee_block = new_ext->ee_block;
+ o_start[i].ee_len = cpu_to_le16(replaced);
+ ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+ }
+
+ /* Insert end entry */
+ if (end_ext->ee_len)
+ o_start[i] = *end_ext;
+
+ /* Increment the total entries counter on the extent block */
+ le16_add_cpu(&eh->eh_entries, range_to_move);
+
+ return 0;
+}
+
+/**
* ext4_defrag_merge_extents - Merge new extent
*
* @handle: journal handle
@@ -145,6 +337,63 @@ ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
struct ext4_extent *start_ext, struct ext4_extent *new_ext,
struct ext4_extent *end_ext, ext4_fsblk_t replaced)
{
+ struct ext4_extent_header *eh;
+ unsigned need_slots, slots_range;
+ int range_to_move, depth, ret;
+
+ /*
+ * The extents need to be inserted
+ * start_extent + new_extent + end_extent.
+ */
+ need_slots = (le16_to_cpu(start_ext->ee_len) ? 1 : 0) +
+ (le16_to_cpu(end_ext->ee_len) ? 1 : 0) +
+ (le16_to_cpu(new_ext->ee_len) ? 1 : 0);
+
+ /* The number of slots between start and end */
+ slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+ / sizeof(struct ext4_extent);
+
+ /* Range to move the end of extent */
+ range_to_move = need_slots - slots_range;
+ depth = org_path->p_depth;
+ org_path += depth;
+ eh = org_path->p_hdr;
+
+ if (depth) {
+ /* Register to journal */
+ ret = ext4_journal_get_write_access(handle, org_path->p_bh);
+ if (ret)
+ return ret;
+ }
+
+ /* Expansion */
+ if (range_to_move > 0 &&
+ (range_to_move > le16_to_cpu(eh->eh_max)
+ - le16_to_cpu(eh->eh_entries))) {
+
+ ret = ext4_defrag_merge_across_blocks(handle, org_inode,
+ o_start, o_end, start_ext, new_ext,
+ end_ext);
+ if (ret < 0)
+ return ret;
+ } else {
+ ret = ext4_defrag_merge_inside_block(o_start, o_end,
+ start_ext, new_ext, end_ext, eh,
+ replaced, range_to_move);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (depth) {
+ ret = ext4_journal_dirty_metadata(handle, org_path->p_bh);
+ if (ret)
+ return ret;
+ } else {
+ ret = ext4_mark_inode_dirty(handle, org_inode);
+ if (ret < 0)
+ return ret;
+ }
+
return 0;

}
@@ -489,6 +738,7 @@ out:
* @dest_path: indicating the temporary inode's extent
* @req_blocks: contiguous blocks count we need
* @iblock: target file offset
+ * @goal: goal offset
*
*/
static void
@@ -496,7 +746,8 @@ ext4_defrag_fill_ar(struct inode *org_inode, struct inode *dest_inode,
struct ext4_allocation_request *ar,
struct ext4_ext_path *org_path,
struct ext4_ext_path *dest_path,
- ext4_fsblk_t req_blocks, ext4_lblk_t iblock)
+ ext4_fsblk_t req_blocks, ext4_lblk_t iblock,
+ ext4_fsblk_t goal)
{
ar->inode = dest_inode;
ar->len = req_blocks;
@@ -508,7 +759,10 @@ ext4_defrag_fill_ar(struct inode *org_inode, struct inode *dest_inode,
ar->lright = 0;
ar->pright = 0;

- ar->goal = ext4_ext_find_goal(dest_inode, dest_path, iblock);
+ if (goal)
+ ar->goal = goal;
+ else
+ ar->goal = ext4_ext_find_goal(dest_inode, dest_path, iblock);
}

/**
@@ -705,6 +959,7 @@ out:
* original extent tree
* @tar_end: the last block number of the allocated blocks
* @sum_tmp: the extents count in the allocated blocks
+ * @goal: block offset for allocation
*
* This function returns the values as below.
* 0 (improved)
@@ -714,7 +969,7 @@ out:
static int
ext4_defrag_comp_ext_count(struct inode *org_inode,
struct ext4_ext_path *org_path, ext4_lblk_t tar_end,
- int sum_tmp)
+ int sum_tmp, ext4_fsblk_t goal)
{
struct ext4_extent *ext = NULL;
int depth = ext_depth(org_inode);
@@ -738,7 +993,7 @@ ext4_defrag_comp_ext_count(struct inode *org_inode,
* Fail if goal is not set and the fragmentation
* is not improved.
*/
- if (sum_org == sum_tmp) {
+ if (sum_org == sum_tmp && !goal) {
/* Not improved */
ret = 1;
} else if (sum_org < sum_tmp) {
@@ -769,6 +1024,7 @@ ext4_defrag_comp_ext_count(struct inode *org_inode,
* @req_start: starting offset to allocate in blocks
* @req_blocks: the number of blocks to allocate
* @iblock: file related offset
+ * @goal: block offset for allocation
*
* This function returns the value as below:
* 0 (succeed)
@@ -778,7 +1034,8 @@ ext4_defrag_comp_ext_count(struct inode *org_inode,
static int
ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
struct ext4_ext_path *org_path, ext4_lblk_t req_start,
- ext4_lblk_t req_blocks, ext4_lblk_t iblock)
+ ext4_lblk_t req_blocks, ext4_lblk_t iblock,
+ ext4_fsblk_t goal)
{
handle_t *handle;
struct ext4_sb_info *sbi = EXT4_SB(org_inode->i_sb);
@@ -806,7 +1063,7 @@ ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,

/* Fill struct ext4_allocation_request with necessary info */
ext4_defrag_fill_ar(org_inode, tmp_inode, &ar, org_path,
- dest_path, req_blocks, iblock);
+ dest_path, req_blocks, iblock, goal);

handle = ext4_journal_start(tmp_inode, 0);
if (IS_ERR(handle)) {
@@ -841,7 +1098,7 @@ ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
}

ret = ext4_defrag_comp_ext_count(org_inode, org_path, req_end,
- sum_tmp);
+ sum_tmp, goal);

out:
if (ret < 0 && ar.len)
@@ -869,11 +1126,13 @@ out2:
*
* @org_inode: original inode
* @defrag_size: size of defrag in blocks
+ * @goal: pointer to block offset for allocation
*
* This function returns 0 if succeed, otherwise returns error value.
*/
static int
-ext4_defrag_check(struct inode *org_inode, ext4_lblk_t defrag_size)
+ext4_defrag_check(struct inode *org_inode, ext4_lblk_t defrag_size,
+ ext4_fsblk_t *goal)
{
/* ext4 online defrag needs mballoc mount option. */
if (!test_opt(org_inode->i_sb, MBALLOC)) {
@@ -962,13 +1221,14 @@ out:
* @filp: pointer to file
* @block_start: starting offset to defrag in blocks
* @defrag_size: size of defrag in blocks
+ * @goal: block offset for allocation
*
* This function returns the number of blocks if succeed, otherwise
* returns error value.
*/
int
ext4_defrag(struct file *filp, ext4_lblk_t block_start,
- ext4_lblk_t defrag_size)
+ ext4_lblk_t defrag_size, ext4_fsblk_t goal)
{
struct inode *org_inode = filp->f_dentry->d_inode, *tmp_inode = NULL;
struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL;
@@ -983,7 +1243,7 @@ ext4_defrag(struct file *filp, ext4_lblk_t block_start,
int block_len_in_page;

/* Check the filesystem environment whether defrag can be done */
- ret = ext4_defrag_check(org_inode, defrag_size);
+ ret = ext4_defrag_check(org_inode, defrag_size, &goal);
if (ret < 0)
return ret;

@@ -1093,14 +1353,14 @@ ext4_defrag(struct file *filp, ext4_lblk_t block_start,
}

/* Found an isolated block */
- if (seq_extents == 1) {
+ if (seq_extents == 1 && !goal) {
seq_start = le32_to_cpu(ext_cur->ee_block);
goto CLEANUP;
}

ret = ext4_defrag_new_extent_tree(org_inode, tmp_inode,
org_path, seq_start, seq_blocks,
- block_start);
+ block_start, goal);

if (ret < 0) {
break;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c6a194..556ff5e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1142,7 +1142,7 @@ extern void ext4_inode_table_set(struct super_block *sb,
extern int ext4_ext_journal_restart(handle_t *handle, int needed);
/* defrag.c */
extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start,
- ext4_lblk_t defrag_size);
+ ext4_lblk_t defrag_size, ext4_fsblk_t goal);
extern int ext4_defrag_ioctl(struct inode *, struct file *, unsigned int,
unsigned long);