2008-09-27 07:27:18

by Akira Fujita

[permalink] [raw]
Subject: [RFC][PATCH 10/12]ext4: Add the EXT4_IOC_MOVE_VICTIM ioctl

ext4: online defrag -- Add the EXT4_IOC_MOVE_VICTIM ioctl.

From: Akira Fujita <[email protected]>

The EXT4_IOC_MOVE_VICTIM moves the victim extents into other block group.
Therefore the contiguous free space is made in the target block group.
This ioctl is used only in the force defrag (-f).

Signed-off-by: Akira Fujita <[email protected]>
Signed-off-by: Takashi Sato <[email protected]>
---
fs/ext4/balloc.c | 1 +
fs/ext4/defrag.c | 262 ++++++++++++++++++++++++++++++++++++++++++------
fs/ext4/ext4.h | 18 +++-
fs/ext4/ext4_extents.h | 5 +
fs/ext4/extents.c | 54 ++++++++--
fs/ext4/ioctl.c | 3 +-
fs/ext4/mballoc.c | 5 +
fs/ext4/mballoc.h | 1 +
8 files changed, 307 insertions(+), 42 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 2344a96..969e996 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -2026,6 +2026,7 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
ar.goal = goal;
ar.len = *count;
ar.logical = iblock;
+ ar.excepted_group = -1;

if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
/* enable in-core preallocation for data block allocation */
diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index 26fb4a6..a2b17c5 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -380,6 +380,80 @@ err:
}

/**
+ * ext4_defrag_move_victim - Create free space for defrag
+ *
+ * @target_filp: target file
+ * @ext_info: target extents array to move
+ *
+ * This function returns 0 if succeed, otherwise
+ * returns error value.
+ */
+static int
+ext4_defrag_move_victim(struct file *target_filp,
+ struct ext4_extents_info *ext_info)
+{
+ struct inode *org_inode = target_filp->f_dentry->d_inode;
+ struct super_block *sb = org_inode->i_sb;
+ struct file victim_file;
+ struct dentry victim_dent;
+ struct inode *victim_inode;
+ struct ext4_extent_data ext;
+ ext4_fsblk_t goal = ext_info->goal;
+ ext4_group_t group;
+ ext4_grpblk_t grp_off;
+ int ret, i;
+
+ /* Setup dummy extent data */
+ ext.len = 0;
+
+ /* Get the inode of the victim file */
+ victim_inode = ext4_iget(sb, ext_info->ino);
+ if (IS_ERR(victim_inode))
+ return PTR_ERR(victim_inode);
+
+ /* Setup file for the victim file */
+ victim_dent.d_inode = victim_inode;
+ victim_file.f_dentry = &victim_dent;
+ victim_file.f_mapping = victim_inode->i_mapping;
+
+ /* Set the goal appropriate offset */
+ if (goal == -1) {
+ ext4_get_group_no_and_offset(victim_inode->i_sb,
+ ext_info->ext[0].start, &group, &grp_off);
+ goal = ext4_group_first_block_no(sb, group + 1);
+ }
+
+ for (i = 0; i < ext_info->entries; i++) {
+ /* Move original blocks to another block group */
+ ret = ext4_defrag(&victim_file, ext_info->ext[i].block,
+ ext_info->ext[i].len, goal, DEFRAG_FORCE_VICTIM, &ext);
+ if (ret < 0) {
+ printk(KERN_ERR "ext4 defrag: "
+ "Moving victim file failed. ino [%llu]\n",
+ ext_info->ino);
+ goto err;
+ }
+
+ /* Sync journal blocks before reservation */
+ ret = ext4_force_commit(sb);
+ if (ret) {
+ printk(KERN_ERR "ext4 defrag: "
+ "ext4_force_commit failed(%d)\n", ret);
+ goto err;
+ }
+ }
+
+ iput(victim_inode);
+ return 0;
+err:
+ down_write(&EXT4_I(org_inode)->i_data_sem);
+ ext4_discard_reservation(org_inode);
+ up_write(&EXT4_I(org_inode)->i_data_sem);
+ iput(victim_inode);
+ return ret;
+}
+
+/**
* ext4_defrag_fblocks_distribution - Search free blocks distribution
*
* @org_inode: original inode
@@ -538,6 +612,16 @@ int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
return -EFAULT;

err = ext4_defrag_reserve_fblocks(inode, &ext_info);
+ } else if (cmd == EXT4_IOC_MOVE_VICTIM) {
+ struct ext4_extents_info ext_info;
+
+ if (copy_from_user(&ext_info,
+ (struct ext4_extents_info __user *)arg,
+ sizeof(ext_info)))
+ return -EFAULT;
+
+ err = ext4_defrag_move_victim(filp, &ext_info);
+
} else if (cmd == EXT4_IOC_DEFRAG) {
struct ext4_ext_defrag_data defrag;
struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
@@ -564,7 +648,8 @@ int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
}

err = ext4_defrag(filp, defrag.start_offset,
- defrag.defrag_size, defrag.goal);
+ defrag.defrag_size, defrag.goal, defrag.flag,
+ &defrag.ext);
}

return err;
@@ -580,6 +665,7 @@ int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
* @start_ext: first new extent to be merged
* @new_ext: middle of new extent to be merged
* @end_ext: last new extent to be merged
+ * @phase: phase of the force defrag mode
*
* This function returns 0 if succeed, otherwise returns error value.
*/
@@ -587,14 +673,20 @@ static int
ext4_defrag_merge_across_blocks(handle_t *handle, struct inode *org_inode,
struct ext4_extent *o_start, struct ext4_extent *o_end,
struct ext4_extent *start_ext, struct ext4_extent *new_ext,
- struct ext4_extent *end_ext)
+ struct ext4_extent *end_ext, int phase)
{
struct ext4_ext_path *org_path = NULL;
ext4_lblk_t eblock = 0;
int new_flag = 0;
int end_flag = 0;
+ int defrag_flag;
int err;

+ if (phase == DEFRAG_FORCE_VICTIM)
+ defrag_flag = 1;
+ else
+ defrag_flag = 0;
+
if (le16_to_cpu(start_ext->ee_len) &&
le16_to_cpu(new_ext->ee_len) &&
le16_to_cpu(end_ext->ee_len)) {
@@ -671,8 +763,8 @@ ext4_defrag_merge_across_blocks(handle_t *handle, struct inode *org_inode,
org_path = NULL;
goto out;
}
- err = ext4_ext_insert_extent(handle, org_inode,
- org_path, new_ext);
+ err = ext4_ext_insert_extent_defrag(handle, org_inode,
+ org_path, new_ext, defrag_flag);
if (err)
goto out;
}
@@ -685,8 +777,8 @@ ext4_defrag_merge_across_blocks(handle_t *handle, struct inode *org_inode,
org_path = NULL;
goto out;
}
- err = ext4_ext_insert_extent(handle, org_inode,
- org_path, end_ext);
+ err = ext4_ext_insert_extent_defrag(handle, org_inode,
+ org_path, end_ext, defrag_flag);
if (err)
goto out;
}
@@ -764,6 +856,7 @@ ext4_defrag_merge_inside_block(struct ext4_extent *o_start,
* @new_ext: middle of new extent to be merged
* @end_ext: last new extent to be merged
* @replaced: the number of blocks which will be replaced with new_ext
+ * @phase: phase of the force defrag mode
*
* This function returns 0 if succeed, otherwise returns error value.
*/
@@ -772,7 +865,7 @@ ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
struct ext4_ext_path *org_path,
struct ext4_extent *o_start, struct ext4_extent *o_end,
struct ext4_extent *start_ext, struct ext4_extent *new_ext,
- struct ext4_extent *end_ext, ext4_fsblk_t replaced)
+ struct ext4_extent *end_ext, ext4_fsblk_t replaced, int phase)
{
struct ext4_extent_header *eh;
unsigned need_slots, slots_range;
@@ -810,7 +903,7 @@ ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,

ret = ext4_defrag_merge_across_blocks(handle, org_inode,
o_start, o_end, start_ext, new_ext,
- end_ext);
+ end_ext, phase);
if (ret < 0)
return ret;
} else {
@@ -843,13 +936,14 @@ ext4_defrag_merge_extents(handle_t *handle, struct inode *org_inode,
* @org_path: path indicates first extent to be defraged
* @dext: destination extent
* @from: start offset on the target file
+ * @phase: phase of the force defrag mode
*
* This function returns 0 if succeed, otherwise returns error value.
*/
static int
ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
struct ext4_ext_path *org_path, struct ext4_extent *dext,
- ext4_lblk_t *from)
+ ext4_lblk_t *from, int phase)
{
struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext;
struct ext4_extent new_ext, start_ext, end_ext;
@@ -950,7 +1044,7 @@ ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
+ le16_to_cpu(oext->ee_len) - 1) {
ret = ext4_defrag_merge_extents(handle, org_inode,
org_path, o_start, o_end, &start_ext,
- &new_ext, &end_ext, replaced);
+ &new_ext, &end_ext, replaced, phase);
if (ret < 0)
return ret;

@@ -1002,6 +1096,7 @@ ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
* @from: block offset of org_inode
* @dest_off: block offset of dest_inode
* @count: block count to be replaced
+ * @phase: phase of the force defrag mode
*
* This function returns 0 if succeed, otherwise returns error value.
* Replace extents for blocks from "from" to "from + count - 1".
@@ -1009,7 +1104,7 @@ ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode,
static int
ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
struct inode *dest_inode, ext4_lblk_t from,
- ext4_lblk_t dest_off, ext4_lblk_t count)
+ ext4_lblk_t dest_off, ext4_lblk_t count, int phase)
{
struct ext4_ext_path *org_path = NULL;
struct ext4_ext_path *dest_path = NULL;
@@ -1070,7 +1165,7 @@ ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,

/* Loop for the original extent blocks */
err = ext4_defrag_leaf_block(handle, org_inode,
- org_path, dext, &from);
+ org_path, dext, &from, phase);
if (err < 0)
goto out;

@@ -1080,7 +1175,7 @@ ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode,
* e.g. ext4_defrag_merge_extents()
*/
err = ext4_defrag_leaf_block(handle, dest_inode,
- dest_path, swap_ext, &dest_off);
+ dest_path, swap_ext, &dest_off, -1);
if (err < 0)
goto out;

@@ -1176,6 +1271,7 @@ out:
* @req_blocks: contiguous blocks count we need
* @iblock: target file offset
* @goal: goal offset
+ * @phase: phase of the force defrag mode
*
*/
static void
@@ -1184,8 +1280,22 @@ ext4_defrag_fill_ar(struct inode *org_inode, struct inode *dest_inode,
struct ext4_ext_path *org_path,
struct ext4_ext_path *dest_path,
ext4_fsblk_t req_blocks, ext4_lblk_t iblock,
- ext4_fsblk_t goal)
+ ext4_fsblk_t goal, int phase)
{
+ ext4_group_t org_grp_no;
+ ext4_grpblk_t org_blk_off;
+ int org_depth = ext_depth(org_inode);
+
+ if (phase == DEFRAG_FORCE_VICTIM) {
+ ext4_get_group_no_and_offset(org_inode->i_sb,
+ ext_pblock(org_path[org_depth].p_ext),
+ &org_grp_no, &org_blk_off);
+ ar->excepted_group = org_grp_no;
+ } else {
+ /* Allocate contiguous blocks to any block group */
+ ar->excepted_group = -1;
+ }
+
ar->inode = dest_inode;
ar->len = req_blocks;
ar->logical = iblock;
@@ -1249,6 +1359,56 @@ ext4_defrag_alloc_blocks(handle_t *handle, struct inode *org_inode,
}

/**
+ * ext4_defrag_check_phase
+ * - Check condition of the allocated blocks (only force defrag mode)
+ *
+ * @ar: allocation request for multiple block allocation
+ * @dest_grp_no: block group num of the allocated blocks
+ * @goal_grp_no: block group num of the destination of block allocation
+ * @alloc_total: sum total of the allocated blocks
+ * @req_blocks: contiguous blocks count we need
+ * @phase: phase of the force defrag mode
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_check_phase(struct ext4_allocation_request *ar,
+ ext4_group_t dest_grp_no, ext4_group_t goal_grp_no,
+ ext4_fsblk_t alloc_total, ext4_lblk_t req_blocks,
+ int phase)
+{
+ int err = 0;
+
+ switch (phase) {
+ case DEFRAG_FORCE_TRY:
+ /* If there is not enough space, return -ENOSPC. */
+ if (ar->len != req_blocks)
+ /* -ENOSPC triggers DEFRAG_FORCE_VICTIM phase. */
+ err = -ENOSPC;
+ break;
+ case DEFRAG_FORCE_VICTIM:
+ /* We can't allocate new blocks in the same block group. */
+ if (dest_grp_no == ar->excepted_group) {
+ printk(KERN_ERR "ext4 defrag: Failed to allocate"
+ " victim file to other block group\n");
+ err = -ENOSPC;
+ }
+ break;
+ case DEFRAG_FORCE_GATHER:
+ /* Maybe reserved blocks are already used by other process. */
+ if (dest_grp_no != goal_grp_no
+ || alloc_total != req_blocks) {
+ printk(KERN_ERR "ext4 defrag: Reserved blocks are"
+ " already used by other process\n");
+ err = -EIO;
+ }
+ break;
+ }
+
+ return err;
+}
+
+/**
* ext4_defrag_partial - Defrag a file per page
*
* @tmp_inode: temporary inode
@@ -1257,13 +1417,15 @@ ext4_defrag_alloc_blocks(handle_t *handle, struct inode *org_inode,
* @dest_blk_offset: block index on temporary file
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
+ * @phase: phase of the force defrag mode
*
* This function returns 0 if succeed, otherwise returns error value.
*/
static int
ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
pgoff_t org_page_offset, ext4_lblk_t dest_blk_offset,
- int data_offset_in_page, int block_len_in_page)
+ int data_offset_in_page, int block_len_in_page,
+ int phase)
{
struct inode *org_inode = filp->f_dentry->d_inode;
struct address_space *mapping = org_inode->i_mapping;
@@ -1346,7 +1508,7 @@ ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
try_to_release_page(page, 0);
ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode,
org_blk_offset, dest_blk_offset,
- block_len_in_page);
+ block_len_in_page, phase);
if (ret < 0)
goto out;

@@ -1397,6 +1559,7 @@ out:
* @tar_end: the last block number of the allocated blocks
* @sum_tmp: the extents count in the allocated blocks
* @goal: block offset for allocation
+ * @phase: phase of the force defrag mode
*
* This function returns the values as below.
* 0 (improved)
@@ -1406,7 +1569,7 @@ out:
static int
ext4_defrag_comp_ext_count(struct inode *org_inode,
struct ext4_ext_path *org_path, ext4_lblk_t tar_end,
- int sum_tmp, ext4_fsblk_t goal)
+ int sum_tmp, ext4_fsblk_t goal, int phase)
{
struct ext4_extent *ext = NULL;
int depth = ext_depth(org_inode);
@@ -1433,7 +1596,8 @@ ext4_defrag_comp_ext_count(struct inode *org_inode,
if (sum_org == sum_tmp && !goal) {
/* Not improved */
ret = 1;
- } else if (sum_org < sum_tmp) {
+ } else if (sum_org < sum_tmp &&
+ phase != DEFRAG_FORCE_VICTIM) {
/* Fragment increased */
ret = -ENOSPC;
printk(KERN_ERR "ext4 defrag: "
@@ -1462,6 +1626,7 @@ ext4_defrag_comp_ext_count(struct inode *org_inode,
* @req_blocks: the number of blocks to allocate
* @iblock: file related offset
* @goal: block offset for allocation
+ * @phase: phase of the force defrag mode
*
* This function returns the value as below:
* 0 (succeed)
@@ -1472,7 +1637,7 @@ static int
ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
struct ext4_ext_path *org_path, ext4_lblk_t req_start,
ext4_lblk_t req_blocks, ext4_lblk_t iblock,
- ext4_fsblk_t goal)
+ ext4_fsblk_t goal, int phase)
{
handle_t *handle;
struct ext4_sb_info *sbi = EXT4_SB(org_inode->i_sb);
@@ -1484,6 +1649,8 @@ ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
ext4_fsblk_t newblock = 0;
ext4_lblk_t req_end = req_start + req_blocks - 1;
ext4_lblk_t rest_blocks = 0;
+ ext4_group_t dest_group_no, goal_group_no;
+ ext4_grpblk_t dest_blk_off, goal_blk_off;
int sum_tmp = 0;
int metadata = 1;
int ret;
@@ -1500,7 +1667,7 @@ ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,

/* Fill struct ext4_allocation_request with necessary info */
ext4_defrag_fill_ar(org_inode, tmp_inode, &ar, org_path,
- dest_path, req_blocks, iblock, goal);
+ dest_path, req_blocks, iblock, goal, phase);

handle = ext4_journal_start(tmp_inode, 0);
if (IS_ERR(handle)) {
@@ -1508,6 +1675,9 @@ ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
goto out2;
}

+ ext4_get_group_no_and_offset(tmp_inode->i_sb, goal,
+ &goal_group_no, &goal_blk_off);
+
while (alloc_total != req_blocks) {
/* Allocate blocks */
ret = ext4_defrag_alloc_blocks(handle, org_inode, tmp_inode,
@@ -1517,9 +1687,21 @@ ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
/* Claimed blocks are already reserved */
EXT4_I(ar.inode)->i_delalloc_reserved_flag = 1;

+ ext4_get_group_no_and_offset(tmp_inode->i_sb, newblock,
+ &dest_group_no, &dest_blk_off);
+
alloc_total += ar.len;
rest_blocks = req_blocks - alloc_total;

+ /* the checks that done in force mode */
+ if (phase) {
+ ret = ext4_defrag_check_phase(&ar, dest_group_no,
+ goal_group_no, alloc_total,
+ req_blocks, phase);
+ if (ret < 0)
+ goto out;
+ }
+
newex.ee_block = cpu_to_le32(alloc_total - ar.len);
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(ar.len);
@@ -1529,13 +1711,14 @@ ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
if (ret < 0)
goto out;

- ar.goal = newblock + ar.len;
+ if (!phase)
+ ar.goal = newblock + ar.len;
ar.len = req_blocks - alloc_total;
sum_tmp++;
}

ret = ext4_defrag_comp_ext_count(org_inode, org_path, req_end,
- sum_tmp, goal);
+ sum_tmp, goal, phase);

out:
if (ret < 0 && ar.len)
@@ -1562,14 +1745,16 @@ out2:
* ext4_defrag_check - Check the environment whether a defrag can be done
*
* @org_inode: original inode
+ * @ext: extent to be moved (only defrag force mode)
* @defrag_size: size of defrag in blocks
* @goal: pointer to block offset for allocation
+ * @phase: phase of the force defrag mode
*
* This function returns 0 if succeed, otherwise returns error value.
*/
static int
-ext4_defrag_check(struct inode *org_inode, ext4_lblk_t defrag_size,
- ext4_fsblk_t *goal)
+ext4_defrag_check(struct inode *org_inode, struct ext4_extent_data *ext,
+ ext4_lblk_t defrag_size, ext4_fsblk_t *goal, int *phase)
{
/* ext4 online defrag needs mballoc mount option. */
if (!test_opt(org_inode->i_sb, MBALLOC)) {
@@ -1578,6 +1763,17 @@ ext4_defrag_check(struct inode *org_inode, ext4_lblk_t defrag_size,
return -EOPNOTSUPP;
}

+ if (ext->len) {
+ /* Setup for the force defrag mode */
+ if (ext->len < defrag_size) {
+ printk(KERN_ERR "ext4 defrag: "
+ "Invalid length of extent\n");
+ return -EINVAL;
+ }
+ *phase = DEFRAG_FORCE_GATHER;
+ *goal = ext->start;
+ }
+
return 0;
}

@@ -1659,13 +1855,16 @@ out:
* @block_start: starting offset to defrag in blocks
* @defrag_size: size of defrag in blocks
* @goal: block offset for allocation
+ * @phase: phase of the force defrag mode
+ * @ext: extent to be moved (only defrag force mode)
*
* This function returns the number of blocks if succeed, otherwise
* returns error value.
*/
int
ext4_defrag(struct file *filp, ext4_lblk_t block_start,
- ext4_lblk_t defrag_size, ext4_fsblk_t goal)
+ ext4_lblk_t defrag_size, ext4_fsblk_t goal, int phase,
+ struct ext4_extent_data *ext)
{
struct inode *org_inode = filp->f_dentry->d_inode, *tmp_inode = NULL;
struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL;
@@ -1680,7 +1879,7 @@ ext4_defrag(struct file *filp, ext4_lblk_t block_start,
int block_len_in_page;

/* Check the filesystem environment whether defrag can be done */
- ret = ext4_defrag_check(org_inode, defrag_size, &goal);
+ ret = ext4_defrag_check(org_inode, ext, defrag_size, &goal, &phase);
if (ret < 0)
return ret;

@@ -1797,11 +1996,11 @@ ext4_defrag(struct file *filp, ext4_lblk_t block_start,

ret = ext4_defrag_new_extent_tree(org_inode, tmp_inode,
org_path, seq_start, seq_blocks,
- block_start, goal);
+ block_start, goal, phase);

if (ret < 0) {
break;
- } else if (ret == 1) {
+ } else if (ret == 1 && (!goal || (goal && !phase))) {
ret = 0;
seq_start = le32_to_cpu(ext_cur->ee_block);
goto CLEANUP;
@@ -1846,7 +2045,8 @@ ext4_defrag(struct file *filp, ext4_lblk_t block_start,
org_page_offset,
dest_block_offset,
data_offset_in_page,
- block_len_in_page);
+ block_len_in_page,
+ phase);
if (ret < 0)
goto out;

@@ -1905,6 +2105,10 @@ out:
kfree(holecheck_path);
}

+ if (phase == DEFRAG_FORCE_GATHER)
+ /* Release reserved block in force mode */
+ ext4_discard_reservation(org_inode);
+
up_write(&EXT4_I(org_inode)->i_data_sem);
mutex_unlock(&org_inode->i_mutex);

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index eef7885..4e54eb4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -97,6 +97,11 @@ struct ext4_allocation_request {
unsigned long len;
/* flags. see above EXT4_MB_HINT_* */
unsigned long flags;
+ /*
+ * for ext4 online defrag:
+ * the block group which is excepted from allocation target
+ */
+ long long excepted_group;
};

/*
@@ -308,6 +313,7 @@ struct ext4_new_group_data {
#define EXT4_IOC_FREE_BLOCKS_INFO _IOW('f', 18, struct ext4_extents_info)
#define EXT4_IOC_FIEMAP_INO _IOW('f', 19, struct fiemap_ino)
#define EXT4_IOC_RESERVE_BLOCK _IOW('f', 20, struct ext4_extents_info)
+#define EXT4_IOC_MOVE_VICTIM _IOW('f', 21, struct ext4_extents_info)

/*
* ioctl commands in 32 bit emulation
@@ -330,8 +336,15 @@ struct ext4_new_group_data {
*
* DEFRAG_MAX_ENT: the maximum number of extents for exchanging between
* kernel-space and user-space per an ioctl
+ * DEFRAG_FORCE_TRY: check whether we have free space fragmentation or not
+ * DEFRAG_FORCE_VICTIM: move victim extents to make sufficient space
+ * DEFRAG_FORCE_GATHER: move the target file into the free space made in the
+ * DEFRAG_FORCE_VICTIM phase
*/
#define DEFRAG_MAX_ENT 32
+#define DEFRAG_FORCE_TRY 1
+#define DEFRAG_FORCE_VICTIM 2
+#define DEFRAG_FORCE_GATHER 3

struct ext4_extent_data {
ext4_lblk_t block; /* start logical block number */
@@ -343,6 +356,8 @@ struct ext4_ext_defrag_data {
ext4_lblk_t start_offset; /* start offset to defrag in blocks */
ext4_lblk_t defrag_size; /* size of defrag in blocks */
ext4_fsblk_t goal; /* block offset for allocation */
+ int flag; /* free space mode flag */
+ struct ext4_extent_data ext;
};

struct ext4_group_data_info {
@@ -1193,7 +1208,8 @@ extern void ext4_inode_table_set(struct super_block *sb,
extern int ext4_ext_journal_restart(handle_t *handle, int needed);
/* defrag.c */
extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start,
- ext4_lblk_t defrag_size, ext4_fsblk_t goal);
+ ext4_lblk_t defrag_size, ext4_fsblk_t goal,
+ int flag, struct ext4_extent_data *ext);
extern int ext4_defrag_ioctl(struct inode *, struct file *, unsigned int,
unsigned long);

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 6407222..fbe34b4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -251,5 +251,10 @@ extern void ext4_ext_drop_refs(struct ext4_ext_path *path);
extern ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t block);
+extern int ext4_ext_insert_extent_defrag(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int defrag);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
+
#endif /* _EXT4_EXTENTS */

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7fcf72d..32c1aa9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -186,11 +186,17 @@ ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
static ext4_fsblk_t
ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *ex, int *err)
+ struct ext4_extent *ex, int *err,
+ ext4_fsblk_t defrag_goal)
{
ext4_fsblk_t goal, newblock;

- goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
+ if (defrag_goal)
+ goal = defrag_goal;
+ else
+ goal = ext4_ext_find_goal(inode, path,
+ le32_to_cpu(ex->ee_block));
+
newblock = ext4_new_meta_block(handle, inode, goal, err);
return newblock;
}
@@ -675,7 +681,8 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
*/
static int ext4_ext_split(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext, int at)
+ struct ext4_extent *newext, int at,
+ ext4_fsblk_t defrag_goal)
{
struct buffer_head *bh = NULL;
int depth = ext_depth(inode);
@@ -726,7 +733,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
newblock = ext4_ext_new_meta_block(handle, inode, path,
- newext, &err);
+ newext, &err, defrag_goal);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -913,7 +920,8 @@ cleanup:
*/
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ struct ext4_extent *newext,
+ ext4_fsblk_t defrag_goal)
{
struct ext4_ext_path *curp = path;
struct ext4_extent_header *neh;
@@ -922,7 +930,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock;
int err = 0;

- newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err, defrag_goal);
if (newblock == 0)
return err;

@@ -998,7 +1007,8 @@ out:
*/
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ struct ext4_extent *newext,
+ ext4_fsblk_t defrag_goal)
{
struct ext4_ext_path *curp;
int depth, i, err = 0;
@@ -1018,7 +1028,8 @@ repeat:
if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
- err = ext4_ext_split(handle, inode, path, newext, i);
+ err = ext4_ext_split(handle, inode, path, newext, i,
+ defrag_goal);
if (err)
goto out;

@@ -1031,7 +1042,8 @@ repeat:
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, path, newext);
+ err = ext4_ext_grow_indepth(handle, inode, path,
+ newext, defrag_goal);
if (err)
goto out;

@@ -1211,7 +1223,7 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
* allocated block. Thus, index entries have to be consistent
* with leaves.
*/
-static ext4_lblk_t
+ext4_lblk_t
ext4_ext_next_allocated_block(struct ext4_ext_path *path)
{
int depth;
@@ -1477,6 +1489,19 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *newext)
{
+ return ext4_ext_insert_extent_defrag(handle, inode, path, newext, 0);
+}
+
+/*
+ * ext4_ext_insert_extent_defrag:
+ * The difference from ext4_ext_insert_extent is to use the first block
+ * in newext as the goal of the new index block.
+ */
+int
+ext4_ext_insert_extent_defrag(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int defrag)
+{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
@@ -1484,6 +1509,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
int depth, len, err;
ext4_lblk_t next;
unsigned uninitialized = 0;
+ ext4_fsblk_t defrag_goal;

BUG_ON(ext4_ext_get_actual_len(newext) == 0);
depth = ext_depth(inode);
@@ -1544,11 +1570,16 @@ repeat:
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
}

+ if (defrag)
+ defrag_goal = ext_pblock(newext);
+ else
+ defrag_goal = 0;
/*
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+ err = ext4_ext_create_new_leaf(handle, inode, path,
+ newext, defrag_goal);
if (err)
goto cleanup;
depth = ext_depth(inode);
@@ -2848,6 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ar.goal = ext4_ext_find_goal(inode, path, iblock);
ar.logical = iblock;
ar.len = allocated;
+ ar.excepted_group = -1;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
else
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9c992d8..a596785 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -261,7 +261,8 @@ setversion_out:
case EXT4_IOC_GROUP_INFO:
case EXT4_IOC_FREE_BLOCKS_INFO:
case EXT4_IOC_FIEMAP_INO:
- case EXT4_IOC_RESERVE_BLOCK: {
+ case EXT4_IOC_RESERVE_BLOCK:
+ case EXT4_IOC_MOVE_VICTIM: {
return ext4_defrag_ioctl(inode, filp, cmd, arg);
}
case EXT4_IOC_GROUP_ADD: {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 01a7daa..78f76da 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1775,6 +1775,10 @@ repeat:
if (group == EXT4_SB(sb)->s_groups_count)
group = 0;

+ if (ac->ac_excepted_group != -1 &&
+ group == ac->ac_excepted_group)
+ continue;
+
/* quick check to skip empty groups */
grp = ext4_get_group_info(ac->ac_sb, group);
if (grp->bb_free == 0)
@@ -4160,6 +4164,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ac->ac_bitmap_page = NULL;
ac->ac_buddy_page = NULL;
ac->ac_lg = NULL;
+ ac->ac_excepted_group = ar->excepted_group;

/* we have to define context: we'll we work with a file or
* locality group. this is a policy, actually */
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c7c9906..6b46c86 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -211,6 +211,7 @@ struct ext4_allocation_context {
struct page *ac_buddy_page;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
+ long long ac_excepted_group;
};

#define AC_STATUS_CONTINUE 1