2009-05-22 07:06:32

by Akira Fujita

[permalink] [raw]
Subject: [RFC][PATCH 1/3] Add EXT4_IOC_MOVE_EXT ioctl and related functions

ext4: online defrag -- Add EXT4_IOC_MOVE_EXT ioctl and related functions.

From: Akira Fujita <[email protected]>

The EXT4_IOC_MOVE_EXT exchanges the blocks between orig_fd and donor_fd,
and then write the file data of orig_fd to donor_fd.
ext4_mext_move_extent() is the main fucntion of ext4 online defrag,
and this patch includes all functions related to ext4 online defrag.

Signed-off-by: Akira Fujita <[email protected]>
Signed-off-by: Takashi Sato <[email protected]>
Signed-off-by: Kazuya Mio <[email protected]>
---
Makefile | 2
ext4.h | 15
ext4_extents.h | 4
extents.c | 4
ioctl.c | 36 +
move_extent.c | 1296 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 1354 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 8a34710..8867b2a 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o

ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o block_validity.o
+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o

ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9f0e482..ff55657 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -354,6 +354,7 @@ struct ext4_new_group_data {
/* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
+#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)

/*
* ioctl commands in 32 bit emulation
@@ -451,6 +452,15 @@ struct ext4_inode {
__le32 i_version_hi; /* high 32 bits for 64-bit version */
};

+struct move_extent {
+ int orig_fd; /* original file descriptor */
+ int donor_fd; /* donor file descriptor */
+ __u64 orig_start; /* logical start offset in block for orig */
+ __u64 donor_start; /* logical start offset in block for donor */
+ __u64 len; /* block length to be moved */
+ __u64 moved_len; /* moved block length */
+};
+#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)

#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -1652,6 +1662,11 @@ extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, int flags);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+/* defrag.c */
+extern int ext4_mext_move_extent(struct file *o_filp, struct file *d_filp,
+ __u64 start_orig, __u64 start_donor,
+ __u64 len, __u64 *moved_len);
+

/*
* Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index f0c3ec8..20a8410 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -221,12 +221,16 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
}

extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2);
extern int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d4e99e9..1903d9b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -49,7 +49,7 @@
* ext_pblock:
* combine low and high parts of physical block number into ext4_fsblk_t
*/
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
{
ext4_fsblk_t block;

@@ -1417,7 +1417,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
return err;
}

-static int
+int
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 3b7faeb..68a7ddf 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/smp_lock.h>
#include <linux/mount.h>
+#include <linux/file.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -224,6 +225,41 @@ setversion_out:

return err;
}
+
+ case EXT4_IOC_MOVE_EXT: {
+ struct move_extent me;
+ struct file *donor_filp;
+ int err;
+
+ if (copy_from_user(&me,
+ (struct move_extent __user *)arg, sizeof(me)))
+ return -EFAULT;
+
+ donor_filp = fget(me.donor_fd);
+ if (!donor_filp)
+ return -EBADF;
+
+ if (!capable(CAP_DAC_OVERRIDE)) {
+ if ((current->real_cred->fsuid != inode->i_uid) ||
+ !(inode->i_mode & S_IRUSR) ||
+ !(donor_filp->f_dentry->d_inode->i_mode &
+ S_IRUSR)) {
+ fput(donor_filp);
+ return -EACCES;
+ }
+ }
+
+ err = ext4_mext_move_extent(filp, donor_filp, me.orig_start,
+ me.donor_start, me.len, &me.moved_len);
+ fput(donor_filp);
+
+ if (!err)
+ if (copy_to_user((struct move_extent *)arg,
+ &me, sizeof(me)))
+ return -EFAULT;
+ return err;
+ }
+
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
struct super_block *sb = inode->i_sb;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e69de29..5b563f2 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1296 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <[email protected]>
+ * Akira Fujita <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "ext4.h"
+
+#define ext4_mext_get_extpath(path, inode, block, ret) \
+ do { \
+ path = ext4_ext_find_extent(inode, block, path); \
+ if (IS_ERR(path)) { \
+ ret = PTR_ERR(path); \
+ path = NULL; \
+ } \
+ } while (0)
+
+/**
+ * ext4_mext_copy_extent_status - Copy the extent's initialization status
+ *
+ * @src: an extent for getting initialize status
+ * @dest: an extent to be set the status
+ */
+static void
+ext4_mext_copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+ if (ext4_ext_is_uninitialized(src))
+ ext4_ext_mark_uninitialized(dest);
+ else
+ dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+
+/**
+ * ext4_mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode: inode which is searched
+ * @path: this will obtain data for the next extent
+ * @extent: pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+static int
+ext4_mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent)
+{
+ int ppos, leaf_ppos = path->p_depth;
+
+ ppos = leaf_ppos;
+ if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+ /* leaf block */
+ *extent = ++path[ppos].p_ext;
+ return 0;
+ }
+
+ while (--ppos >= 0) {
+ if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+ path[ppos].p_idx) {
+ int cur_ppos = ppos;
+
+ /* index block */
+ path[ppos].p_idx++;
+ path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+ if (path[ppos+1].p_bh)
+ brelse(path[ppos+1].p_bh);
+ path[ppos+1].p_bh =
+ sb_bread(inode->i_sb, path[ppos].p_block);
+ if (!path[ppos+1].p_bh)
+ return -EIO;
+ path[ppos+1].p_hdr =
+ ext_block_hdr(path[ppos+1].p_bh);
+
+ /* Halfway index block */
+ while (++cur_ppos < leaf_ppos) {
+ path[cur_ppos].p_idx =
+ EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+ path[cur_ppos].p_block =
+ idx_pblock(path[cur_ppos].p_idx);
+ if (path[cur_ppos+1].p_bh)
+ brelse(path[cur_ppos+1].p_bh);
+ path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+ path[cur_ppos].p_block);
+ if (!path[cur_ppos+1].p_bh)
+ return -EIO;
+ path[cur_ppos+1].p_hdr =
+ ext_block_hdr(path[cur_ppos+1].p_bh);
+ }
+
+ /* leaf block */
+ path[leaf_ppos].p_ext = *extent =
+ EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ return 0;
+ }
+ }
+ /* We found the last extent */
+ return 1;
+}
+
+/**
+ * ext4_mext_double_down_read - Acquire two inodes' read semaphore
+ *
+ * @orig_inode: original inode structure
+ * @donor_inode: donor inode structure
+ * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+ext4_mext_double_down_read(struct inode *orig_inode,
+ struct inode *donor_inode)
+{
+ struct inode *first = orig_inode, *second = donor_inode;
+
+ BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+ /*
+ * Use the inode number to provide the stable locking order instead
+ * of its address, because the C language doesn't guarantee you can
+ * compare pointers that don't come from the same array.
+ */
+ if (donor_inode->i_ino < orig_inode->i_ino) {
+ first = donor_inode;
+ second = orig_inode;
+ }
+
+ down_read(&EXT4_I(first)->i_data_sem);
+ down_read(&EXT4_I(second)->i_data_sem);
+}
+
+/**
+ * ext4_mext_double_down_write - Acquire two inodes' write semaphore
+ *
+ * @orig_inode: original inode structure
+ * @donor_inode: donor inode structure
+ * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+ext4_mext_double_down_write(struct inode *orig_inode,
+ struct inode *donor_inode)
+{
+ struct inode *first = orig_inode, *second = donor_inode;
+
+ BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+ /*
+ * Use the inode number to provide the stable locking order instead
+ * of its address, because the C language doesn't guarantee you can
+ * compare pointers that don't come from the same array.
+ */
+ if (donor_inode->i_ino < orig_inode->i_ino) {
+ first = donor_inode;
+ second = orig_inode;
+ }
+
+ down_write(&EXT4_I(first)->i_data_sem);
+ down_write(&EXT4_I(second)->i_data_sem);
+}
+
+/**
+ * ext4_mext_double_up_read - Release two inodes' read semaphore
+ *
+ * @orig_inode: original inode structure to be released its lock first
+ * @donor_inode: donor inode structure to be released its lock second
+ * Release read semaphore of two inodes (orig and donor).
+ */
+static void
+ext4_mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+ BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+ up_read(&EXT4_I(orig_inode)->i_data_sem);
+ up_read(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * ext4_mext_double_up_write - Release two inodes' write semaphore
+ *
+ * @orig_inode: original inode structure to be released its lock first
+ * @donor_inode: donor inode structure to be released its lock second
+ * Release write semaphore of two inodes (orig and donor).
+ */
+static void
+ext4_mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+ BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+ up_write(&EXT4_I(orig_inode)->i_data_sem);
+ up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * ext4_mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @o_start: first original extent to be changed
+ * @o_end: last original extent to be changed
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+ext4_mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+ struct ext4_extent *o_start, struct ext4_extent *o_end,
+ struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext)
+{
+ struct ext4_ext_path *orig_path = NULL;
+ ext4_lblk_t eblock = 0;
+ int new_flag = 0;
+ int end_flag = 0;
+ int err = 0;
+
+ if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+ if (o_start == o_end) {
+
+ /* start_ext new_ext end_ext
+ * donor |---------|-----------|--------|
+ * orig |------------------------------|
+ */
+ end_flag = 1;
+ } else {
+
+ /* start_ext new_ext end_ext
+ * donor |---------|----------|---------|
+ * orig |---------------|--------------|
+ */
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+ }
+
+ o_start->ee_len = start_ext->ee_len;
+ new_flag = 1;
+
+ } else if (start_ext->ee_len && new_ext->ee_len &&
+ !end_ext->ee_len && o_start == o_end) {
+
+ /* start_ext new_ext
+ * donor |--------------|---------------|
+ * orig |------------------------------|
+ */
+ o_start->ee_len = start_ext->ee_len;
+ new_flag = 1;
+
+ } else if (!start_ext->ee_len && new_ext->ee_len &&
+ end_ext->ee_len && o_start == o_end) {
+
+ /* new_ext end_ext
+ * donor |--------------|---------------|
+ * orig |------------------------------|
+ */
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+
+ /*
+ * Set 0 to the extent block if new_ext was
+ * the first block.
+ */
+ if (new_ext->ee_block)
+ eblock = le32_to_cpu(new_ext->ee_block);
+
+ new_flag = 1;
+ } else {
+ ext4_debug("ext4 move extent: Unexpected insert case\n");
+ return -EIO;
+ }
+
+ if (new_flag) {
+ ext4_mext_get_extpath(orig_path, orig_inode, eblock, err);
+ if (orig_path == NULL)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+ orig_path, new_ext))
+ goto out;
+ }
+
+ if (end_flag) {
+ ext4_mext_get_extpath(orig_path, orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, err);
+ if (orig_path == NULL)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+ orig_path, end_ext))
+ goto out;
+ }
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+
+ return err;
+
+}
+
+/**
+ * ext4_mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start: first original extent to be moved
+ * @o_end: last original extent to be moved
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ * @eh: extent header of target leaf block
+ * @range_to_move: used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+ext4_mext_insert_inside_block(struct ext4_extent *o_start,
+ struct ext4_extent *o_end, struct ext4_extent *start_ext,
+ struct ext4_extent *new_ext, struct ext4_extent *end_ext,
+ struct ext4_extent_header *eh, int range_to_move)
+{
+ int i = 0;
+ unsigned long len;
+
+ /* Move the existing extents */
+ if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+ len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+ (unsigned long)(o_end + 1);
+ memmove(o_end + 1 + range_to_move, o_end + 1, len);
+ }
+
+ /* Insert start entry */
+ if (start_ext->ee_len)
+ o_start[i++].ee_len = start_ext->ee_len;
+
+ /* Insert new entry */
+ if (new_ext->ee_len) {
+ o_start[i] = *new_ext;
+ ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+ }
+
+ /* Insert end entry */
+ if (end_ext->ee_len)
+ o_start[i] = *end_ext;
+
+ /* Increment the total entries counter on the extent block */
+ le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+
+/**
+ * ext4_mext_insert_extents - Insert new extent
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @orig_path: path indicates first extent to be changed
+ * @o_start: first original extent to be changed
+ * @o_end: last original extent to be changed
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call ext4_mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call ext4_mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+ext4_mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+ struct ext4_ext_path *orig_path, struct ext4_extent *o_start,
+ struct ext4_extent *o_end, struct ext4_extent *start_ext,
+ struct ext4_extent *new_ext, struct ext4_extent *end_ext)
+{
+ struct ext4_extent_header *eh;
+ unsigned long need_slots, slots_range;
+ int range_to_move, depth, ret;
+
+ /*
+ * The extents need to be inserted
+ * start_extent + new_extent + end_extent.
+ */
+ need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+ (new_ext->ee_len ? 1 : 0);
+
+ /* The number of slots between start and end */
+ slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+ / sizeof(struct ext4_extent);
+
+ /* Range to move the end of extent */
+ range_to_move = need_slots - slots_range;
+ depth = orig_path->p_depth;
+ orig_path += depth;
+ eh = orig_path->p_hdr;
+
+ if (depth) {
+ /* Register to journal */
+ ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+ if (ret)
+ return ret;
+ }
+
+ /* Expansion */
+ if (range_to_move > 0 &&
+ (range_to_move > le16_to_cpu(eh->eh_max)
+ - le16_to_cpu(eh->eh_entries))) {
+
+ ret = ext4_mext_insert_across_blocks(handle, orig_inode,
+ o_start, o_end, start_ext, new_ext, end_ext);
+ if (ret < 0)
+ return ret;
+ } else
+ ext4_mext_insert_inside_block(o_start, o_end, start_ext,
+ new_ext, end_ext, eh, range_to_move);
+
+ if (depth) {
+ ret = ext4_handle_dirty_metadata(handle, orig_inode,
+ orig_path->p_bh);
+ if (ret)
+ return ret;
+ } else {
+ ret = ext4_mark_inode_dirty(handle, orig_inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * ext4_mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @orig_path: path indicates first extent to be changed
+ * @dext: donor extent
+ * @from: start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling ext4_mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+ext4_mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+ struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+ ext4_lblk_t *from)
+{
+ struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+ struct ext4_extent new_ext, start_ext, end_ext;
+ ext4_lblk_t new_ext_end;
+ ext4_fsblk_t new_phys_end;
+ int oext_alen, new_ext_alen, end_ext_alen;
+ int depth = ext_depth(orig_inode);
+ int ret;
+
+ o_start = o_end = oext = orig_path[depth].p_ext;
+ oext_alen = ext4_ext_get_actual_len(oext);
+ start_ext.ee_len = end_ext.ee_len = 0;
+
+ new_ext.ee_block = cpu_to_le32(*from);
+ ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+ new_ext.ee_len = dext->ee_len;
+ new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+ new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+ new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
+
+ /*
+ * Case: original extent is first
+ * oext |--------|
+ * new_ext |--|
+ * start_ext |--|
+ */
+ if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+ le32_to_cpu(new_ext.ee_block) <
+ le32_to_cpu(oext->ee_block) + oext_alen) {
+ start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+ le32_to_cpu(oext->ee_block));
+ ext4_mext_copy_extent_status(oext, &start_ext);
+ } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+ prev_ext = oext - 1;
+ /*
+ * We can merge new_ext into previous extent,
+ * if these are contiguous and same extent type.
+ */
+ if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+ &new_ext)) {
+ o_start = prev_ext;
+ start_ext.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(prev_ext) +
+ new_ext_alen);
+ ext4_mext_copy_extent_status(prev_ext, &start_ext);
+ new_ext.ee_len = 0;
+ }
+ }
+
+ /*
+ * Case: new_ext_end must be less than oext
+ * oext |-----------|
+ * new_ext |-------|
+ */
+ BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+
+ /*
+ * Case: new_ext is smaller than original extent
+ * oext |---------------|
+ * new_ext |-----------|
+ * end_ext |---|
+ */
+ if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+ new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+ end_ext.ee_len =
+ cpu_to_le16(le32_to_cpu(oext->ee_block) +
+ oext_alen - 1 - new_ext_end);
+ ext4_mext_copy_extent_status(oext, &end_ext);
+ end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+ ext4_ext_store_pblock(&end_ext,
+ (ext_pblock(o_end) + oext_alen - end_ext_alen));
+ end_ext.ee_block =
+ cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+ oext_alen - end_ext_alen);
+ }
+
+ ret = ext4_mext_insert_extents(handle, orig_inode,
+ orig_path, o_start, o_end, &start_ext,
+ &new_ext, &end_ext);
+ return ret;
+}
+
+/**
+ * ext4_mext_get_replaced_extent - Compute extents for extent swapping.
+ *
+ * @tmp_dext: the extent that will belong to the original inode
+ * @tmp_oext: the extent that will belong to the donor inode
+ * @orig_off: block offset of original inode
+ * @donor_off: block offset of donor inode
+ * @max_count: the maximun length of extents
+ */
+static void
+ext4_mext_get_replaced_extent(struct ext4_extent *tmp_dext,
+ struct ext4_extent *tmp_oext,
+ ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+ ext4_lblk_t max_count)
+{
+ ext4_lblk_t diff, orig_diff;
+ struct ext4_extent dext_old, oext_old;
+
+ dext_old = *tmp_dext;
+ oext_old = *tmp_oext;
+
+ /* When tmp_dext is too large, pick up the target range. */
+ diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+
+ ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+ tmp_dext->ee_block =
+ cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
+ tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+
+ if (max_count < ext4_ext_get_actual_len(tmp_dext))
+ tmp_dext->ee_len = cpu_to_le16(max_count);
+
+ orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+ ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+
+ /* Adjust extent length if donor extent is larger than orig */
+ if (ext4_ext_get_actual_len(tmp_dext) >
+ ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+ tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+ orig_diff);
+
+ tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+
+ ext4_mext_copy_extent_status(&oext_old, tmp_dext);
+ ext4_mext_copy_extent_status(&dext_old, tmp_oext);
+}
+
+/**
+ * ext4_mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
+ * @from: block offset of orig_inode
+ * @count: block count to be replaced
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ * dummy extents.
+ * 2. Change the block information of original inode to point at the
+ * donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ * original inode blocks in the dummy extents.
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+ext4_mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+ struct inode *donor_inode, ext4_lblk_t from,
+ ext4_lblk_t count)
+{
+ struct ext4_ext_path *orig_path = NULL;
+ struct ext4_ext_path *donor_path = NULL;
+ struct ext4_extent *oext, *dext;
+ struct ext4_extent tmp_dext, tmp_oext;
+ ext4_lblk_t orig_off = from, donor_off = from;
+ int err = 0;
+ int depth;
+ int replaced_count = 0;
+ int dext_alen;
+
+ ext4_mext_double_down_write(orig_inode, donor_inode);
+
+ /* Get the original extent for the block "orig_off" */
+ ext4_mext_get_extpath(orig_path, orig_inode, orig_off, err);
+ if (orig_path == NULL)
+ goto out;
+
+ /* Get the donor extent for the head */
+ ext4_mext_get_extpath(donor_path, donor_inode, donor_off, err);
+ if (donor_path == NULL)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+ tmp_oext = *oext;
+
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+ tmp_dext = *dext;
+
+ ext4_mext_get_replaced_extent(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count);
+
+ /* Loop for the donor extents */
+ while (1) {
+ /* The extent for donor must be found. */
+ BUG_ON(!dext || donor_off != le32_to_cpu(dext->ee_block));
+
+ /* Set donor extent to orig extent */
+ err = ext4_mext_leaf_block(handle, orig_inode,
+ orig_path, &tmp_dext, &orig_off);
+ if (err < 0)
+ goto out;
+
+ /* Set orig extent to donor extent */
+ err = ext4_mext_leaf_block(handle, donor_inode,
+ donor_path, &tmp_oext, &donor_off);
+ if (err < 0)
+ goto out;
+
+ dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+ replaced_count += dext_alen;
+ donor_off += dext_alen;
+ orig_off += dext_alen;
+
+ /* Already moved the expected blocks */
+ if (replaced_count >= count)
+ break;
+
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ ext4_mext_get_extpath(orig_path, orig_inode, orig_off, err);
+ if (orig_path == NULL)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+ if (le32_to_cpu(oext->ee_block) +
+ ext4_ext_get_actual_len(oext) <= orig_off) {
+ err = 0;
+ goto out;
+ }
+ tmp_oext = *oext;
+
+ if (donor_path)
+ ext4_ext_drop_refs(donor_path);
+ ext4_mext_get_extpath(donor_path, donor_inode,
+ donor_off, err);
+ if (donor_path == NULL)
+ goto out;
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+ if (le32_to_cpu(dext->ee_block) +
+ ext4_ext_get_actual_len(dext) <= donor_off) {
+ err = 0;
+ goto out;
+ }
+ tmp_dext = *dext;
+
+ ext4_mext_get_replaced_extent(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count - replaced_count);
+ }
+
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+ if (donor_path) {
+ ext4_ext_drop_refs(donor_path);
+ kfree(donor_path);
+ }
+
+ ext4_mext_double_up_write(orig_inode, donor_inode);
+ return err;
+}
+
+/**
+ * ext4_mext_partial - Move extent data per page
+ *
+ * @o_filp: file structure of original file
+ * @donor_inode: donor inode
+ * @orig_page_offset: page index on original file
+ * @data_offset_in_page: block index where data swapping starts
+ * @block_len_in_page: the number of blocks to be swapped
+ * @uninit: orig extent is uninitialized or not
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling ext4_mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+ext4_mext_partial(struct file *o_filp, struct inode *donor_inode,
+ pgoff_t orig_page_offset, int data_offset_in_page,
+ int block_len_in_page, int uninit)
+{
+ struct inode *orig_inode = o_filp->f_dentry->d_inode;
+ struct address_space *mapping = orig_inode->i_mapping;
+ struct buffer_head *bh;
+ struct page *page = NULL;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ handle_t *handle;
+ ext4_lblk_t orig_blk_offset;
+ long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+ unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+ unsigned int w_flags = 0;
+ unsigned int tmp_data_len, data_len;
+ void *fsdata;
+ int ret, i, jblocks;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+
+ /*
+ * It needs twice the amount of ordinary journal buffers because
+ * inode and donor_inode may change each different metadata blocks.
+ */
+ jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+ handle = ext4_journal_start(orig_inode, jblocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+
+ if (segment_eq(get_fs(), KERNEL_DS))
+ w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ orig_blk_offset = orig_page_offset * blocks_per_page +
+ data_offset_in_page;
+
+ /*
+ * If orig extent is uninitialized one,
+ * it's not necessary force the page into memory
+ * and then force it to be written out again.
+ * Just swap data blocks between orig and donor.
+ */
+ if (uninit) {
+ ret = ext4_mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page);
+
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+ goto out2;
+ }
+
+ offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+
+ /* Calculate data_len */
+ if ((orig_blk_offset + block_len_in_page - 1) ==
+ ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+ /* Replace the last block */
+ tmp_data_len = orig_inode->i_size & (blocksize - 1);
+ /*
+ * If data_len equal zero, it shows data_len is multiples of
+ * blocksize. So we set appropriate value.
+ */
+ if (tmp_data_len == 0)
+ tmp_data_len = blocksize;
+
+ data_len = tmp_data_len +
+ ((block_len_in_page - 1) << orig_inode->i_blkbits);
+ } else {
+ data_len = block_len_in_page << orig_inode->i_blkbits;
+ }
+
+ ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+ &page, &fsdata);
+ if (unlikely(ret < 0))
+ goto out;
+
+ if (!PageUptodate(page)) {
+ mapping->a_ops->readpage(o_filp, page);
+ lock_page(page);
+ }
+
+ /*
+ * try_to_release_page() doesn't call releasepage in writeback mode.
+ * We should care about the order of writing to the same file
+ * by multiple move extent processes.
+ * It needs to call wait_on_page_writeback() to wait for the
+ * writeback of the page.
+ */
+ if (PageWriteback(page))
+ wait_on_page_writeback(page);
+
+ /* Release old bh and drop refs */
+ try_to_release_page(page, 0);
+
+ ret = ext4_mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset, block_len_in_page);
+ if (ret < 0)
+ goto out;
+
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+
+ bh = page_buffers(page);
+ for (i = 0; i < data_offset_in_page; i++)
+ bh = bh->b_this_page;
+
+ for (i = 0; i < block_len_in_page; i++) {
+ ret = ext4_get_block(orig_inode,
+ (sector_t)(orig_blk_offset + i), bh, 0);
+ if (ret < 0)
+ goto out;
+
+ if (bh->b_this_page != NULL)
+ bh = bh->b_this_page;
+ }
+
+ ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, page,
+ fsdata);
+ page = NULL;
+
+out:
+ if (unlikely(page)) {
+ if (PageLocked(page))
+ unlock_page(page);
+ page_cache_release(page);
+ }
+out2:
+ ext4_journal_stop(handle);
+
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * ext4_mext_check_argumants - Check whether move extent can be done
+ *
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
+ * @orig_start: logical start offset in block for orig
+ * @donor_start: logical start offset in block for donor
+ * @len: the number of blocks to be moved
+ * @moved_len: moved block length
+ *
+ * Check the arguments of ext4_mext_move_extent() whether the files can be
+ * exchanged each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+ext4_mext_check_arguments(struct inode *orig_inode,
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 len, __u64 moved_len)
+{
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be "
+ "regular file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Ext4 move extent does not support swapfile */
+ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should "
+ "not be swapfile [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Files should be in the same ext4 FS */
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files "
+ "should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* orig and donor should be different file */
+ if (orig_inode->i_ino == donor_inode->i_ino) {
+ ext4_debug("ext4 move extent: The argument files should not "
+ "be same file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Ext4 move extent supports only extent based file */
+ if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ ext4_debug("ext4 move extent: orig file is not extents "
+ "based file [ino:orig %lu]\n", orig_inode->i_ino);
+ return -EOPNOTSUPP;
+ } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ ext4_debug("ext4 move extent: donor file is not extents "
+ "based file [ino:donor %lu]\n", donor_inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+
+ /* Start offset should be same */
+ if (orig_start != donor_start) {
+ ext4_debug("ext4 move extent: orig and donor's start "
+ "offset are not same [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (moved_len) {
+ ext4_debug("ext4 move extent: moved_len should be 0 "
+ "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+ donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if ((orig_start > MAX_DEFRAG_SIZE) ||
+ (donor_start > MAX_DEFRAG_SIZE) ||
+ (len > MAX_DEFRAG_SIZE) ||
+ (orig_start + len > MAX_DEFRAG_SIZE)) {
+ ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
+ "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_inode->i_size > donor_inode->i_size) {
+ if (orig_start >= donor_inode->i_size) {
+ ext4_debug("ext4 move extent: orig start offset "
+ "[%llu] should be less than donor file size "
+ "[%lld] [ino:orig %lu, donor_inode %lu]\n",
+ orig_start, donor_inode->i_size,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_start + len > donor_inode->i_size) {
+ ext4_debug("ext4 move extent: End offset [%llu] should "
+ "be less than donor file size [%lld]."
+ "So adjust length from %llu to %lld "
+ "[ino:orig %lu, donor %lu]\n",
+ orig_start + len, donor_inode->i_size,
+ len, donor_inode->i_size - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+ len = donor_inode->i_size - orig_start;
+ }
+ } else {
+ if (orig_start >= orig_inode->i_size) {
+ ext4_debug("ext4 move extent: start offset [%llu] "
+ "should be less than original file size "
+ "[%lld] [inode:orig %lu, donor %lu]\n",
+ orig_start, orig_inode->i_size,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_start + len > orig_inode->i_size) {
+ ext4_debug("ext4 move extent: Adjust length "
+ "from %llu to %lld. Because it should be "
+ "less than original file size "
+ "[ino:orig %lu, donor %lu]\n",
+ len, orig_inode->i_size - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+ len = orig_inode->i_size - orig_start;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * ext4_mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ *
+ * @inode1: the inode structure
+ * @inode2: the inode structure
+ *
+ * Lock two inodes' i_mutex by i_ino order. This function is moved from
+ * fs/inode.c.
+ */
+void ext4_mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
+ if (inode1)
+ mutex_lock(&inode1->i_mutex);
+ else if (inode2)
+ mutex_lock(&inode2->i_mutex);
+ return;
+ }
+
+ if (inode1->i_ino < inode2->i_ino) {
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+ }
+}
+
+/**
+ * ext4_mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ *
+ * @inode1: the inode that is released first
+ * @inode2: the inode that is released second
+ *
+ * This function is moved from fs/inode.c.
+ */
+
+void ext4_mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1)
+ mutex_unlock(&inode1->i_mutex);
+
+ if (inode2 && inode2 != inode1)
+ mutex_unlock(&inode2->i_mutex);
+}
+
+/**
+ * ext4_mext_move_extent - Exchange the specified range of a file
+ *
+ * @o_filp: file structure of the original file
+ * @d_filp: file structure of the donor file
+ * @orig_start: start offset in block for orig
+ * @donor_start: start offset in block for donor
+ * @len: the number of blocks to be moved
+ * @moved_len: moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_mext_move_extent() proceeds the following order.
+ * 1:ext4_mext_move_extent() calculates the last block number of moving extent
+ * function by the start block number (orig_start) and the number of blocks
+ * to be moved (len) specified as arguments.
+ * If the {orig, donor}_start points a hole, the extent's start offset
+ * pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ * after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ * or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call ext4_mext_next_extent()
+ * specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ * until find un-continuous extent, the start logical block number exceeds
+ * the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ * from orig_page_offset to seq_end_page.
+ * The start indexes of data are specified as arguments.
+ * That of the original inode is orig_page_offset,
+ * and the donor inode is also orig_page_offset
+ * (To easily handle blocksize != pagesize case, the offset for the
+ * donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ * then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ * which shows the number of moved blocks.
+ * The moved_len is useful for the command to calculate the file offset
+ * for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_mext_move_extent(struct file *o_filp, struct file *d_filp,
+ __u64 orig_start, __u64 donor_start, __u64 len, __u64 *moved_len)
+{
+ struct inode *orig_inode = o_filp->f_dentry->d_inode;
+ struct inode *donor_inode = d_filp->f_dentry->d_inode;
+ struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+ struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+ ext4_lblk_t block_start = orig_start;
+ ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+ ext4_lblk_t rest_blocks;
+ pgoff_t orig_page_offset = 0, seq_end_page;
+ int ret, depth, last_extent = 0;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ int data_offset_in_page;
+ int block_len_in_page;
+ int uninit;
+
+ /* protect orig and donor against a truncate */
+ ext4_mext_inode_double_lock(orig_inode, donor_inode);
+
+ ext4_mext_double_down_read(orig_inode, donor_inode);
+ /* Check the filesystem environment whether move_extent can be done */
+ ret = ext4_mext_check_arguments(orig_inode, donor_inode,
+ orig_start, donor_start, len, *moved_len);
+ ext4_mext_double_up_read(orig_inode, donor_inode);
+ if (ret)
+ goto out2;
+
+ file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+ block_end = block_start + len - 1;
+ if (file_end < block_end)
+ len -= block_end - file_end;
+
+ ext4_mext_get_extpath(orig_path, orig_inode, block_start, ret);
+ if (orig_path == NULL)
+ goto out2;
+
+ /* Get path structure to check the hole */
+ ext4_mext_get_extpath(holecheck_path, orig_inode, block_start, ret);
+ if (holecheck_path == NULL)
+ goto out;
+
+ depth = ext_depth(orig_inode);
+ ext_cur = holecheck_path[depth].p_ext;
+ if (ext_cur == NULL)
+ goto out;
+
+ /*
+ * Get proper extent whose ee_block is beyond block_start
+ * if block_start was within the hole.
+ */
+ if (le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ last_extent = ext4_mext_next_extent(orig_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ last_extent = ext4_mext_next_extent(orig_inode, orig_path,
+ &ext_dummy);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ }
+ seq_start = block_start;
+
+ /* No blocks within the specified range. */
+ if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+ ext4_debug("ext4 move extent: The specified range of file "
+ "may be the hole\n");
+ goto out;
+ }
+
+ /* Adjust start blocks */
+ add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+ max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+ while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+ seq_blocks += add_blocks;
+
+ /* Adjust tail blocks */
+ if (seq_start + seq_blocks - 1 > block_end)
+ seq_blocks = block_end - seq_start + 1;
+
+ ext_prev = ext_cur;
+ last_extent = ext4_mext_next_extent(orig_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ break;
+ }
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+
+ /*
+ * Extend the length of contiguous block (seq_blocks)
+ * if extents are contiguous.
+ */
+ if (ext4_can_extents_be_merged(orig_inode,
+ ext_prev, ext_cur) &&
+ block_end >= le32_to_cpu(ext_cur->ee_block) &&
+ !last_extent)
+ continue;
+
+ /* Is original extent is uninitialized */
+ uninit = ext4_ext_is_uninitialized(ext_prev);
+
+ data_offset_in_page = seq_start % blocks_per_page;
+
+ /*
+ * Calculate data blocks count that should be swapped
+ * at the first page.
+ */
+ if (data_offset_in_page + seq_blocks > blocks_per_page) {
+ /* Swapped blocks are across pages */
+ block_len_in_page =
+ blocks_per_page - data_offset_in_page;
+ } else {
+ /* Swapped blocks are in a page */
+ block_len_in_page = seq_blocks;
+ }
+
+ orig_page_offset = seq_start >>
+ (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+ seq_end_page = (seq_start + seq_blocks - 1) >>
+ (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ rest_blocks = seq_blocks;
+
+ /* Discard preallocations of two inodes */
+ down_write(&EXT4_I(orig_inode)->i_data_sem);
+ ext4_discard_preallocations(orig_inode);
+ up_write(&EXT4_I(orig_inode)->i_data_sem);
+
+ down_write(&EXT4_I(donor_inode)->i_data_sem);
+ ext4_discard_preallocations(donor_inode);
+ up_write(&EXT4_I(donor_inode)->i_data_sem);
+
+ while (orig_page_offset <= seq_end_page) {
+
+ /* Swap original branches with new branches */
+ ret = ext4_mext_partial(o_filp, donor_inode,
+ orig_page_offset, data_offset_in_page,
+ block_len_in_page, uninit);
+ if (ret < 0)
+ goto out;
+ orig_page_offset++;
+
+ data_offset_in_page = 0;
+ rest_blocks -= block_len_in_page;
+ if (rest_blocks > blocks_per_page)
+ block_len_in_page = blocks_per_page;
+ else
+ block_len_in_page = rest_blocks;
+ }
+
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+ ext4_mext_get_extpath(holecheck_path, orig_inode,
+ seq_start, ret);
+ if (holecheck_path == NULL)
+ break;
+ depth = holecheck_path->p_depth;
+
+ /* Decrease buffer counter */
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ ext4_mext_get_extpath(orig_path, orig_inode, seq_start, ret);
+ if (orig_path == NULL)
+ break;
+
+ ext_cur = holecheck_path[depth].p_ext;
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+ seq_blocks = 0;
+
+ }
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+ if (holecheck_path) {
+ ext4_ext_drop_refs(holecheck_path);
+ kfree(holecheck_path);
+ }
+out2:
+ ext4_mext_inode_double_unlock(orig_inode, donor_inode);
+
+ if (ret) {
+ *moved_len = orig_page_offset * blocks_per_page;
+ return ret;
+ }
+
+ /* Set moved block length */
+ *moved_len = len;
+
+ return 0;
+}


2009-06-13 13:21:52

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/3] Add EXT4_IOC_MOVE_EXT ioctl and related functions

On Fri, May 22, 2009 at 04:06:16PM +0900, Akira Fujita wrote:
> ext4: online defrag -- Add EXT4_IOC_MOVE_EXT ioctl and related functions.
>
> From: Akira Fujita <[email protected]>
>
> The EXT4_IOC_MOVE_EXT exchanges the blocks between orig_fd and donor_fd,
> and then write the file data of orig_fd to donor_fd.
> ext4_mext_move_extent() is the main fucntion of ext4 online defrag,
> and this patch includes all functions related to ext4 online defrag.

Akira-san,

Thank you for all of the hard work and preserverance with the online
defrag work! This patch is much, *much* better; I've done a quick
review, and I've only noted two things, which I've updated in the
version I've now moved into the stable portion of the patch queue.
One is that nothing actually uses orig_fd in the move_extent
structure; so to avoid confusion, and I've renamed it to "reserved",
and used explicit __u32 fields for the reserved and donor_fd fields.
Also, I've renamed ext4_mext_move_extent() to ext4_move_extents();
since it is the one published interface, I wanted it to have an
easier-to-understand name.

As a side note, the static functions in fs/ext4/move_extent.c really
don't need the ext4_mext prefix, since static functions don't have
namespace issues that require a consistent naming scheme. (Sometimes
a shorter name can also be useful since it avoids needing to line wrap
function calls with a long list of parameters.)

I haven't done extensive testing on the patch to make sure that
nothing bad happens if the file is actively been modified while the
defrag progam is running, but the interfaces look good, which is what
matters in terms of pushing it to the stable queue.

Again, many thanks,

Regards,

- Ted

2009-06-15 08:03:55

by Akira Fujita

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/3] Add EXT4_IOC_MOVE_EXT ioctl and related functions

Hi Ted,

Thank you for your time to review my patch.

Theodore Tso wrote:
> On Fri, May 22, 2009 at 04:06:16PM +0900, Akira Fujita wrote:
>> ext4: online defrag -- Add EXT4_IOC_MOVE_EXT ioctl and related functions.
>>
>> From: Akira Fujita <[email protected]>
>>
>> The EXT4_IOC_MOVE_EXT exchanges the blocks between orig_fd and donor_fd,
>> and then write the file data of orig_fd to donor_fd.
>> ext4_mext_move_extent() is the main fucntion of ext4 online defrag,
>> and this patch includes all functions related to ext4 online defrag.
>
> Akira-san,
>
> Thank you for all of the hard work and preserverance with the online
> defrag work! This patch is much, *much* better; I've done a quick
> review, and I've only noted two things, which I've updated in the
> version I've now moved into the stable portion of the patch queue.
> One is that nothing actually uses orig_fd in the move_extent
> structure; so to avoid confusion, and I've renamed it to "reserved",
> and used explicit __u32 fields for the reserved and donor_fd fields.
> Also, I've renamed ext4_mext_move_extent() to ext4_move_extents();
> since it is the one published interface, I wanted it to have an
> easier-to-understand name.

Ok. Certainly orig_fd of move_extent structure is not used
since fd of original file is passed via ioctl directly.
My recognition after the change is as follows:

struct move_extent {
__u32 reserved; /* reserved field */
__u32 donor_fd; /* donor file descriptor */
__u64 orig_start; /* logical start offset in block for orig */
__u64 donor_start; /* logical start offset in block for donor */
__u64 len; /* block length to be moved */
__u64 moved_len; /* moved block length */
};

int ext4_move_extent(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
__u64 len, __u64 *moved_len);

Little changes are needed for command to run ext4 online defrag,
so I will resend patch in a few days.


> As a side note, the static functions in fs/ext4/move_extent.c really
> don't need the ext4_mext prefix, since static functions don't have
> namespace issues that require a consistent naming scheme. (Sometimes
> a shorter name can also be useful since it avoids needing to line wrap
> function calls with a long list of parameters.)

I will check all of the functions in move_extent.c
whether the function name can be shorter or not.

Regards,
Akira Fujita

2009-06-17 05:51:46

by Akira Fujita

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/3] Add EXT4_IOC_MOVE_EXT ioctl and related functions

Hi Ted,

Theodore Tso wrote:
> As a side note, the static functions in fs/ext4/move_extent.c really
> don't need the ext4_mext prefix, since static functions don't have
> namespace issues that require a consistent naming scheme. (Sometimes
> a shorter name can also be useful since it avoids needing to line wrap
> function calls with a long list of parameters.)

This patch is for "online-defrag" in the ext4 patch queue,
and changes are as follows:

- Remove unneeded function prefix (ext4_mext_ or ext4_)
in fs/ext4/move_extent.c to make function name shorter.
And change some name of functions.
- Fix error handling issue.
- Add some argument checks.

If this patch does not seem to have any problem,
could you add this change to the ext4 patch queue?

Best regards,
Akira Fujita

---
ext4: online defrag -- Change function prefix and some fixes

From: Akira Fujita <[email protected]>

- Remove unneeded function prefix to make function name shorter.
e.g ext4_mext_copy_extent_status() -> copy_extent_status()
And change some name of functions.
- Fix error handling issue.
- Add some argument checks.

Signed-off-by: Akira Fujita <[email protected]>
---
move_extent.c | 214 ++++++++++++++++++++++++++++++----------------------------
1 file changed, 113 insertions(+), 101 deletions(-)
--- linux-2.6.30-git4/fs/ext4/move_extent.c 2009-06-16 11:43:41.000000000 +0900
+++ linux-2.6.30-git4-fix/fs/ext4/move_extent.c 2009-06-16 13:45:14.000000000 +0900
@@ -19,7 +19,7 @@
#include "ext4_extents.h"
#include "ext4.h"

-#define ext4_mext_get_extpath(path, inode, block, ret) \
+#define get_ext_path(path, inode, block, ret) \
do { \
path = ext4_ext_find_extent(inode, block, path); \
if (IS_ERR(path)) { \
@@ -29,13 +29,13 @@
} while (0)

/**
- * ext4_mext_copy_extent_status - Copy the extent's initialization status
+ * copy_extent_status - Copy the extent's initialization status
*
* @src: an extent for getting initialize status
* @dest: an extent to be set the status
*/
static void
-ext4_mext_copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
{
if (ext4_ext_is_uninitialized(src))
ext4_ext_mark_uninitialized(dest);
@@ -44,7 +44,7 @@ ext4_mext_copy_extent_status(struct ext4
}

/**
- * ext4_mext_next_extent - Search for the next extent and set it to "extent"
+ * mext_next_extent - Search for the next extent and set it to "extent"
*
* @inode: inode which is searched
* @path: this will obtain data for the next extent
@@ -57,7 +57,7 @@ ext4_mext_copy_extent_status(struct ext4
* value on failure.
*/
static int
-ext4_mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent **extent)
{
int ppos, leaf_ppos = path->p_depth;
@@ -113,15 +113,14 @@ ext4_mext_next_extent(struct inode *inod
}

/**
- * ext4_mext_double_down_read - Acquire two inodes' read semaphore
+ * mext_double_down_read - Acquire two inodes' read semaphore
*
* @orig_inode: original inode structure
* @donor_inode: donor inode structure
* Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
*/
static void
-ext4_mext_double_down_read(struct inode *orig_inode,
- struct inode *donor_inode)
+mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;

@@ -142,15 +141,14 @@ ext4_mext_double_down_read(struct inode
}

/**
- * ext4_mext_double_down_write - Acquire two inodes' write semaphore
+ * mext_double_down_write - Acquire two inodes' write semaphore
*
* @orig_inode: original inode structure
* @donor_inode: donor inode structure
* Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
*/
static void
-ext4_mext_double_down_write(struct inode *orig_inode,
- struct inode *donor_inode)
+mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;

@@ -171,14 +169,14 @@ ext4_mext_double_down_write(struct inode
}

/**
- * ext4_mext_double_up_read - Release two inodes' read semaphore
+ * mext_double_up_read - Release two inodes' read semaphore
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
* Release read semaphore of two inodes (orig and donor).
*/
static void
-ext4_mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
{
BUG_ON(orig_inode == NULL || donor_inode == NULL);

@@ -187,14 +185,14 @@ ext4_mext_double_up_read(struct inode *o
}

/**
- * ext4_mext_double_up_write - Release two inodes' write semaphore
+ * mext_double_up_write - Release two inodes' write semaphore
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
* Release write semaphore of two inodes (orig and donor).
*/
static void
-ext4_mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
{
BUG_ON(orig_inode == NULL || donor_inode == NULL);

@@ -203,7 +201,7 @@ ext4_mext_double_up_write(struct inode *
}

/**
- * ext4_mext_insert_across_blocks - Insert extents across leaf block
+ * mext_insert_across_blocks - Insert extents across leaf block
*
* @handle: journal handle
* @orig_inode: original inode
@@ -217,7 +215,7 @@ ext4_mext_double_up_write(struct inode *
* or a negative error value on failure.
*/
static int
-ext4_mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
struct ext4_extent *o_start, struct ext4_extent *o_end,
struct ext4_extent *start_ext, struct ext4_extent *new_ext,
struct ext4_extent *end_ext)
@@ -285,7 +283,7 @@ ext4_mext_insert_across_blocks(handle_t
}

if (new_flag) {
- ext4_mext_get_extpath(orig_path, orig_inode, eblock, err);
+ get_ext_path(orig_path, orig_inode, eblock, err);
if (orig_path == NULL)
goto out;

@@ -295,7 +293,7 @@ ext4_mext_insert_across_blocks(handle_t
}

if (end_flag) {
- ext4_mext_get_extpath(orig_path, orig_inode,
+ get_ext_path(orig_path, orig_inode,
le32_to_cpu(end_ext->ee_block) - 1, err);
if (orig_path == NULL)
goto out;
@@ -315,7 +313,7 @@ out:
}

/**
- * ext4_mext_insert_inside_block - Insert new extent to the extent block
+ * mext_insert_inside_block - Insert new extent to the extent block
*
* @o_start: first original extent to be moved
* @o_end: last original extent to be moved
@@ -329,7 +327,7 @@ out:
* by inserted extents.
*/
static void
-ext4_mext_insert_inside_block(struct ext4_extent *o_start,
+mext_insert_inside_block(struct ext4_extent *o_start,
struct ext4_extent *o_end,
struct ext4_extent *start_ext,
struct ext4_extent *new_ext,
@@ -366,7 +364,7 @@ ext4_mext_insert_inside_block(struct ext
}

/**
- * ext4_mext_insert_extents - Insert new extent
+ * mext_insert_extents - Insert new extent
*
* @handle: journal handle
* @orig_inode: original inode
@@ -378,12 +376,12 @@ ext4_mext_insert_inside_block(struct ext
* @end_ext: last new extent to be inserted
*
* Call the function to insert extents. If we cannot add more extents into
- * the leaf block, we call ext4_mext_insert_across_blocks() to create a
- * new leaf block. Otherwise call ext4_mext_insert_inside_block(). Return 0
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
* on success, or a negative error value on failure.
*/
static int
-ext4_mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
struct ext4_ext_path *orig_path,
struct ext4_extent *o_start,
struct ext4_extent *o_end,
@@ -424,15 +422,13 @@ ext4_mext_insert_extents(handle_t *handl
(range_to_move > le16_to_cpu(eh->eh_max)
- le16_to_cpu(eh->eh_entries))) {

- ret = ext4_mext_insert_across_blocks(handle, orig_inode,
- o_start, o_end, start_ext,
- new_ext, end_ext);
+ ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+ o_end, start_ext, new_ext, end_ext);
if (ret < 0)
return ret;
} else
- ext4_mext_insert_inside_block(o_start, o_end, start_ext,
- new_ext, end_ext, eh,
- range_to_move);
+ mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+ end_ext, eh, range_to_move);

if (depth) {
ret = ext4_handle_dirty_metadata(handle, orig_inode,
@@ -449,7 +445,7 @@ ext4_mext_insert_extents(handle_t *handl
}

/**
- * ext4_mext_leaf_block - Move one leaf extent block into the inode.
+ * mext_leaf_block - Move one leaf extent block into the inode.
*
* @handle: journal handle
* @orig_inode: original inode
@@ -462,11 +458,11 @@ ext4_mext_insert_extents(handle_t *handl
* extents, and the others are located around it.
*
* Therefore, this function creates structures to save extents of the leaf
- * block, and inserts extents by calling ext4_mext_insert_extents() with
+ * block, and inserts extents by calling mext_insert_extents() with
* created extents. Return 0 on success, or a negative error value on failure.
*/
static int
-ext4_mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
struct ext4_ext_path *orig_path, struct ext4_extent *dext,
ext4_lblk_t *from)
{
@@ -500,7 +496,7 @@ ext4_mext_leaf_block(handle_t *handle, s
le32_to_cpu(oext->ee_block) + oext_alen) {
start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
le32_to_cpu(oext->ee_block));
- ext4_mext_copy_extent_status(oext, &start_ext);
+ copy_extent_status(oext, &start_ext);
} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
prev_ext = oext - 1;
/*
@@ -513,7 +509,7 @@ ext4_mext_leaf_block(handle_t *handle, s
start_ext.ee_len = cpu_to_le16(
ext4_ext_get_actual_len(prev_ext) +
new_ext_alen);
- ext4_mext_copy_extent_status(prev_ext, &start_ext);
+ copy_extent_status(prev_ext, &start_ext);
new_ext.ee_len = 0;
}
}
@@ -536,7 +532,7 @@ ext4_mext_leaf_block(handle_t *handle, s
end_ext.ee_len =
cpu_to_le16(le32_to_cpu(oext->ee_block) +
oext_alen - 1 - new_ext_end);
- ext4_mext_copy_extent_status(oext, &end_ext);
+ copy_extent_status(oext, &end_ext);
end_ext_alen = ext4_ext_get_actual_len(&end_ext);
ext4_ext_store_pblock(&end_ext,
(ext_pblock(o_end) + oext_alen - end_ext_alen));
@@ -545,14 +541,13 @@ ext4_mext_leaf_block(handle_t *handle, s
oext_alen - end_ext_alen);
}

- ret = ext4_mext_insert_extents(handle, orig_inode,
- orig_path, o_start, o_end, &start_ext,
- &new_ext, &end_ext);
+ ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+ o_end, &start_ext, &new_ext, &end_ext);
return ret;
}

/**
- * ext4_mext_get_replaced_extent - Compute extents for extent swapping.
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
*
* @tmp_dext: the extent that will belong to the original inode
* @tmp_oext: the extent that will belong to the donor inode
@@ -561,7 +556,7 @@ ext4_mext_leaf_block(handle_t *handle, s
* @max_count: the maximun length of extents
*/
static void
-ext4_mext_get_replaced_extent(struct ext4_extent *tmp_dext,
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
struct ext4_extent *tmp_oext,
ext4_lblk_t orig_off, ext4_lblk_t donor_off,
ext4_lblk_t max_count)
@@ -594,12 +589,12 @@ ext4_mext_get_replaced_extent(struct ext

tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));

- ext4_mext_copy_extent_status(&oext_old, tmp_dext);
- ext4_mext_copy_extent_status(&dext_old, tmp_oext);
+ copy_extent_status(&oext_old, tmp_dext);
+ copy_extent_status(&dext_old, tmp_oext);
}

/**
- * ext4_mext_replace_branches - Replace original extents with new extents
+ * mext_replace_branches - Replace original extents with new extents
*
* @handle: journal handle
* @orig_inode: original inode
@@ -619,7 +614,7 @@ ext4_mext_get_replaced_extent(struct ext
* Return 0 on success, or a negative error value on failure.
*/
static int
-ext4_mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
struct inode *donor_inode, ext4_lblk_t from,
ext4_lblk_t count)
{
@@ -633,15 +628,15 @@ ext4_mext_replace_branches(handle_t *han
int replaced_count = 0;
int dext_alen;

- ext4_mext_double_down_write(orig_inode, donor_inode);
+ mext_double_down_write(orig_inode, donor_inode);

/* Get the original extent for the block "orig_off" */
- ext4_mext_get_extpath(orig_path, orig_inode, orig_off, err);
+ get_ext_path(orig_path, orig_inode, orig_off, err);
if (orig_path == NULL)
goto out;

/* Get the donor extent for the head */
- ext4_mext_get_extpath(donor_path, donor_inode, donor_off, err);
+ get_ext_path(donor_path, donor_inode, donor_off, err);
if (donor_path == NULL)
goto out;
depth = ext_depth(orig_inode);
@@ -652,22 +647,22 @@ ext4_mext_replace_branches(handle_t *han
dext = donor_path[depth].p_ext;
tmp_dext = *dext;

- ext4_mext_get_replaced_extent(&tmp_dext, &tmp_oext, orig_off,
+ mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count);

/* Loop for the donor extents */
while (1) {
/* The extent for donor must be found. */
- BUG_ON(!dext || donor_off != le32_to_cpu(dext->ee_block));
+ BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));

/* Set donor extent to orig extent */
- err = ext4_mext_leaf_block(handle, orig_inode,
+ err = mext_leaf_block(handle, orig_inode,
orig_path, &tmp_dext, &orig_off);
if (err < 0)
goto out;

/* Set orig extent to donor extent */
- err = ext4_mext_leaf_block(handle, donor_inode,
+ err = mext_leaf_block(handle, donor_inode,
donor_path, &tmp_oext, &donor_off);
if (err < 0)
goto out;
@@ -683,7 +678,7 @@ ext4_mext_replace_branches(handle_t *han

if (orig_path)
ext4_ext_drop_refs(orig_path);
- ext4_mext_get_extpath(orig_path, orig_inode, orig_off, err);
+ get_ext_path(orig_path, orig_inode, orig_off, err);
if (orig_path == NULL)
goto out;
depth = ext_depth(orig_inode);
@@ -697,7 +692,7 @@ ext4_mext_replace_branches(handle_t *han

if (donor_path)
ext4_ext_drop_refs(donor_path);
- ext4_mext_get_extpath(donor_path, donor_inode,
+ get_ext_path(donor_path, donor_inode,
donor_off, err);
if (donor_path == NULL)
goto out;
@@ -710,7 +705,7 @@ ext4_mext_replace_branches(handle_t *han
}
tmp_dext = *dext;

- ext4_mext_get_replaced_extent(&tmp_dext, &tmp_oext, orig_off,
+ mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off,
count - replaced_count);
}
@@ -725,12 +720,12 @@ out:
kfree(donor_path);
}

- ext4_mext_double_up_write(orig_inode, donor_inode);
+ mext_double_up_write(orig_inode, donor_inode);
return err;
}

/**
- * ext4_mext_partial - Move extent data per page
+ * move_extent_per_page - Move extent data per page
*
* @o_filp: file structure of original file
* @donor_inode: donor inode
@@ -740,12 +735,12 @@ out:
* @uninit: orig extent is uninitialized or not
*
* Save the data in original inode blocks and replace original inode extents
- * with donor inode extents by calling ext4_mext_replace_branches().
+ * with donor inode extents by calling mext_replace_branches().
* Finally, write out the saved data in new original inode blocks. Return 0
* on success, or a negative error value on failure.
*/
static int
-ext4_mext_partial(struct file *o_filp, struct inode *donor_inode,
+move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
int block_len_in_page, int uninit)
{
@@ -788,7 +783,7 @@ ext4_mext_partial(struct file *o_filp, s
* Just swap data blocks between orig and donor.
*/
if (uninit) {
- ret = ext4_mext_replace_branches(handle, orig_inode,
+ ret = mext_replace_branches(handle, orig_inode,
donor_inode, orig_blk_offset,
block_len_in_page);

@@ -841,7 +836,7 @@ ext4_mext_partial(struct file *o_filp, s
/* Release old bh and drop refs */
try_to_release_page(page, 0);

- ret = ext4_mext_replace_branches(handle, orig_inode, donor_inode,
+ ret = mext_replace_branches(handle, orig_inode, donor_inode,
orig_blk_offset, block_len_in_page);
if (ret < 0)
goto out;
@@ -884,7 +879,7 @@ out2:
}

/**
- * ext4_mext_check_argumants - Check whether move extent can be done
+ * mext_check_argumants - Check whether move extent can be done
*
* @orig_inode: original inode
* @donor_inode: donor inode
@@ -898,9 +893,9 @@ out2:
* Return 0 on success, or a negative error value on failure.
*/
static int
-ext4_mext_check_arguments(struct inode *orig_inode,
+mext_check_arguments(struct inode *orig_inode,
struct inode *donor_inode, __u64 orig_start,
- __u64 donor_start, __u64 len, __u64 moved_len)
+ __u64 donor_start, __u64 *len, __u64 moved_len)
{
/* Regular file check */
if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
@@ -945,6 +940,11 @@ ext4_mext_check_arguments(struct inode *
return -EOPNOTSUPP;
}

+ if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+ ext4_debug("ext4 move extent: File size is 0 byte\n");
+ return -EINVAL;
+ }
+
/* Start offset should be same */
if (orig_start != donor_start) {
ext4_debug("ext4 move extent: orig and donor's start "
@@ -962,8 +962,8 @@ ext4_mext_check_arguments(struct inode *

if ((orig_start > MAX_DEFRAG_SIZE) ||
(donor_start > MAX_DEFRAG_SIZE) ||
- (len > MAX_DEFRAG_SIZE) ||
- (orig_start + len > MAX_DEFRAG_SIZE)) {
+ (*len > MAX_DEFRAG_SIZE) ||
+ (orig_start + *len > MAX_DEFRAG_SIZE)) {
ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
"[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
orig_inode->i_ino, donor_inode->i_ino);
@@ -980,15 +980,15 @@ ext4_mext_check_arguments(struct inode *
return -EINVAL;
}

- if (orig_start + len > donor_inode->i_size) {
+ if (orig_start + *len > donor_inode->i_size) {
ext4_debug("ext4 move extent: End offset [%llu] should "
"be less than donor file size [%lld]."
"So adjust length from %llu to %lld "
"[ino:orig %lu, donor %lu]\n",
- orig_start + len, donor_inode->i_size,
- len, donor_inode->i_size - orig_start,
+ orig_start + *len, donor_inode->i_size,
+ *len, donor_inode->i_size - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- len = donor_inode->i_size - orig_start;
+ *len = donor_inode->i_size - orig_start;
}
} else {
if (orig_start >= orig_inode->i_size) {
@@ -1000,22 +1000,29 @@ ext4_mext_check_arguments(struct inode *
return -EINVAL;
}

- if (orig_start + len > orig_inode->i_size) {
+ if (orig_start + *len > orig_inode->i_size) {
ext4_debug("ext4 move extent: Adjust length "
"from %llu to %lld. Because it should be "
"less than original file size "
"[ino:orig %lu, donor %lu]\n",
- len, orig_inode->i_size - orig_start,
+ *len, orig_inode->i_size - orig_start,
orig_inode->i_ino, donor_inode->i_ino);
- len = orig_inode->i_size - orig_start;
+ *len = orig_inode->i_size - orig_start;
}
}

+ if (!*len) {
+ ext4_debug("ext4 move extent: len shoudld not be 0 "
+ "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+ donor_inode->i_ino);
+ return -EINVAL;
+ }
+
return 0;
}

/**
- * ext4_mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
*
* @inode1: the inode structure
* @inode2: the inode structure
@@ -1023,7 +1030,8 @@ ext4_mext_check_arguments(struct inode *
* Lock two inodes' i_mutex by i_ino order. This function is moved from
* fs/inode.c.
*/
-void ext4_mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+static void
+mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
if (inode1)
@@ -1043,7 +1051,7 @@ void ext4_mext_inode_double_lock(struct
}

/**
- * ext4_mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
*
* @inode1: the inode that is released first
* @inode2: the inode that is released second
@@ -1051,7 +1059,8 @@ void ext4_mext_inode_double_lock(struct
* This function is moved from fs/inode.c.
*/

-void ext4_mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+static void
+mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
if (inode1)
mutex_unlock(&inode1->i_mutex);
@@ -1082,7 +1091,7 @@ void ext4_mext_inode_double_unlock(struc
* after hole behind.
* 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
* or the ext_cur exceeds the block_end which is last logical block number.
- * 3:To get the length of continues area, call ext4_mext_next_extent()
+ * 3:To get the length of continues area, call mext_next_extent()
* specified with the ext_cur (initial value is holecheck_path) re-cursive,
* until find un-continuous extent, the start logical block number exceeds
* the block_end or the extent points to the last extent.
@@ -1121,14 +1130,13 @@ ext4_move_extents(struct file *o_filp, s
int uninit;

/* protect orig and donor against a truncate */
- ext4_mext_inode_double_lock(orig_inode, donor_inode);
+ mext_inode_double_lock(orig_inode, donor_inode);

- ext4_mext_double_down_read(orig_inode, donor_inode);
+ mext_double_down_read(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
- ret = ext4_mext_check_arguments(orig_inode, donor_inode,
- orig_start, donor_start, len,
- *moved_len);
- ext4_mext_double_up_read(orig_inode, donor_inode);
+ ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ donor_start, &len, *moved_len);
+ mext_double_up_read(orig_inode, donor_inode);
if (ret)
goto out2;

@@ -1137,19 +1145,21 @@ ext4_move_extents(struct file *o_filp, s
if (file_end < block_end)
len -= block_end - file_end;

- ext4_mext_get_extpath(orig_path, orig_inode, block_start, ret);
+ get_ext_path(orig_path, orig_inode, block_start, ret);
if (orig_path == NULL)
goto out2;

/* Get path structure to check the hole */
- ext4_mext_get_extpath(holecheck_path, orig_inode, block_start, ret);
+ get_ext_path(holecheck_path, orig_inode, block_start, ret);
if (holecheck_path == NULL)
goto out;

depth = ext_depth(orig_inode);
ext_cur = holecheck_path[depth].p_ext;
- if (ext_cur == NULL)
+ if (ext_cur == NULL) {
+ ret = -EINVAL;
goto out;
+ }

/*
* Get proper extent whose ee_block is beyond block_start
@@ -1157,13 +1167,13 @@ ext4_move_extents(struct file *o_filp, s
*/
if (le32_to_cpu(ext_cur->ee_block) +
ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
- last_extent = ext4_mext_next_extent(orig_inode,
+ last_extent = mext_next_extent(orig_inode,
holecheck_path, &ext_cur);
if (last_extent < 0) {
ret = last_extent;
goto out;
}
- last_extent = ext4_mext_next_extent(orig_inode, orig_path,
+ last_extent = mext_next_extent(orig_inode, orig_path,
&ext_dummy);
if (last_extent < 0) {
ret = last_extent;
@@ -1176,6 +1186,7 @@ ext4_move_extents(struct file *o_filp, s
if (le32_to_cpu(ext_cur->ee_block) > block_end) {
ext4_debug("ext4 move extent: The specified range of file "
"may be the hole\n");
+ ret = -EINVAL;
goto out;
}

@@ -1192,8 +1203,8 @@ ext4_move_extents(struct file *o_filp, s
seq_blocks = block_end - seq_start + 1;

ext_prev = ext_cur;
- last_extent = ext4_mext_next_extent(orig_inode,
- holecheck_path, &ext_cur);
+ last_extent = mext_next_extent(orig_inode, holecheck_path,
+ &ext_cur);
if (last_extent < 0) {
ret = last_extent;
break;
@@ -1247,13 +1258,16 @@ ext4_move_extents(struct file *o_filp, s
while (orig_page_offset <= seq_end_page) {

/* Swap original branches with new branches */
- ret = ext4_mext_partial(o_filp, donor_inode,
+ ret = move_extent_par_page(o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
block_len_in_page, uninit);
if (ret < 0)
goto out;
orig_page_offset++;
+ /* Count how many blocks we have exchanged */
+ *moved_len += block_len_in_page;
+ BUG_ON(*moved_len > len);

data_offset_in_page = 0;
rest_blocks -= block_len_in_page;
@@ -1266,7 +1280,7 @@ ext4_move_extents(struct file *o_filp, s
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
- ext4_mext_get_extpath(holecheck_path, orig_inode,
+ get_ext_path(holecheck_path, orig_inode,
seq_start, ret);
if (holecheck_path == NULL)
break;
@@ -1275,7 +1289,7 @@ ext4_move_extents(struct file *o_filp, s
/* Decrease buffer counter */
if (orig_path)
ext4_ext_drop_refs(orig_path);
- ext4_mext_get_extpath(orig_path, orig_inode, seq_start, ret);
+ get_ext_path(orig_path, orig_inode, seq_start, ret);
if (orig_path == NULL)
break;

@@ -1294,15 +1308,13 @@ out:
kfree(holecheck_path);
}
out2:
- ext4_mext_inode_double_unlock(orig_inode, donor_inode);
+ mext_inode_double_unlock(orig_inode, donor_inode);

- if (ret) {
- *moved_len = orig_page_offset * blocks_per_page;
+ if (ret)
return ret;
- }

- /* Set moved block length */
- *moved_len = len;
+ /* All of the specified blocks must be exchanged in succeed */
+ BUG_ON(*moved_len != len);

return 0;
}

2009-06-18 00:12:39

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [RFC][PATCH 1/3] Add EXT4_IOC_MOVE_EXT ioctl and related functions

On Wed, Jun 17, 2009 at 02:51:30PM +0900, Akira Fujita wrote:
> Hi Ted,
>
> Theodore Tso wrote:
> > As a side note, the static functions in fs/ext4/move_extent.c really
> > don't need the ext4_mext prefix, since static functions don't have
> > namespace issues that require a consistent naming scheme. (Sometimes
> > a shorter name can also be useful since it avoids needing to line wrap
> > function calls with a long list of parameters.)
>
> This patch is for "online-defrag" in the ext4 patch queue,
> and changes are as follows:
>
> - Remove unneeded function prefix (ext4_mext_ or ext4_)
> in fs/ext4/move_extent.c to make function name shorter.
> And change some name of functions.
> - Fix error handling issue.
> - Add some argument checks.
>
> If this patch does not seem to have any problem,
> could you add this change to the ext4 patch queue?

Thanks for your patch; I've folded your improvements changes into the
online-defrag patch in the ext4 patch queue.

Best regards,

- Ted