2014-02-02 05:44:34

by Namjae Jeon

[permalink] [raw]
Subject: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

From: Namjae Jeon <[email protected]>

Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

Signed-off-by: Namjae Jeon <[email protected]>
Signed-off-by: Ashish Sangwan <[email protected]>
---
fs/ext4/ext4.h | 3 +
fs/ext4/extents.c | 297 ++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/move_extent.c | 2 +-
include/trace/events/ext4.h | 25 ++++
4 files changed, 325 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e618503..5cc015a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2745,6 +2745,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);

/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2754,6 +2755,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
__u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent);

/* page-io.c */
extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 267c9fb..12338c1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4566,12 +4566,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
unsigned int credits, blkbits = inode->i_blkbits;

/* Return error if mode is not supported */
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE))
return -EOPNOTSUPP;

if (mode & FALLOC_FL_PUNCH_HOLE)
return ext4_punch_hole(inode, offset, len);

+ if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ return ext4_collapse_range(inode, offset, len);
+
ret = ext4_convert_inline_data(inode);
if (ret)
return ret;
@@ -4870,3 +4874,294 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
ext4_es_lru_add(inode);
return error;
}
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int credits, err;
+
+ /*
+ * Check if need to extend journal credits
+ * 3 for leaf, sb, and inode plus 2 (bmap and group
+ * descriptor) for each block group; assume two block
+ * groups
+ */
+ if (handle->h_buffer_credits < 7) {
+ credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ /* EAGAIN is success */
+ if (err && err != -EAGAIN)
+ return err;
+ }
+
+ err = ext4_ext_get_access(handle, inode, path);
+ return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+ struct inode *inode, handle_t *handle,
+ ext4_lblk_t *start)
+{
+ int depth, err = 0;
+ struct ext4_extent *ex_start, *ex_last;
+ bool update = 0;
+ depth = path->p_depth;
+
+ while (depth >= 0) {
+ if (depth == path->p_depth) {
+ ex_start = path[depth].p_ext;
+ if (!ex_start)
+ return -EIO;
+
+ ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ if (!ex_last)
+ return -EIO;
+
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+ update = 1;
+
+ *start = ex_last->ee_block +
+ ext4_ext_get_actual_len(ex_last);
+
+ while (ex_start <= ex_last) {
+ ex_start->ee_block -= shift;
+ if (ex_start >
+ EXT_FIRST_EXTENT(path[depth].p_hdr)) {
+ if (ext4_ext_try_to_merge_right(inode,
+ path, ex_start - 1))
+ ex_last--;
+ }
+ ex_start++;
+ }
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (--depth < 0 || !update)
+ break;
+ }
+
+ /* Update index too */
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ path[depth].p_idx->ei_block -= shift;
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ /* we are done if current index is not a starting index */
+ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+ break;
+
+ depth--;
+ }
+
+out:
+ return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+ ext4_lblk_t start, ext4_lblk_t shift)
+{
+ struct ext4_ext_path *path;
+ int ret = 0, depth;
+ struct ext4_extent *extent;
+ ext4_lblk_t stop_block, current_block;
+ ext4_lblk_t ex_start, ex_end;
+
+ /* Let path point to the last extent */
+ path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (!extent) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+ }
+
+ stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ /* Nothing to shift, if hole is at the end of file */
+ if (start >= stop_block)
+ return ret;
+
+ /*
+ * Don't start shifting extents until we make sure the hole is big
+ * enough to accomodate the shift.
+ */
+ path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ ex_start = extent->ee_block;
+ ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ if ((ex_start > start - 1 && shift > ex_start) ||
+ (ex_end > start - shift))
+ return -EINVAL;
+
+ /* Its safe to start updating extents */
+ while (start < stop_block) {
+ path = ext4_ext_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ current_block = extent->ee_block;
+ if (start > current_block) {
+ /* Hole, move to the next extent */
+ ret = mext_next_extent(inode, path, &extent);
+ if (ret != 0) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret == 1)
+ ret = 0;
+ break;
+ }
+ }
+ ret = ext4_ext_shift_path_extents(path, shift, inode,
+ handle, &start);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t punch_start, punch_stop;
+ handle_t *handle;
+ unsigned int credits;
+ unsigned int rounding;
+ loff_t ioffset, new_size;
+ int ret;
+ unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+ BUG_ON(offset + len > i_size_read(inode));
+
+ /* Collapse range works only on fs block size aligned offsets. */
+ if (offset & blksize_mask || len & blksize_mask)
+ return -EINVAL;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (EXT4_SB(sb)->s_cluster_ratio > 1)
+ return -EOPNOTSUPP;
+
+ /* Currently just for extent based files */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return -EOPNOTSUPP;
+
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
+ trace_ext4_collapse_range(inode, offset, len);
+
+ punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ rounding = max_t(uint, 1 << EXT4_BLOCK_SIZE_BITS(sb), PAGE_CACHE_SIZE);
+ ioffset = offset & ~(rounding - 1);
+
+ /* Write out all dirty pages */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, -1);
+ if (ret)
+ return ret;
+
+ /* Take mutex lock */
+ mutex_lock(&inode->i_mutex);
+
+ /* Wait for existing dio to complete */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ truncate_pagecache_range(inode, ioffset, -1);
+
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_dio;
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_es_remove_extent(inode, punch_start,
+ EXT_MAX_BLOCKS - punch_start - 1);
+ if (ret)
+ goto journal_stop;
+
+ ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ if (ret)
+ goto journal_stop;
+
+ ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+ punch_stop - punch_start);
+ if (ret)
+ goto journal_stop;
+
+ if ((offset + len) > i_size_read(inode))
+ new_size = offset;
+ else
+ new_size = i_size_read(inode) - len;
+
+ truncate_setsize(inode, new_size);
+ EXT4_I(inode)->i_disksize = new_size;
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+
+journal_stop:
+ ext4_journal_stop(handle);
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 773b503..b474558 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
* ext4_ext_path structure refers to the last extent, or a negative error
* value on failure.
*/
-static int
+int
mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent **extent)
{
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 197d312..90e2f71 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2410,6 +2410,31 @@ TRACE_EVENT(ext4_es_shrink_exit,
__entry->shrunk_nr, __entry->cache_cnt)
);

+TRACE_EVENT(ext4_collapse_range,
+ TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
+
+ TP_ARGS(inode, offset, len),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(loff_t, offset)
+ __field(loff_t, len)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->offset = offset;
+ __entry->len = len;
+ ),
+
+ TP_printk("dev %d,%d ino %lu offset %lld len %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ __entry->offset, __entry->len)
+);
+
#endif /* _TRACE_EXT4_H */

/* This part must be outside protection */
--
1.7.9.5

_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs


2014-02-10 23:35:05

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

On Sun, Feb 02, 2014 at 02:44:34PM +0900, Namjae Jeon wrote:
> From: Namjae Jeon <[email protected]>
>
> Add support FALLOC_FL_COLLAPSE_RANGE for fallocate
>
> Signed-off-by: Namjae Jeon <[email protected]>
> Signed-off-by: Ashish Sangwan <[email protected]>

Can someone more familiar with ext4 please look at this?

I'm happy to stage this through the XFS tree if it's ok, but I'll
need a reviewed-by tag before I take it. If the XFS code is ready to
go before the ext4 stuff, then I'll just take the XFS functionality
and the ext4 stuff can go through the ext4 tree at a later date...

Cheers,

Dave.

> ---
> fs/ext4/ext4.h | 3 +
> fs/ext4/extents.c | 297 ++++++++++++++++++++++++++++++++++++++++++-
> fs/ext4/move_extent.c | 2 +-
> include/trace/events/ext4.h | 25 ++++
> 4 files changed, 325 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index e618503..5cc015a 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2745,6 +2745,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
> extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> __u64 start, __u64 len);
> extern int ext4_ext_precache(struct inode *inode);
> +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
>
> /* move_extent.c */
> extern void ext4_double_down_write_data_sem(struct inode *first,
> @@ -2754,6 +2755,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
> extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
> __u64 start_orig, __u64 start_donor,
> __u64 len, __u64 *moved_len);
> +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
> + struct ext4_extent **extent);
>
> /* page-io.c */
> extern int __init ext4_init_pageio(void);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 267c9fb..12338c1 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4566,12 +4566,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> unsigned int credits, blkbits = inode->i_blkbits;
>
> /* Return error if mode is not supported */
> - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> + FALLOC_FL_COLLAPSE_RANGE))
> return -EOPNOTSUPP;
>
> if (mode & FALLOC_FL_PUNCH_HOLE)
> return ext4_punch_hole(inode, offset, len);
>
> + if (mode & FALLOC_FL_COLLAPSE_RANGE)
> + return ext4_collapse_range(inode, offset, len);
> +
> ret = ext4_convert_inline_data(inode);
> if (ret)
> return ret;
> @@ -4870,3 +4874,294 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> ext4_es_lru_add(inode);
> return error;
> }
> +
> +/*
> + * ext4_access_path:
> + * Function to access the path buffer for marking it dirty.
> + * It also checks if there are sufficient credits left in the journal handle
> + * to update path.
> + */
> +static int
> +ext4_access_path(handle_t *handle, struct inode *inode,
> + struct ext4_ext_path *path)
> +{
> + int credits, err;
> +
> + /*
> + * Check if need to extend journal credits
> + * 3 for leaf, sb, and inode plus 2 (bmap and group
> + * descriptor) for each block group; assume two block
> + * groups
> + */
> + if (handle->h_buffer_credits < 7) {
> + credits = ext4_writepage_trans_blocks(inode);
> + err = ext4_ext_truncate_extend_restart(handle, inode, credits);
> + /* EAGAIN is success */
> + if (err && err != -EAGAIN)
> + return err;
> + }
> +
> + err = ext4_ext_get_access(handle, inode, path);
> + return err;
> +}
> +
> +/*
> + * ext4_ext_shift_path_extents:
> + * Shift the extents of a path structure lying between path[depth].p_ext
> + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
> + * from starting block for each extent.
> + */
> +static int
> +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
> + struct inode *inode, handle_t *handle,
> + ext4_lblk_t *start)
> +{
> + int depth, err = 0;
> + struct ext4_extent *ex_start, *ex_last;
> + bool update = 0;
> + depth = path->p_depth;
> +
> + while (depth >= 0) {
> + if (depth == path->p_depth) {
> + ex_start = path[depth].p_ext;
> + if (!ex_start)
> + return -EIO;
> +
> + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
> + if (!ex_last)
> + return -EIO;
> +
> + err = ext4_access_path(handle, inode, path + depth);
> + if (err)
> + goto out;
> +
> + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
> + update = 1;
> +
> + *start = ex_last->ee_block +
> + ext4_ext_get_actual_len(ex_last);
> +
> + while (ex_start <= ex_last) {
> + ex_start->ee_block -= shift;
> + if (ex_start >
> + EXT_FIRST_EXTENT(path[depth].p_hdr)) {
> + if (ext4_ext_try_to_merge_right(inode,
> + path, ex_start - 1))
> + ex_last--;
> + }
> + ex_start++;
> + }
> + err = ext4_ext_dirty(handle, inode, path + depth);
> + if (err)
> + goto out;
> +
> + if (--depth < 0 || !update)
> + break;
> + }
> +
> + /* Update index too */
> + err = ext4_access_path(handle, inode, path + depth);
> + if (err)
> + goto out;
> +
> + path[depth].p_idx->ei_block -= shift;
> + err = ext4_ext_dirty(handle, inode, path + depth);
> + if (err)
> + goto out;
> +
> + /* we are done if current index is not a starting index */
> + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
> + break;
> +
> + depth--;
> + }
> +
> +out:
> + return err;
> +}
> +
> +/*
> + * ext4_ext_shift_extents:
> + * All the extents which lies in the range from start to the last allocated
> + * block for the file are shifted downwards by shift blocks.
> + * On success, 0 is returned, error otherwise.
> + */
> +static int
> +ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
> + ext4_lblk_t start, ext4_lblk_t shift)
> +{
> + struct ext4_ext_path *path;
> + int ret = 0, depth;
> + struct ext4_extent *extent;
> + ext4_lblk_t stop_block, current_block;
> + ext4_lblk_t ex_start, ex_end;
> +
> + /* Let path point to the last extent */
> + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
> + if (IS_ERR(path))
> + return PTR_ERR(path);
> +
> + depth = path->p_depth;
> + extent = path[depth].p_ext;
> + if (!extent) {
> + ext4_ext_drop_refs(path);
> + kfree(path);
> + return ret;
> + }
> +
> + stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
> + ext4_ext_drop_refs(path);
> + kfree(path);
> +
> + /* Nothing to shift, if hole is at the end of file */
> + if (start >= stop_block)
> + return ret;
> +
> + /*
> + * Don't start shifting extents until we make sure the hole is big
> + * enough to accomodate the shift.
> + */
> + path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
> + depth = path->p_depth;
> + extent = path[depth].p_ext;
> + ex_start = extent->ee_block;
> + ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
> + ext4_ext_drop_refs(path);
> + kfree(path);
> +
> + if ((ex_start > start - 1 && shift > ex_start) ||
> + (ex_end > start - shift))
> + return -EINVAL;
> +
> + /* Its safe to start updating extents */
> + while (start < stop_block) {
> + path = ext4_ext_find_extent(inode, start, NULL, 0);
> + if (IS_ERR(path))
> + return PTR_ERR(path);
> + depth = path->p_depth;
> + extent = path[depth].p_ext;
> + current_block = extent->ee_block;
> + if (start > current_block) {
> + /* Hole, move to the next extent */
> + ret = mext_next_extent(inode, path, &extent);
> + if (ret != 0) {
> + ext4_ext_drop_refs(path);
> + kfree(path);
> + if (ret == 1)
> + ret = 0;
> + break;
> + }
> + }
> + ret = ext4_ext_shift_path_extents(path, shift, inode,
> + handle, &start);
> + ext4_ext_drop_refs(path);
> + kfree(path);
> + if (ret)
> + break;
> + }
> +
> + return ret;
> +}
> +
> +/*
> + * ext4_collapse_range:
> + * This implements the fallocate's collapse range functionality for ext4
> + * Returns: 0 and non-zero on error.
> + */
> +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
> +{
> + struct super_block *sb = inode->i_sb;
> + ext4_lblk_t punch_start, punch_stop;
> + handle_t *handle;
> + unsigned int credits;
> + unsigned int rounding;
> + loff_t ioffset, new_size;
> + int ret;
> + unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
> +
> + BUG_ON(offset + len > i_size_read(inode));
> +
> + /* Collapse range works only on fs block size aligned offsets. */
> + if (offset & blksize_mask || len & blksize_mask)
> + return -EINVAL;
> +
> + if (!S_ISREG(inode->i_mode))
> + return -EOPNOTSUPP;
> +
> + if (EXT4_SB(sb)->s_cluster_ratio > 1)
> + return -EOPNOTSUPP;
> +
> + /* Currently just for extent based files */
> + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> + return -EOPNOTSUPP;
> +
> + if (IS_SWAPFILE(inode))
> + return -ETXTBSY;
> +
> + trace_ext4_collapse_range(inode, offset, len);
> +
> + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
> + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
> +
> + rounding = max_t(uint, 1 << EXT4_BLOCK_SIZE_BITS(sb), PAGE_CACHE_SIZE);
> + ioffset = offset & ~(rounding - 1);
> +
> + /* Write out all dirty pages */
> + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, -1);
> + if (ret)
> + return ret;
> +
> + /* Take mutex lock */
> + mutex_lock(&inode->i_mutex);
> +
> + /* Wait for existing dio to complete */
> + ext4_inode_block_unlocked_dio(inode);
> + inode_dio_wait(inode);
> +
> + truncate_pagecache_range(inode, ioffset, -1);
> +
> + credits = ext4_writepage_trans_blocks(inode);
> + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out_dio;
> + }
> +
> + down_write(&EXT4_I(inode)->i_data_sem);
> +
> + ext4_discard_preallocations(inode);
> +
> + ret = ext4_es_remove_extent(inode, punch_start,
> + EXT_MAX_BLOCKS - punch_start - 1);
> + if (ret)
> + goto journal_stop;
> +
> + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
> + if (ret)
> + goto journal_stop;
> +
> + ret = ext4_ext_shift_extents(inode, handle, punch_stop,
> + punch_stop - punch_start);
> + if (ret)
> + goto journal_stop;
> +
> + if ((offset + len) > i_size_read(inode))
> + new_size = offset;
> + else
> + new_size = i_size_read(inode) - len;
> +
> + truncate_setsize(inode, new_size);
> + EXT4_I(inode)->i_disksize = new_size;
> +
> + inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
> + ext4_mark_inode_dirty(handle, inode);
> +
> +journal_stop:
> + ext4_journal_stop(handle);
> + up_write(&EXT4_I(inode)->i_data_sem);
> +
> +out_dio:
> + ext4_inode_resume_unlocked_dio(inode);
> + mutex_unlock(&inode->i_mutex);
> + return ret;
> +}
> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
> index 773b503..b474558 100644
> --- a/fs/ext4/move_extent.c
> +++ b/fs/ext4/move_extent.c
> @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
> * ext4_ext_path structure refers to the last extent, or a negative error
> * value on failure.
> */
> -static int
> +int
> mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
> struct ext4_extent **extent)
> {
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index 197d312..90e2f71 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -2410,6 +2410,31 @@ TRACE_EVENT(ext4_es_shrink_exit,
> __entry->shrunk_nr, __entry->cache_cnt)
> );
>
> +TRACE_EVENT(ext4_collapse_range,
> + TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
> +
> + TP_ARGS(inode, offset, len),
> +
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(ino_t, ino)
> + __field(loff_t, offset)
> + __field(loff_t, len)
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = inode->i_sb->s_dev;
> + __entry->ino = inode->i_ino;
> + __entry->offset = offset;
> + __entry->len = len;
> + ),
> +
> + TP_printk("dev %d,%d ino %lu offset %lld len %lld",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + (unsigned long) __entry->ino,
> + __entry->offset, __entry->len)
> +);
> +
> #endif /* _TRACE_EXT4_H */
>
> /* This part must be outside protection */
> --
> 1.7.9.5
>
>

--
Dave Chinner
[email protected]

2014-02-11 11:59:13

by Lukas Czerner

[permalink] [raw]
Subject: Re: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

On Sun, 2 Feb 2014, Namjae Jeon wrote:

> Date: Sun, 2 Feb 2014 14:44:34 +0900
> From: Namjae Jeon <[email protected]>
> To: [email protected], [email protected], [email protected], [email protected],
> [email protected], [email protected], [email protected]
> Cc: [email protected], [email protected],
> [email protected], [email protected],
> Namjae Jeon <[email protected]>, Namjae Jeon <[email protected]>,
> Ashish Sangwan <[email protected]>
> Subject: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for
> fallocate
>
> From: Namjae Jeon <[email protected]>
>
> Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

Description is missing here, please provide FALLOC_FL_COLLAPSE_RANGE
description so people know what it's supposed to be doing.

more comments bellow.

Thanks!
-Lukas

>
> Signed-off-by: Namjae Jeon <[email protected]>
> Signed-off-by: Ashish Sangwan <[email protected]>
> ---
> fs/ext4/ext4.h | 3 +
> fs/ext4/extents.c | 297 ++++++++++++++++++++++++++++++++++++++++++-
> fs/ext4/move_extent.c | 2 +-
> include/trace/events/ext4.h | 25 ++++
> 4 files changed, 325 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index e618503..5cc015a 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2745,6 +2745,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
> extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> __u64 start, __u64 len);
> extern int ext4_ext_precache(struct inode *inode);
> +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
>
> /* move_extent.c */
> extern void ext4_double_down_write_data_sem(struct inode *first,
> @@ -2754,6 +2755,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
> extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
> __u64 start_orig, __u64 start_donor,
> __u64 len, __u64 *moved_len);
> +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
> + struct ext4_extent **extent);
>
> /* page-io.c */
> extern int __init ext4_init_pageio(void);
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 267c9fb..12338c1 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4566,12 +4566,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
> unsigned int credits, blkbits = inode->i_blkbits;
>
> /* Return error if mode is not supported */
> - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
> + FALLOC_FL_COLLAPSE_RANGE))
> return -EOPNOTSUPP;
>
> if (mode & FALLOC_FL_PUNCH_HOLE)
> return ext4_punch_hole(inode, offset, len);
>
> + if (mode & FALLOC_FL_COLLAPSE_RANGE)
> + return ext4_collapse_range(inode, offset, len);
> +
> ret = ext4_convert_inline_data(inode);
> if (ret)
> return ret;
> @@ -4870,3 +4874,294 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> ext4_es_lru_add(inode);
> return error;
> }
> +
> +/*
> + * ext4_access_path:
> + * Function to access the path buffer for marking it dirty.
> + * It also checks if there are sufficient credits left in the journal handle
> + * to update path.
> + */
> +static int
> +ext4_access_path(handle_t *handle, struct inode *inode,
> + struct ext4_ext_path *path)
> +{
> + int credits, err;
> +
> + /*
> + * Check if need to extend journal credits
> + * 3 for leaf, sb, and inode plus 2 (bmap and group
> + * descriptor) for each block group; assume two block
> + * groups
> + */
> + if (handle->h_buffer_credits < 7) {
> + credits = ext4_writepage_trans_blocks(inode);
> + err = ext4_ext_truncate_extend_restart(handle, inode, credits);
> + /* EAGAIN is success */
> + if (err && err != -EAGAIN)
> + return err;
> + }
> +
> + err = ext4_ext_get_access(handle, inode, path);
> + return err;
> +}
> +
> +/*
> + * ext4_ext_shift_path_extents:
> + * Shift the extents of a path structure lying between path[depth].p_ext
> + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
> + * from starting block for each extent.
> + */
> +static int
> +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
> + struct inode *inode, handle_t *handle,
> + ext4_lblk_t *start)
> +{
> + int depth, err = 0;
> + struct ext4_extent *ex_start, *ex_last;
> + bool update = 0;
> + depth = path->p_depth;
> +
> + while (depth >= 0) {
> + if (depth == path->p_depth) {
> + ex_start = path[depth].p_ext;
> + if (!ex_start)
> + return -EIO;
> +
> + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
> + if (!ex_last)
> + return -EIO;
> +
> + err = ext4_access_path(handle, inode, path + depth);
> + if (err)
> + goto out;
> +
> + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
> + update = 1;
> +
> + *start = ex_last->ee_block +
> + ext4_ext_get_actual_len(ex_last);
> +
> + while (ex_start <= ex_last) {
> + ex_start->ee_block -= shift;
> + if (ex_start >
> + EXT_FIRST_EXTENT(path[depth].p_hdr)) {
> + if (ext4_ext_try_to_merge_right(inode,
> + path, ex_start - 1))
> + ex_last--;
> + }
> + ex_start++;
> + }
> + err = ext4_ext_dirty(handle, inode, path + depth);
> + if (err)
> + goto out;
> +
> + if (--depth < 0 || !update)
> + break;
> + }
> +
> + /* Update index too */
> + err = ext4_access_path(handle, inode, path + depth);
> + if (err)
> + goto out;
> +
> + path[depth].p_idx->ei_block -= shift;
> + err = ext4_ext_dirty(handle, inode, path + depth);
> + if (err)
> + goto out;
> +
> + /* we are done if current index is not a starting index */
> + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
> + break;
> +
> + depth--;
> + }
> +
> +out:
> + return err;
> +}
> +
> +/*
> + * ext4_ext_shift_extents:
> + * All the extents which lies in the range from start to the last allocated
> + * block for the file are shifted downwards by shift blocks.
> + * On success, 0 is returned, error otherwise.
> + */
> +static int
> +ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
> + ext4_lblk_t start, ext4_lblk_t shift)
> +{
> + struct ext4_ext_path *path;
> + int ret = 0, depth;
> + struct ext4_extent *extent;
> + ext4_lblk_t stop_block, current_block;
> + ext4_lblk_t ex_start, ex_end;
> +
> + /* Let path point to the last extent */
> + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
> + if (IS_ERR(path))
> + return PTR_ERR(path);
> +
> + depth = path->p_depth;
> + extent = path[depth].p_ext;
> + if (!extent) {
> + ext4_ext_drop_refs(path);
> + kfree(path);
> + return ret;
> + }
> +
> + stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
> + ext4_ext_drop_refs(path);
> + kfree(path);
> +
> + /* Nothing to shift, if hole is at the end of file */
> + if (start >= stop_block)
> + return ret;
> +
> + /*
> + * Don't start shifting extents until we make sure the hole is big
> + * enough to accomodate the shift.
> + */
> + path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
> + depth = path->p_depth;
> + extent = path[depth].p_ext;
> + ex_start = extent->ee_block;
> + ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
> + ext4_ext_drop_refs(path);
> + kfree(path);
> +
> + if ((ex_start > start - 1 && shift > ex_start) ||
> + (ex_end > start - shift))
> + return -EINVAL;
> +
> + /* Its safe to start updating extents */
> + while (start < stop_block) {
> + path = ext4_ext_find_extent(inode, start, NULL, 0);
> + if (IS_ERR(path))
> + return PTR_ERR(path);
> + depth = path->p_depth;
> + extent = path[depth].p_ext;
> + current_block = extent->ee_block;
> + if (start > current_block) {
> + /* Hole, move to the next extent */
> + ret = mext_next_extent(inode, path, &extent);
> + if (ret != 0) {
> + ext4_ext_drop_refs(path);
> + kfree(path);
> + if (ret == 1)
> + ret = 0;
> + break;
> + }
> + }
> + ret = ext4_ext_shift_path_extents(path, shift, inode,
> + handle, &start);
> + ext4_ext_drop_refs(path);
> + kfree(path);
> + if (ret)
> + break;
> + }
> +
> + return ret;
> +}
> +
> +/*
> + * ext4_collapse_range:
> + * This implements the fallocate's collapse range functionality for ext4
> + * Returns: 0 and non-zero on error.
> + */
> +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
> +{
> + struct super_block *sb = inode->i_sb;
> + ext4_lblk_t punch_start, punch_stop;
> + handle_t *handle;
> + unsigned int credits;
> + unsigned int rounding;
> + loff_t ioffset, new_size;
> + int ret;
> + unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
> +
> + BUG_ON(offset + len > i_size_read(inode));

So if anyone provides offset + len which exceeds the i_size then it
crashes the kernel ? That does not sound right, or am I missing a
check anywhere ?

Also in the patch 0/3 you're saying that:

" It wokrs beyond "EOF", so the extents which are pre-allocated
beyond "EOF" are also updated. "

so which is true ? Again it would be better to have the description
in this patch as well.

Moreover offset and len are loff_t which means that the addition
operation can easily overflow.

Also you're not holding any locks which would prevent i_size from
changing.


> +
> + /* Collapse range works only on fs block size aligned offsets. */
> + if (offset & blksize_mask || len & blksize_mask)
> + return -EINVAL;
> +
> + if (!S_ISREG(inode->i_mode))
> + return -EOPNOTSUPP;
> +
> + if (EXT4_SB(sb)->s_cluster_ratio > 1)
> + return -EOPNOTSUPP;

Please if you're going to write the support for it, make it
complete and provide support for bigalloc file system as well.

What is the problem when it comes to bigalloc fs ? It should
concern allocation only, extent tree manipulation should be the
same.

> +
> + /* Currently just for extent based files */
> + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> + return -EOPNOTSUPP;

That's fine indirect block addressing is pretty much obsolete now.
Ext3 uses it, but we do not need to add functionality to "ext3"
code.

However the inode flag can change so you should do this under the
mutex lock.

> +
> + if (IS_SWAPFILE(inode))
> + return -ETXTBSY;

Again, this should be done under the lock.

> +
> + trace_ext4_collapse_range(inode, offset, len);
> +
> + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
> + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);

So far you've been using blksize_mask instead of
EXT4_BLOCK_SIZE_BITS(sb), please use only one to make it easier for
reader. I suggest using EXT4_BLOCK_SIZE_BITS(sb) since you actually
have sb available.

> +
> + rounding = max_t(uint, 1 << EXT4_BLOCK_SIZE_BITS(sb), PAGE_CACHE_SIZE);
> + ioffset = offset & ~(rounding - 1);

Why do you need to round it to the whole page ?

> +
> + /* Write out all dirty pages */
> + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, -1);
> + if (ret)
> + return ret;
> +
> + /* Take mutex lock */
> + mutex_lock(&inode->i_mutex);

What about append only and immutable files ? You probably need to
check for that we well right ? See ext4_punch_hole()

> +
> + /* Wait for existing dio to complete */
> + ext4_inode_block_unlocked_dio(inode);
> + inode_dio_wait(inode);
> +
> + truncate_pagecache_range(inode, ioffset, -1);
> +
> + credits = ext4_writepage_trans_blocks(inode);
> + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
> + if (IS_ERR(handle)) {
> + ret = PTR_ERR(handle);
> + goto out_dio;
> + }
> +
> + down_write(&EXT4_I(inode)->i_data_sem);
> +
> + ext4_discard_preallocations(inode);
> +
> + ret = ext4_es_remove_extent(inode, punch_start,
> + EXT_MAX_BLOCKS - punch_start - 1);
> + if (ret)
> + goto journal_stop;
> +
> + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
> + if (ret)
> + goto journal_stop;
> +
> + ret = ext4_ext_shift_extents(inode, handle, punch_stop,
> + punch_stop - punch_start);
> + if (ret)
> + goto journal_stop;
> +
> + if ((offset + len) > i_size_read(inode))
> + new_size = offset;

You've hit BUG_ON() on this case at the beginning of the function. I
am confused, please provide proper commit description.

Thanks!
-Lukas

> + else
> + new_size = i_size_read(inode) - len;
> +
> + truncate_setsize(inode, new_size);
> + EXT4_I(inode)->i_disksize = new_size;
> +
> + inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
> + ext4_mark_inode_dirty(handle, inode);
> +
> +journal_stop:
> + ext4_journal_stop(handle);
> + up_write(&EXT4_I(inode)->i_data_sem);
> +
> +out_dio:
> + ext4_inode_resume_unlocked_dio(inode);
> + mutex_unlock(&inode->i_mutex);
> + return ret;
> +}
> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
> index 773b503..b474558 100644
> --- a/fs/ext4/move_extent.c
> +++ b/fs/ext4/move_extent.c
> @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
> * ext4_ext_path structure refers to the last extent, or a negative error
> * value on failure.
> */
> -static int
> +int
> mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
> struct ext4_extent **extent)
> {
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index 197d312..90e2f71 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -2410,6 +2410,31 @@ TRACE_EVENT(ext4_es_shrink_exit,
> __entry->shrunk_nr, __entry->cache_cnt)
> );
>
> +TRACE_EVENT(ext4_collapse_range,
> + TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
> +
> + TP_ARGS(inode, offset, len),
> +
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(ino_t, ino)
> + __field(loff_t, offset)
> + __field(loff_t, len)
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = inode->i_sb->s_dev;
> + __entry->ino = inode->i_ino;
> + __entry->offset = offset;
> + __entry->len = len;
> + ),
> +
> + TP_printk("dev %d,%d ino %lu offset %lld len %lld",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + (unsigned long) __entry->ino,
> + __entry->offset, __entry->len)
> +);
> +
> #endif /* _TRACE_EXT4_H */
>
> /* This part must be outside protection */
>

_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs

2014-02-12 02:28:35

by Namjae Jeon

[permalink] [raw]
Subject: Re: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

2014-02-11 20:59 GMT+09:00, Lukáš Czerner <[email protected]>:
> On Sun, 2 Feb 2014, Namjae Jeon wrote:
>
>> Date: Sun, 2 Feb 2014 14:44:34 +0900
>> From: Namjae Jeon <[email protected]>
>> To: [email protected], [email protected], [email protected],
>> [email protected],
>> [email protected], [email protected], [email protected]
>> Cc: [email protected], [email protected],
>> [email protected], [email protected],
>> Namjae Jeon <[email protected]>, Namjae Jeon
>> <[email protected]>,
>> Ashish Sangwan <[email protected]>
>> Subject: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE
>> for
>> fallocate
>>
>> From: Namjae Jeon <[email protected]>
>>
>> Add support FALLOC_FL_COLLAPSE_RANGE for fallocate
Hi Lukas.
>
> Description is missing here, please provide FALLOC_FL_COLLAPSE_RANGE
> description so people know what it's supposed to be doing.
>
> more comments bellow.
Okay, I will udpate.
>
> Thanks!
> -Lukas
>
>>
>> Signed-off-by: Namjae Jeon <[email protected]>
>> Signed-off-by: Ashish Sangwan <[email protected]>
>> ---
>> fs/ext4/ext4.h | 3 +
>> fs/ext4/extents.c | 297
>> ++++++++++++++++++++++++++++++++++++++++++-
>> fs/ext4/move_extent.c | 2 +-
>> include/trace/events/ext4.h | 25 ++++
>> 4 files changed, 325 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index e618503..5cc015a 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -2745,6 +2745,7 @@ extern int ext4_find_delalloc_cluster(struct inode
>> *inode, ext4_lblk_t lblk);
>> extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info
>> *fieinfo,
>> __u64 start, __u64 len);
>> extern int ext4_ext_precache(struct inode *inode);
>> +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t
>> len);
>>
>> /* move_extent.c */
>> extern void ext4_double_down_write_data_sem(struct inode *first,
>> @@ -2754,6 +2755,8 @@ extern void ext4_double_up_write_data_sem(struct
>> inode *orig_inode,
>> extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
>> __u64 start_orig, __u64 start_donor,
>> __u64 len, __u64 *moved_len);
>> +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path
>> *path,
>> + struct ext4_extent **extent);
>>
>> /* page-io.c */
>> extern int __init ext4_init_pageio(void);
>> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
>> index 267c9fb..12338c1 100644
>> --- a/fs/ext4/extents.c
>> +++ b/fs/ext4/extents.c
>> @@ -4566,12 +4566,16 @@ long ext4_fallocate(struct file *file, int mode,
>> loff_t offset, loff_t len)
>> unsigned int credits, blkbits = inode->i_blkbits;
>>
>> /* Return error if mode is not supported */
>> - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
>> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
>> + FALLOC_FL_COLLAPSE_RANGE))
>> return -EOPNOTSUPP;
>>
>> if (mode & FALLOC_FL_PUNCH_HOLE)
>> return ext4_punch_hole(inode, offset, len);
>>
>> + if (mode & FALLOC_FL_COLLAPSE_RANGE)
>> + return ext4_collapse_range(inode, offset, len);
>> +
>> ret = ext4_convert_inline_data(inode);
>> if (ret)
>> return ret;
>> @@ -4870,3 +4874,294 @@ int ext4_fiemap(struct inode *inode, struct
>> fiemap_extent_info *fieinfo,
>> ext4_es_lru_add(inode);
>> return error;
>> }
>> +
>> +/*
>> + * ext4_access_path:
>> + * Function to access the path buffer for marking it dirty.
>> + * It also checks if there are sufficient credits left in the journal
>> handle
>> + * to update path.
>> + */
>> +static int
>> +ext4_access_path(handle_t *handle, struct inode *inode,
>> + struct ext4_ext_path *path)
>> +{
>> + int credits, err;
>> +
>> + /*
>> + * Check if need to extend journal credits
>> + * 3 for leaf, sb, and inode plus 2 (bmap and group
>> + * descriptor) for each block group; assume two block
>> + * groups
>> + */
>> + if (handle->h_buffer_credits < 7) {
>> + credits = ext4_writepage_trans_blocks(inode);
>> + err = ext4_ext_truncate_extend_restart(handle, inode, credits);
>> + /* EAGAIN is success */
>> + if (err && err != -EAGAIN)
>> + return err;
>> + }
>> +
>> + err = ext4_ext_get_access(handle, inode, path);
>> + return err;
>> +}
>> +
>> +/*
>> + * ext4_ext_shift_path_extents:
>> + * Shift the extents of a path structure lying between path[depth].p_ext
>> + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting
>> shift
>> + * from starting block for each extent.
>> + */
>> +static int
>> +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t
>> shift,
>> + struct inode *inode, handle_t *handle,
>> + ext4_lblk_t *start)
>> +{
>> + int depth, err = 0;
>> + struct ext4_extent *ex_start, *ex_last;
>> + bool update = 0;
>> + depth = path->p_depth;
>> +
>> + while (depth >= 0) {
>> + if (depth == path->p_depth) {
>> + ex_start = path[depth].p_ext;
>> + if (!ex_start)
>> + return -EIO;
>> +
>> + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
>> + if (!ex_last)
>> + return -EIO;
>> +
>> + err = ext4_access_path(handle, inode, path + depth);
>> + if (err)
>> + goto out;
>> +
>> + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
>> + update = 1;
>> +
>> + *start = ex_last->ee_block +
>> + ext4_ext_get_actual_len(ex_last);
>> +
>> + while (ex_start <= ex_last) {
>> + ex_start->ee_block -= shift;
>> + if (ex_start >
>> + EXT_FIRST_EXTENT(path[depth].p_hdr)) {
>> + if (ext4_ext_try_to_merge_right(inode,
>> + path, ex_start - 1))
>> + ex_last--;
>> + }
>> + ex_start++;
>> + }
>> + err = ext4_ext_dirty(handle, inode, path + depth);
>> + if (err)
>> + goto out;
>> +
>> + if (--depth < 0 || !update)
>> + break;
>> + }
>> +
>> + /* Update index too */
>> + err = ext4_access_path(handle, inode, path + depth);
>> + if (err)
>> + goto out;
>> +
>> + path[depth].p_idx->ei_block -= shift;
>> + err = ext4_ext_dirty(handle, inode, path + depth);
>> + if (err)
>> + goto out;
>> +
>> + /* we are done if current index is not a starting index */
>> + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
>> + break;
>> +
>> + depth--;
>> + }
>> +
>> +out:
>> + return err;
>> +}
>> +
>> +/*
>> + * ext4_ext_shift_extents:
>> + * All the extents which lies in the range from start to the last
>> allocated
>> + * block for the file are shifted downwards by shift blocks.
>> + * On success, 0 is returned, error otherwise.
>> + */
>> +static int
>> +ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
>> + ext4_lblk_t start, ext4_lblk_t shift)
>> +{
>> + struct ext4_ext_path *path;
>> + int ret = 0, depth;
>> + struct ext4_extent *extent;
>> + ext4_lblk_t stop_block, current_block;
>> + ext4_lblk_t ex_start, ex_end;
>> +
>> + /* Let path point to the last extent */
>> + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
>> + if (IS_ERR(path))
>> + return PTR_ERR(path);
>> +
>> + depth = path->p_depth;
>> + extent = path[depth].p_ext;
>> + if (!extent) {
>> + ext4_ext_drop_refs(path);
>> + kfree(path);
>> + return ret;
>> + }
>> +
>> + stop_block = extent->ee_block + ext4_ext_get_actual_len(extent);
>> + ext4_ext_drop_refs(path);
>> + kfree(path);
>> +
>> + /* Nothing to shift, if hole is at the end of file */
>> + if (start >= stop_block)
>> + return ret;
>> +
>> + /*
>> + * Don't start shifting extents until we make sure the hole is big
>> + * enough to accomodate the shift.
>> + */
>> + path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
>> + depth = path->p_depth;
>> + extent = path[depth].p_ext;
>> + ex_start = extent->ee_block;
>> + ex_end = extent->ee_block + ext4_ext_get_actual_len(extent);
>> + ext4_ext_drop_refs(path);
>> + kfree(path);
>> +
>> + if ((ex_start > start - 1 && shift > ex_start) ||
>> + (ex_end > start - shift))
>> + return -EINVAL;
>> +
>> + /* Its safe to start updating extents */
>> + while (start < stop_block) {
>> + path = ext4_ext_find_extent(inode, start, NULL, 0);
>> + if (IS_ERR(path))
>> + return PTR_ERR(path);
>> + depth = path->p_depth;
>> + extent = path[depth].p_ext;
>> + current_block = extent->ee_block;
>> + if (start > current_block) {
>> + /* Hole, move to the next extent */
>> + ret = mext_next_extent(inode, path, &extent);
>> + if (ret != 0) {
>> + ext4_ext_drop_refs(path);
>> + kfree(path);
>> + if (ret == 1)
>> + ret = 0;
>> + break;
>> + }
>> + }
>> + ret = ext4_ext_shift_path_extents(path, shift, inode,
>> + handle, &start);
>> + ext4_ext_drop_refs(path);
>> + kfree(path);
>> + if (ret)
>> + break;
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * ext4_collapse_range:
>> + * This implements the fallocate's collapse range functionality for ext4
>> + * Returns: 0 and non-zero on error.
>> + */
>> +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
>> +{
>> + struct super_block *sb = inode->i_sb;
>> + ext4_lblk_t punch_start, punch_stop;
>> + handle_t *handle;
>> + unsigned int credits;
>> + unsigned int rounding;
>> + loff_t ioffset, new_size;
>> + int ret;
>> + unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
>> +
>> + BUG_ON(offset + len > i_size_read(inode));
>
> So if anyone provides offset + len which exceeds the i_size then it
> crashes the kernel ? That does not sound right, or am I missing a
> check anywhere ?
>
> Also in the patch 0/3 you're saying that:
>
> " It wokrs beyond "EOF", so the extents which are pre-allocated
> beyond "EOF" are also updated. "
>
> so which is true ? Again it would be better to have the description
> in this patch as well.
You might misunderstand by reading old patch-set. please look at this one.
https://lkml.org/lkml/2014/2/2/12
Since then, it has been decided to limit collapse range within file
size and there is check in VFS patch for this condition.
If user wants to collapse a range that is overlapping with EOF,
truncate(2) is better suited.
>
> Moreover offset and len are loff_t which means that the addition
> operation can easily overflow.
Okay, I will check.
>
> Also you're not holding any locks which would prevent i_size from
> changing.
Okay.
>
>
>> +
>> + /* Collapse range works only on fs block size aligned offsets. */
>> + if (offset & blksize_mask || len & blksize_mask)
>> + return -EINVAL;
>> +
>> + if (!S_ISREG(inode->i_mode))
>> + return -EOPNOTSUPP;
>> +
>> + if (EXT4_SB(sb)->s_cluster_ratio > 1)
>> + return -EOPNOTSUPP;
>
> Please if you're going to write the support for it, make it
> complete and provide support for bigalloc file system as well.
>
> What is the problem when it comes to bigalloc fs ? It should
> concern allocation only, extent tree manipulation should be the
> same.
Acutally, I didn't consider bigalloc feature on collpase range because
when I implmemented collapse range, punch hole didn't provide bigalloc
support.
As you said, bigalloc is only related with allocation, so I will
remove this check.
There has not been much change in ext4 patch since it was posted first
time due to lack of review.
>
>> +
>> + /* Currently just for extent based files */
>> + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>> + return -EOPNOTSUPP;
>
> That's fine indirect block addressing is pretty much obsolete now.
> Ext3 uses it, but we do not need to add functionality to "ext3"
> code.
>
> However the inode flag can change so you should do this under the
> mutex lock.
Okay.
>
>> +
>> + if (IS_SWAPFILE(inode))
>> + return -ETXTBSY;
>
> Again, this should be done under the lock.
Right.

>
>> +
>> + trace_ext4_collapse_range(inode, offset, len);
>> +
>> + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
>> + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
>
> So far you've been using blksize_mask instead of
> EXT4_BLOCK_SIZE_BITS(sb), please use only one to make it easier for
> reader. I suggest using EXT4_BLOCK_SIZE_BITS(sb) since you actually
> have sb available.
Okay, I will use EXT4_BLOCK_SIZE_BITS(sb).

>
>> +
>> + rounding = max_t(uint, 1 << EXT4_BLOCK_SIZE_BITS(sb), PAGE_CACHE_SIZE);
>> + ioffset = offset & ~(rounding - 1);
>
> Why do you need to round it to the whole page ?
We don't actually need to round it to page sized boundary but we are
using truncate_pagecache_range to truncate pages from offset till EOF.
All other places where truncate_pagecache_range is used in kernel, do
this sort of rounding, so we just follow the norm.
Currently it seems un-necessary, I will remove it.
>
>> +
>> + /* Write out all dirty pages */
>> + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, -1);
>> + if (ret)
>> + return ret;
>> +
>> + /* Take mutex lock */
>> + mutex_lock(&inode->i_mutex);
>
> What about append only and immutable files ? You probably need to
> check for that we well right ? See ext4_punch_hole()
Okay, I will check it.
>
>> +
>> + /* Wait for existing dio to complete */
>> + ext4_inode_block_unlocked_dio(inode);
>> + inode_dio_wait(inode);
>> +
>> + truncate_pagecache_range(inode, ioffset, -1);
>> +
>> + credits = ext4_writepage_trans_blocks(inode);
>> + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
>> + if (IS_ERR(handle)) {
>> + ret = PTR_ERR(handle);
>> + goto out_dio;
>> + }
>> +
>> + down_write(&EXT4_I(inode)->i_data_sem);
>> +
>> + ext4_discard_preallocations(inode);
>> +
>> + ret = ext4_es_remove_extent(inode, punch_start,
>> + EXT_MAX_BLOCKS - punch_start - 1);
>> + if (ret)
>> + goto journal_stop;
>> +
>> + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
>> + if (ret)
>> + goto journal_stop;
>> +
>> + ret = ext4_ext_shift_extents(inode, handle, punch_stop,
>> + punch_stop - punch_start);
>> + if (ret)
>> + goto journal_stop;
>> +
>> + if ((offset + len) > i_size_read(inode))
>> + new_size = offset;
>
> You've hit BUG_ON() on this case at the beginning of the function. I
> am confused, please provide proper commit description.
Yes, Right. this condition is obsolete as collapse range semantics
have been changed since this condition was added. I will remove this
one.

Thanks for your review!
>
> Thanks!
> -Lukas
>
>> + else
>> + new_size = i_size_read(inode) - len;
>> +
>> + truncate_setsize(inode, new_size);
>> + EXT4_I(inode)->i_disksize = new_size;
>> +
>> + inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
>> + ext4_mark_inode_dirty(handle, inode);
>> +
>> +journal_stop:
>> + ext4_journal_stop(handle);
>> + up_write(&EXT4_I(inode)->i_data_sem);
>> +
>> +out_dio:
>> + ext4_inode_resume_unlocked_dio(inode);
>> + mutex_unlock(&inode->i_mutex);
>> + return ret;
>> +}
>> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
>> index 773b503..b474558 100644
>> --- a/fs/ext4/move_extent.c
>> +++ b/fs/ext4/move_extent.c
>> @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct
>> ext4_extent *dest)
>> * ext4_ext_path structure refers to the last extent, or a negative
>> error
>> * value on failure.
>> */
>> -static int
>> +int
>> mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
>> struct ext4_extent **extent)
>> {
>> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
>> index 197d312..90e2f71 100644
>> --- a/include/trace/events/ext4.h
>> +++ b/include/trace/events/ext4.h
>> @@ -2410,6 +2410,31 @@ TRACE_EVENT(ext4_es_shrink_exit,
>> __entry->shrunk_nr, __entry->cache_cnt)
>> );
>>
>> +TRACE_EVENT(ext4_collapse_range,
>> + TP_PROTO(struct inode *inode, loff_t offset, loff_t len),
>> +
>> + TP_ARGS(inode, offset, len),
>> +
>> + TP_STRUCT__entry(
>> + __field(dev_t, dev)
>> + __field(ino_t, ino)
>> + __field(loff_t, offset)
>> + __field(loff_t, len)
>> + ),
>> +
>> + TP_fast_assign(
>> + __entry->dev = inode->i_sb->s_dev;
>> + __entry->ino = inode->i_ino;
>> + __entry->offset = offset;
>> + __entry->len = len;
>> + ),
>> +
>> + TP_printk("dev %d,%d ino %lu offset %lld len %lld",
>> + MAJOR(__entry->dev), MINOR(__entry->dev),
>> + (unsigned long) __entry->ino,
>> + __entry->offset, __entry->len)
>> +);
>> +
>> #endif /* _TRACE_EXT4_H */
>>
>> /* This part must be outside protection */
>>
>

_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs

2014-02-18 09:46:44

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

On Tue, Feb 18, 2014 at 10:05:49AM +0100, Lukáš Czerner wrote:
> On Wed, 12 Feb 2014, Namjae Jeon wrote:
>
> > Date: Wed, 12 Feb 2014 11:28:35 +0900
> > From: Namjae Jeon <[email protected]>
> > To: Lukáš Czerner <[email protected]>
> > Cc: [email protected], [email protected], [email protected], [email protected],
> > [email protected], [email protected], [email protected],
> > [email protected], [email protected],
> > [email protected], [email protected],
> > Namjae Jeon <[email protected]>,
> > Ashish Sangwan <[email protected]>
> > Subject: Re: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE
> > for fallocate
> >
> > 2014-02-11 20:59 GMT+09:00, Lukáš Czerner <[email protected]>:
> > > On Sun, 2 Feb 2014, Namjae Jeon wrote:
> > >
> > >> Date: Sun, 2 Feb 2014 14:44:34 +0900
> > >> From: Namjae Jeon <[email protected]>
> > >> To: [email protected], [email protected], [email protected],
> > >> [email protected],
> > >> [email protected], [email protected], [email protected]
> > >> Cc: [email protected], [email protected],
> > >> [email protected], [email protected],
> > >> Namjae Jeon <[email protected]>, Namjae Jeon
> > >> <[email protected]>,
> > >> Ashish Sangwan <[email protected]>
> > >> Subject: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE
> > >> for
> > >> fallocate
> > >>
> > >> From: Namjae Jeon <[email protected]>
> > >>
> > >> Add support FALLOC_FL_COLLAPSE_RANGE for fallocate
> > Hi Lukas.
> > >
> > > Description is missing here, please provide FALLOC_FL_COLLAPSE_RANGE
> > > description so people know what it's supposed to be doing.
> > >
> > > more comments bellow.
> > Okay, I will udpate.
> > >
> > > Thanks!
> > > -Lukas
>
> Hi,
>
> you may have noticed my patches for new FALLOC_FL_ZERO_RANGE
> fallocate flag. This changes things around the same area as your
> patches does so we should probably figure out which are going to
> land in first and then rebase the other on top of that.
>
> One concern I have is that I have not seen any tests provided to
> verify the feature. But I just may have missed it. Do you have any
> xfstests test or at least fsx and fsstress patches to provide
> support for testing FALLOC_FL_COLLAPSE_RANGE ? Patches for
> util_linux might also be handy.

There were 5 xfstests patches posted that used the
_test_generic_punch infrastructure to test this functionality.
See the december 08 index here, search for "shared/00[1-5]":

http://oss.sgi.com/archives/xfs/2013-12/index.html

They need a little rework, but otherwise the test code is there.
Having the feature added to fsx and fsstress would probably be a
good idea, though.

Cheers,

Dave.
--
Dave Chinner
[email protected]

_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs

2014-02-18 09:50:54

by Namjae Jeon

[permalink] [raw]
Subject: Re: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

2014-02-18 18:05 GMT+09:00, Lukáš Czerner <[email protected]>:
> On Wed, 12 Feb 2014, Namjae Jeon wrote:
>
>> Date: Wed, 12 Feb 2014 11:28:35 +0900
>> From: Namjae Jeon <[email protected]>
>> To: Lukáš Czerner <[email protected]>
>> Cc: [email protected], [email protected], [email protected],
>> [email protected],
>> [email protected], [email protected], [email protected],
>> [email protected], [email protected],
>> [email protected], [email protected],
>> Namjae Jeon <[email protected]>,
>> Ashish Sangwan <[email protected]>
>> Subject: Re: [PATCH RESEND 3/10] ext4: Add support
>> FALLOC_FL_COLLAPSE_RANGE
>> for fallocate
>>
>> 2014-02-11 20:59 GMT+09:00, Lukáš Czerner <[email protected]>:
>> > On Sun, 2 Feb 2014, Namjae Jeon wrote:
>> >
>> >> Date: Sun, 2 Feb 2014 14:44:34 +0900
>> >> From: Namjae Jeon <[email protected]>
>> >> To: [email protected], [email protected], [email protected],
>> >> [email protected],
>> >> [email protected], [email protected], [email protected]
>> >> Cc: [email protected], [email protected],
>> >> [email protected], [email protected],
>> >> Namjae Jeon <[email protected]>, Namjae Jeon
>> >> <[email protected]>,
>> >> Ashish Sangwan <[email protected]>
>> >> Subject: [PATCH RESEND 3/10] ext4: Add support
>> >> FALLOC_FL_COLLAPSE_RANGE
>> >> for
>> >> fallocate
>> >>
>> >> From: Namjae Jeon <[email protected]>
>> >>
>> >> Add support FALLOC_FL_COLLAPSE_RANGE for fallocate
>> Hi Lukas.
>> >
>> > Description is missing here, please provide FALLOC_FL_COLLAPSE_RANGE
>> > description so people know what it's supposed to be doing.
>> >
>> > more comments bellow.
>> Okay, I will udpate.
>> >
>> > Thanks!
>> > -Lukas
>
> Hi,
Hi Lukas.
>
> you may have noticed my patches for new FALLOC_FL_ZERO_RANGE
> fallocate flag. This changes things around the same area as your
> patches does so we should probably figure out which are going to
> land in first and then rebase the other on top of that.
Yes, I have seen your patches. I think that VFS + XFS patches for
FALLOC_FL_COLLAPSE_RANGE have got a fair amount of review from Dave.
So they are almost ready. I will post collapse range v5 patches after
fixing some minor comments tonight or tommorow.
If there will be no more comments for XFS patch, VFS + XFS patches can
go through to xfs tree by Dave.
I think that Ext4 collapse range could go at a later time.(Need your
reviewed-by tag)
If still there are comments for xfs patch, I will rebase on top of your patch.
>
> One concern I have is that I have not seen any tests provided to
> verify the feature. But I just may have missed it. Do you have any
> xfstests test or at least fsx and fsstress patches to provide
> support for testing FALLOC_FL_COLLAPSE_RANGE ? Patches for
> util_linux might also be handy.
Yes, There are 5 new xfstests cases in this patch-set. these will also
be posted with collapse range patch-set soon.

Thanks :)
>
> Thanks!
> -Lukas

_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs

2014-02-18 14:30:05

by Theodore Ts'o

[permalink] [raw]
Subject: Re: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

Namjae,

Did you respond to Matthew Wilcox's comments/question from Feb. 2nd?

> > What if the file is mmaped at the time somebody issues this command?
> > Seems to me we should drop pagecache pages that overlap with the
> > removed blocks. If the removed range is not a multiple of PAGE_SIZE,
> > then we should also drop any pagecache pages after the removed range.
>
> Oops, forgot to add "and if it is a multiple of page size, then we need
> to update the offsets of any pages after the removed page".

Dave responded that XFS does the right thing when doing a punch hole
operation, but it wasn't obvious to me whether FL_COLLAPSE_RANGE does
the right thing with ext4.

Thanks,

- Ted

_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs

2014-02-19 01:08:39

by Namjae Jeon

[permalink] [raw]
Subject: Re: [PATCH RESEND 3/10] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate

2014-02-18 23:30 GMT+09:00, Theodore Ts'o <[email protected]>:
> Namjae,
Hi Ted.
>
> Did you respond to Matthew Wilcox's comments/question from Feb. 2nd?
Sorry, I didn't catch about this.
I just replied from Matthew's mail.
Thanks for your remind.
>
>> > What if the file is mmaped at the time somebody issues this command?
>> > Seems to me we should drop pagecache pages that overlap with the
>> > removed blocks. If the removed range is not a multiple of PAGE_SIZE,
>> > then we should also drop any pagecache pages after the removed range.
>>
>> Oops, forgot to add "and if it is a multiple of page size, then we need
>> to update the offsets of any pages after the removed page".
>
> Dave responded that XFS does the right thing when doing a punch hole
> operation, but it wasn't obvious to me whether FL_COLLAPSE_RANGE does
> the right thing with ext4.
>
> Thanks,
>
> - Ted
>

_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs

2014-02-25 19:08:39

by Dongsu Park

[permalink] [raw]
Subject: [PATCH] util-linux/fallocate: introduce an option -c to support FALLOC_FL_COLLAPSE_RANGE

Introduce a new option -c (or --collapse-range) to support a new flag
FALLOC_FL_COLLAPSE_RANGE for fallocate(2). It will nullify a particular
range [offset, offset+len] by shifting extents beyond the range to the
beginning of the hole.

To test that, it's necessary to apply kernel patches in the patchset
"fs: Introduce new flag (FALLOC_FL_COLLAPSE_RANGE) for fallocate" [1],
as well as "[PATCH v6] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for
fallocate". [2]

As discussed in the thread "[PATCH RESEND 3/10] ext4: Add support
FALLOC_FL_COLLAPSE_RANGE for fallocate", [3] this patch to util-linux
will be useful for testing collapse-range inside xfstests.

[1] https://lkml.org/lkml/2014/2/18/374
[2] https://lkml.org/lkml/2014/2/20/318
[3] https://lkml.org/lkml/2014/2/18/83

Cc: Lukas Czerner <[email protected]>
Cc: Namjae Jeon <[email protected]>
Cc: Ashish Sangwan <[email protected]>
Signed-off-by: Dongsu Park <[email protected]>
---
sys-utils/fallocate.1 | 7 +++++++
sys-utils/fallocate.c | 45 ++++++++++++++++++++++++++++-----------------
2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/sys-utils/fallocate.1 b/sys-utils/fallocate.1
index 8a3aa4f..634c595 100644
--- a/sys-utils/fallocate.1
+++ b/sys-utils/fallocate.1
@@ -6,6 +6,7 @@ fallocate \- preallocate or deallocate space to a file
.B fallocate
.RB [ \-n ]
.RB [ \-p ]
+.RB [ \-c ]
.RB [ \-o
.IR offset ]
.B \-l
@@ -54,6 +55,12 @@ implied.
.IP
You can think of this as doing a "\fBcp --sparse\fP" and
renaming the dest file as the original, without the need for extra disk space.
+.IP "\fB\-c, \-\-collapse-range\fP"
+Collapse a particular file range to nullify the hole. Extents beyond the range
+[offset, offset+length] will be shifted to the beginning of hole. Hence this
+command does not leave a hole, while \fI\-\-punch-hole\fP leaves a hole
+instead of shifting extents. Both offset and length should be aligned to
+the block size of filesystem.
.IP "\fB\-o, \-\-offset\fP \fIoffset\fP
Specifies the beginning offset of the allocation, in bytes.
.IP "\fB\-l, \-\-length\fP \fIlength\fP
diff --git a/sys-utils/fallocate.c b/sys-utils/fallocate.c
index d8a74bf..3c8a8d2 100644
--- a/sys-utils/fallocate.c
+++ b/sys-utils/fallocate.c
@@ -39,7 +39,8 @@
#endif

#if defined(HAVE_LINUX_FALLOC_H) && \
- (!defined(FALLOC_FL_KEEP_SIZE) || !defined(FALLOC_FL_PUNCH_HOLE))
+ (!defined(FALLOC_FL_KEEP_SIZE) || !defined(FALLOC_FL_PUNCH_HOLE) || \
+ !defined(FALLOC_FL_COLLAPSE_RANGE))
# include <linux/falloc.h> /* non-libc fallback for FALLOC_FL_* flags */
#endif

@@ -51,6 +52,10 @@
# define FALLOC_FL_PUNCH_HOLE 2
#endif

+#ifndef FALLOC_FL_COLLAPSE_RANGE
+# define FALLOC_FL_COLLAPSE_RANGE 8
+#endif
+
#include "nls.h"
#include "strutils.h"
#include "c.h"
@@ -66,12 +71,13 @@ static void __attribute__((__noreturn__)) usage(FILE *out)
fprintf(out,
_(" %s [options] <filename>\n"), program_invocation_short_name);
fputs(USAGE_OPTIONS, out);
- fputs(_(" -d, --dig-holes detect and dig holes\n"), out);
- fputs(_(" -l, --length <num> length of the (de)allocation, in bytes\n"), out);
- fputs(_(" -n, --keep-size don't modify the length of the file\n"), out);
- fputs(_(" -o, --offset <num> offset of the (de)allocation, in bytes\n"), out);
- fputs(_(" -p, --punch-hole punch holes in the file\n"), out);
- fputs(_(" -v, --verbose verbose mode\n"), out);
+ fputs(_(" -c, --collapse-range collapse a hole in the file\n"), out);
+ fputs(_(" -d, --dig-holes detect and dig holes\n"), out);
+ fputs(_(" -l, --length <num> length of the (de)allocation, in bytes\n"), out);
+ fputs(_(" -n, --keep-size don't modify the length of the file\n"), out);
+ fputs(_(" -o, --offset <num> offset of the (de)allocation, in bytes\n"), out);
+ fputs(_(" -p, --punch-hole punch holes in the file\n"), out);
+ fputs(_(" -v, --verbose verbose mode\n"), out);

fputs(USAGE_SEPARATOR, out);
fputs(USAGE_HELP, out);
@@ -258,15 +264,16 @@ int main(int argc, char **argv)
loff_t offset = 0;

static const struct option longopts[] = {
- { "help", 0, 0, 'h' },
- { "version", 0, 0, 'V' },
- { "keep-size", 0, 0, 'n' },
- { "punch-hole", 0, 0, 'p' },
- { "dig-holes", 0, 0, 'd' },
- { "offset", 1, 0, 'o' },
- { "length", 1, 0, 'l' },
- { "verbose", 0, 0, 'v' },
- { NULL, 0, 0, 0 }
+ { "help", 0, 0, 'h' },
+ { "version", 0, 0, 'V' },
+ { "keep-size", 0, 0, 'n' },
+ { "punch-hole", 0, 0, 'p' },
+ { "collapse-range", 0, 0, 'c' },
+ { "dig-holes", 0, 0, 'd' },
+ { "offset", 1, 0, 'o' },
+ { "length", 1, 0, 'l' },
+ { "verbose", 0, 0, 'v' },
+ { NULL, 0, 0, 0 }
};

setlocale(LC_ALL, "");
@@ -274,7 +281,8 @@ int main(int argc, char **argv)
textdomain(PACKAGE);
atexit(close_stdout);

- while ((c = getopt_long(argc, argv, "hvVnpdl:o:", longopts, NULL)) != -1) {
+ while ((c = getopt_long(argc, argv, "hvVncpdl:o:", longopts, NULL))
+ != -1) {
switch(c) {
case 'h':
usage(stdout);
@@ -282,6 +290,9 @@ int main(int argc, char **argv)
case 'V':
printf(UTIL_LINUX_VERSION);
return EXIT_SUCCESS;
+ case 'c':
+ mode |= FALLOC_FL_COLLAPSE_RANGE;
+ break;
case 'p':
mode |= FALLOC_FL_PUNCH_HOLE;
/* fall through */
--
1.8.5.3


2014-02-25 19:31:01

by Dave Chinner

[permalink] [raw]
Subject: Re: [PATCH] util-linux/fallocate: introduce an option -c to support FALLOC_FL_COLLAPSE_RANGE

On Tue, Feb 25, 2014 at 08:08:25PM +0100, Dongsu Park wrote:
> Introduce a new option -c (or --collapse-range) to support a new flag
> FALLOC_FL_COLLAPSE_RANGE for fallocate(2). It will nullify a particular
> range [offset, offset+len] by shifting extents beyond the range to the
> beginning of the hole.
>
> To test that, it's necessary to apply kernel patches in the patchset
> "fs: Introduce new flag (FALLOC_FL_COLLAPSE_RANGE) for fallocate" [1],
> as well as "[PATCH v6] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for
> fallocate". [2]
>
> As discussed in the thread "[PATCH RESEND 3/10] ext4: Add support
> FALLOC_FL_COLLAPSE_RANGE for fallocate", [3] this patch to util-linux
> will be useful for testing collapse-range inside xfstests.

FYI, we've already got support in xfs_io for collapse-range tests
inside xfstests.

http://oss.sgi.com/cgi-bin/gitweb.cgi?p=xfs/cmds/xfsprogs.git;a=commit;h=ca692f162d36c871c9c1b6169136b2c70503f2d8

There are already several tests that use it.

Cheers,

Dave.
--
Dave Chinner
[email protected]

2014-02-27 03:59:52

by Namjae Jeon

[permalink] [raw]
Subject: Re: [PATCH] util-linux/fallocate: introduce an option -c to support FALLOC_FL_COLLAPSE_RANGE

Hi Dongsu.
> As discussed in the thread "[PATCH RESEND 3/10] ext4: Add support
> FALLOC_FL_COLLAPSE_RANGE for fallocate", [3] this patch to util-linux
> will be useful for testing collapse-range inside xfstests.
As Dave pointed, this patch is needed for fsstress not xfstests.
>
> [1] https://lkml.org/lkml/2014/2/18/374
> [2] https://lkml.org/lkml/2014/2/20/318
> [3] https://lkml.org/lkml/2014/2/18/83
>
> @@ -51,6 +52,10 @@
> # define FALLOC_FL_PUNCH_HOLE 2
> #endif
>
> +#ifndef FALLOC_FL_COLLAPSE_RANGE
> +# define FALLOC_FL_COLLAPSE_RANGE 8
> +#endif
> +
> #include "nls.h"
> #include "strutils.h"
> #include "c.h"
> @@ -66,12 +71,13 @@ static void __attribute__((__noreturn__)) usage(FILE
> *out)
> fprintf(out,
> _(" %s [options] <filename>\n"), program_invocation_short_name);
> fputs(USAGE_OPTIONS, out);
> - fputs(_(" -d, --dig-holes detect and dig holes\n"), out);
> - fputs(_(" -l, --length <num> length of the (de)allocation, in bytes\n"),
> out);
> - fputs(_(" -n, --keep-size don't modify the length of the file\n"),
> out);
> - fputs(_(" -o, --offset <num> offset of the (de)allocation, in bytes\n"),
> out);
> - fputs(_(" -p, --punch-hole punch holes in the file\n"), out);
> - fputs(_(" -v, --verbose verbose mode\n"), out);
> + fputs(_(" -c, --collapse-range collapse a hole in the file\n"), out);
I think that it is better to change like this => "collapse space in the file"
And I add util-liunx's maintainer in this mail loop.

Thanks.

2014-02-27 09:00:05

by Karel Zak

[permalink] [raw]
Subject: Re: [PATCH] util-linux/fallocate: introduce an option -c to support FALLOC_FL_COLLAPSE_RANGE

On Tue, Feb 25, 2014 at 08:08:25PM +0100, Dongsu Park wrote:
> Introduce a new option -c (or --collapse-range) to support a new flag
> FALLOC_FL_COLLAPSE_RANGE for fallocate(2). It will nullify a particular
> range [offset, offset+len] by shifting extents beyond the range to the
> beginning of the hole.

Thanks!

> To test that, it's necessary to apply kernel patches in the patchset

We (util-linux) always waiting for new stuff in Linus tree :-)

Karel

--
Karel Zak <[email protected]>
http://karelzak.blogspot.com

2014-02-27 10:27:05

by Dongsu Park

[permalink] [raw]
Subject: Re: [PATCH] util-linux/fallocate: introduce an option -c to support FALLOC_FL_COLLAPSE_RANGE

Hi Dave,

On 26.02.2014 06:31, Dave Chinner wrote:
> FYI, we've already got support in xfs_io for collapse-range tests
> inside xfstests.
> http://oss.sgi.com/cgi-bin/gitweb.cgi?p=xfs/cmds/xfsprogs.git;a=commit;h=ca692f162d36c871c9c1b6169136b2c70503f2d8
> There are already several tests that use it.

Thanks for the info!

Dongsu

_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs

2014-02-27 10:35:07

by Dongsu Park

[permalink] [raw]
Subject: [PATCH v2] util-linux/fallocate: introduce an option -c to support COLLAPSE_RANGE

From: Dongsu Park <[email protected]>

Introduce a new option -c (or --collapse-range) to support a new flag
FALLOC_FL_COLLAPSE_RANGE for fallocate(2). It will nullify a particular
range [offset, offset+len] by shifting extents beyond the range to the
beginning of the hole.

To test that, it's necessary to apply kernel patches in the patchset
"fs: Introduce new flag (FALLOC_FL_COLLAPSE_RANGE) for fallocate" [1],
as well as "[PATCH v6] ext4: Add support FALLOC_FL_COLLAPSE_RANGE for
fallocate". [2]

As discussed in the thread "[PATCH RESEND 3/10] ext4: Add support
FALLOC_FL_COLLAPSE_RANGE for fallocate", [3] this patch to util-linux
will be useful for testing collapse-range with fsstress.

[1] https://lkml.org/lkml/2014/2/18/374
[2] https://lkml.org/lkml/2014/2/20/318
[3] https://lkml.org/lkml/2014/2/18/83

Cc: Karel Zak <[email protected]>
Cc: Lukas Czerner <[email protected]>
Cc: Namjae Jeon <[email protected]>
Cc: Ashish Sangwan <[email protected]>
Signed-off-by: Dongsu Park <[email protected]>
---

Changelog:

v2:
- update patch description by replacing xfstests with fsstress
- update help message by replacing hole with space

sys-utils/fallocate.1 | 7 +++++++
sys-utils/fallocate.c | 45 ++++++++++++++++++++++++++++-----------------
2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/sys-utils/fallocate.1 b/sys-utils/fallocate.1
index 8a3aa4f..634c595 100644
--- a/sys-utils/fallocate.1
+++ b/sys-utils/fallocate.1
@@ -6,6 +6,7 @@ fallocate \- preallocate or deallocate space to a file
.B fallocate
.RB [ \-n ]
.RB [ \-p ]
+.RB [ \-c ]
.RB [ \-o
.IR offset ]
.B \-l
@@ -54,6 +55,12 @@ implied.
.IP
You can think of this as doing a "\fBcp --sparse\fP" and
renaming the dest file as the original, without the need for extra disk space.
+.IP "\fB\-c, \-\-collapse-range\fP"
+Collapse a particular file range to nullify the hole. Extents beyond the range
+[offset, offset+length] will be shifted to the beginning of hole. Hence this
+command does not leave a hole, while \fI\-\-punch-hole\fP leaves a hole
+instead of shifting extents. Both offset and length should be aligned to
+the block size of filesystem.
.IP "\fB\-o, \-\-offset\fP \fIoffset\fP
Specifies the beginning offset of the allocation, in bytes.
.IP "\fB\-l, \-\-length\fP \fIlength\fP
diff --git a/sys-utils/fallocate.c b/sys-utils/fallocate.c
index d8a74bf..3c8a8d2 100644
--- a/sys-utils/fallocate.c
+++ b/sys-utils/fallocate.c
@@ -39,7 +39,8 @@
#endif

#if defined(HAVE_LINUX_FALLOC_H) && \
- (!defined(FALLOC_FL_KEEP_SIZE) || !defined(FALLOC_FL_PUNCH_HOLE))
+ (!defined(FALLOC_FL_KEEP_SIZE) || !defined(FALLOC_FL_PUNCH_HOLE) || \
+ !defined(FALLOC_FL_COLLAPSE_RANGE))
# include <linux/falloc.h> /* non-libc fallback for FALLOC_FL_* flags */
#endif

@@ -51,6 +52,10 @@
# define FALLOC_FL_PUNCH_HOLE 2
#endif

+#ifndef FALLOC_FL_COLLAPSE_RANGE
+# define FALLOC_FL_COLLAPSE_RANGE 8
+#endif
+
#include "nls.h"
#include "strutils.h"
#include "c.h"
@@ -66,12 +71,13 @@ static void __attribute__((__noreturn__)) usage(FILE *out)
fprintf(out,
_(" %s [options] <filename>\n"), program_invocation_short_name);
fputs(USAGE_OPTIONS, out);
- fputs(_(" -d, --dig-holes detect and dig holes\n"), out);
- fputs(_(" -l, --length <num> length of the (de)allocation, in bytes\n"), out);
- fputs(_(" -n, --keep-size don't modify the length of the file\n"), out);
- fputs(_(" -o, --offset <num> offset of the (de)allocation, in bytes\n"), out);
- fputs(_(" -p, --punch-hole punch holes in the file\n"), out);
- fputs(_(" -v, --verbose verbose mode\n"), out);
+ fputs(_(" -c, --collapse-range collapse space in the file\n"), out);
+ fputs(_(" -d, --dig-holes detect and dig holes\n"), out);
+ fputs(_(" -l, --length <num> length of the (de)allocation, in bytes\n"), out);
+ fputs(_(" -n, --keep-size don't modify the length of the file\n"), out);
+ fputs(_(" -o, --offset <num> offset of the (de)allocation, in bytes\n"), out);
+ fputs(_(" -p, --punch-hole punch holes in the file\n"), out);
+ fputs(_(" -v, --verbose verbose mode\n"), out);

fputs(USAGE_SEPARATOR, out);
fputs(USAGE_HELP, out);
@@ -258,15 +264,16 @@ int main(int argc, char **argv)
loff_t offset = 0;

static const struct option longopts[] = {
- { "help", 0, 0, 'h' },
- { "version", 0, 0, 'V' },
- { "keep-size", 0, 0, 'n' },
- { "punch-hole", 0, 0, 'p' },
- { "dig-holes", 0, 0, 'd' },
- { "offset", 1, 0, 'o' },
- { "length", 1, 0, 'l' },
- { "verbose", 0, 0, 'v' },
- { NULL, 0, 0, 0 }
+ { "help", 0, 0, 'h' },
+ { "version", 0, 0, 'V' },
+ { "keep-size", 0, 0, 'n' },
+ { "punch-hole", 0, 0, 'p' },
+ { "collapse-range", 0, 0, 'c' },
+ { "dig-holes", 0, 0, 'd' },
+ { "offset", 1, 0, 'o' },
+ { "length", 1, 0, 'l' },
+ { "verbose", 0, 0, 'v' },
+ { NULL, 0, 0, 0 }
};

setlocale(LC_ALL, "");
@@ -274,7 +281,8 @@ int main(int argc, char **argv)
textdomain(PACKAGE);
atexit(close_stdout);

- while ((c = getopt_long(argc, argv, "hvVnpdl:o:", longopts, NULL)) != -1) {
+ while ((c = getopt_long(argc, argv, "hvVncpdl:o:", longopts, NULL))
+ != -1) {
switch(c) {
case 'h':
usage(stdout);
@@ -282,6 +290,9 @@ int main(int argc, char **argv)
case 'V':
printf(UTIL_LINUX_VERSION);
return EXIT_SUCCESS;
+ case 'c':
+ mode |= FALLOC_FL_COLLAPSE_RANGE;
+ break;
case 'p':
mode |= FALLOC_FL_PUNCH_HOLE;
/* fall through */
--
1.8.5.3


2014-04-18 10:52:32

by Karel Zak

[permalink] [raw]
Subject: Re: [PATCH v2] util-linux/fallocate: introduce an option -c to support COLLAPSE_RANGE

On Thu, Feb 27, 2014 at 11:35:07AM +0100, Dongsu Park wrote:
> sys-utils/fallocate.1 | 7 +++++++
> sys-utils/fallocate.c | 45 ++++++++++++++++++++++++++++-----------------
> 2 files changed, 35 insertions(+), 17 deletions(-)

Applied, thanks.

--
Karel Zak <[email protected]>
http://karelzak.blogspot.com

_______________________________________________
xfs mailing list
[email protected]
http://oss.sgi.com/mailman/listinfo/xfs