2018-05-07 08:21:40

by Miklos Szeredi

[permalink] [raw]
Subject: [PATCH 0/3] vfs: dedupe: cleanup API and implementation

Hi Al,

Please apply this is a small cleanup/fix series. It is also a prerequisite
to the overlayfs file stacking patchset.

Git tree is here:

git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs.git dedupe-cleanup

Thanks,
Miklos

---
Miklos Szeredi (3):
vfs: dedpue: return s64
vfs: dedupe: rationalize args
vfs: dedupe: extract helper for a single dedup

fs/btrfs/ctree.h | 4 +--
fs/btrfs/ioctl.c | 6 ++--
fs/ocfs2/file.c | 10 +++---
fs/read_write.c | 91 ++++++++++++++++++++++++++++++------------------------
fs/xfs/xfs_file.c | 8 ++---
include/linux/fs.h | 2 +-
6 files changed, 65 insertions(+), 56 deletions(-)

--
2.14.3



2018-05-07 08:21:53

by Miklos Szeredi

[permalink] [raw]
Subject: [PATCH 1/3] vfs: dedpue: return s64

f_op->dedupe_file_range() gets a u64 length to dedup and returns an ssize_t
actual length deduped. This breaks badly on 32bit archs since the returned
length will be truncated and possibly overflow into the sign bit (xfs and
ocfs2 are affected, btrfs limits actual length to 16MiB).

Returning s64 should be good, since clone_verify_area() makes sure that the
supplied length doesn't overflow.

Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/btrfs/ctree.h | 4 ++--
fs/btrfs/ioctl.c | 6 +++---
fs/ocfs2/file.c | 10 +++++-----
fs/read_write.c | 2 +-
fs/xfs/xfs_file.c | 2 +-
include/linux/fs.h | 2 +-
6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2771cc56a622..019a25962ac8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3269,8 +3269,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
struct btrfs_ioctl_balance_args *bargs);
-ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
- struct file *dst_file, u64 dst_loff);
+s64 btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff);

/* file.c */
int __init btrfs_auto_defrag_init(void);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 632e26d6f7ce..b6a1df6bb895 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3194,13 +3194,13 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,

#define BTRFS_MAX_DEDUPE_LEN SZ_16M

-ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
- struct file *dst_file, u64 dst_loff)
+s64 btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+ struct file *dst_file, u64 dst_loff)
{
struct inode *src = file_inode(src_file);
struct inode *dst = file_inode(dst_file);
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
- ssize_t res;
+ int res;

if (olen > BTRFS_MAX_DEDUPE_LEN)
olen = BTRFS_MAX_DEDUPE_LEN;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6ee94bc23f5b..cf3732df4c2e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2537,11 +2537,11 @@ static int ocfs2_file_clone_range(struct file *file_in,
len, false);
}

-static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
- u64 loff,
- u64 len,
- struct file *dst_file,
- u64 dst_loff)
+static s64 ocfs2_file_dedupe_range(struct file *src_file,
+ u64 loff,
+ u64 len,
+ struct file *dst_file,
+ u64 dst_loff)
{
int error;

diff --git a/fs/read_write.c b/fs/read_write.c
index c4eabbfc90df..6a79c7ec2bb2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1976,7 +1976,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
u16 count = same->dest_count;
struct file *dst_file;
loff_t dst_off;
- ssize_t deduped;
+ s64 deduped;

if (!(file->f_mode & FMODE_READ))
return -EINVAL;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 299aee4b7b0b..d06dd1557d0e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -868,7 +868,7 @@ xfs_file_clone_range(
len, false);
}

-STATIC ssize_t
+STATIC s64
xfs_file_dedupe_range(
struct file *src_file,
u64 loff,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..6feb121ae48c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1738,7 +1738,7 @@ struct file_operations {
loff_t, size_t, unsigned int);
int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
u64);
- ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
+ s64 (*dedupe_file_range)(struct file *, u64, u64, struct file *,
u64);
} __randomize_layout;

--
2.14.3


2018-05-07 08:22:16

by Miklos Szeredi

[permalink] [raw]
Subject: [PATCH 2/3] vfs: dedupe: rationalize args

Clean up f_op->dedupe_file_range() interface.

1) Use loff_t for offsets instead of u64
2) Order the arguments the same way as {copy|clone}_file_range().

Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/btrfs/ctree.h | 4 ++--
fs/btrfs/ioctl.c | 4 ++--
fs/ocfs2/file.c | 6 +++---
fs/read_write.c | 4 ++--
fs/xfs/xfs_file.c | 6 +++---
include/linux/fs.h | 2 +-
6 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 019a25962ac8..a8f1ab69158b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3269,8 +3269,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
struct btrfs_ioctl_balance_args *bargs);
-s64 btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
- struct file *dst_file, u64 dst_loff);
+s64 btrfs_dedupe_file_range(struct file *src_file, loff_t loff,
+ struct file *dst_file, loff_t dst_loff, u64 olen);

/* file.c */
int __init btrfs_auto_defrag_init(void);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b6a1df6bb895..10658c0a5ac9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3194,8 +3194,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,

#define BTRFS_MAX_DEDUPE_LEN SZ_16M

-s64 btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
- struct file *dst_file, u64 dst_loff)
+s64 btrfs_dedupe_file_range(struct file *src_file, loff_t loff,
+ struct file *dst_file, loff_t dst_loff, u64 olen)
{
struct inode *src = file_inode(src_file);
struct inode *dst = file_inode(dst_file);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index cf3732df4c2e..00656f4b6059 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2538,10 +2538,10 @@ static int ocfs2_file_clone_range(struct file *file_in,
}

static s64 ocfs2_file_dedupe_range(struct file *src_file,
- u64 loff,
- u64 len,
+ loff_t loff,
struct file *dst_file,
- u64 dst_loff)
+ loff_t dst_loff,
+ u64 len)
{
int error;

diff --git a/fs/read_write.c b/fs/read_write.c
index 6a79c7ec2bb2..0cdef381d9d9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -2046,8 +2046,8 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
info->status = -EINVAL;
} else {
deduped = dst_file->f_op->dedupe_file_range(file, off,
- len, dst_file,
- info->dest_offset);
+ dst_file,
+ info->dest_offset, len);
if (deduped == -EBADE)
info->status = FILE_DEDUPE_RANGE_DIFFERS;
else if (deduped < 0)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d06dd1557d0e..e2620a00d132 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -871,10 +871,10 @@ xfs_file_clone_range(
STATIC s64
xfs_file_dedupe_range(
struct file *src_file,
- u64 loff,
- u64 len,
+ loff_t loff,
struct file *dst_file,
- u64 dst_loff)
+ loff_t dst_loff,
+ u64 len)
{
int error;

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6feb121ae48c..8007a31c4d3c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1738,7 +1738,7 @@ struct file_operations {
loff_t, size_t, unsigned int);
int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
u64);
- s64 (*dedupe_file_range)(struct file *, u64, u64, struct file *,
+ s64 (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
u64);
} __randomize_layout;

--
2.14.3


2018-05-07 08:22:34

by Miklos Szeredi

[permalink] [raw]
Subject: [PATCH 3/3] vfs: dedupe: extract helper for a single dedup

Extract vfs_dedupe_file_range_one() helper to deal with a single dedup
request.

Signed-off-by: Miklos Szeredi <[email protected]>
---
fs/read_write.c | 89 +++++++++++++++++++++++++++++++--------------------------
1 file changed, 49 insertions(+), 40 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 0cdef381d9d9..023df230e2a0 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1964,6 +1964,44 @@ int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
}
EXPORT_SYMBOL(vfs_dedupe_file_range_compare);

+static s64 vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+ struct file *dst_file, loff_t dst_pos,
+ u64 len)
+{
+ s64 ret;
+
+ ret = mnt_want_write_file(dst_file);
+ if (ret)
+ return ret;
+
+ ret = clone_verify_area(dst_file, dst_pos, len, true);
+ if (ret < 0)
+ goto out_drop_write;
+
+ ret = -EINVAL;
+ if (!(capable(CAP_SYS_ADMIN) || (dst_file->f_mode & FMODE_WRITE)))
+ goto out_drop_write;
+
+ ret = -EXDEV;
+ if (src_file->f_path.mnt != dst_file->f_path.mnt)
+ goto out_drop_write;
+
+ ret = -EISDIR;
+ if (S_ISDIR(file_inode(dst_file)->i_mode))
+ goto out_drop_write;
+
+ ret = -EINVAL;
+ if (!dst_file->f_op->dedupe_file_range)
+ goto out_drop_write;
+
+ ret = dst_file->f_op->dedupe_file_range(src_file, src_pos,
+ dst_file, dst_pos, len);
+out_drop_write:
+ mnt_drop_write_file(dst_file);
+
+ return ret;
+}
+
int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
{
struct file_dedupe_range_info *info;
@@ -1972,10 +2010,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
u64 len;
int i;
int ret;
- bool is_admin = capable(CAP_SYS_ADMIN);
u16 count = same->dest_count;
- struct file *dst_file;
- loff_t dst_off;
s64 deduped;

if (!(file->f_mode & FMODE_READ))
@@ -2010,54 +2045,28 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
}

for (i = 0, info = same->info; i < count; i++, info++) {
- struct inode *dst;
struct fd dst_fd = fdget(info->dest_fd);
+ struct file *dst_file = dst_fd.file;

- dst_file = dst_fd.file;
if (!dst_file) {
info->status = -EBADF;
goto next_loop;
}
- dst = file_inode(dst_file);
-
- ret = mnt_want_write_file(dst_file);
- if (ret) {
- info->status = ret;
- goto next_loop;
- }
-
- dst_off = info->dest_offset;
- ret = clone_verify_area(dst_file, dst_off, len, true);
- if (ret < 0) {
- info->status = ret;
- goto next_file;
- }
- ret = 0;

if (info->reserved) {
info->status = -EINVAL;
- } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
- info->status = -EINVAL;
- } else if (file->f_path.mnt != dst_file->f_path.mnt) {
- info->status = -EXDEV;
- } else if (S_ISDIR(dst->i_mode)) {
- info->status = -EISDIR;
- } else if (dst_file->f_op->dedupe_file_range == NULL) {
- info->status = -EINVAL;
- } else {
- deduped = dst_file->f_op->dedupe_file_range(file, off,
- dst_file,
- info->dest_offset, len);
- if (deduped == -EBADE)
- info->status = FILE_DEDUPE_RANGE_DIFFERS;
- else if (deduped < 0)
- info->status = deduped;
- else
- info->bytes_deduped += deduped;
+ goto next_loop;
}

-next_file:
- mnt_drop_write_file(dst_file);
+ deduped = vfs_dedupe_file_range_one(file, off, dst_file,
+ info->dest_offset, len);
+ if (deduped == -EBADE)
+ info->status = FILE_DEDUPE_RANGE_DIFFERS;
+ else if (deduped < 0)
+ info->status = deduped;
+ else
+ info->bytes_deduped += deduped;
+
next_loop:
fdput(dst_fd);

--
2.14.3


2018-05-07 11:12:43

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/3] vfs: dedpue: return s64

On Mon, May 07, 2018 at 10:21:06AM +0200, Miklos Szeredi wrote:
> f_op->dedupe_file_range() gets a u64 length to dedup and returns an ssize_t
> actual length deduped. This breaks badly on 32bit archs since the returned
> length will be truncated and possibly overflow into the sign bit (xfs and
> ocfs2 are affected, btrfs limits actual length to 16MiB).
>
> Returning s64 should be good, since clone_verify_area() makes sure that the
> supplied length doesn't overflow.

Why s64 rather than loff_t? Particularly since the next patch turns
the paramters into loff_t.

2018-05-07 11:17:25

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 2/3] vfs: dedupe: rationalize args

On Mon, May 07, 2018 at 10:21:07AM +0200, Miklos Szeredi wrote:
> @@ -1738,7 +1738,7 @@ struct file_operations {
> loff_t, size_t, unsigned int);
> int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
> u64);
> - s64 (*dedupe_file_range)(struct file *, u64, u64, struct file *,
> + s64 (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
> u64);

Please name the parameters here ...

+ loff_t (*dedupe_file_range)(struct file *src, loff_t src_off,
+ struct file *dst, loff_t dst_off, loff_t len);

2018-05-07 11:33:58

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 1/3] vfs: dedpue: return s64

On Mon, May 7, 2018 at 1:11 PM, Matthew Wilcox <[email protected]> wrote:
> On Mon, May 07, 2018 at 10:21:06AM +0200, Miklos Szeredi wrote:
>> f_op->dedupe_file_range() gets a u64 length to dedup and returns an ssize_t
>> actual length deduped. This breaks badly on 32bit archs since the returned
>> length will be truncated and possibly overflow into the sign bit (xfs and
>> ocfs2 are affected, btrfs limits actual length to 16MiB).
>>
>> Returning s64 should be good, since clone_verify_area() makes sure that the
>> supplied length doesn't overflow.
>
> Why s64 rather than loff_t? Particularly since the next patch turns
> the paramters into loff_t.

Next patch turns the offsets into loff_t and leaves "len" as u64. A
size is definitely not an offset, I'd consider changing the type of
"len" to loff_t a misuse.

Thanks,
Miklos

2018-05-07 11:37:37

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 2/3] vfs: dedupe: rationalize args

On Mon, May 7, 2018 at 1:16 PM, Matthew Wilcox <[email protected]> wrote:
> On Mon, May 07, 2018 at 10:21:07AM +0200, Miklos Szeredi wrote:
>> @@ -1738,7 +1738,7 @@ struct file_operations {
>> loff_t, size_t, unsigned int);
>> int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
>> u64);
>> - s64 (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>> + s64 (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t,
>> u64);
>
> Please name the parameters here ...
>
> + loff_t (*dedupe_file_range)(struct file *src, loff_t src_off,
> + struct file *dst, loff_t dst_off, loff_t len);

It's the convention here. Going against the convention looks odd and
has dubious value.

Fixing the convention is okay by me, but I'd leave that to some
kernelnewbie to worry about.

Thanks,
Miklos

2018-05-07 12:18:38

by Matthew Wilcox

[permalink] [raw]
Subject: Re: [PATCH 1/3] vfs: dedpue: return s64

On Mon, May 07, 2018 at 01:32:09PM +0200, Miklos Szeredi wrote:
> On Mon, May 7, 2018 at 1:11 PM, Matthew Wilcox <[email protected]> wrote:
> > On Mon, May 07, 2018 at 10:21:06AM +0200, Miklos Szeredi wrote:
> >> f_op->dedupe_file_range() gets a u64 length to dedup and returns an ssize_t
> >> actual length deduped. This breaks badly on 32bit archs since the returned
> >> length will be truncated and possibly overflow into the sign bit (xfs and
> >> ocfs2 are affected, btrfs limits actual length to 16MiB).
> >>
> >> Returning s64 should be good, since clone_verify_area() makes sure that the
> >> supplied length doesn't overflow.
> >
> > Why s64 rather than loff_t? Particularly since the next patch turns
> > the paramters into loff_t.
>
> Next patch turns the offsets into loff_t and leaves "len" as u64. A
> size is definitely not an offset, I'd consider changing the type of
> "len" to loff_t a misuse.

Usually a size is the size of something in memory. The length of
something on storage is definitely an loff_t. Look at fallocate()
for an example. You could also argue that lseek where whence is set to
anything other than SEEK_SET is also being used as a length rather than
an absolute offset. We also already use 'loff_t len' as an argument to
vfs_dedupe_file_range_compare().

2018-05-07 14:26:39

by Miklos Szeredi

[permalink] [raw]
Subject: Re: [PATCH 1/3] vfs: dedpue: return s64

On Mon, May 7, 2018 at 2:17 PM, Matthew Wilcox <[email protected]> wrote:
> On Mon, May 07, 2018 at 01:32:09PM +0200, Miklos Szeredi wrote:
>> On Mon, May 7, 2018 at 1:11 PM, Matthew Wilcox <[email protected]> wrote:
>> > On Mon, May 07, 2018 at 10:21:06AM +0200, Miklos Szeredi wrote:
>> >> f_op->dedupe_file_range() gets a u64 length to dedup and returns an ssize_t
>> >> actual length deduped. This breaks badly on 32bit archs since the returned
>> >> length will be truncated and possibly overflow into the sign bit (xfs and
>> >> ocfs2 are affected, btrfs limits actual length to 16MiB).
>> >>
>> >> Returning s64 should be good, since clone_verify_area() makes sure that the
>> >> supplied length doesn't overflow.
>> >
>> > Why s64 rather than loff_t? Particularly since the next patch turns
>> > the paramters into loff_t.
>>
>> Next patch turns the offsets into loff_t and leaves "len" as u64. A
>> size is definitely not an offset, I'd consider changing the type of
>> "len" to loff_t a misuse.
>
> Usually a size is the size of something in memory. The length of
> something on storage is definitely an loff_t. Look at fallocate()
> for an example. You could also argue that lseek where whence is set to
> anything other than SEEK_SET is also being used as a length rather than
> an absolute offset. We also already use 'loff_t len' as an argument to
> vfs_dedupe_file_range_compare().

Fair enough. Will fix.

Thanks,
Miklos