2021-02-15 15:48:01

by Luis Henriques

[permalink] [raw]
Subject: [PATCH v2] vfs: prevent copy_file_range to copy across devices

Nicolas Boichat reported an issue when trying to use the copy_file_range
syscall on a tracefs file. It failed silently because the file content is
generated on-the-fly (reporting a size of zero) and copy_file_range needs
to know in advance how much data is present.

This commit restores the cross-fs restrictions that existed prior to
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") and
removes generic_copy_file_range() calls from ceph, cifs, fuse, and nfs.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
Cc: Nicolas Boichat <[email protected]>
Signed-off-by: Luis Henriques <[email protected]>
---
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description

fs/ceph/file.c | 21 +++-----------------
fs/cifs/cifsfs.c | 3 ---
fs/fuse/file.c | 21 +++-----------------
fs/nfs/nfs4file.c | 20 +++----------------
fs/read_write.c | 49 ++++++++++------------------------------------
include/linux/fs.h | 3 ---
6 files changed, 19 insertions(+), 98 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 209535d5b8d3..639bd7bfaea9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -2261,9 +2261,9 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off
return bytes;
}

-static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
- struct file *dst_file, loff_t dst_off,
- size_t len, unsigned int flags)
+static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
+ struct file *dst_file, loff_t dst_off,
+ size_t len, unsigned int flags)
{
struct inode *src_inode = file_inode(src_file);
struct inode *dst_inode = file_inode(dst_file);
@@ -2456,21 +2456,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
return ret;
}

-static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
- struct file *dst_file, loff_t dst_off,
- size_t len, unsigned int flags)
-{
- ssize_t ret;
-
- ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
- len, flags);
-
- if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(src_file, src_off, dst_file,
- dst_off, len, flags);
- return ret;
-}
-
const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ab883e84e116..7aa3d20f21c0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1229,9 +1229,6 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
len, flags);
free_xid(xid);

- if (rc == -EOPNOTSUPP || rc == -EXDEV)
- rc = generic_copy_file_range(src_file, off, dst_file,
- destoff, len, flags);
return rc;
}

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8cccecb55fb8..0dd703278e49 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3330,9 +3330,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
return err;
}

-static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags)
+static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t len, unsigned int flags)
{
struct fuse_file *ff_in = file_in->private_data;
struct fuse_file *ff_out = file_out->private_data;
@@ -3439,21 +3439,6 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
return err;
}

-static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
- struct file *dst_file, loff_t dst_off,
- size_t len, unsigned int flags)
-{
- ssize_t ret;
-
- ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
- len, flags);
-
- if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(src_file, src_off, dst_file,
- dst_off, len, flags);
- return ret;
-}
-
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
.read_iter = fuse_file_read_iter,
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 57b3821d975a..60998209e310 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -133,9 +133,9 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
}

#ifdef CONFIG_NFS_V4_2
-static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t count, unsigned int flags)
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count, unsigned int flags)
{
struct nfs42_copy_notify_res *cn_resp = NULL;
struct nl4_server *nss = NULL;
@@ -189,20 +189,6 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
return ret;
}

-static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t count, unsigned int flags)
-{
- ssize_t ret;
-
- ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count,
- flags);
- if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(file_in, pos_in, file_out,
- pos_out, count, flags);
- return ret;
-}
-
static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
{
loff_t ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..b217cd62ae0d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1358,40 +1358,12 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
}
#endif

-/**
- * generic_copy_file_range - copy data between two files
- * @file_in: file structure to read from
- * @pos_in: file offset to read from
- * @file_out: file structure to write data to
- * @pos_out: file offset to write data to
- * @len: amount of data to copy
- * @flags: copy flags
- *
- * This is a generic filesystem helper to copy data from one file to another.
- * It has no constraints on the source or destination file owners - the files
- * can belong to different superblocks and different filesystem types. Short
- * copies are allowed.
- *
- * This should be called from the @file_out filesystem, as per the
- * ->copy_file_range() method.
- *
- * Returns the number of bytes copied or a negative error indicating the
- * failure.
- */
-
-ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags)
-{
- return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
- len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
-}
-EXPORT_SYMBOL(generic_copy_file_range);
-
static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags)
{
+ ssize_t ret = -EXDEV;
+
/*
* Although we now allow filesystems to handle cross sb copy, passing
* a file of the wrong filesystem type to filesystem driver can result
@@ -1400,14 +1372,14 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
* several different file_system_type structures, but they all end up
* using the same ->copy_file_range() function pointer.
*/
- if (file_out->f_op->copy_file_range &&
- file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
- return file_out->f_op->copy_file_range(file_in, pos_in,
- file_out, pos_out,
- len, flags);
+ if (!file_out->f_op->copy_file_range)
+ ret = -EOPNOTSUPP;
+ else if (file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
+ ret = file_out->f_op->copy_file_range(file_in, pos_in,
+ file_out, pos_out,
+ len, flags);

- return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
+ return ret;
}

/*
@@ -1514,8 +1486,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
}

ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
- WARN_ON_ONCE(ret == -EOPNOTSUPP);
+ flags);
done:
if (ret > 0) {
fsnotify_access(file_in);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..3aaf627be409 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1910,9 +1910,6 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
loff_t, size_t, unsigned int);
-extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags);
extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *count,


2021-02-15 16:12:02

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Mon, 2021-02-15 at 15:43 +0000, Luis Henriques wrote:
> Nicolas Boichat reported an issue when trying to use the
> copy_file_range
> syscall on a tracefs file.  It failed silently because the file
> content is
> generated on-the-fly (reporting a size of zero) and copy_file_range
> needs
> to know in advance how much data is present.

That explanation makes no sense whatsoever. copy_file_range is a non-
atomic operation and so the file can change while being copied. Any
determination of 'how much data is present' that is made in advance
would therefore be a flaw in the copy process being used (i.e.
do_splice_direct()). Does sendfile() also 'issue' in the same way?


--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2021-02-15 16:40:03

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <[email protected]> wrote:
>
> Nicolas Boichat reported an issue when trying to use the copy_file_range
> syscall on a tracefs file. It failed silently because the file content is
> generated on-the-fly (reporting a size of zero) and copy_file_range needs
> to know in advance how much data is present.
>
> This commit restores the cross-fs restrictions that existed prior to
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") and
> removes generic_copy_file_range() calls from ceph, cifs, fuse, and nfs.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
> Cc: Nicolas Boichat <[email protected]>
> Signed-off-by: Luis Henriques <[email protected]>

Code looks ok.
You may add:

Reviewed-by: Amir Goldstein <[email protected]>

I agree with Trond that the first paragraph of the commit message could
be improved.
The purpose of this change is to fix the change of behavior that
caused the regression.

Before v5.3, behavior was -EXDEV and userspace could fallback to read.
After v5.3, behavior is zero size copy.

It does not matter so much what makes sense for CFR to do in this
case (generic cross-fs copy). What matters is that nobody asked for
this change and that it caused problems.

Thanks,
Amir.

2021-02-15 16:57:02

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <[email protected]>
> wrote:
> >
> > Nicolas Boichat reported an issue when trying to use the
> > copy_file_range
> > syscall on a tracefs file.  It failed silently because the file
> > content is
> > generated on-the-fly (reporting a size of zero) and copy_file_range
> > needs
> > to know in advance how much data is present.
> >
> > This commit restores the cross-fs restrictions that existed prior
> > to
> > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > and
> > removes generic_copy_file_range() calls from ceph, cifs, fuse, and
> > nfs.
> >
> > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > devices")
> > Link:
> > https://lore.kernel.org/linux-fsdevel/[email protected]/
> > Cc: Nicolas Boichat <[email protected]>
> > Signed-off-by: Luis Henriques <[email protected]>
>
> Code looks ok.
> You may add:
>
> Reviewed-by: Amir Goldstein <[email protected]>
>
> I agree with Trond that the first paragraph of the commit message
> could
> be improved.
> The purpose of this change is to fix the change of behavior that
> caused the regression.
>
> Before v5.3, behavior was -EXDEV and userspace could fallback to
> read.
> After v5.3, behavior is zero size copy.
>
> It does not matter so much what makes sense for CFR to do in this
> case (generic cross-fs copy).  What matters is that nobody asked for
> this change and that it caused problems.
>

No. I'm saying that this patch should be NACKed unless there is a real
explanation for why we give crap about this tracefs corner case and why
it can't be fixed.

There are plenty of reasons why copy offload across filesystems makes
sense, and particularly when you're doing NAS. Clone just doesn't cut
it when it comes to disaster recovery (whereas backup to a different
storage unit does). If the client has to do the copy, then you're
effectively doubling the load on the server, and you're adding
potentially unnecessary network traffic (or at the very least you are
doubling that traffic).

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2021-02-15 17:28:09

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <[email protected]> wrote:
>
> On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <[email protected]>
> > wrote:
> > >
> > > Nicolas Boichat reported an issue when trying to use the
> > > copy_file_range
> > > syscall on a tracefs file. It failed silently because the file
> > > content is
> > > generated on-the-fly (reporting a size of zero) and copy_file_range
> > > needs
> > > to know in advance how much data is present.
> > >
> > > This commit restores the cross-fs restrictions that existed prior
> > > to
> > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > > and
> > > removes generic_copy_file_range() calls from ceph, cifs, fuse, and
> > > nfs.
> > >
> > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > devices")
> > > Link:
> > > https://lore.kernel.org/linux-fsdevel/[email protected]/
> > > Cc: Nicolas Boichat <[email protected]>
> > > Signed-off-by: Luis Henriques <[email protected]>
> >
> > Code looks ok.
> > You may add:
> >
> > Reviewed-by: Amir Goldstein <[email protected]>
> >
> > I agree with Trond that the first paragraph of the commit message
> > could
> > be improved.
> > The purpose of this change is to fix the change of behavior that
> > caused the regression.
> >
> > Before v5.3, behavior was -EXDEV and userspace could fallback to
> > read.
> > After v5.3, behavior is zero size copy.
> >
> > It does not matter so much what makes sense for CFR to do in this
> > case (generic cross-fs copy). What matters is that nobody asked for
> > this change and that it caused problems.
> >
>
> No. I'm saying that this patch should be NACKed unless there is a real
> explanation for why we give crap about this tracefs corner case and why
> it can't be fixed.
>
> There are plenty of reasons why copy offload across filesystems makes
> sense, and particularly when you're doing NAS. Clone just doesn't cut
> it when it comes to disaster recovery (whereas backup to a different
> storage unit does). If the client has to do the copy, then you're
> effectively doubling the load on the server, and you're adding
> potentially unnecessary network traffic (or at the very least you are
> doubling that traffic).
>

I don't understand the use case you are describing.

Which filesystem types are you talking about for source and target
of copy_file_range()?

To be clear, the original change was done to support NFS/CIFS server-side
copy and those should not be affected by this change.

Thanks,
Amir.

2021-02-15 18:59:36

by Trond Myklebust

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Mon, 2021-02-15 at 19:24 +0200, Amir Goldstein wrote:
> On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <
> [email protected]> wrote:
> >
> > On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> > > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <
> > > [email protected]>
> > > wrote:
> > > >
> > > > Nicolas Boichat reported an issue when trying to use the
> > > > copy_file_range
> > > > syscall on a tracefs file.  It failed silently because the file
> > > > content is
> > > > generated on-the-fly (reporting a size of zero) and
> > > > copy_file_range
> > > > needs
> > > > to know in advance how much data is present.
> > > >
> > > > This commit restores the cross-fs restrictions that existed
> > > > prior
> > > > to
> > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > > devices")
> > > > and
> > > > removes generic_copy_file_range() calls from ceph, cifs, fuse,
> > > > and
> > > > nfs.
> > > >
> > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > > devices")
> > > > Link:
> > > > https://lore.kernel.org/linux-fsdevel/[email protected]/
> > > > Cc: Nicolas Boichat <[email protected]>
> > > > Signed-off-by: Luis Henriques <[email protected]>
> > >
> > > Code looks ok.
> > > You may add:
> > >
> > > Reviewed-by: Amir Goldstein <[email protected]>
> > >
> > > I agree with Trond that the first paragraph of the commit message
> > > could
> > > be improved.
> > > The purpose of this change is to fix the change of behavior that
> > > caused the regression.
> > >
> > > Before v5.3, behavior was -EXDEV and userspace could fallback to
> > > read.
> > > After v5.3, behavior is zero size copy.
> > >
> > > It does not matter so much what makes sense for CFR to do in this
> > > case (generic cross-fs copy).  What matters is that nobody asked
> > > for
> > > this change and that it caused problems.
> > >
> >
> > No. I'm saying that this patch should be NACKed unless there is a
> > real
> > explanation for why we give crap about this tracefs corner case and
> > why
> > it can't be fixed.
> >
> > There are plenty of reasons why copy offload across filesystems
> > makes
> > sense, and particularly when you're doing NAS. Clone just doesn't
> > cut
> > it when it comes to disaster recovery (whereas backup to a
> > different
> > storage unit does). If the client has to do the copy, then you're
> > effectively doubling the load on the server, and you're adding
> > potentially unnecessary network traffic (or at the very least you
> > are
> > doubling that traffic).
> >
>
> I don't understand the use case you are describing.
>
> Which filesystem types are you talking about for source and target
> of copy_file_range()?
>
> To be clear, the original change was done to support NFS/CIFS server-
> side
> copy and those should not be affected by this change.
>

That is incorrect:

ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file
*dst,
u64 dst_pos, u64 count)
{

/*
* Limit copy to 4MB to prevent indefinitely blocking an nfsd
* thread and client rpc slot. The choice of 4MB is somewhat
* arbitrary. We might instead base this on r/wsize, or make it
* tunable, or use a time instead of a byte limit, or implement
* asynchronous copy. In theory a client could also recognize a
* limit like this and pipeline multiple COPY requests.
*/
count = min_t(u64, count, 1 << 22);
return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
}

You are now explicitly changing the behaviour of knfsd when the source
and destination filesystem differ.

For one thing, you are disallowing the NFSv4.2 copy offload use case of
copying from a local filesystem to a remote NFS server. However you are
also disallowing the copy from, say, an XFS formatted partition to an
ext4 partition.

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
[email protected]


2021-02-16 00:27:37

by Steve French

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Mon, Feb 15, 2021 at 10:11 AM Trond Myklebust
<[email protected]> wrote:
>
> On Mon, 2021-02-15 at 15:43 +0000, Luis Henriques wrote:
> > Nicolas Boichat reported an issue when trying to use the
> > copy_file_range
> > syscall on a tracefs file. It failed silently because the file
> > content is
> > generated on-the-fly (reporting a size of zero) and copy_file_range
> > needs
> > to know in advance how much data is present.
>
> That explanation makes no sense whatsoever. copy_file_range is a non-
> atomic operation and so the file can change while being copied. Any
> determination of 'how much data is present' that is made in advance
> would therefore be a flaw in the copy process being used (i.e.
> do_splice_direct()). Does sendfile() also 'issue' in the same way?

I agree that the explanation of the tracefs problem motivating this
patch doesn't make sense.


--
Thanks,

Steve

2021-02-16 11:31:14

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Tue, Feb 16, 2021 at 11:17:34AM +0000, Luis Henriques wrote:
> Amir Goldstein <[email protected]> writes:
>
> > On Mon, Feb 15, 2021 at 8:57 PM Trond Myklebust <[email protected]> wrote:
> >>
> >> On Mon, 2021-02-15 at 19:24 +0200, Amir Goldstein wrote:
> >> > On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <
> >> > [email protected]> wrote:
> >> > >
> >> > > On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
> >> > > > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <
> >> > > > [email protected]>
> >> > > > wrote:
> >> > > > >
> >> > > > > Nicolas Boichat reported an issue when trying to use the
> >> > > > > copy_file_range
> >> > > > > syscall on a tracefs file. It failed silently because the file
> >> > > > > content is
> >> > > > > generated on-the-fly (reporting a size of zero) and
> >> > > > > copy_file_range
> >> > > > > needs
> >> > > > > to know in advance how much data is present.
> >> > > > >
> >> > > > > This commit restores the cross-fs restrictions that existed
> >> > > > > prior
> >> > > > > to
> >> > > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> >> > > > > devices")
> >> > > > > and
> >> > > > > removes generic_copy_file_range() calls from ceph, cifs, fuse,
> >> > > > > and
> >> > > > > nfs.
> >> > > > >
> >> > > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> >> > > > > devices")
> >> > > > > Link:
> >> > > > > https://lore.kernel.org/linux-fsdevel/[email protected]/
> >> > > > > Cc: Nicolas Boichat <[email protected]>
> >> > > > > Signed-off-by: Luis Henriques <[email protected]>
> >> > > >
> >> > > > Code looks ok.
> >> > > > You may add:
> >> > > >
> >> > > > Reviewed-by: Amir Goldstein <[email protected]>
> >> > > >
> >> > > > I agree with Trond that the first paragraph of the commit message
> >> > > > could
> >> > > > be improved.
> >> > > > The purpose of this change is to fix the change of behavior that
> >> > > > caused the regression.
> >> > > >
> >> > > > Before v5.3, behavior was -EXDEV and userspace could fallback to
> >> > > > read.
> >> > > > After v5.3, behavior is zero size copy.
> >> > > >
> >> > > > It does not matter so much what makes sense for CFR to do in this
> >> > > > case (generic cross-fs copy). What matters is that nobody asked
> >> > > > for
> >> > > > this change and that it caused problems.
> >> > > >
> >> > >
> >> > > No. I'm saying that this patch should be NACKed unless there is a
> >> > > real
> >> > > explanation for why we give crap about this tracefs corner case and
> >> > > why
> >> > > it can't be fixed.
> >> > >
> >> > > There are plenty of reasons why copy offload across filesystems
> >> > > makes
> >> > > sense, and particularly when you're doing NAS. Clone just doesn't
> >> > > cut
> >> > > it when it comes to disaster recovery (whereas backup to a
> >> > > different
> >> > > storage unit does). If the client has to do the copy, then you're
> >> > > effectively doubling the load on the server, and you're adding
> >> > > potentially unnecessary network traffic (or at the very least you
> >> > > are
> >> > > doubling that traffic).
> >> > >
> >> >
> >> > I don't understand the use case you are describing.
> >> >
> >> > Which filesystem types are you talking about for source and target
> >> > of copy_file_range()?
> >> >
> >> > To be clear, the original change was done to support NFS/CIFS server-
> >> > side
> >> > copy and those should not be affected by this change.
> >> >
> >>
> >> That is incorrect:
> >>
> >> ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file
> >> *dst,
> >> u64 dst_pos, u64 count)
> >> {
> >>
> >> /*
> >> * Limit copy to 4MB to prevent indefinitely blocking an nfsd
> >> * thread and client rpc slot. The choice of 4MB is somewhat
> >> * arbitrary. We might instead base this on r/wsize, or make it
> >> * tunable, or use a time instead of a byte limit, or implement
> >> * asynchronous copy. In theory a client could also recognize a
> >> * limit like this and pipeline multiple COPY requests.
> >> */
> >> count = min_t(u64, count, 1 << 22);
> >> return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> >> }
> >>
> >> You are now explicitly changing the behaviour of knfsd when the source
> >> and destination filesystem differ.
> >>
> >> For one thing, you are disallowing the NFSv4.2 copy offload use case of
> >> copying from a local filesystem to a remote NFS server. However you are
> >> also disallowing the copy from, say, an XFS formatted partition to an
> >> ext4 partition.
> >>
> >
> > Got it.
>
> Ugh. And I guess overlayfs may have a similar problem.
>
> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > is internal to kernel users.
> >
> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > for improvements to nfsd_copy_file_range().
> >
> > We can move the check out to copy_file_range syscall:
> >
> > if (flags != 0)
> > return -EINVAL;
> >
> > Leave the fallback from all filesystems and check for the
> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
>
> Ok, the diff bellow is just to make sure I understood your suggestion.
>
> The patch will also need to:
>
> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> use the new flag.
>
> - check flags in generic_copy_file_checks() to make sure only valid flags
> are used (COPY_FILE_SPLICE at the moment).
>
> Also, where should this flag be defined? include/uapi/linux/fs.h?

Why would userspace want/need this flag?

2021-02-16 12:04:49

by Luis Henriques

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

"[email protected]" <[email protected]> writes:

> On Tue, Feb 16, 2021 at 11:17:34AM +0000, Luis Henriques wrote:
>> Amir Goldstein <[email protected]> writes:
>>
>> > On Mon, Feb 15, 2021 at 8:57 PM Trond Myklebust <[email protected]> wrote:
>> >>
>> >> On Mon, 2021-02-15 at 19:24 +0200, Amir Goldstein wrote:
>> >> > On Mon, Feb 15, 2021 at 6:53 PM Trond Myklebust <
>> >> > [email protected]> wrote:
>> >> > >
>> >> > > On Mon, 2021-02-15 at 18:34 +0200, Amir Goldstein wrote:
>> >> > > > On Mon, Feb 15, 2021 at 5:42 PM Luis Henriques <
>> >> > > > [email protected]>
>> >> > > > wrote:
>> >> > > > >
>> >> > > > > Nicolas Boichat reported an issue when trying to use the
>> >> > > > > copy_file_range
>> >> > > > > syscall on a tracefs file. It failed silently because the file
>> >> > > > > content is
>> >> > > > > generated on-the-fly (reporting a size of zero) and
>> >> > > > > copy_file_range
>> >> > > > > needs
>> >> > > > > to know in advance how much data is present.
>> >> > > > >
>> >> > > > > This commit restores the cross-fs restrictions that existed
>> >> > > > > prior
>> >> > > > > to
>> >> > > > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> >> > > > > devices")
>> >> > > > > and
>> >> > > > > removes generic_copy_file_range() calls from ceph, cifs, fuse,
>> >> > > > > and
>> >> > > > > nfs.
>> >> > > > >
>> >> > > > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
>> >> > > > > devices")
>> >> > > > > Link:
>> >> > > > > https://lore.kernel.org/linux-fsdevel/[email protected]/
>> >> > > > > Cc: Nicolas Boichat <[email protected]>
>> >> > > > > Signed-off-by: Luis Henriques <[email protected]>
>> >> > > >
>> >> > > > Code looks ok.
>> >> > > > You may add:
>> >> > > >
>> >> > > > Reviewed-by: Amir Goldstein <[email protected]>
>> >> > > >
>> >> > > > I agree with Trond that the first paragraph of the commit message
>> >> > > > could
>> >> > > > be improved.
>> >> > > > The purpose of this change is to fix the change of behavior that
>> >> > > > caused the regression.
>> >> > > >
>> >> > > > Before v5.3, behavior was -EXDEV and userspace could fallback to
>> >> > > > read.
>> >> > > > After v5.3, behavior is zero size copy.
>> >> > > >
>> >> > > > It does not matter so much what makes sense for CFR to do in this
>> >> > > > case (generic cross-fs copy). What matters is that nobody asked
>> >> > > > for
>> >> > > > this change and that it caused problems.
>> >> > > >
>> >> > >
>> >> > > No. I'm saying that this patch should be NACKed unless there is a
>> >> > > real
>> >> > > explanation for why we give crap about this tracefs corner case and
>> >> > > why
>> >> > > it can't be fixed.
>> >> > >
>> >> > > There are plenty of reasons why copy offload across filesystems
>> >> > > makes
>> >> > > sense, and particularly when you're doing NAS. Clone just doesn't
>> >> > > cut
>> >> > > it when it comes to disaster recovery (whereas backup to a
>> >> > > different
>> >> > > storage unit does). If the client has to do the copy, then you're
>> >> > > effectively doubling the load on the server, and you're adding
>> >> > > potentially unnecessary network traffic (or at the very least you
>> >> > > are
>> >> > > doubling that traffic).
>> >> > >
>> >> >
>> >> > I don't understand the use case you are describing.
>> >> >
>> >> > Which filesystem types are you talking about for source and target
>> >> > of copy_file_range()?
>> >> >
>> >> > To be clear, the original change was done to support NFS/CIFS server-
>> >> > side
>> >> > copy and those should not be affected by this change.
>> >> >
>> >>
>> >> That is incorrect:
>> >>
>> >> ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file
>> >> *dst,
>> >> u64 dst_pos, u64 count)
>> >> {
>> >>
>> >> /*
>> >> * Limit copy to 4MB to prevent indefinitely blocking an nfsd
>> >> * thread and client rpc slot. The choice of 4MB is somewhat
>> >> * arbitrary. We might instead base this on r/wsize, or make it
>> >> * tunable, or use a time instead of a byte limit, or implement
>> >> * asynchronous copy. In theory a client could also recognize a
>> >> * limit like this and pipeline multiple COPY requests.
>> >> */
>> >> count = min_t(u64, count, 1 << 22);
>> >> return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
>> >> }
>> >>
>> >> You are now explicitly changing the behaviour of knfsd when the source
>> >> and destination filesystem differ.
>> >>
>> >> For one thing, you are disallowing the NFSv4.2 copy offload use case of
>> >> copying from a local filesystem to a remote NFS server. However you are
>> >> also disallowing the copy from, say, an XFS formatted partition to an
>> >> ext4 partition.
>> >>
>> >
>> > Got it.
>>
>> Ugh. And I guess overlayfs may have a similar problem.
>>
>> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
>> > is internal to kernel users.
>> >
>> > FWIW, you may want to look at the loop in ovl_copy_up_data()
>> > for improvements to nfsd_copy_file_range().
>> >
>> > We can move the check out to copy_file_range syscall:
>> >
>> > if (flags != 0)
>> > return -EINVAL;
>> >
>> > Leave the fallback from all filesystems and check for the
>> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
>>
>> Ok, the diff bellow is just to make sure I understood your suggestion.
>>
>> The patch will also need to:
>>
>> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
>> use the new flag.
>>
>> - check flags in generic_copy_file_checks() to make sure only valid flags
>> are used (COPY_FILE_SPLICE at the moment).
>>
>> Also, where should this flag be defined? include/uapi/linux/fs.h?
>
> Why would userspace want/need this flag?

In fact, my question sort of implied yours :-)

What I wanted to know was whether we would like to allow userspace to
_explicitly_ revert to the current behaviour (i.e. use the flag to allow
cross-fs copies) or to continue to return -EINVAL to userspace if flags
are != 0 (in which case this check would need to move to the syscall
definition).

Cheers,
--
Luis

2021-02-16 17:48:33

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <[email protected]> wrote:
>
> Amir Goldstein <[email protected]> writes:
>
> >> Ugh. And I guess overlayfs may have a similar problem.
> >
> > Not exactly.
> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > with the flags it got from layer above, so if called from nfsd it
> > will allow cross fs copy and when called from syscall it won't.
> >
> > There are some corner cases where overlayfs could benefit from
> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > let's leave those for now. Just leave overlayfs code as is.
>
> Got it, thanks for clarifying.
>
> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> >> > is internal to kernel users.
> >> >
> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> >> > for improvements to nfsd_copy_file_range().
> >> >
> >> > We can move the check out to copy_file_range syscall:
> >> >
> >> > if (flags != 0)
> >> > return -EINVAL;
> >> >
> >> > Leave the fallback from all filesystems and check for the
> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> >>
> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> >>
> >> The patch will also need to:
> >>
> >> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> >> use the new flag.
> >>
> >> - check flags in generic_copy_file_checks() to make sure only valid flags
> >> are used (COPY_FILE_SPLICE at the moment).
> >>
> >> Also, where should this flag be defined? include/uapi/linux/fs.h?
> >
> > Grep for REMAP_FILE_
> > Same header file, same Documentation rst file.
> >
> >>
> >> Cheers,
> >> --
> >> Luis
> >>
> >> diff --git a/fs/read_write.c b/fs/read_write.c
> >> index 75f764b43418..341d315d2a96 100644
> >> --- a/fs/read_write.c
> >> +++ b/fs/read_write.c
> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> >> struct file *file_out, loff_t pos_out,
> >> size_t len, unsigned int flags)
> >> {
> >> + if (!(flags & COPY_FILE_SPLICE)) {
> >> + if (!file_out->f_op->copy_file_range)
> >> + return -EOPNOTSUPP;
> >> + else if (file_out->f_op->copy_file_range !=
> >> + file_in->f_op->copy_file_range)
> >> + return -EXDEV;
> >> + }
> >
> > That looks strange, because you are duplicating the logic in
> > do_copy_file_range(). Maybe better:
> >
> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > return -EINVAL;
> > if (flags & COPY_FILE_SPLICE)
> > return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>
> My initial reasoning for duplicating the logic in do_copy_file_range() was
> to allow the generic_copy_file_range() callers to be left unmodified and
> allow the filesystems to default to this implementation.
>
> With this change, I guess that the calls to generic_copy_file_range() from
> the different filesystems can be dropped, as in my initial patch, as they
> will always get -EINVAL. The other option would be to set the
> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> problem we're trying to solve.

I don't understand the problem.

What exactly is wrong with the code I suggested?
Why should any filesystem be changed?

Maybe I am missing something.

Thanks,
Amir.

2021-02-16 18:55:31

by Luis Henriques

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

Amir Goldstein <[email protected]> writes:

> On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <[email protected]> wrote:
>>
>> Amir Goldstein <[email protected]> writes:
>>
>> >> Ugh. And I guess overlayfs may have a similar problem.
>> >
>> > Not exactly.
>> > Generally speaking, overlayfs should call vfs_copy_file_range()
>> > with the flags it got from layer above, so if called from nfsd it
>> > will allow cross fs copy and when called from syscall it won't.
>> >
>> > There are some corner cases where overlayfs could benefit from
>> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
>> > let's leave those for now. Just leave overlayfs code as is.
>>
>> Got it, thanks for clarifying.
>>
>> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
>> >> > is internal to kernel users.
>> >> >
>> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
>> >> > for improvements to nfsd_copy_file_range().
>> >> >
>> >> > We can move the check out to copy_file_range syscall:
>> >> >
>> >> > if (flags != 0)
>> >> > return -EINVAL;
>> >> >
>> >> > Leave the fallback from all filesystems and check for the
>> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
>> >>
>> >> Ok, the diff bellow is just to make sure I understood your suggestion.
>> >>
>> >> The patch will also need to:
>> >>
>> >> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
>> >> use the new flag.
>> >>
>> >> - check flags in generic_copy_file_checks() to make sure only valid flags
>> >> are used (COPY_FILE_SPLICE at the moment).
>> >>
>> >> Also, where should this flag be defined? include/uapi/linux/fs.h?
>> >
>> > Grep for REMAP_FILE_
>> > Same header file, same Documentation rst file.
>> >
>> >>
>> >> Cheers,
>> >> --
>> >> Luis
>> >>
>> >> diff --git a/fs/read_write.c b/fs/read_write.c
>> >> index 75f764b43418..341d315d2a96 100644
>> >> --- a/fs/read_write.c
>> >> +++ b/fs/read_write.c
>> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
>> >> struct file *file_out, loff_t pos_out,
>> >> size_t len, unsigned int flags)
>> >> {
>> >> + if (!(flags & COPY_FILE_SPLICE)) {
>> >> + if (!file_out->f_op->copy_file_range)
>> >> + return -EOPNOTSUPP;
>> >> + else if (file_out->f_op->copy_file_range !=
>> >> + file_in->f_op->copy_file_range)
>> >> + return -EXDEV;
>> >> + }
>> >
>> > That looks strange, because you are duplicating the logic in
>> > do_copy_file_range(). Maybe better:
>> >
>> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
>> > return -EINVAL;
>> > if (flags & COPY_FILE_SPLICE)
>> > return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> > len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>>
>> My initial reasoning for duplicating the logic in do_copy_file_range() was
>> to allow the generic_copy_file_range() callers to be left unmodified and
>> allow the filesystems to default to this implementation.
>>
>> With this change, I guess that the calls to generic_copy_file_range() from
>> the different filesystems can be dropped, as in my initial patch, as they
>> will always get -EINVAL. The other option would be to set the
>> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
>> problem we're trying to solve.
>
> I don't understand the problem.
>
> What exactly is wrong with the code I suggested?
> Why should any filesystem be changed?
>
> Maybe I am missing something.

Ok, I have to do a full brain reboot and start all over.

Before that, I picked the code you suggested and tested it. I've mounted
a cephfs filesystem and used xfs_io to execute a 'copy_range' command
using /sys/kernel/debug/sched_features as source. The result was a
0-sized file in cephfs. And the reason is thevfs_copy_file_range()
early exit in:

if (len == 0)
return 0;

'len' is set in generic_copy_file_checks().

This means that we're not solving the original problem anymore (probably
since v1 of this patch, haven't checked).

Also, re-reading Trond's emails, I read: "... also disallowing the copy
from, say, an XFS formatted partition to an ext4 partition". Isn't that
*exactly* what we're trying to do here? I.e. _prevent_ these copies from
happening so that tracefs files can't be CFR'ed?

/me stops now and waits to see if the morning brings some sun :-)

Cheers,
--
Luis

2021-02-16 19:22:06

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <[email protected]> wrote:
>
> Amir Goldstein <[email protected]> writes:
>
> > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <[email protected]> wrote:
> >>
> >> Amir Goldstein <[email protected]> writes:
> >>
> >> >> Ugh. And I guess overlayfs may have a similar problem.
> >> >
> >> > Not exactly.
> >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> >> > with the flags it got from layer above, so if called from nfsd it
> >> > will allow cross fs copy and when called from syscall it won't.
> >> >
> >> > There are some corner cases where overlayfs could benefit from
> >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> >> > let's leave those for now. Just leave overlayfs code as is.
> >>
> >> Got it, thanks for clarifying.
> >>
> >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> >> >> > is internal to kernel users.
> >> >> >
> >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> >> >> > for improvements to nfsd_copy_file_range().
> >> >> >
> >> >> > We can move the check out to copy_file_range syscall:
> >> >> >
> >> >> > if (flags != 0)
> >> >> > return -EINVAL;
> >> >> >
> >> >> > Leave the fallback from all filesystems and check for the
> >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> >> >>
> >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> >> >>
> >> >> The patch will also need to:
> >> >>
> >> >> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> >> >> use the new flag.
> >> >>
> >> >> - check flags in generic_copy_file_checks() to make sure only valid flags
> >> >> are used (COPY_FILE_SPLICE at the moment).
> >> >>
> >> >> Also, where should this flag be defined? include/uapi/linux/fs.h?
> >> >
> >> > Grep for REMAP_FILE_
> >> > Same header file, same Documentation rst file.
> >> >
> >> >>
> >> >> Cheers,
> >> >> --
> >> >> Luis
> >> >>
> >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> >> >> index 75f764b43418..341d315d2a96 100644
> >> >> --- a/fs/read_write.c
> >> >> +++ b/fs/read_write.c
> >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> >> >> struct file *file_out, loff_t pos_out,
> >> >> size_t len, unsigned int flags)
> >> >> {
> >> >> + if (!(flags & COPY_FILE_SPLICE)) {
> >> >> + if (!file_out->f_op->copy_file_range)
> >> >> + return -EOPNOTSUPP;
> >> >> + else if (file_out->f_op->copy_file_range !=
> >> >> + file_in->f_op->copy_file_range)
> >> >> + return -EXDEV;
> >> >> + }
> >> >
> >> > That looks strange, because you are duplicating the logic in
> >> > do_copy_file_range(). Maybe better:
> >> >
> >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> >> > return -EINVAL;
> >> > if (flags & COPY_FILE_SPLICE)
> >> > return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> >> > len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> >>
> >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> >> to allow the generic_copy_file_range() callers to be left unmodified and
> >> allow the filesystems to default to this implementation.
> >>
> >> With this change, I guess that the calls to generic_copy_file_range() from
> >> the different filesystems can be dropped, as in my initial patch, as they
> >> will always get -EINVAL. The other option would be to set the
> >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> >> problem we're trying to solve.
> >
> > I don't understand the problem.
> >
> > What exactly is wrong with the code I suggested?
> > Why should any filesystem be changed?
> >
> > Maybe I am missing something.
>
> Ok, I have to do a full brain reboot and start all over.
>
> Before that, I picked the code you suggested and tested it. I've mounted
> a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> using /sys/kernel/debug/sched_features as source. The result was a
> 0-sized file in cephfs. And the reason is thevfs_copy_file_range()
> early exit in:
>
> if (len == 0)
> return 0;
>
> 'len' is set in generic_copy_file_checks().

Good point.. I guess we will need to do all the checks earlier in
generic_copy_file_checks() including the logic of:

if (file_in->f_op->remap_file_range &&
file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)


>
> This means that we're not solving the original problem anymore (probably
> since v1 of this patch, haven't checked).
>
> Also, re-reading Trond's emails, I read: "... also disallowing the copy
> from, say, an XFS formatted partition to an ext4 partition". Isn't that
> *exactly* what we're trying to do here? I.e. _prevent_ these copies from
> happening so that tracefs files can't be CFR'ed?
>

We want to address the report which means calls coming from
copy_file_range() syscall.

Trond's use case is vfs_copy_file_range() coming from nfsd.
When he writes about copy from XFS to ext4, he means an
NFS client is issuing server side copy (on same or different NFS mounts)
and the NFS server is executing nfsd_copy_file_range() on a source
file that happens to be on XFS and destination happens to be on ext4.

We can undo the copy_file_range() syscall change of behavior from
v5.3 without regressing the NFS use case.

We just need to be careful and look at all the affected code paths.

Thanks,
Amir.

2021-02-16 19:32:05

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <[email protected]> wrote:
>
> On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <[email protected]> wrote:
> >
> > Amir Goldstein <[email protected]> writes:
> >
> > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <[email protected]> wrote:
> > >>
> > >> Amir Goldstein <[email protected]> writes:
> > >>
> > >> >> Ugh. And I guess overlayfs may have a similar problem.
> > >> >
> > >> > Not exactly.
> > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > >> > with the flags it got from layer above, so if called from nfsd it
> > >> > will allow cross fs copy and when called from syscall it won't.
> > >> >
> > >> > There are some corner cases where overlayfs could benefit from
> > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > >> > let's leave those for now. Just leave overlayfs code as is.
> > >>
> > >> Got it, thanks for clarifying.
> > >>
> > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > >> >> > is internal to kernel users.
> > >> >> >
> > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > >> >> > for improvements to nfsd_copy_file_range().
> > >> >> >
> > >> >> > We can move the check out to copy_file_range syscall:
> > >> >> >
> > >> >> > if (flags != 0)
> > >> >> > return -EINVAL;
> > >> >> >
> > >> >> > Leave the fallback from all filesystems and check for the
> > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > >> >>
> > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > >> >>
> > >> >> The patch will also need to:
> > >> >>
> > >> >> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > >> >> use the new flag.
> > >> >>
> > >> >> - check flags in generic_copy_file_checks() to make sure only valid flags
> > >> >> are used (COPY_FILE_SPLICE at the moment).
> > >> >>
> > >> >> Also, where should this flag be defined? include/uapi/linux/fs.h?
> > >> >
> > >> > Grep for REMAP_FILE_
> > >> > Same header file, same Documentation rst file.
> > >> >
> > >> >>
> > >> >> Cheers,
> > >> >> --
> > >> >> Luis
> > >> >>
> > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > >> >> index 75f764b43418..341d315d2a96 100644
> > >> >> --- a/fs/read_write.c
> > >> >> +++ b/fs/read_write.c
> > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > >> >> struct file *file_out, loff_t pos_out,
> > >> >> size_t len, unsigned int flags)
> > >> >> {
> > >> >> + if (!(flags & COPY_FILE_SPLICE)) {
> > >> >> + if (!file_out->f_op->copy_file_range)
> > >> >> + return -EOPNOTSUPP;
> > >> >> + else if (file_out->f_op->copy_file_range !=
> > >> >> + file_in->f_op->copy_file_range)
> > >> >> + return -EXDEV;
> > >> >> + }
> > >> >
> > >> > That looks strange, because you are duplicating the logic in
> > >> > do_copy_file_range(). Maybe better:
> > >> >
> > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > >> > return -EINVAL;
> > >> > if (flags & COPY_FILE_SPLICE)
> > >> > return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > >> > len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > >>
> > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > >> allow the filesystems to default to this implementation.
> > >>
> > >> With this change, I guess that the calls to generic_copy_file_range() from
> > >> the different filesystems can be dropped, as in my initial patch, as they
> > >> will always get -EINVAL. The other option would be to set the
> > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > >> problem we're trying to solve.
> > >
> > > I don't understand the problem.
> > >
> > > What exactly is wrong with the code I suggested?
> > > Why should any filesystem be changed?
> > >
> > > Maybe I am missing something.
> >
> > Ok, I have to do a full brain reboot and start all over.
> >
> > Before that, I picked the code you suggested and tested it. I've mounted
> > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > using /sys/kernel/debug/sched_features as source. The result was a
> > 0-sized file in cephfs. And the reason is thevfs_copy_file_range()
> > early exit in:
> >
> > if (len == 0)
> > return 0;
> >
> > 'len' is set in generic_copy_file_checks().
>
> Good point.. I guess we will need to do all the checks earlier in
> generic_copy_file_checks() including the logic of:
>
> if (file_in->f_op->remap_file_range &&
> file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
>
>
> >
> > This means that we're not solving the original problem anymore (probably
> > since v1 of this patch, haven't checked).
> >
> > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > from, say, an XFS formatted partition to an ext4 partition". Isn't that
> > *exactly* what we're trying to do here? I.e. _prevent_ these copies from
> > happening so that tracefs files can't be CFR'ed?
> >
>
> We want to address the report which means calls coming from
> copy_file_range() syscall.
>
> Trond's use case is vfs_copy_file_range() coming from nfsd.
> When he writes about copy from XFS to ext4, he means an
> NFS client is issuing server side copy (on same or different NFS mounts)
> and the NFS server is executing nfsd_copy_file_range() on a source
> file that happens to be on XFS and destination happens to be on ext4.

NFS also supports a server-to-server copy where the destination server
mounts the source server and reads the data to be copied. Please don't
break that either :)

Anna

>
> We can undo the copy_file_range() syscall change of behavior from
> v5.3 without regressing the NFS use case.
>
> We just need to be careful and look at all the affected code paths.
>
> Thanks,
> Amir.

2021-02-16 19:33:06

by Steve French

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Tue, Feb 16, 2021 at 1:29 PM Anna Schumaker
<[email protected]> wrote:
>
> On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <[email protected]> wrote:
> >
> > On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <[email protected]> wrote:
> > >
> > > Amir Goldstein <[email protected]> writes:
> > >
> > > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <[email protected]> wrote:
> > > >>
> > > >> Amir Goldstein <[email protected]> writes:
> > > >>
> > > >> >> Ugh. And I guess overlayfs may have a similar problem.
> > > >> >
> > > >> > Not exactly.
> > > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > > >> > with the flags it got from layer above, so if called from nfsd it
> > > >> > will allow cross fs copy and when called from syscall it won't.
> > > >> >
> > > >> > There are some corner cases where overlayfs could benefit from
> > > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > > >> > let's leave those for now. Just leave overlayfs code as is.
> > > >>
> > > >> Got it, thanks for clarifying.
> > > >>
> > > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > > >> >> > is internal to kernel users.
> > > >> >> >
> > > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > > >> >> > for improvements to nfsd_copy_file_range().
> > > >> >> >
> > > >> >> > We can move the check out to copy_file_range syscall:
> > > >> >> >
> > > >> >> > if (flags != 0)
> > > >> >> > return -EINVAL;
> > > >> >> >
> > > >> >> > Leave the fallback from all filesystems and check for the
> > > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > > >> >>
> > > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > > >> >>
> > > >> >> The patch will also need to:
> > > >> >>
> > > >> >> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > > >> >> use the new flag.
> > > >> >>
> > > >> >> - check flags in generic_copy_file_checks() to make sure only valid flags
> > > >> >> are used (COPY_FILE_SPLICE at the moment).
> > > >> >>
> > > >> >> Also, where should this flag be defined? include/uapi/linux/fs.h?
> > > >> >
> > > >> > Grep for REMAP_FILE_
> > > >> > Same header file, same Documentation rst file.
> > > >> >
> > > >> >>
> > > >> >> Cheers,
> > > >> >> --
> > > >> >> Luis
> > > >> >>
> > > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > > >> >> index 75f764b43418..341d315d2a96 100644
> > > >> >> --- a/fs/read_write.c
> > > >> >> +++ b/fs/read_write.c
> > > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > > >> >> struct file *file_out, loff_t pos_out,
> > > >> >> size_t len, unsigned int flags)
> > > >> >> {
> > > >> >> + if (!(flags & COPY_FILE_SPLICE)) {
> > > >> >> + if (!file_out->f_op->copy_file_range)
> > > >> >> + return -EOPNOTSUPP;
> > > >> >> + else if (file_out->f_op->copy_file_range !=
> > > >> >> + file_in->f_op->copy_file_range)
> > > >> >> + return -EXDEV;
> > > >> >> + }
> > > >> >
> > > >> > That looks strange, because you are duplicating the logic in
> > > >> > do_copy_file_range(). Maybe better:
> > > >> >
> > > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > > >> > return -EINVAL;
> > > >> > if (flags & COPY_FILE_SPLICE)
> > > >> > return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > >> > len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > >>
> > > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > > >> allow the filesystems to default to this implementation.
> > > >>
> > > >> With this change, I guess that the calls to generic_copy_file_range() from
> > > >> the different filesystems can be dropped, as in my initial patch, as they
> > > >> will always get -EINVAL. The other option would be to set the
> > > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > > >> problem we're trying to solve.
> > > >
> > > > I don't understand the problem.
> > > >
> > > > What exactly is wrong with the code I suggested?
> > > > Why should any filesystem be changed?
> > > >
> > > > Maybe I am missing something.
> > >
> > > Ok, I have to do a full brain reboot and start all over.
> > >
> > > Before that, I picked the code you suggested and tested it. I've mounted
> > > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > > using /sys/kernel/debug/sched_features as source. The result was a
> > > 0-sized file in cephfs. And the reason is thevfs_copy_file_range()
> > > early exit in:
> > >
> > > if (len == 0)
> > > return 0;
> > >
> > > 'len' is set in generic_copy_file_checks().
> >
> > Good point.. I guess we will need to do all the checks earlier in
> > generic_copy_file_checks() including the logic of:
> >
> > if (file_in->f_op->remap_file_range &&
> > file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
> >
> >
> > >
> > > This means that we're not solving the original problem anymore (probably
> > > since v1 of this patch, haven't checked).
> > >
> > > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > > from, say, an XFS formatted partition to an ext4 partition". Isn't that
> > > *exactly* what we're trying to do here? I.e. _prevent_ these copies from
> > > happening so that tracefs files can't be CFR'ed?
> > >
> >
> > We want to address the report which means calls coming from
> > copy_file_range() syscall.
> >
> > Trond's use case is vfs_copy_file_range() coming from nfsd.
> > When he writes about copy from XFS to ext4, he means an
> > NFS client is issuing server side copy (on same or different NFS mounts)
> > and the NFS server is executing nfsd_copy_file_range() on a source
> > file that happens to be on XFS and destination happens to be on ext4.
>
> NFS also supports a server-to-server copy where the destination server
> mounts the source server and reads the data to be copied. Please don't
> break that either :)

This is a case we will eventually need to support for cifs (SMB3) as well.


--
Thanks,

Steve

2021-02-16 19:42:13

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Tue, Feb 16, 2021 at 9:31 PM Steve French <[email protected]> wrote:
>
> On Tue, Feb 16, 2021 at 1:29 PM Anna Schumaker
> <[email protected]> wrote:
> >
> > On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <[email protected]> wrote:
> > >
> > > On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <[email protected]> wrote:
> > > >
> > > > Amir Goldstein <[email protected]> writes:
> > > >
> > > > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <[email protected]> wrote:
> > > > >>
> > > > >> Amir Goldstein <[email protected]> writes:
> > > > >>
> > > > >> >> Ugh. And I guess overlayfs may have a similar problem.
> > > > >> >
> > > > >> > Not exactly.
> > > > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > > > >> > with the flags it got from layer above, so if called from nfsd it
> > > > >> > will allow cross fs copy and when called from syscall it won't.
> > > > >> >
> > > > >> > There are some corner cases where overlayfs could benefit from
> > > > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > > > >> > let's leave those for now. Just leave overlayfs code as is.
> > > > >>
> > > > >> Got it, thanks for clarifying.
> > > > >>
> > > > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > > > >> >> > is internal to kernel users.
> > > > >> >> >
> > > > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > > > >> >> > for improvements to nfsd_copy_file_range().
> > > > >> >> >
> > > > >> >> > We can move the check out to copy_file_range syscall:
> > > > >> >> >
> > > > >> >> > if (flags != 0)
> > > > >> >> > return -EINVAL;
> > > > >> >> >
> > > > >> >> > Leave the fallback from all filesystems and check for the
> > > > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > > > >> >>
> > > > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > > > >> >>
> > > > >> >> The patch will also need to:
> > > > >> >>
> > > > >> >> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > > > >> >> use the new flag.
> > > > >> >>
> > > > >> >> - check flags in generic_copy_file_checks() to make sure only valid flags
> > > > >> >> are used (COPY_FILE_SPLICE at the moment).
> > > > >> >>
> > > > >> >> Also, where should this flag be defined? include/uapi/linux/fs.h?
> > > > >> >
> > > > >> > Grep for REMAP_FILE_
> > > > >> > Same header file, same Documentation rst file.
> > > > >> >
> > > > >> >>
> > > > >> >> Cheers,
> > > > >> >> --
> > > > >> >> Luis
> > > > >> >>
> > > > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > > > >> >> index 75f764b43418..341d315d2a96 100644
> > > > >> >> --- a/fs/read_write.c
> > > > >> >> +++ b/fs/read_write.c
> > > > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > > > >> >> struct file *file_out, loff_t pos_out,
> > > > >> >> size_t len, unsigned int flags)
> > > > >> >> {
> > > > >> >> + if (!(flags & COPY_FILE_SPLICE)) {
> > > > >> >> + if (!file_out->f_op->copy_file_range)
> > > > >> >> + return -EOPNOTSUPP;
> > > > >> >> + else if (file_out->f_op->copy_file_range !=
> > > > >> >> + file_in->f_op->copy_file_range)
> > > > >> >> + return -EXDEV;
> > > > >> >> + }
> > > > >> >
> > > > >> > That looks strange, because you are duplicating the logic in
> > > > >> > do_copy_file_range(). Maybe better:
> > > > >> >
> > > > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > > > >> > return -EINVAL;
> > > > >> > if (flags & COPY_FILE_SPLICE)
> > > > >> > return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > > >> > len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > > >>
> > > > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > > > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > > > >> allow the filesystems to default to this implementation.
> > > > >>
> > > > >> With this change, I guess that the calls to generic_copy_file_range() from
> > > > >> the different filesystems can be dropped, as in my initial patch, as they
> > > > >> will always get -EINVAL. The other option would be to set the
> > > > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > > > >> problem we're trying to solve.
> > > > >
> > > > > I don't understand the problem.
> > > > >
> > > > > What exactly is wrong with the code I suggested?
> > > > > Why should any filesystem be changed?
> > > > >
> > > > > Maybe I am missing something.
> > > >
> > > > Ok, I have to do a full brain reboot and start all over.
> > > >
> > > > Before that, I picked the code you suggested and tested it. I've mounted
> > > > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > > > using /sys/kernel/debug/sched_features as source. The result was a
> > > > 0-sized file in cephfs. And the reason is thevfs_copy_file_range()
> > > > early exit in:
> > > >
> > > > if (len == 0)
> > > > return 0;
> > > >
> > > > 'len' is set in generic_copy_file_checks().
> > >
> > > Good point.. I guess we will need to do all the checks earlier in
> > > generic_copy_file_checks() including the logic of:
> > >
> > > if (file_in->f_op->remap_file_range &&
> > > file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
> > >
> > >
> > > >
> > > > This means that we're not solving the original problem anymore (probably
> > > > since v1 of this patch, haven't checked).
> > > >
> > > > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > > > from, say, an XFS formatted partition to an ext4 partition". Isn't that
> > > > *exactly* what we're trying to do here? I.e. _prevent_ these copies from
> > > > happening so that tracefs files can't be CFR'ed?
> > > >
> > >
> > > We want to address the report which means calls coming from
> > > copy_file_range() syscall.
> > >
> > > Trond's use case is vfs_copy_file_range() coming from nfsd.
> > > When he writes about copy from XFS to ext4, he means an
> > > NFS client is issuing server side copy (on same or different NFS mounts)
> > > and the NFS server is executing nfsd_copy_file_range() on a source
> > > file that happens to be on XFS and destination happens to be on ext4.
> >
> > NFS also supports a server-to-server copy where the destination server
> > mounts the source server and reads the data to be copied. Please don't
> > break that either :)
>

As long as the copy is via nfsd_copy_file_range() and not from the syscall
it should not regress.

> This is a case we will eventually need to support for cifs (SMB3) as well.
>

samba already does server side copy very well without needing any support
from the kernel.

nfsd also doesn't *need* to use vfs_copy_file_range() it can use kernel APIs
like the loop in ovl_copy_up_data(). But it does, so we should not regress it.

samba/nfsd can try to use copy_file_range() and it will work if the
source/target
fs support it. Otherwise, the server can perfectly well do the copy via other
available interfaces, just like userspace copy tools.

Thanks,
Amir.

2021-02-16 21:17:36

by Steve French

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Tue, Feb 16, 2021 at 1:40 PM Amir Goldstein <[email protected]> wrote:
>
> On Tue, Feb 16, 2021 at 9:31 PM Steve French <[email protected]> wrote:
> >
> > On Tue, Feb 16, 2021 at 1:29 PM Anna Schumaker
> > <[email protected]> wrote:
> > >
> > > On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <[email protected]> wrote:
> > > >
> > > > On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <[email protected]> wrote:
> > > > >
> > > > > Amir Goldstein <[email protected]> writes:
> > > > >
> > > > > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <[email protected]> wrote:
> > > > > >>
> > > > > >> Amir Goldstein <[email protected]> writes:
> > > > > >>
> > > > > >> >> Ugh. And I guess overlayfs may have a similar problem.
> > > > > >> >
> > > > > >> > Not exactly.
> > > > > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > > > > >> > with the flags it got from layer above, so if called from nfsd it
> > > > > >> > will allow cross fs copy and when called from syscall it won't.
> > > > > >> >
> > > > > >> > There are some corner cases where overlayfs could benefit from
> > > > > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > > > > >> > let's leave those for now. Just leave overlayfs code as is.
> > > > > >>
> > > > > >> Got it, thanks for clarifying.
> > > > > >>
> > > > > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > > > > >> >> > is internal to kernel users.
> > > > > >> >> >
> > > > > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > > > > >> >> > for improvements to nfsd_copy_file_range().
> > > > > >> >> >
> > > > > >> >> > We can move the check out to copy_file_range syscall:
> > > > > >> >> >
> > > > > >> >> > if (flags != 0)
> > > > > >> >> > return -EINVAL;
> > > > > >> >> >
> > > > > >> >> > Leave the fallback from all filesystems and check for the
> > > > > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > > > > >> >>
> > > > > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > > > > >> >>
> > > > > >> >> The patch will also need to:
> > > > > >> >>
> > > > > >> >> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > > > > >> >> use the new flag.
> > > > > >> >>
> > > > > >> >> - check flags in generic_copy_file_checks() to make sure only valid flags
> > > > > >> >> are used (COPY_FILE_SPLICE at the moment).
> > > > > >> >>
> > > > > >> >> Also, where should this flag be defined? include/uapi/linux/fs.h?
> > > > > >> >
> > > > > >> > Grep for REMAP_FILE_
> > > > > >> > Same header file, same Documentation rst file.
> > > > > >> >
> > > > > >> >>
> > > > > >> >> Cheers,
> > > > > >> >> --
> > > > > >> >> Luis
> > > > > >> >>
> > > > > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > > > > >> >> index 75f764b43418..341d315d2a96 100644
> > > > > >> >> --- a/fs/read_write.c
> > > > > >> >> +++ b/fs/read_write.c
> > > > > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > > > > >> >> struct file *file_out, loff_t pos_out,
> > > > > >> >> size_t len, unsigned int flags)
> > > > > >> >> {
> > > > > >> >> + if (!(flags & COPY_FILE_SPLICE)) {
> > > > > >> >> + if (!file_out->f_op->copy_file_range)
> > > > > >> >> + return -EOPNOTSUPP;
> > > > > >> >> + else if (file_out->f_op->copy_file_range !=
> > > > > >> >> + file_in->f_op->copy_file_range)
> > > > > >> >> + return -EXDEV;
> > > > > >> >> + }
> > > > > >> >
> > > > > >> > That looks strange, because you are duplicating the logic in
> > > > > >> > do_copy_file_range(). Maybe better:
> > > > > >> >
> > > > > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > > > > >> > return -EINVAL;
> > > > > >> > if (flags & COPY_FILE_SPLICE)
> > > > > >> > return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > > > >> > len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > > > >>
> > > > > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > > > > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > > > > >> allow the filesystems to default to this implementation.
> > > > > >>
> > > > > >> With this change, I guess that the calls to generic_copy_file_range() from
> > > > > >> the different filesystems can be dropped, as in my initial patch, as they
> > > > > >> will always get -EINVAL. The other option would be to set the
> > > > > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > > > > >> problem we're trying to solve.
> > > > > >
> > > > > > I don't understand the problem.
> > > > > >
> > > > > > What exactly is wrong with the code I suggested?
> > > > > > Why should any filesystem be changed?
> > > > > >
> > > > > > Maybe I am missing something.
> > > > >
> > > > > Ok, I have to do a full brain reboot and start all over.
> > > > >
> > > > > Before that, I picked the code you suggested and tested it. I've mounted
> > > > > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > > > > using /sys/kernel/debug/sched_features as source. The result was a
> > > > > 0-sized file in cephfs. And the reason is thevfs_copy_file_range()
> > > > > early exit in:
> > > > >
> > > > > if (len == 0)
> > > > > return 0;
> > > > >
> > > > > 'len' is set in generic_copy_file_checks().
> > > >
> > > > Good point.. I guess we will need to do all the checks earlier in
> > > > generic_copy_file_checks() including the logic of:
> > > >
> > > > if (file_in->f_op->remap_file_range &&
> > > > file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
> > > >
> > > >
> > > > >
> > > > > This means that we're not solving the original problem anymore (probably
> > > > > since v1 of this patch, haven't checked).
> > > > >
> > > > > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > > > > from, say, an XFS formatted partition to an ext4 partition". Isn't that
> > > > > *exactly* what we're trying to do here? I.e. _prevent_ these copies from
> > > > > happening so that tracefs files can't be CFR'ed?
> > > > >
> > > >
> > > > We want to address the report which means calls coming from
> > > > copy_file_range() syscall.
> > > >
> > > > Trond's use case is vfs_copy_file_range() coming from nfsd.
> > > > When he writes about copy from XFS to ext4, he means an
> > > > NFS client is issuing server side copy (on same or different NFS mounts)
> > > > and the NFS server is executing nfsd_copy_file_range() on a source
> > > > file that happens to be on XFS and destination happens to be on ext4.
> > >
> > > NFS also supports a server-to-server copy where the destination server
> > > mounts the source server and reads the data to be copied. Please don't
> > > break that either :)
> >
>
> As long as the copy is via nfsd_copy_file_range() and not from the syscall
> it should not regress.
>
> > This is a case we will eventually need to support for cifs (SMB3) as well.
> >
>
> samba already does server side copy very well without needing any support
> from the kernel.
>
> nfsd also doesn't *need* to use vfs_copy_file_range() it can use kernel APIs
> like the loop in ovl_copy_up_data(). But it does, so we should not regress it.
>
> samba/nfsd can try to use copy_file_range() and it will work if the
> source/target
> fs support it. Otherwise, the server can perfectly well do the copy via other
> available interfaces, just like userspace copy tools.

I was thinking about cifsd ("ksmbd") the kernel server from
Namjae/Sergey etc. which is making excellent progress.

--
Thanks,

Steve

2021-02-17 04:48:34

by Nicolas Boichat

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Mon, Feb 15, 2021 at 11:42 PM Luis Henriques <[email protected]> wrote:
>
> Nicolas Boichat reported an issue when trying to use the copy_file_range
> syscall on a tracefs file. It failed silently because the file content is
> generated on-the-fly (reporting a size of zero) and copy_file_range needs
> to know in advance how much data is present.

Not sure if you have the whole history, these links and discussion can
help if you want to expand on the commit message:
[1] http://issuetracker.google.com/issues/178332739
[2] https://lkml.org/lkml/2021/1/25/64
[3] https://lkml.org/lkml/2021/1/26/1736
[4] https://patchwork.kernel.org/project/linux-fsdevel/cover/[email protected]/

> This commit restores the cross-fs restrictions that existed prior to
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") and
> removes generic_copy_file_range() calls from ceph, cifs, fuse, and nfs.

It goes beyond that, I think this also prevents copies within the same
FS if copy_file_range is not implemented. Which is IMHO a good thing
since this has been broken on procfs and friends ever since
copy_file_range was implemented (but I assume that nobody ever hit
that before cross-fs became available).

>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
> Cc: Nicolas Boichat <[email protected]>

You could replace that with Reported-by: Nicolas Boichat <[email protected]>

> Signed-off-by: Luis Henriques <[email protected]>
> ---
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
> fs/ceph/file.c | 21 +++-----------------
> fs/cifs/cifsfs.c | 3 ---
> fs/fuse/file.c | 21 +++-----------------
> fs/nfs/nfs4file.c | 20 +++----------------
> fs/read_write.c | 49 ++++++++++------------------------------------
> include/linux/fs.h | 3 ---
> 6 files changed, 19 insertions(+), 98 deletions(-)
>
[snip]
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..b217cd62ae0d 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1358,40 +1358,12 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
> }
> #endif
>
> -/**
> - * generic_copy_file_range - copy data between two files
> - * @file_in: file structure to read from
> - * @pos_in: file offset to read from
> - * @file_out: file structure to write data to
> - * @pos_out: file offset to write data to
> - * @len: amount of data to copy
> - * @flags: copy flags
> - *
> - * This is a generic filesystem helper to copy data from one file to another.
> - * It has no constraints on the source or destination file owners - the files
> - * can belong to different superblocks and different filesystem types. Short
> - * copies are allowed.
> - *
> - * This should be called from the @file_out filesystem, as per the
> - * ->copy_file_range() method.
> - *
> - * Returns the number of bytes copied or a negative error indicating the
> - * failure.
> - */
> -
> -ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> - struct file *file_out, loff_t pos_out,
> - size_t len, unsigned int flags)
> -{
> - return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> - len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> -}
> -EXPORT_SYMBOL(generic_copy_file_range);
> -
> static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> struct file *file_out, loff_t pos_out,
> size_t len, unsigned int flags)
> {
> + ssize_t ret = -EXDEV;
> +
> /*
> * Although we now allow filesystems to handle cross sb copy, passing
> * a file of the wrong filesystem type to filesystem driver can result
> @@ -1400,14 +1372,14 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> * several different file_system_type structures, but they all end up
> * using the same ->copy_file_range() function pointer.
> */
> - if (file_out->f_op->copy_file_range &&
> - file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> - return file_out->f_op->copy_file_range(file_in, pos_in,
> - file_out, pos_out,
> - len, flags);
> + if (!file_out->f_op->copy_file_range)
> + ret = -EOPNOTSUPP;

This doesn't work as the 0-filesize check is done before that in
vfs_copy_file_range (so the syscall still returns 0, works fine if you
comment out `if (len == 0)`).

Also, you need to check for file_in->f_op->copy_file_range instead,
the problem is if the _input_ filesystem doesn't report sizes or can't
seek properly.

> + else if (file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
> + ret = file_out->f_op->copy_file_range(file_in, pos_in,
> + file_out, pos_out,
> + len, flags);
>
> - return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> - flags);
> + return ret;
> }
>
> /*
> @@ -1514,8 +1486,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> }
>
> ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> - flags);
> - WARN_ON_ONCE(ret == -EOPNOTSUPP);
> + flags);
> done:
> if (ret > 0) {
> fsnotify_access(file_in);

2021-02-17 08:11:28

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Tue, Feb 16, 2021 at 11:15 PM Steve French <[email protected]> wrote:
>
> On Tue, Feb 16, 2021 at 1:40 PM Amir Goldstein <[email protected]> wrote:
> >
> > On Tue, Feb 16, 2021 at 9:31 PM Steve French <[email protected]> wrote:
> > >
> > > On Tue, Feb 16, 2021 at 1:29 PM Anna Schumaker
> > > <[email protected]> wrote:
> > > >
> > > > On Tue, Feb 16, 2021 at 2:22 PM Amir Goldstein <[email protected]> wrote:
> > > > >
> > > > > On Tue, Feb 16, 2021 at 8:54 PM Luis Henriques <[email protected]> wrote:
> > > > > >
> > > > > > Amir Goldstein <[email protected]> writes:
> > > > > >
> > > > > > > On Tue, Feb 16, 2021 at 6:41 PM Luis Henriques <[email protected]> wrote:
> > > > > > >>
> > > > > > >> Amir Goldstein <[email protected]> writes:
> > > > > > >>
> > > > > > >> >> Ugh. And I guess overlayfs may have a similar problem.
> > > > > > >> >
> > > > > > >> > Not exactly.
> > > > > > >> > Generally speaking, overlayfs should call vfs_copy_file_range()
> > > > > > >> > with the flags it got from layer above, so if called from nfsd it
> > > > > > >> > will allow cross fs copy and when called from syscall it won't.
> > > > > > >> >
> > > > > > >> > There are some corner cases where overlayfs could benefit from
> > > > > > >> > COPY_FILE_SPLICE (e.g. copy from lower file to upper file), but
> > > > > > >> > let's leave those for now. Just leave overlayfs code as is.
> > > > > > >>
> > > > > > >> Got it, thanks for clarifying.
> > > > > > >>
> > > > > > >> >> > This is easy to solve with a flag COPY_FILE_SPLICE (or something) that
> > > > > > >> >> > is internal to kernel users.
> > > > > > >> >> >
> > > > > > >> >> > FWIW, you may want to look at the loop in ovl_copy_up_data()
> > > > > > >> >> > for improvements to nfsd_copy_file_range().
> > > > > > >> >> >
> > > > > > >> >> > We can move the check out to copy_file_range syscall:
> > > > > > >> >> >
> > > > > > >> >> > if (flags != 0)
> > > > > > >> >> > return -EINVAL;
> > > > > > >> >> >
> > > > > > >> >> > Leave the fallback from all filesystems and check for the
> > > > > > >> >> > COPY_FILE_SPLICE flag inside generic_copy_file_range().
> > > > > > >> >>
> > > > > > >> >> Ok, the diff bellow is just to make sure I understood your suggestion.
> > > > > > >> >>
> > > > > > >> >> The patch will also need to:
> > > > > > >> >>
> > > > > > >> >> - change nfs and overlayfs calls to vfs_copy_file_range() so that they
> > > > > > >> >> use the new flag.
> > > > > > >> >>
> > > > > > >> >> - check flags in generic_copy_file_checks() to make sure only valid flags
> > > > > > >> >> are used (COPY_FILE_SPLICE at the moment).
> > > > > > >> >>
> > > > > > >> >> Also, where should this flag be defined? include/uapi/linux/fs.h?
> > > > > > >> >
> > > > > > >> > Grep for REMAP_FILE_
> > > > > > >> > Same header file, same Documentation rst file.
> > > > > > >> >
> > > > > > >> >>
> > > > > > >> >> Cheers,
> > > > > > >> >> --
> > > > > > >> >> Luis
> > > > > > >> >>
> > > > > > >> >> diff --git a/fs/read_write.c b/fs/read_write.c
> > > > > > >> >> index 75f764b43418..341d315d2a96 100644
> > > > > > >> >> --- a/fs/read_write.c
> > > > > > >> >> +++ b/fs/read_write.c
> > > > > > >> >> @@ -1383,6 +1383,13 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
> > > > > > >> >> struct file *file_out, loff_t pos_out,
> > > > > > >> >> size_t len, unsigned int flags)
> > > > > > >> >> {
> > > > > > >> >> + if (!(flags & COPY_FILE_SPLICE)) {
> > > > > > >> >> + if (!file_out->f_op->copy_file_range)
> > > > > > >> >> + return -EOPNOTSUPP;
> > > > > > >> >> + else if (file_out->f_op->copy_file_range !=
> > > > > > >> >> + file_in->f_op->copy_file_range)
> > > > > > >> >> + return -EXDEV;
> > > > > > >> >> + }
> > > > > > >> >
> > > > > > >> > That looks strange, because you are duplicating the logic in
> > > > > > >> > do_copy_file_range(). Maybe better:
> > > > > > >> >
> > > > > > >> > if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> > > > > > >> > return -EINVAL;
> > > > > > >> > if (flags & COPY_FILE_SPLICE)
> > > > > > >> > return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > > > > >> > len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > > > > >>
> > > > > > >> My initial reasoning for duplicating the logic in do_copy_file_range() was
> > > > > > >> to allow the generic_copy_file_range() callers to be left unmodified and
> > > > > > >> allow the filesystems to default to this implementation.
> > > > > > >>
> > > > > > >> With this change, I guess that the calls to generic_copy_file_range() from
> > > > > > >> the different filesystems can be dropped, as in my initial patch, as they
> > > > > > >> will always get -EINVAL. The other option would be to set the
> > > > > > >> COPY_FILE_SPLICE flag in those calls, but that would get us back to the
> > > > > > >> problem we're trying to solve.
> > > > > > >
> > > > > > > I don't understand the problem.
> > > > > > >
> > > > > > > What exactly is wrong with the code I suggested?
> > > > > > > Why should any filesystem be changed?
> > > > > > >
> > > > > > > Maybe I am missing something.
> > > > > >
> > > > > > Ok, I have to do a full brain reboot and start all over.
> > > > > >
> > > > > > Before that, I picked the code you suggested and tested it. I've mounted
> > > > > > a cephfs filesystem and used xfs_io to execute a 'copy_range' command
> > > > > > using /sys/kernel/debug/sched_features as source. The result was a
> > > > > > 0-sized file in cephfs. And the reason is thevfs_copy_file_range()
> > > > > > early exit in:
> > > > > >
> > > > > > if (len == 0)
> > > > > > return 0;
> > > > > >
> > > > > > 'len' is set in generic_copy_file_checks().
> > > > >
> > > > > Good point.. I guess we will need to do all the checks earlier in
> > > > > generic_copy_file_checks() including the logic of:
> > > > >
> > > > > if (file_in->f_op->remap_file_range &&
> > > > > file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)
> > > > >
> > > > >
> > > > > >
> > > > > > This means that we're not solving the original problem anymore (probably
> > > > > > since v1 of this patch, haven't checked).
> > > > > >
> > > > > > Also, re-reading Trond's emails, I read: "... also disallowing the copy
> > > > > > from, say, an XFS formatted partition to an ext4 partition". Isn't that
> > > > > > *exactly* what we're trying to do here? I.e. _prevent_ these copies from
> > > > > > happening so that tracefs files can't be CFR'ed?
> > > > > >
> > > > >
> > > > > We want to address the report which means calls coming from
> > > > > copy_file_range() syscall.
> > > > >
> > > > > Trond's use case is vfs_copy_file_range() coming from nfsd.
> > > > > When he writes about copy from XFS to ext4, he means an
> > > > > NFS client is issuing server side copy (on same or different NFS mounts)
> > > > > and the NFS server is executing nfsd_copy_file_range() on a source
> > > > > file that happens to be on XFS and destination happens to be on ext4.
> > > >
> > > > NFS also supports a server-to-server copy where the destination server
> > > > mounts the source server and reads the data to be copied. Please don't
> > > > break that either :)
> > >
> >
> > As long as the copy is via nfsd_copy_file_range() and not from the syscall
> > it should not regress.
> >
> > > This is a case we will eventually need to support for cifs (SMB3) as well.
> > >
> >
> > samba already does server side copy very well without needing any support
> > from the kernel.
> >
> > nfsd also doesn't *need* to use vfs_copy_file_range() it can use kernel APIs
> > like the loop in ovl_copy_up_data(). But it does, so we should not regress it.
> >
> > samba/nfsd can try to use copy_file_range() and it will work if the
> > source/target
> > fs support it. Otherwise, the server can perfectly well do the copy via other
> > available interfaces, just like userspace copy tools.
>
> I was thinking about cifsd ("ksmbd") the kernel server from
> Namjae/Sergey etc. which is making excellent progress.
>

You are missing my point.
Never mind which server. The server does not *need* to rely on
vfs_copy_file_range() to copy files from XFS to ext4.
The server is very capable of implementing the fallback generic copy
in case source/target fs do not support native {copy,remap}_file_range().

w.r.t semantics of copy_file_range() syscall vs. the fallback to userespace
'cp' tool (check source file size before copy or not), please note that the
semantics of CIFS_IOC_COPYCHUNK_FILE are that of the former:

rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
src_inode->i_size, 0);

It will copy zero bytes if advertised source file size if zero.

NFS server side copy semantics are currently de-facto the same
because both the client and the server will have to pass through this
line in vfs_copy_file_range():

if (len == 0)
return 0;

IMO, and this opinion was voiced by several other filesystem developers,
the shortend copy semantics are the correct semantics for copy_file_range()
syscall as well as for vfs_copy_file_range() for internal kernel users.

I guess what this means is that if the 'cp' tool ever tries an opportunistic
copy_file_range() syscall (e.g. --cfr=auto), it may result in zero size copy.

Thanks,
Amir.

2021-02-17 20:30:12

by Luis Henriques

[permalink] [raw]
Subject: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies

A regression has been reported by Nicolas Boichat, found while using the
copy_file_range syscall to copy a tracefs file. Before commit
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
kernel would return -EXDEV to userspace when trying to copy a file across
different filesystems. After this commit, the syscall doesn't fail anymore
and instead returns zero (zero bytes copied), as this file's content is
generated on-the-fly and thus reports a size of zero.

This patch restores some cross-filesystems copy restrictions that existed
prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
devices"). It also introduces a flag (COPY_FILE_SPLICE) that can be used
by filesystems calling directly into the vfs copy_file_range to override
these restrictions. Right now, only NFS needs to set this flag.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
Reported-by: Nicolas Boichat <[email protected]>
Signed-off-by: Luis Henriques <[email protected]>
---
Ok, I've tried to address all the issues and comments. Hopefully this v3
is a bit closer to the final fix.

Changes since v2
- do all the required checks earlier, in generic_copy_file_checks(),
adding new checks for ->remap_file_range
- new COPY_FILE_SPLICE flag
- don't remove filesystem's fallback to generic_copy_file_range()
- updated commit changelog (and subject)
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description

fs/nfsd/vfs.c | 3 ++-
fs/read_write.c | 44 +++++++++++++++++++++++++++++++++++++++++---
include/linux/fs.h | 7 +++++++
3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..14e55822c223 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -578,7 +578,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
* limit like this and pipeline multiple COPY requests.
*/
count = min_t(u64, count, 1 << 22);
- return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+ return vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
+ COPY_FILE_SPLICE);
}

__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..40a16003fb05 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1410,6 +1410,33 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
flags);
}

+/*
+ * This helper function checks whether copy_file_range can actually be used,
+ * depending on the source and destination filesystems being the same.
+ *
+ * In-kernel callers may set COPY_FILE_SPLICE to override these checks.
+ */
+static int fops_copy_file_checks(struct file *file_in, struct file *file_out,
+ unsigned int flags)
+{
+ if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
+ return -EINVAL;
+
+ if (flags & COPY_FILE_SPLICE)
+ return 0;
+ /*
+ * We got here from userspace, so forbid copies if copy_file_range isn't
+ * implemented or if we're doing a cross-fs copy.
+ */
+ if (!file_out->f_op->copy_file_range)
+ return -EOPNOTSUPP;
+ else if (file_out->f_op->copy_file_range !=
+ file_in->f_op->copy_file_range)
+ return -EXDEV;
+
+ return 0;
+}
+
/*
* Performs necessary checks before doing a file copy
*
@@ -1427,6 +1454,14 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
loff_t size_in;
int ret;

+ /* Only check f_ops if we're not trying to clone */
+ if (!file_in->f_op->remap_file_range ||
+ (file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)) {
+ ret = fops_copy_file_checks(file_in, file_out, flags);
+ if (ret)
+ return ret;
+ }
+
ret = generic_file_rw_checks(file_in, file_out);
if (ret)
return ret;
@@ -1474,9 +1509,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
{
ssize_t ret;

- if (flags != 0)
- return -EINVAL;
-
ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
flags);
if (unlikely(ret))
@@ -1511,6 +1543,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
ret = cloned;
goto done;
}
+ ret = fops_copy_file_checks(file_in, file_out, flags);
+ if (ret)
+ return ret;
}

ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
@@ -1543,6 +1578,9 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
struct fd f_out;
ssize_t ret = -EBADF;

+ if (flags != 0)
+ return -EINVAL;
+
f_in = fdget(fd_in);
if (!f_in.file)
goto out2;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd47deea7c17..6f604926d955 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1815,6 +1815,13 @@ struct dir_context {
*/
#define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)

+/*
+ * This flag control the behavior of copy_file_range from internal (kernel)
+ * users. It can be used to override the policy of forbidding copies when
+ * source and destination filesystems are different.
+ */
+#define COPY_FILE_SPLICE (1 << 0)
+
struct iov_iter;

struct file_operations {

2021-02-17 21:27:13

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies

On Wed, Feb 17, 2021 at 7:25 PM Luis Henriques <[email protected]> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file. Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems. After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystems copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices"). It also introduces a flag (COPY_FILE_SPLICE) that can be used
> by filesystems calling directly into the vfs copy_file_range to override
> these restrictions. Right now, only NFS needs to set this flag.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <[email protected]>
> Signed-off-by: Luis Henriques <[email protected]>
> ---
> Ok, I've tried to address all the issues and comments. Hopefully this v3
> is a bit closer to the final fix.
>
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
> adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
> fs/nfsd/vfs.c | 3 ++-
> fs/read_write.c | 44 +++++++++++++++++++++++++++++++++++++++++---
> include/linux/fs.h | 7 +++++++
> 3 files changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..14e55822c223 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -578,7 +578,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
> * limit like this and pipeline multiple COPY requests.
> */
> count = min_t(u64, count, 1 << 22);
> - return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> + return vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
> + COPY_FILE_SPLICE);
> }
>
> __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..40a16003fb05 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1410,6 +1410,33 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> flags);
> }
>
> +/*
> + * This helper function checks whether copy_file_range can actually be used,
> + * depending on the source and destination filesystems being the same.
> + *
> + * In-kernel callers may set COPY_FILE_SPLICE to override these checks.
> + */
> +static int fops_copy_file_checks(struct file *file_in, struct file *file_out,
> + unsigned int flags)
> +{
> + if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> + return -EINVAL;
> +
> + if (flags & COPY_FILE_SPLICE)
> + return 0;
> + /*
> + * We got here from userspace, so forbid copies if copy_file_range isn't
> + * implemented or if we're doing a cross-fs copy.
> + */

Suggest:

if (!file_in->f_op->copy_file_range) {
if (file_in->f_op->copy_file_range !=
file_out->f_op->copy_file_range)
return -EXDEV;
} else if (file_in->f_op->remap_file_range) {
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
} else {
return -EOPNOTSUPP;
}

return 0;
}

> +
> /*
> * Performs necessary checks before doing a file copy
> *
> @@ -1427,6 +1454,14 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
> loff_t size_in;
> int ret;
>
> + /* Only check f_ops if we're not trying to clone */
> + if (!file_in->f_op->remap_file_range ||
> + (file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)) {
> + ret = fops_copy_file_checks(file_in, file_out, flags);
> + if (ret)
> + return ret;
> + }
> +

and then you don't need this special casing of clone here.

> ret = generic_file_rw_checks(file_in, file_out);
> if (ret)
> return ret;
> @@ -1474,9 +1509,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> {
> ssize_t ret;
>
> - if (flags != 0)
> - return -EINVAL;
> -
> ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
> flags);
> if (unlikely(ret))
> @@ -1511,6 +1543,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> ret = cloned;
> goto done;
> }
> + ret = fops_copy_file_checks(file_in, file_out, flags);
> + if (ret)
> + return ret;

and you don't need this here (right?)

and you can remove the checks for same i_sb and same copy_file_range
op that were already tested from vfs_copy_file_range().

Hope I am not missing anything.

Thanks,
Amir.

2021-02-18 01:00:05

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Feb 17, 2021, at 1:08 AM, Amir Goldstein <[email protected]> wrote:
>
> You are missing my point.
> Never mind which server. The server does not *need* to rely on
> vfs_copy_file_range() to copy files from XFS to ext4.
> The server is very capable of implementing the fallback generic copy
> in case source/target fs do not support native {copy,remap}_file_range().
>
> w.r.t semantics of copy_file_range() syscall vs. the fallback to userespace
> 'cp' tool (check source file size before copy or not), please note that the
> semantics of CIFS_IOC_COPYCHUNK_FILE are that of the former:
>
> rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
> src_inode->i_size, 0);
>
> It will copy zero bytes if advertised source file size if zero.
>
> NFS server side copy semantics are currently de-facto the same
> because both the client and the server will have to pass through this
> line in vfs_copy_file_range():
>
> if (len == 0)
> return 0;
>
> IMO, and this opinion was voiced by several other filesystem developers,
> the shortend copy semantics are the correct semantics for copy_file_range()
> syscall as well as for vfs_copy_file_range() for internal kernel users.
>
> I guess what this means is that if the 'cp' tool ever tries an opportunistic
> copy_file_range() syscall (e.g. --cfr=auto), it may result in zero size copy.

Having a syscall that does the "wrong thing" when called on two files
doesn't make sense. Expecting userspace to check whether source/target
files supports CFR is also not practical. This is trivial for the
kernel to determine and return -EOPNOTSUPP to the caller if the source
file (procfs/sysfs/etc) does not work with CFR properly.

Applications must already handle -EOPNOTSUPP with a fallback, but
expecting all applications that may call copy_file_range() to be
properly coded to handle corner cases is just asking for trouble.
That is doubly true given that an existing widely-used tool like
cp and mv are using this syscall if it is available in the kernel.

Cheers, Andreas






Attachments:
signature.asc (890.00 B)
Message signed with OpenPGP

2021-02-18 01:01:32

by Nicolas Boichat

[permalink] [raw]
Subject: Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies

On Thu, Feb 18, 2021 at 1:25 AM Luis Henriques <[email protected]> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file. Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems. After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystems copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices").

Note that you also fix intra-filesystem copy_file_range on these
generated filesystems. This is IMHO great, but needs to be mentioned
in the commit message.

> It also introduces a flag (COPY_FILE_SPLICE) that can be used
> by filesystems calling directly into the vfs copy_file_range to override
> these restrictions. Right now, only NFS needs to set this flag.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")

So technically this fixes something much older, presumably ever since
copy_file_range was introduced.

> Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <[email protected]>

Tested-by: Nicolas Boichat <[email protected]>
but I guess you should not add to the next revision, I'll keep testing
further revisions ,-)

> Signed-off-by: Luis Henriques <[email protected]>
> ---
> Ok, I've tried to address all the issues and comments. Hopefully this v3
> is a bit closer to the final fix.
>
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
> adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description
>
> fs/nfsd/vfs.c | 3 ++-
> fs/read_write.c | 44 +++++++++++++++++++++++++++++++++++++++++---
> include/linux/fs.h | 7 +++++++
> 3 files changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..14e55822c223 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -578,7 +578,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
> * limit like this and pipeline multiple COPY requests.
> */
> count = min_t(u64, count, 1 << 22);
> - return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> + return vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
> + COPY_FILE_SPLICE);
> }
>
> __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..40a16003fb05 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1410,6 +1410,33 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> flags);
> }
>
> +/*
> + * This helper function checks whether copy_file_range can actually be used,
> + * depending on the source and destination filesystems being the same.
> + *
> + * In-kernel callers may set COPY_FILE_SPLICE to override these checks.
> + */
> +static int fops_copy_file_checks(struct file *file_in, struct file *file_out,

fops_copy_file_range_checks ?

> + unsigned int flags)
> +{
> + if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> + return -EINVAL;
> +
> + if (flags & COPY_FILE_SPLICE)
> + return 0;
> + /*
> + * We got here from userspace, so forbid copies if copy_file_range isn't
> + * implemented or if we're doing a cross-fs copy.
> + */
> + if (!file_out->f_op->copy_file_range)
> + return -EOPNOTSUPP;

After this is merged, should this be added as an error code to the man page?

> + else if (file_out->f_op->copy_file_range !=
> + file_in->f_op->copy_file_range)

Just note, this could be a cross-fs copy (just not a cross-fs_type copy).

> + return -EXDEV;
> +
> + return 0;
> +}
> +
> /*
> * Performs necessary checks before doing a file copy
> *
> @@ -1427,6 +1454,14 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
> loff_t size_in;
> int ret;
>
> + /* Only check f_ops if we're not trying to clone */
> + if (!file_in->f_op->remap_file_range ||
> + (file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)) {
> + ret = fops_copy_file_checks(file_in, file_out, flags);
> + if (ret)
> + return ret;
> + }
> +
> ret = generic_file_rw_checks(file_in, file_out);
> if (ret)
> return ret;
> @@ -1474,9 +1509,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> {
> ssize_t ret;
>
> - if (flags != 0)
> - return -EINVAL;
> -
> ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
> flags);
> if (unlikely(ret))
> @@ -1511,6 +1543,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> ret = cloned;
> goto done;
> }
> + ret = fops_copy_file_checks(file_in, file_out, flags);
> + if (ret)
> + return ret;
> }
>
> ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> @@ -1543,6 +1578,9 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
> struct fd f_out;
> ssize_t ret = -EBADF;
>
> + if (flags != 0)
> + return -EINVAL;
> +
> f_in = fdget(fd_in);
> if (!f_in.file)
> goto out2;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index fd47deea7c17..6f604926d955 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1815,6 +1815,13 @@ struct dir_context {
> */
> #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)
>
> +/*
> + * This flag control the behavior of copy_file_range from internal (kernel)
> + * users. It can be used to override the policy of forbidding copies when
> + * source and destination filesystems are different.
> + */
> +#define COPY_FILE_SPLICE (1 << 0)

nit: BIT(0) ?


> +
> struct iov_iter;
>
> struct file_operations {

2021-02-18 05:38:18

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies

On Wed, Feb 17, 2021 at 3:30 PM Luis Henriques <[email protected]> wrote:
>
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file. Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems. After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystems copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices"). It also introduces a flag (COPY_FILE_SPLICE) that can be used
> by filesystems calling directly into the vfs copy_file_range to override
> these restrictions. Right now, only NFS needs to set this flag.
>
> Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
> Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> Reported-by: Nicolas Boichat <[email protected]>
> Signed-off-by: Luis Henriques <[email protected]>
> ---
> Ok, I've tried to address all the issues and comments. Hopefully this v3
> is a bit closer to the final fix.
>
> Changes since v2
> - do all the required checks earlier, in generic_copy_file_checks(),
> adding new checks for ->remap_file_range
> - new COPY_FILE_SPLICE flag
> - don't remove filesystem's fallback to generic_copy_file_range()
> - updated commit changelog (and subject)
> Changes since v1 (after Amir review)
> - restored do_copy_file_range() helper
> - return -EOPNOTSUPP if fs doesn't implement CFR
> - updated commit description

In my testing, this patch breaks NFS server-to-server copy file.

>
> fs/nfsd/vfs.c | 3 ++-
> fs/read_write.c | 44 +++++++++++++++++++++++++++++++++++++++++---
> include/linux/fs.h | 7 +++++++
> 3 files changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 04937e51de56..14e55822c223 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -578,7 +578,8 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
> * limit like this and pipeline multiple COPY requests.
> */
> count = min_t(u64, count, 1 << 22);
> - return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
> + return vfs_copy_file_range(src, src_pos, dst, dst_pos, count,
> + COPY_FILE_SPLICE);
> }
>
> __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 75f764b43418..40a16003fb05 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1410,6 +1410,33 @@ static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
> flags);
> }
>
> +/*
> + * This helper function checks whether copy_file_range can actually be used,
> + * depending on the source and destination filesystems being the same.
> + *
> + * In-kernel callers may set COPY_FILE_SPLICE to override these checks.
> + */
> +static int fops_copy_file_checks(struct file *file_in, struct file *file_out,
> + unsigned int flags)
> +{
> + if (WARN_ON_ONCE(flags & ~COPY_FILE_SPLICE))
> + return -EINVAL;
> +
> + if (flags & COPY_FILE_SPLICE)
> + return 0;
> + /*
> + * We got here from userspace, so forbid copies if copy_file_range isn't
> + * implemented or if we're doing a cross-fs copy.
> + */
> + if (!file_out->f_op->copy_file_range)
> + return -EOPNOTSUPP;
> + else if (file_out->f_op->copy_file_range !=
> + file_in->f_op->copy_file_range)
> + return -EXDEV;
> +
> + return 0;
> +}
> +
> /*
> * Performs necessary checks before doing a file copy
> *
> @@ -1427,6 +1454,14 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
> loff_t size_in;
> int ret;
>
> + /* Only check f_ops if we're not trying to clone */
> + if (!file_in->f_op->remap_file_range ||
> + (file_inode(file_in)->i_sb == file_inode(file_out)->i_sb)) {
> + ret = fops_copy_file_checks(file_in, file_out, flags);
> + if (ret)
> + return ret;
> + }
> +
> ret = generic_file_rw_checks(file_in, file_out);
> if (ret)
> return ret;
> @@ -1474,9 +1509,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> {
> ssize_t ret;
>
> - if (flags != 0)
> - return -EINVAL;
> -
> ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
> flags);
> if (unlikely(ret))
> @@ -1511,6 +1543,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> ret = cloned;
> goto done;
> }
> + ret = fops_copy_file_checks(file_in, file_out, flags);
> + if (ret)
> + return ret;
> }
>
> ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
> @@ -1543,6 +1578,9 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
> struct fd f_out;
> ssize_t ret = -EBADF;
>
> + if (flags != 0)
> + return -EINVAL;
> +
> f_in = fdget(fd_in);
> if (!f_in.file)
> goto out2;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index fd47deea7c17..6f604926d955 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1815,6 +1815,13 @@ struct dir_context {
> */
> #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)
>
> +/*
> + * This flag control the behavior of copy_file_range from internal (kernel)
> + * users. It can be used to override the policy of forbidding copies when
> + * source and destination filesystems are different.
> + */
> +#define COPY_FILE_SPLICE (1 << 0)
> +
> struct iov_iter;
>
> struct file_operations {

2021-02-18 07:02:44

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies

On Thu, Feb 18, 2021 at 7:33 AM Olga Kornievskaia <[email protected]> wrote:
>
> On Wed, Feb 17, 2021 at 3:30 PM Luis Henriques <[email protected]> wrote:
> >
> > A regression has been reported by Nicolas Boichat, found while using the
> > copy_file_range syscall to copy a tracefs file. Before commit
> > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > kernel would return -EXDEV to userspace when trying to copy a file across
> > different filesystems. After this commit, the syscall doesn't fail anymore
> > and instead returns zero (zero bytes copied), as this file's content is
> > generated on-the-fly and thus reports a size of zero.
> >
> > This patch restores some cross-filesystems copy restrictions that existed
> > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > devices"). It also introduces a flag (COPY_FILE_SPLICE) that can be used
> > by filesystems calling directly into the vfs copy_file_range to override
> > these restrictions. Right now, only NFS needs to set this flag.
> >
> > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
> > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> > Reported-by: Nicolas Boichat <[email protected]>
> > Signed-off-by: Luis Henriques <[email protected]>
> > ---
> > Ok, I've tried to address all the issues and comments. Hopefully this v3
> > is a bit closer to the final fix.
> >
> > Changes since v2
> > - do all the required checks earlier, in generic_copy_file_checks(),
> > adding new checks for ->remap_file_range
> > - new COPY_FILE_SPLICE flag
> > - don't remove filesystem's fallback to generic_copy_file_range()
> > - updated commit changelog (and subject)
> > Changes since v1 (after Amir review)
> > - restored do_copy_file_range() helper
> > - return -EOPNOTSUPP if fs doesn't implement CFR
> > - updated commit description
>
> In my testing, this patch breaks NFS server-to-server copy file.

Hi Olga,

Can you please provide more details on the failed tests.

Does it fail on the client between two nfs mounts or does it fail
on the server? If the latter, between which two filesystems on the server?

Thanks,
Amir.

2021-02-18 07:42:23

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Wed, Feb 17, 2021 at 05:50:35PM -0700, Andreas Dilger wrote:
> On Feb 17, 2021, at 1:08 AM, Amir Goldstein <[email protected]> wrote:
> >
> > You are missing my point.
> > Never mind which server. The server does not *need* to rely on
> > vfs_copy_file_range() to copy files from XFS to ext4.
> > The server is very capable of implementing the fallback generic copy
> > in case source/target fs do not support native {copy,remap}_file_range().
> >
> > w.r.t semantics of copy_file_range() syscall vs. the fallback to userespace
> > 'cp' tool (check source file size before copy or not), please note that the
> > semantics of CIFS_IOC_COPYCHUNK_FILE are that of the former:
> >
> > rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
> > src_inode->i_size, 0);
> >
> > It will copy zero bytes if advertised source file size if zero.
> >
> > NFS server side copy semantics are currently de-facto the same
> > because both the client and the server will have to pass through this
> > line in vfs_copy_file_range():
> >
> > if (len == 0)
> > return 0;
> >
> > IMO, and this opinion was voiced by several other filesystem developers,
> > the shortend copy semantics are the correct semantics for copy_file_range()
> > syscall as well as for vfs_copy_file_range() for internal kernel users.
> >
> > I guess what this means is that if the 'cp' tool ever tries an opportunistic
> > copy_file_range() syscall (e.g. --cfr=auto), it may result in zero size copy.
>
> Having a syscall that does the "wrong thing" when called on two files
> doesn't make sense. Expecting userspace to check whether source/target
> files supports CFR is also not practical. This is trivial for the
> kernel to determine and return -EOPNOTSUPP to the caller if the source
> file (procfs/sysfs/etc) does not work with CFR properly.

How does the kernel "know" that a specific file in a specific filesystem
will not work with CFR "properly"? That goes back to the original patch
which tried to label each and every filesystem type with a
"supported/not supported" type of flag, which was going to be a mess,
especially as it seems that this might be a file-specific thing, not a
filesystem-specific thing.

The goal of the patch _should_ be that the kernel figure it out itself,
but so far no one seems to be able to explain how that can be done :(

So, any hints?

thanks,

greg k-h

2021-02-18 07:47:50

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

Looks good:

Reviewed-by: Christoph Hellwig <[email protected]>

This whole idea of cross-device copie has always been a horrible idea,
and I've been arguing against it since the patches were posted.

2021-02-18 07:51:18

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies

On Wed, Feb 17, 2021 at 05:26:54PM +0000, Luis Henriques wrote:
> A regression has been reported by Nicolas Boichat, found while using the
> copy_file_range syscall to copy a tracefs file. Before commit
> 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> kernel would return -EXDEV to userspace when trying to copy a file across
> different filesystems. After this commit, the syscall doesn't fail anymore
> and instead returns zero (zero bytes copied), as this file's content is
> generated on-the-fly and thus reports a size of zero.
>
> This patch restores some cross-filesystems copy restrictions that existed
> prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> devices"). It also introduces a flag (COPY_FILE_SPLICE) that can be used
> by filesystems calling directly into the vfs copy_file_range to override
> these restrictions. Right now, only NFS needs to set this flag.

No need for the flag. Jyst fall back to splicing in the only caller
that wants it.

2021-02-18 10:14:12

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Thu, Feb 18, 2021 at 9:42 AM Christoph Hellwig <[email protected]> wrote:
>
> Looks good:
>
> Reviewed-by: Christoph Hellwig <[email protected]>
>
> This whole idea of cross-device copie has always been a horrible idea,
> and I've been arguing against it since the patches were posted.

Ok. I'm good with this v2 as well, but need to add the fallback to
do_splice_direct()
in nfsd_copy_file_range(), because this patch breaks it.

And the commit message of v3 is better in describing the reported issue.

Thanks,
Amir.

2021-02-18 12:58:15

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Thu, Feb 18, 2021 at 2:14 PM Luis Henriques <[email protected]> wrote:
>
> Luis Henriques <[email protected]> writes:
>
> > Amir Goldstein <[email protected]> writes:
> >
> >> On Thu, Feb 18, 2021 at 9:42 AM Christoph Hellwig <[email protected]> wrote:
> >>>
> >>> Looks good:
> >>>
> >>> Reviewed-by: Christoph Hellwig <[email protected]>
> >>>
> >>> This whole idea of cross-device copie has always been a horrible idea,
> >>> and I've been arguing against it since the patches were posted.
> >>
> >> Ok. I'm good with this v2 as well, but need to add the fallback to
> >> do_splice_direct()
> >> in nfsd_copy_file_range(), because this patch breaks it.
> >>
> >> And the commit message of v3 is better in describing the reported issue.
> >
> > Except that, as I said in a previous email, v2 doesn't really fix the
> > issue: all the checks need to be done earlier in generic_copy_file_checks().
> >
> > I'll work on getting v4, based on v2 and but moving the checks and
> > implementing your review suggestions to v3 (plus this nfs change).
>
> There's something else:
>
> The filesystems (nfs, ceph, cifs, fuse) rely on the fallback to
> generic_copy_file_range() if something's wrong. And this "something's
> wrong" is fs specific. For example: in ceph it is possible to offload the
> file copy to the OSDs even if the files are in different filesystems as
> long as these filesystems are on the *same* ceph cluster. If the copy
> being done is across two different clusters, then the copy reverts to
> splice. This means that the boilerplate code being removed in v2 of this
> patch needs to be restored and replace by:
>
> ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
> len, flags);
>
> if (ret == -EOPNOTSUPP || ret == -EXDEV)
> ret = do_splice_direct(src_file, &src_off, dst_file, &dst_off,
> len > MAX_RW_COUNT ? MAX_RW_COUNT : len,
> flags);
> return ret;
>

Why not leave the filesystem code as is and leave the
generic_copy_file_range() helper? Less churn.

Then nfsd_copy_file_range() can also fallback to generic_copy_file_range().

Thanks,
Amir.

2021-02-18 13:03:21

by Luis Henriques

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

Luis Henriques <[email protected]> writes:

> Amir Goldstein <[email protected]> writes:
>
>> On Thu, Feb 18, 2021 at 9:42 AM Christoph Hellwig <[email protected]> wrote:
>>>
>>> Looks good:
>>>
>>> Reviewed-by: Christoph Hellwig <[email protected]>
>>>
>>> This whole idea of cross-device copie has always been a horrible idea,
>>> and I've been arguing against it since the patches were posted.
>>
>> Ok. I'm good with this v2 as well, but need to add the fallback to
>> do_splice_direct()
>> in nfsd_copy_file_range(), because this patch breaks it.
>>
>> And the commit message of v3 is better in describing the reported issue.
>
> Except that, as I said in a previous email, v2 doesn't really fix the
> issue: all the checks need to be done earlier in generic_copy_file_checks().
>
> I'll work on getting v4, based on v2 and but moving the checks and
> implementing your review suggestions to v3 (plus this nfs change).

There's something else:

The filesystems (nfs, ceph, cifs, fuse) rely on the fallback to
generic_copy_file_range() if something's wrong. And this "something's
wrong" is fs specific. For example: in ceph it is possible to offload the
file copy to the OSDs even if the files are in different filesystems as
long as these filesystems are on the *same* ceph cluster. If the copy
being done is across two different clusters, then the copy reverts to
splice. This means that the boilerplate code being removed in v2 of this
patch needs to be restored and replace by:

ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off,
len, flags);

if (ret == -EOPNOTSUPP || ret == -EXDEV)
ret = do_splice_direct(src_file, &src_off, dst_file, &dst_off,
len > MAX_RW_COUNT ? MAX_RW_COUNT : len,
flags);
return ret;

A quick look at the other filesystems code indicate similar patterns.
Since at this point we've gone through all the syscall checks already,
calling do_splice_direct() shouldn't be a huge change. But I may be
missing something. Again. Which is quite likely :-)

Cheers,
--
Luis

2021-02-18 17:05:25

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH v3] vfs: fix copy_file_range regression in cross-fs copies

On Thu, Feb 18, 2021 at 1:48 AM Amir Goldstein <[email protected]> wrote:
>
> On Thu, Feb 18, 2021 at 7:33 AM Olga Kornievskaia <[email protected]> wrote:
> >
> > On Wed, Feb 17, 2021 at 3:30 PM Luis Henriques <[email protected]> wrote:
> > >
> > > A regression has been reported by Nicolas Boichat, found while using the
> > > copy_file_range syscall to copy a tracefs file. Before commit
> > > 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
> > > kernel would return -EXDEV to userspace when trying to copy a file across
> > > different filesystems. After this commit, the syscall doesn't fail anymore
> > > and instead returns zero (zero bytes copied), as this file's content is
> > > generated on-the-fly and thus reports a size of zero.
> > >
> > > This patch restores some cross-filesystems copy restrictions that existed
> > > prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
> > > devices"). It also introduces a flag (COPY_FILE_SPLICE) that can be used
> > > by filesystems calling directly into the vfs copy_file_range to override
> > > these restrictions. Right now, only NFS needs to set this flag.
> > >
> > > Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
> > > Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
> > > Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
> > > Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
> > > Reported-by: Nicolas Boichat <[email protected]>
> > > Signed-off-by: Luis Henriques <[email protected]>
> > > ---
> > > Ok, I've tried to address all the issues and comments. Hopefully this v3
> > > is a bit closer to the final fix.
> > >
> > > Changes since v2
> > > - do all the required checks earlier, in generic_copy_file_checks(),
> > > adding new checks for ->remap_file_range
> > > - new COPY_FILE_SPLICE flag
> > > - don't remove filesystem's fallback to generic_copy_file_range()
> > > - updated commit changelog (and subject)
> > > Changes since v1 (after Amir review)
> > > - restored do_copy_file_range() helper
> > > - return -EOPNOTSUPP if fs doesn't implement CFR
> > > - updated commit description
> >
> > In my testing, this patch breaks NFS server-to-server copy file.
>
> Hi Olga,
>
> Can you please provide more details on the failed tests.
>
> Does it fail on the client between two nfs mounts or does it fail
> on the server? If the latter, between which two filesystems on the server?
>

It was a pilot error. V3 worked. I'm having some other issues with
server to server copy code but they seem to be unrelated to this. I
will test the new v6 versions when it comes out.

> Thanks,
> Amir.

2021-02-18 17:06:28

by Luis Henriques

[permalink] [raw]
Subject: [PATCH v4] vfs: fix copy_file_range regression in cross-fs copies

A regression has been reported by Nicolas Boichat, found while using the
copy_file_range syscall to copy a tracefs file. Before commit
5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices") the
kernel would return -EXDEV to userspace when trying to copy a file across
different filesystems. After this commit, the syscall doesn't fail anymore
and instead returns zero (zero bytes copied), as this file's content is
generated on-the-fly and thus reports a size of zero.

This patch restores some cross-filesystem copy restrictions that existed
prior to commit 5dae222a5ff0 ("vfs: allow copy_file_range to copy across
devices"). Filesystems are still allowed to fall-back to the VFS
generic_copy_file_range() implementation, but that has now to be done
explicitly.

nfsd is also modified to use generic_copy_file_range() instead of
vfs_copy_file_range() so that it can still fall-back to splice without going
through all the checks.

Fixes: 5dae222a5ff0 ("vfs: allow copy_file_range to copy across devices")
Link: https://lore.kernel.org/linux-fsdevel/[email protected]/
Link: https://lore.kernel.org/linux-fsdevel/CANMq1KDZuxir2LM5jOTm0xx+BnvW=ZmpsG47CyHFJwnw7zSX6Q@mail.gmail.com/
Link: https://lore.kernel.org/linux-fsdevel/20210126135012.1.If45b7cdc3ff707bc1efa17f5366057d60603c45f@changeid/
Reported-by: Nicolas Boichat <[email protected]>
Signed-off-by: Luis Henriques <[email protected]>
---
And here's v4. I'd like to request help for testing. I know Nicolas is
doing that (thanks! and thanks for the reviews). But it would be great to
get at least the nfs code tested. Olga, can you help here?

Changes since v3
- dropped the COPY_FILE_SPLICE flag
- kept the f_op's checks early in generic_copy_file_checks, implementing
Amir's suggestions
- modified nfsd to use generic_copy_file_range()
Changes since v2
- do all the required checks earlier, in generic_copy_file_checks(),
adding new checks for ->remap_file_range
- new COPY_FILE_SPLICE flag
- don't remove filesystem's fallback to generic_copy_file_range()
- updated commit changelog (and subject)
Changes since v1 (after Amir review)
- restored do_copy_file_range() helper
- return -EOPNOTSUPP if fs doesn't implement CFR
- updated commit description

fs/nfsd/vfs.c | 2 +-
fs/read_write.c | 50 +++++++++++++++++++++++--------------------------
2 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 04937e51de56..49dd28ee2602 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -578,7 +578,7 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
* limit like this and pipeline multiple COPY requests.
*/
count = min_t(u64, count, 1 << 22);
- return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+ return generic_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
}

__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
diff --git a/fs/read_write.c b/fs/read_write.c
index 75f764b43418..214d44f7cbfa 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1388,28 +1388,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
}
EXPORT_SYMBOL(generic_copy_file_range);

-static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags)
-{
- /*
- * Although we now allow filesystems to handle cross sb copy, passing
- * a file of the wrong filesystem type to filesystem driver can result
- * in an attempt to dereference the wrong type of ->private_data, so
- * avoid doing that until we really have a good reason. NFS defines
- * several different file_system_type structures, but they all end up
- * using the same ->copy_file_range() function pointer.
- */
- if (file_out->f_op->copy_file_range &&
- file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
- return file_out->f_op->copy_file_range(file_in, pos_in,
- file_out, pos_out,
- len, flags);
-
- return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
-}
-
/*
* Performs necessary checks before doing a file copy
*
@@ -1427,6 +1405,25 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
loff_t size_in;
int ret;

+ /*
+ * Although we now allow filesystems to handle cross sb copy, passing
+ * a file of the wrong filesystem type to filesystem driver can result
+ * in an attempt to dereference the wrong type of ->private_data, so
+ * avoid doing that until we really have a good reason. NFS defines
+ * several different file_system_type structures, but they all end up
+ * using the same ->copy_file_range() function pointer.
+ */
+ if (file_out->f_op->copy_file_range) {
+ if (file_in->f_op->copy_file_range !=
+ file_out->f_op->copy_file_range)
+ return -EXDEV;
+ } else if (file_in->f_op->remap_file_range) {
+ if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
+ return -EXDEV;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
ret = generic_file_rw_checks(file_in, file_out);
if (ret)
return ret;
@@ -1499,8 +1496,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* Try cloning first, this is supported by more file systems, and
* more efficient if both clone and copy are supported (e.g. NFS).
*/
- if (file_in->f_op->remap_file_range &&
- file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
+ if (file_in->f_op->remap_file_range) {
loff_t cloned;

cloned = file_in->f_op->remap_file_range(file_in, pos_in,
@@ -1513,9 +1509,9 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
}
}

- ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
- WARN_ON_ONCE(ret == -EOPNOTSUPP);
+ ret = file_out->f_op->copy_file_range(file_in, pos_in,
+ file_out, pos_out,
+ len, flags);
done:
if (ret > 0) {
fsnotify_access(file_in);

2021-02-18 20:46:22

by Steve French

[permalink] [raw]
Subject: Re: [PATCH v2] vfs: prevent copy_file_range to copy across devices

On Thu, Feb 18, 2021 at 4:03 AM Amir Goldstein <[email protected]> wrote:
>
> On Thu, Feb 18, 2021 at 9:42 AM Christoph Hellwig <[email protected]> wrote:
> >
> > Looks good:
> >
> > Reviewed-by: Christoph Hellwig <[email protected]>
> >
> > This whole idea of cross-device copie has always been a horrible idea,
> > and I've been arguing against it since the patches were posted.
>
> Ok. I'm good with this v2 as well, but need to add the fallback to
> do_splice_direct()
> in nfsd_copy_file_range(), because this patch breaks it.

Interestingly, for ksmbd (cifsd) looks like they already do splice not
copy_file_range


--
Thanks,

Steve