2017-06-14 17:06:56

by Olga Kornievskaia

[permalink] [raw]
Subject: [PATCH 1/1] [RFC] 64bit copy_file_range system call

This is a proposal to allow 64bit count to copy and return as
a result of a copy_file_range. No attempt was made to share code
with the 32bit function because 32bit interface should probably
get depreciated.

Why use 64bit? Current uses of 32bit are by clone_file_range()
which could use 64bit count and NFS copy_file_range also supports
64bit count value.

Also with this proposal off-the-bet allow the userland to copy
between different mount points.

Signed-off-by: Olga Kornievskaia <[email protected]>
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
fs/read_write.c | 136 +++++++++++++++++++++++++++++++++
include/linux/fs.h | 4 +
include/linux/syscalls.h | 3 +
include/uapi/asm-generic/unistd.h | 4 +-
kernel/sys_ni.c | 1 +
7 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac21..1f5f1e7 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
382 i386 pkey_free sys_pkey_free
383 i386 statx sys_statx
384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
+385 i386 copy_file_range64 sys_copy_file_range64
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183..e658bbe 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
332 common statx sys_statx
+333 64 copy_file_range64 sys_copy_file_range64

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/read_write.c b/fs/read_write.c
index 47c1d44..e2665c1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
return ret;
}

+u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ u64 len, unsigned int flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ u64 ret;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ ret = rw_verify_area(READ, file_in, &pos_in, len);
+ if (unlikely(ret))
+ return ret;
+
+ ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+ if (unlikely(ret))
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
+
+ if (len == 0)
+ return 0;
+
+ file_start_write(file_out);
+
+ /*
+ * Try cloning first, this is supported by more file systems, and
+ * more efficient if both clone and copy are supported (e.g. NFS).
+ */
+ if (file_in->f_op->clone_file_range) {
+ ret = file_in->f_op->clone_file_range(file_in, pos_in,
+ file_out, pos_out, len);
+ if (ret == 0) {
+ ret = len;
+ goto done;
+ }
+ }
+
+ if (file_out->f_op->copy_file_range64) {
+ ret = file_out->f_op->copy_file_range64(file_in, pos_in,
+ file_out, pos_out, len, flags);
+ if (ret != -EOPNOTSUPP)
+ goto done;
+ }
+
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+ len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+
+done:
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+
+ inc_syscr(current);
+ inc_syscw(current);
+
+ file_end_write(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range64);
+
+SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ u64, len, unsigned int, flags)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ u64 ret = -EBADF;
+
+ f_in = fdget(fd_in);
+ if (!f_in.file)
+ goto out2;
+
+ f_out = fdget(fd_out);
+ if (!f_out.file)
+ goto out1;
+
+ ret = -EFAULT;
+ if (off_in) {
+ if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_in = f_in.file->f_pos;
+ }
+
+ if (off_out) {
+ if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_out = f_out.file->f_pos;
+ }
+
+ ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
+ flags);
+ if (ret > 0) {
+ pos_in += ret;
+ pos_out += ret;
+
+ if (off_in) {
+ if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_in.file->f_pos = pos_in;
+ }
+
+ if (off_out) {
+ if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_out.file->f_pos = pos_out;
+ }
+ }
+
+out:
+ fdput(f_out);
+out1:
+ fdput(f_in);
+out2:
+ return ret;
+}
+
static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
{
struct inode *inode = file_inode(file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 803e5a9..2727a98 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1690,6 +1690,8 @@ struct file_operations {
u64);
ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
u64);
+ u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
+ loff_t, u64, unsigned int);
};

struct inode_operations {
@@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
loff_t len, bool *is_same);
extern int vfs_dedupe_file_range(struct file *file,
struct file_dedupe_range *same);
+extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
+ loff_t, u64, unsigned int);

struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9..f7ea78e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
asmlinkage long sys_pkey_free(int pkey);
asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
+ int fd_out, loff_t __user *off_out,
+ u64 len, unsigned int flags);

#endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a..e385a76 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -731,9 +731,11 @@
__SYSCALL(__NR_pkey_free, sys_pkey_free)
#define __NR_statx 291
__SYSCALL(__NR_statx, sys_statx)
+#define __NR_copy_file_range64 292
+__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)

#undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293

/*
* All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8acef85..8e0cfd4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
cond_syscall(sys_capget);
cond_syscall(sys_capset);
cond_syscall(sys_copy_file_range);
+cond_syscall(sys_copy_file_range64);

/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
1.8.3.1



2017-06-14 17:24:11

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

Hi Olga,

On 06/14/2017 01:06 PM, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
>
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
>
> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.
>
> Signed-off-by: Olga Kornievskaia <[email protected]>
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> fs/read_write.c | 136 +++++++++++++++++++++++++++++++++
> include/linux/fs.h | 4 +
> include/linux/syscalls.h | 3 +
> include/uapi/asm-generic/unistd.h | 4 +-
> kernel/sys_ni.c | 1 +
> 7 files changed, 149 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 448ac21..1f5f1e7 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -391,3 +391,4 @@
> 382 i386 pkey_free sys_pkey_free
> 383 i386 statx sys_statx
> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
> +385 i386 copy_file_range64 sys_copy_file_range64
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183..e658bbe 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
> 330 common pkey_alloc sys_pkey_alloc
> 331 common pkey_free sys_pkey_free
> 332 common statx sys_statx
> +333 64 copy_file_range64 sys_copy_file_range64
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 47c1d44..e2665c1 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> return ret;
> }
>
> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + u64 len, unsigned int flags)
> +{
> + struct inode *inode_in = file_inode(file_in);
> + struct inode *inode_out = file_inode(file_out);
> + u64 ret;
> +
> + if (flags != 0)
> + return -EINVAL;
> +
> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> + return -EISDIR;
> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> + return -EINVAL;
> +
> + ret = rw_verify_area(READ, file_in, &pos_in, len);
> + if (unlikely(ret))
> + return ret;

Doesn't rw_verify_area() only accept 32-bit ranges? You might have to use clone_verify_area() instead (maybe consider renaming that function to something generic?)

Thanks,
Anna

> +
> + ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> + if (unlikely(ret))
> + return ret;
> +
> + if (!(file_in->f_mode & FMODE_READ) ||
> + !(file_out->f_mode & FMODE_WRITE) ||
> + (file_out->f_flags & O_APPEND))
> + return -EBADF;
> +
> + if (len == 0)
> + return 0;
> +
> + file_start_write(file_out);
> +
> + /*
> + * Try cloning first, this is supported by more file systems, and
> + * more efficient if both clone and copy are supported (e.g. NFS).
> + */
> + if (file_in->f_op->clone_file_range) {
> + ret = file_in->f_op->clone_file_range(file_in, pos_in,
> + file_out, pos_out, len);
> + if (ret == 0) {
> + ret = len;
> + goto done;
> + }
> + }
> +
> + if (file_out->f_op->copy_file_range64) {
> + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> + file_out, pos_out, len, flags);
> + if (ret != -EOPNOTSUPP)
> + goto done;
> + }
> +
> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +
> +done:
> + if (ret > 0) {
> + fsnotify_access(file_in);
> + add_rchar(current, ret);
> + fsnotify_modify(file_out);
> + add_wchar(current, ret);
> + }
> +
> + inc_syscr(current);
> + inc_syscw(current);
> +
> + file_end_write(file_out);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(vfs_copy_file_range64);
> +
> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> + int, fd_out, loff_t __user *, off_out,
> + u64, len, unsigned int, flags)
> +{
> + loff_t pos_in;
> + loff_t pos_out;
> + struct fd f_in;
> + struct fd f_out;
> + u64 ret = -EBADF;
> +
> + f_in = fdget(fd_in);
> + if (!f_in.file)
> + goto out2;
> +
> + f_out = fdget(fd_out);
> + if (!f_out.file)
> + goto out1;
> +
> + ret = -EFAULT;
> + if (off_in) {
> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> + goto out;
> + } else {
> + pos_in = f_in.file->f_pos;
> + }
> +
> + if (off_out) {
> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> + goto out;
> + } else {
> + pos_out = f_out.file->f_pos;
> + }
> +
> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> + flags);
> + if (ret > 0) {
> + pos_in += ret;
> + pos_out += ret;
> +
> + if (off_in) {
> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> + ret = -EFAULT;
> + } else {
> + f_in.file->f_pos = pos_in;
> + }
> +
> + if (off_out) {
> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> + ret = -EFAULT;
> + } else {
> + f_out.file->f_pos = pos_out;
> + }
> + }
> +
> +out:
> + fdput(f_out);
> +out1:
> + fdput(f_in);
> +out2:
> + return ret;
> +}
> +
> static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> {
> struct inode *inode = file_inode(file);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 803e5a9..2727a98 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1690,6 +1690,8 @@ struct file_operations {
> u64);
> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
> u64);
> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> + loff_t, u64, unsigned int);
> };
>
> struct inode_operations {
> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> loff_t len, bool *is_same);
> extern int vfs_dedupe_file_range(struct file *file,
> struct file_dedupe_range *same);
> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> + loff_t, u64, unsigned int);
>
> struct super_operations {
> struct inode *(*alloc_inode)(struct super_block *sb);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9..f7ea78e 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
> asmlinkage long sys_pkey_free(int pkey);
> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> + int fd_out, loff_t __user *off_out,
> + u64 len, unsigned int flags);
>
> #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 061185a..e385a76 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -731,9 +731,11 @@
> __SYSCALL(__NR_pkey_free, sys_pkey_free)
> #define __NR_statx 291
> __SYSCALL(__NR_statx, sys_statx)
> +#define __NR_copy_file_range64 292
> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 292
> +#define __NR_syscalls 293
>
> /*
> * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef85..8e0cfd4 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
> cond_syscall(sys_capget);
> cond_syscall(sys_capset);
> cond_syscall(sys_copy_file_range);
> +cond_syscall(sys_copy_file_range64);
>
> /* arch-specific weak syscall entries */
> cond_syscall(sys_pciconfig_read);
>

2017-06-14 18:53:40

by Olga Kornievskaia

[permalink] [raw]
Subject: [PATCH 1/1] [RFC] 64bit copy_file_range system call

This is a proposal to allow 64bit count to copy and return as
a result of a copy_file_range. No attempt was made to share code
with the 32bit function because 32bit interface should probably
get depreciated.

Why use 64bit? Current uses of 32bit are by clone_file_range()
which could use 64bit count and NFS copy_file_range also supports
64bit count value.

Also with this proposal off-the-bet allow the userland to copy
between different mount points.

Signed-off-by: Olga Kornievskaia <[email protected]>
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
fs/read_write.c | 146 +++++++++++++++++++++++++++++++--
include/linux/fs.h | 4 +
include/linux/syscalls.h | 3 +
include/uapi/asm-generic/unistd.h | 4 +-
kernel/sys_ni.c | 1 +
7 files changed, 154 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac21..1f5f1e7 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
382 i386 pkey_free sys_pkey_free
383 i386 statx sys_statx
384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
+385 i386 copy_file_range64 sys_copy_file_range64
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183..e658bbe 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
332 common statx sys_statx
+333 64 copy_file_range64 sys_copy_file_range64

#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/read_write.c b/fs/read_write.c
index e3ee985..c9c072b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
return ret;
}

-static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
{
struct inode *inode = file_inode(file);

@@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
}

+u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ u64 len, unsigned int flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ u64 ret;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ ret = rw64_verify_area(file_in, pos_in, len, false);
+ if (unlikely(ret))
+ return ret;
+
+ ret = rw64_verify_area(file_out, pos_out, len, true);
+ if (unlikely(ret))
+ return ret;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
+
+ if (len == 0)
+ return 0;
+
+ file_start_write(file_out);
+
+ /*
+ * Try cloning first, this is supported by more file systems, and
+ * more efficient if both clone and copy are supported (e.g. NFS).
+ */
+ if (file_in->f_op->clone_file_range) {
+ ret = file_in->f_op->clone_file_range(file_in, pos_in,
+ file_out, pos_out, len);
+ if (ret == 0) {
+ ret = len;
+ goto done;
+ }
+ }
+
+ if (file_out->f_op->copy_file_range64) {
+ ret = file_out->f_op->copy_file_range64(file_in, pos_in,
+ file_out, pos_out, len, flags);
+ if (ret != -EOPNOTSUPP)
+ goto done;
+ }
+
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+ len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+
+done:
+ if (ret > 0) {
+ fsnotify_access(file_in);
+ add_rchar(current, ret);
+ fsnotify_modify(file_out);
+ add_wchar(current, ret);
+ }
+
+ inc_syscr(current);
+ inc_syscw(current);
+
+ file_end_write(file_out);
+
+ return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range64);
+
+SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ u64, len, unsigned int, flags)
+{
+ loff_t pos_in;
+ loff_t pos_out;
+ struct fd f_in;
+ struct fd f_out;
+ u64 ret = -EBADF;
+
+ f_in = fdget(fd_in);
+ if (!f_in.file)
+ goto out2;
+
+ f_out = fdget(fd_out);
+ if (!f_out.file)
+ goto out1;
+
+ ret = -EFAULT;
+ if (off_in) {
+ if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_in = f_in.file->f_pos;
+ }
+
+ if (off_out) {
+ if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+ goto out;
+ } else {
+ pos_out = f_out.file->f_pos;
+ }
+
+ ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
+ flags);
+ if (ret > 0) {
+ pos_in += ret;
+ pos_out += ret;
+
+ if (off_in) {
+ if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_in.file->f_pos = pos_in;
+ }
+
+ if (off_out) {
+ if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+ ret = -EFAULT;
+ } else {
+ f_out.file->f_pos = pos_out;
+ }
+ }
+
+out:
+ fdput(f_out);
+out1:
+ fdput(f_in);
+out2:
+ return ret;
+}
+
/*
* Check that the two inodes are eligible for cloning, the ranges make
* sense, and then flush all dirty data. Caller must ensure that the
@@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
if (!file_in->f_op->clone_file_range)
return -EOPNOTSUPP;

- ret = clone_verify_area(file_in, pos_in, len, false);
+ ret = rw64_verify_area(file_in, pos_in, len, false);
if (ret)
return ret;

- ret = clone_verify_area(file_out, pos_out, len, true);
+ ret = rw64_verify_area(file_out, pos_out, len, true);
if (ret)
return ret;

@@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
if (!S_ISREG(src->i_mode))
goto out;

- ret = clone_verify_area(file, off, len, false);
+ ret = rw64_verify_area(file, off, len, false);
if (ret < 0)
goto out;
ret = 0;
@@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
}

dst_off = info->dest_offset;
- ret = clone_verify_area(dst_file, dst_off, len, true);
+ ret = rw64_verify_area(dst_file, dst_off, len, true);
if (ret < 0) {
info->status = ret;
goto next_file;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 803e5a9..41fce43 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1690,6 +1690,8 @@ struct file_operations {
u64);
ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
u64);
+ u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
+ loff_t, u64, unsigned int);
};

struct inode_operations {
@@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
loff_t len, bool *is_same);
extern int vfs_dedupe_file_range(struct file *file,
struct file_dedupe_range *same);
+extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
+ loff_t, u64, unsigned int);

struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9..f7ea78e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
asmlinkage long sys_pkey_free(int pkey);
asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
+asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
+ int fd_out, loff_t __user *off_out,
+ u64 len, unsigned int flags);

#endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a..e385a76 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -731,9 +731,11 @@
__SYSCALL(__NR_pkey_free, sys_pkey_free)
#define __NR_statx 291
__SYSCALL(__NR_statx, sys_statx)
+#define __NR_copy_file_range64 292
+__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)

#undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293

/*
* All syscalls below here should go away really,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 8acef85..8e0cfd4 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
cond_syscall(sys_capget);
cond_syscall(sys_capset);
cond_syscall(sys_copy_file_range);
+cond_syscall(sys_copy_file_range64);

/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
--
1.8.3.1


2017-06-14 19:32:38

by Amir Goldstein

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Wed, Jun 14, 2017 at 9:53 PM, Olga Kornievskaia <[email protected]> wrote:
>
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
>
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
>
> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.
>
> Signed-off-by: Olga Kornievskaia <[email protected]>
> ---
...
> +
> + /*
> + * Try cloning first, this is supported by more file systems, and
> + * more efficient if both clone and copy are supported (e.g. NFS).
> + */
> + if (file_in->f_op->clone_file_range) {
> + ret = file_in->f_op->clone_file_range(file_in, pos_in,
> + file_out, pos_out, len);
> + if (ret == 0) {
> + ret = len;
> + goto done;
> + }
> + }
> +
> + if (file_out->f_op->copy_file_range64) {
> + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> + file_out, pos_out, len, flags);
> + if (ret != -EOPNOTSUPP)
> + goto done;
> + }
> +
> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +

Olga,

I know this is copied from vfs_copy_file_range() as is, so my comment
applies to the origin function as well, but it is easier to change the new
function without changing userspace behavior, so..

A while back, either Dave Chinner or Christoph suggested that I use
vfs_copy_file_range() from ovl_copy_up_data() instead of calling
vfs_clone_file_range() and falling back to do_splice_direct() loop.
Then Christoph added the clone_file_range attempt into
vfs_copy_file_range(), which was one part of the work.

However, ovl_copy_up_data() uses a killable loop of do_splice_direct()
calls with smaller chunks, so it is not the same as vfs_copy_file_range().

I wonder if it makes sense to use a similar killable loop in the new
function?
I wonder, for reference, if NFS implementation of copy_file_range64()
is killable?

Amir.

2017-06-14 20:08:16

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Wed, Jun 14, 2017 at 3:32 PM, Amir Goldstein <[email protected]> wrote:
> On Wed, Jun 14, 2017 at 9:53 PM, Olga Kornievskaia <[email protected]> wrote:
>>
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>>
>> Also with this proposal off-the-bet allow the userland to copy
>> between different mount points.
>>
>> Signed-off-by: Olga Kornievskaia <[email protected]>
>> ---
> ...
>> +
>> + /*
>> + * Try cloning first, this is supported by more file systems, and
>> + * more efficient if both clone and copy are supported (e.g. NFS).
>> + */
>> + if (file_in->f_op->clone_file_range) {
>> + ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> + file_out, pos_out, len);
>> + if (ret == 0) {
>> + ret = len;
>> + goto done;
>> + }
>> + }
>> +
>> + if (file_out->f_op->copy_file_range64) {
>> + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> + file_out, pos_out, len, flags);
>> + if (ret != -EOPNOTSUPP)
>> + goto done;
>> + }
>> +
>> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> +
>
> Olga,

Hi Amir,

Thanks for the comments.

> I know this is copied from vfs_copy_file_range() as is, so my comment
> applies to the origin function as well, but it is easier to change the new
> function without changing userspace behavior, so..
>
> A while back, either Dave Chinner or Christoph suggested that I use
> vfs_copy_file_range() from ovl_copy_up_data() instead of calling
> vfs_clone_file_range() and falling back to do_splice_direct() loop.
> Then Christoph added the clone_file_range attempt into
> vfs_copy_file_range(), which was one part of the work.
>
> However, ovl_copy_up_data() uses a killable loop of do_splice_direct()
> calls with smaller chunks, so it is not the same as vfs_copy_file_range().
>
> I wonder if it makes sense to use a similar killable loop in the new
> function?

I'm currently not proposing to change the semantics of
copy_file_range() call to in some cases return a partial copy and thus
making the application responsible for looping to finish the copy.
Basically copy_file_range() follows the same semantics as read/write
where it is possible that not all requested bytes are completed.
However, if it seems more desirable to shift the responsibility to
completing the copy into the kernel, then I can definitely add the
same looping as does ovl_copy_up_data().

> I wonder, for reference, if NFS implementation of copy_file_range64()
> is killable?

I believe the current implementation of "intra" (same server) copy is
not killable. The outstanding proposal to add asynchronous copy as
well as "inter" (server to server) copy is killable.

2017-06-14 21:50:48

by Randy Dunlap

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On 06/14/17 11:53, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.

deprecated.

>
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
>
> Also with this proposal off-the-bet allow the userland to copy

what does "off-the-bet" mean?

> between different mount points.
>
> Signed-off-by: Olga Kornievskaia <[email protected]>
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> fs/read_write.c | 146 +++++++++++++++++++++++++++++++--
> include/linux/fs.h | 4 +
> include/linux/syscalls.h | 3 +
> include/uapi/asm-generic/unistd.h | 4 +-
> kernel/sys_ni.c | 1 +
> 7 files changed, 154 insertions(+), 6 deletions(-)


--
~Randy

2017-06-15 03:59:27

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
>
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
>
> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.
>
> Signed-off-by: Olga Kornievskaia <[email protected]>
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> fs/read_write.c | 146 +++++++++++++++++++++++++++++++--
> include/linux/fs.h | 4 +
> include/linux/syscalls.h | 3 +
> include/uapi/asm-generic/unistd.h | 4 +-
> kernel/sys_ni.c | 1 +
> 7 files changed, 154 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 448ac21..1f5f1e7 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -391,3 +391,4 @@
> 382 i386 pkey_free sys_pkey_free
> 383 i386 statx sys_statx
> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
> +385 i386 copy_file_range64 sys_copy_file_range64
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183..e658bbe 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
> 330 common pkey_alloc sys_pkey_alloc
> 331 common pkey_free sys_pkey_free
> 332 common statx sys_statx
> +333 64 copy_file_range64 sys_copy_file_range64
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/read_write.c b/fs/read_write.c
> index e3ee985..c9c072b 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> return ret;
> }
>
> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> {
> struct inode *inode = file_inode(file);
>
> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
> }
>
> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + u64 len, unsigned int flags)

AFAICT the difference between vfs_copy_file_range() and
vfs_copy_file_range64() is that you've replaced 'size_t len' with
'u64 len', correct?

sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as
the userspace interface is concerned, this only makes a difference on
32-bit userlands, right? So, uh, how many 32-bit user programs need to
c_f_r more than 2GB at a time?

Follow up question -- does c_f_r not work for > 2GB of data when
userspace is 64-bit?

--D

> +{
> + struct inode *inode_in = file_inode(file_in);
> + struct inode *inode_out = file_inode(file_out);
> + u64 ret;
> +
> + if (flags != 0)
> + return -EINVAL;
> +
> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> + return -EISDIR;
> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> + return -EINVAL;
> +
> + ret = rw64_verify_area(file_in, pos_in, len, false);
> + if (unlikely(ret))
> + return ret;
> +
> + ret = rw64_verify_area(file_out, pos_out, len, true);
> + if (unlikely(ret))
> + return ret;
> +
> + if (!(file_in->f_mode & FMODE_READ) ||
> + !(file_out->f_mode & FMODE_WRITE) ||
> + (file_out->f_flags & O_APPEND))
> + return -EBADF;
> +
> + if (len == 0)
> + return 0;
> +
> + file_start_write(file_out);
> +
> + /*
> + * Try cloning first, this is supported by more file systems, and
> + * more efficient if both clone and copy are supported (e.g. NFS).
> + */
> + if (file_in->f_op->clone_file_range) {
> + ret = file_in->f_op->clone_file_range(file_in, pos_in,
> + file_out, pos_out, len);
> + if (ret == 0) {
> + ret = len;
> + goto done;
> + }
> + }
> +
> + if (file_out->f_op->copy_file_range64) {
> + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> + file_out, pos_out, len, flags);
> + if (ret != -EOPNOTSUPP)
> + goto done;
> + }
> +
> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +
> +done:
> + if (ret > 0) {
> + fsnotify_access(file_in);
> + add_rchar(current, ret);
> + fsnotify_modify(file_out);
> + add_wchar(current, ret);
> + }
> +
> + inc_syscr(current);
> + inc_syscw(current);
> +
> + file_end_write(file_out);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(vfs_copy_file_range64);
> +
> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> + int, fd_out, loff_t __user *, off_out,
> + u64, len, unsigned int, flags)
> +{
> + loff_t pos_in;
> + loff_t pos_out;
> + struct fd f_in;
> + struct fd f_out;
> + u64 ret = -EBADF;
> +
> + f_in = fdget(fd_in);
> + if (!f_in.file)
> + goto out2;
> +
> + f_out = fdget(fd_out);
> + if (!f_out.file)
> + goto out1;
> +
> + ret = -EFAULT;
> + if (off_in) {
> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> + goto out;
> + } else {
> + pos_in = f_in.file->f_pos;
> + }
> +
> + if (off_out) {
> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> + goto out;
> + } else {
> + pos_out = f_out.file->f_pos;
> + }
> +
> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> + flags);
> + if (ret > 0) {
> + pos_in += ret;
> + pos_out += ret;
> +
> + if (off_in) {
> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> + ret = -EFAULT;
> + } else {
> + f_in.file->f_pos = pos_in;
> + }
> +
> + if (off_out) {
> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> + ret = -EFAULT;
> + } else {
> + f_out.file->f_pos = pos_out;
> + }
> + }
> +
> +out:
> + fdput(f_out);
> +out1:
> + fdput(f_in);
> +out2:
> + return ret;
> +}
> +
> /*
> * Check that the two inodes are eligible for cloning, the ranges make
> * sense, and then flush all dirty data. Caller must ensure that the
> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
> if (!file_in->f_op->clone_file_range)
> return -EOPNOTSUPP;
>
> - ret = clone_verify_area(file_in, pos_in, len, false);
> + ret = rw64_verify_area(file_in, pos_in, len, false);
> if (ret)
> return ret;
>
> - ret = clone_verify_area(file_out, pos_out, len, true);
> + ret = rw64_verify_area(file_out, pos_out, len, true);
> if (ret)
> return ret;
>
> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
> if (!S_ISREG(src->i_mode))
> goto out;
>
> - ret = clone_verify_area(file, off, len, false);
> + ret = rw64_verify_area(file, off, len, false);
> if (ret < 0)
> goto out;
> ret = 0;
> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
> }
>
> dst_off = info->dest_offset;
> - ret = clone_verify_area(dst_file, dst_off, len, true);
> + ret = rw64_verify_area(dst_file, dst_off, len, true);
> if (ret < 0) {
> info->status = ret;
> goto next_file;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 803e5a9..41fce43 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1690,6 +1690,8 @@ struct file_operations {
> u64);
> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
> u64);
> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> + loff_t, u64, unsigned int);
> };
>
> struct inode_operations {
> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> loff_t len, bool *is_same);
> extern int vfs_dedupe_file_range(struct file *file,
> struct file_dedupe_range *same);
> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> + loff_t, u64, unsigned int);
>
> struct super_operations {
> struct inode *(*alloc_inode)(struct super_block *sb);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9..f7ea78e 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
> asmlinkage long sys_pkey_free(int pkey);
> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> + int fd_out, loff_t __user *off_out,
> + u64 len, unsigned int flags);
>
> #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 061185a..e385a76 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -731,9 +731,11 @@
> __SYSCALL(__NR_pkey_free, sys_pkey_free)
> #define __NR_statx 291
> __SYSCALL(__NR_statx, sys_statx)
> +#define __NR_copy_file_range64 292
> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 292
> +#define __NR_syscalls 293
>
> /*
> * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef85..8e0cfd4 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
> cond_syscall(sys_capget);
> cond_syscall(sys_capset);
> cond_syscall(sys_copy_file_range);
> +cond_syscall(sys_copy_file_range64);
>
> /* arch-specific weak syscall entries */
> cond_syscall(sys_pciconfig_read);
> --
> 1.8.3.1
>

2017-06-15 08:15:04

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
>
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.

Please provide a very good use case that actually matters to a large
portion of the Linux users.

> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.

Totally different issue that we've discussed N times. Trying to sneak
it in behind peoples backs isn't going to improve the situation in any way.

2017-06-15 13:07:42

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Wed, Jun 14, 2017 at 5:50 PM, Randy Dunlap <[email protected]> wrote:
> On 06/14/17 11:53, Olga Kornievskaia wrote:
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>
> deprecated.
>
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>>
>> Also with this proposal off-the-bet allow the userland to copy
>
> what does "off-the-bet" mean?

from the beginning.

>
>> between different mount points.
>>
>> Signed-off-by: Olga Kornievskaia <[email protected]>
>> ---
>> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>> fs/read_write.c | 146 +++++++++++++++++++++++++++++++--
>> include/linux/fs.h | 4 +
>> include/linux/syscalls.h | 3 +
>> include/uapi/asm-generic/unistd.h | 4 +-
>> kernel/sys_ni.c | 1 +
>> 7 files changed, 154 insertions(+), 6 deletions(-)
>
>
> --
> ~Randy
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2017-06-15 13:07:35

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <[email protected]> wrote:
> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote:
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>
> Please provide a very good use case that actually matters to a large
> portion of the Linux users.

Reason: it will not hurt any user but it will help for
server-to-server NFS copy because for each invocation of of
copy_file_range() it requires that the client sends COPY_NOTIFY, then
COPY and then a copy triggers a establishment of clientid/session
before the reading of the source file can happen. All that could be
saved by have a 64bit value.

>> Also with this proposal off-the-bet allow the userland to copy
>> between different mount points.
>
> Totally different issue that we've discussed N times. Trying to sneak
> it in behind peoples backs isn't going to improve the situation in any way.

I would think that sneaking would have been to remove the check and
not mention it in the commit description. I have explicitly brought
the topic up in the commit description to bring the attention to the
issue as a way to restart the conversation about the issue.

I have several times have asked for the reason to why the check can
not be removed. I'm asking again can you please explain why.

> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2017-06-15 14:39:44

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong
<[email protected]> wrote:
> On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>>
>> Also with this proposal off-the-bet allow the userland to copy
>> between different mount points.
>>
>> Signed-off-by: Olga Kornievskaia <[email protected]>
>> ---
>> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>> fs/read_write.c | 146 +++++++++++++++++++++++++++++++--
>> include/linux/fs.h | 4 +
>> include/linux/syscalls.h | 3 +
>> include/uapi/asm-generic/unistd.h | 4 +-
>> kernel/sys_ni.c | 1 +
>> 7 files changed, 154 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> index 448ac21..1f5f1e7 100644
>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> @@ -391,3 +391,4 @@
>> 382 i386 pkey_free sys_pkey_free
>> 383 i386 statx sys_statx
>> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
>> +385 i386 copy_file_range64 sys_copy_file_range64
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index 5aef183..e658bbe 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -339,6 +339,7 @@
>> 330 common pkey_alloc sys_pkey_alloc
>> 331 common pkey_free sys_pkey_free
>> 332 common statx sys_statx
>> +333 64 copy_file_range64 sys_copy_file_range64
>>
>> #
>> # x32-specific system call numbers start at 512 to avoid cache impact
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index e3ee985..c9c072b 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> return ret;
>> }
>>
>> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> {
>> struct inode *inode = file_inode(file);
>>
>> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
>> }
>>
>> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>> + struct file *file_out, loff_t pos_out,
>> + u64 len, unsigned int flags)
>
> AFAICT the difference between vfs_copy_file_range() and
> vfs_copy_file_range64() is that you've replaced 'size_t len' with
> 'u64 len', correct?

That and I'm asking to allow for cross-device copy to be allowed by
the system call.

> sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as
> the userspace interface is concerned, this only makes a difference on
> 32-bit userlands, right? So, uh, how many 32-bit user programs need to
> c_f_r more than 2GB at a time?

Isn't that 4GB? I thought that max of ssize_t is 4GB on 64bit. I'm
under the impression that sizeof (ssize_t) != sizeof(u64) on 64bit
machine. If I'm wrong, then this patch is indeed not needed.

> Follow up question -- does c_f_r not work for > 2GB of data when
> userspace is 64-bit?

Current copy_file_range would work for any file size, it would just
need to be called in a loop by the application. With the given
proposal, there wouldn't be a need for the application to loop due to
the API size limitation. It might still loop if a partial copy was
done.

>
> --D
>
>> +{
>> + struct inode *inode_in = file_inode(file_in);
>> + struct inode *inode_out = file_inode(file_out);
>> + u64 ret;
>> +
>> + if (flags != 0)
>> + return -EINVAL;
>> +
>> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>> + return -EISDIR;
>> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
>> + return -EINVAL;
>> +
>> + ret = rw64_verify_area(file_in, pos_in, len, false);
>> + if (unlikely(ret))
>> + return ret;
>> +
>> + ret = rw64_verify_area(file_out, pos_out, len, true);
>> + if (unlikely(ret))
>> + return ret;
>> +
>> + if (!(file_in->f_mode & FMODE_READ) ||
>> + !(file_out->f_mode & FMODE_WRITE) ||
>> + (file_out->f_flags & O_APPEND))
>> + return -EBADF;
>> +
>> + if (len == 0)
>> + return 0;
>> +
>> + file_start_write(file_out);
>> +
>> + /*
>> + * Try cloning first, this is supported by more file systems, and
>> + * more efficient if both clone and copy are supported (e.g. NFS).
>> + */
>> + if (file_in->f_op->clone_file_range) {
>> + ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> + file_out, pos_out, len);
>> + if (ret == 0) {
>> + ret = len;
>> + goto done;
>> + }
>> + }
>> +
>> + if (file_out->f_op->copy_file_range64) {
>> + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> + file_out, pos_out, len, flags);
>> + if (ret != -EOPNOTSUPP)
>> + goto done;
>> + }
>> +
>> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> +
>> +done:
>> + if (ret > 0) {
>> + fsnotify_access(file_in);
>> + add_rchar(current, ret);
>> + fsnotify_modify(file_out);
>> + add_wchar(current, ret);
>> + }
>> +
>> + inc_syscr(current);
>> + inc_syscw(current);
>> +
>> + file_end_write(file_out);
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(vfs_copy_file_range64);
>> +
>> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
>> + int, fd_out, loff_t __user *, off_out,
>> + u64, len, unsigned int, flags)
>> +{
>> + loff_t pos_in;
>> + loff_t pos_out;
>> + struct fd f_in;
>> + struct fd f_out;
>> + u64 ret = -EBADF;
>> +
>> + f_in = fdget(fd_in);
>> + if (!f_in.file)
>> + goto out2;
>> +
>> + f_out = fdget(fd_out);
>> + if (!f_out.file)
>> + goto out1;
>> +
>> + ret = -EFAULT;
>> + if (off_in) {
>> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
>> + goto out;
>> + } else {
>> + pos_in = f_in.file->f_pos;
>> + }
>> +
>> + if (off_out) {
>> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
>> + goto out;
>> + } else {
>> + pos_out = f_out.file->f_pos;
>> + }
>> +
>> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
>> + flags);
>> + if (ret > 0) {
>> + pos_in += ret;
>> + pos_out += ret;
>> +
>> + if (off_in) {
>> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
>> + ret = -EFAULT;
>> + } else {
>> + f_in.file->f_pos = pos_in;
>> + }
>> +
>> + if (off_out) {
>> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
>> + ret = -EFAULT;
>> + } else {
>> + f_out.file->f_pos = pos_out;
>> + }
>> + }
>> +
>> +out:
>> + fdput(f_out);
>> +out1:
>> + fdput(f_in);
>> +out2:
>> + return ret;
>> +}
>> +
>> /*
>> * Check that the two inodes are eligible for cloning, the ranges make
>> * sense, and then flush all dirty data. Caller must ensure that the
>> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
>> if (!file_in->f_op->clone_file_range)
>> return -EOPNOTSUPP;
>>
>> - ret = clone_verify_area(file_in, pos_in, len, false);
>> + ret = rw64_verify_area(file_in, pos_in, len, false);
>> if (ret)
>> return ret;
>>
>> - ret = clone_verify_area(file_out, pos_out, len, true);
>> + ret = rw64_verify_area(file_out, pos_out, len, true);
>> if (ret)
>> return ret;
>>
>> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>> if (!S_ISREG(src->i_mode))
>> goto out;
>>
>> - ret = clone_verify_area(file, off, len, false);
>> + ret = rw64_verify_area(file, off, len, false);
>> if (ret < 0)
>> goto out;
>> ret = 0;
>> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>> }
>>
>> dst_off = info->dest_offset;
>> - ret = clone_verify_area(dst_file, dst_off, len, true);
>> + ret = rw64_verify_area(dst_file, dst_off, len, true);
>> if (ret < 0) {
>> info->status = ret;
>> goto next_file;
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 803e5a9..41fce43 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1690,6 +1690,8 @@ struct file_operations {
>> u64);
>> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>> u64);
>> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
>> + loff_t, u64, unsigned int);
>> };
>>
>> struct inode_operations {
>> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>> loff_t len, bool *is_same);
>> extern int vfs_dedupe_file_range(struct file *file,
>> struct file_dedupe_range *same);
>> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
>> + loff_t, u64, unsigned int);
>>
>> struct super_operations {
>> struct inode *(*alloc_inode)(struct super_block *sb);
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index 980c3c9..f7ea78e 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>> asmlinkage long sys_pkey_free(int pkey);
>> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>> unsigned mask, struct statx __user *buffer);
>> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
>> + int fd_out, loff_t __user *off_out,
>> + u64 len, unsigned int flags);
>>
>> #endif
>> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
>> index 061185a..e385a76 100644
>> --- a/include/uapi/asm-generic/unistd.h
>> +++ b/include/uapi/asm-generic/unistd.h
>> @@ -731,9 +731,11 @@
>> __SYSCALL(__NR_pkey_free, sys_pkey_free)
>> #define __NR_statx 291
>> __SYSCALL(__NR_statx, sys_statx)
>> +#define __NR_copy_file_range64 292
>> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>>
>> #undef __NR_syscalls
>> -#define __NR_syscalls 292
>> +#define __NR_syscalls 293
>>
>> /*
>> * All syscalls below here should go away really,
>> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> index 8acef85..8e0cfd4 100644
>> --- a/kernel/sys_ni.c
>> +++ b/kernel/sys_ni.c
>> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>> cond_syscall(sys_capget);
>> cond_syscall(sys_capset);
>> cond_syscall(sys_copy_file_range);
>> +cond_syscall(sys_copy_file_range64);
>>
>> /* arch-specific weak syscall entries */
>> cond_syscall(sys_pciconfig_read);
>> --
>> 1.8.3.1
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2017-06-15 18:35:42

by Andreas Dilger

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call


> On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <[email protected]> wrote:
>
> On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <[email protected]> wrote:
>> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote:
>>> This is a proposal to allow 64bit count to copy and return as
>>> a result of a copy_file_range. No attempt was made to share code
>>> with the 32bit function because 32bit interface should probably
>>> get depreciated.
>>>
>>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>>> which could use 64bit count and NFS copy_file_range also supports
>>> 64bit count value.
>>
>> Please provide a very good use case that actually matters to a large
>> portion of the Linux users.
>
> Reason: it will not hurt any user but it will help for
> server-to-server NFS copy because for each invocation of of
> copy_file_range() it requires that the client sends COPY_NOTIFY, then
> COPY and then a copy triggers a establishment of clientid/session
> before the reading of the source file can happen. All that could be
> saved by have a 64bit value.

What is the overhead of sending a couple of RPCs vs. copying 4GB of
data on the server?

> Current copy_file_range would work for any file size, it would just
> need to be called in a loop by the application. With the given
> proposal, there wouldn't be a need for the application to loop due to
> the API size limitation. It might still loop if a partial copy was
> done.


Given that there always needs to be a loop to handle partial completion,
adding the 64-bit syscall doesn't really simplify the application at
all, but adds duplicate code to the kernel for little benefit.

Cheers, Andreas






Attachments:
signature.asc (195.00 B)
Message signed with OpenPGP

2017-06-15 19:11:14

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Thu, Jun 15, 2017 at 2:35 PM, Andreas Dilger <[email protected]> wrote:
>
>> On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <[email protected]> wrote:
>>
>> On Thu, Jun 15, 2017 at 4:15 AM, Christoph Hellwig <[email protected]> wrote:
>>> On Wed, Jun 14, 2017 at 01:06:53PM -0400, Olga Kornievskaia wrote:
>>>> This is a proposal to allow 64bit count to copy and return as
>>>> a result of a copy_file_range. No attempt was made to share code
>>>> with the 32bit function because 32bit interface should probably
>>>> get depreciated.
>>>>
>>>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>>>> which could use 64bit count and NFS copy_file_range also supports
>>>> 64bit count value.
>>>
>>> Please provide a very good use case that actually matters to a large
>>> portion of the Linux users.
>>
>> Reason: it will not hurt any user but it will help for
>> server-to-server NFS copy because for each invocation of of
>> copy_file_range() it requires that the client sends COPY_NOTIFY, then
>> COPY and then a copy triggers a establishment of clientid/session
>> before the reading of the source file can happen. All that could be
>> saved by have a 64bit value.
>
> What is the overhead of sending a couple of RPCs vs. copying 4GB of
> data on the server?

We get about 11% improvement having an interface that removes the need
for multiple calls. That's testing 8gb copy that normally would have
needed 2 copies.

>> Current copy_file_range would work for any file size, it would just
>> need to be called in a loop by the application. With the given
>> proposal, there wouldn't be a need for the application to loop due to
>> the API size limitation. It might still loop if a partial copy was
>> done.
>
>
> Given that there always needs to be a loop to handle partial completion,
> adding the 64-bit syscall doesn't really simplify the application at
> all, but adds duplicate code to the kernel for little benefit.

If 32bit version would be deprecated then there won't be duplicate code.

2017-06-15 20:54:15

by Darrick J. Wong

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Thu, Jun 15, 2017 at 10:39:42AM -0400, Olga Kornievskaia wrote:
> On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong
> <[email protected]> wrote:
> > On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
> >> This is a proposal to allow 64bit count to copy and return as
> >> a result of a copy_file_range. No attempt was made to share code
> >> with the 32bit function because 32bit interface should probably
> >> get depreciated.
> >>
> >> Why use 64bit? Current uses of 32bit are by clone_file_range()
> >> which could use 64bit count and NFS copy_file_range also supports
> >> 64bit count value.
> >>
> >> Also with this proposal off-the-bet allow the userland to copy
> >> between different mount points.
> >>
> >> Signed-off-by: Olga Kornievskaia <[email protected]>
> >> ---
> >> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> >> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> >> fs/read_write.c | 146 +++++++++++++++++++++++++++++++--
> >> include/linux/fs.h | 4 +
> >> include/linux/syscalls.h | 3 +
> >> include/uapi/asm-generic/unistd.h | 4 +-
> >> kernel/sys_ni.c | 1 +
> >> 7 files changed, 154 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> >> index 448ac21..1f5f1e7 100644
> >> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> >> @@ -391,3 +391,4 @@
> >> 382 i386 pkey_free sys_pkey_free
> >> 383 i386 statx sys_statx
> >> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
> >> +385 i386 copy_file_range64 sys_copy_file_range64
> >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> >> index 5aef183..e658bbe 100644
> >> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> >> @@ -339,6 +339,7 @@
> >> 330 common pkey_alloc sys_pkey_alloc
> >> 331 common pkey_free sys_pkey_free
> >> 332 common statx sys_statx
> >> +333 64 copy_file_range64 sys_copy_file_range64
> >>
> >> #
> >> # x32-specific system call numbers start at 512 to avoid cache impact
> >> diff --git a/fs/read_write.c b/fs/read_write.c
> >> index e3ee985..c9c072b 100644
> >> --- a/fs/read_write.c
> >> +++ b/fs/read_write.c
> >> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> >> return ret;
> >> }
> >>
> >> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> >> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> >> {
> >> struct inode *inode = file_inode(file);
> >>
> >> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> >> return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
> >> }
> >>
> >> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> >> + struct file *file_out, loff_t pos_out,
> >> + u64 len, unsigned int flags)
> >
> > AFAICT the difference between vfs_copy_file_range() and
> > vfs_copy_file_range64() is that you've replaced 'size_t len' with
> > 'u64 len', correct?
>
> That and I'm asking to allow for cross-device copy to be allowed by
> the system call.

I was fine letting each fs decide if it was going to allow cross-device
copy or not, but Christoph argued that since we don't generally allow
operations across mountpoints we shouldn't allow cross-mountpoint
{clone,copy}_file_range either.

Roughly translated, XFS has no cross-device copy abilities, there aren't
any plans to add it, so I don't feel motivated to argue for it... but if
the nfs community is so motivated (it sounds like it) then y'all could
try one more time.

> > sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as
> > the userspace interface is concerned, this only makes a difference on
> > 32-bit userlands, right? So, uh, how many 32-bit user programs need to
> > c_f_r more than 2GB at a time?
>
> Isn't that 4GB? I thought that max of ssize_t is 4GB on 64bit. I'm
> under the impression that sizeof (ssize_t) != sizeof(u64) on 64bit
> machine. If I'm wrong, then this patch is indeed not needed.

2017-06-16 17:36:31

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Thu, Jun 15, 2017 at 12:35:29PM -0600, Andreas Dilger wrote:
>
> > On Jun 15, 2017, at 7:07 AM, Olga Kornievskaia <[email protected]> wrote:
> > Reason: it will not hurt any user but it will help for
> > server-to-server NFS copy because for each invocation of of
> > copy_file_range() it requires that the client sends COPY_NOTIFY, then
> > COPY and then a copy triggers a establishment of clientid/session
> > before the reading of the source file can happen. All that could be
> > saved by have a 64bit value.
>
> What is the overhead of sending a couple of RPCs vs. copying 4GB of
> data on the server?

It makes a difference in the clone case, doesn't it? (Though I'd think
best there would be a special "copy to end of file" value.)

--b.

2017-06-17 10:03:16

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote:
> It makes a difference in the clone case, doesn't it? (Though I'd think
> best there would be a special "copy to end of file" value.)

Clones uses a 0 length as "whole" file. Both for the NFS operation
and the Linux ioctl.

2017-06-17 12:43:03

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call


> On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <[email protected]> wrote:
>
> On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote:
>> It makes a difference in the clone case, doesn't it? (Though I'd think
>> best there would be a special "copy to end of file" value.)
>
> Clones uses a 0 length as "whole" file. Both for the NFS operation
> and the Linux ioctl.

Does that actually work? There is check vfs_copy_file_range()

if (len == 0)
return 0;


2017-06-17 15:09:42

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Sat, Jun 17, 2017 at 08:42:54AM -0400, Olga Kornievskaia wrote:
>
> > On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <[email protected]> wrote:
> >
> > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote:
> >> It makes a difference in the clone case, doesn't it? (Though I'd think
> >> best there would be a special "copy to end of file" value.)
> >
> > Clones uses a 0 length as "whole" file. Both for the NFS operation
> > and the Linux ioctl.
>
> Does that actually work? There is check vfs_copy_file_range()

Please re-read the above sentence. I'm talking about NFS clone and
IOC_CLONE_RANGE, not copy_file_range. copy_file_range had to follow
the NFS semantics that don't have the special 0 case.

2017-06-17 18:24:12

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Sat, Jun 17, 2017 at 08:09:38AM -0700, Christoph Hellwig wrote:
> On Sat, Jun 17, 2017 at 08:42:54AM -0400, Olga Kornievskaia wrote:
> >
> > > On Jun 17, 2017, at 6:03 AM, Christoph Hellwig <[email protected]> wrote:
> > >
> > > On Fri, Jun 16, 2017 at 01:36:30PM -0400, J. Bruce Fields wrote:
> > >> It makes a difference in the clone case, doesn't it? (Though I'd think
> > >> best there would be a special "copy to end of file" value.)
> > >
> > > Clones uses a 0 length as "whole" file. Both for the NFS operation
> > > and the Linux ioctl.
> >
> > Does that actually work? There is check vfs_copy_file_range()
>
> Please re-read the above sentence. I'm talking about NFS clone and
> IOC_CLONE_RANGE, not copy_file_range. copy_file_range had to follow
> the NFS semantics that don't have the special 0 case.

Nope, COPY has it:

https://tools.ietf.org/html/rfc7862#section-15.2.3

A count of 0 (zero) requests that all bytes from ca_src_offset
through EOF be copied to the destination.

I think leaving it out of copy_file_range was just an oversight.

--b.

2017-06-19 18:35:10

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Thu, Jun 15, 2017 at 4:53 PM, Darrick J. Wong
<[email protected]> wrote:
> On Thu, Jun 15, 2017 at 10:39:42AM -0400, Olga Kornievskaia wrote:
>> On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong
>> <[email protected]> wrote:
>> > On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
>> >> This is a proposal to allow 64bit count to copy and return as
>> >> a result of a copy_file_range. No attempt was made to share code
>> >> with the 32bit function because 32bit interface should probably
>> >> get depreciated.
>> >>
>> >> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> >> which could use 64bit count and NFS copy_file_range also supports
>> >> 64bit count value.
>> >>
>> >> Also with this proposal off-the-bet allow the userland to copy
>> >> between different mount points.
>> >>
>> >> Signed-off-by: Olga Kornievskaia <[email protected]>
>> >> ---
>> >> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
>> >> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>> >> fs/read_write.c | 146 +++++++++++++++++++++++++++++++--
>> >> include/linux/fs.h | 4 +
>> >> include/linux/syscalls.h | 3 +
>> >> include/uapi/asm-generic/unistd.h | 4 +-
>> >> kernel/sys_ni.c | 1 +
>> >> 7 files changed, 154 insertions(+), 6 deletions(-)
>> >>
>> >> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> >> index 448ac21..1f5f1e7 100644
>> >> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> >> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> >> @@ -391,3 +391,4 @@
>> >> 382 i386 pkey_free sys_pkey_free
>> >> 383 i386 statx sys_statx
>> >> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
>> >> +385 i386 copy_file_range64 sys_copy_file_range64
>> >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> >> index 5aef183..e658bbe 100644
>> >> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> >> @@ -339,6 +339,7 @@
>> >> 330 common pkey_alloc sys_pkey_alloc
>> >> 331 common pkey_free sys_pkey_free
>> >> 332 common statx sys_statx
>> >> +333 64 copy_file_range64 sys_copy_file_range64
>> >>
>> >> #
>> >> # x32-specific system call numbers start at 512 to avoid cache impact
>> >> diff --git a/fs/read_write.c b/fs/read_write.c
>> >> index e3ee985..c9c072b 100644
>> >> --- a/fs/read_write.c
>> >> +++ b/fs/read_write.c
>> >> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> >> return ret;
>> >> }
>> >>
>> >> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> >> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> >> {
>> >> struct inode *inode = file_inode(file);
>> >>
>> >> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> >> return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
>> >> }
>> >>
>> >> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>> >> + struct file *file_out, loff_t pos_out,
>> >> + u64 len, unsigned int flags)
>> >
>> > AFAICT the difference between vfs_copy_file_range() and
>> > vfs_copy_file_range64() is that you've replaced 'size_t len' with
>> > 'u64 len', correct?
>>
>> That and I'm asking to allow for cross-device copy to be allowed by
>> the system call.
>
> I was fine letting each fs decide if it was going to allow cross-device
> copy or not, but Christoph argued that since we don't generally allow
> operations across mountpoints we shouldn't allow cross-mountpoint
> {clone,copy}_file_range either.
>
> Roughly translated, XFS has no cross-device copy abilities, there aren't
> any plans to add it, so I don't feel motivated to argue for it... but if
> the nfs community is so motivated (it sounds like it) then y'all could
> try one more time.

Yes NFS community does need one for doing a server-to-server copy
(performance) feature.

There was an argument that none of the VFS operations allow for
cross-device operations. What about splice, doesn't it allow to cross
mount points? So why not allow copy_file_range to do it too?

>> > sizeof(size_t) == sizeof(u64) on 64-bit machines, so at least as far as
>> > the userspace interface is concerned, this only makes a difference on
>> > 32-bit userlands, right? So, uh, how many 32-bit user programs need to
>> > c_f_r more than 2GB at a time?
>>
>> Isn't that 4GB? I thought that max of ssize_t is 4GB on 64bit. I'm
>> under the impression that sizeof (ssize_t) != sizeof(u64) on 64bit
>> machine. If I'm wrong, then this patch is indeed not needed.
>
> From include/uapi/asm-generic/posix_types.h:
>
> /*
> * Most 32 bit architectures use "unsigned int" size_t,
> * and all 64 bit architectures use "unsigned long" size_t.
> */
> #ifndef __kernel_size_t
> #if __BITS_PER_LONG != 64
> typedef unsigned int __kernel_size_t;
> typedef int __kernel_ssize_t;
> typedef int __kernel_ptrdiff_t;
> #else
> typedef __kernel_ulong_t __kernel_size_t;
> typedef __kernel_long_t __kernel_ssize_t;
> typedef __kernel_long_t __kernel_ptrdiff_t;
> #endif
> #endif
>
> Note that 'long' is 64-bit, at least on x64.

Ok, thank you for pointing this out. I see now that it's not important
to change the size_t to allow for larger than 4GB copies.

>
>> > Follow up question -- does c_f_r not work for > 2GB of data when
>> > userspace is 64-bit?
>>
>> Current copy_file_range would work for any file size, it would just
>> need to be called in a loop by the application. With the given
>> proposal, there wouldn't be a need for the application to loop due to
>> the API size limitation. It might still loop if a partial copy was
>> done.
>
> ...which is what happens if we fall back to do_splice_direct, since a
> single call won't pagecache-copy more than INT_MAX bytes from one file
> to another. Therefore, any serious user of this syscall has to check
> the arguments and loop if it wants the whole range done, similar to how
> one is (supposed) to call write().
>
> --D
>
>>
>> >
>> > --D
>> >
>> >> +{
>> >> + struct inode *inode_in = file_inode(file_in);
>> >> + struct inode *inode_out = file_inode(file_out);
>> >> + u64 ret;
>> >> +
>> >> + if (flags != 0)
>> >> + return -EINVAL;
>> >> +
>> >> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>> >> + return -EISDIR;
>> >> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
>> >> + return -EINVAL;
>> >> +
>> >> + ret = rw64_verify_area(file_in, pos_in, len, false);
>> >> + if (unlikely(ret))
>> >> + return ret;
>> >> +
>> >> + ret = rw64_verify_area(file_out, pos_out, len, true);
>> >> + if (unlikely(ret))
>> >> + return ret;
>> >> +
>> >> + if (!(file_in->f_mode & FMODE_READ) ||
>> >> + !(file_out->f_mode & FMODE_WRITE) ||
>> >> + (file_out->f_flags & O_APPEND))
>> >> + return -EBADF;
>> >> +
>> >> + if (len == 0)
>> >> + return 0;
>> >> +
>> >> + file_start_write(file_out);
>> >> +
>> >> + /*
>> >> + * Try cloning first, this is supported by more file systems, and
>> >> + * more efficient if both clone and copy are supported (e.g. NFS).
>> >> + */
>> >> + if (file_in->f_op->clone_file_range) {
>> >> + ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> >> + file_out, pos_out, len);
>> >> + if (ret == 0) {
>> >> + ret = len;
>> >> + goto done;
>> >> + }
>> >> + }
>> >> +
>> >> + if (file_out->f_op->copy_file_range64) {
>> >> + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> >> + file_out, pos_out, len, flags);
>> >> + if (ret != -EOPNOTSUPP)
>> >> + goto done;
>> >> + }
>> >> +
>> >> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> >> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> >> +
>> >> +done:
>> >> + if (ret > 0) {
>> >> + fsnotify_access(file_in);
>> >> + add_rchar(current, ret);
>> >> + fsnotify_modify(file_out);
>> >> + add_wchar(current, ret);
>> >> + }
>> >> +
>> >> + inc_syscr(current);
>> >> + inc_syscw(current);
>> >> +
>> >> + file_end_write(file_out);
>> >> +
>> >> + return ret;
>> >> +}
>> >> +EXPORT_SYMBOL(vfs_copy_file_range64);
>> >> +
>> >> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
>> >> + int, fd_out, loff_t __user *, off_out,
>> >> + u64, len, unsigned int, flags)
>> >> +{
>> >> + loff_t pos_in;
>> >> + loff_t pos_out;
>> >> + struct fd f_in;
>> >> + struct fd f_out;
>> >> + u64 ret = -EBADF;
>> >> +
>> >> + f_in = fdget(fd_in);
>> >> + if (!f_in.file)
>> >> + goto out2;
>> >> +
>> >> + f_out = fdget(fd_out);
>> >> + if (!f_out.file)
>> >> + goto out1;
>> >> +
>> >> + ret = -EFAULT;
>> >> + if (off_in) {
>> >> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
>> >> + goto out;
>> >> + } else {
>> >> + pos_in = f_in.file->f_pos;
>> >> + }
>> >> +
>> >> + if (off_out) {
>> >> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
>> >> + goto out;
>> >> + } else {
>> >> + pos_out = f_out.file->f_pos;
>> >> + }
>> >> +
>> >> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
>> >> + flags);
>> >> + if (ret > 0) {
>> >> + pos_in += ret;
>> >> + pos_out += ret;
>> >> +
>> >> + if (off_in) {
>> >> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
>> >> + ret = -EFAULT;
>> >> + } else {
>> >> + f_in.file->f_pos = pos_in;
>> >> + }
>> >> +
>> >> + if (off_out) {
>> >> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
>> >> + ret = -EFAULT;
>> >> + } else {
>> >> + f_out.file->f_pos = pos_out;
>> >> + }
>> >> + }
>> >> +
>> >> +out:
>> >> + fdput(f_out);
>> >> +out1:
>> >> + fdput(f_in);
>> >> +out2:
>> >> + return ret;
>> >> +}
>> >> +
>> >> /*
>> >> * Check that the two inodes are eligible for cloning, the ranges make
>> >> * sense, and then flush all dirty data. Caller must ensure that the
>> >> @@ -1860,11 +1996,11 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
>> >> if (!file_in->f_op->clone_file_range)
>> >> return -EOPNOTSUPP;
>> >>
>> >> - ret = clone_verify_area(file_in, pos_in, len, false);
>> >> + ret = rw64_verify_area(file_in, pos_in, len, false);
>> >> if (ret)
>> >> return ret;
>> >>
>> >> - ret = clone_verify_area(file_out, pos_out, len, true);
>> >> + ret = rw64_verify_area(file_out, pos_out, len, true);
>> >> if (ret)
>> >> return ret;
>> >>
>> >> @@ -2009,7 +2145,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>> >> if (!S_ISREG(src->i_mode))
>> >> goto out;
>> >>
>> >> - ret = clone_verify_area(file, off, len, false);
>> >> + ret = rw64_verify_area(file, off, len, false);
>> >> if (ret < 0)
>> >> goto out;
>> >> ret = 0;
>> >> @@ -2041,7 +2177,7 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
>> >> }
>> >>
>> >> dst_off = info->dest_offset;
>> >> - ret = clone_verify_area(dst_file, dst_off, len, true);
>> >> + ret = rw64_verify_area(dst_file, dst_off, len, true);
>> >> if (ret < 0) {
>> >> info->status = ret;
>> >> goto next_file;
>> >> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> >> index 803e5a9..41fce43 100644
>> >> --- a/include/linux/fs.h
>> >> +++ b/include/linux/fs.h
>> >> @@ -1690,6 +1690,8 @@ struct file_operations {
>> >> u64);
>> >> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>> >> u64);
>> >> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
>> >> + loff_t, u64, unsigned int);
>> >> };
>> >>
>> >> struct inode_operations {
>> >> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>> >> loff_t len, bool *is_same);
>> >> extern int vfs_dedupe_file_range(struct file *file,
>> >> struct file_dedupe_range *same);
>> >> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
>> >> + loff_t, u64, unsigned int);
>> >>
>> >> struct super_operations {
>> >> struct inode *(*alloc_inode)(struct super_block *sb);
>> >> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> >> index 980c3c9..f7ea78e 100644
>> >> --- a/include/linux/syscalls.h
>> >> +++ b/include/linux/syscalls.h
>> >> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>> >> asmlinkage long sys_pkey_free(int pkey);
>> >> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>> >> unsigned mask, struct statx __user *buffer);
>> >> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
>> >> + int fd_out, loff_t __user *off_out,
>> >> + u64 len, unsigned int flags);
>> >>
>> >> #endif
>> >> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
>> >> index 061185a..e385a76 100644
>> >> --- a/include/uapi/asm-generic/unistd.h
>> >> +++ b/include/uapi/asm-generic/unistd.h
>> >> @@ -731,9 +731,11 @@
>> >> __SYSCALL(__NR_pkey_free, sys_pkey_free)
>> >> #define __NR_statx 291
>> >> __SYSCALL(__NR_statx, sys_statx)
>> >> +#define __NR_copy_file_range64 292
>> >> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>> >>
>> >> #undef __NR_syscalls
>> >> -#define __NR_syscalls 292
>> >> +#define __NR_syscalls 293
>> >>
>> >> /*
>> >> * All syscalls below here should go away really,
>> >> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> >> index 8acef85..8e0cfd4 100644
>> >> --- a/kernel/sys_ni.c
>> >> +++ b/kernel/sys_ni.c
>> >> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>> >> cond_syscall(sys_capget);
>> >> cond_syscall(sys_capset);
>> >> cond_syscall(sys_copy_file_range);
>> >> +cond_syscall(sys_copy_file_range64);
>> >>
>> >> /* arch-specific weak syscall entries */
>> >> cond_syscall(sys_pciconfig_read);
>> >> --
>> >> 1.8.3.1
>> >>
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>> > the body of a message to [email protected]
>> > More majordomo info at http://vger.kernel.org/majordomo-info.html

2017-06-19 19:39:26

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
> Yes NFS community does need one for doing a server-to-server copy
> (performance) feature.

s/NFS community/NetApp/

2017-06-19 19:48:16

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Mon, Jun 19, 2017 at 3:39 PM, Christoph Hellwig <[email protected]> wrote:
> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>> Yes NFS community does need one for doing a server-to-server copy
>> (performance) feature.
>
> s/NFS community/NetApp/

NetApp = users. Is there another reason for having a feature in the
kernel if not for the users?

2017-06-20 12:38:52

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

Hello,

On 06/19/2017 03:39 PM, Christoph Hellwig wrote:
> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>> Yes NFS community does need one for doing a server-to-server copy
>> (performance) feature.
>
> s/NFS community/NetApp/
Wrong... Red Hat is also very interested in this work
being that this support is in both Fedora and upcoming
RHEL releases...

I must admin I just don't understand what there is *any*
push back on a feature that would dramatically increase
the I/O speed of NFS. That would be good for the
*entire* Linux community... IMHO.

I'm coming to the party late... So is the only
sticking point is to have this vfs call
write to a different filesystem, correct?
Why? (please point to a email that already
explains it) Does that break some type of
boundary or is it because it was never
needed before??

Again, looking at the big picture... Allowing
NFS to copy files by only sending meta-data
is HUGE! It is such a win for the entire
community so I'm so surprise this community
is not trying to help the process instead
of push it back.

my two cents,

steved.


2017-06-20 12:44:37

by Steve Dickson

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call



On 06/15/2017 04:53 PM, Darrick J. Wong wrote:
> On Thu, Jun 15, 2017 at 10:39:42AM -0400, Olga Kornievskaia wrote:
>> On Wed, Jun 14, 2017 at 11:59 PM, Darrick J. Wong
>> <[email protected]> wrote:
>>> On Wed, Jun 14, 2017 at 02:53:35PM -0400, Olga Kornievskaia wrote:
>>>> This is a proposal to allow 64bit count to copy and return as
>>>> a result of a copy_file_range. No attempt was made to share code
>>>> with the 32bit function because 32bit interface should probably
>>>> get depreciated.
>>>>
>>>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>>>> which could use 64bit count and NFS copy_file_range also supports
>>>> 64bit count value.
>>>>
>>>> Also with this proposal off-the-bet allow the userland to copy
>>>> between different mount points.
>>>>
>>>> Signed-off-by: Olga Kornievskaia <[email protected]>
>>>> ---
>>>> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
>>>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>>>> fs/read_write.c | 146 +++++++++++++++++++++++++++++++--
>>>> include/linux/fs.h | 4 +
>>>> include/linux/syscalls.h | 3 +
>>>> include/uapi/asm-generic/unistd.h | 4 +-
>>>> kernel/sys_ni.c | 1 +
>>>> 7 files changed, 154 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>>>> index 448ac21..1f5f1e7 100644
>>>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>>>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>>>> @@ -391,3 +391,4 @@
>>>> 382 i386 pkey_free sys_pkey_free
>>>> 383 i386 statx sys_statx
>>>> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
>>>> +385 i386 copy_file_range64 sys_copy_file_range64
>>>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>>>> index 5aef183..e658bbe 100644
>>>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>>>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>>>> @@ -339,6 +339,7 @@
>>>> 330 common pkey_alloc sys_pkey_alloc
>>>> 331 common pkey_free sys_pkey_free
>>>> 332 common statx sys_statx
>>>> +333 64 copy_file_range64 sys_copy_file_range64
>>>>
>>>> #
>>>> # x32-specific system call numbers start at 512 to avoid cache impact
>>>> diff --git a/fs/read_write.c b/fs/read_write.c
>>>> index e3ee985..c9c072b 100644
>>>> --- a/fs/read_write.c
>>>> +++ b/fs/read_write.c
>>>> @@ -1700,7 +1700,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>>>> return ret;
>>>> }
>>>>
>>>> -static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>>>> +static int rw64_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>>>> {
>>>> struct inode *inode = file_inode(file);
>>>>
>>>> @@ -1723,6 +1723,142 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>>>> return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
>>>> }
>>>>
>>>> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>>>> + struct file *file_out, loff_t pos_out,
>>>> + u64 len, unsigned int flags)
>>>
>>> AFAICT the difference between vfs_copy_file_range() and
>>> vfs_copy_file_range64() is that you've replaced 'size_t len' with
>>> 'u64 len', correct?
>>
>> That and I'm asking to allow for cross-device copy to be allowed by
>> the system call.
>
> I was fine letting each fs decide if it was going to allow cross-device
> copy or not, but Christoph argued that since we don't generally allow
> operations across mountpoints we shouldn't allow cross-mountpoint
> {clone,copy}_file_range either.
>
> Roughly translated, XFS has no cross-device copy abilities, there aren't
> any plans to add it, so I don't feel motivated to argue for it... but if
> the nfs community is so motivated (it sounds like it) then y'all could
> try one more time.
>
So since XFS (or any other local fs) does not have this ability,
NFS does not needed it?? That can't be the reason for the
push back that is denying NFS this ability..

steved.

2017-06-20 19:34:13

by Chuck Lever

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call


> On Jun 19, 2017, at 3:39 PM, Christoph Hellwig <[email protected]> wrote:
>
> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>> Yes NFS community does need one for doing a server-to-server copy
>> (performance) feature.
>
> s/NFS community/NetApp/

I don't have an opinion on the API details, but Oracle is interested
in server-to-server copy, especially if there is an implementation
of it in the Linux NFS server.


--
Chuck Lever




2017-06-20 19:37:45

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Tue, Jun 20, 2017 at 3:33 PM, Chuck Lever <[email protected]> wrote:
>
>> On Jun 19, 2017, at 3:39 PM, Christoph Hellwig <[email protected]> wrote:
>>
>> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>>> Yes NFS community does need one for doing a server-to-server copy
>>> (performance) feature.
>>
>> s/NFS community/NetApp/
>
> I don't have an opinion on the API details, but Oracle is interested
> in server-to-server copy, especially if there is an implementation
> of it in the Linux NFS server.

The proposed patch set is the linux client AND linux server. We need
removal of the VFS cross-device check on both side (client and
destination server) to make the server-to-server copy work. On the
server side, we also call vfs_copy_file_range() that (when check is
removed) ends up reading from the NFS source file and writing to the
local file system.

2017-06-24 11:29:10

by Mkrtchyan, Tigran

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call



----- Original Message -----
> From: "Christoph Hellwig" <[email protected]>
> To: "Olga Kornievskaia" <[email protected]>
> Cc: "Darrick J. Wong" <[email protected]>, "Olga Kornievskaia" <[email protected]>, [email protected],
> "linux-nfs" <[email protected]>
> Sent: Monday, June 19, 2017 9:39:03 PM
> Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

> On Mon, Jun 19, 2017 at 02:34:58PM -0400, Olga Kornievskaia wrote:
>> Yes NFS community does need one for doing a server-to-server copy
>> (performance) feature.
>
> s/NFS community/NetApp/

As being probably the first person who have raised the need in server-to-server copy
in NFS (see slide 15 http://nfsv4bat.org/Documents/ConnectAThon/2008/managedstorage.pdf),
let me disagree with you.

Having end-user hat on me I can say, that over a decade we are moving scientific
data around the world with server-side-copy, or 3rd-party copy as we call
it. While we more-and-more use NFS (pNFS) for data access, we always need a
second protocol, like FTP or WebDAV, to perform efficient data movement.

I am not deep in kernel internals, but there must be a very good reason to block
such functionality in NFS. Such functionality will drastically simplify our
workflow. And I am sure, we are not the only users of it.

Tigran.


> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2017-06-24 13:23:27

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
> This is a proposal to allow 64bit count to copy and return as
> a result of a copy_file_range. No attempt was made to share code
> with the 32bit function because 32bit interface should probably
> get depreciated.
>
> Why use 64bit? Current uses of 32bit are by clone_file_range()
> which could use 64bit count and NFS copy_file_range also supports
> 64bit count value.
>
> Also with this proposal off-the-bet allow the userland to copy
> between different mount points.
>
> Signed-off-by: Olga Kornievskaia <[email protected]>
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> fs/read_write.c | 136 +++++++++++++++++++++++++++++++++
> include/linux/fs.h | 4 +
> include/linux/syscalls.h | 3 +
> include/uapi/asm-generic/unistd.h | 4 +-
> kernel/sys_ni.c | 1 +
> 7 files changed, 149 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 448ac21..1f5f1e7 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -391,3 +391,4 @@
> 382 i386 pkey_free sys_pkey_free
> 383 i386 statx sys_statx
> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
> +385 i386 copy_file_range64 sys_copy_file_range64
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183..e658bbe 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
> 330 common pkey_alloc sys_pkey_alloc
> 331 common pkey_free sys_pkey_free
> 332 common statx sys_statx
> +333 64 copy_file_range64 sys_copy_file_range64
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 47c1d44..e2665c1 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> return ret;
> }
>
> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out,
> + u64 len, unsigned int flags)
> +{
> + struct inode *inode_in = file_inode(file_in);
> + struct inode *inode_out = file_inode(file_out);
> + u64 ret;
> +
> + if (flags != 0)
> + return -EINVAL;
> +
> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> + return -EISDIR;
> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> + return -EINVAL;
> +
> + ret = rw_verify_area(READ, file_in, &pos_in, len);
> + if (unlikely(ret))
> + return ret;
> +
> + ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> + if (unlikely(ret))
> + return ret;
> +
> + if (!(file_in->f_mode & FMODE_READ) ||
> + !(file_out->f_mode & FMODE_WRITE) ||
> + (file_out->f_flags & O_APPEND))
> + return -EBADF;
> +
> + if (len == 0)
> + return 0;
> +
> + file_start_write(file_out);
> +
> + /*
> + * Try cloning first, this is supported by more file systems, and
> + * more efficient if both clone and copy are supported (e.g. NFS).
> + */
> + if (file_in->f_op->clone_file_range) {
> + ret = file_in->f_op->clone_file_range(file_in, pos_in,
> + file_out, pos_out, len);
> + if (ret == 0) {
> + ret = len;
> + goto done;
> + }
> + }
> +
> + if (file_out->f_op->copy_file_range64) {
> + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> + file_out, pos_out, len, flags);
> + if (ret != -EOPNOTSUPP)
> + goto done;
> + }
> +
> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> +
> +done:
> + if (ret > 0) {
> + fsnotify_access(file_in);
> + add_rchar(current, ret);
> + fsnotify_modify(file_out);
> + add_wchar(current, ret);
> + }
> +
> + inc_syscr(current);
> + inc_syscw(current);
> +
> + file_end_write(file_out);
> +
> + return ret;
> +}
> +EXPORT_SYMBOL(vfs_copy_file_range64);
> +
> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> + int, fd_out, loff_t __user *, off_out,
> + u64, len, unsigned int, flags)
> +{
> + loff_t pos_in;
> + loff_t pos_out;
> + struct fd f_in;
> + struct fd f_out;
> + u64 ret = -EBADF;
> +
> + f_in = fdget(fd_in);
> + if (!f_in.file)
> + goto out2;
> +
> + f_out = fdget(fd_out);
> + if (!f_out.file)
> + goto out1;
> +
> + ret = -EFAULT;
> + if (off_in) {
> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> + goto out;
> + } else {
> + pos_in = f_in.file->f_pos;
> + }
> +
> + if (off_out) {
> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> + goto out;
> + } else {
> + pos_out = f_out.file->f_pos;
> + }
> +
> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> + flags);
> + if (ret > 0) {
> + pos_in += ret;
> + pos_out += ret;
> +
> + if (off_in) {
> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> + ret = -EFAULT;
> + } else {
> + f_in.file->f_pos = pos_in;
> + }
> +
> + if (off_out) {
> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> + ret = -EFAULT;
> + } else {
> + f_out.file->f_pos = pos_out;
> + }
> + }
> +
> +out:
> + fdput(f_out);
> +out1:
> + fdput(f_in);
> +out2:
> + return ret;
> +}
> +
> static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> {
> struct inode *inode = file_inode(file);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 803e5a9..2727a98 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1690,6 +1690,8 @@ struct file_operations {
> u64);
> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
> u64);
> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> + loff_t, u64, unsigned int);
> };
>
> struct inode_operations {
> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> loff_t len, bool *is_same);
> extern int vfs_dedupe_file_range(struct file *file,
> struct file_dedupe_range *same);
> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> + loff_t, u64, unsigned int);
>
> struct super_operations {
> struct inode *(*alloc_inode)(struct super_block *sb);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9..f7ea78e 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
> asmlinkage long sys_pkey_free(int pkey);
> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> unsigned mask, struct statx __user *buffer);
> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> + int fd_out, loff_t __user *off_out,
> + u64 len, unsigned int flags);
>
> #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 061185a..e385a76 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -731,9 +731,11 @@
> __SYSCALL(__NR_pkey_free, sys_pkey_free)
> #define __NR_statx 291
> __SYSCALL(__NR_statx, sys_statx)
> +#define __NR_copy_file_range64 292
> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>
> #undef __NR_syscalls
> -#define __NR_syscalls 292
> +#define __NR_syscalls 293
>
> /*
> * All syscalls below here should go away really,
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 8acef85..8e0cfd4 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
> cond_syscall(sys_capget);
> cond_syscall(sys_capset);
> cond_syscall(sys_copy_file_range);
> +cond_syscall(sys_copy_file_range64);
>
> /* arch-specific weak syscall entries */
> cond_syscall(sys_pciconfig_read);

Hi Olga,

We discussed this a bit privately, but I'll note it here too.

Server-to-server copy seems like a nice thing to me as well. There are
several filesystems that can likely make use of that ability.

I don't have a real opinion on the API change either, but you're making
a very subtle change with this patch. Prior to this, the existing
copy_file_range calls could count on the superblocks being the same.
Now, it looks like you can pass them any old file pointer.

The exsiting copy_file_range file ops need to be fixed to vet the struct
files they've been handed if you want to do this.

It would also be nice to see something on copy_file_range and
clone_file_range in Documentation/filesystems/vfs.txt.

Cheers,
--
Jeff Layton <[email protected]>

2017-06-26 13:22:00

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call



On 06/24/2017 09:23 AM, Jeff Layton wrote:
> On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
>> This is a proposal to allow 64bit count to copy and return as
>> a result of a copy_file_range. No attempt was made to share code
>> with the 32bit function because 32bit interface should probably
>> get depreciated.
>>
>> Why use 64bit? Current uses of 32bit are by clone_file_range()
>> which could use 64bit count and NFS copy_file_range also supports
>> 64bit count value.
>>
>> Also with this proposal off-the-bet allow the userland to copy
>> between different mount points.
>>
>> Signed-off-by: Olga Kornievskaia <[email protected]>
>> ---
>> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
>> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>> fs/read_write.c | 136 +++++++++++++++++++++++++++++++++
>> include/linux/fs.h | 4 +
>> include/linux/syscalls.h | 3 +
>> include/uapi/asm-generic/unistd.h | 4 +-
>> kernel/sys_ni.c | 1 +
>> 7 files changed, 149 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> index 448ac21..1f5f1e7 100644
>> --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> @@ -391,3 +391,4 @@
>> 382 i386 pkey_free sys_pkey_free
>> 383 i386 statx sys_statx
>> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
>> +385 i386 copy_file_range64 sys_copy_file_range64
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> index 5aef183..e658bbe 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -339,6 +339,7 @@
>> 330 common pkey_alloc sys_pkey_alloc
>> 331 common pkey_free sys_pkey_free
>> 332 common statx sys_statx
>> +333 64 copy_file_range64 sys_copy_file_range64
>>
>> #
>> # x32-specific system call numbers start at 512 to avoid cache impact
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 47c1d44..e2665c1 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> return ret;
>> }
>>
>> +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>> + struct file *file_out, loff_t pos_out,
>> + u64 len, unsigned int flags)
>> +{
>> + struct inode *inode_in = file_inode(file_in);
>> + struct inode *inode_out = file_inode(file_out);
>> + u64 ret;
>> +
>> + if (flags != 0)
>> + return -EINVAL;
>> +
>> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>> + return -EISDIR;
>> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
>> + return -EINVAL;
>> +
>> + ret = rw_verify_area(READ, file_in, &pos_in, len);
>> + if (unlikely(ret))
>> + return ret;
>> +
>> + ret = rw_verify_area(WRITE, file_out, &pos_out, len);
>> + if (unlikely(ret))
>> + return ret;
>> +
>> + if (!(file_in->f_mode & FMODE_READ) ||
>> + !(file_out->f_mode & FMODE_WRITE) ||
>> + (file_out->f_flags & O_APPEND))
>> + return -EBADF;
>> +
>> + if (len == 0)
>> + return 0;
>> +
>> + file_start_write(file_out);
>> +
>> + /*
>> + * Try cloning first, this is supported by more file systems, and
>> + * more efficient if both clone and copy are supported (e.g. NFS).
>> + */
>> + if (file_in->f_op->clone_file_range) {
>> + ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> + file_out, pos_out, len);
>> + if (ret == 0) {
>> + ret = len;
>> + goto done;
>> + }
>> + }
>> +
>> + if (file_out->f_op->copy_file_range64) {
>> + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> + file_out, pos_out, len, flags);
>> + if (ret != -EOPNOTSUPP)
>> + goto done;
>> + }
>> +
>> + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> +
>> +done:
>> + if (ret > 0) {
>> + fsnotify_access(file_in);
>> + add_rchar(current, ret);
>> + fsnotify_modify(file_out);
>> + add_wchar(current, ret);
>> + }
>> +
>> + inc_syscr(current);
>> + inc_syscw(current);
>> +
>> + file_end_write(file_out);
>> +
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(vfs_copy_file_range64);
>> +
>> +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
>> + int, fd_out, loff_t __user *, off_out,
>> + u64, len, unsigned int, flags)
>> +{
>> + loff_t pos_in;
>> + loff_t pos_out;
>> + struct fd f_in;
>> + struct fd f_out;
>> + u64 ret = -EBADF;
>> +
>> + f_in = fdget(fd_in);
>> + if (!f_in.file)
>> + goto out2;
>> +
>> + f_out = fdget(fd_out);
>> + if (!f_out.file)
>> + goto out1;
>> +
>> + ret = -EFAULT;
>> + if (off_in) {
>> + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
>> + goto out;
>> + } else {
>> + pos_in = f_in.file->f_pos;
>> + }
>> +
>> + if (off_out) {
>> + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
>> + goto out;
>> + } else {
>> + pos_out = f_out.file->f_pos;
>> + }
>> +
>> + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
>> + flags);
>> + if (ret > 0) {
>> + pos_in += ret;
>> + pos_out += ret;
>> +
>> + if (off_in) {
>> + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
>> + ret = -EFAULT;
>> + } else {
>> + f_in.file->f_pos = pos_in;
>> + }
>> +
>> + if (off_out) {
>> + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
>> + ret = -EFAULT;
>> + } else {
>> + f_out.file->f_pos = pos_out;
>> + }
>> + }
>> +
>> +out:
>> + fdput(f_out);
>> +out1:
>> + fdput(f_in);
>> +out2:
>> + return ret;
>> +}
>> +
>> static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> {
>> struct inode *inode = file_inode(file);
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 803e5a9..2727a98 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -1690,6 +1690,8 @@ struct file_operations {
>> u64);
>> ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>> u64);
>> + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
>> + loff_t, u64, unsigned int);
>> };
>>
>> struct inode_operations {
>> @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>> loff_t len, bool *is_same);
>> extern int vfs_dedupe_file_range(struct file *file,
>> struct file_dedupe_range *same);
>> +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
>> + loff_t, u64, unsigned int);
>>
>> struct super_operations {
>> struct inode *(*alloc_inode)(struct super_block *sb);
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index 980c3c9..f7ea78e 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>> asmlinkage long sys_pkey_free(int pkey);
>> asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>> unsigned mask, struct statx __user *buffer);
>> +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
>> + int fd_out, loff_t __user *off_out,
>> + u64 len, unsigned int flags);
>>
>> #endif
>> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
>> index 061185a..e385a76 100644
>> --- a/include/uapi/asm-generic/unistd.h
>> +++ b/include/uapi/asm-generic/unistd.h
>> @@ -731,9 +731,11 @@
>> __SYSCALL(__NR_pkey_free, sys_pkey_free)
>> #define __NR_statx 291
>> __SYSCALL(__NR_statx, sys_statx)
>> +#define __NR_copy_file_range64 292
>> +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>>
>> #undef __NR_syscalls
>> -#define __NR_syscalls 292
>> +#define __NR_syscalls 293
>>
>> /*
>> * All syscalls below here should go away really,
>> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> index 8acef85..8e0cfd4 100644
>> --- a/kernel/sys_ni.c
>> +++ b/kernel/sys_ni.c
>> @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>> cond_syscall(sys_capget);
>> cond_syscall(sys_capset);
>> cond_syscall(sys_copy_file_range);
>> +cond_syscall(sys_copy_file_range64);
>>
>> /* arch-specific weak syscall entries */
>> cond_syscall(sys_pciconfig_read);
>
> Hi Olga,
>
> We discussed this a bit privately, but I'll note it here too.
>
> Server-to-server copy seems like a nice thing to me as well. There are
> several filesystems that can likely make use of that ability.
>
> I don't have a real opinion on the API change either, but you're making
> a very subtle change with this patch. Prior to this, the existing
> copy_file_range calls could count on the superblocks being the same.
> Now, it looks like you can pass them any old file pointer.

What if we add a new file op for xdev copy that gets called when superblocks are different, but filesystem type is the same? We could keep the current copy_file_range fop to fall back on if superblocks are the same.

Thoughts?
Anna

>
> The exsiting copy_file_range file ops need to be fixed to vet the struct
> files they've been handed if you want to do this.
>
> It would also be nice to see something on copy_file_range and
> clone_file_range in Documentation/filesystems/vfs.txt.
>
> Cheers,
>

2017-06-26 13:46:34

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote:
>
> On 06/24/2017 09:23 AM, Jeff Layton wrote:
> > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
> > > This is a proposal to allow 64bit count to copy and return as
> > > a result of a copy_file_range. No attempt was made to share code
> > > with the 32bit function because 32bit interface should probably
> > > get depreciated.
> > >
> > > Why use 64bit? Current uses of 32bit are by clone_file_range()
> > > which could use 64bit count and NFS copy_file_range also supports
> > > 64bit count value.
> > >
> > > Also with this proposal off-the-bet allow the userland to copy
> > > between different mount points.
> > >
> > > Signed-off-by: Olga Kornievskaia <[email protected]>
> > > ---
> > > arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> > > arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> > > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++
> > > include/linux/fs.h | 4 +
> > > include/linux/syscalls.h | 3 +
> > > include/uapi/asm-generic/unistd.h | 4 +-
> > > kernel/sys_ni.c | 1 +
> > > 7 files changed, 149 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> > > index 448ac21..1f5f1e7 100644
> > > --- a/arch/x86/entry/syscalls/syscall_32.tbl
> > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> > > @@ -391,3 +391,4 @@
> > > 382 i386 pkey_free sys_pkey_free
> > > 383 i386 statx sys_statx
> > > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
> > > +385 i386 copy_file_range64 sys_copy_file_range64
> > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > > index 5aef183..e658bbe 100644
> > > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > > @@ -339,6 +339,7 @@
> > > 330 common pkey_alloc sys_pkey_alloc
> > > 331 common pkey_free sys_pkey_free
> > > 332 common statx sys_statx
> > > +333 64 copy_file_range64 sys_copy_file_range64
> > >
> > > #
> > > # x32-specific system call numbers start at 512 to avoid cache impact
> > > diff --git a/fs/read_write.c b/fs/read_write.c
> > > index 47c1d44..e2665c1 100644
> > > --- a/fs/read_write.c
> > > +++ b/fs/read_write.c
> > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> > > return ret;
> > > }
> > >
> > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> > > + struct file *file_out, loff_t pos_out,
> > > + u64 len, unsigned int flags)
> > > +{
> > > + struct inode *inode_in = file_inode(file_in);
> > > + struct inode *inode_out = file_inode(file_out);
> > > + u64 ret;
> > > +
> > > + if (flags != 0)
> > > + return -EINVAL;
> > > +
> > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> > > + return -EISDIR;
> > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> > > + return -EINVAL;
> > > +
> > > + ret = rw_verify_area(READ, file_in, &pos_in, len);
> > > + if (unlikely(ret))
> > > + return ret;
> > > +
> > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> > > + if (unlikely(ret))
> > > + return ret;
> > > +
> > > + if (!(file_in->f_mode & FMODE_READ) ||
> > > + !(file_out->f_mode & FMODE_WRITE) ||
> > > + (file_out->f_flags & O_APPEND))
> > > + return -EBADF;
> > > +
> > > + if (len == 0)
> > > + return 0;
> > > +
> > > + file_start_write(file_out);
> > > +
> > > + /*
> > > + * Try cloning first, this is supported by more file systems, and
> > > + * more efficient if both clone and copy are supported (e.g. NFS).
> > > + */
> > > + if (file_in->f_op->clone_file_range) {
> > > + ret = file_in->f_op->clone_file_range(file_in, pos_in,
> > > + file_out, pos_out, len);
> > > + if (ret == 0) {
> > > + ret = len;
> > > + goto done;
> > > + }
> > > + }
> > > +
> > > + if (file_out->f_op->copy_file_range64) {
> > > + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> > > + file_out, pos_out, len, flags);
> > > + if (ret != -EOPNOTSUPP)
> > > + goto done;
> > > + }
> > > +
> > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > +
> > > +done:
> > > + if (ret > 0) {
> > > + fsnotify_access(file_in);
> > > + add_rchar(current, ret);
> > > + fsnotify_modify(file_out);
> > > + add_wchar(current, ret);
> > > + }
> > > +
> > > + inc_syscr(current);
> > > + inc_syscw(current);
> > > +
> > > + file_end_write(file_out);
> > > +
> > > + return ret;
> > > +}
> > > +EXPORT_SYMBOL(vfs_copy_file_range64);
> > > +
> > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> > > + int, fd_out, loff_t __user *, off_out,
> > > + u64, len, unsigned int, flags)
> > > +{
> > > + loff_t pos_in;
> > > + loff_t pos_out;
> > > + struct fd f_in;
> > > + struct fd f_out;
> > > + u64 ret = -EBADF;
> > > +
> > > + f_in = fdget(fd_in);
> > > + if (!f_in.file)
> > > + goto out2;
> > > +
> > > + f_out = fdget(fd_out);
> > > + if (!f_out.file)
> > > + goto out1;
> > > +
> > > + ret = -EFAULT;
> > > + if (off_in) {
> > > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> > > + goto out;
> > > + } else {
> > > + pos_in = f_in.file->f_pos;
> > > + }
> > > +
> > > + if (off_out) {
> > > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> > > + goto out;
> > > + } else {
> > > + pos_out = f_out.file->f_pos;
> > > + }
> > > +
> > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> > > + flags);
> > > + if (ret > 0) {
> > > + pos_in += ret;
> > > + pos_out += ret;
> > > +
> > > + if (off_in) {
> > > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> > > + ret = -EFAULT;
> > > + } else {
> > > + f_in.file->f_pos = pos_in;
> > > + }
> > > +
> > > + if (off_out) {
> > > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> > > + ret = -EFAULT;
> > > + } else {
> > > + f_out.file->f_pos = pos_out;
> > > + }
> > > + }
> > > +
> > > +out:
> > > + fdput(f_out);
> > > +out1:
> > > + fdput(f_in);
> > > +out2:
> > > + return ret;
> > > +}
> > > +
> > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> > > {
> > > struct inode *inode = file_inode(file);
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index 803e5a9..2727a98 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -1690,6 +1690,8 @@ struct file_operations {
> > > u64);
> > > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
> > > u64);
> > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> > > + loff_t, u64, unsigned int);
> > > };
> > >
> > > struct inode_operations {
> > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > > loff_t len, bool *is_same);
> > > extern int vfs_dedupe_file_range(struct file *file,
> > > struct file_dedupe_range *same);
> > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> > > + loff_t, u64, unsigned int);
> > >
> > > struct super_operations {
> > > struct inode *(*alloc_inode)(struct super_block *sb);
> > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > > index 980c3c9..f7ea78e 100644
> > > --- a/include/linux/syscalls.h
> > > +++ b/include/linux/syscalls.h
> > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
> > > asmlinkage long sys_pkey_free(int pkey);
> > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> > > unsigned mask, struct statx __user *buffer);
> > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> > > + int fd_out, loff_t __user *off_out,
> > > + u64 len, unsigned int flags);
> > >
> > > #endif
> > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> > > index 061185a..e385a76 100644
> > > --- a/include/uapi/asm-generic/unistd.h
> > > +++ b/include/uapi/asm-generic/unistd.h
> > > @@ -731,9 +731,11 @@
> > > __SYSCALL(__NR_pkey_free, sys_pkey_free)
> > > #define __NR_statx 291
> > > __SYSCALL(__NR_statx, sys_statx)
> > > +#define __NR_copy_file_range64 292
> > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
> > >
> > > #undef __NR_syscalls
> > > -#define __NR_syscalls 292
> > > +#define __NR_syscalls 293
> > >
> > > /*
> > > * All syscalls below here should go away really,
> > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> > > index 8acef85..8e0cfd4 100644
> > > --- a/kernel/sys_ni.c
> > > +++ b/kernel/sys_ni.c
> > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
> > > cond_syscall(sys_capget);
> > > cond_syscall(sys_capset);
> > > cond_syscall(sys_copy_file_range);
> > > +cond_syscall(sys_copy_file_range64);
> > >
> > > /* arch-specific weak syscall entries */
> > > cond_syscall(sys_pciconfig_read);
> >
> > Hi Olga,
> >
> > We discussed this a bit privately, but I'll note it here too.
> >
> > Server-to-server copy seems like a nice thing to me as well. There are
> > several filesystems that can likely make use of that ability.
> >
> > I don't have a real opinion on the API change either, but you're making
> > a very subtle change with this patch. Prior to this, the existing
> > copy_file_range calls could count on the superblocks being the same.
> > Now, it looks like you can pass them any old file pointer.
>
> What if we add a new file op for xdev copy that gets called when
> superblocks are different, but filesystem type is the same? We could
> keep the current copy_file_range fop to fall back on if superblocks
> are the same.

I don't think that would really help much. There are only two
filesystems with copy_file_range operations -- nfsv4 and cifs. I don't
think we really need another special case op for this.

What I would probably suggest is to just push the check for the same
superblock down into the copy_file_range ops in one patch (with no
functional change). Then, you could go and lift that restriction in the
NFSv4 operation without regressing cifs. The CIFS folks could eventually
do the same for theirs.

--
Jeff Layton <[email protected]>

2017-06-26 14:46:59

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Mon, Jun 26, 2017 at 9:46 AM, Jeff Layton <[email protected]> wrote:
> On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote:
>>
>> On 06/24/2017 09:23 AM, Jeff Layton wrote:
>> > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
>> > > This is a proposal to allow 64bit count to copy and return as
>> > > a result of a copy_file_range. No attempt was made to share code
>> > > with the 32bit function because 32bit interface should probably
>> > > get depreciated.
>> > >
>> > > Why use 64bit? Current uses of 32bit are by clone_file_range()
>> > > which could use 64bit count and NFS copy_file_range also supports
>> > > 64bit count value.
>> > >
>> > > Also with this proposal off-the-bet allow the userland to copy
>> > > between different mount points.
>> > >
>> > > Signed-off-by: Olga Kornievskaia <[email protected]>
>> > > ---
>> > > arch/x86/entry/syscalls/syscall_32.tbl | 1 +
>> > > arch/x86/entry/syscalls/syscall_64.tbl | 1 +
>> > > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++
>> > > include/linux/fs.h | 4 +
>> > > include/linux/syscalls.h | 3 +
>> > > include/uapi/asm-generic/unistd.h | 4 +-
>> > > kernel/sys_ni.c | 1 +
>> > > 7 files changed, 149 insertions(+), 1 deletion(-)
>> > >
>> > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
>> > > index 448ac21..1f5f1e7 100644
>> > > --- a/arch/x86/entry/syscalls/syscall_32.tbl
>> > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl
>> > > @@ -391,3 +391,4 @@
>> > > 382 i386 pkey_free sys_pkey_free
>> > > 383 i386 statx sys_statx
>> > > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
>> > > +385 i386 copy_file_range64 sys_copy_file_range64
>> > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
>> > > index 5aef183..e658bbe 100644
>> > > --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> > > @@ -339,6 +339,7 @@
>> > > 330 common pkey_alloc sys_pkey_alloc
>> > > 331 common pkey_free sys_pkey_free
>> > > 332 common statx sys_statx
>> > > +333 64 copy_file_range64 sys_copy_file_range64
>> > >
>> > > #
>> > > # x32-specific system call numbers start at 512 to avoid cache impact
>> > > diff --git a/fs/read_write.c b/fs/read_write.c
>> > > index 47c1d44..e2665c1 100644
>> > > --- a/fs/read_write.c
>> > > +++ b/fs/read_write.c
>> > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> > > return ret;
>> > > }
>> > >
>> > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
>> > > + struct file *file_out, loff_t pos_out,
>> > > + u64 len, unsigned int flags)
>> > > +{
>> > > + struct inode *inode_in = file_inode(file_in);
>> > > + struct inode *inode_out = file_inode(file_out);
>> > > + u64 ret;
>> > > +
>> > > + if (flags != 0)
>> > > + return -EINVAL;
>> > > +
>> > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
>> > > + return -EISDIR;
>> > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
>> > > + return -EINVAL;
>> > > +
>> > > + ret = rw_verify_area(READ, file_in, &pos_in, len);
>> > > + if (unlikely(ret))
>> > > + return ret;
>> > > +
>> > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len);
>> > > + if (unlikely(ret))
>> > > + return ret;
>> > > +
>> > > + if (!(file_in->f_mode & FMODE_READ) ||
>> > > + !(file_out->f_mode & FMODE_WRITE) ||
>> > > + (file_out->f_flags & O_APPEND))
>> > > + return -EBADF;
>> > > +
>> > > + if (len == 0)
>> > > + return 0;
>> > > +
>> > > + file_start_write(file_out);
>> > > +
>> > > + /*
>> > > + * Try cloning first, this is supported by more file systems, and
>> > > + * more efficient if both clone and copy are supported (e.g. NFS).
>> > > + */
>> > > + if (file_in->f_op->clone_file_range) {
>> > > + ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> > > + file_out, pos_out, len);
>> > > + if (ret == 0) {
>> > > + ret = len;
>> > > + goto done;
>> > > + }
>> > > + }
>> > > +
>> > > + if (file_out->f_op->copy_file_range64) {
>> > > + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
>> > > + file_out, pos_out, len, flags);
>> > > + if (ret != -EOPNOTSUPP)
>> > > + goto done;
>> > > + }
>> > > +
>> > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
>> > > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
>> > > +
>> > > +done:
>> > > + if (ret > 0) {
>> > > + fsnotify_access(file_in);
>> > > + add_rchar(current, ret);
>> > > + fsnotify_modify(file_out);
>> > > + add_wchar(current, ret);
>> > > + }
>> > > +
>> > > + inc_syscr(current);
>> > > + inc_syscw(current);
>> > > +
>> > > + file_end_write(file_out);
>> > > +
>> > > + return ret;
>> > > +}
>> > > +EXPORT_SYMBOL(vfs_copy_file_range64);
>> > > +
>> > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
>> > > + int, fd_out, loff_t __user *, off_out,
>> > > + u64, len, unsigned int, flags)
>> > > +{
>> > > + loff_t pos_in;
>> > > + loff_t pos_out;
>> > > + struct fd f_in;
>> > > + struct fd f_out;
>> > > + u64 ret = -EBADF;
>> > > +
>> > > + f_in = fdget(fd_in);
>> > > + if (!f_in.file)
>> > > + goto out2;
>> > > +
>> > > + f_out = fdget(fd_out);
>> > > + if (!f_out.file)
>> > > + goto out1;
>> > > +
>> > > + ret = -EFAULT;
>> > > + if (off_in) {
>> > > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
>> > > + goto out;
>> > > + } else {
>> > > + pos_in = f_in.file->f_pos;
>> > > + }
>> > > +
>> > > + if (off_out) {
>> > > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
>> > > + goto out;
>> > > + } else {
>> > > + pos_out = f_out.file->f_pos;
>> > > + }
>> > > +
>> > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
>> > > + flags);
>> > > + if (ret > 0) {
>> > > + pos_in += ret;
>> > > + pos_out += ret;
>> > > +
>> > > + if (off_in) {
>> > > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
>> > > + ret = -EFAULT;
>> > > + } else {
>> > > + f_in.file->f_pos = pos_in;
>> > > + }
>> > > +
>> > > + if (off_out) {
>> > > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
>> > > + ret = -EFAULT;
>> > > + } else {
>> > > + f_out.file->f_pos = pos_out;
>> > > + }
>> > > + }
>> > > +
>> > > +out:
>> > > + fdput(f_out);
>> > > +out1:
>> > > + fdput(f_in);
>> > > +out2:
>> > > + return ret;
>> > > +}
>> > > +
>> > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
>> > > {
>> > > struct inode *inode = file_inode(file);
>> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
>> > > index 803e5a9..2727a98 100644
>> > > --- a/include/linux/fs.h
>> > > +++ b/include/linux/fs.h
>> > > @@ -1690,6 +1690,8 @@ struct file_operations {
>> > > u64);
>> > > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
>> > > u64);
>> > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
>> > > + loff_t, u64, unsigned int);
>> > > };
>> > >
>> > > struct inode_operations {
>> > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
>> > > loff_t len, bool *is_same);
>> > > extern int vfs_dedupe_file_range(struct file *file,
>> > > struct file_dedupe_range *same);
>> > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
>> > > + loff_t, u64, unsigned int);
>> > >
>> > > struct super_operations {
>> > > struct inode *(*alloc_inode)(struct super_block *sb);
>> > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> > > index 980c3c9..f7ea78e 100644
>> > > --- a/include/linux/syscalls.h
>> > > +++ b/include/linux/syscalls.h
>> > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
>> > > asmlinkage long sys_pkey_free(int pkey);
>> > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
>> > > unsigned mask, struct statx __user *buffer);
>> > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
>> > > + int fd_out, loff_t __user *off_out,
>> > > + u64 len, unsigned int flags);
>> > >
>> > > #endif
>> > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
>> > > index 061185a..e385a76 100644
>> > > --- a/include/uapi/asm-generic/unistd.h
>> > > +++ b/include/uapi/asm-generic/unistd.h
>> > > @@ -731,9 +731,11 @@
>> > > __SYSCALL(__NR_pkey_free, sys_pkey_free)
>> > > #define __NR_statx 291
>> > > __SYSCALL(__NR_statx, sys_statx)
>> > > +#define __NR_copy_file_range64 292
>> > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
>> > >
>> > > #undef __NR_syscalls
>> > > -#define __NR_syscalls 292
>> > > +#define __NR_syscalls 293
>> > >
>> > > /*
>> > > * All syscalls below here should go away really,
>> > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> > > index 8acef85..8e0cfd4 100644
>> > > --- a/kernel/sys_ni.c
>> > > +++ b/kernel/sys_ni.c
>> > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
>> > > cond_syscall(sys_capget);
>> > > cond_syscall(sys_capset);
>> > > cond_syscall(sys_copy_file_range);
>> > > +cond_syscall(sys_copy_file_range64);
>> > >
>> > > /* arch-specific weak syscall entries */
>> > > cond_syscall(sys_pciconfig_read);
>> >
>> > Hi Olga,
>> >
>> > We discussed this a bit privately, but I'll note it here too.
>> >
>> > Server-to-server copy seems like a nice thing to me as well. There are
>> > several filesystems that can likely make use of that ability.
>> >
>> > I don't have a real opinion on the API change either, but you're making
>> > a very subtle change with this patch. Prior to this, the existing
>> > copy_file_range calls could count on the superblocks being the same.
>> > Now, it looks like you can pass them any old file pointer.
>>
>> What if we add a new file op for xdev copy that gets called when
>> superblocks are different, but filesystem type is the same? We could
>> keep the current copy_file_range fop to fall back on if superblocks
>> are the same.
>
> I don't think that would really help much. There are only two
> filesystems with copy_file_range operations -- nfsv4 and cifs. I don't
> think we really need another special case op for this.
>
> What I would probably suggest is to just push the check for the same
> superblock down into the copy_file_range ops in one patch (with no
> functional change). Then, you could go and lift that restriction in the
> NFSv4 operation without regressing cifs. The CIFS folks could eventually
> do the same for theirs.

CIFS folks already check if their target and source destination are
different then they return with EXDEV. So I believe the check that
file system ops that are the same for the fd_in and fd_out would be
sufficient for the current uses?

2017-06-26 14:49:52

by Olga Kornievskaia

[permalink] [raw]
Subject: [PATCH 1/1] VFS permit cross device vfs_copy_file_range

Allow copy_file_range to copy between different superblocks but only
of the same file system types.

This feature is needed by NFSv4.2 to perform file copy operation on
the same server or file copy between different NFSv4.2 servers.

If a file system's fileoperations copy_file_range operation prohibits
cross-device copies, fall back to do_splice_direct. This is needed for
the NFS (destination) server side implementation of the file copy.

Currently there is only 1 implementor of the copy_file_range FS
operation -- CIFS. CIFS assumes incoming file descriptors are both
CIFS but it will check if they are coming from different servers.

NFS will allow for copies between different NFS servers.

Adding to the vfs.txt documentation to explicitly warn about allowing
for different superblocks of the same file type to be passed into the
copy_file_range for the future users of copy_file_range method.

Signed-off-by: Olga Kornievskaia <[email protected]>
---
Documentation/filesystems/vfs.txt | 7 +++++++
fs/read_write.c | 12 +++++-------
2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f42b906..cf22424 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -845,6 +845,8 @@ struct file_operations {
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
+ ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
+ loff_t, size_t, unsigned int);
};

Again, all methods are called without any locks being held, unless
@@ -913,6 +915,11 @@ otherwise noted.

fallocate: called by the VFS to preallocate blocks or punch a hole.

+ copy_file_range: called by copy_file_range(2) system call. This method
+ works on two file descriptors that might reside on
+ different superblocks of the same type of file system.
+
+
Note that the file operations are implemented by the specific
filesystem in which the inode resides. When opening a device node
(character or block special) most filesystems will call special
diff --git a/fs/read_write.c b/fs/read_write.c
index 47c1d44..effbfb2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1589,10 +1589,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
(file_out->f_flags & O_APPEND))
return -EBADF;

- /* this could be relaxed once a method supports cross-fs copies */
- if (inode_in->i_sb != inode_out->i_sb)
- return -EXDEV;
-
if (len == 0)
return 0;

@@ -1602,7 +1598,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* Try cloning first, this is supported by more file systems, and
* more efficient if both clone and copy are supported (e.g. NFS).
*/
- if (file_in->f_op->clone_file_range) {
+ if (inode_in->i_sb == inode_out->i_sb &&
+ file_in->f_op->clone_file_range) {
ret = file_in->f_op->clone_file_range(file_in, pos_in,
file_out, pos_out, len);
if (ret == 0) {
@@ -1611,10 +1608,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
}
}

- if (file_out->f_op->copy_file_range) {
+ if (file_out->f_op->copy_file_range &&
+ (file_in->f_op == file_out->f_op)) {
ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
pos_out, len, flags);
- if (ret != -EOPNOTSUPP)
+ if (ret != -EOPNOTSUPP && ret != -EXDEV)
goto done;
}

--
1.8.3.1


2017-06-26 14:51:03

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 1/1] [RFC] 64bit copy_file_range system call

On Mon, 2017-06-26 at 10:40 -0400, Olga Kornievskaia wrote:
> > On Jun 26, 2017, at 9:46 AM, Jeff Layton <[email protected]> wrote:
> >
> > On Mon, 2017-06-26 at 09:21 -0400, Anna Schumaker wrote:
> > > On 06/24/2017 09:23 AM, Jeff Layton wrote:
> > > > On Wed, 2017-06-14 at 13:06 -0400, Olga Kornievskaia wrote:
> > > > > This is a proposal to allow 64bit count to copy and return as
> > > > > a result of a copy_file_range. No attempt was made to share code
> > > > > with the 32bit function because 32bit interface should probably
> > > > > get depreciated.
> > > > >
> > > > > Why use 64bit? Current uses of 32bit are by clone_file_range()
> > > > > which could use 64bit count and NFS copy_file_range also supports
> > > > > 64bit count value.
> > > > >
> > > > > Also with this proposal off-the-bet allow the userland to copy
> > > > > between different mount points.
> > > > >
> > > > > Signed-off-by: Olga Kornievskaia <[email protected]>
> > > > > ---
> > > > > arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> > > > > arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> > > > > fs/read_write.c | 136 +++++++++++++++++++++++++++++++++
> > > > > include/linux/fs.h | 4 +
> > > > > include/linux/syscalls.h | 3 +
> > > > > include/uapi/asm-generic/unistd.h | 4 +-
> > > > > kernel/sys_ni.c | 1 +
> > > > > 7 files changed, 149 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> > > > > index 448ac21..1f5f1e7 100644
> > > > > --- a/arch/x86/entry/syscalls/syscall_32.tbl
> > > > > +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> > > > > @@ -391,3 +391,4 @@
> > > > > 382 i386 pkey_free sys_pkey_free
> > > > > 383 i386 statx sys_statx
> > > > > 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
> > > > > +385 i386 copy_file_range64 sys_copy_file_range64
> > > > > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > > > > index 5aef183..e658bbe 100644
> > > > > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > > > > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > > > > @@ -339,6 +339,7 @@
> > > > > 330 common pkey_alloc sys_pkey_alloc
> > > > > 331 common pkey_free sys_pkey_free
> > > > > 332 common statx sys_statx
> > > > > +333 64 copy_file_range64 sys_copy_file_range64
> > > > >
> > > > > #
> > > > > # x32-specific system call numbers start at 512 to avoid cache impact
> > > > > diff --git a/fs/read_write.c b/fs/read_write.c
> > > > > index 47c1d44..e2665c1 100644
> > > > > --- a/fs/read_write.c
> > > > > +++ b/fs/read_write.c
> > > > > @@ -1700,6 +1700,142 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> > > > > return ret;
> > > > > }
> > > > >
> > > > > +u64 vfs_copy_file_range64(struct file *file_in, loff_t pos_in,
> > > > > + struct file *file_out, loff_t pos_out,
> > > > > + u64 len, unsigned int flags)
> > > > > +{
> > > > > + struct inode *inode_in = file_inode(file_in);
> > > > > + struct inode *inode_out = file_inode(file_out);
> > > > > + u64 ret;
> > > > > +
> > > > > + if (flags != 0)
> > > > > + return -EINVAL;
> > > > > +
> > > > > + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> > > > > + return -EISDIR;
> > > > > + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> > > > > + return -EINVAL;
> > > > > +
> > > > > + ret = rw_verify_area(READ, file_in, &pos_in, len);
> > > > > + if (unlikely(ret))
> > > > > + return ret;
> > > > > +
> > > > > + ret = rw_verify_area(WRITE, file_out, &pos_out, len);
> > > > > + if (unlikely(ret))
> > > > > + return ret;
> > > > > +
> > > > > + if (!(file_in->f_mode & FMODE_READ) ||
> > > > > + !(file_out->f_mode & FMODE_WRITE) ||
> > > > > + (file_out->f_flags & O_APPEND))
> > > > > + return -EBADF;
> > > > > +
> > > > > + if (len == 0)
> > > > > + return 0;
> > > > > +
> > > > > + file_start_write(file_out);
> > > > > +
> > > > > + /*
> > > > > + * Try cloning first, this is supported by more file systems, and
> > > > > + * more efficient if both clone and copy are supported (e.g. NFS).
> > > > > + */
> > > > > + if (file_in->f_op->clone_file_range) {
> > > > > + ret = file_in->f_op->clone_file_range(file_in, pos_in,
> > > > > + file_out, pos_out, len);
> > > > > + if (ret == 0) {
> > > > > + ret = len;
> > > > > + goto done;
> > > > > + }
> > > > > + }
> > > > > +
> > > > > + if (file_out->f_op->copy_file_range64) {
> > > > > + ret = file_out->f_op->copy_file_range64(file_in, pos_in,
> > > > > + file_out, pos_out, len, flags);
> > > > > + if (ret != -EOPNOTSUPP)
> > > > > + goto done;
> > > > > + }
> > > > > +
> > > > > + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
> > > > > + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
> > > > > +
> > > > > +done:
> > > > > + if (ret > 0) {
> > > > > + fsnotify_access(file_in);
> > > > > + add_rchar(current, ret);
> > > > > + fsnotify_modify(file_out);
> > > > > + add_wchar(current, ret);
> > > > > + }
> > > > > +
> > > > > + inc_syscr(current);
> > > > > + inc_syscw(current);
> > > > > +
> > > > > + file_end_write(file_out);
> > > > > +
> > > > > + return ret;
> > > > > +}
> > > > > +EXPORT_SYMBOL(vfs_copy_file_range64);
> > > > > +
> > > > > +SYSCALL_DEFINE6(copy_file_range64, int, fd_in, loff_t __user *, off_in,
> > > > > + int, fd_out, loff_t __user *, off_out,
> > > > > + u64, len, unsigned int, flags)
> > > > > +{
> > > > > + loff_t pos_in;
> > > > > + loff_t pos_out;
> > > > > + struct fd f_in;
> > > > > + struct fd f_out;
> > > > > + u64 ret = -EBADF;
> > > > > +
> > > > > + f_in = fdget(fd_in);
> > > > > + if (!f_in.file)
> > > > > + goto out2;
> > > > > +
> > > > > + f_out = fdget(fd_out);
> > > > > + if (!f_out.file)
> > > > > + goto out1;
> > > > > +
> > > > > + ret = -EFAULT;
> > > > > + if (off_in) {
> > > > > + if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
> > > > > + goto out;
> > > > > + } else {
> > > > > + pos_in = f_in.file->f_pos;
> > > > > + }
> > > > > +
> > > > > + if (off_out) {
> > > > > + if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
> > > > > + goto out;
> > > > > + } else {
> > > > > + pos_out = f_out.file->f_pos;
> > > > > + }
> > > > > +
> > > > > + ret = vfs_copy_file_range64(f_in.file, pos_in, f_out.file, pos_out, len,
> > > > > + flags);
> > > > > + if (ret > 0) {
> > > > > + pos_in += ret;
> > > > > + pos_out += ret;
> > > > > +
> > > > > + if (off_in) {
> > > > > + if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
> > > > > + ret = -EFAULT;
> > > > > + } else {
> > > > > + f_in.file->f_pos = pos_in;
> > > > > + }
> > > > > +
> > > > > + if (off_out) {
> > > > > + if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
> > > > > + ret = -EFAULT;
> > > > > + } else {
> > > > > + f_out.file->f_pos = pos_out;
> > > > > + }
> > > > > + }
> > > > > +
> > > > > +out:
> > > > > + fdput(f_out);
> > > > > +out1:
> > > > > + fdput(f_in);
> > > > > +out2:
> > > > > + return ret;
> > > > > +}
> > > > > +
> > > > > static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> > > > > {
> > > > > struct inode *inode = file_inode(file);
> > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > index 803e5a9..2727a98 100644
> > > > > --- a/include/linux/fs.h
> > > > > +++ b/include/linux/fs.h
> > > > > @@ -1690,6 +1690,8 @@ struct file_operations {
> > > > > u64);
> > > > > ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
> > > > > u64);
> > > > > + u64 (*copy_file_range64)(struct file *, loff_t, struct file *,
> > > > > + loff_t, u64, unsigned int);
> > > > > };
> > > > >
> > > > > struct inode_operations {
> > > > > @@ -1770,6 +1772,8 @@ extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
> > > > > loff_t len, bool *is_same);
> > > > > extern int vfs_dedupe_file_range(struct file *file,
> > > > > struct file_dedupe_range *same);
> > > > > +extern u64 vfs_copy_file_range64(struct file *, loff_t, struct file *,
> > > > > + loff_t, u64, unsigned int);
> > > > >
> > > > > struct super_operations {
> > > > > struct inode *(*alloc_inode)(struct super_block *sb);
> > > > > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > > > > index 980c3c9..f7ea78e 100644
> > > > > --- a/include/linux/syscalls.h
> > > > > +++ b/include/linux/syscalls.h
> > > > > @@ -905,5 +905,8 @@ asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
> > > > > asmlinkage long sys_pkey_free(int pkey);
> > > > > asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
> > > > > unsigned mask, struct statx __user *buffer);
> > > > > +asmlinkage long sys_copy_file_range64(int fd_in, loff_t __user *off_in,
> > > > > + int fd_out, loff_t __user *off_out,
> > > > > + u64 len, unsigned int flags);
> > > > >
> > > > > #endif
> > > > > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> > > > > index 061185a..e385a76 100644
> > > > > --- a/include/uapi/asm-generic/unistd.h
> > > > > +++ b/include/uapi/asm-generic/unistd.h
> > > > > @@ -731,9 +731,11 @@
> > > > > __SYSCALL(__NR_pkey_free, sys_pkey_free)
> > > > > #define __NR_statx 291
> > > > > __SYSCALL(__NR_statx, sys_statx)
> > > > > +#define __NR_copy_file_range64 292
> > > > > +__SYSCALL(__NR_copy_file_range64, sys_copy_file_range64)
> > > > >
> > > > > #undef __NR_syscalls
> > > > > -#define __NR_syscalls 292
> > > > > +#define __NR_syscalls 293
> > > > >
> > > > > /*
> > > > > * All syscalls below here should go away really,
> > > > > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> > > > > index 8acef85..8e0cfd4 100644
> > > > > --- a/kernel/sys_ni.c
> > > > > +++ b/kernel/sys_ni.c
> > > > > @@ -178,6 +178,7 @@ asmlinkage long sys_ni_syscall(void)
> > > > > cond_syscall(sys_capget);
> > > > > cond_syscall(sys_capset);
> > > > > cond_syscall(sys_copy_file_range);
> > > > > +cond_syscall(sys_copy_file_range64);
> > > > >
> > > > > /* arch-specific weak syscall entries */
> > > > > cond_syscall(sys_pciconfig_read);
> > > >
> > > > Hi Olga,
> > > >
> > > > We discussed this a bit privately, but I'll note it here too.
> > > >
> > > > Server-to-server copy seems like a nice thing to me as well. There are
> > > > several filesystems that can likely make use of that ability.
> > > >
> > > > I don't have a real opinion on the API change either, but you're making
> > > > a very subtle change with this patch. Prior to this, the existing
> > > > copy_file_range calls could count on the superblocks being the same.
> > > > Now, it looks like you can pass them any old file pointer.
> > >
> > > What if we add a new file op for xdev copy that gets called when
> > > superblocks are different, but filesystem type is the same? We could
> > > keep the current copy_file_range fop to fall back on if superblocks
> > > are the same.
> >
> > I don't think that would really help much. There are only two
> > filesystems with copy_file_range operations -- nfsv4 and cifs. I don't
> > think we really need another special case op for this.
> >
> > What I would probably suggest is to just push the check for the same
> > superblock down into the copy_file_range ops in one patch (with no
> > functional change). Then, you could go and lift that restriction in the
> > NFSv4 operation without regressing cifs. The CIFS folks could eventually
> > do the same for theirs.
>
> CIFS folks already check if their target and source destination are different then they return with EXDEV. So I believe the check that file system ops that are the same for the fd_in and fd_out would be sufficient for the current uses?
>

Yes that should cover it.
--
Jeff Layton <[email protected]>

2017-06-27 11:47:46

by Jeff Layton

[permalink] [raw]
Subject: Re: [PATCH 1/1] VFS permit cross device vfs_copy_file_range

On Mon, 2017-06-26 at 10:49 -0400, Olga Kornievskaia wrote:
> Allow copy_file_range to copy between different superblocks but only
> of the same file system types.
>
> This feature is needed by NFSv4.2 to perform file copy operation on
> the same server or file copy between different NFSv4.2 servers.
>
> If a file system's fileoperations copy_file_range operation prohibits
> cross-device copies, fall back to do_splice_direct. This is needed for
> the NFS (destination) server side implementation of the file copy.
>
> Currently there is only 1 implementor of the copy_file_range FS
> operation -- CIFS. CIFS assumes incoming file descriptors are both
> CIFS but it will check if they are coming from different servers.
>
> NFS will allow for copies between different NFS servers.
>
> Adding to the vfs.txt documentation to explicitly warn about allowing
> for different superblocks of the same file type to be passed into the
> copy_file_range for the future users of copy_file_range method.
>
> Signed-off-by: Olga Kornievskaia <[email protected]>
> ---
> Documentation/filesystems/vfs.txt | 7 +++++++
> fs/read_write.c | 12 +++++-------
> 2 files changed, 12 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
> index f42b906..cf22424 100644
> --- a/Documentation/filesystems/vfs.txt
> +++ b/Documentation/filesystems/vfs.txt
> @@ -845,6 +845,8 @@ struct file_operations {
> #ifndef CONFIG_MMU
> unsigned (*mmap_capabilities)(struct file *);
> #endif
> + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
> + loff_t, size_t, unsigned int);
> };
>
> Again, all methods are called without any locks being held, unless
> @@ -913,6 +915,11 @@ otherwise noted.
>
> fallocate: called by the VFS to preallocate blocks or punch a hole.
>
> + copy_file_range: called by copy_file_range(2) system call. This method
> + works on two file descriptors that might reside on
> + different superblocks of the same type of file system.
> +
> +
> Note that the file operations are implemented by the specific
> filesystem in which the inode resides. When opening a device node
> (character or block special) most filesystems will call special
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 47c1d44..effbfb2 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1589,10 +1589,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> (file_out->f_flags & O_APPEND))
> return -EBADF;
>
> - /* this could be relaxed once a method supports cross-fs copies */
> - if (inode_in->i_sb != inode_out->i_sb)
> - return -EXDEV;
> -
> if (len == 0)
> return 0;
>
> @@ -1602,7 +1598,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> * Try cloning first, this is supported by more file systems, and
> * more efficient if both clone and copy are supported (e.g. NFS).
> */
> - if (file_in->f_op->clone_file_range) {
> + if (inode_in->i_sb == inode_out->i_sb &&
> + file_in->f_op->clone_file_range) {

I do wonder if we really ought to check for the same superblock here.

ISTM that it might be possible for some fs' (btrfs?) to clone blocks
across superblocks in some situations?

Oh well -- I guess we can cross that bridge when we come to it.

> ret = file_in->f_op->clone_file_range(file_in, pos_in,
> file_out, pos_out, len);
> if (ret == 0) {
> @@ -1611,10 +1608,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
> }
> }
>
> - if (file_out->f_op->copy_file_range) {
> + if (file_out->f_op->copy_file_range &&
> + (file_in->f_op == file_out->f_op)) {

It's possible that they might not have the same f_op pointer, but have
the same copy_file_range operation (e.g. cifs.ko has a bunch of
different f_op structures for different cases, so they might not be the
same even though they have the same copy_file_range op).

Maybe this should be:

file_in->f_op->copy_file_range == file_out->f_op->copy_file_range

?

> ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
> pos_out, len, flags);
> - if (ret != -EOPNOTSUPP)
> + if (ret != -EOPNOTSUPP && ret != -EXDEV)
> goto done;
> }
>

--
Jeff Layton <[email protected]>

2017-06-27 16:12:32

by Olga Kornievskaia

[permalink] [raw]
Subject: Re: [PATCH 1/1] VFS permit cross device vfs_copy_file_range

On Tue, Jun 27, 2017 at 7:47 AM, Jeff Layton <[email protected]> wrote:
> On Mon, 2017-06-26 at 10:49 -0400, Olga Kornievskaia wrote:
>> Allow copy_file_range to copy between different superblocks but only
>> of the same file system types.
>>
>> This feature is needed by NFSv4.2 to perform file copy operation on
>> the same server or file copy between different NFSv4.2 servers.
>>
>> If a file system's fileoperations copy_file_range operation prohibits
>> cross-device copies, fall back to do_splice_direct. This is needed for
>> the NFS (destination) server side implementation of the file copy.
>>
>> Currently there is only 1 implementor of the copy_file_range FS
>> operation -- CIFS. CIFS assumes incoming file descriptors are both
>> CIFS but it will check if they are coming from different servers.
>>
>> NFS will allow for copies between different NFS servers.
>>
>> Adding to the vfs.txt documentation to explicitly warn about allowing
>> for different superblocks of the same file type to be passed into the
>> copy_file_range for the future users of copy_file_range method.
>>
>> Signed-off-by: Olga Kornievskaia <[email protected]>
>> ---
>> Documentation/filesystems/vfs.txt | 7 +++++++
>> fs/read_write.c | 12 +++++-------
>> 2 files changed, 12 insertions(+), 7 deletions(-)
>>
>> diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
>> index f42b906..cf22424 100644
>> --- a/Documentation/filesystems/vfs.txt
>> +++ b/Documentation/filesystems/vfs.txt
>> @@ -845,6 +845,8 @@ struct file_operations {
>> #ifndef CONFIG_MMU
>> unsigned (*mmap_capabilities)(struct file *);
>> #endif
>> + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
>> + loff_t, size_t, unsigned int);
>> };
>>
>> Again, all methods are called without any locks being held, unless
>> @@ -913,6 +915,11 @@ otherwise noted.
>>
>> fallocate: called by the VFS to preallocate blocks or punch a hole.
>>
>> + copy_file_range: called by copy_file_range(2) system call. This method
>> + works on two file descriptors that might reside on
>> + different superblocks of the same type of file system.
>> +
>> +
>> Note that the file operations are implemented by the specific
>> filesystem in which the inode resides. When opening a device node
>> (character or block special) most filesystems will call special
>> diff --git a/fs/read_write.c b/fs/read_write.c
>> index 47c1d44..effbfb2 100644
>> --- a/fs/read_write.c
>> +++ b/fs/read_write.c
>> @@ -1589,10 +1589,6 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> (file_out->f_flags & O_APPEND))
>> return -EBADF;
>>
>> - /* this could be relaxed once a method supports cross-fs copies */
>> - if (inode_in->i_sb != inode_out->i_sb)
>> - return -EXDEV;
>> -
>> if (len == 0)
>> return 0;
>>
>> @@ -1602,7 +1598,8 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> * Try cloning first, this is supported by more file systems, and
>> * more efficient if both clone and copy are supported (e.g. NFS).
>> */
>> - if (file_in->f_op->clone_file_range) {
>> + if (inode_in->i_sb == inode_out->i_sb &&
>> + file_in->f_op->clone_file_range) {
>
> I do wonder if we really ought to check for the same superblock here.
>
> ISTM that it might be possible for some fs' (btrfs?) to clone blocks
> across superblocks in some situations?
>
> Oh well -- I guess we can cross that bridge when we come to it.

I was trying to keep with the expectations of the underlying function
calls. Since I'm not proposing to change clone_file_range, I added
that check.

>> ret = file_in->f_op->clone_file_range(file_in, pos_in,
>> file_out, pos_out, len);
>> if (ret == 0) {
>> @@ -1611,10 +1608,11 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
>> }
>> }
>>
>> - if (file_out->f_op->copy_file_range) {
>> + if (file_out->f_op->copy_file_range &&
>> + (file_in->f_op == file_out->f_op)) {
>
> It's possible that they might not have the same f_op pointer, but have
> the same copy_file_range operation (e.g. cifs.ko has a bunch of
> different f_op structures for different cases, so they might not be the
> same even though they have the same copy_file_range op).
>
> Maybe this should be:
>
> file_in->f_op->copy_file_range == file_out->f_op->copy_file_range
>
> ?

Yeah, I was debating between those two options and decided to go with
f_op pointer but I agree the other way is the way to go. I'll wait to
hear if there are any other comments and if not make this change and
send the next version.

thank you for the comments!

>> ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
>> pos_out, len, flags);
>> - if (ret != -EOPNOTSUPP)
>> + if (ret != -EOPNOTSUPP && ret != -EXDEV)
>> goto done;
>> }
>>
>
> --
> Jeff Layton <[email protected]>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html