2015-07-13 09:07:04

by Peng Tao

[permalink] [raw]
Subject: [PATCH v2 0/8] nfs: minor cleanups and NFSv42 CLONE support

Hi all,

The first two patches are minor cleanups. The rest 6 added NFSv42 CLONE support via
a new ioctl interface. There's a minor conflict between the cleanup and new functionality.
So I ended up sending them together.

Tested with following program as well as `cp --reflink=always`.

Cheers,
Tao

v1->v2 changelog:
1. change NFS_IOC_CLONE definition to match BTRFS_IOC_CLONE
2. add NFS_IOC_CLONE_RANGE that is same as BTRFS_IOC_RANGE
3. add Christoph's reviewed-by lines on the first two patches

[lear@tests]$cat copy_reflink.c
#include <sys/ioctl.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

void usage(const char *cmd)
{
printf("%s: <src_file_name> <dst_file_name>\n", cmd);
}

int main(char argc, char** argv)
{
int srcfd, dstfd;

if (argc < 3) {
usage(argv[0]);
return -1;
}

srcfd = open(argv[1], O_RDONLY);
if (srcfd < 0) {
perror("open failed");
return -1;
}
dstfd = open(argv[2], O_WRONLY|O_CREAT, 0644);
if (dstfd < 0) {
perror("open failed");
return -1;
}

#ifndef NFS_IOC_CLONE
#define NFS_IOC_CLONE _IOW(0x94, 9, int)
#endif
if (ioctl(dstfd, NFS_IOC_CLONE, srcfd) < 0)
perror("ioctl CLONE failed\n");

close(srcfd);
close(dstfd);
return 0;
}


Peng Tao (8):
nfs42: decode_layoutstats does not need res parameter
nfs42: remove unused declaration
nfs42: add CLONE xdr functions
nfs42: add CLONE proc functions
nfs42: add NFS_IOC_CLONE ioctl
nfs: get clone_blksize when probing fsinfo
nfs42: respect clone_blksize
nfs42: add NFS_IOC_CLONE_RANGE ioctl

fs/nfs/client.c | 1 +
fs/nfs/nfs42.h | 3 +-
fs/nfs/nfs42proc.c | 71 ++++++++++++++++++++++++++
fs/nfs/nfs42xdr.c | 102 +++++++++++++++++++++++++++++++++++--
fs/nfs/nfs4file.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++
fs/nfs/nfs4proc.c | 4 +-
fs/nfs/nfs4xdr.c | 26 ++++++++++
include/linux/nfs4.h | 3 ++
include/linux/nfs_fs_sb.h | 2 +
include/linux/nfs_xdr.h | 20 ++++++++
include/uapi/linux/nfs.h | 11 ++++
11 files changed, 363 insertions(+), 7 deletions(-)

--
1.8.3.1



2015-07-13 09:07:08

by Peng Tao

[permalink] [raw]
Subject: [PATCH v2 1/8] nfs42: decode_layoutstats does not need res parameter

Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/nfs42xdr.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index a6bd27d..0eb29e1 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -238,8 +238,7 @@ out_overflow:
return -EIO;
}

-static int decode_layoutstats(struct xdr_stream *xdr,
- struct nfs42_layoutstat_res *res)
+static int decode_layoutstats(struct xdr_stream *xdr)
{
return decode_op_hdr(xdr, OP_LAYOUTSTATS);
}
@@ -343,7 +342,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
goto out;
WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
for (i = 0; i < res->num_dev; i++) {
- status = decode_layoutstats(xdr, res);
+ status = decode_layoutstats(xdr);
if (status)
goto out;
}
--
1.8.3.1


2015-07-13 09:07:12

by Peng Tao

[permalink] [raw]
Subject: [PATCH v2 2/8] nfs42: remove unused declaration

Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/nfs42.h | 2 --
1 file changed, 2 deletions(-)

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index ff66ae7..814c125 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,7 +17,5 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
-/* nfs4.2xdr.h */
-extern struct rpc_procinfo nfs4_2_procedures[];

#endif /* __LINUX_FS_NFS_NFS4_2_H */
--
1.8.3.1


2015-07-13 09:07:15

by Peng Tao

[permalink] [raw]
Subject: [PATCH v2 3/8] nfs42: add CLONE xdr functions

xdr definitions per draft-ietf-nfsv4-minorversion2-38.txt

Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/nfs42xdr.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++-
fs/nfs/nfs4xdr.c | 1 +
include/linux/nfs4.h | 2 +
include/linux/nfs_xdr.h | 19 ++++++++++
4 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0eb29e1..0ca482a 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -34,6 +34,12 @@
1 /* opaque devaddr4 length */ + \
XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_clone_maxsz (encode_stateid_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* src offset */ + \
+ 2 /* dst offset */ + \
+ 2 /* count */)
+#define decode_clone_maxsz (op_decode_hdr_maxsz)

#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
@@ -65,7 +71,20 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
-
+#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_clone_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_clone_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_clone_maxsz + \
+ decode_getattr_maxsz)

static void encode_fallocate(struct xdr_stream *xdr,
struct nfs42_falloc_args *args)
@@ -128,6 +147,21 @@ static void encode_layoutstats(struct xdr_stream *xdr,
encode_uint32(xdr, 0);
}

+static void encode_clone(struct xdr_stream *xdr,
+ struct nfs42_clone_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_CLONE, decode_clone_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+ p = reserve_space(xdr, 3*8);
+ p = xdr_encode_hyper(p, args->src_offset);
+ p = xdr_encode_hyper(p, args->dst_offset);
+ xdr_encode_hyper(p, args->count);
+}
+
/*
* Encode ALLOCATE request
*/
@@ -206,6 +240,27 @@ static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
encode_nops(&hdr);
}

+/*
+ * Encode CLONE request
+ */
+static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_clone(xdr, args, &hdr);
+ encode_getfattr(xdr, args->dst_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -243,6 +298,11 @@ static int decode_layoutstats(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_LAYOUTSTATS);
}

+static int decode_clone(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_CLONE);
+}
+
/*
* Decode ALLOCATE request
*/
@@ -351,4 +411,39 @@ out:
return status;
}

+/*
+ * Decode CLONE request
+ */
+static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_clone(xdr);
+ if (status)
+ goto out;
+ status = decode_getfattr(xdr, res->dst_fattr, res->server);
+
+out:
+ res->rpc_status = status;
+ return status;
+}
+
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 558cd65d..d9ea209 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7432,6 +7432,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(ALLOCATE, enc_allocate, dec_allocate),
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
+ PROC(CLONE, enc_clone, dec_clone),
#endif /* CONFIG_NFS_V4_2 */
};

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b8e72aa..7ccb06a 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -130,6 +130,7 @@ enum nfs_opnum4 {
OP_READ_PLUS = 68,
OP_SEEK = 69,
OP_WRITE_SAME = 70,
+ OP_CLONE = 71,

OP_ILLEGAL = 10044,
};
@@ -501,6 +502,7 @@ enum {
NFSPROC4_CLNT_ALLOCATE,
NFSPROC4_CLNT_DEALLOCATE,
NFSPROC4_CLNT_LAYOUTSTATS,
+ NFSPROC4_CLNT_CLONE,
};

/* nfs41 types */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7bbe505..838b7b6 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -359,6 +359,25 @@ struct nfs42_layoutstat_data {
struct nfs42_layoutstat_res res;
};

+struct nfs42_clone_args {
+ struct nfs4_sequence_args seq_args;
+ struct nfs_fh *src_fh;
+ struct nfs_fh *dst_fh;
+ nfs4_stateid src_stateid;
+ nfs4_stateid dst_stateid;
+ __u64 src_offset;
+ __u64 dst_offset;
+ __u64 count;
+ const u32 *dst_bitmask;
+};
+
+struct nfs42_clone_res {
+ struct nfs4_sequence_res seq_res;
+ unsigned int rpc_status;
+ struct nfs_fattr *dst_fattr;
+ const struct nfs_server *server;
+};
+
struct stateowner_id {
__u64 create_time;
__u32 uniquifier;
--
1.8.3.1


2015-07-13 09:07:19

by Peng Tao

[permalink] [raw]
Subject: [PATCH v2 4/8] nfs42: add CLONE proc functions

Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/nfs42.h | 1 +
fs/nfs/nfs42proc.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++
fs/nfs/nfs4proc.c | 3 +-
include/linux/nfs_fs_sb.h | 1 +
4 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 814c125..b587ccd 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,5 +17,6 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
+int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);

#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index dd33a24..01031f7 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -253,3 +253,74 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
rpc_put_task(task);
return 0;
}
+
+static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
+ struct file *dst_f, loff_t src_offset,
+ loff_t dst_offset, loff_t count)
+{
+ struct inode *src_inode = file_inode(src_f);
+ struct inode *dst_inode = file_inode(dst_f);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct nfs42_clone_args args = {
+ .src_fh = NFS_FH(src_inode),
+ .dst_fh = NFS_FH(dst_inode),
+ .src_offset = src_offset,
+ .dst_offset = dst_offset,
+ .dst_bitmask = server->cache_consistency_bitmask,
+ };
+ struct nfs42_clone_res res = {
+ .server = server,
+ };
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
+ if (status)
+ return status;
+
+ status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
+ if (status)
+ return status;
+
+ res.dst_fattr = nfs_alloc_fattr();
+ if (!res.dst_fattr)
+ return -ENOMEM;
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == 0)
+ status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+
+ kfree(res.dst_fattr);
+ return status;
+}
+
+int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
+ loff_t src_offset, loff_t dst_offset, loff_t count)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
+ };
+ struct inode *inode = file_inode(src_f);
+ struct nfs_server *server = NFS_SERVER(file_inode(src_f));
+ struct nfs4_exception exception = { };
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_CLONE))
+ return -EOPNOTSUPP;
+
+ do {
+ err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
+ dst_offset, count);
+ if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
+ return -EOPNOTSUPP;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+
+ return err;
+
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6f228b5..a61e815 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8646,7 +8646,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_ALLOCATE
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
- | NFS_CAP_LAYOUTSTATS,
+ | NFS_CAP_LAYOUTSTATS
+ | NFS_CAP_CLONE,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index a2ea149..682b751 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -238,5 +238,6 @@ struct nfs_server {
#define NFS_CAP_ALLOCATE (1U << 20)
#define NFS_CAP_DEALLOCATE (1U << 21)
#define NFS_CAP_LAYOUTSTATS (1U << 22)
+#define NFS_CAP_CLONE (1U << 23)

#endif
--
1.8.3.1


2015-07-13 09:07:23

by Peng Tao

[permalink] [raw]
Subject: [PATCH v2 5/8] nfs42: add NFS_IOC_CLONE ioctl

It can be called by user space to CLONE two files.
Follow btrfs lead and define NFS_IOC_CLONE same as BTRFS_IOC_CLONE.
Thus we don't mess up userspace with too many ioctls.

Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/nfs4file.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++
include/uapi/linux/nfs.h | 4 ++
2 files changed, 106 insertions(+)

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index dcd39d4..dfa6620 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -4,6 +4,7 @@
* Copyright (C) 1992 Rick Sladkey
*/
#include <linux/fs.h>
+#include <linux/file.h>
#include <linux/falloc.h>
#include <linux/nfs_fs.h>
#include "internal.h"
@@ -166,8 +167,104 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
return nfs42_proc_deallocate(filep, offset, len);
return nfs42_proc_allocate(filep, offset, len);
}
+
+static noinline long
+nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
+ u64 src_off, u64 dst_off, u64 count)
+{
+ struct inode *dst_inode = file_inode(dst_file);
+ struct fd src_file;
+ struct inode *src_inode;
+ int ret;
+
+ /* dst file must be opened for writing */
+ if (!(dst_file->f_mode & FMODE_WRITE))
+ return -EINVAL;
+
+ ret = mnt_want_write_file(dst_file);
+ if (ret)
+ return ret;
+
+ src_file = fdget(srcfd);
+ if (!src_file.file) {
+ ret = -EBADF;
+ goto out_drop_write;
+ }
+
+ src_inode = file_inode(src_file.file);
+
+ /* src and dst must be different files */
+ ret = -EINVAL;
+ if (src_inode == dst_inode)
+ goto out_fput;
+
+ /* src file must be opened for reading */
+ if (!(src_file.file->f_mode & FMODE_READ))
+ goto out_fput;
+
+ /* src and dst must be regular files */
+ ret = -EISDIR;
+ if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode))
+ goto out_fput;
+
+ ret = -EXDEV;
+ if (src_file.file->f_path.mnt != dst_file->f_path.mnt ||
+ src_inode->i_sb != dst_inode->i_sb)
+ goto out_fput;
+
+ /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
+ if (dst_inode < src_inode) {
+ mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+ }
+
+ /* flush all pending writes on both src and dst so that server
+ * has the latest data */
+ ret = nfs_sync_inode(src_inode);
+ if (ret)
+ goto out_unlock;
+ ret = nfs_sync_inode(dst_inode);
+ if (ret)
+ goto out_unlock;
+
+ ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count);
+
+ /* truncate inode page cache of the dst range so that future reads can fetch
+ * new data from server */
+ if (!ret)
+ truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
+
+out_unlock:
+ if (dst_inode < src_inode) {
+ mutex_unlock(&src_inode->i_mutex);
+ mutex_unlock(&dst_inode->i_mutex);
+ } else {
+ mutex_unlock(&dst_inode->i_mutex);
+ mutex_unlock(&src_inode->i_mutex);
+ }
+out_fput:
+ fdput(src_file);
+out_drop_write:
+ mnt_drop_write_file(dst_file);
+ return ret;
+}
#endif /* CONFIG_NFS_V4_2 */

+long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+#ifdef CONFIG_NFS_V4_2
+ case NFS_IOC_CLONE:
+ return nfs42_ioctl_clone(file, arg, 0, 0, 0);
+#endif
+ }
+
+ return -ENOTTY;
+}
+
const struct file_operations nfs4_file_operations = {
#ifdef CONFIG_NFS_V4_2
.llseek = nfs4_file_llseek,
@@ -190,4 +287,9 @@ const struct file_operations nfs4_file_operations = {
#endif /* CONFIG_NFS_V4_2 */
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
+#ifdef CONFIG_COMPAT
+ .unlocked_ioctl = nfs4_ioctl,
+#else
+ .compat_ioctl = nfs4_ioctl,
+#endif /* CONFIG_COMPAT */
};
diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
index 5199a36..d85748d 100644
--- a/include/uapi/linux/nfs.h
+++ b/include/uapi/linux/nfs.h
@@ -31,6 +31,10 @@

#define NFS_PIPE_DIRNAME "nfs"

+/* NFS ioctls */
+/* Let's follow btrfs lead on CLONE to avoid messing userspace */
+#define NFS_IOC_CLONE _IOW(0x94, 9, int)
+
/*
* NFS stats. The good thing with these values is that NFSv3 errors are
* a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which
--
1.8.3.1


2015-07-13 09:07:27

by Peng Tao

[permalink] [raw]
Subject: [PATCH v2 6/8] nfs: get clone_blksize when probing fsinfo

NFSv42 CLONE operation is supposed to respect it.

Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/client.c | 1 +
fs/nfs/nfs4proc.c | 1 +
fs/nfs/nfs4xdr.c | 25 +++++++++++++++++++++++++
include/linux/nfs4.h | 1 +
include/linux/nfs_fs_sb.h | 1 +
include/linux/nfs_xdr.h | 1 +
6 files changed, 30 insertions(+)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ecebb40..51793ab 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -873,6 +873,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,

server->time_delta = fsinfo->time_delta;

+ server->clone_blksize = fsinfo->clone_blksize;
/* We're airborne Set socket buffersize */
rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a61e815..04c8adc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -239,6 +239,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
FATTR4_WORD1_TIME_DELTA
| FATTR4_WORD1_FS_LAYOUT_TYPES,
FATTR4_WORD2_LAYOUT_BLKSIZE
+ | FATTR4_WORD2_CLONE_BLKSIZE
};

const u32 nfs4_fs_locations_bitmap[3] = {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d9ea209..8d621e9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4735,6 +4735,28 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
return 0;
}

+/*
+ * The granularity of a CLONE operation.
+ */
+static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p)) {
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+ }
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
+ }
+ return 0;
+}
+
static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
{
unsigned int savep;
@@ -4769,6 +4791,9 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
if (status)
goto xdr_error;
+ status = decode_attr_clone_blksize(xdr, bitmap, &fsinfo->clone_blksize);
+ if (status)
+ goto xdr_error;

status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 7ccb06a..e865be4 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -422,6 +422,7 @@ enum lock_type4 {
#define FATTR4_WORD2_LAYOUT_TYPES (1UL << 0)
#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1)
#define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4)
+#define FATTR4_WORD2_CLONE_BLKSIZE (1UL << 13)
#define FATTR4_WORD2_SECURITY_LABEL (1UL << 16)

/* MDS threshold bitmap bits */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 682b751..4fb8fca 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -147,6 +147,7 @@ struct nfs_server {
unsigned int acdirmax;
unsigned int namelen;
unsigned int options; /* extra options enabled by mount */
+ unsigned int clone_blksize; /* granularity of a CLONE operation */
#define NFS_OPTION_FSCACHE 0x00000001 /* - local caching enabled */
#define NFS_OPTION_MIGRATION 0x00000002 /* - NFSv4 migration enabled */

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 838b7b6..7998133 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -141,6 +141,7 @@ struct nfs_fsinfo {
__u32 lease_time; /* in seconds */
__u32 layouttype; /* supported pnfs layout driver */
__u32 blksize; /* preferred pnfs io block size */
+ __u32 clone_blksize; /* granularity of a CLONE operation */
};

struct nfs_fsstat {
--
1.8.3.1


2015-07-13 09:07:31

by Peng Tao

[permalink] [raw]
Subject: [PATCH v2 7/8] nfs42: respect clone_blksize

draft-ietf-nfsv4-minorversion2-38.txt says:
Both cl_src_offset and
cl_dst_offset must be aligned to the clone block size Section 12.2.1.
The number of bytes to be cloned must be a multiple of the clone
block size, except in the case in which cl_src_offset plus the number
of bytes to be cloned is equal to the source file size.

Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/nfs4file.c | 11 +++++++++++
1 file changed, 11 insertions(+)

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index dfa6620..e450a5f 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -173,8 +173,10 @@ nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
u64 src_off, u64 dst_off, u64 count)
{
struct inode *dst_inode = file_inode(dst_file);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
struct fd src_file;
struct inode *src_inode;
+ unsigned int bs = server->clone_blksize;
int ret;

/* dst file must be opened for writing */
@@ -212,6 +214,15 @@ nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
src_inode->i_sb != dst_inode->i_sb)
goto out_fput;

+ /* check alignment w.r.t. clone_blksize */
+ ret = -EINVAL;
+ if (bs) {
+ if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs))
+ goto out_fput;
+ if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count))
+ goto out_fput;
+ }
+
/* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
if (dst_inode < src_inode) {
mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
--
1.8.3.1


2015-07-13 09:07:35

by Peng Tao

[permalink] [raw]
Subject: [PATCH v2 8/8] nfs42: add NFS_IOC_CLONE_RANGE ioctl

It follows btrfs BTRFS_IOC_CLONE_RANGE lead on ioctl number and
arguments.

Signed-off-by: Peng Tao <[email protected]>
---
fs/nfs/nfs4file.c | 14 ++++++++++++++
include/uapi/linux/nfs.h | 9 ++++++++-
2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e450a5f..f37b74c 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -262,14 +262,28 @@ out_drop_write:
mnt_drop_write_file(dst_file);
return ret;
}
+
+static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp)
+{
+ struct nfs_ioctl_clone_range_args args;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ return nfs42_ioctl_clone(dst_file, args.src_fd, args.src_off, args.dst_off, args.count);
+}
#endif /* CONFIG_NFS_V4_2 */

long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = (void __user *)arg;
+
switch (cmd) {
#ifdef CONFIG_NFS_V4_2
case NFS_IOC_CLONE:
return nfs42_ioctl_clone(file, arg, 0, 0, 0);
+ case NFS_IOC_CLONE_RANGE:
+ return nfs42_ioctl_clone_range(file, argp);
#endif
}

diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
index d85748d..c6b86cc 100644
--- a/include/uapi/linux/nfs.h
+++ b/include/uapi/linux/nfs.h
@@ -33,7 +33,14 @@

/* NFS ioctls */
/* Let's follow btrfs lead on CLONE to avoid messing userspace */
-#define NFS_IOC_CLONE _IOW(0x94, 9, int)
+#define NFS_IOC_CLONE _IOW(0x94, 9, int)
+#define NFS_IOC_CLONE_RANGE _IOW(0x94, 13, int)
+
+struct nfs_ioctl_clone_range_args {
+ __s64 src_fd;
+ __u64 src_off, count;
+ __u64 dst_off;
+};

/*
* NFS stats. The good thing with these values is that NFSv3 errors are
--
1.8.3.1


2015-07-13 13:53:54

by Anna Schumaker

[permalink] [raw]
Subject: Re: [PATCH v2 5/8] nfs42: add NFS_IOC_CLONE ioctl

Hi Tao,

One question inline (below):

On 07/13/2015 05:06 AM, Peng Tao wrote:
> It can be called by user space to CLONE two files.
> Follow btrfs lead and define NFS_IOC_CLONE same as BTRFS_IOC_CLONE.
> Thus we don't mess up userspace with too many ioctls.
>
> Signed-off-by: Peng Tao <[email protected]>
> ---
> fs/nfs/nfs4file.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++
> include/uapi/linux/nfs.h | 4 ++
> 2 files changed, 106 insertions(+)
>
> diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
> index dcd39d4..dfa6620 100644
> --- a/fs/nfs/nfs4file.c
> +++ b/fs/nfs/nfs4file.c
> @@ -4,6 +4,7 @@
> * Copyright (C) 1992 Rick Sladkey
> */
> #include <linux/fs.h>
> +#include <linux/file.h>
> #include <linux/falloc.h>
> #include <linux/nfs_fs.h>
> #include "internal.h"
> @@ -166,8 +167,104 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
> return nfs42_proc_deallocate(filep, offset, len);
> return nfs42_proc_allocate(filep, offset, len);
> }
> +
> +static noinline long
> +nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
> + u64 src_off, u64 dst_off, u64 count)
> +{
> + struct inode *dst_inode = file_inode(dst_file);
> + struct fd src_file;
> + struct inode *src_inode;
> + int ret;
> +
> + /* dst file must be opened for writing */
> + if (!(dst_file->f_mode & FMODE_WRITE))
> + return -EINVAL;
> +
> + ret = mnt_want_write_file(dst_file);
> + if (ret)
> + return ret;
> +
> + src_file = fdget(srcfd);
> + if (!src_file.file) {
> + ret = -EBADF;
> + goto out_drop_write;
> + }
> +
> + src_inode = file_inode(src_file.file);
> +
> + /* src and dst must be different files */
> + ret = -EINVAL;
> + if (src_inode == dst_inode)
> + goto out_fput;
> +
> + /* src file must be opened for reading */
> + if (!(src_file.file->f_mode & FMODE_READ))
> + goto out_fput;
> +
> + /* src and dst must be regular files */
> + ret = -EISDIR;
> + if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode))
> + goto out_fput;
> +
> + ret = -EXDEV;
> + if (src_file.file->f_path.mnt != dst_file->f_path.mnt ||
> + src_inode->i_sb != dst_inode->i_sb)
> + goto out_fput;
> +
> + /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
> + if (dst_inode < src_inode) {

Why is the order of inode numbers important?

Thanks,
Anna

> + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
> + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
> + } else {
> + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
> + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
> + }
> +
> + /* flush all pending writes on both src and dst so that server
> + * has the latest data */
> + ret = nfs_sync_inode(src_inode);
> + if (ret)
> + goto out_unlock;
> + ret = nfs_sync_inode(dst_inode);
> + if (ret)
> + goto out_unlock;
> +
> + ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count);
> +
> + /* truncate inode page cache of the dst range so that future reads can fetch
> + * new data from server */
> + if (!ret)
> + truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
> +
> +out_unlock:
> + if (dst_inode < src_inode) {
> + mutex_unlock(&src_inode->i_mutex);
> + mutex_unlock(&dst_inode->i_mutex);
> + } else {
> + mutex_unlock(&dst_inode->i_mutex);
> + mutex_unlock(&src_inode->i_mutex);
> + }
> +out_fput:
> + fdput(src_file);
> +out_drop_write:
> + mnt_drop_write_file(dst_file);
> + return ret;
> +}
> #endif /* CONFIG_NFS_V4_2 */
>
> +long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> +{
> + switch (cmd) {
> +#ifdef CONFIG_NFS_V4_2
> + case NFS_IOC_CLONE:
> + return nfs42_ioctl_clone(file, arg, 0, 0, 0);
> +#endif
> + }
> +
> + return -ENOTTY;
> +}
> +
> const struct file_operations nfs4_file_operations = {
> #ifdef CONFIG_NFS_V4_2
> .llseek = nfs4_file_llseek,
> @@ -190,4 +287,9 @@ const struct file_operations nfs4_file_operations = {
> #endif /* CONFIG_NFS_V4_2 */
> .check_flags = nfs_check_flags,
> .setlease = simple_nosetlease,
> +#ifdef CONFIG_COMPAT
> + .unlocked_ioctl = nfs4_ioctl,
> +#else
> + .compat_ioctl = nfs4_ioctl,
> +#endif /* CONFIG_COMPAT */
> };
> diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
> index 5199a36..d85748d 100644
> --- a/include/uapi/linux/nfs.h
> +++ b/include/uapi/linux/nfs.h
> @@ -31,6 +31,10 @@
>
> #define NFS_PIPE_DIRNAME "nfs"
>
> +/* NFS ioctls */
> +/* Let's follow btrfs lead on CLONE to avoid messing userspace */
> +#define NFS_IOC_CLONE _IOW(0x94, 9, int)
> +
> /*
> * NFS stats. The good thing with these values is that NFSv3 errors are
> * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which
>


2015-07-13 15:46:45

by Peng Tao

[permalink] [raw]
Subject: Re: [PATCH v2 5/8] nfs42: add NFS_IOC_CLONE ioctl

On Mon, Jul 13, 2015 at 9:53 PM, Anna Schumaker
<[email protected]> wrote:
> Hi Tao,
>
> One question inline (below):
>
> On 07/13/2015 05:06 AM, Peng Tao wrote:
>> It can be called by user space to CLONE two files.
>> Follow btrfs lead and define NFS_IOC_CLONE same as BTRFS_IOC_CLONE.
>> Thus we don't mess up userspace with too many ioctls.
>>
>> Signed-off-by: Peng Tao <[email protected]>
>> ---
>> fs/nfs/nfs4file.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++
>> include/uapi/linux/nfs.h | 4 ++
>> 2 files changed, 106 insertions(+)
>>
>> diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
>> index dcd39d4..dfa6620 100644
>> --- a/fs/nfs/nfs4file.c
>> +++ b/fs/nfs/nfs4file.c
>> @@ -4,6 +4,7 @@
>> * Copyright (C) 1992 Rick Sladkey
>> */
>> #include <linux/fs.h>
>> +#include <linux/file.h>
>> #include <linux/falloc.h>
>> #include <linux/nfs_fs.h>
>> #include "internal.h"
>> @@ -166,8 +167,104 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
>> return nfs42_proc_deallocate(filep, offset, len);
>> return nfs42_proc_allocate(filep, offset, len);
>> }
>> +
>> +static noinline long
>> +nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
>> + u64 src_off, u64 dst_off, u64 count)
>> +{
>> + struct inode *dst_inode = file_inode(dst_file);
>> + struct fd src_file;
>> + struct inode *src_inode;
>> + int ret;
>> +
>> + /* dst file must be opened for writing */
>> + if (!(dst_file->f_mode & FMODE_WRITE))
>> + return -EINVAL;
>> +
>> + ret = mnt_want_write_file(dst_file);
>> + if (ret)
>> + return ret;
>> +
>> + src_file = fdget(srcfd);
>> + if (!src_file.file) {
>> + ret = -EBADF;
>> + goto out_drop_write;
>> + }
>> +
>> + src_inode = file_inode(src_file.file);
>> +
>> + /* src and dst must be different files */
>> + ret = -EINVAL;
>> + if (src_inode == dst_inode)
>> + goto out_fput;
>> +
>> + /* src file must be opened for reading */
>> + if (!(src_file.file->f_mode & FMODE_READ))
>> + goto out_fput;
>> +
>> + /* src and dst must be regular files */
>> + ret = -EISDIR;
>> + if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode))
>> + goto out_fput;
>> +
>> + ret = -EXDEV;
>> + if (src_file.file->f_path.mnt != dst_file->f_path.mnt ||
>> + src_inode->i_sb != dst_inode->i_sb)
>> + goto out_fput;
>> +
>> + /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
>> + if (dst_inode < src_inode) {
>
> Why is the order of inode numbers important?
If we get two concurrent clone ioctls CLONE(foo,bar) and
CLONE(bar,foo), the lock ordering avoids deadlock since we always lock
smaller inode first.

Cheers,
Tao

>
> Thanks,
> Anna
>
>> + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
>> + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
>> + } else {
>> + mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
>> + mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
>> + }
>> +
>> + /* flush all pending writes on both src and dst so that server
>> + * has the latest data */
>> + ret = nfs_sync_inode(src_inode);
>> + if (ret)
>> + goto out_unlock;
>> + ret = nfs_sync_inode(dst_inode);
>> + if (ret)
>> + goto out_unlock;
>> +
>> + ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count);
>> +
>> + /* truncate inode page cache of the dst range so that future reads can fetch
>> + * new data from server */
>> + if (!ret)
>> + truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
>> +
>> +out_unlock:
>> + if (dst_inode < src_inode) {
>> + mutex_unlock(&src_inode->i_mutex);
>> + mutex_unlock(&dst_inode->i_mutex);
>> + } else {
>> + mutex_unlock(&dst_inode->i_mutex);
>> + mutex_unlock(&src_inode->i_mutex);
>> + }
>> +out_fput:
>> + fdput(src_file);
>> +out_drop_write:
>> + mnt_drop_write_file(dst_file);
>> + return ret;
>> +}
>> #endif /* CONFIG_NFS_V4_2 */
>>
>> +long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>> +{
>> + switch (cmd) {
>> +#ifdef CONFIG_NFS_V4_2
>> + case NFS_IOC_CLONE:
>> + return nfs42_ioctl_clone(file, arg, 0, 0, 0);
>> +#endif
>> + }
>> +
>> + return -ENOTTY;
>> +}
>> +
>> const struct file_operations nfs4_file_operations = {
>> #ifdef CONFIG_NFS_V4_2
>> .llseek = nfs4_file_llseek,
>> @@ -190,4 +287,9 @@ const struct file_operations nfs4_file_operations = {
>> #endif /* CONFIG_NFS_V4_2 */
>> .check_flags = nfs_check_flags,
>> .setlease = simple_nosetlease,
>> +#ifdef CONFIG_COMPAT
>> + .unlocked_ioctl = nfs4_ioctl,
>> +#else
>> + .compat_ioctl = nfs4_ioctl,
>> +#endif /* CONFIG_COMPAT */
>> };
>> diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
>> index 5199a36..d85748d 100644
>> --- a/include/uapi/linux/nfs.h
>> +++ b/include/uapi/linux/nfs.h
>> @@ -31,6 +31,10 @@
>>
>> #define NFS_PIPE_DIRNAME "nfs"
>>
>> +/* NFS ioctls */
>> +/* Let's follow btrfs lead on CLONE to avoid messing userspace */
>> +#define NFS_IOC_CLONE _IOW(0x94, 9, int)
>> +
>> /*
>> * NFS stats. The good thing with these values is that NFSv3 errors are
>> * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2015-07-26 16:59:58

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v2 0/8] nfs: minor cleanups and NFSv42 CLONE support

How is your progress on the server side patches to allow testing this?

Also can you move the btrfs ioctls to the VFS and dispatch them through
a file operation?

And please add Zack and Darren to the Cc list as they involved with
local filesystem cloning, thanks!