LinuxLists.cc - [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

2016-04-29 20:42:20

Subject: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

These patches add client and server support for the NFS v4.2 COPY operation,
and make use of the new copy_file_range() system call.

Changes in v4:
- Rename nfsd_copy_range() -> nfsd_copy_file_range() to match clone
- Remove extra size checks that are handled by the VFS

Still missing:
- xfstests unit tests

Questions, comments, and other testing ideas would be greatly appreciated!

Thanks,
Anna

*** BLURB HERE ***

Anna Schumaker (4):
NFS: Add nfs_commit_file()
NFS: Add COPY nfs operation
NFSD: Implement the COPY call
vfs_copy_range() test program

fs/nfs/internal.h | 1 +
fs/nfs/nfs42.h | 1 +
fs/nfs/nfs42proc.c | 105 +++++++++++++++++++++++++++++++++
fs/nfs/nfs42xdr.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++
fs/nfs/nfs4file.c | 23 ++++++++
fs/nfs/nfs4proc.c | 1 +
fs/nfs/nfs4xdr.c | 1 +
fs/nfs/pagelist.c | 6 +-
fs/nfs/write.c | 41 +++++++++++--
fs/nfsd/nfs4proc.c | 90 ++++++++++++++++++++++++----
fs/nfsd/nfs4xdr.c | 63 +++++++++++++++++++-
fs/nfsd/vfs.c | 6 ++
fs/nfsd/vfs.h | 2 +
fs/nfsd/xdr4.h | 23 ++++++++
include/linux/nfs4.h | 1 +
include/linux/nfs_fs_sb.h | 1 +
include/linux/nfs_xdr.h | 26 +++++++++
nfscopy.c | 59 +++++++++++++++++++
18 files changed, 577 insertions(+), 19 deletions(-)
create mode 100644 nfscopy.c

--
2.8.0

2016-04-29 20:42:22

by Anna Schumaker

[permalink] [raw]

Subject: [PATCH v4 1/3] NFS: Add nfs_commit_file()

Copy will use this to set up a commit request for a generic range. I
don't want to allocate a new pagecache entry for the file, so I needed
to change parts of the commit path to handle requests with a null
wb_page.

Signed-off-by: Anna Schumaker <[email protected]>
---
Please let me know if there is a better way to do this!
---
fs/nfs/internal.h | 1 +
fs/nfs/pagelist.c | 6 ++++--
fs/nfs/write.c | 41 +++++++++++++++++++++++++++++++++++++----
3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f1d1d2c..5154fa6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -477,6 +477,7 @@ void nfs_mark_request_commit(struct nfs_page *req,
u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 1f6db42..174dd4c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
* long write-back delay. This will be adjusted in
* update_nfs_request below if the region is not locked. */
req->wb_page = page;
- req->wb_index = page_file_index(page);
- get_page(page);
+ if (page) {
+ req->wb_index = page_file_index(page);
+ get_page(page);
+ }
req->wb_offset = offset;
req->wb_pgbase = offset;
req->wb_bytes = count;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5f4fd53..b5e18f2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -737,7 +737,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
head = req->wb_head;

spin_lock(&inode->i_lock);
- if (likely(!PageSwapCache(head->wb_page))) {
+ if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
set_page_private(head->wb_page, 0);
ClearPagePrivate(head->wb_page);
smp_mb__after_atomic();
@@ -759,7 +759,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
static void
nfs_mark_request_dirty(struct nfs_page *req)
{
- __set_page_dirty_nobuffers(req->wb_page);
+ if (req->wb_page)
+ __set_page_dirty_nobuffers(req->wb_page);
}

/*
@@ -835,7 +836,8 @@ nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
spin_lock(cinfo->lock);
nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
spin_unlock(cinfo->lock);
- nfs_mark_page_unstable(req->wb_page, cinfo);
+ if (req->wb_page)
+ nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);

@@ -1724,6 +1726,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
return -ENOMEM;
}

+int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
+{
+ struct inode *inode = file_inode(file);
+ struct nfs_open_context *open;
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ int ret;
+
+ open = get_nfs_open_context(nfs_file_open_context(file));
+ req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
+ if (!req) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ nfs_init_cinfo_from_inode(&cinfo, inode);
+
+ memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
+ nfs_request_add_commit_list(req, &cinfo);
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret > 0)
+ ret = 0;
+
+ nfs_free_request(req);
+out_put:
+ put_nfs_open_context(open);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_commit_file);
+
/*
* COMMIT call returned
*/
@@ -1748,7 +1780,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- nfs_clear_page_commit(req->wb_page);
+ if (req->wb_page)
+ nfs_clear_page_commit(req->wb_page);

dprintk("NFS: commit (%s/%llu %d@%lld)",
req->wb_context->dentry->d_sb->s_id,
--
2.8.0

2016-04-29 20:42:24

by Anna Schumaker

[permalink] [raw]

Subject: [PATCH v4 2/3] NFS: Add COPY nfs operation

From: Anna Schumaker <[email protected]>

This adds the copy_range file_ops function pointer used by the
sys_copy_range() function call. This patch only implements sync copies,
so if an async copy happens we decode the stateid and ignore it.

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfs/nfs42.h | 1 +
fs/nfs/nfs42proc.c | 105 +++++++++++++++++++++++++++++++++
fs/nfs/nfs42xdr.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++
fs/nfs/nfs4file.c | 23 ++++++++
fs/nfs/nfs4proc.c | 1 +
fs/nfs/nfs4xdr.c | 1 +
include/linux/nfs4.h | 1 +
include/linux/nfs_fs_sb.h | 1 +
include/linux/nfs_xdr.h | 26 +++++++++
9 files changed, 305 insertions(+)

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b587ccd..b6cd153 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -13,6 +13,7 @@

/* nfs4.2proc.c */
int nfs42_proc_allocate(struct file *, loff_t, loff_t);
+ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t);
int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index dff8346..579ee20 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
return err;
}

+static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
+ struct nfs_lock_context *src_lock,
+ struct file *dst, loff_t pos_dst,
+ struct nfs_lock_context *dst_lock,
+ size_t count)
+{
+ struct nfs42_copy_args args = {
+ .src_fh = NFS_FH(file_inode(src)),
+ .src_pos = pos_src,
+ .dst_fh = NFS_FH(file_inode(dst)),
+ .dst_pos = pos_dst,
+ .count = count,
+ };
+ struct nfs42_copy_res res;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ };
+ struct inode *dst_inode = file_inode(dst);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ int status;
+
+ status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+ src_lock, FMODE_READ);
+ if (status)
+ return status;
+
+ status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+ dst_lock, FMODE_WRITE);
+ if (status)
+ return status;
+
+ status = nfs4_call_sync(server->client, server, &msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == -ENOTSUPP)
+ server->caps &= ~NFS_CAP_COPY;
+ if (status)
+ return status;
+
+ if (res.write_res.verifier.committed != NFS_FILE_SYNC) {
+ status = nfs_commit_file(dst, &res.write_res.verifier.verifier);
+ if (status)
+ return status;
+ }
+
+ truncate_pagecache_range(dst_inode, pos_dst,
+ pos_dst + res.write_res.count);
+
+ return res.write_res.count;
+}
+
+ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
+ struct file *dst, loff_t pos_dst,
+ size_t count)
+{
+ struct nfs_server *server = NFS_SERVER(file_inode(dst));
+ struct nfs_lock_context *src_lock;
+ struct nfs_lock_context *dst_lock;
+ struct nfs4_exception src_exception = { };
+ struct nfs4_exception dst_exception = { };
+ ssize_t err, err2;
+
+ if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
+ return -EOPNOTSUPP;
+
+ src_lock = nfs_get_lock_context(nfs_file_open_context(src));
+ if (IS_ERR(src_lock))
+ return PTR_ERR(src_lock);
+
+ src_exception.inode = file_inode(src);
+ src_exception.state = src_lock->open_context->state;
+
+ dst_lock = nfs_get_lock_context(nfs_file_open_context(dst));
+ if (IS_ERR(dst_lock)) {
+ err = PTR_ERR(dst_lock);
+ goto out_put_src_lock;
+ }
+
+ dst_exception.inode = file_inode(dst);
+ dst_exception.state = dst_lock->open_context->state;
+
+ do {
+ mutex_lock(&file_inode(dst)->i_mutex);
+ err = _nfs42_proc_copy(src, pos_src, src_lock,
+ dst, pos_dst, dst_lock, count);
+ mutex_unlock(&file_inode(dst)->i_mutex);
+
+ if (err == -ENOTSUPP) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ err2 = nfs4_handle_exception(server, err, &src_exception);
+ err = nfs4_handle_exception(server, err, &dst_exception);
+ if (!err)
+ err = err2;
+ } while (src_exception.retry || dst_exception.retry);
+
+ nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+ nfs_put_lock_context(src_lock);
+ return err;
+}
+
static loff_t _nfs42_proc_llseek(struct file *filep,
struct nfs_lock_context *lock, loff_t offset, int whence)
{
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0ca482a..6dc6f2a 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -9,9 +9,22 @@
#define encode_fallocate_maxsz (encode_stateid_maxsz + \
2 /* offset */ + \
2 /* length */)
+#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 /* wr_count */ + \
+ 1 /* wr_committed */ + \
+ XDR_QUADLEN(NFS4_VERIFIER_SIZE))
#define encode_allocate_maxsz (op_encode_hdr_maxsz + \
encode_fallocate_maxsz)
#define decode_allocate_maxsz (op_decode_hdr_maxsz)
+#define encode_copy_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE) + \
+ 2 + 2 + 2 + 1 + 1 + 1)
+#define decode_copy_maxsz (op_decode_hdr_maxsz + \
+ NFS42_WRITE_RES_SIZE + \
+ 1 /* cr_consecutive */ + \
+ 1 /* cr_synchronous */)
#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \
encode_fallocate_maxsz)
#define decode_deallocate_maxsz (op_decode_hdr_maxsz)
@@ -49,6 +62,16 @@
decode_putfh_maxsz + \
decode_allocate_maxsz + \
decode_getattr_maxsz)
+#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_copy_maxsz)
+#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_copy_maxsz)
#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
encode_deallocate_maxsz + \
@@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr,
encode_fallocate(xdr, args);
}

+static void encode_copy(struct xdr_stream *xdr,
+ struct nfs42_copy_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+
+ encode_uint64(xdr, args->src_pos);
+ encode_uint64(xdr, args->dst_pos);
+ encode_uint64(xdr, args->count);
+
+ encode_uint32(xdr, 1); /* consecutive = true */
+ encode_uint32(xdr, 1); /* synchronous = true */
+ encode_uint32(xdr, 0); /* src server list */
+}
+
static void encode_deallocate(struct xdr_stream *xdr,
struct nfs42_falloc_args *args,
struct compound_hdr *hdr)
@@ -182,6 +222,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
}

/*
+ * Encode COPY request
+ */
+static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_copy_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_copy(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode DEALLOCATE request
*/
static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
@@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
return decode_op_hdr(xdr, OP_ALLOCATE);
}

+static int decode_write_response(struct xdr_stream *xdr,
+ struct nfs42_write_res *res)
+{
+ __be32 *p;
+ int stateids;
+
+ p = xdr_inline_decode(xdr, 4 + 8 + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ stateids = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &res->count);
+ res->verifier.committed = be32_to_cpup(p);
+ return decode_verifier(xdr, &res->verifier.verifier);
+
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_copy_requirements(struct xdr_stream *xdr,
+ struct nfs42_copy_res *res) {
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 + 4);
+ if (unlikely(!p))
+ goto out_overflow;
+
+ res->consecutive = be32_to_cpup(p++);
+ res->synchronous = be32_to_cpup(p++);
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res)
+{
+ int status;
+
+ status = decode_op_hdr(xdr, OP_COPY);
+ if (status == NFS4ERR_OFFLOAD_NO_REQS) {
+ status = decode_copy_requirements(xdr, res);
+ if (status)
+ return status;
+ return NFS4ERR_OFFLOAD_NO_REQS;
+ } else if (status)
+ return status;
+
+ status = decode_write_response(xdr, &res->write_res);
+ if (status)
+ return status;
+
+ return decode_copy_requirements(xdr, res);
+}
+
static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_DEALLOCATE);
@@ -331,6 +447,36 @@ out:
}

/*
+ * Decode COPY response
+ */
+static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_copy_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_copy(xdr, res);
+out:
+ return status;
+}
+
+/*
* Decode DEALLOCATE request
*/
static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index d039051..014b0e4 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
}

#ifdef CONFIG_NFS_V4_2
+static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t count, unsigned int flags)
+{
+ struct inode *in_inode = file_inode(file_in);
+ struct inode *out_inode = file_inode(file_out);
+ int ret;
+
+ if (in_inode == out_inode)
+ return -EINVAL;
+
+ /* flush any pending writes */
+ ret = nfs_sync_inode(in_inode);
+ if (ret)
+ return ret;
+ ret = nfs_sync_inode(out_inode);
+ if (ret)
+ return ret;
+
+ return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
+}
+
static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
{
loff_t ret;
@@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = {
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
#ifdef CONFIG_NFS_V4_2
+ .copy_file_range = nfs4_copy_file_range,
.llseek = nfs4_file_llseek,
.fallocate = nfs42_fallocate,
.clone_file_range = nfs42_clone_file_range,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 327b8c3..ec4cd62 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8793,6 +8793,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_STATEID_NFSV41
| NFS_CAP_ATOMIC_OPEN_V1
| NFS_CAP_ALLOCATE
+ | NFS_CAP_COPY
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 88474a4..d1c96fc 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7515,6 +7515,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
PROC(CLONE, enc_clone, dec_clone),
+ PROC(COPY, enc_copy, dec_copy),
#endif /* CONFIG_NFS_V4_2 */
};

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 0114334..7225094 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -504,6 +504,7 @@ enum {
NFSPROC4_CLNT_DEALLOCATE,
NFSPROC4_CLNT_LAYOUTSTATS,
NFSPROC4_CLNT_CLONE,
+ NFSPROC4_CLNT_COPY,
};

/* nfs41 types */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7fcc13c..14a762d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -246,5 +246,6 @@ struct nfs_server {
#define NFS_CAP_DEALLOCATE (1U << 21)
#define NFS_CAP_LAYOUTSTATS (1U << 22)
#define NFS_CAP_CLONE (1U << 23)
+#define NFS_CAP_COPY (1U << 24)

#endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d320906..d7adcab 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1343,6 +1343,32 @@ struct nfs42_falloc_res {
const struct nfs_server *falloc_server;
};

+struct nfs42_copy_args {
+ struct nfs4_sequence_args seq_args;
+
+ struct nfs_fh *src_fh;
+ nfs4_stateid src_stateid;
+ u64 src_pos;
+
+ struct nfs_fh *dst_fh;
+ nfs4_stateid dst_stateid;
+ u64 dst_pos;
+
+ u64 count;
+};
+
+struct nfs42_write_res {
+ u64 count;
+ struct nfs_writeverf verifier;
+};
+
+struct nfs42_copy_res {
+ struct nfs4_sequence_res seq_res;
+ struct nfs42_write_res write_res;
+ bool consecutive;
+ bool synchronous;
+};
+
struct nfs42_seek_args {
struct nfs4_sequence_args seq_args;

--
2.8.0

2016-04-29 20:42:26

by Anna Schumaker

[permalink] [raw]

Subject: [PATCH v4 3/3] NFSD: Implement the COPY call

From: Anna Schumaker <[email protected]>

I only implemented the sync version of this call, since it's the
easiest. I can simply call vfs_copy_range() and have the vfs do the
right thing for the filesystem being exported.

Signed-off-by: Anna Schumaker <[email protected]>
---
v4:
- Rename nfsd_copy_range() -> nfsd_copy_file_range()
- Remove size checks handled by VFS
---
fs/nfsd/nfs4proc.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++-------
fs/nfsd/nfs4xdr.c | 63 ++++++++++++++++++++++++++++++++++++--
fs/nfsd/vfs.c | 6 ++++
fs/nfsd/vfs.h | 2 ++
fs/nfsd/xdr4.h | 23 ++++++++++++++
5 files changed, 171 insertions(+), 13 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index de1ff1d..f54ed58 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1011,47 +1011,97 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}

static __be32
-nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
- struct nfsd4_clone *clone)
+nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ stateid_t *src_stateid, struct file **src,
+ stateid_t *dst_stateid, struct file **dst)
{
- struct file *src, *dst;
__be32 status;

status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
- &clone->cl_src_stateid, RD_STATE,
- &src, NULL);
+ src_stateid, RD_STATE, src, NULL);
if (status) {
dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
goto out;
}

status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
- &clone->cl_dst_stateid, WR_STATE,
- &dst, NULL);
+ dst_stateid, WR_STATE, dst, NULL);
if (status) {
dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
goto out_put_src;
}

/* fix up for NFS-specific error code */
- if (!S_ISREG(file_inode(src)->i_mode) ||
- !S_ISREG(file_inode(dst)->i_mode)) {
+ if (!S_ISREG(file_inode(*src)->i_mode) ||
+ !S_ISREG(file_inode(*dst)->i_mode)) {
status = nfserr_wrong_type;
goto out_put_dst;
}

+out:
+ return status;
+out_put_dst:
+ fput(*dst);
+out_put_src:
+ fput(*src);
+ goto out;
+}
+
+static __be32
+nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_clone *clone)
+{
+ struct file *src, *dst;
+ __be32 status;
+
+ status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src,
+ &clone->cl_dst_stateid, &dst);
+ if (status)
+ goto out;
+
status = nfsd4_clone_file_range(src, clone->cl_src_pos,
dst, clone->cl_dst_pos, clone->cl_count);

-out_put_dst:
fput(dst);
-out_put_src:
fput(src);
out:
return status;
}

static __be32
+nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd4_copy *copy)
+{
+ struct file *src, *dst;
+ __be32 status;
+ ssize_t bytes;
+
+ status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid, &src,
+ &copy->cp_dst_stateid, &dst);
+ if (status)
+ goto out;
+
+ bytes = nfsd_copy_file_range(src, copy->cp_src_pos,
+ dst, copy->cp_dst_pos, copy->cp_count);
+
+ if (bytes < 0)
+ status = nfserrno(bytes);
+ else {
+ copy->cp_res.wr_bytes_written = bytes;
+ copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+ copy->cp_consecutive = 1;
+ copy->cp_synchronous = 1;
+ gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
+ status = nfs_ok;
+ }
+
+ fput(src);
+ fput(dst);
+out:
+ return status;
+}
+
+static __be32
nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_fallocate *fallocate, int flags)
{
@@ -1967,6 +2017,18 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
op_encode_channel_attrs_maxsz) * sizeof(__be32);
}

+static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* wr_callback */ +
+ op_encode_stateid_maxsz /* wr_callback */ +
+ 2 /* wr_count */ +
+ 1 /* wr_committed */ +
+ op_encode_verifier_maxsz +
+ 1 /* cr_consecutive */ +
+ 1 /* cr_synchronous */) * sizeof(__be32);
+}
+
#ifdef CONFIG_NFSD_PNFS
/*
* At this stage we don't really know what layout driver will handle the request,
@@ -2329,6 +2391,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_CLONE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+ [OP_COPY] = {
+ .op_func = (nfsd4op_func)nfsd4_copy,
+ .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+ .op_name = "OP_COPY",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_copy_rsize,
+ },
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9df898b..dff11c6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1696,6 +1696,30 @@ nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
}

static __be32
+nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
+{
+ DECODE_HEAD;
+ unsigned int tmp;
+
+ status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
+ if (status)
+ return status;
+ status = nfsd4_decode_stateid(argp, &copy->cp_dst_stateid);
+ if (status)
+ return status;
+
+ READ_BUF(8 + 8 + 8 + 4 + 4 + 4);
+ p = xdr_decode_hyper(p, &copy->cp_src_pos);
+ p = xdr_decode_hyper(p, &copy->cp_dst_pos);
+ p = xdr_decode_hyper(p, &copy->cp_count);
+ copy->cp_consecutive = be32_to_cpup(p++);
+ copy->cp_synchronous = be32_to_cpup(p++);
+ tmp = be32_to_cpup(p); /* Source server list not supported */
+
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
{
DECODE_HEAD;
@@ -1795,7 +1819,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {

/* new operations for NFSv4.2 */
[OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
- [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy,
[OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
[OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -4209,6 +4233,41 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
#endif /* CONFIG_NFSD_PNFS */

static __be32
+nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(0);
+ p = xdr_encode_hyper(p, write->wr_bytes_written);
+ *p++ = cpu_to_be32(write->wr_stable_how);
+ p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_copy *copy)
+{
+ __be32 *p;
+
+ if (!nfserr) {
+ nfserr = nfsd42_encode_write_res(resp, &copy->cp_res);
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(&resp->xdr, 4 + 4);
+ *p++ = cpu_to_be32(copy->cp_consecutive);
+ *p++ = cpu_to_be32(copy->cp_synchronous);
+ }
+ return nfserr;
+}
+
+static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_seek *seek)
{
@@ -4307,7 +4366,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {

/* NFSv4.2 operations */
[OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy,
[OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop,
[OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
[OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d40010e..cac7394 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -513,6 +513,12 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
count));
}

+ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
+ u64 dst_pos, u64 count)
+{
+ return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+}
+
__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset, loff_t len,
int flags)
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 2d573ec..d5afa83 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -93,6 +93,8 @@ __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
struct svc_fh *res);
__be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
char *, int, struct svc_fh *);
+ssize_t nfsd_copy_file_range(struct file *, u64,
+ struct file *, u64, u64);
__be32 nfsd_rename(struct svc_rqst *,
struct svc_fh *, char *, int,
struct svc_fh *, char *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d955481..2cad349 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -500,6 +500,28 @@ struct nfsd4_clone {
u64 cl_count;
};

+struct nfsd42_write_res {
+ u64 wr_bytes_written;
+ u32 wr_stable_how;
+ nfs4_verifier wr_verifier;
+};
+
+struct nfsd4_copy {
+ /* request */
+ stateid_t cp_src_stateid;
+ stateid_t cp_dst_stateid;
+ u64 cp_src_pos;
+ u64 cp_dst_pos;
+ u64 cp_count;
+
+ /* both */
+ bool cp_consecutive;
+ bool cp_synchronous;
+
+ /* response */
+ struct nfsd42_write_res cp_res;
+};
+
struct nfsd4_seek {
/* request */
stateid_t seek_stateid;
@@ -565,6 +587,7 @@ struct nfsd4_op {
struct nfsd4_fallocate allocate;
struct nfsd4_fallocate deallocate;
struct nfsd4_clone clone;
+ struct nfsd4_copy copy;
struct nfsd4_seek seek;
} u;
struct nfs4_replay * replay;
--
2.8.0

2016-04-29 20:42:27

by Anna Schumaker

[permalink] [raw]

Subject: [RFC v4 4/3] vfs_copy_range() test program

This is a simple C program that I used for calling the copy system call.
Usage: ./nfscopy /nfs/original.txt /nfs/copy.txt
---
nfscopy.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 59 insertions(+)
create mode 100644 nfscopy.c

diff --git a/nfscopy.c b/nfscopy.c
new file mode 100644
index 0000000..3417a14
--- /dev/null
+++ b/nfscopy.c
@@ -0,0 +1,59 @@
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+loff_t copy_file_range(int fd_in, loff_t *off_in, int fd_out,
+ loff_t *off_out, size_t len, unsigned int flags)
+{
+ return syscall(__NR_copy_file_range, fd_in, off_in, fd_out,
+ off_out, len, flags);
+}
+
+int main(int argc, char **argv)
+{
+ int fd_in, fd_out;
+ struct stat stat;
+ loff_t len, ret;
+ char buf[2];
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s <source> <destination>\n", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ fd_in = open(argv[1], O_RDONLY);
+ if (fd_in == -1) {
+ perror("open (argv[1])");
+ exit(EXIT_FAILURE);
+ }
+
+ if (fstat(fd_in, &stat) == -1) {
+ perror("fstat");
+ exit(EXIT_FAILURE);
+ }
+ len = stat.st_size;
+
+ fd_out = open(argv[2], O_CREAT|O_WRONLY|O_TRUNC, 0644);
+ if (fd_out == -1) {
+ perror("open (argv[2])");
+ exit(EXIT_FAILURE);
+ }
+
+ do {
+ ret = copy_file_range(fd_in, NULL, fd_out, NULL, len, 0);
+ if (ret == -1) {
+ perror("copy_file_range");
+ exit(EXIT_FAILURE);
+ }
+
+ len -= ret;
+ } while (len > 0);
+
+ close(fd_in);
+ close(fd_out);
+ exit(EXIT_SUCCESS);
+}
--
2.8.0

2016-05-01 17:37:38

by Christoph Hellwig

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

I might sound like a broken record, but I'd feel much happier if this
had extensive xfstests coverage. Xfstests has over one hundred tests for
file clones, and many of them should be easily adapatable.

2016-05-13 20:31:39

by J. Bruce Fields

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

On Sun, May 01, 2016 at 10:37:33AM -0700, Christoph Hellwig wrote:
> I might sound like a broken record, but I'd feel much happier if this
> had extensive xfstests coverage. Xfstests has over one hundred tests for
> file clones, and many of them should be easily adapatable.

Anna, have you looked at this yet?

I don't see any obvious problem with the nfsd code, other than the
obvious issue with large synchronous copies tying up server threads and
leaving clients waiting--but maybe we should just see how people end up
using it and deal with the problems as they come up.

--b.

2016-05-13 20:58:13

by Anna Schumaker

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

On 05/13/2016 04:31 PM, J. Bruce Fields wrote:
> On Sun, May 01, 2016 at 10:37:33AM -0700, Christoph Hellwig wrote:
>> I might sound like a broken record, but I'd feel much happier if this
>> had extensive xfstests coverage. Xfstests has over one hundred tests for
>> file clones, and many of them should be easily adapatable.
>
> Anna, have you looked at this yet?

Yep! I just sent out what I came up with :)

Anna

>
> I don't see any obvious problem with the nfsd code, other than the
> obvious issue with large synchronous copies tying up server threads and
> leaving clients waiting--but maybe we should just see how people end up
> using it and deal with the problems as they come up.
>
> --b.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

2016-07-29 18:59:35

by J. Bruce Fields

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

On Fri, May 13, 2016 at 04:58:06PM -0400, Anna Schumaker wrote:
> On 05/13/2016 04:31 PM, J. Bruce Fields wrote:
> > On Sun, May 01, 2016 at 10:37:33AM -0700, Christoph Hellwig wrote:
> >> I might sound like a broken record, but I'd feel much happier if this
> >> had extensive xfstests coverage. Xfstests has over one hundred tests for
> >> file clones, and many of them should be easily adapatable.
> >
> > Anna, have you looked at this yet?
>
> Yep! I just sent out what I came up with :)

Sorry for the lack of response. For some reason I don't seem to have
the updated version in my mailboxes. Do you have a more recent version?

> > I don't see any obvious problem with the nfsd code, other than the
> > obvious issue with large synchronous copies tying up server threads and
> > leaving clients waiting--but maybe we should just see how people end up
> > using it and deal with the problems as they come up.

I'm still worrying about this, though.

As a simple stopgap, could we just set *some* maximum on the size of the
copy? Or better yet on the time?--that'd let filesystems with
clone-like features copy the whole file without blocking an nfsd thread
indefinitely in the case of other filesystems.

--b.

2016-07-29 19:40:08

by Anna Schumaker

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

On 07/29/2016 02:59 PM, J. Bruce Fields wrote:
> On Fri, May 13, 2016 at 04:58:06PM -0400, Anna Schumaker wrote:
>> On 05/13/2016 04:31 PM, J. Bruce Fields wrote:
>>> On Sun, May 01, 2016 at 10:37:33AM -0700, Christoph Hellwig wrote:
>>>> I might sound like a broken record, but I'd feel much happier if this
>>>> had extensive xfstests coverage. Xfstests has over one hundred tests for
>>>> file clones, and many of them should be easily adapatable.
>>>
>>> Anna, have you looked at this yet?
>>
>> Yep! I just sent out what I came up with :)
>
> Sorry for the lack of response. For some reason I don't seem to have
> the updated version in my mailboxes. Do you have a more recent version?

I'm not sure, so I'll make sure my code still works and then resubmit!

>
>>> I don't see any obvious problem with the nfsd code, other than the
>>> obvious issue with large synchronous copies tying up server threads and
>>> leaving clients waiting--but maybe we should just see how people end up
>>> using it and deal with the problems as they come up.
>
> I'm still worrying about this, though.
>
> As a simple stopgap, could we just set *some* maximum on the size of the
> copy? Or better yet on the time?--that'd let filesystems with
> clone-like features copy the whole file without blocking an nfsd thread
> indefinitely in the case of other filesystems.

Would there be a good way of figuring out the time a copy would take? Capping with an arbitrary size would definitely be simpler, so I'll look into adding that.

Anna

>
> --b.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

2016-07-29 20:20:28

by J. Bruce Fields

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

On Fri, Jul 29, 2016 at 03:40:00PM -0400, Anna Schumaker wrote:
> On 07/29/2016 02:59 PM, J. Bruce Fields wrote:
> > On Fri, May 13, 2016 at 04:58:06PM -0400, Anna Schumaker wrote:
> >> On 05/13/2016 04:31 PM, J. Bruce Fields wrote:
> >>> On Sun, May 01, 2016 at 10:37:33AM -0700, Christoph Hellwig wrote:
> >>>> I might sound like a broken record, but I'd feel much happier if this
> >>>> had extensive xfstests coverage. Xfstests has over one hundred tests for
> >>>> file clones, and many of them should be easily adapatable.
> >>>
> >>> Anna, have you looked at this yet?
> >>
> >> Yep! I just sent out what I came up with :)
> >
> > Sorry for the lack of response. For some reason I don't seem to have
> > the updated version in my mailboxes. Do you have a more recent version?
>
> I'm not sure, so I'll make sure my code still works and then resubmit!
>
> >
> >>> I don't see any obvious problem with the nfsd code, other than the
> >>> obvious issue with large synchronous copies tying up server threads and
> >>> leaving clients waiting--but maybe we should just see how people end up
> >>> using it and deal with the problems as they come up.
> >
> > I'm still worrying about this, though.
> >
> > As a simple stopgap, could we just set *some* maximum on the size of the
> > copy? Or better yet on the time?--that'd let filesystems with
> > clone-like features copy the whole file without blocking an nfsd thread
> > indefinitely in the case of other filesystems.
>
> Would there be a good way of figuring out the time a copy would take?

Can we set some sort of timer to signal our thread after a limit? Then
hopefully the copy loop gets interrupted and we can return the amount
copied so far. (And hopefully the client has actually set the
contiguous flag so it can continue where it left off.)

> Capping with an arbitrary size would definitely be simpler, so I'll
> look into adding that.

I'm not sure how to set the limit. The downside (assuming the
client/application handle the short copy correctly) is that data can
stop flowing while we wait for the client to send us the next copy, but
I'm not sure how high the cap needs to be before that becomes
negligible.

--b.

2016-07-29 20:45:02

by Anna Schumaker

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

2016-07-29 21:21:39

by J. Bruce Fields

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

On Fri, Jul 29, 2016 at 04:44:39PM -0400, Anna Schumaker wrote:
> On 07/29/2016 04:20 PM, J. Bruce Fields wrote:
> > On Fri, Jul 29, 2016 at 03:40:00PM -0400, Anna Schumaker wrote:
> >> On 07/29/2016 02:59 PM, J. Bruce Fields wrote:
> >>> On Fri, May 13, 2016 at 04:58:06PM -0400, Anna Schumaker wrote:
> >>>> On 05/13/2016 04:31 PM, J. Bruce Fields wrote:
> >>>>> On Sun, May 01, 2016 at 10:37:33AM -0700, Christoph Hellwig wrote:
> >>>>>> I might sound like a broken record, but I'd feel much happier if this
> >>>>>> had extensive xfstests coverage. Xfstests has over one hundred tests for
> >>>>>> file clones, and many of them should be easily adapatable.
> >>>>>
> >>>>> Anna, have you looked at this yet?
> >>>>
> >>>> Yep! I just sent out what I came up with :)
> >>>
> >>> Sorry for the lack of response. For some reason I don't seem to have
> >>> the updated version in my mailboxes. Do you have a more recent version?
> >>
> >> I'm not sure, so I'll make sure my code still works and then resubmit!
> >>
> >>>
> >>>>> I don't see any obvious problem with the nfsd code, other than the
> >>>>> obvious issue with large synchronous copies tying up server threads and
> >>>>> leaving clients waiting--but maybe we should just see how people end up
> >>>>> using it and deal with the problems as they come up.
> >>>
> >>> I'm still worrying about this, though.
> >>>
> >>> As a simple stopgap, could we just set *some* maximum on the size of the
> >>> copy? Or better yet on the time?--that'd let filesystems with
> >>> clone-like features copy the whole file without blocking an nfsd thread
> >>> indefinitely in the case of other filesystems.
> >>
> >> Would there be a good way of figuring out the time a copy would take?
> >
> > Can we set some sort of timer to signal our thread after a limit? Then
> > hopefully the copy loop gets interrupted and we can return the amount
> > copied so far. (And hopefully the client has actually set the
> > contiguous flag so it can continue where it left off.)
>
> There are a lot of "hopefullys" there... I'll look into timers and signals, since I haven't needed to use them yet. What do you think would be a good maximum amount of time to copy before replying, assuming this way works out?
>
> >
> >> Capping with an arbitrary size would definitely be simpler, so I'll
> >> look into adding that.
> >
> > I'm not sure how to set the limit. The downside (assuming the
> > client/application handle the short copy correctly) is that data can
> > stop flowing while we wait for the client to send us the next copy, but
> > I'm not sure how high the cap needs to be before that becomes
> > negligible.
>
> This probably changes based on if the underlying storage is a spinning disk or flash. I'll poke around with the timer solution to see if I can figure that out, since it sounds more reliable.

So if D is the bandwidth of the disk copy, and L is the client-server
round-trip time, then DL is the amount of data you miss copying while
waiting for the client to issue the next copy. So to a first
approximation I think you lose roughly DL/B by not doing the whole copy
at once. So you'd like B to be large relative to likely values for DL.
Uh, but that internal copy bandwidth could be pretty huge. Maybe the
better goal is to make sure we still beat a network copy. I guess I'll
think about it over the weekend. My first reaction is just to pick
something pretty large (a gig?) and then at least we've got *some*
bound on how long a thread can block.

> Alternatively, would it be better for me to implement the callback portion and prepare the client for that? Then you could blame the client for tying up an RPC slot if they request a large, synchronous copy :)

We still tie up a server thread and an rpc slot, and we've got no user
interface on the client to take advantage of the callback information,
so I'm not sure that helps.

--b.

2016-08-01 13:30:47

by Christoph Hellwig

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

On Fri, Jul 29, 2016 at 02:59:33PM -0400, J. Bruce Fields wrote:
> > > I don't see any obvious problem with the nfsd code, other than the
> > > obvious issue with large synchronous copies tying up server threads and
> > > leaving clients waiting--but maybe we should just see how people end up
> > > using it and deal with the problems as they come up.
>
> I'm still worrying about this, though.
>
> As a simple stopgap, could we just set *some* maximum on the size of the
> copy? Or better yet on the time?--that'd let filesystems with
> clone-like features copy the whole file without blocking an nfsd thread
> indefinitely in the case of other filesystems.

I'm still really worried about corner cases in the copy_file_range
syscall and the COPY nfs implementation. When Darrick implemented
xfstests support for clone we found various bugs in the existing
implementation and corner cases handled different by xfs/btrfs/nfs and
the documentation. I'd really like to see something similar for
the copy side. Especially as clone should always be a valid
implementation for copy as well.

2016-08-02 18:49:14

by J. Bruce Fields

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

On Fri, Jul 29, 2016 at 05:21:36PM -0400, J. Bruce Fields wrote:
> On Fri, Jul 29, 2016 at 04:44:39PM -0400, Anna Schumaker wrote:
> > On 07/29/2016 04:20 PM, J. Bruce Fields wrote:
> > > On Fri, Jul 29, 2016 at 03:40:00PM -0400, Anna Schumaker wrote:
> > >> On 07/29/2016 02:59 PM, J. Bruce Fields wrote:
> > >>> On Fri, May 13, 2016 at 04:58:06PM -0400, Anna Schumaker wrote:
> > >>>> On 05/13/2016 04:31 PM, J. Bruce Fields wrote:
> > >>>>> On Sun, May 01, 2016 at 10:37:33AM -0700, Christoph Hellwig wrote:
> > >>>>>> I might sound like a broken record, but I'd feel much happier if this
> > >>>>>> had extensive xfstests coverage. Xfstests has over one hundred tests for
> > >>>>>> file clones, and many of them should be easily adapatable.
> > >>>>>
> > >>>>> Anna, have you looked at this yet?
> > >>>>
> > >>>> Yep! I just sent out what I came up with :)
> > >>>
> > >>> Sorry for the lack of response. For some reason I don't seem to have
> > >>> the updated version in my mailboxes. Do you have a more recent version?
> > >>
> > >> I'm not sure, so I'll make sure my code still works and then resubmit!
> > >>
> > >>>
> > >>>>> I don't see any obvious problem with the nfsd code, other than the
> > >>>>> obvious issue with large synchronous copies tying up server threads and
> > >>>>> leaving clients waiting--but maybe we should just see how people end up
> > >>>>> using it and deal with the problems as they come up.
> > >>>
> > >>> I'm still worrying about this, though.
> > >>>
> > >>> As a simple stopgap, could we just set *some* maximum on the size of the
> > >>> copy? Or better yet on the time?--that'd let filesystems with
> > >>> clone-like features copy the whole file without blocking an nfsd thread
> > >>> indefinitely in the case of other filesystems.
> > >>
> > >> Would there be a good way of figuring out the time a copy would take?
> > >
> > > Can we set some sort of timer to signal our thread after a limit? Then
> > > hopefully the copy loop gets interrupted and we can return the amount
> > > copied so far. (And hopefully the client has actually set the
> > > contiguous flag so it can continue where it left off.)
> >
> > There are a lot of "hopefullys" there... I'll look into timers and signals, since I haven't needed to use them yet. What do you think would be a good maximum amount of time to copy before replying, assuming this way works out?
> >
> > >
> > >> Capping with an arbitrary size would definitely be simpler, so I'll
> > >> look into adding that.
> > >
> > > I'm not sure how to set the limit. The downside (assuming the
> > > client/application handle the short copy correctly) is that data can
> > > stop flowing while we wait for the client to send us the next copy, but
> > > I'm not sure how high the cap needs to be before that becomes
> > > negligible.
> >
> > This probably changes based on if the underlying storage is a spinning disk or flash. I'll poke around with the timer solution to see if I can figure that out, since it sounds more reliable.
>
> So if D is the bandwidth of the disk copy, and L is the client-server
> round-trip time, then DL is the amount of data you miss copying while
> waiting for the client to issue the next copy. So to a first
> approximation I think you lose roughly DL/B by not doing the whole copy
> at once. So you'd like B to be large relative to likely values for DL.
> Uh, but that internal copy bandwidth could be pretty huge. Maybe the
> better goal is to make sure we still beat a network copy. I guess I'll
> think about it over the weekend. My first reaction is just to pick
> something pretty large (a gig?) and then at least we've got *some*
> bound on how long a thread can block.

Sorry, but you were asking about how to set a timeout, not how to set a
maximum byte value. I agree that a timeout would be better. The goal
is to spend most of our time moving data, so the timeout should be large
relative to the client-server roundtrip time. I don't know, is it safe
to assume that most client-server roundtrip times are less than 10ms?
In which case a 1/10th second timeout would usually result in less than
10% of the server's time spent waiting for the next copy call. Well,
assuming a pretty simplistic model of how this works. But it seems like
a starting point at least.

--b.

2016-08-26 20:36:16

by Anna Schumaker

[permalink] [raw]

Subject: Re: [PATCH v4 0/3] NFSv4.2: Add support for the COPY operation

On 07/29/2016 04:20 PM, J. Bruce Fields wrote:
> On Fri, Jul 29, 2016 at 03:40:00PM -0400, Anna Schumaker wrote:
>> On 07/29/2016 02:59 PM, J. Bruce Fields wrote:
>>> On Fri, May 13, 2016 at 04:58:06PM -0400, Anna Schumaker wrote:
>>>> On 05/13/2016 04:31 PM, J. Bruce Fields wrote:
>>>>> On Sun, May 01, 2016 at 10:37:33AM -0700, Christoph Hellwig wrote:
>>>>>> I might sound like a broken record, but I'd feel much happier if this
>>>>>> had extensive xfstests coverage. Xfstests has over one hundred tests for
>>>>>> file clones, and many of them should be easily adapatable.
>>>>>
>>>>> Anna, have you looked at this yet?
>>>>
>>>> Yep! I just sent out what I came up with :)
>>>
>>> Sorry for the lack of response. For some reason I don't seem to have
>>> the updated version in my mailboxes. Do you have a more recent version?
>>
>> I'm not sure, so I'll make sure my code still works and then resubmit!
>>
>>>
>>>>> I don't see any obvious problem with the nfsd code, other than the
>>>>> obvious issue with large synchronous copies tying up server threads and
>>>>> leaving clients waiting--but maybe we should just see how people end up
>>>>> using it and deal with the problems as they come up.
>>>
>>> I'm still worrying about this, though.
>>>
>>> As a simple stopgap, could we just set *some* maximum on the size of the
>>> copy? Or better yet on the time?--that'd let filesystems with
>>> clone-like features copy the whole file without blocking an nfsd thread
>>> indefinitely in the case of other filesystems.
>>
>> Would there be a good way of figuring out the time a copy would take?
>
> Can we set some sort of timer to signal our thread after a limit? Then
> hopefully the copy loop gets interrupted and we can return the amount
> copied so far. (And hopefully the client has actually set the
> contiguous flag so it can continue where it left off.)
>
>> Capping with an arbitrary size would definitely be simpler, so I'll
>> look into adding that.
>
> I'm not sure how to set the limit. The downside (assuming the
> client/application handle the short copy correctly) is that data can
> stop flowing while we wait for the client to send us the next copy, but
> I'm not sure how high the cap needs to be before that becomes
> negligible.

I've been playing around with copy caps and I found that a 4MB cap has performance pretty close to no cap at all. Here is my performance data (averages across several runs):

|-------|---------|---------|---------|---------|---------|---------|---------|
|NFSv4.1| 512 MB | 1024 MB | 1536 MB | 2048 MB | 2560 MB | 3072 MB | 5120 MB |
|-------|---------|---------|---------|---------|---------|---------|---------|
|user | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s |
|system | 0.28s | 0.48s | 0.72s | 0.94s | 1.17s | 1.42s | 2.37s |
|cpu | 18% | 14% | 14% | 14% | 14% | 13% | 13% |
|total | 1.547s | 3.269s | 5.040s | 6.690s | 8.363s | 10.146s | 16.729s |
|-------|---------|---------|---------|---------|---------|---------|---------|
|read | 4096 | 8192 | 12288 | 16384 | 20480 | 24576 | 40960 |
|write | 4098 | 8203 | 12302 | 16402 | 20497 | 24668 | 40996 |
|commit | 9 | 18 | 27 | 36 | 46 | 55 | 93 |
|-------|---------|---------|---------|---------|---------|---------|---------|

|-------|---------|---------|---------|---------|---------|---------|---------|
|No Cap | 512 MB | 1024 MB | 1536 MB | 2048 MB | 2560 MB | 3072 MB | 5120 MB |
|-------|---------|---------|---------|---------|---------|---------|---------|
|user | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s |
|system | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s |
|cpu | 0% | 0% | 0% | 0% | 0% | 0% | 0% |
|total | 1.476s | 3.068s | 4.569s | 6.123s | 7.701s | 9.265s | 15.438s |
|-------|---------|---------|---------|---------|---------|---------|---------|
|copy | 1 | 1 | 1 | 2 | 2 | 2 | 3 |
|commit | 1 | 1 | 1 | 2 | 2 | 2 | 3 |
|-------|---------|---------|---------|---------|---------|---------|---------|

|-------|---------|---------|---------|---------|---------|---------|---------|
|1MB Cap| 512 MB | 1024 MB | 1536 MB | 2048 MB | 2560 MB | 3072 MB | 5120 MB |
|-------|---------|---------|---------|---------|---------|---------|---------|
|user | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s |
|system | 0.01s | 0.02s | 0.03s | 0.04s | 0.05s | 0.07s | 0.10s |
|cpu | 0% | 0% | 0% | 0% | 0% | 0% | 0% |
|total | 1.659s | 3.118s | 4.930s | 6.647s | 8.110s | 9.637s | 16.030s |
|-------|---------|---------|---------|---------|---------|---------|---------|
|copy | 512 | 1024 | 1536 | 2048 | 2560 | 3072 | 5120 |
|commit | 512 | 1024 | 1536 | 2048 | 2560 | 3072 | 5120 |
|-------|---------|---------|---------|---------|---------|---------|---------|

|-------|---------|---------|---------|---------|---------|---------|---------|
|2MB Cap| 512 MB | 1024 MB | 1536 MB | 2048 MB | 2560 MB | 3072 MB | 5120 MB |
|-------|---------|---------|---------|---------|---------|---------|---------|
|user | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s |
|system | 0.01s | 0.01s | 0.02s | 0.03s | 0.03s | 0.04s | 0.07s |
|cpu | 0% | 0% | 0% | 0% | 0% | 0% | 0% |
|total | 1.757s | 3.351s | 4.791s | 6.522s | 8.090s | 9.594s | 15.977s |
|-------|---------|---------|---------|---------|---------|---------|---------|
|copy | 256 | 512 | 768 | 1024 | 1280 | 1536 | 2560 |
|commit | 256 | 512 | 768 | 1024 | 1280 | 1536 | 2560 |
|-------|---------|---------|---------|---------|---------|---------|---------|

|-------|---------|---------|---------|---------|---------|---------|---------|
|4MB Cap| 512 MB | 1024 MB | 1536 MB | 2048 MB | 2560 MB | 3072 MB | 5120 MB |
|-------|---------|---------|---------|---------|---------|---------|---------|
|user | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s | 0.00s |
|system | 0.00s | 0.01s | 0.01s | 0.01s | 0.02s | 0.02s | 0.04s |
|cpu | 0% | 0% | 0% | 0% | 0% | 0% | 0% |
|total | 1.475s | 3.046s | 4.643s | 6.159s | 7.779s | 9.340s | 15.476s |
|-------|---------|---------|---------|---------|---------|---------|---------|
|copy | 128 | 256 | 384 | 512 | 640 | 768 | 1280 |
|commit | 128 | 256 | 384 | 512 | 640 | 768 | 1280 |
|-------|---------|---------|---------|---------|---------|---------|---------|

If you don't have any objections then I'll put in the 4MB cap with a comment that it's to keep from holding open the RPC slot too long. Then I'll resubmit that along with the xfstests I've written (and one vfs fix).

Thoughts?
Anna

>
> --b.
>