From: Anna Schumaker <[email protected]>
These patches add server support for the NFS v4.2 operation READ_PLUS.
I noticed a race condition in the 3rd patch when I start needing vfs_llseek()
to determine if I should encode the next segment as either data or a hole. It
is possible that the file could change on us between each of the seek calls,
so we don't know if we are actually at a hole or data segment. I don't want to
add new locks to the NFS server for this case, so instead I've decided to
encode any "quantum data" segments as if they were actually data.
I tested these patches using xfstests, specificially generic/075, generic/091,
generic/112, generic/127, generic/210, and generic/263. Additionally, three
new tests are run once READ_PLUS support has been added: generic/213,
generic/214, and generic/228.
Changes since v1:
- Set correct read_rsize to suppress a buffer overflow warning.
These patches and the corresponding client changes are available in the
[read_plus] branch of
git://git.linux-nfs.org/projects/anna/linux-nfs.git
Questions? Comments? Thoughts?
Anna
Anna Schumaker (4):
NFSD: nfsd4_encode_read() should encode eof and maxcount
NFSD: Add READ_PLUS support for data segments
NFSD: Add READ_PLUS support for hole segments
NFSD: Add support for encoding multiple segments
fs/nfsd/nfs4proc.c | 16 +++++
fs/nfsd/nfs4xdr.c | 174 +++++++++++++++++++++++++++++++++++++++++------------
2 files changed, 152 insertions(+), 38 deletions(-)
--
2.2.2
From: Anna Schumaker <[email protected]>
I intend to reuse nfsd4_encode_readv() for READ_PLUS, so I need an
alternate way to encode these values. I think it makes sense for
nfsd4_encode_read() to handle this in a single place for both READ
paths.
Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4xdr.c | 59 ++++++++++++++++++++++---------------------------------
1 file changed, 23 insertions(+), 36 deletions(-)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73..01248b2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3122,21 +3122,19 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
static __be32 nfsd4_encode_splice_read(
struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
- struct file *file, unsigned long maxcount)
+ struct file *file, unsigned long *maxcount)
{
struct xdr_stream *xdr = &resp->xdr;
struct xdr_buf *buf = xdr->buf;
- u32 eof;
int space_left;
__be32 nfserr;
- __be32 *p = xdr->p - 2;
/* Make sure there will be room for padding if needed */
if (xdr->end - xdr->p < 1)
return nfserr_resource;
nfserr = nfsd_splice_read(read->rd_rqstp, file,
- read->rd_offset, &maxcount);
+ read->rd_offset, maxcount);
if (nfserr) {
/*
* nfsd_splice_actor may have already messed with the
@@ -3147,27 +3145,21 @@ static __be32 nfsd4_encode_splice_read(
return nfserr;
}
- eof = (read->rd_offset + maxcount >=
- read->rd_fhp->fh_dentry->d_inode->i_size);
-
- *(p++) = htonl(eof);
- *(p++) = htonl(maxcount);
-
- buf->page_len = maxcount;
- buf->len += maxcount;
- xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+ buf->page_len = *maxcount;
+ buf->len += *maxcount;
+ xdr->page_ptr += (buf->page_base + *maxcount + PAGE_SIZE - 1)
/ PAGE_SIZE;
/* Use rest of head for padding and remaining ops: */
buf->tail[0].iov_base = xdr->p;
buf->tail[0].iov_len = 0;
xdr->iov = buf->tail;
- if (maxcount&3) {
- int pad = 4 - (maxcount&3);
+ if (*maxcount&3) {
+ int pad = 4 - (*maxcount&3);
*(xdr->p++) = 0;
- buf->tail[0].iov_base += maxcount&3;
+ buf->tail[0].iov_base += *maxcount&3;
buf->tail[0].iov_len = pad;
buf->len += pad;
}
@@ -3182,21 +3174,19 @@ static __be32 nfsd4_encode_splice_read(
static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
- struct file *file, unsigned long maxcount)
+ struct file *file, unsigned long *maxcount)
{
struct xdr_stream *xdr = &resp->xdr;
- u32 eof;
int v;
- int starting_len = xdr->buf->len - 8;
+ int starting_len = xdr->buf->len;
long len;
int thislen;
__be32 nfserr;
- __be32 tmp;
__be32 *p;
u32 zzz = 0;
int pad;
- len = maxcount;
+ len = *maxcount;
v = 0;
thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
@@ -3219,22 +3209,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
read->rd_vlen = v;
nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
- read->rd_vlen, &maxcount);
+ read->rd_vlen, maxcount);
if (nfserr)
return nfserr;
- xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
+ xdr_truncate_encode(xdr, starting_len + ((*maxcount+3)&~3));
- eof = (read->rd_offset + maxcount >=
- read->rd_fhp->fh_dentry->d_inode->i_size);
-
- tmp = htonl(eof);
- write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
- tmp = htonl(maxcount);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
-
- pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
- &zzz, pad);
+ pad = (*maxcount&3) ? 4 - (*maxcount&3) : 0;
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + *maxcount, &zzz, pad);
return 0;
}
@@ -3250,6 +3231,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
struct raparms *ra;
__be32 *p;
__be32 err;
+ u32 eof;
if (nfserr)
return nfserr;
@@ -3277,9 +3259,14 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
}
if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
- err = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
else
- err = nfsd4_encode_readv(resp, read, file, maxcount);
+ err = nfsd4_encode_readv(resp, read, file, &maxcount);
+
+ eof = (read->rd_offset + maxcount >=
+ read->rd_fhp->fh_dentry->d_inode->i_size);
+ *p++ = cpu_to_be32(eof);
+ *p++ = cpu_to_be32(maxcount);
if (!read->rd_filp)
nfsd_put_tmp_read_open(file, ra);
--
2.2.2
From: Anna Schumaker <[email protected]>
This patch adds READ_PLUS support by encoding file contents ase one
single data segment, matching the behavior of the READ operation.
Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4proc.c | 16 +++++++++++
fs/nfsd/nfs4xdr.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 93 insertions(+), 2 deletions(-)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13..fdf9ce3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1603,6 +1603,16 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
+static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = svc_max_payload(rqstp);
+ u32 rlen = min(op->u.read.rd_length, maxcount);
+ /* extra xdr space for encoding read plus segments. */
+ u32 xdr = 4;
+
+ return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
u32 maxcount = 0, rlen = 0;
@@ -1980,6 +1990,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_DEALLOCATE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
},
+ [OP_READ_PLUS] = {
+ .op_func = (nfsd4op_func)nfsd4_read,
+ .op_name = "OP_READ_PLUS",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_plus_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid,
+ },
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 01248b2..480b07f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1630,7 +1630,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read,
[OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
[OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
};
@@ -3802,6 +3802,81 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
}
static __be32
+nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
+ struct file *file)
+{
+ __be32 *p, err;
+ unsigned long maxcount;
+ struct xdr_stream *xdr = &resp->xdr;
+
+ p = xdr_reserve_space(xdr, 4 + 8 + 4); /* content type, offset, maxcount */
+ if (!p)
+ return nfserr_resource;
+ xdr_commit_encode(xdr);
+
+ maxcount = svc_max_payload(resp->rqstp);
+ maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
+ maxcount = min_t(unsigned long, maxcount, read->rd_length);
+
+ err = nfsd4_encode_readv(resp, read, file, &maxcount);
+
+ *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
+ p = xdr_encode_hyper(p, read->rd_offset);
+ *p++ = cpu_to_be32(maxcount);
+
+ read->rd_offset += maxcount;
+ return err;
+}
+
+static __be32
+nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_read *read)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file = read->rd_filp;
+ int starting_len = xdr->buf->len;
+ struct raparms *ra;
+ __be32 *p;
+ __be32 err;
+ u32 eof, segments = 0;
+
+ if (nfserr)
+ return nfserr;
+
+ /* eof flag, segment count */
+ p = xdr_reserve_space(xdr, 4 + 4 );
+ if (!p)
+ return nfserr_resource;
+ xdr_commit_encode(xdr);
+
+ if (!read->rd_filp) {
+ err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
+ &file, &ra);
+ if (err)
+ goto err_truncate;
+ }
+
+ if (read->rd_offset >= i_size_read(file_inode(file)))
+ goto out_encode;
+
+ err = nfsd4_encode_read_plus_data(resp, read, file);
+ segments++;
+
+out_encode:
+ eof = (read->rd_offset >= i_size_read(file_inode(file)));
+ *p++ = cpu_to_be32(eof);
+ *p++ = cpu_to_be32(segments);
+
+ if (!read->rd_filp)
+ nfsd_put_tmp_read_open(file, ra);
+
+err_truncate:
+ if (err)
+ xdr_truncate_encode(xdr, starting_len);
+ return err;
+}
+
+static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_seek *seek)
{
@@ -3900,7 +3975,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus,
[OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
[OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
};
--
2.2.2
From: Anna Schumaker <[email protected]>
This patch adds support for returning NFS4_CONTENT_HOLE segments to the
client. I did notice a race condition where the same offset can be
reported as both a hole and as data, and this patch gets around this
without a lock by encoding questionable regions as data.
Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4proc.c | 4 ++--
fs/nfsd/nfs4xdr.c | 34 +++++++++++++++++++++++++++++++---
2 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index fdf9ce3..3db34ee 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1607,8 +1607,8 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
{
u32 maxcount = svc_max_payload(rqstp);
u32 rlen = min(op->u.read.rd_length, maxcount);
- /* extra xdr space for encoding read plus segments. */
- u32 xdr = 4;
+ /* enough extra xdr space for encoding either a hole or data segment. */
+ u32 xdr = 5;
return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 480b07f..c85e658 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3803,7 +3803,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
- struct file *file)
+ struct file *file, loff_t hole_pos)
{
__be32 *p, err;
unsigned long maxcount;
@@ -3817,6 +3817,7 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *r
maxcount = svc_max_payload(resp->rqstp);
maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
maxcount = min_t(unsigned long, maxcount, read->rd_length);
+ maxcount = min_t(unsigned long, maxcount, hole_pos - read->rd_offset);
err = nfsd4_encode_readv(resp, read, file, &maxcount);
@@ -3829,6 +3830,22 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *r
}
static __be32
+nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
+ struct file *file, loff_t data_pos)
+{
+ __be32 *p;
+ unsigned long maxcount = data_pos - read->rd_offset;
+
+ p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
+ *p++ = cpu_to_be32(NFS4_CONTENT_HOLE);
+ p = xdr_encode_hyper(p, read->rd_offset);
+ p = xdr_encode_hyper(p, maxcount);
+
+ read->rd_offset += maxcount;
+ return nfs_ok;
+}
+
+static __be32
nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_read *read)
{
@@ -3836,6 +3853,7 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
struct file *file = read->rd_filp;
int starting_len = xdr->buf->len;
struct raparms *ra;
+ loff_t hole_pos, data_pos;
__be32 *p;
__be32 err;
u32 eof, segments = 0;
@@ -3856,10 +3874,20 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
goto err_truncate;
}
- if (read->rd_offset >= i_size_read(file_inode(file)))
+ hole_pos = vfs_llseek(read->rd_filp, read->rd_offset, SEEK_HOLE);
+ if (hole_pos == -ENXIO)
goto out_encode;
- err = nfsd4_encode_read_plus_data(resp, read, file);
+ data_pos = vfs_llseek(read->rd_filp, read->rd_offset, SEEK_DATA);
+ if (data_pos == -ENXIO)
+ data_pos = i_size_read(file_inode(file));
+
+ if ((data_pos == read->rd_offset) && (hole_pos > data_pos))
+ err = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
+ else if ((hole_pos == read->rd_offset) && (data_pos > hole_pos))
+ err = nfsd4_encode_read_plus_hole(resp, read, file, data_pos);
+ else /* The file probably changed on us between seeks. */
+ err = nfsd4_encode_read_plus_data(resp, read, file, i_size_read(file_inode(file)));
segments++;
out_encode:
--
2.2.2
From: Anna Schumaker <[email protected]>
This patch implements sending an array of segments back to the client.
Clients should be prepared to handle multiple segment reads to make this
useful.
Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4proc.c | 4 ++--
fs/nfsd/nfs4xdr.c | 36 ++++++++++++++++++++++--------------
2 files changed, 24 insertions(+), 16 deletions(-)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 3db34ee..5f3c75a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1607,8 +1607,8 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
{
u32 maxcount = svc_max_payload(rqstp);
u32 rlen = min(op->u.read.rd_length, maxcount);
- /* enough extra xdr space for encoding either a hole or data segment. */
- u32 xdr = 5;
+ /* Extra xdr padding for encoding multiple segments. */
+ u32 xdr = 20;
return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c85e658..dbd4580 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3826,6 +3826,7 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *r
*p++ = cpu_to_be32(maxcount);
read->rd_offset += maxcount;
+ read->rd_length -= maxcount;
return err;
}
@@ -3842,6 +3843,10 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
p = xdr_encode_hyper(p, maxcount);
read->rd_offset += maxcount;
+ if (maxcount > read->rd_length)
+ read->rd_length = 0;
+ else
+ read->rd_length -= maxcount;
return nfs_ok;
}
@@ -3874,23 +3879,26 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
goto err_truncate;
}
- hole_pos = vfs_llseek(read->rd_filp, read->rd_offset, SEEK_HOLE);
- if (hole_pos == -ENXIO)
- goto out_encode;
+ do {
+ hole_pos = vfs_llseek(read->rd_filp, read->rd_offset, SEEK_HOLE);
+ if (hole_pos == -ENXIO)
+ break;
- data_pos = vfs_llseek(read->rd_filp, read->rd_offset, SEEK_DATA);
- if (data_pos == -ENXIO)
- data_pos = i_size_read(file_inode(file));
+ data_pos = vfs_llseek(read->rd_filp, read->rd_offset, SEEK_DATA);
+ if (data_pos == -ENXIO)
+ data_pos = i_size_read(file_inode(file));
- if ((data_pos == read->rd_offset) && (hole_pos > data_pos))
- err = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
- else if ((hole_pos == read->rd_offset) && (data_pos > hole_pos))
- err = nfsd4_encode_read_plus_hole(resp, read, file, data_pos);
- else /* The file probably changed on us between seeks. */
- err = nfsd4_encode_read_plus_data(resp, read, file, i_size_read(file_inode(file)));
- segments++;
+ if ((data_pos == read->rd_offset) && (hole_pos > data_pos))
+ err = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
+ else if ((hole_pos == read->rd_offset) && (data_pos > hole_pos))
+ err = nfsd4_encode_read_plus_hole(resp, read, file, data_pos);
+ else /* The file probably changed on us between seeks. */
+ err = nfsd4_encode_read_plus_data(resp, read, file, i_size_read(file_inode(file)));
+ if (err)
+ break;
+ segments++;
+ } while (read->rd_length > 0);
-out_encode:
eof = (read->rd_offset >= i_size_read(file_inode(file)));
*p++ = cpu_to_be32(eof);
*p++ = cpu_to_be32(segments);
--
2.2.2
On Wed, Jan 28, 2015 at 03:42:53PM -0500, [email protected] wrote:
> From: Anna Schumaker <[email protected]>
>
> These patches add server support for the NFS v4.2 operation READ_PLUS.
>
> I noticed a race condition in the 3rd patch when I start needing vfs_llseek()
> to determine if I should encode the next segment as either data or a hole. It
> is possible that the file could change on us between each of the seek calls,
> so we don't know if we are actually at a hole or data segment. I don't want to
> add new locks to the NFS server for this case, so instead I've decided to
> encode any "quantum data" segments as if they were actually data.
>
> I tested these patches using xfstests, specificially generic/075, generic/091,
> generic/112, generic/127, generic/210, and generic/263. Additionally, three
> new tests are run once READ_PLUS support has been added: generic/213,
> generic/214, and generic/228.
Why do theses test start working when you use READ_PLUS? They should
only depend on ALLOCATE support, and maybe SEEK if we're trying to
figure out information about mappings somehow.
On 01/28/2015 04:38 PM, Christoph Hellwig wrote:
> On Wed, Jan 28, 2015 at 03:42:53PM -0500, [email protected] wrote:
>> From: Anna Schumaker <[email protected]>
>>
>> These patches add server support for the NFS v4.2 operation READ_PLUS.
>>
>> I noticed a race condition in the 3rd patch when I start needing vfs_llseek()
>> to determine if I should encode the next segment as either data or a hole. It
>> is possible that the file could change on us between each of the seek calls,
>> so we don't know if we are actually at a hole or data segment. I don't want to
>> add new locks to the NFS server for this case, so instead I've decided to
>> encode any "quantum data" segments as if they were actually data.
>>
>> I tested these patches using xfstests, specificially generic/075, generic/091,
>> generic/112, generic/127, generic/210, and generic/263. Additionally, three
>> new tests are run once READ_PLUS support has been added: generic/213,
>> generic/214, and generic/228.
>
> Why do theses test start working when you use READ_PLUS? They should
> only depend on ALLOCATE support, and maybe SEEK if we're trying to
> figure out information about mappings somehow.
I'm not sure. I'll rerun the tests to make sure I wasn't just confused when I typed that!
Anna
>
On 01/28/2015 04:45 PM, Anna Schumaker wrote:
> On 01/28/2015 04:38 PM, Christoph Hellwig wrote:
>> On Wed, Jan 28, 2015 at 03:42:53PM -0500, [email protected] wrote:
>>> From: Anna Schumaker <[email protected]>
>>>
>>> These patches add server support for the NFS v4.2 operation READ_PLUS.
>>>
>>> I noticed a race condition in the 3rd patch when I start needing vfs_llseek()
>>> to determine if I should encode the next segment as either data or a hole. It
>>> is possible that the file could change on us between each of the seek calls,
>>> so we don't know if we are actually at a hole or data segment. I don't want to
>>> add new locks to the NFS server for this case, so instead I've decided to
>>> encode any "quantum data" segments as if they were actually data.
>>>
>>> I tested these patches using xfstests, specificially generic/075, generic/091,
>>> generic/112, generic/127, generic/210, and generic/263. Additionally, three
>>> new tests are run once READ_PLUS support has been added: generic/213,
>>> generic/214, and generic/228.
>>
>> Why do theses test start working when you use READ_PLUS? They should
>> only depend on ALLOCATE support, and maybe SEEK if we're trying to
>> figure out information about mappings somehow.
>
> I'm not sure. I'll rerun the tests to make sure I wasn't just confused when I typed that!
I must have been comparing v4.1 to v4.2 when I wrote that, because those tests still run without the READ_PLUS patches. Thanks for catching that!
Anna
>
> Anna
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
The subject line should include readv instead of read as well, shouldn't
it?
On 02/05/2015 09:11 AM, Christoph Hellwig wrote:
> The subject line should include readv instead of read as well, shouldn't
> it?
>
Probably, yeah. I'll fix that up.
> + p = xdr_reserve_space(xdr, 4 + 8 + 4); /* content type, offset, maxcount */
> + if (!p)
> + return nfserr_resource;
> + xdr_commit_encode(xdr);
> +
> + maxcount = svc_max_payload(resp->rqstp);
> + maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
> + maxcount = min_t(unsigned long, maxcount, read->rd_length);
> +
> + err = nfsd4_encode_readv(resp, read, file, &maxcount);
If the READ_PLUS implementation can't use the splice read path it's
probably a loss for most workloads.
On 02/05/2015 09:13 AM, Christoph Hellwig wrote:
>> + p = xdr_reserve_space(xdr, 4 + 8 + 4); /* content type, offset, maxcount */
>> + if (!p)
>> + return nfserr_resource;
>> + xdr_commit_encode(xdr);
>> +
>> + maxcount = svc_max_payload(resp->rqstp);
>> + maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
>> + maxcount = min_t(unsigned long, maxcount, read->rd_length);
>> +
>> + err = nfsd4_encode_readv(resp, read, file, &maxcount);
>
> If the READ_PLUS implementation can't use the splice read path it's
> probably a loss for most workloads.
>
I'll play around with the splice path, but I don't think it was designed for encoding multiple read segments.
On Thu, Feb 05, 2015 at 11:06:04AM -0500, Anna Schumaker wrote:
> > If the READ_PLUS implementation can't use the splice read path it's
> > probably a loss for most workloads.
> >
>
> I'll play around with the splice path, but I don't think it was designed for encoding multiple read segments.
The problem is that the typical case of all data won't use splice
every with your patches as the 4.2 client will always send a READ_PLUS.
So we'll have to find a way to use it where it helps. While we might be
able to add some hacks to only use splice for the first segment I guess
we just need to make the splice support generic enough in the long run.
On 02/05/2015 11:23 AM, Christoph Hellwig wrote:
> On Thu, Feb 05, 2015 at 11:06:04AM -0500, Anna Schumaker wrote:
>>> If the READ_PLUS implementation can't use the splice read path it's
>>> probably a loss for most workloads.
>>>
>>
>> I'll play around with the splice path, but I don't think it was designed for encoding multiple read segments.
>
> The problem is that the typical case of all data won't use splice
> every with your patches as the 4.2 client will always send a READ_PLUS.
>
> So we'll have to find a way to use it where it helps. While we might be
> able to add some hacks to only use splice for the first segment I guess
> we just need to make the splice support generic enough in the long run.
>
I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
> On 02/05/2015 11:23 AM, Christoph Hellwig wrote:
> > On Thu, Feb 05, 2015 at 11:06:04AM -0500, Anna Schumaker wrote:
> >>> If the READ_PLUS implementation can't use the splice read path it's
> >>> probably a loss for most workloads.
> >>>
> >>
> >> I'll play around with the splice path, but I don't think it was designed for encoding multiple read segments.
> >
> > The problem is that the typical case of all data won't use splice
> > every with your patches as the 4.2 client will always send a READ_PLUS.
> >
> > So we'll have to find a way to use it where it helps. While we might be
> > able to add some hacks to only use splice for the first segment I guess
> > we just need to make the splice support generic enough in the long run.
> >
>
> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
Oh, good thought, yes maybe that would be enough.
But I still wish he had more evidence here.
--b.
On 02/05/2015 11:48 AM, J. Bruce Fields wrote:
> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
>> On 02/05/2015 11:23 AM, Christoph Hellwig wrote:
>>> On Thu, Feb 05, 2015 at 11:06:04AM -0500, Anna Schumaker wrote:
>>>>> If the READ_PLUS implementation can't use the splice read path it's
>>>>> probably a loss for most workloads.
>>>>>
>>>>
>>>> I'll play around with the splice path, but I don't think it was designed for encoding multiple read segments.
>>>
>>> The problem is that the typical case of all data won't use splice
>>> every with your patches as the 4.2 client will always send a READ_PLUS.
>>>
>>> So we'll have to find a way to use it where it helps. While we might be
>>> able to add some hacks to only use splice for the first segment I guess
>>> we just need to make the splice support generic enough in the long run.
>>>
>>
>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
>
> Oh, good thought, yes maybe that would be enough.
>
> But I still wish he had more evidence here.
I'm planning to do some performance testing with this change:
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e2a4702..f516694 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3876,7 +3876,10 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *r
maxcount = min_t(unsigned long, maxcount, read->rd_length);
maxcount = min_t(unsigned long, maxcount, hole_pos - read->rd_offset);
- err = nfsd4_encode_readv(resp, read, file, &maxcount);
+ if ((read->rd_offset == 0) && (read->rd_length == maxcount))
+ err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
+ else
+ err = nfsd4_encode_readv(resp, read, file, &maxcount);
*p++ = cpu_to_be32(NFS4_CONTENT_DATA);
p = xdr_encode_hyper(p, read->rd_offset);
>
> --b.
>
On 02/05/2015 11:48 AM, J. Bruce Fields wrote:
> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
>> On 02/05/2015 11:23 AM, Christoph Hellwig wrote:
>>> On Thu, Feb 05, 2015 at 11:06:04AM -0500, Anna Schumaker wrote:
>>>>> If the READ_PLUS implementation can't use the splice read path it's
>>>>> probably a loss for most workloads.
>>>>>
>>>>
>>>> I'll play around with the splice path, but I don't think it was designed for encoding multiple read segments.
>>>
>>> The problem is that the typical case of all data won't use splice
>>> every with your patches as the 4.2 client will always send a READ_PLUS.
>>>
>>> So we'll have to find a way to use it where it helps. While we might be
>>> able to add some hacks to only use splice for the first segment I guess
>>> we just need to make the splice support generic enough in the long run.
>>>
>>
>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
>
> Oh, good thought, yes maybe that would be enough.
>
> But I still wish he had more evidence here.
I switched to using splice and fixed a few other bugs that happened to pop up. xfstests definitely runs faster compared to the old code, but I haven't had a chance to compare against other NFS versions yet.
Anna
>
> --b.
>
On 02/05/2015 11:48 AM, J. Bruce Fields wrote:
> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
>> On 02/05/2015 11:23 AM, Christoph Hellwig wrote:
>>> On Thu, Feb 05, 2015 at 11:06:04AM -0500, Anna Schumaker wrote:
>>>>> If the READ_PLUS implementation can't use the splice read path it's
>>>>> probably a loss for most workloads.
>>>>>
>>>>
>>>> I'll play around with the splice path, but I don't think it was designed for encoding multiple read segments.
>>>
>>> The problem is that the typical case of all data won't use splice
>>> every with your patches as the 4.2 client will always send a READ_PLUS.
>>>
>>> So we'll have to find a way to use it where it helps. While we might be
>>> able to add some hacks to only use splice for the first segment I guess
>>> we just need to make the splice support generic enough in the long run.
>>>
>>
>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
>
> Oh, good thought, yes maybe that would be enough.
>
> But I still wish he had more evidence here.
I'm not seeing a huge performance increase with READ_PLUS compared to READ (in fact, it's often a bit slower compared to READ, even when using splice). My guess is that the problem is mostly on the client's end since we have to do a memory shift on each segment to get everything lined up properly. I'm playing around with code that cuts down the number of memory shifts, but I still have a few bugs to work out before I'll know if it actually helps.
Anna
>
> --b.
>
On Wed, Feb 11, 2015 at 11:04 AM, Anna Schumaker
<[email protected]> wrote:
> On 02/05/2015 11:48 AM, J. Bruce Fields wrote:
>> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
>>> On 02/05/2015 11:23 AM, Christoph Hellwig wrote:
>>>> On Thu, Feb 05, 2015 at 11:06:04AM -0500, Anna Schumaker wrote:
>>>>>> If the READ_PLUS implementation can't use the splice read path it's
>>>>>> probably a loss for most workloads.
>>>>>>
>>>>>
>>>>> I'll play around with the splice path, but I don't think it was designed for encoding multiple read segments.
>>>>
>>>> The problem is that the typical case of all data won't use splice
>>>> every with your patches as the 4.2 client will always send a READ_PLUS.
>>>>
>>>> So we'll have to find a way to use it where it helps. While we might be
>>>> able to add some hacks to only use splice for the first segment I guess
>>>> we just need to make the splice support generic enough in the long run.
>>>>
>>>
>>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
>>
>> Oh, good thought, yes maybe that would be enough.
>>
>> But I still wish he had more evidence here.
>
> I'm not seeing a huge performance increase with READ_PLUS compared to READ (in fact, it's often a bit slower compared to READ, even when using splice). My guess is that the problem is mostly on the client's end since we have to do a memory shift on each segment to get everything lined up properly. I'm playing around with code that cuts down the number of memory shifts, but I still have a few bugs to work out before I'll know if it actually helps.
>
I'm wondering if the right way to do READ_PLUS would have been to
instead have a separate function READ_SPARSE, that will return a list
of all sparse areas in the supplied range. We could even make that a
READ_SAME, that can do the same for patterned data.
The thing is that READ works just fine for what we want it to do. The
real win here would be if given a very large file, we could request a
list of all the sparse areas in, say, a 100GB range, and then use that
data to build up a bitmap of unallocated blocks for which we can skip
the READ requests.
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
On Wed, Feb 11, 2015 at 11:13:38AM -0500, Trond Myklebust wrote:
> On Wed, Feb 11, 2015 at 11:04 AM, Anna Schumaker
> <[email protected]> wrote:
> > I'm not seeing a huge performance increase with READ_PLUS compared to READ (in fact, it's often a bit slower compared to READ, even when using splice). My guess is that the problem is mostly on the client's end since we have to do a memory shift on each segment to get everything lined up properly. I'm playing around with code that cuts down the number of memory shifts, but I still have a few bugs to work out before I'll know if it actually helps.
> >
>
> I'm wondering if the right way to do READ_PLUS would have been to
> instead have a separate function READ_SPARSE, that will return a list
> of all sparse areas in the supplied range. We could even make that a
> READ_SAME, that can do the same for patterned data.
I worry about ending up with incoherent results, but perhaps it's no
different from the current behavior since we're already piecing together
our idea of the file content from multiple reads sent in parallel.
> The thing is that READ works just fine for what we want it to do. The
> real win here would be if given a very large file, we could request a
> list of all the sparse areas in, say, a 100GB range, and then use that
> data to build up a bitmap of unallocated blocks for which we can skip
> the READ requests.
Can we start by having the server return a single data extent covering
the whole read request, with the single exception of the case where the
read falls entirely within a hole?
I think that should help in the case of large holes without interfering
with the client's zero-copy logic in the case there are no large holes.
--b.
On Wed, Feb 11, 2015 at 11:22 AM, J. Bruce Fields <[email protected]> wrote:
> On Wed, Feb 11, 2015 at 11:13:38AM -0500, Trond Myklebust wrote:
>> On Wed, Feb 11, 2015 at 11:04 AM, Anna Schumaker
>> <[email protected]> wrote:
>> > I'm not seeing a huge performance increase with READ_PLUS compared to READ (in fact, it's often a bit slower compared to READ, even when using splice). My guess is that the problem is mostly on the client's end since we have to do a memory shift on each segment to get everything lined up properly. I'm playing around with code that cuts down the number of memory shifts, but I still have a few bugs to work out before I'll know if it actually helps.
>> >
>>
>> I'm wondering if the right way to do READ_PLUS would have been to
>> instead have a separate function READ_SPARSE, that will return a list
>> of all sparse areas in the supplied range. We could even make that a
>> READ_SAME, that can do the same for patterned data.
>
> I worry about ending up with incoherent results, but perhaps it's no
> different from the current behavior since we're already piecing together
> our idea of the file content from multiple reads sent in parallel.
I don't see what the problem is. The client sends a READ_SPARSE, and
caches the existence or not of a hole. How is that in any way
different from caching the results of a read that returns no data?
>> The thing is that READ works just fine for what we want it to do. The
>> real win here would be if given a very large file, we could request a
>> list of all the sparse areas in, say, a 100GB range, and then use that
>> data to build up a bitmap of unallocated blocks for which we can skip
>> the READ requests.
>
> Can we start by having the server return a single data extent covering
> the whole read request, with the single exception of the case where the
> read falls entirely within a hole?
>
> I think that should help in the case of large holes without interfering
> with the client's zero-copy logic in the case there are no large holes.
>
That still forces the server to do extra work on each read: it has to
check for the presence of a hole or not instead of just filling the
buffer with data.
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
On Wed, Feb 11, 2015 at 11:31:43AM -0500, Trond Myklebust wrote:
> On Wed, Feb 11, 2015 at 11:22 AM, J. Bruce Fields <[email protected]> wrote:
> > On Wed, Feb 11, 2015 at 11:13:38AM -0500, Trond Myklebust wrote:
> >> On Wed, Feb 11, 2015 at 11:04 AM, Anna Schumaker
> >> <[email protected]> wrote:
> >> > I'm not seeing a huge performance increase with READ_PLUS compared to READ (in fact, it's often a bit slower compared to READ, even when using splice). My guess is that the problem is mostly on the client's end since we have to do a memory shift on each segment to get everything lined up properly. I'm playing around with code that cuts down the number of memory shifts, but I still have a few bugs to work out before I'll know if it actually helps.
> >> >
> >>
> >> I'm wondering if the right way to do READ_PLUS would have been to
> >> instead have a separate function READ_SPARSE, that will return a list
> >> of all sparse areas in the supplied range. We could even make that a
> >> READ_SAME, that can do the same for patterned data.
> >
> > I worry about ending up with incoherent results, but perhaps it's no
> > different from the current behavior since we're already piecing together
> > our idea of the file content from multiple reads sent in parallel.
>
> I don't see what the problem is. The client sends a READ_SPARSE, and
> caches the existence or not of a hole. How is that in any way
> different from caching the results of a read that returns no data?
>
> >> The thing is that READ works just fine for what we want it to do. The
> >> real win here would be if given a very large file, we could request a
> >> list of all the sparse areas in, say, a 100GB range, and then use that
> >> data to build up a bitmap of unallocated blocks for which we can skip
> >> the READ requests.
> >
> > Can we start by having the server return a single data extent covering
> > the whole read request, with the single exception of the case where the
> > read falls entirely within a hole?
> >
> > I think that should help in the case of large holes without interfering
> > with the client's zero-copy logic in the case there are no large holes.
> >
>
> That still forces the server to do extra work on each read: it has to
> check for the presence of a hole or not instead of just filling the
> buffer with data.
I guess. Presumably the filesystem already does something like that on
read, so I find it hard to imagine that extra check is expensive,
especially if it's amortized over large-ish reads. Seems worth checking
at least.
--b.
On Wed, Feb 11, 2015 at 11:42:19AM -0500, J. Bruce Fields wrote:
> I guess. Presumably the filesystem already does something like that on
> read, so I find it hard to imagine that extra check is expensive,
> especially if it's amortized over large-ish reads. Seems worth checking
> at least.
The filesystem zero-fills blocks when reading them from disk, which is
serveral layers away from where it interacts with nfsd. We've been
looking at an extension to read to return AGAIN when data can't be read
at the momemnt, it would be logical to base on that and just return a
hole size if there isn't any data, but implementing that would be a fair
amount of work.
On Wed, Feb 11, 2015 at 12:39 PM, Marc Eshel <[email protected]> wrote:
> [email protected] wrote on 02/11/2015 08:31:43 AM:
>
>> From: Trond Myklebust <[email protected]>
>> To: "J. Bruce Fields" <[email protected]>
>> Cc: Anna Schumaker <[email protected]>, Christoph Hellwig
>> <[email protected]>, Linux NFS Mailing List <linux-
>> [email protected]>, Thomas D Haynes <[email protected]>
>> Date: 02/11/2015 08:31 AM
>> Subject: Re: [PATCH v2 2/4] NFSD: Add READ_PLUS support for data segments
>> Sent by: [email protected]
>>
>> On Wed, Feb 11, 2015 at 11:22 AM, J. Bruce Fields
>> <[email protected]> wrote:
>> > On Wed, Feb 11, 2015 at 11:13:38AM -0500, Trond Myklebust wrote:
>> >> On Wed, Feb 11, 2015 at 11:04 AM, Anna Schumaker
>> >> <[email protected]> wrote:
>> >> > I'm not seeing a huge performance increase with READ_PLUS
>> compared to READ (in fact, it's often a bit slower compared to READ,
>> even when using splice). My guess is that the problem is mostly on
>> the client's end since we have to do a memory shift on each segment
>> to get everything lined up properly. I'm playing around with code
>> that cuts down the number of memory shifts, but I still have a few
>> bugs to work out before I'll know if it actually helps.
>> >> >
>> >>
>> >> I'm wondering if the right way to do READ_PLUS would have been to
>> >> instead have a separate function READ_SPARSE, that will return a list
>> >> of all sparse areas in the supplied range. We could even make that a
>> >> READ_SAME, that can do the same for patterned data.
>> >
>> > I worry about ending up with incoherent results, but perhaps it's no
>> > different from the current behavior since we're already piecing together
>> > our idea of the file content from multiple reads sent in parallel.
>>
>> I don't see what the problem is. The client sends a READ_SPARSE, and
>> caches the existence or not of a hole. How is that in any way
>> different from caching the results of a read that returns no data?
>>
>> >> The thing is that READ works just fine for what we want it to do. The
>> >> real win here would be if given a very large file, we could request a
>> >> list of all the sparse areas in, say, a 100GB range, and then use that
>> >> data to build up a bitmap of unallocated blocks for which we can skip
>> >> the READ requests.
>> >
>> > Can we start by having the server return a single data extent covering
>> > the whole read request, with the single exception of the case where the
>> > read falls entirely within a hole?
>> >
>> > I think that should help in the case of large holes without interfering
>> > with the client's zero-copy logic in the case there are no large holes.
>> >
>>
>> That still forces the server to do extra work on each read: it has to
>> check for the presence of a hole or not instead of just filling the
>> buffer with data.
>
> A good hint that we are dealing with a sparse file is the the number of
> blocks don't add up to the reported filesize
> Marc.
Sure, but that still adds up to an unnecessary inefficiency on each
READ_PLUS call to that file.
My point is that the best way for the client to process this
information (and for the server too) is to read the sparseness map in
once as a bulk operation on a very large chunk of the file, and then
to use that map as a guide for when it needs to call READ.
--
Trond Myklebust
Linux NFS client maintainer, PrimaryData
[email protected]
On Wed, Feb 11, 2015 at 12:47:26PM -0500, Trond Myklebust wrote:
> On Wed, Feb 11, 2015 at 12:39 PM, Marc Eshel <[email protected]> wrote:
> >
> > A good hint that we are dealing with a sparse file is the the number of
> > blocks don't add up to the reported filesize
>
> Sure, but that still adds up to an unnecessary inefficiency on each
> READ_PLUS call to that file.
>
> My point is that the best way for the client to process this
> information (and for the server too) is to read the sparseness map in
> once as a bulk operation on a very large chunk of the file, and then
> to use that map as a guide for when it needs to call READ.
I thought we agreed that the sparseness map made no sense because
the information was immeadiately stale?
Anyway, the client could build that map if it wanted to using SEEK. I'm
not arguing that it would be efficient, but that it could be done
with a cycle of looking for holes.
On Wed, Feb 11, 2015 at 1:17 PM, Tom Haynes
<[email protected]> wrote:
> On Wed, Feb 11, 2015 at 12:47:26PM -0500, Trond Myklebust wrote:
>> On Wed, Feb 11, 2015 at 12:39 PM, Marc Eshel <[email protected]> wrote:
>> >
>> > A good hint that we are dealing with a sparse file is the the number of
>> > blocks don't add up to the reported filesize
>>
>> Sure, but that still adds up to an unnecessary inefficiency on each
>> READ_PLUS call to that file.
>>
>> My point is that the best way for the client to process this
>> information (and for the server too) is to read the sparseness map in
>> once as a bulk operation on a very large chunk of the file, and then
>> to use that map as a guide for when it needs to call READ.
>
> I thought we agreed that the sparseness map made no sense because
> the information was immeadiately stale?
Right now, we're caching the zero reads, aren't we? What makes caching
hole information so special?
> Anyway, the client could build that map if it wanted to using SEEK. I'm
> not arguing that it would be efficient, but that it could be done
> with a cycle of looking for holes.
Sure, however that's not necessarily useful as an optimiser either.
On 02/11/2015 01:49 PM, Trond Myklebust wrote:
> On Wed, Feb 11, 2015 at 1:17 PM, Tom Haynes
> <[email protected]> wrote:
>> On Wed, Feb 11, 2015 at 12:47:26PM -0500, Trond Myklebust wrote:
>>> On Wed, Feb 11, 2015 at 12:39 PM, Marc Eshel <[email protected]> wrote:
>>>>
>>>> A good hint that we are dealing with a sparse file is the the number of
>>>> blocks don't add up to the reported filesize
>>>
>>> Sure, but that still adds up to an unnecessary inefficiency on each
>>> READ_PLUS call to that file.
>>>
>>> My point is that the best way for the client to process this
>>> information (and for the server too) is to read the sparseness map in
>>> once as a bulk operation on a very large chunk of the file, and then
>>> to use that map as a guide for when it needs to call READ.
>>
>> I thought we agreed that the sparseness map made no sense because
>> the information was immeadiately stale?
>
> Right now, we're caching the zero reads, aren't we? What makes caching
> hole information so special?
>
>> Anyway, the client could build that map if it wanted to using SEEK. I'm
>> not arguing that it would be efficient, but that it could be done
>> with a cycle of looking for holes.
>
> Sure, however that's not necessarily useful as an optimiser either.
The vfs on Linux doesn't keep a sparse map either, so the server would be stuck doing seeks to build this up anyway. I've already seen that this can be somewhat unreliable while working on READ_PLUS.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On 02/11/2015 11:22 AM, J. Bruce Fields wrote:
> On Wed, Feb 11, 2015 at 11:13:38AM -0500, Trond Myklebust wrote:
>> On Wed, Feb 11, 2015 at 11:04 AM, Anna Schumaker
>> <[email protected]> wrote:
>>> I'm not seeing a huge performance increase with READ_PLUS compared to READ (in fact, it's often a bit slower compared to READ, even when using splice). My guess is that the problem is mostly on the client's end since we have to do a memory shift on each segment to get everything lined up properly. I'm playing around with code that cuts down the number of memory shifts, but I still have a few bugs to work out before I'll know if it actually helps.
>>>
>>
>> I'm wondering if the right way to do READ_PLUS would have been to
>> instead have a separate function READ_SPARSE, that will return a list
>> of all sparse areas in the supplied range. We could even make that a
>> READ_SAME, that can do the same for patterned data.
>
> I worry about ending up with incoherent results, but perhaps it's no
> different from the current behavior since we're already piecing together
> our idea of the file content from multiple reads sent in parallel.
>
>> The thing is that READ works just fine for what we want it to do. The
>> real win here would be if given a very large file, we could request a
>> list of all the sparse areas in, say, a 100GB range, and then use that
>> data to build up a bitmap of unallocated blocks for which we can skip
>> the READ requests.
>
> Can we start by having the server return a single data extent covering
> the whole read request, with the single exception of the case where the
> read falls entirely within a hole?
I'm trying this and it's still giving me pretty bad performance. I picked out 6 xfstests that read sparse files, and v4.2 takes almost a minute longer to run compared to v4.1. (1:30 vs 2:22).
I'm going to look into how I zero pages on the client - maybe that can be combined with the right-shift function so pages only need to be mapped into memory once.
Anna
>
> I think that should help in the case of large holes without interfering
> with the client's zero-copy logic in the case there are no large holes.
>
> --b.
>
On 02/11/2015 02:22 PM, Anna Schumaker wrote:
> On 02/11/2015 11:22 AM, J. Bruce Fields wrote:
>> On Wed, Feb 11, 2015 at 11:13:38AM -0500, Trond Myklebust wrote:
>>> On Wed, Feb 11, 2015 at 11:04 AM, Anna Schumaker
>>> <[email protected]> wrote:
>>>> I'm not seeing a huge performance increase with READ_PLUS compared to READ (in fact, it's often a bit slower compared to READ, even when using splice). My guess is that the problem is mostly on the client's end since we have to do a memory shift on each segment to get everything lined up properly. I'm playing around with code that cuts down the number of memory shifts, but I still have a few bugs to work out before I'll know if it actually helps.
>>>>
>>>
>>> I'm wondering if the right way to do READ_PLUS would have been to
>>> instead have a separate function READ_SPARSE, that will return a list
>>> of all sparse areas in the supplied range. We could even make that a
>>> READ_SAME, that can do the same for patterned data.
>>
>> I worry about ending up with incoherent results, but perhaps it's no
>> different from the current behavior since we're already piecing together
>> our idea of the file content from multiple reads sent in parallel.
>>
>>> The thing is that READ works just fine for what we want it to do. The
>>> real win here would be if given a very large file, we could request a
>>> list of all the sparse areas in, say, a 100GB range, and then use that
>>> data to build up a bitmap of unallocated blocks for which we can skip
>>> the READ requests.
>>
>> Can we start by having the server return a single data extent covering
>> the whole read request, with the single exception of the case where the
>> read falls entirely within a hole?
>
> I'm trying this and it's still giving me pretty bad performance. I picked out 6 xfstests that read sparse files, and v4.2 takes almost a minute longer to run compared to v4.1. (1:30 vs 2:22).
>
> I'm going to look into how I zero pages on the client - maybe that can be combined with the right-shift function so pages only need to be mapped into memory once.
Today I learned all about how to use operf to identify where the bottleneck is :). It looks like the problem is in the hole zeroing code on the client side. Is there a better way than memset() to mark a page as all zeros?
Anna
>
> Anna
>
>>
>> I think that should help in the case of large holes without interfering
>> with the client's zero-copy logic in the case there are no large holes.
>>
>> --b.
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
On Thu, Feb 12, 2015 at 02:59:45PM -0500, Anna Schumaker wrote:
> On 02/11/2015 02:22 PM, Anna Schumaker wrote:
> > On 02/11/2015 11:22 AM, J. Bruce Fields wrote:
> >> On Wed, Feb 11, 2015 at 11:13:38AM -0500, Trond Myklebust wrote:
> >>> On Wed, Feb 11, 2015 at 11:04 AM, Anna Schumaker
> >>> <[email protected]> wrote:
> >>>> I'm not seeing a huge performance increase with READ_PLUS compared to READ (in fact, it's often a bit slower compared to READ, even when using splice). My guess is that the problem is mostly on the client's end since we have to do a memory shift on each segment to get everything lined up properly. I'm playing around with code that cuts down the number of memory shifts, but I still have a few bugs to work out before I'll know if it actually helps.
> >>>>
> >>>
> >>> I'm wondering if the right way to do READ_PLUS would have been to
> >>> instead have a separate function READ_SPARSE, that will return a list
> >>> of all sparse areas in the supplied range. We could even make that a
> >>> READ_SAME, that can do the same for patterned data.
> >>
> >> I worry about ending up with incoherent results, but perhaps it's no
> >> different from the current behavior since we're already piecing together
> >> our idea of the file content from multiple reads sent in parallel.
> >>
> >>> The thing is that READ works just fine for what we want it to do. The
> >>> real win here would be if given a very large file, we could request a
> >>> list of all the sparse areas in, say, a 100GB range, and then use that
> >>> data to build up a bitmap of unallocated blocks for which we can skip
> >>> the READ requests.
> >>
> >> Can we start by having the server return a single data extent covering
> >> the whole read request, with the single exception of the case where the
> >> read falls entirely within a hole?
> >
> > I'm trying this and it's still giving me pretty bad performance. I picked out 6 xfstests that read sparse files, and v4.2 takes almost a minute longer to run compared to v4.1. (1:30 vs 2:22).
> >
> > I'm going to look into how I zero pages on the client - maybe that can be combined with the right-shift function so pages only need to be mapped into memory once.
>
> Today I learned all about how to use operf to identify where the bottleneck is :). It looks like the problem is in the hole zeroing code on the client side. Is there a better way than memset() to mark a page as all zeros?
I'm a little surprised that replacing a READ with a READ_PLUS + memset
is slower, and I'm really surprised that it's a minute slower. Are we
sure there isn't something else going on?
Does comparing the operation counts (from /proc/self/mountstats) show
anything interesting?
--b.
On Wed, Feb 11, 2015 at 11:13:38AM -0500, Trond Myklebust wrote:
> I'm wondering if the right way to do READ_PLUS would have been to
> instead have a separate function READ_SPARSE, that will return a list
> of all sparse areas in the supplied range. We could even make that a
> READ_SAME, that can do the same for patterned data.
That soudns like the protocol level equivalent of the fiemap ioctl.
While fiemap is useful for filesystem debugging, using it for detection
of sparse ranges in oreutils turned out to be a desaster, and we instead
oved to support lseek SEEK_DATA/SEEK_HOLE, which would map to SEEK in
NFSv4.2.
The userspace tools use the different in file size vs allocate blocks
in the stat data to decide if they use SEEK_DATA/SEEK_HOLE, which should
work just fine over NFSv4.2 as well.
On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
> > The problem is that the typical case of all data won't use splice
> > every with your patches as the 4.2 client will always send a READ_PLUS.
> >
> > So we'll have to find a way to use it where it helps. While we might be
> > able to add some hacks to only use splice for the first segment I guess
> > we just need to make the splice support generic enough in the long run.
> >
>
> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
You could also elect to never return more than one data segment as a
start:
In all situations, the
server may choose to return fewer bytes than specified by the client.
The client needs to check for this condition and handle the
condition appropriately.
But doing any of these for a call that's really just an optimization
soudns odd. I'd really like to see an evaluation of the READ_PLUS
impact on various workloads before offering it.
On Fri, Feb 06, 2015 at 03:54:56AM -0800, Christoph Hellwig wrote:
> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
> > > The problem is that the typical case of all data won't use splice
> > > every with your patches as the 4.2 client will always send a READ_PLUS.
> > >
> > > So we'll have to find a way to use it where it helps. While we might be
> > > able to add some hacks to only use splice for the first segment I guess
> > > we just need to make the splice support generic enough in the long run.
> > >
> >
> > I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
>
> You could also elect to never return more than one data segment as a
> start:
>
> In all situations, the
> server may choose to return fewer bytes than specified by the client.
> The client needs to check for this condition and handle the
> condition appropriately.
Yeah, I think that was more-or-less what Anna's first attempt did and I
said "what if that means more round trips"? The client can't anticipate
the short reads so it can't make up for this with parallelism.
> But doing any of these for a call that's really just an optimization
> soudns odd. I'd really like to see an evaluation of the READ_PLUS
> impact on various workloads before offering it.
Yes, unfortunately I don't see a way to make this just an obvious win.
(Is there any way we could make it so with better protocol? Maybe RDMA
could help get the alignment right in multiple-segment cases? But then
I think there needs to be some sort of language about RDMA, or else
we're stuck with:
https://tools.ietf.org/html/rfc5667#section-5
which I think forces us to return READ_PLUS data inline, another
possible READ_PLUS regression.)
Anyway, ignoring RDMA, I guess the interesting questions to me are:
- How much does READ_PLUS actually help in the cases it was
meant to? Perhaps create and age a VM image somehow, and then
time a read of the entire image?
- What are the worst-case regressions? Read a large file with a
lot of small holes and measure time and cpu usage?
--b.
On Fri, Feb 06, 2015 at 11:08:48AM -0500, J. Bruce Fields wrote:
> On Fri, Feb 06, 2015 at 03:54:56AM -0800, Christoph Hellwig wrote:
> > On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
> > > > The problem is that the typical case of all data won't use splice
> > > > every with your patches as the 4.2 client will always send a READ_PLUS.
> > > >
> > > > So we'll have to find a way to use it where it helps. While we might be
> > > > able to add some hacks to only use splice for the first segment I guess
> > > > we just need to make the splice support generic enough in the long run.
> > > >
> > >
> > > I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
> >
> > You could also elect to never return more than one data segment as a
> > start:
> >
> > In all situations, the
> > server may choose to return fewer bytes than specified by the client.
> > The client needs to check for this condition and handle the
> > condition appropriately.
>
> Yeah, I think that was more-or-less what Anna's first attempt did and I
> said "what if that means more round trips"? The client can't anticipate
> the short reads so it can't make up for this with parallelism.
>
> > But doing any of these for a call that's really just an optimization
> > soudns odd. I'd really like to see an evaluation of the READ_PLUS
> > impact on various workloads before offering it.
>
> Yes, unfortunately I don't see a way to make this just an obvious win.
Actually, the other fallback is to return a single data segment, which
shouldn't be any worse than READ as long as the client assumes that's
the normal case.
So as a first pass another idea might be to return a hole *only* if it
covers the entire requested range and otherwise represent the result as
a single data segment?
(Or you could imagine various ways to make it more complicated: e.g.
also allow 1 data segment + 1 hole, also representable with an xdr buf
and without screwing up the client's alignment expectations. Or allow
additional data segments if they're small enough that we think the
copying cost would be negligible.)
And turn off READ_PLUS over RDMA, but that seems like something that
should be fixed in the spec.
--b.
>
> (Is there any way we could make it so with better protocol? Maybe RDMA
> could help get the alignment right in multiple-segment cases? But then
> I think there needs to be some sort of language about RDMA, or else
> we're stuck with:
>
> https://tools.ietf.org/html/rfc5667#section-5
>
> which I think forces us to return READ_PLUS data inline, another
> possible READ_PLUS regression.)
>
> Anyway, ignoring RDMA, I guess the interesting questions to me are:
>
> - How much does READ_PLUS actually help in the cases it was
> meant to? Perhaps create and age a VM image somehow, and then
> time a read of the entire image?
> - What are the worst-case regressions? Read a large file with a
> lot of small holes and measure time and cpu usage?
>
> --b.
On Feb 6, 2015, at 11:08 AM, J. Bruce Fields <[email protected]> wrote:
> On Fri, Feb 06, 2015 at 03:54:56AM -0800, Christoph Hellwig wrote:
>> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
>>>> The problem is that the typical case of all data won't use splice
>>>> every with your patches as the 4.2 client will always send a READ_PLUS.
>>>>
>>>> So we'll have to find a way to use it where it helps. While we might be
>>>> able to add some hacks to only use splice for the first segment I guess
>>>> we just need to make the splice support generic enough in the long run.
>>>>
>>>
>>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
>>
>> You could also elect to never return more than one data segment as a
>> start:
>>
>> In all situations, the
>> server may choose to return fewer bytes than specified by the client.
>> The client needs to check for this condition and handle the
>> condition appropriately.
>
> Yeah, I think that was more-or-less what Anna's first attempt did and I
> said "what if that means more round trips"? The client can't anticipate
> the short reads so it can't make up for this with parallelism.
>
>> But doing any of these for a call that's really just an optimization
>> soudns odd. I'd really like to see an evaluation of the READ_PLUS
>> impact on various workloads before offering it.
>
> Yes, unfortunately I don't see a way to make this just an obvious win.
I don?t think a ?win? is necessary. It simply needs to be no worse than
READ for current use cases.
READ_PLUS should be a win for the particular use cases it was
designed for (large sparsely-populated datasets). Without a
demonstrated benefit I think there?s no point in keeping it.
> (Is there any way we could make it so with better protocol? Maybe RDMA
> could help get the alignment right in multiple-segment cases? But then
> I think there needs to be some sort of language about RDMA, or else
> we're stuck with:
>
> https://tools.ietf.org/html/rfc5667#section-5
>
> which I think forces us to return READ_PLUS data inline, another
> possible READ_PLUS regression.)
NFSv4.2 currently does not have a binding to RPC/RDMA. It?s hard to
say at this point what a READ_PLUS on RPC/RDMA might look like.
RDMA clearly provides no advantage for moving a pattern that a
client must re-inflate into data itself. I can guess that only the
CONTENT_DATA case is interesting for RDMA bulk transfers.
But don?t forget that NFSv4.1 and later don?t yet work over RDMA,
thanks to missing support for bi-directional RPC/RDMA. I wouldn?t
worry about special cases for it at this point.
--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com
On Feb 6, 2015, at 11:46 AM, Chuck Lever <[email protected]> wrote:
>
> On Feb 6, 2015, at 11:08 AM, J. Bruce Fields <[email protected]> wrote:
>
>> On Fri, Feb 06, 2015 at 03:54:56AM -0800, Christoph Hellwig wrote:
>>> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
>>>>> The problem is that the typical case of all data won't use splice
>>>>> every with your patches as the 4.2 client will always send a READ_PLUS.
>>>>>
>>>>> So we'll have to find a way to use it where it helps. While we might be
>>>>> able to add some hacks to only use splice for the first segment I guess
>>>>> we just need to make the splice support generic enough in the long run.
>>>>>
>>>>
>>>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
>>>
>>> You could also elect to never return more than one data segment as a
>>> start:
>>>
>>> In all situations, the
>>> server may choose to return fewer bytes than specified by the client.
>>> The client needs to check for this condition and handle the
>>> condition appropriately.
>>
>> Yeah, I think that was more-or-less what Anna's first attempt did and I
>> said "what if that means more round trips"? The client can't anticipate
>> the short reads so it can't make up for this with parallelism.
>>
>>> But doing any of these for a call that's really just an optimization
>>> soudns odd. I'd really like to see an evaluation of the READ_PLUS
>>> impact on various workloads before offering it.
>>
>> Yes, unfortunately I don't see a way to make this just an obvious win.
>
> I don?t think a ?win? is necessary. It simply needs to be no worse than
> READ for current use cases.
>
> READ_PLUS should be a win for the particular use cases it was
> designed for (large sparsely-populated datasets). Without a
> demonstrated benefit I think there?s no point in keeping it.
>
>> (Is there any way we could make it so with better protocol? Maybe RDMA
>> could help get the alignment right in multiple-segment cases? But then
>> I think there needs to be some sort of language about RDMA, or else
>> we're stuck with:
>>
>> https://tools.ietf.org/html/rfc5667#section-5
>>
>> which I think forces us to return READ_PLUS data inline, another
>> possible READ_PLUS regression.)
Btw, if I understand this correctly:
Without a spec update, a large NFS READ_PLUS reply would be returned
in a reply list, which is moved via RDMA WRITE, just like READ
replies.
The difference is NFS READ payload is placed directly into the
client?s page cache by the adapter. With a reply list, the client
transport would need to copy the returned data into the page cache.
And a large reply buffer would be needed.
So, slower, yes. But not inline.
> NFSv4.2 currently does not have a binding to RPC/RDMA.
Right, this means a spec update is needed. I agree with you, and
it?s on our list.
> It?s hard to
> say at this point what a READ_PLUS on RPC/RDMA might look like.
>
> RDMA clearly provides no advantage for moving a pattern that a
> client must re-inflate into data itself. I can guess that only the
> CONTENT_DATA case is interesting for RDMA bulk transfers.
>
> But don?t forget that NFSv4.1 and later don?t yet work over RDMA,
> thanks to missing support for bi-directional RPC/RDMA. I wouldn?t
> worry about special cases for it at this point.
>
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com
On Fri, Feb 06, 2015 at 12:04:13PM -0500, Chuck Lever wrote:
>
> On Feb 6, 2015, at 11:46 AM, Chuck Lever <[email protected]> wrote:
>
> >
> > On Feb 6, 2015, at 11:08 AM, J. Bruce Fields <[email protected]> wrote:
> >
> >> On Fri, Feb 06, 2015 at 03:54:56AM -0800, Christoph Hellwig wrote:
> >>> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
> >>>>> The problem is that the typical case of all data won't use splice
> >>>>> every with your patches as the 4.2 client will always send a READ_PLUS.
> >>>>>
> >>>>> So we'll have to find a way to use it where it helps. While we might be
> >>>>> able to add some hacks to only use splice for the first segment I guess
> >>>>> we just need to make the splice support generic enough in the long run.
> >>>>>
> >>>>
> >>>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
> >>>
> >>> You could also elect to never return more than one data segment as a
> >>> start:
> >>>
> >>> In all situations, the
> >>> server may choose to return fewer bytes than specified by the client.
> >>> The client needs to check for this condition and handle the
> >>> condition appropriately.
> >>
> >> Yeah, I think that was more-or-less what Anna's first attempt did and I
> >> said "what if that means more round trips"? The client can't anticipate
> >> the short reads so it can't make up for this with parallelism.
> >>
> >>> But doing any of these for a call that's really just an optimization
> >>> soudns odd. I'd really like to see an evaluation of the READ_PLUS
> >>> impact on various workloads before offering it.
> >>
> >> Yes, unfortunately I don't see a way to make this just an obvious win.
> >
> > I don’t think a “win” is necessary. It simply needs to be no worse than
> > READ for current use cases.
> >
> > READ_PLUS should be a win for the particular use cases it was
> > designed for (large sparsely-populated datasets). Without a
> > demonstrated benefit I think there’s no point in keeping it.
> >
> >> (Is there any way we could make it so with better protocol? Maybe RDMA
> >> could help get the alignment right in multiple-segment cases? But then
> >> I think there needs to be some sort of language about RDMA, or else
> >> we're stuck with:
> >>
> >> https://tools.ietf.org/html/rfc5667#section-5
> >>
> >> which I think forces us to return READ_PLUS data inline, another
> >> possible READ_PLUS regression.)
>
> Btw, if I understand this correctly:
>
> Without a spec update, a large NFS READ_PLUS reply would be returned
> in a reply list, which is moved via RDMA WRITE, just like READ
> replies.
>
> The difference is NFS READ payload is placed directly into the
> client’s page cache by the adapter. With a reply list, the client
> transport would need to copy the returned data into the page cache.
> And a large reply buffer would be needed.
>
> So, slower, yes. But not inline.
I'm not very good at this, bear with me, but: the above-referenced
section doesn't talk about "reply lists", only "write lists", and only
explains how to use write lists for READ and READLINK data, and seems to
expect everything else to be sent inline.
> > NFSv4.2 currently does not have a binding to RPC/RDMA.
>
> Right, this means a spec update is needed. I agree with you, and
> it’s on our list.
OK, so that would go in some kind of update to 5667 rather than in the
minor version 2 spec?
Discussing this in the READ_PLUS description would also seem helpful to
me, but OK I don't really have a strong opinion.
--b.
>
> > It’s hard to
> > say at this point what a READ_PLUS on RPC/RDMA might look like.
> >
> > RDMA clearly provides no advantage for moving a pattern that a
> > client must re-inflate into data itself. I can guess that only the
> > CONTENT_DATA case is interesting for RDMA bulk transfers.
> >
> > But don’t forget that NFSv4.1 and later don’t yet work over RDMA,
> > thanks to missing support for bi-directional RPC/RDMA. I wouldn’t
> > worry about special cases for it at this point.
> >
> > --
> > Chuck Lever
> > chuck[dot]lever[at]oracle[dot]com
> >
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
>
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
>
>
On Feb 6, 2015, at 12:59 PM, J. Bruce Fields <[email protected]> wrote:
> On Fri, Feb 06, 2015 at 12:04:13PM -0500, Chuck Lever wrote:
>>
>> On Feb 6, 2015, at 11:46 AM, Chuck Lever <[email protected]> wrote:
>>
>>>
>>> On Feb 6, 2015, at 11:08 AM, J. Bruce Fields <[email protected]> wrote:
>>>
>>>> On Fri, Feb 06, 2015 at 03:54:56AM -0800, Christoph Hellwig wrote:
>>>>> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
>>>>>>> The problem is that the typical case of all data won't use splice
>>>>>>> every with your patches as the 4.2 client will always send a READ_PLUS.
>>>>>>>
>>>>>>> So we'll have to find a way to use it where it helps. While we might be
>>>>>>> able to add some hacks to only use splice for the first segment I guess
>>>>>>> we just need to make the splice support generic enough in the long run.
>>>>>>>
>>>>>>
>>>>>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
>>>>>
>>>>> You could also elect to never return more than one data segment as a
>>>>> start:
>>>>>
>>>>> In all situations, the
>>>>> server may choose to return fewer bytes than specified by the client.
>>>>> The client needs to check for this condition and handle the
>>>>> condition appropriately.
>>>>
>>>> Yeah, I think that was more-or-less what Anna's first attempt did and I
>>>> said "what if that means more round trips"? The client can't anticipate
>>>> the short reads so it can't make up for this with parallelism.
>>>>
>>>>> But doing any of these for a call that's really just an optimization
>>>>> soudns odd. I'd really like to see an evaluation of the READ_PLUS
>>>>> impact on various workloads before offering it.
>>>>
>>>> Yes, unfortunately I don't see a way to make this just an obvious win.
>>>
>>> I don?t think a ?win? is necessary. It simply needs to be no worse than
>>> READ for current use cases.
>>>
>>> READ_PLUS should be a win for the particular use cases it was
>>> designed for (large sparsely-populated datasets). Without a
>>> demonstrated benefit I think there?s no point in keeping it.
>>>
>>>> (Is there any way we could make it so with better protocol? Maybe RDMA
>>>> could help get the alignment right in multiple-segment cases? But then
>>>> I think there needs to be some sort of language about RDMA, or else
>>>> we're stuck with:
>>>>
>>>> https://tools.ietf.org/html/rfc5667#section-5
>>>>
>>>> which I think forces us to return READ_PLUS data inline, another
>>>> possible READ_PLUS regression.)
>>
>> Btw, if I understand this correctly:
>>
>> Without a spec update, a large NFS READ_PLUS reply would be returned
>> in a reply list, which is moved via RDMA WRITE, just like READ
>> replies.
>>
>> The difference is NFS READ payload is placed directly into the
>> client?s page cache by the adapter. With a reply list, the client
>> transport would need to copy the returned data into the page cache.
>> And a large reply buffer would be needed.
>>
>> So, slower, yes. But not inline.
>
> I'm not very good at this, bear with me, but: the above-referenced
> section doesn't talk about "reply lists", only "write lists", and only
> explains how to use write lists for READ and READLINK data, and seems to expect everything else to be sent inline.
I may have some details wrong, but this is my understanding.
Small replies are sent inline. There is a size maximum for inline
messages, however. I guess 5667 section 5 assumes this context, which
appears throughout RFC 5666.
If an expected reply exceeds the inline size, then a client will
set up a reply list for the server. A memory region on the client is
registered as a target for RDMA WRITE operations, and the co-ordinates
of that region are sent to the server in the RPC call.
If the server finds the reply will indeed be larger than the inline
maximum, it plants the reply in the client memory region described by
the request?s reply list, and repeats the co-ordinates of that region
back to the client in the RPC reply.
A server may also choose to send a small reply inline, even if the
client provided a reply list. In that case, the server does not
repeat the reply list in the reply, and the full reply appears
inline.
Linux registers part of the RPC reply buffer for the reply list. After
it is received on the client, the reply payload is copied by the client
CPU to its final destination.
Inline and reply list are the mechanisms used when the upper layer
has some processing to do to the incoming data (eg READDIR). When
a request just needs raw data to be simply dropped off in the client?s
memory, then the write list is preferred. A write list is basically a
zero-copy I/O.
But these choices are fixed by the specified RPC/RDMA binding of the
upper layer protocol (that?s what RFC 5667 is). NFS READ and READLINK
are the only NFS operations allowed to use a write list. (NFSv4
compounds are somewhat ambiguous, and that too needs to be addressed).
As READ_PLUS conveys both kinds of data (zero-copy and data that
might require some processing) IMO RFC 5667 does not provide adequate
guidance about how to convey READ_PLUS. It will need to be added
somewhere.
>>> NFSv4.2 currently does not have a binding to RPC/RDMA.
>>
>> Right, this means a spec update is needed. I agree with you, and
>> it?s on our list.
>
> OK, so that would go in some kind of update to 5667 rather than in the
> minor version 2 spec?
The WG has to decide whether an update to 5667 or a new document will
be the ultimate vehicle.
> Discussing this in the READ_PLUS description would also seem helpful to
> me, but OK I don't really have a strong opinion.
If there is a precedent, it?s probably that the RPC/RDMA binding is
specified in a separate document. I suspect there won?t be much
appetite for holding up NFSv4.2 for an RPC/RDMA binding.
> --b.
>
>>
>>> It?s hard to
>>> say at this point what a READ_PLUS on RPC/RDMA might look like.
>>>
>>> RDMA clearly provides no advantage for moving a pattern that a
>>> client must re-inflate into data itself. I can guess that only the
>>> CONTENT_DATA case is interesting for RDMA bulk transfers.
>>>
>>> But don?t forget that NFSv4.1 and later don?t yet work over RDMA,
>>> thanks to missing support for bi-directional RPC/RDMA. I wouldn?t
>>> worry about special cases for it at this point.
>>>
>>> --
>>> Chuck Lever
>>> chuck[dot]lever[at]oracle[dot]com
>>>
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
>>> the body of a message to [email protected]
>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>> --
>> Chuck Lever
>> chuck[dot]lever[at]oracle[dot]com
>>
>>
--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com
On Fri, Feb 06, 2015 at 01:44:15PM -0500, Chuck Lever wrote:
>
> On Feb 6, 2015, at 12:59 PM, J. Bruce Fields <[email protected]> wrote:
>
> > On Fri, Feb 06, 2015 at 12:04:13PM -0500, Chuck Lever wrote:
> >>
> >> On Feb 6, 2015, at 11:46 AM, Chuck Lever <[email protected]> wrote:
> >>
> >>>
> >>> On Feb 6, 2015, at 11:08 AM, J. Bruce Fields <[email protected]> wrote:
> >>>
> >>>> On Fri, Feb 06, 2015 at 03:54:56AM -0800, Christoph Hellwig wrote:
> >>>>> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
> >>>>>>> The problem is that the typical case of all data won't use splice
> >>>>>>> every with your patches as the 4.2 client will always send a READ_PLUS.
> >>>>>>>
> >>>>>>> So we'll have to find a way to use it where it helps. While we might be
> >>>>>>> able to add some hacks to only use splice for the first segment I guess
> >>>>>>> we just need to make the splice support generic enough in the long run.
> >>>>>>>
> >>>>>>
> >>>>>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
> >>>>>
> >>>>> You could also elect to never return more than one data segment as a
> >>>>> start:
> >>>>>
> >>>>> In all situations, the
> >>>>> server may choose to return fewer bytes than specified by the client.
> >>>>> The client needs to check for this condition and handle the
> >>>>> condition appropriately.
> >>>>
> >>>> Yeah, I think that was more-or-less what Anna's first attempt did and I
> >>>> said "what if that means more round trips"? The client can't anticipate
> >>>> the short reads so it can't make up for this with parallelism.
> >>>>
> >>>>> But doing any of these for a call that's really just an optimization
> >>>>> soudns odd. I'd really like to see an evaluation of the READ_PLUS
> >>>>> impact on various workloads before offering it.
> >>>>
> >>>> Yes, unfortunately I don't see a way to make this just an obvious win.
> >>>
> >>> I don’t think a “win” is necessary. It simply needs to be no worse than
> >>> READ for current use cases.
> >>>
> >>> READ_PLUS should be a win for the particular use cases it was
> >>> designed for (large sparsely-populated datasets). Without a
> >>> demonstrated benefit I think there’s no point in keeping it.
> >>>
> >>>> (Is there any way we could make it so with better protocol? Maybe RDMA
> >>>> could help get the alignment right in multiple-segment cases? But then
> >>>> I think there needs to be some sort of language about RDMA, or else
> >>>> we're stuck with:
> >>>>
> >>>> https://tools.ietf.org/html/rfc5667#section-5
> >>>>
> >>>> which I think forces us to return READ_PLUS data inline, another
> >>>> possible READ_PLUS regression.)
> >>
> >> Btw, if I understand this correctly:
> >>
> >> Without a spec update, a large NFS READ_PLUS reply would be returned
> >> in a reply list, which is moved via RDMA WRITE, just like READ
> >> replies.
> >>
> >> The difference is NFS READ payload is placed directly into the
> >> client’s page cache by the adapter. With a reply list, the client
> >> transport would need to copy the returned data into the page cache.
> >> And a large reply buffer would be needed.
> >>
> >> So, slower, yes. But not inline.
> >
> > I'm not very good at this, bear with me, but: the above-referenced
> > section doesn't talk about "reply lists", only "write lists", and only
> > explains how to use write lists for READ and READLINK data, and seems to expect everything else to be sent inline.
>
> I may have some details wrong, but this is my understanding.
>
> Small replies are sent inline. There is a size maximum for inline
> messages, however. I guess 5667 section 5 assumes this context, which
> appears throughout RFC 5666.
>
> If an expected reply exceeds the inline size, then a client will
> set up a reply list for the server. A memory region on the client is
> registered as a target for RDMA WRITE operations, and the co-ordinates
> of that region are sent to the server in the RPC call.
>
> If the server finds the reply will indeed be larger than the inline
> maximum, it plants the reply in the client memory region described by
> the request’s reply list, and repeats the co-ordinates of that region
> back to the client in the RPC reply.
>
> A server may also choose to send a small reply inline, even if the
> client provided a reply list. In that case, the server does not
> repeat the reply list in the reply, and the full reply appears
> inline.
>
> Linux registers part of the RPC reply buffer for the reply list. After
> it is received on the client, the reply payload is copied by the client
> CPU to its final destination.
>
> Inline and reply list are the mechanisms used when the upper layer
> has some processing to do to the incoming data (eg READDIR). When
> a request just needs raw data to be simply dropped off in the client’s
> memory, then the write list is preferred. A write list is basically a
> zero-copy I/O.
The term "reply list" doesn't appear in either RFC. I believe you mean
"client-posted write list" in most of the above, except this last
paragraph, which should have started with "Inline and server-posted read
list..." ?
> But these choices are fixed by the specified RPC/RDMA binding of the
> upper layer protocol (that’s what RFC 5667 is). NFS READ and READLINK
> are the only NFS operations allowed to use a write list. (NFSv4
> compounds are somewhat ambiguous, and that too needs to be addressed).
>
> As READ_PLUS conveys both kinds of data (zero-copy and data that
> might require some processing) IMO RFC 5667 does not provide adequate
> guidance about how to convey READ_PLUS. It will need to be added
> somewhere.
OK, good. I wonder how it would do this. The best the client could do,
I guess, is provide the same write list it would for a READ of the same
extent. Could the server then write just the pieces of that extent it
needs to, send the hole information inline, and leave it to the client
to do any necessary zeroing? (And is any of this worth it?)
> >>> NFSv4.2 currently does not have a binding to RPC/RDMA.
> >>
> >> Right, this means a spec update is needed. I agree with you, and
> >> it’s on our list.
> >
> > OK, so that would go in some kind of update to 5667 rather than in the
> > minor version 2 spec?
>
> The WG has to decide whether an update to 5667 or a new document will
> be the ultimate vehicle.
>
> > Discussing this in the READ_PLUS description would also seem helpful to
> > me, but OK I don't really have a strong opinion.
>
> If there is a precedent, it’s probably that the RPC/RDMA binding is
> specified in a separate document. I suspect there won’t be much
> appetite for holding up NFSv4.2 for an RPC/RDMA binding.
Alright, I guess that makes sense, thanks.
--b.
On Feb 6, 2015, at 2:35 PM, J. Bruce Fields <[email protected]> wrote:
> On Fri, Feb 06, 2015 at 01:44:15PM -0500, Chuck Lever wrote:
>>
>> On Feb 6, 2015, at 12:59 PM, J. Bruce Fields <[email protected]> wrote:
>>
>>> On Fri, Feb 06, 2015 at 12:04:13PM -0500, Chuck Lever wrote:
>>>>
>>>> On Feb 6, 2015, at 11:46 AM, Chuck Lever <[email protected]> wrote:
>>>>
>>>>>
>>>>> On Feb 6, 2015, at 11:08 AM, J. Bruce Fields <[email protected]> wrote:
>>>>>
>>>>>> On Fri, Feb 06, 2015 at 03:54:56AM -0800, Christoph Hellwig wrote:
>>>>>>> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
>>>>>>>>> The problem is that the typical case of all data won't use splice
>>>>>>>>> every with your patches as the 4.2 client will always send a READ_PLUS.
>>>>>>>>>
>>>>>>>>> So we'll have to find a way to use it where it helps. While we might be
>>>>>>>>> able to add some hacks to only use splice for the first segment I guess
>>>>>>>>> we just need to make the splice support generic enough in the long run.
>>>>>>>>>
>>>>>>>>
>>>>>>>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
>>>>>>>
>>>>>>> You could also elect to never return more than one data segment as a
>>>>>>> start:
>>>>>>>
>>>>>>> In all situations, the
>>>>>>> server may choose to return fewer bytes than specified by the client.
>>>>>>> The client needs to check for this condition and handle the
>>>>>>> condition appropriately.
>>>>>>
>>>>>> Yeah, I think that was more-or-less what Anna's first attempt did and I
>>>>>> said "what if that means more round trips"? The client can't anticipate
>>>>>> the short reads so it can't make up for this with parallelism.
>>>>>>
>>>>>>> But doing any of these for a call that's really just an optimization
>>>>>>> soudns odd. I'd really like to see an evaluation of the READ_PLUS
>>>>>>> impact on various workloads before offering it.
>>>>>>
>>>>>> Yes, unfortunately I don't see a way to make this just an obvious win.
>>>>>
>>>>> I don?t think a ?win? is necessary. It simply needs to be no worse than
>>>>> READ for current use cases.
>>>>>
>>>>> READ_PLUS should be a win for the particular use cases it was
>>>>> designed for (large sparsely-populated datasets). Without a
>>>>> demonstrated benefit I think there?s no point in keeping it.
>>>>>
>>>>>> (Is there any way we could make it so with better protocol? Maybe RDMA
>>>>>> could help get the alignment right in multiple-segment cases? But then
>>>>>> I think there needs to be some sort of language about RDMA, or else
>>>>>> we're stuck with:
>>>>>>
>>>>>> https://tools.ietf.org/html/rfc5667#section-5
>>>>>>
>>>>>> which I think forces us to return READ_PLUS data inline, another
>>>>>> possible READ_PLUS regression.)
>>>>
>>>> Btw, if I understand this correctly:
>>>>
>>>> Without a spec update, a large NFS READ_PLUS reply would be returned
>>>> in a reply list, which is moved via RDMA WRITE, just like READ
>>>> replies.
>>>>
>>>> The difference is NFS READ payload is placed directly into the
>>>> client?s page cache by the adapter. With a reply list, the client
>>>> transport would need to copy the returned data into the page cache.
>>>> And a large reply buffer would be needed.
>>>>
>>>> So, slower, yes. But not inline.
>>>
>>> I'm not very good at this, bear with me, but: the above-referenced
>>> section doesn't talk about "reply lists", only "write lists", and only
>>> explains how to use write lists for READ and READLINK data, and seems to expect everything else to be sent inline.
>>
>> I may have some details wrong, but this is my understanding.
>>
>> Small replies are sent inline. There is a size maximum for inline
>> messages, however. I guess 5667 section 5 assumes this context, which
>> appears throughout RFC 5666.
>>
>> If an expected reply exceeds the inline size, then a client will
>> set up a reply list for the server. A memory region on the client is
>> registered as a target for RDMA WRITE operations, and the co-ordinates
>> of that region are sent to the server in the RPC call.
>>
>> If the server finds the reply will indeed be larger than the inline
>> maximum, it plants the reply in the client memory region described by
>> the request?s reply list, and repeats the co-ordinates of that region
>> back to the client in the RPC reply.
>>
>> A server may also choose to send a small reply inline, even if the
>> client provided a reply list. In that case, the server does not
>> repeat the reply list in the reply, and the full reply appears
>> inline.
>>
>> Linux registers part of the RPC reply buffer for the reply list. After
>> it is received on the client, the reply payload is copied by the client
>> CPU to its final destination.
>>
>> Inline and reply list are the mechanisms used when the upper layer
>> has some processing to do to the incoming data (eg READDIR). When
>> a request just needs raw data to be simply dropped off in the client?s
>> memory, then the write list is preferred. A write list is basically a
>> zero-copy I/O.
>
> The term "reply list" doesn't appear in either RFC. I believe you mean
> "client-posted write list" in most of the above, except this last
> paragraph, which should have started with "Inline and server-posted read list...? ?
No, I meant ?reply list.? Definitely not read list.
The terms used in the RFCs and the implementations vary,
unfortunately, and only the read list is an actual list. The write and
reply lists are actually two separate counted arrays that are both
expressed using xdr_write_list.
Have a look at RFC 5666, section 5.2, where it is referred to as
either a ?long reply? or a ?reply chunk.?
>> But these choices are fixed by the specified RPC/RDMA binding of the
>> upper layer protocol (that?s what RFC 5667 is). NFS READ and READLINK
>> are the only NFS operations allowed to use a write list. (NFSv4
>> compounds are somewhat ambiguous, and that too needs to be addressed).
>>
>> As READ_PLUS conveys both kinds of data (zero-copy and data that
>> might require some processing) IMO RFC 5667 does not provide adequate
>> guidance about how to convey READ_PLUS. It will need to be added
>> somewhere.
>
> OK, good. I wonder how it would do this. The best the client could do,
> I guess, is provide the same write list it would for a READ of the same
> extent. Could the server then write just the pieces of that extent it
> needs to, send the hole information inline, and leave it to the client
> to do any necessary zeroing? (And is any of this worth it?)
Conveying large data payloads using zero-copy techniques should be
beneficial.
Since hole information could appear in a reply list if it were large,
and thus would not be inline, technically speaking, the best we can
say is that hole information wouldn?t be eligible for the write list.
--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com
On Fri, Feb 06, 2015 at 03:07:08PM -0500, Chuck Lever wrote:
>
> On Feb 6, 2015, at 2:35 PM, J. Bruce Fields <[email protected]> wrote:
>
> > On Fri, Feb 06, 2015 at 01:44:15PM -0500, Chuck Lever wrote:
> >>
> >> On Feb 6, 2015, at 12:59 PM, J. Bruce Fields <[email protected]> wrote:
> >>
> >>> On Fri, Feb 06, 2015 at 12:04:13PM -0500, Chuck Lever wrote:
> >>>>
> >>>> On Feb 6, 2015, at 11:46 AM, Chuck Lever <[email protected]> wrote:
> >>>>
> >>>>>
> >>>>> On Feb 6, 2015, at 11:08 AM, J. Bruce Fields <[email protected]> wrote:
> >>>>>
> >>>>>> On Fri, Feb 06, 2015 at 03:54:56AM -0800, Christoph Hellwig wrote:
> >>>>>>> On Thu, Feb 05, 2015 at 11:43:46AM -0500, Anna Schumaker wrote:
> >>>>>>>>> The problem is that the typical case of all data won't use splice
> >>>>>>>>> every with your patches as the 4.2 client will always send a READ_PLUS.
> >>>>>>>>>
> >>>>>>>>> So we'll have to find a way to use it where it helps. While we might be
> >>>>>>>>> able to add some hacks to only use splice for the first segment I guess
> >>>>>>>>> we just need to make the splice support generic enough in the long run.
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> I should be able to use splice if I detect that we're only returning a single DATA segment easily enough.
> >>>>>>>
> >>>>>>> You could also elect to never return more than one data segment as a
> >>>>>>> start:
> >>>>>>>
> >>>>>>> In all situations, the
> >>>>>>> server may choose to return fewer bytes than specified by the client.
> >>>>>>> The client needs to check for this condition and handle the
> >>>>>>> condition appropriately.
> >>>>>>
> >>>>>> Yeah, I think that was more-or-less what Anna's first attempt did and I
> >>>>>> said "what if that means more round trips"? The client can't anticipate
> >>>>>> the short reads so it can't make up for this with parallelism.
> >>>>>>
> >>>>>>> But doing any of these for a call that's really just an optimization
> >>>>>>> soudns odd. I'd really like to see an evaluation of the READ_PLUS
> >>>>>>> impact on various workloads before offering it.
> >>>>>>
> >>>>>> Yes, unfortunately I don't see a way to make this just an obvious win.
> >>>>>
> >>>>> I don’t think a “win” is necessary. It simply needs to be no worse than
> >>>>> READ for current use cases.
> >>>>>
> >>>>> READ_PLUS should be a win for the particular use cases it was
> >>>>> designed for (large sparsely-populated datasets). Without a
> >>>>> demonstrated benefit I think there’s no point in keeping it.
> >>>>>
> >>>>>> (Is there any way we could make it so with better protocol? Maybe RDMA
> >>>>>> could help get the alignment right in multiple-segment cases? But then
> >>>>>> I think there needs to be some sort of language about RDMA, or else
> >>>>>> we're stuck with:
> >>>>>>
> >>>>>> https://tools.ietf.org/html/rfc5667#section-5
> >>>>>>
> >>>>>> which I think forces us to return READ_PLUS data inline, another
> >>>>>> possible READ_PLUS regression.)
> >>>>
> >>>> Btw, if I understand this correctly:
> >>>>
> >>>> Without a spec update, a large NFS READ_PLUS reply would be returned
> >>>> in a reply list, which is moved via RDMA WRITE, just like READ
> >>>> replies.
> >>>>
> >>>> The difference is NFS READ payload is placed directly into the
> >>>> client’s page cache by the adapter. With a reply list, the client
> >>>> transport would need to copy the returned data into the page cache.
> >>>> And a large reply buffer would be needed.
> >>>>
> >>>> So, slower, yes. But not inline.
> >>>
> >>> I'm not very good at this, bear with me, but: the above-referenced
> >>> section doesn't talk about "reply lists", only "write lists", and only
> >>> explains how to use write lists for READ and READLINK data, and seems to expect everything else to be sent inline.
> >>
> >> I may have some details wrong, but this is my understanding.
> >>
> >> Small replies are sent inline. There is a size maximum for inline
> >> messages, however. I guess 5667 section 5 assumes this context, which
> >> appears throughout RFC 5666.
> >>
> >> If an expected reply exceeds the inline size, then a client will
> >> set up a reply list for the server. A memory region on the client is
> >> registered as a target for RDMA WRITE operations, and the co-ordinates
> >> of that region are sent to the server in the RPC call.
> >>
> >> If the server finds the reply will indeed be larger than the inline
> >> maximum, it plants the reply in the client memory region described by
> >> the request’s reply list, and repeats the co-ordinates of that region
> >> back to the client in the RPC reply.
> >>
> >> A server may also choose to send a small reply inline, even if the
> >> client provided a reply list. In that case, the server does not
> >> repeat the reply list in the reply, and the full reply appears
> >> inline.
> >>
> >> Linux registers part of the RPC reply buffer for the reply list. After
> >> it is received on the client, the reply payload is copied by the client
> >> CPU to its final destination.
> >>
> >> Inline and reply list are the mechanisms used when the upper layer
> >> has some processing to do to the incoming data (eg READDIR). When
> >> a request just needs raw data to be simply dropped off in the client’s
> >> memory, then the write list is preferred. A write list is basically a
> >> zero-copy I/O.
> >
> > The term "reply list" doesn't appear in either RFC. I believe you mean
> > "client-posted write list" in most of the above, except this last
> > paragraph, which should have started with "Inline and server-posted read list...” ?
>
> No, I meant “reply list.” Definitely not read list.
>
> The terms used in the RFCs and the implementations vary,
OK. Would you mind defining the term "reply list" for me? Google's not
helping.
--b.
> unfortunately, and only the read list is an actual list. The write and
> reply lists are actually two separate counted arrays that are both
> expressed using xdr_write_list.
>
> Have a look at RFC 5666, section 5.2, where it is referred to as
> either a “long reply” or a “reply chunk.”
>
> >> But these choices are fixed by the specified RPC/RDMA binding of the
> >> upper layer protocol (that’s what RFC 5667 is). NFS READ and READLINK
> >> are the only NFS operations allowed to use a write list. (NFSv4
> >> compounds are somewhat ambiguous, and that too needs to be addressed).
> >>
> >> As READ_PLUS conveys both kinds of data (zero-copy and data that
> >> might require some processing) IMO RFC 5667 does not provide adequate
> >> guidance about how to convey READ_PLUS. It will need to be added
> >> somewhere.
> >
> > OK, good. I wonder how it would do this. The best the client could do,
> > I guess, is provide the same write list it would for a READ of the same
> > extent. Could the server then write just the pieces of that extent it
> > needs to, send the hole information inline, and leave it to the client
> > to do any necessary zeroing? (And is any of this worth it?)
>
> Conveying large data payloads using zero-copy techniques should be
> beneficial.
>
> Since hole information could appear in a reply list if it were large,
> and thus would not be inline, technically speaking, the best we can
> say is that hole information wouldn’t be eligible for the write list.
>
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Feb 6, 2015, at 3:28 PM, J. Bruce Fields <[email protected]> wrote:
> On Fri, Feb 06, 2015 at 03:07:08PM -0500, Chuck Lever wrote:
>>
>> On Feb 6, 2015, at 2:35 PM, J. Bruce Fields <[email protected]> wrote:
>>>>
>>>> Small replies are sent inline. There is a size maximum for inline
>>>> messages, however. I guess 5667 section 5 assumes this context, which
>>>> appears throughout RFC 5666.
>>>>
>>>> If an expected reply exceeds the inline size, then a client will
>>>> set up a reply list for the server. A memory region on the client is
>>>> registered as a target for RDMA WRITE operations, and the co-ordinates
>>>> of that region are sent to the server in the RPC call.
>>>>
>>>> If the server finds the reply will indeed be larger than the inline
>>>> maximum, it plants the reply in the client memory region described by
>>>> the request?s reply list, and repeats the co-ordinates of that region
>>>> back to the client in the RPC reply.
>>>>
>>>> A server may also choose to send a small reply inline, even if the
>>>> client provided a reply list. In that case, the server does not
>>>> repeat the reply list in the reply, and the full reply appears
>>>> inline.
>>>>
>>>> Linux registers part of the RPC reply buffer for the reply list. After
>>>> it is received on the client, the reply payload is copied by the client
>>>> CPU to its final destination.
>>>>
>>>> Inline and reply list are the mechanisms used when the upper layer
>>>> has some processing to do to the incoming data (eg READDIR). When
>>>> a request just needs raw data to be simply dropped off in the client?s
>>>> memory, then the write list is preferred. A write list is basically a
>>>> zero-copy I/O.
>>>
>>> The term "reply list" doesn't appear in either RFC. I believe you mean
>>> "client-posted write list" in most of the above, except this last
>>> paragraph, which should have started with "Inline and server-posted read list...? ?
>>
>> No, I meant ?reply list.? Definitely not read list.
>>
>> The terms used in the RFCs and the implementations vary,
>
> OK. Would you mind defining the term "reply list" for me? Google's not helping.
Let?s look at section 4.3 of RFC 5666. Each RPC/RDMA header begins
with this:
struct rdma_msg {
uint32 rdma_xid; /* Mirrors the RPC header xid */
uint32 rdma_vers; /* Version of this protocol */
uint32 rdma_credit; /* Buffers requested/granted */
rdma_body rdma_body;
};
rdma_body starts with a uint32 which discriminates a union:
union rdma_body switch (rdma_proc proc) {
. . .
case RDMA_NOMSG:
rpc_rdma_header_nomsg rdma_nomsg;
. . .
};
When ?proc? == RDMA_NOMSG, rdma_body is made up of three lists:
struct rpc_rdma_header_nomsg {
struct xdr_read_list *rdma_reads;
struct xdr_write_list *rdma_writes;
struct xdr_write_chunk *rdma_reply;
};
The ?reply list? is that last part: rdma_reply, which is a counted
array of xdr_rdma_segment?s.
Large replies for non-NFS READ operations are sent using RDMA_NOMSG.
The RPC/RDMA header is sent as the inline portion of the message.
The RPC reply message (the part we are all familiar with) is planted
in the memory region described by rdma_reply, it?s not inline.
rdma_reply is a write chunk. The server WRITEs its RPC reply into the
memory region described by rdma_reply. That description was provided
by the client in the matching RPC call message.
--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com
On Fri, Feb 06, 2015 at 04:12:51PM -0500, Chuck Lever wrote:
>
> On Feb 6, 2015, at 3:28 PM, J. Bruce Fields <[email protected]> wrote:
>
> > On Fri, Feb 06, 2015 at 03:07:08PM -0500, Chuck Lever wrote:
> >>
> >> On Feb 6, 2015, at 2:35 PM, J. Bruce Fields <[email protected]> wrote:
> >>>>
> >>>> Small replies are sent inline. There is a size maximum for inline
> >>>> messages, however. I guess 5667 section 5 assumes this context, which
> >>>> appears throughout RFC 5666.
> >>>>
> >>>> If an expected reply exceeds the inline size, then a client will
> >>>> set up a reply list for the server. A memory region on the client is
> >>>> registered as a target for RDMA WRITE operations, and the co-ordinates
> >>>> of that region are sent to the server in the RPC call.
> >>>>
> >>>> If the server finds the reply will indeed be larger than the inline
> >>>> maximum, it plants the reply in the client memory region described by
> >>>> the request’s reply list, and repeats the co-ordinates of that region
> >>>> back to the client in the RPC reply.
> >>>>
> >>>> A server may also choose to send a small reply inline, even if the
> >>>> client provided a reply list. In that case, the server does not
> >>>> repeat the reply list in the reply, and the full reply appears
> >>>> inline.
> >>>>
> >>>> Linux registers part of the RPC reply buffer for the reply list. After
> >>>> it is received on the client, the reply payload is copied by the client
> >>>> CPU to its final destination.
> >>>>
> >>>> Inline and reply list are the mechanisms used when the upper layer
> >>>> has some processing to do to the incoming data (eg READDIR). When
> >>>> a request just needs raw data to be simply dropped off in the client’s
> >>>> memory, then the write list is preferred. A write list is basically a
> >>>> zero-copy I/O.
> >>>
> >>> The term "reply list" doesn't appear in either RFC. I believe you mean
> >>> "client-posted write list" in most of the above, except this last
> >>> paragraph, which should have started with "Inline and server-posted read list...” ?
> >>
> >> No, I meant “reply list.” Definitely not read list.
> >>
> >> The terms used in the RFCs and the implementations vary,
> >
> > OK. Would you mind defining the term "reply list" for me? Google's not helping.
>
> Let’s look at section 4.3 of RFC 5666. Each RPC/RDMA header begins
> with this:
>
> struct rdma_msg {
> uint32 rdma_xid; /* Mirrors the RPC header xid */
> uint32 rdma_vers; /* Version of this protocol */
> uint32 rdma_credit; /* Buffers requested/granted */
> rdma_body rdma_body;
> };
>
> rdma_body starts with a uint32 which discriminates a union:
>
> union rdma_body switch (rdma_proc proc) {
> . . .
> case RDMA_NOMSG:
> rpc_rdma_header_nomsg rdma_nomsg;
> . . .
> };
>
> When “proc” == RDMA_NOMSG, rdma_body is made up of three lists:
>
> struct rpc_rdma_header_nomsg {
> struct xdr_read_list *rdma_reads;
> struct xdr_write_list *rdma_writes;
> struct xdr_write_chunk *rdma_reply;
> };
>
> The “reply list” is that last part: rdma_reply, which is a counted
> array of xdr_rdma_segment’s.
>
> Large replies for non-NFS READ operations are sent using RDMA_NOMSG.
> The RPC/RDMA header is sent as the inline portion of the message.
> The RPC reply message (the part we are all familiar with) is planted
> in the memory region described by rdma_reply, it’s not inline.
>
> rdma_reply is a write chunk. The server WRITEs its RPC reply into the
> memory region described by rdma_reply. That description was provided
> by the client in the matching RPC call message.
Thanks! Gah, my apologies, obviously I didn't understand the reference
to section 5.2 before. I think I understand now....
And I'll be interested to see what we come up with for READ_PLUS case.
--b.
On Thu, Feb 05, 2015 at 11:06:04AM -0500, Anna Schumaker wrote:
> On 02/05/2015 09:13 AM, Christoph Hellwig wrote:
> >> + p = xdr_reserve_space(xdr, 4 + 8 + 4); /* content type, offset, maxcount */
> >> + if (!p)
> >> + return nfserr_resource;
> >> + xdr_commit_encode(xdr);
> >> +
> >> + maxcount = svc_max_payload(resp->rqstp);
> >> + maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
> >> + maxcount = min_t(unsigned long, maxcount, read->rd_length);
> >> +
> >> + err = nfsd4_encode_readv(resp, read, file, &maxcount);
> >
> > If the READ_PLUS implementation can't use the splice read path it's
> > probably a loss for most workloads.
> >
>
> I'll play around with the splice path, but I don't think it was designed for encoding multiple read segments.
The advantage of splice is it can hand us references to page cache pages
that already hold the read data, which we can in turn hand off to the
network layer when we send the reply, minimizing copying.
But then we need a data structure than can keep track of all the pieces
of the resulting reply, which consists of that the read data plus any
protocol xdr that surrounds the read data. The xdr_buf represents this
fine: head, (pages[], page_base, page_len), tail.
If you want to do zero-copy READ_PLUS then you need a data structure
that will represent a reply with multiple data segments.
I guess you could add more fields to the xdr_buf, or leave it alone and
keep a list of xdr_buf's instead.
The latter sounds better to me, and I imagine that would be doable but
tedious--but I haven't honestly thought it through.
There's also a tradeoff on the client side, isn't there? Worst case if
the data to be read is a small hole followed by a bunch of data, then a
READ_PLUS would end up needing to copy all the data in return for only a
small savings in data transferred.
I feel like READ_PLUS needs more of an argument.
--b.
On Thu, Feb 12, 2015 at 02:59:45PM -0500, Anna Schumaker wrote:
> Today I learned all about how to use operf to identify where the bottleneck is :). It looks like the problem is in the hole zeroing code on the client side. Is there a better way than memset() to mark a page as all zeros?
clear_highpage(), but I don't expect it to make a huge different.