2015-09-04 19:07:25

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v1 0/3] NFSD: Add READ_PLUS support

This is an updated posting of my NFS v4.2 READ_PLUS patches from several
months ago, and it should apply against current development trees.

I tested this code using dd to read a file between two KVMs, and I compared
the reported transfer rates against various NFS versions and the local XFS
filesystem. I tested using four 5 GB files, the first two simply entirely
data or entirely hole. The others alternate between data and hole pages, at
either 4 KB (one page) or 8 KB (two page) intervals.

I found that dd uses a default block size of 512 bytes, which could cause
reads over NFS to take forever. I bumped this up to 128K during my testing
by passing the argument "bs=128K" to dd. My results are below:

| XFS | NFS v3 | NFS v4.0 | NFS v4.1 | NFS v4.2
-------|------------|------------|------------|------------|------------
Data | 2.7 GB/s | 845 MB/s | 864 MB/s | 847 MB/s | 807 MB/s
Hole | 4.1 GB/s | 980 MB/s | 1.1 GB/s | 989 MB/s | 2.3 GB/s
1 Page | 1.6 GB/s | 681 MB/s | 683 MB/s | 688 MB/s | 672 MB/s
2 Page | 2.5 GB/s | 760 MB/s | 760 MB/s | 755 MB/s | 836 MB/s

The pure data case is slightly slower on NFS v4.2, most likely due to
additional server-side lseeks while reading. The alternating 1 page
regions test didn't see as large a slowdown, most likely because of to
the additional savings by not transferring zeroes over the wire. Any
slowdown caused by additional seeks is more than made up for by the time
we reach 2 page intervals.

These patches and the corresponding client changes are available in the
[read_plus] branch of

git://git.linux-nfs.org/projects/anna/linux-nfs.git

Questions? Comments? Thoughts?

Anna



Anna Schumaker (3):
NFSD: nfsd4_encode_read{v}() should encode eof and maxcount
NFSD: Add basic READ_PLUS support
NFSD: Add support for encoding multiple segments

fs/nfsd/nfs4proc.c | 16 +++++
fs/nfsd/nfs4xdr.c | 183 ++++++++++++++++++++++++++++++++++++++++++-----------
2 files changed, 161 insertions(+), 38 deletions(-)

--
2.5.1



2015-09-04 19:07:26

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v1 1/3] NFSD: nfsd4_encode_read{v}() should encode eof and maxcount

I intend to reuse nfsd4_encode_readv() for READ_PLUS, so I need an
alternate way to encode these values. I think it makes sense for
nfsd4_encode_read() to handle this in a single place for both splice and
readv paths.

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4xdr.c | 60 ++++++++++++++++++++++---------------------------------
1 file changed, 24 insertions(+), 36 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5463385..2af54fa 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3294,21 +3294,19 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc
static __be32 nfsd4_encode_splice_read(
struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
- struct file *file, unsigned long maxcount)
+ struct file *file, unsigned long *maxcount)
{
struct xdr_stream *xdr = &resp->xdr;
struct xdr_buf *buf = xdr->buf;
- u32 eof;
int space_left;
__be32 nfserr;
- __be32 *p = xdr->p - 2;

/* Make sure there will be room for padding if needed */
if (xdr->end - xdr->p < 1)
return nfserr_resource;

nfserr = nfsd_splice_read(read->rd_rqstp, file,
- read->rd_offset, &maxcount);
+ read->rd_offset, maxcount);
if (nfserr) {
/*
* nfsd_splice_actor may have already messed with the
@@ -3319,27 +3317,21 @@ static __be32 nfsd4_encode_splice_read(
return nfserr;
}

- eof = (read->rd_offset + maxcount >=
- d_inode(read->rd_fhp->fh_dentry)->i_size);
-
- *(p++) = htonl(eof);
- *(p++) = htonl(maxcount);
-
- buf->page_len = maxcount;
- buf->len += maxcount;
- xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+ buf->page_len = *maxcount;
+ buf->len += *maxcount;
+ xdr->page_ptr += (buf->page_base + *maxcount + PAGE_SIZE - 1)
/ PAGE_SIZE;

/* Use rest of head for padding and remaining ops: */
buf->tail[0].iov_base = xdr->p;
buf->tail[0].iov_len = 0;
xdr->iov = buf->tail;
- if (maxcount&3) {
- int pad = 4 - (maxcount&3);
+ if (*maxcount&3) {
+ int pad = 4 - (*maxcount&3);

*(xdr->p++) = 0;

- buf->tail[0].iov_base += maxcount&3;
+ buf->tail[0].iov_base += *maxcount&3;
buf->tail[0].iov_len = pad;
buf->len += pad;
}
@@ -3354,21 +3346,19 @@ static __be32 nfsd4_encode_splice_read(

static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct nfsd4_read *read,
- struct file *file, unsigned long maxcount)
+ struct file *file, unsigned long *maxcount)
{
struct xdr_stream *xdr = &resp->xdr;
- u32 eof;
int v;
- int starting_len = xdr->buf->len - 8;
+ int starting_len = xdr->buf->len;
long len;
int thislen;
__be32 nfserr;
- __be32 tmp;
__be32 *p;
u32 zzz = 0;
int pad;

- len = maxcount;
+ len = *maxcount;
v = 0;

thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
@@ -3391,22 +3381,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
read->rd_vlen = v;

nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
- read->rd_vlen, &maxcount);
+ read->rd_vlen, maxcount);
if (nfserr)
return nfserr;
- xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
+ xdr_truncate_encode(xdr, starting_len + ((*maxcount+3)&~3));

- eof = (read->rd_offset + maxcount >=
- d_inode(read->rd_fhp->fh_dentry)->i_size);
-
- tmp = htonl(eof);
- write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
- tmp = htonl(maxcount);
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
-
- pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
- write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
- &zzz, pad);
+ pad = (*maxcount&3) ? 4 - (*maxcount&3) : 0;
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + *maxcount, &zzz, pad);
return 0;

}
@@ -3421,6 +3402,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
int starting_len = xdr->buf->len;
struct raparms *ra = NULL;
__be32 *p;
+ __be32 err;
+ u32 eof;

if (nfserr)
goto out;
@@ -3449,9 +3432,14 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,

if (file->f_op->splice_read &&
test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
- nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
else
- nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
+ err = nfsd4_encode_readv(resp, read, file, &maxcount);
+
+ eof = (read->rd_offset + maxcount >=
+ d_inode(read->rd_fhp->fh_dentry)->i_size);
+ *p++ = cpu_to_be32(eof);
+ *p++ = cpu_to_be32(maxcount);

if (ra)
nfsd_put_raparams(file, ra);
--
2.5.1


2015-09-04 19:07:26

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v1 2/3] NFSD: Add basic READ_PLUS support

This patch adds READ_PLUS support for both NFS4_CONTENT_DATA and
NFS4_CONTENT_HOLE segments. I keep things simple for now by only
returning one segment at a time to clients issuing the READ_PLUS call.

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4proc.c | 16 ++++++++
fs/nfsd/nfs4xdr.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 90cfda7..fe4fc24 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1849,6 +1849,16 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}

+static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ u32 maxcount = svc_max_payload(rqstp);
+ u32 rlen = min(op->u.read.rd_length, maxcount);
+ /* enough extra xdr space for encoding either a hole or data segment. */
+ u32 xdr = 5;
+
+ return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
+}
+
static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
u32 maxcount = 0, rlen = 0;
@@ -2281,6 +2291,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_DEALLOCATE",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+ [OP_READ_PLUS] = {
+ .op_func = (nfsd4op_func)nfsd4_read,
+ .op_name = "OP_READ_PLUS",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_read_plus_rsize,
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid,
+ },
[OP_SEEK] = {
.op_func = (nfsd4op_func)nfsd4_seek,
.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2af54fa..7ab4033 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1782,7 +1782,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read,
[OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
[OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
};
@@ -4128,6 +4128,116 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
#endif /* CONFIG_NFSD_PNFS */

static __be32
+nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
+ struct file *file, loff_t hole_pos)
+{
+ __be32 *p, err;
+ unsigned long maxcount;
+ struct xdr_stream *xdr = &resp->xdr;
+
+ p = xdr_reserve_space(xdr, 4 + 8 + 4);
+ if (!p)
+ return nfserr_resource;
+ xdr_commit_encode(xdr);
+
+ if (hole_pos <= read->rd_offset)
+ hole_pos = i_size_read(file_inode(file));
+
+ maxcount = svc_max_payload(resp->rqstp);
+ maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
+ maxcount = min_t(unsigned long, maxcount, read->rd_length);
+ maxcount = min_t(unsigned long, maxcount, hole_pos - read->rd_offset);
+
+ if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
+ err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
+ else
+ err = nfsd4_encode_readv(resp, read, file, &maxcount);
+
+ *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
+ p = xdr_encode_hyper(p, read->rd_offset);
+ *p++ = cpu_to_be32(maxcount);
+
+ read->rd_offset += maxcount;
+ return err;
+}
+
+static __be32
+nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *read,
+ struct file *file)
+{
+ __be32 *p;
+ unsigned long maxcount;
+ loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
+
+ if (data_pos == -ENXIO)
+ data_pos = i_size_read(file_inode(file));
+ if (data_pos <= read->rd_offset)
+ return nfsd4_encode_read_plus_data(resp, read, file, 0);
+
+ maxcount = data_pos - read->rd_offset;
+ p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
+ *p++ = cpu_to_be32(NFS4_CONTENT_HOLE);
+ p = xdr_encode_hyper(p, read->rd_offset);
+ p = xdr_encode_hyper(p, maxcount);
+
+ read->rd_offset += maxcount;
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_read *read)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file = read->rd_filp;
+ int starting_len = xdr->buf->len;
+ struct raparms *ra = NULL;
+ loff_t hole_pos;
+ __be32 *p;
+ u32 eof, segments = 0;
+
+ if (nfserr)
+ goto out;
+
+ /* eof flag, segment count */
+ p = xdr_reserve_space(xdr, 4 + 4 );
+ if (!p) {
+ nfserr = nfserr_resource;
+ goto out;
+ }
+ xdr_commit_encode(xdr);
+
+ if (read->rd_tmp_file)
+ ra = nfsd_init_raparms(file);
+
+ hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+ if (hole_pos == -ENXIO)
+ goto out_encode;
+
+ if (hole_pos == read->rd_offset)
+ nfserr = nfsd4_encode_read_plus_hole(resp, read, file);
+ else
+ nfserr = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
+ segments++;
+
+out_encode:
+ eof = (read->rd_offset >= i_size_read(file_inode(file)));
+ *p++ = cpu_to_be32(eof);
+ *p++ = cpu_to_be32(segments);
+
+ if (ra)
+ nfsd_put_raparams(file, ra);
+
+ if (nfserr)
+ xdr_truncate_encode(xdr, starting_len);
+
+out:
+ if (file)
+ fput(file);
+ return nfserr;
+}
+
+static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_seek *seek)
{
@@ -4234,7 +4344,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
[OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
- [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus,
[OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
[OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
};
--
2.5.1


2015-09-04 19:07:27

by Anna Schumaker

[permalink] [raw]
Subject: [PATCH v1 3/3] NFSD: Add support for encoding multiple segments

This patch implements sending an array of segments back to the client.
Clients should be prepared to handle multiple segment reads to make this
useful. We try to splice the first data segment into the XDR result,
and remaining segments are encoded directly.

Signed-off-by: Anna Schumaker <[email protected]>
---
fs/nfsd/nfs4proc.c | 4 ++--
fs/nfsd/nfs4xdr.c | 27 ++++++++++++++++++---------
2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index fe4fc24..0e34ae9 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1853,8 +1853,8 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
{
u32 maxcount = svc_max_payload(rqstp);
u32 rlen = min(op->u.read.rd_length, maxcount);
- /* enough extra xdr space for encoding either a hole or data segment. */
- u32 xdr = 5;
+ /* Extra xdr padding for encoding multiple segments. */
+ u32 xdr = 20;

return (op_encode_hdr_size + 2 + xdr + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7ab4033..735a4ff 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4152,12 +4152,14 @@ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp, struct nfsd4_read *r
err = nfsd4_encode_splice_read(resp, read, file, &maxcount);
else
err = nfsd4_encode_readv(resp, read, file, &maxcount);
+ clear_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);

*p++ = cpu_to_be32(NFS4_CONTENT_DATA);
p = xdr_encode_hyper(p, read->rd_offset);
*p++ = cpu_to_be32(maxcount);

read->rd_offset += maxcount;
+ read->rd_length -= maxcount;
return err;
}

@@ -4181,6 +4183,10 @@ nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp, struct nfsd4_read *r
p = xdr_encode_hyper(p, maxcount);

read->rd_offset += maxcount;
+ if (maxcount > read->rd_length)
+ read->rd_length = 0;
+ else
+ read->rd_length -= maxcount;
return nfs_ok;
}

@@ -4210,17 +4216,20 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
if (read->rd_tmp_file)
ra = nfsd_init_raparms(file);

- hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
- if (hole_pos == -ENXIO)
- goto out_encode;
+ do {
+ hole_pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+ if (hole_pos == -ENXIO)
+ break;

- if (hole_pos == read->rd_offset)
- nfserr = nfsd4_encode_read_plus_hole(resp, read, file);
- else
- nfserr = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
- segments++;
+ if (hole_pos == read->rd_offset)
+ nfserr = nfsd4_encode_read_plus_hole(resp, read, file);
+ else
+ nfserr = nfsd4_encode_read_plus_data(resp, read, file, hole_pos);
+ if (nfserr)
+ break;
+ segments++;
+ } while (read->rd_length > 0);

-out_encode:
eof = (read->rd_offset >= i_size_read(file_inode(file)));
*p++ = cpu_to_be32(eof);
*p++ = cpu_to_be32(segments);
--
2.5.1


2015-09-17 18:15:45

by J. Bruce Fields

[permalink] [raw]
Subject: Re: [PATCH v1 0/3] NFSD: Add READ_PLUS support

On Fri, Sep 04, 2015 at 03:07:18PM -0400, Anna Schumaker wrote:
> This is an updated posting of my NFS v4.2 READ_PLUS patches from several
> months ago, and it should apply against current development trees.
>
> I tested this code using dd to read a file between two KVMs, and I compared
> the reported transfer rates against various NFS versions and the local XFS
> filesystem. I tested using four 5 GB files, the first two simply entirely
> data or entirely hole. The others alternate between data and hole pages, at
> either 4 KB (one page) or 8 KB (two page) intervals.

The exports are over XFS too, right?

I seem to remember a more severe regression in the ext4 case--am I
remembering right, and is that still an issue?

My main concern is to avoid significant performance regressions.
Without that we'll be stuck having to make people manually tune use of
READ_PLUS for their specific workload, and I think we'd rather avoid
that at least for common cases.

So your numbers show about a 5% performance drop in the (probably very
common) case of a non-sparse file. I guess that on its own doesn't look
like a deal-breaker to me.

But we don't know how much of these results are noise, and if your setup
is really representative. It would be more reassuring to have:

- some information about variance of these results across runs.

- results on a wider variety of setups. (E.g at least one case
involving "real" server and client and network, not just VMs.)

- some profiling to understand exactly where the time is going
if there still appears to be any significant loss.

But this certainly seems promising.

--b.

>
> I found that dd uses a default block size of 512 bytes, which could cause
> reads over NFS to take forever. I bumped this up to 128K during my testing
> by passing the argument "bs=128K" to dd. My results are below:
>
> | XFS | NFS v3 | NFS v4.0 | NFS v4.1 | NFS v4.2
> -------|------------|------------|------------|------------|------------
> Data | 2.7 GB/s | 845 MB/s | 864 MB/s | 847 MB/s | 807 MB/s
> Hole | 4.1 GB/s | 980 MB/s | 1.1 GB/s | 989 MB/s | 2.3 GB/s
> 1 Page | 1.6 GB/s | 681 MB/s | 683 MB/s | 688 MB/s | 672 MB/s
> 2 Page | 2.5 GB/s | 760 MB/s | 760 MB/s | 755 MB/s | 836 MB/s
>
> The pure data case is slightly slower on NFS v4.2, most likely due to
> additional server-side lseeks while reading. The alternating 1 page
> regions test didn't see as large a slowdown, most likely because of to
> the additional savings by not transferring zeroes over the wire. Any
> slowdown caused by additional seeks is more than made up for by the time
> we reach 2 page intervals.
>
> These patches and the corresponding client changes are available in the
> [read_plus] branch of
>
> git://git.linux-nfs.org/projects/anna/linux-nfs.git
>
> Questions? Comments? Thoughts?
>
> Anna
>
>
>
> Anna Schumaker (3):
> NFSD: nfsd4_encode_read{v}() should encode eof and maxcount
> NFSD: Add basic READ_PLUS support
> NFSD: Add support for encoding multiple segments
>
> fs/nfsd/nfs4proc.c | 16 +++++
> fs/nfsd/nfs4xdr.c | 183 ++++++++++++++++++++++++++++++++++++++++++-----------
> 2 files changed, 161 insertions(+), 38 deletions(-)
>
> --
> 2.5.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html